summaryrefslogtreecommitdiffstats
path: root/contrib/libs/apache/arrow/cpp
diff options
context:
space:
mode:
authorDevtools Arcadia <[email protected]>2022-02-07 18:08:42 +0300
committerDevtools Arcadia <[email protected]>2022-02-07 18:08:42 +0300
commit1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
treee26c9fed0de5d9873cce7e00bc214573dc2195b7 /contrib/libs/apache/arrow/cpp
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'contrib/libs/apache/arrow/cpp')
-rw-r--r--contrib/libs/apache/arrow/cpp/CHANGELOG_PARQUET.md501
-rw-r--r--contrib/libs/apache/arrow/cpp/README.md34
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/adapters/orc/adapter.cc595
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/adapters/orc/adapter.h181
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/adapters/orc/adapter_util.cc1069
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/adapters/orc/adapter_util.h57
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/api.h44
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array.h32
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/README.md20
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/array_base.cc308
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/array_base.h260
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/array_binary.cc108
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/array_binary.h255
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/array_decimal.cc63
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/array_decimal.h66
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/array_dict.cc442
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/array_dict.h180
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/array_nested.cc757
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/array_nested.h523
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/array_primitive.cc99
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/array_primitive.h135
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/builder_adaptive.cc380
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/builder_adaptive.h203
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/builder_base.cc295
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/builder_base.h276
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/builder_binary.cc199
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/builder_binary.h670
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/builder_decimal.cc105
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/builder_decimal.h94
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/builder_dict.cc204
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/builder_dict.h572
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/builder_nested.cc294
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/builder_nested.h482
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/builder_primitive.cc137
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/builder_primitive.h479
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/builder_time.h43
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/builder_union.cc121
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/builder_union.h235
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/concatenate.cc490
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/concatenate.h42
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/data.cc331
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/data.h258
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/dict_internal.h193
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/diff.cc784
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/diff.h76
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/util.cc754
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/util.h78
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/validate.cc657
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/validate.h55
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/buffer.cc207
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/buffer.h496
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/buffer_builder.h450
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/builder.cc222
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/builder.h32
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/c/abi.h103
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/c/bridge.cc1712
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/c/bridge.h197
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/c/helpers.h117
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/c/util_internal.h85
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/chunked_array.cc294
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/chunked_array.h252
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compare.cc1304
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compare.h133
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/README.md58
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/api.h35
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/api_aggregate.cc197
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/api_aggregate.h433
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/api_scalar.cc498
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/api_scalar.h989
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/api_vector.cc283
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/api_vector.h410
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/cast.cc273
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/cast.h167
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/cast_internal.h43
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/exec.cc1050
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/exec.h264
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/exec_plan.cc823
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/exec_plan.h287
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/expression.cc1186
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/expression.h269
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/expression_internal.h336
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_compare.cc268
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_compare.h101
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_encode.cc1649
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_encode.h635
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_hash.cc238
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_hash.h94
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_map.cc610
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_map.h172
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/util.cc278
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/util.h171
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/exec_internal.h142
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/function.cc330
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/function.h393
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/function_internal.cc113
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/function_internal.h626
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernel.cc486
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernel.h739
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_basic.cc604
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h463
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_internal.h172
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_mode.cc392
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_quantile.cc493
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_tdigest.cc164
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_var_std.cc326
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/codegen_internal.cc337
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/codegen_internal.h1381
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/common.h54
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/hash_aggregate.cc1379
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc1823
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_boolean.cc563
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc70
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc126
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc285
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_internal.h88
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc133
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc727
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_string.cc247
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc452
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_compare.cc524
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_fill_null.cc244
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_if_else.cc1730
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_nested.cc183
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc513
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_string.cc4145
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_temporal.cc663
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_validity.cc230
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/util_internal.cc82
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/util_internal.h166
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_hash.cc782
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_nested.cc102
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_replace.cc540
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_selection.cc2268
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_sort.cc1838
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/registry.cc199
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/registry.h93
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/registry_internal.h63
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/type_fwd.h48
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/util_internal.h32
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/config.cc78
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/config.h72
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/csv/api.h26
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/csv/chunker.cc300
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/csv/chunker.h36
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/csv/column_builder.cc367
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/csv/column_builder.h78
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/csv/column_decoder.cc243
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/csv/column_decoder.h64
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/csv/converter.cc692
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/csv/converter.h82
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/csv/inference_internal.h150
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/csv/options.cc83
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/csv/options.h189
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/csv/parser.cc581
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/csv/parser.h202
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/csv/reader.cc1279
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/csv/reader.h123
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/csv/type_fwd.h28
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/csv/writer.cc460
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/csv/writer.h73
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/datum.cc284
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/datum.h281
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/device.cc209
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/device.h226
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/extension_type.cc169
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/extension_type.h161
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/io/api.h24
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/io/buffered.cc489
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/io/buffered.h167
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/io/caching.cc318
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/io/caching.h138
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/io/compressed.cc450
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/io/compressed.h118
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/io/concurrency.h263
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/io/file.cc772
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/io/file.h221
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/io/interfaces.cc469
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/io/interfaces.h340
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/io/memory.cc388
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/io/memory.h197
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/io/mman.h169
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/io/slow.cc148
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/io/slow.h118
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/io/stdio.cc95
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/io/stdio.h82
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/io/transform.cc162
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/io/transform.h60
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/io/type_fwd.h79
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/io/util_internal.h66
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/ipc/api.h25
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/ipc/dictionary.cc412
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/ipc/dictionary.h177
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/ipc/feather.cc819
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/ipc/feather.h140
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/ipc/json_simple.h61
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/ipc/message.cc931
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/ipc/message.h536
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/ipc/metadata_internal.cc1486
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/ipc/metadata_internal.h227
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/ipc/options.cc41
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/ipc/options.h161
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/ipc/reader.cc2081
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/ipc/reader.h536
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/ipc/type_fwd.h65
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/ipc/util.h41
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/ipc/writer.cc1429
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/ipc/writer.h459
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/memory_pool.cc797
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/memory_pool.h185
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/pretty_print.cc711
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/pretty_print.h125
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/record_batch.cc367
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/record_batch.h238
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/result.cc36
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/result.h519
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/result_internal.h22
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/scalar.cc659
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/scalar.h537
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/sparse_tensor.cc478
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/sparse_tensor.h624
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/status.cc143
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/status.h451
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/stl_allocator.h153
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/stl_iterator.h146
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/symbols.map38
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/table.cc640
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/table.h295
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/table_builder.cc113
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/table_builder.h110
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/tensor.cc342
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/tensor.h250
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/tensor/converter.h67
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/tensor/converter_internal.h88
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/tensor/coo_converter.cc333
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/tensor/csf_converter.cc289
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/tensor/csx_converter.cc241
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/type.cc2282
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/type.h1930
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/type_fwd.h678
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/type_traits.h1024
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/algorithm.h33
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/align_util.h68
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/async_generator.h1614
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/atomic_shared_ptr.h111
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/base64.h34
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/basic_decimal.cc1344
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/basic_decimal.h342
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/bit_block_counter.cc80
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/bit_block_counter.h542
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/bit_run_reader.cc54
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/bit_run_reader.h515
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/bit_stream_utils.h433
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/bit_util.cc127
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/bit_util.h354
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap.cc75
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap.h461
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_builders.cc72
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_builders.h43
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_generate.h111
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_ops.cc387
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_ops.h206
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_reader.h271
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_visit.h88
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_writer.h285
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/bpacking.cc178
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/bpacking.h32
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/bpacking_default.h4251
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/byte_stream_split.h626
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/cancel.cc226
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/cancel.h102
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/checked_cast.h61
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/compare.h62
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/compression.cc261
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/compression.h202
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/compression_brotli.cc245
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/compression_internal.h80
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/compression_lz4.cc495
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/compression_snappy.cc102
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/compression_zlib.cc507
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/compression_zstd.cc249
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/cpu_info.cc563
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/cpu_info.h143
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/decimal.cc932
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/decimal.h291
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/delimiting.cc193
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/delimiting.h181
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/dispatch.h115
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/double_conversion.h32
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/endian.h181
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/formatting.cc91
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/formatting.h426
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/functional.h160
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/future.cc421
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/future.h957
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/hash_util.h66
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/hashing.h886
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/int128_internal.h45
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/int_util.cc952
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/int_util.h117
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/int_util_internal.h153
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/io_util.cc1685
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/io_util.h349
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/iterator.h568
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/key_value_metadata.cc274
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/key_value_metadata.h99
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/logging.cc256
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/logging.h259
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/macros.h185
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/make_unique.h42
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/memory.cc74
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/memory.h43
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/mutex.cc54
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/mutex.h64
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/optional.h33
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/parallel.h102
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/queue.h29
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/range.h155
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/reflection_internal.h133
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/rle_encoding.h826
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/simd.h50
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/sort.h78
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/spaced.h98
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/string.cc191
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/string.h79
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/string_builder.cc40
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/string_builder.h84
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/string_view.h38
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/task_group.cc224
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/task_group.h106
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/tdigest.cc417
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/tdigest.h103
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/thread_pool.cc442
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/thread_pool.h398
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/time.cc68
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/time.h82
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/trie.cc211
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/trie.h245
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/type_fwd.h62
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/type_traits.h86
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/ubsan.h88
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/uri.cc292
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/uri.h104
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/utf8.cc160
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/utf8.h570
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/value_parsing.cc87
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/value_parsing.h780
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/variant.h439
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/vector.h172
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/visibility.h45
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/windows_compatibility.h42
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/windows_fixup.h52
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/vendored/ProducerConsumerQueue.h217
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/vendored/base64.cpp128
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/vendored/datetime.h26
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/vendored/datetime/README.md21
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/vendored/datetime/date.h7949
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/vendored/datetime/ios.h53
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/vendored/datetime/ios.mm340
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/vendored/datetime/tz.cpp3877
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/vendored/datetime/tz.h2804
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/vendored/datetime/tz_private.h319
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/vendored/datetime/visibility.h26
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/vendored/musl/README.md25
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/vendored/musl/strptime.c237
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/vendored/portable-snippets/README.md10
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/vendored/portable-snippets/safe-math.h1072
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/vendored/string_view.hpp1531
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/vendored/strptime.h35
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/vendored/utfcpp/README.md28
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/vendored/utfcpp/checked.h333
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/vendored/utfcpp/core.h338
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/vendored/utfcpp/cpp11.h103
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/visitor.cc169
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/visitor.h152
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/visitor_inline.h449
-rw-r--r--contrib/libs/apache/arrow/cpp/src/generated/File_generated.h200
-rw-r--r--contrib/libs/apache/arrow/cpp/src/generated/Message_generated.h659
-rw-r--r--contrib/libs/apache/arrow/cpp/src/generated/Schema_generated.h2265
-rw-r--r--contrib/libs/apache/arrow/cpp/src/generated/SparseTensor_generated.h913
-rw-r--r--contrib/libs/apache/arrow/cpp/src/generated/Tensor_generated.h387
-rw-r--r--contrib/libs/apache/arrow/cpp/src/generated/feather_generated.h863
-rw-r--r--contrib/libs/apache/arrow/cpp/src/generated/parquet_constants.cpp17
-rw-r--r--contrib/libs/apache/arrow/cpp/src/generated/parquet_constants.h24
-rw-r--r--contrib/libs/apache/arrow/cpp/src/generated/parquet_types.cpp7415
-rw-r--r--contrib/libs/apache/arrow/cpp/src/generated/parquet_types.h2917
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/README10
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/arrow/path_internal.cc900
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/arrow/path_internal.h155
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/arrow/reader.cc1248
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/arrow/reader.h343
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/arrow/reader_internal.cc791
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/arrow/reader_internal.h122
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/arrow/schema.cc1087
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/arrow/schema.h184
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/arrow/schema_internal.cc222
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/arrow/schema_internal.h51
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/arrow/writer.cc482
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/arrow/writer.h109
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/bloom_filter.cc162
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/bloom_filter.h247
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/column_page.h160
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/column_reader.cc1802
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/column_reader.h376
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/column_scanner.cc91
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/column_scanner.h262
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/column_writer.cc2067
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/column_writer.h270
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/encoding.cc2547
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/encoding.h460
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/encryption/encryption.cc412
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/encryption/encryption.h510
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/encryption/encryption_internal.h116
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/encryption/encryption_internal_nossl.cc110
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/encryption/internal_file_decryptor.cc240
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/encryption/internal_file_decryptor.h121
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/encryption/internal_file_encryptor.cc170
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/encryption/internal_file_encryptor.h109
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/exception.cc27
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/exception.h158
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/file_reader.cc868
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/file_reader.h188
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/file_writer.cc547
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/file_writer.h234
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/hasher.h72
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/level_comparison.cc82
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/level_comparison.h40
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/level_comparison_inc.h65
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/level_conversion.cc183
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/level_conversion.h199
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/level_conversion_inc.h357
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/metadata.cc1783
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/metadata.h484
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/murmur3.cc222
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/murmur3.h54
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/platform.cc41
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/platform.h111
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/printer.cc297
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/printer.h46
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/properties.cc64
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/properties.h813
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/schema.cc945
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/schema.h494
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/schema_internal.h54
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/statistics.cc885
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/statistics.h342
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/stream_reader.cc521
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/stream_reader.h299
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/stream_writer.cc324
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/stream_writer.h243
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/symbols.map40
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/thrift_internal.h494
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/type_fwd.h43
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/types.cc1567
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/types.h765
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/windows_compatibility.h30
455 files changed, 194833 insertions, 0 deletions
diff --git a/contrib/libs/apache/arrow/cpp/CHANGELOG_PARQUET.md b/contrib/libs/apache/arrow/cpp/CHANGELOG_PARQUET.md
new file mode 100644
index 00000000000..06a09c20f0e
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/CHANGELOG_PARQUET.md
@@ -0,0 +1,501 @@
+Parquet C++ 1.5.0
+--------------------------------------------------------------------------------
+## Bug
+ * [PARQUET-979] - [C++] Limit size of min, max or disable stats for long binary types
+ * [PARQUET-1071] - [C++] parquet::arrow::FileWriter::Close is not idempotent
+ * [PARQUET-1349] - [C++] PARQUET_RPATH_ORIGIN is not picked by the build
+ * [PARQUET-1334] - [C++] memory_map parameter seems missleading in parquet file opener
+ * [PARQUET-1333] - [C++] Reading of files with dictionary size 0 fails on Windows with bad_alloc
+ * [PARQUET-1283] - [C++] FormatStatValue appends trailing space to string and int96
+ * [PARQUET-1270] - [C++] Executable tools do not get installed
+ * [PARQUET-1272] - [C++] ScanFileContents reports wrong row count for nested columns
+ * [PARQUET-1268] - [C++] Conversion of Arrow null list columns fails
+ * [PARQUET-1255] - [C++] Exceptions thrown in some tests
+ * [PARQUET-1358] - [C++] index_page_offset should be unset as it is not supported.
+ * [PARQUET-1357] - [C++] FormatStatValue truncates binary statistics on zero character
+ * [PARQUET-1319] - [C++] Pass BISON_EXECUTABLE to Thrift EP for MacOS
+ * [PARQUET-1313] - [C++] Compilation failure with VS2017
+ * [PARQUET-1315] - [C++] ColumnChunkMetaData.has_dictionary_page() should return bool, not int64_t
+ * [PARQUET-1307] - [C++] memory-test fails with latest Arrow
+ * [PARQUET-1274] - [Python] SegFault in pyarrow.parquet.write_table with specific options
+ * [PARQUET-1209] - locally defined symbol ... imported in function ..
+ * [PARQUET-1245] - [C++] Segfault when writing Arrow table with duplicate columns
+ * [PARQUET-1273] - [Python] Error writing to partitioned Parquet dataset
+ * [PARQUET-1384] - [C++] Clang compiler warnings in bloom_filter-test.cc
+
+## Improvement
+ * [PARQUET-1348] - [C++] Allow Arrow FileWriter To Write FileMetaData
+ * [PARQUET-1346] - [C++] Protect against null values data in empty Arrow array
+ * [PARQUET-1340] - [C++] Fix Travis Ci valgrind errors related to std::random_device
+ * [PARQUET-1323] - [C++] Fix compiler warnings with clang-6.0
+ * [PARQUET-1279] - Use ASSERT_NO_FATAIL_FAILURE in C++ unit tests
+ * [PARQUET-1262] - [C++] Use the same BOOST_ROOT and Boost_NAMESPACE for Thrift
+ * [PARQUET-1267] - replace "unsafe" std::equal by std::memcmp
+ * [PARQUET-1360] - [C++] Minor API + style changes follow up to PARQUET-1348
+ * [PARQUET-1166] - [API Proposal] Add GetRecordBatchReader in parquet/arrow/reader.h
+ * [PARQUET-1378] - [c++] Allow RowGroups with zero rows to be written
+ * [PARQUET-1256] - [C++] Add --print-key-value-metadata option to parquet_reader tool
+ * [PARQUET-1276] - [C++] Reduce the amount of memory used for writing null decimal values
+
+## New Feature
+ * [PARQUET-1392] - [C++] Supply row group indices to parquet::arrow::FileReader::ReadTable
+
+## Sub-task
+ * [PARQUET-1227] - Thrift crypto metadata structures
+ * [PARQUET-1332] - [C++] Add bloom filter utility class
+
+## Task
+ * [PARQUET-1350] - [C++] Use abstract ResizableBuffer instead of concrete PoolBuffer
+ * [PARQUET-1366] - [C++] Streamline use of Arrow bit-util.h
+ * [PARQUET-1308] - [C++] parquet::arrow should use thread pool, not ParallelFor
+ * [PARQUET-1382] - [C++] Prepare for arrow::test namespace removal
+ * [PARQUET-1372] - [C++] Add an API to allow writing RowGroups based on their size rather than num_rows
+
+
+Parquet C++ 1.4.0
+--------------------------------------------------------------------------------
+## Bug
+ * [PARQUET-1193] - [CPP] Implement ColumnOrder to support min_value and max_value
+ * [PARQUET-1180] - C++: Fix behaviour of num_children element of primitive nodes
+ * [PARQUET-1146] - C++: Add macOS-compatible sha512sum call to release verify script
+ * [PARQUET-1167] - [C++] FieldToNode function should return a status when throwing an exception
+ * [PARQUET-1175] - [C++] Fix usage of deprecated Arrow API
+ * [PARQUET-1113] - [C++] Incorporate fix from ARROW-1601 on bitmap read path
+ * [PARQUET-1111] - dev/release/verify-release-candidate has stale help
+ * [PARQUET-1109] - C++: Update release verification script to SHA512
+ * [PARQUET-1179] - [C++] Support Apache Thrift 0.11
+ * [PARQUET-1226] - [C++] Fix new build warnings with clang 5.0
+ * [PARQUET-1233] - [CPP ]Enable option to switch between stl classes and boost classes for thrift header
+ * [PARQUET-1205] - Fix msvc static build
+ * [PARQUET-1210] - [C++] Boost 1.66 compilation fails on Windows on linkage stage
+
+## Improvement
+ * [PARQUET-1092] - [C++] Write Arrow tables with chunked columns
+ * [PARQUET-1086] - [C++] Remove usage of arrow/util/compiler-util.h after 1.3.0 release
+ * [PARQUET-1097] - [C++] Account for Arrow API deprecation in ARROW-1511
+ * [PARQUET-1150] - C++: Hide statically linked boost symbols
+ * [PARQUET-1151] - [C++] Add build options / configuration to use static runtime libraries with MSVC
+ * [PARQUET-1147] - [C++] Account for API deprecation / change in ARROW-1671
+ * [PARQUET-1162] - C++: Update dev/README after migration to Gitbox
+ * [PARQUET-1165] - [C++] Pin clang-format version to 4.0
+ * [PARQUET-1164] - [C++] Follow API changes in ARROW-1808
+ * [PARQUET-1177] - [C++] Add more extensive compiler warnings when using Clang
+ * [PARQUET-1110] - [C++] Release verification script for Windows
+ * [PARQUET-859] - [C++] Flatten parquet/file directory
+ * [PARQUET-1220] - [C++] Don't build Thrift examples and tutorials in the ExternalProject
+ * [PARQUET-1219] - [C++] Update release-candidate script links to gitbox
+ * [PARQUET-1196] - [C++] Provide a parquet_arrow example project incl. CMake setup
+ * [PARQUET-1200] - [C++] Support reading a single Arrow column from a Parquet file
+
+## New Feature
+ * [PARQUET-1095] - [C++] Read and write Arrow decimal values
+ * [PARQUET-970] - Add Add Lz4 and Zstd compression codecs
+
+## Task
+ * [PARQUET-1221] - [C++] Extend release README
+ * [PARQUET-1225] - NaN values may lead to incorrect filtering under certain circumstances
+
+
+Parquet C++ 1.3.1
+--------------------------------------------------------------------------------
+## Bug
+ * [PARQUET-1105] - [CPP] Remove libboost_system dependency
+ * [PARQUET-1138] - [C++] Fix compilation with Arrow 0.7.1
+ * [PARQUET-1123] - [C++] Update parquet-cpp to use Arrow's AssertArraysEqual
+ * [PARQUET-1121] - C++: DictionaryArrays of NullType cannot be written
+ * [PARQUET-1139] - Add license to cmake_modules/parquet-cppConfig.cmake.in
+
+## Improvement
+ * [PARQUET-1140] - [C++] Fail on RAT errors in CI
+ * [PARQUET-1070] - Add CPack support to the build
+
+
+Parquet C++ 1.3.0
+--------------------------------------------------------------------------------
+## Bug
+ * [PARQUET-1098] - [C++] Install new header in parquet/util
+ * [PARQUET-1085] - [C++] Backwards compatibility from macro cleanup in transitive dependencies in ARROW-1452
+ * [PARQUET-1074] - [C++] Switch to long key ids in KEYs file
+ * [PARQUET-1075] - C++: Coverage upload is broken
+ * [PARQUET-1088] - [CPP] remove parquet_version.h from version control since it gets auto generated
+ * [PARQUET-1002] - [C++] Compute statistics based on Logical Types
+ * [PARQUET-1100] - [C++] Reading repeated types should decode number of records rather than number of values
+ * [PARQUET-1090] - [C++] Fix int32 overflow in Arrow table writer, add max row group size property
+ * [PARQUET-1108] - [C++] Fix Int96 comparators
+
+## Improvement
+ * [PARQUET-1104] - [C++] Upgrade to Apache Arrow 0.7.0 RC0
+ * [PARQUET-1072] - [C++] Add ARROW_NO_DEPRECATED_API to CI to check for deprecated API use
+ * [PARQUET-1096] - C++: Update sha{1, 256, 512} checksums per latest ASF release policy
+ * [PARQUET-1079] - [C++] Account for Arrow API change in ARROW-1335
+ * [PARQUET-1087] - [C++] Add wrapper for ScanFileContents in parquet::arrow that catches exceptions
+ * [PARQUET-1093] - C++: Improve Arrow level generation error message
+ * [PARQUET-1094] - C++: Add benchmark for boolean Arrow column I/O
+ * [PARQUET-1083] - [C++] Refactor core logic in parquet-scan.cc so that it can be used as a library function for benchmarking
+ * [PARQUET-1037] - Allow final RowGroup to be unfilled
+
+## New Feature
+ * [PARQUET-1078] - [C++] Add Arrow writer option to coerce timestamps to milliseconds or microseconds
+ * [PARQUET-929] - [C++] Handle arrow::DictionaryArray when writing Arrow data
+
+
+Parquet C++ 1.2.0
+--------------------------------------------------------------------------------
+## Bug
+ * [PARQUET-1029] - [C++] TypedColumnReader/TypeColumnWriter symbols are no longer being exported
+ * [PARQUET-997] - Fix override compiler warnings
+ * [PARQUET-1033] - Mismatched Read and Write
+ * [PARQUET-1007] - [C++ ] Update parquet.thrift from https://github.com/apache/parquet-format
+ * [PARQUET-1039] - PARQUET-911 Breaks Arrow
+ * [PARQUET-1038] - Key value metadata should be nullptr if not set
+ * [PARQUET-1018] - [C++] parquet.dll has runtime dependencies on one or more libraries in the build toolchain
+ * [PARQUET-1003] - [C++] Modify DEFAULT_CREATED_BY value for every new release version
+ * [PARQUET-1004] - CPP Building fails on windows
+ * [PARQUET-1040] - Missing writer method implementations
+ * [PARQUET-1054] - [C++] Account for Arrow API changes in ARROW-1199
+ * [PARQUET-1042] - C++: Compilation breaks on GCC 4.8
+ * [PARQUET-1048] - [C++] Static linking of libarrow is no longer supported
+ * [PARQUET-1013] - Fix ZLIB_INCLUDE_DIR
+ * [PARQUET-998] - C++: Release script is not usable
+ * [PARQUET-1023] - [C++] Brotli libraries are not being statically linked on Windows
+ * [PARQUET-1000] - [C++] Do not build thirdparty Arrow with /WX on MSVC
+ * [PARQUET-1052] - [C++] add_compiler_export_flags() throws warning with CMake >= 3.3
+ * [PARQUET-1069] - C++: ./dev/release/verify-release-candidate is broken due to missing Arrow dependencies
+
+## Improvement
+ * [PARQUET-996] - Improve MSVC build - ThirdpartyToolchain - Arrow
+ * [PARQUET-911] - C++: Support nested structs in parquet_arrow
+ * [PARQUET-986] - Improve MSVC build - ThirdpartyToolchain - Thrift
+ * [PARQUET-864] - [C++] Consolidate non-Parquet-specific bit utility code into Apache Arrow
+ * [PARQUET-1043] - [C++] Raise minimum supported CMake version to 3.2
+ * [PARQUET-1016] - Upgrade thirdparty Arrow to 0.4.0
+ * [PARQUET-858] - [C++] Flatten parquet/column directory, consolidate related code
+ * [PARQUET-978] - [C++] Minimizing footer reads for small(ish) metadata
+ * [PARQUET-991] - [C++] Fix compiler warnings on MSVC and build with /WX in Appveyor
+ * [PARQUET-863] - [C++] Move SIMD, CPU info, hashing, and other generic utilities into Apache Arrow
+ * [PARQUET-1053] - Fix unused result warnings due to unchecked Statuses
+ * [PARQUET-1067] - C++: Update arrow hash to 0.5.0
+ * [PARQUET-1041] - C++: Support Arrow's NullArray
+ * [PARQUET-1008] - Update TypedColumnReader::ReadBatch method to accept batch_size as int64_t
+ * [PARQUET-1044] - [C++] Use compression libraries from Apache Arrow
+ * [PARQUET-999] - Improve MSVC build - Enable PARQUET_BUILD_BENCHMARKS
+ * [PARQUET-967] - [C++] Combine libparquet/libparquet_arrow libraries
+ * [PARQUET-1045] - [C++] Refactor to account for computational utility code migration in ARROW-1154
+
+## New Feature
+ * [PARQUET-1035] - Write Int96 from Arrow Timestamp(ns)
+
+## Task
+ * [PARQUET-994] - C++: release-candidate script should not push to master
+ * [PARQUET-902] - [C++] Move compressor interfaces into Apache Arrow
+
+## Test
+ * [PARQUET-706] - [C++] Create test case that uses libparquet as a 3rd party library
+
+
+Parquet C++ 1.1.0
+--------------------------------------------------------------------------------
+## Bug
+ * [PARQUET-898] - [C++] Change Travis CI OS X image to Xcode 6.4 and fix our thirdparty build
+ * [PARQUET-976] - [C++] Pass unit test suite with MSVC, build in Appveyor
+ * [PARQUET-963] - [C++] Disallow reading struct types in Arrow reader for now
+ * [PARQUET-959] - [C++] Arrow thirdparty build fails on multiarch systems
+ * [PARQUET-962] - [C++] GTEST_MAIN_STATIC_LIB is not defined in FindGTest.cmake
+ * [PARQUET-958] - [C++] Print Parquet metadata in JSON format
+ * [PARQUET-956] - C++: BUILD_BYPRODUCTS not specified anymore for gtest
+ * [PARQUET-948] - [C++] Account for API changes in ARROW-782
+ * [PARQUET-947] - [C++] Refactor to account for ARROW-795 Arrow core library consolidation
+ * [PARQUET-965] - [C++] FIXED_LEN_BYTE_ARRAY types are unhandled in the Arrow reader
+ * [PARQUET-949] - [C++] Arrow version pinning seems to not be working properly
+ * [PARQUET-955] - [C++] pkg_check_modules will override $ARROW_HOME if it is set in the environment
+ * [PARQUET-945] - [C++] Thrift static libraries are not used with recent patch
+ * [PARQUET-943] - [C++] Overflow build error on x86
+ * [PARQUET-938] - [C++] There is a typo in cmake_modules/FindSnappy.cmake comment
+ * [PARQUET-936] - [C++] parquet::arrow::WriteTable can enter infinite loop if chunk_size is 0
+ * [PARQUET-981] - Repair usage of *_HOME 3rd party dependencies environment variables during Windows build
+ * [PARQUET-992] - [C++] parquet/compression.h leaks zlib.h
+ * [PARQUET-987] - [C++] Fix regressions caused by PARQUET-981
+ * [PARQUET-933] - [C++] Account for Arrow Table API changes coming in ARROW-728
+ * [PARQUET-915] - Support Arrow Time Types in Schema
+ * [PARQUET-914] - [C++] Throw more informative exception when user writes too many values to a column in a row group
+ * [PARQUET-923] - [C++] Account for Time metadata changes in ARROW-686
+ * [PARQUET-918] - FromParquetSchema API crashes on nested schemas
+ * [PARQUET-925] - [C++] FindArrow.cmake sets the wrong library path after ARROW-648
+ * [PARQUET-932] - [c++] Add option to build parquet library with minimal dependency
+ * [PARQUET-919] - [C++] Account for API changes in ARROW-683
+ * [PARQUET-995] - [C++] Int96 reader in parquet_arrow uses size of Int96Type instead of Int96
+
+## Improvement
+ * [PARQUET-508] - Add ParquetFilePrinter
+ * [PARQUET-595] - Add API for key-value metadata
+ * [PARQUET-897] - [C++] Only use designated public headers from libarrow
+ * [PARQUET-679] - [C++] Build and unit tests support for MSVC on Windows
+ * [PARQUET-977] - Improve MSVC build
+ * [PARQUET-957] - [C++] Add optional $PARQUET_BUILD_TOOLCHAIN environment variable option for configuring build environment
+ * [PARQUET-961] - [C++] Strip debug symbols from libparquet libraries in release builds by default
+ * [PARQUET-954] - C++: Use Brolti 0.6 release
+ * [PARQUET-953] - [C++] Change arrow::FileWriter API to be initialized from a Schema, and provide for writing multiple tables
+ * [PARQUET-941] - [C++] Stop needless Boost static library detection for CentOS 7 support
+ * [PARQUET-942] - [C++] Fix wrong variabe use in FindSnappy
+ * [PARQUET-939] - [C++] Support Thrift_HOME CMake variable like FindSnappy does as Snappy_HOME
+ * [PARQUET-940] - [C++] Fix Arrow library path detection
+ * [PARQUET-937] - [C++] Support CMake < 3.4 again for Arrow detection
+ * [PARQUET-935] - [C++] Set shared library version for .deb packages
+ * [PARQUET-934] - [C++] Support multiarch on Debian
+ * [PARQUET-984] - C++: Add abi and so version to pkg-config
+ * [PARQUET-983] - C++: Update Thirdparty hash to Arrow 0.3.0
+ * [PARQUET-989] - [C++] Link dynamically to libarrow in toolchain build, set LD_LIBRARY_PATH
+ * [PARQUET-988] - [C++] Add Linux toolchain-based build to Travis CI
+ * [PARQUET-928] - [C++] Support pkg-config
+ * [PARQUET-927] - [C++] Specify shared library version of Apache Arrow
+ * [PARQUET-931] - [C++] Add option to pin thirdparty Arrow version used in ExternalProject
+ * [PARQUET-926] - [C++] Use pkg-config to find Apache Arrow
+ * [PARQUET-917] - C++: Build parquet_arrow by default
+ * [PARQUET-910] - C++: Support TIME logical type in parquet_arrow
+ * [PARQUET-909] - [CPP]: Reduce buffer allocations (mallocs) on critical path
+
+## New Feature
+ * [PARQUET-853] - [C++] Add option to link with shared boost libraries when building Arrow in the thirdparty toolchain
+ * [PARQUET-946] - [C++] Refactoring in parquet::arrow::FileReader to be able to read a single row group
+ * [PARQUET-930] - [C++] Account for all Arrow date/time types
+
+
+Parquet C++ 1.0.0
+--------------------------------------------------------------------------------
+## Bug
+ * [PARQUET-455] - Fix compiler warnings on OS X / Clang
+ * [PARQUET-558] - Support ZSH in build scripts
+ * [PARQUET-720] - Parquet-cpp fails to link when included in multiple TUs
+ * [PARQUET-718] - Reading boolean pages written by parquet-cpp fails
+ * [PARQUET-640] - [C++] Force the use of gcc 4.9 in conda builds
+ * [PARQUET-643] - Add const modifier to schema pointer reference in ParquetFileWriter
+ * [PARQUET-672] - [C++] Build testing conda artifacts in debug mode
+ * [PARQUET-661] - [C++] Do not assume that perl is found in /usr/bin
+ * [PARQUET-659] - [C++] Instantiated template visibility is broken on clang / OS X
+ * [PARQUET-657] - [C++] Don't define DISALLOW_COPY_AND_ASSIGN if already defined
+ * [PARQUET-656] - [C++] Revert PARQUET-653
+ * [PARQUET-676] - MAX_VALUES_PER_LITERAL_RUN causes RLE encoding failure
+ * [PARQUET-614] - C++: Remove unneeded LZ4-related code
+ * [PARQUET-604] - Install writer.h headers
+ * [PARQUET-621] - C++: Uninitialised DecimalMetadata is read
+ * [PARQUET-620] - C++: Duplicate calls to ParquetFileWriter::Close cause duplicate metdata writes
+ * [PARQUET-599] - ColumnWriter::RleEncodeLevels' size estimation might be wrong
+ * [PARQUET-617] - C++: Enable conda build to work on systems with non-default C++ toolchains
+ * [PARQUET-627] - Ensure that thrift headers are generated before source compilation
+ * [PARQUET-745] - TypedRowGroupStatistics fails to PlainDecode min and max in ByteArrayType
+ * [PARQUET-738] - Update arrow version that also supports newer Xcode
+ * [PARQUET-747] - [C++] TypedRowGroupStatistics are not being exported in libparquet.so
+ * [PARQUET-711] - Use metadata builders in parquet writer
+ * [PARQUET-732] - Building a subset of dependencies does not work
+ * [PARQUET-760] - On switching from dictionary to the fallback encoding, an incorrect encoding is set
+ * [PARQUET-691] - [C++] Write ColumnChunk metadata after each column chunk in the file
+ * [PARQUET-797] - [C++] Update for API changes in ARROW-418
+ * [PARQUET-837] - [C++] SerializedFile::ParseMetaData uses Seek, followed by Read, and could have race conditions
+ * [PARQUET-827] - [C++] Incorporate addition of arrow::MemoryPool::Reallocate
+ * [PARQUET-502] - Scanner segfaults when its batch size is smaller than the number of rows
+ * [PARQUET-469] - Roll back Thrift bindings to 0.9.0
+ * [PARQUET-889] - Fix compilation when PARQUET_USE_SSE is on
+ * [PARQUET-888] - C++ Memory leak in RowGroupSerializer
+ * [PARQUET-819] - C++: Trying to install non-existing parquet/arrow/utils.h
+ * [PARQUET-736] - XCode 8.0 breaks builds
+ * [PARQUET-505] - Column reader: automatically handle large data pages
+ * [PARQUET-615] - C++: Building static or shared libparquet should not be mutually exclusive
+ * [PARQUET-658] - ColumnReader has no virtual destructor
+ * [PARQUET-799] - concurrent usage of the file reader API
+ * [PARQUET-513] - Valgrind errors are not failing the Travis CI build
+ * [PARQUET-841] - [C++] Writing wrong format version when using ParquetVersion::PARQUET_1_0
+ * [PARQUET-742] - Add missing license headers
+ * [PARQUET-741] - compression_buffer_ is reused although it shouldn't
+ * [PARQUET-700] - C++: Disable dictionary encoding for boolean columns
+ * [PARQUET-662] - [C++] ParquetException must be explicitly exported in dynamic libraries
+ * [PARQUET-704] - [C++] scan-all.h is not being installed
+ * [PARQUET-865] - C++: Pass all CXXFLAGS to Thrift ExternalProject
+ * [PARQUET-875] - [C++] Fix coveralls build given changes to thirdparty build procedure
+ * [PARQUET-709] - [C++] Fix conda dev binary builds
+ * [PARQUET-638] - [C++] Revert static linking of libstdc++ in conda builds until symbol visibility addressed
+ * [PARQUET-606] - Travis coverage is broken
+ * [PARQUET-880] - [CPP] Prevent destructors from throwing
+ * [PARQUET-886] - [C++] Revise build documentation and requirements in README.md
+ * [PARQUET-900] - C++: Fix NOTICE / LICENSE issues
+ * [PARQUET-885] - [C++] Do not search for Thrift in default system paths
+ * [PARQUET-879] - C++: ExternalProject compilation for Thrift fails on older CMake versions
+ * [PARQUET-635] - [C++] Statically link libstdc++ on Linux in conda recipe
+ * [PARQUET-710] - Remove unneeded private member variables from RowGroupReader ABI
+ * [PARQUET-766] - C++: Expose ParquetFileReader through Arrow reader as const
+ * [PARQUET-876] - C++: Correct snapshot version
+ * [PARQUET-821] - [C++] zlib download link is broken
+ * [PARQUET-818] - [C++] Refactor library to share IO, Buffer, and memory management abstractions with Apache Arrow
+ * [PARQUET-537] - LocalFileSource leaks resources
+ * [PARQUET-764] - [CPP] Parquet Writer does not write Boolean values correctly
+ * [PARQUET-812] - [C++] Failure reading BYTE_ARRAY data from file in parquet-compatibility project
+ * [PARQUET-759] - Cannot store columns consisting of empty strings
+ * [PARQUET-846] - [CPP] CpuInfo::Init() is not thread safe
+ * [PARQUET-694] - C++: Revert default data page size back to 1M
+ * [PARQUET-842] - [C++] Impala rejects DOUBLE columns if decimal metadata is set
+ * [PARQUET-708] - [C++] RleEncoder does not account for "worst case scenario" in MaxBufferSize for bit_width > 1
+ * [PARQUET-639] - Do not export DCHECK in public headers
+ * [PARQUET-828] - [C++] "version" field set improperly in file metadata
+ * [PARQUET-891] - [C++] Do not search for Snappy in default system paths
+ * [PARQUET-626] - Fix builds due to unavailable llvm.org apt mirror
+ * [PARQUET-629] - RowGroupSerializer should only close itself once
+ * [PARQUET-472] - Clean up InputStream ownership semantics in ColumnReader
+ * [PARQUET-739] - Rle-decoding uses static buffer that is shared accross threads
+ * [PARQUET-561] - ParquetFileReader::Contents PIMPL missing a virtual destructor
+ * [PARQUET-892] - [C++] Clean up link library targets in CMake files
+ * [PARQUET-454] - Address inconsistencies in boolean decoding
+ * [PARQUET-816] - [C++] Failure decoding sample dict-encoded file from parquet-compatibility project
+ * [PARQUET-565] - Use PATH instead of DIRECTORY in get_filename_component to support CMake<2.8.12
+ * [PARQUET-446] - Hide thrift dependency in parquet-cpp
+ * [PARQUET-843] - [C++] Impala unable to read files created by parquet-cpp
+ * [PARQUET-555] - Dictionary page metadata handling inconsistencies
+ * [PARQUET-908] - Fix for PARQUET-890 introduces undefined symbol in libparquet_arrow.so
+ * [PARQUET-793] - [CPP] Do not return incorrect statistics
+ * [PARQUET-887] - C++: Fix issues in release scripts arise in RC1
+
+## Improvement
+ * [PARQUET-277] - Remove boost dependency
+ * [PARQUET-500] - Enable coveralls.io for apache/parquet-cpp
+ * [PARQUET-497] - Decouple Parquet physical file structure from FileReader class
+ * [PARQUET-597] - Add data rates to benchmark output
+ * [PARQUET-522] - #include cleanup with include-what-you-use
+ * [PARQUET-515] - Add "Reset" to LevelEncoder and LevelDecoder
+ * [PARQUET-514] - Automate coveralls.io updates in Travis CI
+ * [PARQUET-551] - Handle compiler warnings due to disabled DCHECKs in release builds
+ * [PARQUET-559] - Enable InputStream as a source to the ParquetFileReader
+ * [PARQUET-562] - Simplified ZSH support in build scripts
+ * [PARQUET-538] - Improve ColumnReader Tests
+ * [PARQUET-541] - Portable build scripts
+ * [PARQUET-724] - Test more advanced properties setting
+ * [PARQUET-641] - Instantiate stringstream only if needed in SerializedPageReader::NextPage
+ * [PARQUET-636] - Expose selection for different encodings
+ * [PARQUET-603] - Implement missing information in schema descriptor
+ * [PARQUET-610] - Print ColumnMetaData for each RowGroup
+ * [PARQUET-600] - Add benchmarks for RLE-Level encoding
+ * [PARQUET-592] - Support compressed writes
+ * [PARQUET-593] - Add API for writing Page statistics
+ * [PARQUET-589] - Implement Chunked InMemoryInputStream for better memory usage
+ * [PARQUET-587] - Implement BufferReader::Read(int64_t,uint8_t*)
+ * [PARQUET-616] - C++: WriteBatch should accept const arrays
+ * [PARQUET-630] - C++: Support link flags for older CMake versions
+ * [PARQUET-634] - Consistent private linking of dependencies
+ * [PARQUET-633] - Add version to WriterProperties
+ * [PARQUET-625] - Improve RLE read performance
+ * [PARQUET-737] - Use absolute namespace in macros
+ * [PARQUET-762] - C++: Use optimistic allocation instead of Arrow Builders
+ * [PARQUET-773] - C++: Check licenses with RAT in CI
+ * [PARQUET-687] - C++: Switch to PLAIN encoding if dictionary grows too large
+ * [PARQUET-784] - C++: Reference Spark, Kudu and FrameOfReference in LICENSE
+ * [PARQUET-809] - [C++] Add API to determine if two files' schemas are compatible
+ * [PARQUET-778] - Standardize the schema output to match the parquet-mr format
+ * [PARQUET-463] - Add DCHECK* macros for assertions in debug builds
+ * [PARQUET-471] - Use the same environment setup script for Travis CI as local sandbox development
+ * [PARQUET-449] - Update to latest parquet.thrift
+ * [PARQUET-496] - Fix cpplint configuration to be more restrictive
+ * [PARQUET-468] - Add a cmake option to generate the Parquet thrift headers with the thriftc in the environment
+ * [PARQUET-482] - Organize src code file structure to have a very clear folder with public headers.
+ * [PARQUET-591] - Page size estimation during writes
+ * [PARQUET-518] - Review usages of size_t and unsigned integers generally per Google style guide
+ * [PARQUET-533] - Simplify RandomAccessSource API to combine Seek/Read
+ * [PARQUET-767] - Add release scripts for parquet-cpp
+ * [PARQUET-699] - Update parquet.thrift from https://github.com/apache/parquet-format
+ * [PARQUET-653] - [C++] Re-enable -static-libstdc++ in dev artifact builds
+ * [PARQUET-763] - C++: Expose ParquetFileReader through Arrow reader
+ * [PARQUET-857] - [C++] Flatten parquet/encodings directory
+ * [PARQUET-862] - Provide defaut cache size values if CPU info probing is not available
+ * [PARQUET-689] - C++: Compress DataPages eagerly
+ * [PARQUET-874] - [C++] Use default memory allocator from Arrow
+ * [PARQUET-267] - Detach thirdparty code from build configuration.
+ * [PARQUET-418] - Add a utility to print contents of a Parquet file to stdout
+ * [PARQUET-519] - Disable compiler warning supressions and fix all DEBUG build warnings
+ * [PARQUET-447] - Add Debug and Release build types and associated compiler flags
+ * [PARQUET-868] - C++: Build snappy with optimizations
+ * [PARQUET-894] - Fix compilation warning
+ * [PARQUET-883] - C++: Support non-standard gcc version strings
+ * [PARQUET-607] - Public Writer header
+ * [PARQUET-731] - [CPP] Add API to return metadata size and Skip reading values
+ * [PARQUET-628] - Link thrift privately
+ * [PARQUET-877] - C++: Update Arrow Hash, update Version in metadata.
+ * [PARQUET-547] - Refactor most templates to use DataType structs rather than the Type::type enum
+ * [PARQUET-882] - [CPP] Improve Application Version parsing
+ * [PARQUET-448] - Add cmake option to skip building the unit tests
+ * [PARQUET-721] - Performance benchmarks for reading into Arrow structures
+ * [PARQUET-820] - C++: Decoders should directly emit arrays with spacing for null entries
+ * [PARQUET-813] - C++: Build dependencies using CMake External project
+ * [PARQUET-488] - Add SSE-related cmake options to manage compiler flags
+ * [PARQUET-564] - Add option to run unit tests with valgrind --tool=memcheck
+ * [PARQUET-572] - Rename parquet_cpp namespace to parquet
+ * [PARQUET-829] - C++: Make use of ARROW-469
+ * [PARQUET-501] - Add an OutputStream abstraction (capable of memory allocation) for Encoder public API
+ * [PARQUET-744] - Clarifications on build instructions
+ * [PARQUET-520] - Add version of LocalFileSource that uses memory-mapping for zero-copy reads
+ * [PARQUET-556] - Extend RowGroupStatistics to include "min" "max" statistics
+ * [PARQUET-671] - Improve performance of RLE/bit-packed decoding in parquet-cpp
+ * [PARQUET-681] - Add tool to scan a parquet file
+
+## New Feature
+ * [PARQUET-499] - Complete PlainEncoder implementation for all primitive types and test end to end
+ * [PARQUET-439] - Conform all copyright headers to ASF requirements
+ * [PARQUET-436] - Implement ParquetFileWriter class entry point for generating new Parquet files
+ * [PARQUET-435] - Provide vectorized ColumnReader interface
+ * [PARQUET-438] - Update RLE encoder/decoder modules from Impala upstream changes and adapt unit tests
+ * [PARQUET-512] - Add optional google/benchmark 3rd-party dependency for performance testing
+ * [PARQUET-566] - Add method to retrieve the full column path
+ * [PARQUET-613] - C++: Add conda packaging recipe
+ * [PARQUET-605] - Expose schema node in ColumnDescriptor
+ * [PARQUET-619] - C++: Add OutputStream for local files
+ * [PARQUET-583] - Implement Parquet to Thrift schema conversion
+ * [PARQUET-582] - Conversion functions for Parquet enums to Thrift enums
+ * [PARQUET-728] - [C++] Bring parquet::arrow up to date with API changes in arrow::io
+ * [PARQUET-752] - [C++] Conform parquet_arrow to upstream API changes
+ * [PARQUET-788] - [C++] Reference Impala / Apache Impala (incubating) in LICENSE
+ * [PARQUET-808] - [C++] Add API to read file given externally-provided FileMetadata
+ * [PARQUET-807] - [C++] Add API to read file metadata only from a file handle
+ * [PARQUET-805] - C++: Read Int96 into Arrow Timestamp(ns)
+ * [PARQUET-836] - [C++] Add column selection to parquet::arrow::FileReader
+ * [PARQUET-835] - [C++] Add option to parquet::arrow to read columns in parallel using a thread pool
+ * [PARQUET-830] - [C++] Add additional configuration options to parquet::arrow::OpenFIle
+ * [PARQUET-769] - C++: Add support for Brotli Compression
+ * [PARQUET-489] - Add visibility macros to be used for public and internal APIs of libparquet
+ * [PARQUET-542] - Support memory allocation from external memory
+ * [PARQUET-844] - [C++] Consolidate encodings, schema, and compression subdirectories into fewer files
+ * [PARQUET-848] - [C++] Consolidate libparquet_thrift subcomponent
+ * [PARQUET-646] - [C++] Enable easier 3rd-party toolchain clang builds on Linux
+ * [PARQUET-598] - [C++] Test writing all primitive data types
+ * [PARQUET-442] - Convert flat SchemaElement vector to implied nested schema data structure
+ * [PARQUET-867] - [C++] Support writing sliced Arrow arrays
+ * [PARQUET-456] - Add zlib codec support
+ * [PARQUET-834] - C++: Support r/w of arrow::ListArray
+ * [PARQUET-485] - Decouple data page delimiting from column reader / scanner classes, create test fixtures
+ * [PARQUET-434] - Add a ParquetFileReader class to encapsulate some low-level details of interacting with Parquet files
+ * [PARQUET-666] - PLAIN_DICTIONARY write support
+ * [PARQUET-437] - Incorporate googletest thirdparty dependency and add cmake tools (ADD_PARQUET_TEST) to simplify adding new unit tests
+ * [PARQUET-866] - [C++] Account for API changes in ARROW-33
+ * [PARQUET-545] - Improve API to support Decimal type
+ * [PARQUET-579] - Add API for writing Column statistics
+ * [PARQUET-494] - Implement PLAIN_DICTIONARY encoding and decoding
+ * [PARQUET-618] - C++: Automatically upload conda build artifacts on commits to master
+ * [PARQUET-833] - C++: Provide API to write spaced arrays (e.g. Arrow)
+ * [PARQUET-903] - C++: Add option to set RPATH to ORIGIN
+ * [PARQUET-451] - Add a RowGroup reader interface class
+ * [PARQUET-785] - C++: List conversion for Arrow Schemas
+ * [PARQUET-712] - C++: Read into Arrow memory
+ * [PARQUET-890] - C++: Support I/O of DATE columns in parquet_arrow
+ * [PARQUET-782] - C++: Support writing to Arrow sinks
+ * [PARQUET-849] - [C++] Upgrade default Thrift in thirdparty toolchain to 0.9.3 or 0.10
+ * [PARQUET-573] - C++: Create a public API for reading and writing file metadata
+
+## Task
+ * [PARQUET-814] - C++: Remove Conda recipes
+ * [PARQUET-503] - Re-enable parquet 2.0 encodings
+ * [PARQUET-169] - Parquet-cpp: Implement support for bulk reading and writing repetition/definition levels.
+ * [PARQUET-878] - C++: Remove setup_build_env from rc-verification script
+ * [PARQUET-881] - C++: Update Arrow hash to 0.2.0-rc2
+ * [PARQUET-771] - C++: Sync KEYS file
+ * [PARQUET-901] - C++: Publish RCs in apache-parquet-VERSION in SVN
+
+## Test
+ * [PARQUET-525] - Test coverage for malformed file failure modes on the read path
+ * [PARQUET-703] - [C++] Validate num_values metadata for columns with nulls
+ * [PARQUET-507] - Improve runtime of rle-test.cc
+ * [PARQUET-549] - Add scanner and column reader tests for dictionary data pages
+ * [PARQUET-457] - Add compressed data page unit tests
diff --git a/contrib/libs/apache/arrow/cpp/README.md b/contrib/libs/apache/arrow/cpp/README.md
new file mode 100644
index 00000000000..b083f3fe78e
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/README.md
@@ -0,0 +1,34 @@
+<!---
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+
+# Apache Arrow C++
+
+This directory contains the code and build system for the Arrow C++ libraries,
+as well as for the C++ libraries for Apache Parquet.
+
+## Installation
+
+See https://arrow.apache.org/install/ for the latest instructions how
+to install pre-compiled binary versions of the library.
+
+## Source Builds and Development
+
+Please refer to our latest [C++ Development Documentation][1].
+
+[1]: https://github.com/apache/arrow/blob/master/docs/source/developers/cpp
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/adapters/orc/adapter.cc b/contrib/libs/apache/arrow/cpp/src/arrow/adapters/orc/adapter.cc
new file mode 100644
index 00000000000..2f74b40e40d
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/adapters/orc/adapter.cc
@@ -0,0 +1,595 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/adapters/orc/adapter.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <functional>
+#include <list>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/adapters/orc/adapter_util.h"
+#include "arrow/buffer.h"
+#include "arrow/builder.h"
+#include "arrow/io/interfaces.h"
+#include "arrow/memory_pool.h"
+#include "arrow/record_batch.h"
+#include "arrow/status.h"
+#include "arrow/table.h"
+#include "arrow/table_builder.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/decimal.h"
+#include "arrow/util/key_value_metadata.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/range.h"
+#include "arrow/util/visibility.h"
+#include "orc/Exceptions.hh"
+
+// alias to not interfere with nested orc namespace
+namespace liborc = orc;
+
+#define ORC_THROW_NOT_OK(s) \
+ do { \
+ Status _s = (s); \
+ if (!_s.ok()) { \
+ std::stringstream ss; \
+ ss << "Arrow error: " << _s.ToString(); \
+ throw liborc::ParseError(ss.str()); \
+ } \
+ } while (0)
+
+#define ORC_ASSIGN_OR_THROW_IMPL(status_name, lhs, rexpr) \
+ auto status_name = (rexpr); \
+ ORC_THROW_NOT_OK(status_name.status()); \
+ lhs = std::move(status_name).ValueOrDie();
+
+#define ORC_ASSIGN_OR_THROW(lhs, rexpr) \
+ ORC_ASSIGN_OR_THROW_IMPL(ARROW_ASSIGN_OR_RAISE_NAME(_error_or_value, __COUNTER__), \
+ lhs, rexpr);
+
+#define ORC_BEGIN_CATCH_NOT_OK try {
+#define ORC_END_CATCH_NOT_OK \
+ } \
+ catch (const liborc::ParseError& e) { \
+ return Status::IOError(e.what()); \
+ } \
+ catch (const liborc::InvalidArgument& e) { \
+ return Status::Invalid(e.what()); \
+ } \
+ catch (const liborc::NotImplementedYet& e) { \
+ return Status::NotImplemented(e.what()); \
+ }
+
+#define ORC_CATCH_NOT_OK(_s) \
+ ORC_BEGIN_CATCH_NOT_OK(_s); \
+ ORC_END_CATCH_NOT_OK
+
+namespace arrow {
+namespace adapters {
+namespace orc {
+
+namespace {
+
+// The following are required by ORC to be uint64_t
+constexpr uint64_t kOrcWriterBatchSize = 128 * 1024;
+constexpr uint64_t kOrcNaturalWriteSize = 128 * 1024;
+
+using internal::checked_cast;
+
+class ArrowInputFile : public liborc::InputStream {
+ public:
+ explicit ArrowInputFile(const std::shared_ptr<io::RandomAccessFile>& file)
+ : file_(file) {}
+
+ uint64_t getLength() const override {
+ ORC_ASSIGN_OR_THROW(int64_t size, file_->GetSize());
+ return static_cast<uint64_t>(size);
+ }
+
+ uint64_t getNaturalReadSize() const override { return 128 * 1024; }
+
+ void read(void* buf, uint64_t length, uint64_t offset) override {
+ ORC_ASSIGN_OR_THROW(int64_t bytes_read, file_->ReadAt(offset, length, buf));
+
+ if (static_cast<uint64_t>(bytes_read) != length) {
+ throw liborc::ParseError("Short read from arrow input file");
+ }
+ }
+
+ const std::string& getName() const override {
+ static const std::string filename("ArrowInputFile");
+ return filename;
+ }
+
+ private:
+ std::shared_ptr<io::RandomAccessFile> file_;
+};
+
+struct StripeInformation {
+ uint64_t offset;
+ uint64_t length;
+ uint64_t num_rows;
+ uint64_t first_row_of_stripe;
+};
+
+// The number of rows to read in a ColumnVectorBatch
+constexpr int64_t kReadRowsBatch = 1000;
+
+class OrcStripeReader : public RecordBatchReader {
+ public:
+ OrcStripeReader(std::unique_ptr<liborc::RowReader> row_reader,
+ std::shared_ptr<Schema> schema, int64_t batch_size, MemoryPool* pool)
+ : row_reader_(std::move(row_reader)),
+ schema_(schema),
+ pool_(pool),
+ batch_size_{batch_size} {}
+
+ std::shared_ptr<Schema> schema() const override { return schema_; }
+
+ Status ReadNext(std::shared_ptr<RecordBatch>* out) override {
+ std::unique_ptr<liborc::ColumnVectorBatch> batch;
+ ORC_CATCH_NOT_OK(batch = row_reader_->createRowBatch(batch_size_));
+
+ const liborc::Type& type = row_reader_->getSelectedType();
+ if (!row_reader_->next(*batch)) {
+ out->reset();
+ return Status::OK();
+ }
+
+ std::unique_ptr<RecordBatchBuilder> builder;
+ RETURN_NOT_OK(RecordBatchBuilder::Make(schema_, pool_, batch->numElements, &builder));
+
+ // The top-level type must be a struct to read into an arrow table
+ const auto& struct_batch = checked_cast<liborc::StructVectorBatch&>(*batch);
+
+ for (int i = 0; i < builder->num_fields(); i++) {
+ RETURN_NOT_OK(AppendBatch(type.getSubtype(i), struct_batch.fields[i], 0,
+ batch->numElements, builder->GetField(i)));
+ }
+
+ RETURN_NOT_OK(builder->Flush(out));
+ return Status::OK();
+ }
+
+ private:
+ std::unique_ptr<liborc::RowReader> row_reader_;
+ std::shared_ptr<Schema> schema_;
+ MemoryPool* pool_;
+ int64_t batch_size_;
+};
+
+} // namespace
+
+class ORCFileReader::Impl {
+ public:
+ Impl() {}
+ ~Impl() {}
+
+ Status Open(const std::shared_ptr<io::RandomAccessFile>& file, MemoryPool* pool) {
+ std::unique_ptr<ArrowInputFile> io_wrapper(new ArrowInputFile(file));
+ liborc::ReaderOptions options;
+ std::unique_ptr<liborc::Reader> liborc_reader;
+ ORC_CATCH_NOT_OK(liborc_reader = createReader(std::move(io_wrapper), options));
+ pool_ = pool;
+ reader_ = std::move(liborc_reader);
+ current_row_ = 0;
+
+ return Init();
+ }
+
+ Status Init() {
+ int64_t nstripes = reader_->getNumberOfStripes();
+ stripes_.resize(nstripes);
+ std::unique_ptr<liborc::StripeInformation> stripe;
+ uint64_t first_row_of_stripe = 0;
+ for (int i = 0; i < nstripes; ++i) {
+ stripe = reader_->getStripe(i);
+ stripes_[i] = StripeInformation({stripe->getOffset(), stripe->getLength(),
+ stripe->getNumberOfRows(), first_row_of_stripe});
+ first_row_of_stripe += stripe->getNumberOfRows();
+ }
+ return Status::OK();
+ }
+
+ int64_t NumberOfStripes() { return stripes_.size(); }
+
+ int64_t NumberOfRows() { return reader_->getNumberOfRows(); }
+
+ Status ReadSchema(std::shared_ptr<Schema>* out) {
+ const liborc::Type& type = reader_->getType();
+ return GetArrowSchema(type, out);
+ }
+
+ Status ReadSchema(const liborc::RowReaderOptions& opts, std::shared_ptr<Schema>* out) {
+ std::unique_ptr<liborc::RowReader> row_reader;
+ ORC_CATCH_NOT_OK(row_reader = reader_->createRowReader(opts));
+ const liborc::Type& type = row_reader->getSelectedType();
+ return GetArrowSchema(type, out);
+ }
+
+ Result<std::shared_ptr<const KeyValueMetadata>> ReadMetadata() {
+ const std::list<std::string> keys = reader_->getMetadataKeys();
+ auto metadata = std::make_shared<KeyValueMetadata>();
+ for (const auto& key : keys) {
+ metadata->Append(key, reader_->getMetadataValue(key));
+ }
+ return std::const_pointer_cast<const KeyValueMetadata>(metadata);
+ }
+
+ Status GetArrowSchema(const liborc::Type& type, std::shared_ptr<Schema>* out) {
+ if (type.getKind() != liborc::STRUCT) {
+ return Status::NotImplemented(
+ "Only ORC files with a top-level struct "
+ "can be handled");
+ }
+ int size = static_cast<int>(type.getSubtypeCount());
+ std::vector<std::shared_ptr<Field>> fields;
+ for (int child = 0; child < size; ++child) {
+ std::shared_ptr<DataType> elemtype;
+ RETURN_NOT_OK(GetArrowType(type.getSubtype(child), &elemtype));
+ std::string name = type.getFieldName(child);
+ fields.push_back(field(name, elemtype));
+ }
+ ARROW_ASSIGN_OR_RAISE(auto metadata, ReadMetadata());
+ *out = std::make_shared<Schema>(std::move(fields), std::move(metadata));
+ return Status::OK();
+ }
+
+ Status Read(std::shared_ptr<Table>* out) {
+ liborc::RowReaderOptions opts;
+ std::shared_ptr<Schema> schema;
+ RETURN_NOT_OK(ReadSchema(opts, &schema));
+ return ReadTable(opts, schema, out);
+ }
+
+ Status Read(const std::shared_ptr<Schema>& schema, std::shared_ptr<Table>* out) {
+ liborc::RowReaderOptions opts;
+ return ReadTable(opts, schema, out);
+ }
+
+ Status Read(const std::vector<int>& include_indices, std::shared_ptr<Table>* out) {
+ liborc::RowReaderOptions opts;
+ RETURN_NOT_OK(SelectIndices(&opts, include_indices));
+ std::shared_ptr<Schema> schema;
+ RETURN_NOT_OK(ReadSchema(opts, &schema));
+ return ReadTable(opts, schema, out);
+ }
+
+ Status Read(const std::shared_ptr<Schema>& schema,
+ const std::vector<int>& include_indices, std::shared_ptr<Table>* out) {
+ liborc::RowReaderOptions opts;
+ RETURN_NOT_OK(SelectIndices(&opts, include_indices));
+ return ReadTable(opts, schema, out);
+ }
+
+ Status ReadStripe(int64_t stripe, std::shared_ptr<RecordBatch>* out) {
+ liborc::RowReaderOptions opts;
+ RETURN_NOT_OK(SelectStripe(&opts, stripe));
+ std::shared_ptr<Schema> schema;
+ RETURN_NOT_OK(ReadSchema(opts, &schema));
+ return ReadBatch(opts, schema, stripes_[stripe].num_rows, out);
+ }
+
+ Status ReadStripe(int64_t stripe, const std::vector<int>& include_indices,
+ std::shared_ptr<RecordBatch>* out) {
+ liborc::RowReaderOptions opts;
+ RETURN_NOT_OK(SelectIndices(&opts, include_indices));
+ RETURN_NOT_OK(SelectStripe(&opts, stripe));
+ std::shared_ptr<Schema> schema;
+ RETURN_NOT_OK(ReadSchema(opts, &schema));
+ return ReadBatch(opts, schema, stripes_[stripe].num_rows, out);
+ }
+
+ Status SelectStripe(liborc::RowReaderOptions* opts, int64_t stripe) {
+ ARROW_RETURN_IF(stripe < 0 || stripe >= NumberOfStripes(),
+ Status::Invalid("Out of bounds stripe: ", stripe));
+
+ opts->range(stripes_[stripe].offset, stripes_[stripe].length);
+ return Status::OK();
+ }
+
+ Status SelectStripeWithRowNumber(liborc::RowReaderOptions* opts, int64_t row_number,
+ StripeInformation* out) {
+ ARROW_RETURN_IF(row_number >= NumberOfRows(),
+ Status::Invalid("Out of bounds row number: ", row_number));
+
+ for (auto it = stripes_.begin(); it != stripes_.end(); it++) {
+ if (static_cast<uint64_t>(row_number) >= it->first_row_of_stripe &&
+ static_cast<uint64_t>(row_number) < it->first_row_of_stripe + it->num_rows) {
+ opts->range(it->offset, it->length);
+ *out = *it;
+ return Status::OK();
+ }
+ }
+
+ return Status::Invalid("Invalid row number", row_number);
+ }
+
+ Status SelectIndices(liborc::RowReaderOptions* opts,
+ const std::vector<int>& include_indices) {
+ std::list<uint64_t> include_indices_list;
+ for (auto it = include_indices.begin(); it != include_indices.end(); ++it) {
+ ARROW_RETURN_IF(*it < 0, Status::Invalid("Negative field index"));
+ include_indices_list.push_back(*it);
+ }
+ opts->includeTypes(include_indices_list);
+ return Status::OK();
+ }
+
+ Status ReadTable(const liborc::RowReaderOptions& row_opts,
+ const std::shared_ptr<Schema>& schema, std::shared_ptr<Table>* out) {
+ liborc::RowReaderOptions opts(row_opts);
+ std::vector<std::shared_ptr<RecordBatch>> batches(stripes_.size());
+ for (size_t stripe = 0; stripe < stripes_.size(); stripe++) {
+ opts.range(stripes_[stripe].offset, stripes_[stripe].length);
+ RETURN_NOT_OK(ReadBatch(opts, schema, stripes_[stripe].num_rows, &batches[stripe]));
+ }
+ return Table::FromRecordBatches(schema, std::move(batches)).Value(out);
+ }
+
+ Status ReadBatch(const liborc::RowReaderOptions& opts,
+ const std::shared_ptr<Schema>& schema, int64_t nrows,
+ std::shared_ptr<RecordBatch>* out) {
+ std::unique_ptr<liborc::RowReader> row_reader;
+ std::unique_ptr<liborc::ColumnVectorBatch> batch;
+
+ ORC_BEGIN_CATCH_NOT_OK
+ row_reader = reader_->createRowReader(opts);
+ batch = row_reader->createRowBatch(std::min(nrows, kReadRowsBatch));
+ ORC_END_CATCH_NOT_OK
+
+ std::unique_ptr<RecordBatchBuilder> builder;
+ RETURN_NOT_OK(RecordBatchBuilder::Make(schema, pool_, nrows, &builder));
+
+ // The top-level type must be a struct to read into an arrow table
+ const auto& struct_batch = checked_cast<liborc::StructVectorBatch&>(*batch);
+
+ const liborc::Type& type = row_reader->getSelectedType();
+ while (row_reader->next(*batch)) {
+ for (int i = 0; i < builder->num_fields(); i++) {
+ RETURN_NOT_OK(AppendBatch(type.getSubtype(i), struct_batch.fields[i], 0,
+ batch->numElements, builder->GetField(i)));
+ }
+ }
+ RETURN_NOT_OK(builder->Flush(out));
+ return Status::OK();
+ }
+
+ Status Seek(int64_t row_number) {
+ ARROW_RETURN_IF(row_number >= NumberOfRows(),
+ Status::Invalid("Out of bounds row number: ", row_number));
+
+ current_row_ = row_number;
+ return Status::OK();
+ }
+
+ Status NextStripeReader(int64_t batch_size, const std::vector<int>& include_indices,
+ std::shared_ptr<RecordBatchReader>* out) {
+ if (current_row_ >= NumberOfRows()) {
+ out->reset();
+ return Status::OK();
+ }
+
+ liborc::RowReaderOptions opts;
+ if (!include_indices.empty()) {
+ RETURN_NOT_OK(SelectIndices(&opts, include_indices));
+ }
+ StripeInformation stripe_info({0, 0, 0, 0});
+ RETURN_NOT_OK(SelectStripeWithRowNumber(&opts, current_row_, &stripe_info));
+ std::shared_ptr<Schema> schema;
+ RETURN_NOT_OK(ReadSchema(opts, &schema));
+ std::unique_ptr<liborc::RowReader> row_reader;
+
+ ORC_BEGIN_CATCH_NOT_OK
+ row_reader = reader_->createRowReader(opts);
+ row_reader->seekToRow(current_row_);
+ current_row_ = stripe_info.first_row_of_stripe + stripe_info.num_rows;
+ ORC_END_CATCH_NOT_OK
+
+ *out = std::shared_ptr<RecordBatchReader>(
+ new OrcStripeReader(std::move(row_reader), schema, batch_size, pool_));
+ return Status::OK();
+ }
+
+ Status NextStripeReader(int64_t batch_size, std::shared_ptr<RecordBatchReader>* out) {
+ return NextStripeReader(batch_size, {}, out);
+ }
+
+ private:
+ MemoryPool* pool_;
+ std::unique_ptr<liborc::Reader> reader_;
+ std::vector<StripeInformation> stripes_;
+ int64_t current_row_;
+};
+
+ORCFileReader::ORCFileReader() { impl_.reset(new ORCFileReader::Impl()); }
+
+ORCFileReader::~ORCFileReader() {}
+
+Status ORCFileReader::Open(const std::shared_ptr<io::RandomAccessFile>& file,
+ MemoryPool* pool, std::unique_ptr<ORCFileReader>* reader) {
+ auto result = std::unique_ptr<ORCFileReader>(new ORCFileReader());
+ RETURN_NOT_OK(result->impl_->Open(file, pool));
+ *reader = std::move(result);
+ return Status::OK();
+}
+
+Result<std::shared_ptr<const KeyValueMetadata>> ORCFileReader::ReadMetadata() {
+ return impl_->ReadMetadata();
+}
+
+Status ORCFileReader::ReadSchema(std::shared_ptr<Schema>* out) {
+ return impl_->ReadSchema(out);
+}
+
+Status ORCFileReader::Read(std::shared_ptr<Table>* out) { return impl_->Read(out); }
+
+Status ORCFileReader::Read(const std::shared_ptr<Schema>& schema,
+ std::shared_ptr<Table>* out) {
+ return impl_->Read(schema, out);
+}
+
+Status ORCFileReader::Read(const std::vector<int>& include_indices,
+ std::shared_ptr<Table>* out) {
+ return impl_->Read(include_indices, out);
+}
+
+Status ORCFileReader::Read(const std::shared_ptr<Schema>& schema,
+ const std::vector<int>& include_indices,
+ std::shared_ptr<Table>* out) {
+ return impl_->Read(schema, include_indices, out);
+}
+
+Status ORCFileReader::ReadStripe(int64_t stripe, std::shared_ptr<RecordBatch>* out) {
+ return impl_->ReadStripe(stripe, out);
+}
+
+Status ORCFileReader::ReadStripe(int64_t stripe, const std::vector<int>& include_indices,
+ std::shared_ptr<RecordBatch>* out) {
+ return impl_->ReadStripe(stripe, include_indices, out);
+}
+
+Status ORCFileReader::Seek(int64_t row_number) { return impl_->Seek(row_number); }
+
+Status ORCFileReader::NextStripeReader(int64_t batch_sizes,
+ std::shared_ptr<RecordBatchReader>* out) {
+ return impl_->NextStripeReader(batch_sizes, out);
+}
+
+Status ORCFileReader::NextStripeReader(int64_t batch_size,
+ const std::vector<int>& include_indices,
+ std::shared_ptr<RecordBatchReader>* out) {
+ return impl_->NextStripeReader(batch_size, include_indices, out);
+}
+
+int64_t ORCFileReader::NumberOfStripes() { return impl_->NumberOfStripes(); }
+
+int64_t ORCFileReader::NumberOfRows() { return impl_->NumberOfRows(); }
+
+namespace {
+
+class ArrowOutputStream : public liborc::OutputStream {
+ public:
+ explicit ArrowOutputStream(arrow::io::OutputStream& output_stream)
+ : output_stream_(output_stream), length_(0) {}
+
+ uint64_t getLength() const override { return length_; }
+
+ uint64_t getNaturalWriteSize() const override { return kOrcNaturalWriteSize; }
+
+ void write(const void* buf, size_t length) override {
+ ORC_THROW_NOT_OK(output_stream_.Write(buf, static_cast<int64_t>(length)));
+ length_ += static_cast<int64_t>(length);
+ }
+
+ // Mandatory due to us implementing an ORC virtual class.
+ // Used by ORC for error messages, not used by Arrow
+ const std::string& getName() const override {
+ static const std::string filename("ArrowOutputFile");
+ return filename;
+ }
+
+ void close() override {
+ if (!output_stream_.closed()) {
+ ORC_THROW_NOT_OK(output_stream_.Close());
+ }
+ }
+
+ void set_length(int64_t length) { length_ = length; }
+
+ private:
+ arrow::io::OutputStream& output_stream_;
+ int64_t length_;
+};
+
+} // namespace
+
+class ORCFileWriter::Impl {
+ public:
+ Status Open(arrow::io::OutputStream* output_stream) {
+ out_stream_ = std::unique_ptr<liborc::OutputStream>(
+ checked_cast<liborc::OutputStream*>(new ArrowOutputStream(*output_stream)));
+ return Status::OK();
+ }
+
+ Status Write(const Table& table) {
+ std::unique_ptr<liborc::WriterOptions> orc_options =
+ std::unique_ptr<liborc::WriterOptions>(new liborc::WriterOptions());
+ ARROW_ASSIGN_OR_RAISE(auto orc_schema, GetOrcType(*(table.schema())));
+ ORC_CATCH_NOT_OK(
+ writer_ = liborc::createWriter(*orc_schema, out_stream_.get(), *orc_options))
+
+ int64_t num_rows = table.num_rows();
+ const int num_cols_ = table.num_columns();
+ std::vector<int64_t> arrow_index_offset(num_cols_, 0);
+ std::vector<int> arrow_chunk_offset(num_cols_, 0);
+ std::unique_ptr<liborc::ColumnVectorBatch> batch =
+ writer_->createRowBatch(kOrcWriterBatchSize);
+ liborc::StructVectorBatch* root =
+ internal::checked_cast<liborc::StructVectorBatch*>(batch.get());
+ while (num_rows > 0) {
+ for (int i = 0; i < num_cols_; i++) {
+ RETURN_NOT_OK(adapters::orc::WriteBatch(
+ *(table.column(i)), kOrcWriterBatchSize, &(arrow_chunk_offset[i]),
+ &(arrow_index_offset[i]), (root->fields)[i]));
+ }
+ root->numElements = (root->fields)[0]->numElements;
+ writer_->add(*batch);
+ batch->clear();
+ num_rows -= kOrcWriterBatchSize;
+ }
+ return Status::OK();
+ }
+
+ Status Close() {
+ writer_->close();
+ return Status::OK();
+ }
+
+ private:
+ std::unique_ptr<liborc::Writer> writer_;
+ std::unique_ptr<liborc::OutputStream> out_stream_;
+};
+
+ORCFileWriter::~ORCFileWriter() {}
+
+ORCFileWriter::ORCFileWriter() { impl_.reset(new ORCFileWriter::Impl()); }
+
+Result<std::unique_ptr<ORCFileWriter>> ORCFileWriter::Open(
+ io::OutputStream* output_stream) {
+ std::unique_ptr<ORCFileWriter> result =
+ std::unique_ptr<ORCFileWriter>(new ORCFileWriter());
+ Status status = result->impl_->Open(output_stream);
+ RETURN_NOT_OK(status);
+ return std::move(result);
+}
+
+Status ORCFileWriter::Write(const Table& table) { return impl_->Write(table); }
+
+Status ORCFileWriter::Close() { return impl_->Close(); }
+
+} // namespace orc
+} // namespace adapters
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/adapters/orc/adapter.h b/contrib/libs/apache/arrow/cpp/src/arrow/adapters/orc/adapter.h
new file mode 100644
index 00000000000..012c1701980
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/adapters/orc/adapter.h
@@ -0,0 +1,181 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "arrow/io/interfaces.h"
+#include "arrow/memory_pool.h"
+#include "arrow/record_batch.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace adapters {
+namespace orc {
+
+/// \class ORCFileReader
+/// \brief Read an Arrow Table or RecordBatch from an ORC file.
+class ARROW_EXPORT ORCFileReader {
+ public:
+ ~ORCFileReader();
+
+ /// \brief Creates a new ORC reader.
+ ///
+ /// \param[in] file the data source
+ /// \param[in] pool a MemoryPool to use for buffer allocations
+ /// \param[out] reader the returned reader object
+ /// \return Status
+ static Status Open(const std::shared_ptr<io::RandomAccessFile>& file, MemoryPool* pool,
+ std::unique_ptr<ORCFileReader>* reader);
+
+ /// \brief Return the metadata read from the ORC file
+ ///
+ /// \return A KeyValueMetadata object containing the ORC metadata
+ Result<std::shared_ptr<const KeyValueMetadata>> ReadMetadata();
+
+ /// \brief Return the schema read from the ORC file
+ ///
+ /// \param[out] out the returned Schema object
+ Status ReadSchema(std::shared_ptr<Schema>* out);
+
+ /// \brief Read the file as a Table
+ ///
+ /// The table will be composed of one record batch per stripe.
+ ///
+ /// \param[out] out the returned Table
+ Status Read(std::shared_ptr<Table>* out);
+
+ /// \brief Read the file as a Table
+ ///
+ /// The table will be composed of one record batch per stripe.
+ ///
+ /// \param[in] schema the Table schema
+ /// \param[out] out the returned Table
+ Status Read(const std::shared_ptr<Schema>& schema, std::shared_ptr<Table>* out);
+
+ /// \brief Read the file as a Table
+ ///
+ /// The table will be composed of one record batch per stripe.
+ ///
+ /// \param[in] include_indices the selected field indices to read
+ /// \param[out] out the returned Table
+ Status Read(const std::vector<int>& include_indices, std::shared_ptr<Table>* out);
+
+ /// \brief Read the file as a Table
+ ///
+ /// The table will be composed of one record batch per stripe.
+ ///
+ /// \param[in] schema the Table schema
+ /// \param[in] include_indices the selected field indices to read
+ /// \param[out] out the returned Table
+ Status Read(const std::shared_ptr<Schema>& schema,
+ const std::vector<int>& include_indices, std::shared_ptr<Table>* out);
+
+ /// \brief Read a single stripe as a RecordBatch
+ ///
+ /// \param[in] stripe the stripe index
+ /// \param[out] out the returned RecordBatch
+ Status ReadStripe(int64_t stripe, std::shared_ptr<RecordBatch>* out);
+
+ /// \brief Read a single stripe as a RecordBatch
+ ///
+ /// \param[in] stripe the stripe index
+ /// \param[in] include_indices the selected field indices to read
+ /// \param[out] out the returned RecordBatch
+ Status ReadStripe(int64_t stripe, const std::vector<int>& include_indices,
+ std::shared_ptr<RecordBatch>* out);
+
+ /// \brief Seek to designated row. Invoke NextStripeReader() after seek
+ /// will return stripe reader starting from designated row.
+ ///
+ /// \param[in] row_number the rows number to seek
+ Status Seek(int64_t row_number);
+
+ /// \brief Get a stripe level record batch iterator with specified row count
+ /// in each record batch. NextStripeReader serves as a fine grain
+ /// alternative to ReadStripe which may cause OOM issue by loading
+ /// the whole stripes into memory.
+ ///
+ /// \param[in] batch_size the number of rows each record batch contains in
+ /// record batch iteration.
+ /// \param[out] out the returned stripe reader
+ Status NextStripeReader(int64_t batch_size, std::shared_ptr<RecordBatchReader>* out);
+
+ /// \brief Get a stripe level record batch iterator with specified row count
+ /// in each record batch. NextStripeReader serves as a fine grain
+ /// alternative to ReadStripe which may cause OOM issue by loading
+ /// the whole stripes into memory.
+ ///
+ /// \param[in] batch_size Get a stripe level record batch iterator with specified row
+ /// count in each record batch.
+ ///
+ /// \param[in] include_indices the selected field indices to read
+ /// \param[out] out the returned stripe reader
+ Status NextStripeReader(int64_t batch_size, const std::vector<int>& include_indices,
+ std::shared_ptr<RecordBatchReader>* out);
+
+ /// \brief The number of stripes in the file
+ int64_t NumberOfStripes();
+
+ /// \brief The number of rows in the file
+ int64_t NumberOfRows();
+
+ private:
+ class Impl;
+ std::unique_ptr<Impl> impl_;
+ ORCFileReader();
+};
+
+/// \class ORCFileWriter
+/// \brief Write an Arrow Table or RecordBatch to an ORC file.
+class ARROW_EXPORT ORCFileWriter {
+ public:
+ ~ORCFileWriter();
+ /// \brief Creates a new ORC writer.
+ ///
+ /// \param[in] output_stream a pointer to the io::OutputStream to write into
+ /// \return the returned writer object
+ static Result<std::unique_ptr<ORCFileWriter>> Open(io::OutputStream* output_stream);
+
+ /// \brief Write a table
+ ///
+ /// \param[in] table the Arrow table from which data is extracted
+ /// \return Status
+ Status Write(const Table& table);
+
+ /// \brief Close an ORC writer (orc::Writer)
+ ///
+ /// \return Status
+ Status Close();
+
+ private:
+ class Impl;
+ std::unique_ptr<Impl> impl_;
+
+ private:
+ ORCFileWriter();
+};
+
+} // namespace orc
+} // namespace adapters
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/adapters/orc/adapter_util.cc b/contrib/libs/apache/arrow/cpp/src/arrow/adapters/orc/adapter_util.cc
new file mode 100644
index 00000000000..f956a6f6217
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/adapters/orc/adapter_util.cc
@@ -0,0 +1,1069 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/adapters/orc/adapter_util.h"
+
+#include <cmath>
+#include <string>
+#include <vector>
+
+#include "arrow/array/builder_base.h"
+#include "arrow/builder.h"
+#include "arrow/chunked_array.h"
+#include "arrow/scalar.h"
+#include "arrow/status.h"
+#include "arrow/util/bitmap_ops.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/decimal.h"
+#include "arrow/util/range.h"
+#include "arrow/util/string_view.h"
+#include "arrow/visitor_inline.h"
+#include "orc/Exceptions.hh"
+#include "orc/MemoryPool.hh"
+#include "orc/OrcFile.hh"
+
+// alias to not interfere with nested orc namespace
+namespace liborc = orc;
+
+namespace arrow {
+
+using internal::checked_cast;
+
+namespace adapters {
+namespace orc {
+
+namespace {
+
+// The number of milliseconds, microseconds and nanoseconds in a second
+constexpr int64_t kOneSecondMillis = 1000LL;
+constexpr int64_t kOneMicroNanos = 1000LL;
+constexpr int64_t kOneSecondMicros = 1000000LL;
+constexpr int64_t kOneMilliNanos = 1000000LL;
+constexpr int64_t kOneSecondNanos = 1000000000LL;
+
+Status AppendStructBatch(const liborc::Type* type,
+ liborc::ColumnVectorBatch* column_vector_batch, int64_t offset,
+ int64_t length, ArrayBuilder* abuilder) {
+ auto builder = checked_cast<StructBuilder*>(abuilder);
+ auto batch = checked_cast<liborc::StructVectorBatch*>(column_vector_batch);
+
+ const uint8_t* valid_bytes = nullptr;
+ if (batch->hasNulls) {
+ valid_bytes = reinterpret_cast<const uint8_t*>(batch->notNull.data()) + offset;
+ }
+ RETURN_NOT_OK(builder->AppendValues(length, valid_bytes));
+
+ for (int i = 0; i < builder->num_fields(); i++) {
+ RETURN_NOT_OK(AppendBatch(type->getSubtype(i), batch->fields[i], offset, length,
+ builder->field_builder(i)));
+ }
+ return Status::OK();
+}
+
+Status AppendListBatch(const liborc::Type* type,
+ liborc::ColumnVectorBatch* column_vector_batch, int64_t offset,
+ int64_t length, ArrayBuilder* abuilder) {
+ auto builder = checked_cast<ListBuilder*>(abuilder);
+ auto batch = checked_cast<liborc::ListVectorBatch*>(column_vector_batch);
+ liborc::ColumnVectorBatch* elements = batch->elements.get();
+ const liborc::Type* elemtype = type->getSubtype(0);
+
+ const bool has_nulls = batch->hasNulls;
+ for (int64_t i = offset; i < length + offset; i++) {
+ if (!has_nulls || batch->notNull[i]) {
+ int64_t start = batch->offsets[i];
+ int64_t end = batch->offsets[i + 1];
+ RETURN_NOT_OK(builder->Append());
+ RETURN_NOT_OK(
+ AppendBatch(elemtype, elements, start, end - start, builder->value_builder()));
+ } else {
+ RETURN_NOT_OK(builder->AppendNull());
+ }
+ }
+ return Status::OK();
+}
+
+Status AppendMapBatch(const liborc::Type* type,
+ liborc::ColumnVectorBatch* column_vector_batch, int64_t offset,
+ int64_t length, ArrayBuilder* abuilder) {
+ auto builder = checked_cast<MapBuilder*>(abuilder);
+ auto batch = checked_cast<liborc::MapVectorBatch*>(column_vector_batch);
+ liborc::ColumnVectorBatch* keys = batch->keys.get();
+ liborc::ColumnVectorBatch* items = batch->elements.get();
+ const liborc::Type* key_type = type->getSubtype(0);
+ const liborc::Type* item_type = type->getSubtype(1);
+
+ const bool has_nulls = batch->hasNulls;
+ for (int64_t i = offset; i < length + offset; i++) {
+ if (!has_nulls || batch->notNull[i]) {
+ int64_t start = batch->offsets[i];
+ int64_t end = batch->offsets[i + 1];
+ RETURN_NOT_OK(builder->Append());
+ RETURN_NOT_OK(
+ AppendBatch(key_type, keys, start, end - start, builder->key_builder()));
+ RETURN_NOT_OK(
+ AppendBatch(item_type, items, start, end - start, builder->item_builder()));
+ } else {
+ RETURN_NOT_OK(builder->AppendNull());
+ }
+ }
+ return Status::OK();
+}
+
+template <class BuilderType, class BatchType, class ElemType>
+Status AppendNumericBatch(liborc::ColumnVectorBatch* column_vector_batch, int64_t offset,
+ int64_t length, ArrayBuilder* abuilder) {
+ auto builder = checked_cast<BuilderType*>(abuilder);
+ auto batch = checked_cast<BatchType*>(column_vector_batch);
+
+ if (length == 0) {
+ return Status::OK();
+ }
+ const uint8_t* valid_bytes = nullptr;
+ if (batch->hasNulls) {
+ valid_bytes = reinterpret_cast<const uint8_t*>(batch->notNull.data()) + offset;
+ }
+ const ElemType* source = batch->data.data() + offset;
+ RETURN_NOT_OK(builder->AppendValues(source, length, valid_bytes));
+ return Status::OK();
+}
+
+template <class BuilderType, class TargetType, class BatchType, class SourceType>
+Status AppendNumericBatchCast(liborc::ColumnVectorBatch* column_vector_batch,
+ int64_t offset, int64_t length, ArrayBuilder* abuilder) {
+ auto builder = checked_cast<BuilderType*>(abuilder);
+ auto batch = checked_cast<BatchType*>(column_vector_batch);
+
+ if (length == 0) {
+ return Status::OK();
+ }
+
+ const uint8_t* valid_bytes = nullptr;
+ if (batch->hasNulls) {
+ valid_bytes = reinterpret_cast<const uint8_t*>(batch->notNull.data()) + offset;
+ }
+ const SourceType* source = batch->data.data() + offset;
+ auto cast_iter = internal::MakeLazyRange(
+ [&source](int64_t index) { return static_cast<TargetType>(source[index]); },
+ length);
+
+ RETURN_NOT_OK(builder->AppendValues(cast_iter.begin(), cast_iter.end(), valid_bytes));
+
+ return Status::OK();
+}
+
+Status AppendBoolBatch(liborc::ColumnVectorBatch* column_vector_batch, int64_t offset,
+ int64_t length, ArrayBuilder* abuilder) {
+ auto builder = checked_cast<BooleanBuilder*>(abuilder);
+ auto batch = checked_cast<liborc::LongVectorBatch*>(column_vector_batch);
+
+ if (length == 0) {
+ return Status::OK();
+ }
+
+ const uint8_t* valid_bytes = nullptr;
+ if (batch->hasNulls) {
+ valid_bytes = reinterpret_cast<const uint8_t*>(batch->notNull.data()) + offset;
+ }
+ const int64_t* source = batch->data.data() + offset;
+
+ auto cast_iter = internal::MakeLazyRange(
+ [&source](int64_t index) { return static_cast<bool>(source[index]); }, length);
+
+ RETURN_NOT_OK(builder->AppendValues(cast_iter.begin(), cast_iter.end(), valid_bytes));
+
+ return Status::OK();
+}
+
+Status AppendTimestampBatch(liborc::ColumnVectorBatch* column_vector_batch,
+ int64_t offset, int64_t length, ArrayBuilder* abuilder) {
+ auto builder = checked_cast<TimestampBuilder*>(abuilder);
+ auto batch = checked_cast<liborc::TimestampVectorBatch*>(column_vector_batch);
+
+ if (length == 0) {
+ return Status::OK();
+ }
+
+ const uint8_t* valid_bytes = nullptr;
+ if (batch->hasNulls) {
+ valid_bytes = reinterpret_cast<const uint8_t*>(batch->notNull.data()) + offset;
+ }
+
+ const int64_t* seconds = batch->data.data() + offset;
+ const int64_t* nanos = batch->nanoseconds.data() + offset;
+
+ auto transform_timestamp = [seconds, nanos](int64_t index) {
+ return seconds[index] * kOneSecondNanos + nanos[index];
+ };
+
+ auto transform_range = internal::MakeLazyRange(transform_timestamp, length);
+
+ RETURN_NOT_OK(
+ builder->AppendValues(transform_range.begin(), transform_range.end(), valid_bytes));
+ return Status::OK();
+}
+
+template <class BuilderType>
+Status AppendBinaryBatch(liborc::ColumnVectorBatch* column_vector_batch, int64_t offset,
+ int64_t length, ArrayBuilder* abuilder) {
+ auto builder = checked_cast<BuilderType*>(abuilder);
+ auto batch = checked_cast<liborc::StringVectorBatch*>(column_vector_batch);
+
+ const bool has_nulls = batch->hasNulls;
+ for (int64_t i = offset; i < length + offset; i++) {
+ if (!has_nulls || batch->notNull[i]) {
+ RETURN_NOT_OK(
+ builder->Append(batch->data[i], static_cast<int32_t>(batch->length[i])));
+ } else {
+ RETURN_NOT_OK(builder->AppendNull());
+ }
+ }
+ return Status::OK();
+}
+
+Status AppendFixedBinaryBatch(liborc::ColumnVectorBatch* column_vector_batch,
+ int64_t offset, int64_t length, ArrayBuilder* abuilder) {
+ auto builder = checked_cast<FixedSizeBinaryBuilder*>(abuilder);
+ auto batch = checked_cast<liborc::StringVectorBatch*>(column_vector_batch);
+
+ const bool has_nulls = batch->hasNulls;
+ for (int64_t i = offset; i < length + offset; i++) {
+ if (!has_nulls || batch->notNull[i]) {
+ RETURN_NOT_OK(builder->Append(batch->data[i]));
+ } else {
+ RETURN_NOT_OK(builder->AppendNull());
+ }
+ }
+ return Status::OK();
+}
+
+Status AppendDecimalBatch(const liborc::Type* type,
+ liborc::ColumnVectorBatch* column_vector_batch, int64_t offset,
+ int64_t length, ArrayBuilder* abuilder) {
+ auto builder = checked_cast<Decimal128Builder*>(abuilder);
+
+ const bool has_nulls = column_vector_batch->hasNulls;
+ if (type->getPrecision() == 0 || type->getPrecision() > 18) {
+ auto batch = checked_cast<liborc::Decimal128VectorBatch*>(column_vector_batch);
+ for (int64_t i = offset; i < length + offset; i++) {
+ if (!has_nulls || batch->notNull[i]) {
+ RETURN_NOT_OK(builder->Append(
+ Decimal128(batch->values[i].getHighBits(), batch->values[i].getLowBits())));
+ } else {
+ RETURN_NOT_OK(builder->AppendNull());
+ }
+ }
+ } else {
+ auto batch = checked_cast<liborc::Decimal64VectorBatch*>(column_vector_batch);
+ for (int64_t i = offset; i < length + offset; i++) {
+ if (!has_nulls || batch->notNull[i]) {
+ RETURN_NOT_OK(builder->Append(Decimal128(batch->values[i])));
+ } else {
+ RETURN_NOT_OK(builder->AppendNull());
+ }
+ }
+ }
+ return Status::OK();
+}
+
+} // namespace
+
+Status AppendBatch(const liborc::Type* type, liborc::ColumnVectorBatch* batch,
+ int64_t offset, int64_t length, ArrayBuilder* builder) {
+ if (type == nullptr) {
+ return Status::OK();
+ }
+ liborc::TypeKind kind = type->getKind();
+ switch (kind) {
+ case liborc::STRUCT:
+ return AppendStructBatch(type, batch, offset, length, builder);
+ case liborc::LIST:
+ return AppendListBatch(type, batch, offset, length, builder);
+ case liborc::MAP:
+ return AppendMapBatch(type, batch, offset, length, builder);
+ case liborc::LONG:
+ return AppendNumericBatch<Int64Builder, liborc::LongVectorBatch, int64_t>(
+ batch, offset, length, builder);
+ case liborc::INT:
+ return AppendNumericBatchCast<Int32Builder, int32_t, liborc::LongVectorBatch,
+ int64_t>(batch, offset, length, builder);
+ case liborc::SHORT:
+ return AppendNumericBatchCast<Int16Builder, int16_t, liborc::LongVectorBatch,
+ int64_t>(batch, offset, length, builder);
+ case liborc::BYTE:
+ return AppendNumericBatchCast<Int8Builder, int8_t, liborc::LongVectorBatch,
+ int64_t>(batch, offset, length, builder);
+ case liborc::DOUBLE:
+ return AppendNumericBatch<DoubleBuilder, liborc::DoubleVectorBatch, double>(
+ batch, offset, length, builder);
+ case liborc::FLOAT:
+ return AppendNumericBatchCast<FloatBuilder, float, liborc::DoubleVectorBatch,
+ double>(batch, offset, length, builder);
+ case liborc::BOOLEAN:
+ return AppendBoolBatch(batch, offset, length, builder);
+ case liborc::VARCHAR:
+ case liborc::STRING:
+ return AppendBinaryBatch<StringBuilder>(batch, offset, length, builder);
+ case liborc::BINARY:
+ return AppendBinaryBatch<BinaryBuilder>(batch, offset, length, builder);
+ case liborc::CHAR:
+ return AppendFixedBinaryBatch(batch, offset, length, builder);
+ case liborc::DATE:
+ return AppendNumericBatchCast<Date32Builder, int32_t, liborc::LongVectorBatch,
+ int64_t>(batch, offset, length, builder);
+ case liborc::TIMESTAMP:
+ return AppendTimestampBatch(batch, offset, length, builder);
+ case liborc::DECIMAL:
+ return AppendDecimalBatch(type, batch, offset, length, builder);
+ default:
+ return Status::NotImplemented("Not implemented type kind: ", kind);
+ }
+}
+
+namespace {
+
+using internal::checked_cast;
+using internal::checked_pointer_cast;
+
+Status WriteBatch(const Array& parray, int64_t orc_offset,
+ liborc::ColumnVectorBatch* column_vector_batch);
+
+// Make sure children of StructArray have appropriate null.
+Result<std::shared_ptr<Array>> NormalizeArray(const std::shared_ptr<Array>& array) {
+ Type::type kind = array->type_id();
+ switch (kind) {
+ case Type::type::STRUCT: {
+ if (array->null_count() == 0) {
+ return array;
+ } else {
+ auto struct_array = checked_pointer_cast<StructArray>(array);
+ const std::shared_ptr<Buffer> bitmap = struct_array->null_bitmap();
+ std::shared_ptr<DataType> struct_type = struct_array->type();
+ std::size_t size = struct_type->fields().size();
+ std::vector<std::shared_ptr<Array>> new_children(size, nullptr);
+ for (std::size_t i = 0; i < size; i++) {
+ std::shared_ptr<Array> child = struct_array->field(i);
+ const std::shared_ptr<Buffer> child_bitmap = child->null_bitmap();
+ std::shared_ptr<Buffer> final_child_bitmap;
+ if (child_bitmap == nullptr) {
+ final_child_bitmap = bitmap;
+ } else {
+ ARROW_ASSIGN_OR_RAISE(
+ final_child_bitmap,
+ internal::BitmapAnd(default_memory_pool(), bitmap->data(), 0,
+ child_bitmap->data(), 0, struct_array->length(), 0));
+ }
+ std::shared_ptr<ArrayData> child_array_data = child->data();
+ std::vector<std::shared_ptr<Buffer>> child_buffers = child_array_data->buffers;
+ child_buffers[0] = final_child_bitmap;
+ std::shared_ptr<ArrayData> new_child_array_data =
+ ArrayData::Make(child->type(), child->length(), child_buffers,
+ child_array_data->child_data, child_array_data->dictionary);
+ ARROW_ASSIGN_OR_RAISE(new_children[i],
+ NormalizeArray(MakeArray(new_child_array_data)));
+ }
+ return std::make_shared<StructArray>(struct_type, struct_array->length(),
+ new_children, bitmap);
+ }
+ }
+ case Type::type::LIST: {
+ auto list_array = checked_pointer_cast<ListArray>(array);
+ ARROW_ASSIGN_OR_RAISE(auto value_array, NormalizeArray(list_array->values()));
+ return std::make_shared<ListArray>(list_array->type(), list_array->length(),
+ list_array->value_offsets(), value_array,
+ list_array->null_bitmap());
+ }
+ case Type::type::LARGE_LIST: {
+ auto list_array = checked_pointer_cast<LargeListArray>(array);
+ ARROW_ASSIGN_OR_RAISE(auto value_array, NormalizeArray(list_array->values()));
+ return std::make_shared<LargeListArray>(list_array->type(), list_array->length(),
+ list_array->value_offsets(), value_array,
+ list_array->null_bitmap());
+ }
+ case Type::type::FIXED_SIZE_LIST: {
+ auto list_array = checked_pointer_cast<FixedSizeListArray>(array);
+ ARROW_ASSIGN_OR_RAISE(auto value_array, NormalizeArray(list_array->values()));
+ return std::make_shared<FixedSizeListArray>(list_array->type(),
+ list_array->length(), value_array,
+ list_array->null_bitmap());
+ }
+ case Type::type::MAP: {
+ auto map_array = checked_pointer_cast<MapArray>(array);
+ ARROW_ASSIGN_OR_RAISE(auto key_array, NormalizeArray(map_array->keys()));
+ ARROW_ASSIGN_OR_RAISE(auto item_array, NormalizeArray(map_array->items()));
+ return std::make_shared<MapArray>(map_array->type(), map_array->length(),
+ map_array->value_offsets(), key_array, item_array,
+ map_array->null_bitmap());
+ }
+ default: {
+ return array;
+ }
+ }
+}
+
+template <class DataType, class BatchType, typename Enable = void>
+struct Appender {};
+
+// Types for long/double-like Appender, that is, numeric, boolean or date32
+template <typename T>
+using is_generic_type =
+ std::integral_constant<bool, is_number_type<T>::value ||
+ std::is_same<Date32Type, T>::value ||
+ is_boolean_type<T>::value>;
+template <typename T, typename R = void>
+using enable_if_generic = enable_if_t<is_generic_type<T>::value, R>;
+
+// Number-like
+template <class DataType, class BatchType>
+struct Appender<DataType, BatchType, enable_if_generic<DataType>> {
+ using ArrayType = typename TypeTraits<DataType>::ArrayType;
+ using ValueType = typename TypeTraits<DataType>::CType;
+ Status VisitNull() {
+ batch->notNull[running_orc_offset] = false;
+ running_orc_offset++;
+ running_arrow_offset++;
+ return Status::OK();
+ }
+ Status VisitValue(ValueType v) {
+ batch->data[running_orc_offset] = array.Value(running_arrow_offset);
+ batch->notNull[running_orc_offset] = true;
+ running_orc_offset++;
+ running_arrow_offset++;
+ return Status::OK();
+ }
+ const ArrayType& array;
+ BatchType* batch;
+ int64_t running_orc_offset, running_arrow_offset;
+};
+
+// Binary
+template <class DataType>
+struct Appender<DataType, liborc::StringVectorBatch> {
+ using ArrayType = typename TypeTraits<DataType>::ArrayType;
+ using COffsetType = typename TypeTraits<DataType>::OffsetType::c_type;
+ Status VisitNull() {
+ batch->notNull[running_orc_offset] = false;
+ running_orc_offset++;
+ running_arrow_offset++;
+ return Status::OK();
+ }
+ Status VisitValue(util::string_view v) {
+ batch->notNull[running_orc_offset] = true;
+ COffsetType data_length = 0;
+ batch->data[running_orc_offset] = reinterpret_cast<char*>(
+ const_cast<uint8_t*>(array.GetValue(running_arrow_offset, &data_length)));
+ batch->length[running_orc_offset] = data_length;
+ running_orc_offset++;
+ running_arrow_offset++;
+ return Status::OK();
+ }
+ const ArrayType& array;
+ liborc::StringVectorBatch* batch;
+ int64_t running_orc_offset, running_arrow_offset;
+};
+
+// Decimal
+template <>
+struct Appender<Decimal128Type, liborc::Decimal64VectorBatch> {
+ Status VisitNull() {
+ batch->notNull[running_orc_offset] = false;
+ running_orc_offset++;
+ running_arrow_offset++;
+ return Status::OK();
+ }
+ Status VisitValue(util::string_view v) {
+ batch->notNull[running_orc_offset] = true;
+ const Decimal128 dec_value(array.GetValue(running_arrow_offset));
+ batch->values[running_orc_offset] = static_cast<int64_t>(dec_value.low_bits());
+ running_orc_offset++;
+ running_arrow_offset++;
+ return Status::OK();
+ }
+ const Decimal128Array& array;
+ liborc::Decimal64VectorBatch* batch;
+ int64_t running_orc_offset, running_arrow_offset;
+};
+
+template <>
+struct Appender<Decimal128Type, liborc::Decimal128VectorBatch> {
+ Status VisitNull() {
+ batch->notNull[running_orc_offset] = false;
+ running_orc_offset++;
+ running_arrow_offset++;
+ return Status::OK();
+ }
+ Status VisitValue(util::string_view v) {
+ batch->notNull[running_orc_offset] = true;
+ const Decimal128 dec_value(array.GetValue(running_arrow_offset));
+ batch->values[running_orc_offset] =
+ liborc::Int128(dec_value.high_bits(), dec_value.low_bits());
+ running_orc_offset++;
+ running_arrow_offset++;
+ return Status::OK();
+ }
+ const Decimal128Array& array;
+ liborc::Decimal128VectorBatch* batch;
+ int64_t running_orc_offset, running_arrow_offset;
+};
+
+// Date64 and Timestamp
+template <class DataType>
+struct TimestampAppender {
+ using ArrayType = typename TypeTraits<DataType>::ArrayType;
+ Status VisitNull() {
+ batch->notNull[running_orc_offset] = false;
+ running_orc_offset++;
+ running_arrow_offset++;
+ return Status::OK();
+ }
+ Status VisitValue(int64_t v) {
+ int64_t data = array.Value(running_arrow_offset);
+ batch->notNull[running_orc_offset] = true;
+ batch->data[running_orc_offset] =
+ static_cast<int64_t>(std::floor(data / conversion_factor_from_second));
+ batch->nanoseconds[running_orc_offset] =
+ (data - conversion_factor_from_second * batch->data[running_orc_offset]) *
+ conversion_factor_to_nano;
+ running_orc_offset++;
+ running_arrow_offset++;
+ return Status::OK();
+ }
+ const ArrayType& array;
+ liborc::TimestampVectorBatch* batch;
+ int64_t running_orc_offset, running_arrow_offset;
+ int64_t conversion_factor_from_second, conversion_factor_to_nano;
+};
+
+// FSB
+struct FixedSizeBinaryAppender {
+ Status VisitNull() {
+ batch->notNull[running_orc_offset] = false;
+ running_orc_offset++;
+ running_arrow_offset++;
+ return Status::OK();
+ }
+ Status VisitValue(util::string_view v) {
+ batch->notNull[running_orc_offset] = true;
+ batch->data[running_orc_offset] = reinterpret_cast<char*>(
+ const_cast<uint8_t*>(array.GetValue(running_arrow_offset)));
+ batch->length[running_orc_offset] = data_length;
+ running_orc_offset++;
+ running_arrow_offset++;
+ return Status::OK();
+ }
+ const FixedSizeBinaryArray& array;
+ liborc::StringVectorBatch* batch;
+ int64_t running_orc_offset, running_arrow_offset;
+ const int32_t data_length;
+};
+
+// static_cast from int64_t or double to itself shouldn't introduce overhead
+// Pleae see
+// https://stackoverflow.com/questions/19106826/
+// can-static-cast-to-same-type-introduce-runtime-overhead
+template <class DataType, class BatchType>
+Status WriteGenericBatch(const Array& array, int64_t orc_offset,
+ liborc::ColumnVectorBatch* column_vector_batch) {
+ using ArrayType = typename TypeTraits<DataType>::ArrayType;
+ const ArrayType& array_(checked_cast<const ArrayType&>(array));
+ auto batch = checked_cast<BatchType*>(column_vector_batch);
+ if (array.null_count()) {
+ batch->hasNulls = true;
+ }
+ Appender<DataType, BatchType> appender{array_, batch, orc_offset, 0};
+ ArrayDataVisitor<DataType> visitor;
+ RETURN_NOT_OK(visitor.Visit(*(array_.data()), &appender));
+ return Status::OK();
+}
+
+template <class DataType>
+Status WriteTimestampBatch(const Array& array, int64_t orc_offset,
+ liborc::ColumnVectorBatch* column_vector_batch,
+ const int64_t& conversion_factor_from_second,
+ const int64_t& conversion_factor_to_nano) {
+ using ArrayType = typename TypeTraits<DataType>::ArrayType;
+ const ArrayType& array_(checked_cast<const ArrayType&>(array));
+ auto batch = checked_cast<liborc::TimestampVectorBatch*>(column_vector_batch);
+ if (array.null_count()) {
+ batch->hasNulls = true;
+ }
+ TimestampAppender<DataType> appender{array_,
+ batch,
+ orc_offset,
+ 0,
+ conversion_factor_from_second,
+ conversion_factor_to_nano};
+ ArrayDataVisitor<DataType> visitor;
+ RETURN_NOT_OK(visitor.Visit(*(array_.data()), &appender));
+ return Status::OK();
+}
+
+Status WriteFixedSizeBinaryBatch(const Array& array, int64_t orc_offset,
+ liborc::ColumnVectorBatch* column_vector_batch) {
+ const FixedSizeBinaryArray& array_(checked_cast<const FixedSizeBinaryArray&>(array));
+ auto batch = checked_cast<liborc::StringVectorBatch*>(column_vector_batch);
+ if (array.null_count()) {
+ batch->hasNulls = true;
+ }
+ FixedSizeBinaryAppender appender{array_, batch, orc_offset, 0, array_.byte_width()};
+ ArrayDataVisitor<FixedSizeBinaryType> visitor;
+ RETURN_NOT_OK(visitor.Visit(*(array_.data()), &appender));
+ return Status::OK();
+}
+
+Status WriteStructBatch(const Array& array, int64_t orc_offset,
+ liborc::ColumnVectorBatch* column_vector_batch) {
+ std::shared_ptr<Array> array_ = MakeArray(array.data());
+ std::shared_ptr<StructArray> struct_array(checked_pointer_cast<StructArray>(array_));
+ auto batch = checked_cast<liborc::StructVectorBatch*>(column_vector_batch);
+ std::size_t size = array.type()->fields().size();
+ int64_t arrow_length = array.length();
+ int64_t running_arrow_offset = 0, running_orc_offset = orc_offset;
+ // First fill fields of ColumnVectorBatch
+ if (array.null_count()) {
+ batch->hasNulls = true;
+ }
+ for (; running_arrow_offset < arrow_length;
+ running_orc_offset++, running_arrow_offset++) {
+ if (array.IsNull(running_arrow_offset)) {
+ batch->notNull[running_orc_offset] = false;
+ } else {
+ batch->notNull[running_orc_offset] = true;
+ }
+ }
+ // Fill the fields
+ for (std::size_t i = 0; i < size; i++) {
+ batch->fields[i]->resize(orc_offset + arrow_length);
+ RETURN_NOT_OK(WriteBatch(*(struct_array->field(i)), orc_offset, batch->fields[i]));
+ }
+ return Status::OK();
+}
+
+template <class ArrayType>
+Status WriteListBatch(const Array& array, int64_t orc_offset,
+ liborc::ColumnVectorBatch* column_vector_batch) {
+ const ArrayType& list_array(checked_cast<const ArrayType&>(array));
+ auto batch = checked_cast<liborc::ListVectorBatch*>(column_vector_batch);
+ liborc::ColumnVectorBatch* element_batch = (batch->elements).get();
+ int64_t arrow_length = array.length();
+ int64_t running_arrow_offset = 0, running_orc_offset = orc_offset;
+ if (orc_offset == 0) {
+ batch->offsets[0] = 0;
+ }
+ if (array.null_count()) {
+ batch->hasNulls = true;
+ }
+ for (; running_arrow_offset < arrow_length;
+ running_orc_offset++, running_arrow_offset++) {
+ if (array.IsNull(running_arrow_offset)) {
+ batch->notNull[running_orc_offset] = false;
+ batch->offsets[running_orc_offset + 1] = batch->offsets[running_orc_offset];
+ } else {
+ batch->notNull[running_orc_offset] = true;
+ batch->offsets[running_orc_offset + 1] =
+ batch->offsets[running_orc_offset] +
+ list_array.value_offset(running_arrow_offset + 1) -
+ list_array.value_offset(running_arrow_offset);
+ element_batch->resize(batch->offsets[running_orc_offset + 1]);
+ int64_t subarray_arrow_offset = list_array.value_offset(running_arrow_offset),
+ subarray_orc_offset = batch->offsets[running_orc_offset],
+ subarray_orc_length =
+ batch->offsets[running_orc_offset + 1] - subarray_orc_offset;
+ RETURN_NOT_OK(WriteBatch(
+ *(list_array.values()->Slice(subarray_arrow_offset, subarray_orc_length)),
+ subarray_orc_offset, element_batch));
+ }
+ }
+ return Status::OK();
+}
+
+Status WriteMapBatch(const Array& array, int64_t orc_offset,
+ liborc::ColumnVectorBatch* column_vector_batch) {
+ const MapArray& map_array(checked_cast<const MapArray&>(array));
+ auto batch = checked_cast<liborc::MapVectorBatch*>(column_vector_batch);
+ liborc::ColumnVectorBatch* key_batch = (batch->keys).get();
+ liborc::ColumnVectorBatch* element_batch = (batch->elements).get();
+ std::shared_ptr<Array> key_array = map_array.keys();
+ std::shared_ptr<Array> element_array = map_array.items();
+ int64_t arrow_length = array.length();
+ int64_t running_arrow_offset = 0, running_orc_offset = orc_offset;
+ if (orc_offset == 0) {
+ batch->offsets[0] = 0;
+ }
+ if (array.null_count()) {
+ batch->hasNulls = true;
+ }
+ for (; running_arrow_offset < arrow_length;
+ running_orc_offset++, running_arrow_offset++) {
+ if (array.IsNull(running_arrow_offset)) {
+ batch->notNull[running_orc_offset] = false;
+ batch->offsets[running_orc_offset + 1] = batch->offsets[running_orc_offset];
+ } else {
+ batch->notNull[running_orc_offset] = true;
+ batch->offsets[running_orc_offset + 1] =
+ batch->offsets[running_orc_offset] +
+ map_array.value_offset(running_arrow_offset + 1) -
+ map_array.value_offset(running_arrow_offset);
+ int64_t subarray_arrow_offset = map_array.value_offset(running_arrow_offset),
+ subarray_orc_offset = batch->offsets[running_orc_offset],
+ new_subarray_orc_offset = batch->offsets[running_orc_offset + 1],
+ subarray_orc_length = new_subarray_orc_offset - subarray_orc_offset;
+ key_batch->resize(new_subarray_orc_offset);
+ element_batch->resize(new_subarray_orc_offset);
+ RETURN_NOT_OK(
+ WriteBatch(*(key_array->Slice(subarray_arrow_offset, subarray_orc_length)),
+ subarray_orc_offset, key_batch));
+ RETURN_NOT_OK(
+ WriteBatch(*(element_array->Slice(subarray_arrow_offset, subarray_orc_length)),
+ subarray_orc_offset, element_batch));
+ }
+ }
+ return Status::OK();
+}
+
+Status WriteBatch(const Array& array, int64_t orc_offset,
+ liborc::ColumnVectorBatch* column_vector_batch) {
+ Type::type kind = array.type_id();
+ column_vector_batch->numElements = orc_offset;
+ switch (kind) {
+ case Type::type::BOOL:
+ return WriteGenericBatch<BooleanType, liborc::LongVectorBatch>(array, orc_offset,
+ column_vector_batch);
+ case Type::type::INT8:
+ return WriteGenericBatch<Int8Type, liborc::LongVectorBatch>(array, orc_offset,
+ column_vector_batch);
+ case Type::type::INT16:
+ return WriteGenericBatch<Int16Type, liborc::LongVectorBatch>(array, orc_offset,
+ column_vector_batch);
+ case Type::type::INT32:
+ return WriteGenericBatch<Int32Type, liborc::LongVectorBatch>(array, orc_offset,
+ column_vector_batch);
+ case Type::type::INT64:
+ return WriteGenericBatch<Int64Type, liborc::LongVectorBatch>(array, orc_offset,
+ column_vector_batch);
+ case Type::type::FLOAT:
+ return WriteGenericBatch<FloatType, liborc::DoubleVectorBatch>(array, orc_offset,
+ column_vector_batch);
+ case Type::type::DOUBLE:
+ return WriteGenericBatch<DoubleType, liborc::DoubleVectorBatch>(
+ array, orc_offset, column_vector_batch);
+ case Type::type::BINARY:
+ return WriteGenericBatch<BinaryType, liborc::StringVectorBatch>(
+ array, orc_offset, column_vector_batch);
+ case Type::type::LARGE_BINARY:
+ return WriteGenericBatch<LargeBinaryType, liborc::StringVectorBatch>(
+ array, orc_offset, column_vector_batch);
+ case Type::type::STRING:
+ return WriteGenericBatch<StringType, liborc::StringVectorBatch>(
+ array, orc_offset, column_vector_batch);
+ case Type::type::LARGE_STRING:
+ return WriteGenericBatch<LargeStringType, liborc::StringVectorBatch>(
+ array, orc_offset, column_vector_batch);
+ case Type::type::FIXED_SIZE_BINARY:
+ return WriteFixedSizeBinaryBatch(array, orc_offset, column_vector_batch);
+ case Type::type::DATE32:
+ return WriteGenericBatch<Date32Type, liborc::LongVectorBatch>(array, orc_offset,
+ column_vector_batch);
+ case Type::type::DATE64:
+ return WriteTimestampBatch<Date64Type>(array, orc_offset, column_vector_batch,
+ kOneSecondMillis, kOneMilliNanos);
+ case Type::type::TIMESTAMP: {
+ switch (internal::checked_pointer_cast<TimestampType>(array.type())->unit()) {
+ case TimeUnit::type::SECOND:
+ return WriteTimestampBatch<TimestampType>(
+ array, orc_offset, column_vector_batch, 1, kOneSecondNanos);
+ case TimeUnit::type::MILLI:
+ return WriteTimestampBatch<TimestampType>(
+ array, orc_offset, column_vector_batch, kOneSecondMillis, kOneMilliNanos);
+ case TimeUnit::type::MICRO:
+ return WriteTimestampBatch<TimestampType>(
+ array, orc_offset, column_vector_batch, kOneSecondMicros, kOneMicroNanos);
+ case TimeUnit::type::NANO:
+ return WriteTimestampBatch<TimestampType>(
+ array, orc_offset, column_vector_batch, kOneSecondNanos, 1);
+ default:
+ return Status::TypeError("Unknown or unsupported Arrow type: ",
+ array.type()->ToString());
+ }
+ }
+ case Type::type::DECIMAL128: {
+ int32_t precision = checked_pointer_cast<Decimal128Type>(array.type())->precision();
+ if (precision > 18) {
+ return WriteGenericBatch<Decimal128Type, liborc::Decimal128VectorBatch>(
+ array, orc_offset, column_vector_batch);
+ } else {
+ return WriteGenericBatch<Decimal128Type, liborc::Decimal64VectorBatch>(
+ array, orc_offset, column_vector_batch);
+ }
+ }
+ case Type::type::STRUCT:
+ return WriteStructBatch(array, orc_offset, column_vector_batch);
+ case Type::type::LIST:
+ return WriteListBatch<ListArray>(array, orc_offset, column_vector_batch);
+ case Type::type::LARGE_LIST:
+ return WriteListBatch<LargeListArray>(array, orc_offset, column_vector_batch);
+ case Type::type::FIXED_SIZE_LIST:
+ return WriteListBatch<FixedSizeListArray>(array, orc_offset, column_vector_batch);
+ case Type::type::MAP:
+ return WriteMapBatch(array, orc_offset, column_vector_batch);
+ default: {
+ return Status::NotImplemented("Unknown or unsupported Arrow type: ",
+ array.type()->ToString());
+ }
+ }
+ return Status::OK();
+}
+
+Result<ORC_UNIQUE_PTR<liborc::Type>> GetOrcType(const DataType& type) {
+ Type::type kind = type.id();
+ switch (kind) {
+ case Type::type::BOOL:
+ return liborc::createPrimitiveType(liborc::TypeKind::BOOLEAN);
+ case Type::type::INT8:
+ return liborc::createPrimitiveType(liborc::TypeKind::BYTE);
+ case Type::type::INT16:
+ return liborc::createPrimitiveType(liborc::TypeKind::SHORT);
+ case Type::type::INT32:
+ return liborc::createPrimitiveType(liborc::TypeKind::INT);
+ case Type::type::INT64:
+ return liborc::createPrimitiveType(liborc::TypeKind::LONG);
+ case Type::type::FLOAT:
+ return liborc::createPrimitiveType(liborc::TypeKind::FLOAT);
+ case Type::type::DOUBLE:
+ return liborc::createPrimitiveType(liborc::TypeKind::DOUBLE);
+ // Use STRING instead of VARCHAR for now, both use UTF-8
+ case Type::type::STRING:
+ case Type::type::LARGE_STRING:
+ return liborc::createPrimitiveType(liborc::TypeKind::STRING);
+ case Type::type::BINARY:
+ case Type::type::LARGE_BINARY:
+ case Type::type::FIXED_SIZE_BINARY:
+ return liborc::createPrimitiveType(liborc::TypeKind::BINARY);
+ case Type::type::DATE32:
+ return liborc::createPrimitiveType(liborc::TypeKind::DATE);
+ case Type::type::DATE64:
+ case Type::type::TIMESTAMP:
+ return liborc::createPrimitiveType(liborc::TypeKind::TIMESTAMP);
+ case Type::type::DECIMAL128: {
+ const uint64_t precision =
+ static_cast<uint64_t>(checked_cast<const Decimal128Type&>(type).precision());
+ const uint64_t scale =
+ static_cast<uint64_t>(checked_cast<const Decimal128Type&>(type).scale());
+ return liborc::createDecimalType(precision, scale);
+ }
+ case Type::type::LIST:
+ case Type::type::FIXED_SIZE_LIST:
+ case Type::type::LARGE_LIST: {
+ std::shared_ptr<DataType> arrow_child_type =
+ checked_cast<const BaseListType&>(type).value_type();
+ ARROW_ASSIGN_OR_RAISE(auto orc_subtype, GetOrcType(*arrow_child_type));
+ return liborc::createListType(std::move(orc_subtype));
+ }
+ case Type::type::STRUCT: {
+ ORC_UNIQUE_PTR<liborc::Type> out_type = liborc::createStructType();
+ std::vector<std::shared_ptr<Field>> arrow_fields =
+ checked_cast<const StructType&>(type).fields();
+ for (std::vector<std::shared_ptr<Field>>::iterator it = arrow_fields.begin();
+ it != arrow_fields.end(); ++it) {
+ std::string field_name = (*it)->name();
+ std::shared_ptr<DataType> arrow_child_type = (*it)->type();
+ ARROW_ASSIGN_OR_RAISE(auto orc_subtype, GetOrcType(*arrow_child_type));
+ out_type->addStructField(field_name, std::move(orc_subtype));
+ }
+ return std::move(out_type);
+ }
+ case Type::type::MAP: {
+ std::shared_ptr<DataType> key_arrow_type =
+ checked_cast<const MapType&>(type).key_type();
+ std::shared_ptr<DataType> item_arrow_type =
+ checked_cast<const MapType&>(type).item_type();
+ ARROW_ASSIGN_OR_RAISE(auto key_orc_type, GetOrcType(*key_arrow_type));
+ ARROW_ASSIGN_OR_RAISE(auto item_orc_type, GetOrcType(*item_arrow_type));
+ return liborc::createMapType(std::move(key_orc_type), std::move(item_orc_type));
+ }
+ case Type::type::DENSE_UNION:
+ case Type::type::SPARSE_UNION: {
+ ORC_UNIQUE_PTR<liborc::Type> out_type = liborc::createUnionType();
+ std::vector<std::shared_ptr<Field>> arrow_fields =
+ checked_cast<const UnionType&>(type).fields();
+ for (std::vector<std::shared_ptr<Field>>::iterator it = arrow_fields.begin();
+ it != arrow_fields.end(); ++it) {
+ std::string field_name = (*it)->name();
+ std::shared_ptr<DataType> arrow_child_type = (*it)->type();
+ ARROW_ASSIGN_OR_RAISE(auto orc_subtype, GetOrcType(*arrow_child_type));
+ out_type->addUnionChild(std::move(orc_subtype));
+ }
+ return std::move(out_type);
+ }
+ default: {
+ return Status::NotImplemented("Unknown or unsupported Arrow type: ",
+ type.ToString());
+ }
+ }
+}
+
+} // namespace
+
+Status WriteBatch(const ChunkedArray& chunked_array, int64_t length,
+ int* arrow_chunk_offset, int64_t* arrow_index_offset,
+ liborc::ColumnVectorBatch* column_vector_batch) {
+ int num_batch = chunked_array.num_chunks();
+ int64_t orc_offset = 0;
+ while (*arrow_chunk_offset < num_batch && orc_offset < length) {
+ ARROW_ASSIGN_OR_RAISE(auto array,
+ NormalizeArray(chunked_array.chunk(*arrow_chunk_offset)));
+ int64_t num_written_elements =
+ std::min(length - orc_offset, array->length() - *arrow_index_offset);
+ if (num_written_elements > 0) {
+ RETURN_NOT_OK(WriteBatch(*(array->Slice(*arrow_index_offset, num_written_elements)),
+ orc_offset, column_vector_batch));
+ orc_offset += num_written_elements;
+ *arrow_index_offset += num_written_elements;
+ }
+ if (orc_offset < length) { // Another Arrow Array done
+ *arrow_index_offset = 0;
+ (*arrow_chunk_offset)++;
+ }
+ }
+ column_vector_batch->numElements = orc_offset;
+ return Status::OK();
+}
+
+Status GetArrowType(const liborc::Type* type, std::shared_ptr<DataType>* out) {
+ // When subselecting fields on read, liborc will set some nodes to nullptr,
+ // so we need to check for nullptr before progressing
+ if (type == nullptr) {
+ *out = null();
+ return Status::OK();
+ }
+ liborc::TypeKind kind = type->getKind();
+ const int subtype_count = static_cast<int>(type->getSubtypeCount());
+
+ switch (kind) {
+ case liborc::BOOLEAN:
+ *out = boolean();
+ break;
+ case liborc::BYTE:
+ *out = int8();
+ break;
+ case liborc::SHORT:
+ *out = int16();
+ break;
+ case liborc::INT:
+ *out = int32();
+ break;
+ case liborc::LONG:
+ *out = int64();
+ break;
+ case liborc::FLOAT:
+ *out = float32();
+ break;
+ case liborc::DOUBLE:
+ *out = float64();
+ break;
+ case liborc::VARCHAR:
+ case liborc::STRING:
+ *out = utf8();
+ break;
+ case liborc::BINARY:
+ *out = binary();
+ break;
+ case liborc::CHAR:
+ *out = fixed_size_binary(static_cast<int>(type->getMaximumLength()));
+ break;
+ case liborc::TIMESTAMP:
+ *out = timestamp(TimeUnit::NANO);
+ break;
+ case liborc::DATE:
+ *out = date32();
+ break;
+ case liborc::DECIMAL: {
+ const int precision = static_cast<int>(type->getPrecision());
+ const int scale = static_cast<int>(type->getScale());
+ if (precision == 0) {
+ // In HIVE 0.11/0.12 precision is set as 0, but means max precision
+ *out = decimal128(38, 6);
+ } else {
+ *out = decimal128(precision, scale);
+ }
+ break;
+ }
+ case liborc::LIST: {
+ if (subtype_count != 1) {
+ return Status::TypeError("Invalid Orc List type");
+ }
+ std::shared_ptr<DataType> elemtype;
+ RETURN_NOT_OK(GetArrowType(type->getSubtype(0), &elemtype));
+ *out = list(elemtype);
+ break;
+ }
+ case liborc::MAP: {
+ if (subtype_count != 2) {
+ return Status::TypeError("Invalid Orc Map type");
+ }
+ std::shared_ptr<DataType> key_type, item_type;
+ RETURN_NOT_OK(GetArrowType(type->getSubtype(0), &key_type));
+ RETURN_NOT_OK(GetArrowType(type->getSubtype(1), &item_type));
+ *out = map(key_type, item_type);
+ break;
+ }
+ case liborc::STRUCT: {
+ std::vector<std::shared_ptr<Field>> fields;
+ for (int child = 0; child < subtype_count; ++child) {
+ std::shared_ptr<DataType> elem_type;
+ RETURN_NOT_OK(GetArrowType(type->getSubtype(child), &elem_type));
+ std::string name = type->getFieldName(child);
+ fields.push_back(field(name, elem_type));
+ }
+ *out = struct_(fields);
+ break;
+ }
+ case liborc::UNION: {
+ std::vector<std::shared_ptr<Field>> fields;
+ std::vector<int8_t> type_codes;
+ for (int child = 0; child < subtype_count; ++child) {
+ std::shared_ptr<DataType> elem_type;
+ RETURN_NOT_OK(GetArrowType(type->getSubtype(child), &elem_type));
+ fields.push_back(field("_union_" + std::to_string(child), elem_type));
+ type_codes.push_back(static_cast<int8_t>(child));
+ }
+ *out = sparse_union(fields, type_codes);
+ break;
+ }
+ default: {
+ return Status::TypeError("Unknown Orc type kind: ", type->toString());
+ }
+ }
+ return Status::OK();
+}
+
+Result<ORC_UNIQUE_PTR<liborc::Type>> GetOrcType(const Schema& schema) {
+ int numFields = schema.num_fields();
+ ORC_UNIQUE_PTR<liborc::Type> out_type = liborc::createStructType();
+ for (int i = 0; i < numFields; i++) {
+ std::shared_ptr<Field> field = schema.field(i);
+ std::string field_name = field->name();
+ std::shared_ptr<DataType> arrow_child_type = field->type();
+ ARROW_ASSIGN_OR_RAISE(auto orc_subtype, GetOrcType(*arrow_child_type));
+ out_type->addStructField(field_name, std::move(orc_subtype));
+ }
+ return std::move(out_type);
+}
+
+} // namespace orc
+} // namespace adapters
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/adapters/orc/adapter_util.h b/contrib/libs/apache/arrow/cpp/src/arrow/adapters/orc/adapter_util.h
new file mode 100644
index 00000000000..3e6d0fcc660
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/adapters/orc/adapter_util.h
@@ -0,0 +1,57 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+
+#include "arrow/array/builder_base.h"
+#include "arrow/status.h"
+#include "orc/OrcFile.hh"
+
+namespace liborc = orc;
+
+namespace arrow {
+
+namespace adapters {
+
+namespace orc {
+
+Status GetArrowType(const liborc::Type* type, std::shared_ptr<DataType>* out);
+
+Result<ORC_UNIQUE_PTR<liborc::Type>> GetOrcType(const Schema& schema);
+
+Status AppendBatch(const liborc::Type* type, liborc::ColumnVectorBatch* batch,
+ int64_t offset, int64_t length, arrow::ArrayBuilder* builder);
+
+/// \brief Write a chunked array to an orc::ColumnVectorBatch
+///
+/// \param[in] chunked_array the chunked array
+/// \param[in] length the orc::ColumnVectorBatch size limit
+/// \param[in,out] arrow_chunk_offset The current chunk being processed
+/// \param[in,out] arrow_index_offset The index of the arrow_chunk_offset array
+/// before or after a process
+/// \param[in,out] column_vector_batch the orc::ColumnVectorBatch to be filled
+/// \return Status
+Status WriteBatch(const ChunkedArray& chunked_array, int64_t length,
+ int* arrow_chunk_offset, int64_t* arrow_index_offset,
+ liborc::ColumnVectorBatch* column_vector_batch);
+
+} // namespace orc
+} // namespace adapters
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/api.h b/contrib/libs/apache/arrow/cpp/src/arrow/api.h
new file mode 100644
index 00000000000..8958eaf1c9a
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/api.h
@@ -0,0 +1,44 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Coarse public API while the library is in development
+
+#pragma once
+
+#include "arrow/array.h" // IYWU pragma: export
+#include "arrow/array/concatenate.h" // IYWU pragma: export
+#include "arrow/buffer.h" // IYWU pragma: export
+#include "arrow/builder.h" // IYWU pragma: export
+#include "arrow/chunked_array.h" // IYWU pragma: export
+#include "arrow/compare.h" // IYWU pragma: export
+#include "arrow/config.h" // IYWU pragma: export
+#include "arrow/datum.h" // IYWU pragma: export
+#include "arrow/extension_type.h" // IYWU pragma: export
+#include "arrow/memory_pool.h" // IYWU pragma: export
+#include "arrow/pretty_print.h" // IYWU pragma: export
+#include "arrow/record_batch.h" // IYWU pragma: export
+#include "arrow/result.h" // IYWU pragma: export
+#include "arrow/status.h" // IYWU pragma: export
+#include "arrow/table.h" // IYWU pragma: export
+#include "arrow/table_builder.h" // IYWU pragma: export
+#include "arrow/tensor.h" // IYWU pragma: export
+#include "arrow/type.h" // IYWU pragma: export
+#include "arrow/util/key_value_metadata.h" // IWYU pragma: export
+#include "arrow/visitor.h" // IYWU pragma: export
+
+/// \brief Top-level namespace for Apache Arrow C++ API
+namespace arrow {}
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array.h b/contrib/libs/apache/arrow/cpp/src/arrow/array.h
new file mode 100644
index 00000000000..739d65e0a5d
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array.h
@@ -0,0 +1,32 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Kitchen-sink public API for arrow::Array data structures. C++ library code
+// (especially header files) in Apache Arrow should use more specific headers
+// unless it's a file that uses most or all Array types in which case using
+// arrow/array.h is fine.
+
+#pragma once
+
+#include "arrow/array/array_base.h" // IWYU pragma: keep
+#include "arrow/array/array_binary.h" // IWYU pragma: keep
+#include "arrow/array/array_decimal.h" // IWYU pragma: keep
+#include "arrow/array/array_dict.h" // IWYU pragma: keep
+#include "arrow/array/array_nested.h" // IWYU pragma: keep
+#include "arrow/array/array_primitive.h" // IWYU pragma: keep
+#include "arrow/array/data.h" // IWYU pragma: keep
+#include "arrow/array/util.h" // IWYU pragma: keep
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/README.md b/contrib/libs/apache/arrow/cpp/src/arrow/array/README.md
new file mode 100644
index 00000000000..01ffa104eb4
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/README.md
@@ -0,0 +1,20 @@
+<!---
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+
+## Implementation details related to columnar (array) data structures
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/array_base.cc b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_base.cc
new file mode 100644
index 00000000000..67c5ca84e1f
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_base.cc
@@ -0,0 +1,308 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/array/array_base.h"
+
+#include <cstdint>
+#include <memory>
+#include <sstream> // IWYU pragma: keep
+#include <string>
+#include <type_traits>
+#include <utility>
+
+#include "arrow/array/array_binary.h"
+#include "arrow/array/array_dict.h"
+#include "arrow/array/array_nested.h"
+#include "arrow/array/array_primitive.h"
+#include "arrow/array/util.h"
+#include "arrow/array/validate.h"
+#include "arrow/buffer.h"
+#include "arrow/compare.h"
+#include "arrow/pretty_print.h"
+#include "arrow/scalar.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_fwd.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/logging.h"
+#include "arrow/visitor.h"
+#include "arrow/visitor_inline.h"
+
+namespace arrow {
+
+class ExtensionArray;
+
+// ----------------------------------------------------------------------
+// Base array class
+
+int64_t Array::null_count() const { return data_->GetNullCount(); }
+
+namespace internal {
+
+struct ScalarFromArraySlotImpl {
+ template <typename T>
+ using ScalarType = typename TypeTraits<T>::ScalarType;
+
+ Status Visit(const NullArray& a) {
+ out_ = std::make_shared<NullScalar>();
+ return Status::OK();
+ }
+
+ Status Visit(const BooleanArray& a) { return Finish(a.Value(index_)); }
+
+ template <typename T>
+ Status Visit(const NumericArray<T>& a) {
+ return Finish(a.Value(index_));
+ }
+
+ Status Visit(const Decimal128Array& a) {
+ return Finish(Decimal128(a.GetValue(index_)));
+ }
+
+ Status Visit(const Decimal256Array& a) {
+ return Finish(Decimal256(a.GetValue(index_)));
+ }
+
+ template <typename T>
+ Status Visit(const BaseBinaryArray<T>& a) {
+ return Finish(a.GetString(index_));
+ }
+
+ Status Visit(const FixedSizeBinaryArray& a) { return Finish(a.GetString(index_)); }
+
+ Status Visit(const DayTimeIntervalArray& a) { return Finish(a.Value(index_)); }
+
+ template <typename T>
+ Status Visit(const BaseListArray<T>& a) {
+ return Finish(a.value_slice(index_));
+ }
+
+ Status Visit(const FixedSizeListArray& a) { return Finish(a.value_slice(index_)); }
+
+ Status Visit(const StructArray& a) {
+ ScalarVector children;
+ for (const auto& child : a.fields()) {
+ children.emplace_back();
+ ARROW_ASSIGN_OR_RAISE(children.back(), child->GetScalar(index_));
+ }
+ return Finish(std::move(children));
+ }
+
+ Status Visit(const SparseUnionArray& a) {
+ // child array which stores the actual value
+ auto arr = a.field(a.child_id(index_));
+ // no need to adjust the index
+ ARROW_ASSIGN_OR_RAISE(auto value, arr->GetScalar(index_));
+ if (value->is_valid) {
+ out_ = std::shared_ptr<Scalar>(new SparseUnionScalar(value, a.type()));
+ } else {
+ out_ = MakeNullScalar(a.type());
+ }
+ return Status::OK();
+ }
+
+ Status Visit(const DenseUnionArray& a) {
+ // child array which stores the actual value
+ auto arr = a.field(a.child_id(index_));
+ // need to look up the value based on offsets
+ auto offset = a.value_offset(index_);
+ ARROW_ASSIGN_OR_RAISE(auto value, arr->GetScalar(offset));
+ if (value->is_valid) {
+ out_ = std::shared_ptr<Scalar>(new DenseUnionScalar(value, a.type()));
+ } else {
+ out_ = MakeNullScalar(a.type());
+ }
+ return Status::OK();
+ }
+
+ Status Visit(const DictionaryArray& a) {
+ auto ty = a.type();
+
+ ARROW_ASSIGN_OR_RAISE(auto index,
+ MakeScalar(checked_cast<DictionaryType&>(*ty).index_type(),
+ a.GetValueIndex(index_)));
+
+ auto scalar = DictionaryScalar(ty);
+ scalar.is_valid = a.IsValid(index_);
+ scalar.value.index = index;
+ scalar.value.dictionary = a.dictionary();
+
+ out_ = std::make_shared<DictionaryScalar>(std::move(scalar));
+ return Status::OK();
+ }
+
+ Status Visit(const ExtensionArray& a) {
+ return Status::NotImplemented("Non-null ExtensionScalar");
+ }
+
+ template <typename Arg>
+ Status Finish(Arg&& arg) {
+ return MakeScalar(array_.type(), std::forward<Arg>(arg)).Value(&out_);
+ }
+
+ Status Finish(std::string arg) {
+ return MakeScalar(array_.type(), Buffer::FromString(std::move(arg))).Value(&out_);
+ }
+
+ Result<std::shared_ptr<Scalar>> Finish() && {
+ if (index_ >= array_.length()) {
+ return Status::IndexError("tried to refer to element ", index_,
+ " but array is only ", array_.length(), " long");
+ }
+
+ if (array_.IsNull(index_)) {
+ auto null = MakeNullScalar(array_.type());
+ if (is_dictionary(array_.type()->id())) {
+ auto& dict_null = checked_cast<DictionaryScalar&>(*null);
+ const auto& dict_array = checked_cast<const DictionaryArray&>(array_);
+ dict_null.value.dictionary = dict_array.dictionary();
+ }
+ return null;
+ }
+
+ RETURN_NOT_OK(VisitArrayInline(array_, this));
+ return std::move(out_);
+ }
+
+ ScalarFromArraySlotImpl(const Array& array, int64_t index)
+ : array_(array), index_(index) {}
+
+ const Array& array_;
+ int64_t index_;
+ std::shared_ptr<Scalar> out_;
+};
+
+} // namespace internal
+
+Result<std::shared_ptr<Scalar>> Array::GetScalar(int64_t i) const {
+ return internal::ScalarFromArraySlotImpl{*this, i}.Finish();
+}
+
+std::string Array::Diff(const Array& other) const {
+ std::stringstream diff;
+ ARROW_IGNORE_EXPR(Equals(other, EqualOptions().diff_sink(&diff)));
+ return diff.str();
+}
+
+bool Array::Equals(const Array& arr, const EqualOptions& opts) const {
+ return ArrayEquals(*this, arr, opts);
+}
+
+bool Array::Equals(const std::shared_ptr<Array>& arr, const EqualOptions& opts) const {
+ if (!arr) {
+ return false;
+ }
+ return Equals(*arr, opts);
+}
+
+bool Array::ApproxEquals(const Array& arr, const EqualOptions& opts) const {
+ return ArrayApproxEquals(*this, arr, opts);
+}
+
+bool Array::ApproxEquals(const std::shared_ptr<Array>& arr,
+ const EqualOptions& opts) const {
+ if (!arr) {
+ return false;
+ }
+ return ApproxEquals(*arr, opts);
+}
+
+bool Array::RangeEquals(const Array& other, int64_t start_idx, int64_t end_idx,
+ int64_t other_start_idx, const EqualOptions& opts) const {
+ return ArrayRangeEquals(*this, other, start_idx, end_idx, other_start_idx, opts);
+}
+
+bool Array::RangeEquals(const std::shared_ptr<Array>& other, int64_t start_idx,
+ int64_t end_idx, int64_t other_start_idx,
+ const EqualOptions& opts) const {
+ if (!other) {
+ return false;
+ }
+ return ArrayRangeEquals(*this, *other, start_idx, end_idx, other_start_idx, opts);
+}
+
+bool Array::RangeEquals(int64_t start_idx, int64_t end_idx, int64_t other_start_idx,
+ const Array& other, const EqualOptions& opts) const {
+ return ArrayRangeEquals(*this, other, start_idx, end_idx, other_start_idx, opts);
+}
+
+bool Array::RangeEquals(int64_t start_idx, int64_t end_idx, int64_t other_start_idx,
+ const std::shared_ptr<Array>& other,
+ const EqualOptions& opts) const {
+ if (!other) {
+ return false;
+ }
+ return ArrayRangeEquals(*this, *other, start_idx, end_idx, other_start_idx, opts);
+}
+
+std::shared_ptr<Array> Array::Slice(int64_t offset, int64_t length) const {
+ return MakeArray(data_->Slice(offset, length));
+}
+
+std::shared_ptr<Array> Array::Slice(int64_t offset) const {
+ int64_t slice_length = data_->length - offset;
+ return Slice(offset, slice_length);
+}
+
+Result<std::shared_ptr<Array>> Array::SliceSafe(int64_t offset, int64_t length) const {
+ ARROW_ASSIGN_OR_RAISE(auto sliced_data, data_->SliceSafe(offset, length));
+ return MakeArray(std::move(sliced_data));
+}
+
+Result<std::shared_ptr<Array>> Array::SliceSafe(int64_t offset) const {
+ if (offset < 0) {
+ // Avoid UBSAN in subtraction below
+ return Status::Invalid("Negative buffer slice offset");
+ }
+ return SliceSafe(offset, data_->length - offset);
+}
+
+std::string Array::ToString() const {
+ std::stringstream ss;
+ ARROW_CHECK_OK(PrettyPrint(*this, 0, &ss));
+ return ss.str();
+}
+
+Result<std::shared_ptr<Array>> Array::View(
+ const std::shared_ptr<DataType>& out_type) const {
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<ArrayData> result,
+ internal::GetArrayView(data_, out_type));
+ return MakeArray(result);
+}
+
+// ----------------------------------------------------------------------
+// NullArray
+
+NullArray::NullArray(int64_t length) {
+ SetData(ArrayData::Make(null(), length, {nullptr}, length));
+}
+
+// ----------------------------------------------------------------------
+// Implement Array::Accept as inline visitor
+
+Status Array::Accept(ArrayVisitor* visitor) const {
+ return VisitArrayInline(*this, visitor);
+}
+
+Status Array::Validate() const { return internal::ValidateArray(*this); }
+
+Status Array::ValidateFull() const {
+ RETURN_NOT_OK(internal::ValidateArray(*this));
+ return internal::ValidateArrayFull(*this);
+}
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/array_base.h b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_base.h
new file mode 100644
index 00000000000..2add572e7a4
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_base.h
@@ -0,0 +1,260 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <iosfwd>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/array/data.h"
+#include "arrow/buffer.h"
+#include "arrow/compare.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+#include "arrow/visitor.h"
+
+namespace arrow {
+
+// ----------------------------------------------------------------------
+// User array accessor types
+
+/// \brief Array base type
+/// Immutable data array with some logical type and some length.
+///
+/// Any memory is owned by the respective Buffer instance (or its parents).
+///
+/// The base class is only required to have a null bitmap buffer if the null
+/// count is greater than 0
+///
+/// If known, the null count can be provided in the base Array constructor. If
+/// the null count is not known, pass -1 to indicate that the null count is to
+/// be computed on the first call to null_count()
+class ARROW_EXPORT Array {
+ public:
+ virtual ~Array() = default;
+
+ /// \brief Return true if value at index is null. Does not boundscheck
+ bool IsNull(int64_t i) const {
+ return null_bitmap_data_ != NULLPTR
+ ? !BitUtil::GetBit(null_bitmap_data_, i + data_->offset)
+ : data_->null_count == data_->length;
+ }
+
+ /// \brief Return true if value at index is valid (not null). Does not
+ /// boundscheck
+ bool IsValid(int64_t i) const {
+ return null_bitmap_data_ != NULLPTR
+ ? BitUtil::GetBit(null_bitmap_data_, i + data_->offset)
+ : data_->null_count != data_->length;
+ }
+
+ /// \brief Return a Scalar containing the value of this array at i
+ Result<std::shared_ptr<Scalar>> GetScalar(int64_t i) const;
+
+ /// Size in the number of elements this array contains.
+ int64_t length() const { return data_->length; }
+
+ /// A relative position into another array's data, to enable zero-copy
+ /// slicing. This value defaults to zero
+ int64_t offset() const { return data_->offset; }
+
+ /// The number of null entries in the array. If the null count was not known
+ /// at time of construction (and set to a negative value), then the null
+ /// count will be computed and cached on the first invocation of this
+ /// function
+ int64_t null_count() const;
+
+ std::shared_ptr<DataType> type() const { return data_->type; }
+ Type::type type_id() const { return data_->type->id(); }
+
+ /// Buffer for the validity (null) bitmap, if any. Note that Union types
+ /// never have a null bitmap.
+ ///
+ /// Note that for `null_count == 0` or for null type, this will be null.
+ /// This buffer does not account for any slice offset
+ const std::shared_ptr<Buffer>& null_bitmap() const { return data_->buffers[0]; }
+
+ /// Raw pointer to the null bitmap.
+ ///
+ /// Note that for `null_count == 0` or for null type, this will be null.
+ /// This buffer does not account for any slice offset
+ const uint8_t* null_bitmap_data() const { return null_bitmap_data_; }
+
+ /// Equality comparison with another array
+ bool Equals(const Array& arr, const EqualOptions& = EqualOptions::Defaults()) const;
+ bool Equals(const std::shared_ptr<Array>& arr,
+ const EqualOptions& = EqualOptions::Defaults()) const;
+
+ /// \brief Return the formatted unified diff of arrow::Diff between this
+ /// Array and another Array
+ std::string Diff(const Array& other) const;
+
+ /// Approximate equality comparison with another array
+ ///
+ /// epsilon is only used if this is FloatArray or DoubleArray
+ bool ApproxEquals(const std::shared_ptr<Array>& arr,
+ const EqualOptions& = EqualOptions::Defaults()) const;
+ bool ApproxEquals(const Array& arr,
+ const EqualOptions& = EqualOptions::Defaults()) const;
+
+ /// Compare if the range of slots specified are equal for the given array and
+ /// this array. end_idx exclusive. This methods does not bounds check.
+ bool RangeEquals(int64_t start_idx, int64_t end_idx, int64_t other_start_idx,
+ const Array& other,
+ const EqualOptions& = EqualOptions::Defaults()) const;
+ bool RangeEquals(int64_t start_idx, int64_t end_idx, int64_t other_start_idx,
+ const std::shared_ptr<Array>& other,
+ const EqualOptions& = EqualOptions::Defaults()) const;
+ bool RangeEquals(const Array& other, int64_t start_idx, int64_t end_idx,
+ int64_t other_start_idx,
+ const EqualOptions& = EqualOptions::Defaults()) const;
+ bool RangeEquals(const std::shared_ptr<Array>& other, int64_t start_idx,
+ int64_t end_idx, int64_t other_start_idx,
+ const EqualOptions& = EqualOptions::Defaults()) const;
+
+ Status Accept(ArrayVisitor* visitor) const;
+
+ /// Construct a zero-copy view of this array with the given type.
+ ///
+ /// This method checks if the types are layout-compatible.
+ /// Nested types are traversed in depth-first order. Data buffers must have
+ /// the same item sizes, even though the logical types may be different.
+ /// An error is returned if the types are not layout-compatible.
+ Result<std::shared_ptr<Array>> View(const std::shared_ptr<DataType>& type) const;
+
+ /// Construct a zero-copy slice of the array with the indicated offset and
+ /// length
+ ///
+ /// \param[in] offset the position of the first element in the constructed
+ /// slice
+ /// \param[in] length the length of the slice. If there are not enough
+ /// elements in the array, the length will be adjusted accordingly
+ ///
+ /// \return a new object wrapped in std::shared_ptr<Array>
+ std::shared_ptr<Array> Slice(int64_t offset, int64_t length) const;
+
+ /// Slice from offset until end of the array
+ std::shared_ptr<Array> Slice(int64_t offset) const;
+
+ /// Input-checking variant of Array::Slice
+ Result<std::shared_ptr<Array>> SliceSafe(int64_t offset, int64_t length) const;
+ /// Input-checking variant of Array::Slice
+ Result<std::shared_ptr<Array>> SliceSafe(int64_t offset) const;
+
+ const std::shared_ptr<ArrayData>& data() const { return data_; }
+
+ int num_fields() const { return static_cast<int>(data_->child_data.size()); }
+
+ /// \return PrettyPrint representation of array suitable for debugging
+ std::string ToString() const;
+
+ /// \brief Perform cheap validation checks to determine obvious inconsistencies
+ /// within the array's internal data.
+ ///
+ /// This is O(k) where k is the number of descendents.
+ ///
+ /// \return Status
+ Status Validate() const;
+
+ /// \brief Perform extensive validation checks to determine inconsistencies
+ /// within the array's internal data.
+ ///
+ /// This is potentially O(k*n) where k is the number of descendents and n
+ /// is the array length.
+ ///
+ /// \return Status
+ Status ValidateFull() const;
+
+ protected:
+ Array() : null_bitmap_data_(NULLPTR) {}
+
+ std::shared_ptr<ArrayData> data_;
+ const uint8_t* null_bitmap_data_;
+
+ /// Protected method for constructors
+ void SetData(const std::shared_ptr<ArrayData>& data) {
+ if (data->buffers.size() > 0) {
+ null_bitmap_data_ = data->GetValuesSafe<uint8_t>(0, /*offset=*/0);
+ } else {
+ null_bitmap_data_ = NULLPTR;
+ }
+ data_ = data;
+ }
+
+ private:
+ ARROW_DISALLOW_COPY_AND_ASSIGN(Array);
+};
+
+static inline std::ostream& operator<<(std::ostream& os, const Array& x) {
+ os << x.ToString();
+ return os;
+}
+
+/// Base class for non-nested arrays
+class ARROW_EXPORT FlatArray : public Array {
+ protected:
+ using Array::Array;
+};
+
+/// Base class for arrays of fixed-size logical types
+class ARROW_EXPORT PrimitiveArray : public FlatArray {
+ public:
+ PrimitiveArray(const std::shared_ptr<DataType>& type, int64_t length,
+ const std::shared_ptr<Buffer>& data,
+ const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
+ int64_t null_count = kUnknownNullCount, int64_t offset = 0);
+
+ /// Does not account for any slice offset
+ std::shared_ptr<Buffer> values() const { return data_->buffers[1]; }
+
+ protected:
+ PrimitiveArray() : raw_values_(NULLPTR) {}
+
+ void SetData(const std::shared_ptr<ArrayData>& data) {
+ this->Array::SetData(data);
+ raw_values_ = data->GetValuesSafe<uint8_t>(1, /*offset=*/0);
+ }
+
+ explicit PrimitiveArray(const std::shared_ptr<ArrayData>& data) { SetData(data); }
+
+ const uint8_t* raw_values_;
+};
+
+/// Degenerate null type Array
+class ARROW_EXPORT NullArray : public FlatArray {
+ public:
+ using TypeClass = NullType;
+
+ explicit NullArray(const std::shared_ptr<ArrayData>& data) { SetData(data); }
+ explicit NullArray(int64_t length);
+
+ private:
+ void SetData(const std::shared_ptr<ArrayData>& data) {
+ null_bitmap_data_ = NULLPTR;
+ data->null_count = data->length;
+ data_ = data;
+ }
+};
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/array_binary.cc b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_binary.cc
new file mode 100644
index 00000000000..9466b5a48f9
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_binary.cc
@@ -0,0 +1,108 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/array/array_binary.h"
+
+#include <cstdint>
+#include <memory>
+
+#include "arrow/array/array_base.h"
+#include "arrow/array/validate.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/logging.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+
+BinaryArray::BinaryArray(const std::shared_ptr<ArrayData>& data) {
+ ARROW_CHECK(is_binary_like(data->type->id()));
+ SetData(data);
+}
+
+BinaryArray::BinaryArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
+ const std::shared_ptr<Buffer>& data,
+ const std::shared_ptr<Buffer>& null_bitmap, int64_t null_count,
+ int64_t offset) {
+ SetData(ArrayData::Make(binary(), length, {null_bitmap, value_offsets, data},
+ null_count, offset));
+}
+
+LargeBinaryArray::LargeBinaryArray(const std::shared_ptr<ArrayData>& data) {
+ ARROW_CHECK(is_large_binary_like(data->type->id()));
+ SetData(data);
+}
+
+LargeBinaryArray::LargeBinaryArray(int64_t length,
+ const std::shared_ptr<Buffer>& value_offsets,
+ const std::shared_ptr<Buffer>& data,
+ const std::shared_ptr<Buffer>& null_bitmap,
+ int64_t null_count, int64_t offset) {
+ SetData(ArrayData::Make(large_binary(), length, {null_bitmap, value_offsets, data},
+ null_count, offset));
+}
+
+StringArray::StringArray(const std::shared_ptr<ArrayData>& data) {
+ ARROW_CHECK_EQ(data->type->id(), Type::STRING);
+ SetData(data);
+}
+
+StringArray::StringArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
+ const std::shared_ptr<Buffer>& data,
+ const std::shared_ptr<Buffer>& null_bitmap, int64_t null_count,
+ int64_t offset) {
+ SetData(ArrayData::Make(utf8(), length, {null_bitmap, value_offsets, data}, null_count,
+ offset));
+}
+
+Status StringArray::ValidateUTF8() const { return internal::ValidateUTF8(*data_); }
+
+LargeStringArray::LargeStringArray(const std::shared_ptr<ArrayData>& data) {
+ ARROW_CHECK_EQ(data->type->id(), Type::LARGE_STRING);
+ SetData(data);
+}
+
+LargeStringArray::LargeStringArray(int64_t length,
+ const std::shared_ptr<Buffer>& value_offsets,
+ const std::shared_ptr<Buffer>& data,
+ const std::shared_ptr<Buffer>& null_bitmap,
+ int64_t null_count, int64_t offset) {
+ SetData(ArrayData::Make(large_utf8(), length, {null_bitmap, value_offsets, data},
+ null_count, offset));
+}
+
+Status LargeStringArray::ValidateUTF8() const { return internal::ValidateUTF8(*data_); }
+
+FixedSizeBinaryArray::FixedSizeBinaryArray(const std::shared_ptr<ArrayData>& data) {
+ SetData(data);
+}
+
+FixedSizeBinaryArray::FixedSizeBinaryArray(const std::shared_ptr<DataType>& type,
+ int64_t length,
+ const std::shared_ptr<Buffer>& data,
+ const std::shared_ptr<Buffer>& null_bitmap,
+ int64_t null_count, int64_t offset)
+ : PrimitiveArray(type, length, data, null_bitmap, null_count, offset),
+ byte_width_(checked_cast<const FixedSizeBinaryType&>(*type).byte_width()) {}
+
+const uint8_t* FixedSizeBinaryArray::GetValue(int64_t i) const {
+ return raw_values_ + (i + data_->offset) * byte_width_;
+}
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/array_binary.h b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_binary.h
new file mode 100644
index 00000000000..f8e8c4f8a44
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_binary.h
@@ -0,0 +1,255 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Array accessor classes for Binary, LargeBinart, String, LargeString,
+// FixedSizeBinary
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/array/array_base.h"
+#include "arrow/array/data.h"
+#include "arrow/buffer.h"
+#include "arrow/stl_iterator.h"
+#include "arrow/type.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/string_view.h" // IWYU pragma: export
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+// ----------------------------------------------------------------------
+// Binary and String
+
+/// Base class for variable-sized binary arrays, regardless of offset size
+/// and logical interpretation.
+template <typename TYPE>
+class BaseBinaryArray : public FlatArray {
+ public:
+ using TypeClass = TYPE;
+ using offset_type = typename TypeClass::offset_type;
+ using IteratorType = stl::ArrayIterator<BaseBinaryArray<TYPE>>;
+
+ /// Return the pointer to the given elements bytes
+ // XXX should GetValue(int64_t i) return a string_view?
+ const uint8_t* GetValue(int64_t i, offset_type* out_length) const {
+ // Account for base offset
+ i += data_->offset;
+ const offset_type pos = raw_value_offsets_[i];
+ *out_length = raw_value_offsets_[i + 1] - pos;
+ return raw_data_ + pos;
+ }
+
+ /// \brief Get binary value as a string_view
+ ///
+ /// \param i the value index
+ /// \return the view over the selected value
+ util::string_view GetView(int64_t i) const {
+ // Account for base offset
+ i += data_->offset;
+ const offset_type pos = raw_value_offsets_[i];
+ return util::string_view(reinterpret_cast<const char*>(raw_data_ + pos),
+ raw_value_offsets_[i + 1] - pos);
+ }
+
+ /// \brief Get binary value as a string_view
+ /// Provided for consistency with other arrays.
+ ///
+ /// \param i the value index
+ /// \return the view over the selected value
+ util::string_view Value(int64_t i) const { return GetView(i); }
+
+ /// \brief Get binary value as a std::string
+ ///
+ /// \param i the value index
+ /// \return the value copied into a std::string
+ std::string GetString(int64_t i) const { return std::string(GetView(i)); }
+
+ /// Note that this buffer does not account for any slice offset
+ std::shared_ptr<Buffer> value_offsets() const { return data_->buffers[1]; }
+
+ /// Note that this buffer does not account for any slice offset
+ std::shared_ptr<Buffer> value_data() const { return data_->buffers[2]; }
+
+ const offset_type* raw_value_offsets() const {
+ return raw_value_offsets_ + data_->offset;
+ }
+
+ const uint8_t* raw_data() const { return raw_data_; }
+
+ /// \brief Return the data buffer absolute offset of the data for the value
+ /// at the passed index.
+ ///
+ /// Does not perform boundschecking
+ offset_type value_offset(int64_t i) const {
+ return raw_value_offsets_[i + data_->offset];
+ }
+
+ /// \brief Return the length of the data for the value at the passed index.
+ ///
+ /// Does not perform boundschecking
+ offset_type value_length(int64_t i) const {
+ i += data_->offset;
+ return raw_value_offsets_[i + 1] - raw_value_offsets_[i];
+ }
+
+ /// \brief Return the total length of the memory in the data buffer
+ /// referenced by this array. If the array has been sliced then this may be
+ /// less than the size of the data buffer (data_->buffers[2]).
+ offset_type total_values_length() const {
+ if (data_->length > 0) {
+ return raw_value_offsets_[data_->length + data_->offset] -
+ raw_value_offsets_[data_->offset];
+ } else {
+ return 0;
+ }
+ }
+
+ IteratorType begin() const { return IteratorType(*this); }
+
+ IteratorType end() const { return IteratorType(*this, length()); }
+
+ protected:
+ // For subclasses
+ BaseBinaryArray() = default;
+
+ // Protected method for constructors
+ void SetData(const std::shared_ptr<ArrayData>& data) {
+ this->Array::SetData(data);
+ raw_value_offsets_ = data->GetValuesSafe<offset_type>(1, /*offset=*/0);
+ raw_data_ = data->GetValuesSafe<uint8_t>(2, /*offset=*/0);
+ }
+
+ const offset_type* raw_value_offsets_ = NULLPTR;
+ const uint8_t* raw_data_ = NULLPTR;
+};
+
+/// Concrete Array class for variable-size binary data
+class ARROW_EXPORT BinaryArray : public BaseBinaryArray<BinaryType> {
+ public:
+ explicit BinaryArray(const std::shared_ptr<ArrayData>& data);
+
+ BinaryArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
+ const std::shared_ptr<Buffer>& data,
+ const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
+ int64_t null_count = kUnknownNullCount, int64_t offset = 0);
+
+ protected:
+ // For subclasses such as StringArray
+ BinaryArray() : BaseBinaryArray() {}
+};
+
+/// Concrete Array class for variable-size string (utf-8) data
+class ARROW_EXPORT StringArray : public BinaryArray {
+ public:
+ using TypeClass = StringType;
+
+ explicit StringArray(const std::shared_ptr<ArrayData>& data);
+
+ StringArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
+ const std::shared_ptr<Buffer>& data,
+ const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
+ int64_t null_count = kUnknownNullCount, int64_t offset = 0);
+
+ /// \brief Validate that this array contains only valid UTF8 entries
+ ///
+ /// This check is also implied by ValidateFull()
+ Status ValidateUTF8() const;
+};
+
+/// Concrete Array class for large variable-size binary data
+class ARROW_EXPORT LargeBinaryArray : public BaseBinaryArray<LargeBinaryType> {
+ public:
+ explicit LargeBinaryArray(const std::shared_ptr<ArrayData>& data);
+
+ LargeBinaryArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
+ const std::shared_ptr<Buffer>& data,
+ const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
+ int64_t null_count = kUnknownNullCount, int64_t offset = 0);
+
+ protected:
+ // For subclasses such as LargeStringArray
+ LargeBinaryArray() : BaseBinaryArray() {}
+};
+
+/// Concrete Array class for large variable-size string (utf-8) data
+class ARROW_EXPORT LargeStringArray : public LargeBinaryArray {
+ public:
+ using TypeClass = LargeStringType;
+
+ explicit LargeStringArray(const std::shared_ptr<ArrayData>& data);
+
+ LargeStringArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
+ const std::shared_ptr<Buffer>& data,
+ const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
+ int64_t null_count = kUnknownNullCount, int64_t offset = 0);
+
+ /// \brief Validate that this array contains only valid UTF8 entries
+ ///
+ /// This check is also implied by ValidateFull()
+ Status ValidateUTF8() const;
+};
+
+// ----------------------------------------------------------------------
+// Fixed width binary
+
+/// Concrete Array class for fixed-size binary data
+class ARROW_EXPORT FixedSizeBinaryArray : public PrimitiveArray {
+ public:
+ using TypeClass = FixedSizeBinaryType;
+ using IteratorType = stl::ArrayIterator<FixedSizeBinaryArray>;
+
+ explicit FixedSizeBinaryArray(const std::shared_ptr<ArrayData>& data);
+
+ FixedSizeBinaryArray(const std::shared_ptr<DataType>& type, int64_t length,
+ const std::shared_ptr<Buffer>& data,
+ const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
+ int64_t null_count = kUnknownNullCount, int64_t offset = 0);
+
+ const uint8_t* GetValue(int64_t i) const;
+ const uint8_t* Value(int64_t i) const { return GetValue(i); }
+
+ util::string_view GetView(int64_t i) const {
+ return util::string_view(reinterpret_cast<const char*>(GetValue(i)), byte_width());
+ }
+
+ std::string GetString(int64_t i) const { return std::string(GetView(i)); }
+
+ int32_t byte_width() const { return byte_width_; }
+
+ const uint8_t* raw_values() const { return raw_values_ + data_->offset * byte_width_; }
+
+ IteratorType begin() const { return IteratorType(*this); }
+
+ IteratorType end() const { return IteratorType(*this, length()); }
+
+ protected:
+ void SetData(const std::shared_ptr<ArrayData>& data) {
+ this->PrimitiveArray::SetData(data);
+ byte_width_ =
+ internal::checked_cast<const FixedSizeBinaryType&>(*type()).byte_width();
+ }
+
+ int32_t byte_width_;
+};
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/array_decimal.cc b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_decimal.cc
new file mode 100644
index 00000000000..d65f6ee5356
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_decimal.cc
@@ -0,0 +1,63 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/array/array_decimal.h"
+
+#include <cstdint>
+#include <memory>
+#include <string>
+
+#include "arrow/array/array_binary.h"
+#include "arrow/array/data.h"
+#include "arrow/type.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/decimal.h"
+#include "arrow/util/logging.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+
+// ----------------------------------------------------------------------
+// Decimal128
+
+Decimal128Array::Decimal128Array(const std::shared_ptr<ArrayData>& data)
+ : FixedSizeBinaryArray(data) {
+ ARROW_CHECK_EQ(data->type->id(), Type::DECIMAL128);
+}
+
+std::string Decimal128Array::FormatValue(int64_t i) const {
+ const auto& type_ = checked_cast<const Decimal128Type&>(*type());
+ const Decimal128 value(GetValue(i));
+ return value.ToString(type_.scale());
+}
+
+// ----------------------------------------------------------------------
+// Decimal256
+
+Decimal256Array::Decimal256Array(const std::shared_ptr<ArrayData>& data)
+ : FixedSizeBinaryArray(data) {
+ ARROW_CHECK_EQ(data->type->id(), Type::DECIMAL256);
+}
+
+std::string Decimal256Array::FormatValue(int64_t i) const {
+ const auto& type_ = checked_cast<const Decimal256Type&>(*type());
+ const Decimal256 value(GetValue(i));
+ return value.ToString(type_.scale());
+}
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/array_decimal.h b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_decimal.h
new file mode 100644
index 00000000000..8d7d1c59cd0
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_decimal.h
@@ -0,0 +1,66 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+
+#include "arrow/array/array_binary.h"
+#include "arrow/array/data.h"
+#include "arrow/type.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+// ----------------------------------------------------------------------
+// Decimal128Array
+
+/// Concrete Array class for 128-bit decimal data
+class ARROW_EXPORT Decimal128Array : public FixedSizeBinaryArray {
+ public:
+ using TypeClass = Decimal128Type;
+
+ using FixedSizeBinaryArray::FixedSizeBinaryArray;
+
+ /// \brief Construct Decimal128Array from ArrayData instance
+ explicit Decimal128Array(const std::shared_ptr<ArrayData>& data);
+
+ std::string FormatValue(int64_t i) const;
+};
+
+// Backward compatibility
+using DecimalArray = Decimal128Array;
+
+// ----------------------------------------------------------------------
+// Decimal256Array
+
+/// Concrete Array class for 256-bit decimal data
+class ARROW_EXPORT Decimal256Array : public FixedSizeBinaryArray {
+ public:
+ using TypeClass = Decimal256Type;
+
+ using FixedSizeBinaryArray::FixedSizeBinaryArray;
+
+ /// \brief Construct Decimal256Array from ArrayData instance
+ explicit Decimal256Array(const std::shared_ptr<ArrayData>& data);
+
+ std::string FormatValue(int64_t i) const;
+};
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/array_dict.cc b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_dict.cc
new file mode 100644
index 00000000000..2fa95e9a176
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_dict.cc
@@ -0,0 +1,442 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/array/array_dict.h"
+
+#include <algorithm>
+#include <climits>
+#include <cstdint>
+#include <limits>
+#include <utility>
+#include <vector>
+
+#include "arrow/array/array_primitive.h"
+#include "arrow/array/data.h"
+#include "arrow/array/dict_internal.h"
+#include "arrow/array/util.h"
+#include "arrow/buffer.h"
+#include "arrow/chunked_array.h"
+#include "arrow/datum.h"
+#include "arrow/status.h"
+#include "arrow/table.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_ops.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/int_util.h"
+#include "arrow/util/logging.h"
+#include "arrow/visitor_inline.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+using internal::CopyBitmap;
+
+// ----------------------------------------------------------------------
+// DictionaryArray
+
+std::shared_ptr<Array> DictionaryArray::indices() const { return indices_; }
+
+int64_t DictionaryArray::GetValueIndex(int64_t i) const {
+ const uint8_t* indices_data = data_->buffers[1]->data();
+ // If the value is non-negative then we can use the unsigned path
+ switch (indices_->type_id()) {
+ case Type::UINT8:
+ case Type::INT8:
+ return static_cast<int64_t>(indices_data[data_->offset + i]);
+ case Type::UINT16:
+ case Type::INT16:
+ return static_cast<int64_t>(
+ reinterpret_cast<const uint16_t*>(indices_data)[data_->offset + i]);
+ case Type::UINT32:
+ case Type::INT32:
+ return static_cast<int64_t>(
+ reinterpret_cast<const uint32_t*>(indices_data)[data_->offset + i]);
+ case Type::UINT64:
+ case Type::INT64:
+ return static_cast<int64_t>(
+ reinterpret_cast<const uint64_t*>(indices_data)[data_->offset + i]);
+ default:
+ ARROW_CHECK(false) << "unreachable";
+ return -1;
+ }
+}
+
+DictionaryArray::DictionaryArray(const std::shared_ptr<ArrayData>& data)
+ : dict_type_(checked_cast<const DictionaryType*>(data->type.get())) {
+ ARROW_CHECK_EQ(data->type->id(), Type::DICTIONARY);
+ ARROW_CHECK_NE(data->dictionary, nullptr);
+ SetData(data);
+}
+
+void DictionaryArray::SetData(const std::shared_ptr<ArrayData>& data) {
+ this->Array::SetData(data);
+ auto indices_data = data_->Copy();
+ indices_data->type = dict_type_->index_type();
+ indices_data->dictionary = nullptr;
+ indices_ = MakeArray(indices_data);
+}
+
+DictionaryArray::DictionaryArray(const std::shared_ptr<DataType>& type,
+ const std::shared_ptr<Array>& indices,
+ const std::shared_ptr<Array>& dictionary)
+ : dict_type_(checked_cast<const DictionaryType*>(type.get())) {
+ ARROW_CHECK_EQ(type->id(), Type::DICTIONARY);
+ ARROW_CHECK_EQ(indices->type_id(), dict_type_->index_type()->id());
+ ARROW_CHECK_EQ(dict_type_->value_type()->id(), dictionary->type()->id());
+ DCHECK(dict_type_->value_type()->Equals(*dictionary->type()));
+ auto data = indices->data()->Copy();
+ data->type = type;
+ data->dictionary = dictionary->data();
+ SetData(data);
+}
+
+std::shared_ptr<Array> DictionaryArray::dictionary() const {
+ if (!dictionary_) {
+ dictionary_ = MakeArray(data_->dictionary);
+ }
+ return dictionary_;
+}
+
+Result<std::shared_ptr<Array>> DictionaryArray::FromArrays(
+ const std::shared_ptr<DataType>& type, const std::shared_ptr<Array>& indices,
+ const std::shared_ptr<Array>& dictionary) {
+ if (type->id() != Type::DICTIONARY) {
+ return Status::TypeError("Expected a dictionary type");
+ }
+ const auto& dict = checked_cast<const DictionaryType&>(*type);
+ if (indices->type_id() != dict.index_type()->id()) {
+ return Status::TypeError(
+ "Dictionary type's index type does not match "
+ "indices array's type");
+ }
+ RETURN_NOT_OK(internal::CheckIndexBounds(*indices->data(),
+ static_cast<uint64_t>(dictionary->length())));
+ return std::make_shared<DictionaryArray>(type, indices, dictionary);
+}
+
+bool DictionaryArray::CanCompareIndices(const DictionaryArray& other) const {
+ DCHECK(dictionary()->type()->Equals(other.dictionary()->type()))
+ << "dictionaries have differing type " << *dictionary()->type() << " vs "
+ << *other.dictionary()->type();
+
+ if (!indices()->type()->Equals(other.indices()->type())) {
+ return false;
+ }
+
+ auto min_length = std::min(dictionary()->length(), other.dictionary()->length());
+ return dictionary()->RangeEquals(other.dictionary(), 0, min_length, 0);
+}
+
+// ----------------------------------------------------------------------
+// Dictionary transposition
+
+namespace {
+
+inline bool IsTrivialTransposition(const int32_t* transpose_map,
+ int64_t input_dict_size) {
+ for (int64_t i = 0; i < input_dict_size; ++i) {
+ if (transpose_map[i] != i) {
+ return false;
+ }
+ }
+ return true;
+}
+
+Result<std::shared_ptr<ArrayData>> TransposeDictIndices(
+ const std::shared_ptr<ArrayData>& data, const std::shared_ptr<DataType>& in_type,
+ const std::shared_ptr<DataType>& out_type,
+ const std::shared_ptr<ArrayData>& dictionary, const int32_t* transpose_map,
+ MemoryPool* pool) {
+ // Note that in_type may be different from data->type if data is of type ExtensionType
+ if (in_type->id() != Type::DICTIONARY || out_type->id() != Type::DICTIONARY) {
+ return Status::TypeError("Expected dictionary type");
+ }
+ const int64_t in_dict_len = data->dictionary->length;
+ const auto& in_dict_type = checked_cast<const DictionaryType&>(*in_type);
+ const auto& out_dict_type = checked_cast<const DictionaryType&>(*out_type);
+
+ const auto& in_index_type = *in_dict_type.index_type();
+ const auto& out_index_type =
+ checked_cast<const FixedWidthType&>(*out_dict_type.index_type());
+
+ if (in_index_type.id() == out_index_type.id() &&
+ IsTrivialTransposition(transpose_map, in_dict_len)) {
+ // Index type and values will be identical => we can simply reuse
+ // the existing buffers.
+ auto out_data =
+ ArrayData::Make(out_type, data->length, {data->buffers[0], data->buffers[1]},
+ data->null_count, data->offset);
+ out_data->dictionary = dictionary;
+ return out_data;
+ }
+
+ // Default path: compute a buffer of transposed indices.
+ ARROW_ASSIGN_OR_RAISE(
+ auto out_buffer,
+ AllocateBuffer(data->length * (out_index_type.bit_width() / CHAR_BIT), pool));
+
+ // Shift null buffer if the original offset is non-zero
+ std::shared_ptr<Buffer> null_bitmap;
+ if (data->offset != 0 && data->null_count != 0) {
+ ARROW_ASSIGN_OR_RAISE(null_bitmap, CopyBitmap(pool, data->buffers[0]->data(),
+ data->offset, data->length));
+ } else {
+ null_bitmap = data->buffers[0];
+ }
+
+ auto out_data = ArrayData::Make(out_type, data->length,
+ {null_bitmap, std::move(out_buffer)}, data->null_count);
+ out_data->dictionary = dictionary;
+ RETURN_NOT_OK(internal::TransposeInts(
+ in_index_type, out_index_type, data->GetValues<uint8_t>(1, 0),
+ out_data->GetMutableValues<uint8_t>(1, 0), data->offset, out_data->offset,
+ data->length, transpose_map));
+ return out_data;
+}
+
+} // namespace
+
+Result<std::shared_ptr<Array>> DictionaryArray::Transpose(
+ const std::shared_ptr<DataType>& type, const std::shared_ptr<Array>& dictionary,
+ const int32_t* transpose_map, MemoryPool* pool) const {
+ ARROW_ASSIGN_OR_RAISE(auto transposed,
+ TransposeDictIndices(data_, data_->type, type, dictionary->data(),
+ transpose_map, pool));
+ return MakeArray(std::move(transposed));
+}
+
+// ----------------------------------------------------------------------
+// Dictionary unification
+
+namespace {
+
+template <typename T>
+class DictionaryUnifierImpl : public DictionaryUnifier {
+ public:
+ using ArrayType = typename TypeTraits<T>::ArrayType;
+ using DictTraits = typename internal::DictionaryTraits<T>;
+ using MemoTableType = typename DictTraits::MemoTableType;
+
+ DictionaryUnifierImpl(MemoryPool* pool, std::shared_ptr<DataType> value_type)
+ : pool_(pool), value_type_(value_type), memo_table_(pool) {}
+
+ Status Unify(const Array& dictionary, std::shared_ptr<Buffer>* out) override {
+ if (dictionary.null_count() > 0) {
+ return Status::Invalid("Cannot yet unify dictionaries with nulls");
+ }
+ if (!dictionary.type()->Equals(*value_type_)) {
+ return Status::Invalid("Dictionary type different from unifier: ",
+ dictionary.type()->ToString());
+ }
+ const ArrayType& values = checked_cast<const ArrayType&>(dictionary);
+ if (out != nullptr) {
+ ARROW_ASSIGN_OR_RAISE(auto result,
+ AllocateBuffer(dictionary.length() * sizeof(int32_t), pool_));
+ auto result_raw = reinterpret_cast<int32_t*>(result->mutable_data());
+ for (int64_t i = 0; i < values.length(); ++i) {
+ RETURN_NOT_OK(memo_table_.GetOrInsert(values.GetView(i), &result_raw[i]));
+ }
+ *out = std::move(result);
+ } else {
+ for (int64_t i = 0; i < values.length(); ++i) {
+ int32_t unused_memo_index;
+ RETURN_NOT_OK(memo_table_.GetOrInsert(values.GetView(i), &unused_memo_index));
+ }
+ }
+ return Status::OK();
+ }
+
+ Status Unify(const Array& dictionary) override { return Unify(dictionary, nullptr); }
+
+ Status GetResult(std::shared_ptr<DataType>* out_type,
+ std::shared_ptr<Array>* out_dict) override {
+ int64_t dict_length = memo_table_.size();
+ std::shared_ptr<DataType> index_type;
+ if (dict_length <= std::numeric_limits<int8_t>::max()) {
+ index_type = int8();
+ } else if (dict_length <= std::numeric_limits<int16_t>::max()) {
+ index_type = int16();
+ } else if (dict_length <= std::numeric_limits<int32_t>::max()) {
+ index_type = int32();
+ } else {
+ index_type = int64();
+ }
+ // Build unified dictionary type with the right index type
+ *out_type = arrow::dictionary(index_type, value_type_);
+
+ // Build unified dictionary array
+ std::shared_ptr<ArrayData> data;
+ RETURN_NOT_OK(DictTraits::GetDictionaryArrayData(pool_, value_type_, memo_table_,
+ 0 /* start_offset */, &data));
+ *out_dict = MakeArray(data);
+ return Status::OK();
+ }
+
+ Status GetResultWithIndexType(const std::shared_ptr<DataType>& index_type,
+ std::shared_ptr<Array>* out_dict) override {
+ int64_t dict_length = memo_table_.size();
+ if (!internal::IntegersCanFit(Datum(dict_length), *index_type).ok()) {
+ return Status::Invalid(
+ "These dictionaries cannot be combined. The unified dictionary requires a "
+ "larger index type.");
+ }
+
+ // Build unified dictionary array
+ std::shared_ptr<ArrayData> data;
+ RETURN_NOT_OK(DictTraits::GetDictionaryArrayData(pool_, value_type_, memo_table_,
+ 0 /* start_offset */, &data));
+ *out_dict = MakeArray(data);
+ return Status::OK();
+ }
+
+ private:
+ MemoryPool* pool_;
+ std::shared_ptr<DataType> value_type_;
+ MemoTableType memo_table_;
+};
+
+struct MakeUnifier {
+ MemoryPool* pool;
+ std::shared_ptr<DataType> value_type;
+ std::unique_ptr<DictionaryUnifier> result;
+
+ MakeUnifier(MemoryPool* pool, std::shared_ptr<DataType> value_type)
+ : pool(pool), value_type(value_type) {}
+
+ template <typename T>
+ enable_if_no_memoize<T, Status> Visit(const T&) {
+ // Default implementation for non-dictionary-supported datatypes
+ return Status::NotImplemented("Unification of ", *value_type,
+ " dictionaries is not implemented");
+ }
+
+ template <typename T>
+ enable_if_memoize<T, Status> Visit(const T&) {
+ result.reset(new DictionaryUnifierImpl<T>(pool, value_type));
+ return Status::OK();
+ }
+};
+
+struct RecursiveUnifier {
+ MemoryPool* pool;
+
+ // Return true if any of the arrays was changed (including descendents)
+ Result<bool> Unify(std::shared_ptr<DataType> type, ArrayDataVector* chunks) {
+ DCHECK(!chunks->empty());
+ bool changed = false;
+ std::shared_ptr<DataType> ext_type = nullptr;
+
+ if (type->id() == Type::EXTENSION) {
+ ext_type = std::move(type);
+ type = checked_cast<const ExtensionType&>(*ext_type).storage_type();
+ }
+
+ // Unify all child dictionaries (if any)
+ if (type->num_fields() > 0) {
+ ArrayDataVector children(chunks->size());
+ for (int i = 0; i < type->num_fields(); ++i) {
+ std::transform(chunks->begin(), chunks->end(), children.begin(),
+ [i](const std::shared_ptr<ArrayData>& array) {
+ return array->child_data[i];
+ });
+ ARROW_ASSIGN_OR_RAISE(bool child_changed,
+ Unify(type->field(i)->type(), &children));
+ if (child_changed) {
+ // Only do this when unification actually occurred
+ for (size_t j = 0; j < chunks->size(); ++j) {
+ (*chunks)[j]->child_data[i] = std::move(children[j]);
+ }
+ changed = true;
+ }
+ }
+ }
+
+ // Unify this dictionary
+ if (type->id() == Type::DICTIONARY) {
+ const auto& dict_type = checked_cast<const DictionaryType&>(*type);
+ // XXX Ideally, we should unify dictionaries nested in value_type first,
+ // but DictionaryUnifier doesn't supported nested dictionaries anyway,
+ // so this will fail.
+ ARROW_ASSIGN_OR_RAISE(auto unifier,
+ DictionaryUnifier::Make(dict_type.value_type(), this->pool));
+ // Unify all dictionary array chunks
+ BufferVector transpose_maps(chunks->size());
+ for (size_t j = 0; j < chunks->size(); ++j) {
+ DCHECK_NE((*chunks)[j]->dictionary, nullptr);
+ RETURN_NOT_OK(
+ unifier->Unify(*MakeArray((*chunks)[j]->dictionary), &transpose_maps[j]));
+ }
+ std::shared_ptr<Array> dictionary;
+ RETURN_NOT_OK(unifier->GetResultWithIndexType(dict_type.index_type(), &dictionary));
+ for (size_t j = 0; j < chunks->size(); ++j) {
+ ARROW_ASSIGN_OR_RAISE(
+ (*chunks)[j],
+ TransposeDictIndices(
+ (*chunks)[j], type, type, dictionary->data(),
+ reinterpret_cast<const int32_t*>(transpose_maps[j]->data()), this->pool));
+ if (ext_type) {
+ (*chunks)[j]->type = ext_type;
+ }
+ }
+ changed = true;
+ }
+
+ return changed;
+ }
+};
+
+} // namespace
+
+Result<std::unique_ptr<DictionaryUnifier>> DictionaryUnifier::Make(
+ std::shared_ptr<DataType> value_type, MemoryPool* pool) {
+ MakeUnifier maker(pool, value_type);
+ RETURN_NOT_OK(VisitTypeInline(*value_type, &maker));
+ return std::move(maker.result);
+}
+
+Result<std::shared_ptr<ChunkedArray>> DictionaryUnifier::UnifyChunkedArray(
+ const std::shared_ptr<ChunkedArray>& array, MemoryPool* pool) {
+ if (array->num_chunks() <= 1) {
+ return array;
+ }
+
+ ArrayDataVector data_chunks(array->num_chunks());
+ std::transform(array->chunks().begin(), array->chunks().end(), data_chunks.begin(),
+ [](const std::shared_ptr<Array>& array) { return array->data(); });
+ ARROW_ASSIGN_OR_RAISE(bool changed,
+ RecursiveUnifier{pool}.Unify(array->type(), &data_chunks));
+ if (!changed) {
+ return array;
+ }
+ ArrayVector chunks(array->num_chunks());
+ std::transform(data_chunks.begin(), data_chunks.end(), chunks.begin(),
+ [](const std::shared_ptr<ArrayData>& data) { return MakeArray(data); });
+ return std::make_shared<ChunkedArray>(std::move(chunks), array->type());
+}
+
+Result<std::shared_ptr<Table>> DictionaryUnifier::UnifyTable(const Table& table,
+ MemoryPool* pool) {
+ ChunkedArrayVector columns = table.columns();
+ for (auto& col : columns) {
+ ARROW_ASSIGN_OR_RAISE(col, DictionaryUnifier::UnifyChunkedArray(col, pool));
+ }
+ return Table::Make(table.schema(), std::move(columns), table.num_rows());
+}
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/array_dict.h b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_dict.h
new file mode 100644
index 00000000000..8791eaa07db
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_dict.h
@@ -0,0 +1,180 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+
+#include "arrow/array/array_base.h"
+#include "arrow/array/data.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+// ----------------------------------------------------------------------
+// DictionaryArray
+
+/// \brief Array type for dictionary-encoded data with a
+/// data-dependent dictionary
+///
+/// A dictionary array contains an array of non-negative integers (the
+/// "dictionary indices") along with a data type containing a "dictionary"
+/// corresponding to the distinct values represented in the data.
+///
+/// For example, the array
+///
+/// ["foo", "bar", "foo", "bar", "foo", "bar"]
+///
+/// with dictionary ["bar", "foo"], would have dictionary array representation
+///
+/// indices: [1, 0, 1, 0, 1, 0]
+/// dictionary: ["bar", "foo"]
+///
+/// The indices in principle may be any integer type.
+class ARROW_EXPORT DictionaryArray : public Array {
+ public:
+ using TypeClass = DictionaryType;
+
+ explicit DictionaryArray(const std::shared_ptr<ArrayData>& data);
+
+ DictionaryArray(const std::shared_ptr<DataType>& type,
+ const std::shared_ptr<Array>& indices,
+ const std::shared_ptr<Array>& dictionary);
+
+ /// \brief Construct DictionaryArray from dictionary and indices
+ /// array and validate
+ ///
+ /// This function does the validation of the indices and input type. It checks if
+ /// all indices are non-negative and smaller than the size of the dictionary.
+ ///
+ /// \param[in] type a dictionary type
+ /// \param[in] dictionary the dictionary with same value type as the
+ /// type object
+ /// \param[in] indices an array of non-negative integers smaller than the
+ /// size of the dictionary
+ static Result<std::shared_ptr<Array>> FromArrays(
+ const std::shared_ptr<DataType>& type, const std::shared_ptr<Array>& indices,
+ const std::shared_ptr<Array>& dictionary);
+
+ static Result<std::shared_ptr<Array>> FromArrays(
+ const std::shared_ptr<Array>& indices, const std::shared_ptr<Array>& dictionary) {
+ return FromArrays(::arrow::dictionary(indices->type(), dictionary->type()), indices,
+ dictionary);
+ }
+
+ /// \brief Transpose this DictionaryArray
+ ///
+ /// This method constructs a new dictionary array with the given dictionary
+ /// type, transposing indices using the transpose map. The type and the
+ /// transpose map are typically computed using DictionaryUnifier.
+ ///
+ /// \param[in] type the new type object
+ /// \param[in] dictionary the new dictionary
+ /// \param[in] transpose_map transposition array of this array's indices
+ /// into the target array's indices
+ /// \param[in] pool a pool to allocate the array data from
+ Result<std::shared_ptr<Array>> Transpose(
+ const std::shared_ptr<DataType>& type, const std::shared_ptr<Array>& dictionary,
+ const int32_t* transpose_map, MemoryPool* pool = default_memory_pool()) const;
+
+ /// \brief Determine whether dictionary arrays may be compared without unification
+ bool CanCompareIndices(const DictionaryArray& other) const;
+
+ /// \brief Return the dictionary for this array, which is stored as
+ /// a member of the ArrayData internal structure
+ std::shared_ptr<Array> dictionary() const;
+ std::shared_ptr<Array> indices() const;
+
+ /// \brief Return the ith value of indices, cast to int64_t. Not recommended
+ /// for use in performance-sensitive code. Does not validate whether the
+ /// value is null or out-of-bounds.
+ int64_t GetValueIndex(int64_t i) const;
+
+ const DictionaryType* dict_type() const { return dict_type_; }
+
+ private:
+ void SetData(const std::shared_ptr<ArrayData>& data);
+ const DictionaryType* dict_type_;
+ std::shared_ptr<Array> indices_;
+
+ // Lazily initialized when invoking dictionary()
+ mutable std::shared_ptr<Array> dictionary_;
+};
+
+/// \brief Helper class for incremental dictionary unification
+class ARROW_EXPORT DictionaryUnifier {
+ public:
+ virtual ~DictionaryUnifier() = default;
+
+ /// \brief Construct a DictionaryUnifier
+ /// \param[in] value_type the data type of the dictionaries
+ /// \param[in] pool MemoryPool to use for memory allocations
+ static Result<std::unique_ptr<DictionaryUnifier>> Make(
+ std::shared_ptr<DataType> value_type, MemoryPool* pool = default_memory_pool());
+
+ /// \brief Unify dictionaries accross array chunks
+ ///
+ /// The dictionaries in the array chunks will be unified, their indices
+ /// accordingly transposed.
+ ///
+ /// Only dictionaries with a primitive value type are currently supported.
+ /// However, dictionaries nested inside a more complex type are correctly unified.
+ static Result<std::shared_ptr<ChunkedArray>> UnifyChunkedArray(
+ const std::shared_ptr<ChunkedArray>& array,
+ MemoryPool* pool = default_memory_pool());
+
+ /// \brief Unify dictionaries accross the chunks of each table column
+ ///
+ /// The dictionaries in each table column will be unified, their indices
+ /// accordingly transposed.
+ ///
+ /// Only dictionaries with a primitive value type are currently supported.
+ /// However, dictionaries nested inside a more complex type are correctly unified.
+ static Result<std::shared_ptr<Table>> UnifyTable(
+ const Table& table, MemoryPool* pool = default_memory_pool());
+
+ /// \brief Append dictionary to the internal memo
+ virtual Status Unify(const Array& dictionary) = 0;
+
+ /// \brief Append dictionary and compute transpose indices
+ /// \param[in] dictionary the dictionary values to unify
+ /// \param[out] out_transpose a Buffer containing computed transpose indices
+ /// as int32_t values equal in length to the passed dictionary. The value in
+ /// each slot corresponds to the new index value for each original index
+ /// for a DictionaryArray with the old dictionary
+ virtual Status Unify(const Array& dictionary,
+ std::shared_ptr<Buffer>* out_transpose) = 0;
+
+ /// \brief Return a result DictionaryType with the smallest possible index
+ /// type to accommodate the unified dictionary. The unifier cannot be used
+ /// after this is called
+ virtual Status GetResult(std::shared_ptr<DataType>* out_type,
+ std::shared_ptr<Array>* out_dict) = 0;
+
+ /// \brief Return a unified dictionary with the given index type. If
+ /// the index type is not large enough then an invalid status will be returned.
+ /// The unifier cannot be used after this is called
+ virtual Status GetResultWithIndexType(const std::shared_ptr<DataType>& index_type,
+ std::shared_ptr<Array>* out_dict) = 0;
+};
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/array_nested.cc b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_nested.cc
new file mode 100644
index 00000000000..f967127c5f1
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_nested.cc
@@ -0,0 +1,757 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/array/array_nested.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/array/array_base.h"
+#include "arrow/array/array_primitive.h"
+#include "arrow/array/concatenate.h"
+#include "arrow/array/util.h"
+#include "arrow/buffer.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_fwd.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/atomic_shared_ptr.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_ops.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/logging.h"
+
+namespace arrow {
+
+using internal::BitmapAnd;
+using internal::checked_cast;
+using internal::checked_pointer_cast;
+using internal::CopyBitmap;
+
+// ----------------------------------------------------------------------
+// ListArray / LargeListArray
+
+namespace {
+
+template <typename TYPE>
+Status CleanListOffsets(const Array& offsets, MemoryPool* pool,
+ std::shared_ptr<Buffer>* offset_buf_out,
+ std::shared_ptr<Buffer>* validity_buf_out) {
+ using offset_type = typename TYPE::offset_type;
+ using OffsetArrowType = typename CTypeTraits<offset_type>::ArrowType;
+ using OffsetArrayType = typename TypeTraits<OffsetArrowType>::ArrayType;
+
+ const auto& typed_offsets = checked_cast<const OffsetArrayType&>(offsets);
+ const int64_t num_offsets = offsets.length();
+
+ if (offsets.null_count() > 0) {
+ if (!offsets.IsValid(num_offsets - 1)) {
+ return Status::Invalid("Last list offset should be non-null");
+ }
+
+ ARROW_ASSIGN_OR_RAISE(auto clean_offsets,
+ AllocateBuffer(num_offsets * sizeof(offset_type), pool));
+
+ // Copy valid bits, ignoring the final offset (since for a length N list array,
+ // we have N + 1 offsets)
+ ARROW_ASSIGN_OR_RAISE(
+ auto clean_valid_bits,
+ offsets.null_bitmap()->CopySlice(0, BitUtil::BytesForBits(num_offsets - 1)));
+ *validity_buf_out = clean_valid_bits;
+
+ const offset_type* raw_offsets = typed_offsets.raw_values();
+ auto clean_raw_offsets =
+ reinterpret_cast<offset_type*>(clean_offsets->mutable_data());
+
+ // Must work backwards so we can tell how many values were in the last non-null value
+ offset_type current_offset = raw_offsets[num_offsets - 1];
+ for (int64_t i = num_offsets - 1; i >= 0; --i) {
+ if (offsets.IsValid(i)) {
+ current_offset = raw_offsets[i];
+ }
+ clean_raw_offsets[i] = current_offset;
+ }
+
+ *offset_buf_out = std::move(clean_offsets);
+ } else {
+ *validity_buf_out = offsets.null_bitmap();
+ *offset_buf_out = typed_offsets.values();
+ }
+
+ return Status::OK();
+}
+
+template <typename TYPE>
+Result<std::shared_ptr<typename TypeTraits<TYPE>::ArrayType>> ListArrayFromArrays(
+ const Array& offsets, const Array& values, MemoryPool* pool) {
+ using offset_type = typename TYPE::offset_type;
+ using ArrayType = typename TypeTraits<TYPE>::ArrayType;
+ using OffsetArrowType = typename CTypeTraits<offset_type>::ArrowType;
+
+ if (offsets.length() == 0) {
+ return Status::Invalid("List offsets must have non-zero length");
+ }
+
+ if (offsets.type_id() != OffsetArrowType::type_id) {
+ return Status::TypeError("List offsets must be ", OffsetArrowType::type_name());
+ }
+
+ std::shared_ptr<Buffer> offset_buf, validity_buf;
+ RETURN_NOT_OK(CleanListOffsets<TYPE>(offsets, pool, &offset_buf, &validity_buf));
+ BufferVector buffers = {validity_buf, offset_buf};
+
+ auto list_type = std::make_shared<TYPE>(values.type());
+ auto internal_data =
+ ArrayData::Make(list_type, offsets.length() - 1, std::move(buffers),
+ offsets.null_count(), offsets.offset());
+ internal_data->child_data.push_back(values.data());
+
+ return std::make_shared<ArrayType>(internal_data);
+}
+
+static std::shared_ptr<Array> SliceArrayWithOffsets(const Array& array, int64_t begin,
+ int64_t end) {
+ return array.Slice(begin, end - begin);
+}
+
+template <typename ListArrayT>
+Result<std::shared_ptr<Array>> FlattenListArray(const ListArrayT& list_array,
+ MemoryPool* memory_pool) {
+ const int64_t list_array_length = list_array.length();
+ std::shared_ptr<arrow::Array> value_array = list_array.values();
+
+ // Shortcut: if a ListArray does not contain nulls, then simply slice its
+ // value array with the first and the last offsets.
+ if (list_array.null_count() == 0) {
+ return SliceArrayWithOffsets(*value_array, list_array.value_offset(0),
+ list_array.value_offset(list_array_length));
+ }
+
+ // The ListArray contains nulls: there may be a non-empty sub-list behind
+ // a null and it must not be contained in the result.
+ std::vector<std::shared_ptr<Array>> non_null_fragments;
+ int64_t valid_begin = 0;
+ while (valid_begin < list_array_length) {
+ int64_t valid_end = valid_begin;
+ while (valid_end < list_array_length &&
+ (list_array.IsValid(valid_end) || list_array.value_length(valid_end) == 0)) {
+ ++valid_end;
+ }
+ if (valid_begin < valid_end) {
+ non_null_fragments.push_back(
+ SliceArrayWithOffsets(*value_array, list_array.value_offset(valid_begin),
+ list_array.value_offset(valid_end)));
+ }
+ valid_begin = valid_end + 1; // skip null entry
+ }
+
+ // Final attempt to avoid invoking Concatenate().
+ if (non_null_fragments.size() == 1) {
+ return non_null_fragments[0];
+ }
+
+ return Concatenate(non_null_fragments, memory_pool);
+}
+
+} // namespace
+
+namespace internal {
+
+template <typename TYPE>
+inline void SetListData(BaseListArray<TYPE>* self, const std::shared_ptr<ArrayData>& data,
+ Type::type expected_type_id) {
+ ARROW_CHECK_EQ(data->buffers.size(), 2);
+ ARROW_CHECK_EQ(data->type->id(), expected_type_id);
+ ARROW_CHECK_EQ(data->child_data.size(), 1);
+
+ self->Array::SetData(data);
+
+ self->list_type_ = checked_cast<const TYPE*>(data->type.get());
+ self->raw_value_offsets_ =
+ data->GetValuesSafe<typename TYPE::offset_type>(1, /*offset=*/0);
+
+ ARROW_CHECK_EQ(self->list_type_->value_type()->id(), data->child_data[0]->type->id());
+ DCHECK(self->list_type_->value_type()->Equals(data->child_data[0]->type));
+ self->values_ = MakeArray(self->data_->child_data[0]);
+}
+
+} // namespace internal
+
+ListArray::ListArray(std::shared_ptr<ArrayData> data) { SetData(std::move(data)); }
+
+LargeListArray::LargeListArray(const std::shared_ptr<ArrayData>& data) { SetData(data); }
+
+ListArray::ListArray(std::shared_ptr<DataType> type, int64_t length,
+ std::shared_ptr<Buffer> value_offsets, std::shared_ptr<Array> values,
+ std::shared_ptr<Buffer> null_bitmap, int64_t null_count,
+ int64_t offset) {
+ ARROW_CHECK_EQ(type->id(), Type::LIST);
+ auto internal_data = ArrayData::Make(
+ std::move(type), length,
+ BufferVector{std::move(null_bitmap), std::move(value_offsets)}, null_count, offset);
+ internal_data->child_data.emplace_back(values->data());
+ SetData(std::move(internal_data));
+}
+
+void ListArray::SetData(const std::shared_ptr<ArrayData>& data) {
+ internal::SetListData(this, data);
+}
+
+LargeListArray::LargeListArray(const std::shared_ptr<DataType>& type, int64_t length,
+ const std::shared_ptr<Buffer>& value_offsets,
+ const std::shared_ptr<Array>& values,
+ const std::shared_ptr<Buffer>& null_bitmap,
+ int64_t null_count, int64_t offset) {
+ ARROW_CHECK_EQ(type->id(), Type::LARGE_LIST);
+ auto internal_data =
+ ArrayData::Make(type, length, {null_bitmap, value_offsets}, null_count, offset);
+ internal_data->child_data.emplace_back(values->data());
+ SetData(internal_data);
+}
+
+void LargeListArray::SetData(const std::shared_ptr<ArrayData>& data) {
+ internal::SetListData(this, data);
+}
+
+Result<std::shared_ptr<ListArray>> ListArray::FromArrays(const Array& offsets,
+ const Array& values,
+ MemoryPool* pool) {
+ return ListArrayFromArrays<ListType>(offsets, values, pool);
+}
+
+Result<std::shared_ptr<LargeListArray>> LargeListArray::FromArrays(const Array& offsets,
+ const Array& values,
+ MemoryPool* pool) {
+ return ListArrayFromArrays<LargeListType>(offsets, values, pool);
+}
+
+Result<std::shared_ptr<Array>> ListArray::Flatten(MemoryPool* memory_pool) const {
+ return FlattenListArray(*this, memory_pool);
+}
+
+Result<std::shared_ptr<Array>> LargeListArray::Flatten(MemoryPool* memory_pool) const {
+ return FlattenListArray(*this, memory_pool);
+}
+
+static std::shared_ptr<Array> BoxOffsets(const std::shared_ptr<DataType>& boxed_type,
+ const ArrayData& data) {
+ std::vector<std::shared_ptr<Buffer>> buffers = {nullptr, data.buffers[1]};
+ auto offsets_data =
+ std::make_shared<ArrayData>(boxed_type, data.length + 1, std::move(buffers),
+ /*null_count=*/0, data.offset);
+ return MakeArray(offsets_data);
+}
+
+std::shared_ptr<Array> ListArray::offsets() const { return BoxOffsets(int32(), *data_); }
+
+std::shared_ptr<Array> LargeListArray::offsets() const {
+ return BoxOffsets(int64(), *data_);
+}
+
+// ----------------------------------------------------------------------
+// MapArray
+
+MapArray::MapArray(const std::shared_ptr<ArrayData>& data) { SetData(data); }
+
+MapArray::MapArray(const std::shared_ptr<DataType>& type, int64_t length,
+ const std::shared_ptr<Buffer>& offsets,
+ const std::shared_ptr<Array>& values,
+ const std::shared_ptr<Buffer>& null_bitmap, int64_t null_count,
+ int64_t offset) {
+ SetData(ArrayData::Make(type, length, {null_bitmap, offsets}, {values->data()},
+ null_count, offset));
+}
+
+MapArray::MapArray(const std::shared_ptr<DataType>& type, int64_t length,
+ const std::shared_ptr<Buffer>& offsets,
+ const std::shared_ptr<Array>& keys,
+ const std::shared_ptr<Array>& items,
+ const std::shared_ptr<Buffer>& null_bitmap, int64_t null_count,
+ int64_t offset) {
+ auto pair_data = ArrayData::Make(type->fields()[0]->type(), keys->data()->length,
+ {nullptr}, {keys->data(), items->data()}, 0, offset);
+ auto map_data = ArrayData::Make(type, length, {null_bitmap, offsets}, {pair_data},
+ null_count, offset);
+ SetData(map_data);
+}
+
+Result<std::shared_ptr<Array>> MapArray::FromArraysInternal(
+ std::shared_ptr<DataType> type, const std::shared_ptr<Array>& offsets,
+ const std::shared_ptr<Array>& keys, const std::shared_ptr<Array>& items,
+ MemoryPool* pool) {
+ using offset_type = typename MapType::offset_type;
+ using OffsetArrowType = typename CTypeTraits<offset_type>::ArrowType;
+
+ if (offsets->length() == 0) {
+ return Status::Invalid("Map offsets must have non-zero length");
+ }
+
+ if (offsets->type_id() != OffsetArrowType::type_id) {
+ return Status::TypeError("Map offsets must be ", OffsetArrowType::type_name());
+ }
+
+ if (keys->null_count() != 0) {
+ return Status::Invalid("Map can not contain NULL valued keys");
+ }
+
+ if (keys->length() != items->length()) {
+ return Status::Invalid("Map key and item arrays must be equal length");
+ }
+
+ std::shared_ptr<Buffer> offset_buf, validity_buf;
+ RETURN_NOT_OK(CleanListOffsets<MapType>(*offsets, pool, &offset_buf, &validity_buf));
+
+ return std::make_shared<MapArray>(type, offsets->length() - 1, offset_buf, keys, items,
+ validity_buf, offsets->null_count(),
+ offsets->offset());
+}
+
+Result<std::shared_ptr<Array>> MapArray::FromArrays(const std::shared_ptr<Array>& offsets,
+ const std::shared_ptr<Array>& keys,
+ const std::shared_ptr<Array>& items,
+ MemoryPool* pool) {
+ return FromArraysInternal(std::make_shared<MapType>(keys->type(), items->type()),
+ offsets, keys, items, pool);
+}
+
+Result<std::shared_ptr<Array>> MapArray::FromArrays(std::shared_ptr<DataType> type,
+ const std::shared_ptr<Array>& offsets,
+ const std::shared_ptr<Array>& keys,
+ const std::shared_ptr<Array>& items,
+ MemoryPool* pool) {
+ if (type->id() != Type::MAP) {
+ return Status::TypeError("Expected map type, got ", type->ToString());
+ }
+ const auto& map_type = checked_cast<const MapType&>(*type);
+ if (!map_type.key_type()->Equals(keys->type())) {
+ return Status::TypeError("Mismatching map keys type");
+ }
+ if (!map_type.item_type()->Equals(items->type())) {
+ return Status::TypeError("Mismatching map items type");
+ }
+ return FromArraysInternal(std::move(type), offsets, keys, items, pool);
+}
+
+Status MapArray::ValidateChildData(
+ const std::vector<std::shared_ptr<ArrayData>>& child_data) {
+ if (child_data.size() != 1) {
+ return Status::Invalid("Expected one child array for map array");
+ }
+ const auto& pair_data = child_data[0];
+ if (pair_data->type->id() != Type::STRUCT) {
+ return Status::Invalid("Map array child array should have struct type");
+ }
+ if (pair_data->null_count != 0) {
+ return Status::Invalid("Map array child array should have no nulls");
+ }
+ if (pair_data->child_data.size() != 2) {
+ return Status::Invalid("Map array child array should have two fields");
+ }
+ if (pair_data->child_data[0]->null_count != 0) {
+ return Status::Invalid("Map array keys array should have no nulls");
+ }
+ return Status::OK();
+}
+
+void MapArray::SetData(const std::shared_ptr<ArrayData>& data) {
+ ARROW_CHECK_OK(ValidateChildData(data->child_data));
+
+ internal::SetListData(this, data, Type::MAP);
+ map_type_ = checked_cast<const MapType*>(data->type.get());
+ const auto& pair_data = data->child_data[0];
+ keys_ = MakeArray(pair_data->child_data[0]);
+ items_ = MakeArray(pair_data->child_data[1]);
+}
+
+// ----------------------------------------------------------------------
+// FixedSizeListArray
+
+FixedSizeListArray::FixedSizeListArray(const std::shared_ptr<ArrayData>& data) {
+ SetData(data);
+}
+
+FixedSizeListArray::FixedSizeListArray(const std::shared_ptr<DataType>& type,
+ int64_t length,
+ const std::shared_ptr<Array>& values,
+ const std::shared_ptr<Buffer>& null_bitmap,
+ int64_t null_count, int64_t offset) {
+ auto internal_data = ArrayData::Make(type, length, {null_bitmap}, null_count, offset);
+ internal_data->child_data.emplace_back(values->data());
+ SetData(internal_data);
+}
+
+void FixedSizeListArray::SetData(const std::shared_ptr<ArrayData>& data) {
+ ARROW_CHECK_EQ(data->type->id(), Type::FIXED_SIZE_LIST);
+ this->Array::SetData(data);
+
+ ARROW_CHECK_EQ(list_type()->value_type()->id(), data->child_data[0]->type->id());
+ DCHECK(list_type()->value_type()->Equals(data->child_data[0]->type));
+ list_size_ = list_type()->list_size();
+
+ ARROW_CHECK_EQ(data_->child_data.size(), 1);
+ values_ = MakeArray(data_->child_data[0]);
+}
+
+const FixedSizeListType* FixedSizeListArray::list_type() const {
+ return checked_cast<const FixedSizeListType*>(data_->type.get());
+}
+
+std::shared_ptr<DataType> FixedSizeListArray::value_type() const {
+ return list_type()->value_type();
+}
+
+std::shared_ptr<Array> FixedSizeListArray::values() const { return values_; }
+
+Result<std::shared_ptr<Array>> FixedSizeListArray::FromArrays(
+ const std::shared_ptr<Array>& values, int32_t list_size) {
+ if (list_size <= 0) {
+ return Status::Invalid("list_size needs to be a strict positive integer");
+ }
+
+ if ((values->length() % list_size) != 0) {
+ return Status::Invalid(
+ "The length of the values Array needs to be a multiple of the list_size");
+ }
+ int64_t length = values->length() / list_size;
+ auto list_type = std::make_shared<FixedSizeListType>(values->type(), list_size);
+ std::shared_ptr<Buffer> validity_buf;
+
+ return std::make_shared<FixedSizeListArray>(list_type, length, values, validity_buf,
+ /*null_count=*/0, /*offset=*/0);
+}
+
+// ----------------------------------------------------------------------
+// Struct
+
+StructArray::StructArray(const std::shared_ptr<ArrayData>& data) {
+ ARROW_CHECK_EQ(data->type->id(), Type::STRUCT);
+ SetData(data);
+ boxed_fields_.resize(data->child_data.size());
+}
+
+StructArray::StructArray(const std::shared_ptr<DataType>& type, int64_t length,
+ const std::vector<std::shared_ptr<Array>>& children,
+ std::shared_ptr<Buffer> null_bitmap, int64_t null_count,
+ int64_t offset) {
+ ARROW_CHECK_EQ(type->id(), Type::STRUCT);
+ SetData(ArrayData::Make(type, length, {null_bitmap}, null_count, offset));
+ for (const auto& child : children) {
+ data_->child_data.push_back(child->data());
+ }
+ boxed_fields_.resize(children.size());
+}
+
+Result<std::shared_ptr<StructArray>> StructArray::Make(
+ const std::vector<std::shared_ptr<Array>>& children,
+ const std::vector<std::shared_ptr<Field>>& fields,
+ std::shared_ptr<Buffer> null_bitmap, int64_t null_count, int64_t offset) {
+ if (children.size() != fields.size()) {
+ return Status::Invalid("Mismatching number of fields and child arrays");
+ }
+ int64_t length = 0;
+ if (children.size() == 0) {
+ return Status::Invalid("Can't infer struct array length with 0 child arrays");
+ }
+ length = children.front()->length();
+ for (const auto& child : children) {
+ if (length != child->length()) {
+ return Status::Invalid("Mismatching child array lengths");
+ }
+ }
+ if (offset > length) {
+ return Status::IndexError("Offset greater than length of child arrays");
+ }
+ if (null_bitmap == nullptr) {
+ if (null_count > 0) {
+ return Status::Invalid("null_count = ", null_count, " but no null bitmap given");
+ }
+ null_count = 0;
+ }
+ return std::make_shared<StructArray>(struct_(fields), length - offset, children,
+ null_bitmap, null_count, offset);
+}
+
+Result<std::shared_ptr<StructArray>> StructArray::Make(
+ const std::vector<std::shared_ptr<Array>>& children,
+ const std::vector<std::string>& field_names, std::shared_ptr<Buffer> null_bitmap,
+ int64_t null_count, int64_t offset) {
+ if (children.size() != field_names.size()) {
+ return Status::Invalid("Mismatching number of field names and child arrays");
+ }
+ std::vector<std::shared_ptr<Field>> fields(children.size());
+ for (size_t i = 0; i < children.size(); ++i) {
+ fields[i] = ::arrow::field(field_names[i], children[i]->type());
+ }
+ return Make(children, fields, std::move(null_bitmap), null_count, offset);
+}
+
+const StructType* StructArray::struct_type() const {
+ return checked_cast<const StructType*>(data_->type.get());
+}
+
+const ArrayVector& StructArray::fields() const {
+ for (int i = 0; i < num_fields(); ++i) {
+ (void)field(i);
+ }
+ return boxed_fields_;
+}
+
+std::shared_ptr<Array> StructArray::field(int i) const {
+ std::shared_ptr<Array> result = internal::atomic_load(&boxed_fields_[i]);
+ if (!result) {
+ std::shared_ptr<ArrayData> field_data;
+ if (data_->offset != 0 || data_->child_data[i]->length != data_->length) {
+ field_data = data_->child_data[i]->Slice(data_->offset, data_->length);
+ } else {
+ field_data = data_->child_data[i];
+ }
+ result = MakeArray(field_data);
+ internal::atomic_store(&boxed_fields_[i], result);
+ }
+ return result;
+}
+
+std::shared_ptr<Array> StructArray::GetFieldByName(const std::string& name) const {
+ int i = struct_type()->GetFieldIndex(name);
+ return i == -1 ? nullptr : field(i);
+}
+
+Result<ArrayVector> StructArray::Flatten(MemoryPool* pool) const {
+ ArrayVector flattened;
+ flattened.reserve(data_->child_data.size());
+ std::shared_ptr<Buffer> null_bitmap = data_->buffers[0];
+
+ for (const auto& child_data_ptr : data_->child_data) {
+ auto child_data = child_data_ptr->Copy();
+
+ std::shared_ptr<Buffer> flattened_null_bitmap;
+ int64_t flattened_null_count = kUnknownNullCount;
+
+ // Need to adjust for parent offset
+ if (data_->offset != 0 || data_->length != child_data->length) {
+ child_data = child_data->Slice(data_->offset, data_->length);
+ }
+ std::shared_ptr<Buffer> child_null_bitmap = child_data->buffers[0];
+ const int64_t child_offset = child_data->offset;
+
+ // The validity of a flattened datum is the logical AND of the struct
+ // element's validity and the individual field element's validity.
+ if (null_bitmap && child_null_bitmap) {
+ ARROW_ASSIGN_OR_RAISE(
+ flattened_null_bitmap,
+ BitmapAnd(pool, child_null_bitmap->data(), child_offset, null_bitmap_data_,
+ data_->offset, data_->length, child_offset));
+ } else if (child_null_bitmap) {
+ flattened_null_bitmap = child_null_bitmap;
+ flattened_null_count = child_data->null_count;
+ } else if (null_bitmap) {
+ if (child_offset == data_->offset) {
+ flattened_null_bitmap = null_bitmap;
+ } else {
+ ARROW_ASSIGN_OR_RAISE(
+ flattened_null_bitmap,
+ CopyBitmap(pool, null_bitmap_data_, data_->offset, data_->length));
+ }
+ flattened_null_count = data_->null_count;
+ } else {
+ flattened_null_count = 0;
+ }
+
+ auto flattened_data = child_data->Copy();
+ flattened_data->buffers[0] = flattened_null_bitmap;
+ flattened_data->null_count = flattened_null_count;
+
+ flattened.push_back(MakeArray(flattened_data));
+ }
+
+ return flattened;
+}
+
+// ----------------------------------------------------------------------
+// UnionArray
+
+void UnionArray::SetData(std::shared_ptr<ArrayData> data) {
+ this->Array::SetData(std::move(data));
+
+ union_type_ = checked_cast<const UnionType*>(data_->type.get());
+
+ ARROW_CHECK_GE(data_->buffers.size(), 2);
+ raw_type_codes_ = data->GetValuesSafe<int8_t>(1, /*offset=*/0);
+ boxed_fields_.resize(data_->child_data.size());
+}
+
+void SparseUnionArray::SetData(std::shared_ptr<ArrayData> data) {
+ this->UnionArray::SetData(std::move(data));
+ ARROW_CHECK_EQ(data_->type->id(), Type::SPARSE_UNION);
+ ARROW_CHECK_EQ(data_->buffers.size(), 2);
+
+ // No validity bitmap
+ ARROW_CHECK_EQ(data_->buffers[0], nullptr);
+}
+
+void DenseUnionArray::SetData(const std::shared_ptr<ArrayData>& data) {
+ this->UnionArray::SetData(std::move(data));
+
+ ARROW_CHECK_EQ(data_->type->id(), Type::DENSE_UNION);
+ ARROW_CHECK_EQ(data_->buffers.size(), 3);
+
+ // No validity bitmap
+ ARROW_CHECK_EQ(data_->buffers[0], nullptr);
+
+ raw_value_offsets_ = data->GetValuesSafe<int32_t>(2, /*offset=*/0);
+}
+
+SparseUnionArray::SparseUnionArray(std::shared_ptr<ArrayData> data) {
+ SetData(std::move(data));
+}
+
+SparseUnionArray::SparseUnionArray(std::shared_ptr<DataType> type, int64_t length,
+ ArrayVector children,
+ std::shared_ptr<Buffer> type_codes, int64_t offset) {
+ auto internal_data = ArrayData::Make(std::move(type), length,
+ BufferVector{nullptr, std::move(type_codes)},
+ /*null_count=*/0, offset);
+ for (const auto& child : children) {
+ internal_data->child_data.push_back(child->data());
+ }
+ SetData(std::move(internal_data));
+}
+
+DenseUnionArray::DenseUnionArray(const std::shared_ptr<ArrayData>& data) {
+ SetData(data);
+}
+
+DenseUnionArray::DenseUnionArray(std::shared_ptr<DataType> type, int64_t length,
+ ArrayVector children, std::shared_ptr<Buffer> type_ids,
+ std::shared_ptr<Buffer> value_offsets, int64_t offset) {
+ auto internal_data = ArrayData::Make(
+ std::move(type), length,
+ BufferVector{nullptr, std::move(type_ids), std::move(value_offsets)},
+ /*null_count=*/0, offset);
+ for (const auto& child : children) {
+ internal_data->child_data.push_back(child->data());
+ }
+ SetData(internal_data);
+}
+
+Result<std::shared_ptr<Array>> DenseUnionArray::Make(
+ const Array& type_ids, const Array& value_offsets, ArrayVector children,
+ std::vector<std::string> field_names, std::vector<type_code_t> type_codes) {
+ if (value_offsets.length() == 0) {
+ return Status::Invalid("UnionArray offsets must have non-zero length");
+ }
+
+ if (value_offsets.type_id() != Type::INT32) {
+ return Status::TypeError("UnionArray offsets must be signed int32");
+ }
+
+ if (type_ids.type_id() != Type::INT8) {
+ return Status::TypeError("UnionArray type_ids must be signed int8");
+ }
+
+ if (type_ids.null_count() != 0) {
+ return Status::Invalid("Union type ids may not have nulls");
+ }
+
+ if (value_offsets.null_count() != 0) {
+ return Status::Invalid("Make does not allow nulls in value_offsets");
+ }
+
+ if (field_names.size() > 0 && field_names.size() != children.size()) {
+ return Status::Invalid("field_names must have the same length as children");
+ }
+
+ if (type_codes.size() > 0 && type_codes.size() != children.size()) {
+ return Status::Invalid("type_codes must have the same length as children");
+ }
+
+ BufferVector buffers = {nullptr, checked_cast<const Int8Array&>(type_ids).values(),
+ checked_cast<const Int32Array&>(value_offsets).values()};
+
+ auto union_type = dense_union(children, std::move(field_names), std::move(type_codes));
+ auto internal_data =
+ ArrayData::Make(std::move(union_type), type_ids.length(), std::move(buffers),
+ /*null_count=*/0, type_ids.offset());
+ for (const auto& child : children) {
+ internal_data->child_data.push_back(child->data());
+ }
+ return std::make_shared<DenseUnionArray>(std::move(internal_data));
+}
+
+Result<std::shared_ptr<Array>> SparseUnionArray::Make(
+ const Array& type_ids, ArrayVector children, std::vector<std::string> field_names,
+ std::vector<int8_t> type_codes) {
+ if (type_ids.type_id() != Type::INT8) {
+ return Status::TypeError("UnionArray type_ids must be signed int8");
+ }
+
+ if (type_ids.null_count() != 0) {
+ return Status::Invalid("Union type ids may not have nulls");
+ }
+
+ if (field_names.size() > 0 && field_names.size() != children.size()) {
+ return Status::Invalid("field_names must have the same length as children");
+ }
+
+ if (type_codes.size() > 0 && type_codes.size() != children.size()) {
+ return Status::Invalid("type_codes must have the same length as children");
+ }
+
+ BufferVector buffers = {nullptr, checked_cast<const Int8Array&>(type_ids).values()};
+ auto union_type = sparse_union(children, std::move(field_names), std::move(type_codes));
+ auto internal_data =
+ ArrayData::Make(std::move(union_type), type_ids.length(), std::move(buffers),
+ /*null_count=*/0, type_ids.offset());
+ for (const auto& child : children) {
+ internal_data->child_data.push_back(child->data());
+ if (child->length() != type_ids.length()) {
+ return Status::Invalid(
+ "Sparse UnionArray must have len(child) == len(type_ids) for all children");
+ }
+ }
+ return std::make_shared<SparseUnionArray>(std::move(internal_data));
+}
+
+std::shared_ptr<Array> UnionArray::child(int i) const { return field(i); }
+
+std::shared_ptr<Array> UnionArray::field(int i) const {
+ if (i < 0 ||
+ static_cast<decltype(boxed_fields_)::size_type>(i) >= boxed_fields_.size()) {
+ return nullptr;
+ }
+ std::shared_ptr<Array> result = internal::atomic_load(&boxed_fields_[i]);
+ if (!result) {
+ std::shared_ptr<ArrayData> child_data = data_->child_data[i]->Copy();
+ if (mode() == UnionMode::SPARSE) {
+ // Sparse union: need to adjust child if union is sliced
+ // (for dense unions, the need to lookup through the offsets
+ // makes this unnecessary)
+ if (data_->offset != 0 || child_data->length > data_->length) {
+ child_data = child_data->Slice(data_->offset, data_->length);
+ }
+ }
+ result = MakeArray(child_data);
+ internal::atomic_store(&boxed_fields_[i], result);
+ }
+ return result;
+}
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/array_nested.h b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_nested.h
new file mode 100644
index 00000000000..d39f33f4702
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_nested.h
@@ -0,0 +1,523 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Array accessor classes for List, LargeList, FixedSizeList, Map, Struct, and
+// Union
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/array/array_base.h"
+#include "arrow/array/data.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+// ----------------------------------------------------------------------
+// ListArray
+
+template <typename TYPE>
+class BaseListArray;
+
+namespace internal {
+
+// Private helper for ListArray::SetData.
+// Unfortunately, trying to define BaseListArray::SetData outside of this header
+// doesn't play well with MSVC.
+template <typename TYPE>
+void SetListData(BaseListArray<TYPE>* self, const std::shared_ptr<ArrayData>& data,
+ Type::type expected_type_id = TYPE::type_id);
+
+} // namespace internal
+
+/// Base class for variable-sized list arrays, regardless of offset size.
+template <typename TYPE>
+class BaseListArray : public Array {
+ public:
+ using TypeClass = TYPE;
+ using offset_type = typename TypeClass::offset_type;
+
+ const TypeClass* list_type() const { return list_type_; }
+
+ /// \brief Return array object containing the list's values
+ std::shared_ptr<Array> values() const { return values_; }
+
+ /// Note that this buffer does not account for any slice offset
+ std::shared_ptr<Buffer> value_offsets() const { return data_->buffers[1]; }
+
+ std::shared_ptr<DataType> value_type() const { return list_type_->value_type(); }
+
+ /// Return pointer to raw value offsets accounting for any slice offset
+ const offset_type* raw_value_offsets() const {
+ return raw_value_offsets_ + data_->offset;
+ }
+
+ // The following functions will not perform boundschecking
+ offset_type value_offset(int64_t i) const {
+ return raw_value_offsets_[i + data_->offset];
+ }
+ offset_type value_length(int64_t i) const {
+ i += data_->offset;
+ return raw_value_offsets_[i + 1] - raw_value_offsets_[i];
+ }
+ std::shared_ptr<Array> value_slice(int64_t i) const {
+ return values_->Slice(value_offset(i), value_length(i));
+ }
+
+ protected:
+ friend void internal::SetListData<TYPE>(BaseListArray<TYPE>* self,
+ const std::shared_ptr<ArrayData>& data,
+ Type::type expected_type_id);
+
+ const TypeClass* list_type_ = NULLPTR;
+ std::shared_ptr<Array> values_;
+ const offset_type* raw_value_offsets_ = NULLPTR;
+};
+
+/// Concrete Array class for list data
+class ARROW_EXPORT ListArray : public BaseListArray<ListType> {
+ public:
+ explicit ListArray(std::shared_ptr<ArrayData> data);
+
+ ListArray(std::shared_ptr<DataType> type, int64_t length,
+ std::shared_ptr<Buffer> value_offsets, std::shared_ptr<Array> values,
+ std::shared_ptr<Buffer> null_bitmap = NULLPTR,
+ int64_t null_count = kUnknownNullCount, int64_t offset = 0);
+
+ /// \brief Construct ListArray from array of offsets and child value array
+ ///
+ /// This function does the bare minimum of validation of the offsets and
+ /// input types, and will allocate a new offsets array if necessary (i.e. if
+ /// the offsets contain any nulls). If the offsets do not have nulls, they
+ /// are assumed to be well-formed
+ ///
+ /// \param[in] offsets Array containing n + 1 offsets encoding length and
+ /// size. Must be of int32 type
+ /// \param[in] values Array containing list values
+ /// \param[in] pool MemoryPool in case new offsets array needs to be
+ /// allocated because of null values
+ static Result<std::shared_ptr<ListArray>> FromArrays(
+ const Array& offsets, const Array& values,
+ MemoryPool* pool = default_memory_pool());
+
+ /// \brief Return an Array that is a concatenation of the lists in this array.
+ ///
+ /// Note that it's different from `values()` in that it takes into
+ /// consideration of this array's offsets as well as null elements backed
+ /// by non-empty lists (they are skipped, thus copying may be needed).
+ Result<std::shared_ptr<Array>> Flatten(
+ MemoryPool* memory_pool = default_memory_pool()) const;
+
+ /// \brief Return list offsets as an Int32Array
+ std::shared_ptr<Array> offsets() const;
+
+ protected:
+ // This constructor defers SetData to a derived array class
+ ListArray() = default;
+
+ void SetData(const std::shared_ptr<ArrayData>& data);
+};
+
+/// Concrete Array class for large list data (with 64-bit offsets)
+class ARROW_EXPORT LargeListArray : public BaseListArray<LargeListType> {
+ public:
+ explicit LargeListArray(const std::shared_ptr<ArrayData>& data);
+
+ LargeListArray(const std::shared_ptr<DataType>& type, int64_t length,
+ const std::shared_ptr<Buffer>& value_offsets,
+ const std::shared_ptr<Array>& values,
+ const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
+ int64_t null_count = kUnknownNullCount, int64_t offset = 0);
+
+ /// \brief Construct LargeListArray from array of offsets and child value array
+ ///
+ /// This function does the bare minimum of validation of the offsets and
+ /// input types, and will allocate a new offsets array if necessary (i.e. if
+ /// the offsets contain any nulls). If the offsets do not have nulls, they
+ /// are assumed to be well-formed
+ ///
+ /// \param[in] offsets Array containing n + 1 offsets encoding length and
+ /// size. Must be of int64 type
+ /// \param[in] values Array containing list values
+ /// \param[in] pool MemoryPool in case new offsets array needs to be
+ /// allocated because of null values
+ static Result<std::shared_ptr<LargeListArray>> FromArrays(
+ const Array& offsets, const Array& values,
+ MemoryPool* pool = default_memory_pool());
+
+ /// \brief Return an Array that is a concatenation of the lists in this array.
+ ///
+ /// Note that it's different from `values()` in that it takes into
+ /// consideration of this array's offsets as well as null elements backed
+ /// by non-empty lists (they are skipped, thus copying may be needed).
+ Result<std::shared_ptr<Array>> Flatten(
+ MemoryPool* memory_pool = default_memory_pool()) const;
+
+ /// \brief Return list offsets as an Int64Array
+ std::shared_ptr<Array> offsets() const;
+
+ protected:
+ void SetData(const std::shared_ptr<ArrayData>& data);
+};
+
+// ----------------------------------------------------------------------
+// MapArray
+
+/// Concrete Array class for map data
+///
+/// NB: "value" in this context refers to a pair of a key and the corresponding item
+class ARROW_EXPORT MapArray : public ListArray {
+ public:
+ using TypeClass = MapType;
+
+ explicit MapArray(const std::shared_ptr<ArrayData>& data);
+
+ MapArray(const std::shared_ptr<DataType>& type, int64_t length,
+ const std::shared_ptr<Buffer>& value_offsets,
+ const std::shared_ptr<Array>& keys, const std::shared_ptr<Array>& items,
+ const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
+ int64_t null_count = kUnknownNullCount, int64_t offset = 0);
+
+ MapArray(const std::shared_ptr<DataType>& type, int64_t length,
+ const std::shared_ptr<Buffer>& value_offsets,
+ const std::shared_ptr<Array>& values,
+ const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
+ int64_t null_count = kUnknownNullCount, int64_t offset = 0);
+
+ /// \brief Construct MapArray from array of offsets and child key, item arrays
+ ///
+ /// This function does the bare minimum of validation of the offsets and
+ /// input types, and will allocate a new offsets array if necessary (i.e. if
+ /// the offsets contain any nulls). If the offsets do not have nulls, they
+ /// are assumed to be well-formed
+ ///
+ /// \param[in] offsets Array containing n + 1 offsets encoding length and
+ /// size. Must be of int32 type
+ /// \param[in] keys Array containing key values
+ /// \param[in] items Array containing item values
+ /// \param[in] pool MemoryPool in case new offsets array needs to be
+ /// allocated because of null values
+ static Result<std::shared_ptr<Array>> FromArrays(
+ const std::shared_ptr<Array>& offsets, const std::shared_ptr<Array>& keys,
+ const std::shared_ptr<Array>& items, MemoryPool* pool = default_memory_pool());
+
+ static Result<std::shared_ptr<Array>> FromArrays(
+ std::shared_ptr<DataType> type, const std::shared_ptr<Array>& offsets,
+ const std::shared_ptr<Array>& keys, const std::shared_ptr<Array>& items,
+ MemoryPool* pool = default_memory_pool());
+
+ const MapType* map_type() const { return map_type_; }
+
+ /// \brief Return array object containing all map keys
+ std::shared_ptr<Array> keys() const { return keys_; }
+
+ /// \brief Return array object containing all mapped items
+ std::shared_ptr<Array> items() const { return items_; }
+
+ /// Validate child data before constructing the actual MapArray.
+ static Status ValidateChildData(
+ const std::vector<std::shared_ptr<ArrayData>>& child_data);
+
+ protected:
+ void SetData(const std::shared_ptr<ArrayData>& data);
+
+ static Result<std::shared_ptr<Array>> FromArraysInternal(
+ std::shared_ptr<DataType> type, const std::shared_ptr<Array>& offsets,
+ const std::shared_ptr<Array>& keys, const std::shared_ptr<Array>& items,
+ MemoryPool* pool);
+
+ private:
+ const MapType* map_type_;
+ std::shared_ptr<Array> keys_, items_;
+};
+
+// ----------------------------------------------------------------------
+// FixedSizeListArray
+
+/// Concrete Array class for fixed size list data
+class ARROW_EXPORT FixedSizeListArray : public Array {
+ public:
+ using TypeClass = FixedSizeListType;
+ using offset_type = TypeClass::offset_type;
+
+ explicit FixedSizeListArray(const std::shared_ptr<ArrayData>& data);
+
+ FixedSizeListArray(const std::shared_ptr<DataType>& type, int64_t length,
+ const std::shared_ptr<Array>& values,
+ const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
+ int64_t null_count = kUnknownNullCount, int64_t offset = 0);
+
+ const FixedSizeListType* list_type() const;
+
+ /// \brief Return array object containing the list's values
+ std::shared_ptr<Array> values() const;
+
+ std::shared_ptr<DataType> value_type() const;
+
+ // The following functions will not perform boundschecking
+ int32_t value_offset(int64_t i) const {
+ i += data_->offset;
+ return static_cast<int32_t>(list_size_ * i);
+ }
+ int32_t value_length(int64_t i = 0) const {
+ ARROW_UNUSED(i);
+ return list_size_;
+ }
+ std::shared_ptr<Array> value_slice(int64_t i) const {
+ return values_->Slice(value_offset(i), value_length(i));
+ }
+
+ /// \brief Construct FixedSizeListArray from child value array and value_length
+ ///
+ /// \param[in] values Array containing list values
+ /// \param[in] list_size The fixed length of each list
+ /// \return Will have length equal to values.length() / list_size
+ static Result<std::shared_ptr<Array>> FromArrays(const std::shared_ptr<Array>& values,
+ int32_t list_size);
+
+ protected:
+ void SetData(const std::shared_ptr<ArrayData>& data);
+ int32_t list_size_;
+
+ private:
+ std::shared_ptr<Array> values_;
+};
+
+// ----------------------------------------------------------------------
+// Struct
+
+/// Concrete Array class for struct data
+class ARROW_EXPORT StructArray : public Array {
+ public:
+ using TypeClass = StructType;
+
+ explicit StructArray(const std::shared_ptr<ArrayData>& data);
+
+ StructArray(const std::shared_ptr<DataType>& type, int64_t length,
+ const std::vector<std::shared_ptr<Array>>& children,
+ std::shared_ptr<Buffer> null_bitmap = NULLPTR,
+ int64_t null_count = kUnknownNullCount, int64_t offset = 0);
+
+ /// \brief Return a StructArray from child arrays and field names.
+ ///
+ /// The length and data type are automatically inferred from the arguments.
+ /// There should be at least one child array.
+ static Result<std::shared_ptr<StructArray>> Make(
+ const ArrayVector& children, const std::vector<std::string>& field_names,
+ std::shared_ptr<Buffer> null_bitmap = NULLPTR,
+ int64_t null_count = kUnknownNullCount, int64_t offset = 0);
+
+ /// \brief Return a StructArray from child arrays and fields.
+ ///
+ /// The length is automatically inferred from the arguments.
+ /// There should be at least one child array. This method does not
+ /// check that field types and child array types are consistent.
+ static Result<std::shared_ptr<StructArray>> Make(
+ const ArrayVector& children, const FieldVector& fields,
+ std::shared_ptr<Buffer> null_bitmap = NULLPTR,
+ int64_t null_count = kUnknownNullCount, int64_t offset = 0);
+
+ const StructType* struct_type() const;
+
+ // Return a shared pointer in case the requestor desires to share ownership
+ // with this array. The returned array has its offset, length and null
+ // count adjusted.
+ std::shared_ptr<Array> field(int pos) const;
+
+ const ArrayVector& fields() const;
+
+ /// Returns null if name not found
+ std::shared_ptr<Array> GetFieldByName(const std::string& name) const;
+
+ /// \brief Flatten this array as a vector of arrays, one for each field
+ ///
+ /// \param[in] pool The pool to allocate null bitmaps from, if necessary
+ Result<ArrayVector> Flatten(MemoryPool* pool = default_memory_pool()) const;
+
+ private:
+ // For caching boxed child data
+ // XXX This is not handled in a thread-safe manner.
+ mutable ArrayVector boxed_fields_;
+};
+
+// ----------------------------------------------------------------------
+// Union
+
+/// Base class for SparseUnionArray and DenseUnionArray
+class ARROW_EXPORT UnionArray : public Array {
+ public:
+ using type_code_t = int8_t;
+
+ /// Note that this buffer does not account for any slice offset
+ std::shared_ptr<Buffer> type_codes() const { return data_->buffers[1]; }
+
+ const type_code_t* raw_type_codes() const { return raw_type_codes_ + data_->offset; }
+
+ /// The physical child id containing value at index.
+ int child_id(int64_t i) const {
+ return union_type_->child_ids()[raw_type_codes_[i + data_->offset]];
+ }
+
+ const UnionType* union_type() const { return union_type_; }
+
+ UnionMode::type mode() const { return union_type_->mode(); }
+
+ // Return the given field as an individual array.
+ // For sparse unions, the returned array has its offset, length and null
+ // count adjusted.
+ ARROW_DEPRECATED("Deprecated in 1.0.0. Use field(pos)")
+ std::shared_ptr<Array> child(int pos) const;
+
+ /// \brief Return the given field as an individual array.
+ ///
+ /// For sparse unions, the returned array has its offset, length and null
+ /// count adjusted.
+ std::shared_ptr<Array> field(int pos) const;
+
+ protected:
+ void SetData(std::shared_ptr<ArrayData> data);
+
+ const type_code_t* raw_type_codes_;
+ const UnionType* union_type_;
+
+ // For caching boxed child data
+ mutable std::vector<std::shared_ptr<Array>> boxed_fields_;
+};
+
+/// Concrete Array class for sparse union data
+class ARROW_EXPORT SparseUnionArray : public UnionArray {
+ public:
+ using TypeClass = SparseUnionType;
+
+ explicit SparseUnionArray(std::shared_ptr<ArrayData> data);
+
+ SparseUnionArray(std::shared_ptr<DataType> type, int64_t length, ArrayVector children,
+ std::shared_ptr<Buffer> type_ids, int64_t offset = 0);
+
+ /// \brief Construct SparseUnionArray from type_ids and children
+ ///
+ /// This function does the bare minimum of validation of the input types.
+ ///
+ /// \param[in] type_ids An array of logical type ids for the union type
+ /// \param[in] children Vector of children Arrays containing the data for each type.
+ /// \param[in] type_codes Vector of type codes.
+ static Result<std::shared_ptr<Array>> Make(const Array& type_ids, ArrayVector children,
+ std::vector<type_code_t> type_codes) {
+ return Make(std::move(type_ids), std::move(children), std::vector<std::string>{},
+ std::move(type_codes));
+ }
+
+ /// \brief Construct SparseUnionArray with custom field names from type_ids and children
+ ///
+ /// This function does the bare minimum of validation of the input types.
+ ///
+ /// \param[in] type_ids An array of logical type ids for the union type
+ /// \param[in] children Vector of children Arrays containing the data for each type.
+ /// \param[in] field_names Vector of strings containing the name of each field.
+ /// \param[in] type_codes Vector of type codes.
+ static Result<std::shared_ptr<Array>> Make(const Array& type_ids, ArrayVector children,
+ std::vector<std::string> field_names = {},
+ std::vector<type_code_t> type_codes = {});
+
+ const SparseUnionType* union_type() const {
+ return internal::checked_cast<const SparseUnionType*>(union_type_);
+ }
+
+ protected:
+ void SetData(std::shared_ptr<ArrayData> data);
+};
+
+/// \brief Concrete Array class for dense union data
+///
+/// Note that union types do not have a validity bitmap
+class ARROW_EXPORT DenseUnionArray : public UnionArray {
+ public:
+ using TypeClass = DenseUnionType;
+
+ explicit DenseUnionArray(const std::shared_ptr<ArrayData>& data);
+
+ DenseUnionArray(std::shared_ptr<DataType> type, int64_t length, ArrayVector children,
+ std::shared_ptr<Buffer> type_ids,
+ std::shared_ptr<Buffer> value_offsets = NULLPTR, int64_t offset = 0);
+
+ /// \brief Construct DenseUnionArray from type_ids, value_offsets, and children
+ ///
+ /// This function does the bare minimum of validation of the offsets and
+ /// input types.
+ ///
+ /// \param[in] type_ids An array of logical type ids for the union type
+ /// \param[in] value_offsets An array of signed int32 values indicating the
+ /// relative offset into the respective child array for the type in a given slot.
+ /// The respective offsets for each child value array must be in order / increasing.
+ /// \param[in] children Vector of children Arrays containing the data for each type.
+ /// \param[in] type_codes Vector of type codes.
+ static Result<std::shared_ptr<Array>> Make(const Array& type_ids,
+ const Array& value_offsets,
+ ArrayVector children,
+ std::vector<type_code_t> type_codes) {
+ return Make(type_ids, value_offsets, std::move(children), std::vector<std::string>{},
+ std::move(type_codes));
+ }
+
+ /// \brief Construct DenseUnionArray with custom field names from type_ids,
+ /// value_offsets, and children
+ ///
+ /// This function does the bare minimum of validation of the offsets and
+ /// input types.
+ ///
+ /// \param[in] type_ids An array of logical type ids for the union type
+ /// \param[in] value_offsets An array of signed int32 values indicating the
+ /// relative offset into the respective child array for the type in a given slot.
+ /// The respective offsets for each child value array must be in order / increasing.
+ /// \param[in] children Vector of children Arrays containing the data for each type.
+ /// \param[in] field_names Vector of strings containing the name of each field.
+ /// \param[in] type_codes Vector of type codes.
+ static Result<std::shared_ptr<Array>> Make(const Array& type_ids,
+ const Array& value_offsets,
+ ArrayVector children,
+ std::vector<std::string> field_names = {},
+ std::vector<type_code_t> type_codes = {});
+
+ const DenseUnionType* union_type() const {
+ return internal::checked_cast<const DenseUnionType*>(union_type_);
+ }
+
+ /// Note that this buffer does not account for any slice offset
+ std::shared_ptr<Buffer> value_offsets() const { return data_->buffers[2]; }
+
+ int32_t value_offset(int64_t i) const { return raw_value_offsets_[i + data_->offset]; }
+
+ const int32_t* raw_value_offsets() const { return raw_value_offsets_ + data_->offset; }
+
+ protected:
+ const int32_t* raw_value_offsets_;
+
+ void SetData(const std::shared_ptr<ArrayData>& data);
+};
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/array_primitive.cc b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_primitive.cc
new file mode 100644
index 00000000000..a1aff933af4
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_primitive.cc
@@ -0,0 +1,99 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/array/array_primitive.h"
+
+#include <cstdint>
+#include <memory>
+
+#include "arrow/array/array_base.h"
+#include "arrow/type.h"
+#include "arrow/util/bit_block_counter.h"
+#include "arrow/util/bitmap_ops.h"
+#include "arrow/util/logging.h"
+
+namespace arrow {
+
+// ----------------------------------------------------------------------
+// Primitive array base
+
+PrimitiveArray::PrimitiveArray(const std::shared_ptr<DataType>& type, int64_t length,
+ const std::shared_ptr<Buffer>& data,
+ const std::shared_ptr<Buffer>& null_bitmap,
+ int64_t null_count, int64_t offset) {
+ SetData(ArrayData::Make(type, length, {null_bitmap, data}, null_count, offset));
+}
+
+// ----------------------------------------------------------------------
+// BooleanArray
+
+BooleanArray::BooleanArray(const std::shared_ptr<ArrayData>& data)
+ : PrimitiveArray(data) {
+ ARROW_CHECK_EQ(data->type->id(), Type::BOOL);
+}
+
+BooleanArray::BooleanArray(int64_t length, const std::shared_ptr<Buffer>& data,
+ const std::shared_ptr<Buffer>& null_bitmap, int64_t null_count,
+ int64_t offset)
+ : PrimitiveArray(boolean(), length, data, null_bitmap, null_count, offset) {}
+
+int64_t BooleanArray::false_count() const {
+ return this->length() - this->null_count() - this->true_count();
+}
+
+int64_t BooleanArray::true_count() const {
+ if (data_->null_count.load() != 0) {
+ DCHECK(data_->buffers[0]);
+ internal::BinaryBitBlockCounter bit_counter(data_->buffers[0]->data(), data_->offset,
+ data_->buffers[1]->data(), data_->offset,
+ data_->length);
+ int64_t count = 0;
+ while (true) {
+ internal::BitBlockCount block = bit_counter.NextAndWord();
+ if (block.length == 0) {
+ break;
+ }
+ count += block.popcount;
+ }
+ return count;
+ } else {
+ return internal::CountSetBits(data_->buffers[1]->data(), data_->offset,
+ data_->length);
+ }
+}
+
+// ----------------------------------------------------------------------
+// Day time interval
+
+DayTimeIntervalArray::DayTimeIntervalArray(const std::shared_ptr<ArrayData>& data) {
+ SetData(data);
+}
+
+DayTimeIntervalArray::DayTimeIntervalArray(const std::shared_ptr<DataType>& type,
+ int64_t length,
+ const std::shared_ptr<Buffer>& data,
+ const std::shared_ptr<Buffer>& null_bitmap,
+ int64_t null_count, int64_t offset)
+ : PrimitiveArray(type, length, data, null_bitmap, null_count, offset) {}
+
+DayTimeIntervalType::DayMilliseconds DayTimeIntervalArray::GetValue(int64_t i) const {
+ DCHECK(i < length());
+ return *reinterpret_cast<const DayTimeIntervalType::DayMilliseconds*>(
+ raw_values_ + (i + data_->offset) * byte_width());
+}
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/array_primitive.h b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_primitive.h
new file mode 100644
index 00000000000..b601eb770c3
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_primitive.h
@@ -0,0 +1,135 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Array accessor types for primitive/C-type-based arrays, such as numbers,
+// boolean, and temporal types.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+
+#include "arrow/array/array_base.h"
+#include "arrow/array/data.h"
+#include "arrow/stl_iterator.h"
+#include "arrow/type.h"
+#include "arrow/type_fwd.h" // IWYU pragma: export
+#include "arrow/type_traits.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+/// Concrete Array class for numeric data.
+template <typename TYPE>
+class NumericArray : public PrimitiveArray {
+ public:
+ using TypeClass = TYPE;
+ using value_type = typename TypeClass::c_type;
+ using IteratorType = stl::ArrayIterator<NumericArray<TYPE>>;
+
+ explicit NumericArray(const std::shared_ptr<ArrayData>& data) : PrimitiveArray(data) {}
+
+ // Only enable this constructor without a type argument for types without additional
+ // metadata
+ template <typename T1 = TYPE>
+ NumericArray(enable_if_parameter_free<T1, int64_t> length,
+ const std::shared_ptr<Buffer>& data,
+ const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
+ int64_t null_count = kUnknownNullCount, int64_t offset = 0)
+ : PrimitiveArray(TypeTraits<T1>::type_singleton(), length, data, null_bitmap,
+ null_count, offset) {}
+
+ const value_type* raw_values() const {
+ return reinterpret_cast<const value_type*>(raw_values_) + data_->offset;
+ }
+
+ value_type Value(int64_t i) const { return raw_values()[i]; }
+
+ // For API compatibility with BinaryArray etc.
+ value_type GetView(int64_t i) const { return Value(i); }
+
+ IteratorType begin() const { return IteratorType(*this); }
+
+ IteratorType end() const { return IteratorType(*this, length()); }
+
+ protected:
+ using PrimitiveArray::PrimitiveArray;
+};
+
+/// Concrete Array class for boolean data
+class ARROW_EXPORT BooleanArray : public PrimitiveArray {
+ public:
+ using TypeClass = BooleanType;
+ using IteratorType = stl::ArrayIterator<BooleanArray>;
+
+ explicit BooleanArray(const std::shared_ptr<ArrayData>& data);
+
+ BooleanArray(int64_t length, const std::shared_ptr<Buffer>& data,
+ const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
+ int64_t null_count = kUnknownNullCount, int64_t offset = 0);
+
+ bool Value(int64_t i) const {
+ return BitUtil::GetBit(reinterpret_cast<const uint8_t*>(raw_values_),
+ i + data_->offset);
+ }
+
+ bool GetView(int64_t i) const { return Value(i); }
+
+ /// \brief Return the number of false (0) values among the valid
+ /// values. Result is not cached.
+ int64_t false_count() const;
+
+ /// \brief Return the number of true (1) values among the valid
+ /// values. Result is not cached.
+ int64_t true_count() const;
+
+ IteratorType begin() const { return IteratorType(*this); }
+
+ IteratorType end() const { return IteratorType(*this, length()); }
+
+ protected:
+ using PrimitiveArray::PrimitiveArray;
+};
+
+/// DayTimeArray
+/// ---------------------
+/// \brief Array of Day and Millisecond values.
+class ARROW_EXPORT DayTimeIntervalArray : public PrimitiveArray {
+ public:
+ using TypeClass = DayTimeIntervalType;
+
+ explicit DayTimeIntervalArray(const std::shared_ptr<ArrayData>& data);
+
+ DayTimeIntervalArray(const std::shared_ptr<DataType>& type, int64_t length,
+ const std::shared_ptr<Buffer>& data,
+ const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
+ int64_t null_count = kUnknownNullCount, int64_t offset = 0);
+
+ TypeClass::DayMilliseconds GetValue(int64_t i) const;
+ TypeClass::DayMilliseconds Value(int64_t i) const { return GetValue(i); }
+
+ // For compatibility with Take kernel.
+ TypeClass::DayMilliseconds GetView(int64_t i) const { return GetValue(i); }
+
+ int32_t byte_width() const { return sizeof(TypeClass::DayMilliseconds); }
+
+ const uint8_t* raw_values() const { return raw_values_ + data_->offset * byte_width(); }
+};
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_adaptive.cc b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_adaptive.cc
new file mode 100644
index 00000000000..36e5546a749
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_adaptive.cc
@@ -0,0 +1,380 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/array/builder_adaptive.h"
+
+#include <algorithm>
+#include <cstdint>
+
+#include "arrow/array/data.h"
+#include "arrow/buffer.h"
+#include "arrow/buffer_builder.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/util/int_util.h"
+#include "arrow/util/logging.h"
+
+namespace arrow {
+
+using internal::AdaptiveIntBuilderBase;
+
+AdaptiveIntBuilderBase::AdaptiveIntBuilderBase(uint8_t start_int_size, MemoryPool* pool)
+ : ArrayBuilder(pool), start_int_size_(start_int_size), int_size_(start_int_size) {}
+
+void AdaptiveIntBuilderBase::Reset() {
+ ArrayBuilder::Reset();
+ data_.reset();
+ raw_data_ = nullptr;
+ pending_pos_ = 0;
+ pending_has_nulls_ = false;
+ int_size_ = start_int_size_;
+}
+
+Status AdaptiveIntBuilderBase::Resize(int64_t capacity) {
+ RETURN_NOT_OK(CheckCapacity(capacity));
+ capacity = std::max(capacity, kMinBuilderCapacity);
+
+ int64_t nbytes = capacity * int_size_;
+ if (capacity_ == 0) {
+ ARROW_ASSIGN_OR_RAISE(data_, AllocateResizableBuffer(nbytes, pool_));
+ } else {
+ RETURN_NOT_OK(data_->Resize(nbytes));
+ }
+ raw_data_ = reinterpret_cast<uint8_t*>(data_->mutable_data());
+
+ return ArrayBuilder::Resize(capacity);
+}
+
+template <typename new_type, typename old_type>
+typename std::enable_if<sizeof(old_type) >= sizeof(new_type), Status>::type
+AdaptiveIntBuilderBase::ExpandIntSizeInternal() {
+ return Status::OK();
+}
+
+template <typename new_type, typename old_type>
+typename std::enable_if<(sizeof(old_type) < sizeof(new_type)), Status>::type
+AdaptiveIntBuilderBase::ExpandIntSizeInternal() {
+ int_size_ = sizeof(new_type);
+ RETURN_NOT_OK(Resize(data_->size() / sizeof(old_type)));
+
+ const old_type* src = reinterpret_cast<old_type*>(raw_data_);
+ new_type* dst = reinterpret_cast<new_type*>(raw_data_);
+ // By doing the backward copy, we ensure that no element is overridden during
+ // the copy process while the copy stays in-place.
+ std::copy_backward(src, src + length_, dst + length_);
+
+ return Status::OK();
+}
+
+std::shared_ptr<DataType> AdaptiveUIntBuilder::type() const {
+ auto int_size = int_size_;
+ if (pending_pos_ != 0) {
+ const uint8_t* valid_bytes = pending_has_nulls_ ? pending_valid_ : nullptr;
+ int_size =
+ internal::DetectUIntWidth(pending_data_, valid_bytes, pending_pos_, int_size_);
+ }
+ switch (int_size) {
+ case 1:
+ return uint8();
+ case 2:
+ return uint16();
+ case 4:
+ return uint32();
+ case 8:
+ return uint64();
+ default:
+ DCHECK(false);
+ }
+ return nullptr;
+}
+
+std::shared_ptr<DataType> AdaptiveIntBuilder::type() const {
+ auto int_size = int_size_;
+ if (pending_pos_ != 0) {
+ const uint8_t* valid_bytes = pending_has_nulls_ ? pending_valid_ : nullptr;
+ int_size = internal::DetectIntWidth(reinterpret_cast<const int64_t*>(pending_data_),
+ valid_bytes, pending_pos_, int_size_);
+ }
+ switch (int_size) {
+ case 1:
+ return int8();
+ case 2:
+ return int16();
+ case 4:
+ return int32();
+ case 8:
+ return int64();
+ default:
+ DCHECK(false);
+ }
+ return nullptr;
+}
+
+AdaptiveIntBuilder::AdaptiveIntBuilder(uint8_t start_int_size, MemoryPool* pool)
+ : AdaptiveIntBuilderBase(start_int_size, pool) {}
+
+Status AdaptiveIntBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
+ RETURN_NOT_OK(CommitPendingData());
+
+ std::shared_ptr<Buffer> null_bitmap;
+ RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
+ RETURN_NOT_OK(TrimBuffer(length_ * int_size_, data_.get()));
+
+ *out = ArrayData::Make(type(), length_, {null_bitmap, data_}, null_count_);
+
+ data_ = nullptr;
+ capacity_ = length_ = null_count_ = 0;
+ return Status::OK();
+}
+
+Status AdaptiveIntBuilder::CommitPendingData() {
+ if (pending_pos_ == 0) {
+ return Status::OK();
+ }
+ RETURN_NOT_OK(Reserve(pending_pos_));
+ const uint8_t* valid_bytes = pending_has_nulls_ ? pending_valid_ : nullptr;
+ RETURN_NOT_OK(AppendValuesInternal(reinterpret_cast<const int64_t*>(pending_data_),
+ pending_pos_, valid_bytes));
+ pending_has_nulls_ = false;
+ pending_pos_ = 0;
+ return Status::OK();
+}
+
+static constexpr int64_t kAdaptiveIntChunkSize = 8192;
+
+Status AdaptiveIntBuilder::AppendValuesInternal(const int64_t* values, int64_t length,
+ const uint8_t* valid_bytes) {
+ if (pending_pos_ > 0) {
+ // UnsafeAppendToBitmap expects length_ to be the pre-update value, satisfy it
+ DCHECK_EQ(length, pending_pos_) << "AppendValuesInternal called while data pending";
+ length_ -= pending_pos_;
+ }
+
+ while (length > 0) {
+ // In case `length` is very large, we don't want to trash the cache by
+ // scanning it twice (first to detect int width, second to copy the data).
+ // Instead, process data in L2-cacheable chunks.
+ const int64_t chunk_size = std::min(length, kAdaptiveIntChunkSize);
+
+ uint8_t new_int_size;
+ new_int_size = internal::DetectIntWidth(values, valid_bytes, chunk_size, int_size_);
+
+ DCHECK_GE(new_int_size, int_size_);
+ if (new_int_size > int_size_) {
+ // This updates int_size_
+ RETURN_NOT_OK(ExpandIntSize(new_int_size));
+ }
+
+ switch (int_size_) {
+ case 1:
+ internal::DowncastInts(values, reinterpret_cast<int8_t*>(raw_data_) + length_,
+ chunk_size);
+ break;
+ case 2:
+ internal::DowncastInts(values, reinterpret_cast<int16_t*>(raw_data_) + length_,
+ chunk_size);
+ break;
+ case 4:
+ internal::DowncastInts(values, reinterpret_cast<int32_t*>(raw_data_) + length_,
+ chunk_size);
+ break;
+ case 8:
+ internal::DowncastInts(values, reinterpret_cast<int64_t*>(raw_data_) + length_,
+ chunk_size);
+ break;
+ default:
+ DCHECK(false);
+ }
+
+ // UnsafeAppendToBitmap increments length_ by chunk_size
+ ArrayBuilder::UnsafeAppendToBitmap(valid_bytes, chunk_size);
+ values += chunk_size;
+ if (valid_bytes != nullptr) {
+ valid_bytes += chunk_size;
+ }
+ length -= chunk_size;
+ }
+
+ return Status::OK();
+}
+
+Status AdaptiveUIntBuilder::CommitPendingData() {
+ if (pending_pos_ == 0) {
+ return Status::OK();
+ }
+ RETURN_NOT_OK(Reserve(pending_pos_));
+ const uint8_t* valid_bytes = pending_has_nulls_ ? pending_valid_ : nullptr;
+ RETURN_NOT_OK(AppendValuesInternal(pending_data_, pending_pos_, valid_bytes));
+ pending_has_nulls_ = false;
+ pending_pos_ = 0;
+ return Status::OK();
+}
+
+Status AdaptiveIntBuilder::AppendValues(const int64_t* values, int64_t length,
+ const uint8_t* valid_bytes) {
+ RETURN_NOT_OK(CommitPendingData());
+ RETURN_NOT_OK(Reserve(length));
+
+ return AppendValuesInternal(values, length, valid_bytes);
+}
+
+template <typename new_type>
+Status AdaptiveIntBuilder::ExpandIntSizeN() {
+ switch (int_size_) {
+ case 1:
+ return ExpandIntSizeInternal<new_type, int8_t>();
+ case 2:
+ return ExpandIntSizeInternal<new_type, int16_t>();
+ case 4:
+ return ExpandIntSizeInternal<new_type, int32_t>();
+ case 8:
+ return ExpandIntSizeInternal<new_type, int64_t>();
+ default:
+ DCHECK(false);
+ }
+ return Status::OK();
+}
+
+Status AdaptiveIntBuilder::ExpandIntSize(uint8_t new_int_size) {
+ switch (new_int_size) {
+ case 1:
+ return ExpandIntSizeN<int8_t>();
+ case 2:
+ return ExpandIntSizeN<int16_t>();
+ case 4:
+ return ExpandIntSizeN<int32_t>();
+ case 8:
+ return ExpandIntSizeN<int64_t>();
+ default:
+ DCHECK(false);
+ }
+ return Status::OK();
+}
+
+AdaptiveUIntBuilder::AdaptiveUIntBuilder(uint8_t start_int_size, MemoryPool* pool)
+ : AdaptiveIntBuilderBase(start_int_size, pool) {}
+
+Status AdaptiveUIntBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
+ RETURN_NOT_OK(CommitPendingData());
+
+ std::shared_ptr<Buffer> null_bitmap;
+ RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
+ RETURN_NOT_OK(TrimBuffer(length_ * int_size_, data_.get()));
+
+ *out = ArrayData::Make(type(), length_, {null_bitmap, data_}, null_count_);
+
+ data_ = nullptr;
+ capacity_ = length_ = null_count_ = 0;
+ return Status::OK();
+}
+
+Status AdaptiveUIntBuilder::AppendValuesInternal(const uint64_t* values, int64_t length,
+ const uint8_t* valid_bytes) {
+ if (pending_pos_ > 0) {
+ // UnsafeAppendToBitmap expects length_ to be the pre-update value, satisfy it
+ DCHECK_EQ(length, pending_pos_) << "AppendValuesInternal called while data pending";
+ length_ -= pending_pos_;
+ }
+
+ while (length > 0) {
+ // See AdaptiveIntBuilder::AppendValuesInternal
+ const int64_t chunk_size = std::min(length, kAdaptiveIntChunkSize);
+
+ uint8_t new_int_size;
+ new_int_size = internal::DetectUIntWidth(values, valid_bytes, chunk_size, int_size_);
+
+ DCHECK_GE(new_int_size, int_size_);
+ if (new_int_size > int_size_) {
+ // This updates int_size_
+ RETURN_NOT_OK(ExpandIntSize(new_int_size));
+ }
+
+ switch (int_size_) {
+ case 1:
+ internal::DowncastUInts(values, reinterpret_cast<uint8_t*>(raw_data_) + length_,
+ chunk_size);
+ break;
+ case 2:
+ internal::DowncastUInts(values, reinterpret_cast<uint16_t*>(raw_data_) + length_,
+ chunk_size);
+ break;
+ case 4:
+ internal::DowncastUInts(values, reinterpret_cast<uint32_t*>(raw_data_) + length_,
+ chunk_size);
+ break;
+ case 8:
+ internal::DowncastUInts(values, reinterpret_cast<uint64_t*>(raw_data_) + length_,
+ chunk_size);
+ break;
+ default:
+ DCHECK(false);
+ }
+
+ // UnsafeAppendToBitmap increments length_ by chunk_size
+ ArrayBuilder::UnsafeAppendToBitmap(valid_bytes, chunk_size);
+ values += chunk_size;
+ if (valid_bytes != nullptr) {
+ valid_bytes += chunk_size;
+ }
+ length -= chunk_size;
+ }
+
+ return Status::OK();
+}
+
+Status AdaptiveUIntBuilder::AppendValues(const uint64_t* values, int64_t length,
+ const uint8_t* valid_bytes) {
+ RETURN_NOT_OK(Reserve(length));
+
+ return AppendValuesInternal(values, length, valid_bytes);
+}
+
+template <typename new_type>
+Status AdaptiveUIntBuilder::ExpandIntSizeN() {
+ switch (int_size_) {
+ case 1:
+ return ExpandIntSizeInternal<new_type, uint8_t>();
+ case 2:
+ return ExpandIntSizeInternal<new_type, uint16_t>();
+ case 4:
+ return ExpandIntSizeInternal<new_type, uint32_t>();
+ case 8:
+ return ExpandIntSizeInternal<new_type, uint64_t>();
+ default:
+ DCHECK(false);
+ }
+ return Status::OK();
+}
+
+Status AdaptiveUIntBuilder::ExpandIntSize(uint8_t new_int_size) {
+ switch (new_int_size) {
+ case 1:
+ return ExpandIntSizeN<uint8_t>();
+ case 2:
+ return ExpandIntSizeN<uint16_t>();
+ case 4:
+ return ExpandIntSizeN<uint32_t>();
+ case 8:
+ return ExpandIntSizeN<uint64_t>();
+ default:
+ DCHECK(false);
+ }
+ return Status::OK();
+}
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_adaptive.h b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_adaptive.h
new file mode 100644
index 00000000000..c0df797256d
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_adaptive.h
@@ -0,0 +1,203 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <type_traits>
+
+#include "arrow/array/builder_base.h"
+#include "arrow/buffer.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+namespace internal {
+
+class ARROW_EXPORT AdaptiveIntBuilderBase : public ArrayBuilder {
+ public:
+ AdaptiveIntBuilderBase(uint8_t start_int_size, MemoryPool* pool);
+
+ explicit AdaptiveIntBuilderBase(MemoryPool* pool)
+ : AdaptiveIntBuilderBase(sizeof(uint8_t), pool) {}
+
+ /// \brief Append multiple nulls
+ /// \param[in] length the number of nulls to append
+ Status AppendNulls(int64_t length) final {
+ ARROW_RETURN_NOT_OK(CommitPendingData());
+ ARROW_RETURN_NOT_OK(Reserve(length));
+ memset(data_->mutable_data() + length_ * int_size_, 0, int_size_ * length);
+ UnsafeSetNull(length);
+ return Status::OK();
+ }
+
+ Status AppendNull() final {
+ pending_data_[pending_pos_] = 0;
+ pending_valid_[pending_pos_] = 0;
+ pending_has_nulls_ = true;
+ ++pending_pos_;
+ ++length_;
+ ++null_count_;
+
+ if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) {
+ return CommitPendingData();
+ }
+ return Status::OK();
+ }
+
+ Status AppendEmptyValues(int64_t length) final {
+ ARROW_RETURN_NOT_OK(CommitPendingData());
+ ARROW_RETURN_NOT_OK(Reserve(length));
+ memset(data_->mutable_data() + length_ * int_size_, 0, int_size_ * length);
+ UnsafeSetNotNull(length);
+ return Status::OK();
+ }
+
+ Status AppendEmptyValue() final {
+ pending_data_[pending_pos_] = 0;
+ pending_valid_[pending_pos_] = 1;
+ ++pending_pos_;
+ ++length_;
+
+ if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) {
+ return CommitPendingData();
+ }
+ return Status::OK();
+ }
+
+ void Reset() override;
+ Status Resize(int64_t capacity) override;
+
+ protected:
+ Status AppendInternal(const uint64_t val) {
+ pending_data_[pending_pos_] = val;
+ pending_valid_[pending_pos_] = 1;
+ ++pending_pos_;
+ ++length_;
+
+ if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) {
+ return CommitPendingData();
+ }
+ return Status::OK();
+ }
+
+ virtual Status CommitPendingData() = 0;
+
+ template <typename new_type, typename old_type>
+ typename std::enable_if<sizeof(old_type) >= sizeof(new_type), Status>::type
+ ExpandIntSizeInternal();
+ template <typename new_type, typename old_type>
+ typename std::enable_if<(sizeof(old_type) < sizeof(new_type)), Status>::type
+ ExpandIntSizeInternal();
+
+ std::shared_ptr<ResizableBuffer> data_;
+ uint8_t* raw_data_ = NULLPTR;
+
+ const uint8_t start_int_size_;
+ uint8_t int_size_;
+
+ static constexpr int32_t pending_size_ = 1024;
+ uint8_t pending_valid_[pending_size_];
+ uint64_t pending_data_[pending_size_];
+ int32_t pending_pos_ = 0;
+ bool pending_has_nulls_ = false;
+};
+
+} // namespace internal
+
+class ARROW_EXPORT AdaptiveUIntBuilder : public internal::AdaptiveIntBuilderBase {
+ public:
+ explicit AdaptiveUIntBuilder(uint8_t start_int_size,
+ MemoryPool* pool = default_memory_pool());
+
+ explicit AdaptiveUIntBuilder(MemoryPool* pool = default_memory_pool())
+ : AdaptiveUIntBuilder(sizeof(uint8_t), pool) {}
+
+ using ArrayBuilder::Advance;
+ using internal::AdaptiveIntBuilderBase::Reset;
+
+ /// Scalar append
+ Status Append(const uint64_t val) { return AppendInternal(val); }
+
+ /// \brief Append a sequence of elements in one shot
+ /// \param[in] values a contiguous C array of values
+ /// \param[in] length the number of values to append
+ /// \param[in] valid_bytes an optional sequence of bytes where non-zero
+ /// indicates a valid (non-null) value
+ /// \return Status
+ Status AppendValues(const uint64_t* values, int64_t length,
+ const uint8_t* valid_bytes = NULLPTR);
+
+ Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
+
+ std::shared_ptr<DataType> type() const override;
+
+ protected:
+ Status CommitPendingData() override;
+ Status ExpandIntSize(uint8_t new_int_size);
+
+ Status AppendValuesInternal(const uint64_t* values, int64_t length,
+ const uint8_t* valid_bytes);
+
+ template <typename new_type>
+ Status ExpandIntSizeN();
+};
+
+class ARROW_EXPORT AdaptiveIntBuilder : public internal::AdaptiveIntBuilderBase {
+ public:
+ explicit AdaptiveIntBuilder(uint8_t start_int_size,
+ MemoryPool* pool = default_memory_pool());
+
+ explicit AdaptiveIntBuilder(MemoryPool* pool = default_memory_pool())
+ : AdaptiveIntBuilder(sizeof(uint8_t), pool) {}
+
+ using ArrayBuilder::Advance;
+ using internal::AdaptiveIntBuilderBase::Reset;
+
+ /// Scalar append
+ Status Append(const int64_t val) { return AppendInternal(static_cast<uint64_t>(val)); }
+
+ /// \brief Append a sequence of elements in one shot
+ /// \param[in] values a contiguous C array of values
+ /// \param[in] length the number of values to append
+ /// \param[in] valid_bytes an optional sequence of bytes where non-zero
+ /// indicates a valid (non-null) value
+ /// \return Status
+ Status AppendValues(const int64_t* values, int64_t length,
+ const uint8_t* valid_bytes = NULLPTR);
+
+ Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
+
+ std::shared_ptr<DataType> type() const override;
+
+ protected:
+ Status CommitPendingData() override;
+ Status ExpandIntSize(uint8_t new_int_size);
+
+ Status AppendValuesInternal(const int64_t* values, int64_t length,
+ const uint8_t* valid_bytes);
+
+ template <typename new_type>
+ Status ExpandIntSizeN();
+};
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_base.cc b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_base.cc
new file mode 100644
index 00000000000..c892e3d664b
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_base.cc
@@ -0,0 +1,295 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/array/builder_base.h"
+
+#include <cstdint>
+#include <vector>
+
+#include "arrow/array/array_base.h"
+#include "arrow/array/data.h"
+#include "arrow/array/util.h"
+#include "arrow/buffer.h"
+#include "arrow/builder.h"
+#include "arrow/scalar.h"
+#include "arrow/status.h"
+#include "arrow/util/logging.h"
+#include "arrow/visitor_inline.h"
+
+namespace arrow {
+
+Status ArrayBuilder::CheckArrayType(const std::shared_ptr<DataType>& expected_type,
+ const Array& array, const char* message) {
+ if (!expected_type->Equals(*array.type())) {
+ return Status::TypeError(message);
+ }
+ return Status::OK();
+}
+
+Status ArrayBuilder::CheckArrayType(Type::type expected_type, const Array& array,
+ const char* message) {
+ if (array.type_id() != expected_type) {
+ return Status::TypeError(message);
+ }
+ return Status::OK();
+}
+
+Status ArrayBuilder::TrimBuffer(const int64_t bytes_filled, ResizableBuffer* buffer) {
+ if (buffer) {
+ if (bytes_filled < buffer->size()) {
+ // Trim buffer
+ RETURN_NOT_OK(buffer->Resize(bytes_filled));
+ }
+ // zero the padding
+ buffer->ZeroPadding();
+ } else {
+ // Null buffers are allowed in place of 0-byte buffers
+ DCHECK_EQ(bytes_filled, 0);
+ }
+ return Status::OK();
+}
+
+Status ArrayBuilder::AppendToBitmap(bool is_valid) {
+ RETURN_NOT_OK(Reserve(1));
+ UnsafeAppendToBitmap(is_valid);
+ return Status::OK();
+}
+
+Status ArrayBuilder::AppendToBitmap(const uint8_t* valid_bytes, int64_t length) {
+ RETURN_NOT_OK(Reserve(length));
+ UnsafeAppendToBitmap(valid_bytes, length);
+ return Status::OK();
+}
+
+Status ArrayBuilder::AppendToBitmap(int64_t num_bits, bool value) {
+ RETURN_NOT_OK(Reserve(num_bits));
+ UnsafeAppendToBitmap(num_bits, value);
+ return Status::OK();
+}
+
+Status ArrayBuilder::Resize(int64_t capacity) {
+ RETURN_NOT_OK(CheckCapacity(capacity));
+ capacity_ = capacity;
+ return null_bitmap_builder_.Resize(capacity);
+}
+
+Status ArrayBuilder::Advance(int64_t elements) {
+ if (length_ + elements > capacity_) {
+ return Status::Invalid("Builder must be expanded");
+ }
+ length_ += elements;
+ return null_bitmap_builder_.Advance(elements);
+}
+
+namespace {
+struct AppendScalarImpl {
+ template <typename T>
+ enable_if_t<has_c_type<T>::value || is_decimal_type<T>::value ||
+ is_fixed_size_binary_type<T>::value,
+ Status>
+ Visit(const T&) {
+ auto builder = internal::checked_cast<typename TypeTraits<T>::BuilderType*>(builder_);
+ RETURN_NOT_OK(builder->Reserve(n_repeats_ * (scalars_end_ - scalars_begin_)));
+
+ for (int64_t i = 0; i < n_repeats_; i++) {
+ for (const std::shared_ptr<Scalar>* raw = scalars_begin_; raw != scalars_end_;
+ raw++) {
+ auto scalar =
+ internal::checked_cast<const typename TypeTraits<T>::ScalarType*>(raw->get());
+ if (scalar->is_valid) {
+ builder->UnsafeAppend(scalar->value);
+ } else {
+ builder->UnsafeAppendNull();
+ }
+ }
+ }
+ return Status::OK();
+ }
+
+ template <typename T>
+ enable_if_base_binary<T, Status> Visit(const T&) {
+ int64_t data_size = 0;
+ for (const std::shared_ptr<Scalar>* raw = scalars_begin_; raw != scalars_end_;
+ raw++) {
+ auto scalar =
+ internal::checked_cast<const typename TypeTraits<T>::ScalarType*>(raw->get());
+ if (scalar->is_valid) {
+ data_size += scalar->value->size();
+ }
+ }
+
+ auto builder = internal::checked_cast<typename TypeTraits<T>::BuilderType*>(builder_);
+ RETURN_NOT_OK(builder->Reserve(n_repeats_ * (scalars_end_ - scalars_begin_)));
+ RETURN_NOT_OK(builder->ReserveData(n_repeats_ * data_size));
+
+ for (int64_t i = 0; i < n_repeats_; i++) {
+ for (const std::shared_ptr<Scalar>* raw = scalars_begin_; raw != scalars_end_;
+ raw++) {
+ auto scalar =
+ internal::checked_cast<const typename TypeTraits<T>::ScalarType*>(raw->get());
+ if (scalar->is_valid) {
+ builder->UnsafeAppend(util::string_view{*scalar->value});
+ } else {
+ builder->UnsafeAppendNull();
+ }
+ }
+ }
+ return Status::OK();
+ }
+
+ template <typename T>
+ enable_if_list_like<T, Status> Visit(const T&) {
+ auto builder = internal::checked_cast<typename TypeTraits<T>::BuilderType*>(builder_);
+ int64_t num_children = 0;
+ for (const std::shared_ptr<Scalar>* scalar = scalars_begin_; scalar != scalars_end_;
+ scalar++) {
+ if (!(*scalar)->is_valid) continue;
+ num_children +=
+ internal::checked_cast<const BaseListScalar&>(**scalar).value->length();
+ }
+ RETURN_NOT_OK(builder->value_builder()->Reserve(num_children * n_repeats_));
+
+ for (int64_t i = 0; i < n_repeats_; i++) {
+ for (const std::shared_ptr<Scalar>* scalar = scalars_begin_; scalar != scalars_end_;
+ scalar++) {
+ if ((*scalar)->is_valid) {
+ RETURN_NOT_OK(builder->Append());
+ const Array& list =
+ *internal::checked_cast<const BaseListScalar&>(**scalar).value;
+ for (int64_t i = 0; i < list.length(); i++) {
+ ARROW_ASSIGN_OR_RAISE(auto scalar, list.GetScalar(i));
+ RETURN_NOT_OK(builder->value_builder()->AppendScalar(*scalar));
+ }
+ } else {
+ RETURN_NOT_OK(builder_->AppendNull());
+ }
+ }
+ }
+ return Status::OK();
+ }
+
+ Status Visit(const StructType& type) {
+ auto* builder = internal::checked_cast<StructBuilder*>(builder_);
+ auto count = n_repeats_ * (scalars_end_ - scalars_begin_);
+ RETURN_NOT_OK(builder->Reserve(count));
+ for (int field_index = 0; field_index < type.num_fields(); ++field_index) {
+ RETURN_NOT_OK(builder->field_builder(field_index)->Reserve(count));
+ }
+ for (int64_t i = 0; i < n_repeats_; i++) {
+ for (const std::shared_ptr<Scalar>* s = scalars_begin_; s != scalars_end_; s++) {
+ const auto& scalar = internal::checked_cast<const StructScalar&>(**s);
+ for (int field_index = 0; field_index < type.num_fields(); ++field_index) {
+ if (!scalar.is_valid || !scalar.value[field_index]) {
+ RETURN_NOT_OK(builder->field_builder(field_index)->AppendNull());
+ } else {
+ RETURN_NOT_OK(builder->field_builder(field_index)
+ ->AppendScalar(*scalar.value[field_index]));
+ }
+ }
+ RETURN_NOT_OK(builder->Append(scalar.is_valid));
+ }
+ }
+ return Status::OK();
+ }
+
+ Status Visit(const DataType& type) {
+ return Status::NotImplemented("AppendScalar for type ", type);
+ }
+
+ Status Convert() { return VisitTypeInline(*(*scalars_begin_)->type, this); }
+
+ const std::shared_ptr<Scalar>* scalars_begin_;
+ const std::shared_ptr<Scalar>* scalars_end_;
+ int64_t n_repeats_;
+ ArrayBuilder* builder_;
+};
+} // namespace
+
+Status ArrayBuilder::AppendScalar(const Scalar& scalar) {
+ if (!scalar.type->Equals(type())) {
+ return Status::Invalid("Cannot append scalar of type ", scalar.type->ToString(),
+ " to builder for type ", type()->ToString());
+ }
+ std::shared_ptr<Scalar> shared{const_cast<Scalar*>(&scalar), [](Scalar*) {}};
+ return AppendScalarImpl{&shared, &shared + 1, /*n_repeats=*/1, this}.Convert();
+}
+
+Status ArrayBuilder::AppendScalar(const Scalar& scalar, int64_t n_repeats) {
+ if (!scalar.type->Equals(type())) {
+ return Status::Invalid("Cannot append scalar of type ", scalar.type->ToString(),
+ " to builder for type ", type()->ToString());
+ }
+ std::shared_ptr<Scalar> shared{const_cast<Scalar*>(&scalar), [](Scalar*) {}};
+ return AppendScalarImpl{&shared, &shared + 1, n_repeats, this}.Convert();
+}
+
+Status ArrayBuilder::AppendScalars(const ScalarVector& scalars) {
+ if (scalars.empty()) return Status::OK();
+ const auto ty = type();
+ for (const auto& scalar : scalars) {
+ if (!scalar->type->Equals(ty)) {
+ return Status::Invalid("Cannot append scalar of type ", scalar->type->ToString(),
+ " to builder for type ", type()->ToString());
+ }
+ }
+ return AppendScalarImpl{scalars.data(), scalars.data() + scalars.size(),
+ /*n_repeats=*/1, this}
+ .Convert();
+}
+
+Status ArrayBuilder::Finish(std::shared_ptr<Array>* out) {
+ std::shared_ptr<ArrayData> internal_data;
+ RETURN_NOT_OK(FinishInternal(&internal_data));
+ *out = MakeArray(internal_data);
+ return Status::OK();
+}
+
+Result<std::shared_ptr<Array>> ArrayBuilder::Finish() {
+ std::shared_ptr<Array> out;
+ RETURN_NOT_OK(Finish(&out));
+ return out;
+}
+
+void ArrayBuilder::Reset() {
+ capacity_ = length_ = null_count_ = 0;
+ null_bitmap_builder_.Reset();
+}
+
+Status ArrayBuilder::SetNotNull(int64_t length) {
+ RETURN_NOT_OK(Reserve(length));
+ UnsafeSetNotNull(length);
+ return Status::OK();
+}
+
+void ArrayBuilder::UnsafeAppendToBitmap(const std::vector<bool>& is_valid) {
+ for (bool element_valid : is_valid) {
+ UnsafeAppendToBitmap(element_valid);
+ }
+}
+
+void ArrayBuilder::UnsafeSetNotNull(int64_t length) {
+ length_ += length;
+ null_bitmap_builder_.UnsafeAppend(length, true);
+}
+
+void ArrayBuilder::UnsafeSetNull(int64_t length) {
+ length_ += length;
+ null_count_ += length;
+ null_bitmap_builder_.UnsafeAppend(length, false);
+}
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_base.h b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_base.h
new file mode 100644
index 00000000000..905b3c1b491
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_base.h
@@ -0,0 +1,276 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <algorithm> // IWYU pragma: keep
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "arrow/array/array_base.h"
+#include "arrow/array/array_primitive.h"
+#include "arrow/buffer.h"
+#include "arrow/buffer_builder.h"
+#include "arrow/status.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+constexpr int64_t kMinBuilderCapacity = 1 << 5;
+constexpr int64_t kListMaximumElements = std::numeric_limits<int32_t>::max() - 1;
+
+/// Base class for all data array builders.
+///
+/// This class provides a facilities for incrementally building the null bitmap
+/// (see Append methods) and as a side effect the current number of slots and
+/// the null count.
+///
+/// \note Users are expected to use builders as one of the concrete types below.
+/// For example, ArrayBuilder* pointing to BinaryBuilder should be downcast before use.
+class ARROW_EXPORT ArrayBuilder {
+ public:
+ explicit ArrayBuilder(MemoryPool* pool) : pool_(pool), null_bitmap_builder_(pool) {}
+
+ virtual ~ArrayBuilder() = default;
+ ARROW_DEFAULT_MOVE_AND_ASSIGN(ArrayBuilder);
+
+ /// For nested types. Since the objects are owned by this class instance, we
+ /// skip shared pointers and just return a raw pointer
+ ArrayBuilder* child(int i) { return children_[i].get(); }
+
+ const std::shared_ptr<ArrayBuilder>& child_builder(int i) const { return children_[i]; }
+
+ int num_children() const { return static_cast<int>(children_.size()); }
+
+ virtual int64_t length() const { return length_; }
+ int64_t null_count() const { return null_count_; }
+ int64_t capacity() const { return capacity_; }
+
+ /// \brief Ensure that enough memory has been allocated to fit the indicated
+ /// number of total elements in the builder, including any that have already
+ /// been appended. Does not account for reallocations that may be due to
+ /// variable size data, like binary values. To make space for incremental
+ /// appends, use Reserve instead.
+ ///
+ /// \param[in] capacity the minimum number of total array values to
+ /// accommodate. Must be greater than the current capacity.
+ /// \return Status
+ virtual Status Resize(int64_t capacity);
+
+ /// \brief Ensure that there is enough space allocated to append the indicated
+ /// number of elements without any further reallocation. Overallocation is
+ /// used in order to minimize the impact of incremental Reserve() calls.
+ /// Note that additional_capacity is relative to the current number of elements
+ /// rather than to the current capacity, so calls to Reserve() which are not
+ /// interspersed with addition of new elements may not increase the capacity.
+ ///
+ /// \param[in] additional_capacity the number of additional array values
+ /// \return Status
+ Status Reserve(int64_t additional_capacity) {
+ auto current_capacity = capacity();
+ auto min_capacity = length() + additional_capacity;
+ if (min_capacity <= current_capacity) return Status::OK();
+
+ // leave growth factor up to BufferBuilder
+ auto new_capacity = BufferBuilder::GrowByFactor(current_capacity, min_capacity);
+ return Resize(new_capacity);
+ }
+
+ /// Reset the builder.
+ virtual void Reset();
+
+ /// \brief Append a null value to builder
+ virtual Status AppendNull() = 0;
+ /// \brief Append a number of null values to builder
+ virtual Status AppendNulls(int64_t length) = 0;
+
+ /// \brief Append a non-null value to builder
+ ///
+ /// The appended value is an implementation detail, but the corresponding
+ /// memory slot is guaranteed to be initialized.
+ /// This method is useful when appending a null value to a parent nested type.
+ virtual Status AppendEmptyValue() = 0;
+
+ /// \brief Append a number of non-null values to builder
+ ///
+ /// The appended values are an implementation detail, but the corresponding
+ /// memory slot is guaranteed to be initialized.
+ /// This method is useful when appending null values to a parent nested type.
+ virtual Status AppendEmptyValues(int64_t length) = 0;
+
+ /// \brief Append a value from a scalar
+ Status AppendScalar(const Scalar& scalar);
+ Status AppendScalar(const Scalar& scalar, int64_t n_repeats);
+ Status AppendScalars(const ScalarVector& scalars);
+
+ /// For cases where raw data was memcpy'd into the internal buffers, allows us
+ /// to advance the length of the builder. It is your responsibility to use
+ /// this function responsibly.
+ Status Advance(int64_t elements);
+
+ /// \brief Return result of builder as an internal generic ArrayData
+ /// object. Resets builder except for dictionary builder
+ ///
+ /// \param[out] out the finalized ArrayData object
+ /// \return Status
+ virtual Status FinishInternal(std::shared_ptr<ArrayData>* out) = 0;
+
+ /// \brief Return result of builder as an Array object.
+ ///
+ /// The builder is reset except for DictionaryBuilder.
+ ///
+ /// \param[out] out the finalized Array object
+ /// \return Status
+ Status Finish(std::shared_ptr<Array>* out);
+
+ /// \brief Return result of builder as an Array object.
+ ///
+ /// The builder is reset except for DictionaryBuilder.
+ ///
+ /// \return The finalized Array object
+ Result<std::shared_ptr<Array>> Finish();
+
+ /// \brief Return the type of the built Array
+ virtual std::shared_ptr<DataType> type() const = 0;
+
+ protected:
+ /// Append to null bitmap
+ Status AppendToBitmap(bool is_valid);
+
+ /// Vector append. Treat each zero byte as a null. If valid_bytes is null
+ /// assume all of length bits are valid.
+ Status AppendToBitmap(const uint8_t* valid_bytes, int64_t length);
+
+ /// Uniform append. Append N times the same validity bit.
+ Status AppendToBitmap(int64_t num_bits, bool value);
+
+ /// Set the next length bits to not null (i.e. valid).
+ Status SetNotNull(int64_t length);
+
+ // Unsafe operations (don't check capacity/don't resize)
+
+ void UnsafeAppendNull() { UnsafeAppendToBitmap(false); }
+
+ // Append to null bitmap, update the length
+ void UnsafeAppendToBitmap(bool is_valid) {
+ null_bitmap_builder_.UnsafeAppend(is_valid);
+ ++length_;
+ if (!is_valid) ++null_count_;
+ }
+
+ // Vector append. Treat each zero byte as a nullzero. If valid_bytes is null
+ // assume all of length bits are valid.
+ void UnsafeAppendToBitmap(const uint8_t* valid_bytes, int64_t length) {
+ if (valid_bytes == NULLPTR) {
+ return UnsafeSetNotNull(length);
+ }
+ null_bitmap_builder_.UnsafeAppend(valid_bytes, length);
+ length_ += length;
+ null_count_ = null_bitmap_builder_.false_count();
+ }
+
+ // Append the same validity value a given number of times.
+ void UnsafeAppendToBitmap(const int64_t num_bits, bool value) {
+ if (value) {
+ UnsafeSetNotNull(num_bits);
+ } else {
+ UnsafeSetNull(num_bits);
+ }
+ }
+
+ void UnsafeAppendToBitmap(const std::vector<bool>& is_valid);
+
+ // Set the next validity bits to not null (i.e. valid).
+ void UnsafeSetNotNull(int64_t length);
+
+ // Set the next validity bits to null (i.e. invalid).
+ void UnsafeSetNull(int64_t length);
+
+ static Status TrimBuffer(const int64_t bytes_filled, ResizableBuffer* buffer);
+
+ /// \brief Finish to an array of the specified ArrayType
+ template <typename ArrayType>
+ Status FinishTyped(std::shared_ptr<ArrayType>* out) {
+ std::shared_ptr<Array> out_untyped;
+ ARROW_RETURN_NOT_OK(Finish(&out_untyped));
+ *out = std::static_pointer_cast<ArrayType>(std::move(out_untyped));
+ return Status::OK();
+ }
+
+ // Check the requested capacity for validity
+ Status CheckCapacity(int64_t new_capacity) {
+ if (ARROW_PREDICT_FALSE(new_capacity < 0)) {
+ return Status::Invalid(
+ "Resize capacity must be positive (requested: ", new_capacity, ")");
+ }
+
+ if (ARROW_PREDICT_FALSE(new_capacity < length_)) {
+ return Status::Invalid("Resize cannot downsize (requested: ", new_capacity,
+ ", current length: ", length_, ")");
+ }
+
+ return Status::OK();
+ }
+
+ // Check for array type
+ Status CheckArrayType(const std::shared_ptr<DataType>& expected_type,
+ const Array& array, const char* message);
+ Status CheckArrayType(Type::type expected_type, const Array& array,
+ const char* message);
+
+ MemoryPool* pool_;
+
+ TypedBufferBuilder<bool> null_bitmap_builder_;
+ int64_t null_count_ = 0;
+
+ // Array length, so far. Also, the index of the next element to be added
+ int64_t length_ = 0;
+ int64_t capacity_ = 0;
+
+ // Child value array builders. These are owned by this class
+ std::vector<std::shared_ptr<ArrayBuilder>> children_;
+
+ private:
+ ARROW_DISALLOW_COPY_AND_ASSIGN(ArrayBuilder);
+};
+
+/// \brief Construct an empty ArrayBuilder corresponding to the data
+/// type
+/// \param[in] pool the MemoryPool to use for allocations
+/// \param[in] type the data type to create the builder for
+/// \param[out] out the created ArrayBuilder
+ARROW_EXPORT
+Status MakeBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type,
+ std::unique_ptr<ArrayBuilder>* out);
+
+/// \brief Construct an empty DictionaryBuilder initialized optionally
+/// with a pre-existing dictionary
+/// \param[in] pool the MemoryPool to use for allocations
+/// \param[in] type the dictionary type to create the builder for
+/// \param[in] dictionary the initial dictionary, if any. May be nullptr
+/// \param[out] out the created ArrayBuilder
+ARROW_EXPORT
+Status MakeDictionaryBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type,
+ const std::shared_ptr<Array>& dictionary,
+ std::unique_ptr<ArrayBuilder>* out);
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_binary.cc b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_binary.cc
new file mode 100644
index 00000000000..6822dc89903
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_binary.cc
@@ -0,0 +1,199 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/array/builder_binary.h"
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <numeric>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/buffer.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/decimal.h"
+#include "arrow/util/logging.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+
+// ----------------------------------------------------------------------
+// Fixed width binary
+
+FixedSizeBinaryBuilder::FixedSizeBinaryBuilder(const std::shared_ptr<DataType>& type,
+ MemoryPool* pool)
+ : ArrayBuilder(pool),
+ byte_width_(checked_cast<const FixedSizeBinaryType&>(*type).byte_width()),
+ byte_builder_(pool) {}
+
+void FixedSizeBinaryBuilder::CheckValueSize(int64_t size) {
+ DCHECK_EQ(size, byte_width_) << "Appending wrong size to FixedSizeBinaryBuilder";
+}
+
+Status FixedSizeBinaryBuilder::AppendValues(const uint8_t* data, int64_t length,
+ const uint8_t* valid_bytes) {
+ RETURN_NOT_OK(Reserve(length));
+ UnsafeAppendToBitmap(valid_bytes, length);
+ return byte_builder_.Append(data, length * byte_width_);
+}
+
+Status FixedSizeBinaryBuilder::AppendNull() {
+ RETURN_NOT_OK(Reserve(1));
+ UnsafeAppendNull();
+ return Status::OK();
+}
+
+Status FixedSizeBinaryBuilder::AppendNulls(int64_t length) {
+ RETURN_NOT_OK(Reserve(length));
+ UnsafeAppendToBitmap(length, false);
+ byte_builder_.UnsafeAppend(/*num_copies=*/length * byte_width_, 0);
+ return Status::OK();
+}
+
+Status FixedSizeBinaryBuilder::AppendEmptyValue() {
+ RETURN_NOT_OK(Reserve(1));
+ UnsafeAppendToBitmap(true);
+ byte_builder_.UnsafeAppend(/*num_copies=*/byte_width_, 0);
+ return Status::OK();
+}
+
+Status FixedSizeBinaryBuilder::AppendEmptyValues(int64_t length) {
+ RETURN_NOT_OK(Reserve(length));
+ UnsafeAppendToBitmap(length, true);
+ byte_builder_.UnsafeAppend(/*num_copies=*/length * byte_width_, 0);
+ return Status::OK();
+}
+
+void FixedSizeBinaryBuilder::Reset() {
+ ArrayBuilder::Reset();
+ byte_builder_.Reset();
+}
+
+Status FixedSizeBinaryBuilder::Resize(int64_t capacity) {
+ RETURN_NOT_OK(CheckCapacity(capacity));
+ RETURN_NOT_OK(byte_builder_.Resize(capacity * byte_width_));
+ return ArrayBuilder::Resize(capacity);
+}
+
+Status FixedSizeBinaryBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
+ std::shared_ptr<Buffer> data;
+ RETURN_NOT_OK(byte_builder_.Finish(&data));
+
+ std::shared_ptr<Buffer> null_bitmap;
+ RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
+ *out = ArrayData::Make(type(), length_, {null_bitmap, data}, null_count_);
+
+ capacity_ = length_ = null_count_ = 0;
+ return Status::OK();
+}
+
+const uint8_t* FixedSizeBinaryBuilder::GetValue(int64_t i) const {
+ const uint8_t* data_ptr = byte_builder_.data();
+ return data_ptr + i * byte_width_;
+}
+
+util::string_view FixedSizeBinaryBuilder::GetView(int64_t i) const {
+ const uint8_t* data_ptr = byte_builder_.data();
+ return util::string_view(reinterpret_cast<const char*>(data_ptr + i * byte_width_),
+ byte_width_);
+}
+
+// ----------------------------------------------------------------------
+// ChunkedArray builders
+
+namespace internal {
+
+ChunkedBinaryBuilder::ChunkedBinaryBuilder(int32_t max_chunk_value_length,
+ MemoryPool* pool)
+ : max_chunk_value_length_(max_chunk_value_length), builder_(new BinaryBuilder(pool)) {
+ DCHECK_LE(max_chunk_value_length, kBinaryMemoryLimit);
+}
+
+ChunkedBinaryBuilder::ChunkedBinaryBuilder(int32_t max_chunk_value_length,
+ int32_t max_chunk_length, MemoryPool* pool)
+ : ChunkedBinaryBuilder(max_chunk_value_length, pool) {
+ max_chunk_length_ = max_chunk_length;
+}
+
+Status ChunkedBinaryBuilder::Finish(ArrayVector* out) {
+ if (builder_->length() > 0 || chunks_.size() == 0) {
+ std::shared_ptr<Array> chunk;
+ RETURN_NOT_OK(builder_->Finish(&chunk));
+ chunks_.emplace_back(std::move(chunk));
+ }
+ *out = std::move(chunks_);
+ return Status::OK();
+}
+
+Status ChunkedBinaryBuilder::NextChunk() {
+ std::shared_ptr<Array> chunk;
+ RETURN_NOT_OK(builder_->Finish(&chunk));
+ chunks_.emplace_back(std::move(chunk));
+
+ if (auto capacity = extra_capacity_) {
+ extra_capacity_ = 0;
+ return Reserve(capacity);
+ }
+
+ return Status::OK();
+}
+
+Status ChunkedStringBuilder::Finish(ArrayVector* out) {
+ RETURN_NOT_OK(ChunkedBinaryBuilder::Finish(out));
+
+ // Change data type to string/utf8
+ for (size_t i = 0; i < out->size(); ++i) {
+ std::shared_ptr<ArrayData> data = (*out)[i]->data();
+ data->type = ::arrow::utf8();
+ (*out)[i] = std::make_shared<StringArray>(data);
+ }
+ return Status::OK();
+}
+
+Status ChunkedBinaryBuilder::Reserve(int64_t values) {
+ if (ARROW_PREDICT_FALSE(extra_capacity_ != 0)) {
+ extra_capacity_ += values;
+ return Status::OK();
+ }
+
+ auto current_capacity = builder_->capacity();
+ auto min_capacity = builder_->length() + values;
+ if (current_capacity >= min_capacity) {
+ return Status::OK();
+ }
+
+ auto new_capacity = BufferBuilder::GrowByFactor(current_capacity, min_capacity);
+ if (ARROW_PREDICT_TRUE(new_capacity <= max_chunk_length_)) {
+ return builder_->Resize(new_capacity);
+ }
+
+ extra_capacity_ = new_capacity - max_chunk_length_;
+ return builder_->Resize(max_chunk_length_);
+}
+
+} // namespace internal
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_binary.h b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_binary.h
new file mode 100644
index 00000000000..62edc69fb8e
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_binary.h
@@ -0,0 +1,670 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <limits>
+#include <memory>
+#include <numeric>
+#include <string>
+#include <vector>
+
+#include "arrow/array/array_base.h"
+#include "arrow/array/array_binary.h"
+#include "arrow/array/builder_base.h"
+#include "arrow/array/data.h"
+#include "arrow/buffer.h"
+#include "arrow/buffer_builder.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/string_view.h" // IWYU pragma: export
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+// ----------------------------------------------------------------------
+// Binary and String
+
+template <typename TYPE>
+class BaseBinaryBuilder : public ArrayBuilder {
+ public:
+ using TypeClass = TYPE;
+ using offset_type = typename TypeClass::offset_type;
+
+ explicit BaseBinaryBuilder(MemoryPool* pool = default_memory_pool())
+ : ArrayBuilder(pool), offsets_builder_(pool), value_data_builder_(pool) {}
+
+ BaseBinaryBuilder(const std::shared_ptr<DataType>& /*type*/, MemoryPool* pool)
+ : BaseBinaryBuilder(pool) {}
+
+ Status Append(const uint8_t* value, offset_type length) {
+ ARROW_RETURN_NOT_OK(Reserve(1));
+ ARROW_RETURN_NOT_OK(AppendNextOffset());
+ // Safety check for UBSAN.
+ if (ARROW_PREDICT_TRUE(length > 0)) {
+ ARROW_RETURN_NOT_OK(ValidateOverflow(length));
+ ARROW_RETURN_NOT_OK(value_data_builder_.Append(value, length));
+ }
+
+ UnsafeAppendToBitmap(true);
+ return Status::OK();
+ }
+
+ Status Append(const char* value, offset_type length) {
+ return Append(reinterpret_cast<const uint8_t*>(value), length);
+ }
+
+ Status Append(util::string_view value) {
+ return Append(value.data(), static_cast<offset_type>(value.size()));
+ }
+
+ /// Extend the last appended value by appending more data at the end
+ ///
+ /// Unlike Append, this does not create a new offset.
+ Status ExtendCurrent(const uint8_t* value, offset_type length) {
+ // Safety check for UBSAN.
+ if (ARROW_PREDICT_TRUE(length > 0)) {
+ ARROW_RETURN_NOT_OK(ValidateOverflow(length));
+ ARROW_RETURN_NOT_OK(value_data_builder_.Append(value, length));
+ }
+ return Status::OK();
+ }
+
+ Status ExtendCurrent(util::string_view value) {
+ return ExtendCurrent(reinterpret_cast<const uint8_t*>(value.data()),
+ static_cast<offset_type>(value.size()));
+ }
+
+ Status AppendNulls(int64_t length) final {
+ const int64_t num_bytes = value_data_builder_.length();
+ ARROW_RETURN_NOT_OK(Reserve(length));
+ for (int64_t i = 0; i < length; ++i) {
+ offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
+ }
+ UnsafeAppendToBitmap(length, false);
+ return Status::OK();
+ }
+
+ Status AppendNull() final {
+ ARROW_RETURN_NOT_OK(AppendNextOffset());
+ ARROW_RETURN_NOT_OK(Reserve(1));
+ UnsafeAppendToBitmap(false);
+ return Status::OK();
+ }
+
+ Status AppendEmptyValue() final {
+ ARROW_RETURN_NOT_OK(AppendNextOffset());
+ ARROW_RETURN_NOT_OK(Reserve(1));
+ UnsafeAppendToBitmap(true);
+ return Status::OK();
+ }
+
+ Status AppendEmptyValues(int64_t length) final {
+ const int64_t num_bytes = value_data_builder_.length();
+ ARROW_RETURN_NOT_OK(Reserve(length));
+ for (int64_t i = 0; i < length; ++i) {
+ offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
+ }
+ UnsafeAppendToBitmap(length, true);
+ return Status::OK();
+ }
+
+ /// \brief Append without checking capacity
+ ///
+ /// Offsets and data should have been presized using Reserve() and
+ /// ReserveData(), respectively.
+ void UnsafeAppend(const uint8_t* value, offset_type length) {
+ UnsafeAppendNextOffset();
+ value_data_builder_.UnsafeAppend(value, length);
+ UnsafeAppendToBitmap(true);
+ }
+
+ void UnsafeAppend(const char* value, offset_type length) {
+ UnsafeAppend(reinterpret_cast<const uint8_t*>(value), length);
+ }
+
+ void UnsafeAppend(const std::string& value) {
+ UnsafeAppend(value.c_str(), static_cast<offset_type>(value.size()));
+ }
+
+ void UnsafeAppend(util::string_view value) {
+ UnsafeAppend(value.data(), static_cast<offset_type>(value.size()));
+ }
+
+ /// Like ExtendCurrent, but do not check capacity
+ void UnsafeExtendCurrent(const uint8_t* value, offset_type length) {
+ value_data_builder_.UnsafeAppend(value, length);
+ }
+
+ void UnsafeExtendCurrent(util::string_view value) {
+ UnsafeExtendCurrent(reinterpret_cast<const uint8_t*>(value.data()),
+ static_cast<offset_type>(value.size()));
+ }
+
+ void UnsafeAppendNull() {
+ const int64_t num_bytes = value_data_builder_.length();
+ offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
+ UnsafeAppendToBitmap(false);
+ }
+
+ void UnsafeAppendEmptyValue() {
+ const int64_t num_bytes = value_data_builder_.length();
+ offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
+ UnsafeAppendToBitmap(true);
+ }
+
+ /// \brief Append a sequence of strings in one shot.
+ ///
+ /// \param[in] values a vector of strings
+ /// \param[in] valid_bytes an optional sequence of bytes where non-zero
+ /// indicates a valid (non-null) value
+ /// \return Status
+ Status AppendValues(const std::vector<std::string>& values,
+ const uint8_t* valid_bytes = NULLPTR) {
+ std::size_t total_length = std::accumulate(
+ values.begin(), values.end(), 0ULL,
+ [](uint64_t sum, const std::string& str) { return sum + str.size(); });
+ ARROW_RETURN_NOT_OK(Reserve(values.size()));
+ ARROW_RETURN_NOT_OK(value_data_builder_.Reserve(total_length));
+ ARROW_RETURN_NOT_OK(offsets_builder_.Reserve(values.size()));
+
+ if (valid_bytes != NULLPTR) {
+ for (std::size_t i = 0; i < values.size(); ++i) {
+ UnsafeAppendNextOffset();
+ if (valid_bytes[i]) {
+ value_data_builder_.UnsafeAppend(
+ reinterpret_cast<const uint8_t*>(values[i].data()), values[i].size());
+ }
+ }
+ } else {
+ for (std::size_t i = 0; i < values.size(); ++i) {
+ UnsafeAppendNextOffset();
+ value_data_builder_.UnsafeAppend(
+ reinterpret_cast<const uint8_t*>(values[i].data()), values[i].size());
+ }
+ }
+
+ UnsafeAppendToBitmap(valid_bytes, values.size());
+ return Status::OK();
+ }
+
+ /// \brief Append a sequence of nul-terminated strings in one shot.
+ /// If one of the values is NULL, it is processed as a null
+ /// value even if the corresponding valid_bytes entry is 1.
+ ///
+ /// \param[in] values a contiguous C array of nul-terminated char *
+ /// \param[in] length the number of values to append
+ /// \param[in] valid_bytes an optional sequence of bytes where non-zero
+ /// indicates a valid (non-null) value
+ /// \return Status
+ Status AppendValues(const char** values, int64_t length,
+ const uint8_t* valid_bytes = NULLPTR) {
+ std::size_t total_length = 0;
+ std::vector<std::size_t> value_lengths(length);
+ bool have_null_value = false;
+ for (int64_t i = 0; i < length; ++i) {
+ if (values[i] != NULLPTR) {
+ auto value_length = strlen(values[i]);
+ value_lengths[i] = value_length;
+ total_length += value_length;
+ } else {
+ have_null_value = true;
+ }
+ }
+ ARROW_RETURN_NOT_OK(Reserve(length));
+ ARROW_RETURN_NOT_OK(ReserveData(total_length));
+
+ if (valid_bytes) {
+ int64_t valid_bytes_offset = 0;
+ for (int64_t i = 0; i < length; ++i) {
+ UnsafeAppendNextOffset();
+ if (valid_bytes[i]) {
+ if (values[i]) {
+ value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]),
+ value_lengths[i]);
+ } else {
+ UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset,
+ i - valid_bytes_offset);
+ UnsafeAppendToBitmap(false);
+ valid_bytes_offset = i + 1;
+ }
+ }
+ }
+ UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset, length - valid_bytes_offset);
+ } else {
+ if (have_null_value) {
+ std::vector<uint8_t> valid_vector(length, 0);
+ for (int64_t i = 0; i < length; ++i) {
+ UnsafeAppendNextOffset();
+ if (values[i]) {
+ value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]),
+ value_lengths[i]);
+ valid_vector[i] = 1;
+ }
+ }
+ UnsafeAppendToBitmap(valid_vector.data(), length);
+ } else {
+ for (int64_t i = 0; i < length; ++i) {
+ UnsafeAppendNextOffset();
+ value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]),
+ value_lengths[i]);
+ }
+ UnsafeAppendToBitmap(NULLPTR, length);
+ }
+ }
+ return Status::OK();
+ }
+
+ void Reset() override {
+ ArrayBuilder::Reset();
+ offsets_builder_.Reset();
+ value_data_builder_.Reset();
+ }
+
+ Status ValidateOverflow(int64_t new_bytes) {
+ auto new_size = value_data_builder_.length() + new_bytes;
+ if (ARROW_PREDICT_FALSE(new_size > memory_limit())) {
+ return Status::CapacityError("array cannot contain more than ", memory_limit(),
+ " bytes, have ", new_size);
+ } else {
+ return Status::OK();
+ }
+ }
+
+ Status Resize(int64_t capacity) override {
+ ARROW_RETURN_NOT_OK(CheckCapacity(capacity));
+ // One more than requested for offsets
+ ARROW_RETURN_NOT_OK(offsets_builder_.Resize(capacity + 1));
+ return ArrayBuilder::Resize(capacity);
+ }
+
+ /// \brief Ensures there is enough allocated capacity to append the indicated
+ /// number of bytes to the value data buffer without additional allocations
+ Status ReserveData(int64_t elements) {
+ ARROW_RETURN_NOT_OK(ValidateOverflow(elements));
+ return value_data_builder_.Reserve(elements);
+ }
+
+ Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
+ // Write final offset (values length)
+ ARROW_RETURN_NOT_OK(AppendNextOffset());
+
+ // These buffers' padding zeroed by BufferBuilder
+ std::shared_ptr<Buffer> offsets, value_data, null_bitmap;
+ ARROW_RETURN_NOT_OK(offsets_builder_.Finish(&offsets));
+ ARROW_RETURN_NOT_OK(value_data_builder_.Finish(&value_data));
+ ARROW_RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
+
+ *out = ArrayData::Make(type(), length_, {null_bitmap, offsets, value_data},
+ null_count_, 0);
+ Reset();
+ return Status::OK();
+ }
+
+ /// \return data pointer of the value date builder
+ const uint8_t* value_data() const { return value_data_builder_.data(); }
+ /// \return size of values buffer so far
+ int64_t value_data_length() const { return value_data_builder_.length(); }
+ /// \return capacity of values buffer
+ int64_t value_data_capacity() const { return value_data_builder_.capacity(); }
+
+ /// \return data pointer of the value date builder
+ const offset_type* offsets_data() const { return offsets_builder_.data(); }
+
+ /// Temporary access to a value.
+ ///
+ /// This pointer becomes invalid on the next modifying operation.
+ const uint8_t* GetValue(int64_t i, offset_type* out_length) const {
+ const offset_type* offsets = offsets_builder_.data();
+ const auto offset = offsets[i];
+ if (i == (length_ - 1)) {
+ *out_length = static_cast<offset_type>(value_data_builder_.length()) - offset;
+ } else {
+ *out_length = offsets[i + 1] - offset;
+ }
+ return value_data_builder_.data() + offset;
+ }
+
+ offset_type offset(int64_t i) const { return offsets_data()[i]; }
+
+ /// Temporary access to a value.
+ ///
+ /// This view becomes invalid on the next modifying operation.
+ util::string_view GetView(int64_t i) const {
+ offset_type value_length;
+ const uint8_t* value_data = GetValue(i, &value_length);
+ return util::string_view(reinterpret_cast<const char*>(value_data), value_length);
+ }
+
+ // Cannot make this a static attribute because of linking issues
+ static constexpr int64_t memory_limit() {
+ return std::numeric_limits<offset_type>::max() - 1;
+ }
+
+ protected:
+ TypedBufferBuilder<offset_type> offsets_builder_;
+ TypedBufferBuilder<uint8_t> value_data_builder_;
+
+ Status AppendNextOffset() {
+ const int64_t num_bytes = value_data_builder_.length();
+ return offsets_builder_.Append(static_cast<offset_type>(num_bytes));
+ }
+
+ void UnsafeAppendNextOffset() {
+ const int64_t num_bytes = value_data_builder_.length();
+ offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
+ }
+};
+
+/// \class BinaryBuilder
+/// \brief Builder class for variable-length binary data
+class ARROW_EXPORT BinaryBuilder : public BaseBinaryBuilder<BinaryType> {
+ public:
+ using BaseBinaryBuilder::BaseBinaryBuilder;
+
+ /// \cond FALSE
+ using ArrayBuilder::Finish;
+ /// \endcond
+
+ Status Finish(std::shared_ptr<BinaryArray>* out) { return FinishTyped(out); }
+
+ std::shared_ptr<DataType> type() const override { return binary(); }
+};
+
+/// \class StringBuilder
+/// \brief Builder class for UTF8 strings
+class ARROW_EXPORT StringBuilder : public BinaryBuilder {
+ public:
+ using BinaryBuilder::BinaryBuilder;
+
+ /// \cond FALSE
+ using ArrayBuilder::Finish;
+ /// \endcond
+
+ Status Finish(std::shared_ptr<StringArray>* out) { return FinishTyped(out); }
+
+ std::shared_ptr<DataType> type() const override { return utf8(); }
+};
+
+/// \class LargeBinaryBuilder
+/// \brief Builder class for large variable-length binary data
+class ARROW_EXPORT LargeBinaryBuilder : public BaseBinaryBuilder<LargeBinaryType> {
+ public:
+ using BaseBinaryBuilder::BaseBinaryBuilder;
+
+ /// \cond FALSE
+ using ArrayBuilder::Finish;
+ /// \endcond
+
+ Status Finish(std::shared_ptr<LargeBinaryArray>* out) { return FinishTyped(out); }
+
+ std::shared_ptr<DataType> type() const override { return large_binary(); }
+};
+
+/// \class LargeStringBuilder
+/// \brief Builder class for large UTF8 strings
+class ARROW_EXPORT LargeStringBuilder : public LargeBinaryBuilder {
+ public:
+ using LargeBinaryBuilder::LargeBinaryBuilder;
+
+ /// \cond FALSE
+ using ArrayBuilder::Finish;
+ /// \endcond
+
+ Status Finish(std::shared_ptr<LargeStringArray>* out) { return FinishTyped(out); }
+
+ std::shared_ptr<DataType> type() const override { return large_utf8(); }
+};
+
+// ----------------------------------------------------------------------
+// FixedSizeBinaryBuilder
+
+class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder {
+ public:
+ using TypeClass = FixedSizeBinaryType;
+
+ explicit FixedSizeBinaryBuilder(const std::shared_ptr<DataType>& type,
+ MemoryPool* pool = default_memory_pool());
+
+ Status Append(const uint8_t* value) {
+ ARROW_RETURN_NOT_OK(Reserve(1));
+ UnsafeAppend(value);
+ return Status::OK();
+ }
+
+ Status Append(const char* value) {
+ return Append(reinterpret_cast<const uint8_t*>(value));
+ }
+
+ Status Append(const util::string_view& view) {
+ ARROW_RETURN_NOT_OK(Reserve(1));
+ UnsafeAppend(view);
+ return Status::OK();
+ }
+
+ Status Append(const std::string& s) {
+ ARROW_RETURN_NOT_OK(Reserve(1));
+ UnsafeAppend(s);
+ return Status::OK();
+ }
+
+ Status Append(const Buffer& s) {
+ ARROW_RETURN_NOT_OK(Reserve(1));
+ UnsafeAppend(util::string_view(s));
+ return Status::OK();
+ }
+
+ Status Append(const std::shared_ptr<Buffer>& s) { return Append(*s); }
+
+ template <size_t NBYTES>
+ Status Append(const std::array<uint8_t, NBYTES>& value) {
+ ARROW_RETURN_NOT_OK(Reserve(1));
+ UnsafeAppend(
+ util::string_view(reinterpret_cast<const char*>(value.data()), value.size()));
+ return Status::OK();
+ }
+
+ Status AppendValues(const uint8_t* data, int64_t length,
+ const uint8_t* valid_bytes = NULLPTR);
+
+ Status AppendNull() final;
+ Status AppendNulls(int64_t length) final;
+
+ Status AppendEmptyValue() final;
+ Status AppendEmptyValues(int64_t length) final;
+
+ void UnsafeAppend(const uint8_t* value) {
+ UnsafeAppendToBitmap(true);
+ if (ARROW_PREDICT_TRUE(byte_width_ > 0)) {
+ byte_builder_.UnsafeAppend(value, byte_width_);
+ }
+ }
+
+ void UnsafeAppend(const char* value) {
+ UnsafeAppend(reinterpret_cast<const uint8_t*>(value));
+ }
+
+ void UnsafeAppend(util::string_view value) {
+#ifndef NDEBUG
+ CheckValueSize(static_cast<size_t>(value.size()));
+#endif
+ UnsafeAppend(reinterpret_cast<const uint8_t*>(value.data()));
+ }
+
+ void UnsafeAppend(const Buffer& s) { UnsafeAppend(util::string_view(s)); }
+
+ void UnsafeAppend(const std::shared_ptr<Buffer>& s) { UnsafeAppend(*s); }
+
+ void UnsafeAppendNull() {
+ UnsafeAppendToBitmap(false);
+ byte_builder_.UnsafeAppend(/*num_copies=*/byte_width_, 0);
+ }
+
+ Status ValidateOverflow(int64_t new_bytes) const {
+ auto new_size = byte_builder_.length() + new_bytes;
+ if (ARROW_PREDICT_FALSE(new_size > memory_limit())) {
+ return Status::CapacityError("array cannot contain more than ", memory_limit(),
+ " bytes, have ", new_size);
+ } else {
+ return Status::OK();
+ }
+ }
+
+ /// \brief Ensures there is enough allocated capacity to append the indicated
+ /// number of bytes to the value data buffer without additional allocations
+ Status ReserveData(int64_t elements) {
+ ARROW_RETURN_NOT_OK(ValidateOverflow(elements));
+ return byte_builder_.Reserve(elements);
+ }
+
+ void Reset() override;
+ Status Resize(int64_t capacity) override;
+ Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
+
+ /// \cond FALSE
+ using ArrayBuilder::Finish;
+ /// \endcond
+
+ Status Finish(std::shared_ptr<FixedSizeBinaryArray>* out) { return FinishTyped(out); }
+
+ /// \return size of values buffer so far
+ int64_t value_data_length() const { return byte_builder_.length(); }
+
+ int32_t byte_width() const { return byte_width_; }
+
+ /// Temporary access to a value.
+ ///
+ /// This pointer becomes invalid on the next modifying operation.
+ const uint8_t* GetValue(int64_t i) const;
+
+ /// Temporary access to a value.
+ ///
+ /// This view becomes invalid on the next modifying operation.
+ util::string_view GetView(int64_t i) const;
+
+ static constexpr int64_t memory_limit() {
+ return std::numeric_limits<int64_t>::max() - 1;
+ }
+
+ std::shared_ptr<DataType> type() const override {
+ return fixed_size_binary(byte_width_);
+ }
+
+ protected:
+ int32_t byte_width_;
+ BufferBuilder byte_builder_;
+
+ /// Temporary access to a value.
+ ///
+ /// This pointer becomes invalid on the next modifying operation.
+ uint8_t* GetMutableValue(int64_t i) {
+ uint8_t* data_ptr = byte_builder_.mutable_data();
+ return data_ptr + i * byte_width_;
+ }
+
+ void CheckValueSize(int64_t size);
+};
+
+// ----------------------------------------------------------------------
+// Chunked builders: build a sequence of BinaryArray or StringArray that are
+// limited to a particular size (to the upper limit of 2GB)
+
+namespace internal {
+
+class ARROW_EXPORT ChunkedBinaryBuilder {
+ public:
+ explicit ChunkedBinaryBuilder(int32_t max_chunk_value_length,
+ MemoryPool* pool = default_memory_pool());
+
+ ChunkedBinaryBuilder(int32_t max_chunk_value_length, int32_t max_chunk_length,
+ MemoryPool* pool = default_memory_pool());
+
+ virtual ~ChunkedBinaryBuilder() = default;
+
+ Status Append(const uint8_t* value, int32_t length) {
+ if (ARROW_PREDICT_FALSE(length + builder_->value_data_length() >
+ max_chunk_value_length_)) {
+ if (builder_->value_data_length() == 0) {
+ // The current item is larger than max_chunk_size_;
+ // this chunk will be oversize and hold *only* this item
+ ARROW_RETURN_NOT_OK(builder_->Append(value, length));
+ return NextChunk();
+ }
+ // The current item would cause builder_->value_data_length() to exceed
+ // max_chunk_size_, so finish this chunk and append the current item to the next
+ // chunk
+ ARROW_RETURN_NOT_OK(NextChunk());
+ return Append(value, length);
+ }
+
+ if (ARROW_PREDICT_FALSE(builder_->length() == max_chunk_length_)) {
+ // The current item would cause builder_->length() to exceed max_chunk_length_, so
+ // finish this chunk and append the current item to the next chunk
+ ARROW_RETURN_NOT_OK(NextChunk());
+ }
+
+ return builder_->Append(value, length);
+ }
+
+ Status Append(const util::string_view& value) {
+ return Append(reinterpret_cast<const uint8_t*>(value.data()),
+ static_cast<int32_t>(value.size()));
+ }
+
+ Status AppendNull() {
+ if (ARROW_PREDICT_FALSE(builder_->length() == max_chunk_length_)) {
+ ARROW_RETURN_NOT_OK(NextChunk());
+ }
+ return builder_->AppendNull();
+ }
+
+ Status Reserve(int64_t values);
+
+ virtual Status Finish(ArrayVector* out);
+
+ protected:
+ Status NextChunk();
+
+ // maximum total character data size per chunk
+ int64_t max_chunk_value_length_;
+
+ // maximum elements allowed per chunk
+ int64_t max_chunk_length_ = kListMaximumElements;
+
+ // when Reserve() would cause builder_ to exceed its max_chunk_length_,
+ // add to extra_capacity_ instead and wait to reserve until the next chunk
+ int64_t extra_capacity_ = 0;
+
+ std::unique_ptr<BinaryBuilder> builder_;
+ std::vector<std::shared_ptr<Array>> chunks_;
+};
+
+class ARROW_EXPORT ChunkedStringBuilder : public ChunkedBinaryBuilder {
+ public:
+ using ChunkedBinaryBuilder::ChunkedBinaryBuilder;
+
+ Status Finish(ArrayVector* out) override;
+};
+
+} // namespace internal
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_decimal.cc b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_decimal.cc
new file mode 100644
index 00000000000..bd7615a7309
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_decimal.cc
@@ -0,0 +1,105 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/array/builder_decimal.h"
+
+#include <cstdint>
+#include <memory>
+
+#include "arrow/array/data.h"
+#include "arrow/buffer.h"
+#include "arrow/buffer_builder.h"
+#include "arrow/status.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/decimal.h"
+
+namespace arrow {
+
+class Buffer;
+class MemoryPool;
+
+// ----------------------------------------------------------------------
+// Decimal128Builder
+
+Decimal128Builder::Decimal128Builder(const std::shared_ptr<DataType>& type,
+ MemoryPool* pool)
+ : FixedSizeBinaryBuilder(type, pool),
+ decimal_type_(internal::checked_pointer_cast<Decimal128Type>(type)) {}
+
+Status Decimal128Builder::Append(Decimal128 value) {
+ RETURN_NOT_OK(FixedSizeBinaryBuilder::Reserve(1));
+ UnsafeAppend(value);
+ return Status::OK();
+}
+
+void Decimal128Builder::UnsafeAppend(Decimal128 value) {
+ value.ToBytes(GetMutableValue(length()));
+ byte_builder_.UnsafeAdvance(16);
+ UnsafeAppendToBitmap(true);
+}
+
+void Decimal128Builder::UnsafeAppend(util::string_view value) {
+ FixedSizeBinaryBuilder::UnsafeAppend(value);
+}
+
+Status Decimal128Builder::FinishInternal(std::shared_ptr<ArrayData>* out) {
+ std::shared_ptr<Buffer> data;
+ RETURN_NOT_OK(byte_builder_.Finish(&data));
+ std::shared_ptr<Buffer> null_bitmap;
+ RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
+
+ *out = ArrayData::Make(type(), length_, {null_bitmap, data}, null_count_);
+ capacity_ = length_ = null_count_ = 0;
+ return Status::OK();
+}
+
+// ----------------------------------------------------------------------
+// Decimal256Builder
+
+Decimal256Builder::Decimal256Builder(const std::shared_ptr<DataType>& type,
+ MemoryPool* pool)
+ : FixedSizeBinaryBuilder(type, pool),
+ decimal_type_(internal::checked_pointer_cast<Decimal256Type>(type)) {}
+
+Status Decimal256Builder::Append(const Decimal256& value) {
+ RETURN_NOT_OK(FixedSizeBinaryBuilder::Reserve(1));
+ UnsafeAppend(value);
+ return Status::OK();
+}
+
+void Decimal256Builder::UnsafeAppend(const Decimal256& value) {
+ value.ToBytes(GetMutableValue(length()));
+ byte_builder_.UnsafeAdvance(32);
+ UnsafeAppendToBitmap(true);
+}
+
+void Decimal256Builder::UnsafeAppend(util::string_view value) {
+ FixedSizeBinaryBuilder::UnsafeAppend(value);
+}
+
+Status Decimal256Builder::FinishInternal(std::shared_ptr<ArrayData>* out) {
+ std::shared_ptr<Buffer> data;
+ RETURN_NOT_OK(byte_builder_.Finish(&data));
+ std::shared_ptr<Buffer> null_bitmap;
+ RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
+
+ *out = ArrayData::Make(type(), length_, {null_bitmap, data}, null_count_);
+ capacity_ = length_ = null_count_ = 0;
+ return Status::OK();
+}
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_decimal.h b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_decimal.h
new file mode 100644
index 00000000000..f48392ed001
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_decimal.h
@@ -0,0 +1,94 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+
+#include "arrow/array/array_decimal.h"
+#include "arrow/array/builder_base.h"
+#include "arrow/array/builder_binary.h"
+#include "arrow/array/data.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class ARROW_EXPORT Decimal128Builder : public FixedSizeBinaryBuilder {
+ public:
+ using TypeClass = Decimal128Type;
+ using ValueType = Decimal128;
+
+ explicit Decimal128Builder(const std::shared_ptr<DataType>& type,
+ MemoryPool* pool = default_memory_pool());
+
+ using FixedSizeBinaryBuilder::Append;
+ using FixedSizeBinaryBuilder::AppendValues;
+ using FixedSizeBinaryBuilder::Reset;
+
+ Status Append(Decimal128 val);
+ void UnsafeAppend(Decimal128 val);
+ void UnsafeAppend(util::string_view val);
+
+ Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
+
+ /// \cond FALSE
+ using ArrayBuilder::Finish;
+ /// \endcond
+
+ Status Finish(std::shared_ptr<Decimal128Array>* out) { return FinishTyped(out); }
+
+ std::shared_ptr<DataType> type() const override { return decimal_type_; }
+
+ protected:
+ std::shared_ptr<Decimal128Type> decimal_type_;
+};
+
+class ARROW_EXPORT Decimal256Builder : public FixedSizeBinaryBuilder {
+ public:
+ using TypeClass = Decimal256Type;
+ using ValueType = Decimal256;
+
+ explicit Decimal256Builder(const std::shared_ptr<DataType>& type,
+ MemoryPool* pool = default_memory_pool());
+
+ using FixedSizeBinaryBuilder::Append;
+ using FixedSizeBinaryBuilder::AppendValues;
+ using FixedSizeBinaryBuilder::Reset;
+
+ Status Append(const Decimal256& val);
+ void UnsafeAppend(const Decimal256& val);
+ void UnsafeAppend(util::string_view val);
+
+ Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
+
+ /// \cond FALSE
+ using ArrayBuilder::Finish;
+ /// \endcond
+
+ Status Finish(std::shared_ptr<Decimal256Array>* out) { return FinishTyped(out); }
+
+ std::shared_ptr<DataType> type() const override { return decimal_type_; }
+
+ protected:
+ std::shared_ptr<Decimal256Type> decimal_type_;
+};
+
+using DecimalBuilder = Decimal128Builder;
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_dict.cc b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_dict.cc
new file mode 100644
index 00000000000..b13f6a2db34
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_dict.cc
@@ -0,0 +1,204 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/array/builder_dict.h"
+
+#include <cstdint>
+#include <utility>
+
+#include "arrow/array/dict_internal.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/hashing.h"
+#include "arrow/util/logging.h"
+#include "arrow/visitor_inline.h"
+
+namespace arrow {
+
+// ----------------------------------------------------------------------
+// DictionaryBuilder
+
+namespace internal {
+
+class DictionaryMemoTable::DictionaryMemoTableImpl {
+ // Type-dependent visitor for memo table initialization
+ struct MemoTableInitializer {
+ std::shared_ptr<DataType> value_type_;
+ MemoryPool* pool_;
+ std::unique_ptr<MemoTable>* memo_table_;
+
+ template <typename T>
+ enable_if_no_memoize<T, Status> Visit(const T&) {
+ return Status::NotImplemented("Initialization of ", value_type_->ToString(),
+ " memo table is not implemented");
+ }
+
+ template <typename T>
+ enable_if_memoize<T, Status> Visit(const T&) {
+ using MemoTable = typename DictionaryTraits<T>::MemoTableType;
+ memo_table_->reset(new MemoTable(pool_, 0));
+ return Status::OK();
+ }
+ };
+
+ // Type-dependent visitor for memo table insertion
+ struct ArrayValuesInserter {
+ DictionaryMemoTableImpl* impl_;
+ const Array& values_;
+
+ template <typename T>
+ Status Visit(const T& type) {
+ using ArrayType = typename TypeTraits<T>::ArrayType;
+ return InsertValues(type, checked_cast<const ArrayType&>(values_));
+ }
+
+ private:
+ template <typename T, typename ArrayType>
+ enable_if_no_memoize<T, Status> InsertValues(const T& type, const ArrayType&) {
+ return Status::NotImplemented("Inserting array values of ", type,
+ " is not implemented");
+ }
+
+ template <typename T, typename ArrayType>
+ enable_if_memoize<T, Status> InsertValues(const T&, const ArrayType& array) {
+ if (array.null_count() > 0) {
+ return Status::Invalid("Cannot insert dictionary values containing nulls");
+ }
+ for (int64_t i = 0; i < array.length(); ++i) {
+ int32_t unused_memo_index;
+ RETURN_NOT_OK(impl_->GetOrInsert<T>(array.GetView(i), &unused_memo_index));
+ }
+ return Status::OK();
+ }
+ };
+
+ // Type-dependent visitor for building ArrayData from memo table
+ struct ArrayDataGetter {
+ std::shared_ptr<DataType> value_type_;
+ MemoTable* memo_table_;
+ MemoryPool* pool_;
+ int64_t start_offset_;
+ std::shared_ptr<ArrayData>* out_;
+
+ template <typename T>
+ enable_if_no_memoize<T, Status> Visit(const T&) {
+ return Status::NotImplemented("Getting array data of ", value_type_,
+ " is not implemented");
+ }
+
+ template <typename T>
+ enable_if_memoize<T, Status> Visit(const T&) {
+ using ConcreteMemoTable = typename DictionaryTraits<T>::MemoTableType;
+ auto memo_table = checked_cast<ConcreteMemoTable*>(memo_table_);
+ return DictionaryTraits<T>::GetDictionaryArrayData(pool_, value_type_, *memo_table,
+ start_offset_, out_);
+ }
+ };
+
+ public:
+ DictionaryMemoTableImpl(MemoryPool* pool, std::shared_ptr<DataType> type)
+ : pool_(pool), type_(std::move(type)), memo_table_(nullptr) {
+ MemoTableInitializer visitor{type_, pool_, &memo_table_};
+ ARROW_CHECK_OK(VisitTypeInline(*type_, &visitor));
+ }
+
+ Status InsertValues(const Array& array) {
+ if (!array.type()->Equals(*type_)) {
+ return Status::Invalid("Array value type does not match memo type: ",
+ array.type()->ToString());
+ }
+ ArrayValuesInserter visitor{this, array};
+ return VisitTypeInline(*array.type(), &visitor);
+ }
+
+ template <typename PhysicalType,
+ typename CType = typename DictionaryValue<PhysicalType>::type>
+ Status GetOrInsert(CType value, int32_t* out) {
+ using ConcreteMemoTable = typename DictionaryTraits<PhysicalType>::MemoTableType;
+ return checked_cast<ConcreteMemoTable*>(memo_table_.get())->GetOrInsert(value, out);
+ }
+
+ Status GetArrayData(int64_t start_offset, std::shared_ptr<ArrayData>* out) {
+ ArrayDataGetter visitor{type_, memo_table_.get(), pool_, start_offset, out};
+ return VisitTypeInline(*type_, &visitor);
+ }
+
+ int32_t size() const { return memo_table_->size(); }
+
+ private:
+ MemoryPool* pool_;
+ std::shared_ptr<DataType> type_;
+ std::unique_ptr<MemoTable> memo_table_;
+};
+
+DictionaryMemoTable::DictionaryMemoTable(MemoryPool* pool,
+ const std::shared_ptr<DataType>& type)
+ : impl_(new DictionaryMemoTableImpl(pool, type)) {}
+
+DictionaryMemoTable::DictionaryMemoTable(MemoryPool* pool,
+ const std::shared_ptr<Array>& dictionary)
+ : impl_(new DictionaryMemoTableImpl(pool, dictionary->type())) {
+ ARROW_CHECK_OK(impl_->InsertValues(*dictionary));
+}
+
+DictionaryMemoTable::~DictionaryMemoTable() = default;
+
+#define GET_OR_INSERT(C_TYPE) \
+ Status DictionaryMemoTable::GetOrInsert( \
+ const typename CTypeTraits<C_TYPE>::ArrowType*, C_TYPE value, int32_t* out) { \
+ return impl_->GetOrInsert<typename CTypeTraits<C_TYPE>::ArrowType>(value, out); \
+ }
+
+GET_OR_INSERT(bool)
+GET_OR_INSERT(int8_t)
+GET_OR_INSERT(int16_t)
+GET_OR_INSERT(int32_t)
+GET_OR_INSERT(int64_t)
+GET_OR_INSERT(uint8_t)
+GET_OR_INSERT(uint16_t)
+GET_OR_INSERT(uint32_t)
+GET_OR_INSERT(uint64_t)
+GET_OR_INSERT(float)
+GET_OR_INSERT(double)
+
+#undef GET_OR_INSERT
+
+Status DictionaryMemoTable::GetOrInsert(const BinaryType*, util::string_view value,
+ int32_t* out) {
+ return impl_->GetOrInsert<BinaryType>(value, out);
+}
+
+Status DictionaryMemoTable::GetOrInsert(const LargeBinaryType*, util::string_view value,
+ int32_t* out) {
+ return impl_->GetOrInsert<LargeBinaryType>(value, out);
+}
+
+Status DictionaryMemoTable::GetArrayData(int64_t start_offset,
+ std::shared_ptr<ArrayData>* out) {
+ return impl_->GetArrayData(start_offset, out);
+}
+
+Status DictionaryMemoTable::InsertValues(const Array& array) {
+ return impl_->InsertValues(array);
+}
+
+int32_t DictionaryMemoTable::size() const { return impl_->size(); }
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_dict.h b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_dict.h
new file mode 100644
index 00000000000..eb96482dbf7
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_dict.h
@@ -0,0 +1,572 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+#include <type_traits>
+
+#include "arrow/array/array_base.h"
+#include "arrow/array/array_binary.h"
+#include "arrow/array/builder_adaptive.h" // IWYU pragma: export
+#include "arrow/array/builder_base.h" // IWYU pragma: export
+#include "arrow/array/builder_primitive.h" // IWYU pragma: export
+#include "arrow/array/data.h"
+#include "arrow/array/util.h"
+#include "arrow/scalar.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/decimal.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+// ----------------------------------------------------------------------
+// Dictionary builder
+
+namespace internal {
+
+template <typename T, typename Enable = void>
+struct DictionaryValue {
+ using type = typename T::c_type;
+ using PhysicalType = T;
+};
+
+template <typename T>
+struct DictionaryValue<T, enable_if_base_binary<T>> {
+ using type = util::string_view;
+ using PhysicalType =
+ typename std::conditional<std::is_same<typename T::offset_type, int32_t>::value,
+ BinaryType, LargeBinaryType>::type;
+};
+
+template <typename T>
+struct DictionaryValue<T, enable_if_fixed_size_binary<T>> {
+ using type = util::string_view;
+ using PhysicalType = BinaryType;
+};
+
+class ARROW_EXPORT DictionaryMemoTable {
+ public:
+ DictionaryMemoTable(MemoryPool* pool, const std::shared_ptr<DataType>& type);
+ DictionaryMemoTable(MemoryPool* pool, const std::shared_ptr<Array>& dictionary);
+ ~DictionaryMemoTable();
+
+ Status GetArrayData(int64_t start_offset, std::shared_ptr<ArrayData>* out);
+
+ /// \brief Insert new memo values
+ Status InsertValues(const Array& values);
+
+ int32_t size() const;
+
+ template <typename T>
+ Status GetOrInsert(typename DictionaryValue<T>::type value, int32_t* out) {
+ // We want to keep the DictionaryMemoTable implementation private, also we can't
+ // use extern template classes because of compiler issues (MinGW?). Instead,
+ // we expose explicit function overrides for each supported physical type.
+ const typename DictionaryValue<T>::PhysicalType* physical_type = NULLPTR;
+ return GetOrInsert(physical_type, value, out);
+ }
+
+ private:
+ Status GetOrInsert(const BooleanType*, bool value, int32_t* out);
+ Status GetOrInsert(const Int8Type*, int8_t value, int32_t* out);
+ Status GetOrInsert(const Int16Type*, int16_t value, int32_t* out);
+ Status GetOrInsert(const Int32Type*, int32_t value, int32_t* out);
+ Status GetOrInsert(const Int64Type*, int64_t value, int32_t* out);
+ Status GetOrInsert(const UInt8Type*, uint8_t value, int32_t* out);
+ Status GetOrInsert(const UInt16Type*, uint16_t value, int32_t* out);
+ Status GetOrInsert(const UInt32Type*, uint32_t value, int32_t* out);
+ Status GetOrInsert(const UInt64Type*, uint64_t value, int32_t* out);
+ Status GetOrInsert(const FloatType*, float value, int32_t* out);
+ Status GetOrInsert(const DoubleType*, double value, int32_t* out);
+
+ Status GetOrInsert(const BinaryType*, util::string_view value, int32_t* out);
+ Status GetOrInsert(const LargeBinaryType*, util::string_view value, int32_t* out);
+
+ class DictionaryMemoTableImpl;
+ std::unique_ptr<DictionaryMemoTableImpl> impl_;
+};
+
+/// \brief Array builder for created encoded DictionaryArray from
+/// dense array
+///
+/// Unlike other builders, dictionary builder does not completely
+/// reset the state on Finish calls.
+template <typename BuilderType, typename T>
+class DictionaryBuilderBase : public ArrayBuilder {
+ public:
+ using TypeClass = DictionaryType;
+ using Value = typename DictionaryValue<T>::type;
+
+ // WARNING: the type given below is the value type, not the DictionaryType.
+ // The DictionaryType is instantiated on the Finish() call.
+ template <typename B = BuilderType, typename T1 = T>
+ DictionaryBuilderBase(uint8_t start_int_size,
+ enable_if_t<std::is_base_of<AdaptiveIntBuilderBase, B>::value &&
+ !is_fixed_size_binary_type<T1>::value,
+ const std::shared_ptr<DataType>&>
+ value_type,
+ MemoryPool* pool = default_memory_pool())
+ : ArrayBuilder(pool),
+ memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
+ delta_offset_(0),
+ byte_width_(-1),
+ indices_builder_(start_int_size, pool),
+ value_type_(value_type) {}
+
+ template <typename T1 = T>
+ explicit DictionaryBuilderBase(
+ enable_if_t<!is_fixed_size_binary_type<T1>::value, const std::shared_ptr<DataType>&>
+ value_type,
+ MemoryPool* pool = default_memory_pool())
+ : ArrayBuilder(pool),
+ memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
+ delta_offset_(0),
+ byte_width_(-1),
+ indices_builder_(pool),
+ value_type_(value_type) {}
+
+ template <typename B = BuilderType, typename T1 = T>
+ DictionaryBuilderBase(uint8_t start_int_size,
+ enable_if_t<std::is_base_of<AdaptiveIntBuilderBase, B>::value &&
+ is_fixed_size_binary_type<T1>::value,
+ const std::shared_ptr<DataType>&>
+ value_type,
+ MemoryPool* pool = default_memory_pool())
+ : ArrayBuilder(pool),
+ memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
+ delta_offset_(0),
+ byte_width_(static_cast<const T1&>(*value_type).byte_width()),
+ indices_builder_(start_int_size, pool),
+ value_type_(value_type) {}
+
+ template <typename T1 = T>
+ explicit DictionaryBuilderBase(
+ enable_if_fixed_size_binary<T1, const std::shared_ptr<DataType>&> value_type,
+ MemoryPool* pool = default_memory_pool())
+ : ArrayBuilder(pool),
+ memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
+ delta_offset_(0),
+ byte_width_(static_cast<const T1&>(*value_type).byte_width()),
+ indices_builder_(pool),
+ value_type_(value_type) {}
+
+ template <typename T1 = T>
+ explicit DictionaryBuilderBase(
+ enable_if_parameter_free<T1, MemoryPool*> pool = default_memory_pool())
+ : DictionaryBuilderBase<BuilderType, T1>(TypeTraits<T1>::type_singleton(), pool) {}
+
+ // This constructor doesn't check for errors. Use InsertMemoValues instead.
+ explicit DictionaryBuilderBase(const std::shared_ptr<Array>& dictionary,
+ MemoryPool* pool = default_memory_pool())
+ : ArrayBuilder(pool),
+ memo_table_(new internal::DictionaryMemoTable(pool, dictionary)),
+ delta_offset_(0),
+ byte_width_(-1),
+ indices_builder_(pool),
+ value_type_(dictionary->type()) {}
+
+ ~DictionaryBuilderBase() override = default;
+
+ /// \brief The current number of entries in the dictionary
+ int64_t dictionary_length() const { return memo_table_->size(); }
+
+ /// \brief The value byte width (for FixedSizeBinaryType)
+ template <typename T1 = T>
+ enable_if_fixed_size_binary<T1, int32_t> byte_width() const {
+ return byte_width_;
+ }
+
+ /// \brief Append a scalar value
+ Status Append(Value value) {
+ ARROW_RETURN_NOT_OK(Reserve(1));
+
+ int32_t memo_index;
+ ARROW_RETURN_NOT_OK(memo_table_->GetOrInsert<T>(value, &memo_index));
+ ARROW_RETURN_NOT_OK(indices_builder_.Append(memo_index));
+ length_ += 1;
+
+ return Status::OK();
+ }
+
+ /// \brief Append a fixed-width string (only for FixedSizeBinaryType)
+ template <typename T1 = T>
+ enable_if_fixed_size_binary<T1, Status> Append(const uint8_t* value) {
+ return Append(util::string_view(reinterpret_cast<const char*>(value), byte_width_));
+ }
+
+ /// \brief Append a fixed-width string (only for FixedSizeBinaryType)
+ template <typename T1 = T>
+ enable_if_fixed_size_binary<T1, Status> Append(const char* value) {
+ return Append(util::string_view(value, byte_width_));
+ }
+
+ /// \brief Append a string (only for binary types)
+ template <typename T1 = T>
+ enable_if_binary_like<T1, Status> Append(const uint8_t* value, int32_t length) {
+ return Append(reinterpret_cast<const char*>(value), length);
+ }
+
+ /// \brief Append a string (only for binary types)
+ template <typename T1 = T>
+ enable_if_binary_like<T1, Status> Append(const char* value, int32_t length) {
+ return Append(util::string_view(value, length));
+ }
+
+ /// \brief Append a string (only for string types)
+ template <typename T1 = T>
+ enable_if_string_like<T1, Status> Append(const char* value, int32_t length) {
+ return Append(util::string_view(value, length));
+ }
+
+ /// \brief Append a decimal (only for Decimal128Type)
+ template <typename T1 = T>
+ enable_if_decimal128<T1, Status> Append(const Decimal128& value) {
+ uint8_t data[16];
+ value.ToBytes(data);
+ return Append(data, 16);
+ }
+
+ /// \brief Append a decimal (only for Decimal128Type)
+ template <typename T1 = T>
+ enable_if_decimal256<T1, Status> Append(const Decimal256& value) {
+ uint8_t data[32];
+ value.ToBytes(data);
+ return Append(data, 32);
+ }
+
+ /// \brief Append a scalar null value
+ Status AppendNull() final {
+ length_ += 1;
+ null_count_ += 1;
+
+ return indices_builder_.AppendNull();
+ }
+
+ Status AppendNulls(int64_t length) final {
+ length_ += length;
+ null_count_ += length;
+
+ return indices_builder_.AppendNulls(length);
+ }
+
+ Status AppendEmptyValue() final {
+ length_ += 1;
+
+ return indices_builder_.AppendEmptyValue();
+ }
+
+ Status AppendEmptyValues(int64_t length) final {
+ length_ += length;
+
+ return indices_builder_.AppendEmptyValues(length);
+ }
+
+ /// \brief Insert values into the dictionary's memo, but do not append any
+ /// indices. Can be used to initialize a new builder with known dictionary
+ /// values
+ /// \param[in] values dictionary values to add to memo. Type must match
+ /// builder type
+ Status InsertMemoValues(const Array& values) {
+ return memo_table_->InsertValues(values);
+ }
+
+ /// \brief Append a whole dense array to the builder
+ template <typename T1 = T>
+ enable_if_t<!is_fixed_size_binary_type<T1>::value, Status> AppendArray(
+ const Array& array) {
+ using ArrayType = typename TypeTraits<T>::ArrayType;
+
+#ifndef NDEBUG
+ ARROW_RETURN_NOT_OK(ArrayBuilder::CheckArrayType(
+ value_type_, array, "Wrong value type of array to be appended"));
+#endif
+
+ const auto& concrete_array = static_cast<const ArrayType&>(array);
+ for (int64_t i = 0; i < array.length(); i++) {
+ if (array.IsNull(i)) {
+ ARROW_RETURN_NOT_OK(AppendNull());
+ } else {
+ ARROW_RETURN_NOT_OK(Append(concrete_array.GetView(i)));
+ }
+ }
+ return Status::OK();
+ }
+
+ template <typename T1 = T>
+ enable_if_fixed_size_binary<T1, Status> AppendArray(const Array& array) {
+#ifndef NDEBUG
+ ARROW_RETURN_NOT_OK(ArrayBuilder::CheckArrayType(
+ value_type_, array, "Wrong value type of array to be appended"));
+#endif
+
+ const auto& concrete_array = static_cast<const FixedSizeBinaryArray&>(array);
+ for (int64_t i = 0; i < array.length(); i++) {
+ if (array.IsNull(i)) {
+ ARROW_RETURN_NOT_OK(AppendNull());
+ } else {
+ ARROW_RETURN_NOT_OK(Append(concrete_array.GetValue(i)));
+ }
+ }
+ return Status::OK();
+ }
+
+ void Reset() override {
+ // Perform a partial reset. Call ResetFull to also reset the accumulated
+ // dictionary values
+ ArrayBuilder::Reset();
+ indices_builder_.Reset();
+ }
+
+ /// \brief Reset and also clear accumulated dictionary values in memo table
+ void ResetFull() {
+ Reset();
+ memo_table_.reset(new internal::DictionaryMemoTable(pool_, value_type_));
+ }
+
+ Status Resize(int64_t capacity) override {
+ ARROW_RETURN_NOT_OK(CheckCapacity(capacity));
+ capacity = std::max(capacity, kMinBuilderCapacity);
+ ARROW_RETURN_NOT_OK(indices_builder_.Resize(capacity));
+ capacity_ = indices_builder_.capacity();
+ return Status::OK();
+ }
+
+ /// \brief Return dictionary indices and a delta dictionary since the last
+ /// time that Finish or FinishDelta were called, and reset state of builder
+ /// (except the memo table)
+ Status FinishDelta(std::shared_ptr<Array>* out_indices,
+ std::shared_ptr<Array>* out_delta) {
+ std::shared_ptr<ArrayData> indices_data;
+ std::shared_ptr<ArrayData> delta_data;
+ ARROW_RETURN_NOT_OK(FinishWithDictOffset(delta_offset_, &indices_data, &delta_data));
+ *out_indices = MakeArray(indices_data);
+ *out_delta = MakeArray(delta_data);
+ return Status::OK();
+ }
+
+ /// \cond FALSE
+ using ArrayBuilder::Finish;
+ /// \endcond
+
+ Status Finish(std::shared_ptr<DictionaryArray>* out) { return FinishTyped(out); }
+
+ std::shared_ptr<DataType> type() const override {
+ return ::arrow::dictionary(indices_builder_.type(), value_type_);
+ }
+
+ protected:
+ Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
+ std::shared_ptr<ArrayData> dictionary;
+ ARROW_RETURN_NOT_OK(FinishWithDictOffset(/*offset=*/0, out, &dictionary));
+
+ // Set type of array data to the right dictionary type
+ (*out)->type = type();
+ (*out)->dictionary = dictionary;
+ return Status::OK();
+ }
+
+ Status FinishWithDictOffset(int64_t dict_offset,
+ std::shared_ptr<ArrayData>* out_indices,
+ std::shared_ptr<ArrayData>* out_dictionary) {
+ // Finalize indices array
+ ARROW_RETURN_NOT_OK(indices_builder_.FinishInternal(out_indices));
+
+ // Generate dictionary array from hash table contents
+ ARROW_RETURN_NOT_OK(memo_table_->GetArrayData(dict_offset, out_dictionary));
+ delta_offset_ = memo_table_->size();
+
+ // Update internals for further uses of this DictionaryBuilder
+ ArrayBuilder::Reset();
+ return Status::OK();
+ }
+
+ std::unique_ptr<DictionaryMemoTable> memo_table_;
+
+ // The size of the dictionary memo at last invocation of Finish, to use in
+ // FinishDelta for computing dictionary deltas
+ int32_t delta_offset_;
+
+ // Only used for FixedSizeBinaryType
+ int32_t byte_width_;
+
+ BuilderType indices_builder_;
+ std::shared_ptr<DataType> value_type_;
+};
+
+template <typename BuilderType>
+class DictionaryBuilderBase<BuilderType, NullType> : public ArrayBuilder {
+ public:
+ template <typename B = BuilderType>
+ DictionaryBuilderBase(
+ enable_if_t<std::is_base_of<AdaptiveIntBuilderBase, B>::value, uint8_t>
+ start_int_size,
+ const std::shared_ptr<DataType>& /*value_type*/,
+ MemoryPool* pool = default_memory_pool())
+ : ArrayBuilder(pool), indices_builder_(start_int_size, pool) {}
+
+ explicit DictionaryBuilderBase(const std::shared_ptr<DataType>& /*value_type*/,
+ MemoryPool* pool = default_memory_pool())
+ : ArrayBuilder(pool), indices_builder_(pool) {}
+
+ template <typename B = BuilderType>
+ explicit DictionaryBuilderBase(
+ enable_if_t<std::is_base_of<AdaptiveIntBuilderBase, B>::value, uint8_t>
+ start_int_size,
+ MemoryPool* pool = default_memory_pool())
+ : ArrayBuilder(pool), indices_builder_(start_int_size, pool) {}
+
+ explicit DictionaryBuilderBase(MemoryPool* pool = default_memory_pool())
+ : ArrayBuilder(pool), indices_builder_(pool) {}
+
+ explicit DictionaryBuilderBase(const std::shared_ptr<Array>& /*dictionary*/,
+ MemoryPool* pool = default_memory_pool())
+ : ArrayBuilder(pool), indices_builder_(pool) {}
+
+ /// \brief Append a scalar null value
+ Status AppendNull() final {
+ length_ += 1;
+ null_count_ += 1;
+
+ return indices_builder_.AppendNull();
+ }
+
+ Status AppendNulls(int64_t length) final {
+ length_ += length;
+ null_count_ += length;
+
+ return indices_builder_.AppendNulls(length);
+ }
+
+ Status AppendEmptyValue() final {
+ length_ += 1;
+
+ return indices_builder_.AppendEmptyValue();
+ }
+
+ Status AppendEmptyValues(int64_t length) final {
+ length_ += length;
+
+ return indices_builder_.AppendEmptyValues(length);
+ }
+
+ /// \brief Append a whole dense array to the builder
+ Status AppendArray(const Array& array) {
+#ifndef NDEBUG
+ ARROW_RETURN_NOT_OK(ArrayBuilder::CheckArrayType(
+ Type::NA, array, "Wrong value type of array to be appended"));
+#endif
+ for (int64_t i = 0; i < array.length(); i++) {
+ ARROW_RETURN_NOT_OK(AppendNull());
+ }
+ return Status::OK();
+ }
+
+ Status Resize(int64_t capacity) override {
+ ARROW_RETURN_NOT_OK(CheckCapacity(capacity));
+ capacity = std::max(capacity, kMinBuilderCapacity);
+
+ ARROW_RETURN_NOT_OK(indices_builder_.Resize(capacity));
+ capacity_ = indices_builder_.capacity();
+ return Status::OK();
+ }
+
+ Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
+ ARROW_RETURN_NOT_OK(indices_builder_.FinishInternal(out));
+ (*out)->type = dictionary((*out)->type, null());
+ (*out)->dictionary = NullArray(0).data();
+ return Status::OK();
+ }
+
+ /// \cond FALSE
+ using ArrayBuilder::Finish;
+ /// \endcond
+
+ Status Finish(std::shared_ptr<DictionaryArray>* out) { return FinishTyped(out); }
+
+ std::shared_ptr<DataType> type() const override {
+ return ::arrow::dictionary(indices_builder_.type(), null());
+ }
+
+ protected:
+ BuilderType indices_builder_;
+};
+
+} // namespace internal
+
+/// \brief A DictionaryArray builder that uses AdaptiveIntBuilder to return the
+/// smallest index size that can accommodate the dictionary indices
+template <typename T>
+class DictionaryBuilder : public internal::DictionaryBuilderBase<AdaptiveIntBuilder, T> {
+ public:
+ using BASE = internal::DictionaryBuilderBase<AdaptiveIntBuilder, T>;
+ using BASE::BASE;
+
+ /// \brief Append dictionary indices directly without modifying memo
+ ///
+ /// NOTE: Experimental API
+ Status AppendIndices(const int64_t* values, int64_t length,
+ const uint8_t* valid_bytes = NULLPTR) {
+ int64_t null_count_before = this->indices_builder_.null_count();
+ ARROW_RETURN_NOT_OK(this->indices_builder_.AppendValues(values, length, valid_bytes));
+ this->capacity_ = this->indices_builder_.capacity();
+ this->length_ += length;
+ this->null_count_ += this->indices_builder_.null_count() - null_count_before;
+ return Status::OK();
+ }
+};
+
+/// \brief A DictionaryArray builder that always returns int32 dictionary
+/// indices so that data cast to dictionary form will have a consistent index
+/// type, e.g. for creating a ChunkedArray
+template <typename T>
+class Dictionary32Builder : public internal::DictionaryBuilderBase<Int32Builder, T> {
+ public:
+ using BASE = internal::DictionaryBuilderBase<Int32Builder, T>;
+ using BASE::BASE;
+
+ /// \brief Append dictionary indices directly without modifying memo
+ ///
+ /// NOTE: Experimental API
+ Status AppendIndices(const int32_t* values, int64_t length,
+ const uint8_t* valid_bytes = NULLPTR) {
+ int64_t null_count_before = this->indices_builder_.null_count();
+ ARROW_RETURN_NOT_OK(this->indices_builder_.AppendValues(values, length, valid_bytes));
+ this->capacity_ = this->indices_builder_.capacity();
+ this->length_ += length;
+ this->null_count_ += this->indices_builder_.null_count() - null_count_before;
+ return Status::OK();
+ }
+};
+
+// ----------------------------------------------------------------------
+// Binary / Unicode builders
+// (compatibility aliases; those used to be derived classes with additional
+// Append() overloads, but they have been folded into DictionaryBuilderBase)
+
+using BinaryDictionaryBuilder = DictionaryBuilder<BinaryType>;
+using StringDictionaryBuilder = DictionaryBuilder<StringType>;
+using BinaryDictionary32Builder = Dictionary32Builder<BinaryType>;
+using StringDictionary32Builder = Dictionary32Builder<StringType>;
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_nested.cc b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_nested.cc
new file mode 100644
index 00000000000..a3bcde0381a
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_nested.cc
@@ -0,0 +1,294 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/array/builder_nested.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+#include "arrow/buffer.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/logging.h"
+
+namespace arrow {
+
+// ----------------------------------------------------------------------
+// MapBuilder
+
+MapBuilder::MapBuilder(MemoryPool* pool, const std::shared_ptr<ArrayBuilder>& key_builder,
+ std::shared_ptr<ArrayBuilder> const& item_builder,
+ const std::shared_ptr<DataType>& type)
+ : ArrayBuilder(pool), key_builder_(key_builder), item_builder_(item_builder) {
+ auto map_type = internal::checked_cast<const MapType*>(type.get());
+ keys_sorted_ = map_type->keys_sorted();
+
+ std::vector<std::shared_ptr<ArrayBuilder>> child_builders{key_builder, item_builder};
+ auto struct_builder =
+ std::make_shared<StructBuilder>(map_type->value_type(), pool, child_builders);
+
+ list_builder_ =
+ std::make_shared<ListBuilder>(pool, struct_builder, struct_builder->type());
+}
+
+MapBuilder::MapBuilder(MemoryPool* pool, const std::shared_ptr<ArrayBuilder>& key_builder,
+ const std::shared_ptr<ArrayBuilder>& item_builder,
+ bool keys_sorted)
+ : MapBuilder(pool, key_builder, item_builder,
+ map(key_builder->type(), item_builder->type(), keys_sorted)) {}
+
+MapBuilder::MapBuilder(MemoryPool* pool,
+ const std::shared_ptr<ArrayBuilder>& struct_builder,
+ const std::shared_ptr<DataType>& type)
+ : ArrayBuilder(pool) {
+ auto map_type = internal::checked_cast<const MapType*>(type.get());
+ keys_sorted_ = map_type->keys_sorted();
+ key_builder_ = struct_builder->child_builder(0);
+ item_builder_ = struct_builder->child_builder(1);
+ list_builder_ =
+ std::make_shared<ListBuilder>(pool, struct_builder, struct_builder->type());
+}
+
+Status MapBuilder::Resize(int64_t capacity) {
+ RETURN_NOT_OK(list_builder_->Resize(capacity));
+ capacity_ = list_builder_->capacity();
+ return Status::OK();
+}
+
+void MapBuilder::Reset() {
+ list_builder_->Reset();
+ ArrayBuilder::Reset();
+}
+
+Status MapBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
+ ARROW_CHECK_EQ(item_builder_->length(), key_builder_->length())
+ << "keys and items builders don't have the same size in MapBuilder";
+ RETURN_NOT_OK(AdjustStructBuilderLength());
+ RETURN_NOT_OK(list_builder_->FinishInternal(out));
+ (*out)->type = type();
+ ArrayBuilder::Reset();
+ return Status::OK();
+}
+
+Status MapBuilder::AppendValues(const int32_t* offsets, int64_t length,
+ const uint8_t* valid_bytes) {
+ DCHECK_EQ(item_builder_->length(), key_builder_->length());
+ RETURN_NOT_OK(AdjustStructBuilderLength());
+ RETURN_NOT_OK(list_builder_->AppendValues(offsets, length, valid_bytes));
+ length_ = list_builder_->length();
+ null_count_ = list_builder_->null_count();
+ return Status::OK();
+}
+
+Status MapBuilder::Append() {
+ DCHECK_EQ(item_builder_->length(), key_builder_->length());
+ RETURN_NOT_OK(AdjustStructBuilderLength());
+ RETURN_NOT_OK(list_builder_->Append());
+ length_ = list_builder_->length();
+ return Status::OK();
+}
+
+Status MapBuilder::AppendNull() {
+ DCHECK_EQ(item_builder_->length(), key_builder_->length());
+ RETURN_NOT_OK(AdjustStructBuilderLength());
+ RETURN_NOT_OK(list_builder_->AppendNull());
+ length_ = list_builder_->length();
+ null_count_ = list_builder_->null_count();
+ return Status::OK();
+}
+
+Status MapBuilder::AppendNulls(int64_t length) {
+ DCHECK_EQ(item_builder_->length(), key_builder_->length());
+ RETURN_NOT_OK(AdjustStructBuilderLength());
+ RETURN_NOT_OK(list_builder_->AppendNulls(length));
+ length_ = list_builder_->length();
+ null_count_ = list_builder_->null_count();
+ return Status::OK();
+}
+
+Status MapBuilder::AppendEmptyValue() {
+ DCHECK_EQ(item_builder_->length(), key_builder_->length());
+ RETURN_NOT_OK(AdjustStructBuilderLength());
+ RETURN_NOT_OK(list_builder_->AppendEmptyValue());
+ length_ = list_builder_->length();
+ null_count_ = list_builder_->null_count();
+ return Status::OK();
+}
+
+Status MapBuilder::AppendEmptyValues(int64_t length) {
+ DCHECK_EQ(item_builder_->length(), key_builder_->length());
+ RETURN_NOT_OK(AdjustStructBuilderLength());
+ RETURN_NOT_OK(list_builder_->AppendEmptyValues(length));
+ length_ = list_builder_->length();
+ null_count_ = list_builder_->null_count();
+ return Status::OK();
+}
+
+Status MapBuilder::AdjustStructBuilderLength() {
+ // If key/item builders have been appended, adjust struct builder length
+ // to match. Struct and key are non-nullable, append all valid values.
+ auto struct_builder =
+ internal::checked_cast<StructBuilder*>(list_builder_->value_builder());
+ if (struct_builder->length() < key_builder_->length()) {
+ int64_t length_diff = key_builder_->length() - struct_builder->length();
+ RETURN_NOT_OK(struct_builder->AppendValues(length_diff, NULLPTR));
+ }
+ return Status::OK();
+}
+
+// ----------------------------------------------------------------------
+// FixedSizeListBuilder
+
+FixedSizeListBuilder::FixedSizeListBuilder(
+ MemoryPool* pool, const std::shared_ptr<ArrayBuilder>& value_builder,
+ const std::shared_ptr<DataType>& type)
+ : ArrayBuilder(pool),
+ value_field_(type->field(0)),
+ list_size_(
+ internal::checked_cast<const FixedSizeListType*>(type.get())->list_size()),
+ value_builder_(value_builder) {}
+
+FixedSizeListBuilder::FixedSizeListBuilder(
+ MemoryPool* pool, const std::shared_ptr<ArrayBuilder>& value_builder,
+ int32_t list_size)
+ : FixedSizeListBuilder(pool, value_builder,
+ fixed_size_list(value_builder->type(), list_size)) {}
+
+void FixedSizeListBuilder::Reset() {
+ ArrayBuilder::Reset();
+ value_builder_->Reset();
+}
+
+Status FixedSizeListBuilder::Append() {
+ RETURN_NOT_OK(Reserve(1));
+ UnsafeAppendToBitmap(true);
+ return Status::OK();
+}
+
+Status FixedSizeListBuilder::AppendValues(int64_t length, const uint8_t* valid_bytes) {
+ RETURN_NOT_OK(Reserve(length));
+ UnsafeAppendToBitmap(valid_bytes, length);
+ return Status::OK();
+}
+
+Status FixedSizeListBuilder::AppendNull() {
+ RETURN_NOT_OK(Reserve(1));
+ UnsafeAppendToBitmap(false);
+ return value_builder_->AppendNulls(list_size_);
+}
+
+Status FixedSizeListBuilder::AppendNulls(int64_t length) {
+ RETURN_NOT_OK(Reserve(length));
+ UnsafeAppendToBitmap(length, false);
+ return value_builder_->AppendNulls(list_size_ * length);
+}
+
+Status FixedSizeListBuilder::ValidateOverflow(int64_t new_elements) {
+ auto new_length = value_builder_->length() + new_elements;
+ if (new_elements != list_size_) {
+ return Status::Invalid("Length of item not correct: expected ", list_size_,
+ " but got array of size ", new_elements);
+ }
+ if (new_length > maximum_elements()) {
+ return Status::CapacityError("array cannot contain more than ", maximum_elements(),
+ " elements, have ", new_elements);
+ }
+ return Status::OK();
+}
+
+Status FixedSizeListBuilder::AppendEmptyValue() {
+ RETURN_NOT_OK(Reserve(1));
+ UnsafeAppendToBitmap(true);
+ return value_builder_->AppendEmptyValues(list_size_);
+}
+
+Status FixedSizeListBuilder::AppendEmptyValues(int64_t length) {
+ RETURN_NOT_OK(Reserve(length));
+ UnsafeAppendToBitmap(length, true);
+ return value_builder_->AppendEmptyValues(list_size_ * length);
+}
+
+Status FixedSizeListBuilder::Resize(int64_t capacity) {
+ RETURN_NOT_OK(CheckCapacity(capacity));
+ return ArrayBuilder::Resize(capacity);
+}
+
+Status FixedSizeListBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
+ std::shared_ptr<ArrayData> items;
+
+ if (value_builder_->length() == 0) {
+ // Try to make sure we get a non-null values buffer (ARROW-2744)
+ RETURN_NOT_OK(value_builder_->Resize(0));
+ }
+ RETURN_NOT_OK(value_builder_->FinishInternal(&items));
+
+ std::shared_ptr<Buffer> null_bitmap;
+ RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
+ *out = ArrayData::Make(type(), length_, {null_bitmap}, {std::move(items)}, null_count_);
+ Reset();
+ return Status::OK();
+}
+
+// ----------------------------------------------------------------------
+// Struct
+
+StructBuilder::StructBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool,
+ std::vector<std::shared_ptr<ArrayBuilder>> field_builders)
+ : ArrayBuilder(pool), type_(type) {
+ children_ = std::move(field_builders);
+}
+
+void StructBuilder::Reset() {
+ ArrayBuilder::Reset();
+ for (const auto& field_builder : children_) {
+ field_builder->Reset();
+ }
+}
+
+Status StructBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
+ std::shared_ptr<Buffer> null_bitmap;
+ RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
+
+ std::vector<std::shared_ptr<ArrayData>> child_data(children_.size());
+ for (size_t i = 0; i < children_.size(); ++i) {
+ if (length_ == 0) {
+ // Try to make sure the child buffers are initialized
+ RETURN_NOT_OK(children_[i]->Resize(0));
+ }
+ RETURN_NOT_OK(children_[i]->FinishInternal(&child_data[i]));
+ }
+
+ *out = ArrayData::Make(type(), length_, {null_bitmap}, null_count_);
+ (*out)->child_data = std::move(child_data);
+
+ capacity_ = length_ = null_count_ = 0;
+ return Status::OK();
+}
+
+std::shared_ptr<DataType> StructBuilder::type() const {
+ DCHECK_EQ(type_->fields().size(), children_.size());
+ std::vector<std::shared_ptr<Field>> fields(children_.size());
+ for (int i = 0; i < static_cast<int>(fields.size()); ++i) {
+ fields[i] = type_->field(i)->WithType(children_[i]->type());
+ }
+ return struct_(std::move(fields));
+}
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_nested.h b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_nested.h
new file mode 100644
index 00000000000..12b999b786e
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_nested.h
@@ -0,0 +1,482 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "arrow/array/array_nested.h"
+#include "arrow/array/builder_base.h"
+#include "arrow/array/data.h"
+#include "arrow/buffer.h"
+#include "arrow/buffer_builder.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+// ----------------------------------------------------------------------
+// List builder
+
+template <typename TYPE>
+class BaseListBuilder : public ArrayBuilder {
+ public:
+ using TypeClass = TYPE;
+ using offset_type = typename TypeClass::offset_type;
+
+ /// Use this constructor to incrementally build the value array along with offsets and
+ /// null bitmap.
+ BaseListBuilder(MemoryPool* pool, std::shared_ptr<ArrayBuilder> const& value_builder,
+ const std::shared_ptr<DataType>& type)
+ : ArrayBuilder(pool),
+ offsets_builder_(pool),
+ value_builder_(value_builder),
+ value_field_(type->field(0)->WithType(NULLPTR)) {}
+
+ BaseListBuilder(MemoryPool* pool, std::shared_ptr<ArrayBuilder> const& value_builder)
+ : BaseListBuilder(pool, value_builder, list(value_builder->type())) {}
+
+ Status Resize(int64_t capacity) override {
+ if (capacity > maximum_elements()) {
+ return Status::CapacityError("List array cannot reserve space for more than ",
+ maximum_elements(), " got ", capacity);
+ }
+ ARROW_RETURN_NOT_OK(CheckCapacity(capacity));
+
+ // One more than requested for offsets
+ ARROW_RETURN_NOT_OK(offsets_builder_.Resize(capacity + 1));
+ return ArrayBuilder::Resize(capacity);
+ }
+
+ void Reset() override {
+ ArrayBuilder::Reset();
+ offsets_builder_.Reset();
+ value_builder_->Reset();
+ }
+
+ /// \brief Vector append
+ ///
+ /// If passed, valid_bytes is of equal length to values, and any zero byte
+ /// will be considered as a null for that slot
+ Status AppendValues(const offset_type* offsets, int64_t length,
+ const uint8_t* valid_bytes = NULLPTR) {
+ ARROW_RETURN_NOT_OK(Reserve(length));
+ UnsafeAppendToBitmap(valid_bytes, length);
+ offsets_builder_.UnsafeAppend(offsets, length);
+ return Status::OK();
+ }
+
+ /// \brief Start a new variable-length list slot
+ ///
+ /// This function should be called before beginning to append elements to the
+ /// value builder
+ Status Append(bool is_valid = true) {
+ ARROW_RETURN_NOT_OK(Reserve(1));
+ UnsafeAppendToBitmap(is_valid);
+ return AppendNextOffset();
+ }
+
+ Status AppendNull() final { return Append(false); }
+
+ Status AppendNulls(int64_t length) final {
+ ARROW_RETURN_NOT_OK(Reserve(length));
+ ARROW_RETURN_NOT_OK(ValidateOverflow(0));
+ UnsafeAppendToBitmap(length, false);
+ const int64_t num_values = value_builder_->length();
+ for (int64_t i = 0; i < length; ++i) {
+ offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_values));
+ }
+ return Status::OK();
+ }
+
+ Status AppendEmptyValue() final { return Append(true); }
+
+ Status AppendEmptyValues(int64_t length) final {
+ ARROW_RETURN_NOT_OK(Reserve(length));
+ ARROW_RETURN_NOT_OK(ValidateOverflow(0));
+ UnsafeAppendToBitmap(length, true);
+ const int64_t num_values = value_builder_->length();
+ for (int64_t i = 0; i < length; ++i) {
+ offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_values));
+ }
+ return Status::OK();
+ }
+
+ Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
+ ARROW_RETURN_NOT_OK(AppendNextOffset());
+
+ // Offset padding zeroed by BufferBuilder
+ std::shared_ptr<Buffer> offsets, null_bitmap;
+ ARROW_RETURN_NOT_OK(offsets_builder_.Finish(&offsets));
+ ARROW_RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
+
+ if (value_builder_->length() == 0) {
+ // Try to make sure we get a non-null values buffer (ARROW-2744)
+ ARROW_RETURN_NOT_OK(value_builder_->Resize(0));
+ }
+
+ std::shared_ptr<ArrayData> items;
+ ARROW_RETURN_NOT_OK(value_builder_->FinishInternal(&items));
+
+ *out = ArrayData::Make(type(), length_, {null_bitmap, offsets}, {std::move(items)},
+ null_count_);
+ Reset();
+ return Status::OK();
+ }
+
+ Status ValidateOverflow(int64_t new_elements) const {
+ auto new_length = value_builder_->length() + new_elements;
+ if (ARROW_PREDICT_FALSE(new_length > maximum_elements())) {
+ return Status::CapacityError("List array cannot contain more than ",
+ maximum_elements(), " elements, have ", new_elements);
+ } else {
+ return Status::OK();
+ }
+ }
+
+ ArrayBuilder* value_builder() const { return value_builder_.get(); }
+
+ // Cannot make this a static attribute because of linking issues
+ static constexpr int64_t maximum_elements() {
+ return std::numeric_limits<offset_type>::max() - 1;
+ }
+
+ std::shared_ptr<DataType> type() const override {
+ return std::make_shared<TYPE>(value_field_->WithType(value_builder_->type()));
+ }
+
+ protected:
+ TypedBufferBuilder<offset_type> offsets_builder_;
+ std::shared_ptr<ArrayBuilder> value_builder_;
+ std::shared_ptr<Field> value_field_;
+
+ Status AppendNextOffset() {
+ ARROW_RETURN_NOT_OK(ValidateOverflow(0));
+ const int64_t num_values = value_builder_->length();
+ return offsets_builder_.Append(static_cast<offset_type>(num_values));
+ }
+};
+
+/// \class ListBuilder
+/// \brief Builder class for variable-length list array value types
+///
+/// To use this class, you must append values to the child array builder and use
+/// the Append function to delimit each distinct list value (once the values
+/// have been appended to the child array) or use the bulk API to append
+/// a sequence of offsets and null values.
+///
+/// A note on types. Per arrow/type.h all types in the c++ implementation are
+/// logical so even though this class always builds list array, this can
+/// represent multiple different logical types. If no logical type is provided
+/// at construction time, the class defaults to List<T> where t is taken from the
+/// value_builder/values that the object is constructed with.
+class ARROW_EXPORT ListBuilder : public BaseListBuilder<ListType> {
+ public:
+ using BaseListBuilder::BaseListBuilder;
+
+ /// \cond FALSE
+ using ArrayBuilder::Finish;
+ /// \endcond
+
+ Status Finish(std::shared_ptr<ListArray>* out) { return FinishTyped(out); }
+};
+
+/// \class LargeListBuilder
+/// \brief Builder class for large variable-length list array value types
+///
+/// Like ListBuilder, but to create large list arrays (with 64-bit offsets).
+class ARROW_EXPORT LargeListBuilder : public BaseListBuilder<LargeListType> {
+ public:
+ using BaseListBuilder::BaseListBuilder;
+
+ /// \cond FALSE
+ using ArrayBuilder::Finish;
+ /// \endcond
+
+ Status Finish(std::shared_ptr<LargeListArray>* out) { return FinishTyped(out); }
+};
+
+// ----------------------------------------------------------------------
+// Map builder
+
+/// \class MapBuilder
+/// \brief Builder class for arrays of variable-size maps
+///
+/// To use this class, you must append values to the key and item array builders
+/// and use the Append function to delimit each distinct map (once the keys and items
+/// have been appended) or use the bulk API to append a sequence of offsets and null
+/// maps.
+///
+/// Key uniqueness and ordering are not validated.
+class ARROW_EXPORT MapBuilder : public ArrayBuilder {
+ public:
+ /// Use this constructor to define the built array's type explicitly. If key_builder
+ /// or item_builder has indeterminate type, this builder will also.
+ MapBuilder(MemoryPool* pool, const std::shared_ptr<ArrayBuilder>& key_builder,
+ const std::shared_ptr<ArrayBuilder>& item_builder,
+ const std::shared_ptr<DataType>& type);
+
+ /// Use this constructor to infer the built array's type. If key_builder or
+ /// item_builder has indeterminate type, this builder will also.
+ MapBuilder(MemoryPool* pool, const std::shared_ptr<ArrayBuilder>& key_builder,
+ const std::shared_ptr<ArrayBuilder>& item_builder, bool keys_sorted = false);
+
+ MapBuilder(MemoryPool* pool, const std::shared_ptr<ArrayBuilder>& item_builder,
+ const std::shared_ptr<DataType>& type);
+
+ Status Resize(int64_t capacity) override;
+ void Reset() override;
+ Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
+
+ /// \cond FALSE
+ using ArrayBuilder::Finish;
+ /// \endcond
+
+ Status Finish(std::shared_ptr<MapArray>* out) { return FinishTyped(out); }
+
+ /// \brief Vector append
+ ///
+ /// If passed, valid_bytes is of equal length to values, and any zero byte
+ /// will be considered as a null for that slot
+ Status AppendValues(const int32_t* offsets, int64_t length,
+ const uint8_t* valid_bytes = NULLPTR);
+
+ /// \brief Start a new variable-length map slot
+ ///
+ /// This function should be called before beginning to append elements to the
+ /// key and item builders
+ Status Append();
+
+ Status AppendNull() final;
+
+ Status AppendNulls(int64_t length) final;
+
+ Status AppendEmptyValue() final;
+
+ Status AppendEmptyValues(int64_t length) final;
+
+ /// \brief Get builder to append keys.
+ ///
+ /// Append a key with this builder should be followed by appending
+ /// an item or null value with item_builder().
+ ArrayBuilder* key_builder() const { return key_builder_.get(); }
+
+ /// \brief Get builder to append items
+ ///
+ /// Appending an item with this builder should have been preceded
+ /// by appending a key with key_builder().
+ ArrayBuilder* item_builder() const { return item_builder_.get(); }
+
+ /// \brief Get builder to add Map entries as struct values.
+ ///
+ /// This is used instead of key_builder()/item_builder() and allows
+ /// the Map to be built as a list of struct values.
+ ArrayBuilder* value_builder() const { return list_builder_->value_builder(); }
+
+ std::shared_ptr<DataType> type() const override {
+ return map(key_builder_->type(), item_builder_->type(), keys_sorted_);
+ }
+
+ Status ValidateOverflow(int64_t new_elements) {
+ return list_builder_->ValidateOverflow(new_elements);
+ }
+
+ protected:
+ inline Status AdjustStructBuilderLength();
+
+ protected:
+ bool keys_sorted_ = false;
+ std::shared_ptr<ListBuilder> list_builder_;
+ std::shared_ptr<ArrayBuilder> key_builder_;
+ std::shared_ptr<ArrayBuilder> item_builder_;
+};
+
+// ----------------------------------------------------------------------
+// FixedSizeList builder
+
+/// \class FixedSizeListBuilder
+/// \brief Builder class for fixed-length list array value types
+class ARROW_EXPORT FixedSizeListBuilder : public ArrayBuilder {
+ public:
+ /// Use this constructor to define the built array's type explicitly. If value_builder
+ /// has indeterminate type, this builder will also.
+ FixedSizeListBuilder(MemoryPool* pool,
+ std::shared_ptr<ArrayBuilder> const& value_builder,
+ int32_t list_size);
+
+ /// Use this constructor to infer the built array's type. If value_builder has
+ /// indeterminate type, this builder will also.
+ FixedSizeListBuilder(MemoryPool* pool,
+ std::shared_ptr<ArrayBuilder> const& value_builder,
+ const std::shared_ptr<DataType>& type);
+
+ Status Resize(int64_t capacity) override;
+ void Reset() override;
+ Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
+
+ /// \cond FALSE
+ using ArrayBuilder::Finish;
+ /// \endcond
+
+ Status Finish(std::shared_ptr<FixedSizeListArray>* out) { return FinishTyped(out); }
+
+ /// \brief Append a valid fixed length list.
+ ///
+ /// This function affects only the validity bitmap; the child values must be appended
+ /// using the child array builder.
+ Status Append();
+
+ /// \brief Vector append
+ ///
+ /// If passed, valid_bytes wil be read and any zero byte
+ /// will cause the corresponding slot to be null
+ ///
+ /// This function affects only the validity bitmap; the child values must be appended
+ /// using the child array builder. This includes appending nulls for null lists.
+ /// XXX this restriction is confusing, should this method be omitted?
+ Status AppendValues(int64_t length, const uint8_t* valid_bytes = NULLPTR);
+
+ /// \brief Append a null fixed length list.
+ ///
+ /// The child array builder will have the appropriate number of nulls appended
+ /// automatically.
+ Status AppendNull() final;
+
+ /// \brief Append length null fixed length lists.
+ ///
+ /// The child array builder will have the appropriate number of nulls appended
+ /// automatically.
+ Status AppendNulls(int64_t length) final;
+
+ Status ValidateOverflow(int64_t new_elements);
+
+ Status AppendEmptyValue() final;
+
+ Status AppendEmptyValues(int64_t length) final;
+
+ ArrayBuilder* value_builder() const { return value_builder_.get(); }
+
+ std::shared_ptr<DataType> type() const override {
+ return fixed_size_list(value_field_->WithType(value_builder_->type()), list_size_);
+ }
+
+ // Cannot make this a static attribute because of linking issues
+ static constexpr int64_t maximum_elements() {
+ return std::numeric_limits<FixedSizeListType::offset_type>::max() - 1;
+ }
+
+ protected:
+ std::shared_ptr<Field> value_field_;
+ const int32_t list_size_;
+ std::shared_ptr<ArrayBuilder> value_builder_;
+};
+
+// ----------------------------------------------------------------------
+// Struct
+
+// ---------------------------------------------------------------------------------
+// StructArray builder
+/// Append, Resize and Reserve methods are acting on StructBuilder.
+/// Please make sure all these methods of all child-builders' are consistently
+/// called to maintain data-structure consistency.
+class ARROW_EXPORT StructBuilder : public ArrayBuilder {
+ public:
+ /// If any of field_builders has indeterminate type, this builder will also
+ StructBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool,
+ std::vector<std::shared_ptr<ArrayBuilder>> field_builders);
+
+ Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
+
+ /// \cond FALSE
+ using ArrayBuilder::Finish;
+ /// \endcond
+
+ Status Finish(std::shared_ptr<StructArray>* out) { return FinishTyped(out); }
+
+ /// Null bitmap is of equal length to every child field, and any zero byte
+ /// will be considered as a null for that field, but users must using app-
+ /// end methods or advance methods of the child builders' independently to
+ /// insert data.
+ Status AppendValues(int64_t length, const uint8_t* valid_bytes) {
+ ARROW_RETURN_NOT_OK(Reserve(length));
+ UnsafeAppendToBitmap(valid_bytes, length);
+ return Status::OK();
+ }
+
+ /// Append an element to the Struct. All child-builders' Append method must
+ /// be called independently to maintain data-structure consistency.
+ Status Append(bool is_valid = true) {
+ ARROW_RETURN_NOT_OK(Reserve(1));
+ UnsafeAppendToBitmap(is_valid);
+ return Status::OK();
+ }
+
+ /// \brief Append a null value. Automatically appends an empty value to each child
+ /// builder.
+ Status AppendNull() final {
+ for (const auto& field : children_) {
+ ARROW_RETURN_NOT_OK(field->AppendEmptyValue());
+ }
+ return Append(false);
+ }
+
+ /// \brief Append multiple null values. Automatically appends empty values to each
+ /// child builder.
+ Status AppendNulls(int64_t length) final {
+ for (const auto& field : children_) {
+ ARROW_RETURN_NOT_OK(field->AppendEmptyValues(length));
+ }
+ ARROW_RETURN_NOT_OK(Reserve(length));
+ UnsafeAppendToBitmap(length, false);
+ return Status::OK();
+ }
+
+ Status AppendEmptyValue() final {
+ for (const auto& field : children_) {
+ ARROW_RETURN_NOT_OK(field->AppendEmptyValue());
+ }
+ return Append(true);
+ }
+
+ Status AppendEmptyValues(int64_t length) final {
+ for (const auto& field : children_) {
+ ARROW_RETURN_NOT_OK(field->AppendEmptyValues(length));
+ }
+ ARROW_RETURN_NOT_OK(Reserve(length));
+ UnsafeAppendToBitmap(length, true);
+ return Status::OK();
+ }
+
+ void Reset() override;
+
+ ArrayBuilder* field_builder(int i) const { return children_[i].get(); }
+
+ int num_fields() const { return static_cast<int>(children_.size()); }
+
+ std::shared_ptr<DataType> type() const override;
+
+ private:
+ std::shared_ptr<DataType> type_;
+};
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_primitive.cc b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_primitive.cc
new file mode 100644
index 00000000000..e403c42411d
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_primitive.cc
@@ -0,0 +1,137 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/array/builder_primitive.h"
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <utility>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/buffer.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/int_util.h"
+#include "arrow/util/logging.h"
+
+namespace arrow {
+
+// ----------------------------------------------------------------------
+// Null builder
+
+Status NullBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
+ *out = ArrayData::Make(null(), length_, {nullptr}, length_);
+ length_ = null_count_ = 0;
+ return Status::OK();
+}
+
+BooleanBuilder::BooleanBuilder(MemoryPool* pool)
+ : ArrayBuilder(pool), data_builder_(pool) {}
+
+BooleanBuilder::BooleanBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool)
+ : BooleanBuilder(pool) {
+ ARROW_CHECK_EQ(Type::BOOL, type->id());
+}
+
+void BooleanBuilder::Reset() {
+ ArrayBuilder::Reset();
+ data_builder_.Reset();
+}
+
+Status BooleanBuilder::Resize(int64_t capacity) {
+ RETURN_NOT_OK(CheckCapacity(capacity));
+ capacity = std::max(capacity, kMinBuilderCapacity);
+ RETURN_NOT_OK(data_builder_.Resize(capacity));
+ return ArrayBuilder::Resize(capacity);
+}
+
+Status BooleanBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
+ ARROW_ASSIGN_OR_RAISE(auto null_bitmap, null_bitmap_builder_.FinishWithLength(length_));
+ ARROW_ASSIGN_OR_RAISE(auto data, data_builder_.FinishWithLength(length_));
+
+ *out = ArrayData::Make(boolean(), length_, {null_bitmap, data}, null_count_);
+
+ capacity_ = length_ = null_count_ = 0;
+ return Status::OK();
+}
+
+Status BooleanBuilder::AppendValues(const uint8_t* values, int64_t length,
+ const uint8_t* valid_bytes) {
+ RETURN_NOT_OK(Reserve(length));
+
+ int64_t i = 0;
+ data_builder_.UnsafeAppend<false>(length,
+ [values, &i]() -> bool { return values[i++] != 0; });
+ ArrayBuilder::UnsafeAppendToBitmap(valid_bytes, length);
+ return Status::OK();
+}
+
+Status BooleanBuilder::AppendValues(const uint8_t* values, int64_t length,
+ const std::vector<bool>& is_valid) {
+ RETURN_NOT_OK(Reserve(length));
+ DCHECK_EQ(length, static_cast<int64_t>(is_valid.size()));
+ int64_t i = 0;
+ data_builder_.UnsafeAppend<false>(length,
+ [values, &i]() -> bool { return values[i++]; });
+ ArrayBuilder::UnsafeAppendToBitmap(is_valid);
+ return Status::OK();
+}
+
+Status BooleanBuilder::AppendValues(const std::vector<uint8_t>& values,
+ const std::vector<bool>& is_valid) {
+ return AppendValues(values.data(), static_cast<int64_t>(values.size()), is_valid);
+}
+
+Status BooleanBuilder::AppendValues(const std::vector<uint8_t>& values) {
+ return AppendValues(values.data(), static_cast<int64_t>(values.size()));
+}
+
+Status BooleanBuilder::AppendValues(const std::vector<bool>& values,
+ const std::vector<bool>& is_valid) {
+ const int64_t length = static_cast<int64_t>(values.size());
+ RETURN_NOT_OK(Reserve(length));
+ DCHECK_EQ(length, static_cast<int64_t>(is_valid.size()));
+ int64_t i = 0;
+ data_builder_.UnsafeAppend<false>(length,
+ [&values, &i]() -> bool { return values[i++]; });
+ ArrayBuilder::UnsafeAppendToBitmap(is_valid);
+ return Status::OK();
+}
+
+Status BooleanBuilder::AppendValues(const std::vector<bool>& values) {
+ const int64_t length = static_cast<int64_t>(values.size());
+ RETURN_NOT_OK(Reserve(length));
+ int64_t i = 0;
+ data_builder_.UnsafeAppend<false>(length,
+ [&values, &i]() -> bool { return values[i++]; });
+ ArrayBuilder::UnsafeSetNotNull(length);
+ return Status::OK();
+}
+
+Status BooleanBuilder::AppendValues(int64_t length, bool value) {
+ RETURN_NOT_OK(Reserve(length));
+ data_builder_.UnsafeAppend(length, value);
+ ArrayBuilder::UnsafeSetNotNull(length);
+ return Status::OK();
+}
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_primitive.h b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_primitive.h
new file mode 100644
index 00000000000..80cfc4061bb
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_primitive.h
@@ -0,0 +1,479 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <memory>
+#include <vector>
+
+#include "arrow/array/builder_base.h"
+#include "arrow/array/data.h"
+#include "arrow/result.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+
+namespace arrow {
+
+class ARROW_EXPORT NullBuilder : public ArrayBuilder {
+ public:
+ explicit NullBuilder(MemoryPool* pool = default_memory_pool()) : ArrayBuilder(pool) {}
+ explicit NullBuilder(const std::shared_ptr<DataType>& /*type*/,
+ MemoryPool* pool = default_memory_pool())
+ : NullBuilder(pool) {}
+
+ /// \brief Append the specified number of null elements
+ Status AppendNulls(int64_t length) final {
+ if (length < 0) return Status::Invalid("length must be positive");
+ null_count_ += length;
+ length_ += length;
+ return Status::OK();
+ }
+
+ /// \brief Append a single null element
+ Status AppendNull() final { return AppendNulls(1); }
+
+ Status AppendEmptyValues(int64_t length) final { return AppendNulls(length); }
+
+ Status AppendEmptyValue() final { return AppendEmptyValues(1); }
+
+ Status Append(std::nullptr_t) { return AppendNull(); }
+
+ Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
+
+ /// \cond FALSE
+ using ArrayBuilder::Finish;
+ /// \endcond
+
+ std::shared_ptr<DataType> type() const override { return null(); }
+
+ Status Finish(std::shared_ptr<NullArray>* out) { return FinishTyped(out); }
+};
+
+/// Base class for all Builders that emit an Array of a scalar numerical type.
+template <typename T>
+class NumericBuilder : public ArrayBuilder {
+ public:
+ using TypeClass = T;
+ using value_type = typename T::c_type;
+ using ArrayType = typename TypeTraits<T>::ArrayType;
+
+ template <typename T1 = T>
+ explicit NumericBuilder(
+ enable_if_parameter_free<T1, MemoryPool*> pool = default_memory_pool())
+ : ArrayBuilder(pool), type_(TypeTraits<T>::type_singleton()), data_builder_(pool) {}
+
+ NumericBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool)
+ : ArrayBuilder(pool), type_(type), data_builder_(pool) {}
+
+ /// Append a single scalar and increase the size if necessary.
+ Status Append(const value_type val) {
+ ARROW_RETURN_NOT_OK(ArrayBuilder::Reserve(1));
+ UnsafeAppend(val);
+ return Status::OK();
+ }
+
+ /// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory
+ /// The memory at the corresponding data slot is set to 0 to prevent
+ /// uninitialized memory access
+ Status AppendNulls(int64_t length) final {
+ ARROW_RETURN_NOT_OK(Reserve(length));
+ data_builder_.UnsafeAppend(length, value_type{}); // zero
+ UnsafeSetNull(length);
+ return Status::OK();
+ }
+
+ /// \brief Append a single null element
+ Status AppendNull() final {
+ ARROW_RETURN_NOT_OK(Reserve(1));
+ data_builder_.UnsafeAppend(value_type{}); // zero
+ UnsafeAppendToBitmap(false);
+ return Status::OK();
+ }
+
+ /// \brief Append a empty element
+ Status AppendEmptyValue() final {
+ ARROW_RETURN_NOT_OK(Reserve(1));
+ data_builder_.UnsafeAppend(value_type{}); // zero
+ UnsafeAppendToBitmap(true);
+ return Status::OK();
+ }
+
+ /// \brief Append several empty elements
+ Status AppendEmptyValues(int64_t length) final {
+ ARROW_RETURN_NOT_OK(Reserve(length));
+ data_builder_.UnsafeAppend(length, value_type{}); // zero
+ UnsafeSetNotNull(length);
+ return Status::OK();
+ }
+
+ value_type GetValue(int64_t index) const { return data_builder_.data()[index]; }
+
+ void Reset() override { data_builder_.Reset(); }
+
+ Status Resize(int64_t capacity) override {
+ ARROW_RETURN_NOT_OK(CheckCapacity(capacity));
+ capacity = std::max(capacity, kMinBuilderCapacity);
+ ARROW_RETURN_NOT_OK(data_builder_.Resize(capacity));
+ return ArrayBuilder::Resize(capacity);
+ }
+
+ value_type operator[](int64_t index) const { return GetValue(index); }
+
+ value_type& operator[](int64_t index) {
+ return reinterpret_cast<value_type*>(data_builder_.mutable_data())[index];
+ }
+
+ /// \brief Append a sequence of elements in one shot
+ /// \param[in] values a contiguous C array of values
+ /// \param[in] length the number of values to append
+ /// \param[in] valid_bytes an optional sequence of bytes where non-zero
+ /// indicates a valid (non-null) value
+ /// \return Status
+ Status AppendValues(const value_type* values, int64_t length,
+ const uint8_t* valid_bytes = NULLPTR) {
+ ARROW_RETURN_NOT_OK(Reserve(length));
+ data_builder_.UnsafeAppend(values, length);
+ // length_ is update by these
+ ArrayBuilder::UnsafeAppendToBitmap(valid_bytes, length);
+ return Status::OK();
+ }
+
+ /// \brief Append a sequence of elements in one shot
+ /// \param[in] values a contiguous C array of values
+ /// \param[in] length the number of values to append
+ /// \param[in] is_valid an std::vector<bool> indicating valid (1) or null
+ /// (0). Equal in length to values
+ /// \return Status
+ Status AppendValues(const value_type* values, int64_t length,
+ const std::vector<bool>& is_valid) {
+ ARROW_RETURN_NOT_OK(Reserve(length));
+ data_builder_.UnsafeAppend(values, length);
+ // length_ is update by these
+ ArrayBuilder::UnsafeAppendToBitmap(is_valid);
+ return Status::OK();
+ }
+
+ /// \brief Append a sequence of elements in one shot
+ /// \param[in] values a std::vector of values
+ /// \param[in] is_valid an std::vector<bool> indicating valid (1) or null
+ /// (0). Equal in length to values
+ /// \return Status
+ Status AppendValues(const std::vector<value_type>& values,
+ const std::vector<bool>& is_valid) {
+ return AppendValues(values.data(), static_cast<int64_t>(values.size()), is_valid);
+ }
+
+ /// \brief Append a sequence of elements in one shot
+ /// \param[in] values a std::vector of values
+ /// \return Status
+ Status AppendValues(const std::vector<value_type>& values) {
+ return AppendValues(values.data(), static_cast<int64_t>(values.size()));
+ }
+
+ Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
+ ARROW_ASSIGN_OR_RAISE(auto null_bitmap,
+ null_bitmap_builder_.FinishWithLength(length_));
+ ARROW_ASSIGN_OR_RAISE(auto data, data_builder_.FinishWithLength(length_));
+ *out = ArrayData::Make(type(), length_, {null_bitmap, data}, null_count_);
+ capacity_ = length_ = null_count_ = 0;
+ return Status::OK();
+ }
+
+ /// \cond FALSE
+ using ArrayBuilder::Finish;
+ /// \endcond
+
+ Status Finish(std::shared_ptr<ArrayType>* out) { return FinishTyped(out); }
+
+ /// \brief Append a sequence of elements in one shot
+ /// \param[in] values_begin InputIterator to the beginning of the values
+ /// \param[in] values_end InputIterator pointing to the end of the values
+ /// \return Status
+ template <typename ValuesIter>
+ Status AppendValues(ValuesIter values_begin, ValuesIter values_end) {
+ int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
+ ARROW_RETURN_NOT_OK(Reserve(length));
+ data_builder_.UnsafeAppend(values_begin, values_end);
+ // this updates the length_
+ UnsafeSetNotNull(length);
+ return Status::OK();
+ }
+
+ /// \brief Append a sequence of elements in one shot, with a specified nullmap
+ /// \param[in] values_begin InputIterator to the beginning of the values
+ /// \param[in] values_end InputIterator pointing to the end of the values
+ /// \param[in] valid_begin InputIterator with elements indication valid(1)
+ /// or null(0) values.
+ /// \return Status
+ template <typename ValuesIter, typename ValidIter>
+ enable_if_t<!std::is_pointer<ValidIter>::value, Status> AppendValues(
+ ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) {
+ static_assert(!internal::is_null_pointer<ValidIter>::value,
+ "Don't pass a NULLPTR directly as valid_begin, use the 2-argument "
+ "version instead");
+ int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
+ ARROW_RETURN_NOT_OK(Reserve(length));
+ data_builder_.UnsafeAppend(values_begin, values_end);
+ null_bitmap_builder_.UnsafeAppend<true>(
+ length, [&valid_begin]() -> bool { return *valid_begin++; });
+ length_ = null_bitmap_builder_.length();
+ null_count_ = null_bitmap_builder_.false_count();
+ return Status::OK();
+ }
+
+ // Same as above, with a pointer type ValidIter
+ template <typename ValuesIter, typename ValidIter>
+ enable_if_t<std::is_pointer<ValidIter>::value, Status> AppendValues(
+ ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) {
+ int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
+ ARROW_RETURN_NOT_OK(Reserve(length));
+ data_builder_.UnsafeAppend(values_begin, values_end);
+ // this updates the length_
+ if (valid_begin == NULLPTR) {
+ UnsafeSetNotNull(length);
+ } else {
+ null_bitmap_builder_.UnsafeAppend<true>(
+ length, [&valid_begin]() -> bool { return *valid_begin++; });
+ length_ = null_bitmap_builder_.length();
+ null_count_ = null_bitmap_builder_.false_count();
+ }
+
+ return Status::OK();
+ }
+
+ /// Append a single scalar under the assumption that the underlying Buffer is
+ /// large enough.
+ ///
+ /// This method does not capacity-check; make sure to call Reserve
+ /// beforehand.
+ void UnsafeAppend(const value_type val) {
+ ArrayBuilder::UnsafeAppendToBitmap(true);
+ data_builder_.UnsafeAppend(val);
+ }
+
+ void UnsafeAppendNull() {
+ ArrayBuilder::UnsafeAppendToBitmap(false);
+ data_builder_.UnsafeAppend(value_type{}); // zero
+ }
+
+ std::shared_ptr<DataType> type() const override { return type_; }
+
+ protected:
+ std::shared_ptr<DataType> type_;
+ TypedBufferBuilder<value_type> data_builder_;
+};
+
+// Builders
+
+using UInt8Builder = NumericBuilder<UInt8Type>;
+using UInt16Builder = NumericBuilder<UInt16Type>;
+using UInt32Builder = NumericBuilder<UInt32Type>;
+using UInt64Builder = NumericBuilder<UInt64Type>;
+
+using Int8Builder = NumericBuilder<Int8Type>;
+using Int16Builder = NumericBuilder<Int16Type>;
+using Int32Builder = NumericBuilder<Int32Type>;
+using Int64Builder = NumericBuilder<Int64Type>;
+
+using HalfFloatBuilder = NumericBuilder<HalfFloatType>;
+using FloatBuilder = NumericBuilder<FloatType>;
+using DoubleBuilder = NumericBuilder<DoubleType>;
+
+class ARROW_EXPORT BooleanBuilder : public ArrayBuilder {
+ public:
+ using TypeClass = BooleanType;
+ using value_type = bool;
+
+ explicit BooleanBuilder(MemoryPool* pool = default_memory_pool());
+
+ BooleanBuilder(const std::shared_ptr<DataType>& type,
+ MemoryPool* pool = default_memory_pool());
+
+ /// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory
+ Status AppendNulls(int64_t length) final {
+ ARROW_RETURN_NOT_OK(Reserve(length));
+ data_builder_.UnsafeAppend(length, false);
+ UnsafeSetNull(length);
+ return Status::OK();
+ }
+
+ Status AppendNull() final {
+ ARROW_RETURN_NOT_OK(Reserve(1));
+ UnsafeAppendNull();
+ return Status::OK();
+ }
+
+ Status AppendEmptyValue() final {
+ ARROW_RETURN_NOT_OK(Reserve(1));
+ data_builder_.UnsafeAppend(false);
+ UnsafeSetNotNull(1);
+ return Status::OK();
+ }
+
+ Status AppendEmptyValues(int64_t length) final {
+ ARROW_RETURN_NOT_OK(Reserve(length));
+ data_builder_.UnsafeAppend(length, false);
+ UnsafeSetNotNull(length);
+ return Status::OK();
+ }
+
+ /// Scalar append
+ Status Append(const bool val) {
+ ARROW_RETURN_NOT_OK(Reserve(1));
+ UnsafeAppend(val);
+ return Status::OK();
+ }
+
+ Status Append(const uint8_t val) { return Append(val != 0); }
+
+ /// Scalar append, without checking for capacity
+ void UnsafeAppend(const bool val) {
+ data_builder_.UnsafeAppend(val);
+ UnsafeAppendToBitmap(true);
+ }
+
+ void UnsafeAppendNull() {
+ data_builder_.UnsafeAppend(false);
+ UnsafeAppendToBitmap(false);
+ }
+
+ void UnsafeAppend(const uint8_t val) { UnsafeAppend(val != 0); }
+
+ /// \brief Append a sequence of elements in one shot
+ /// \param[in] values a contiguous array of bytes (non-zero is 1)
+ /// \param[in] length the number of values to append
+ /// \param[in] valid_bytes an optional sequence of bytes where non-zero
+ /// indicates a valid (non-null) value
+ /// \return Status
+ Status AppendValues(const uint8_t* values, int64_t length,
+ const uint8_t* valid_bytes = NULLPTR);
+
+ /// \brief Append a sequence of elements in one shot
+ /// \param[in] values a contiguous C array of values
+ /// \param[in] length the number of values to append
+ /// \param[in] is_valid an std::vector<bool> indicating valid (1) or null
+ /// (0). Equal in length to values
+ /// \return Status
+ Status AppendValues(const uint8_t* values, int64_t length,
+ const std::vector<bool>& is_valid);
+
+ /// \brief Append a sequence of elements in one shot
+ /// \param[in] values a std::vector of bytes
+ /// \param[in] is_valid an std::vector<bool> indicating valid (1) or null
+ /// (0). Equal in length to values
+ /// \return Status
+ Status AppendValues(const std::vector<uint8_t>& values,
+ const std::vector<bool>& is_valid);
+
+ /// \brief Append a sequence of elements in one shot
+ /// \param[in] values a std::vector of bytes
+ /// \return Status
+ Status AppendValues(const std::vector<uint8_t>& values);
+
+ /// \brief Append a sequence of elements in one shot
+ /// \param[in] values an std::vector<bool> indicating true (1) or false
+ /// \param[in] is_valid an std::vector<bool> indicating valid (1) or null
+ /// (0). Equal in length to values
+ /// \return Status
+ Status AppendValues(const std::vector<bool>& values, const std::vector<bool>& is_valid);
+
+ /// \brief Append a sequence of elements in one shot
+ /// \param[in] values an std::vector<bool> indicating true (1) or false
+ /// \return Status
+ Status AppendValues(const std::vector<bool>& values);
+
+ /// \brief Append a sequence of elements in one shot
+ /// \param[in] values_begin InputIterator to the beginning of the values
+ /// \param[in] values_end InputIterator pointing to the end of the values
+ /// or null(0) values
+ /// \return Status
+ template <typename ValuesIter>
+ Status AppendValues(ValuesIter values_begin, ValuesIter values_end) {
+ int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
+ ARROW_RETURN_NOT_OK(Reserve(length));
+ data_builder_.UnsafeAppend<false>(
+ length, [&values_begin]() -> bool { return *values_begin++; });
+ // this updates length_
+ UnsafeSetNotNull(length);
+ return Status::OK();
+ }
+
+ /// \brief Append a sequence of elements in one shot, with a specified nullmap
+ /// \param[in] values_begin InputIterator to the beginning of the values
+ /// \param[in] values_end InputIterator pointing to the end of the values
+ /// \param[in] valid_begin InputIterator with elements indication valid(1)
+ /// or null(0) values
+ /// \return Status
+ template <typename ValuesIter, typename ValidIter>
+ enable_if_t<!std::is_pointer<ValidIter>::value, Status> AppendValues(
+ ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) {
+ static_assert(!internal::is_null_pointer<ValidIter>::value,
+ "Don't pass a NULLPTR directly as valid_begin, use the 2-argument "
+ "version instead");
+ int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
+ ARROW_RETURN_NOT_OK(Reserve(length));
+
+ data_builder_.UnsafeAppend<false>(
+ length, [&values_begin]() -> bool { return *values_begin++; });
+ null_bitmap_builder_.UnsafeAppend<true>(
+ length, [&valid_begin]() -> bool { return *valid_begin++; });
+ length_ = null_bitmap_builder_.length();
+ null_count_ = null_bitmap_builder_.false_count();
+ return Status::OK();
+ }
+
+ // Same as above, for a pointer type ValidIter
+ template <typename ValuesIter, typename ValidIter>
+ enable_if_t<std::is_pointer<ValidIter>::value, Status> AppendValues(
+ ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) {
+ int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
+ ARROW_RETURN_NOT_OK(Reserve(length));
+ data_builder_.UnsafeAppend<false>(
+ length, [&values_begin]() -> bool { return *values_begin++; });
+
+ if (valid_begin == NULLPTR) {
+ UnsafeSetNotNull(length);
+ } else {
+ null_bitmap_builder_.UnsafeAppend<true>(
+ length, [&valid_begin]() -> bool { return *valid_begin++; });
+ }
+ length_ = null_bitmap_builder_.length();
+ null_count_ = null_bitmap_builder_.false_count();
+ return Status::OK();
+ }
+
+ Status AppendValues(int64_t length, bool value);
+
+ Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
+
+ /// \cond FALSE
+ using ArrayBuilder::Finish;
+ /// \endcond
+
+ Status Finish(std::shared_ptr<BooleanArray>* out) { return FinishTyped(out); }
+
+ void Reset() override;
+ Status Resize(int64_t capacity) override;
+
+ std::shared_ptr<DataType> type() const override { return boolean(); }
+
+ protected:
+ TypedBufferBuilder<bool> data_builder_;
+};
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_time.h b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_time.h
new file mode 100644
index 00000000000..ccd11c22345
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_time.h
@@ -0,0 +1,43 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Contains declarations of time related Arrow builder types.
+
+#pragma once
+
+#include <memory>
+
+#include "arrow/array/builder_base.h"
+#include "arrow/array/builder_primitive.h"
+
+namespace arrow {
+
+// TODO(ARROW-7938): this class is untested
+
+class ARROW_EXPORT DayTimeIntervalBuilder : public NumericBuilder<DayTimeIntervalType> {
+ public:
+ using DayMilliseconds = DayTimeIntervalType::DayMilliseconds;
+
+ explicit DayTimeIntervalBuilder(MemoryPool* pool = default_memory_pool())
+ : DayTimeIntervalBuilder(day_time_interval(), pool) {}
+
+ explicit DayTimeIntervalBuilder(std::shared_ptr<DataType> type,
+ MemoryPool* pool = default_memory_pool())
+ : NumericBuilder<DayTimeIntervalType>(type, pool) {}
+};
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_union.cc b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_union.cc
new file mode 100644
index 00000000000..8617cb73fce
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_union.cc
@@ -0,0 +1,121 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/array/builder_union.h"
+
+#include <cstddef>
+#include <utility>
+
+#include "arrow/buffer.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/logging.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+using internal::checked_pointer_cast;
+
+Status BasicUnionBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
+ int64_t length = types_builder_.length();
+
+ std::shared_ptr<Buffer> types;
+ RETURN_NOT_OK(types_builder_.Finish(&types));
+
+ std::vector<std::shared_ptr<ArrayData>> child_data(children_.size());
+ for (size_t i = 0; i < children_.size(); ++i) {
+ RETURN_NOT_OK(children_[i]->FinishInternal(&child_data[i]));
+ }
+
+ *out = ArrayData::Make(type(), length, {nullptr, types}, /*null_count=*/0);
+ (*out)->child_data = std::move(child_data);
+ return Status::OK();
+}
+
+Status DenseUnionBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
+ ARROW_RETURN_NOT_OK(BasicUnionBuilder::FinishInternal(out));
+ (*out)->buffers.resize(3);
+ ARROW_RETURN_NOT_OK(offsets_builder_.Finish(&(*out)->buffers[2]));
+ return Status::OK();
+}
+
+BasicUnionBuilder::BasicUnionBuilder(
+ MemoryPool* pool, const std::vector<std::shared_ptr<ArrayBuilder>>& children,
+ const std::shared_ptr<DataType>& type)
+ : ArrayBuilder(pool), child_fields_(children.size()), types_builder_(pool) {
+ const auto& union_type = checked_cast<const UnionType&>(*type);
+ mode_ = union_type.mode();
+
+ DCHECK_EQ(children.size(), union_type.type_codes().size());
+
+ type_codes_ = union_type.type_codes();
+ children_ = children;
+
+ type_id_to_children_.resize(union_type.max_type_code() + 1, nullptr);
+ DCHECK_LE(
+ type_id_to_children_.size() - 1,
+ static_cast<decltype(type_id_to_children_)::size_type>(UnionType::kMaxTypeCode));
+
+ for (size_t i = 0; i < children.size(); ++i) {
+ child_fields_[i] = union_type.field(static_cast<int>(i));
+
+ auto type_id = union_type.type_codes()[i];
+ type_id_to_children_[type_id] = children[i].get();
+ }
+}
+
+int8_t BasicUnionBuilder::AppendChild(const std::shared_ptr<ArrayBuilder>& new_child,
+ const std::string& field_name) {
+ children_.push_back(new_child);
+ auto new_type_id = NextTypeId();
+
+ type_id_to_children_[new_type_id] = new_child.get();
+ child_fields_.push_back(field(field_name, nullptr));
+ type_codes_.push_back(static_cast<int8_t>(new_type_id));
+
+ return new_type_id;
+}
+
+std::shared_ptr<DataType> BasicUnionBuilder::type() const {
+ std::vector<std::shared_ptr<Field>> child_fields(child_fields_.size());
+ for (size_t i = 0; i < child_fields.size(); ++i) {
+ child_fields[i] = child_fields_[i]->WithType(children_[i]->type());
+ }
+ return mode_ == UnionMode::SPARSE ? sparse_union(std::move(child_fields), type_codes_)
+ : dense_union(std::move(child_fields), type_codes_);
+}
+
+int8_t BasicUnionBuilder::NextTypeId() {
+ // Find type_id such that type_id_to_children_[type_id] == nullptr
+ // and use that for the new child. Start searching at dense_type_id_
+ // since type_id_to_children_ is densely packed up at least up to dense_type_id_
+ for (; static_cast<size_t>(dense_type_id_) < type_id_to_children_.size();
+ ++dense_type_id_) {
+ if (type_id_to_children_[dense_type_id_] == nullptr) {
+ return dense_type_id_++;
+ }
+ }
+
+ DCHECK_LT(
+ type_id_to_children_.size(),
+ static_cast<decltype(type_id_to_children_)::size_type>(UnionType::kMaxTypeCode));
+
+ // type_id_to_children_ is already densely packed, so just append the new child
+ type_id_to_children_.resize(type_id_to_children_.size() + 1);
+ return dense_type_id_++;
+}
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_union.h b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_union.h
new file mode 100644
index 00000000000..060be474fb8
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_union.h
@@ -0,0 +1,235 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/array/array_nested.h"
+#include "arrow/array/builder_base.h"
+#include "arrow/array/data.h"
+#include "arrow/buffer_builder.h"
+#include "arrow/memory_pool.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+/// \brief Base class for union array builds.
+///
+/// Note that while we subclass ArrayBuilder, as union types do not have a
+/// validity bitmap, the bitmap builder member of ArrayBuilder is not used.
+class ARROW_EXPORT BasicUnionBuilder : public ArrayBuilder {
+ public:
+ Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
+
+ /// \cond FALSE
+ using ArrayBuilder::Finish;
+ /// \endcond
+
+ Status Finish(std::shared_ptr<UnionArray>* out) { return FinishTyped(out); }
+
+ /// \brief Make a new child builder available to the UnionArray
+ ///
+ /// \param[in] new_child the child builder
+ /// \param[in] field_name the name of the field in the union array type
+ /// if type inference is used
+ /// \return child index, which is the "type" argument that needs
+ /// to be passed to the "Append" method to add a new element to
+ /// the union array.
+ int8_t AppendChild(const std::shared_ptr<ArrayBuilder>& new_child,
+ const std::string& field_name = "");
+
+ std::shared_ptr<DataType> type() const override;
+
+ int64_t length() const override { return types_builder_.length(); }
+
+ protected:
+ BasicUnionBuilder(MemoryPool* pool,
+ const std::vector<std::shared_ptr<ArrayBuilder>>& children,
+ const std::shared_ptr<DataType>& type);
+
+ int8_t NextTypeId();
+
+ std::vector<std::shared_ptr<Field>> child_fields_;
+ std::vector<int8_t> type_codes_;
+ UnionMode::type mode_;
+
+ std::vector<ArrayBuilder*> type_id_to_children_;
+ // for all type_id < dense_type_id_, type_id_to_children_[type_id] != nullptr
+ int8_t dense_type_id_ = 0;
+ TypedBufferBuilder<int8_t> types_builder_;
+};
+
+/// \class DenseUnionBuilder
+///
+/// This API is EXPERIMENTAL.
+class ARROW_EXPORT DenseUnionBuilder : public BasicUnionBuilder {
+ public:
+ /// Use this constructor to initialize the UnionBuilder with no child builders,
+ /// allowing type to be inferred. You will need to call AppendChild for each of the
+ /// children builders you want to use.
+ explicit DenseUnionBuilder(MemoryPool* pool)
+ : BasicUnionBuilder(pool, {}, dense_union(FieldVector{})), offsets_builder_(pool) {}
+
+ /// Use this constructor to specify the type explicitly.
+ /// You can still add child builders to the union after using this constructor
+ DenseUnionBuilder(MemoryPool* pool,
+ const std::vector<std::shared_ptr<ArrayBuilder>>& children,
+ const std::shared_ptr<DataType>& type)
+ : BasicUnionBuilder(pool, children, type), offsets_builder_(pool) {}
+
+ Status AppendNull() final {
+ const int8_t first_child_code = type_codes_[0];
+ ArrayBuilder* child_builder = type_id_to_children_[first_child_code];
+ ARROW_RETURN_NOT_OK(types_builder_.Append(first_child_code));
+ ARROW_RETURN_NOT_OK(
+ offsets_builder_.Append(static_cast<int32_t>(child_builder->length())));
+ // Append a null arbitrarily to the first child
+ return child_builder->AppendNull();
+ }
+
+ Status AppendNulls(int64_t length) final {
+ const int8_t first_child_code = type_codes_[0];
+ ArrayBuilder* child_builder = type_id_to_children_[first_child_code];
+ ARROW_RETURN_NOT_OK(types_builder_.Append(length, first_child_code));
+ ARROW_RETURN_NOT_OK(
+ offsets_builder_.Append(length, static_cast<int32_t>(child_builder->length())));
+ // Append just a single null to the first child
+ return child_builder->AppendNull();
+ }
+
+ Status AppendEmptyValue() final {
+ const int8_t first_child_code = type_codes_[0];
+ ArrayBuilder* child_builder = type_id_to_children_[first_child_code];
+ ARROW_RETURN_NOT_OK(types_builder_.Append(first_child_code));
+ ARROW_RETURN_NOT_OK(
+ offsets_builder_.Append(static_cast<int32_t>(child_builder->length())));
+ // Append an empty value arbitrarily to the first child
+ return child_builder->AppendEmptyValue();
+ }
+
+ Status AppendEmptyValues(int64_t length) final {
+ const int8_t first_child_code = type_codes_[0];
+ ArrayBuilder* child_builder = type_id_to_children_[first_child_code];
+ ARROW_RETURN_NOT_OK(types_builder_.Append(length, first_child_code));
+ ARROW_RETURN_NOT_OK(
+ offsets_builder_.Append(length, static_cast<int32_t>(child_builder->length())));
+ // Append just a single empty value to the first child
+ return child_builder->AppendEmptyValue();
+ }
+
+ /// \brief Append an element to the UnionArray. This must be followed
+ /// by an append to the appropriate child builder.
+ ///
+ /// \param[in] next_type type_id of the child to which the next value will be appended.
+ ///
+ /// The corresponding child builder must be appended to independently after this method
+ /// is called.
+ Status Append(int8_t next_type) {
+ ARROW_RETURN_NOT_OK(types_builder_.Append(next_type));
+ if (type_id_to_children_[next_type]->length() == kListMaximumElements) {
+ return Status::CapacityError(
+ "a dense UnionArray cannot contain more than 2^31 - 1 elements from a single "
+ "child");
+ }
+ auto offset = static_cast<int32_t>(type_id_to_children_[next_type]->length());
+ return offsets_builder_.Append(offset);
+ }
+
+ Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
+
+ private:
+ TypedBufferBuilder<int32_t> offsets_builder_;
+};
+
+/// \class SparseUnionBuilder
+///
+/// This API is EXPERIMENTAL.
+class ARROW_EXPORT SparseUnionBuilder : public BasicUnionBuilder {
+ public:
+ /// Use this constructor to initialize the UnionBuilder with no child builders,
+ /// allowing type to be inferred. You will need to call AppendChild for each of the
+ /// children builders you want to use.
+ explicit SparseUnionBuilder(MemoryPool* pool)
+ : BasicUnionBuilder(pool, {}, sparse_union(FieldVector{})) {}
+
+ /// Use this constructor to specify the type explicitly.
+ /// You can still add child builders to the union after using this constructor
+ SparseUnionBuilder(MemoryPool* pool,
+ const std::vector<std::shared_ptr<ArrayBuilder>>& children,
+ const std::shared_ptr<DataType>& type)
+ : BasicUnionBuilder(pool, children, type) {}
+
+ /// \brief Append a null value.
+ ///
+ /// A null is appended to the first child, empty values to the other children.
+ Status AppendNull() final {
+ const auto first_child_code = type_codes_[0];
+ ARROW_RETURN_NOT_OK(types_builder_.Append(first_child_code));
+ ARROW_RETURN_NOT_OK(type_id_to_children_[first_child_code]->AppendNull());
+ for (int i = 1; i < static_cast<int>(type_codes_.size()); ++i) {
+ ARROW_RETURN_NOT_OK(type_id_to_children_[type_codes_[i]]->AppendEmptyValue());
+ }
+ return Status::OK();
+ }
+
+ /// \brief Append multiple null values.
+ ///
+ /// Nulls are appended to the first child, empty values to the other children.
+ Status AppendNulls(int64_t length) final {
+ const auto first_child_code = type_codes_[0];
+ ARROW_RETURN_NOT_OK(types_builder_.Append(length, first_child_code));
+ ARROW_RETURN_NOT_OK(type_id_to_children_[first_child_code]->AppendNulls(length));
+ for (int i = 1; i < static_cast<int>(type_codes_.size()); ++i) {
+ ARROW_RETURN_NOT_OK(
+ type_id_to_children_[type_codes_[i]]->AppendEmptyValues(length));
+ }
+ return Status::OK();
+ }
+
+ Status AppendEmptyValue() final {
+ ARROW_RETURN_NOT_OK(types_builder_.Append(type_codes_[0]));
+ for (int8_t code : type_codes_) {
+ ARROW_RETURN_NOT_OK(type_id_to_children_[code]->AppendEmptyValue());
+ }
+ return Status::OK();
+ }
+
+ Status AppendEmptyValues(int64_t length) final {
+ ARROW_RETURN_NOT_OK(types_builder_.Append(length, type_codes_[0]));
+ for (int8_t code : type_codes_) {
+ ARROW_RETURN_NOT_OK(type_id_to_children_[code]->AppendEmptyValues(length));
+ }
+ return Status::OK();
+ }
+
+ /// \brief Append an element to the UnionArray. This must be followed
+ /// by an append to the appropriate child builder.
+ ///
+ /// \param[in] next_type type_id of the child to which the next value will be appended.
+ ///
+ /// The corresponding child builder must be appended to independently after this method
+ /// is called, and all other child builders must have null or empty value appended.
+ Status Append(int8_t next_type) { return types_builder_.Append(next_type); }
+};
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/concatenate.cc b/contrib/libs/apache/arrow/cpp/src/arrow/array/concatenate.cc
new file mode 100644
index 00000000000..32478783394
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/concatenate.cc
@@ -0,0 +1,490 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/array/concatenate.h"
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "arrow/array/array_base.h"
+#include "arrow/array/data.h"
+#include "arrow/array/util.h"
+#include "arrow/buffer.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_ops.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/int_util.h"
+#include "arrow/util/int_util_internal.h"
+#include "arrow/util/logging.h"
+#include "arrow/visitor_inline.h"
+
+namespace arrow {
+
+using internal::SafeSignedAdd;
+
+namespace {
+/// offset, length pair for representing a Range of a buffer or array
+struct Range {
+ int64_t offset = -1, length = 0;
+
+ Range() = default;
+ Range(int64_t o, int64_t l) : offset(o), length(l) {}
+};
+
+/// non-owning view into a range of bits
+struct Bitmap {
+ Bitmap() = default;
+ Bitmap(const uint8_t* d, Range r) : data(d), range(r) {}
+ explicit Bitmap(const std::shared_ptr<Buffer>& buffer, Range r)
+ : Bitmap(buffer ? buffer->data() : nullptr, r) {}
+
+ const uint8_t* data = nullptr;
+ Range range;
+
+ bool AllSet() const { return data == nullptr; }
+};
+
+// Allocate a buffer and concatenate bitmaps into it.
+Status ConcatenateBitmaps(const std::vector<Bitmap>& bitmaps, MemoryPool* pool,
+ std::shared_ptr<Buffer>* out) {
+ int64_t out_length = 0;
+ for (const auto& bitmap : bitmaps) {
+ if (internal::AddWithOverflow(out_length, bitmap.range.length, &out_length)) {
+ return Status::Invalid("Length overflow when concatenating arrays");
+ }
+ }
+ ARROW_ASSIGN_OR_RAISE(*out, AllocateBitmap(out_length, pool));
+ uint8_t* dst = (*out)->mutable_data();
+
+ int64_t bitmap_offset = 0;
+ for (auto bitmap : bitmaps) {
+ if (bitmap.AllSet()) {
+ BitUtil::SetBitsTo(dst, bitmap_offset, bitmap.range.length, true);
+ } else {
+ internal::CopyBitmap(bitmap.data, bitmap.range.offset, bitmap.range.length, dst,
+ bitmap_offset);
+ }
+ bitmap_offset += bitmap.range.length;
+ }
+
+ return Status::OK();
+}
+
+// Write offsets in src into dst, adjusting them such that first_offset
+// will be the first offset written.
+template <typename Offset>
+Status PutOffsets(const std::shared_ptr<Buffer>& src, Offset first_offset, Offset* dst,
+ Range* values_range);
+
+// Concatenate buffers holding offsets into a single buffer of offsets,
+// also computing the ranges of values spanned by each buffer of offsets.
+template <typename Offset>
+Status ConcatenateOffsets(const BufferVector& buffers, MemoryPool* pool,
+ std::shared_ptr<Buffer>* out,
+ std::vector<Range>* values_ranges) {
+ values_ranges->resize(buffers.size());
+
+ // allocate output buffer
+ int64_t out_length = 0;
+ for (const auto& buffer : buffers) {
+ out_length += buffer->size() / sizeof(Offset);
+ }
+ ARROW_ASSIGN_OR_RAISE(*out, AllocateBuffer((out_length + 1) * sizeof(Offset), pool));
+ auto dst = reinterpret_cast<Offset*>((*out)->mutable_data());
+
+ int64_t elements_length = 0;
+ Offset values_length = 0;
+ for (size_t i = 0; i < buffers.size(); ++i) {
+ // the first offset from buffers[i] will be adjusted to values_length
+ // (the cumulative length of values spanned by offsets in previous buffers)
+ RETURN_NOT_OK(PutOffsets<Offset>(buffers[i], values_length, &dst[elements_length],
+ &values_ranges->at(i)));
+ elements_length += buffers[i]->size() / sizeof(Offset);
+ values_length += static_cast<Offset>(values_ranges->at(i).length);
+ }
+
+ // the final element in dst is the length of all values spanned by the offsets
+ dst[out_length] = values_length;
+ return Status::OK();
+}
+
+template <typename Offset>
+Status PutOffsets(const std::shared_ptr<Buffer>& src, Offset first_offset, Offset* dst,
+ Range* values_range) {
+ if (src->size() == 0) {
+ // It's allowed to have an empty offsets buffer for a 0-length array
+ // (see Array::Validate)
+ values_range->offset = 0;
+ values_range->length = 0;
+ return Status::OK();
+ }
+
+ // Get the range of offsets to transfer from src
+ auto src_begin = reinterpret_cast<const Offset*>(src->data());
+ auto src_end = reinterpret_cast<const Offset*>(src->data() + src->size());
+
+ // Compute the range of values which is spanned by this range of offsets
+ values_range->offset = src_begin[0];
+ values_range->length = *src_end - values_range->offset;
+ if (first_offset > std::numeric_limits<Offset>::max() - values_range->length) {
+ return Status::Invalid("offset overflow while concatenating arrays");
+ }
+
+ // Write offsets into dst, ensuring that the first offset written is
+ // first_offset
+ auto adjustment = first_offset - src_begin[0];
+ // NOTE: Concatenate can be called during IPC reads to append delta dictionaries.
+ // Avoid UB on non-validated input by doing the addition in the unsigned domain.
+ // (the result can later be validated using Array::ValidateFull)
+ std::transform(src_begin, src_end, dst, [adjustment](Offset offset) {
+ return SafeSignedAdd(offset, adjustment);
+ });
+ return Status::OK();
+}
+
+class ConcatenateImpl {
+ public:
+ ConcatenateImpl(const ArrayDataVector& in, MemoryPool* pool)
+ : in_(std::move(in)), pool_(pool), out_(std::make_shared<ArrayData>()) {
+ out_->type = in[0]->type;
+ for (size_t i = 0; i < in_.size(); ++i) {
+ out_->length = SafeSignedAdd(out_->length, in[i]->length);
+ if (out_->null_count == kUnknownNullCount ||
+ in[i]->null_count == kUnknownNullCount) {
+ out_->null_count = kUnknownNullCount;
+ continue;
+ }
+ out_->null_count = SafeSignedAdd(out_->null_count.load(), in[i]->null_count.load());
+ }
+ out_->buffers.resize(in[0]->buffers.size());
+ out_->child_data.resize(in[0]->child_data.size());
+ for (auto& data : out_->child_data) {
+ data = std::make_shared<ArrayData>();
+ }
+ }
+
+ Status Concatenate(std::shared_ptr<ArrayData>* out) && {
+ if (out_->null_count != 0 && internal::HasValidityBitmap(out_->type->id())) {
+ RETURN_NOT_OK(ConcatenateBitmaps(Bitmaps(0), pool_, &out_->buffers[0]));
+ }
+ RETURN_NOT_OK(VisitTypeInline(*out_->type, this));
+ *out = std::move(out_);
+ return Status::OK();
+ }
+
+ Status Visit(const NullType&) { return Status::OK(); }
+
+ Status Visit(const BooleanType&) {
+ return ConcatenateBitmaps(Bitmaps(1), pool_, &out_->buffers[1]);
+ }
+
+ Status Visit(const FixedWidthType& fixed) {
+ // Handles numbers, decimal128, decimal256, fixed_size_binary
+ ARROW_ASSIGN_OR_RAISE(auto buffers, Buffers(1, fixed));
+ return ConcatenateBuffers(buffers, pool_).Value(&out_->buffers[1]);
+ }
+
+ Status Visit(const BinaryType&) {
+ std::vector<Range> value_ranges;
+ ARROW_ASSIGN_OR_RAISE(auto index_buffers, Buffers(1, sizeof(int32_t)));
+ RETURN_NOT_OK(ConcatenateOffsets<int32_t>(index_buffers, pool_, &out_->buffers[1],
+ &value_ranges));
+ ARROW_ASSIGN_OR_RAISE(auto value_buffers, Buffers(2, value_ranges));
+ return ConcatenateBuffers(value_buffers, pool_).Value(&out_->buffers[2]);
+ }
+
+ Status Visit(const LargeBinaryType&) {
+ std::vector<Range> value_ranges;
+ ARROW_ASSIGN_OR_RAISE(auto index_buffers, Buffers(1, sizeof(int64_t)));
+ RETURN_NOT_OK(ConcatenateOffsets<int64_t>(index_buffers, pool_, &out_->buffers[1],
+ &value_ranges));
+ ARROW_ASSIGN_OR_RAISE(auto value_buffers, Buffers(2, value_ranges));
+ return ConcatenateBuffers(value_buffers, pool_).Value(&out_->buffers[2]);
+ }
+
+ Status Visit(const ListType&) {
+ std::vector<Range> value_ranges;
+ ARROW_ASSIGN_OR_RAISE(auto index_buffers, Buffers(1, sizeof(int32_t)));
+ RETURN_NOT_OK(ConcatenateOffsets<int32_t>(index_buffers, pool_, &out_->buffers[1],
+ &value_ranges));
+ ARROW_ASSIGN_OR_RAISE(auto child_data, ChildData(0, value_ranges));
+ return ConcatenateImpl(child_data, pool_).Concatenate(&out_->child_data[0]);
+ }
+
+ Status Visit(const LargeListType&) {
+ std::vector<Range> value_ranges;
+ ARROW_ASSIGN_OR_RAISE(auto index_buffers, Buffers(1, sizeof(int64_t)));
+ RETURN_NOT_OK(ConcatenateOffsets<int64_t>(index_buffers, pool_, &out_->buffers[1],
+ &value_ranges));
+ ARROW_ASSIGN_OR_RAISE(auto child_data, ChildData(0, value_ranges));
+ return ConcatenateImpl(child_data, pool_).Concatenate(&out_->child_data[0]);
+ }
+
+ Status Visit(const FixedSizeListType& fixed_size_list) {
+ ARROW_ASSIGN_OR_RAISE(auto child_data, ChildData(0, fixed_size_list.list_size()));
+ return ConcatenateImpl(child_data, pool_).Concatenate(&out_->child_data[0]);
+ }
+
+ Status Visit(const StructType& s) {
+ for (int i = 0; i < s.num_fields(); ++i) {
+ ARROW_ASSIGN_OR_RAISE(auto child_data, ChildData(i));
+ RETURN_NOT_OK(ConcatenateImpl(child_data, pool_).Concatenate(&out_->child_data[i]));
+ }
+ return Status::OK();
+ }
+
+ Result<BufferVector> UnifyDictionaries(const DictionaryType& d) {
+ BufferVector new_index_lookup;
+ ARROW_ASSIGN_OR_RAISE(auto unifier, DictionaryUnifier::Make(d.value_type()));
+ new_index_lookup.resize(in_.size());
+ for (size_t i = 0; i < in_.size(); i++) {
+ auto item = in_[i];
+ auto dictionary_array = MakeArray(item->dictionary);
+ RETURN_NOT_OK(unifier->Unify(*dictionary_array, &new_index_lookup[i]));
+ }
+ std::shared_ptr<Array> out_dictionary;
+ RETURN_NOT_OK(unifier->GetResultWithIndexType(d.index_type(), &out_dictionary));
+ out_->dictionary = out_dictionary->data();
+ return new_index_lookup;
+ }
+
+ // Transpose and concatenate dictionary indices
+ Result<std::shared_ptr<Buffer>> ConcatenateDictionaryIndices(
+ const DataType& index_type, const BufferVector& index_transpositions) {
+ const auto index_width =
+ internal::checked_cast<const FixedWidthType&>(index_type).bit_width() / 8;
+ int64_t out_length = 0;
+ for (const auto& data : in_) {
+ out_length += data->length;
+ }
+ ARROW_ASSIGN_OR_RAISE(auto out, AllocateBuffer(out_length * index_width, pool_));
+ uint8_t* out_data = out->mutable_data();
+ for (size_t i = 0; i < in_.size(); i++) {
+ const auto& data = in_[i];
+ auto transpose_map =
+ reinterpret_cast<const int32_t*>(index_transpositions[i]->data());
+ RETURN_NOT_OK(internal::TransposeInts(index_type, index_type,
+ /*src=*/data->GetValues<uint8_t>(1, 0),
+ /*dest=*/out_data,
+ /*src_offset=*/data->offset,
+ /*dest_offset=*/0, /*length=*/data->length,
+ transpose_map));
+ out_data += data->length * index_width;
+ }
+ return std::move(out);
+ }
+
+ Status Visit(const DictionaryType& d) {
+ auto fixed = internal::checked_cast<const FixedWidthType*>(d.index_type().get());
+
+ // Two cases: all the dictionaries are the same, or unification is
+ // required
+ bool dictionaries_same = true;
+ std::shared_ptr<Array> dictionary0 = MakeArray(in_[0]->dictionary);
+ for (size_t i = 1; i < in_.size(); ++i) {
+ if (!MakeArray(in_[i]->dictionary)->Equals(dictionary0)) {
+ dictionaries_same = false;
+ break;
+ }
+ }
+
+ ARROW_ASSIGN_OR_RAISE(auto index_buffers, Buffers(1, *fixed));
+ if (dictionaries_same) {
+ out_->dictionary = in_[0]->dictionary;
+ return ConcatenateBuffers(index_buffers, pool_).Value(&out_->buffers[1]);
+ } else {
+ ARROW_ASSIGN_OR_RAISE(auto index_lookup, UnifyDictionaries(d));
+ ARROW_ASSIGN_OR_RAISE(out_->buffers[1],
+ ConcatenateDictionaryIndices(*fixed, index_lookup));
+ return Status::OK();
+ }
+ }
+
+ Status Visit(const UnionType& u) {
+ return Status::NotImplemented("concatenation of ", u);
+ }
+
+ Status Visit(const ExtensionType& e) {
+ // XXX can we just concatenate their storage?
+ return Status::NotImplemented("concatenation of ", e);
+ }
+
+ private:
+ // NOTE: Concatenate() can be called during IPC reads to append delta dictionaries
+ // on non-validated input. Therefore, the input-checking SliceBufferSafe and
+ // ArrayData::SliceSafe are used below.
+
+ // Gather the index-th buffer of each input into a vector.
+ // Bytes are sliced with that input's offset and length.
+ // Note that BufferVector will not contain the buffer of in_[i] if it's
+ // nullptr.
+ Result<BufferVector> Buffers(size_t index) {
+ BufferVector buffers;
+ buffers.reserve(in_.size());
+ for (const auto& array_data : in_) {
+ const auto& buffer = array_data->buffers[index];
+ if (buffer != nullptr) {
+ ARROW_ASSIGN_OR_RAISE(
+ auto sliced_buffer,
+ SliceBufferSafe(buffer, array_data->offset, array_data->length));
+ buffers.push_back(std::move(sliced_buffer));
+ }
+ }
+ return buffers;
+ }
+
+ // Gather the index-th buffer of each input into a vector.
+ // Bytes are sliced with the explicitly passed ranges.
+ // Note that BufferVector will not contain the buffer of in_[i] if it's
+ // nullptr.
+ Result<BufferVector> Buffers(size_t index, const std::vector<Range>& ranges) {
+ DCHECK_EQ(in_.size(), ranges.size());
+ BufferVector buffers;
+ buffers.reserve(in_.size());
+ for (size_t i = 0; i < in_.size(); ++i) {
+ const auto& buffer = in_[i]->buffers[index];
+ if (buffer != nullptr) {
+ ARROW_ASSIGN_OR_RAISE(
+ auto sliced_buffer,
+ SliceBufferSafe(buffer, ranges[i].offset, ranges[i].length));
+ buffers.push_back(std::move(sliced_buffer));
+ } else {
+ DCHECK_EQ(ranges[i].length, 0);
+ }
+ }
+ return buffers;
+ }
+
+ // Gather the index-th buffer of each input into a vector.
+ // Buffers are assumed to contain elements of the given byte_width,
+ // those elements are sliced with that input's offset and length.
+ // Note that BufferVector will not contain the buffer of in_[i] if it's
+ // nullptr.
+ Result<BufferVector> Buffers(size_t index, int byte_width) {
+ BufferVector buffers;
+ buffers.reserve(in_.size());
+ for (const auto& array_data : in_) {
+ const auto& buffer = array_data->buffers[index];
+ if (buffer != nullptr) {
+ ARROW_ASSIGN_OR_RAISE(auto sliced_buffer,
+ SliceBufferSafe(buffer, array_data->offset * byte_width,
+ array_data->length * byte_width));
+ buffers.push_back(std::move(sliced_buffer));
+ }
+ }
+ return buffers;
+ }
+
+ // Gather the index-th buffer of each input into a vector.
+ // Buffers are assumed to contain elements of fixed.bit_width(),
+ // those elements are sliced with that input's offset and length.
+ // Note that BufferVector will not contain the buffer of in_[i] if it's
+ // nullptr.
+ Result<BufferVector> Buffers(size_t index, const FixedWidthType& fixed) {
+ DCHECK_EQ(fixed.bit_width() % 8, 0);
+ return Buffers(index, fixed.bit_width() / 8);
+ }
+
+ // Gather the index-th buffer of each input as a Bitmap
+ // into a vector of Bitmaps.
+ std::vector<Bitmap> Bitmaps(size_t index) {
+ std::vector<Bitmap> bitmaps(in_.size());
+ for (size_t i = 0; i < in_.size(); ++i) {
+ Range range(in_[i]->offset, in_[i]->length);
+ bitmaps[i] = Bitmap(in_[i]->buffers[index], range);
+ }
+ return bitmaps;
+ }
+
+ // Gather the index-th child_data of each input into a vector.
+ // Elements are sliced with that input's offset and length.
+ Result<ArrayDataVector> ChildData(size_t index) {
+ ArrayDataVector child_data(in_.size());
+ for (size_t i = 0; i < in_.size(); ++i) {
+ ARROW_ASSIGN_OR_RAISE(child_data[i], in_[i]->child_data[index]->SliceSafe(
+ in_[i]->offset, in_[i]->length));
+ }
+ return child_data;
+ }
+
+ // Gather the index-th child_data of each input into a vector.
+ // Elements are sliced with that input's offset and length multiplied by multiplier.
+ Result<ArrayDataVector> ChildData(size_t index, size_t multiplier) {
+ ArrayDataVector child_data(in_.size());
+ for (size_t i = 0; i < in_.size(); ++i) {
+ ARROW_ASSIGN_OR_RAISE(
+ child_data[i], in_[i]->child_data[index]->SliceSafe(
+ in_[i]->offset * multiplier, in_[i]->length * multiplier));
+ }
+ return child_data;
+ }
+
+ // Gather the index-th child_data of each input into a vector.
+ // Elements are sliced with the explicitly passed ranges.
+ Result<ArrayDataVector> ChildData(size_t index, const std::vector<Range>& ranges) {
+ DCHECK_EQ(in_.size(), ranges.size());
+ ArrayDataVector child_data(in_.size());
+ for (size_t i = 0; i < in_.size(); ++i) {
+ ARROW_ASSIGN_OR_RAISE(child_data[i], in_[i]->child_data[index]->SliceSafe(
+ ranges[i].offset, ranges[i].length));
+ }
+ return child_data;
+ }
+
+ const ArrayDataVector& in_;
+ MemoryPool* pool_;
+ std::shared_ptr<ArrayData> out_;
+};
+
+} // namespace
+
+Result<std::shared_ptr<Array>> Concatenate(const ArrayVector& arrays, MemoryPool* pool) {
+ if (arrays.size() == 0) {
+ return Status::Invalid("Must pass at least one array");
+ }
+
+ // gather ArrayData of input arrays
+ ArrayDataVector data(arrays.size());
+ for (size_t i = 0; i < arrays.size(); ++i) {
+ if (!arrays[i]->type()->Equals(*arrays[0]->type())) {
+ return Status::Invalid("arrays to be concatenated must be identically typed, but ",
+ *arrays[0]->type(), " and ", *arrays[i]->type(),
+ " were encountered.");
+ }
+ data[i] = arrays[i]->data();
+ }
+
+ std::shared_ptr<ArrayData> out_data;
+ RETURN_NOT_OK(ConcatenateImpl(data, pool).Concatenate(&out_data));
+ return MakeArray(std::move(out_data));
+}
+
+Status Concatenate(const ArrayVector& arrays, MemoryPool* pool,
+ std::shared_ptr<Array>* out) {
+ return Concatenate(arrays, pool).Value(out);
+}
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/concatenate.h b/contrib/libs/apache/arrow/cpp/src/arrow/array/concatenate.h
new file mode 100644
index 00000000000..a6c1c3cf3c1
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/concatenate.h
@@ -0,0 +1,42 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+
+#include "arrow/type_fwd.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+/// \brief Concatenate arrays
+///
+/// \param[in] arrays a vector of arrays to be concatenated
+/// \param[in] pool memory to store the result will be allocated from this memory pool
+/// \return the concatenated array
+ARROW_EXPORT
+Result<std::shared_ptr<Array>> Concatenate(const ArrayVector& arrays,
+ MemoryPool* pool = default_memory_pool());
+
+ARROW_DEPRECATED("Use Result-returning version")
+ARROW_EXPORT
+Status Concatenate(const ArrayVector& arrays, MemoryPool* pool,
+ std::shared_ptr<Array>* out);
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/data.cc b/contrib/libs/apache/arrow/cpp/src/arrow/array/data.cc
new file mode 100644
index 00000000000..5a214473972
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/data.cc
@@ -0,0 +1,331 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/array/data.h"
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/buffer.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/util/bitmap_ops.h"
+#include "arrow/util/int_util_internal.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/macros.h"
+
+namespace arrow {
+
+using internal::CountSetBits;
+
+static inline void AdjustNonNullable(Type::type type_id, int64_t length,
+ std::vector<std::shared_ptr<Buffer>>* buffers,
+ int64_t* null_count) {
+ if (type_id == Type::NA) {
+ *null_count = length;
+ (*buffers)[0] = nullptr;
+ } else if (internal::HasValidityBitmap(type_id)) {
+ if (*null_count == 0) {
+ // In case there are no nulls, don't keep an allocated null bitmap around
+ (*buffers)[0] = nullptr;
+ } else if (*null_count == kUnknownNullCount && buffers->at(0) == nullptr) {
+ // Conversely, if no null bitmap is provided, set the null count to 0
+ *null_count = 0;
+ }
+ } else {
+ *null_count = 0;
+ }
+}
+
+std::shared_ptr<ArrayData> ArrayData::Make(std::shared_ptr<DataType> type, int64_t length,
+ std::vector<std::shared_ptr<Buffer>> buffers,
+ int64_t null_count, int64_t offset) {
+ AdjustNonNullable(type->id(), length, &buffers, &null_count);
+ return std::make_shared<ArrayData>(std::move(type), length, std::move(buffers),
+ null_count, offset);
+}
+
+std::shared_ptr<ArrayData> ArrayData::Make(
+ std::shared_ptr<DataType> type, int64_t length,
+ std::vector<std::shared_ptr<Buffer>> buffers,
+ std::vector<std::shared_ptr<ArrayData>> child_data, int64_t null_count,
+ int64_t offset) {
+ AdjustNonNullable(type->id(), length, &buffers, &null_count);
+ return std::make_shared<ArrayData>(std::move(type), length, std::move(buffers),
+ std::move(child_data), null_count, offset);
+}
+
+std::shared_ptr<ArrayData> ArrayData::Make(
+ std::shared_ptr<DataType> type, int64_t length,
+ std::vector<std::shared_ptr<Buffer>> buffers,
+ std::vector<std::shared_ptr<ArrayData>> child_data,
+ std::shared_ptr<ArrayData> dictionary, int64_t null_count, int64_t offset) {
+ AdjustNonNullable(type->id(), length, &buffers, &null_count);
+ auto data = std::make_shared<ArrayData>(std::move(type), length, std::move(buffers),
+ std::move(child_data), null_count, offset);
+ data->dictionary = std::move(dictionary);
+ return data;
+}
+
+std::shared_ptr<ArrayData> ArrayData::Make(std::shared_ptr<DataType> type, int64_t length,
+ int64_t null_count, int64_t offset) {
+ return std::make_shared<ArrayData>(std::move(type), length, null_count, offset);
+}
+
+std::shared_ptr<ArrayData> ArrayData::Slice(int64_t off, int64_t len) const {
+ ARROW_CHECK_LE(off, length) << "Slice offset greater than array length";
+ len = std::min(length - off, len);
+ off += offset;
+
+ auto copy = this->Copy();
+ copy->length = len;
+ copy->offset = off;
+ if (null_count == length) {
+ copy->null_count = len;
+ } else if (off == offset && len == length) { // A copy of current.
+ copy->null_count = null_count.load();
+ } else {
+ copy->null_count = null_count != 0 ? kUnknownNullCount : 0;
+ }
+ return copy;
+}
+
+Result<std::shared_ptr<ArrayData>> ArrayData::SliceSafe(int64_t off, int64_t len) const {
+ RETURN_NOT_OK(internal::CheckSliceParams(length, off, len, "array"));
+ return Slice(off, len);
+}
+
+int64_t ArrayData::GetNullCount() const {
+ int64_t precomputed = this->null_count.load();
+ if (ARROW_PREDICT_FALSE(precomputed == kUnknownNullCount)) {
+ if (this->buffers[0]) {
+ precomputed = this->length -
+ CountSetBits(this->buffers[0]->data(), this->offset, this->length);
+ } else {
+ precomputed = 0;
+ }
+ this->null_count.store(precomputed);
+ }
+ return precomputed;
+}
+
+// ----------------------------------------------------------------------
+// Implement ArrayData::View
+
+namespace {
+
+void AccumulateLayouts(const std::shared_ptr<DataType>& type,
+ std::vector<DataTypeLayout>* layouts) {
+ layouts->push_back(type->layout());
+ for (const auto& child : type->fields()) {
+ AccumulateLayouts(child->type(), layouts);
+ }
+}
+
+void AccumulateArrayData(const std::shared_ptr<ArrayData>& data,
+ std::vector<std::shared_ptr<ArrayData>>* out) {
+ out->push_back(data);
+ for (const auto& child : data->child_data) {
+ AccumulateArrayData(child, out);
+ }
+}
+
+struct ViewDataImpl {
+ std::shared_ptr<DataType> root_in_type;
+ std::shared_ptr<DataType> root_out_type;
+ std::vector<DataTypeLayout> in_layouts;
+ std::vector<std::shared_ptr<ArrayData>> in_data;
+ int64_t in_data_length;
+ size_t in_layout_idx = 0;
+ size_t in_buffer_idx = 0;
+ bool input_exhausted = false;
+
+ Status InvalidView(const std::string& msg) {
+ return Status::Invalid("Can't view array of type ", root_in_type->ToString(), " as ",
+ root_out_type->ToString(), ": ", msg);
+ }
+
+ void AdjustInputPointer() {
+ if (input_exhausted) {
+ return;
+ }
+ while (true) {
+ // Skip exhausted layout (might be empty layout)
+ while (in_buffer_idx >= in_layouts[in_layout_idx].buffers.size()) {
+ in_buffer_idx = 0;
+ ++in_layout_idx;
+ if (in_layout_idx >= in_layouts.size()) {
+ input_exhausted = true;
+ return;
+ }
+ }
+ const auto& in_spec = in_layouts[in_layout_idx].buffers[in_buffer_idx];
+ if (in_spec.kind != DataTypeLayout::ALWAYS_NULL) {
+ return;
+ }
+ // Skip always-null input buffers
+ // (e.g. buffer 0 of a null type or buffer 2 of a sparse union)
+ ++in_buffer_idx;
+ }
+ }
+
+ Status CheckInputAvailable() {
+ if (input_exhausted) {
+ return InvalidView("not enough buffers for view type");
+ }
+ return Status::OK();
+ }
+
+ Status CheckInputExhausted() {
+ if (!input_exhausted) {
+ return InvalidView("too many buffers for view type");
+ }
+ return Status::OK();
+ }
+
+ Result<std::shared_ptr<ArrayData>> GetDictionaryView(const DataType& out_type) {
+ if (in_data[in_layout_idx]->type->id() != Type::DICTIONARY) {
+ return InvalidView("Cannot get view as dictionary type");
+ }
+ const auto& dict_out_type = static_cast<const DictionaryType&>(out_type);
+ return internal::GetArrayView(in_data[in_layout_idx]->dictionary,
+ dict_out_type.value_type());
+ }
+
+ Status MakeDataView(const std::shared_ptr<Field>& out_field,
+ std::shared_ptr<ArrayData>* out) {
+ const auto& out_type = out_field->type();
+ const auto out_layout = out_type->layout();
+
+ AdjustInputPointer();
+ int64_t out_length = in_data_length;
+ int64_t out_offset = 0;
+ int64_t out_null_count;
+
+ std::shared_ptr<ArrayData> dictionary;
+ if (out_type->id() == Type::DICTIONARY) {
+ ARROW_ASSIGN_OR_RAISE(dictionary, GetDictionaryView(*out_type));
+ }
+
+ // No type has a purely empty layout
+ DCHECK_GT(out_layout.buffers.size(), 0);
+
+ std::vector<std::shared_ptr<Buffer>> out_buffers;
+
+ // Process null bitmap
+ if (in_buffer_idx == 0 && out_layout.buffers[0].kind == DataTypeLayout::BITMAP) {
+ // Copy input null bitmap
+ RETURN_NOT_OK(CheckInputAvailable());
+ const auto& in_data_item = in_data[in_layout_idx];
+ if (!out_field->nullable() && in_data_item->GetNullCount() != 0) {
+ return InvalidView("nulls in input cannot be viewed as non-nullable");
+ }
+ DCHECK_GT(in_data_item->buffers.size(), in_buffer_idx);
+ out_buffers.push_back(in_data_item->buffers[in_buffer_idx]);
+ out_length = in_data_item->length;
+ out_offset = in_data_item->offset;
+ out_null_count = in_data_item->null_count;
+ ++in_buffer_idx;
+ AdjustInputPointer();
+ } else {
+ // No null bitmap in input, append no-nulls bitmap
+ out_buffers.push_back(nullptr);
+ if (out_type->id() == Type::NA) {
+ out_null_count = out_length;
+ } else {
+ out_null_count = 0;
+ }
+ }
+
+ // Process other buffers in output layout
+ for (size_t out_buffer_idx = 1; out_buffer_idx < out_layout.buffers.size();
+ ++out_buffer_idx) {
+ const auto& out_spec = out_layout.buffers[out_buffer_idx];
+ // If always-null buffer is expected, just construct it
+ if (out_spec.kind == DataTypeLayout::ALWAYS_NULL) {
+ out_buffers.push_back(nullptr);
+ continue;
+ }
+
+ // If input buffer is null bitmap, try to ignore it
+ while (in_buffer_idx == 0) {
+ RETURN_NOT_OK(CheckInputAvailable());
+ if (in_data[in_layout_idx]->GetNullCount() != 0) {
+ return InvalidView("cannot represent nested nulls");
+ }
+ ++in_buffer_idx;
+ AdjustInputPointer();
+ }
+
+ RETURN_NOT_OK(CheckInputAvailable());
+ const auto& in_spec = in_layouts[in_layout_idx].buffers[in_buffer_idx];
+ if (out_spec != in_spec) {
+ return InvalidView("incompatible layouts");
+ }
+ // Copy input buffer
+ const auto& in_data_item = in_data[in_layout_idx];
+ out_length = in_data_item->length;
+ out_offset = in_data_item->offset;
+ DCHECK_GT(in_data_item->buffers.size(), in_buffer_idx);
+ out_buffers.push_back(in_data_item->buffers[in_buffer_idx]);
+ ++in_buffer_idx;
+ AdjustInputPointer();
+ }
+
+ std::shared_ptr<ArrayData> out_data = ArrayData::Make(
+ out_type, out_length, std::move(out_buffers), out_null_count, out_offset);
+ out_data->dictionary = dictionary;
+
+ // Process children recursively, depth-first
+ for (const auto& child_field : out_type->fields()) {
+ std::shared_ptr<ArrayData> child_data;
+ RETURN_NOT_OK(MakeDataView(child_field, &child_data));
+ out_data->child_data.push_back(std::move(child_data));
+ }
+ *out = std::move(out_data);
+ return Status::OK();
+ }
+};
+
+} // namespace
+
+namespace internal {
+
+Result<std::shared_ptr<ArrayData>> GetArrayView(
+ const std::shared_ptr<ArrayData>& data, const std::shared_ptr<DataType>& out_type) {
+ ViewDataImpl impl;
+ impl.root_in_type = data->type;
+ impl.root_out_type = out_type;
+ AccumulateLayouts(impl.root_in_type, &impl.in_layouts);
+ AccumulateArrayData(data, &impl.in_data);
+ impl.in_data_length = data->length;
+
+ std::shared_ptr<ArrayData> out_data;
+ // Dummy field for output type
+ auto out_field = field("", out_type);
+ RETURN_NOT_OK(impl.MakeDataView(out_field, &out_data));
+ RETURN_NOT_OK(impl.CheckInputExhausted());
+ return out_data;
+}
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/data.h b/contrib/libs/apache/arrow/cpp/src/arrow/array/data.h
new file mode 100644
index 00000000000..418d09def6b
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/data.h
@@ -0,0 +1,258 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <atomic> // IWYU pragma: export
+#include <cstdint>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "arrow/buffer.h"
+#include "arrow/result.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+// When slicing, we do not know the null count of the sliced range without
+// doing some computation. To avoid doing this eagerly, we set the null count
+// to -1 (any negative number will do). When Array::null_count is called the
+// first time, the null count will be computed. See ARROW-33
+constexpr int64_t kUnknownNullCount = -1;
+
+// ----------------------------------------------------------------------
+// Generic array data container
+
+/// \class ArrayData
+/// \brief Mutable container for generic Arrow array data
+///
+/// This data structure is a self-contained representation of the memory and
+/// metadata inside an Arrow array data structure (called vectors in Java). The
+/// classes arrow::Array and its subclasses provide strongly-typed accessors
+/// with support for the visitor pattern and other affordances.
+///
+/// This class is designed for easy internal data manipulation, analytical data
+/// processing, and data transport to and from IPC messages. For example, we
+/// could cast from int64 to float64 like so:
+///
+/// Int64Array arr = GetMyData();
+/// auto new_data = arr.data()->Copy();
+/// new_data->type = arrow::float64();
+/// DoubleArray double_arr(new_data);
+///
+/// This object is also useful in an analytics setting where memory may be
+/// reused. For example, if we had a group of operations all returning doubles,
+/// say:
+///
+/// Log(Sqrt(Expr(arr)))
+///
+/// Then the low-level implementations of each of these functions could have
+/// the signatures
+///
+/// void Log(const ArrayData& values, ArrayData* out);
+///
+/// As another example a function may consume one or more memory buffers in an
+/// input array and replace them with newly-allocated data, changing the output
+/// data type as well.
+struct ARROW_EXPORT ArrayData {
+ ArrayData() = default;
+
+ ArrayData(std::shared_ptr<DataType> type, int64_t length,
+ int64_t null_count = kUnknownNullCount, int64_t offset = 0)
+ : type(std::move(type)), length(length), null_count(null_count), offset(offset) {}
+
+ ArrayData(std::shared_ptr<DataType> type, int64_t length,
+ std::vector<std::shared_ptr<Buffer>> buffers,
+ int64_t null_count = kUnknownNullCount, int64_t offset = 0)
+ : ArrayData(std::move(type), length, null_count, offset) {
+ this->buffers = std::move(buffers);
+ }
+
+ ArrayData(std::shared_ptr<DataType> type, int64_t length,
+ std::vector<std::shared_ptr<Buffer>> buffers,
+ std::vector<std::shared_ptr<ArrayData>> child_data,
+ int64_t null_count = kUnknownNullCount, int64_t offset = 0)
+ : ArrayData(std::move(type), length, null_count, offset) {
+ this->buffers = std::move(buffers);
+ this->child_data = std::move(child_data);
+ }
+
+ static std::shared_ptr<ArrayData> Make(std::shared_ptr<DataType> type, int64_t length,
+ std::vector<std::shared_ptr<Buffer>> buffers,
+ int64_t null_count = kUnknownNullCount,
+ int64_t offset = 0);
+
+ static std::shared_ptr<ArrayData> Make(
+ std::shared_ptr<DataType> type, int64_t length,
+ std::vector<std::shared_ptr<Buffer>> buffers,
+ std::vector<std::shared_ptr<ArrayData>> child_data,
+ int64_t null_count = kUnknownNullCount, int64_t offset = 0);
+
+ static std::shared_ptr<ArrayData> Make(
+ std::shared_ptr<DataType> type, int64_t length,
+ std::vector<std::shared_ptr<Buffer>> buffers,
+ std::vector<std::shared_ptr<ArrayData>> child_data,
+ std::shared_ptr<ArrayData> dictionary, int64_t null_count = kUnknownNullCount,
+ int64_t offset = 0);
+
+ static std::shared_ptr<ArrayData> Make(std::shared_ptr<DataType> type, int64_t length,
+ int64_t null_count = kUnknownNullCount,
+ int64_t offset = 0);
+
+ // Move constructor
+ ArrayData(ArrayData&& other) noexcept
+ : type(std::move(other.type)),
+ length(other.length),
+ offset(other.offset),
+ buffers(std::move(other.buffers)),
+ child_data(std::move(other.child_data)),
+ dictionary(std::move(other.dictionary)) {
+ SetNullCount(other.null_count);
+ }
+
+ // Copy constructor
+ ArrayData(const ArrayData& other) noexcept
+ : type(other.type),
+ length(other.length),
+ offset(other.offset),
+ buffers(other.buffers),
+ child_data(other.child_data),
+ dictionary(other.dictionary) {
+ SetNullCount(other.null_count);
+ }
+
+ // Move assignment
+ ArrayData& operator=(ArrayData&& other) {
+ type = std::move(other.type);
+ length = other.length;
+ SetNullCount(other.null_count);
+ offset = other.offset;
+ buffers = std::move(other.buffers);
+ child_data = std::move(other.child_data);
+ dictionary = std::move(other.dictionary);
+ return *this;
+ }
+
+ // Copy assignment
+ ArrayData& operator=(const ArrayData& other) {
+ type = other.type;
+ length = other.length;
+ SetNullCount(other.null_count);
+ offset = other.offset;
+ buffers = other.buffers;
+ child_data = other.child_data;
+ dictionary = other.dictionary;
+ return *this;
+ }
+
+ std::shared_ptr<ArrayData> Copy() const { return std::make_shared<ArrayData>(*this); }
+
+ // Access a buffer's data as a typed C pointer
+ template <typename T>
+ inline const T* GetValues(int i, int64_t absolute_offset) const {
+ if (buffers[i]) {
+ return reinterpret_cast<const T*>(buffers[i]->data()) + absolute_offset;
+ } else {
+ return NULLPTR;
+ }
+ }
+
+ template <typename T>
+ inline const T* GetValues(int i) const {
+ return GetValues<T>(i, offset);
+ }
+
+ // Like GetValues, but returns NULLPTR instead of aborting if the underlying
+ // buffer is not a CPU buffer.
+ template <typename T>
+ inline const T* GetValuesSafe(int i, int64_t absolute_offset) const {
+ if (buffers[i] && buffers[i]->is_cpu()) {
+ return reinterpret_cast<const T*>(buffers[i]->data()) + absolute_offset;
+ } else {
+ return NULLPTR;
+ }
+ }
+
+ template <typename T>
+ inline const T* GetValuesSafe(int i) const {
+ return GetValuesSafe<T>(i, offset);
+ }
+
+ // Access a buffer's data as a typed C pointer
+ template <typename T>
+ inline T* GetMutableValues(int i, int64_t absolute_offset) {
+ if (buffers[i]) {
+ return reinterpret_cast<T*>(buffers[i]->mutable_data()) + absolute_offset;
+ } else {
+ return NULLPTR;
+ }
+ }
+
+ template <typename T>
+ inline T* GetMutableValues(int i) {
+ return GetMutableValues<T>(i, offset);
+ }
+
+ /// \brief Construct a zero-copy slice of the data with the given offset and length
+ std::shared_ptr<ArrayData> Slice(int64_t offset, int64_t length) const;
+
+ /// \brief Input-checking variant of Slice
+ ///
+ /// An Invalid Status is returned if the requested slice falls out of bounds.
+ /// Note that unlike Slice, `length` isn't clamped to the available buffer size.
+ Result<std::shared_ptr<ArrayData>> SliceSafe(int64_t offset, int64_t length) const;
+
+ void SetNullCount(int64_t v) { null_count.store(v); }
+
+ /// \brief Return null count, or compute and set it if it's not known
+ int64_t GetNullCount() const;
+
+ bool MayHaveNulls() const {
+ // If an ArrayData is slightly malformed it may have kUnknownNullCount set
+ // but no buffer
+ return null_count.load() != 0 && buffers[0] != NULLPTR;
+ }
+
+ std::shared_ptr<DataType> type;
+ int64_t length = 0;
+ mutable std::atomic<int64_t> null_count{0};
+ // The logical start point into the physical buffers (in values, not bytes).
+ // Note that, for child data, this must be *added* to the child data's own offset.
+ int64_t offset = 0;
+ std::vector<std::shared_ptr<Buffer>> buffers;
+ std::vector<std::shared_ptr<ArrayData>> child_data;
+
+ // The dictionary for this Array, if any. Only used for dictionary type
+ std::shared_ptr<ArrayData> dictionary;
+};
+
+namespace internal {
+
+/// Construct a zero-copy view of this ArrayData with the given type.
+///
+/// This method checks if the types are layout-compatible.
+/// Nested types are traversed in depth-first order. Data buffers must have
+/// the same item sizes, even though the logical types may be different.
+/// An error is returned if the types are not layout-compatible.
+ARROW_EXPORT
+Result<std::shared_ptr<ArrayData>> GetArrayView(const std::shared_ptr<ArrayData>& data,
+ const std::shared_ptr<DataType>& type);
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/dict_internal.h b/contrib/libs/apache/arrow/cpp/src/arrow/array/dict_internal.h
new file mode 100644
index 00000000000..aa027ac22de
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/dict_internal.h
@@ -0,0 +1,193 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/array/builder_dict.h"
+
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/buffer.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/hashing.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/string_view.h"
+#include "arrow/visitor_inline.h"
+
+namespace arrow {
+namespace internal {
+
+template <typename T, typename Enable = void>
+struct DictionaryTraits {
+ using MemoTableType = void;
+};
+
+} // namespace internal
+
+template <typename T, typename Out = void>
+using enable_if_memoize = enable_if_t<
+ !std::is_same<typename internal::DictionaryTraits<T>::MemoTableType, void>::value,
+ Out>;
+
+template <typename T, typename Out = void>
+using enable_if_no_memoize = enable_if_t<
+ std::is_same<typename internal::DictionaryTraits<T>::MemoTableType, void>::value,
+ Out>;
+
+namespace internal {
+
+template <>
+struct DictionaryTraits<BooleanType> {
+ using T = BooleanType;
+ using MemoTableType = typename HashTraits<T>::MemoTableType;
+
+ static Status GetDictionaryArrayData(MemoryPool* pool,
+ const std::shared_ptr<DataType>& type,
+ const MemoTableType& memo_table,
+ int64_t start_offset,
+ std::shared_ptr<ArrayData>* out) {
+ if (start_offset < 0) {
+ return Status::Invalid("invalid start_offset ", start_offset);
+ }
+
+ BooleanBuilder builder(pool);
+ const auto& bool_values = memo_table.values();
+ const auto null_index = memo_table.GetNull();
+
+ // Will iterate up to 3 times.
+ for (int64_t i = start_offset; i < memo_table.size(); i++) {
+ RETURN_NOT_OK(i == null_index ? builder.AppendNull()
+ : builder.Append(bool_values[i]));
+ }
+
+ return builder.FinishInternal(out);
+ }
+}; // namespace internal
+
+template <typename T>
+struct DictionaryTraits<T, enable_if_has_c_type<T>> {
+ using c_type = typename T::c_type;
+ using MemoTableType = typename HashTraits<T>::MemoTableType;
+
+ static Status GetDictionaryArrayData(MemoryPool* pool,
+ const std::shared_ptr<DataType>& type,
+ const MemoTableType& memo_table,
+ int64_t start_offset,
+ std::shared_ptr<ArrayData>* out) {
+ auto dict_length = static_cast<int64_t>(memo_table.size()) - start_offset;
+ // This makes a copy, but we assume a dictionary array is usually small
+ // compared to the size of the dictionary-using array.
+ // (also, copying the dictionary values is cheap compared to the cost
+ // of building the memo table)
+ ARROW_ASSIGN_OR_RAISE(
+ std::shared_ptr<Buffer> dict_buffer,
+ AllocateBuffer(TypeTraits<T>::bytes_required(dict_length), pool));
+ memo_table.CopyValues(static_cast<int32_t>(start_offset),
+ reinterpret_cast<c_type*>(dict_buffer->mutable_data()));
+
+ int64_t null_count = 0;
+ std::shared_ptr<Buffer> null_bitmap = nullptr;
+ RETURN_NOT_OK(
+ ComputeNullBitmap(pool, memo_table, start_offset, &null_count, &null_bitmap));
+
+ *out = ArrayData::Make(type, dict_length, {null_bitmap, dict_buffer}, null_count);
+ return Status::OK();
+ }
+};
+
+template <typename T>
+struct DictionaryTraits<T, enable_if_base_binary<T>> {
+ using MemoTableType = typename HashTraits<T>::MemoTableType;
+
+ static Status GetDictionaryArrayData(MemoryPool* pool,
+ const std::shared_ptr<DataType>& type,
+ const MemoTableType& memo_table,
+ int64_t start_offset,
+ std::shared_ptr<ArrayData>* out) {
+ using offset_type = typename T::offset_type;
+
+ // Create the offsets buffer
+ auto dict_length = static_cast<int64_t>(memo_table.size() - start_offset);
+ ARROW_ASSIGN_OR_RAISE(auto dict_offsets,
+ AllocateBuffer(sizeof(offset_type) * (dict_length + 1), pool));
+ auto raw_offsets = reinterpret_cast<offset_type*>(dict_offsets->mutable_data());
+ memo_table.CopyOffsets(static_cast<int32_t>(start_offset), raw_offsets);
+
+ // Create the data buffer
+ auto values_size = memo_table.values_size();
+ ARROW_ASSIGN_OR_RAISE(auto dict_data, AllocateBuffer(values_size, pool));
+ if (values_size > 0) {
+ memo_table.CopyValues(static_cast<int32_t>(start_offset), dict_data->size(),
+ dict_data->mutable_data());
+ }
+
+ int64_t null_count = 0;
+ std::shared_ptr<Buffer> null_bitmap = nullptr;
+ RETURN_NOT_OK(
+ ComputeNullBitmap(pool, memo_table, start_offset, &null_count, &null_bitmap));
+
+ *out = ArrayData::Make(type, dict_length,
+ {null_bitmap, std::move(dict_offsets), std::move(dict_data)},
+ null_count);
+
+ return Status::OK();
+ }
+};
+
+template <typename T>
+struct DictionaryTraits<T, enable_if_fixed_size_binary<T>> {
+ using MemoTableType = typename HashTraits<T>::MemoTableType;
+
+ static Status GetDictionaryArrayData(MemoryPool* pool,
+ const std::shared_ptr<DataType>& type,
+ const MemoTableType& memo_table,
+ int64_t start_offset,
+ std::shared_ptr<ArrayData>* out) {
+ const T& concrete_type = internal::checked_cast<const T&>(*type);
+
+ // Create the data buffer
+ auto dict_length = static_cast<int64_t>(memo_table.size() - start_offset);
+ auto width_length = concrete_type.byte_width();
+ auto data_length = dict_length * width_length;
+ ARROW_ASSIGN_OR_RAISE(auto dict_data, AllocateBuffer(data_length, pool));
+ auto data = dict_data->mutable_data();
+
+ memo_table.CopyFixedWidthValues(static_cast<int32_t>(start_offset), width_length,
+ data_length, data);
+
+ int64_t null_count = 0;
+ std::shared_ptr<Buffer> null_bitmap = nullptr;
+ RETURN_NOT_OK(
+ ComputeNullBitmap(pool, memo_table, start_offset, &null_count, &null_bitmap));
+
+ *out = ArrayData::Make(type, dict_length, {null_bitmap, std::move(dict_data)},
+ null_count);
+ return Status::OK();
+ }
+};
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/diff.cc b/contrib/libs/apache/arrow/cpp/src/arrow/array/diff.cc
new file mode 100644
index 00000000000..a94ca178a40
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/diff.cc
@@ -0,0 +1,784 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/array/diff.h"
+
+#include <algorithm>
+#include <chrono>
+#include <functional>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "arrow/array/array_base.h"
+#include "arrow/array/array_decimal.h"
+#include "arrow/array/array_nested.h"
+#include "arrow/array/array_primitive.h"
+#include "arrow/buffer.h"
+#include "arrow/buffer_builder.h"
+#include "arrow/extension_type.h"
+#include "arrow/memory_pool.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/range.h"
+#include "arrow/util/string.h"
+#include "arrow/util/string_view.h"
+#include "arrow/vendored/datetime.h"
+#include "arrow/visitor_inline.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+using internal::checked_pointer_cast;
+using internal::MakeLazyRange;
+
+template <typename ArrayType>
+auto GetView(const ArrayType& array, int64_t index) -> decltype(array.GetView(index)) {
+ return array.GetView(index);
+}
+
+struct Slice {
+ const Array* array_;
+ int64_t offset_, length_;
+
+ bool operator==(const Slice& other) const {
+ return length_ == other.length_ &&
+ array_->RangeEquals(offset_, offset_ + length_, other.offset_, *other.array_);
+ }
+ bool operator!=(const Slice& other) const { return !(*this == other); }
+};
+
+template <typename ArrayType, typename T = typename ArrayType::TypeClass,
+ typename = enable_if_list_like<T>>
+static Slice GetView(const ArrayType& array, int64_t index) {
+ return Slice{array.values().get(), array.value_offset(index),
+ array.value_length(index)};
+}
+
+struct UnitSlice {
+ const Array* array_;
+ int64_t offset_;
+
+ bool operator==(const UnitSlice& other) const {
+ return array_->RangeEquals(offset_, offset_ + 1, other.offset_, *other.array_);
+ }
+ bool operator!=(const UnitSlice& other) const { return !(*this == other); }
+};
+
+// FIXME(bkietz) this is inefficient;
+// StructArray's fields can be diffed independently then merged
+static UnitSlice GetView(const StructArray& array, int64_t index) {
+ return UnitSlice{&array, index};
+}
+
+static UnitSlice GetView(const UnionArray& array, int64_t index) {
+ return UnitSlice{&array, index};
+}
+
+using ValueComparator = std::function<bool(const Array&, int64_t, const Array&, int64_t)>;
+
+struct ValueComparatorVisitor {
+ template <typename T>
+ Status Visit(const T&) {
+ using ArrayType = typename TypeTraits<T>::ArrayType;
+ out = [](const Array& base, int64_t base_index, const Array& target,
+ int64_t target_index) {
+ return (GetView(checked_cast<const ArrayType&>(base), base_index) ==
+ GetView(checked_cast<const ArrayType&>(target), target_index));
+ };
+ return Status::OK();
+ }
+
+ Status Visit(const NullType&) { return Status::NotImplemented("null type"); }
+
+ Status Visit(const ExtensionType&) { return Status::NotImplemented("extension type"); }
+
+ Status Visit(const DictionaryType&) {
+ return Status::NotImplemented("dictionary type");
+ }
+
+ ValueComparator Create(const DataType& type) {
+ DCHECK_OK(VisitTypeInline(type, this));
+ return out;
+ }
+
+ ValueComparator out;
+};
+
+ValueComparator GetValueComparator(const DataType& type) {
+ ValueComparatorVisitor type_visitor;
+ return type_visitor.Create(type);
+}
+
+// represents an intermediate state in the comparison of two arrays
+struct EditPoint {
+ int64_t base, target;
+ bool operator==(EditPoint other) const {
+ return base == other.base && target == other.target;
+ }
+};
+
+/// A generic sequence difference algorithm, based on
+///
+/// E. W. Myers, "An O(ND) difference algorithm and its variations,"
+/// Algorithmica, vol. 1, no. 1-4, pp. 251–266, 1986.
+///
+/// To summarize, an edit script is computed by maintaining the furthest set of EditPoints
+/// which are reachable in a given number of edits D. This is used to compute the furthest
+/// set reachable with D+1 edits, and the process continues inductively until a complete
+/// edit script is discovered.
+///
+/// From each edit point a single deletion and insertion is made then as many shared
+/// elements as possible are skipped, recording only the endpoint of the run. This
+/// representation is minimal in the common case where the sequences differ only slightly,
+/// since most of the elements are shared between base and target and are represented
+/// implicitly.
+class QuadraticSpaceMyersDiff {
+ public:
+ QuadraticSpaceMyersDiff(const Array& base, const Array& target, MemoryPool* pool)
+ : base_(base),
+ target_(target),
+ pool_(pool),
+ value_comparator_(GetValueComparator(*base.type())),
+ base_begin_(0),
+ base_end_(base.length()),
+ target_begin_(0),
+ target_end_(target.length()),
+ endpoint_base_({ExtendFrom({base_begin_, target_begin_}).base}),
+ insert_({true}) {
+ if ((base_end_ - base_begin_ == target_end_ - target_begin_) &&
+ endpoint_base_[0] == base_end_) {
+ // trivial case: base == target
+ finish_index_ = 0;
+ }
+ }
+
+ bool ValuesEqual(int64_t base_index, int64_t target_index) const {
+ bool base_null = base_.IsNull(base_index);
+ bool target_null = target_.IsNull(target_index);
+ if (base_null || target_null) {
+ // If only one is null, then this is false, otherwise true
+ return base_null && target_null;
+ }
+ return value_comparator_(base_, base_index, target_, target_index);
+ }
+
+ // increment the position within base (the element pointed to was deleted)
+ // then extend maximally
+ EditPoint DeleteOne(EditPoint p) const {
+ if (p.base != base_end_) {
+ ++p.base;
+ }
+ return ExtendFrom(p);
+ }
+
+ // increment the position within target (the element pointed to was inserted)
+ // then extend maximally
+ EditPoint InsertOne(EditPoint p) const {
+ if (p.target != target_end_) {
+ ++p.target;
+ }
+ return ExtendFrom(p);
+ }
+
+ // increment the position within base and target (the elements skipped in this way were
+ // present in both sequences)
+ EditPoint ExtendFrom(EditPoint p) const {
+ for (; p.base != base_end_ && p.target != target_end_; ++p.base, ++p.target) {
+ if (!ValuesEqual(p.base, p.target)) {
+ break;
+ }
+ }
+ return p;
+ }
+
+ // beginning of a range for storing per-edit state in endpoint_base_ and insert_
+ int64_t StorageOffset(int64_t edit_count) const {
+ return edit_count * (edit_count + 1) / 2;
+ }
+
+ // given edit_count and index, augment endpoint_base_[index] with the corresponding
+ // position in target (which is only implicitly represented in edit_count, index)
+ EditPoint GetEditPoint(int64_t edit_count, int64_t index) const {
+ DCHECK_GE(index, StorageOffset(edit_count));
+ DCHECK_LT(index, StorageOffset(edit_count + 1));
+ auto insertions_minus_deletions =
+ 2 * (index - StorageOffset(edit_count)) - edit_count;
+ auto maximal_base = endpoint_base_[index];
+ auto maximal_target = std::min(
+ target_begin_ + ((maximal_base - base_begin_) + insertions_minus_deletions),
+ target_end_);
+ return {maximal_base, maximal_target};
+ }
+
+ void Next() {
+ ++edit_count_;
+ // base_begin_ is used as a dummy value here since Iterator may not be default
+ // constructible. The newly allocated range is completely overwritten below.
+ endpoint_base_.resize(StorageOffset(edit_count_ + 1), base_begin_);
+ insert_.resize(StorageOffset(edit_count_ + 1), false);
+
+ auto previous_offset = StorageOffset(edit_count_ - 1);
+ auto current_offset = StorageOffset(edit_count_);
+
+ // try deleting from base first
+ for (int64_t i = 0, i_out = 0; i < edit_count_; ++i, ++i_out) {
+ auto previous_endpoint = GetEditPoint(edit_count_ - 1, i + previous_offset);
+ endpoint_base_[i_out + current_offset] = DeleteOne(previous_endpoint).base;
+ }
+
+ // check if inserting from target could do better
+ for (int64_t i = 0, i_out = 1; i < edit_count_; ++i, ++i_out) {
+ // retrieve the previously computed best endpoint for (edit_count_, i_out)
+ // for comparison with the best endpoint achievable with an insertion
+ auto endpoint_after_deletion = GetEditPoint(edit_count_, i_out + current_offset);
+
+ auto previous_endpoint = GetEditPoint(edit_count_ - 1, i + previous_offset);
+ auto endpoint_after_insertion = InsertOne(previous_endpoint);
+
+ if (endpoint_after_insertion.base - endpoint_after_deletion.base >= 0) {
+ // insertion was more efficient; keep it and mark the insertion in insert_
+ insert_[i_out + current_offset] = true;
+ endpoint_base_[i_out + current_offset] = endpoint_after_insertion.base;
+ }
+ }
+
+ // check for completion
+ EditPoint finish = {base_end_, target_end_};
+ for (int64_t i_out = 0; i_out < edit_count_ + 1; ++i_out) {
+ if (GetEditPoint(edit_count_, i_out + current_offset) == finish) {
+ finish_index_ = i_out + current_offset;
+ return;
+ }
+ }
+ }
+
+ bool Done() { return finish_index_ != -1; }
+
+ Result<std::shared_ptr<StructArray>> GetEdits(MemoryPool* pool) {
+ DCHECK(Done());
+
+ int64_t length = edit_count_ + 1;
+ ARROW_ASSIGN_OR_RAISE(auto insert_buf, AllocateEmptyBitmap(length, pool));
+ ARROW_ASSIGN_OR_RAISE(auto run_length_buf,
+ AllocateBuffer(length * sizeof(int64_t), pool));
+ auto run_length = reinterpret_cast<int64_t*>(run_length_buf->mutable_data());
+
+ auto index = finish_index_;
+ auto endpoint = GetEditPoint(edit_count_, finish_index_);
+
+ for (int64_t i = edit_count_; i > 0; --i) {
+ bool insert = insert_[index];
+ BitUtil::SetBitTo(insert_buf->mutable_data(), i, insert);
+
+ auto insertions_minus_deletions =
+ (endpoint.base - base_begin_) - (endpoint.target - target_begin_);
+ if (insert) {
+ ++insertions_minus_deletions;
+ } else {
+ --insertions_minus_deletions;
+ }
+ index = (i - 1 - insertions_minus_deletions) / 2 + StorageOffset(i - 1);
+
+ // endpoint of previous edit
+ auto previous = GetEditPoint(i - 1, index);
+ run_length[i] = endpoint.base - previous.base - !insert;
+ DCHECK_GE(run_length[i], 0);
+
+ endpoint = previous;
+ }
+ BitUtil::SetBitTo(insert_buf->mutable_data(), 0, false);
+ run_length[0] = endpoint.base - base_begin_;
+
+ return StructArray::Make(
+ {std::make_shared<BooleanArray>(length, std::move(insert_buf)),
+ std::make_shared<Int64Array>(length, std::move(run_length_buf))},
+ {field("insert", boolean()), field("run_length", int64())});
+ }
+
+ Result<std::shared_ptr<StructArray>> Diff() {
+ while (!Done()) {
+ Next();
+ }
+ return GetEdits(pool_);
+ }
+
+ private:
+ const Array& base_;
+ const Array& target_;
+ MemoryPool* pool_;
+ ValueComparator value_comparator_;
+ int64_t finish_index_ = -1;
+ int64_t edit_count_ = 0;
+ int64_t base_begin_, base_end_;
+ int64_t target_begin_, target_end_;
+ // each element of endpoint_base_ is the furthest position in base reachable given an
+ // edit_count and (# insertions) - (# deletions). Each bit of insert_ records whether
+ // the corresponding furthest position was reached via an insertion or a deletion
+ // (followed by a run of shared elements). See StorageOffset for the
+ // layout of these vectors
+ std::vector<int64_t> endpoint_base_;
+ std::vector<bool> insert_;
+};
+
+Result<std::shared_ptr<StructArray>> NullDiff(const Array& base, const Array& target,
+ MemoryPool* pool) {
+ bool insert = base.length() < target.length();
+ auto run_length = std::min(base.length(), target.length());
+ auto edit_count = std::max(base.length(), target.length()) - run_length;
+
+ TypedBufferBuilder<bool> insert_builder(pool);
+ RETURN_NOT_OK(insert_builder.Resize(edit_count + 1));
+ insert_builder.UnsafeAppend(false);
+ TypedBufferBuilder<int64_t> run_length_builder(pool);
+ RETURN_NOT_OK(run_length_builder.Resize(edit_count + 1));
+ run_length_builder.UnsafeAppend(run_length);
+ if (edit_count > 0) {
+ insert_builder.UnsafeAppend(edit_count, insert);
+ run_length_builder.UnsafeAppend(edit_count, 0);
+ }
+
+ std::shared_ptr<Buffer> insert_buf, run_length_buf;
+ RETURN_NOT_OK(insert_builder.Finish(&insert_buf));
+ RETURN_NOT_OK(run_length_builder.Finish(&run_length_buf));
+
+ return StructArray::Make({std::make_shared<BooleanArray>(edit_count + 1, insert_buf),
+ std::make_shared<Int64Array>(edit_count + 1, run_length_buf)},
+ {field("insert", boolean()), field("run_length", int64())});
+}
+
+Result<std::shared_ptr<StructArray>> Diff(const Array& base, const Array& target,
+ MemoryPool* pool) {
+ if (!base.type()->Equals(target.type())) {
+ return Status::TypeError("only taking the diff of like-typed arrays is supported.");
+ }
+
+ if (base.type()->id() == Type::NA) {
+ return NullDiff(base, target, pool);
+ } else if (base.type()->id() == Type::EXTENSION) {
+ auto base_storage = checked_cast<const ExtensionArray&>(base).storage();
+ auto target_storage = checked_cast<const ExtensionArray&>(target).storage();
+ return Diff(*base_storage, *target_storage, pool);
+ } else if (base.type()->id() == Type::DICTIONARY) {
+ return Status::NotImplemented("diffing arrays of type ", *base.type());
+ } else {
+ return QuadraticSpaceMyersDiff(base, target, pool).Diff();
+ }
+}
+
+using Formatter = std::function<void(const Array&, int64_t index, std::ostream*)>;
+
+static Result<Formatter> MakeFormatter(const DataType& type);
+
+class MakeFormatterImpl {
+ public:
+ Result<Formatter> Make(const DataType& type) && {
+ RETURN_NOT_OK(VisitTypeInline(type, this));
+ return std::move(impl_);
+ }
+
+ private:
+ template <typename VISITOR>
+ friend Status VisitTypeInline(const DataType&, VISITOR*);
+
+ // factory implementation
+ Status Visit(const BooleanType&) {
+ impl_ = [](const Array& array, int64_t index, std::ostream* os) {
+ *os << (checked_cast<const BooleanArray&>(array).Value(index) ? "true" : "false");
+ };
+ return Status::OK();
+ }
+
+ // format Numerics with std::ostream defaults
+ template <typename T>
+ enable_if_number<T, Status> Visit(const T&) {
+ impl_ = [](const Array& array, int64_t index, std::ostream* os) {
+ const auto& numeric = checked_cast<const NumericArray<T>&>(array);
+ if (sizeof(decltype(numeric.Value(index))) == sizeof(char)) {
+ // override std::ostream defaults for /(u|)int8_t/ since they are
+ // formatted as potentially unprintable/tty borking characters
+ *os << static_cast<int16_t>(numeric.Value(index));
+ } else {
+ *os << numeric.Value(index);
+ }
+ };
+ return Status::OK();
+ }
+
+ template <typename T>
+ enable_if_date<T, Status> Visit(const T&) {
+ using unit = typename std::conditional<std::is_same<T, Date32Type>::value,
+ arrow_vendored::date::days,
+ std::chrono::milliseconds>::type;
+
+ static arrow_vendored::date::sys_days epoch{arrow_vendored::date::jan / 1 / 1970};
+
+ impl_ = [](const Array& array, int64_t index, std::ostream* os) {
+ unit value(checked_cast<const NumericArray<T>&>(array).Value(index));
+ *os << arrow_vendored::date::format("%F", value + epoch);
+ };
+ return Status::OK();
+ }
+
+ template <typename T>
+ enable_if_time<T, Status> Visit(const T&) {
+ impl_ = MakeTimeFormatter<T, false>("%T");
+ return Status::OK();
+ }
+
+ Status Visit(const TimestampType&) {
+ impl_ = MakeTimeFormatter<TimestampType, true>("%F %T");
+ return Status::OK();
+ }
+
+ Status Visit(const DayTimeIntervalType&) {
+ impl_ = [](const Array& array, int64_t index, std::ostream* os) {
+ auto day_millis = checked_cast<const DayTimeIntervalArray&>(array).Value(index);
+ *os << day_millis.days << "d" << day_millis.milliseconds << "ms";
+ };
+ return Status::OK();
+ }
+
+ // format Binary, LargeBinary and FixedSizeBinary in hexadecimal
+ template <typename T>
+ enable_if_binary_like<T, Status> Visit(const T&) {
+ using ArrayType = typename TypeTraits<T>::ArrayType;
+ impl_ = [](const Array& array, int64_t index, std::ostream* os) {
+ *os << HexEncode(checked_cast<const ArrayType&>(array).GetView(index));
+ };
+ return Status::OK();
+ }
+
+ // format Strings with \"\n\r\t\\ escaped
+ template <typename T>
+ enable_if_string_like<T, Status> Visit(const T&) {
+ using ArrayType = typename TypeTraits<T>::ArrayType;
+ impl_ = [](const Array& array, int64_t index, std::ostream* os) {
+ *os << "\"" << Escape(checked_cast<const ArrayType&>(array).GetView(index)) << "\"";
+ };
+ return Status::OK();
+ }
+
+ // format Decimals with Decimal128Array::FormatValue
+ Status Visit(const Decimal128Type&) {
+ impl_ = [](const Array& array, int64_t index, std::ostream* os) {
+ *os << checked_cast<const Decimal128Array&>(array).FormatValue(index);
+ };
+ return Status::OK();
+ }
+
+ template <typename T>
+ enable_if_list_like<T, Status> Visit(const T& t) {
+ struct ListImpl {
+ explicit ListImpl(Formatter f) : values_formatter_(std::move(f)) {}
+
+ void operator()(const Array& array, int64_t index, std::ostream* os) {
+ const auto& list_array =
+ checked_cast<const typename TypeTraits<T>::ArrayType&>(array);
+ *os << "[";
+ for (int32_t i = 0; i < list_array.value_length(index); ++i) {
+ if (i != 0) {
+ *os << ", ";
+ }
+ values_formatter_(*list_array.values(), i + list_array.value_offset(index), os);
+ }
+ *os << "]";
+ }
+
+ Formatter values_formatter_;
+ };
+
+ ARROW_ASSIGN_OR_RAISE(auto values_formatter, MakeFormatter(*t.value_type()));
+ impl_ = ListImpl(std::move(values_formatter));
+ return Status::OK();
+ }
+
+ // TODO(bkietz) format maps better
+
+ Status Visit(const StructType& t) {
+ struct StructImpl {
+ explicit StructImpl(std::vector<Formatter> f) : field_formatters_(std::move(f)) {}
+
+ void operator()(const Array& array, int64_t index, std::ostream* os) {
+ const auto& struct_array = checked_cast<const StructArray&>(array);
+ *os << "{";
+ for (int i = 0, printed = 0; i < struct_array.num_fields(); ++i) {
+ if (printed != 0) {
+ *os << ", ";
+ }
+ if (struct_array.field(i)->IsNull(index)) {
+ continue;
+ }
+ ++printed;
+ *os << struct_array.struct_type()->field(i)->name() << ": ";
+ field_formatters_[i](*struct_array.field(i), index, os);
+ }
+ *os << "}";
+ }
+
+ std::vector<Formatter> field_formatters_;
+ };
+
+ std::vector<Formatter> field_formatters(t.num_fields());
+ for (int i = 0; i < t.num_fields(); ++i) {
+ ARROW_ASSIGN_OR_RAISE(field_formatters[i], MakeFormatter(*t.field(i)->type()));
+ }
+
+ impl_ = StructImpl(std::move(field_formatters));
+ return Status::OK();
+ }
+
+ Status Visit(const UnionType& t) {
+ struct UnionImpl {
+ explicit UnionImpl(std::vector<Formatter> f) : field_formatters_(std::move(f)) {}
+
+ void DoFormat(const UnionArray& array, int64_t index, int64_t child_index,
+ std::ostream* os) {
+ auto type_code = array.raw_type_codes()[index];
+ auto child = array.field(array.child_id(index));
+
+ *os << "{" << static_cast<int16_t>(type_code) << ": ";
+ if (child->IsNull(child_index)) {
+ *os << "null";
+ } else {
+ field_formatters_[type_code](*child, child_index, os);
+ }
+ *os << "}";
+ }
+
+ std::vector<Formatter> field_formatters_;
+ };
+
+ struct SparseImpl : UnionImpl {
+ using UnionImpl::UnionImpl;
+
+ void operator()(const Array& array, int64_t index, std::ostream* os) {
+ const auto& union_array = checked_cast<const SparseUnionArray&>(array);
+ DoFormat(union_array, index, index, os);
+ }
+ };
+
+ struct DenseImpl : UnionImpl {
+ using UnionImpl::UnionImpl;
+
+ void operator()(const Array& array, int64_t index, std::ostream* os) {
+ const auto& union_array = checked_cast<const DenseUnionArray&>(array);
+ DoFormat(union_array, index, union_array.raw_value_offsets()[index], os);
+ }
+ };
+
+ std::vector<Formatter> field_formatters(t.max_type_code() + 1);
+ for (int i = 0; i < t.num_fields(); ++i) {
+ auto type_id = t.type_codes()[i];
+ ARROW_ASSIGN_OR_RAISE(field_formatters[type_id],
+ MakeFormatter(*t.field(i)->type()));
+ }
+
+ if (t.mode() == UnionMode::SPARSE) {
+ impl_ = SparseImpl(std::move(field_formatters));
+ } else {
+ impl_ = DenseImpl(std::move(field_formatters));
+ }
+ return Status::OK();
+ }
+
+ Status Visit(const NullType& t) {
+ return Status::NotImplemented("formatting diffs between arrays of type ", t);
+ }
+
+ Status Visit(const DictionaryType& t) {
+ return Status::NotImplemented("formatting diffs between arrays of type ", t);
+ }
+
+ Status Visit(const ExtensionType& t) {
+ return Status::NotImplemented("formatting diffs between arrays of type ", t);
+ }
+
+ Status Visit(const DurationType& t) {
+ return Status::NotImplemented("formatting diffs between arrays of type ", t);
+ }
+
+ Status Visit(const MonthIntervalType& t) {
+ return Status::NotImplemented("formatting diffs between arrays of type ", t);
+ }
+
+ template <typename T, bool AddEpoch>
+ Formatter MakeTimeFormatter(const std::string& fmt_str) {
+ return [fmt_str](const Array& array, int64_t index, std::ostream* os) {
+ auto fmt = fmt_str.c_str();
+ auto unit = checked_cast<const T&>(*array.type()).unit();
+ auto value = checked_cast<const NumericArray<T>&>(array).Value(index);
+ using arrow_vendored::date::format;
+ using std::chrono::nanoseconds;
+ using std::chrono::microseconds;
+ using std::chrono::milliseconds;
+ using std::chrono::seconds;
+ if (AddEpoch) {
+ static arrow_vendored::date::sys_days epoch{arrow_vendored::date::jan / 1 / 1970};
+
+ switch (unit) {
+ case TimeUnit::NANO:
+ *os << format(fmt, static_cast<nanoseconds>(value) + epoch);
+ break;
+ case TimeUnit::MICRO:
+ *os << format(fmt, static_cast<microseconds>(value) + epoch);
+ break;
+ case TimeUnit::MILLI:
+ *os << format(fmt, static_cast<milliseconds>(value) + epoch);
+ break;
+ case TimeUnit::SECOND:
+ *os << format(fmt, static_cast<seconds>(value) + epoch);
+ break;
+ }
+ return;
+ }
+ switch (unit) {
+ case TimeUnit::NANO:
+ *os << format(fmt, static_cast<nanoseconds>(value));
+ break;
+ case TimeUnit::MICRO:
+ *os << format(fmt, static_cast<microseconds>(value));
+ break;
+ case TimeUnit::MILLI:
+ *os << format(fmt, static_cast<milliseconds>(value));
+ break;
+ case TimeUnit::SECOND:
+ *os << format(fmt, static_cast<seconds>(value));
+ break;
+ }
+ };
+ }
+
+ Formatter impl_;
+};
+
+static Result<Formatter> MakeFormatter(const DataType& type) {
+ return MakeFormatterImpl{}.Make(type);
+}
+
+Status VisitEditScript(
+ const Array& edits,
+ const std::function<Status(int64_t delete_begin, int64_t delete_end,
+ int64_t insert_begin, int64_t insert_end)>& visitor) {
+ static const auto edits_type =
+ struct_({field("insert", boolean()), field("run_length", int64())});
+ DCHECK(edits.type()->Equals(*edits_type));
+ DCHECK_GE(edits.length(), 1);
+
+ auto insert = checked_pointer_cast<BooleanArray>(
+ checked_cast<const StructArray&>(edits).field(0));
+ auto run_lengths =
+ checked_pointer_cast<Int64Array>(checked_cast<const StructArray&>(edits).field(1));
+
+ DCHECK(!insert->Value(0));
+
+ auto length = run_lengths->Value(0);
+ int64_t base_begin, base_end, target_begin, target_end;
+ base_begin = base_end = target_begin = target_end = length;
+ for (int64_t i = 1; i < edits.length(); ++i) {
+ if (insert->Value(i)) {
+ ++target_end;
+ } else {
+ ++base_end;
+ }
+ length = run_lengths->Value(i);
+ if (length != 0) {
+ RETURN_NOT_OK(visitor(base_begin, base_end, target_begin, target_end));
+ base_begin = base_end = base_end + length;
+ target_begin = target_end = target_end + length;
+ }
+ }
+ if (length == 0) {
+ return visitor(base_begin, base_end, target_begin, target_end);
+ }
+ return Status::OK();
+}
+
+class UnifiedDiffFormatter {
+ public:
+ UnifiedDiffFormatter(std::ostream* os, Formatter formatter)
+ : os_(os), formatter_(std::move(formatter)) {}
+
+ Status operator()(int64_t delete_begin, int64_t delete_end, int64_t insert_begin,
+ int64_t insert_end) {
+ *os_ << "@@ -" << delete_begin << ", +" << insert_begin << " @@" << std::endl;
+
+ for (int64_t i = delete_begin; i < delete_end; ++i) {
+ *os_ << "-";
+ if (base_->IsValid(i)) {
+ formatter_(*base_, i, &*os_);
+ } else {
+ *os_ << "null";
+ }
+ *os_ << std::endl;
+ }
+
+ for (int64_t i = insert_begin; i < insert_end; ++i) {
+ *os_ << "+";
+ if (target_->IsValid(i)) {
+ formatter_(*target_, i, &*os_);
+ } else {
+ *os_ << "null";
+ }
+ *os_ << std::endl;
+ }
+
+ return Status::OK();
+ }
+
+ Status operator()(const Array& edits, const Array& base, const Array& target) {
+ if (edits.length() == 1) {
+ return Status::OK();
+ }
+ base_ = &base;
+ target_ = &target;
+ *os_ << std::endl;
+ return VisitEditScript(edits, *this);
+ }
+
+ private:
+ std::ostream* os_ = nullptr;
+ const Array* base_ = nullptr;
+ const Array* target_ = nullptr;
+ Formatter formatter_;
+};
+
+Result<std::function<Status(const Array& edits, const Array& base, const Array& target)>>
+MakeUnifiedDiffFormatter(const DataType& type, std::ostream* os) {
+ if (type.id() == Type::NA) {
+ return [os](const Array& edits, const Array& base, const Array& target) {
+ if (base.length() != target.length()) {
+ *os << "# Null arrays differed" << std::endl
+ << "-" << base.length() << " nulls" << std::endl
+ << "+" << target.length() << " nulls" << std::endl;
+ }
+ return Status::OK();
+ };
+ }
+
+ ARROW_ASSIGN_OR_RAISE(auto formatter, MakeFormatter(type));
+ return UnifiedDiffFormatter(os, std::move(formatter));
+}
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/diff.h b/contrib/libs/apache/arrow/cpp/src/arrow/array/diff.h
new file mode 100644
index 00000000000..a405164b333
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/diff.h
@@ -0,0 +1,76 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <functional>
+#include <iosfwd>
+#include <memory>
+
+#include "arrow/array/array_base.h"
+#include "arrow/array/array_nested.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+/// \brief Compare two arrays, returning an edit script which expresses the difference
+/// between them
+///
+/// An edit script is an array of struct(insert: bool, run_length: int64_t).
+/// Each element of "insert" determines whether an element was inserted into (true)
+/// or deleted from (false) base. Each insertion or deletion is followed by a run of
+/// elements which are unchanged from base to target; the length of this run is stored
+/// in "run_length". (Note that the edit script begins and ends with a run of shared
+/// elements but both fields of the struct must have the same length. To accommodate this
+/// the first element of "insert" should be ignored.)
+///
+/// For example for base "hlloo" and target "hello", the edit script would be
+/// [
+/// {"insert": false, "run_length": 1}, // leading run of length 1 ("h")
+/// {"insert": true, "run_length": 3}, // insert("e") then a run of length 3 ("llo")
+/// {"insert": false, "run_length": 0} // delete("o") then an empty run
+/// ]
+///
+/// Diffing arrays containing nulls is not currently supported.
+///
+/// \param[in] base baseline for comparison
+/// \param[in] target an array of identical type to base whose elements differ from base's
+/// \param[in] pool memory to store the result will be allocated from this memory pool
+/// \return an edit script array which can be applied to base to produce target
+ARROW_EXPORT
+Result<std::shared_ptr<StructArray>> Diff(const Array& base, const Array& target,
+ MemoryPool* pool = default_memory_pool());
+
+/// \brief visitor interface for easy traversal of an edit script
+///
+/// visitor will be called for each hunk of insertions and deletions.
+ARROW_EXPORT Status VisitEditScript(
+ const Array& edits,
+ const std::function<Status(int64_t delete_begin, int64_t delete_end,
+ int64_t insert_begin, int64_t insert_end)>& visitor);
+
+/// \brief return a function which will format an edit script in unified
+/// diff format to os, given base and target arrays of type
+ARROW_EXPORT Result<
+ std::function<Status(const Array& edits, const Array& base, const Array& target)>>
+MakeUnifiedDiffFormatter(const DataType& type, std::ostream* os);
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/util.cc b/contrib/libs/apache/arrow/cpp/src/arrow/array/util.cc
new file mode 100644
index 00000000000..ed26ecff4e0
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/util.cc
@@ -0,0 +1,754 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/array/util.h"
+
+#include <algorithm>
+#include <array>
+#include <cstdint>
+#include <cstring>
+#include <limits>
+#include <memory>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "arrow/array/array_base.h"
+#include "arrow/array/array_dict.h"
+#include "arrow/array/array_primitive.h"
+#include "arrow/array/concatenate.h"
+#include "arrow/buffer.h"
+#include "arrow/buffer_builder.h"
+#include "arrow/extension_type.h"
+#include "arrow/scalar.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/decimal.h"
+#include "arrow/util/endian.h"
+#include "arrow/util/logging.h"
+#include "arrow/visitor_inline.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+
+// ----------------------------------------------------------------------
+// Loading from ArrayData
+
+namespace {
+
+class ArrayDataWrapper {
+ public:
+ ArrayDataWrapper(const std::shared_ptr<ArrayData>& data, std::shared_ptr<Array>* out)
+ : data_(data), out_(out) {}
+
+ template <typename T>
+ Status Visit(const T&) {
+ using ArrayType = typename TypeTraits<T>::ArrayType;
+ *out_ = std::make_shared<ArrayType>(data_);
+ return Status::OK();
+ }
+
+ Status Visit(const ExtensionType& type) {
+ *out_ = type.MakeArray(data_);
+ return Status::OK();
+ }
+
+ const std::shared_ptr<ArrayData>& data_;
+ std::shared_ptr<Array>* out_;
+};
+
+class ArrayDataEndianSwapper {
+ public:
+ ArrayDataEndianSwapper(const std::shared_ptr<ArrayData>& data, int64_t length)
+ : data_(data), length_(length) {
+ out_ = data->Copy();
+ }
+
+ Status SwapType(const DataType& type) {
+ RETURN_NOT_OK(VisitTypeInline(type, this));
+ RETURN_NOT_OK(SwapChildren(type.fields()));
+ if (internal::HasValidityBitmap(type.id())) {
+ // Copy null bitmap
+ out_->buffers[0] = data_->buffers[0];
+ }
+ return Status::OK();
+ }
+
+ Status SwapChildren(const FieldVector& child_fields) {
+ for (size_t i = 0; i < child_fields.size(); i++) {
+ ARROW_ASSIGN_OR_RAISE(out_->child_data[i],
+ internal::SwapEndianArrayData(data_->child_data[i]));
+ }
+ return Status::OK();
+ }
+
+ template <typename T>
+ Result<std::shared_ptr<Buffer>> ByteSwapBuffer(
+ const std::shared_ptr<Buffer>& in_buffer) {
+ if (sizeof(T) == 1) {
+ // if data size is 1, element is not swapped. We can use the original buffer
+ return in_buffer;
+ }
+ auto in_data = reinterpret_cast<const T*>(in_buffer->data());
+ ARROW_ASSIGN_OR_RAISE(auto out_buffer, AllocateBuffer(in_buffer->size()));
+ auto out_data = reinterpret_cast<T*>(out_buffer->mutable_data());
+ int64_t length = in_buffer->size() / sizeof(T);
+ for (int64_t i = 0; i < length; i++) {
+ out_data[i] = BitUtil::ByteSwap(in_data[i]);
+ }
+ return std::move(out_buffer);
+ }
+
+ template <typename VALUE_TYPE>
+ Status SwapOffsets(int index) {
+ if (data_->buffers[index] == nullptr || data_->buffers[index]->size() == 0) {
+ out_->buffers[index] = data_->buffers[index];
+ return Status::OK();
+ }
+ // Except union, offset has one more element rather than data->length
+ ARROW_ASSIGN_OR_RAISE(out_->buffers[index],
+ ByteSwapBuffer<VALUE_TYPE>(data_->buffers[index]));
+ return Status::OK();
+ }
+
+ template <typename T>
+ enable_if_t<std::is_base_of<FixedWidthType, T>::value &&
+ !std::is_base_of<FixedSizeBinaryType, T>::value &&
+ !std::is_base_of<DictionaryType, T>::value,
+ Status>
+ Visit(const T& type) {
+ using value_type = typename T::c_type;
+ ARROW_ASSIGN_OR_RAISE(out_->buffers[1],
+ ByteSwapBuffer<value_type>(data_->buffers[1]));
+ return Status::OK();
+ }
+
+ Status Visit(const Decimal128Type& type) {
+ auto data = reinterpret_cast<const uint64_t*>(data_->buffers[1]->data());
+ ARROW_ASSIGN_OR_RAISE(auto new_buffer, AllocateBuffer(data_->buffers[1]->size()));
+ auto new_data = reinterpret_cast<uint64_t*>(new_buffer->mutable_data());
+ int64_t length = length_;
+ length = data_->buffers[1]->size() / (sizeof(uint64_t) * 2);
+ for (int64_t i = 0; i < length; i++) {
+ uint64_t tmp;
+ auto idx = i * 2;
+#if ARROW_LITTLE_ENDIAN
+ tmp = BitUtil::FromBigEndian(data[idx]);
+ new_data[idx] = BitUtil::FromBigEndian(data[idx + 1]);
+ new_data[idx + 1] = tmp;
+#else
+ tmp = BitUtil::FromLittleEndian(data[idx]);
+ new_data[idx] = BitUtil::FromLittleEndian(data[idx + 1]);
+ new_data[idx + 1] = tmp;
+#endif
+ }
+ out_->buffers[1] = std::move(new_buffer);
+ return Status::OK();
+ }
+
+ Status Visit(const Decimal256Type& type) {
+ auto data = reinterpret_cast<const uint64_t*>(data_->buffers[1]->data());
+ ARROW_ASSIGN_OR_RAISE(auto new_buffer, AllocateBuffer(data_->buffers[1]->size()));
+ auto new_data = reinterpret_cast<uint64_t*>(new_buffer->mutable_data());
+ int64_t length = length_;
+ length = data_->buffers[1]->size() / (sizeof(uint64_t) * 4);
+ for (int64_t i = 0; i < length; i++) {
+ uint64_t tmp0, tmp1, tmp2;
+ auto idx = i * 4;
+#if ARROW_LITTLE_ENDIAN
+ tmp0 = BitUtil::FromBigEndian(data[idx]);
+ tmp1 = BitUtil::FromBigEndian(data[idx + 1]);
+ tmp2 = BitUtil::FromBigEndian(data[idx + 2]);
+ new_data[idx] = BitUtil::FromBigEndian(data[idx + 3]);
+ new_data[idx + 1] = tmp2;
+ new_data[idx + 2] = tmp1;
+ new_data[idx + 3] = tmp0;
+#else
+ tmp0 = BitUtil::FromLittleEndian(data[idx]);
+ tmp1 = BitUtil::FromLittleEndian(data[idx + 1]);
+ tmp2 = BitUtil::FromLittleEndian(data[idx + 2]);
+ new_data[idx] = BitUtil::FromLittleEndian(data[idx + 3]);
+ new_data[idx + 1] = tmp2;
+ new_data[idx + 2] = tmp1;
+ new_data[idx + 3] = tmp0;
+#endif
+ }
+ out_->buffers[1] = std::move(new_buffer);
+ return Status::OK();
+ }
+
+ Status Visit(const DayTimeIntervalType& type) {
+ ARROW_ASSIGN_OR_RAISE(out_->buffers[1], ByteSwapBuffer<uint32_t>(data_->buffers[1]));
+ return Status::OK();
+ }
+
+ Status Visit(const NullType& type) { return Status::OK(); }
+ Status Visit(const BooleanType& type) { return Status::OK(); }
+ Status Visit(const Int8Type& type) { return Status::OK(); }
+ Status Visit(const UInt8Type& type) { return Status::OK(); }
+ Status Visit(const FixedSizeBinaryType& type) { return Status::OK(); }
+ Status Visit(const FixedSizeListType& type) { return Status::OK(); }
+ Status Visit(const StructType& type) { return Status::OK(); }
+ Status Visit(const UnionType& type) {
+ out_->buffers[1] = data_->buffers[1];
+ if (type.mode() == UnionMode::DENSE) {
+ RETURN_NOT_OK(SwapOffsets<int32_t>(2));
+ }
+ return Status::OK();
+ }
+
+ template <typename T>
+ enable_if_t<std::is_same<BinaryType, T>::value || std::is_same<StringType, T>::value,
+ Status>
+ Visit(const T& type) {
+ RETURN_NOT_OK(SwapOffsets<int32_t>(1));
+ out_->buffers[2] = data_->buffers[2];
+ return Status::OK();
+ }
+
+ template <typename T>
+ enable_if_t<std::is_same<LargeBinaryType, T>::value ||
+ std::is_same<LargeStringType, T>::value,
+ Status>
+ Visit(const T& type) {
+ RETURN_NOT_OK(SwapOffsets<int64_t>(1));
+ out_->buffers[2] = data_->buffers[2];
+ return Status::OK();
+ }
+
+ Status Visit(const ListType& type) {
+ RETURN_NOT_OK(SwapOffsets<int32_t>(1));
+ return Status::OK();
+ }
+ Status Visit(const LargeListType& type) {
+ RETURN_NOT_OK(SwapOffsets<int64_t>(1));
+ return Status::OK();
+ }
+
+ Status Visit(const DictionaryType& type) {
+ // dictionary was already swapped in ReadDictionary() in ipc/reader.cc
+ RETURN_NOT_OK(SwapType(*type.index_type()));
+ return Status::OK();
+ }
+
+ Status Visit(const ExtensionType& type) {
+ RETURN_NOT_OK(SwapType(*type.storage_type()));
+ return Status::OK();
+ }
+
+ const std::shared_ptr<ArrayData>& data_;
+ int64_t length_;
+ std::shared_ptr<ArrayData> out_;
+};
+
+} // namespace
+
+namespace internal {
+
+Result<std::shared_ptr<ArrayData>> SwapEndianArrayData(
+ const std::shared_ptr<ArrayData>& data) {
+ if (data->offset != 0) {
+ return Status::Invalid("Unsupported data format: data.offset != 0");
+ }
+ ArrayDataEndianSwapper swapper(data, data->length);
+ RETURN_NOT_OK(swapper.SwapType(*data->type));
+ return std::move(swapper.out_);
+}
+
+} // namespace internal
+
+std::shared_ptr<Array> MakeArray(const std::shared_ptr<ArrayData>& data) {
+ std::shared_ptr<Array> out;
+ ArrayDataWrapper wrapper_visitor(data, &out);
+ DCHECK_OK(VisitTypeInline(*data->type, &wrapper_visitor));
+ DCHECK(out);
+ return out;
+}
+
+// ----------------------------------------------------------------------
+// Misc APIs
+
+namespace {
+
+// get the maximum buffer length required, then allocate a single zeroed buffer
+// to use anywhere a buffer is required
+class NullArrayFactory {
+ public:
+ struct GetBufferLength {
+ GetBufferLength(const std::shared_ptr<DataType>& type, int64_t length)
+ : type_(*type), length_(length), buffer_length_(BitUtil::BytesForBits(length)) {}
+
+ Result<int64_t> Finish() && {
+ RETURN_NOT_OK(VisitTypeInline(type_, this));
+ return buffer_length_;
+ }
+
+ template <typename T, typename = decltype(TypeTraits<T>::bytes_required(0))>
+ Status Visit(const T&) {
+ return MaxOf(TypeTraits<T>::bytes_required(length_));
+ }
+
+ template <typename T>
+ enable_if_var_size_list<T, Status> Visit(const T&) {
+ // values array may be empty, but there must be at least one offset of 0
+ return MaxOf(sizeof(typename T::offset_type) * (length_ + 1));
+ }
+
+ template <typename T>
+ enable_if_base_binary<T, Status> Visit(const T&) {
+ // values buffer may be empty, but there must be at least one offset of 0
+ return MaxOf(sizeof(typename T::offset_type) * (length_ + 1));
+ }
+
+ Status Visit(const FixedSizeListType& type) {
+ return MaxOf(GetBufferLength(type.value_type(), type.list_size() * length_));
+ }
+
+ Status Visit(const FixedSizeBinaryType& type) {
+ return MaxOf(type.byte_width() * length_);
+ }
+
+ Status Visit(const StructType& type) {
+ for (const auto& child : type.fields()) {
+ RETURN_NOT_OK(MaxOf(GetBufferLength(child->type(), length_)));
+ }
+ return Status::OK();
+ }
+
+ Status Visit(const UnionType& type) {
+ // type codes
+ RETURN_NOT_OK(MaxOf(length_));
+ if (type.mode() == UnionMode::DENSE) {
+ // offsets
+ RETURN_NOT_OK(MaxOf(sizeof(int32_t) * length_));
+ }
+ for (const auto& child : type.fields()) {
+ RETURN_NOT_OK(MaxOf(GetBufferLength(child->type(), length_)));
+ }
+ return Status::OK();
+ }
+
+ Status Visit(const DictionaryType& type) {
+ RETURN_NOT_OK(MaxOf(GetBufferLength(type.value_type(), length_)));
+ return MaxOf(GetBufferLength(type.index_type(), length_));
+ }
+
+ Status Visit(const ExtensionType& type) {
+ // XXX is an extension array's length always == storage length
+ return MaxOf(GetBufferLength(type.storage_type(), length_));
+ }
+
+ Status Visit(const DataType& type) {
+ return Status::NotImplemented("construction of all-null ", type);
+ }
+
+ private:
+ Status MaxOf(GetBufferLength&& other) {
+ ARROW_ASSIGN_OR_RAISE(int64_t buffer_length, std::move(other).Finish());
+ return MaxOf(buffer_length);
+ }
+
+ Status MaxOf(int64_t buffer_length) {
+ if (buffer_length > buffer_length_) {
+ buffer_length_ = buffer_length;
+ }
+ return Status::OK();
+ }
+
+ const DataType& type_;
+ int64_t length_, buffer_length_;
+ };
+
+ NullArrayFactory(MemoryPool* pool, const std::shared_ptr<DataType>& type,
+ int64_t length)
+ : pool_(pool), type_(type), length_(length) {}
+
+ Status CreateBuffer() {
+ ARROW_ASSIGN_OR_RAISE(int64_t buffer_length,
+ GetBufferLength(type_, length_).Finish());
+ ARROW_ASSIGN_OR_RAISE(buffer_, AllocateBuffer(buffer_length, pool_));
+ std::memset(buffer_->mutable_data(), 0, buffer_->size());
+ return Status::OK();
+ }
+
+ Result<std::shared_ptr<ArrayData>> Create() {
+ if (buffer_ == nullptr) {
+ RETURN_NOT_OK(CreateBuffer());
+ }
+ std::vector<std::shared_ptr<ArrayData>> child_data(type_->num_fields());
+ out_ = ArrayData::Make(type_, length_, {buffer_}, child_data, length_, 0);
+ RETURN_NOT_OK(VisitTypeInline(*type_, this));
+ return out_;
+ }
+
+ Status Visit(const NullType&) {
+ out_->buffers.resize(1, nullptr);
+ return Status::OK();
+ }
+
+ Status Visit(const FixedWidthType&) {
+ out_->buffers.resize(2, buffer_);
+ return Status::OK();
+ }
+
+ template <typename T>
+ enable_if_base_binary<T, Status> Visit(const T&) {
+ out_->buffers.resize(3, buffer_);
+ return Status::OK();
+ }
+
+ template <typename T>
+ enable_if_var_size_list<T, Status> Visit(const T& type) {
+ out_->buffers.resize(2, buffer_);
+ ARROW_ASSIGN_OR_RAISE(out_->child_data[0], CreateChild(0, /*length=*/0));
+ return Status::OK();
+ }
+
+ Status Visit(const FixedSizeListType& type) {
+ ARROW_ASSIGN_OR_RAISE(out_->child_data[0],
+ CreateChild(0, length_ * type.list_size()));
+ return Status::OK();
+ }
+
+ Status Visit(const StructType& type) {
+ for (int i = 0; i < type_->num_fields(); ++i) {
+ ARROW_ASSIGN_OR_RAISE(out_->child_data[i], CreateChild(i, length_));
+ }
+ return Status::OK();
+ }
+
+ Status Visit(const UnionType& type) {
+ out_->buffers.resize(2);
+
+ // First buffer is always null
+ out_->buffers[0] = nullptr;
+
+ // Type codes are all zero, so we can use buffer_ which has had it's memory
+ // zeroed
+ out_->buffers[1] = buffer_;
+
+ // For sparse unions, we now create children with the same length as the
+ // parent
+ int64_t child_length = length_;
+ if (type.mode() == UnionMode::DENSE) {
+ // For dense unions, we set the offsets to all zero and create children
+ // with length 1
+ out_->buffers.resize(3);
+ out_->buffers[2] = buffer_;
+
+ child_length = 1;
+ }
+ for (int i = 0; i < type_->num_fields(); ++i) {
+ ARROW_ASSIGN_OR_RAISE(out_->child_data[i], CreateChild(i, child_length));
+ }
+ return Status::OK();
+ }
+
+ Status Visit(const DictionaryType& type) {
+ out_->buffers.resize(2, buffer_);
+ ARROW_ASSIGN_OR_RAISE(auto typed_null_dict, MakeArrayOfNull(type.value_type(), 0));
+ out_->dictionary = typed_null_dict->data();
+ return Status::OK();
+ }
+
+ Status Visit(const DataType& type) {
+ return Status::NotImplemented("construction of all-null ", type);
+ }
+
+ Result<std::shared_ptr<ArrayData>> CreateChild(int i, int64_t length) {
+ NullArrayFactory child_factory(pool_, type_->field(i)->type(), length);
+ child_factory.buffer_ = buffer_;
+ return child_factory.Create();
+ }
+
+ MemoryPool* pool_;
+ std::shared_ptr<DataType> type_;
+ int64_t length_;
+ std::shared_ptr<ArrayData> out_;
+ std::shared_ptr<Buffer> buffer_;
+};
+
+class RepeatedArrayFactory {
+ public:
+ RepeatedArrayFactory(MemoryPool* pool, const Scalar& scalar, int64_t length)
+ : pool_(pool), scalar_(scalar), length_(length) {}
+
+ Result<std::shared_ptr<Array>> Create() {
+ RETURN_NOT_OK(VisitTypeInline(*scalar_.type, this));
+ return out_;
+ }
+
+ Status Visit(const NullType& type) {
+ DCHECK(false); // already forwarded to MakeArrayOfNull
+ return Status::OK();
+ }
+
+ Status Visit(const BooleanType&) {
+ ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateBitmap(length_, pool_));
+ BitUtil::SetBitsTo(buffer->mutable_data(), 0, length_,
+ checked_cast<const BooleanScalar&>(scalar_).value);
+ out_ = std::make_shared<BooleanArray>(length_, buffer);
+ return Status::OK();
+ }
+
+ template <typename T>
+ enable_if_t<is_number_type<T>::value || is_temporal_type<T>::value, Status> Visit(
+ const T&) {
+ auto value = checked_cast<const typename TypeTraits<T>::ScalarType&>(scalar_).value;
+ return FinishFixedWidth(&value, sizeof(value));
+ }
+
+ Status Visit(const FixedSizeBinaryType& type) {
+ auto value = checked_cast<const FixedSizeBinaryScalar&>(scalar_).value;
+ return FinishFixedWidth(value->data(), type.byte_width());
+ }
+
+ template <typename T>
+ enable_if_decimal<T, Status> Visit(const T&) {
+ using ScalarType = typename TypeTraits<T>::ScalarType;
+ auto value = checked_cast<const ScalarType&>(scalar_).value.ToBytes();
+ return FinishFixedWidth(value.data(), value.size());
+ }
+
+ Status Visit(const Decimal256Type&) {
+ auto value = checked_cast<const Decimal256Scalar&>(scalar_).value.ToBytes();
+ return FinishFixedWidth(value.data(), value.size());
+ }
+
+ template <typename T>
+ enable_if_base_binary<T, Status> Visit(const T&) {
+ std::shared_ptr<Buffer> value =
+ checked_cast<const typename TypeTraits<T>::ScalarType&>(scalar_).value;
+ std::shared_ptr<Buffer> values_buffer, offsets_buffer;
+ RETURN_NOT_OK(CreateBufferOf(value->data(), value->size(), &values_buffer));
+ auto size = static_cast<typename T::offset_type>(value->size());
+ RETURN_NOT_OK(CreateOffsetsBuffer(size, &offsets_buffer));
+ out_ = std::make_shared<typename TypeTraits<T>::ArrayType>(length_, offsets_buffer,
+ values_buffer);
+ return Status::OK();
+ }
+
+ template <typename T>
+ enable_if_var_size_list<T, Status> Visit(const T& type) {
+ using ScalarType = typename TypeTraits<T>::ScalarType;
+ using ArrayType = typename TypeTraits<T>::ArrayType;
+
+ auto value = checked_cast<const ScalarType&>(scalar_).value;
+
+ ArrayVector values(length_, value);
+ ARROW_ASSIGN_OR_RAISE(auto value_array, Concatenate(values, pool_));
+
+ std::shared_ptr<Buffer> offsets_buffer;
+ auto size = static_cast<typename T::offset_type>(value->length());
+ RETURN_NOT_OK(CreateOffsetsBuffer(size, &offsets_buffer));
+
+ out_ =
+ std::make_shared<ArrayType>(scalar_.type, length_, offsets_buffer, value_array);
+ return Status::OK();
+ }
+
+ Status Visit(const FixedSizeListType& type) {
+ auto value = checked_cast<const FixedSizeListScalar&>(scalar_).value;
+
+ ArrayVector values(length_, value);
+ ARROW_ASSIGN_OR_RAISE(auto value_array, Concatenate(values, pool_));
+
+ out_ = std::make_shared<FixedSizeListArray>(scalar_.type, length_, value_array);
+ return Status::OK();
+ }
+
+ Status Visit(const MapType& type) {
+ auto map_scalar = checked_cast<const MapScalar&>(scalar_);
+ auto struct_array = checked_cast<const StructArray*>(map_scalar.value.get());
+
+ ArrayVector keys(length_, struct_array->field(0));
+ ArrayVector values(length_, struct_array->field(1));
+
+ ARROW_ASSIGN_OR_RAISE(auto key_array, Concatenate(keys, pool_));
+ ARROW_ASSIGN_OR_RAISE(auto value_array, Concatenate(values, pool_));
+
+ std::shared_ptr<Buffer> offsets_buffer;
+ auto size = static_cast<typename MapType::offset_type>(struct_array->length());
+ RETURN_NOT_OK(CreateOffsetsBuffer(size, &offsets_buffer));
+
+ out_ = std::make_shared<MapArray>(scalar_.type, length_, std::move(offsets_buffer),
+ std::move(key_array), std::move(value_array));
+ return Status::OK();
+ }
+
+ Status Visit(const DictionaryType& type) {
+ const auto& value = checked_cast<const DictionaryScalar&>(scalar_).value;
+ ARROW_ASSIGN_OR_RAISE(auto indices,
+ MakeArrayFromScalar(*value.index, length_, pool_));
+ out_ = std::make_shared<DictionaryArray>(scalar_.type, std::move(indices),
+ value.dictionary);
+ return Status::OK();
+ }
+
+ Status Visit(const StructType& type) {
+ ArrayVector fields;
+ for (const auto& value : checked_cast<const StructScalar&>(scalar_).value) {
+ fields.emplace_back();
+ ARROW_ASSIGN_OR_RAISE(fields.back(), MakeArrayFromScalar(*value, length_, pool_));
+ }
+ out_ = std::make_shared<StructArray>(scalar_.type, length_, std::move(fields));
+ return Status::OK();
+ }
+
+ Status Visit(const ExtensionType& type) {
+ return Status::NotImplemented("construction from scalar of type ", *scalar_.type);
+ }
+
+ Status Visit(const DenseUnionType& type) {
+ return Status::NotImplemented("construction from scalar of type ", *scalar_.type);
+ }
+
+ Status Visit(const SparseUnionType& type) {
+ return Status::NotImplemented("construction from scalar of type ", *scalar_.type);
+ }
+
+ template <typename OffsetType>
+ Status CreateOffsetsBuffer(OffsetType value_length, std::shared_ptr<Buffer>* out) {
+ TypedBufferBuilder<OffsetType> builder(pool_);
+ RETURN_NOT_OK(builder.Resize(length_ + 1));
+ OffsetType offset = 0;
+ for (int64_t i = 0; i < length_ + 1; ++i, offset += value_length) {
+ builder.UnsafeAppend(offset);
+ }
+ return builder.Finish(out);
+ }
+
+ Status CreateBufferOf(const void* data, size_t data_length,
+ std::shared_ptr<Buffer>* out) {
+ BufferBuilder builder(pool_);
+ RETURN_NOT_OK(builder.Resize(length_ * data_length));
+ for (int64_t i = 0; i < length_; ++i) {
+ builder.UnsafeAppend(data, data_length);
+ }
+ return builder.Finish(out);
+ }
+
+ Status FinishFixedWidth(const void* data, size_t data_length) {
+ std::shared_ptr<Buffer> buffer;
+ RETURN_NOT_OK(CreateBufferOf(data, data_length, &buffer));
+ out_ = MakeArray(
+ ArrayData::Make(scalar_.type, length_, {nullptr, std::move(buffer)}, 0));
+ return Status::OK();
+ }
+
+ MemoryPool* pool_;
+ const Scalar& scalar_;
+ int64_t length_;
+ std::shared_ptr<Array> out_;
+};
+
+} // namespace
+
+Result<std::shared_ptr<Array>> MakeArrayOfNull(const std::shared_ptr<DataType>& type,
+ int64_t length, MemoryPool* pool) {
+ ARROW_ASSIGN_OR_RAISE(auto data, NullArrayFactory(pool, type, length).Create());
+ return MakeArray(data);
+}
+
+Result<std::shared_ptr<Array>> MakeArrayFromScalar(const Scalar& scalar, int64_t length,
+ MemoryPool* pool) {
+ if (!scalar.is_valid) {
+ return MakeArrayOfNull(scalar.type, length, pool);
+ }
+ return RepeatedArrayFactory(pool, scalar, length).Create();
+}
+
+namespace internal {
+
+std::vector<ArrayVector> RechunkArraysConsistently(
+ const std::vector<ArrayVector>& groups) {
+ if (groups.size() <= 1) {
+ return groups;
+ }
+ int64_t total_length = 0;
+ for (const auto& array : groups.front()) {
+ total_length += array->length();
+ }
+#ifndef NDEBUG
+ for (const auto& group : groups) {
+ int64_t group_length = 0;
+ for (const auto& array : group) {
+ group_length += array->length();
+ }
+ DCHECK_EQ(group_length, total_length)
+ << "Array groups should have the same total number of elements";
+ }
+#endif
+ if (total_length == 0) {
+ return groups;
+ }
+
+ // Set up result vectors
+ std::vector<ArrayVector> rechunked_groups(groups.size());
+
+ // Set up progress counters
+ std::vector<ArrayVector::const_iterator> current_arrays;
+ std::vector<int64_t> array_offsets;
+ for (const auto& group : groups) {
+ current_arrays.emplace_back(group.cbegin());
+ array_offsets.emplace_back(0);
+ }
+
+ // Scan all array vectors at once, rechunking along the way
+ int64_t start = 0;
+ while (start < total_length) {
+ // First compute max possible length for next chunk
+ int64_t chunk_length = std::numeric_limits<int64_t>::max();
+ for (size_t i = 0; i < groups.size(); i++) {
+ auto& arr_it = current_arrays[i];
+ auto& offset = array_offsets[i];
+ // Skip any done arrays (including 0-length arrays)
+ while (offset == (*arr_it)->length()) {
+ ++arr_it;
+ offset = 0;
+ }
+ const auto& array = *arr_it;
+ DCHECK_GT(array->length(), offset);
+ chunk_length = std::min(chunk_length, array->length() - offset);
+ }
+ DCHECK_GT(chunk_length, 0);
+
+ // Then slice all arrays along this chunk size
+ for (size_t i = 0; i < groups.size(); i++) {
+ const auto& array = *current_arrays[i];
+ auto& offset = array_offsets[i];
+ if (offset == 0 && array->length() == chunk_length) {
+ // Slice spans entire array
+ rechunked_groups[i].emplace_back(array);
+ } else {
+ DCHECK_LT(chunk_length - offset, array->length());
+ rechunked_groups[i].emplace_back(array->Slice(offset, chunk_length));
+ }
+ offset += chunk_length;
+ }
+ start += chunk_length;
+ }
+
+ return rechunked_groups;
+}
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/util.h b/contrib/libs/apache/arrow/cpp/src/arrow/array/util.h
new file mode 100644
index 00000000000..3ef4e08828f
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/util.h
@@ -0,0 +1,78 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "arrow/array/data.h"
+#include "arrow/compare.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+/// \brief Create a strongly-typed Array instance from generic ArrayData
+/// \param[in] data the array contents
+/// \return the resulting Array instance
+ARROW_EXPORT
+std::shared_ptr<Array> MakeArray(const std::shared_ptr<ArrayData>& data);
+
+/// \brief Create a strongly-typed Array instance with all elements null
+/// \param[in] type the array type
+/// \param[in] length the array length
+/// \param[in] pool the memory pool to allocate memory from
+ARROW_EXPORT
+Result<std::shared_ptr<Array>> MakeArrayOfNull(const std::shared_ptr<DataType>& type,
+ int64_t length,
+ MemoryPool* pool = default_memory_pool());
+
+/// \brief Create an Array instance whose slots are the given scalar
+/// \param[in] scalar the value with which to fill the array
+/// \param[in] length the array length
+/// \param[in] pool the memory pool to allocate memory from
+ARROW_EXPORT
+Result<std::shared_ptr<Array>> MakeArrayFromScalar(
+ const Scalar& scalar, int64_t length, MemoryPool* pool = default_memory_pool());
+
+namespace internal {
+
+/// \brief Swap endian of each element in a generic ArrayData
+///
+/// As dictionaries are often shared between different arrays, dictionaries
+/// are not swapped by this function and should be handled separately.
+///
+/// \param[in] data the array contents
+/// \return the resulting ArrayData whose elements were swapped
+ARROW_EXPORT
+Result<std::shared_ptr<ArrayData>> SwapEndianArrayData(
+ const std::shared_ptr<ArrayData>& data);
+
+/// Given a number of ArrayVectors, treat each ArrayVector as the
+/// chunks of a chunked array. Then rechunk each ArrayVector such that
+/// all ArrayVectors are chunked identically. It is mandatory that
+/// all ArrayVectors contain the same total number of elements.
+ARROW_EXPORT
+std::vector<ArrayVector> RechunkArraysConsistently(const std::vector<ArrayVector>&);
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/validate.cc b/contrib/libs/apache/arrow/cpp/src/arrow/array/validate.cc
new file mode 100644
index 00000000000..5cc3bacf282
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/validate.cc
@@ -0,0 +1,657 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/array/validate.h"
+
+#include <vector>
+
+#include "arrow/array.h" // IWYU pragma: keep
+#include "arrow/extension_type.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/bit_block_counter.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/int_util_internal.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/utf8.h"
+#include "arrow/visitor_inline.h"
+
+namespace arrow {
+namespace internal {
+
+///////////////////////////////////////////////////////////////////////////
+// ValidateArray: cheap validation checks
+
+namespace {
+
+struct ValidateArrayImpl {
+ const ArrayData& data;
+
+ Status Validate() { return ValidateWithType(*data.type); }
+
+ Status ValidateWithType(const DataType& type) { return VisitTypeInline(type, this); }
+
+ Status Visit(const NullType&) {
+ if (data.null_count != data.length) {
+ return Status::Invalid("Null array null_count unequal to its length");
+ }
+ return Status::OK();
+ }
+
+ Status Visit(const FixedWidthType&) {
+ if (data.length > 0) {
+ if (!IsBufferValid(1)) {
+ return Status::Invalid("Missing values buffer in non-empty array");
+ }
+ }
+ return Status::OK();
+ }
+
+ Status Visit(const StringType& type) { return ValidateBinaryLike(type); }
+
+ Status Visit(const BinaryType& type) { return ValidateBinaryLike(type); }
+
+ Status Visit(const LargeStringType& type) { return ValidateBinaryLike(type); }
+
+ Status Visit(const LargeBinaryType& type) { return ValidateBinaryLike(type); }
+
+ Status Visit(const ListType& type) { return ValidateListLike(type); }
+
+ Status Visit(const LargeListType& type) { return ValidateListLike(type); }
+
+ Status Visit(const MapType& type) { return ValidateListLike(type); }
+
+ Status Visit(const FixedSizeListType& type) {
+ const ArrayData& values = *data.child_data[0];
+ const int64_t list_size = type.list_size();
+ if (list_size < 0) {
+ return Status::Invalid("Fixed size list has negative list size");
+ }
+
+ int64_t expected_values_length = -1;
+ if (MultiplyWithOverflow(data.length, list_size, &expected_values_length) ||
+ values.length != expected_values_length) {
+ return Status::Invalid("Values length (", values.length,
+ ") is not equal to the length (", data.length,
+ ") multiplied by the value size (", list_size, ")");
+ }
+
+ const Status child_valid = ValidateArray(values);
+ if (!child_valid.ok()) {
+ return Status::Invalid("Fixed size list child array invalid: ",
+ child_valid.ToString());
+ }
+
+ return Status::OK();
+ }
+
+ Status Visit(const StructType& type) {
+ for (int i = 0; i < type.num_fields(); ++i) {
+ const auto& field_data = *data.child_data[i];
+
+ // Validate child first, to catch nonsensical length / offset etc.
+ const Status field_valid = ValidateArray(field_data);
+ if (!field_valid.ok()) {
+ return Status::Invalid("Struct child array #", i,
+ " invalid: ", field_valid.ToString());
+ }
+
+ if (field_data.length < data.length + data.offset) {
+ return Status::Invalid("Struct child array #", i,
+ " has length smaller than expected for struct array (",
+ field_data.length, " < ", data.length + data.offset, ")");
+ }
+
+ const auto& field_type = type.field(i)->type();
+ if (!field_data.type->Equals(*field_type)) {
+ return Status::Invalid("Struct child array #", i, " does not match type field: ",
+ field_data.type->ToString(), " vs ",
+ field_type->ToString());
+ }
+ }
+ return Status::OK();
+ }
+
+ Status Visit(const UnionType& type) {
+ for (int i = 0; i < type.num_fields(); ++i) {
+ const auto& field_data = *data.child_data[i];
+
+ // Validate child first, to catch nonsensical length / offset etc.
+ const Status field_valid = ValidateArray(field_data);
+ if (!field_valid.ok()) {
+ return Status::Invalid("Union child array #", i,
+ " invalid: ", field_valid.ToString());
+ }
+
+ if (type.mode() == UnionMode::SPARSE &&
+ field_data.length < data.length + data.offset) {
+ return Status::Invalid("Sparse union child array #", i,
+ " has length smaller than expected for union array (",
+ field_data.length, " < ", data.length + data.offset, ")");
+ }
+
+ const auto& field_type = type.field(i)->type();
+ if (!field_data.type->Equals(*field_type)) {
+ return Status::Invalid("Union child array #", i, " does not match type field: ",
+ field_data.type->ToString(), " vs ",
+ field_type->ToString());
+ }
+ }
+ return Status::OK();
+ }
+
+ Status Visit(const DictionaryType& type) {
+ Type::type index_type_id = type.index_type()->id();
+ if (!is_integer(index_type_id)) {
+ return Status::Invalid("Dictionary indices must be integer type");
+ }
+ if (!data.dictionary) {
+ return Status::Invalid("Dictionary values must be non-null");
+ }
+ const Status dict_valid = ValidateArray(*data.dictionary);
+ if (!dict_valid.ok()) {
+ return Status::Invalid("Dictionary array invalid: ", dict_valid.ToString());
+ }
+ // Visit indices
+ return ValidateWithType(*type.index_type());
+ }
+
+ Status Visit(const ExtensionType& type) {
+ // Visit storage
+ return ValidateWithType(*type.storage_type());
+ }
+
+ private:
+ bool IsBufferValid(int index) { return IsBufferValid(data, index); }
+
+ static bool IsBufferValid(const ArrayData& data, int index) {
+ return data.buffers[index] != nullptr && data.buffers[index]->address() != 0;
+ }
+
+ template <typename BinaryType>
+ Status ValidateBinaryLike(const BinaryType& type) {
+ if (!IsBufferValid(2)) {
+ return Status::Invalid("Value data buffer is null");
+ }
+ // First validate offsets, to make sure the accesses below are valid
+ RETURN_NOT_OK(ValidateOffsets(type));
+
+ if (data.length > 0 && data.buffers[1]->is_cpu()) {
+ using offset_type = typename BinaryType::offset_type;
+
+ const auto offsets = data.GetValues<offset_type>(1);
+ const Buffer& values = *data.buffers[2];
+
+ const auto first_offset = offsets[0];
+ const auto last_offset = offsets[data.length];
+ // This early test avoids undefined behaviour when computing `data_extent`
+ if (first_offset < 0 || last_offset < 0) {
+ return Status::Invalid("Negative offsets in binary array");
+ }
+ const auto data_extent = last_offset - first_offset;
+ const auto values_length = values.size();
+ if (values_length < data_extent) {
+ return Status::Invalid("Length spanned by binary offsets (", data_extent,
+ ") larger than values array (size ", values_length, ")");
+ }
+ // These tests ensure that array concatenation is safe if Validate() succeeds
+ // (for delta dictionaries)
+ if (first_offset > values_length || last_offset > values_length) {
+ return Status::Invalid("First or last binary offset out of bounds");
+ }
+ if (first_offset > last_offset) {
+ return Status::Invalid("First offset larger than last offset in binary array");
+ }
+ }
+ return Status::OK();
+ }
+
+ template <typename ListType>
+ Status ValidateListLike(const ListType& type) {
+ // First validate offsets, to make sure the accesses below are valid
+ RETURN_NOT_OK(ValidateOffsets(type));
+
+ const ArrayData& values = *data.child_data[0];
+
+ // An empty list array can have 0 offsets
+ if (data.length > 0 && data.buffers[1]->is_cpu()) {
+ using offset_type = typename ListType::offset_type;
+
+ const auto offsets = data.GetValues<offset_type>(1);
+
+ const auto first_offset = offsets[0];
+ const auto last_offset = offsets[data.length];
+ // This early test avoids undefined behaviour when computing `data_extent`
+ if (first_offset < 0 || last_offset < 0) {
+ return Status::Invalid("Negative offsets in list array");
+ }
+ const auto data_extent = last_offset - first_offset;
+ const auto values_length = values.length;
+ if (values_length < data_extent) {
+ return Status::Invalid("Length spanned by list offsets (", data_extent,
+ ") larger than values array (length ", values_length, ")");
+ }
+ // These tests ensure that array concatenation is safe if Validate() succeeds
+ // (for delta dictionaries)
+ if (first_offset > values_length || last_offset > values_length) {
+ return Status::Invalid("First or last list offset out of bounds");
+ }
+ if (first_offset > last_offset) {
+ return Status::Invalid("First offset larger than last offset in list array");
+ }
+ }
+
+ const Status child_valid = ValidateArray(values);
+ if (!child_valid.ok()) {
+ return Status::Invalid("List child array invalid: ", child_valid.ToString());
+ }
+ return Status::OK();
+ }
+
+ template <typename TypeClass>
+ Status ValidateOffsets(const TypeClass& type) {
+ using offset_type = typename TypeClass::offset_type;
+
+ const Buffer* offsets = data.buffers[1].get();
+ if (offsets == nullptr) {
+ // For length 0, an empty offsets buffer seems accepted as a special case
+ // (ARROW-544)
+ if (data.length > 0) {
+ return Status::Invalid("Non-empty array but offsets are null");
+ }
+ return Status::OK();
+ }
+
+ // An empty list array can have 0 offsets
+ auto required_offsets = (data.length > 0) ? data.length + data.offset + 1 : 0;
+ if (offsets->size() / static_cast<int32_t>(sizeof(offset_type)) < required_offsets) {
+ return Status::Invalid("Offsets buffer size (bytes): ", offsets->size(),
+ " isn't large enough for length: ", data.length);
+ }
+
+ return Status::OK();
+ }
+};
+
+} // namespace
+
+ARROW_EXPORT
+Status ValidateArray(const ArrayData& data) {
+ // First check the data layout conforms to the spec
+ const DataType& type = *data.type;
+ const auto layout = type.layout();
+
+ if (data.length < 0) {
+ return Status::Invalid("Array length is negative");
+ }
+
+ if (data.buffers.size() != layout.buffers.size()) {
+ return Status::Invalid("Expected ", layout.buffers.size(),
+ " buffers in array "
+ "of type ",
+ type.ToString(), ", got ", data.buffers.size());
+ }
+
+ // This check is required to avoid addition overflow below
+ int64_t length_plus_offset = -1;
+ if (AddWithOverflow(data.length, data.offset, &length_plus_offset)) {
+ return Status::Invalid("Array of type ", type.ToString(),
+ " has impossibly large length and offset");
+ }
+
+ for (int i = 0; i < static_cast<int>(data.buffers.size()); ++i) {
+ const auto& buffer = data.buffers[i];
+ const auto& spec = layout.buffers[i];
+
+ if (buffer == nullptr) {
+ continue;
+ }
+ int64_t min_buffer_size = -1;
+ switch (spec.kind) {
+ case DataTypeLayout::BITMAP:
+ min_buffer_size = BitUtil::BytesForBits(length_plus_offset);
+ break;
+ case DataTypeLayout::FIXED_WIDTH:
+ if (MultiplyWithOverflow(length_plus_offset, spec.byte_width, &min_buffer_size)) {
+ return Status::Invalid("Array of type ", type.ToString(),
+ " has impossibly large length and offset");
+ }
+ break;
+ case DataTypeLayout::ALWAYS_NULL:
+ // XXX Should we raise on non-null buffer?
+ continue;
+ default:
+ continue;
+ }
+ if (buffer->size() < min_buffer_size) {
+ return Status::Invalid("Buffer #", i, " too small in array of type ",
+ type.ToString(), " and length ", data.length,
+ ": expected at least ", min_buffer_size, " byte(s), got ",
+ buffer->size());
+ }
+ }
+ if (type.id() != Type::NA && data.null_count > 0 && data.buffers[0] == nullptr) {
+ return Status::Invalid("Array of type ", type.ToString(), " has ", data.null_count,
+ " nulls but no null bitmap");
+ }
+
+ // Check null_count() *after* validating the buffer sizes, to avoid
+ // reading out of bounds.
+ if (data.null_count > data.length) {
+ return Status::Invalid("Null count exceeds array length");
+ }
+ if (data.null_count < 0 && data.null_count != kUnknownNullCount) {
+ return Status::Invalid("Negative null count");
+ }
+
+ if (type.id() != Type::EXTENSION) {
+ if (data.child_data.size() != static_cast<size_t>(type.num_fields())) {
+ return Status::Invalid("Expected ", type.num_fields(),
+ " child arrays in array "
+ "of type ",
+ type.ToString(), ", got ", data.child_data.size());
+ }
+ }
+ if (layout.has_dictionary && !data.dictionary) {
+ return Status::Invalid("Array of type ", type.ToString(),
+ " must have dictionary values");
+ }
+ if (!layout.has_dictionary && data.dictionary) {
+ return Status::Invalid("Unexpected dictionary values in array of type ",
+ type.ToString());
+ }
+
+ ValidateArrayImpl validator{data};
+ return validator.Validate();
+}
+
+ARROW_EXPORT
+Status ValidateArray(const Array& array) { return ValidateArray(*array.data()); }
+
+///////////////////////////////////////////////////////////////////////////
+// ValidateArrayFull: expensive validation checks
+
+namespace {
+
+struct UTF8DataValidator {
+ const ArrayData& data;
+
+ Status Visit(const DataType&) {
+ // Default, should be unreachable
+ return Status::NotImplemented("");
+ }
+
+ template <typename StringType>
+ enable_if_string<StringType, Status> Visit(const StringType&) {
+ util::InitializeUTF8();
+
+ int64_t i = 0;
+ return VisitArrayDataInline<StringType>(
+ data,
+ [&](util::string_view v) {
+ if (ARROW_PREDICT_FALSE(!util::ValidateUTF8(v))) {
+ return Status::Invalid("Invalid UTF8 sequence at string index ", i);
+ }
+ ++i;
+ return Status::OK();
+ },
+ [&]() {
+ ++i;
+ return Status::OK();
+ });
+ }
+};
+
+struct BoundsChecker {
+ const ArrayData& data;
+ int64_t min_value;
+ int64_t max_value;
+
+ Status Visit(const DataType&) {
+ // Default, should be unreachable
+ return Status::NotImplemented("");
+ }
+
+ template <typename IntegerType>
+ enable_if_integer<IntegerType, Status> Visit(const IntegerType&) {
+ using c_type = typename IntegerType::c_type;
+
+ int64_t i = 0;
+ return VisitArrayDataInline<IntegerType>(
+ data,
+ [&](c_type value) {
+ const auto v = static_cast<int64_t>(value);
+ if (ARROW_PREDICT_FALSE(v < min_value || v > max_value)) {
+ return Status::Invalid("Value at position ", i, " out of bounds: ", v,
+ " (should be in [", min_value, ", ", max_value, "])");
+ }
+ ++i;
+ return Status::OK();
+ },
+ [&]() {
+ ++i;
+ return Status::OK();
+ });
+ }
+};
+
+struct ValidateArrayFullImpl {
+ const ArrayData& data;
+
+ Status Validate() { return ValidateWithType(*data.type); }
+
+ Status ValidateWithType(const DataType& type) { return VisitTypeInline(type, this); }
+
+ Status Visit(const NullType& type) { return Status::OK(); }
+
+ Status Visit(const FixedWidthType& type) { return Status::OK(); }
+
+ Status Visit(const StringType& type) {
+ RETURN_NOT_OK(ValidateBinaryLike(type));
+ return ValidateUTF8(data);
+ }
+
+ Status Visit(const LargeStringType& type) {
+ RETURN_NOT_OK(ValidateBinaryLike(type));
+ return ValidateUTF8(data);
+ }
+
+ Status Visit(const BinaryType& type) { return ValidateBinaryLike(type); }
+
+ Status Visit(const LargeBinaryType& type) { return ValidateBinaryLike(type); }
+
+ Status Visit(const ListType& type) { return ValidateListLike(type); }
+
+ Status Visit(const LargeListType& type) { return ValidateListLike(type); }
+
+ Status Visit(const MapType& type) { return ValidateListLike(type); }
+
+ Status Visit(const FixedSizeListType& type) {
+ const ArrayData& child = *data.child_data[0];
+ const Status child_valid = ValidateArrayFull(child);
+ if (!child_valid.ok()) {
+ return Status::Invalid("Fixed size list child array invalid: ",
+ child_valid.ToString());
+ }
+ return Status::OK();
+ }
+
+ Status Visit(const StructType& type) {
+ // Validate children
+ for (int64_t i = 0; i < type.num_fields(); ++i) {
+ const ArrayData& field = *data.child_data[i];
+ const Status field_valid = ValidateArrayFull(field);
+ if (!field_valid.ok()) {
+ return Status::Invalid("Struct child array #", i,
+ " invalid: ", field_valid.ToString());
+ }
+ }
+ return Status::OK();
+ }
+
+ Status Visit(const UnionType& type) {
+ const auto& child_ids = type.child_ids();
+ const auto& type_codes_map = type.type_codes();
+
+ const int8_t* type_codes = data.GetValues<int8_t>(1);
+
+ for (int64_t i = 0; i < data.length; ++i) {
+ // Note that union arrays never have top-level nulls
+ const int32_t code = type_codes[i];
+ if (code < 0 || child_ids[code] == UnionType::kInvalidChildId) {
+ return Status::Invalid("Union value at position ", i, " has invalid type id ",
+ code);
+ }
+ }
+
+ if (type.mode() == UnionMode::DENSE) {
+ // Map logical type id to child length
+ std::vector<int64_t> child_lengths(256);
+ for (int child_id = 0; child_id < type.num_fields(); ++child_id) {
+ child_lengths[type_codes_map[child_id]] = data.child_data[child_id]->length;
+ }
+
+ // Check offsets are in bounds
+ std::vector<int64_t> last_child_offsets(256, 0);
+ const int32_t* offsets = data.GetValues<int32_t>(2);
+ for (int64_t i = 0; i < data.length; ++i) {
+ const int32_t code = type_codes[i];
+ const int32_t offset = offsets[i];
+ if (offset < 0) {
+ return Status::Invalid("Union value at position ", i, " has negative offset ",
+ offset);
+ }
+ if (offset >= child_lengths[code]) {
+ return Status::Invalid("Union value at position ", i,
+ " has offset larger "
+ "than child length (",
+ offset, " >= ", child_lengths[code], ")");
+ }
+ if (offset < last_child_offsets[code]) {
+ return Status::Invalid("Union value at position ", i,
+ " has non-monotonic offset ", offset);
+ }
+ last_child_offsets[code] = offset;
+ }
+ }
+
+ // Validate children
+ for (int64_t i = 0; i < type.num_fields(); ++i) {
+ const ArrayData& field = *data.child_data[i];
+ const Status field_valid = ValidateArrayFull(field);
+ if (!field_valid.ok()) {
+ return Status::Invalid("Union child array #", i,
+ " invalid: ", field_valid.ToString());
+ }
+ }
+ return Status::OK();
+ }
+
+ Status Visit(const DictionaryType& type) {
+ const Status indices_status =
+ CheckBounds(*type.index_type(), 0, data.dictionary->length - 1);
+ if (!indices_status.ok()) {
+ return Status::Invalid("Dictionary indices invalid: ", indices_status.ToString());
+ }
+ return ValidateArrayFull(*data.dictionary);
+ }
+
+ Status Visit(const ExtensionType& type) {
+ return ValidateWithType(*type.storage_type());
+ }
+
+ protected:
+ template <typename BinaryType>
+ Status ValidateBinaryLike(const BinaryType& type) {
+ const auto& data_buffer = data.buffers[2];
+ if (data_buffer == nullptr) {
+ return Status::Invalid("Binary data buffer is null");
+ }
+ return ValidateOffsets(type, data_buffer->size());
+ }
+
+ template <typename ListType>
+ Status ValidateListLike(const ListType& type) {
+ const ArrayData& child = *data.child_data[0];
+ const Status child_valid = ValidateArrayFull(child);
+ if (!child_valid.ok()) {
+ return Status::Invalid("List child array invalid: ", child_valid.ToString());
+ }
+ return ValidateOffsets(type, child.offset + child.length);
+ }
+
+ template <typename TypeClass>
+ Status ValidateOffsets(const TypeClass& type, int64_t offset_limit) {
+ using offset_type = typename TypeClass::offset_type;
+ if (data.length == 0) {
+ return Status::OK();
+ }
+
+ const offset_type* offsets = data.GetValues<offset_type>(1);
+ if (offsets == nullptr) {
+ return Status::Invalid("Non-empty array but offsets are null");
+ }
+
+ auto prev_offset = offsets[0];
+ if (prev_offset < 0) {
+ return Status::Invalid("Offset invariant failure: array starts at negative offset ",
+ prev_offset);
+ }
+ for (int64_t i = 1; i <= data.length; ++i) {
+ const auto current_offset = offsets[i];
+ if (current_offset < prev_offset) {
+ return Status::Invalid("Offset invariant failure: non-monotonic offset at slot ",
+ i, ": ", current_offset, " < ", prev_offset);
+ }
+ if (current_offset > offset_limit) {
+ return Status::Invalid("Offset invariant failure: offset for slot ", i,
+ " out of bounds: ", current_offset, " > ", offset_limit);
+ }
+ prev_offset = current_offset;
+ }
+ return Status::OK();
+ }
+
+ Status CheckBounds(const DataType& type, int64_t min_value, int64_t max_value) {
+ BoundsChecker checker{data, min_value, max_value};
+ return VisitTypeInline(type, &checker);
+ }
+};
+
+} // namespace
+
+ARROW_EXPORT
+Status ValidateArrayFull(const ArrayData& data) {
+ return ValidateArrayFullImpl{data}.Validate();
+}
+
+ARROW_EXPORT
+Status ValidateArrayFull(const Array& array) { return ValidateArrayFull(*array.data()); }
+
+ARROW_EXPORT
+Status ValidateUTF8(const ArrayData& data) {
+ DCHECK(data.type->id() == Type::STRING || data.type->id() == Type::LARGE_STRING);
+ UTF8DataValidator validator{data};
+ return VisitTypeInline(*data.type, &validator);
+}
+
+ARROW_EXPORT
+Status ValidateUTF8(const Array& array) { return ValidateUTF8(*array.data()); }
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/validate.h b/contrib/libs/apache/arrow/cpp/src/arrow/array/validate.h
new file mode 100644
index 00000000000..cae3e16b3c5
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/validate.h
@@ -0,0 +1,55 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/status.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace internal {
+
+// Internal functions implementing Array::Validate() and friends.
+
+// O(1) array metadata validation
+
+ARROW_EXPORT
+Status ValidateArray(const Array& array);
+
+ARROW_EXPORT
+Status ValidateArray(const ArrayData& data);
+
+// O(N) array data validation.
+// Note the "full" routines don't validate metadata. It should be done
+// beforehand using ValidateArray(), otherwise invalid memory accesses
+// may occur.
+
+ARROW_EXPORT
+Status ValidateArrayFull(const Array& array);
+
+ARROW_EXPORT
+Status ValidateArrayFull(const ArrayData& data);
+
+ARROW_EXPORT
+Status ValidateUTF8(const Array& array);
+
+ARROW_EXPORT
+Status ValidateUTF8(const ArrayData& data);
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/buffer.cc b/contrib/libs/apache/arrow/cpp/src/arrow/buffer.cc
new file mode 100644
index 00000000000..b1b2945d0f5
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/buffer.cc
@@ -0,0 +1,207 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/buffer.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <utility>
+
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/int_util_internal.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/string.h"
+
+namespace arrow {
+
+Result<std::shared_ptr<Buffer>> Buffer::CopySlice(const int64_t start,
+ const int64_t nbytes,
+ MemoryPool* pool) const {
+ // Sanity checks
+ ARROW_CHECK_LE(start, size_);
+ ARROW_CHECK_LE(nbytes, size_ - start);
+ DCHECK_GE(nbytes, 0);
+
+ ARROW_ASSIGN_OR_RAISE(auto new_buffer, AllocateResizableBuffer(nbytes, pool));
+ std::memcpy(new_buffer->mutable_data(), data() + start, static_cast<size_t>(nbytes));
+ return std::move(new_buffer);
+}
+
+namespace {
+
+Status CheckBufferSlice(const Buffer& buffer, int64_t offset, int64_t length) {
+ return internal::CheckSliceParams(buffer.size(), offset, length, "buffer");
+}
+
+Status CheckBufferSlice(const Buffer& buffer, int64_t offset) {
+ if (ARROW_PREDICT_FALSE(offset < 0)) {
+ // Avoid UBSAN in subtraction below
+ return Status::Invalid("Negative buffer slice offset");
+ }
+ return CheckBufferSlice(buffer, offset, buffer.size() - offset);
+}
+
+} // namespace
+
+Result<std::shared_ptr<Buffer>> SliceBufferSafe(const std::shared_ptr<Buffer>& buffer,
+ int64_t offset) {
+ RETURN_NOT_OK(CheckBufferSlice(*buffer, offset));
+ return SliceBuffer(buffer, offset);
+}
+
+Result<std::shared_ptr<Buffer>> SliceBufferSafe(const std::shared_ptr<Buffer>& buffer,
+ int64_t offset, int64_t length) {
+ RETURN_NOT_OK(CheckBufferSlice(*buffer, offset, length));
+ return SliceBuffer(buffer, offset, length);
+}
+
+Result<std::shared_ptr<Buffer>> SliceMutableBufferSafe(
+ const std::shared_ptr<Buffer>& buffer, int64_t offset) {
+ RETURN_NOT_OK(CheckBufferSlice(*buffer, offset));
+ return SliceMutableBuffer(buffer, offset);
+}
+
+Result<std::shared_ptr<Buffer>> SliceMutableBufferSafe(
+ const std::shared_ptr<Buffer>& buffer, int64_t offset, int64_t length) {
+ RETURN_NOT_OK(CheckBufferSlice(*buffer, offset, length));
+ return SliceMutableBuffer(buffer, offset, length);
+}
+
+std::string Buffer::ToHexString() {
+ return HexEncode(data(), static_cast<size_t>(size()));
+}
+
+bool Buffer::Equals(const Buffer& other, const int64_t nbytes) const {
+ return this == &other || (size_ >= nbytes && other.size_ >= nbytes &&
+ (data_ == other.data_ ||
+ !memcmp(data_, other.data_, static_cast<size_t>(nbytes))));
+}
+
+bool Buffer::Equals(const Buffer& other) const {
+ return this == &other || (size_ == other.size_ &&
+ (data_ == other.data_ ||
+ !memcmp(data_, other.data_, static_cast<size_t>(size_))));
+}
+
+std::string Buffer::ToString() const {
+ return std::string(reinterpret_cast<const char*>(data_), static_cast<size_t>(size_));
+}
+
+void Buffer::CheckMutable() const { DCHECK(is_mutable()) << "buffer not mutable"; }
+
+void Buffer::CheckCPU() const {
+ DCHECK(is_cpu()) << "not a CPU buffer (device: " << device()->ToString() << ")";
+}
+
+Result<std::shared_ptr<io::RandomAccessFile>> Buffer::GetReader(
+ std::shared_ptr<Buffer> buf) {
+ return buf->memory_manager_->GetBufferReader(buf);
+}
+
+Result<std::shared_ptr<io::OutputStream>> Buffer::GetWriter(std::shared_ptr<Buffer> buf) {
+ if (!buf->is_mutable()) {
+ return Status::Invalid("Expected mutable buffer");
+ }
+ return buf->memory_manager_->GetBufferWriter(buf);
+}
+
+Result<std::shared_ptr<Buffer>> Buffer::Copy(std::shared_ptr<Buffer> source,
+ const std::shared_ptr<MemoryManager>& to) {
+ return MemoryManager::CopyBuffer(source, to);
+}
+
+Result<std::shared_ptr<Buffer>> Buffer::View(std::shared_ptr<Buffer> source,
+ const std::shared_ptr<MemoryManager>& to) {
+ return MemoryManager::ViewBuffer(source, to);
+}
+
+Result<std::shared_ptr<Buffer>> Buffer::ViewOrCopy(
+ std::shared_ptr<Buffer> source, const std::shared_ptr<MemoryManager>& to) {
+ auto maybe_buffer = MemoryManager::ViewBuffer(source, to);
+ if (maybe_buffer.ok()) {
+ return maybe_buffer;
+ }
+ return MemoryManager::CopyBuffer(source, to);
+}
+
+class StlStringBuffer : public Buffer {
+ public:
+ explicit StlStringBuffer(std::string data)
+ : Buffer(nullptr, 0), input_(std::move(data)) {
+ data_ = reinterpret_cast<const uint8_t*>(input_.c_str());
+ size_ = static_cast<int64_t>(input_.size());
+ capacity_ = size_;
+ }
+
+ private:
+ std::string input_;
+};
+
+std::shared_ptr<Buffer> Buffer::FromString(std::string data) {
+ return std::make_shared<StlStringBuffer>(std::move(data));
+}
+
+std::shared_ptr<Buffer> SliceMutableBuffer(const std::shared_ptr<Buffer>& buffer,
+ const int64_t offset, const int64_t length) {
+ return std::make_shared<MutableBuffer>(buffer, offset, length);
+}
+
+MutableBuffer::MutableBuffer(const std::shared_ptr<Buffer>& parent, const int64_t offset,
+ const int64_t size)
+ : MutableBuffer(reinterpret_cast<uint8_t*>(parent->mutable_address()) + offset,
+ size) {
+ DCHECK(parent->is_mutable()) << "Must pass mutable buffer";
+ parent_ = parent;
+}
+
+Result<std::shared_ptr<Buffer>> AllocateBitmap(int64_t length, MemoryPool* pool) {
+ ARROW_ASSIGN_OR_RAISE(auto buf, AllocateBuffer(BitUtil::BytesForBits(length), pool));
+ // Zero out any trailing bits
+ if (buf->size() > 0) {
+ buf->mutable_data()[buf->size() - 1] = 0;
+ }
+ return std::move(buf);
+}
+
+Result<std::shared_ptr<Buffer>> AllocateEmptyBitmap(int64_t length, MemoryPool* pool) {
+ ARROW_ASSIGN_OR_RAISE(auto buf, AllocateBuffer(BitUtil::BytesForBits(length), pool));
+ memset(buf->mutable_data(), 0, static_cast<size_t>(buf->size()));
+ return std::move(buf);
+}
+
+Status AllocateEmptyBitmap(int64_t length, std::shared_ptr<Buffer>* out) {
+ return AllocateEmptyBitmap(length).Value(out);
+}
+
+Result<std::shared_ptr<Buffer>> ConcatenateBuffers(
+ const std::vector<std::shared_ptr<Buffer>>& buffers, MemoryPool* pool) {
+ int64_t out_length = 0;
+ for (const auto& buffer : buffers) {
+ out_length += buffer->size();
+ }
+ ARROW_ASSIGN_OR_RAISE(auto out, AllocateBuffer(out_length, pool));
+ auto out_data = out->mutable_data();
+ for (const auto& buffer : buffers) {
+ std::memcpy(out_data, buffer->data(), buffer->size());
+ out_data += buffer->size();
+ }
+ return std::move(out);
+}
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/buffer.h b/contrib/libs/apache/arrow/cpp/src/arrow/buffer.h
new file mode 100644
index 00000000000..6c47a464b1d
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/buffer.h
@@ -0,0 +1,496 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/device.h"
+#include "arrow/status.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/string_view.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+// ----------------------------------------------------------------------
+// Buffer classes
+
+/// \class Buffer
+/// \brief Object containing a pointer to a piece of contiguous memory with a
+/// particular size.
+///
+/// Buffers have two related notions of length: size and capacity. Size is
+/// the number of bytes that might have valid data. Capacity is the number
+/// of bytes that were allocated for the buffer in total.
+///
+/// The Buffer base class does not own its memory, but subclasses often do.
+///
+/// The following invariant is always true: Size <= Capacity
+class ARROW_EXPORT Buffer {
+ public:
+ /// \brief Construct from buffer and size without copying memory
+ ///
+ /// \param[in] data a memory buffer
+ /// \param[in] size buffer size
+ ///
+ /// \note The passed memory must be kept alive through some other means
+ Buffer(const uint8_t* data, int64_t size)
+ : is_mutable_(false), is_cpu_(true), data_(data), size_(size), capacity_(size) {
+ SetMemoryManager(default_cpu_memory_manager());
+ }
+
+ Buffer(const uint8_t* data, int64_t size, std::shared_ptr<MemoryManager> mm,
+ std::shared_ptr<Buffer> parent = NULLPTR)
+ : is_mutable_(false), data_(data), size_(size), capacity_(size), parent_(parent) {
+ SetMemoryManager(std::move(mm));
+ }
+
+ Buffer(uintptr_t address, int64_t size, std::shared_ptr<MemoryManager> mm,
+ std::shared_ptr<Buffer> parent = NULLPTR)
+ : Buffer(reinterpret_cast<const uint8_t*>(address), size, std::move(mm),
+ std::move(parent)) {}
+
+ /// \brief Construct from string_view without copying memory
+ ///
+ /// \param[in] data a string_view object
+ ///
+ /// \note The memory viewed by data must not be deallocated in the lifetime of the
+ /// Buffer; temporary rvalue strings must be stored in an lvalue somewhere
+ explicit Buffer(util::string_view data)
+ : Buffer(reinterpret_cast<const uint8_t*>(data.data()),
+ static_cast<int64_t>(data.size())) {}
+
+ virtual ~Buffer() = default;
+
+ /// An offset into data that is owned by another buffer, but we want to be
+ /// able to retain a valid pointer to it even after other shared_ptr's to the
+ /// parent buffer have been destroyed
+ ///
+ /// This method makes no assertions about alignment or padding of the buffer but
+ /// in general we expected buffers to be aligned and padded to 64 bytes. In the future
+ /// we might add utility methods to help determine if a buffer satisfies this contract.
+ Buffer(const std::shared_ptr<Buffer>& parent, const int64_t offset, const int64_t size)
+ : Buffer(parent->data_ + offset, size) {
+ parent_ = parent;
+ SetMemoryManager(parent->memory_manager_);
+ }
+
+ uint8_t operator[](std::size_t i) const { return data_[i]; }
+
+ /// \brief Construct a new std::string with a hexadecimal representation of the buffer.
+ /// \return std::string
+ std::string ToHexString();
+
+ /// Return true if both buffers are the same size and contain the same bytes
+ /// up to the number of compared bytes
+ bool Equals(const Buffer& other, int64_t nbytes) const;
+
+ /// Return true if both buffers are the same size and contain the same bytes
+ bool Equals(const Buffer& other) const;
+
+ /// Copy a section of the buffer into a new Buffer.
+ Result<std::shared_ptr<Buffer>> CopySlice(
+ const int64_t start, const int64_t nbytes,
+ MemoryPool* pool = default_memory_pool()) const;
+
+ /// Zero bytes in padding, i.e. bytes between size_ and capacity_.
+ void ZeroPadding() {
+#ifndef NDEBUG
+ CheckMutable();
+#endif
+ // A zero-capacity buffer can have a null data pointer
+ if (capacity_ != 0) {
+ memset(mutable_data() + size_, 0, static_cast<size_t>(capacity_ - size_));
+ }
+ }
+
+ /// \brief Construct an immutable buffer that takes ownership of the contents
+ /// of an std::string (without copying it).
+ ///
+ /// \param[in] data a string to own
+ /// \return a new Buffer instance
+ static std::shared_ptr<Buffer> FromString(std::string data);
+
+ /// \brief Create buffer referencing typed memory with some length without
+ /// copying
+ /// \param[in] data the typed memory as C array
+ /// \param[in] length the number of values in the array
+ /// \return a new shared_ptr<Buffer>
+ template <typename T, typename SizeType = int64_t>
+ static std::shared_ptr<Buffer> Wrap(const T* data, SizeType length) {
+ return std::make_shared<Buffer>(reinterpret_cast<const uint8_t*>(data),
+ static_cast<int64_t>(sizeof(T) * length));
+ }
+
+ /// \brief Create buffer referencing std::vector with some length without
+ /// copying
+ /// \param[in] data the vector to be referenced. If this vector is changed,
+ /// the buffer may become invalid
+ /// \return a new shared_ptr<Buffer>
+ template <typename T>
+ static std::shared_ptr<Buffer> Wrap(const std::vector<T>& data) {
+ return std::make_shared<Buffer>(reinterpret_cast<const uint8_t*>(data.data()),
+ static_cast<int64_t>(sizeof(T) * data.size()));
+ }
+
+ /// \brief Copy buffer contents into a new std::string
+ /// \return std::string
+ /// \note Can throw std::bad_alloc if buffer is large
+ std::string ToString() const;
+
+ /// \brief View buffer contents as a util::string_view
+ /// \return util::string_view
+ explicit operator util::string_view() const {
+ return util::string_view(reinterpret_cast<const char*>(data_), size_);
+ }
+
+ /// \brief View buffer contents as a util::bytes_view
+ /// \return util::bytes_view
+ explicit operator util::bytes_view() const { return util::bytes_view(data_, size_); }
+
+ /// \brief Return a pointer to the buffer's data
+ ///
+ /// The buffer has to be a CPU buffer (`is_cpu()` is true).
+ /// Otherwise, an assertion may be thrown or a null pointer may be returned.
+ ///
+ /// To get the buffer's data address regardless of its device, call `address()`.
+ const uint8_t* data() const {
+#ifndef NDEBUG
+ CheckCPU();
+#endif
+ return ARROW_PREDICT_TRUE(is_cpu_) ? data_ : NULLPTR;
+ }
+
+ /// \brief Return a writable pointer to the buffer's data
+ ///
+ /// The buffer has to be a mutable CPU buffer (`is_cpu()` and `is_mutable()`
+ /// are true). Otherwise, an assertion may be thrown or a null pointer may
+ /// be returned.
+ ///
+ /// To get the buffer's mutable data address regardless of its device, call
+ /// `mutable_address()`.
+ uint8_t* mutable_data() {
+#ifndef NDEBUG
+ CheckCPU();
+ CheckMutable();
+#endif
+ return ARROW_PREDICT_TRUE(is_cpu_ && is_mutable_) ? const_cast<uint8_t*>(data_)
+ : NULLPTR;
+ }
+
+ /// \brief Return the device address of the buffer's data
+ uintptr_t address() const { return reinterpret_cast<uintptr_t>(data_); }
+
+ /// \brief Return a writable device address to the buffer's data
+ ///
+ /// The buffer has to be a mutable buffer (`is_mutable()` is true).
+ /// Otherwise, an assertion may be thrown or 0 may be returned.
+ uintptr_t mutable_address() const {
+#ifndef NDEBUG
+ CheckMutable();
+#endif
+ return ARROW_PREDICT_TRUE(is_mutable_) ? reinterpret_cast<uintptr_t>(data_) : 0;
+ }
+
+ /// \brief Return the buffer's size in bytes
+ int64_t size() const { return size_; }
+
+ /// \brief Return the buffer's capacity (number of allocated bytes)
+ int64_t capacity() const { return capacity_; }
+
+ /// \brief Whether the buffer is directly CPU-accessible
+ ///
+ /// If this function returns true, you can read directly from the buffer's
+ /// `data()` pointer. Otherwise, you'll have to `View()` or `Copy()` it.
+ bool is_cpu() const { return is_cpu_; }
+
+ /// \brief Whether the buffer is mutable
+ ///
+ /// If this function returns true, you are allowed to modify buffer contents
+ /// using the pointer returned by `mutable_data()` or `mutable_address()`.
+ bool is_mutable() const { return is_mutable_; }
+
+ const std::shared_ptr<Device>& device() const { return memory_manager_->device(); }
+
+ const std::shared_ptr<MemoryManager>& memory_manager() const { return memory_manager_; }
+
+ std::shared_ptr<Buffer> parent() const { return parent_; }
+
+ /// \brief Get a RandomAccessFile for reading a buffer
+ ///
+ /// The returned file object reads from this buffer's underlying memory.
+ static Result<std::shared_ptr<io::RandomAccessFile>> GetReader(std::shared_ptr<Buffer>);
+
+ /// \brief Get a OutputStream for writing to a buffer
+ ///
+ /// The buffer must be mutable. The returned stream object writes into the buffer's
+ /// underlying memory (but it won't resize it).
+ static Result<std::shared_ptr<io::OutputStream>> GetWriter(std::shared_ptr<Buffer>);
+
+ /// \brief Copy buffer
+ ///
+ /// The buffer contents will be copied into a new buffer allocated by the
+ /// given MemoryManager. This function supports cross-device copies.
+ static Result<std::shared_ptr<Buffer>> Copy(std::shared_ptr<Buffer> source,
+ const std::shared_ptr<MemoryManager>& to);
+
+ /// \brief View buffer
+ ///
+ /// Return a Buffer that reflects this buffer, seen potentially from another
+ /// device, without making an explicit copy of the contents. The underlying
+ /// mechanism is typically implemented by the kernel or device driver, and may
+ /// involve lazy caching of parts of the buffer contents on the destination
+ /// device's memory.
+ ///
+ /// If a non-copy view is unsupported for the buffer on the given device,
+ /// nullptr is returned. An error can be returned if some low-level
+ /// operation fails (such as an out-of-memory condition).
+ static Result<std::shared_ptr<Buffer>> View(std::shared_ptr<Buffer> source,
+ const std::shared_ptr<MemoryManager>& to);
+
+ /// \brief View or copy buffer
+ ///
+ /// Try to view buffer contents on the given MemoryManager's device, but
+ /// fall back to copying if a no-copy view isn't supported.
+ static Result<std::shared_ptr<Buffer>> ViewOrCopy(
+ std::shared_ptr<Buffer> source, const std::shared_ptr<MemoryManager>& to);
+
+ protected:
+ bool is_mutable_;
+ bool is_cpu_;
+ const uint8_t* data_;
+ int64_t size_;
+ int64_t capacity_;
+
+ // null by default, but may be set
+ std::shared_ptr<Buffer> parent_;
+
+ private:
+ // private so that subclasses are forced to call SetMemoryManager()
+ std::shared_ptr<MemoryManager> memory_manager_;
+
+ protected:
+ void CheckMutable() const;
+ void CheckCPU() const;
+
+ void SetMemoryManager(std::shared_ptr<MemoryManager> mm) {
+ memory_manager_ = std::move(mm);
+ is_cpu_ = memory_manager_->is_cpu();
+ }
+
+ private:
+ Buffer() = delete;
+ ARROW_DISALLOW_COPY_AND_ASSIGN(Buffer);
+};
+
+/// \defgroup buffer-slicing-functions Functions for slicing buffers
+///
+/// @{
+
+/// \brief Construct a view on a buffer at the given offset and length.
+///
+/// This function cannot fail and does not check for errors (except in debug builds)
+static inline std::shared_ptr<Buffer> SliceBuffer(const std::shared_ptr<Buffer>& buffer,
+ const int64_t offset,
+ const int64_t length) {
+ return std::make_shared<Buffer>(buffer, offset, length);
+}
+
+/// \brief Construct a view on a buffer at the given offset, up to the buffer's end.
+///
+/// This function cannot fail and does not check for errors (except in debug builds)
+static inline std::shared_ptr<Buffer> SliceBuffer(const std::shared_ptr<Buffer>& buffer,
+ const int64_t offset) {
+ int64_t length = buffer->size() - offset;
+ return SliceBuffer(buffer, offset, length);
+}
+
+/// \brief Input-checking version of SliceBuffer
+///
+/// An Invalid Status is returned if the requested slice falls out of bounds.
+ARROW_EXPORT
+Result<std::shared_ptr<Buffer>> SliceBufferSafe(const std::shared_ptr<Buffer>& buffer,
+ int64_t offset);
+/// \brief Input-checking version of SliceBuffer
+///
+/// An Invalid Status is returned if the requested slice falls out of bounds.
+/// Note that unlike SliceBuffer, `length` isn't clamped to the available buffer size.
+ARROW_EXPORT
+Result<std::shared_ptr<Buffer>> SliceBufferSafe(const std::shared_ptr<Buffer>& buffer,
+ int64_t offset, int64_t length);
+
+/// \brief Like SliceBuffer, but construct a mutable buffer slice.
+///
+/// If the parent buffer is not mutable, behavior is undefined (it may abort
+/// in debug builds).
+ARROW_EXPORT
+std::shared_ptr<Buffer> SliceMutableBuffer(const std::shared_ptr<Buffer>& buffer,
+ const int64_t offset, const int64_t length);
+
+/// \brief Like SliceBuffer, but construct a mutable buffer slice.
+///
+/// If the parent buffer is not mutable, behavior is undefined (it may abort
+/// in debug builds).
+static inline std::shared_ptr<Buffer> SliceMutableBuffer(
+ const std::shared_ptr<Buffer>& buffer, const int64_t offset) {
+ int64_t length = buffer->size() - offset;
+ return SliceMutableBuffer(buffer, offset, length);
+}
+
+/// \brief Input-checking version of SliceMutableBuffer
+///
+/// An Invalid Status is returned if the requested slice falls out of bounds.
+ARROW_EXPORT
+Result<std::shared_ptr<Buffer>> SliceMutableBufferSafe(
+ const std::shared_ptr<Buffer>& buffer, int64_t offset);
+/// \brief Input-checking version of SliceMutableBuffer
+///
+/// An Invalid Status is returned if the requested slice falls out of bounds.
+/// Note that unlike SliceBuffer, `length` isn't clamped to the available buffer size.
+ARROW_EXPORT
+Result<std::shared_ptr<Buffer>> SliceMutableBufferSafe(
+ const std::shared_ptr<Buffer>& buffer, int64_t offset, int64_t length);
+
+/// @}
+
+/// \class MutableBuffer
+/// \brief A Buffer whose contents can be mutated. May or may not own its data.
+class ARROW_EXPORT MutableBuffer : public Buffer {
+ public:
+ MutableBuffer(uint8_t* data, const int64_t size) : Buffer(data, size) {
+ is_mutable_ = true;
+ }
+
+ MutableBuffer(uint8_t* data, const int64_t size, std::shared_ptr<MemoryManager> mm)
+ : Buffer(data, size, std::move(mm)) {
+ is_mutable_ = true;
+ }
+
+ MutableBuffer(const std::shared_ptr<Buffer>& parent, const int64_t offset,
+ const int64_t size);
+
+ /// \brief Create buffer referencing typed memory with some length
+ /// \param[in] data the typed memory as C array
+ /// \param[in] length the number of values in the array
+ /// \return a new shared_ptr<Buffer>
+ template <typename T, typename SizeType = int64_t>
+ static std::shared_ptr<Buffer> Wrap(T* data, SizeType length) {
+ return std::make_shared<MutableBuffer>(reinterpret_cast<uint8_t*>(data),
+ static_cast<int64_t>(sizeof(T) * length));
+ }
+
+ protected:
+ MutableBuffer() : Buffer(NULLPTR, 0) {}
+};
+
+/// \class ResizableBuffer
+/// \brief A mutable buffer that can be resized
+class ARROW_EXPORT ResizableBuffer : public MutableBuffer {
+ public:
+ /// Change buffer reported size to indicated size, allocating memory if
+ /// necessary. This will ensure that the capacity of the buffer is a multiple
+ /// of 64 bytes as defined in Layout.md.
+ /// Consider using ZeroPadding afterwards, to conform to the Arrow layout
+ /// specification.
+ ///
+ /// @param new_size The new size for the buffer.
+ /// @param shrink_to_fit Whether to shrink the capacity if new size < current size
+ virtual Status Resize(const int64_t new_size, bool shrink_to_fit = true) = 0;
+
+ /// Ensure that buffer has enough memory allocated to fit the indicated
+ /// capacity (and meets the 64 byte padding requirement in Layout.md).
+ /// It does not change buffer's reported size and doesn't zero the padding.
+ virtual Status Reserve(const int64_t new_capacity) = 0;
+
+ template <class T>
+ Status TypedResize(const int64_t new_nb_elements, bool shrink_to_fit = true) {
+ return Resize(sizeof(T) * new_nb_elements, shrink_to_fit);
+ }
+
+ template <class T>
+ Status TypedReserve(const int64_t new_nb_elements) {
+ return Reserve(sizeof(T) * new_nb_elements);
+ }
+
+ protected:
+ ResizableBuffer(uint8_t* data, int64_t size) : MutableBuffer(data, size) {}
+ ResizableBuffer(uint8_t* data, int64_t size, std::shared_ptr<MemoryManager> mm)
+ : MutableBuffer(data, size, std::move(mm)) {}
+};
+
+/// \defgroup buffer-allocation-functions Functions for allocating buffers
+///
+/// @{
+
+/// \brief Allocate a fixed size mutable buffer from a memory pool, zero its padding.
+///
+/// \param[in] size size of buffer to allocate
+/// \param[in] pool a memory pool
+ARROW_EXPORT
+Result<std::unique_ptr<Buffer>> AllocateBuffer(const int64_t size,
+ MemoryPool* pool = NULLPTR);
+
+/// \brief Allocate a resizeable buffer from a memory pool, zero its padding.
+///
+/// \param[in] size size of buffer to allocate
+/// \param[in] pool a memory pool
+ARROW_EXPORT
+Result<std::unique_ptr<ResizableBuffer>> AllocateResizableBuffer(
+ const int64_t size, MemoryPool* pool = NULLPTR);
+
+/// \brief Allocate a bitmap buffer from a memory pool
+/// no guarantee on values is provided.
+///
+/// \param[in] length size in bits of bitmap to allocate
+/// \param[in] pool memory pool to allocate memory from
+ARROW_EXPORT
+Result<std::shared_ptr<Buffer>> AllocateBitmap(int64_t length,
+ MemoryPool* pool = NULLPTR);
+
+ARROW_EXPORT
+Status AllocateBitmap(MemoryPool* pool, int64_t length, std::shared_ptr<Buffer>* out);
+
+/// \brief Allocate a zero-initialized bitmap buffer from a memory pool
+///
+/// \param[in] length size in bits of bitmap to allocate
+/// \param[in] pool memory pool to allocate memory from
+ARROW_EXPORT
+Result<std::shared_ptr<Buffer>> AllocateEmptyBitmap(int64_t length,
+ MemoryPool* pool = NULLPTR);
+
+/// \brief Concatenate multiple buffers into a single buffer
+///
+/// \param[in] buffers to be concatenated
+/// \param[in] pool memory pool to allocate the new buffer from
+ARROW_EXPORT
+Result<std::shared_ptr<Buffer>> ConcatenateBuffers(const BufferVector& buffers,
+ MemoryPool* pool = NULLPTR);
+
+ARROW_EXPORT
+Status ConcatenateBuffers(const BufferVector& buffers, MemoryPool* pool,
+ std::shared_ptr<Buffer>* out);
+
+/// @}
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/buffer_builder.h b/contrib/libs/apache/arrow/cpp/src/arrow/buffer_builder.h
new file mode 100644
index 00000000000..c6250ae2b76
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/buffer_builder.h
@@ -0,0 +1,450 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "arrow/buffer.h"
+#include "arrow/status.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_generate.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/ubsan.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+// ----------------------------------------------------------------------
+// Buffer builder classes
+
+/// \class BufferBuilder
+/// \brief A class for incrementally building a contiguous chunk of in-memory
+/// data
+class ARROW_EXPORT BufferBuilder {
+ public:
+ explicit BufferBuilder(MemoryPool* pool = default_memory_pool())
+ : pool_(pool),
+ data_(/*ensure never null to make ubsan happy and avoid check penalties below*/
+ &util::internal::non_null_filler),
+
+ capacity_(0),
+ size_(0) {}
+
+ /// \brief Constructs new Builder that will start using
+ /// the provided buffer until Finish/Reset are called.
+ /// The buffer is not resized.
+ explicit BufferBuilder(std::shared_ptr<ResizableBuffer> buffer,
+ MemoryPool* pool = default_memory_pool())
+ : buffer_(std::move(buffer)),
+ pool_(pool),
+ data_(buffer_->mutable_data()),
+ capacity_(buffer_->capacity()),
+ size_(buffer_->size()) {}
+
+ /// \brief Resize the buffer to the nearest multiple of 64 bytes
+ ///
+ /// \param new_capacity the new capacity of the of the builder. Will be
+ /// rounded up to a multiple of 64 bytes for padding
+ /// \param shrink_to_fit if new capacity is smaller than the existing,
+ /// reallocate internal buffer. Set to false to avoid reallocations when
+ /// shrinking the builder.
+ /// \return Status
+ Status Resize(const int64_t new_capacity, bool shrink_to_fit = true) {
+ if (buffer_ == NULLPTR) {
+ ARROW_ASSIGN_OR_RAISE(buffer_, AllocateResizableBuffer(new_capacity, pool_));
+ } else {
+ ARROW_RETURN_NOT_OK(buffer_->Resize(new_capacity, shrink_to_fit));
+ }
+ capacity_ = buffer_->capacity();
+ data_ = buffer_->mutable_data();
+ return Status::OK();
+ }
+
+ /// \brief Ensure that builder can accommodate the additional number of bytes
+ /// without the need to perform allocations
+ ///
+ /// \param[in] additional_bytes number of additional bytes to make space for
+ /// \return Status
+ Status Reserve(const int64_t additional_bytes) {
+ auto min_capacity = size_ + additional_bytes;
+ if (min_capacity <= capacity_) {
+ return Status::OK();
+ }
+ return Resize(GrowByFactor(capacity_, min_capacity), false);
+ }
+
+ /// \brief Return a capacity expanded by the desired growth factor
+ static int64_t GrowByFactor(int64_t current_capacity, int64_t new_capacity) {
+ // Doubling capacity except for large Reserve requests. 2x growth strategy
+ // (versus 1.5x) seems to have slightly better performance when using
+ // jemalloc, but significantly better performance when using the system
+ // allocator. See ARROW-6450 for further discussion
+ return std::max(new_capacity, current_capacity * 2);
+ }
+
+ /// \brief Append the given data to the buffer
+ ///
+ /// The buffer is automatically expanded if necessary.
+ Status Append(const void* data, const int64_t length) {
+ if (ARROW_PREDICT_FALSE(size_ + length > capacity_)) {
+ ARROW_RETURN_NOT_OK(Resize(GrowByFactor(capacity_, size_ + length), false));
+ }
+ UnsafeAppend(data, length);
+ return Status::OK();
+ }
+
+ /// \brief Append copies of a value to the buffer
+ ///
+ /// The buffer is automatically expanded if necessary.
+ Status Append(const int64_t num_copies, uint8_t value) {
+ ARROW_RETURN_NOT_OK(Reserve(num_copies));
+ UnsafeAppend(num_copies, value);
+ return Status::OK();
+ }
+
+ // Advance pointer and zero out memory
+ Status Advance(const int64_t length) { return Append(length, 0); }
+
+ // Advance pointer, but don't allocate or zero memory
+ void UnsafeAdvance(const int64_t length) { size_ += length; }
+
+ // Unsafe methods don't check existing size
+ void UnsafeAppend(const void* data, const int64_t length) {
+ memcpy(data_ + size_, data, static_cast<size_t>(length));
+ size_ += length;
+ }
+
+ void UnsafeAppend(const int64_t num_copies, uint8_t value) {
+ memset(data_ + size_, value, static_cast<size_t>(num_copies));
+ size_ += num_copies;
+ }
+
+ /// \brief Return result of builder as a Buffer object.
+ ///
+ /// The builder is reset and can be reused afterwards.
+ ///
+ /// \param[out] out the finalized Buffer object
+ /// \param shrink_to_fit if the buffer size is smaller than its capacity,
+ /// reallocate to fit more tightly in memory. Set to false to avoid
+ /// a reallocation, at the expense of potentially more memory consumption.
+ /// \return Status
+ Status Finish(std::shared_ptr<Buffer>* out, bool shrink_to_fit = true) {
+ ARROW_RETURN_NOT_OK(Resize(size_, shrink_to_fit));
+ if (size_ != 0) buffer_->ZeroPadding();
+ *out = buffer_;
+ if (*out == NULLPTR) {
+ ARROW_ASSIGN_OR_RAISE(*out, AllocateBuffer(0, pool_));
+ }
+ Reset();
+ return Status::OK();
+ }
+
+ Result<std::shared_ptr<Buffer>> Finish(bool shrink_to_fit = true) {
+ std::shared_ptr<Buffer> out;
+ ARROW_RETURN_NOT_OK(Finish(&out, shrink_to_fit));
+ return out;
+ }
+
+ /// \brief Like Finish, but override the final buffer size
+ ///
+ /// This is useful after writing data directly into the builder memory
+ /// without calling the Append methods (basically, when using BufferBuilder
+ /// mostly for memory allocation).
+ Result<std::shared_ptr<Buffer>> FinishWithLength(int64_t final_length,
+ bool shrink_to_fit = true) {
+ size_ = final_length;
+ return Finish(shrink_to_fit);
+ }
+
+ void Reset() {
+ buffer_ = NULLPTR;
+ capacity_ = size_ = 0;
+ }
+
+ /// \brief Set size to a smaller value without modifying builder
+ /// contents. For reusable BufferBuilder classes
+ /// \param[in] position must be non-negative and less than or equal
+ /// to the current length()
+ void Rewind(int64_t position) { size_ = position; }
+
+ int64_t capacity() const { return capacity_; }
+ int64_t length() const { return size_; }
+ const uint8_t* data() const { return data_; }
+ uint8_t* mutable_data() { return data_; }
+
+ private:
+ std::shared_ptr<ResizableBuffer> buffer_;
+ MemoryPool* pool_;
+ uint8_t* data_;
+ int64_t capacity_;
+ int64_t size_;
+};
+
+template <typename T, typename Enable = void>
+class TypedBufferBuilder;
+
+/// \brief A BufferBuilder for building a buffer of arithmetic elements
+template <typename T>
+class TypedBufferBuilder<
+ T, typename std::enable_if<std::is_arithmetic<T>::value ||
+ std::is_standard_layout<T>::value>::type> {
+ public:
+ explicit TypedBufferBuilder(MemoryPool* pool = default_memory_pool())
+ : bytes_builder_(pool) {}
+
+ explicit TypedBufferBuilder(std::shared_ptr<ResizableBuffer> buffer,
+ MemoryPool* pool = default_memory_pool())
+ : bytes_builder_(std::move(buffer), pool) {}
+
+ explicit TypedBufferBuilder(BufferBuilder builder)
+ : bytes_builder_(std::move(builder)) {}
+
+ BufferBuilder* bytes_builder() { return &bytes_builder_; }
+
+ Status Append(T value) {
+ return bytes_builder_.Append(reinterpret_cast<uint8_t*>(&value), sizeof(T));
+ }
+
+ Status Append(const T* values, int64_t num_elements) {
+ return bytes_builder_.Append(reinterpret_cast<const uint8_t*>(values),
+ num_elements * sizeof(T));
+ }
+
+ Status Append(const int64_t num_copies, T value) {
+ ARROW_RETURN_NOT_OK(Reserve(num_copies + length()));
+ UnsafeAppend(num_copies, value);
+ return Status::OK();
+ }
+
+ void UnsafeAppend(T value) {
+ bytes_builder_.UnsafeAppend(reinterpret_cast<uint8_t*>(&value), sizeof(T));
+ }
+
+ void UnsafeAppend(const T* values, int64_t num_elements) {
+ bytes_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values),
+ num_elements * sizeof(T));
+ }
+
+ template <typename Iter>
+ void UnsafeAppend(Iter values_begin, Iter values_end) {
+ int64_t num_elements = static_cast<int64_t>(std::distance(values_begin, values_end));
+ auto data = mutable_data() + length();
+ bytes_builder_.UnsafeAdvance(num_elements * sizeof(T));
+ std::copy(values_begin, values_end, data);
+ }
+
+ void UnsafeAppend(const int64_t num_copies, T value) {
+ auto data = mutable_data() + length();
+ bytes_builder_.UnsafeAdvance(num_copies * sizeof(T));
+ std::fill(data, data + num_copies, value);
+ }
+
+ Status Resize(const int64_t new_capacity, bool shrink_to_fit = true) {
+ return bytes_builder_.Resize(new_capacity * sizeof(T), shrink_to_fit);
+ }
+
+ Status Reserve(const int64_t additional_elements) {
+ return bytes_builder_.Reserve(additional_elements * sizeof(T));
+ }
+
+ Status Advance(const int64_t length) {
+ return bytes_builder_.Advance(length * sizeof(T));
+ }
+
+ Status Finish(std::shared_ptr<Buffer>* out, bool shrink_to_fit = true) {
+ return bytes_builder_.Finish(out, shrink_to_fit);
+ }
+
+ Result<std::shared_ptr<Buffer>> Finish(bool shrink_to_fit = true) {
+ std::shared_ptr<Buffer> out;
+ ARROW_RETURN_NOT_OK(Finish(&out, shrink_to_fit));
+ return out;
+ }
+
+ /// \brief Like Finish, but override the final buffer size
+ ///
+ /// This is useful after writing data directly into the builder memory
+ /// without calling the Append methods (basically, when using TypedBufferBuilder
+ /// only for memory allocation).
+ Result<std::shared_ptr<Buffer>> FinishWithLength(int64_t final_length,
+ bool shrink_to_fit = true) {
+ return bytes_builder_.FinishWithLength(final_length * sizeof(T), shrink_to_fit);
+ }
+
+ void Reset() { bytes_builder_.Reset(); }
+
+ int64_t length() const { return bytes_builder_.length() / sizeof(T); }
+ int64_t capacity() const { return bytes_builder_.capacity() / sizeof(T); }
+ const T* data() const { return reinterpret_cast<const T*>(bytes_builder_.data()); }
+ T* mutable_data() { return reinterpret_cast<T*>(bytes_builder_.mutable_data()); }
+
+ private:
+ BufferBuilder bytes_builder_;
+};
+
+/// \brief A BufferBuilder for building a buffer containing a bitmap
+template <>
+class TypedBufferBuilder<bool> {
+ public:
+ explicit TypedBufferBuilder(MemoryPool* pool = default_memory_pool())
+ : bytes_builder_(pool) {}
+
+ explicit TypedBufferBuilder(BufferBuilder builder)
+ : bytes_builder_(std::move(builder)) {}
+
+ BufferBuilder* bytes_builder() { return &bytes_builder_; }
+
+ Status Append(bool value) {
+ ARROW_RETURN_NOT_OK(Reserve(1));
+ UnsafeAppend(value);
+ return Status::OK();
+ }
+
+ Status Append(const uint8_t* valid_bytes, int64_t num_elements) {
+ ARROW_RETURN_NOT_OK(Reserve(num_elements));
+ UnsafeAppend(valid_bytes, num_elements);
+ return Status::OK();
+ }
+
+ Status Append(const int64_t num_copies, bool value) {
+ ARROW_RETURN_NOT_OK(Reserve(num_copies));
+ UnsafeAppend(num_copies, value);
+ return Status::OK();
+ }
+
+ void UnsafeAppend(bool value) {
+ BitUtil::SetBitTo(mutable_data(), bit_length_, value);
+ if (!value) {
+ ++false_count_;
+ }
+ ++bit_length_;
+ }
+
+ void UnsafeAppend(const uint8_t* bytes, int64_t num_elements) {
+ if (num_elements == 0) return;
+ int64_t i = 0;
+ internal::GenerateBitsUnrolled(mutable_data(), bit_length_, num_elements, [&] {
+ bool value = bytes[i++];
+ false_count_ += !value;
+ return value;
+ });
+ bit_length_ += num_elements;
+ }
+
+ void UnsafeAppend(const int64_t num_copies, bool value) {
+ BitUtil::SetBitsTo(mutable_data(), bit_length_, num_copies, value);
+ false_count_ += num_copies * !value;
+ bit_length_ += num_copies;
+ }
+
+ template <bool count_falses, typename Generator>
+ void UnsafeAppend(const int64_t num_elements, Generator&& gen) {
+ if (num_elements == 0) return;
+
+ if (count_falses) {
+ internal::GenerateBitsUnrolled(mutable_data(), bit_length_, num_elements, [&] {
+ bool value = gen();
+ false_count_ += !value;
+ return value;
+ });
+ } else {
+ internal::GenerateBitsUnrolled(mutable_data(), bit_length_, num_elements,
+ std::forward<Generator>(gen));
+ }
+ bit_length_ += num_elements;
+ }
+
+ Status Resize(const int64_t new_capacity, bool shrink_to_fit = true) {
+ const int64_t old_byte_capacity = bytes_builder_.capacity();
+ ARROW_RETURN_NOT_OK(
+ bytes_builder_.Resize(BitUtil::BytesForBits(new_capacity), shrink_to_fit));
+ // Resize() may have chosen a larger capacity (e.g. for padding),
+ // so ask it again before calling memset().
+ const int64_t new_byte_capacity = bytes_builder_.capacity();
+ if (new_byte_capacity > old_byte_capacity) {
+ // The additional buffer space is 0-initialized for convenience,
+ // so that other methods can simply bump the length.
+ memset(mutable_data() + old_byte_capacity, 0,
+ static_cast<size_t>(new_byte_capacity - old_byte_capacity));
+ }
+ return Status::OK();
+ }
+
+ Status Reserve(const int64_t additional_elements) {
+ return Resize(
+ BufferBuilder::GrowByFactor(bit_length_, bit_length_ + additional_elements),
+ false);
+ }
+
+ Status Advance(const int64_t length) {
+ ARROW_RETURN_NOT_OK(Reserve(length));
+ bit_length_ += length;
+ false_count_ += length;
+ return Status::OK();
+ }
+
+ Status Finish(std::shared_ptr<Buffer>* out, bool shrink_to_fit = true) {
+ // set bytes_builder_.size_ == byte size of data
+ bytes_builder_.UnsafeAdvance(BitUtil::BytesForBits(bit_length_) -
+ bytes_builder_.length());
+ bit_length_ = false_count_ = 0;
+ return bytes_builder_.Finish(out, shrink_to_fit);
+ }
+
+ Result<std::shared_ptr<Buffer>> Finish(bool shrink_to_fit = true) {
+ std::shared_ptr<Buffer> out;
+ ARROW_RETURN_NOT_OK(Finish(&out, shrink_to_fit));
+ return out;
+ }
+
+ /// \brief Like Finish, but override the final buffer size
+ ///
+ /// This is useful after writing data directly into the builder memory
+ /// without calling the Append methods (basically, when using TypedBufferBuilder
+ /// only for memory allocation).
+ Result<std::shared_ptr<Buffer>> FinishWithLength(int64_t final_length,
+ bool shrink_to_fit = true) {
+ const auto final_byte_length = BitUtil::BytesForBits(final_length);
+ bytes_builder_.UnsafeAdvance(final_byte_length - bytes_builder_.length());
+ bit_length_ = false_count_ = 0;
+ return bytes_builder_.FinishWithLength(final_byte_length, shrink_to_fit);
+ }
+
+ void Reset() {
+ bytes_builder_.Reset();
+ bit_length_ = false_count_ = 0;
+ }
+
+ int64_t length() const { return bit_length_; }
+ int64_t capacity() const { return bytes_builder_.capacity() * 8; }
+ const uint8_t* data() const { return bytes_builder_.data(); }
+ uint8_t* mutable_data() { return bytes_builder_.mutable_data(); }
+ int64_t false_count() const { return false_count_; }
+
+ private:
+ BufferBuilder bytes_builder_;
+ int64_t bit_length_ = 0;
+ int64_t false_count_ = 0;
+};
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/builder.cc b/contrib/libs/apache/arrow/cpp/src/arrow/builder.cc
new file mode 100644
index 00000000000..f22228a4588
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/builder.cc
@@ -0,0 +1,222 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/builder.h"
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/hashing.h"
+#include "arrow/visitor_inline.h"
+
+namespace arrow {
+
+class MemoryPool;
+
+// ----------------------------------------------------------------------
+// Helper functions
+
+struct DictionaryBuilderCase {
+ template <typename ValueType, typename Enable = typename ValueType::c_type>
+ Status Visit(const ValueType&) {
+ return CreateFor<ValueType>();
+ }
+
+ Status Visit(const NullType&) { return CreateFor<NullType>(); }
+ Status Visit(const BinaryType&) { return Create<BinaryDictionaryBuilder>(); }
+ Status Visit(const StringType&) { return Create<StringDictionaryBuilder>(); }
+ Status Visit(const LargeBinaryType&) {
+ return Create<DictionaryBuilder<LargeBinaryType>>();
+ }
+ Status Visit(const LargeStringType&) {
+ return Create<DictionaryBuilder<LargeStringType>>();
+ }
+ Status Visit(const FixedSizeBinaryType&) { return CreateFor<FixedSizeBinaryType>(); }
+ Status Visit(const Decimal128Type&) { return CreateFor<Decimal128Type>(); }
+ Status Visit(const Decimal256Type&) { return CreateFor<Decimal256Type>(); }
+
+ Status Visit(const DataType& value_type) { return NotImplemented(value_type); }
+ Status Visit(const HalfFloatType& value_type) { return NotImplemented(value_type); }
+ Status NotImplemented(const DataType& value_type) {
+ return Status::NotImplemented(
+ "MakeBuilder: cannot construct builder for dictionaries with value type ",
+ value_type);
+ }
+
+ template <typename ValueType>
+ Status CreateFor() {
+ return Create<DictionaryBuilder<ValueType>>();
+ }
+
+ template <typename BuilderType>
+ Status Create() {
+ BuilderType* builder;
+ if (dictionary != nullptr) {
+ builder = new BuilderType(dictionary, pool);
+ } else {
+ auto start_int_size = internal::GetByteWidth(*index_type);
+ builder = new BuilderType(start_int_size, value_type, pool);
+ }
+ out->reset(builder);
+ return Status::OK();
+ }
+
+ Status Make() { return VisitTypeInline(*value_type, this); }
+
+ MemoryPool* pool;
+ const std::shared_ptr<DataType>& index_type;
+ const std::shared_ptr<DataType>& value_type;
+ const std::shared_ptr<Array>& dictionary;
+ std::unique_ptr<ArrayBuilder>* out;
+};
+
+#define BUILDER_CASE(TYPE_CLASS) \
+ case TYPE_CLASS##Type::type_id: \
+ out->reset(new TYPE_CLASS##Builder(type, pool)); \
+ return Status::OK();
+
+Result<std::vector<std::shared_ptr<ArrayBuilder>>> FieldBuilders(const DataType& type,
+ MemoryPool* pool) {
+ std::vector<std::shared_ptr<ArrayBuilder>> field_builders;
+
+ for (const auto& field : type.fields()) {
+ std::unique_ptr<ArrayBuilder> builder;
+ RETURN_NOT_OK(MakeBuilder(pool, field->type(), &builder));
+ field_builders.emplace_back(std::move(builder));
+ }
+
+ return field_builders;
+}
+
+Status MakeBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type,
+ std::unique_ptr<ArrayBuilder>* out) {
+ switch (type->id()) {
+ case Type::NA: {
+ out->reset(new NullBuilder(pool));
+ return Status::OK();
+ }
+ BUILDER_CASE(UInt8);
+ BUILDER_CASE(Int8);
+ BUILDER_CASE(UInt16);
+ BUILDER_CASE(Int16);
+ BUILDER_CASE(UInt32);
+ BUILDER_CASE(Int32);
+ BUILDER_CASE(UInt64);
+ BUILDER_CASE(Int64);
+ BUILDER_CASE(Date32);
+ BUILDER_CASE(Date64);
+ BUILDER_CASE(Duration);
+ BUILDER_CASE(Time32);
+ BUILDER_CASE(Time64);
+ BUILDER_CASE(Timestamp);
+ BUILDER_CASE(MonthInterval);
+ BUILDER_CASE(DayTimeInterval);
+ BUILDER_CASE(Boolean);
+ BUILDER_CASE(HalfFloat);
+ BUILDER_CASE(Float);
+ BUILDER_CASE(Double);
+ BUILDER_CASE(String);
+ BUILDER_CASE(Binary);
+ BUILDER_CASE(LargeString);
+ BUILDER_CASE(LargeBinary);
+ BUILDER_CASE(FixedSizeBinary);
+ BUILDER_CASE(Decimal128);
+ BUILDER_CASE(Decimal256);
+
+ case Type::DICTIONARY: {
+ const auto& dict_type = static_cast<const DictionaryType&>(*type);
+ DictionaryBuilderCase visitor = {pool, dict_type.index_type(),
+ dict_type.value_type(), nullptr, out};
+ return visitor.Make();
+ }
+
+ case Type::LIST: {
+ std::unique_ptr<ArrayBuilder> value_builder;
+ std::shared_ptr<DataType> value_type =
+ internal::checked_cast<const ListType&>(*type).value_type();
+ RETURN_NOT_OK(MakeBuilder(pool, value_type, &value_builder));
+ out->reset(new ListBuilder(pool, std::move(value_builder), type));
+ return Status::OK();
+ }
+
+ case Type::LARGE_LIST: {
+ std::unique_ptr<ArrayBuilder> value_builder;
+ std::shared_ptr<DataType> value_type =
+ internal::checked_cast<const LargeListType&>(*type).value_type();
+ RETURN_NOT_OK(MakeBuilder(pool, value_type, &value_builder));
+ out->reset(new LargeListBuilder(pool, std::move(value_builder), type));
+ return Status::OK();
+ }
+
+ case Type::MAP: {
+ const auto& map_type = internal::checked_cast<const MapType&>(*type);
+ std::unique_ptr<ArrayBuilder> key_builder, item_builder;
+ RETURN_NOT_OK(MakeBuilder(pool, map_type.key_type(), &key_builder));
+ RETURN_NOT_OK(MakeBuilder(pool, map_type.item_type(), &item_builder));
+ out->reset(
+ new MapBuilder(pool, std::move(key_builder), std::move(item_builder), type));
+ return Status::OK();
+ }
+
+ case Type::FIXED_SIZE_LIST: {
+ const auto& list_type = internal::checked_cast<const FixedSizeListType&>(*type);
+ std::unique_ptr<ArrayBuilder> value_builder;
+ auto value_type = list_type.value_type();
+ RETURN_NOT_OK(MakeBuilder(pool, value_type, &value_builder));
+ out->reset(new FixedSizeListBuilder(pool, std::move(value_builder), type));
+ return Status::OK();
+ }
+
+ case Type::STRUCT: {
+ ARROW_ASSIGN_OR_RAISE(auto field_builders, FieldBuilders(*type, pool));
+ out->reset(new StructBuilder(type, pool, std::move(field_builders)));
+ return Status::OK();
+ }
+
+ case Type::SPARSE_UNION: {
+ ARROW_ASSIGN_OR_RAISE(auto field_builders, FieldBuilders(*type, pool));
+ out->reset(new SparseUnionBuilder(pool, std::move(field_builders), type));
+ return Status::OK();
+ }
+
+ case Type::DENSE_UNION: {
+ ARROW_ASSIGN_OR_RAISE(auto field_builders, FieldBuilders(*type, pool));
+ out->reset(new DenseUnionBuilder(pool, std::move(field_builders), type));
+ return Status::OK();
+ }
+
+ default:
+ break;
+ }
+ return Status::NotImplemented("MakeBuilder: cannot construct builder for type ",
+ type->ToString());
+}
+
+Status MakeDictionaryBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type,
+ const std::shared_ptr<Array>& dictionary,
+ std::unique_ptr<ArrayBuilder>* out) {
+ const auto& dict_type = static_cast<const DictionaryType&>(*type);
+ DictionaryBuilderCase visitor = {pool, dict_type.index_type(), dict_type.value_type(),
+ dictionary, out};
+ return visitor.Make();
+}
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/builder.h b/contrib/libs/apache/arrow/cpp/src/arrow/builder.h
new file mode 100644
index 00000000000..4b80e558004
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/builder.h
@@ -0,0 +1,32 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+
+#include "arrow/array/builder_adaptive.h" // IWYU pragma: keep
+#include "arrow/array/builder_base.h" // IWYU pragma: keep
+#include "arrow/array/builder_binary.h" // IWYU pragma: keep
+#include "arrow/array/builder_decimal.h" // IWYU pragma: keep
+#include "arrow/array/builder_dict.h" // IWYU pragma: keep
+#include "arrow/array/builder_nested.h" // IWYU pragma: keep
+#include "arrow/array/builder_primitive.h" // IWYU pragma: keep
+#include "arrow/array/builder_time.h" // IWYU pragma: keep
+#include "arrow/array/builder_union.h" // IWYU pragma: keep
+#include "arrow/status.h"
+#include "arrow/util/visibility.h"
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/c/abi.h b/contrib/libs/apache/arrow/cpp/src/arrow/c/abi.h
new file mode 100644
index 00000000000..a78170dbdbc
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/c/abi.h
@@ -0,0 +1,103 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define ARROW_FLAG_DICTIONARY_ORDERED 1
+#define ARROW_FLAG_NULLABLE 2
+#define ARROW_FLAG_MAP_KEYS_SORTED 4
+
+struct ArrowSchema {
+ // Array type description
+ const char* format;
+ const char* name;
+ const char* metadata;
+ int64_t flags;
+ int64_t n_children;
+ struct ArrowSchema** children;
+ struct ArrowSchema* dictionary;
+
+ // Release callback
+ void (*release)(struct ArrowSchema*);
+ // Opaque producer-specific data
+ void* private_data;
+};
+
+struct ArrowArray {
+ // Array data description
+ int64_t length;
+ int64_t null_count;
+ int64_t offset;
+ int64_t n_buffers;
+ int64_t n_children;
+ const void** buffers;
+ struct ArrowArray** children;
+ struct ArrowArray* dictionary;
+
+ // Release callback
+ void (*release)(struct ArrowArray*);
+ // Opaque producer-specific data
+ void* private_data;
+};
+
+// EXPERIMENTAL: C stream interface
+
+struct ArrowArrayStream {
+ // Callback to get the stream type
+ // (will be the same for all arrays in the stream).
+ //
+ // Return value: 0 if successful, an `errno`-compatible error code otherwise.
+ //
+ // If successful, the ArrowSchema must be released independently from the stream.
+ int (*get_schema)(struct ArrowArrayStream*, struct ArrowSchema* out);
+
+ // Callback to get the next array
+ // (if no error and the array is released, the stream has ended)
+ //
+ // Return value: 0 if successful, an `errno`-compatible error code otherwise.
+ //
+ // If successful, the ArrowArray must be released independently from the stream.
+ int (*get_next)(struct ArrowArrayStream*, struct ArrowArray* out);
+
+ // Callback to get optional detailed error information.
+ // This must only be called if the last stream operation failed
+ // with a non-0 return code.
+ //
+ // Return value: pointer to a null-terminated character array describing
+ // the last error, or NULL if no description is available.
+ //
+ // The returned pointer is only valid until the next operation on this stream
+ // (including release).
+ const char* (*get_last_error)(struct ArrowArrayStream*);
+
+ // Release callback: release the stream's own resources.
+ // Note that arrays returned by `get_next` must be individually released.
+ void (*release)(struct ArrowArrayStream*);
+
+ // Opaque producer-specific data
+ void* private_data;
+};
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/c/bridge.cc b/contrib/libs/apache/arrow/cpp/src/arrow/c/bridge.cc
new file mode 100644
index 00000000000..a43bf8104f2
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/c/bridge.cc
@@ -0,0 +1,1712 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/c/bridge.h"
+
+#include <algorithm>
+#include <cerrno>
+#include <cstring>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/buffer.h"
+#include "arrow/c/helpers.h"
+#include "arrow/c/util_internal.h"
+#include "arrow/memory_pool.h"
+#include "arrow/record_batch.h"
+#include "arrow/result.h"
+#include "arrow/stl_allocator.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/key_value_metadata.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/string_view.h"
+#include "arrow/util/value_parsing.h"
+#include "arrow/visitor_inline.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+using internal::checked_pointer_cast;
+
+using internal::ArrayExportGuard;
+using internal::ArrayExportTraits;
+using internal::SchemaExportGuard;
+using internal::SchemaExportTraits;
+
+// TODO export / import Extension types and arrays
+
+namespace {
+
+Status ExportingNotImplemented(const DataType& type) {
+ return Status::NotImplemented("Exporting ", type.ToString(), " array not supported");
+}
+
+// Allocate exported private data using MemoryPool,
+// to allow accounting memory and checking for memory leaks.
+
+// XXX use Gandiva's SimpleArena?
+
+template <typename T>
+using PoolVector = std::vector<T, ::arrow::stl::allocator<T>>;
+
+template <typename Derived>
+struct PoolAllocationMixin {
+ static void* operator new(size_t size) {
+ DCHECK_EQ(size, sizeof(Derived));
+ uint8_t* data;
+ ARROW_CHECK_OK(default_memory_pool()->Allocate(static_cast<int64_t>(size), &data));
+ return data;
+ }
+
+ static void operator delete(void* ptr) {
+ default_memory_pool()->Free(reinterpret_cast<uint8_t*>(ptr), sizeof(Derived));
+ }
+};
+
+//////////////////////////////////////////////////////////////////////////
+// C schema export
+
+struct ExportedSchemaPrivateData : PoolAllocationMixin<ExportedSchemaPrivateData> {
+ std::string format_;
+ std::string name_;
+ std::string metadata_;
+ struct ArrowSchema dictionary_;
+ PoolVector<struct ArrowSchema> children_;
+ PoolVector<struct ArrowSchema*> child_pointers_;
+
+ ExportedSchemaPrivateData() = default;
+ ARROW_DEFAULT_MOVE_AND_ASSIGN(ExportedSchemaPrivateData);
+ ARROW_DISALLOW_COPY_AND_ASSIGN(ExportedSchemaPrivateData);
+};
+
+void ReleaseExportedSchema(struct ArrowSchema* schema) {
+ if (ArrowSchemaIsReleased(schema)) {
+ return;
+ }
+ for (int64_t i = 0; i < schema->n_children; ++i) {
+ struct ArrowSchema* child = schema->children[i];
+ ArrowSchemaRelease(child);
+ DCHECK(ArrowSchemaIsReleased(child))
+ << "Child release callback should have marked it released";
+ }
+ struct ArrowSchema* dict = schema->dictionary;
+ if (dict != nullptr) {
+ ArrowSchemaRelease(dict);
+ DCHECK(ArrowSchemaIsReleased(dict))
+ << "Dictionary release callback should have marked it released";
+ }
+ DCHECK_NE(schema->private_data, nullptr);
+ delete reinterpret_cast<ExportedSchemaPrivateData*>(schema->private_data);
+
+ ArrowSchemaMarkReleased(schema);
+}
+
+template <typename SizeType>
+Result<int32_t> DowncastMetadataSize(SizeType size) {
+ auto res = static_cast<int32_t>(size);
+ if (res < 0 || static_cast<SizeType>(res) != size) {
+ return Status::Invalid("Metadata too large (more than 2**31 items or bytes)");
+ }
+ return res;
+}
+
+Result<std::string> EncodeMetadata(const KeyValueMetadata& metadata) {
+ ARROW_ASSIGN_OR_RAISE(auto npairs, DowncastMetadataSize(metadata.size()));
+ std::string exported;
+
+ // Pre-compute total string size
+ size_t total_size = 4;
+ for (int32_t i = 0; i < npairs; ++i) {
+ total_size += 8 + metadata.key(i).length() + metadata.value(i).length();
+ }
+ exported.resize(total_size);
+
+ char* data_start = &exported[0];
+ char* data = data_start;
+ auto write_int32 = [&](int32_t v) -> void {
+ memcpy(data, &v, 4);
+ data += 4;
+ };
+ auto write_string = [&](const std::string& s) -> Status {
+ ARROW_ASSIGN_OR_RAISE(auto len, DowncastMetadataSize(s.length()));
+ write_int32(len);
+ if (len > 0) {
+ memcpy(data, s.data(), len);
+ data += len;
+ }
+ return Status::OK();
+ };
+
+ write_int32(npairs);
+ for (int32_t i = 0; i < npairs; ++i) {
+ RETURN_NOT_OK(write_string(metadata.key(i)));
+ RETURN_NOT_OK(write_string(metadata.value(i)));
+ }
+ DCHECK_EQ(static_cast<size_t>(data - data_start), total_size);
+ return exported;
+}
+
+struct SchemaExporter {
+ Status ExportField(const Field& field) {
+ export_.name_ = field.name();
+ flags_ = field.nullable() ? ARROW_FLAG_NULLABLE : 0;
+
+ const DataType& type = *field.type();
+ RETURN_NOT_OK(ExportFormat(type));
+ RETURN_NOT_OK(ExportChildren(type.fields()));
+ RETURN_NOT_OK(ExportMetadata(field.metadata().get()));
+ return Status::OK();
+ }
+
+ Status ExportType(const DataType& type) {
+ flags_ = ARROW_FLAG_NULLABLE;
+
+ RETURN_NOT_OK(ExportFormat(type));
+ RETURN_NOT_OK(ExportChildren(type.fields()));
+ return Status::OK();
+ }
+
+ Status ExportSchema(const Schema& schema) {
+ static StructType dummy_struct_type({});
+ flags_ = 0;
+
+ RETURN_NOT_OK(ExportFormat(dummy_struct_type));
+ RETURN_NOT_OK(ExportChildren(schema.fields()));
+ RETURN_NOT_OK(ExportMetadata(schema.metadata().get()));
+ return Status::OK();
+ }
+
+ // Finalize exporting by setting C struct fields and allocating
+ // autonomous private data for each schema node.
+ //
+ // This function can't fail, as properly reclaiming memory in case of error
+ // would be too fragile. After this function returns, memory is reclaimed
+ // by calling the release() pointer in the top level ArrowSchema struct.
+ void Finish(struct ArrowSchema* c_struct) {
+ // First, create permanent ExportedSchemaPrivateData
+ auto pdata = new ExportedSchemaPrivateData(std::move(export_));
+
+ // Second, finish dictionary and children.
+ if (dict_exporter_) {
+ dict_exporter_->Finish(&pdata->dictionary_);
+ }
+ pdata->child_pointers_.resize(child_exporters_.size(), nullptr);
+ for (size_t i = 0; i < child_exporters_.size(); ++i) {
+ auto ptr = pdata->child_pointers_[i] = &pdata->children_[i];
+ child_exporters_[i].Finish(ptr);
+ }
+
+ // Third, fill C struct.
+ DCHECK_NE(c_struct, nullptr);
+ memset(c_struct, 0, sizeof(*c_struct));
+
+ c_struct->format = pdata->format_.c_str();
+ c_struct->name = pdata->name_.c_str();
+ c_struct->metadata = pdata->metadata_.empty() ? nullptr : pdata->metadata_.c_str();
+ c_struct->flags = flags_;
+
+ c_struct->n_children = static_cast<int64_t>(child_exporters_.size());
+ c_struct->children = pdata->child_pointers_.data();
+ c_struct->dictionary = dict_exporter_ ? &pdata->dictionary_ : nullptr;
+ c_struct->private_data = pdata;
+ c_struct->release = ReleaseExportedSchema;
+ }
+
+ Status ExportFormat(const DataType& type) {
+ if (type.id() == Type::DICTIONARY) {
+ const auto& dict_type = checked_cast<const DictionaryType&>(type);
+ if (dict_type.ordered()) {
+ flags_ |= ARROW_FLAG_DICTIONARY_ORDERED;
+ }
+ // Dictionary type: parent struct describes index type,
+ // child dictionary struct describes value type.
+ RETURN_NOT_OK(VisitTypeInline(*dict_type.index_type(), this));
+ dict_exporter_.reset(new SchemaExporter());
+ RETURN_NOT_OK(dict_exporter_->ExportType(*dict_type.value_type()));
+ } else {
+ RETURN_NOT_OK(VisitTypeInline(type, this));
+ }
+ DCHECK(!export_.format_.empty());
+ return Status::OK();
+ }
+
+ Status ExportChildren(const std::vector<std::shared_ptr<Field>>& fields) {
+ export_.children_.resize(fields.size());
+ child_exporters_.resize(fields.size());
+ for (size_t i = 0; i < fields.size(); ++i) {
+ RETURN_NOT_OK(child_exporters_[i].ExportField(*fields[i]));
+ }
+ return Status::OK();
+ }
+
+ Status ExportMetadata(const KeyValueMetadata* metadata) {
+ if (metadata != nullptr && metadata->size() >= 0) {
+ ARROW_ASSIGN_OR_RAISE(export_.metadata_, EncodeMetadata(*metadata));
+ }
+ return Status::OK();
+ }
+
+ Status SetFormat(std::string s) {
+ export_.format_ = std::move(s);
+ return Status::OK();
+ }
+
+ // Type-specific visitors
+
+ Status Visit(const DataType& type) { return ExportingNotImplemented(type); }
+
+ Status Visit(const NullType& type) { return SetFormat("n"); }
+
+ Status Visit(const BooleanType& type) { return SetFormat("b"); }
+
+ Status Visit(const Int8Type& type) { return SetFormat("c"); }
+
+ Status Visit(const UInt8Type& type) { return SetFormat("C"); }
+
+ Status Visit(const Int16Type& type) { return SetFormat("s"); }
+
+ Status Visit(const UInt16Type& type) { return SetFormat("S"); }
+
+ Status Visit(const Int32Type& type) { return SetFormat("i"); }
+
+ Status Visit(const UInt32Type& type) { return SetFormat("I"); }
+
+ Status Visit(const Int64Type& type) { return SetFormat("l"); }
+
+ Status Visit(const UInt64Type& type) { return SetFormat("L"); }
+
+ Status Visit(const HalfFloatType& type) { return SetFormat("e"); }
+
+ Status Visit(const FloatType& type) { return SetFormat("f"); }
+
+ Status Visit(const DoubleType& type) { return SetFormat("g"); }
+
+ Status Visit(const FixedSizeBinaryType& type) {
+ return SetFormat("w:" + std::to_string(type.byte_width()));
+ }
+
+ Status Visit(const DecimalType& type) {
+ if (type.bit_width() == 128) {
+ // 128 is the default bit-width
+ return SetFormat("d:" + std::to_string(type.precision()) + "," +
+ std::to_string(type.scale()));
+ } else {
+ return SetFormat("d:" + std::to_string(type.precision()) + "," +
+ std::to_string(type.scale()) + "," +
+ std::to_string(type.bit_width()));
+ }
+ }
+
+ Status Visit(const BinaryType& type) { return SetFormat("z"); }
+
+ Status Visit(const LargeBinaryType& type) { return SetFormat("Z"); }
+
+ Status Visit(const StringType& type) { return SetFormat("u"); }
+
+ Status Visit(const LargeStringType& type) { return SetFormat("U"); }
+
+ Status Visit(const Date32Type& type) { return SetFormat("tdD"); }
+
+ Status Visit(const Date64Type& type) { return SetFormat("tdm"); }
+
+ Status Visit(const Time32Type& type) {
+ switch (type.unit()) {
+ case TimeUnit::SECOND:
+ export_.format_ = "tts";
+ break;
+ case TimeUnit::MILLI:
+ export_.format_ = "ttm";
+ break;
+ default:
+ return Status::Invalid("Invalid time unit for Time32: ", type.unit());
+ }
+ return Status::OK();
+ }
+
+ Status Visit(const Time64Type& type) {
+ switch (type.unit()) {
+ case TimeUnit::MICRO:
+ export_.format_ = "ttu";
+ break;
+ case TimeUnit::NANO:
+ export_.format_ = "ttn";
+ break;
+ default:
+ return Status::Invalid("Invalid time unit for Time64: ", type.unit());
+ }
+ return Status::OK();
+ }
+
+ Status Visit(const TimestampType& type) {
+ switch (type.unit()) {
+ case TimeUnit::SECOND:
+ export_.format_ = "tss:";
+ break;
+ case TimeUnit::MILLI:
+ export_.format_ = "tsm:";
+ break;
+ case TimeUnit::MICRO:
+ export_.format_ = "tsu:";
+ break;
+ case TimeUnit::NANO:
+ export_.format_ = "tsn:";
+ break;
+ default:
+ return Status::Invalid("Invalid time unit for Timestamp: ", type.unit());
+ }
+ export_.format_ += type.timezone();
+ return Status::OK();
+ }
+
+ Status Visit(const DurationType& type) {
+ switch (type.unit()) {
+ case TimeUnit::SECOND:
+ export_.format_ = "tDs";
+ break;
+ case TimeUnit::MILLI:
+ export_.format_ = "tDm";
+ break;
+ case TimeUnit::MICRO:
+ export_.format_ = "tDu";
+ break;
+ case TimeUnit::NANO:
+ export_.format_ = "tDn";
+ break;
+ default:
+ return Status::Invalid("Invalid time unit for Duration: ", type.unit());
+ }
+ return Status::OK();
+ }
+
+ Status Visit(const MonthIntervalType& type) { return SetFormat("tiM"); }
+
+ Status Visit(const DayTimeIntervalType& type) { return SetFormat("tiD"); }
+
+ Status Visit(const ListType& type) { return SetFormat("+l"); }
+
+ Status Visit(const LargeListType& type) { return SetFormat("+L"); }
+
+ Status Visit(const FixedSizeListType& type) {
+ return SetFormat("+w:" + std::to_string(type.list_size()));
+ }
+
+ Status Visit(const StructType& type) { return SetFormat("+s"); }
+
+ Status Visit(const MapType& type) {
+ export_.format_ = "+m";
+ if (type.keys_sorted()) {
+ flags_ |= ARROW_FLAG_MAP_KEYS_SORTED;
+ }
+ return Status::OK();
+ }
+
+ Status Visit(const UnionType& type) {
+ std::string& s = export_.format_;
+ s = "+u";
+ if (type.mode() == UnionMode::DENSE) {
+ s += "d:";
+ } else {
+ DCHECK_EQ(type.mode(), UnionMode::SPARSE);
+ s += "s:";
+ }
+ bool first = true;
+ for (const auto code : type.type_codes()) {
+ if (!first) {
+ s += ",";
+ }
+ s += std::to_string(code);
+ first = false;
+ }
+ return Status::OK();
+ }
+
+ ExportedSchemaPrivateData export_;
+ int64_t flags_ = 0;
+ std::unique_ptr<SchemaExporter> dict_exporter_;
+ std::vector<SchemaExporter> child_exporters_;
+};
+
+} // namespace
+
+Status ExportType(const DataType& type, struct ArrowSchema* out) {
+ SchemaExporter exporter;
+ RETURN_NOT_OK(exporter.ExportType(type));
+ exporter.Finish(out);
+ return Status::OK();
+}
+
+Status ExportField(const Field& field, struct ArrowSchema* out) {
+ SchemaExporter exporter;
+ RETURN_NOT_OK(exporter.ExportField(field));
+ exporter.Finish(out);
+ return Status::OK();
+}
+
+Status ExportSchema(const Schema& schema, struct ArrowSchema* out) {
+ SchemaExporter exporter;
+ RETURN_NOT_OK(exporter.ExportSchema(schema));
+ exporter.Finish(out);
+ return Status::OK();
+}
+
+//////////////////////////////////////////////////////////////////////////
+// C data export
+
+namespace {
+
+struct ExportedArrayPrivateData : PoolAllocationMixin<ExportedArrayPrivateData> {
+ // The buffers are owned by the ArrayData member
+ PoolVector<const void*> buffers_;
+ struct ArrowArray dictionary_;
+ PoolVector<struct ArrowArray> children_;
+ PoolVector<struct ArrowArray*> child_pointers_;
+
+ std::shared_ptr<ArrayData> data_;
+
+ ExportedArrayPrivateData() = default;
+ ARROW_DEFAULT_MOVE_AND_ASSIGN(ExportedArrayPrivateData);
+ ARROW_DISALLOW_COPY_AND_ASSIGN(ExportedArrayPrivateData);
+};
+
+void ReleaseExportedArray(struct ArrowArray* array) {
+ if (ArrowArrayIsReleased(array)) {
+ return;
+ }
+ for (int64_t i = 0; i < array->n_children; ++i) {
+ struct ArrowArray* child = array->children[i];
+ ArrowArrayRelease(child);
+ DCHECK(ArrowArrayIsReleased(child))
+ << "Child release callback should have marked it released";
+ }
+ struct ArrowArray* dict = array->dictionary;
+ if (dict != nullptr) {
+ ArrowArrayRelease(dict);
+ DCHECK(ArrowArrayIsReleased(dict))
+ << "Dictionary release callback should have marked it released";
+ }
+ DCHECK_NE(array->private_data, nullptr);
+ delete reinterpret_cast<ExportedArrayPrivateData*>(array->private_data);
+
+ ArrowArrayMarkReleased(array);
+}
+
+struct ArrayExporter {
+ Status Export(const std::shared_ptr<ArrayData>& data) {
+ // Force computing null count.
+ // This is because ARROW-9037 is in version 0.17 and 0.17.1, and they are
+ // not able to import arrays without a null bitmap and null_count == -1.
+ data->GetNullCount();
+ // Store buffer pointers
+ export_.buffers_.resize(data->buffers.size());
+ std::transform(data->buffers.begin(), data->buffers.end(), export_.buffers_.begin(),
+ [](const std::shared_ptr<Buffer>& buffer) -> const void* {
+ return buffer ? buffer->data() : nullptr;
+ });
+
+ // Export dictionary
+ if (data->dictionary != nullptr) {
+ dict_exporter_.reset(new ArrayExporter());
+ RETURN_NOT_OK(dict_exporter_->Export(data->dictionary));
+ }
+
+ // Export children
+ export_.children_.resize(data->child_data.size());
+ child_exporters_.resize(data->child_data.size());
+ for (size_t i = 0; i < data->child_data.size(); ++i) {
+ RETURN_NOT_OK(child_exporters_[i].Export(data->child_data[i]));
+ }
+
+ // Store owning pointer to ArrayData
+ export_.data_ = data;
+
+ return Status::OK();
+ }
+
+ // Finalize exporting by setting C struct fields and allocating
+ // autonomous private data for each array node.
+ //
+ // This function can't fail, as properly reclaiming memory in case of error
+ // would be too fragile. After this function returns, memory is reclaimed
+ // by calling the release() pointer in the top level ArrowArray struct.
+ void Finish(struct ArrowArray* c_struct_) {
+ // First, create permanent ExportedArrayPrivateData, to make sure that
+ // child ArrayData pointers don't get invalidated.
+ auto pdata = new ExportedArrayPrivateData(std::move(export_));
+ const ArrayData& data = *pdata->data_;
+
+ // Second, finish dictionary and children.
+ if (dict_exporter_) {
+ dict_exporter_->Finish(&pdata->dictionary_);
+ }
+ pdata->child_pointers_.resize(data.child_data.size(), nullptr);
+ for (size_t i = 0; i < data.child_data.size(); ++i) {
+ auto ptr = &pdata->children_[i];
+ pdata->child_pointers_[i] = ptr;
+ child_exporters_[i].Finish(ptr);
+ }
+
+ // Third, fill C struct.
+ DCHECK_NE(c_struct_, nullptr);
+ memset(c_struct_, 0, sizeof(*c_struct_));
+
+ c_struct_->length = data.length;
+ c_struct_->null_count = data.null_count;
+ c_struct_->offset = data.offset;
+ c_struct_->n_buffers = static_cast<int64_t>(pdata->buffers_.size());
+ c_struct_->n_children = static_cast<int64_t>(pdata->child_pointers_.size());
+ c_struct_->buffers = pdata->buffers_.data();
+ c_struct_->children = pdata->child_pointers_.data();
+ c_struct_->dictionary = dict_exporter_ ? &pdata->dictionary_ : nullptr;
+ c_struct_->private_data = pdata;
+ c_struct_->release = ReleaseExportedArray;
+ }
+
+ ExportedArrayPrivateData export_;
+ std::unique_ptr<ArrayExporter> dict_exporter_;
+ std::vector<ArrayExporter> child_exporters_;
+};
+
+} // namespace
+
+Status ExportArray(const Array& array, struct ArrowArray* out,
+ struct ArrowSchema* out_schema) {
+ SchemaExportGuard guard(out_schema);
+ if (out_schema != nullptr) {
+ RETURN_NOT_OK(ExportType(*array.type(), out_schema));
+ }
+ ArrayExporter exporter;
+ RETURN_NOT_OK(exporter.Export(array.data()));
+ exporter.Finish(out);
+ guard.Detach();
+ return Status::OK();
+}
+
+Status ExportRecordBatch(const RecordBatch& batch, struct ArrowArray* out,
+ struct ArrowSchema* out_schema) {
+ // XXX perhaps bypass ToStructArray() for speed?
+ ARROW_ASSIGN_OR_RAISE(auto array, batch.ToStructArray());
+
+ SchemaExportGuard guard(out_schema);
+ if (out_schema != nullptr) {
+ // Export the schema, not the struct type, so as not to lose top-level metadata
+ RETURN_NOT_OK(ExportSchema(*batch.schema(), out_schema));
+ }
+ ArrayExporter exporter;
+ RETURN_NOT_OK(exporter.Export(array->data()));
+ exporter.Finish(out);
+ guard.Detach();
+ return Status::OK();
+}
+
+//////////////////////////////////////////////////////////////////////////
+// C schema import
+
+namespace {
+
+static constexpr int64_t kMaxImportRecursionLevel = 64;
+
+Status InvalidFormatString(util::string_view v) {
+ return Status::Invalid("Invalid or unsupported format string: '", v, "'");
+}
+
+class FormatStringParser {
+ public:
+ FormatStringParser() {}
+
+ explicit FormatStringParser(util::string_view v) : view_(v), index_(0) {}
+
+ bool AtEnd() const { return index_ >= view_.length(); }
+
+ char Next() { return view_[index_++]; }
+
+ util::string_view Rest() { return view_.substr(index_); }
+
+ Status CheckNext(char c) {
+ if (AtEnd() || Next() != c) {
+ return Invalid();
+ }
+ return Status::OK();
+ }
+
+ Status CheckHasNext() {
+ if (AtEnd()) {
+ return Invalid();
+ }
+ return Status::OK();
+ }
+
+ Status CheckAtEnd() {
+ if (!AtEnd()) {
+ return Invalid();
+ }
+ return Status::OK();
+ }
+
+ template <typename IntType = int32_t>
+ Result<IntType> ParseInt(util::string_view v) {
+ using ArrowIntType = typename CTypeTraits<IntType>::ArrowType;
+ IntType value;
+ if (!internal::ParseValue<ArrowIntType>(v.data(), v.size(), &value)) {
+ return Invalid();
+ }
+ return value;
+ }
+
+ Result<TimeUnit::type> ParseTimeUnit() {
+ RETURN_NOT_OK(CheckHasNext());
+ switch (Next()) {
+ case 's':
+ return TimeUnit::SECOND;
+ case 'm':
+ return TimeUnit::MILLI;
+ case 'u':
+ return TimeUnit::MICRO;
+ case 'n':
+ return TimeUnit::NANO;
+ default:
+ return Invalid();
+ }
+ }
+
+ std::vector<util::string_view> Split(util::string_view v, char delim = ',') {
+ std::vector<util::string_view> parts;
+ size_t start = 0, end;
+ while (true) {
+ end = v.find_first_of(delim, start);
+ parts.push_back(v.substr(start, end - start));
+ if (end == util::string_view::npos) {
+ break;
+ }
+ start = end + 1;
+ }
+ return parts;
+ }
+
+ template <typename IntType = int32_t>
+ Result<std::vector<IntType>> ParseInts(util::string_view v) {
+ auto parts = Split(v);
+ std::vector<IntType> result;
+ result.reserve(parts.size());
+ for (const auto& p : parts) {
+ ARROW_ASSIGN_OR_RAISE(auto i, ParseInt<IntType>(p));
+ result.push_back(i);
+ }
+ return result;
+ }
+
+ Status Invalid() { return InvalidFormatString(view_); }
+
+ protected:
+ util::string_view view_;
+ size_t index_;
+};
+
+Result<std::shared_ptr<KeyValueMetadata>> DecodeMetadata(const char* metadata) {
+ auto read_int32 = [&](int32_t* out) -> Status {
+ int32_t v;
+ memcpy(&v, metadata, 4);
+ metadata += 4;
+ *out = v;
+ if (*out < 0) {
+ return Status::Invalid("Invalid encoded metadata string");
+ }
+ return Status::OK();
+ };
+
+ auto read_string = [&](std::string* out) -> Status {
+ int32_t len;
+ RETURN_NOT_OK(read_int32(&len));
+ out->resize(len);
+ if (len > 0) {
+ memcpy(&(*out)[0], metadata, len);
+ metadata += len;
+ }
+ return Status::OK();
+ };
+
+ if (metadata == nullptr) {
+ return nullptr;
+ }
+ int32_t npairs;
+ RETURN_NOT_OK(read_int32(&npairs));
+ if (npairs == 0) {
+ return nullptr;
+ }
+ std::vector<std::string> keys(npairs);
+ std::vector<std::string> values(npairs);
+ for (int32_t i = 0; i < npairs; ++i) {
+ RETURN_NOT_OK(read_string(&keys[i]));
+ RETURN_NOT_OK(read_string(&values[i]));
+ }
+ return key_value_metadata(std::move(keys), std::move(values));
+}
+
+struct SchemaImporter {
+ SchemaImporter() : c_struct_(nullptr), guard_(nullptr) {}
+
+ Status Import(struct ArrowSchema* src) {
+ if (ArrowSchemaIsReleased(src)) {
+ return Status::Invalid("Cannot import released ArrowSchema");
+ }
+ guard_.Reset(src);
+ recursion_level_ = 0;
+ c_struct_ = src;
+ return DoImport();
+ }
+
+ Result<std::shared_ptr<Field>> MakeField() const {
+ ARROW_ASSIGN_OR_RAISE(auto metadata, DecodeMetadata(c_struct_->metadata));
+ const char* name = c_struct_->name ? c_struct_->name : "";
+ bool nullable = (c_struct_->flags & ARROW_FLAG_NULLABLE) != 0;
+ return field(name, type_, nullable, std::move(metadata));
+ }
+
+ Result<std::shared_ptr<Schema>> MakeSchema() const {
+ if (type_->id() != Type::STRUCT) {
+ return Status::Invalid(
+ "Cannot import schema: ArrowSchema describes non-struct type ",
+ type_->ToString());
+ }
+ ARROW_ASSIGN_OR_RAISE(auto metadata, DecodeMetadata(c_struct_->metadata));
+ return schema(type_->fields(), std::move(metadata));
+ }
+
+ Result<std::shared_ptr<DataType>> MakeType() const { return type_; }
+
+ protected:
+ Status ImportChild(const SchemaImporter* parent, struct ArrowSchema* src) {
+ if (ArrowSchemaIsReleased(src)) {
+ return Status::Invalid("Cannot import released ArrowSchema");
+ }
+ recursion_level_ = parent->recursion_level_ + 1;
+ if (recursion_level_ >= kMaxImportRecursionLevel) {
+ return Status::Invalid("Recursion level in ArrowSchema struct exceeded");
+ }
+ // The ArrowSchema is owned by its parent, so don't release it ourselves
+ c_struct_ = src;
+ return DoImport();
+ }
+
+ Status ImportDict(const SchemaImporter* parent, struct ArrowSchema* src) {
+ return ImportChild(parent, src);
+ }
+
+ Status DoImport() {
+ // First import children (required for reconstituting parent type)
+ child_importers_.resize(c_struct_->n_children);
+ for (int64_t i = 0; i < c_struct_->n_children; ++i) {
+ DCHECK_NE(c_struct_->children[i], nullptr);
+ RETURN_NOT_OK(child_importers_[i].ImportChild(this, c_struct_->children[i]));
+ }
+
+ // Import main type
+ RETURN_NOT_OK(ProcessFormat());
+ DCHECK_NE(type_, nullptr);
+
+ // Import dictionary type
+ if (c_struct_->dictionary != nullptr) {
+ // Check this index type
+ if (!is_integer(type_->id())) {
+ return Status::Invalid(
+ "ArrowSchema struct has a dictionary but is not an integer type: ",
+ type_->ToString());
+ }
+ SchemaImporter dict_importer;
+ RETURN_NOT_OK(dict_importer.ImportDict(this, c_struct_->dictionary));
+ bool ordered = (c_struct_->flags & ARROW_FLAG_DICTIONARY_ORDERED) != 0;
+ type_ = dictionary(type_, dict_importer.type_, ordered);
+ }
+ return Status::OK();
+ }
+
+ Status ProcessFormat() {
+ f_parser_ = FormatStringParser(c_struct_->format);
+ RETURN_NOT_OK(f_parser_.CheckHasNext());
+ switch (f_parser_.Next()) {
+ case 'n':
+ return ProcessPrimitive(null());
+ case 'b':
+ return ProcessPrimitive(boolean());
+ case 'c':
+ return ProcessPrimitive(int8());
+ case 'C':
+ return ProcessPrimitive(uint8());
+ case 's':
+ return ProcessPrimitive(int16());
+ case 'S':
+ return ProcessPrimitive(uint16());
+ case 'i':
+ return ProcessPrimitive(int32());
+ case 'I':
+ return ProcessPrimitive(uint32());
+ case 'l':
+ return ProcessPrimitive(int64());
+ case 'L':
+ return ProcessPrimitive(uint64());
+ case 'e':
+ return ProcessPrimitive(float16());
+ case 'f':
+ return ProcessPrimitive(float32());
+ case 'g':
+ return ProcessPrimitive(float64());
+ case 'u':
+ return ProcessPrimitive(utf8());
+ case 'U':
+ return ProcessPrimitive(large_utf8());
+ case 'z':
+ return ProcessPrimitive(binary());
+ case 'Z':
+ return ProcessPrimitive(large_binary());
+ case 'w':
+ return ProcessFixedSizeBinary();
+ case 'd':
+ return ProcessDecimal();
+ case 't':
+ return ProcessTemporal();
+ case '+':
+ return ProcessNested();
+ }
+ return f_parser_.Invalid();
+ }
+
+ Status ProcessTemporal() {
+ RETURN_NOT_OK(f_parser_.CheckHasNext());
+ switch (f_parser_.Next()) {
+ case 'd':
+ return ProcessDate();
+ case 't':
+ return ProcessTime();
+ case 'D':
+ return ProcessDuration();
+ case 'i':
+ return ProcessInterval();
+ case 's':
+ return ProcessTimestamp();
+ }
+ return f_parser_.Invalid();
+ }
+
+ Status ProcessNested() {
+ RETURN_NOT_OK(f_parser_.CheckHasNext());
+ switch (f_parser_.Next()) {
+ case 'l':
+ return ProcessListLike<ListType>();
+ case 'L':
+ return ProcessListLike<LargeListType>();
+ case 'w':
+ return ProcessFixedSizeList();
+ case 's':
+ return ProcessStruct();
+ case 'm':
+ return ProcessMap();
+ case 'u':
+ return ProcessUnion();
+ }
+ return f_parser_.Invalid();
+ }
+
+ Status ProcessDate() {
+ RETURN_NOT_OK(f_parser_.CheckHasNext());
+ switch (f_parser_.Next()) {
+ case 'D':
+ return ProcessPrimitive(date32());
+ case 'm':
+ return ProcessPrimitive(date64());
+ }
+ return f_parser_.Invalid();
+ }
+
+ Status ProcessInterval() {
+ RETURN_NOT_OK(f_parser_.CheckHasNext());
+ switch (f_parser_.Next()) {
+ case 'D':
+ return ProcessPrimitive(day_time_interval());
+ case 'M':
+ return ProcessPrimitive(month_interval());
+ }
+ return f_parser_.Invalid();
+ }
+
+ Status ProcessTime() {
+ ARROW_ASSIGN_OR_RAISE(auto unit, f_parser_.ParseTimeUnit());
+ if (unit == TimeUnit::SECOND || unit == TimeUnit::MILLI) {
+ return ProcessPrimitive(time32(unit));
+ } else {
+ return ProcessPrimitive(time64(unit));
+ }
+ }
+
+ Status ProcessDuration() {
+ ARROW_ASSIGN_OR_RAISE(auto unit, f_parser_.ParseTimeUnit());
+ return ProcessPrimitive(duration(unit));
+ }
+
+ Status ProcessTimestamp() {
+ ARROW_ASSIGN_OR_RAISE(auto unit, f_parser_.ParseTimeUnit());
+ RETURN_NOT_OK(f_parser_.CheckNext(':'));
+ type_ = timestamp(unit, std::string(f_parser_.Rest()));
+ return Status::OK();
+ }
+
+ Status ProcessFixedSizeBinary() {
+ RETURN_NOT_OK(f_parser_.CheckNext(':'));
+ ARROW_ASSIGN_OR_RAISE(auto byte_width, f_parser_.ParseInt(f_parser_.Rest()));
+ if (byte_width < 0) {
+ return f_parser_.Invalid();
+ }
+ type_ = fixed_size_binary(byte_width);
+ return Status::OK();
+ }
+
+ Status ProcessDecimal() {
+ RETURN_NOT_OK(f_parser_.CheckNext(':'));
+ ARROW_ASSIGN_OR_RAISE(auto prec_scale, f_parser_.ParseInts(f_parser_.Rest()));
+ // 3 elements indicates bit width was communicated as well.
+ if (prec_scale.size() != 2 && prec_scale.size() != 3) {
+ return f_parser_.Invalid();
+ }
+ if (prec_scale[0] <= 0) {
+ return f_parser_.Invalid();
+ }
+ if (prec_scale.size() == 2 || prec_scale[2] == 128) {
+ type_ = decimal128(prec_scale[0], prec_scale[1]);
+ } else if (prec_scale[2] == 256) {
+ type_ = decimal256(prec_scale[0], prec_scale[1]);
+ } else {
+ return f_parser_.Invalid();
+ }
+ return Status::OK();
+ }
+
+ Status ProcessPrimitive(const std::shared_ptr<DataType>& type) {
+ RETURN_NOT_OK(f_parser_.CheckAtEnd());
+ type_ = type;
+ return CheckNoChildren(type);
+ }
+
+ template <typename ListType>
+ Status ProcessListLike() {
+ RETURN_NOT_OK(f_parser_.CheckAtEnd());
+ RETURN_NOT_OK(CheckNumChildren(1));
+ ARROW_ASSIGN_OR_RAISE(auto field, MakeChildField(0));
+ type_ = std::make_shared<ListType>(field);
+ return Status::OK();
+ }
+
+ Status ProcessMap() {
+ RETURN_NOT_OK(f_parser_.CheckAtEnd());
+ RETURN_NOT_OK(CheckNumChildren(1));
+ ARROW_ASSIGN_OR_RAISE(auto field, MakeChildField(0));
+ const auto& value_type = field->type();
+ if (value_type->id() != Type::STRUCT) {
+ return Status::Invalid("Imported map array has unexpected child field type: ",
+ field->ToString());
+ }
+ if (value_type->num_fields() != 2) {
+ return Status::Invalid("Imported map array has unexpected child field type: ",
+ field->ToString());
+ }
+
+ bool keys_sorted = (c_struct_->flags & ARROW_FLAG_MAP_KEYS_SORTED);
+ type_ = map(value_type->field(0)->type(), value_type->field(1)->type(), keys_sorted);
+ return Status::OK();
+ }
+
+ Status ProcessFixedSizeList() {
+ RETURN_NOT_OK(f_parser_.CheckNext(':'));
+ ARROW_ASSIGN_OR_RAISE(auto list_size, f_parser_.ParseInt(f_parser_.Rest()));
+ if (list_size < 0) {
+ return f_parser_.Invalid();
+ }
+ RETURN_NOT_OK(CheckNumChildren(1));
+ ARROW_ASSIGN_OR_RAISE(auto field, MakeChildField(0));
+ type_ = fixed_size_list(field, list_size);
+ return Status::OK();
+ }
+
+ Status ProcessStruct() {
+ RETURN_NOT_OK(f_parser_.CheckAtEnd());
+ ARROW_ASSIGN_OR_RAISE(auto fields, MakeChildFields());
+ type_ = struct_(std::move(fields));
+ return Status::OK();
+ }
+
+ Status ProcessUnion() {
+ RETURN_NOT_OK(f_parser_.CheckHasNext());
+ UnionMode::type mode;
+ switch (f_parser_.Next()) {
+ case 'd':
+ mode = UnionMode::DENSE;
+ break;
+ case 's':
+ mode = UnionMode::SPARSE;
+ break;
+ default:
+ return f_parser_.Invalid();
+ }
+ RETURN_NOT_OK(f_parser_.CheckNext(':'));
+ ARROW_ASSIGN_OR_RAISE(auto type_codes, f_parser_.ParseInts<int8_t>(f_parser_.Rest()));
+ ARROW_ASSIGN_OR_RAISE(auto fields, MakeChildFields());
+ if (fields.size() != type_codes.size()) {
+ return Status::Invalid(
+ "ArrowArray struct number of children incompatible with format string "
+ "(mismatching number of union type codes) ",
+ "'", c_struct_->format, "'");
+ }
+ for (const auto code : type_codes) {
+ if (code < 0) {
+ return Status::Invalid("Negative type code in union: format string '",
+ c_struct_->format, "'");
+ }
+ }
+ if (mode == UnionMode::SPARSE) {
+ type_ = sparse_union(std::move(fields), std::move(type_codes));
+ } else {
+ type_ = dense_union(std::move(fields), std::move(type_codes));
+ }
+ return Status::OK();
+ }
+
+ Result<std::shared_ptr<Field>> MakeChildField(int64_t child_id) {
+ const auto& child = child_importers_[child_id];
+ if (child.c_struct_->name == nullptr) {
+ return Status::Invalid("Expected non-null name in imported array child");
+ }
+ return child.MakeField();
+ }
+
+ Result<std::vector<std::shared_ptr<Field>>> MakeChildFields() {
+ std::vector<std::shared_ptr<Field>> fields(child_importers_.size());
+ for (int64_t i = 0; i < static_cast<int64_t>(child_importers_.size()); ++i) {
+ ARROW_ASSIGN_OR_RAISE(fields[i], MakeChildField(i));
+ }
+ return fields;
+ }
+
+ Status CheckNoChildren(const std::shared_ptr<DataType>& type) {
+ return CheckNumChildren(type, 0);
+ }
+
+ Status CheckNumChildren(const std::shared_ptr<DataType>& type, int64_t n_children) {
+ if (c_struct_->n_children != n_children) {
+ return Status::Invalid("Expected ", n_children, " children for imported type ",
+ *type, ", ArrowArray struct has ", c_struct_->n_children);
+ }
+ return Status::OK();
+ }
+
+ Status CheckNumChildren(int64_t n_children) {
+ if (c_struct_->n_children != n_children) {
+ return Status::Invalid("Expected ", n_children, " children for imported format '",
+ c_struct_->format, "', ArrowArray struct has ",
+ c_struct_->n_children);
+ }
+ return Status::OK();
+ }
+
+ struct ArrowSchema* c_struct_;
+ SchemaExportGuard guard_;
+ FormatStringParser f_parser_;
+ int64_t recursion_level_;
+ std::vector<SchemaImporter> child_importers_;
+ std::shared_ptr<DataType> type_;
+};
+
+} // namespace
+
+Result<std::shared_ptr<DataType>> ImportType(struct ArrowSchema* schema) {
+ SchemaImporter importer;
+ RETURN_NOT_OK(importer.Import(schema));
+ return importer.MakeType();
+}
+
+Result<std::shared_ptr<Field>> ImportField(struct ArrowSchema* schema) {
+ SchemaImporter importer;
+ RETURN_NOT_OK(importer.Import(schema));
+ return importer.MakeField();
+}
+
+Result<std::shared_ptr<Schema>> ImportSchema(struct ArrowSchema* schema) {
+ SchemaImporter importer;
+ RETURN_NOT_OK(importer.Import(schema));
+ return importer.MakeSchema();
+}
+
+//////////////////////////////////////////////////////////////////////////
+// C data import
+
+namespace {
+
+// A wrapper struct for an imported C ArrowArray.
+// The ArrowArray is released on destruction.
+struct ImportedArrayData {
+ struct ArrowArray array_;
+
+ ImportedArrayData() {
+ ArrowArrayMarkReleased(&array_); // Initially released
+ }
+
+ void Release() {
+ if (!ArrowArrayIsReleased(&array_)) {
+ ArrowArrayRelease(&array_);
+ DCHECK(ArrowArrayIsReleased(&array_));
+ }
+ }
+
+ ~ImportedArrayData() { Release(); }
+
+ private:
+ ARROW_DISALLOW_COPY_AND_ASSIGN(ImportedArrayData);
+};
+
+// A buffer wrapping an imported piece of data.
+class ImportedBuffer : public Buffer {
+ public:
+ ImportedBuffer(const uint8_t* data, int64_t size,
+ std::shared_ptr<ImportedArrayData> import)
+ : Buffer(data, size), import_(std::move(import)) {}
+
+ ~ImportedBuffer() override {}
+
+ protected:
+ std::shared_ptr<ImportedArrayData> import_;
+};
+
+struct ArrayImporter {
+ explicit ArrayImporter(const std::shared_ptr<DataType>& type) : type_(type) {}
+
+ Status Import(struct ArrowArray* src) {
+ if (ArrowArrayIsReleased(src)) {
+ return Status::Invalid("Cannot import released ArrowArray");
+ }
+ recursion_level_ = 0;
+ import_ = std::make_shared<ImportedArrayData>();
+ c_struct_ = &import_->array_;
+ ArrowArrayMove(src, c_struct_);
+ return DoImport();
+ }
+
+ Result<std::shared_ptr<Array>> MakeArray() {
+ DCHECK_NE(data_, nullptr);
+ return ::arrow::MakeArray(data_);
+ }
+
+ std::shared_ptr<ArrayData> GetArrayData() {
+ DCHECK_NE(data_, nullptr);
+ return data_;
+ }
+
+ Result<std::shared_ptr<RecordBatch>> MakeRecordBatch(std::shared_ptr<Schema> schema) {
+ DCHECK_NE(data_, nullptr);
+ if (data_->GetNullCount() != 0) {
+ return Status::Invalid(
+ "ArrowArray struct has non-zero null count, "
+ "cannot be imported as RecordBatch");
+ }
+ if (data_->offset != 0) {
+ return Status::Invalid(
+ "ArrowArray struct has non-zero offset, "
+ "cannot be imported as RecordBatch");
+ }
+ return RecordBatch::Make(std::move(schema), data_->length,
+ std::move(data_->child_data));
+ }
+
+ Status ImportChild(const ArrayImporter* parent, struct ArrowArray* src) {
+ if (ArrowArrayIsReleased(src)) {
+ return Status::Invalid("Cannot import released ArrowArray");
+ }
+ recursion_level_ = parent->recursion_level_ + 1;
+ if (recursion_level_ >= kMaxImportRecursionLevel) {
+ return Status::Invalid("Recursion level in ArrowArray struct exceeded");
+ }
+ // Child buffers will keep the entire parent import alive.
+ // Perhaps we can move the child structs to an owned area
+ // when the parent ImportedArrayData::Release() gets called,
+ // but that is another level of complication.
+ import_ = parent->import_;
+ // The ArrowArray shouldn't be moved, it's owned by its parent
+ c_struct_ = src;
+ return DoImport();
+ }
+
+ Status ImportDict(const ArrayImporter* parent, struct ArrowArray* src) {
+ return ImportChild(parent, src);
+ }
+
+ Status DoImport() {
+ // First import children (required for reconstituting parent array data)
+ const auto& fields = type_->fields();
+ if (c_struct_->n_children != static_cast<int64_t>(fields.size())) {
+ return Status::Invalid("ArrowArray struct has ", c_struct_->n_children,
+ " children, expected ", fields.size(), " for type ",
+ type_->ToString());
+ }
+ child_importers_.reserve(fields.size());
+ for (int64_t i = 0; i < c_struct_->n_children; ++i) {
+ DCHECK_NE(c_struct_->children[i], nullptr);
+ child_importers_.emplace_back(fields[i]->type());
+ RETURN_NOT_OK(child_importers_.back().ImportChild(this, c_struct_->children[i]));
+ }
+
+ // Import main data
+ RETURN_NOT_OK(ImportMainData());
+
+ bool is_dict_type = (type_->id() == Type::DICTIONARY);
+ if (c_struct_->dictionary != nullptr) {
+ if (!is_dict_type) {
+ return Status::Invalid("Import type is ", type_->ToString(),
+ " but dictionary field in ArrowArray struct is not null");
+ }
+ const auto& dict_type = checked_cast<const DictionaryType&>(*type_);
+ // Import dictionary values
+ ArrayImporter dict_importer(dict_type.value_type());
+ RETURN_NOT_OK(dict_importer.ImportDict(this, c_struct_->dictionary));
+ data_->dictionary = dict_importer.GetArrayData();
+ } else {
+ if (is_dict_type) {
+ return Status::Invalid("Import type is ", type_->ToString(),
+ " but dictionary field in ArrowArray struct is null");
+ }
+ }
+ return Status::OK();
+ }
+
+ Status ImportMainData() { return VisitTypeInline(*type_, this); }
+
+ Status Visit(const DataType& type) {
+ return Status::NotImplemented("Cannot import array of type ", type_->ToString());
+ }
+
+ Status Visit(const FixedWidthType& type) { return ImportFixedSizePrimitive(); }
+
+ Status Visit(const NullType& type) {
+ RETURN_NOT_OK(CheckNoChildren());
+ // XXX should we be lenient on the number of buffers?
+ RETURN_NOT_OK(CheckNumBuffers(1));
+ RETURN_NOT_OK(AllocateArrayData());
+ RETURN_NOT_OK(ImportBitsBuffer(0));
+ return Status::OK();
+ }
+
+ Status Visit(const StringType& type) { return ImportStringLike(type); }
+
+ Status Visit(const BinaryType& type) { return ImportStringLike(type); }
+
+ Status Visit(const LargeStringType& type) { return ImportStringLike(type); }
+
+ Status Visit(const LargeBinaryType& type) { return ImportStringLike(type); }
+
+ Status Visit(const ListType& type) { return ImportListLike(type); }
+
+ Status Visit(const LargeListType& type) { return ImportListLike(type); }
+
+ Status Visit(const FixedSizeListType& type) {
+ RETURN_NOT_OK(CheckNumChildren(1));
+ RETURN_NOT_OK(CheckNumBuffers(1));
+ RETURN_NOT_OK(AllocateArrayData());
+ RETURN_NOT_OK(ImportNullBitmap());
+ return Status::OK();
+ }
+
+ Status Visit(const StructType& type) {
+ RETURN_NOT_OK(CheckNumBuffers(1));
+ RETURN_NOT_OK(AllocateArrayData());
+ RETURN_NOT_OK(ImportNullBitmap());
+ return Status::OK();
+ }
+
+ Status Visit(const UnionType& type) {
+ auto mode = type.mode();
+ if (mode == UnionMode::SPARSE) {
+ RETURN_NOT_OK(CheckNumBuffers(2));
+ } else {
+ RETURN_NOT_OK(CheckNumBuffers(3));
+ }
+ RETURN_NOT_OK(AllocateArrayData());
+ RETURN_NOT_OK(ImportNullBitmap());
+ RETURN_NOT_OK(ImportFixedSizeBuffer(1, sizeof(int8_t)));
+ if (mode == UnionMode::DENSE) {
+ RETURN_NOT_OK(ImportFixedSizeBuffer(2, sizeof(int32_t)));
+ }
+ return Status::OK();
+ }
+
+ Status ImportFixedSizePrimitive() {
+ const auto& fw_type = checked_cast<const FixedWidthType&>(*type_);
+ RETURN_NOT_OK(CheckNoChildren());
+ RETURN_NOT_OK(CheckNumBuffers(2));
+ RETURN_NOT_OK(AllocateArrayData());
+ RETURN_NOT_OK(ImportNullBitmap());
+ if (BitUtil::IsMultipleOf8(fw_type.bit_width())) {
+ RETURN_NOT_OK(ImportFixedSizeBuffer(1, fw_type.bit_width() / 8));
+ } else {
+ DCHECK_EQ(fw_type.bit_width(), 1);
+ RETURN_NOT_OK(ImportBitsBuffer(1));
+ }
+ return Status::OK();
+ }
+
+ template <typename StringType>
+ Status ImportStringLike(const StringType& type) {
+ RETURN_NOT_OK(CheckNoChildren());
+ RETURN_NOT_OK(CheckNumBuffers(3));
+ RETURN_NOT_OK(AllocateArrayData());
+ RETURN_NOT_OK(ImportNullBitmap());
+ RETURN_NOT_OK(ImportOffsetsBuffer<typename StringType::offset_type>(1));
+ RETURN_NOT_OK(ImportStringValuesBuffer<typename StringType::offset_type>(1, 2));
+ return Status::OK();
+ }
+
+ template <typename ListType>
+ Status ImportListLike(const ListType& type) {
+ RETURN_NOT_OK(CheckNumChildren(1));
+ RETURN_NOT_OK(CheckNumBuffers(2));
+ RETURN_NOT_OK(AllocateArrayData());
+ RETURN_NOT_OK(ImportNullBitmap());
+ RETURN_NOT_OK(ImportOffsetsBuffer<typename ListType::offset_type>(1));
+ return Status::OK();
+ }
+
+ Status CheckNoChildren() { return CheckNumChildren(0); }
+
+ Status CheckNumChildren(int64_t n_children) {
+ if (c_struct_->n_children != n_children) {
+ return Status::Invalid("Expected ", n_children, " children for imported type ",
+ type_->ToString(), ", ArrowArray struct has ",
+ c_struct_->n_children);
+ }
+ return Status::OK();
+ }
+
+ Status CheckNumBuffers(int64_t n_buffers) {
+ if (n_buffers != c_struct_->n_buffers) {
+ return Status::Invalid("Expected ", n_buffers, " buffers for imported type ",
+ type_->ToString(), ", ArrowArray struct has ",
+ c_struct_->n_buffers);
+ }
+ return Status::OK();
+ }
+
+ Status AllocateArrayData() {
+ DCHECK_EQ(data_, nullptr);
+ data_ = std::make_shared<ArrayData>(type_, c_struct_->length, c_struct_->null_count,
+ c_struct_->offset);
+ data_->buffers.resize(static_cast<size_t>(c_struct_->n_buffers));
+ data_->child_data.resize(static_cast<size_t>(c_struct_->n_children));
+ DCHECK_EQ(child_importers_.size(), data_->child_data.size());
+ std::transform(child_importers_.begin(), child_importers_.end(),
+ data_->child_data.begin(),
+ [](const ArrayImporter& child) { return child.data_; });
+ return Status::OK();
+ }
+
+ Status ImportNullBitmap(int32_t buffer_id = 0) {
+ RETURN_NOT_OK(ImportBitsBuffer(buffer_id));
+ if (data_->null_count > 0 && data_->buffers[buffer_id] == nullptr) {
+ return Status::Invalid(
+ "ArrowArray struct has null bitmap buffer but non-zero null_count ",
+ data_->null_count);
+ }
+ return Status::OK();
+ }
+
+ Status ImportBitsBuffer(int32_t buffer_id) {
+ // Compute visible size of buffer
+ int64_t buffer_size = BitUtil::BytesForBits(c_struct_->length + c_struct_->offset);
+ return ImportBuffer(buffer_id, buffer_size);
+ }
+
+ Status ImportFixedSizeBuffer(int32_t buffer_id, int64_t byte_width) {
+ // Compute visible size of buffer
+ int64_t buffer_size = byte_width * (c_struct_->length + c_struct_->offset);
+ return ImportBuffer(buffer_id, buffer_size);
+ }
+
+ template <typename OffsetType>
+ Status ImportOffsetsBuffer(int32_t buffer_id) {
+ // Compute visible size of buffer
+ int64_t buffer_size =
+ sizeof(OffsetType) * (c_struct_->length + c_struct_->offset + 1);
+ return ImportBuffer(buffer_id, buffer_size);
+ }
+
+ template <typename OffsetType>
+ Status ImportStringValuesBuffer(int32_t offsets_buffer_id, int32_t buffer_id,
+ int64_t byte_width = 1) {
+ auto offsets = data_->GetValues<OffsetType>(offsets_buffer_id);
+ // Compute visible size of buffer
+ int64_t buffer_size = byte_width * offsets[c_struct_->length];
+ return ImportBuffer(buffer_id, buffer_size);
+ }
+
+ Status ImportBuffer(int32_t buffer_id, int64_t buffer_size) {
+ std::shared_ptr<Buffer>* out = &data_->buffers[buffer_id];
+ auto data = reinterpret_cast<const uint8_t*>(c_struct_->buffers[buffer_id]);
+ if (data != nullptr) {
+ *out = std::make_shared<ImportedBuffer>(data, buffer_size, import_);
+ } else {
+ out->reset();
+ }
+ return Status::OK();
+ }
+
+ struct ArrowArray* c_struct_;
+ int64_t recursion_level_;
+ const std::shared_ptr<DataType>& type_;
+
+ std::shared_ptr<ImportedArrayData> import_;
+ std::shared_ptr<ArrayData> data_;
+ std::vector<ArrayImporter> child_importers_;
+};
+
+} // namespace
+
+Result<std::shared_ptr<Array>> ImportArray(struct ArrowArray* array,
+ std::shared_ptr<DataType> type) {
+ ArrayImporter importer(type);
+ RETURN_NOT_OK(importer.Import(array));
+ return importer.MakeArray();
+}
+
+Result<std::shared_ptr<Array>> ImportArray(struct ArrowArray* array,
+ struct ArrowSchema* type) {
+ auto maybe_type = ImportType(type);
+ if (!maybe_type.ok()) {
+ ArrowArrayRelease(array);
+ return maybe_type.status();
+ }
+ return ImportArray(array, *maybe_type);
+}
+
+Result<std::shared_ptr<RecordBatch>> ImportRecordBatch(struct ArrowArray* array,
+ std::shared_ptr<Schema> schema) {
+ auto type = struct_(schema->fields());
+ ArrayImporter importer(type);
+ RETURN_NOT_OK(importer.Import(array));
+ return importer.MakeRecordBatch(std::move(schema));
+}
+
+Result<std::shared_ptr<RecordBatch>> ImportRecordBatch(struct ArrowArray* array,
+ struct ArrowSchema* schema) {
+ auto maybe_schema = ImportSchema(schema);
+ if (!maybe_schema.ok()) {
+ ArrowArrayRelease(array);
+ return maybe_schema.status();
+ }
+ return ImportRecordBatch(array, *maybe_schema);
+}
+
+//////////////////////////////////////////////////////////////////////////
+// C stream export
+
+namespace {
+
+class ExportedArrayStream {
+ public:
+ struct PrivateData {
+ explicit PrivateData(std::shared_ptr<RecordBatchReader> reader)
+ : reader_(std::move(reader)) {}
+
+ std::shared_ptr<RecordBatchReader> reader_;
+ std::string last_error_;
+
+ PrivateData() = default;
+ ARROW_DISALLOW_COPY_AND_ASSIGN(PrivateData);
+ };
+
+ explicit ExportedArrayStream(struct ArrowArrayStream* stream) : stream_(stream) {}
+
+ Status GetSchema(struct ArrowSchema* out_schema) {
+ return ExportSchema(*reader()->schema(), out_schema);
+ }
+
+ Status GetNext(struct ArrowArray* out_array) {
+ std::shared_ptr<RecordBatch> batch;
+ RETURN_NOT_OK(reader()->ReadNext(&batch));
+ if (batch == nullptr) {
+ // End of stream
+ ArrowArrayMarkReleased(out_array);
+ return Status::OK();
+ } else {
+ return ExportRecordBatch(*batch, out_array);
+ }
+ }
+
+ const char* GetLastError() {
+ const auto& last_error = private_data()->last_error_;
+ return last_error.empty() ? nullptr : last_error.c_str();
+ }
+
+ void Release() {
+ if (ArrowArrayStreamIsReleased(stream_)) {
+ return;
+ }
+ DCHECK_NE(private_data(), nullptr);
+ delete private_data();
+
+ ArrowArrayStreamMarkReleased(stream_);
+ }
+
+ // C-compatible callbacks
+
+ static int StaticGetSchema(struct ArrowArrayStream* stream,
+ struct ArrowSchema* out_schema) {
+ ExportedArrayStream self{stream};
+ return self.ToCError(self.GetSchema(out_schema));
+ }
+
+ static int StaticGetNext(struct ArrowArrayStream* stream,
+ struct ArrowArray* out_array) {
+ ExportedArrayStream self{stream};
+ return self.ToCError(self.GetNext(out_array));
+ }
+
+ static void StaticRelease(struct ArrowArrayStream* stream) {
+ ExportedArrayStream{stream}.Release();
+ }
+
+ static const char* StaticGetLastError(struct ArrowArrayStream* stream) {
+ return ExportedArrayStream{stream}.GetLastError();
+ }
+
+ private:
+ int ToCError(const Status& status) {
+ if (ARROW_PREDICT_TRUE(status.ok())) {
+ private_data()->last_error_.clear();
+ return 0;
+ }
+ private_data()->last_error_ = status.ToString();
+ switch (status.code()) {
+ case StatusCode::IOError:
+ return EIO;
+ case StatusCode::NotImplemented:
+ return ENOSYS;
+ case StatusCode::OutOfMemory:
+ return ENOMEM;
+ default:
+ return EINVAL; // Fallback for Invalid, TypeError, etc.
+ }
+ }
+
+ PrivateData* private_data() {
+ return reinterpret_cast<PrivateData*>(stream_->private_data);
+ }
+
+ const std::shared_ptr<RecordBatchReader>& reader() { return private_data()->reader_; }
+
+ struct ArrowArrayStream* stream_;
+};
+
+} // namespace
+
+Status ExportRecordBatchReader(std::shared_ptr<RecordBatchReader> reader,
+ struct ArrowArrayStream* out) {
+ out->get_schema = ExportedArrayStream::StaticGetSchema;
+ out->get_next = ExportedArrayStream::StaticGetNext;
+ out->get_last_error = ExportedArrayStream::StaticGetLastError;
+ out->release = ExportedArrayStream::StaticRelease;
+ out->private_data = new ExportedArrayStream::PrivateData{std::move(reader)};
+ return Status::OK();
+}
+
+//////////////////////////////////////////////////////////////////////////
+// C stream import
+
+namespace {
+
+class ArrayStreamBatchReader : public RecordBatchReader {
+ public:
+ explicit ArrayStreamBatchReader(struct ArrowArrayStream* stream) {
+ ArrowArrayStreamMove(stream, &stream_);
+ DCHECK(!ArrowArrayStreamIsReleased(&stream_));
+ }
+
+ ~ArrayStreamBatchReader() {
+ ArrowArrayStreamRelease(&stream_);
+ DCHECK(ArrowArrayStreamIsReleased(&stream_));
+ }
+
+ std::shared_ptr<Schema> schema() const override { return CacheSchema(); }
+
+ Status ReadNext(std::shared_ptr<RecordBatch>* batch) override {
+ struct ArrowArray c_array;
+ RETURN_NOT_OK(StatusFromCError(stream_.get_next(&stream_, &c_array)));
+ if (ArrowArrayIsReleased(&c_array)) {
+ // End of stream
+ batch->reset();
+ return Status::OK();
+ } else {
+ return ImportRecordBatch(&c_array, CacheSchema()).Value(batch);
+ }
+ }
+
+ private:
+ std::shared_ptr<Schema> CacheSchema() const {
+ if (!schema_) {
+ struct ArrowSchema c_schema;
+ ARROW_CHECK_OK(StatusFromCError(stream_.get_schema(&stream_, &c_schema)));
+ schema_ = ImportSchema(&c_schema).ValueOrDie();
+ }
+ return schema_;
+ }
+
+ Status StatusFromCError(int errno_like) const {
+ if (ARROW_PREDICT_TRUE(errno_like == 0)) {
+ return Status::OK();
+ }
+ StatusCode code;
+ switch (errno_like) {
+ case EDOM:
+ case EINVAL:
+ case ERANGE:
+ code = StatusCode::Invalid;
+ break;
+ case ENOMEM:
+ code = StatusCode::OutOfMemory;
+ break;
+ case ENOSYS:
+ code = StatusCode::NotImplemented;
+ default:
+ code = StatusCode::IOError;
+ break;
+ }
+ const char* last_error = stream_.get_last_error(&stream_);
+ return Status(code, last_error ? std::string(last_error) : "");
+ }
+
+ mutable struct ArrowArrayStream stream_;
+ mutable std::shared_ptr<Schema> schema_;
+};
+
+} // namespace
+
+Result<std::shared_ptr<RecordBatchReader>> ImportRecordBatchReader(
+ struct ArrowArrayStream* stream) {
+ if (ArrowArrayStreamIsReleased(stream)) {
+ return Status::Invalid("Cannot import released ArrowArrayStream");
+ }
+ // XXX should we call get_schema() here to avoid crashing on error?
+ return std::make_shared<ArrayStreamBatchReader>(stream);
+}
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/c/bridge.h b/contrib/libs/apache/arrow/cpp/src/arrow/c/bridge.h
new file mode 100644
index 00000000000..294f53e49fb
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/c/bridge.h
@@ -0,0 +1,197 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include "arrow/c/abi.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+/// \defgroup c-data-interface Functions for working with the C data interface.
+///
+/// @{
+
+/// \brief Export C++ DataType using the C data interface format.
+///
+/// The root type is considered to have empty name and metadata.
+/// If you want the root type to have a name and/or metadata, pass
+/// a Field instead.
+///
+/// \param[in] type DataType object to export
+/// \param[out] out C struct where to export the datatype
+ARROW_EXPORT
+Status ExportType(const DataType& type, struct ArrowSchema* out);
+
+/// \brief Export C++ Field using the C data interface format.
+///
+/// \param[in] field Field object to export
+/// \param[out] out C struct where to export the field
+ARROW_EXPORT
+Status ExportField(const Field& field, struct ArrowSchema* out);
+
+/// \brief Export C++ Schema using the C data interface format.
+///
+/// \param[in] schema Schema object to export
+/// \param[out] out C struct where to export the field
+ARROW_EXPORT
+Status ExportSchema(const Schema& schema, struct ArrowSchema* out);
+
+/// \brief Export C++ Array using the C data interface format.
+///
+/// The resulting ArrowArray struct keeps the array data and buffers alive
+/// until its release callback is called by the consumer.
+///
+/// \param[in] array Array object to export
+/// \param[out] out C struct where to export the array
+/// \param[out] out_schema optional C struct where to export the array type
+ARROW_EXPORT
+Status ExportArray(const Array& array, struct ArrowArray* out,
+ struct ArrowSchema* out_schema = NULLPTR);
+
+/// \brief Export C++ RecordBatch using the C data interface format.
+///
+/// The record batch is exported as if it were a struct array.
+/// The resulting ArrowArray struct keeps the record batch data and buffers alive
+/// until its release callback is called by the consumer.
+///
+/// \param[in] batch Record batch to export
+/// \param[out] out C struct where to export the record batch
+/// \param[out] out_schema optional C struct where to export the record batch schema
+ARROW_EXPORT
+Status ExportRecordBatch(const RecordBatch& batch, struct ArrowArray* out,
+ struct ArrowSchema* out_schema = NULLPTR);
+
+/// \brief Import C++ DataType from the C data interface.
+///
+/// The given ArrowSchema struct is released (as per the C data interface
+/// specification), even if this function fails.
+///
+/// \param[in,out] schema C data interface struct representing the data type
+/// \return Imported type object
+ARROW_EXPORT
+Result<std::shared_ptr<DataType>> ImportType(struct ArrowSchema* schema);
+
+/// \brief Import C++ Field from the C data interface.
+///
+/// The given ArrowSchema struct is released (as per the C data interface
+/// specification), even if this function fails.
+///
+/// \param[in,out] schema C data interface struct representing the field
+/// \return Imported field object
+ARROW_EXPORT
+Result<std::shared_ptr<Field>> ImportField(struct ArrowSchema* schema);
+
+/// \brief Import C++ Schema from the C data interface.
+///
+/// The given ArrowSchema struct is released (as per the C data interface
+/// specification), even if this function fails.
+///
+/// \param[in,out] schema C data interface struct representing the field
+/// \return Imported field object
+ARROW_EXPORT
+Result<std::shared_ptr<Schema>> ImportSchema(struct ArrowSchema* schema);
+
+/// \brief Import C++ array from the C data interface.
+///
+/// The ArrowArray struct has its contents moved (as per the C data interface
+/// specification) to a private object held alive by the resulting array.
+///
+/// \param[in,out] array C data interface struct holding the array data
+/// \param[in] type type of the imported array
+/// \return Imported array object
+ARROW_EXPORT
+Result<std::shared_ptr<Array>> ImportArray(struct ArrowArray* array,
+ std::shared_ptr<DataType> type);
+
+/// \brief Import C++ array and its type from the C data interface.
+///
+/// The ArrowArray struct has its contents moved (as per the C data interface
+/// specification) to a private object held alive by the resulting array.
+/// The ArrowSchema struct is released, even if this function fails.
+///
+/// \param[in,out] array C data interface struct holding the array data
+/// \param[in,out] type C data interface struct holding the array type
+/// \return Imported array object
+ARROW_EXPORT
+Result<std::shared_ptr<Array>> ImportArray(struct ArrowArray* array,
+ struct ArrowSchema* type);
+
+/// \brief Import C++ record batch from the C data interface.
+///
+/// The ArrowArray struct has its contents moved (as per the C data interface
+/// specification) to a private object held alive by the resulting record batch.
+///
+/// \param[in,out] array C data interface struct holding the record batch data
+/// \param[in] schema schema of the imported record batch
+/// \return Imported record batch object
+ARROW_EXPORT
+Result<std::shared_ptr<RecordBatch>> ImportRecordBatch(struct ArrowArray* array,
+ std::shared_ptr<Schema> schema);
+
+/// \brief Import C++ record batch and its schema from the C data interface.
+///
+/// The type represented by the ArrowSchema struct must be a struct type array.
+/// The ArrowArray struct has its contents moved (as per the C data interface
+/// specification) to a private object held alive by the resulting record batch.
+/// The ArrowSchema struct is released, even if this function fails.
+///
+/// \param[in,out] array C data interface struct holding the record batch data
+/// \param[in,out] schema C data interface struct holding the record batch schema
+/// \return Imported record batch object
+ARROW_EXPORT
+Result<std::shared_ptr<RecordBatch>> ImportRecordBatch(struct ArrowArray* array,
+ struct ArrowSchema* schema);
+
+/// @}
+
+/// \defgroup c-stream-interface Functions for working with the C data interface.
+///
+/// @{
+
+/// \brief EXPERIMENTAL: Export C++ RecordBatchReader using the C stream interface.
+///
+/// The resulting ArrowArrayStream struct keeps the record batch reader alive
+/// until its release callback is called by the consumer.
+///
+/// \param[in] reader RecordBatchReader object to export
+/// \param[out] out C struct where to export the stream
+ARROW_EXPORT
+Status ExportRecordBatchReader(std::shared_ptr<RecordBatchReader> reader,
+ struct ArrowArrayStream* out);
+
+/// \brief EXPERIMENTAL: Import C++ RecordBatchReader from the C stream interface.
+///
+/// The ArrowArrayStream struct has its contents moved to a private object
+/// held alive by the resulting record batch reader.
+///
+/// \param[in,out] stream C stream interface struct
+/// \return Imported RecordBatchReader object
+ARROW_EXPORT
+Result<std::shared_ptr<RecordBatchReader>> ImportRecordBatchReader(
+ struct ArrowArrayStream* stream);
+
+/// @}
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/c/helpers.h b/contrib/libs/apache/arrow/cpp/src/arrow/c/helpers.h
new file mode 100644
index 00000000000..a5c1f6fe4ba
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/c/helpers.h
@@ -0,0 +1,117 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <assert.h>
+#include <string.h>
+
+#include "arrow/c/abi.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/// Query whether the C schema is released
+inline int ArrowSchemaIsReleased(const struct ArrowSchema* schema) {
+ return schema->release == NULL;
+}
+
+/// Mark the C schema released (for use in release callbacks)
+inline void ArrowSchemaMarkReleased(struct ArrowSchema* schema) {
+ schema->release = NULL;
+}
+
+/// Move the C schema from `src` to `dest`
+///
+/// Note `dest` must *not* point to a valid schema already, otherwise there
+/// will be a memory leak.
+inline void ArrowSchemaMove(struct ArrowSchema* src, struct ArrowSchema* dest) {
+ assert(dest != src);
+ assert(!ArrowSchemaIsReleased(src));
+ memcpy(dest, src, sizeof(struct ArrowSchema));
+ ArrowSchemaMarkReleased(src);
+}
+
+/// Release the C schema, if necessary, by calling its release callback
+inline void ArrowSchemaRelease(struct ArrowSchema* schema) {
+ if (!ArrowSchemaIsReleased(schema)) {
+ schema->release(schema);
+ assert(ArrowSchemaIsReleased(schema));
+ }
+}
+
+/// Query whether the C array is released
+inline int ArrowArrayIsReleased(const struct ArrowArray* array) {
+ return array->release == NULL;
+}
+
+/// Mark the C array released (for use in release callbacks)
+inline void ArrowArrayMarkReleased(struct ArrowArray* array) { array->release = NULL; }
+
+/// Move the C array from `src` to `dest`
+///
+/// Note `dest` must *not* point to a valid array already, otherwise there
+/// will be a memory leak.
+inline void ArrowArrayMove(struct ArrowArray* src, struct ArrowArray* dest) {
+ assert(dest != src);
+ assert(!ArrowArrayIsReleased(src));
+ memcpy(dest, src, sizeof(struct ArrowArray));
+ ArrowArrayMarkReleased(src);
+}
+
+/// Release the C array, if necessary, by calling its release callback
+inline void ArrowArrayRelease(struct ArrowArray* array) {
+ if (!ArrowArrayIsReleased(array)) {
+ array->release(array);
+ assert(ArrowArrayIsReleased(array));
+ }
+}
+
+/// Query whether the C array stream is released
+inline int ArrowArrayStreamIsReleased(const struct ArrowArrayStream* stream) {
+ return stream->release == NULL;
+}
+
+/// Mark the C array stream released (for use in release callbacks)
+inline void ArrowArrayStreamMarkReleased(struct ArrowArrayStream* stream) {
+ stream->release = NULL;
+}
+
+/// Move the C array stream from `src` to `dest`
+///
+/// Note `dest` must *not* point to a valid stream already, otherwise there
+/// will be a memory leak.
+inline void ArrowArrayStreamMove(struct ArrowArrayStream* src,
+ struct ArrowArrayStream* dest) {
+ assert(dest != src);
+ assert(!ArrowArrayStreamIsReleased(src));
+ memcpy(dest, src, sizeof(struct ArrowArrayStream));
+ ArrowArrayStreamMarkReleased(src);
+}
+
+/// Release the C array stream, if necessary, by calling its release callback
+inline void ArrowArrayStreamRelease(struct ArrowArrayStream* stream) {
+ if (!ArrowArrayStreamIsReleased(stream)) {
+ stream->release(stream);
+ assert(ArrowArrayStreamIsReleased(stream));
+ }
+}
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/c/util_internal.h b/contrib/libs/apache/arrow/cpp/src/arrow/c/util_internal.h
new file mode 100644
index 00000000000..6a33be9b0da
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/c/util_internal.h
@@ -0,0 +1,85 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/c/helpers.h"
+
+namespace arrow {
+namespace internal {
+
+struct SchemaExportTraits {
+ typedef struct ArrowSchema CType;
+ static constexpr auto IsReleasedFunc = &ArrowSchemaIsReleased;
+ static constexpr auto ReleaseFunc = &ArrowSchemaRelease;
+};
+
+struct ArrayExportTraits {
+ typedef struct ArrowArray CType;
+ static constexpr auto IsReleasedFunc = &ArrowArrayIsReleased;
+ static constexpr auto ReleaseFunc = &ArrowArrayRelease;
+};
+
+struct ArrayStreamExportTraits {
+ typedef struct ArrowArrayStream CType;
+ static constexpr auto IsReleasedFunc = &ArrowArrayStreamIsReleased;
+ static constexpr auto ReleaseFunc = &ArrowArrayStreamRelease;
+};
+
+// A RAII-style object to release a C Array / Schema struct at block scope exit.
+template <typename Traits>
+class ExportGuard {
+ public:
+ using CType = typename Traits::CType;
+
+ explicit ExportGuard(CType* c_export) : c_export_(c_export) {}
+
+ ExportGuard(ExportGuard&& other) : c_export_(other.c_export_) {
+ other.c_export_ = nullptr;
+ }
+
+ ExportGuard& operator=(ExportGuard&& other) {
+ Release();
+ c_export_ = other.c_export_;
+ other.c_export_ = nullptr;
+ }
+
+ ~ExportGuard() { Release(); }
+
+ void Detach() { c_export_ = nullptr; }
+
+ void Reset(CType* c_export) { c_export_ = c_export; }
+
+ void Release() {
+ if (c_export_) {
+ Traits::ReleaseFunc(c_export_);
+ c_export_ = nullptr;
+ }
+ }
+
+ private:
+ ARROW_DISALLOW_COPY_AND_ASSIGN(ExportGuard);
+
+ CType* c_export_;
+};
+
+using SchemaExportGuard = ExportGuard<SchemaExportTraits>;
+using ArrayExportGuard = ExportGuard<ArrayExportTraits>;
+using ArrayStreamExportGuard = ExportGuard<ArrayStreamExportTraits>;
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/chunked_array.cc b/contrib/libs/apache/arrow/cpp/src/arrow/chunked_array.cc
new file mode 100644
index 00000000000..142bd0d8c89
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/chunked_array.cc
@@ -0,0 +1,294 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/chunked_array.h"
+
+#include <algorithm>
+#include <cstdlib>
+#include <memory>
+#include <sstream>
+#include <utility>
+
+#include "arrow/array/array_base.h"
+#include "arrow/array/array_nested.h"
+#include "arrow/array/validate.h"
+#include "arrow/pretty_print.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/logging.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+
+class MemoryPool;
+
+// ----------------------------------------------------------------------
+// ChunkedArray methods
+
+ChunkedArray::ChunkedArray(ArrayVector chunks) : chunks_(std::move(chunks)) {
+ length_ = 0;
+ null_count_ = 0;
+
+ ARROW_CHECK_GT(chunks_.size(), 0)
+ << "cannot construct ChunkedArray from empty vector and omitted type";
+ type_ = chunks_[0]->type();
+ for (const std::shared_ptr<Array>& chunk : chunks_) {
+ length_ += chunk->length();
+ null_count_ += chunk->null_count();
+ }
+}
+
+ChunkedArray::ChunkedArray(ArrayVector chunks, std::shared_ptr<DataType> type)
+ : chunks_(std::move(chunks)), type_(std::move(type)) {
+ length_ = 0;
+ null_count_ = 0;
+ for (const std::shared_ptr<Array>& chunk : chunks_) {
+ length_ += chunk->length();
+ null_count_ += chunk->null_count();
+ }
+}
+
+Result<std::shared_ptr<ChunkedArray>> ChunkedArray::Make(ArrayVector chunks,
+ std::shared_ptr<DataType> type) {
+ if (type == nullptr) {
+ if (chunks.size() == 0) {
+ return Status::Invalid(
+ "cannot construct ChunkedArray from empty vector "
+ "and omitted type");
+ }
+ type = chunks[0]->type();
+ }
+ for (size_t i = 0; i < chunks.size(); ++i) {
+ if (!chunks[i]->type()->Equals(*type)) {
+ return Status::Invalid("Array chunks must all be same type");
+ }
+ }
+ return std::make_shared<ChunkedArray>(std::move(chunks), std::move(type));
+}
+
+bool ChunkedArray::Equals(const ChunkedArray& other) const {
+ if (length_ != other.length()) {
+ return false;
+ }
+ if (null_count_ != other.null_count()) {
+ return false;
+ }
+ // We cannot toggle check_metadata here yet, so we don't check it
+ if (!type_->Equals(*other.type_, /*check_metadata=*/false)) {
+ return false;
+ }
+
+ // Check contents of the underlying arrays. This checks for equality of
+ // the underlying data independently of the chunk size.
+ return internal::ApplyBinaryChunked(
+ *this, other,
+ [](const Array& left_piece, const Array& right_piece,
+ int64_t ARROW_ARG_UNUSED(position)) {
+ if (!left_piece.Equals(right_piece)) {
+ return Status::Invalid("Unequal piece");
+ }
+ return Status::OK();
+ })
+ .ok();
+}
+
+bool ChunkedArray::Equals(const std::shared_ptr<ChunkedArray>& other) const {
+ if (this == other.get()) {
+ return true;
+ }
+ if (!other) {
+ return false;
+ }
+ return Equals(*other.get());
+}
+
+bool ChunkedArray::ApproxEquals(const ChunkedArray& other,
+ const EqualOptions& equal_options) const {
+ if (length_ != other.length()) {
+ return false;
+ }
+ if (null_count_ != other.null_count()) {
+ return false;
+ }
+ // We cannot toggle check_metadata here yet, so we don't check it
+ if (!type_->Equals(*other.type_, /*check_metadata=*/false)) {
+ return false;
+ }
+
+ // Check contents of the underlying arrays. This checks for equality of
+ // the underlying data independently of the chunk size.
+ return internal::ApplyBinaryChunked(
+ *this, other,
+ [&](const Array& left_piece, const Array& right_piece,
+ int64_t ARROW_ARG_UNUSED(position)) {
+ if (!left_piece.ApproxEquals(right_piece, equal_options)) {
+ return Status::Invalid("Unequal piece");
+ }
+ return Status::OK();
+ })
+ .ok();
+}
+
+std::shared_ptr<ChunkedArray> ChunkedArray::Slice(int64_t offset, int64_t length) const {
+ ARROW_CHECK_LE(offset, length_) << "Slice offset greater than array length";
+ bool offset_equals_length = offset == length_;
+ int curr_chunk = 0;
+ while (curr_chunk < num_chunks() && offset >= chunk(curr_chunk)->length()) {
+ offset -= chunk(curr_chunk)->length();
+ curr_chunk++;
+ }
+
+ ArrayVector new_chunks;
+ if (num_chunks() > 0 && (offset_equals_length || length == 0)) {
+ // Special case the zero-length slice to make sure there is at least 1 Array
+ // in the result. When there are zero chunks we return zero chunks
+ new_chunks.push_back(chunk(std::min(curr_chunk, num_chunks() - 1))->Slice(0, 0));
+ } else {
+ while (curr_chunk < num_chunks() && length > 0) {
+ new_chunks.push_back(chunk(curr_chunk)->Slice(offset, length));
+ length -= chunk(curr_chunk)->length() - offset;
+ offset = 0;
+ curr_chunk++;
+ }
+ }
+
+ return std::make_shared<ChunkedArray>(new_chunks, type_);
+}
+
+std::shared_ptr<ChunkedArray> ChunkedArray::Slice(int64_t offset) const {
+ return Slice(offset, length_);
+}
+
+Result<std::vector<std::shared_ptr<ChunkedArray>>> ChunkedArray::Flatten(
+ MemoryPool* pool) const {
+ if (type()->id() != Type::STRUCT) {
+ // Emulate nonexistent copy constructor
+ return std::vector<std::shared_ptr<ChunkedArray>>{
+ std::make_shared<ChunkedArray>(chunks_, type_)};
+ }
+
+ std::vector<ArrayVector> flattened_chunks(type()->num_fields());
+ for (const auto& chunk : chunks_) {
+ ARROW_ASSIGN_OR_RAISE(auto arrays,
+ checked_cast<const StructArray&>(*chunk).Flatten(pool));
+ DCHECK_EQ(arrays.size(), flattened_chunks.size());
+ for (size_t i = 0; i < arrays.size(); ++i) {
+ flattened_chunks[i].push_back(arrays[i]);
+ }
+ }
+
+ std::vector<std::shared_ptr<ChunkedArray>> flattened(type()->num_fields());
+ for (size_t i = 0; i < flattened.size(); ++i) {
+ auto child_type = type()->field(static_cast<int>(i))->type();
+ flattened[i] =
+ std::make_shared<ChunkedArray>(std::move(flattened_chunks[i]), child_type);
+ }
+ return flattened;
+}
+
+Result<std::shared_ptr<ChunkedArray>> ChunkedArray::View(
+ const std::shared_ptr<DataType>& type) const {
+ ArrayVector out_chunks(this->num_chunks());
+ for (int i = 0; i < this->num_chunks(); ++i) {
+ ARROW_ASSIGN_OR_RAISE(out_chunks[i], chunks_[i]->View(type));
+ }
+ return std::make_shared<ChunkedArray>(out_chunks, type);
+}
+
+std::string ChunkedArray::ToString() const {
+ std::stringstream ss;
+ ARROW_CHECK_OK(PrettyPrint(*this, 0, &ss));
+ return ss.str();
+}
+
+Status ChunkedArray::Validate() const {
+ if (chunks_.size() == 0) {
+ return Status::OK();
+ }
+
+ const auto& type = *chunks_[0]->type();
+ // Make sure chunks all have the same type
+ for (size_t i = 1; i < chunks_.size(); ++i) {
+ const Array& chunk = *chunks_[i];
+ if (!chunk.type()->Equals(type)) {
+ return Status::Invalid("In chunk ", i, " expected type ", type.ToString(),
+ " but saw ", chunk.type()->ToString());
+ }
+ }
+ // Validate the chunks themselves
+ for (size_t i = 0; i < chunks_.size(); ++i) {
+ const Array& chunk = *chunks_[i];
+ const Status st = internal::ValidateArray(chunk);
+ if (!st.ok()) {
+ return Status::Invalid("In chunk ", i, ": ", st.ToString());
+ }
+ }
+ return Status::OK();
+}
+
+Status ChunkedArray::ValidateFull() const {
+ RETURN_NOT_OK(Validate());
+ for (size_t i = 0; i < chunks_.size(); ++i) {
+ const Array& chunk = *chunks_[i];
+ const Status st = internal::ValidateArrayFull(chunk);
+ if (!st.ok()) {
+ return Status::Invalid("In chunk ", i, ": ", st.ToString());
+ }
+ }
+ return Status::OK();
+}
+
+namespace internal {
+
+bool MultipleChunkIterator::Next(std::shared_ptr<Array>* next_left,
+ std::shared_ptr<Array>* next_right) {
+ if (pos_ == length_) return false;
+
+ // Find non-empty chunk
+ std::shared_ptr<Array> chunk_left, chunk_right;
+ while (true) {
+ chunk_left = left_.chunk(chunk_idx_left_);
+ chunk_right = right_.chunk(chunk_idx_right_);
+ if (chunk_pos_left_ == chunk_left->length()) {
+ chunk_pos_left_ = 0;
+ ++chunk_idx_left_;
+ continue;
+ }
+ if (chunk_pos_right_ == chunk_right->length()) {
+ chunk_pos_right_ = 0;
+ ++chunk_idx_right_;
+ continue;
+ }
+ break;
+ }
+ // Determine how big of a section to return
+ int64_t iteration_size = std::min(chunk_left->length() - chunk_pos_left_,
+ chunk_right->length() - chunk_pos_right_);
+
+ *next_left = chunk_left->Slice(chunk_pos_left_, iteration_size);
+ *next_right = chunk_right->Slice(chunk_pos_right_, iteration_size);
+
+ pos_ += iteration_size;
+ chunk_pos_left_ += iteration_size;
+ chunk_pos_right_ += iteration_size;
+ return true;
+}
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/chunked_array.h b/contrib/libs/apache/arrow/cpp/src/arrow/chunked_array.h
new file mode 100644
index 00000000000..2ace045c2bf
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/chunked_array.h
@@ -0,0 +1,252 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/compare.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class Array;
+class DataType;
+class MemoryPool;
+
+/// \class ChunkedArray
+/// \brief A data structure managing a list of primitive Arrow arrays logically
+/// as one large array
+///
+/// Data chunking is treated throughout this project largely as an
+/// implementation detail for performance and memory use optimization.
+/// ChunkedArray allows Array objects to be collected and interpreted
+/// as a single logical array without requiring an expensive concatenation
+/// step.
+///
+/// In some cases, data produced by a function may exceed the capacity of an
+/// Array (like BinaryArray or StringArray) and so returning multiple Arrays is
+/// the only possibility. In these cases, we recommend returning a ChunkedArray
+/// instead of vector of Arrays or some alternative.
+///
+/// When data is processed in parallel, it may not be practical or possible to
+/// create large contiguous memory allocations and write output into them. With
+/// some data types, like binary and string types, it is not possible at all to
+/// produce non-chunked array outputs without requiring a concatenation step at
+/// the end of processing.
+///
+/// Application developers may tune chunk sizes based on analysis of
+/// performance profiles but many developer-users will not need to be
+/// especially concerned with the chunking details.
+///
+/// Preserving the chunk layout/sizes in processing steps is generally not
+/// considered to be a contract in APIs. A function may decide to alter the
+/// chunking of its result. Similarly, APIs accepting multiple ChunkedArray
+/// inputs should not expect the chunk layout to be the same in each input.
+class ARROW_EXPORT ChunkedArray {
+ public:
+ /// \brief Construct a chunked array from a vector of arrays
+ ///
+ /// The vector must be non-empty and all its elements must have the same
+ /// data type.
+ explicit ChunkedArray(ArrayVector chunks);
+
+ ChunkedArray(ChunkedArray&&) = default;
+ ChunkedArray& operator=(ChunkedArray&&) = default;
+
+ /// \brief Construct a chunked array from a single Array
+ explicit ChunkedArray(std::shared_ptr<Array> chunk)
+ : ChunkedArray(ArrayVector{std::move(chunk)}) {}
+
+ /// \brief Construct a chunked array from a vector of arrays and a data type
+ ///
+ /// As the data type is passed explicitly, the vector may be empty.
+ ChunkedArray(ArrayVector chunks, std::shared_ptr<DataType> type);
+
+ // \brief Constructor with basic input validation.
+ static Result<std::shared_ptr<ChunkedArray>> Make(
+ ArrayVector chunks, std::shared_ptr<DataType> type = NULLPTR);
+
+ /// \return the total length of the chunked array; computed on construction
+ int64_t length() const { return length_; }
+
+ /// \return the total number of nulls among all chunks
+ int64_t null_count() const { return null_count_; }
+
+ int num_chunks() const { return static_cast<int>(chunks_.size()); }
+
+ /// \return chunk a particular chunk from the chunked array
+ std::shared_ptr<Array> chunk(int i) const { return chunks_[i]; }
+
+ const ArrayVector& chunks() const { return chunks_; }
+
+ /// \brief Construct a zero-copy slice of the chunked array with the
+ /// indicated offset and length
+ ///
+ /// \param[in] offset the position of the first element in the constructed
+ /// slice
+ /// \param[in] length the length of the slice. If there are not enough
+ /// elements in the chunked array, the length will be adjusted accordingly
+ ///
+ /// \return a new object wrapped in std::shared_ptr<ChunkedArray>
+ std::shared_ptr<ChunkedArray> Slice(int64_t offset, int64_t length) const;
+
+ /// \brief Slice from offset until end of the chunked array
+ std::shared_ptr<ChunkedArray> Slice(int64_t offset) const;
+
+ /// \brief Flatten this chunked array as a vector of chunked arrays, one
+ /// for each struct field
+ ///
+ /// \param[in] pool The pool for buffer allocations, if any
+ Result<std::vector<std::shared_ptr<ChunkedArray>>> Flatten(
+ MemoryPool* pool = default_memory_pool()) const;
+
+ /// Construct a zero-copy view of this chunked array with the given
+ /// type. Calls Array::View on each constituent chunk. Always succeeds if
+ /// there are zero chunks
+ Result<std::shared_ptr<ChunkedArray>> View(const std::shared_ptr<DataType>& type) const;
+
+ std::shared_ptr<DataType> type() const { return type_; }
+
+ /// \brief Determine if two chunked arrays are equal.
+ ///
+ /// Two chunked arrays can be equal only if they have equal datatypes.
+ /// However, they may be equal even if they have different chunkings.
+ bool Equals(const ChunkedArray& other) const;
+ /// \brief Determine if two chunked arrays are equal.
+ bool Equals(const std::shared_ptr<ChunkedArray>& other) const;
+ /// \brief Determine if two chunked arrays approximately equal
+ bool ApproxEquals(const ChunkedArray& other,
+ const EqualOptions& = EqualOptions::Defaults()) const;
+
+ /// \return PrettyPrint representation suitable for debugging
+ std::string ToString() const;
+
+ /// \brief Perform cheap validation checks to determine obvious inconsistencies
+ /// within the chunk array's internal data.
+ ///
+ /// This is O(k*m) where k is the number of array descendents,
+ /// and m is the number of chunks.
+ ///
+ /// \return Status
+ Status Validate() const;
+
+ /// \brief Perform extensive validation checks to determine inconsistencies
+ /// within the chunk array's internal data.
+ ///
+ /// This is O(k*n) where k is the number of array descendents,
+ /// and n is the length in elements.
+ ///
+ /// \return Status
+ Status ValidateFull() const;
+
+ protected:
+ ArrayVector chunks_;
+ int64_t length_;
+ int64_t null_count_;
+ std::shared_ptr<DataType> type_;
+
+ private:
+ ARROW_DISALLOW_COPY_AND_ASSIGN(ChunkedArray);
+};
+
+namespace internal {
+
+/// \brief EXPERIMENTAL: Utility for incremental iteration over contiguous
+/// pieces of potentially differently-chunked ChunkedArray objects
+class ARROW_EXPORT MultipleChunkIterator {
+ public:
+ MultipleChunkIterator(const ChunkedArray& left, const ChunkedArray& right)
+ : left_(left),
+ right_(right),
+ pos_(0),
+ length_(left.length()),
+ chunk_idx_left_(0),
+ chunk_idx_right_(0),
+ chunk_pos_left_(0),
+ chunk_pos_right_(0) {}
+
+ bool Next(std::shared_ptr<Array>* next_left, std::shared_ptr<Array>* next_right);
+
+ int64_t position() const { return pos_; }
+
+ private:
+ const ChunkedArray& left_;
+ const ChunkedArray& right_;
+
+ // The amount of the entire ChunkedArray consumed
+ int64_t pos_;
+
+ // Length of the chunked array(s)
+ int64_t length_;
+
+ // Current left chunk
+ int chunk_idx_left_;
+
+ // Current right chunk
+ int chunk_idx_right_;
+
+ // Offset into the current left chunk
+ int64_t chunk_pos_left_;
+
+ // Offset into the current right chunk
+ int64_t chunk_pos_right_;
+};
+
+/// \brief Evaluate binary function on two ChunkedArray objects having possibly
+/// different chunk layouts. The passed binary function / functor should have
+/// the following signature.
+///
+/// Status(const Array&, const Array&, int64_t)
+///
+/// The third argument is the absolute position relative to the start of each
+/// ChunkedArray. The function is executed against each contiguous pair of
+/// array segments, slicing if necessary.
+///
+/// For example, if two arrays have chunk sizes
+///
+/// left: [10, 10, 20]
+/// right: [15, 10, 15]
+///
+/// Then the following invocations take place (pseudocode)
+///
+/// func(left.chunk[0][0:10], right.chunk[0][0:10], 0)
+/// func(left.chunk[1][0:5], right.chunk[0][10:15], 10)
+/// func(left.chunk[1][5:10], right.chunk[1][0:5], 15)
+/// func(left.chunk[2][0:5], right.chunk[1][5:10], 20)
+/// func(left.chunk[2][5:20], right.chunk[2][:], 25)
+template <typename Action>
+Status ApplyBinaryChunked(const ChunkedArray& left, const ChunkedArray& right,
+ Action&& action) {
+ MultipleChunkIterator iterator(left, right);
+ std::shared_ptr<Array> left_piece, right_piece;
+ while (iterator.Next(&left_piece, &right_piece)) {
+ ARROW_RETURN_NOT_OK(action(*left_piece, *right_piece, iterator.position()));
+ }
+ return Status::OK();
+}
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compare.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compare.cc
new file mode 100644
index 00000000000..4c6f97faf95
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compare.cc
@@ -0,0 +1,1304 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Functions for comparing Arrow data structures
+
+#include "arrow/compare.h"
+
+#include <climits>
+#include <cmath>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/array/diff.h"
+#include "arrow/buffer.h"
+#include "arrow/scalar.h"
+#include "arrow/sparse_tensor.h"
+#include "arrow/status.h"
+#include "arrow/tensor.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/bit_run_reader.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_ops.h"
+#include "arrow/util/bitmap_reader.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/memory.h"
+#include "arrow/visitor_inline.h"
+
+namespace arrow {
+
+using internal::BitmapEquals;
+using internal::BitmapReader;
+using internal::BitmapUInt64Reader;
+using internal::checked_cast;
+using internal::OptionalBitmapEquals;
+
+// ----------------------------------------------------------------------
+// Public method implementations
+
+namespace {
+
+// TODO also handle HALF_FLOAT NaNs
+
+enum FloatingEqualityFlags : int8_t { Approximate = 1, NansEqual = 2 };
+
+template <typename T, int8_t Flags>
+struct FloatingEquality {
+ bool operator()(T x, T y) { return x == y; }
+};
+
+template <typename T>
+struct FloatingEquality<T, NansEqual> {
+ bool operator()(T x, T y) { return (x == y) || (std::isnan(x) && std::isnan(y)); }
+};
+
+template <typename T>
+struct FloatingEquality<T, Approximate> {
+ explicit FloatingEquality(const EqualOptions& options)
+ : epsilon(static_cast<T>(options.atol())) {}
+
+ bool operator()(T x, T y) { return (fabs(x - y) <= epsilon) || (x == y); }
+
+ const T epsilon;
+};
+
+template <typename T>
+struct FloatingEquality<T, Approximate | NansEqual> {
+ explicit FloatingEquality(const EqualOptions& options)
+ : epsilon(static_cast<T>(options.atol())) {}
+
+ bool operator()(T x, T y) {
+ return (fabs(x - y) <= epsilon) || (x == y) || (std::isnan(x) && std::isnan(y));
+ }
+
+ const T epsilon;
+};
+
+template <typename T, typename Visitor>
+void VisitFloatingEquality(const EqualOptions& options, bool floating_approximate,
+ Visitor&& visit) {
+ if (options.nans_equal()) {
+ if (floating_approximate) {
+ visit(FloatingEquality<T, NansEqual | Approximate>{options});
+ } else {
+ visit(FloatingEquality<T, NansEqual>{});
+ }
+ } else {
+ if (floating_approximate) {
+ visit(FloatingEquality<T, Approximate>{options});
+ } else {
+ visit(FloatingEquality<T, 0>{});
+ }
+ }
+}
+
+inline bool IdentityImpliesEqualityNansNotEqual(const DataType& type) {
+ if (type.id() == Type::FLOAT || type.id() == Type::DOUBLE) {
+ return false;
+ }
+ for (const auto& child : type.fields()) {
+ if (!IdentityImpliesEqualityNansNotEqual(*child->type())) {
+ return false;
+ }
+ }
+ return true;
+}
+
+inline bool IdentityImpliesEquality(const DataType& type, const EqualOptions& options) {
+ if (options.nans_equal()) {
+ return true;
+ }
+ return IdentityImpliesEqualityNansNotEqual(type);
+}
+
+bool CompareArrayRanges(const ArrayData& left, const ArrayData& right,
+ int64_t left_start_idx, int64_t left_end_idx,
+ int64_t right_start_idx, const EqualOptions& options,
+ bool floating_approximate);
+
+class RangeDataEqualsImpl {
+ public:
+ // PRE-CONDITIONS:
+ // - the types are equal
+ // - the ranges are in bounds
+ RangeDataEqualsImpl(const EqualOptions& options, bool floating_approximate,
+ const ArrayData& left, const ArrayData& right,
+ int64_t left_start_idx, int64_t right_start_idx,
+ int64_t range_length)
+ : options_(options),
+ floating_approximate_(floating_approximate),
+ left_(left),
+ right_(right),
+ left_start_idx_(left_start_idx),
+ right_start_idx_(right_start_idx),
+ range_length_(range_length),
+ result_(false) {}
+
+ bool Compare() {
+ // Compare null bitmaps
+ if (left_start_idx_ == 0 && right_start_idx_ == 0 && range_length_ == left_.length &&
+ range_length_ == right_.length) {
+ // If we're comparing entire arrays, we can first compare the cached null counts
+ if (left_.GetNullCount() != right_.GetNullCount()) {
+ return false;
+ }
+ }
+ if (!OptionalBitmapEquals(left_.buffers[0], left_.offset + left_start_idx_,
+ right_.buffers[0], right_.offset + right_start_idx_,
+ range_length_)) {
+ return false;
+ }
+ // Compare values
+ return CompareWithType(*left_.type);
+ }
+
+ bool CompareWithType(const DataType& type) {
+ result_ = true;
+ if (range_length_ != 0) {
+ ARROW_CHECK_OK(VisitTypeInline(type, this));
+ }
+ return result_;
+ }
+
+ Status Visit(const NullType&) { return Status::OK(); }
+
+ template <typename TypeClass>
+ enable_if_primitive_ctype<TypeClass, Status> Visit(const TypeClass& type) {
+ return ComparePrimitive(type);
+ }
+
+ template <typename TypeClass>
+ enable_if_t<is_temporal_type<TypeClass>::value, Status> Visit(const TypeClass& type) {
+ return ComparePrimitive(type);
+ }
+
+ Status Visit(const BooleanType&) {
+ const uint8_t* left_bits = left_.GetValues<uint8_t>(1, 0);
+ const uint8_t* right_bits = right_.GetValues<uint8_t>(1, 0);
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+ if (length <= 8) {
+ // Avoid the BitmapUInt64Reader overhead for very small runs
+ for (int64_t j = i; j < i + length; ++j) {
+ if (BitUtil::GetBit(left_bits, left_start_idx_ + left_.offset + j) !=
+ BitUtil::GetBit(right_bits, right_start_idx_ + right_.offset + j)) {
+ return false;
+ }
+ }
+ return true;
+ } else if (length <= 1024) {
+ BitmapUInt64Reader left_reader(left_bits, left_start_idx_ + left_.offset + i,
+ length);
+ BitmapUInt64Reader right_reader(right_bits, right_start_idx_ + right_.offset + i,
+ length);
+ while (left_reader.position() < length) {
+ if (left_reader.NextWord() != right_reader.NextWord()) {
+ return false;
+ }
+ }
+ DCHECK_EQ(right_reader.position(), length);
+ } else {
+ // BitmapEquals is the fastest method on large runs
+ return BitmapEquals(left_bits, left_start_idx_ + left_.offset + i, right_bits,
+ right_start_idx_ + right_.offset + i, length);
+ }
+ return true;
+ };
+ VisitValidRuns(compare_runs);
+ return Status::OK();
+ }
+
+ Status Visit(const FloatType& type) { return CompareFloating(type); }
+
+ Status Visit(const DoubleType& type) { return CompareFloating(type); }
+
+ // Also matches StringType
+ Status Visit(const BinaryType& type) { return CompareBinary(type); }
+
+ // Also matches LargeStringType
+ Status Visit(const LargeBinaryType& type) { return CompareBinary(type); }
+
+ Status Visit(const FixedSizeBinaryType& type) {
+ const auto byte_width = type.byte_width();
+ const uint8_t* left_data = left_.GetValues<uint8_t>(1, 0);
+ const uint8_t* right_data = right_.GetValues<uint8_t>(1, 0);
+
+ if (left_data != nullptr && right_data != nullptr) {
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+ return memcmp(left_data + (left_start_idx_ + left_.offset + i) * byte_width,
+ right_data + (right_start_idx_ + right_.offset + i) * byte_width,
+ length * byte_width) == 0;
+ };
+ VisitValidRuns(compare_runs);
+ } else {
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool { return true; };
+ VisitValidRuns(compare_runs);
+ }
+ return Status::OK();
+ }
+
+ // Also matches MapType
+ Status Visit(const ListType& type) { return CompareList(type); }
+
+ Status Visit(const LargeListType& type) { return CompareList(type); }
+
+ Status Visit(const FixedSizeListType& type) {
+ const auto list_size = type.list_size();
+ const ArrayData& left_data = *left_.child_data[0];
+ const ArrayData& right_data = *right_.child_data[0];
+
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+ RangeDataEqualsImpl impl(options_, floating_approximate_, left_data, right_data,
+ (left_start_idx_ + left_.offset + i) * list_size,
+ (right_start_idx_ + right_.offset + i) * list_size,
+ length * list_size);
+ return impl.Compare();
+ };
+ VisitValidRuns(compare_runs);
+ return Status::OK();
+ }
+
+ Status Visit(const StructType& type) {
+ const int32_t num_fields = type.num_fields();
+
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+ for (int32_t f = 0; f < num_fields; ++f) {
+ RangeDataEqualsImpl impl(options_, floating_approximate_, *left_.child_data[f],
+ *right_.child_data[f],
+ left_start_idx_ + left_.offset + i,
+ right_start_idx_ + right_.offset + i, length);
+ if (!impl.Compare()) {
+ return false;
+ }
+ }
+ return true;
+ };
+ VisitValidRuns(compare_runs);
+ return Status::OK();
+ }
+
+ Status Visit(const SparseUnionType& type) {
+ const auto& child_ids = type.child_ids();
+ const int8_t* left_codes = left_.GetValues<int8_t>(1);
+ const int8_t* right_codes = right_.GetValues<int8_t>(1);
+
+ // Unions don't have a null bitmap
+ for (int64_t i = 0; i < range_length_; ++i) {
+ const auto type_id = left_codes[left_start_idx_ + i];
+ if (type_id != right_codes[right_start_idx_ + i]) {
+ result_ = false;
+ break;
+ }
+ const auto child_num = child_ids[type_id];
+ // XXX can we instead detect runs of same-child union values?
+ RangeDataEqualsImpl impl(
+ options_, floating_approximate_, *left_.child_data[child_num],
+ *right_.child_data[child_num], left_start_idx_ + left_.offset + i,
+ right_start_idx_ + right_.offset + i, 1);
+ if (!impl.Compare()) {
+ result_ = false;
+ break;
+ }
+ }
+ return Status::OK();
+ }
+
+ Status Visit(const DenseUnionType& type) {
+ const auto& child_ids = type.child_ids();
+ const int8_t* left_codes = left_.GetValues<int8_t>(1);
+ const int8_t* right_codes = right_.GetValues<int8_t>(1);
+ const int32_t* left_offsets = left_.GetValues<int32_t>(2);
+ const int32_t* right_offsets = right_.GetValues<int32_t>(2);
+
+ for (int64_t i = 0; i < range_length_; ++i) {
+ const auto type_id = left_codes[left_start_idx_ + i];
+ if (type_id != right_codes[right_start_idx_ + i]) {
+ result_ = false;
+ break;
+ }
+ const auto child_num = child_ids[type_id];
+ RangeDataEqualsImpl impl(
+ options_, floating_approximate_, *left_.child_data[child_num],
+ *right_.child_data[child_num], left_offsets[left_start_idx_ + i],
+ right_offsets[right_start_idx_ + i], 1);
+ if (!impl.Compare()) {
+ result_ = false;
+ break;
+ }
+ }
+ return Status::OK();
+ }
+
+ Status Visit(const DictionaryType& type) {
+ // Compare dictionaries
+ result_ &= CompareArrayRanges(
+ *left_.dictionary, *right_.dictionary,
+ /*left_start_idx=*/0,
+ /*left_end_idx=*/std::max(left_.dictionary->length, right_.dictionary->length),
+ /*right_start_idx=*/0, options_, floating_approximate_);
+ if (result_) {
+ // Compare indices
+ result_ &= CompareWithType(*type.index_type());
+ }
+ return Status::OK();
+ }
+
+ Status Visit(const ExtensionType& type) {
+ // Compare storages
+ result_ &= CompareWithType(*type.storage_type());
+ return Status::OK();
+ }
+
+ protected:
+ // For CompareFloating (templated local classes or lambdas not supported in C++11)
+ template <typename CType>
+ struct ComparatorVisitor {
+ RangeDataEqualsImpl* impl;
+ const CType* left_values;
+ const CType* right_values;
+
+ template <typename CompareFunction>
+ void operator()(CompareFunction&& compare) {
+ impl->VisitValues([&](int64_t i) {
+ const CType x = left_values[i + impl->left_start_idx_];
+ const CType y = right_values[i + impl->right_start_idx_];
+ return compare(x, y);
+ });
+ }
+ };
+
+ template <typename CType>
+ friend struct ComparatorVisitor;
+
+ template <typename TypeClass, typename CType = typename TypeClass::c_type>
+ Status ComparePrimitive(const TypeClass&) {
+ const CType* left_values = left_.GetValues<CType>(1);
+ const CType* right_values = right_.GetValues<CType>(1);
+ VisitValidRuns([&](int64_t i, int64_t length) {
+ return memcmp(left_values + left_start_idx_ + i,
+ right_values + right_start_idx_ + i, length * sizeof(CType)) == 0;
+ });
+ return Status::OK();
+ }
+
+ template <typename TypeClass>
+ Status CompareFloating(const TypeClass&) {
+ using CType = typename TypeClass::c_type;
+ const CType* left_values = left_.GetValues<CType>(1);
+ const CType* right_values = right_.GetValues<CType>(1);
+
+ ComparatorVisitor<CType> visitor{this, left_values, right_values};
+ VisitFloatingEquality<CType>(options_, floating_approximate_, visitor);
+ return Status::OK();
+ }
+
+ template <typename TypeClass>
+ Status CompareBinary(const TypeClass&) {
+ const uint8_t* left_data = left_.GetValues<uint8_t>(2, 0);
+ const uint8_t* right_data = right_.GetValues<uint8_t>(2, 0);
+
+ if (left_data != nullptr && right_data != nullptr) {
+ const auto compare_ranges = [&](int64_t left_offset, int64_t right_offset,
+ int64_t length) -> bool {
+ return memcmp(left_data + left_offset, right_data + right_offset, length) == 0;
+ };
+ CompareWithOffsets<typename TypeClass::offset_type>(1, compare_ranges);
+ } else {
+ // One of the arrays is an array of empty strings and nulls.
+ // We just need to compare the offsets.
+ // (note we must not call memcmp() with null data pointers)
+ CompareWithOffsets<typename TypeClass::offset_type>(1, [](...) { return true; });
+ }
+ return Status::OK();
+ }
+
+ template <typename TypeClass>
+ Status CompareList(const TypeClass&) {
+ const ArrayData& left_data = *left_.child_data[0];
+ const ArrayData& right_data = *right_.child_data[0];
+
+ const auto compare_ranges = [&](int64_t left_offset, int64_t right_offset,
+ int64_t length) -> bool {
+ RangeDataEqualsImpl impl(options_, floating_approximate_, left_data, right_data,
+ left_offset, right_offset, length);
+ return impl.Compare();
+ };
+
+ CompareWithOffsets<typename TypeClass::offset_type>(1, compare_ranges);
+ return Status::OK();
+ }
+
+ template <typename offset_type, typename CompareRanges>
+ void CompareWithOffsets(int offsets_buffer_index, CompareRanges&& compare_ranges) {
+ const offset_type* left_offsets =
+ left_.GetValues<offset_type>(offsets_buffer_index) + left_start_idx_;
+ const offset_type* right_offsets =
+ right_.GetValues<offset_type>(offsets_buffer_index) + right_start_idx_;
+
+ const auto compare_runs = [&](int64_t i, int64_t length) {
+ for (int64_t j = i; j < i + length; ++j) {
+ if (left_offsets[j + 1] - left_offsets[j] !=
+ right_offsets[j + 1] - right_offsets[j]) {
+ return false;
+ }
+ }
+ if (!compare_ranges(left_offsets[i], right_offsets[i],
+ left_offsets[i + length] - left_offsets[i])) {
+ return false;
+ }
+ return true;
+ };
+
+ VisitValidRuns(compare_runs);
+ }
+
+ template <typename CompareValues>
+ void VisitValues(CompareValues&& compare_values) {
+ internal::VisitSetBitRunsVoid(left_.buffers[0], left_.offset + left_start_idx_,
+ range_length_, [&](int64_t position, int64_t length) {
+ for (int64_t i = 0; i < length; ++i) {
+ result_ &= compare_values(position + i);
+ }
+ });
+ }
+
+ // Visit and compare runs of non-null values
+ template <typename CompareRuns>
+ void VisitValidRuns(CompareRuns&& compare_runs) {
+ const uint8_t* left_null_bitmap = left_.GetValues<uint8_t>(0, 0);
+ if (left_null_bitmap == nullptr) {
+ result_ = compare_runs(0, range_length_);
+ return;
+ }
+ internal::SetBitRunReader reader(left_null_bitmap, left_.offset + left_start_idx_,
+ range_length_);
+ while (true) {
+ const auto run = reader.NextRun();
+ if (run.length == 0) {
+ return;
+ }
+ if (!compare_runs(run.position, run.length)) {
+ result_ = false;
+ return;
+ }
+ }
+ }
+
+ const EqualOptions& options_;
+ const bool floating_approximate_;
+ const ArrayData& left_;
+ const ArrayData& right_;
+ const int64_t left_start_idx_;
+ const int64_t right_start_idx_;
+ const int64_t range_length_;
+
+ bool result_;
+};
+
+bool CompareArrayRanges(const ArrayData& left, const ArrayData& right,
+ int64_t left_start_idx, int64_t left_end_idx,
+ int64_t right_start_idx, const EqualOptions& options,
+ bool floating_approximate) {
+ if (left.type->id() != right.type->id() ||
+ !TypeEquals(*left.type, *right.type, false /* check_metadata */)) {
+ return false;
+ }
+
+ const int64_t range_length = left_end_idx - left_start_idx;
+ DCHECK_GE(range_length, 0);
+ if (left_start_idx + range_length > left.length) {
+ // Left range too small
+ return false;
+ }
+ if (right_start_idx + range_length > right.length) {
+ // Right range too small
+ return false;
+ }
+ if (&left == &right && left_start_idx == right_start_idx &&
+ IdentityImpliesEquality(*left.type, options)) {
+ return true;
+ }
+ // Compare values
+ RangeDataEqualsImpl impl(options, floating_approximate, left, right, left_start_idx,
+ right_start_idx, range_length);
+ return impl.Compare();
+}
+
+class TypeEqualsVisitor {
+ public:
+ explicit TypeEqualsVisitor(const DataType& right, bool check_metadata)
+ : right_(right), check_metadata_(check_metadata), result_(false) {}
+
+ Status VisitChildren(const DataType& left) {
+ if (left.num_fields() != right_.num_fields()) {
+ result_ = false;
+ return Status::OK();
+ }
+
+ for (int i = 0; i < left.num_fields(); ++i) {
+ if (!left.field(i)->Equals(right_.field(i), check_metadata_)) {
+ result_ = false;
+ return Status::OK();
+ }
+ }
+ result_ = true;
+ return Status::OK();
+ }
+
+ template <typename T>
+ enable_if_t<is_null_type<T>::value || is_primitive_ctype<T>::value ||
+ is_base_binary_type<T>::value,
+ Status>
+ Visit(const T&) {
+ result_ = true;
+ return Status::OK();
+ }
+
+ template <typename T>
+ enable_if_interval<T, Status> Visit(const T& left) {
+ const auto& right = checked_cast<const IntervalType&>(right_);
+ result_ = right.interval_type() == left.interval_type();
+ return Status::OK();
+ }
+
+ template <typename T>
+ enable_if_t<is_time_type<T>::value || is_date_type<T>::value ||
+ is_duration_type<T>::value,
+ Status>
+ Visit(const T& left) {
+ const auto& right = checked_cast<const T&>(right_);
+ result_ = left.unit() == right.unit();
+ return Status::OK();
+ }
+
+ Status Visit(const TimestampType& left) {
+ const auto& right = checked_cast<const TimestampType&>(right_);
+ result_ = left.unit() == right.unit() && left.timezone() == right.timezone();
+ return Status::OK();
+ }
+
+ Status Visit(const FixedSizeBinaryType& left) {
+ const auto& right = checked_cast<const FixedSizeBinaryType&>(right_);
+ result_ = left.byte_width() == right.byte_width();
+ return Status::OK();
+ }
+
+ Status Visit(const Decimal128Type& left) {
+ const auto& right = checked_cast<const Decimal128Type&>(right_);
+ result_ = left.precision() == right.precision() && left.scale() == right.scale();
+ return Status::OK();
+ }
+
+ Status Visit(const Decimal256Type& left) {
+ const auto& right = checked_cast<const Decimal256Type&>(right_);
+ result_ = left.precision() == right.precision() && left.scale() == right.scale();
+ return Status::OK();
+ }
+
+ template <typename T>
+ enable_if_t<is_list_like_type<T>::value || is_struct_type<T>::value, Status> Visit(
+ const T& left) {
+ return VisitChildren(left);
+ }
+
+ Status Visit(const MapType& left) {
+ const auto& right = checked_cast<const MapType&>(right_);
+ if (left.keys_sorted() != right.keys_sorted()) {
+ result_ = false;
+ return Status::OK();
+ }
+ result_ = left.key_type()->Equals(*right.key_type(), check_metadata_) &&
+ left.item_type()->Equals(*right.item_type(), check_metadata_);
+ return Status::OK();
+ }
+
+ Status Visit(const UnionType& left) {
+ const auto& right = checked_cast<const UnionType&>(right_);
+
+ if (left.mode() != right.mode() || left.type_codes() != right.type_codes()) {
+ result_ = false;
+ return Status::OK();
+ }
+
+ result_ = std::equal(
+ left.fields().begin(), left.fields().end(), right.fields().begin(),
+ [this](const std::shared_ptr<Field>& l, const std::shared_ptr<Field>& r) {
+ return l->Equals(r, check_metadata_);
+ });
+ return Status::OK();
+ }
+
+ Status Visit(const DictionaryType& left) {
+ const auto& right = checked_cast<const DictionaryType&>(right_);
+ result_ = left.index_type()->Equals(right.index_type()) &&
+ left.value_type()->Equals(right.value_type()) &&
+ (left.ordered() == right.ordered());
+ return Status::OK();
+ }
+
+ Status Visit(const ExtensionType& left) {
+ result_ = left.ExtensionEquals(static_cast<const ExtensionType&>(right_));
+ return Status::OK();
+ }
+
+ bool result() const { return result_; }
+
+ protected:
+ const DataType& right_;
+ bool check_metadata_;
+ bool result_;
+};
+
+bool ArrayEquals(const Array& left, const Array& right, const EqualOptions& opts,
+ bool floating_approximate);
+bool ScalarEquals(const Scalar& left, const Scalar& right, const EqualOptions& options,
+ bool floating_approximate);
+
+class ScalarEqualsVisitor {
+ public:
+ // PRE-CONDITIONS:
+ // - the types are equal
+ // - the scalars are non-null
+ explicit ScalarEqualsVisitor(const Scalar& right, const EqualOptions& opts,
+ bool floating_approximate)
+ : right_(right),
+ options_(opts),
+ floating_approximate_(floating_approximate),
+ result_(false) {}
+
+ Status Visit(const NullScalar& left) {
+ result_ = true;
+ return Status::OK();
+ }
+
+ Status Visit(const BooleanScalar& left) {
+ const auto& right = checked_cast<const BooleanScalar&>(right_);
+ result_ = left.value == right.value;
+ return Status::OK();
+ }
+
+ template <typename T>
+ typename std::enable_if<(is_primitive_ctype<typename T::TypeClass>::value ||
+ is_temporal_type<typename T::TypeClass>::value),
+ Status>::type
+ Visit(const T& left_) {
+ const auto& right = checked_cast<const T&>(right_);
+ result_ = right.value == left_.value;
+ return Status::OK();
+ }
+
+ Status Visit(const FloatScalar& left) { return CompareFloating(left); }
+
+ Status Visit(const DoubleScalar& left) { return CompareFloating(left); }
+
+ template <typename T>
+ typename std::enable_if<std::is_base_of<BaseBinaryScalar, T>::value, Status>::type
+ Visit(const T& left) {
+ const auto& right = checked_cast<const BaseBinaryScalar&>(right_);
+ result_ = internal::SharedPtrEquals(left.value, right.value);
+ return Status::OK();
+ }
+
+ Status Visit(const Decimal128Scalar& left) {
+ const auto& right = checked_cast<const Decimal128Scalar&>(right_);
+ result_ = left.value == right.value;
+ return Status::OK();
+ }
+
+ Status Visit(const Decimal256Scalar& left) {
+ const auto& right = checked_cast<const Decimal256Scalar&>(right_);
+ result_ = left.value == right.value;
+ return Status::OK();
+ }
+
+ Status Visit(const ListScalar& left) {
+ const auto& right = checked_cast<const ListScalar&>(right_);
+ result_ = ArrayEquals(*left.value, *right.value, options_, floating_approximate_);
+ return Status::OK();
+ }
+
+ Status Visit(const LargeListScalar& left) {
+ const auto& right = checked_cast<const LargeListScalar&>(right_);
+ result_ = ArrayEquals(*left.value, *right.value, options_, floating_approximate_);
+ return Status::OK();
+ }
+
+ Status Visit(const MapScalar& left) {
+ const auto& right = checked_cast<const MapScalar&>(right_);
+ result_ = ArrayEquals(*left.value, *right.value, options_, floating_approximate_);
+ return Status::OK();
+ }
+
+ Status Visit(const FixedSizeListScalar& left) {
+ const auto& right = checked_cast<const FixedSizeListScalar&>(right_);
+ result_ = ArrayEquals(*left.value, *right.value, options_, floating_approximate_);
+ return Status::OK();
+ }
+
+ Status Visit(const StructScalar& left) {
+ const auto& right = checked_cast<const StructScalar&>(right_);
+
+ if (right.value.size() != left.value.size()) {
+ result_ = false;
+ } else {
+ bool all_equals = true;
+ for (size_t i = 0; i < left.value.size() && all_equals; i++) {
+ all_equals &= ScalarEquals(*left.value[i], *right.value[i], options_,
+ floating_approximate_);
+ }
+ result_ = all_equals;
+ }
+
+ return Status::OK();
+ }
+
+ Status Visit(const UnionScalar& left) {
+ const auto& right = checked_cast<const UnionScalar&>(right_);
+ if (left.is_valid && right.is_valid) {
+ result_ = ScalarEquals(*left.value, *right.value, options_, floating_approximate_);
+ } else if (!left.is_valid && !right.is_valid) {
+ result_ = true;
+ } else {
+ result_ = false;
+ }
+ return Status::OK();
+ }
+
+ Status Visit(const DictionaryScalar& left) {
+ const auto& right = checked_cast<const DictionaryScalar&>(right_);
+ result_ = ScalarEquals(*left.value.index, *right.value.index, options_,
+ floating_approximate_) &&
+ ArrayEquals(*left.value.dictionary, *right.value.dictionary, options_,
+ floating_approximate_);
+ return Status::OK();
+ }
+
+ Status Visit(const ExtensionScalar& left) {
+ return Status::NotImplemented("extension");
+ }
+
+ bool result() const { return result_; }
+
+ protected:
+ // For CompareFloating (templated local classes or lambdas not supported in C++11)
+ template <typename ScalarType>
+ struct ComparatorVisitor {
+ const ScalarType& left;
+ const ScalarType& right;
+ bool* result;
+
+ template <typename CompareFunction>
+ void operator()(CompareFunction&& compare) {
+ *result = compare(left.value, right.value);
+ }
+ };
+
+ template <typename ScalarType>
+ Status CompareFloating(const ScalarType& left) {
+ using CType = decltype(left.value);
+
+ ComparatorVisitor<ScalarType> visitor{left, checked_cast<const ScalarType&>(right_),
+ &result_};
+ VisitFloatingEquality<CType>(options_, floating_approximate_, visitor);
+ return Status::OK();
+ }
+
+ const Scalar& right_;
+ const EqualOptions options_;
+ const bool floating_approximate_;
+ bool result_;
+};
+
+Status PrintDiff(const Array& left, const Array& right, std::ostream* os);
+
+Status PrintDiff(const Array& left, const Array& right, int64_t left_offset,
+ int64_t left_length, int64_t right_offset, int64_t right_length,
+ std::ostream* os) {
+ if (os == nullptr) {
+ return Status::OK();
+ }
+
+ if (!left.type()->Equals(right.type())) {
+ *os << "# Array types differed: " << *left.type() << " vs " << *right.type()
+ << std::endl;
+ return Status::OK();
+ }
+
+ if (left.type()->id() == Type::DICTIONARY) {
+ *os << "# Dictionary arrays differed" << std::endl;
+
+ const auto& left_dict = checked_cast<const DictionaryArray&>(left);
+ const auto& right_dict = checked_cast<const DictionaryArray&>(right);
+
+ *os << "## dictionary diff";
+ auto pos = os->tellp();
+ RETURN_NOT_OK(PrintDiff(*left_dict.dictionary(), *right_dict.dictionary(), os));
+ if (os->tellp() == pos) {
+ *os << std::endl;
+ }
+
+ *os << "## indices diff";
+ pos = os->tellp();
+ RETURN_NOT_OK(PrintDiff(*left_dict.indices(), *right_dict.indices(), os));
+ if (os->tellp() == pos) {
+ *os << std::endl;
+ }
+ return Status::OK();
+ }
+
+ const auto left_slice = left.Slice(left_offset, left_length);
+ const auto right_slice = right.Slice(right_offset, right_length);
+ ARROW_ASSIGN_OR_RAISE(auto edits,
+ Diff(*left_slice, *right_slice, default_memory_pool()));
+ ARROW_ASSIGN_OR_RAISE(auto formatter, MakeUnifiedDiffFormatter(*left.type(), os));
+ return formatter(*edits, *left_slice, *right_slice);
+}
+
+Status PrintDiff(const Array& left, const Array& right, std::ostream* os) {
+ return PrintDiff(left, right, 0, left.length(), 0, right.length(), os);
+}
+
+bool ArrayRangeEquals(const Array& left, const Array& right, int64_t left_start_idx,
+ int64_t left_end_idx, int64_t right_start_idx,
+ const EqualOptions& options, bool floating_approximate) {
+ bool are_equal =
+ CompareArrayRanges(*left.data(), *right.data(), left_start_idx, left_end_idx,
+ right_start_idx, options, floating_approximate);
+ if (!are_equal) {
+ ARROW_IGNORE_EXPR(PrintDiff(
+ left, right, left_start_idx, left_end_idx, right_start_idx,
+ right_start_idx + (left_end_idx - left_start_idx), options.diff_sink()));
+ }
+ return are_equal;
+}
+
+bool ArrayEquals(const Array& left, const Array& right, const EqualOptions& opts,
+ bool floating_approximate) {
+ if (left.length() != right.length()) {
+ ARROW_IGNORE_EXPR(PrintDiff(left, right, opts.diff_sink()));
+ return false;
+ }
+ return ArrayRangeEquals(left, right, 0, left.length(), 0, opts, floating_approximate);
+}
+
+bool ScalarEquals(const Scalar& left, const Scalar& right, const EqualOptions& options,
+ bool floating_approximate) {
+ if (&left == &right && IdentityImpliesEquality(*left.type, options)) {
+ return true;
+ }
+ if (!left.type->Equals(right.type)) {
+ return false;
+ }
+ if (left.is_valid != right.is_valid) {
+ return false;
+ }
+ if (!left.is_valid) {
+ return true;
+ }
+ ScalarEqualsVisitor visitor(right, options, floating_approximate);
+ auto error = VisitScalarInline(left, &visitor);
+ DCHECK_OK(error);
+ return visitor.result();
+}
+
+} // namespace
+
+bool ArrayRangeEquals(const Array& left, const Array& right, int64_t left_start_idx,
+ int64_t left_end_idx, int64_t right_start_idx,
+ const EqualOptions& options) {
+ const bool floating_approximate = false;
+ return ArrayRangeEquals(left, right, left_start_idx, left_end_idx, right_start_idx,
+ options, floating_approximate);
+}
+
+bool ArrayRangeApproxEquals(const Array& left, const Array& right, int64_t left_start_idx,
+ int64_t left_end_idx, int64_t right_start_idx,
+ const EqualOptions& options) {
+ const bool floating_approximate = true;
+ return ArrayRangeEquals(left, right, left_start_idx, left_end_idx, right_start_idx,
+ options, floating_approximate);
+}
+
+bool ArrayEquals(const Array& left, const Array& right, const EqualOptions& opts) {
+ const bool floating_approximate = false;
+ return ArrayEquals(left, right, opts, floating_approximate);
+}
+
+bool ArrayApproxEquals(const Array& left, const Array& right, const EqualOptions& opts) {
+ const bool floating_approximate = true;
+ return ArrayEquals(left, right, opts, floating_approximate);
+}
+
+bool ScalarEquals(const Scalar& left, const Scalar& right, const EqualOptions& options) {
+ const bool floating_approximate = false;
+ return ScalarEquals(left, right, options, floating_approximate);
+}
+
+bool ScalarApproxEquals(const Scalar& left, const Scalar& right,
+ const EqualOptions& options) {
+ const bool floating_approximate = true;
+ return ScalarEquals(left, right, options, floating_approximate);
+}
+
+namespace {
+
+bool StridedIntegerTensorContentEquals(const int dim_index, int64_t left_offset,
+ int64_t right_offset, int elem_size,
+ const Tensor& left, const Tensor& right) {
+ const auto n = left.shape()[dim_index];
+ const auto left_stride = left.strides()[dim_index];
+ const auto right_stride = right.strides()[dim_index];
+ if (dim_index == left.ndim() - 1) {
+ for (int64_t i = 0; i < n; ++i) {
+ if (memcmp(left.raw_data() + left_offset + i * left_stride,
+ right.raw_data() + right_offset + i * right_stride, elem_size) != 0) {
+ return false;
+ }
+ }
+ return true;
+ }
+ for (int64_t i = 0; i < n; ++i) {
+ if (!StridedIntegerTensorContentEquals(dim_index + 1, left_offset, right_offset,
+ elem_size, left, right)) {
+ return false;
+ }
+ left_offset += left_stride;
+ right_offset += right_stride;
+ }
+ return true;
+}
+
+bool IntegerTensorEquals(const Tensor& left, const Tensor& right) {
+ bool are_equal;
+ // The arrays are the same object
+ if (&left == &right) {
+ are_equal = true;
+ } else {
+ const bool left_row_major_p = left.is_row_major();
+ const bool left_column_major_p = left.is_column_major();
+ const bool right_row_major_p = right.is_row_major();
+ const bool right_column_major_p = right.is_column_major();
+
+ if (!(left_row_major_p && right_row_major_p) &&
+ !(left_column_major_p && right_column_major_p)) {
+ const auto& type = checked_cast<const FixedWidthType&>(*left.type());
+ are_equal = StridedIntegerTensorContentEquals(0, 0, 0, internal::GetByteWidth(type),
+ left, right);
+ } else {
+ const int byte_width = internal::GetByteWidth(*left.type());
+ DCHECK_GT(byte_width, 0);
+
+ const uint8_t* left_data = left.data()->data();
+ const uint8_t* right_data = right.data()->data();
+
+ are_equal = memcmp(left_data, right_data,
+ static_cast<size_t>(byte_width * left.size())) == 0;
+ }
+ }
+ return are_equal;
+}
+
+template <typename DataType>
+bool StridedFloatTensorContentEquals(const int dim_index, int64_t left_offset,
+ int64_t right_offset, const Tensor& left,
+ const Tensor& right, const EqualOptions& opts) {
+ using c_type = typename DataType::c_type;
+ static_assert(std::is_floating_point<c_type>::value,
+ "DataType must be a floating point type");
+
+ const auto n = left.shape()[dim_index];
+ const auto left_stride = left.strides()[dim_index];
+ const auto right_stride = right.strides()[dim_index];
+ if (dim_index == left.ndim() - 1) {
+ auto left_data = left.raw_data();
+ auto right_data = right.raw_data();
+ if (opts.nans_equal()) {
+ for (int64_t i = 0; i < n; ++i) {
+ c_type left_value =
+ *reinterpret_cast<const c_type*>(left_data + left_offset + i * left_stride);
+ c_type right_value = *reinterpret_cast<const c_type*>(right_data + right_offset +
+ i * right_stride);
+ if (left_value != right_value &&
+ !(std::isnan(left_value) && std::isnan(right_value))) {
+ return false;
+ }
+ }
+ } else {
+ for (int64_t i = 0; i < n; ++i) {
+ c_type left_value =
+ *reinterpret_cast<const c_type*>(left_data + left_offset + i * left_stride);
+ c_type right_value = *reinterpret_cast<const c_type*>(right_data + right_offset +
+ i * right_stride);
+ if (left_value != right_value) {
+ return false;
+ }
+ }
+ }
+ return true;
+ }
+ for (int64_t i = 0; i < n; ++i) {
+ if (!StridedFloatTensorContentEquals<DataType>(dim_index + 1, left_offset,
+ right_offset, left, right, opts)) {
+ return false;
+ }
+ left_offset += left_stride;
+ right_offset += right_stride;
+ }
+ return true;
+}
+
+template <typename DataType>
+bool FloatTensorEquals(const Tensor& left, const Tensor& right,
+ const EqualOptions& opts) {
+ return StridedFloatTensorContentEquals<DataType>(0, 0, 0, left, right, opts);
+}
+
+} // namespace
+
+bool TensorEquals(const Tensor& left, const Tensor& right, const EqualOptions& opts) {
+ if (left.type_id() != right.type_id()) {
+ return false;
+ } else if (left.size() == 0 && right.size() == 0) {
+ return true;
+ } else if (left.shape() != right.shape()) {
+ return false;
+ }
+
+ switch (left.type_id()) {
+ // TODO: Support half-float tensors
+ // case Type::HALF_FLOAT:
+ case Type::FLOAT:
+ return FloatTensorEquals<FloatType>(left, right, opts);
+
+ case Type::DOUBLE:
+ return FloatTensorEquals<DoubleType>(left, right, opts);
+
+ default:
+ return IntegerTensorEquals(left, right);
+ }
+}
+
+namespace {
+
+template <typename LeftSparseIndexType, typename RightSparseIndexType>
+struct SparseTensorEqualsImpl {
+ static bool Compare(const SparseTensorImpl<LeftSparseIndexType>& left,
+ const SparseTensorImpl<RightSparseIndexType>& right,
+ const EqualOptions&) {
+ // TODO(mrkn): should we support the equality among different formats?
+ return false;
+ }
+};
+
+bool IntegerSparseTensorDataEquals(const uint8_t* left_data, const uint8_t* right_data,
+ const int byte_width, const int64_t length) {
+ if (left_data == right_data) {
+ return true;
+ }
+ return memcmp(left_data, right_data, static_cast<size_t>(byte_width * length)) == 0;
+}
+
+template <typename DataType>
+bool FloatSparseTensorDataEquals(const typename DataType::c_type* left_data,
+ const typename DataType::c_type* right_data,
+ const int64_t length, const EqualOptions& opts) {
+ using c_type = typename DataType::c_type;
+ static_assert(std::is_floating_point<c_type>::value,
+ "DataType must be a floating point type");
+ if (opts.nans_equal()) {
+ if (left_data == right_data) {
+ return true;
+ }
+
+ for (int64_t i = 0; i < length; ++i) {
+ const auto left = left_data[i];
+ const auto right = right_data[i];
+ if (left != right && !(std::isnan(left) && std::isnan(right))) {
+ return false;
+ }
+ }
+ } else {
+ for (int64_t i = 0; i < length; ++i) {
+ if (left_data[i] != right_data[i]) {
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
+template <typename SparseIndexType>
+struct SparseTensorEqualsImpl<SparseIndexType, SparseIndexType> {
+ static bool Compare(const SparseTensorImpl<SparseIndexType>& left,
+ const SparseTensorImpl<SparseIndexType>& right,
+ const EqualOptions& opts) {
+ DCHECK(left.type()->id() == right.type()->id());
+ DCHECK(left.shape() == right.shape());
+
+ const auto length = left.non_zero_length();
+ DCHECK(length == right.non_zero_length());
+
+ const auto& left_index = checked_cast<const SparseIndexType&>(*left.sparse_index());
+ const auto& right_index = checked_cast<const SparseIndexType&>(*right.sparse_index());
+
+ if (!left_index.Equals(right_index)) {
+ return false;
+ }
+
+ const int byte_width = internal::GetByteWidth(*left.type());
+ DCHECK_GT(byte_width, 0);
+
+ const uint8_t* left_data = left.data()->data();
+ const uint8_t* right_data = right.data()->data();
+ switch (left.type()->id()) {
+ // TODO: Support half-float tensors
+ // case Type::HALF_FLOAT:
+ case Type::FLOAT:
+ return FloatSparseTensorDataEquals<FloatType>(
+ reinterpret_cast<const float*>(left_data),
+ reinterpret_cast<const float*>(right_data), length, opts);
+
+ case Type::DOUBLE:
+ return FloatSparseTensorDataEquals<DoubleType>(
+ reinterpret_cast<const double*>(left_data),
+ reinterpret_cast<const double*>(right_data), length, opts);
+
+ default: // Integer cases
+ return IntegerSparseTensorDataEquals(left_data, right_data, byte_width, length);
+ }
+ }
+};
+
+template <typename SparseIndexType>
+inline bool SparseTensorEqualsImplDispatch(const SparseTensorImpl<SparseIndexType>& left,
+ const SparseTensor& right,
+ const EqualOptions& opts) {
+ switch (right.format_id()) {
+ case SparseTensorFormat::COO: {
+ const auto& right_coo =
+ checked_cast<const SparseTensorImpl<SparseCOOIndex>&>(right);
+ return SparseTensorEqualsImpl<SparseIndexType, SparseCOOIndex>::Compare(
+ left, right_coo, opts);
+ }
+
+ case SparseTensorFormat::CSR: {
+ const auto& right_csr =
+ checked_cast<const SparseTensorImpl<SparseCSRIndex>&>(right);
+ return SparseTensorEqualsImpl<SparseIndexType, SparseCSRIndex>::Compare(
+ left, right_csr, opts);
+ }
+
+ case SparseTensorFormat::CSC: {
+ const auto& right_csc =
+ checked_cast<const SparseTensorImpl<SparseCSCIndex>&>(right);
+ return SparseTensorEqualsImpl<SparseIndexType, SparseCSCIndex>::Compare(
+ left, right_csc, opts);
+ }
+
+ case SparseTensorFormat::CSF: {
+ const auto& right_csf =
+ checked_cast<const SparseTensorImpl<SparseCSFIndex>&>(right);
+ return SparseTensorEqualsImpl<SparseIndexType, SparseCSFIndex>::Compare(
+ left, right_csf, opts);
+ }
+
+ default:
+ return false;
+ }
+}
+
+} // namespace
+
+bool SparseTensorEquals(const SparseTensor& left, const SparseTensor& right,
+ const EqualOptions& opts) {
+ if (left.type()->id() != right.type()->id()) {
+ return false;
+ } else if (left.size() == 0 && right.size() == 0) {
+ return true;
+ } else if (left.shape() != right.shape()) {
+ return false;
+ } else if (left.non_zero_length() != right.non_zero_length()) {
+ return false;
+ }
+
+ switch (left.format_id()) {
+ case SparseTensorFormat::COO: {
+ const auto& left_coo = checked_cast<const SparseTensorImpl<SparseCOOIndex>&>(left);
+ return SparseTensorEqualsImplDispatch(left_coo, right, opts);
+ }
+
+ case SparseTensorFormat::CSR: {
+ const auto& left_csr = checked_cast<const SparseTensorImpl<SparseCSRIndex>&>(left);
+ return SparseTensorEqualsImplDispatch(left_csr, right, opts);
+ }
+
+ case SparseTensorFormat::CSC: {
+ const auto& left_csc = checked_cast<const SparseTensorImpl<SparseCSCIndex>&>(left);
+ return SparseTensorEqualsImplDispatch(left_csc, right, opts);
+ }
+
+ case SparseTensorFormat::CSF: {
+ const auto& left_csf = checked_cast<const SparseTensorImpl<SparseCSFIndex>&>(left);
+ return SparseTensorEqualsImplDispatch(left_csf, right, opts);
+ }
+
+ default:
+ return false;
+ }
+}
+
+bool TypeEquals(const DataType& left, const DataType& right, bool check_metadata) {
+ // The arrays are the same object
+ if (&left == &right) {
+ return true;
+ } else if (left.id() != right.id()) {
+ return false;
+ } else {
+ // First try to compute fingerprints
+ if (check_metadata) {
+ const auto& left_metadata_fp = left.metadata_fingerprint();
+ const auto& right_metadata_fp = right.metadata_fingerprint();
+ if (left_metadata_fp != right_metadata_fp) {
+ return false;
+ }
+ }
+
+ const auto& left_fp = left.fingerprint();
+ const auto& right_fp = right.fingerprint();
+ if (!left_fp.empty() && !right_fp.empty()) {
+ return left_fp == right_fp;
+ }
+
+ // TODO remove check_metadata here?
+ TypeEqualsVisitor visitor(right, check_metadata);
+ auto error = VisitTypeInline(left, &visitor);
+ if (!error.ok()) {
+ DCHECK(false) << "Types are not comparable: " << error.ToString();
+ }
+ return visitor.result();
+ }
+}
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compare.h b/contrib/libs/apache/arrow/cpp/src/arrow/compare.h
new file mode 100644
index 00000000000..6769b23867b
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compare.h
@@ -0,0 +1,133 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Functions for comparing Arrow data structures
+
+#pragma once
+
+#include <cstdint>
+#include <iosfwd>
+
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class Array;
+class DataType;
+class Tensor;
+class SparseTensor;
+struct Scalar;
+
+static constexpr double kDefaultAbsoluteTolerance = 1E-5;
+
+/// A container of options for equality comparisons
+class EqualOptions {
+ public:
+ /// Whether or not NaNs are considered equal.
+ bool nans_equal() const { return nans_equal_; }
+
+ /// Return a new EqualOptions object with the "nans_equal" property changed.
+ EqualOptions nans_equal(bool v) const {
+ auto res = EqualOptions(*this);
+ res.nans_equal_ = v;
+ return res;
+ }
+
+ /// The absolute tolerance for approximate comparisons of floating-point values.
+ double atol() const { return atol_; }
+
+ /// Return a new EqualOptions object with the "atol" property changed.
+ EqualOptions atol(double v) const {
+ auto res = EqualOptions(*this);
+ res.atol_ = v;
+ return res;
+ }
+
+ /// The ostream to which a diff will be formatted if arrays disagree.
+ /// If this is null (the default) no diff will be formatted.
+ std::ostream* diff_sink() const { return diff_sink_; }
+
+ /// Return a new EqualOptions object with the "diff_sink" property changed.
+ /// This option will be ignored if diff formatting of the types of compared arrays is
+ /// not supported.
+ EqualOptions diff_sink(std::ostream* diff_sink) const {
+ auto res = EqualOptions(*this);
+ res.diff_sink_ = diff_sink;
+ return res;
+ }
+
+ static EqualOptions Defaults() { return {}; }
+
+ protected:
+ double atol_ = kDefaultAbsoluteTolerance;
+ bool nans_equal_ = false;
+ std::ostream* diff_sink_ = NULLPTR;
+};
+
+/// Returns true if the arrays are exactly equal
+bool ARROW_EXPORT ArrayEquals(const Array& left, const Array& right,
+ const EqualOptions& = EqualOptions::Defaults());
+
+/// Returns true if the arrays are approximately equal. For non-floating point
+/// types, this is equivalent to ArrayEquals(left, right)
+bool ARROW_EXPORT ArrayApproxEquals(const Array& left, const Array& right,
+ const EqualOptions& = EqualOptions::Defaults());
+
+/// Returns true if indicated equal-length segment of arrays are exactly equal
+bool ARROW_EXPORT ArrayRangeEquals(const Array& left, const Array& right,
+ int64_t start_idx, int64_t end_idx,
+ int64_t other_start_idx,
+ const EqualOptions& = EqualOptions::Defaults());
+
+/// Returns true if indicated equal-length segment of arrays are approximately equal
+bool ARROW_EXPORT ArrayRangeApproxEquals(const Array& left, const Array& right,
+ int64_t start_idx, int64_t end_idx,
+ int64_t other_start_idx,
+ const EqualOptions& = EqualOptions::Defaults());
+
+bool ARROW_EXPORT TensorEquals(const Tensor& left, const Tensor& right,
+ const EqualOptions& = EqualOptions::Defaults());
+
+/// EXPERIMENTAL: Returns true if the given sparse tensors are exactly equal
+bool ARROW_EXPORT SparseTensorEquals(const SparseTensor& left, const SparseTensor& right,
+ const EqualOptions& = EqualOptions::Defaults());
+
+/// Returns true if the type metadata are exactly equal
+/// \param[in] left a DataType
+/// \param[in] right a DataType
+/// \param[in] check_metadata whether to compare KeyValueMetadata for child
+/// fields
+bool ARROW_EXPORT TypeEquals(const DataType& left, const DataType& right,
+ bool check_metadata = true);
+
+/// Returns true if scalars are equal
+/// \param[in] left a Scalar
+/// \param[in] right a Scalar
+/// \param[in] options comparison options
+bool ARROW_EXPORT ScalarEquals(const Scalar& left, const Scalar& right,
+ const EqualOptions& options = EqualOptions::Defaults());
+
+/// Returns true if scalars are approximately equal
+/// \param[in] left a Scalar
+/// \param[in] right a Scalar
+/// \param[in] options comparison options
+bool ARROW_EXPORT
+ScalarApproxEquals(const Scalar& left, const Scalar& right,
+ const EqualOptions& options = EqualOptions::Defaults());
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/README.md b/contrib/libs/apache/arrow/cpp/src/arrow/compute/README.md
new file mode 100644
index 00000000000..80d8918e3d9
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/README.md
@@ -0,0 +1,58 @@
+<!---
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+
+## Apache Arrow C++ Compute Functions
+
+This submodule contains analytical functions that process primarily Arrow
+columnar data; some functions can process scalar or Arrow-based array
+inputs. These are intended for use inside query engines, data frame libraries,
+etc.
+
+Many functions have SQL-like semantics in that they perform elementwise or
+scalar operations on whole arrays at a time. Other functions are not SQL-like
+and compute results that may be a different length or whose results depend on
+the order of the values.
+
+Some basic terminology:
+
+* We use the term "function" to refer to particular general operation that may
+ have many different implementations corresponding to different combinations
+ of types or function behavior options.
+* We call a specific implementation of a function a "kernel". When executing a
+ function on inputs, we must first select a suitable kernel (kernel selection
+ is called "dispatching") corresponding to the value types of the inputs
+* Functions along with their kernel implementations are collected in a
+ "function registry". Given a function name and argument types, we can look up
+ that function and dispatch to a compatible kernel.
+
+Types of functions
+
+* Scalar functions: elementwise functions that perform scalar operations in a
+ vectorized manner. These functions are generally valid for SQL-like
+ context. These are called "scalar" in that the functions executed consider
+ each value in an array independently, and the output array or arrays have the
+ same length as the input arrays. The result for each array cell is generally
+ independent of its position in the array.
+* Vector functions, which produce a result whose output is generally dependent
+ on the entire contents of the input arrays. These functions **are generally
+ not valid** for SQL-like processing because the output size may be different
+ than the input size, and the result may change based on the order of the
+ values in the array. This includes things like array subselection, sorting,
+ hashing, and more.
+* Scalar aggregate functions of which can be used in a SQL-like context \ No newline at end of file
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/api.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/api.h
new file mode 100644
index 00000000000..a890cd362f8
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/api.h
@@ -0,0 +1,35 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// NOTE: API is EXPERIMENTAL and will change without going through a
+// deprecation cycle
+
+#pragma once
+
+/// \defgroup compute-concrete-options Concrete option classes for compute functions
+/// @{
+/// @}
+
+#include "arrow/compute/api_aggregate.h" // IWYU pragma: export
+#include "arrow/compute/api_scalar.h" // IWYU pragma: export
+#include "arrow/compute/api_vector.h" // IWYU pragma: export
+#include "arrow/compute/cast.h" // IWYU pragma: export
+#include "arrow/compute/exec.h" // IWYU pragma: export
+#include "arrow/compute/function.h" // IWYU pragma: export
+#include "arrow/compute/kernel.h" // IWYU pragma: export
+#include "arrow/compute/registry.h" // IWYU pragma: export
+#include "arrow/datum.h" // IWYU pragma: export
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_aggregate.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_aggregate.cc
new file mode 100644
index 00000000000..1b00c366bfd
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_aggregate.cc
@@ -0,0 +1,197 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/api_aggregate.h"
+
+#include "arrow/compute/exec.h"
+#include "arrow/compute/function_internal.h"
+#include "arrow/compute/registry.h"
+#include "arrow/compute/util_internal.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/logging.h"
+
+namespace arrow {
+
+namespace internal {
+template <>
+struct EnumTraits<compute::QuantileOptions::Interpolation>
+ : BasicEnumTraits<compute::QuantileOptions::Interpolation,
+ compute::QuantileOptions::LINEAR, compute::QuantileOptions::LOWER,
+ compute::QuantileOptions::HIGHER, compute::QuantileOptions::NEAREST,
+ compute::QuantileOptions::MIDPOINT> {
+ static std::string name() { return "QuantileOptions::Interpolation"; }
+ static std::string value_name(compute::QuantileOptions::Interpolation value) {
+ switch (value) {
+ case compute::QuantileOptions::LINEAR:
+ return "LINEAR";
+ case compute::QuantileOptions::LOWER:
+ return "LOWER";
+ case compute::QuantileOptions::HIGHER:
+ return "HIGHER";
+ case compute::QuantileOptions::NEAREST:
+ return "NEAREST";
+ case compute::QuantileOptions::MIDPOINT:
+ return "MIDPOINT";
+ }
+ return "<INVALID>";
+ }
+};
+} // namespace internal
+
+namespace compute {
+
+// ----------------------------------------------------------------------
+// Function options
+
+using ::arrow::internal::checked_cast;
+
+namespace internal {
+namespace {
+using ::arrow::internal::DataMember;
+static auto kScalarAggregateOptionsType = GetFunctionOptionsType<ScalarAggregateOptions>(
+ DataMember("skip_nulls", &ScalarAggregateOptions::skip_nulls),
+ DataMember("min_count", &ScalarAggregateOptions::min_count));
+static auto kModeOptionsType =
+ GetFunctionOptionsType<ModeOptions>(DataMember("n", &ModeOptions::n));
+static auto kVarianceOptionsType =
+ GetFunctionOptionsType<VarianceOptions>(DataMember("ddof", &VarianceOptions::ddof));
+static auto kQuantileOptionsType = GetFunctionOptionsType<QuantileOptions>(
+ DataMember("q", &QuantileOptions::q),
+ DataMember("interpolation", &QuantileOptions::interpolation));
+static auto kTDigestOptionsType = GetFunctionOptionsType<TDigestOptions>(
+ DataMember("q", &TDigestOptions::q), DataMember("delta", &TDigestOptions::delta),
+ DataMember("buffer_size", &TDigestOptions::buffer_size));
+static auto kIndexOptionsType =
+ GetFunctionOptionsType<IndexOptions>(DataMember("value", &IndexOptions::value));
+} // namespace
+} // namespace internal
+
+ScalarAggregateOptions::ScalarAggregateOptions(bool skip_nulls, uint32_t min_count)
+ : FunctionOptions(internal::kScalarAggregateOptionsType),
+ skip_nulls(skip_nulls),
+ min_count(min_count) {}
+constexpr char ScalarAggregateOptions::kTypeName[];
+
+ModeOptions::ModeOptions(int64_t n) : FunctionOptions(internal::kModeOptionsType), n(n) {}
+constexpr char ModeOptions::kTypeName[];
+
+VarianceOptions::VarianceOptions(int ddof)
+ : FunctionOptions(internal::kVarianceOptionsType), ddof(ddof) {}
+constexpr char VarianceOptions::kTypeName[];
+
+QuantileOptions::QuantileOptions(double q, enum Interpolation interpolation)
+ : FunctionOptions(internal::kQuantileOptionsType),
+ q{q},
+ interpolation{interpolation} {}
+QuantileOptions::QuantileOptions(std::vector<double> q, enum Interpolation interpolation)
+ : FunctionOptions(internal::kQuantileOptionsType),
+ q{std::move(q)},
+ interpolation{interpolation} {}
+constexpr char QuantileOptions::kTypeName[];
+
+TDigestOptions::TDigestOptions(double q, uint32_t delta, uint32_t buffer_size)
+ : FunctionOptions(internal::kTDigestOptionsType),
+ q{q},
+ delta{delta},
+ buffer_size{buffer_size} {}
+TDigestOptions::TDigestOptions(std::vector<double> q, uint32_t delta,
+ uint32_t buffer_size)
+ : FunctionOptions(internal::kTDigestOptionsType),
+ q{std::move(q)},
+ delta{delta},
+ buffer_size{buffer_size} {}
+constexpr char TDigestOptions::kTypeName[];
+
+IndexOptions::IndexOptions(std::shared_ptr<Scalar> value)
+ : FunctionOptions(internal::kIndexOptionsType), value{std::move(value)} {}
+IndexOptions::IndexOptions() : IndexOptions(std::make_shared<NullScalar>()) {}
+constexpr char IndexOptions::kTypeName[];
+
+namespace internal {
+void RegisterAggregateOptions(FunctionRegistry* registry) {
+ DCHECK_OK(registry->AddFunctionOptionsType(kScalarAggregateOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kModeOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kVarianceOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kQuantileOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kTDigestOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kIndexOptionsType));
+}
+} // namespace internal
+
+// ----------------------------------------------------------------------
+// Scalar aggregates
+
+Result<Datum> Count(const Datum& value, const ScalarAggregateOptions& options,
+ ExecContext* ctx) {
+ return CallFunction("count", {value}, &options, ctx);
+}
+
+Result<Datum> Mean(const Datum& value, const ScalarAggregateOptions& options,
+ ExecContext* ctx) {
+ return CallFunction("mean", {value}, &options, ctx);
+}
+
+Result<Datum> Sum(const Datum& value, const ScalarAggregateOptions& options,
+ ExecContext* ctx) {
+ return CallFunction("sum", {value}, &options, ctx);
+}
+
+Result<Datum> MinMax(const Datum& value, const ScalarAggregateOptions& options,
+ ExecContext* ctx) {
+ return CallFunction("min_max", {value}, &options, ctx);
+}
+
+Result<Datum> Any(const Datum& value, const ScalarAggregateOptions& options,
+ ExecContext* ctx) {
+ return CallFunction("any", {value}, &options, ctx);
+}
+
+Result<Datum> All(const Datum& value, const ScalarAggregateOptions& options,
+ ExecContext* ctx) {
+ return CallFunction("all", {value}, &options, ctx);
+}
+
+Result<Datum> Mode(const Datum& value, const ModeOptions& options, ExecContext* ctx) {
+ return CallFunction("mode", {value}, &options, ctx);
+}
+
+Result<Datum> Stddev(const Datum& value, const VarianceOptions& options,
+ ExecContext* ctx) {
+ return CallFunction("stddev", {value}, &options, ctx);
+}
+
+Result<Datum> Variance(const Datum& value, const VarianceOptions& options,
+ ExecContext* ctx) {
+ return CallFunction("variance", {value}, &options, ctx);
+}
+
+Result<Datum> Quantile(const Datum& value, const QuantileOptions& options,
+ ExecContext* ctx) {
+ return CallFunction("quantile", {value}, &options, ctx);
+}
+
+Result<Datum> TDigest(const Datum& value, const TDigestOptions& options,
+ ExecContext* ctx) {
+ return CallFunction("tdigest", {value}, &options, ctx);
+}
+
+Result<Datum> Index(const Datum& value, const IndexOptions& options, ExecContext* ctx) {
+ return CallFunction("index", {value}, &options, ctx);
+}
+
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_aggregate.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_aggregate.h
new file mode 100644
index 00000000000..7a6c44bd923
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_aggregate.h
@@ -0,0 +1,433 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Eager evaluation convenience APIs for invoking common functions, including
+// necessary memory allocations
+
+#pragma once
+
+#include "arrow/compute/function.h"
+#include "arrow/datum.h"
+#include "arrow/result.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class Array;
+
+namespace compute {
+
+class ExecContext;
+
+// ----------------------------------------------------------------------
+// Aggregate functions
+
+/// \addtogroup compute-concrete-options
+/// @{
+
+/// \brief Control general scalar aggregate kernel behavior
+///
+/// By default, null values are ignored
+class ARROW_EXPORT ScalarAggregateOptions : public FunctionOptions {
+ public:
+ explicit ScalarAggregateOptions(bool skip_nulls = true, uint32_t min_count = 1);
+ constexpr static char const kTypeName[] = "ScalarAggregateOptions";
+ static ScalarAggregateOptions Defaults() { return ScalarAggregateOptions{}; }
+
+ bool skip_nulls;
+ uint32_t min_count;
+};
+
+/// \brief Control Mode kernel behavior
+///
+/// Returns top-n common values and counts.
+/// By default, returns the most common value and count.
+class ARROW_EXPORT ModeOptions : public FunctionOptions {
+ public:
+ explicit ModeOptions(int64_t n = 1);
+ constexpr static char const kTypeName[] = "ModeOptions";
+ static ModeOptions Defaults() { return ModeOptions{}; }
+
+ int64_t n = 1;
+};
+
+/// \brief Control Delta Degrees of Freedom (ddof) of Variance and Stddev kernel
+///
+/// The divisor used in calculations is N - ddof, where N is the number of elements.
+/// By default, ddof is zero, and population variance or stddev is returned.
+class ARROW_EXPORT VarianceOptions : public FunctionOptions {
+ public:
+ explicit VarianceOptions(int ddof = 0);
+ constexpr static char const kTypeName[] = "VarianceOptions";
+ static VarianceOptions Defaults() { return VarianceOptions{}; }
+
+ int ddof = 0;
+};
+
+/// \brief Control Quantile kernel behavior
+///
+/// By default, returns the median value.
+class ARROW_EXPORT QuantileOptions : public FunctionOptions {
+ public:
+ /// Interpolation method to use when quantile lies between two data points
+ enum Interpolation {
+ LINEAR = 0,
+ LOWER,
+ HIGHER,
+ NEAREST,
+ MIDPOINT,
+ };
+
+ explicit QuantileOptions(double q = 0.5, enum Interpolation interpolation = LINEAR);
+
+ explicit QuantileOptions(std::vector<double> q,
+ enum Interpolation interpolation = LINEAR);
+
+ constexpr static char const kTypeName[] = "QuantileOptions";
+ static QuantileOptions Defaults() { return QuantileOptions{}; }
+
+ /// quantile must be between 0 and 1 inclusive
+ std::vector<double> q;
+ enum Interpolation interpolation;
+};
+
+/// \brief Control TDigest approximate quantile kernel behavior
+///
+/// By default, returns the median value.
+class ARROW_EXPORT TDigestOptions : public FunctionOptions {
+ public:
+ explicit TDigestOptions(double q = 0.5, uint32_t delta = 100,
+ uint32_t buffer_size = 500);
+ explicit TDigestOptions(std::vector<double> q, uint32_t delta = 100,
+ uint32_t buffer_size = 500);
+ constexpr static char const kTypeName[] = "TDigestOptions";
+ static TDigestOptions Defaults() { return TDigestOptions{}; }
+
+ /// quantile must be between 0 and 1 inclusive
+ std::vector<double> q;
+ /// compression parameter, default 100
+ uint32_t delta;
+ /// input buffer size, default 500
+ uint32_t buffer_size;
+};
+
+/// \brief Control Index kernel behavior
+class ARROW_EXPORT IndexOptions : public FunctionOptions {
+ public:
+ explicit IndexOptions(std::shared_ptr<Scalar> value);
+ // Default constructor for serialization
+ IndexOptions();
+ constexpr static char const kTypeName[] = "IndexOptions";
+
+ std::shared_ptr<Scalar> value;
+};
+
+/// @}
+
+/// \brief Count non-null (or null) values in an array.
+///
+/// \param[in] options counting options, see ScalarAggregateOptions for more information
+/// \param[in] datum to count
+/// \param[in] ctx the function execution context, optional
+/// \return out resulting datum
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Count(
+ const Datum& datum,
+ const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Compute the mean of a numeric array.
+///
+/// \param[in] value datum to compute the mean, expecting Array
+/// \param[in] options see ScalarAggregateOptions for more information
+/// \param[in] ctx the function execution context, optional
+/// \return datum of the computed mean as a DoubleScalar
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Mean(
+ const Datum& value,
+ const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Sum values of a numeric array.
+///
+/// \param[in] value datum to sum, expecting Array or ChunkedArray
+/// \param[in] options see ScalarAggregateOptions for more information
+/// \param[in] ctx the function execution context, optional
+/// \return datum of the computed sum as a Scalar
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Sum(
+ const Datum& value,
+ const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Calculate the min / max of a numeric array
+///
+/// This function returns both the min and max as a struct scalar, with type
+/// struct<min: T, max: T>, where T is the input type
+///
+/// \param[in] value input datum, expecting Array or ChunkedArray
+/// \param[in] options see ScalarAggregateOptions for more information
+/// \param[in] ctx the function execution context, optional
+/// \return resulting datum as a struct<min: T, max: T> scalar
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> MinMax(
+ const Datum& value,
+ const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Test whether any element in a boolean array evaluates to true.
+///
+/// This function returns true if any of the elements in the array evaluates
+/// to true and false otherwise. Null values are ignored by default.
+/// If null values are taken into account by setting ScalarAggregateOptions
+/// parameter skip_nulls = false then Kleene logic is used.
+/// See KleeneOr for more details on Kleene logic.
+///
+/// \param[in] value input datum, expecting a boolean array
+/// \param[in] options see ScalarAggregateOptions for more information
+/// \param[in] ctx the function execution context, optional
+/// \return resulting datum as a BooleanScalar
+///
+/// \since 3.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Any(
+ const Datum& value,
+ const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Test whether all elements in a boolean array evaluate to true.
+///
+/// This function returns true if all of the elements in the array evaluate
+/// to true and false otherwise. Null values are ignored by default.
+/// If null values are taken into account by setting ScalarAggregateOptions
+/// parameter skip_nulls = false then Kleene logic is used.
+/// See KleeneAnd for more details on Kleene logic.
+///
+/// \param[in] value input datum, expecting a boolean array
+/// \param[in] options see ScalarAggregateOptions for more information
+/// \param[in] ctx the function execution context, optional
+/// \return resulting datum as a BooleanScalar
+
+/// \since 3.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> All(
+ const Datum& value,
+ const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Calculate the modal (most common) value of a numeric array
+///
+/// This function returns top-n most common values and number of times they occur as
+/// an array of `struct<mode: T, count: int64>`, where T is the input type.
+/// Values with larger counts are returned before smaller ones.
+/// If there are more than one values with same count, smaller value is returned first.
+///
+/// \param[in] value input datum, expecting Array or ChunkedArray
+/// \param[in] options see ModeOptions for more information
+/// \param[in] ctx the function execution context, optional
+/// \return resulting datum as an array of struct<mode: T, count: int64>
+///
+/// \since 2.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Mode(const Datum& value,
+ const ModeOptions& options = ModeOptions::Defaults(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Calculate the standard deviation of a numeric array
+///
+/// \param[in] value input datum, expecting Array or ChunkedArray
+/// \param[in] options see VarianceOptions for more information
+/// \param[in] ctx the function execution context, optional
+/// \return datum of the computed standard deviation as a DoubleScalar
+///
+/// \since 2.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Stddev(const Datum& value,
+ const VarianceOptions& options = VarianceOptions::Defaults(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Calculate the variance of a numeric array
+///
+/// \param[in] value input datum, expecting Array or ChunkedArray
+/// \param[in] options see VarianceOptions for more information
+/// \param[in] ctx the function execution context, optional
+/// \return datum of the computed variance as a DoubleScalar
+///
+/// \since 2.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Variance(const Datum& value,
+ const VarianceOptions& options = VarianceOptions::Defaults(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Calculate the quantiles of a numeric array
+///
+/// \param[in] value input datum, expecting Array or ChunkedArray
+/// \param[in] options see QuantileOptions for more information
+/// \param[in] ctx the function execution context, optional
+/// \return resulting datum as an array
+///
+/// \since 4.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Quantile(const Datum& value,
+ const QuantileOptions& options = QuantileOptions::Defaults(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Calculate the approximate quantiles of a numeric array with T-Digest algorithm
+///
+/// \param[in] value input datum, expecting Array or ChunkedArray
+/// \param[in] options see TDigestOptions for more information
+/// \param[in] ctx the function execution context, optional
+/// \return resulting datum as an array
+///
+/// \since 4.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> TDigest(const Datum& value,
+ const TDigestOptions& options = TDigestOptions::Defaults(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Find the first index of a value in an array.
+///
+/// \param[in] value The array to search.
+/// \param[in] options The array to search for. See IndexOoptions.
+/// \param[in] ctx the function execution context, optional
+/// \return out a Scalar containing the index (or -1 if not found).
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Index(const Datum& value, const IndexOptions& options,
+ ExecContext* ctx = NULLPTR);
+
+namespace internal {
+
+/// Internal use only: streaming group identifier.
+/// Consumes batches of keys and yields batches of the group ids.
+class ARROW_EXPORT Grouper {
+ public:
+ virtual ~Grouper() = default;
+
+ /// Construct a Grouper which receives the specified key types
+ static Result<std::unique_ptr<Grouper>> Make(const std::vector<ValueDescr>& descrs,
+ ExecContext* ctx = default_exec_context());
+
+ /// Consume a batch of keys, producing the corresponding group ids as an integer array.
+ /// Currently only uint32 indices will be produced, eventually the bit width will only
+ /// be as wide as necessary.
+ virtual Result<Datum> Consume(const ExecBatch& batch) = 0;
+
+ /// Get current unique keys. May be called multiple times.
+ virtual Result<ExecBatch> GetUniques() = 0;
+
+ /// Get the current number of groups.
+ virtual uint32_t num_groups() const = 0;
+
+ /// \brief Assemble lists of indices of identical elements.
+ ///
+ /// \param[in] ids An unsigned, all-valid integral array which will be
+ /// used as grouping criteria.
+ /// \param[in] num_groups An upper bound for the elements of ids
+ /// \return A num_groups-long ListArray where the slot at i contains a
+ /// list of indices where i appears in ids.
+ ///
+ /// MakeGroupings([
+ /// 2,
+ /// 2,
+ /// 5,
+ /// 5,
+ /// 2,
+ /// 3
+ /// ], 8) == [
+ /// [],
+ /// [],
+ /// [0, 1, 4],
+ /// [5],
+ /// [],
+ /// [2, 3],
+ /// [],
+ /// []
+ /// ]
+ static Result<std::shared_ptr<ListArray>> MakeGroupings(
+ const UInt32Array& ids, uint32_t num_groups,
+ ExecContext* ctx = default_exec_context());
+
+ /// \brief Produce a ListArray whose slots are selections of `array` which correspond to
+ /// the provided groupings.
+ ///
+ /// For example,
+ /// ApplyGroupings([
+ /// [],
+ /// [],
+ /// [0, 1, 4],
+ /// [5],
+ /// [],
+ /// [2, 3],
+ /// [],
+ /// []
+ /// ], [2, 2, 5, 5, 2, 3]) == [
+ /// [],
+ /// [],
+ /// [2, 2, 2],
+ /// [3],
+ /// [],
+ /// [5, 5],
+ /// [],
+ /// []
+ /// ]
+ static Result<std::shared_ptr<ListArray>> ApplyGroupings(
+ const ListArray& groupings, const Array& array,
+ ExecContext* ctx = default_exec_context());
+};
+
+/// \brief Configure a grouped aggregation
+struct ARROW_EXPORT Aggregate {
+ /// the name of the aggregation function
+ std::string function;
+
+ /// options for the aggregation function
+ const FunctionOptions* options;
+};
+
+/// Internal use only: helper function for testing HashAggregateKernels.
+/// This will be replaced by streaming execution operators.
+ARROW_EXPORT
+Result<Datum> GroupBy(const std::vector<Datum>& arguments, const std::vector<Datum>& keys,
+ const std::vector<Aggregate>& aggregates,
+ ExecContext* ctx = default_exec_context());
+
+} // namespace internal
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_scalar.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_scalar.cc
new file mode 100644
index 00000000000..1feb4e7eee0
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_scalar.cc
@@ -0,0 +1,498 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/api_scalar.h"
+
+#include <memory>
+#include <sstream>
+#include <string>
+
+#include "arrow/array/array_base.h"
+#include "arrow/compute/exec.h"
+#include "arrow/compute/function_internal.h"
+#include "arrow/compute/registry.h"
+#include "arrow/compute/util_internal.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/logging.h"
+
+namespace arrow {
+
+namespace internal {
+template <>
+struct EnumTraits<compute::JoinOptions::NullHandlingBehavior>
+ : BasicEnumTraits<compute::JoinOptions::NullHandlingBehavior,
+ compute::JoinOptions::NullHandlingBehavior::EMIT_NULL,
+ compute::JoinOptions::NullHandlingBehavior::SKIP,
+ compute::JoinOptions::NullHandlingBehavior::REPLACE> {
+ static std::string name() { return "JoinOptions::NullHandlingBehavior"; }
+ static std::string value_name(compute::JoinOptions::NullHandlingBehavior value) {
+ switch (value) {
+ case compute::JoinOptions::NullHandlingBehavior::EMIT_NULL:
+ return "EMIT_NULL";
+ case compute::JoinOptions::NullHandlingBehavior::SKIP:
+ return "SKIP";
+ case compute::JoinOptions::NullHandlingBehavior::REPLACE:
+ return "REPLACE";
+ }
+ return "<INVALID>";
+ }
+};
+template <>
+struct EnumTraits<TimeUnit::type>
+ : BasicEnumTraits<TimeUnit::type, TimeUnit::type::SECOND, TimeUnit::type::MILLI,
+ TimeUnit::type::MICRO, TimeUnit::type::NANO> {
+ static std::string name() { return "TimeUnit::type"; }
+ static std::string value_name(TimeUnit::type value) {
+ switch (value) {
+ case TimeUnit::type::SECOND:
+ return "SECOND";
+ case TimeUnit::type::MILLI:
+ return "MILLI";
+ case TimeUnit::type::MICRO:
+ return "MICRO";
+ case TimeUnit::type::NANO:
+ return "NANO";
+ }
+ return "<INVALID>";
+ }
+};
+template <>
+struct EnumTraits<compute::CompareOperator>
+ : BasicEnumTraits<
+ compute::CompareOperator, compute::CompareOperator::EQUAL,
+ compute::CompareOperator::NOT_EQUAL, compute::CompareOperator::GREATER,
+ compute::CompareOperator::GREATER_EQUAL, compute::CompareOperator::LESS,
+ compute::CompareOperator::LESS_EQUAL> {
+ static std::string name() { return "compute::CompareOperator"; }
+ static std::string value_name(compute::CompareOperator value) {
+ switch (value) {
+ case compute::CompareOperator::EQUAL:
+ return "EQUAL";
+ case compute::CompareOperator::NOT_EQUAL:
+ return "NOT_EQUAL";
+ case compute::CompareOperator::GREATER:
+ return "GREATER";
+ case compute::CompareOperator::GREATER_EQUAL:
+ return "GREATER_EQUAL";
+ case compute::CompareOperator::LESS:
+ return "LESS";
+ case compute::CompareOperator::LESS_EQUAL:
+ return "LESS_EQUAL";
+ }
+ return "<INVALID>";
+ }
+};
+} // namespace internal
+
+namespace compute {
+
+// ----------------------------------------------------------------------
+// Function options
+
+using ::arrow::internal::checked_cast;
+
+namespace internal {
+namespace {
+using ::arrow::internal::DataMember;
+static auto kArithmeticOptionsType = GetFunctionOptionsType<ArithmeticOptions>(
+ DataMember("check_overflow", &ArithmeticOptions::check_overflow));
+static auto kElementWiseAggregateOptionsType =
+ GetFunctionOptionsType<ElementWiseAggregateOptions>(
+ DataMember("skip_nulls", &ElementWiseAggregateOptions::skip_nulls));
+static auto kJoinOptionsType = GetFunctionOptionsType<JoinOptions>(
+ DataMember("null_handling", &JoinOptions::null_handling),
+ DataMember("null_replacement", &JoinOptions::null_replacement));
+static auto kMatchSubstringOptionsType = GetFunctionOptionsType<MatchSubstringOptions>(
+ DataMember("pattern", &MatchSubstringOptions::pattern),
+ DataMember("ignore_case", &MatchSubstringOptions::ignore_case));
+static auto kSplitOptionsType = GetFunctionOptionsType<SplitOptions>(
+ DataMember("max_splits", &SplitOptions::max_splits),
+ DataMember("reverse", &SplitOptions::reverse));
+static auto kSplitPatternOptionsType = GetFunctionOptionsType<SplitPatternOptions>(
+ DataMember("pattern", &SplitPatternOptions::pattern),
+ DataMember("max_splits", &SplitPatternOptions::max_splits),
+ DataMember("reverse", &SplitPatternOptions::reverse));
+static auto kReplaceSliceOptionsType = GetFunctionOptionsType<ReplaceSliceOptions>(
+ DataMember("start", &ReplaceSliceOptions::start),
+ DataMember("stop", &ReplaceSliceOptions::stop),
+ DataMember("replacement", &ReplaceSliceOptions::replacement));
+static auto kReplaceSubstringOptionsType =
+ GetFunctionOptionsType<ReplaceSubstringOptions>(
+ DataMember("pattern", &ReplaceSubstringOptions::pattern),
+ DataMember("replacement", &ReplaceSubstringOptions::replacement),
+ DataMember("max_replacements", &ReplaceSubstringOptions::max_replacements));
+static auto kExtractRegexOptionsType = GetFunctionOptionsType<ExtractRegexOptions>(
+ DataMember("pattern", &ExtractRegexOptions::pattern));
+static auto kSetLookupOptionsType = GetFunctionOptionsType<SetLookupOptions>(
+ DataMember("value_set", &SetLookupOptions::value_set),
+ DataMember("skip_nulls", &SetLookupOptions::skip_nulls));
+static auto kStrptimeOptionsType = GetFunctionOptionsType<StrptimeOptions>(
+ DataMember("format", &StrptimeOptions::format),
+ DataMember("unit", &StrptimeOptions::unit));
+static auto kPadOptionsType = GetFunctionOptionsType<PadOptions>(
+ DataMember("width", &PadOptions::width), DataMember("padding", &PadOptions::padding));
+static auto kTrimOptionsType = GetFunctionOptionsType<TrimOptions>(
+ DataMember("characters", &TrimOptions::characters));
+static auto kSliceOptionsType = GetFunctionOptionsType<SliceOptions>(
+ DataMember("start", &SliceOptions::start), DataMember("stop", &SliceOptions::stop),
+ DataMember("step", &SliceOptions::step));
+static auto kMakeStructOptionsType = GetFunctionOptionsType<MakeStructOptions>(
+ DataMember("field_names", &MakeStructOptions::field_names),
+ DataMember("field_nullability", &MakeStructOptions::field_nullability),
+ DataMember("field_metadata", &MakeStructOptions::field_metadata));
+static auto kDayOfWeekOptionsType = GetFunctionOptionsType<DayOfWeekOptions>(
+ DataMember("one_based_numbering", &DayOfWeekOptions::one_based_numbering),
+ DataMember("week_start", &DayOfWeekOptions::week_start));
+} // namespace
+} // namespace internal
+
+ArithmeticOptions::ArithmeticOptions(bool check_overflow)
+ : FunctionOptions(internal::kArithmeticOptionsType), check_overflow(check_overflow) {}
+constexpr char ArithmeticOptions::kTypeName[];
+
+ElementWiseAggregateOptions::ElementWiseAggregateOptions(bool skip_nulls)
+ : FunctionOptions(internal::kElementWiseAggregateOptionsType),
+ skip_nulls(skip_nulls) {}
+constexpr char ElementWiseAggregateOptions::kTypeName[];
+
+JoinOptions::JoinOptions(NullHandlingBehavior null_handling, std::string null_replacement)
+ : FunctionOptions(internal::kJoinOptionsType),
+ null_handling(null_handling),
+ null_replacement(std::move(null_replacement)) {}
+constexpr char JoinOptions::kTypeName[];
+
+MatchSubstringOptions::MatchSubstringOptions(std::string pattern, bool ignore_case)
+ : FunctionOptions(internal::kMatchSubstringOptionsType),
+ pattern(std::move(pattern)),
+ ignore_case(ignore_case) {}
+MatchSubstringOptions::MatchSubstringOptions() : MatchSubstringOptions("", false) {}
+constexpr char MatchSubstringOptions::kTypeName[];
+
+SplitOptions::SplitOptions(int64_t max_splits, bool reverse)
+ : FunctionOptions(internal::kSplitOptionsType),
+ max_splits(max_splits),
+ reverse(reverse) {}
+constexpr char SplitOptions::kTypeName[];
+
+SplitPatternOptions::SplitPatternOptions(std::string pattern, int64_t max_splits,
+ bool reverse)
+ : FunctionOptions(internal::kSplitPatternOptionsType),
+ pattern(std::move(pattern)),
+ max_splits(max_splits),
+ reverse(reverse) {}
+SplitPatternOptions::SplitPatternOptions() : SplitPatternOptions("", -1, false) {}
+constexpr char SplitPatternOptions::kTypeName[];
+
+ReplaceSliceOptions::ReplaceSliceOptions(int64_t start, int64_t stop,
+ std::string replacement)
+ : FunctionOptions(internal::kReplaceSliceOptionsType),
+ start(start),
+ stop(stop),
+ replacement(std::move(replacement)) {}
+ReplaceSliceOptions::ReplaceSliceOptions() : ReplaceSliceOptions(0, 0, "") {}
+constexpr char ReplaceSliceOptions::kTypeName[];
+
+ReplaceSubstringOptions::ReplaceSubstringOptions(std::string pattern,
+ std::string replacement,
+ int64_t max_replacements)
+ : FunctionOptions(internal::kReplaceSubstringOptionsType),
+ pattern(std::move(pattern)),
+ replacement(std::move(replacement)),
+ max_replacements(max_replacements) {}
+ReplaceSubstringOptions::ReplaceSubstringOptions()
+ : ReplaceSubstringOptions("", "", -1) {}
+constexpr char ReplaceSubstringOptions::kTypeName[];
+
+ExtractRegexOptions::ExtractRegexOptions(std::string pattern)
+ : FunctionOptions(internal::kExtractRegexOptionsType), pattern(std::move(pattern)) {}
+ExtractRegexOptions::ExtractRegexOptions() : ExtractRegexOptions("") {}
+constexpr char ExtractRegexOptions::kTypeName[];
+
+SetLookupOptions::SetLookupOptions(Datum value_set, bool skip_nulls)
+ : FunctionOptions(internal::kSetLookupOptionsType),
+ value_set(std::move(value_set)),
+ skip_nulls(skip_nulls) {}
+SetLookupOptions::SetLookupOptions() : SetLookupOptions({}, false) {}
+constexpr char SetLookupOptions::kTypeName[];
+
+StrptimeOptions::StrptimeOptions(std::string format, TimeUnit::type unit)
+ : FunctionOptions(internal::kStrptimeOptionsType),
+ format(std::move(format)),
+ unit(unit) {}
+StrptimeOptions::StrptimeOptions() : StrptimeOptions("", TimeUnit::SECOND) {}
+constexpr char StrptimeOptions::kTypeName[];
+
+PadOptions::PadOptions(int64_t width, std::string padding)
+ : FunctionOptions(internal::kPadOptionsType),
+ width(width),
+ padding(std::move(padding)) {}
+PadOptions::PadOptions() : PadOptions(0, " ") {}
+constexpr char PadOptions::kTypeName[];
+
+TrimOptions::TrimOptions(std::string characters)
+ : FunctionOptions(internal::kTrimOptionsType), characters(std::move(characters)) {}
+TrimOptions::TrimOptions() : TrimOptions("") {}
+constexpr char TrimOptions::kTypeName[];
+
+SliceOptions::SliceOptions(int64_t start, int64_t stop, int64_t step)
+ : FunctionOptions(internal::kSliceOptionsType),
+ start(start),
+ stop(stop),
+ step(step) {}
+SliceOptions::SliceOptions() : SliceOptions(0, 0, 1) {}
+constexpr char SliceOptions::kTypeName[];
+
+MakeStructOptions::MakeStructOptions(
+ std::vector<std::string> n, std::vector<bool> r,
+ std::vector<std::shared_ptr<const KeyValueMetadata>> m)
+ : FunctionOptions(internal::kMakeStructOptionsType),
+ field_names(std::move(n)),
+ field_nullability(std::move(r)),
+ field_metadata(std::move(m)) {}
+
+MakeStructOptions::MakeStructOptions(std::vector<std::string> n)
+ : FunctionOptions(internal::kMakeStructOptionsType),
+ field_names(std::move(n)),
+ field_nullability(field_names.size(), true),
+ field_metadata(field_names.size(), NULLPTR) {}
+
+MakeStructOptions::MakeStructOptions() : MakeStructOptions(std::vector<std::string>()) {}
+constexpr char MakeStructOptions::kTypeName[];
+
+DayOfWeekOptions::DayOfWeekOptions(bool one_based_numbering, uint32_t week_start)
+ : FunctionOptions(internal::kDayOfWeekOptionsType),
+ one_based_numbering(one_based_numbering),
+ week_start(week_start) {}
+constexpr char DayOfWeekOptions::kTypeName[];
+
+namespace internal {
+void RegisterScalarOptions(FunctionRegistry* registry) {
+ DCHECK_OK(registry->AddFunctionOptionsType(kArithmeticOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kElementWiseAggregateOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kJoinOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kMatchSubstringOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kSplitOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kSplitPatternOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kReplaceSliceOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kReplaceSubstringOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kExtractRegexOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kSetLookupOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kStrptimeOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kPadOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kTrimOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kSliceOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kMakeStructOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kDayOfWeekOptionsType));
+}
+} // namespace internal
+
+#define SCALAR_EAGER_UNARY(NAME, REGISTRY_NAME) \
+ Result<Datum> NAME(const Datum& value, ExecContext* ctx) { \
+ return CallFunction(REGISTRY_NAME, {value}, ctx); \
+ }
+
+#define SCALAR_EAGER_BINARY(NAME, REGISTRY_NAME) \
+ Result<Datum> NAME(const Datum& left, const Datum& right, ExecContext* ctx) { \
+ return CallFunction(REGISTRY_NAME, {left, right}, ctx); \
+ }
+
+// ----------------------------------------------------------------------
+// Arithmetic
+
+#define SCALAR_ARITHMETIC_UNARY(NAME, REGISTRY_NAME, REGISTRY_CHECKED_NAME) \
+ Result<Datum> NAME(const Datum& arg, ArithmeticOptions options, ExecContext* ctx) { \
+ auto func_name = (options.check_overflow) ? REGISTRY_CHECKED_NAME : REGISTRY_NAME; \
+ return CallFunction(func_name, {arg}, ctx); \
+ }
+
+SCALAR_ARITHMETIC_UNARY(AbsoluteValue, "abs", "abs_checked")
+SCALAR_ARITHMETIC_UNARY(Negate, "negate", "negate_checked")
+SCALAR_EAGER_UNARY(Sign, "sign")
+SCALAR_ARITHMETIC_UNARY(Sin, "sin", "sin_checked")
+SCALAR_ARITHMETIC_UNARY(Cos, "cos", "cos_checked")
+SCALAR_ARITHMETIC_UNARY(Asin, "asin", "asin_checked")
+SCALAR_ARITHMETIC_UNARY(Acos, "acos", "acos_checked")
+SCALAR_ARITHMETIC_UNARY(Tan, "tan", "tan_checked")
+SCALAR_EAGER_UNARY(Atan, "atan")
+SCALAR_ARITHMETIC_UNARY(Ln, "ln", "ln_checked")
+SCALAR_ARITHMETIC_UNARY(Log10, "log10", "log10_checked")
+SCALAR_ARITHMETIC_UNARY(Log2, "log2", "log2_checked")
+SCALAR_ARITHMETIC_UNARY(Log1p, "log1p", "log1p_checked")
+
+#define SCALAR_ARITHMETIC_BINARY(NAME, REGISTRY_NAME, REGISTRY_CHECKED_NAME) \
+ Result<Datum> NAME(const Datum& left, const Datum& right, ArithmeticOptions options, \
+ ExecContext* ctx) { \
+ auto func_name = (options.check_overflow) ? REGISTRY_CHECKED_NAME : REGISTRY_NAME; \
+ return CallFunction(func_name, {left, right}, ctx); \
+ }
+
+SCALAR_ARITHMETIC_BINARY(Add, "add", "add_checked")
+SCALAR_ARITHMETIC_BINARY(Subtract, "subtract", "subtract_checked")
+SCALAR_ARITHMETIC_BINARY(Multiply, "multiply", "multiply_checked")
+SCALAR_ARITHMETIC_BINARY(Divide, "divide", "divide_checked")
+SCALAR_ARITHMETIC_BINARY(Power, "power", "power_checked")
+SCALAR_ARITHMETIC_BINARY(ShiftLeft, "shift_left", "shift_left_checked")
+SCALAR_ARITHMETIC_BINARY(ShiftRight, "shift_right", "shift_right_checked")
+SCALAR_EAGER_BINARY(Atan2, "atan2")
+SCALAR_EAGER_UNARY(Floor, "floor")
+SCALAR_EAGER_UNARY(Ceil, "ceil")
+SCALAR_EAGER_UNARY(Trunc, "trunc")
+
+Result<Datum> MaxElementWise(const std::vector<Datum>& args,
+ ElementWiseAggregateOptions options, ExecContext* ctx) {
+ return CallFunction("max_element_wise", args, &options, ctx);
+}
+
+Result<Datum> MinElementWise(const std::vector<Datum>& args,
+ ElementWiseAggregateOptions options, ExecContext* ctx) {
+ return CallFunction("min_element_wise", args, &options, ctx);
+}
+
+// ----------------------------------------------------------------------
+// Set-related operations
+
+static Result<Datum> ExecSetLookup(const std::string& func_name, const Datum& data,
+ const SetLookupOptions& options, ExecContext* ctx) {
+ if (!options.value_set.is_arraylike()) {
+ return Status::Invalid("Set lookup value set must be Array or ChunkedArray");
+ }
+ std::shared_ptr<DataType> data_type;
+ if (data.type()->id() == Type::DICTIONARY) {
+ data_type =
+ arrow::internal::checked_pointer_cast<DictionaryType>(data.type())->value_type();
+ } else {
+ data_type = data.type();
+ }
+
+ if (options.value_set.length() > 0 && !data_type->Equals(options.value_set.type())) {
+ std::stringstream ss;
+ ss << "Array type didn't match type of values set: " << data_type->ToString()
+ << " vs " << options.value_set.type()->ToString();
+ return Status::Invalid(ss.str());
+ }
+ return CallFunction(func_name, {data}, &options, ctx);
+}
+
+Result<Datum> IsIn(const Datum& values, const SetLookupOptions& options,
+ ExecContext* ctx) {
+ return ExecSetLookup("is_in", values, options, ctx);
+}
+
+Result<Datum> IsIn(const Datum& values, const Datum& value_set, ExecContext* ctx) {
+ return ExecSetLookup("is_in", values, SetLookupOptions{value_set}, ctx);
+}
+
+Result<Datum> IndexIn(const Datum& values, const SetLookupOptions& options,
+ ExecContext* ctx) {
+ return ExecSetLookup("index_in", values, options, ctx);
+}
+
+Result<Datum> IndexIn(const Datum& values, const Datum& value_set, ExecContext* ctx) {
+ return ExecSetLookup("index_in", values, SetLookupOptions{value_set}, ctx);
+}
+
+// ----------------------------------------------------------------------
+// Boolean functions
+
+SCALAR_EAGER_UNARY(Invert, "invert")
+SCALAR_EAGER_BINARY(And, "and")
+SCALAR_EAGER_BINARY(KleeneAnd, "and_kleene")
+SCALAR_EAGER_BINARY(Or, "or")
+SCALAR_EAGER_BINARY(KleeneOr, "or_kleene")
+SCALAR_EAGER_BINARY(Xor, "xor")
+SCALAR_EAGER_BINARY(AndNot, "and_not")
+SCALAR_EAGER_BINARY(KleeneAndNot, "and_not_kleene")
+
+// ----------------------------------------------------------------------
+
+Result<Datum> Compare(const Datum& left, const Datum& right, CompareOptions options,
+ ExecContext* ctx) {
+ std::string func_name;
+ switch (options.op) {
+ case CompareOperator::EQUAL:
+ func_name = "equal";
+ break;
+ case CompareOperator::NOT_EQUAL:
+ func_name = "not_equal";
+ break;
+ case CompareOperator::GREATER:
+ func_name = "greater";
+ break;
+ case CompareOperator::GREATER_EQUAL:
+ func_name = "greater_equal";
+ break;
+ case CompareOperator::LESS:
+ func_name = "less";
+ break;
+ case CompareOperator::LESS_EQUAL:
+ func_name = "less_equal";
+ break;
+ }
+ return CallFunction(func_name, {left, right}, nullptr, ctx);
+}
+
+// ----------------------------------------------------------------------
+// Validity functions
+
+SCALAR_EAGER_UNARY(IsValid, "is_valid")
+SCALAR_EAGER_UNARY(IsNull, "is_null")
+SCALAR_EAGER_UNARY(IsNan, "is_nan")
+
+Result<Datum> FillNull(const Datum& values, const Datum& fill_value, ExecContext* ctx) {
+ return CallFunction("fill_null", {values, fill_value}, ctx);
+}
+
+Result<Datum> IfElse(const Datum& cond, const Datum& if_true, const Datum& if_false,
+ ExecContext* ctx) {
+ return CallFunction("if_else", {cond, if_true, if_false}, ctx);
+}
+
+Result<Datum> CaseWhen(const Datum& cond, const std::vector<Datum>& cases,
+ ExecContext* ctx) {
+ std::vector<Datum> args = {cond};
+ args.reserve(cases.size() + 1);
+ args.insert(args.end(), cases.begin(), cases.end());
+ return CallFunction("case_when", args, ctx);
+}
+
+// ----------------------------------------------------------------------
+// Temporal functions
+
+SCALAR_EAGER_UNARY(Year, "year")
+SCALAR_EAGER_UNARY(Month, "month")
+SCALAR_EAGER_UNARY(Day, "day")
+SCALAR_EAGER_UNARY(DayOfYear, "day_of_year")
+SCALAR_EAGER_UNARY(ISOYear, "iso_year")
+SCALAR_EAGER_UNARY(ISOWeek, "iso_week")
+SCALAR_EAGER_UNARY(ISOCalendar, "iso_calendar")
+SCALAR_EAGER_UNARY(Quarter, "quarter")
+SCALAR_EAGER_UNARY(Hour, "hour")
+SCALAR_EAGER_UNARY(Minute, "minute")
+SCALAR_EAGER_UNARY(Second, "second")
+SCALAR_EAGER_UNARY(Millisecond, "millisecond")
+SCALAR_EAGER_UNARY(Microsecond, "microsecond")
+SCALAR_EAGER_UNARY(Nanosecond, "nanosecond")
+SCALAR_EAGER_UNARY(Subsecond, "subsecond")
+
+Result<Datum> DayOfWeek(const Datum& arg, DayOfWeekOptions options, ExecContext* ctx) {
+ return CallFunction("day_of_week", {arg}, &options, ctx);
+}
+
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_scalar.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_scalar.h
new file mode 100644
index 00000000000..e07e41569a1
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_scalar.h
@@ -0,0 +1,989 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Eager evaluation convenience APIs for invoking common functions, including
+// necessary memory allocations
+
+#pragma once
+
+#include <string>
+#include <utility>
+
+#include "arrow/compute/exec.h" // IWYU pragma: keep
+#include "arrow/compute/function.h"
+#include "arrow/datum.h"
+#include "arrow/result.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace compute {
+
+/// \addtogroup compute-concrete-options
+///
+/// @{
+
+class ARROW_EXPORT ArithmeticOptions : public FunctionOptions {
+ public:
+ explicit ArithmeticOptions(bool check_overflow = false);
+ constexpr static char const kTypeName[] = "ArithmeticOptions";
+ bool check_overflow;
+};
+
+class ARROW_EXPORT ElementWiseAggregateOptions : public FunctionOptions {
+ public:
+ explicit ElementWiseAggregateOptions(bool skip_nulls = true);
+ constexpr static char const kTypeName[] = "ElementWiseAggregateOptions";
+ static ElementWiseAggregateOptions Defaults() { return ElementWiseAggregateOptions{}; }
+
+ bool skip_nulls;
+};
+
+/// Options for var_args_join.
+class ARROW_EXPORT JoinOptions : public FunctionOptions {
+ public:
+ /// How to handle null values. (A null separator always results in a null output.)
+ enum NullHandlingBehavior {
+ /// A null in any input results in a null in the output.
+ EMIT_NULL,
+ /// Nulls in inputs are skipped.
+ SKIP,
+ /// Nulls in inputs are replaced with the replacement string.
+ REPLACE,
+ };
+ explicit JoinOptions(NullHandlingBehavior null_handling = EMIT_NULL,
+ std::string null_replacement = "");
+ constexpr static char const kTypeName[] = "JoinOptions";
+ static JoinOptions Defaults() { return JoinOptions(); }
+ NullHandlingBehavior null_handling;
+ std::string null_replacement;
+};
+
+class ARROW_EXPORT MatchSubstringOptions : public FunctionOptions {
+ public:
+ explicit MatchSubstringOptions(std::string pattern, bool ignore_case = false);
+ MatchSubstringOptions();
+ constexpr static char const kTypeName[] = "MatchSubstringOptions";
+
+ /// The exact substring (or regex, depending on kernel) to look for inside input values.
+ std::string pattern;
+ /// Whether to perform a case-insensitive match.
+ bool ignore_case = false;
+};
+
+class ARROW_EXPORT SplitOptions : public FunctionOptions {
+ public:
+ explicit SplitOptions(int64_t max_splits = -1, bool reverse = false);
+ constexpr static char const kTypeName[] = "SplitOptions";
+
+ /// Maximum number of splits allowed, or unlimited when -1
+ int64_t max_splits;
+ /// Start splitting from the end of the string (only relevant when max_splits != -1)
+ bool reverse;
+};
+
+class ARROW_EXPORT SplitPatternOptions : public FunctionOptions {
+ public:
+ explicit SplitPatternOptions(std::string pattern, int64_t max_splits = -1,
+ bool reverse = false);
+ SplitPatternOptions();
+ constexpr static char const kTypeName[] = "SplitPatternOptions";
+
+ /// The exact substring to split on.
+ std::string pattern;
+ /// Maximum number of splits allowed, or unlimited when -1
+ int64_t max_splits;
+ /// Start splitting from the end of the string (only relevant when max_splits != -1)
+ bool reverse;
+};
+
+class ARROW_EXPORT ReplaceSliceOptions : public FunctionOptions {
+ public:
+ explicit ReplaceSliceOptions(int64_t start, int64_t stop, std::string replacement);
+ ReplaceSliceOptions();
+ constexpr static char const kTypeName[] = "ReplaceSliceOptions";
+
+ /// Index to start slicing at
+ int64_t start;
+ /// Index to stop slicing at
+ int64_t stop;
+ /// String to replace the slice with
+ std::string replacement;
+};
+
+class ARROW_EXPORT ReplaceSubstringOptions : public FunctionOptions {
+ public:
+ explicit ReplaceSubstringOptions(std::string pattern, std::string replacement,
+ int64_t max_replacements = -1);
+ ReplaceSubstringOptions();
+ constexpr static char const kTypeName[] = "ReplaceSubstringOptions";
+
+ /// Pattern to match, literal, or regular expression depending on which kernel is used
+ std::string pattern;
+ /// String to replace the pattern with
+ std::string replacement;
+ /// Max number of substrings to replace (-1 means unbounded)
+ int64_t max_replacements;
+};
+
+class ARROW_EXPORT ExtractRegexOptions : public FunctionOptions {
+ public:
+ explicit ExtractRegexOptions(std::string pattern);
+ ExtractRegexOptions();
+ constexpr static char const kTypeName[] = "ExtractRegexOptions";
+
+ /// Regular expression with named capture fields
+ std::string pattern;
+};
+
+/// Options for IsIn and IndexIn functions
+class ARROW_EXPORT SetLookupOptions : public FunctionOptions {
+ public:
+ explicit SetLookupOptions(Datum value_set, bool skip_nulls = false);
+ SetLookupOptions();
+ constexpr static char const kTypeName[] = "SetLookupOptions";
+
+ /// The set of values to look up input values into.
+ Datum value_set;
+ /// Whether nulls in `value_set` count for lookup.
+ ///
+ /// If true, any null in `value_set` is ignored and nulls in the input
+ /// produce null (IndexIn) or false (IsIn) values in the output.
+ /// If false, any null in `value_set` is successfully matched in
+ /// the input.
+ bool skip_nulls;
+};
+
+class ARROW_EXPORT StrptimeOptions : public FunctionOptions {
+ public:
+ explicit StrptimeOptions(std::string format, TimeUnit::type unit);
+ StrptimeOptions();
+ constexpr static char const kTypeName[] = "StrptimeOptions";
+
+ std::string format;
+ TimeUnit::type unit;
+};
+
+class ARROW_EXPORT PadOptions : public FunctionOptions {
+ public:
+ explicit PadOptions(int64_t width, std::string padding = " ");
+ PadOptions();
+ constexpr static char const kTypeName[] = "PadOptions";
+
+ /// The desired string length.
+ int64_t width;
+ /// What to pad the string with. Should be one codepoint (Unicode)/byte (ASCII).
+ std::string padding;
+};
+
+class ARROW_EXPORT TrimOptions : public FunctionOptions {
+ public:
+ explicit TrimOptions(std::string characters);
+ TrimOptions();
+ constexpr static char const kTypeName[] = "TrimOptions";
+
+ /// The individual characters that can be trimmed from the string.
+ std::string characters;
+};
+
+class ARROW_EXPORT SliceOptions : public FunctionOptions {
+ public:
+ explicit SliceOptions(int64_t start, int64_t stop = std::numeric_limits<int64_t>::max(),
+ int64_t step = 1);
+ SliceOptions();
+ constexpr static char const kTypeName[] = "SliceOptions";
+ int64_t start, stop, step;
+};
+
+enum CompareOperator : int8_t {
+ EQUAL,
+ NOT_EQUAL,
+ GREATER,
+ GREATER_EQUAL,
+ LESS,
+ LESS_EQUAL,
+};
+
+struct ARROW_EXPORT CompareOptions {
+ explicit CompareOptions(CompareOperator op) : op(op) {}
+ CompareOptions() : CompareOptions(CompareOperator::EQUAL) {}
+ enum CompareOperator op;
+};
+
+class ARROW_EXPORT MakeStructOptions : public FunctionOptions {
+ public:
+ MakeStructOptions(std::vector<std::string> n, std::vector<bool> r,
+ std::vector<std::shared_ptr<const KeyValueMetadata>> m);
+ explicit MakeStructOptions(std::vector<std::string> n);
+ MakeStructOptions();
+ constexpr static char const kTypeName[] = "MakeStructOptions";
+
+ /// Names for wrapped columns
+ std::vector<std::string> field_names;
+
+ /// Nullability bits for wrapped columns
+ std::vector<bool> field_nullability;
+
+ /// Metadata attached to wrapped columns
+ std::vector<std::shared_ptr<const KeyValueMetadata>> field_metadata;
+};
+
+struct ARROW_EXPORT DayOfWeekOptions : public FunctionOptions {
+ public:
+ explicit DayOfWeekOptions(bool one_based_numbering = false, uint32_t week_start = 1);
+ constexpr static char const kTypeName[] = "DayOfWeekOptions";
+ static DayOfWeekOptions Defaults() { return DayOfWeekOptions{}; }
+
+ /// Number days from 1 if true and from 0 if false
+ bool one_based_numbering;
+ /// What day does the week start with (Monday=1, Sunday=7)
+ uint32_t week_start;
+};
+
+/// @}
+
+/// \brief Get the absolute value of a value.
+///
+/// If argument is null the result will be null.
+///
+/// \param[in] arg the value transformed
+/// \param[in] options arithmetic options (overflow handling), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise absolute value
+ARROW_EXPORT
+Result<Datum> AbsoluteValue(const Datum& arg,
+ ArithmeticOptions options = ArithmeticOptions(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Add two values together. Array values must be the same length. If
+/// either addend is null the result will be null.
+///
+/// \param[in] left the first addend
+/// \param[in] right the second addend
+/// \param[in] options arithmetic options (overflow handling), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise sum
+ARROW_EXPORT
+Result<Datum> Add(const Datum& left, const Datum& right,
+ ArithmeticOptions options = ArithmeticOptions(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Subtract two values. Array values must be the same length. If the
+/// minuend or subtrahend is null the result will be null.
+///
+/// \param[in] left the value subtracted from (minuend)
+/// \param[in] right the value by which the minuend is reduced (subtrahend)
+/// \param[in] options arithmetic options (overflow handling), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise difference
+ARROW_EXPORT
+Result<Datum> Subtract(const Datum& left, const Datum& right,
+ ArithmeticOptions options = ArithmeticOptions(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Multiply two values. Array values must be the same length. If either
+/// factor is null the result will be null.
+///
+/// \param[in] left the first factor
+/// \param[in] right the second factor
+/// \param[in] options arithmetic options (overflow handling), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise product
+ARROW_EXPORT
+Result<Datum> Multiply(const Datum& left, const Datum& right,
+ ArithmeticOptions options = ArithmeticOptions(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Divide two values. Array values must be the same length. If either
+/// argument is null the result will be null. For integer types, if there is
+/// a zero divisor, an error will be raised.
+///
+/// \param[in] left the dividend
+/// \param[in] right the divisor
+/// \param[in] options arithmetic options (enable/disable overflow checking), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise quotient
+ARROW_EXPORT
+Result<Datum> Divide(const Datum& left, const Datum& right,
+ ArithmeticOptions options = ArithmeticOptions(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Negate values.
+///
+/// If argument is null the result will be null.
+///
+/// \param[in] arg the value negated
+/// \param[in] options arithmetic options (overflow handling), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise negation
+ARROW_EXPORT
+Result<Datum> Negate(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Raise the values of base array to the power of the exponent array values.
+/// Array values must be the same length. If either base or exponent is null the result
+/// will be null.
+///
+/// \param[in] left the base
+/// \param[in] right the exponent
+/// \param[in] options arithmetic options (enable/disable overflow checking), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise base value raised to the power of exponent
+ARROW_EXPORT
+Result<Datum> Power(const Datum& left, const Datum& right,
+ ArithmeticOptions options = ArithmeticOptions(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Left shift the left array by the right array. Array values must be the
+/// same length. If either operand is null, the result will be null.
+///
+/// \param[in] left the value to shift
+/// \param[in] right the value to shift by
+/// \param[in] options arithmetic options (enable/disable overflow checking), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise left value shifted left by the right value
+ARROW_EXPORT
+Result<Datum> ShiftLeft(const Datum& left, const Datum& right,
+ ArithmeticOptions options = ArithmeticOptions(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Right shift the left array by the right array. Array values must be the
+/// same length. If either operand is null, the result will be null. Performs a
+/// logical shift for unsigned values, and an arithmetic shift for signed values.
+///
+/// \param[in] left the value to shift
+/// \param[in] right the value to shift by
+/// \param[in] options arithmetic options (enable/disable overflow checking), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise left value shifted right by the right value
+ARROW_EXPORT
+Result<Datum> ShiftRight(const Datum& left, const Datum& right,
+ ArithmeticOptions options = ArithmeticOptions(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Compute the sine of the array values.
+/// \param[in] arg The values to compute the sine for.
+/// \param[in] options arithmetic options (enable/disable overflow checking), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise sine of the values
+ARROW_EXPORT
+Result<Datum> Sin(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Compute the cosine of the array values.
+/// \param[in] arg The values to compute the cosine for.
+/// \param[in] options arithmetic options (enable/disable overflow checking), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise cosine of the values
+ARROW_EXPORT
+Result<Datum> Cos(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Compute the inverse sine (arcsine) of the array values.
+/// \param[in] arg The values to compute the inverse sine for.
+/// \param[in] options arithmetic options (enable/disable overflow checking), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise inverse sine of the values
+ARROW_EXPORT
+Result<Datum> Asin(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Compute the inverse cosine (arccosine) of the array values.
+/// \param[in] arg The values to compute the inverse cosine for.
+/// \param[in] options arithmetic options (enable/disable overflow checking), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise inverse cosine of the values
+ARROW_EXPORT
+Result<Datum> Acos(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Compute the tangent of the array values.
+/// \param[in] arg The values to compute the tangent for.
+/// \param[in] options arithmetic options (enable/disable overflow checking), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise tangent of the values
+ARROW_EXPORT
+Result<Datum> Tan(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Compute the inverse tangent (arctangent) of the array values.
+/// \param[in] arg The values to compute the inverse tangent for.
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise inverse tangent of the values
+ARROW_EXPORT
+Result<Datum> Atan(const Datum& arg, ExecContext* ctx = NULLPTR);
+
+/// \brief Compute the inverse tangent (arctangent) of y/x, using the
+/// argument signs to determine the correct quadrant.
+/// \param[in] y The y-values to compute the inverse tangent for.
+/// \param[in] x The x-values to compute the inverse tangent for.
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise inverse tangent of the values
+ARROW_EXPORT
+Result<Datum> Atan2(const Datum& y, const Datum& x, ExecContext* ctx = NULLPTR);
+
+/// \brief Get the natural log of a value.
+///
+/// If argument is null the result will be null.
+///
+/// \param[in] arg The values to compute the logarithm for.
+/// \param[in] options arithmetic options (overflow handling), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise natural log
+ARROW_EXPORT
+Result<Datum> Ln(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Get the log base 10 of a value.
+///
+/// If argument is null the result will be null.
+///
+/// \param[in] arg The values to compute the logarithm for.
+/// \param[in] options arithmetic options (overflow handling), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise log base 10
+ARROW_EXPORT
+Result<Datum> Log10(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Get the log base 2 of a value.
+///
+/// If argument is null the result will be null.
+///
+/// \param[in] arg The values to compute the logarithm for.
+/// \param[in] options arithmetic options (overflow handling), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise log base 2
+ARROW_EXPORT
+Result<Datum> Log2(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Get the natural log of (1 + value).
+///
+/// If argument is null the result will be null.
+/// This function may be more accurate than Log(1 + value) for values close to zero.
+///
+/// \param[in] arg The values to compute the logarithm for.
+/// \param[in] options arithmetic options (overflow handling), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise natural log
+ARROW_EXPORT
+Result<Datum> Log1p(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Round to the nearest integer less than or equal in magnitude to the
+/// argument. Array values can be of arbitrary length. If argument is null the
+/// result will be null.
+///
+/// \param[in] arg the value to round
+/// \param[in] ctx the function execution context, optional
+/// \return the rounded value
+ARROW_EXPORT
+Result<Datum> Floor(const Datum& arg, ExecContext* ctx = NULLPTR);
+
+/// \brief Round to the nearest integer greater than or equal in magnitude to the
+/// argument. Array values can be of arbitrary length. If argument is null the
+/// result will be null.
+///
+/// \param[in] arg the value to round
+/// \param[in] ctx the function execution context, optional
+/// \return the rounded value
+ARROW_EXPORT
+Result<Datum> Ceil(const Datum& arg, ExecContext* ctx = NULLPTR);
+
+/// \brief Get the integral part without fractional digits. Array values can be
+/// of arbitrary length. If argument is null the result will be null.
+///
+/// \param[in] arg the value to truncate
+/// \param[in] ctx the function execution context, optional
+/// \return the truncated value
+ARROW_EXPORT
+Result<Datum> Trunc(const Datum& arg, ExecContext* ctx = NULLPTR);
+
+/// \brief Find the element-wise maximum of any number of arrays or scalars.
+/// Array values must be the same length.
+///
+/// \param[in] args arrays or scalars to operate on.
+/// \param[in] options options for handling nulls, optional
+/// \param[in] ctx the function execution context, optional
+/// \return the element-wise maximum
+ARROW_EXPORT
+Result<Datum> MaxElementWise(
+ const std::vector<Datum>& args,
+ ElementWiseAggregateOptions options = ElementWiseAggregateOptions::Defaults(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Find the element-wise minimum of any number of arrays or scalars.
+/// Array values must be the same length.
+///
+/// \param[in] args arrays or scalars to operate on.
+/// \param[in] options options for handling nulls, optional
+/// \param[in] ctx the function execution context, optional
+/// \return the element-wise minimum
+ARROW_EXPORT
+Result<Datum> MinElementWise(
+ const std::vector<Datum>& args,
+ ElementWiseAggregateOptions options = ElementWiseAggregateOptions::Defaults(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Get the sign of a value. Array values can be of arbitrary length. If argument
+/// is null the result will be null.
+///
+/// \param[in] arg the value to extract sign from
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise sign function
+ARROW_EXPORT
+Result<Datum> Sign(const Datum& arg, ExecContext* ctx = NULLPTR);
+
+/// \brief Compare a numeric array with a scalar.
+///
+/// \param[in] left datum to compare, must be an Array
+/// \param[in] right datum to compare, must be a Scalar of the same type than
+/// left Datum.
+/// \param[in] options compare options
+/// \param[in] ctx the function execution context, optional
+/// \return resulting datum
+///
+/// Note on floating point arrays, this uses ieee-754 compare semantics.
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_DEPRECATED("Deprecated in 5.0.0. Use each compare function directly")
+ARROW_EXPORT
+Result<Datum> Compare(const Datum& left, const Datum& right, CompareOptions options,
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Invert the values of a boolean datum
+/// \param[in] value datum to invert
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Invert(const Datum& value, ExecContext* ctx = NULLPTR);
+
+/// \brief Element-wise AND of two boolean datums which always propagates nulls
+/// (null and false is null).
+///
+/// \param[in] left left operand
+/// \param[in] right right operand
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> And(const Datum& left, const Datum& right, ExecContext* ctx = NULLPTR);
+
+/// \brief Element-wise AND of two boolean datums with a Kleene truth table
+/// (null and false is false).
+///
+/// \param[in] left left operand
+/// \param[in] right right operand
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> KleeneAnd(const Datum& left, const Datum& right,
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Element-wise OR of two boolean datums which always propagates nulls
+/// (null and true is null).
+///
+/// \param[in] left left operand
+/// \param[in] right right operand
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Or(const Datum& left, const Datum& right, ExecContext* ctx = NULLPTR);
+
+/// \brief Element-wise OR of two boolean datums with a Kleene truth table
+/// (null or true is true).
+///
+/// \param[in] left left operand
+/// \param[in] right right operand
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> KleeneOr(const Datum& left, const Datum& right, ExecContext* ctx = NULLPTR);
+
+/// \brief Element-wise XOR of two boolean datums
+/// \param[in] left left operand
+/// \param[in] right right operand
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Xor(const Datum& left, const Datum& right, ExecContext* ctx = NULLPTR);
+
+/// \brief Element-wise AND NOT of two boolean datums which always propagates nulls
+/// (null and not true is null).
+///
+/// \param[in] left left operand
+/// \param[in] right right operand
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 3.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> AndNot(const Datum& left, const Datum& right, ExecContext* ctx = NULLPTR);
+
+/// \brief Element-wise AND NOT of two boolean datums with a Kleene truth table
+/// (false and not null is false, null and not true is false).
+///
+/// \param[in] left left operand
+/// \param[in] right right operand
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 3.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> KleeneAndNot(const Datum& left, const Datum& right,
+ ExecContext* ctx = NULLPTR);
+
+/// \brief IsIn returns true for each element of `values` that is contained in
+/// `value_set`
+///
+/// Behaviour of nulls is governed by SetLookupOptions::skip_nulls.
+///
+/// \param[in] values array-like input to look up in value_set
+/// \param[in] options SetLookupOptions
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> IsIn(const Datum& values, const SetLookupOptions& options,
+ ExecContext* ctx = NULLPTR);
+ARROW_EXPORT
+Result<Datum> IsIn(const Datum& values, const Datum& value_set,
+ ExecContext* ctx = NULLPTR);
+
+/// \brief IndexIn examines each slot in the values against a value_set array.
+/// If the value is not found in value_set, null will be output.
+/// If found, the index of occurrence within value_set (ignoring duplicates)
+/// will be output.
+///
+/// For example given values = [99, 42, 3, null] and
+/// value_set = [3, 3, 99], the output will be = [1, null, 0, null]
+///
+/// Behaviour of nulls is governed by SetLookupOptions::skip_nulls.
+///
+/// \param[in] values array-like input
+/// \param[in] options SetLookupOptions
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> IndexIn(const Datum& values, const SetLookupOptions& options,
+ ExecContext* ctx = NULLPTR);
+ARROW_EXPORT
+Result<Datum> IndexIn(const Datum& values, const Datum& value_set,
+ ExecContext* ctx = NULLPTR);
+
+/// \brief IsValid returns true for each element of `values` that is not null,
+/// false otherwise
+///
+/// \param[in] values input to examine for validity
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> IsValid(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief IsNull returns true for each element of `values` that is null,
+/// false otherwise
+///
+/// \param[in] values input to examine for nullity
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> IsNull(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief IsNan returns true for each element of `values` that is NaN,
+/// false otherwise
+///
+/// \param[in] values input to look for NaN
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 3.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> IsNan(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief FillNull replaces each null element in `values`
+/// with `fill_value`
+///
+/// \param[in] values input to examine for nullity
+/// \param[in] fill_value scalar
+/// \param[in] ctx the function execution context, optional
+///
+/// \return the resulting datum
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> FillNull(const Datum& values, const Datum& fill_value,
+ ExecContext* ctx = NULLPTR);
+
+/// \brief IfElse returns elements chosen from `left` or `right`
+/// depending on `cond`. `null` values in `cond` will be promoted to the result
+///
+/// \param[in] cond `Boolean` condition Scalar/ Array
+/// \param[in] left Scalar/ Array
+/// \param[in] right Scalar/ Array
+/// \param[in] ctx the function execution context, optional
+///
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> IfElse(const Datum& cond, const Datum& left, const Datum& right,
+ ExecContext* ctx = NULLPTR);
+
+/// \brief CaseWhen behaves like a switch/case or if-else if-else statement: for
+/// each row, select the first value for which the corresponding condition is
+/// true, or (if given) select the 'else' value, else emit null. Note that a
+/// null condition is the same as false.
+///
+/// \param[in] cond Conditions (Boolean)
+/// \param[in] cases Values (any type), along with an optional 'else' value.
+/// \param[in] ctx the function execution context, optional
+///
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> CaseWhen(const Datum& cond, const std::vector<Datum>& cases,
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Year returns year for each element of `values`
+///
+/// \param[in] values input to extract year from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Year(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief Month returns month for each element of `values`.
+/// Month is encoded as January=1, December=12
+///
+/// \param[in] values input to extract month from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Month(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief Day returns day number for each element of `values`
+///
+/// \param[in] values input to extract day from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Day(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief DayOfWeek returns number of the day of the week value for each element of
+/// `values`.
+///
+/// By default week starts on Monday denoted by 0 and ends on Sunday denoted
+/// by 6. Start day of the week (Monday=1, Sunday=7) and numbering base (0 or 1) can be
+/// set using DayOfWeekOptions
+///
+/// \param[in] values input to extract number of the day of the week from
+/// \param[in] options for setting start of the week and day numbering
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT Result<Datum> DayOfWeek(const Datum& values,
+ DayOfWeekOptions options = DayOfWeekOptions(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief DayOfYear returns number of day of the year for each element of `values`.
+/// January 1st maps to day number 1, February 1st to 32, etc.
+///
+/// \param[in] values input to extract number of day of the year from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT Result<Datum> DayOfYear(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief ISOYear returns ISO year number for each element of `values`.
+/// First week of an ISO year has the majority (4 or more) of its days in January.
+///
+/// \param[in] values input to extract ISO year from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> ISOYear(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief ISOWeek returns ISO week of year number for each element of `values`.
+/// First ISO week has the majority (4 or more) of its days in January.
+/// Week of the year starts with 1 and can run up to 53.
+///
+/// \param[in] values input to extract ISO week of year from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT Result<Datum> ISOWeek(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief ISOCalendar returns a (ISO year, ISO week, ISO day of week) struct for
+/// each element of `values`.
+/// ISO week starts on Monday denoted by 1 and ends on Sunday denoted by 7.
+///
+/// \param[in] values input to ISO calendar struct from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT Result<Datum> ISOCalendar(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief Quarter returns the quarter of year number for each element of `values`
+/// First quarter maps to 1 and fourth quarter maps to 4.
+///
+/// \param[in] values input to extract quarter of year from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT Result<Datum> Quarter(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief Hour returns hour value for each element of `values`
+///
+/// \param[in] values input to extract hour from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Hour(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief Minute returns minutes value for each element of `values`
+///
+/// \param[in] values input to extract minutes from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Minute(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief Second returns seconds value for each element of `values`
+///
+/// \param[in] values input to extract seconds from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Second(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief Millisecond returns number of milliseconds since the last full second
+/// for each element of `values`
+///
+/// \param[in] values input to extract milliseconds from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Millisecond(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief Microsecond returns number of microseconds since the last full millisecond
+/// for each element of `values`
+///
+/// \param[in] values input to extract microseconds from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Microsecond(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief Nanosecond returns number of nanoseconds since the last full millisecond
+/// for each element of `values`
+///
+/// \param[in] values input to extract nanoseconds from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Nanosecond(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief Subsecond returns the fraction of second elapsed since last full second
+/// as a float for each element of `values`
+///
+/// \param[in] values input to extract subsecond from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT Result<Datum> Subsecond(const Datum& values, ExecContext* ctx = NULLPTR);
+
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_vector.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_vector.cc
new file mode 100644
index 00000000000..a68969b2ee5
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_vector.cc
@@ -0,0 +1,283 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/api_vector.h"
+
+#include <memory>
+#include <sstream>
+#include <utility>
+#include <vector>
+
+#include "arrow/array/array_nested.h"
+#include "arrow/array/builder_primitive.h"
+#include "arrow/compute/exec.h"
+#include "arrow/compute/function_internal.h"
+#include "arrow/compute/registry.h"
+#include "arrow/datum.h"
+#include "arrow/record_batch.h"
+#include "arrow/result.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/logging.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+using internal::checked_pointer_cast;
+
+namespace internal {
+using compute::DictionaryEncodeOptions;
+using compute::FilterOptions;
+template <>
+struct EnumTraits<FilterOptions::NullSelectionBehavior>
+ : BasicEnumTraits<FilterOptions::NullSelectionBehavior, FilterOptions::DROP,
+ FilterOptions::EMIT_NULL> {
+ static std::string name() { return "FilterOptions::NullSelectionBehavior"; }
+ static std::string value_name(FilterOptions::NullSelectionBehavior value) {
+ switch (value) {
+ case FilterOptions::DROP:
+ return "DROP";
+ case FilterOptions::EMIT_NULL:
+ return "EMIT_NULL";
+ }
+ return "<INVALID>";
+ }
+};
+template <>
+struct EnumTraits<DictionaryEncodeOptions::NullEncodingBehavior>
+ : BasicEnumTraits<DictionaryEncodeOptions::NullEncodingBehavior,
+ DictionaryEncodeOptions::ENCODE, DictionaryEncodeOptions::MASK> {
+ static std::string name() { return "DictionaryEncodeOptions::NullEncodingBehavior"; }
+ static std::string value_name(DictionaryEncodeOptions::NullEncodingBehavior value) {
+ switch (value) {
+ case DictionaryEncodeOptions::ENCODE:
+ return "ENCODE";
+ case DictionaryEncodeOptions::MASK:
+ return "MASK";
+ }
+ return "<INVALID>";
+ }
+};
+} // namespace internal
+
+namespace compute {
+
+// ----------------------------------------------------------------------
+// Function options
+
+bool SortKey::Equals(const SortKey& other) const {
+ return name == other.name && order == other.order;
+}
+std::string SortKey::ToString() const {
+ std::stringstream ss;
+ ss << name << ' ';
+ switch (order) {
+ case SortOrder::Ascending:
+ ss << "ASC";
+ break;
+ case SortOrder::Descending:
+ ss << "DESC";
+ break;
+ }
+ return ss.str();
+}
+
+namespace internal {
+namespace {
+using ::arrow::internal::DataMember;
+static auto kFilterOptionsType = GetFunctionOptionsType<FilterOptions>(
+ DataMember("null_selection_behavior", &FilterOptions::null_selection_behavior));
+static auto kTakeOptionsType = GetFunctionOptionsType<TakeOptions>(
+ DataMember("boundscheck", &TakeOptions::boundscheck));
+static auto kDictionaryEncodeOptionsType =
+ GetFunctionOptionsType<DictionaryEncodeOptions>(DataMember(
+ "null_encoding_behavior", &DictionaryEncodeOptions::null_encoding_behavior));
+static auto kArraySortOptionsType = GetFunctionOptionsType<ArraySortOptions>(
+ DataMember("order", &ArraySortOptions::order));
+static auto kSortOptionsType =
+ GetFunctionOptionsType<SortOptions>(DataMember("sort_keys", &SortOptions::sort_keys));
+static auto kPartitionNthOptionsType = GetFunctionOptionsType<PartitionNthOptions>(
+ DataMember("pivot", &PartitionNthOptions::pivot));
+} // namespace
+} // namespace internal
+
+FilterOptions::FilterOptions(NullSelectionBehavior null_selection)
+ : FunctionOptions(internal::kFilterOptionsType),
+ null_selection_behavior(null_selection) {}
+constexpr char FilterOptions::kTypeName[];
+
+TakeOptions::TakeOptions(bool boundscheck)
+ : FunctionOptions(internal::kTakeOptionsType), boundscheck(boundscheck) {}
+constexpr char TakeOptions::kTypeName[];
+
+DictionaryEncodeOptions::DictionaryEncodeOptions(NullEncodingBehavior null_encoding)
+ : FunctionOptions(internal::kDictionaryEncodeOptionsType),
+ null_encoding_behavior(null_encoding) {}
+constexpr char DictionaryEncodeOptions::kTypeName[];
+
+ArraySortOptions::ArraySortOptions(SortOrder order)
+ : FunctionOptions(internal::kArraySortOptionsType), order(order) {}
+constexpr char ArraySortOptions::kTypeName[];
+
+SortOptions::SortOptions(std::vector<SortKey> sort_keys)
+ : FunctionOptions(internal::kSortOptionsType), sort_keys(std::move(sort_keys)) {}
+constexpr char SortOptions::kTypeName[];
+
+PartitionNthOptions::PartitionNthOptions(int64_t pivot)
+ : FunctionOptions(internal::kPartitionNthOptionsType), pivot(pivot) {}
+constexpr char PartitionNthOptions::kTypeName[];
+
+namespace internal {
+void RegisterVectorOptions(FunctionRegistry* registry) {
+ DCHECK_OK(registry->AddFunctionOptionsType(kFilterOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kTakeOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kDictionaryEncodeOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kArraySortOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kSortOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kPartitionNthOptionsType));
+}
+} // namespace internal
+
+// ----------------------------------------------------------------------
+// Direct exec interface to kernels
+
+Result<std::shared_ptr<Array>> NthToIndices(const Array& values, int64_t n,
+ ExecContext* ctx) {
+ PartitionNthOptions options(/*pivot=*/n);
+ ARROW_ASSIGN_OR_RAISE(Datum result, CallFunction("partition_nth_indices",
+ {Datum(values)}, &options, ctx));
+ return result.make_array();
+}
+
+Result<Datum> ReplaceWithMask(const Datum& values, const Datum& mask,
+ const Datum& replacements, ExecContext* ctx) {
+ return CallFunction("replace_with_mask", {values, mask, replacements}, ctx);
+}
+
+Result<std::shared_ptr<Array>> SortIndices(const Array& values, SortOrder order,
+ ExecContext* ctx) {
+ ArraySortOptions options(order);
+ ARROW_ASSIGN_OR_RAISE(
+ Datum result, CallFunction("array_sort_indices", {Datum(values)}, &options, ctx));
+ return result.make_array();
+}
+
+Result<std::shared_ptr<Array>> SortIndices(const ChunkedArray& chunked_array,
+ SortOrder order, ExecContext* ctx) {
+ SortOptions options({SortKey("not-used", order)});
+ ARROW_ASSIGN_OR_RAISE(
+ Datum result, CallFunction("sort_indices", {Datum(chunked_array)}, &options, ctx));
+ return result.make_array();
+}
+
+Result<std::shared_ptr<Array>> SortIndices(const Datum& datum, const SortOptions& options,
+ ExecContext* ctx) {
+ ARROW_ASSIGN_OR_RAISE(Datum result,
+ CallFunction("sort_indices", {datum}, &options, ctx));
+ return result.make_array();
+}
+
+Result<std::shared_ptr<Array>> Unique(const Datum& value, ExecContext* ctx) {
+ ARROW_ASSIGN_OR_RAISE(Datum result, CallFunction("unique", {value}, ctx));
+ return result.make_array();
+}
+
+Result<Datum> DictionaryEncode(const Datum& value, const DictionaryEncodeOptions& options,
+ ExecContext* ctx) {
+ return CallFunction("dictionary_encode", {value}, &options, ctx);
+}
+
+const char kValuesFieldName[] = "values";
+const char kCountsFieldName[] = "counts";
+const int32_t kValuesFieldIndex = 0;
+const int32_t kCountsFieldIndex = 1;
+
+Result<std::shared_ptr<StructArray>> ValueCounts(const Datum& value, ExecContext* ctx) {
+ ARROW_ASSIGN_OR_RAISE(Datum result, CallFunction("value_counts", {value}, ctx));
+ return checked_pointer_cast<StructArray>(result.make_array());
+}
+
+// ----------------------------------------------------------------------
+// Filter- and take-related selection functions
+
+Result<Datum> Filter(const Datum& values, const Datum& filter,
+ const FilterOptions& options, ExecContext* ctx) {
+ // Invoke metafunction which deals with Datum kinds other than just Array,
+ // ChunkedArray.
+ return CallFunction("filter", {values, filter}, &options, ctx);
+}
+
+Result<Datum> Take(const Datum& values, const Datum& filter, const TakeOptions& options,
+ ExecContext* ctx) {
+ // Invoke metafunction which deals with Datum kinds other than just Array,
+ // ChunkedArray.
+ return CallFunction("take", {values, filter}, &options, ctx);
+}
+
+Result<std::shared_ptr<Array>> Take(const Array& values, const Array& indices,
+ const TakeOptions& options, ExecContext* ctx) {
+ ARROW_ASSIGN_OR_RAISE(Datum out, Take(Datum(values), Datum(indices), options, ctx));
+ return out.make_array();
+}
+
+// ----------------------------------------------------------------------
+// Deprecated functions
+
+Result<std::shared_ptr<ChunkedArray>> Take(const ChunkedArray& values,
+ const Array& indices,
+ const TakeOptions& options, ExecContext* ctx) {
+ ARROW_ASSIGN_OR_RAISE(Datum result, Take(Datum(values), Datum(indices), options, ctx));
+ return result.chunked_array();
+}
+
+Result<std::shared_ptr<ChunkedArray>> Take(const ChunkedArray& values,
+ const ChunkedArray& indices,
+ const TakeOptions& options, ExecContext* ctx) {
+ ARROW_ASSIGN_OR_RAISE(Datum result, Take(Datum(values), Datum(indices), options, ctx));
+ return result.chunked_array();
+}
+
+Result<std::shared_ptr<ChunkedArray>> Take(const Array& values,
+ const ChunkedArray& indices,
+ const TakeOptions& options, ExecContext* ctx) {
+ ARROW_ASSIGN_OR_RAISE(Datum result, Take(Datum(values), Datum(indices), options, ctx));
+ return result.chunked_array();
+}
+
+Result<std::shared_ptr<RecordBatch>> Take(const RecordBatch& batch, const Array& indices,
+ const TakeOptions& options, ExecContext* ctx) {
+ ARROW_ASSIGN_OR_RAISE(Datum result, Take(Datum(batch), Datum(indices), options, ctx));
+ return result.record_batch();
+}
+
+Result<std::shared_ptr<Table>> Take(const Table& table, const Array& indices,
+ const TakeOptions& options, ExecContext* ctx) {
+ ARROW_ASSIGN_OR_RAISE(Datum result, Take(Datum(table), Datum(indices), options, ctx));
+ return result.table();
+}
+
+Result<std::shared_ptr<Table>> Take(const Table& table, const ChunkedArray& indices,
+ const TakeOptions& options, ExecContext* ctx) {
+ ARROW_ASSIGN_OR_RAISE(Datum result, Take(Datum(table), Datum(indices), options, ctx));
+ return result.table();
+}
+
+Result<std::shared_ptr<Array>> SortToIndices(const Array& values, ExecContext* ctx) {
+ return SortIndices(values, SortOrder::Ascending, ctx);
+}
+
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_vector.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_vector.h
new file mode 100644
index 00000000000..9d8d4271db8
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_vector.h
@@ -0,0 +1,410 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+
+#include "arrow/compute/function.h"
+#include "arrow/datum.h"
+#include "arrow/result.h"
+#include "arrow/type_fwd.h"
+
+namespace arrow {
+namespace compute {
+
+class ExecContext;
+
+/// \addtogroup compute-concrete-options
+/// @{
+
+class ARROW_EXPORT FilterOptions : public FunctionOptions {
+ public:
+ /// Configure the action taken when a slot of the selection mask is null
+ enum NullSelectionBehavior {
+ /// the corresponding filtered value will be removed in the output
+ DROP,
+ /// the corresponding filtered value will be null in the output
+ EMIT_NULL,
+ };
+
+ explicit FilterOptions(NullSelectionBehavior null_selection = DROP);
+ constexpr static char const kTypeName[] = "FilterOptions";
+ static FilterOptions Defaults() { return FilterOptions(); }
+
+ NullSelectionBehavior null_selection_behavior = DROP;
+};
+
+class ARROW_EXPORT TakeOptions : public FunctionOptions {
+ public:
+ explicit TakeOptions(bool boundscheck = true);
+ constexpr static char const kTypeName[] = "TakeOptions";
+ static TakeOptions BoundsCheck() { return TakeOptions(true); }
+ static TakeOptions NoBoundsCheck() { return TakeOptions(false); }
+ static TakeOptions Defaults() { return BoundsCheck(); }
+
+ bool boundscheck = true;
+};
+
+/// \brief Options for the dictionary encode function
+class ARROW_EXPORT DictionaryEncodeOptions : public FunctionOptions {
+ public:
+ /// Configure how null values will be encoded
+ enum NullEncodingBehavior {
+ /// the null value will be added to the dictionary with a proper index
+ ENCODE,
+ /// the null value will be masked in the indices array
+ MASK
+ };
+
+ explicit DictionaryEncodeOptions(NullEncodingBehavior null_encoding = MASK);
+ constexpr static char const kTypeName[] = "DictionaryEncodeOptions";
+ static DictionaryEncodeOptions Defaults() { return DictionaryEncodeOptions(); }
+
+ NullEncodingBehavior null_encoding_behavior = MASK;
+};
+
+enum class SortOrder {
+ Ascending,
+ Descending,
+};
+
+/// \brief One sort key for PartitionNthIndices (TODO) and SortIndices
+class ARROW_EXPORT SortKey : public util::EqualityComparable<SortKey> {
+ public:
+ explicit SortKey(std::string name, SortOrder order = SortOrder::Ascending)
+ : name(name), order(order) {}
+
+ using util::EqualityComparable<SortKey>::Equals;
+ using util::EqualityComparable<SortKey>::operator==;
+ using util::EqualityComparable<SortKey>::operator!=;
+ bool Equals(const SortKey& other) const;
+ std::string ToString() const;
+
+ /// The name of the sort column.
+ std::string name;
+ /// How to order by this sort key.
+ SortOrder order;
+};
+
+class ARROW_EXPORT ArraySortOptions : public FunctionOptions {
+ public:
+ explicit ArraySortOptions(SortOrder order = SortOrder::Ascending);
+ constexpr static char const kTypeName[] = "ArraySortOptions";
+ static ArraySortOptions Defaults() { return ArraySortOptions{}; }
+
+ SortOrder order;
+};
+
+class ARROW_EXPORT SortOptions : public FunctionOptions {
+ public:
+ explicit SortOptions(std::vector<SortKey> sort_keys = {});
+ constexpr static char const kTypeName[] = "SortOptions";
+ static SortOptions Defaults() { return SortOptions{}; }
+
+ std::vector<SortKey> sort_keys;
+};
+
+/// \brief Partitioning options for NthToIndices
+class ARROW_EXPORT PartitionNthOptions : public FunctionOptions {
+ public:
+ explicit PartitionNthOptions(int64_t pivot);
+ PartitionNthOptions() : PartitionNthOptions(0) {}
+ constexpr static char const kTypeName[] = "PartitionNthOptions";
+
+ /// The index into the equivalent sorted array of the partition pivot element.
+ int64_t pivot;
+};
+
+/// @}
+
+/// \brief Filter with a boolean selection filter
+///
+/// The output will be populated with values from the input at positions
+/// where the selection filter is not 0. Nulls in the filter will be handled
+/// based on options.null_selection_behavior.
+///
+/// For example given values = ["a", "b", "c", null, "e", "f"] and
+/// filter = [0, 1, 1, 0, null, 1], the output will be
+/// (null_selection_behavior == DROP) = ["b", "c", "f"]
+/// (null_selection_behavior == EMIT_NULL) = ["b", "c", null, "f"]
+///
+/// \param[in] values array to filter
+/// \param[in] filter indicates which values should be filtered out
+/// \param[in] options configures null_selection_behavior
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+ARROW_EXPORT
+Result<Datum> Filter(const Datum& values, const Datum& filter,
+ const FilterOptions& options = FilterOptions::Defaults(),
+ ExecContext* ctx = NULLPTR);
+
+namespace internal {
+
+// These internal functions are implemented in kernels/vector_selection.cc
+
+/// \brief Return the number of selected indices in the boolean filter
+ARROW_EXPORT
+int64_t GetFilterOutputSize(const ArrayData& filter,
+ FilterOptions::NullSelectionBehavior null_selection);
+
+/// \brief Compute uint64 selection indices for use with Take given a boolean
+/// filter
+ARROW_EXPORT
+Result<std::shared_ptr<ArrayData>> GetTakeIndices(
+ const ArrayData& filter, FilterOptions::NullSelectionBehavior null_selection,
+ MemoryPool* memory_pool = default_memory_pool());
+
+} // namespace internal
+
+/// \brief ReplaceWithMask replaces each value in the array corresponding
+/// to a true value in the mask with the next element from `replacements`.
+///
+/// \param[in] values Array input to replace
+/// \param[in] mask Array or Scalar of Boolean mask values
+/// \param[in] replacements The replacement values to draw from. There must
+/// be as many replacement values as true values in the mask.
+/// \param[in] ctx the function execution context, optional
+///
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> ReplaceWithMask(const Datum& values, const Datum& mask,
+ const Datum& replacements, ExecContext* ctx = NULLPTR);
+
+/// \brief Take from an array of values at indices in another array
+///
+/// The output array will be of the same type as the input values
+/// array, with elements taken from the values array at the given
+/// indices. If an index is null then the taken element will be null.
+///
+/// For example given values = ["a", "b", "c", null, "e", "f"] and
+/// indices = [2, 1, null, 3], the output will be
+/// = [values[2], values[1], null, values[3]]
+/// = ["c", "b", null, null]
+///
+/// \param[in] values datum from which to take
+/// \param[in] indices which values to take
+/// \param[in] options options
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+ARROW_EXPORT
+Result<Datum> Take(const Datum& values, const Datum& indices,
+ const TakeOptions& options = TakeOptions::Defaults(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Take with Array inputs and output
+ARROW_EXPORT
+Result<std::shared_ptr<Array>> Take(const Array& values, const Array& indices,
+ const TakeOptions& options = TakeOptions::Defaults(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Returns indices that partition an array around n-th
+/// sorted element.
+///
+/// Find index of n-th(0 based) smallest value and perform indirect
+/// partition of an array around that element. Output indices[0 ~ n-1]
+/// holds values no greater than n-th element, and indices[n+1 ~ end]
+/// holds values no less than n-th element. Elements in each partition
+/// is not sorted. Nulls will be partitioned to the end of the output.
+/// Output is not guaranteed to be stable.
+///
+/// \param[in] values array to be partitioned
+/// \param[in] n pivot array around sorted n-th element
+/// \param[in] ctx the function execution context, optional
+/// \return offsets indices that would partition an array
+ARROW_EXPORT
+Result<std::shared_ptr<Array>> NthToIndices(const Array& values, int64_t n,
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Returns the indices that would sort an array in the
+/// specified order.
+///
+/// Perform an indirect sort of array. The output array will contain
+/// indices that would sort an array, which would be the same length
+/// as input. Nulls will be stably partitioned to the end of the output
+/// regardless of order.
+///
+/// For example given array = [null, 1, 3.3, null, 2, 5.3] and order
+/// = SortOrder::DESCENDING, the output will be [5, 2, 4, 1, 0,
+/// 3].
+///
+/// \param[in] array array to sort
+/// \param[in] order ascending or descending
+/// \param[in] ctx the function execution context, optional
+/// \return offsets indices that would sort an array
+ARROW_EXPORT
+Result<std::shared_ptr<Array>> SortIndices(const Array& array,
+ SortOrder order = SortOrder::Ascending,
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Returns the indices that would sort a chunked array in the
+/// specified order.
+///
+/// Perform an indirect sort of chunked array. The output array will
+/// contain indices that would sort a chunked array, which would be
+/// the same length as input. Nulls will be stably partitioned to the
+/// end of the output regardless of order.
+///
+/// For example given chunked_array = [[null, 1], [3.3], [null, 2,
+/// 5.3]] and order = SortOrder::DESCENDING, the output will be [5, 2,
+/// 4, 1, 0, 3].
+///
+/// \param[in] chunked_array chunked array to sort
+/// \param[in] order ascending or descending
+/// \param[in] ctx the function execution context, optional
+/// \return offsets indices that would sort an array
+ARROW_EXPORT
+Result<std::shared_ptr<Array>> SortIndices(const ChunkedArray& chunked_array,
+ SortOrder order = SortOrder::Ascending,
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Returns the indices that would sort an input in the
+/// specified order. Input is one of array, chunked array record batch
+/// or table.
+///
+/// Perform an indirect sort of input. The output array will contain
+/// indices that would sort an input, which would be the same length
+/// as input. Nulls will be stably partitioned to the end of the
+/// output regardless of order.
+///
+/// For example given input (table) = {
+/// "column1": [[null, 1], [ 3, null, 2, 1]],
+/// "column2": [[ 5], [3, null, null, 5, 5]],
+/// } and options = {
+/// {"column1", SortOrder::Ascending},
+/// {"column2", SortOrder::Descending},
+/// }, the output will be [5, 1, 4, 2, 0, 3].
+///
+/// \param[in] datum array, chunked array, record batch or table to sort
+/// \param[in] options options
+/// \param[in] ctx the function execution context, optional
+/// \return offsets indices that would sort a table
+ARROW_EXPORT
+Result<std::shared_ptr<Array>> SortIndices(const Datum& datum, const SortOptions& options,
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Compute unique elements from an array-like object
+///
+/// Note if a null occurs in the input it will NOT be included in the output.
+///
+/// \param[in] datum array-like input
+/// \param[in] ctx the function execution context, optional
+/// \return result as Array
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<std::shared_ptr<Array>> Unique(const Datum& datum, ExecContext* ctx = NULLPTR);
+
+// Constants for accessing the output of ValueCounts
+ARROW_EXPORT extern const char kValuesFieldName[];
+ARROW_EXPORT extern const char kCountsFieldName[];
+ARROW_EXPORT extern const int32_t kValuesFieldIndex;
+ARROW_EXPORT extern const int32_t kCountsFieldIndex;
+
+/// \brief Return counts of unique elements from an array-like object.
+///
+/// Note that the counts do not include counts for nulls in the array. These can be
+/// obtained separately from metadata.
+///
+/// For floating point arrays there is no attempt to normalize -0.0, 0.0 and NaN values
+/// which can lead to unexpected results if the input Array has these values.
+///
+/// \param[in] value array-like input
+/// \param[in] ctx the function execution context, optional
+/// \return counts An array of <input type "Values", int64_t "Counts"> structs.
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<std::shared_ptr<StructArray>> ValueCounts(const Datum& value,
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Dictionary-encode values in an array-like object
+///
+/// Any nulls encountered in the dictionary will be handled according to the
+/// specified null encoding behavior.
+///
+/// For example, given values ["a", "b", null, "a", null] the output will be
+/// (null_encoding == ENCODE) Indices: [0, 1, 2, 0, 2] / Dict: ["a", "b", null]
+/// (null_encoding == MASK) Indices: [0, 1, null, 0, null] / Dict: ["a", "b"]
+///
+/// If the input is already dictionary encoded this function is a no-op unless
+/// it needs to modify the null_encoding (TODO)
+///
+/// \param[in] data array-like input
+/// \param[in] ctx the function execution context, optional
+/// \param[in] options configures null encoding behavior
+/// \return result with same shape and type as input
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> DictionaryEncode(
+ const Datum& data,
+ const DictionaryEncodeOptions& options = DictionaryEncodeOptions::Defaults(),
+ ExecContext* ctx = NULLPTR);
+
+// ----------------------------------------------------------------------
+// Deprecated functions
+
+ARROW_DEPRECATED("Deprecated in 1.0.0. Use Datum-based version")
+ARROW_EXPORT
+Result<std::shared_ptr<ChunkedArray>> Take(
+ const ChunkedArray& values, const Array& indices,
+ const TakeOptions& options = TakeOptions::Defaults(), ExecContext* context = NULLPTR);
+
+ARROW_DEPRECATED("Deprecated in 1.0.0. Use Datum-based version")
+ARROW_EXPORT
+Result<std::shared_ptr<ChunkedArray>> Take(
+ const ChunkedArray& values, const ChunkedArray& indices,
+ const TakeOptions& options = TakeOptions::Defaults(), ExecContext* context = NULLPTR);
+
+ARROW_DEPRECATED("Deprecated in 1.0.0. Use Datum-based version")
+ARROW_EXPORT
+Result<std::shared_ptr<ChunkedArray>> Take(
+ const Array& values, const ChunkedArray& indices,
+ const TakeOptions& options = TakeOptions::Defaults(), ExecContext* context = NULLPTR);
+
+ARROW_DEPRECATED("Deprecated in 1.0.0. Use Datum-based version")
+ARROW_EXPORT
+Result<std::shared_ptr<RecordBatch>> Take(
+ const RecordBatch& batch, const Array& indices,
+ const TakeOptions& options = TakeOptions::Defaults(), ExecContext* context = NULLPTR);
+
+ARROW_DEPRECATED("Deprecated in 1.0.0. Use Datum-based version")
+ARROW_EXPORT
+Result<std::shared_ptr<Table>> Take(const Table& table, const Array& indices,
+ const TakeOptions& options = TakeOptions::Defaults(),
+ ExecContext* context = NULLPTR);
+
+ARROW_DEPRECATED("Deprecated in 1.0.0. Use Datum-based version")
+ARROW_EXPORT
+Result<std::shared_ptr<Table>> Take(const Table& table, const ChunkedArray& indices,
+ const TakeOptions& options = TakeOptions::Defaults(),
+ ExecContext* context = NULLPTR);
+
+ARROW_DEPRECATED("Deprecated in 3.0.0. Use SortIndices()")
+ARROW_EXPORT
+Result<std::shared_ptr<Array>> SortToIndices(const Array& values,
+ ExecContext* ctx = NULLPTR);
+
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/cast.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/cast.cc
new file mode 100644
index 00000000000..4de68ba8d90
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/cast.cc
@@ -0,0 +1,273 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/cast.h"
+
+#include <mutex>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "arrow/compute/cast_internal.h"
+#include "arrow/compute/exec.h"
+#include "arrow/compute/function_internal.h"
+#include "arrow/compute/kernel.h"
+#include "arrow/compute/kernels/codegen_internal.h"
+#include "arrow/compute/registry.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/reflection_internal.h"
+
+namespace arrow {
+
+using internal::ToTypeName;
+
+namespace compute {
+namespace internal {
+
+// ----------------------------------------------------------------------
+// Function options
+
+namespace {
+
+std::unordered_map<int, std::shared_ptr<CastFunction>> g_cast_table;
+std::once_flag cast_table_initialized;
+
+void AddCastFunctions(const std::vector<std::shared_ptr<CastFunction>>& funcs) {
+ for (const auto& func : funcs) {
+ g_cast_table[static_cast<int>(func->out_type_id())] = func;
+ }
+}
+
+void InitCastTable() {
+ AddCastFunctions(GetBooleanCasts());
+ AddCastFunctions(GetBinaryLikeCasts());
+ AddCastFunctions(GetNestedCasts());
+ AddCastFunctions(GetNumericCasts());
+ AddCastFunctions(GetTemporalCasts());
+ AddCastFunctions(GetDictionaryCasts());
+}
+
+void EnsureInitCastTable() { std::call_once(cast_table_initialized, InitCastTable); }
+
+// Private version of GetCastFunction with better error reporting
+// if the input type is known.
+Result<std::shared_ptr<CastFunction>> GetCastFunctionInternal(
+ const std::shared_ptr<DataType>& to_type, const DataType* from_type = nullptr) {
+ internal::EnsureInitCastTable();
+ auto it = internal::g_cast_table.find(static_cast<int>(to_type->id()));
+ if (it == internal::g_cast_table.end()) {
+ if (from_type != nullptr) {
+ return Status::NotImplemented("Unsupported cast from ", *from_type, " to ",
+ *to_type,
+ " (no available cast function for target type)");
+ } else {
+ return Status::NotImplemented("Unsupported cast to ", *to_type,
+ " (no available cast function for target type)");
+ }
+ }
+ return it->second;
+}
+
+const FunctionDoc cast_doc{"Cast values to another data type",
+ ("Behavior when values wouldn't fit in the target type\n"
+ "can be controlled through CastOptions."),
+ {"input"},
+ "CastOptions"};
+
+// Metafunction for dispatching to appropriate CastFunction. This corresponds
+// to the standard SQL CAST(expr AS target_type)
+class CastMetaFunction : public MetaFunction {
+ public:
+ CastMetaFunction() : MetaFunction("cast", Arity::Unary(), &cast_doc) {}
+
+ Result<const CastOptions*> ValidateOptions(const FunctionOptions* options) const {
+ auto cast_options = static_cast<const CastOptions*>(options);
+
+ if (cast_options == nullptr || cast_options->to_type == nullptr) {
+ return Status::Invalid(
+ "Cast requires that options be passed with "
+ "the to_type populated");
+ }
+
+ return cast_options;
+ }
+
+ Result<Datum> ExecuteImpl(const std::vector<Datum>& args,
+ const FunctionOptions* options,
+ ExecContext* ctx) const override {
+ ARROW_ASSIGN_OR_RAISE(auto cast_options, ValidateOptions(options));
+ if (args[0].type()->Equals(*cast_options->to_type)) {
+ return args[0];
+ }
+ ARROW_ASSIGN_OR_RAISE(
+ std::shared_ptr<CastFunction> cast_func,
+ GetCastFunctionInternal(cast_options->to_type, args[0].type().get()));
+ return cast_func->Execute(args, options, ctx);
+ }
+};
+
+static auto kCastOptionsType = GetFunctionOptionsType<CastOptions>(
+ arrow::internal::DataMember("to_type", &CastOptions::to_type),
+ arrow::internal::DataMember("allow_int_overflow", &CastOptions::allow_int_overflow),
+ arrow::internal::DataMember("allow_time_truncate", &CastOptions::allow_time_truncate),
+ arrow::internal::DataMember("allow_time_overflow", &CastOptions::allow_time_overflow),
+ arrow::internal::DataMember("allow_decimal_truncate",
+ &CastOptions::allow_decimal_truncate),
+ arrow::internal::DataMember("allow_float_truncate",
+ &CastOptions::allow_float_truncate),
+ arrow::internal::DataMember("allow_invalid_utf8", &CastOptions::allow_invalid_utf8));
+} // namespace
+
+void RegisterScalarCast(FunctionRegistry* registry) {
+ DCHECK_OK(registry->AddFunction(std::make_shared<CastMetaFunction>()));
+ DCHECK_OK(registry->AddFunctionOptionsType(kCastOptionsType));
+}
+} // namespace internal
+
+CastOptions::CastOptions(bool safe)
+ : FunctionOptions(internal::kCastOptionsType),
+ allow_int_overflow(!safe),
+ allow_time_truncate(!safe),
+ allow_time_overflow(!safe),
+ allow_decimal_truncate(!safe),
+ allow_float_truncate(!safe),
+ allow_invalid_utf8(!safe) {}
+
+constexpr char CastOptions::kTypeName[];
+
+CastFunction::CastFunction(std::string name, Type::type out_type_id)
+ : ScalarFunction(std::move(name), Arity::Unary(), /*doc=*/nullptr),
+ out_type_id_(out_type_id) {}
+
+Status CastFunction::AddKernel(Type::type in_type_id, ScalarKernel kernel) {
+ // We use the same KernelInit for every cast
+ kernel.init = internal::CastState::Init;
+ RETURN_NOT_OK(ScalarFunction::AddKernel(kernel));
+ in_type_ids_.push_back(in_type_id);
+ return Status::OK();
+}
+
+Status CastFunction::AddKernel(Type::type in_type_id, std::vector<InputType> in_types,
+ OutputType out_type, ArrayKernelExec exec,
+ NullHandling::type null_handling,
+ MemAllocation::type mem_allocation) {
+ ScalarKernel kernel;
+ kernel.signature = KernelSignature::Make(std::move(in_types), std::move(out_type));
+ kernel.exec = exec;
+ kernel.null_handling = null_handling;
+ kernel.mem_allocation = mem_allocation;
+ return AddKernel(in_type_id, std::move(kernel));
+}
+
+Result<const Kernel*> CastFunction::DispatchExact(
+ const std::vector<ValueDescr>& values) const {
+ RETURN_NOT_OK(CheckArity(values));
+
+ std::vector<const ScalarKernel*> candidate_kernels;
+ for (const auto& kernel : kernels_) {
+ if (kernel.signature->MatchesInputs(values)) {
+ candidate_kernels.push_back(&kernel);
+ }
+ }
+
+ if (candidate_kernels.size() == 0) {
+ return Status::NotImplemented("Unsupported cast from ", values[0].type->ToString(),
+ " to ", ToTypeName(out_type_id_), " using function ",
+ this->name());
+ }
+
+ if (candidate_kernels.size() == 1) {
+ // One match, return it
+ return candidate_kernels[0];
+ }
+
+ // Now we are in a casting scenario where we may have both a EXACT_TYPE and
+ // a SAME_TYPE_ID. So we will see if there is an exact match among the
+ // candidate kernels and if not we will just return the first one
+ for (auto kernel : candidate_kernels) {
+ const InputType& arg0 = kernel->signature->in_types()[0];
+ if (arg0.kind() == InputType::EXACT_TYPE) {
+ // Bingo. Return it
+ return kernel;
+ }
+ }
+
+ // We didn't find an exact match. So just return some kernel that matches
+ return candidate_kernels[0];
+}
+
+Result<Datum> Cast(const Datum& value, const CastOptions& options, ExecContext* ctx) {
+ return CallFunction("cast", {value}, &options, ctx);
+}
+
+Result<Datum> Cast(const Datum& value, std::shared_ptr<DataType> to_type,
+ const CastOptions& options, ExecContext* ctx) {
+ CastOptions options_with_to_type = options;
+ options_with_to_type.to_type = to_type;
+ return Cast(value, options_with_to_type, ctx);
+}
+
+Result<std::shared_ptr<Array>> Cast(const Array& value, std::shared_ptr<DataType> to_type,
+ const CastOptions& options, ExecContext* ctx) {
+ ARROW_ASSIGN_OR_RAISE(Datum result, Cast(Datum(value), to_type, options, ctx));
+ return result.make_array();
+}
+
+Result<std::shared_ptr<CastFunction>> GetCastFunction(
+ const std::shared_ptr<DataType>& to_type) {
+ return internal::GetCastFunctionInternal(to_type);
+}
+
+bool CanCast(const DataType& from_type, const DataType& to_type) {
+ internal::EnsureInitCastTable();
+ auto it = internal::g_cast_table.find(static_cast<int>(to_type.id()));
+ if (it == internal::g_cast_table.end()) {
+ return false;
+ }
+
+ const CastFunction* function = it->second.get();
+ DCHECK_EQ(function->out_type_id(), to_type.id());
+
+ for (auto from_id : function->in_type_ids()) {
+ // XXX should probably check the output type as well
+ if (from_type.id() == from_id) return true;
+ }
+
+ return false;
+}
+
+Result<std::vector<Datum>> Cast(std::vector<Datum> datums, std::vector<ValueDescr> descrs,
+ ExecContext* ctx) {
+ for (size_t i = 0; i != datums.size(); ++i) {
+ if (descrs[i] != datums[i].descr()) {
+ if (descrs[i].shape != datums[i].shape()) {
+ return Status::NotImplemented("casting between Datum shapes");
+ }
+
+ ARROW_ASSIGN_OR_RAISE(datums[i],
+ Cast(datums[i], CastOptions::Safe(descrs[i].type), ctx));
+ }
+ }
+
+ return datums;
+}
+
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/cast.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/cast.h
new file mode 100644
index 00000000000..131f57f892f
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/cast.h
@@ -0,0 +1,167 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/compute/function.h"
+#include "arrow/compute/kernel.h"
+#include "arrow/datum.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class Array;
+
+namespace compute {
+
+class ExecContext;
+
+/// \addtogroup compute-concrete-options
+/// @{
+
+class ARROW_EXPORT CastOptions : public FunctionOptions {
+ public:
+ explicit CastOptions(bool safe = true);
+
+ constexpr static char const kTypeName[] = "CastOptions";
+ static CastOptions Safe(std::shared_ptr<DataType> to_type = NULLPTR) {
+ CastOptions safe(true);
+ safe.to_type = std::move(to_type);
+ return safe;
+ }
+
+ static CastOptions Unsafe(std::shared_ptr<DataType> to_type = NULLPTR) {
+ CastOptions unsafe(false);
+ unsafe.to_type = std::move(to_type);
+ return unsafe;
+ }
+
+ // Type being casted to. May be passed separate to eager function
+ // compute::Cast
+ std::shared_ptr<DataType> to_type;
+
+ bool allow_int_overflow;
+ bool allow_time_truncate;
+ bool allow_time_overflow;
+ bool allow_decimal_truncate;
+ bool allow_float_truncate;
+ // Indicate if conversions from Binary/FixedSizeBinary to string must
+ // validate the utf8 payload.
+ bool allow_invalid_utf8;
+};
+
+/// @}
+
+// Cast functions are _not_ registered in the FunctionRegistry, though they use
+// the same execution machinery
+class CastFunction : public ScalarFunction {
+ public:
+ CastFunction(std::string name, Type::type out_type_id);
+
+ Type::type out_type_id() const { return out_type_id_; }
+ const std::vector<Type::type>& in_type_ids() const { return in_type_ids_; }
+
+ Status AddKernel(Type::type in_type_id, std::vector<InputType> in_types,
+ OutputType out_type, ArrayKernelExec exec,
+ NullHandling::type = NullHandling::INTERSECTION,
+ MemAllocation::type = MemAllocation::PREALLOCATE);
+
+ // Note, this function toggles off memory allocation and sets the init
+ // function to CastInit
+ Status AddKernel(Type::type in_type_id, ScalarKernel kernel);
+
+ Result<const Kernel*> DispatchExact(
+ const std::vector<ValueDescr>& values) const override;
+
+ private:
+ std::vector<Type::type> in_type_ids_;
+ const Type::type out_type_id_;
+};
+
+ARROW_EXPORT
+Result<std::shared_ptr<CastFunction>> GetCastFunction(
+ const std::shared_ptr<DataType>& to_type);
+
+/// \brief Return true if a cast function is defined
+ARROW_EXPORT
+bool CanCast(const DataType& from_type, const DataType& to_type);
+
+// ----------------------------------------------------------------------
+// Convenience invocation APIs for a number of kernels
+
+/// \brief Cast from one array type to another
+/// \param[in] value array to cast
+/// \param[in] to_type type to cast to
+/// \param[in] options casting options
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting array
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<std::shared_ptr<Array>> Cast(const Array& value, std::shared_ptr<DataType> to_type,
+ const CastOptions& options = CastOptions::Safe(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Cast from one array type to another
+/// \param[in] value array to cast
+/// \param[in] options casting options. The "to_type" field must be populated
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting array
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Cast(const Datum& value, const CastOptions& options,
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Cast from one value to another
+/// \param[in] value datum to cast
+/// \param[in] to_type type to cast to
+/// \param[in] options casting options
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Cast(const Datum& value, std::shared_ptr<DataType> to_type,
+ const CastOptions& options = CastOptions::Safe(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Cast several values simultaneously. Safe cast options are used.
+/// \param[in] values datums to cast
+/// \param[in] descrs ValueDescrs to cast to
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datums
+///
+/// \since 4.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<std::vector<Datum>> Cast(std::vector<Datum> values, std::vector<ValueDescr> descrs,
+ ExecContext* ctx = NULLPTR);
+
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/cast_internal.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/cast_internal.h
new file mode 100644
index 00000000000..0105d08a573
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/cast_internal.h
@@ -0,0 +1,43 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include "arrow/compute/cast.h" // IWYU pragma: keep
+#include "arrow/compute/kernel.h" // IWYU pragma: keep
+#include "arrow/compute/kernels/codegen_internal.h" // IWYU pragma: keep
+
+namespace arrow {
+namespace compute {
+namespace internal {
+
+using CastState = OptionsWrapper<CastOptions>;
+
+// See kernels/scalar_cast_*.cc for these
+std::vector<std::shared_ptr<CastFunction>> GetBooleanCasts();
+std::vector<std::shared_ptr<CastFunction>> GetNumericCasts();
+std::vector<std::shared_ptr<CastFunction>> GetTemporalCasts();
+std::vector<std::shared_ptr<CastFunction>> GetBinaryLikeCasts();
+std::vector<std::shared_ptr<CastFunction>> GetNestedCasts();
+std::vector<std::shared_ptr<CastFunction>> GetDictionaryCasts();
+
+} // namespace internal
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec.cc
new file mode 100644
index 00000000000..63f8d39f551
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec.cc
@@ -0,0 +1,1050 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/exec.h"
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "arrow/array/array_base.h"
+#include "arrow/array/array_primitive.h"
+#include "arrow/array/data.h"
+#include "arrow/array/util.h"
+#include "arrow/buffer.h"
+#include "arrow/chunked_array.h"
+#include "arrow/compute/exec_internal.h"
+#include "arrow/compute/function.h"
+#include "arrow/compute/kernel.h"
+#include "arrow/compute/registry.h"
+#include "arrow/compute/util_internal.h"
+#include "arrow/datum.h"
+#include "arrow/pretty_print.h"
+#include "arrow/record_batch.h"
+#include "arrow/scalar.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_ops.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/cpu_info.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/make_unique.h"
+#include "arrow/util/vector.h"
+
+namespace arrow {
+
+using internal::BitmapAnd;
+using internal::checked_cast;
+using internal::CopyBitmap;
+using internal::CpuInfo;
+
+namespace compute {
+
+ExecContext* default_exec_context() {
+ static ExecContext default_ctx;
+ return &default_ctx;
+}
+
+ExecBatch::ExecBatch(const RecordBatch& batch)
+ : values(batch.num_columns()), length(batch.num_rows()) {
+ auto columns = batch.column_data();
+ std::move(columns.begin(), columns.end(), values.begin());
+}
+
+bool ExecBatch::Equals(const ExecBatch& other) const {
+ return guarantee == other.guarantee && values == other.values;
+}
+
+void PrintTo(const ExecBatch& batch, std::ostream* os) {
+ *os << "ExecBatch\n";
+
+ static const std::string indent = " ";
+
+ *os << indent << "# Rows: " << batch.length << "\n";
+ if (batch.guarantee != literal(true)) {
+ *os << indent << "Guarantee: " << batch.guarantee.ToString() << "\n";
+ }
+
+ int i = 0;
+ for (const Datum& value : batch.values) {
+ *os << indent << "" << i++ << ": ";
+
+ if (value.is_scalar()) {
+ *os << "Scalar[" << value.scalar()->ToString() << "]\n";
+ continue;
+ }
+
+ auto array = value.make_array();
+ PrettyPrintOptions options;
+ options.skip_new_lines = true;
+ *os << "Array";
+ ARROW_CHECK_OK(PrettyPrint(*array, options, os));
+ *os << "\n";
+ }
+}
+
+ExecBatch ExecBatch::Slice(int64_t offset, int64_t length) const {
+ ExecBatch out = *this;
+ for (auto& value : out.values) {
+ if (value.is_scalar()) continue;
+ value = value.array()->Slice(offset, length);
+ }
+ out.length = length;
+ return out;
+}
+
+Result<ExecBatch> ExecBatch::Make(std::vector<Datum> values) {
+ if (values.empty()) {
+ return Status::Invalid("Cannot infer ExecBatch length without at least one value");
+ }
+
+ int64_t length = -1;
+ for (const auto& value : values) {
+ if (value.is_scalar()) {
+ continue;
+ }
+
+ if (length == -1) {
+ length = value.length();
+ continue;
+ }
+
+ if (length != value.length()) {
+ return Status::Invalid(
+ "Arrays used to construct an ExecBatch must have equal length");
+ }
+ }
+
+ if (length == -1) {
+ length = 1;
+ }
+
+ return ExecBatch(std::move(values), length);
+}
+
+Result<std::shared_ptr<RecordBatch>> ExecBatch::ToRecordBatch(
+ std::shared_ptr<Schema> schema, MemoryPool* pool) const {
+ ArrayVector columns(schema->num_fields());
+
+ for (size_t i = 0; i < columns.size(); ++i) {
+ const Datum& value = values[i];
+ if (value.is_array()) {
+ columns[i] = value.make_array();
+ continue;
+ }
+ ARROW_ASSIGN_OR_RAISE(columns[i], MakeArrayFromScalar(*value.scalar(), length, pool));
+ }
+
+ return RecordBatch::Make(std::move(schema), length, std::move(columns));
+}
+
+namespace {
+
+Result<std::shared_ptr<Buffer>> AllocateDataBuffer(KernelContext* ctx, int64_t length,
+ int bit_width) {
+ if (bit_width == 1) {
+ return ctx->AllocateBitmap(length);
+ } else {
+ int64_t buffer_size = BitUtil::BytesForBits(length * bit_width);
+ return ctx->Allocate(buffer_size);
+ }
+}
+
+struct BufferPreallocation {
+ explicit BufferPreallocation(int bit_width = -1, int added_length = 0)
+ : bit_width(bit_width), added_length(added_length) {}
+
+ int bit_width;
+ int added_length;
+};
+
+void ComputeDataPreallocate(const DataType& type,
+ std::vector<BufferPreallocation>* widths) {
+ if (is_fixed_width(type.id()) && type.id() != Type::NA) {
+ widths->emplace_back(checked_cast<const FixedWidthType&>(type).bit_width());
+ return;
+ }
+ // Preallocate binary and list offsets
+ switch (type.id()) {
+ case Type::BINARY:
+ case Type::STRING:
+ case Type::LIST:
+ case Type::MAP:
+ widths->emplace_back(32, /*added_length=*/1);
+ return;
+ case Type::LARGE_BINARY:
+ case Type::LARGE_STRING:
+ case Type::LARGE_LIST:
+ widths->emplace_back(64, /*added_length=*/1);
+ return;
+ default:
+ break;
+ }
+}
+
+} // namespace
+
+namespace detail {
+
+Status CheckAllValues(const std::vector<Datum>& values) {
+ for (const auto& value : values) {
+ if (!value.is_value()) {
+ return Status::Invalid("Tried executing function with non-value type: ",
+ value.ToString());
+ }
+ }
+ return Status::OK();
+}
+
+ExecBatchIterator::ExecBatchIterator(std::vector<Datum> args, int64_t length,
+ int64_t max_chunksize)
+ : args_(std::move(args)),
+ position_(0),
+ length_(length),
+ max_chunksize_(max_chunksize) {
+ chunk_indexes_.resize(args_.size(), 0);
+ chunk_positions_.resize(args_.size(), 0);
+}
+
+Result<std::unique_ptr<ExecBatchIterator>> ExecBatchIterator::Make(
+ std::vector<Datum> args, int64_t max_chunksize) {
+ for (const auto& arg : args) {
+ if (!(arg.is_arraylike() || arg.is_scalar())) {
+ return Status::Invalid(
+ "ExecBatchIterator only works with Scalar, Array, and "
+ "ChunkedArray arguments");
+ }
+ }
+
+ // If the arguments are all scalars, then the length is 1
+ int64_t length = 1;
+
+ bool length_set = false;
+ for (auto& arg : args) {
+ if (arg.is_scalar()) {
+ continue;
+ }
+ if (!length_set) {
+ length = arg.length();
+ length_set = true;
+ } else {
+ if (arg.length() != length) {
+ return Status::Invalid("Array arguments must all be the same length");
+ }
+ }
+ }
+
+ max_chunksize = std::min(length, max_chunksize);
+
+ return std::unique_ptr<ExecBatchIterator>(
+ new ExecBatchIterator(std::move(args), length, max_chunksize));
+}
+
+bool ExecBatchIterator::Next(ExecBatch* batch) {
+ if (position_ == length_) {
+ return false;
+ }
+
+ // Determine how large the common contiguous "slice" of all the arguments is
+ int64_t iteration_size = std::min(length_ - position_, max_chunksize_);
+
+ // If length_ is 0, then this loop will never execute
+ for (size_t i = 0; i < args_.size() && iteration_size > 0; ++i) {
+ // If the argument is not a chunked array, it's either a Scalar or Array,
+ // in which case it doesn't influence the size of this batch. Note that if
+ // the args are all scalars the batch length is 1
+ if (args_[i].kind() != Datum::CHUNKED_ARRAY) {
+ continue;
+ }
+ const ChunkedArray& arg = *args_[i].chunked_array();
+ std::shared_ptr<Array> current_chunk;
+ while (true) {
+ current_chunk = arg.chunk(chunk_indexes_[i]);
+ if (chunk_positions_[i] == current_chunk->length()) {
+ // Chunk is zero-length, or was exhausted in the previous iteration
+ chunk_positions_[i] = 0;
+ ++chunk_indexes_[i];
+ continue;
+ }
+ break;
+ }
+ iteration_size =
+ std::min(current_chunk->length() - chunk_positions_[i], iteration_size);
+ }
+
+ // Now, fill the batch
+ batch->values.resize(args_.size());
+ batch->length = iteration_size;
+ for (size_t i = 0; i < args_.size(); ++i) {
+ if (args_[i].is_scalar()) {
+ batch->values[i] = args_[i].scalar();
+ } else if (args_[i].is_array()) {
+ batch->values[i] = args_[i].array()->Slice(position_, iteration_size);
+ } else {
+ const ChunkedArray& carr = *args_[i].chunked_array();
+ const auto& chunk = carr.chunk(chunk_indexes_[i]);
+ batch->values[i] = chunk->data()->Slice(chunk_positions_[i], iteration_size);
+ chunk_positions_[i] += iteration_size;
+ }
+ }
+ position_ += iteration_size;
+ DCHECK_LE(position_, length_);
+ return true;
+}
+
+namespace {
+
+struct NullGeneralization {
+ enum type { PERHAPS_NULL, ALL_VALID, ALL_NULL };
+
+ static type Get(const Datum& datum) {
+ if (datum.type()->id() == Type::NA) {
+ return ALL_NULL;
+ }
+
+ if (datum.is_scalar()) {
+ return datum.scalar()->is_valid ? ALL_VALID : ALL_NULL;
+ }
+
+ const auto& arr = *datum.array();
+
+ // Do not count the bits if they haven't been counted already
+ const int64_t known_null_count = arr.null_count.load();
+ if ((known_null_count == 0) || (arr.buffers[0] == NULLPTR)) {
+ return ALL_VALID;
+ }
+
+ if (known_null_count == arr.length) {
+ return ALL_NULL;
+ }
+
+ return PERHAPS_NULL;
+ }
+};
+
+// Null propagation implementation that deals both with preallocated bitmaps
+// and maybe-to-be allocated bitmaps
+//
+// If the bitmap is preallocated, it MUST be populated (since it might be a
+// view of a much larger bitmap). If it isn't preallocated, then we have
+// more flexibility.
+//
+// * If the batch has no nulls, then we do nothing
+// * If only a single array has nulls, and its offset is a multiple of 8,
+// then we can zero-copy the bitmap into the output
+// * Otherwise, we allocate the bitmap and populate it
+class NullPropagator {
+ public:
+ NullPropagator(KernelContext* ctx, const ExecBatch& batch, ArrayData* output)
+ : ctx_(ctx), batch_(batch), output_(output) {
+ for (const Datum& datum : batch_.values) {
+ auto null_generalization = NullGeneralization::Get(datum);
+
+ if (null_generalization == NullGeneralization::ALL_NULL) {
+ is_all_null_ = true;
+ }
+
+ if (null_generalization != NullGeneralization::ALL_VALID &&
+ datum.kind() == Datum::ARRAY) {
+ arrays_with_nulls_.push_back(datum.array().get());
+ }
+ }
+
+ if (output->buffers[0] != nullptr) {
+ bitmap_preallocated_ = true;
+ SetBitmap(output_->buffers[0].get());
+ }
+ }
+
+ void SetBitmap(Buffer* bitmap) { bitmap_ = bitmap->mutable_data(); }
+
+ Status EnsureAllocated() {
+ if (bitmap_preallocated_) {
+ return Status::OK();
+ }
+ ARROW_ASSIGN_OR_RAISE(output_->buffers[0], ctx_->AllocateBitmap(output_->length));
+ SetBitmap(output_->buffers[0].get());
+ return Status::OK();
+ }
+
+ Status AllNullShortCircuit() {
+ // OK, the output should be all null
+ output_->null_count = output_->length;
+
+ if (bitmap_preallocated_) {
+ BitUtil::SetBitsTo(bitmap_, output_->offset, output_->length, false);
+ return Status::OK();
+ }
+
+ // Walk all the values with nulls instead of breaking on the first in case
+ // we find a bitmap that can be reused in the non-preallocated case
+ for (const ArrayData* arr : arrays_with_nulls_) {
+ if (arr->null_count.load() == arr->length && arr->buffers[0] != nullptr) {
+ // Reuse this all null bitmap
+ output_->buffers[0] = arr->buffers[0];
+ return Status::OK();
+ }
+ }
+
+ RETURN_NOT_OK(EnsureAllocated());
+ BitUtil::SetBitsTo(bitmap_, output_->offset, output_->length, false);
+ return Status::OK();
+ }
+
+ Status PropagateSingle() {
+ // One array
+ const ArrayData& arr = *arrays_with_nulls_[0];
+ const std::shared_ptr<Buffer>& arr_bitmap = arr.buffers[0];
+
+ // Reuse the null count if it's known
+ output_->null_count = arr.null_count.load();
+
+ if (bitmap_preallocated_) {
+ CopyBitmap(arr_bitmap->data(), arr.offset, arr.length, bitmap_, output_->offset);
+ return Status::OK();
+ }
+
+ // Two cases when memory was not pre-allocated:
+ //
+ // * Offset is zero: we reuse the bitmap as is
+ // * Offset is nonzero but a multiple of 8: we can slice the bitmap
+ // * Offset is not a multiple of 8: we must allocate and use CopyBitmap
+ //
+ // Keep in mind that output_->offset is not permitted to be nonzero when
+ // the bitmap is not preallocated, and that precondition is asserted
+ // higher in the call stack.
+ if (arr.offset == 0) {
+ output_->buffers[0] = arr_bitmap;
+ } else if (arr.offset % 8 == 0) {
+ output_->buffers[0] =
+ SliceBuffer(arr_bitmap, arr.offset / 8, BitUtil::BytesForBits(arr.length));
+ } else {
+ RETURN_NOT_OK(EnsureAllocated());
+ CopyBitmap(arr_bitmap->data(), arr.offset, arr.length, bitmap_,
+ /*dst_offset=*/0);
+ }
+ return Status::OK();
+ }
+
+ Status PropagateMultiple() {
+ // More than one array. We use BitmapAnd to intersect their bitmaps
+
+ // Do not compute the intersection null count until it's needed
+ RETURN_NOT_OK(EnsureAllocated());
+
+ auto Accumulate = [&](const ArrayData& left, const ArrayData& right) {
+ DCHECK(left.buffers[0]);
+ DCHECK(right.buffers[0]);
+ BitmapAnd(left.buffers[0]->data(), left.offset, right.buffers[0]->data(),
+ right.offset, output_->length, output_->offset,
+ output_->buffers[0]->mutable_data());
+ };
+
+ DCHECK_GT(arrays_with_nulls_.size(), 1);
+
+ // Seed the output bitmap with the & of the first two bitmaps
+ Accumulate(*arrays_with_nulls_[0], *arrays_with_nulls_[1]);
+
+ // Accumulate the rest
+ for (size_t i = 2; i < arrays_with_nulls_.size(); ++i) {
+ Accumulate(*output_, *arrays_with_nulls_[i]);
+ }
+ return Status::OK();
+ }
+
+ Status Execute() {
+ if (is_all_null_) {
+ // An all-null value (scalar null or all-null array) gives us a short
+ // circuit opportunity
+ return AllNullShortCircuit();
+ }
+
+ // At this point, by construction we know that all of the values in
+ // arrays_with_nulls_ are arrays that are not all null. So there are a
+ // few cases:
+ //
+ // * No arrays. This is a no-op w/o preallocation but when the bitmap is
+ // pre-allocated we have to fill it with 1's
+ // * One array, whose bitmap can be zero-copied (w/o preallocation, and
+ // when no byte is split) or copied (split byte or w/ preallocation)
+ // * More than one array, we must compute the intersection of all the
+ // bitmaps
+ //
+ // BUT, if the output offset is nonzero for some reason, we copy into the
+ // output unconditionally
+
+ output_->null_count = kUnknownNullCount;
+
+ if (arrays_with_nulls_.empty()) {
+ // No arrays with nulls case
+ output_->null_count = 0;
+ if (bitmap_preallocated_) {
+ BitUtil::SetBitsTo(bitmap_, output_->offset, output_->length, true);
+ }
+ return Status::OK();
+ }
+
+ if (arrays_with_nulls_.size() == 1) {
+ return PropagateSingle();
+ }
+
+ return PropagateMultiple();
+ }
+
+ private:
+ KernelContext* ctx_;
+ const ExecBatch& batch_;
+ std::vector<const ArrayData*> arrays_with_nulls_;
+ bool is_all_null_ = false;
+ ArrayData* output_;
+ uint8_t* bitmap_;
+ bool bitmap_preallocated_ = false;
+};
+
+std::shared_ptr<ChunkedArray> ToChunkedArray(const std::vector<Datum>& values,
+ const std::shared_ptr<DataType>& type) {
+ std::vector<std::shared_ptr<Array>> arrays;
+ arrays.reserve(values.size());
+ for (const Datum& val : values) {
+ if (val.length() == 0) {
+ // Skip empty chunks
+ continue;
+ }
+ arrays.emplace_back(val.make_array());
+ }
+ return std::make_shared<ChunkedArray>(std::move(arrays), type);
+}
+
+bool HaveChunkedArray(const std::vector<Datum>& values) {
+ for (const auto& value : values) {
+ if (value.kind() == Datum::CHUNKED_ARRAY) {
+ return true;
+ }
+ }
+ return false;
+}
+
+template <typename KernelType>
+class KernelExecutorImpl : public KernelExecutor {
+ public:
+ Status Init(KernelContext* kernel_ctx, KernelInitArgs args) override {
+ kernel_ctx_ = kernel_ctx;
+ kernel_ = static_cast<const KernelType*>(args.kernel);
+
+ // Resolve the output descriptor for this kernel
+ ARROW_ASSIGN_OR_RAISE(
+ output_descr_, kernel_->signature->out_type().Resolve(kernel_ctx_, args.inputs));
+
+ return Status::OK();
+ }
+
+ protected:
+ // This is overridden by the VectorExecutor
+ virtual Status SetupArgIteration(const std::vector<Datum>& args) {
+ ARROW_ASSIGN_OR_RAISE(
+ batch_iterator_, ExecBatchIterator::Make(args, exec_context()->exec_chunksize()));
+ return Status::OK();
+ }
+
+ Result<std::shared_ptr<ArrayData>> PrepareOutput(int64_t length) {
+ auto out = std::make_shared<ArrayData>(output_descr_.type, length);
+ out->buffers.resize(output_num_buffers_);
+
+ if (validity_preallocated_) {
+ ARROW_ASSIGN_OR_RAISE(out->buffers[0], kernel_ctx_->AllocateBitmap(length));
+ }
+ if (kernel_->null_handling == NullHandling::OUTPUT_NOT_NULL) {
+ out->null_count = 0;
+ }
+ for (size_t i = 0; i < data_preallocated_.size(); ++i) {
+ const auto& prealloc = data_preallocated_[i];
+ if (prealloc.bit_width >= 0) {
+ ARROW_ASSIGN_OR_RAISE(
+ out->buffers[i + 1],
+ AllocateDataBuffer(kernel_ctx_, length + prealloc.added_length,
+ prealloc.bit_width));
+ }
+ }
+ return out;
+ }
+
+ ExecContext* exec_context() { return kernel_ctx_->exec_context(); }
+ KernelState* state() { return kernel_ctx_->state(); }
+
+ // Not all of these members are used for every executor type
+
+ KernelContext* kernel_ctx_;
+ const KernelType* kernel_;
+ std::unique_ptr<ExecBatchIterator> batch_iterator_;
+ ValueDescr output_descr_;
+
+ int output_num_buffers_;
+
+ // If true, then memory is preallocated for the validity bitmap with the same
+ // strategy as the data buffer(s).
+ bool validity_preallocated_ = false;
+
+ // The kernel writes into data buffers preallocated for these bit widths
+ // (0 indicates no preallocation);
+ std::vector<BufferPreallocation> data_preallocated_;
+};
+
+class ScalarExecutor : public KernelExecutorImpl<ScalarKernel> {
+ public:
+ Status Execute(const std::vector<Datum>& args, ExecListener* listener) override {
+ RETURN_NOT_OK(PrepareExecute(args));
+ ExecBatch batch;
+ while (batch_iterator_->Next(&batch)) {
+ RETURN_NOT_OK(ExecuteBatch(batch, listener));
+ }
+ if (preallocate_contiguous_) {
+ // If we preallocated one big chunk, since the kernel execution is
+ // completed, we can now emit it
+ RETURN_NOT_OK(listener->OnResult(std::move(preallocated_)));
+ }
+ return Status::OK();
+ }
+
+ Datum WrapResults(const std::vector<Datum>& inputs,
+ const std::vector<Datum>& outputs) override {
+ if (output_descr_.shape == ValueDescr::SCALAR) {
+ DCHECK_GT(outputs.size(), 0);
+ if (outputs.size() == 1) {
+ // Return as SCALAR
+ return outputs[0];
+ } else {
+ // Return as COLLECTION
+ return outputs;
+ }
+ } else {
+ // If execution yielded multiple chunks (because large arrays were split
+ // based on the ExecContext parameters, then the result is a ChunkedArray
+ if (HaveChunkedArray(inputs) || outputs.size() > 1) {
+ return ToChunkedArray(outputs, output_descr_.type);
+ } else if (outputs.size() == 1) {
+ // Outputs have just one element
+ return outputs[0];
+ } else {
+ // XXX: In the case where no outputs are omitted, is returning a 0-length
+ // array always the correct move?
+ return MakeArrayOfNull(output_descr_.type, /*length=*/0,
+ exec_context()->memory_pool())
+ .ValueOrDie();
+ }
+ }
+ }
+
+ protected:
+ Status ExecuteBatch(const ExecBatch& batch, ExecListener* listener) {
+ Datum out;
+ RETURN_NOT_OK(PrepareNextOutput(batch, &out));
+
+ if (output_descr_.shape == ValueDescr::ARRAY) {
+ ArrayData* out_arr = out.mutable_array();
+ if (kernel_->null_handling == NullHandling::INTERSECTION) {
+ RETURN_NOT_OK(PropagateNulls(kernel_ctx_, batch, out_arr));
+ } else if (kernel_->null_handling == NullHandling::OUTPUT_NOT_NULL) {
+ out_arr->null_count = 0;
+ }
+ } else {
+ if (kernel_->null_handling == NullHandling::INTERSECTION) {
+ // set scalar validity
+ out.scalar()->is_valid =
+ std::all_of(batch.values.begin(), batch.values.end(),
+ [](const Datum& input) { return input.scalar()->is_valid; });
+ } else if (kernel_->null_handling == NullHandling::OUTPUT_NOT_NULL) {
+ out.scalar()->is_valid = true;
+ }
+ }
+
+ RETURN_NOT_OK(kernel_->exec(kernel_ctx_, batch, &out));
+ if (!preallocate_contiguous_) {
+ // If we are producing chunked output rather than one big array, then
+ // emit each chunk as soon as it's available
+ RETURN_NOT_OK(listener->OnResult(std::move(out)));
+ }
+ return Status::OK();
+ }
+
+ Status PrepareExecute(const std::vector<Datum>& args) {
+ RETURN_NOT_OK(this->SetupArgIteration(args));
+
+ if (output_descr_.shape == ValueDescr::ARRAY) {
+ // If the executor is configured to produce a single large Array output for
+ // kernels supporting preallocation, then we do so up front and then
+ // iterate over slices of that large array. Otherwise, we preallocate prior
+ // to processing each batch emitted from the ExecBatchIterator
+ RETURN_NOT_OK(SetupPreallocation(batch_iterator_->length()));
+ }
+ return Status::OK();
+ }
+
+ // We must accommodate two different modes of execution for preallocated
+ // execution
+ //
+ // * A single large ("contiguous") allocation that we populate with results
+ // on a chunkwise basis according to the ExecBatchIterator. This permits
+ // parallelization even if the objective is to obtain a single Array or
+ // ChunkedArray at the end
+ // * A standalone buffer preallocation for each chunk emitted from the
+ // ExecBatchIterator
+ //
+ // When data buffer preallocation is not possible (e.g. with BINARY / STRING
+ // outputs), then contiguous results are only possible if the input is
+ // contiguous.
+
+ Status PrepareNextOutput(const ExecBatch& batch, Datum* out) {
+ if (output_descr_.shape == ValueDescr::ARRAY) {
+ if (preallocate_contiguous_) {
+ // The output is already fully preallocated
+ const int64_t batch_start_position = batch_iterator_->position() - batch.length;
+
+ if (batch.length < batch_iterator_->length()) {
+ // If this is a partial execution, then we write into a slice of
+ // preallocated_
+ out->value = preallocated_->Slice(batch_start_position, batch.length);
+ } else {
+ // Otherwise write directly into preallocated_. The main difference
+ // computationally (versus the Slice approach) is that the null_count
+ // may not need to be recomputed in the result
+ out->value = preallocated_;
+ }
+ } else {
+ // We preallocate (maybe) only for the output of processing the current
+ // batch
+ ARROW_ASSIGN_OR_RAISE(out->value, PrepareOutput(batch.length));
+ }
+ } else {
+ // For scalar outputs, we set a null scalar of the correct type to
+ // communicate the output type to the kernel if needed
+ //
+ // XXX: Is there some way to avoid this step?
+ out->value = MakeNullScalar(output_descr_.type);
+ }
+ return Status::OK();
+ }
+
+ Status SetupPreallocation(int64_t total_length) {
+ output_num_buffers_ = static_cast<int>(output_descr_.type->layout().buffers.size());
+
+ // Decide if we need to preallocate memory for this kernel
+ validity_preallocated_ =
+ (kernel_->null_handling != NullHandling::COMPUTED_NO_PREALLOCATE &&
+ kernel_->null_handling != NullHandling::OUTPUT_NOT_NULL &&
+ output_descr_.type->id() != Type::NA);
+ if (kernel_->mem_allocation == MemAllocation::PREALLOCATE) {
+ ComputeDataPreallocate(*output_descr_.type, &data_preallocated_);
+ }
+
+ // Contiguous preallocation only possible on non-nested types if all
+ // buffers are preallocated. Otherwise, we must go chunk-by-chunk.
+ //
+ // Some kernels are also unable to write into sliced outputs, so we respect the
+ // kernel's attributes.
+ preallocate_contiguous_ =
+ (exec_context()->preallocate_contiguous() && kernel_->can_write_into_slices &&
+ validity_preallocated_ && !is_nested(output_descr_.type->id()) &&
+ !is_dictionary(output_descr_.type->id()) &&
+ data_preallocated_.size() == static_cast<size_t>(output_num_buffers_ - 1) &&
+ std::all_of(data_preallocated_.begin(), data_preallocated_.end(),
+ [](const BufferPreallocation& prealloc) {
+ return prealloc.bit_width >= 0;
+ }));
+ if (preallocate_contiguous_) {
+ ARROW_ASSIGN_OR_RAISE(preallocated_, PrepareOutput(total_length));
+ }
+ return Status::OK();
+ }
+
+ // If true, and the kernel and output type supports preallocation (for both
+ // the validity and data buffers), then we allocate one big array and then
+ // iterate through it while executing the kernel in chunks
+ bool preallocate_contiguous_ = false;
+
+ // For storing a contiguous preallocation per above. Unused otherwise
+ std::shared_ptr<ArrayData> preallocated_;
+};
+
+Status PackBatchNoChunks(const std::vector<Datum>& args, ExecBatch* out) {
+ int64_t length = 0;
+ for (const auto& arg : args) {
+ switch (arg.kind()) {
+ case Datum::SCALAR:
+ case Datum::ARRAY:
+ case Datum::CHUNKED_ARRAY:
+ length = std::max(arg.length(), length);
+ break;
+ default:
+ DCHECK(false);
+ break;
+ }
+ }
+ out->length = length;
+ out->values = args;
+ return Status::OK();
+}
+
+class VectorExecutor : public KernelExecutorImpl<VectorKernel> {
+ public:
+ Status Execute(const std::vector<Datum>& args, ExecListener* listener) override {
+ RETURN_NOT_OK(PrepareExecute(args));
+ ExecBatch batch;
+ if (kernel_->can_execute_chunkwise) {
+ while (batch_iterator_->Next(&batch)) {
+ RETURN_NOT_OK(ExecuteBatch(batch, listener));
+ }
+ } else {
+ RETURN_NOT_OK(PackBatchNoChunks(args, &batch));
+ RETURN_NOT_OK(ExecuteBatch(batch, listener));
+ }
+ return Finalize(listener);
+ }
+
+ Datum WrapResults(const std::vector<Datum>& inputs,
+ const std::vector<Datum>& outputs) override {
+ // If execution yielded multiple chunks (because large arrays were split
+ // based on the ExecContext parameters, then the result is a ChunkedArray
+ if (kernel_->output_chunked && (HaveChunkedArray(inputs) || outputs.size() > 1)) {
+ return ToChunkedArray(outputs, output_descr_.type);
+ } else if (outputs.size() == 1) {
+ // Outputs have just one element
+ return outputs[0];
+ } else {
+ // XXX: In the case where no outputs are omitted, is returning a 0-length
+ // array always the correct move?
+ return MakeArrayOfNull(output_descr_.type, /*length=*/0).ValueOrDie();
+ }
+ }
+
+ protected:
+ Status ExecuteBatch(const ExecBatch& batch, ExecListener* listener) {
+ if (batch.length == 0) {
+ // Skip empty batches. This may only happen when not using
+ // ExecBatchIterator
+ return Status::OK();
+ }
+ Datum out;
+ if (output_descr_.shape == ValueDescr::ARRAY) {
+ // We preallocate (maybe) only for the output of processing the current
+ // batch
+ ARROW_ASSIGN_OR_RAISE(out.value, PrepareOutput(batch.length));
+ }
+
+ if (kernel_->null_handling == NullHandling::INTERSECTION &&
+ output_descr_.shape == ValueDescr::ARRAY) {
+ RETURN_NOT_OK(PropagateNulls(kernel_ctx_, batch, out.mutable_array()));
+ }
+ RETURN_NOT_OK(kernel_->exec(kernel_ctx_, batch, &out));
+ if (!kernel_->finalize) {
+ // If there is no result finalizer (e.g. for hash-based functions, we can
+ // emit the processed batch right away rather than waiting
+ RETURN_NOT_OK(listener->OnResult(std::move(out)));
+ } else {
+ results_.emplace_back(std::move(out));
+ }
+ return Status::OK();
+ }
+
+ Status Finalize(ExecListener* listener) {
+ if (kernel_->finalize) {
+ // Intermediate results require post-processing after the execution is
+ // completed (possibly involving some accumulated state)
+ RETURN_NOT_OK(kernel_->finalize(kernel_ctx_, &results_));
+ for (const auto& result : results_) {
+ RETURN_NOT_OK(listener->OnResult(result));
+ }
+ }
+ return Status::OK();
+ }
+
+ Status SetupArgIteration(const std::vector<Datum>& args) override {
+ if (kernel_->can_execute_chunkwise) {
+ ARROW_ASSIGN_OR_RAISE(batch_iterator_, ExecBatchIterator::Make(
+ args, exec_context()->exec_chunksize()));
+ }
+ return Status::OK();
+ }
+
+ Status PrepareExecute(const std::vector<Datum>& args) {
+ RETURN_NOT_OK(this->SetupArgIteration(args));
+ output_num_buffers_ = static_cast<int>(output_descr_.type->layout().buffers.size());
+
+ // Decide if we need to preallocate memory for this kernel
+ validity_preallocated_ =
+ (kernel_->null_handling != NullHandling::COMPUTED_NO_PREALLOCATE &&
+ kernel_->null_handling != NullHandling::OUTPUT_NOT_NULL);
+ if (kernel_->mem_allocation == MemAllocation::PREALLOCATE) {
+ ComputeDataPreallocate(*output_descr_.type, &data_preallocated_);
+ }
+ return Status::OK();
+ }
+
+ std::vector<Datum> results_;
+};
+
+class ScalarAggExecutor : public KernelExecutorImpl<ScalarAggregateKernel> {
+ public:
+ Status Init(KernelContext* ctx, KernelInitArgs args) override {
+ input_descrs_ = &args.inputs;
+ options_ = args.options;
+ return KernelExecutorImpl<ScalarAggregateKernel>::Init(ctx, args);
+ }
+
+ Status Execute(const std::vector<Datum>& args, ExecListener* listener) override {
+ RETURN_NOT_OK(this->SetupArgIteration(args));
+
+ ExecBatch batch;
+ while (batch_iterator_->Next(&batch)) {
+ // TODO: implement parallelism
+ if (batch.length > 0) {
+ RETURN_NOT_OK(Consume(batch));
+ }
+ }
+
+ Datum out;
+ RETURN_NOT_OK(kernel_->finalize(kernel_ctx_, &out));
+ RETURN_NOT_OK(listener->OnResult(std::move(out)));
+ return Status::OK();
+ }
+
+ Datum WrapResults(const std::vector<Datum>&,
+ const std::vector<Datum>& outputs) override {
+ DCHECK_EQ(1, outputs.size());
+ return outputs[0];
+ }
+
+ private:
+ Status Consume(const ExecBatch& batch) {
+ // FIXME(ARROW-11840) don't merge *any* aggegates for every batch
+ ARROW_ASSIGN_OR_RAISE(
+ auto batch_state,
+ kernel_->init(kernel_ctx_, {kernel_, *input_descrs_, options_}));
+
+ if (batch_state == nullptr) {
+ return Status::Invalid("ScalarAggregation requires non-null kernel state");
+ }
+
+ KernelContext batch_ctx(exec_context());
+ batch_ctx.SetState(batch_state.get());
+
+ RETURN_NOT_OK(kernel_->consume(&batch_ctx, batch));
+ RETURN_NOT_OK(kernel_->merge(kernel_ctx_, std::move(*batch_state), state()));
+ return Status::OK();
+ }
+
+ const std::vector<ValueDescr>* input_descrs_;
+ const FunctionOptions* options_;
+};
+
+template <typename ExecutorType,
+ typename FunctionType = typename ExecutorType::FunctionType>
+Result<std::unique_ptr<KernelExecutor>> MakeExecutor(ExecContext* ctx,
+ const Function* func,
+ const FunctionOptions* options) {
+ DCHECK_EQ(ExecutorType::function_kind, func->kind());
+ auto typed_func = checked_cast<const FunctionType*>(func);
+ return std::unique_ptr<KernelExecutor>(new ExecutorType(ctx, typed_func, options));
+}
+
+} // namespace
+
+Status PropagateNulls(KernelContext* ctx, const ExecBatch& batch, ArrayData* output) {
+ DCHECK_NE(nullptr, output);
+ DCHECK_GT(output->buffers.size(), 0);
+
+ if (output->type->id() == Type::NA) {
+ // Null output type is a no-op (rare when this would happen but we at least
+ // will test for it)
+ return Status::OK();
+ }
+
+ // This function is ONLY able to write into output with non-zero offset
+ // when the bitmap is preallocated. This could be a DCHECK but returning
+ // error Status for now for emphasis
+ if (output->offset != 0 && output->buffers[0] == nullptr) {
+ return Status::Invalid(
+ "Can only propagate nulls into pre-allocated memory "
+ "when the output offset is non-zero");
+ }
+ NullPropagator propagator(ctx, batch, output);
+ return propagator.Execute();
+}
+
+std::unique_ptr<KernelExecutor> KernelExecutor::MakeScalar() {
+ return ::arrow::internal::make_unique<detail::ScalarExecutor>();
+}
+
+std::unique_ptr<KernelExecutor> KernelExecutor::MakeVector() {
+ return ::arrow::internal::make_unique<detail::VectorExecutor>();
+}
+
+std::unique_ptr<KernelExecutor> KernelExecutor::MakeScalarAggregate() {
+ return ::arrow::internal::make_unique<detail::ScalarAggExecutor>();
+}
+
+} // namespace detail
+
+ExecContext::ExecContext(MemoryPool* pool, ::arrow::internal::Executor* executor,
+ FunctionRegistry* func_registry)
+ : pool_(pool), executor_(executor) {
+ this->func_registry_ = func_registry == nullptr ? GetFunctionRegistry() : func_registry;
+}
+
+CpuInfo* ExecContext::cpu_info() const { return CpuInfo::GetInstance(); }
+
+// ----------------------------------------------------------------------
+// SelectionVector
+
+SelectionVector::SelectionVector(std::shared_ptr<ArrayData> data)
+ : data_(std::move(data)) {
+ DCHECK_EQ(Type::INT32, data_->type->id());
+ DCHECK_EQ(0, data_->GetNullCount());
+ indices_ = data_->GetValues<int32_t>(1);
+}
+
+SelectionVector::SelectionVector(const Array& arr) : SelectionVector(arr.data()) {}
+
+int32_t SelectionVector::length() const { return static_cast<int32_t>(data_->length); }
+
+Result<std::shared_ptr<SelectionVector>> SelectionVector::FromMask(
+ const BooleanArray& arr) {
+ return Status::NotImplemented("FromMask");
+}
+
+Result<Datum> CallFunction(const std::string& func_name, const std::vector<Datum>& args,
+ const FunctionOptions* options, ExecContext* ctx) {
+ if (ctx == nullptr) {
+ ExecContext default_ctx;
+ return CallFunction(func_name, args, options, &default_ctx);
+ }
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<const Function> func,
+ ctx->func_registry()->GetFunction(func_name));
+ return func->Execute(args, options, ctx);
+}
+
+Result<Datum> CallFunction(const std::string& func_name, const std::vector<Datum>& args,
+ ExecContext* ctx) {
+ return CallFunction(func_name, args, /*options=*/nullptr, ctx);
+}
+
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec.h
new file mode 100644
index 00000000000..de1b695de48
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec.h
@@ -0,0 +1,264 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// NOTE: API is EXPERIMENTAL and will change without going through a
+// deprecation cycle
+
+#pragma once
+
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/array/data.h"
+#include "arrow/compute/exec/expression.h"
+#include "arrow/datum.h"
+#include "arrow/memory_pool.h"
+#include "arrow/result.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/type_fwd.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace internal {
+
+class CpuInfo;
+
+} // namespace internal
+
+namespace compute {
+
+class FunctionOptions;
+class FunctionRegistry;
+
+// It seems like 64K might be a good default chunksize to use for execution
+// based on the experience of other query processing systems. The current
+// default is not to chunk contiguous arrays, though, but this may change in
+// the future once parallel execution is implemented
+static constexpr int64_t kDefaultExecChunksize = UINT16_MAX;
+
+/// \brief Context for expression-global variables and options used by
+/// function evaluation
+class ARROW_EXPORT ExecContext {
+ public:
+ // If no function registry passed, the default is used.
+ explicit ExecContext(MemoryPool* pool = default_memory_pool(),
+ ::arrow::internal::Executor* executor = NULLPTR,
+ FunctionRegistry* func_registry = NULLPTR);
+
+ /// \brief The MemoryPool used for allocations, default is
+ /// default_memory_pool().
+ MemoryPool* memory_pool() const { return pool_; }
+
+ ::arrow::internal::CpuInfo* cpu_info() const;
+
+ /// \brief An Executor which may be used to parallelize execution.
+ ::arrow::internal::Executor* executor() const { return executor_; }
+
+ /// \brief The FunctionRegistry for looking up functions by name and
+ /// selecting kernels for execution. Defaults to the library-global function
+ /// registry provided by GetFunctionRegistry.
+ FunctionRegistry* func_registry() const { return func_registry_; }
+
+ // \brief Set maximum length unit of work for kernel execution. Larger
+ // contiguous array inputs will be split into smaller chunks, and, if
+ // possible and enabled, processed in parallel. The default chunksize is
+ // INT64_MAX, so contiguous arrays are not split.
+ void set_exec_chunksize(int64_t chunksize) { exec_chunksize_ = chunksize; }
+
+ // \brief Maximum length for ExecBatch data chunks processed by
+ // kernels. Contiguous array inputs with longer length will be split into
+ // smaller chunks.
+ int64_t exec_chunksize() const { return exec_chunksize_; }
+
+ /// \brief Set whether to use multiple threads for function execution. This
+ /// is not yet used.
+ void set_use_threads(bool use_threads = true) { use_threads_ = use_threads; }
+
+ /// \brief If true, then utilize multiple threads where relevant for function
+ /// execution. This is not yet used.
+ bool use_threads() const { return use_threads_; }
+
+ // Set the preallocation strategy for kernel execution as it relates to
+ // chunked execution. For chunked execution, whether via ChunkedArray inputs
+ // or splitting larger Array arguments into smaller pieces, contiguous
+ // allocation (if permitted by the kernel) will allocate one large array to
+ // write output into yielding it to the caller at the end. If this option is
+ // set to off, then preallocations will be performed independently for each
+ // chunk of execution
+ //
+ // TODO: At some point we might want the limit the size of contiguous
+ // preallocations. For example, even if the exec_chunksize is 64K or less, we
+ // might limit contiguous allocations to 1M records, say.
+ void set_preallocate_contiguous(bool preallocate) {
+ preallocate_contiguous_ = preallocate;
+ }
+
+ /// \brief If contiguous preallocations should be used when doing chunked
+ /// execution as specified by exec_chunksize(). See
+ /// set_preallocate_contiguous() for more information.
+ bool preallocate_contiguous() const { return preallocate_contiguous_; }
+
+ private:
+ MemoryPool* pool_;
+ ::arrow::internal::Executor* executor_;
+ FunctionRegistry* func_registry_;
+ int64_t exec_chunksize_ = std::numeric_limits<int64_t>::max();
+ bool preallocate_contiguous_ = true;
+ bool use_threads_ = true;
+};
+
+ARROW_EXPORT ExecContext* default_exec_context();
+
+// TODO: Consider standardizing on uint16 selection vectors and only use them
+// when we can ensure that each value is 64K length or smaller
+
+/// \brief Container for an array of value selection indices that were
+/// materialized from a filter.
+///
+/// Columnar query engines (see e.g. [1]) have found that rather than
+/// materializing filtered data, the filter can instead be converted to an
+/// array of the "on" indices and then "fusing" these indices in operator
+/// implementations. This is especially relevant for aggregations but also
+/// applies to scalar operations.
+///
+/// We are not yet using this so this is mostly a placeholder for now.
+///
+/// [1]: http://cidrdb.org/cidr2005/papers/P19.pdf
+class ARROW_EXPORT SelectionVector {
+ public:
+ explicit SelectionVector(std::shared_ptr<ArrayData> data);
+
+ explicit SelectionVector(const Array& arr);
+
+ /// \brief Create SelectionVector from boolean mask
+ static Result<std::shared_ptr<SelectionVector>> FromMask(const BooleanArray& arr);
+
+ const int32_t* indices() const { return indices_; }
+ int32_t length() const;
+
+ private:
+ std::shared_ptr<ArrayData> data_;
+ const int32_t* indices_;
+};
+
+/// \brief A unit of work for kernel execution. It contains a collection of
+/// Array and Scalar values and an optional SelectionVector indicating that
+/// there is an unmaterialized filter that either must be materialized, or (if
+/// the kernel supports it) pushed down into the kernel implementation.
+///
+/// ExecBatch is semantically similar to RecordBatch in that in a SQL context
+/// it represents a collection of records, but constant "columns" are
+/// represented by Scalar values rather than having to be converted into arrays
+/// with repeated values.
+///
+/// TODO: Datum uses arrow/util/variant.h which may be a bit heavier-weight
+/// than is desirable for this class. Microbenchmarks would help determine for
+/// sure. See ARROW-8928.
+struct ARROW_EXPORT ExecBatch {
+ ExecBatch() = default;
+ ExecBatch(std::vector<Datum> values, int64_t length)
+ : values(std::move(values)), length(length) {}
+
+ explicit ExecBatch(const RecordBatch& batch);
+
+ static Result<ExecBatch> Make(std::vector<Datum> values);
+
+ Result<std::shared_ptr<RecordBatch>> ToRecordBatch(
+ std::shared_ptr<Schema> schema, MemoryPool* pool = default_memory_pool()) const;
+
+ /// The values representing positional arguments to be passed to a kernel's
+ /// exec function for processing.
+ std::vector<Datum> values;
+
+ /// A deferred filter represented as an array of indices into the values.
+ ///
+ /// For example, the filter [true, true, false, true] would be represented as
+ /// the selection vector [0, 1, 3]. When the selection vector is set,
+ /// ExecBatch::length is equal to the length of this array.
+ std::shared_ptr<SelectionVector> selection_vector;
+
+ /// A predicate Expression guaranteed to evaluate to true for all rows in this batch.
+ Expression guarantee = literal(true);
+
+ /// The semantic length of the ExecBatch. When the values are all scalars,
+ /// the length should be set to 1, otherwise the length is taken from the
+ /// array values, except when there is a selection vector. When there is a
+ /// selection vector set, the length of the batch is the length of the
+ /// selection.
+ ///
+ /// If the array values are of length 0 then the length is 0 regardless of
+ /// whether any values are Scalar. In general ExecBatch objects are produced
+ /// by ExecBatchIterator which by design does not yield length-0 batches.
+ int64_t length;
+
+ /// \brief Return the value at the i-th index
+ template <typename index_type>
+ inline const Datum& operator[](index_type i) const {
+ return values[i];
+ }
+
+ bool Equals(const ExecBatch& other) const;
+
+ /// \brief A convenience for the number of values / arguments.
+ int num_values() const { return static_cast<int>(values.size()); }
+
+ ExecBatch Slice(int64_t offset, int64_t length) const;
+
+ /// \brief A convenience for returning the ValueDescr objects (types and
+ /// shapes) from the batch.
+ std::vector<ValueDescr> GetDescriptors() const {
+ std::vector<ValueDescr> result;
+ for (const auto& value : this->values) {
+ result.emplace_back(value.descr());
+ }
+ return result;
+ }
+
+ ARROW_EXPORT friend void PrintTo(const ExecBatch&, std::ostream*);
+};
+
+inline bool operator==(const ExecBatch& l, const ExecBatch& r) { return l.Equals(r); }
+inline bool operator!=(const ExecBatch& l, const ExecBatch& r) { return !l.Equals(r); }
+
+/// \defgroup compute-call-function One-shot calls to compute functions
+///
+/// @{
+
+/// \brief One-shot invoker for all types of functions.
+///
+/// Does kernel dispatch, argument checking, iteration of ChunkedArray inputs,
+/// and wrapping of outputs.
+ARROW_EXPORT
+Result<Datum> CallFunction(const std::string& func_name, const std::vector<Datum>& args,
+ const FunctionOptions* options, ExecContext* ctx = NULLPTR);
+
+/// \brief Variant of CallFunction which uses a function's default options.
+///
+/// NB: Some functions require FunctionOptions be provided.
+ARROW_EXPORT
+Result<Datum> CallFunction(const std::string& func_name, const std::vector<Datum>& args,
+ ExecContext* ctx = NULLPTR);
+
+/// @}
+
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/exec_plan.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/exec_plan.cc
new file mode 100644
index 00000000000..433e895c243
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/exec_plan.cc
@@ -0,0 +1,823 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/exec/exec_plan.h"
+
+#include <mutex>
+#include <thread>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "arrow/array/util.h"
+#include "arrow/compute/api_vector.h"
+#include "arrow/compute/exec.h"
+#include "arrow/compute/exec/expression.h"
+#include "arrow/compute/registry.h"
+#include "arrow/datum.h"
+#include "arrow/record_batch.h"
+#include "arrow/result.h"
+#include "arrow/util/async_generator.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/optional.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+using internal::checked_pointer_cast;
+
+namespace compute {
+
+namespace {
+
+struct ExecPlanImpl : public ExecPlan {
+ explicit ExecPlanImpl(ExecContext* exec_context) : ExecPlan(exec_context) {}
+
+ ~ExecPlanImpl() override {
+ if (started_ && !finished_.is_finished()) {
+ ARROW_LOG(WARNING) << "Plan was destroyed before finishing";
+ StopProducing();
+ finished().Wait();
+ }
+ }
+
+ ExecNode* AddNode(std::unique_ptr<ExecNode> node) {
+ if (node->num_inputs() == 0) {
+ sources_.push_back(node.get());
+ }
+ if (node->num_outputs() == 0) {
+ sinks_.push_back(node.get());
+ }
+ nodes_.push_back(std::move(node));
+ return nodes_.back().get();
+ }
+
+ Status Validate() const {
+ if (nodes_.empty()) {
+ return Status::Invalid("ExecPlan has no node");
+ }
+ for (const auto& node : nodes_) {
+ RETURN_NOT_OK(node->Validate());
+ }
+ return Status::OK();
+ }
+
+ Status StartProducing() {
+ if (started_) {
+ return Status::Invalid("restarted ExecPlan");
+ }
+ started_ = true;
+
+ // producers precede consumers
+ sorted_nodes_ = TopoSort();
+
+ std::vector<Future<>> futures;
+
+ Status st = Status::OK();
+
+ using rev_it = std::reverse_iterator<NodeVector::iterator>;
+ for (rev_it it(sorted_nodes_.end()), end(sorted_nodes_.begin()); it != end; ++it) {
+ auto node = *it;
+
+ st = node->StartProducing();
+ if (!st.ok()) {
+ // Stop nodes that successfully started, in reverse order
+ stopped_ = true;
+ StopProducingImpl(it.base(), sorted_nodes_.end());
+ break;
+ }
+
+ futures.push_back(node->finished());
+ }
+
+ finished_ = AllComplete(std::move(futures));
+ return st;
+ }
+
+ void StopProducing() {
+ DCHECK(started_) << "stopped an ExecPlan which never started";
+ stopped_ = true;
+
+ StopProducingImpl(sorted_nodes_.begin(), sorted_nodes_.end());
+ }
+
+ template <typename It>
+ void StopProducingImpl(It begin, It end) {
+ for (auto it = begin; it != end; ++it) {
+ auto node = *it;
+ node->StopProducing();
+ }
+ }
+
+ NodeVector TopoSort() {
+ struct Impl {
+ const std::vector<std::unique_ptr<ExecNode>>& nodes;
+ std::unordered_set<ExecNode*> visited;
+ NodeVector sorted;
+
+ explicit Impl(const std::vector<std::unique_ptr<ExecNode>>& nodes) : nodes(nodes) {
+ visited.reserve(nodes.size());
+ sorted.resize(nodes.size());
+
+ for (const auto& node : nodes) {
+ Visit(node.get());
+ }
+
+ DCHECK_EQ(visited.size(), nodes.size());
+ }
+
+ void Visit(ExecNode* node) {
+ if (visited.count(node) != 0) return;
+
+ for (auto input : node->inputs()) {
+ // Ensure that producers are inserted before this consumer
+ Visit(input);
+ }
+
+ sorted[visited.size()] = node;
+ visited.insert(node);
+ }
+ };
+
+ return std::move(Impl{nodes_}.sorted);
+ }
+
+ Future<> finished_ = Future<>::MakeFinished();
+ bool started_ = false, stopped_ = false;
+ std::vector<std::unique_ptr<ExecNode>> nodes_;
+ NodeVector sources_, sinks_;
+ NodeVector sorted_nodes_;
+};
+
+ExecPlanImpl* ToDerived(ExecPlan* ptr) { return checked_cast<ExecPlanImpl*>(ptr); }
+
+const ExecPlanImpl* ToDerived(const ExecPlan* ptr) {
+ return checked_cast<const ExecPlanImpl*>(ptr);
+}
+
+util::optional<int> GetNodeIndex(const std::vector<ExecNode*>& nodes,
+ const ExecNode* node) {
+ for (int i = 0; i < static_cast<int>(nodes.size()); ++i) {
+ if (nodes[i] == node) return i;
+ }
+ return util::nullopt;
+}
+
+} // namespace
+
+Result<std::shared_ptr<ExecPlan>> ExecPlan::Make(ExecContext* ctx) {
+ return std::shared_ptr<ExecPlan>(new ExecPlanImpl{ctx});
+}
+
+ExecNode* ExecPlan::AddNode(std::unique_ptr<ExecNode> node) {
+ return ToDerived(this)->AddNode(std::move(node));
+}
+
+const ExecPlan::NodeVector& ExecPlan::sources() const {
+ return ToDerived(this)->sources_;
+}
+
+const ExecPlan::NodeVector& ExecPlan::sinks() const { return ToDerived(this)->sinks_; }
+
+Status ExecPlan::Validate() { return ToDerived(this)->Validate(); }
+
+Status ExecPlan::StartProducing() { return ToDerived(this)->StartProducing(); }
+
+void ExecPlan::StopProducing() { ToDerived(this)->StopProducing(); }
+
+Future<> ExecPlan::finished() { return ToDerived(this)->finished_; }
+
+ExecNode::ExecNode(ExecPlan* plan, std::string label, NodeVector inputs,
+ std::vector<std::string> input_labels,
+ std::shared_ptr<Schema> output_schema, int num_outputs)
+ : plan_(plan),
+ label_(std::move(label)),
+ inputs_(std::move(inputs)),
+ input_labels_(std::move(input_labels)),
+ output_schema_(std::move(output_schema)),
+ num_outputs_(num_outputs) {
+ for (auto input : inputs_) {
+ input->outputs_.push_back(this);
+ }
+}
+
+Status ExecNode::Validate() const {
+ if (inputs_.size() != input_labels_.size()) {
+ return Status::Invalid("Invalid number of inputs for '", label(), "' (expected ",
+ num_inputs(), ", actual ", input_labels_.size(), ")");
+ }
+
+ if (static_cast<int>(outputs_.size()) != num_outputs_) {
+ return Status::Invalid("Invalid number of outputs for '", label(), "' (expected ",
+ num_outputs(), ", actual ", outputs_.size(), ")");
+ }
+
+ for (auto out : outputs_) {
+ auto input_index = GetNodeIndex(out->inputs(), this);
+ if (!input_index) {
+ return Status::Invalid("Node '", label(), "' outputs to node '", out->label(),
+ "' but is not listed as an input.");
+ }
+ }
+
+ return Status::OK();
+}
+
+struct SourceNode : ExecNode {
+ SourceNode(ExecPlan* plan, std::string label, std::shared_ptr<Schema> output_schema,
+ AsyncGenerator<util::optional<ExecBatch>> generator)
+ : ExecNode(plan, std::move(label), {}, {}, std::move(output_schema),
+ /*num_outputs=*/1),
+ generator_(std::move(generator)) {}
+
+ const char* kind_name() override { return "SourceNode"; }
+
+ [[noreturn]] static void NoInputs() {
+ DCHECK(false) << "no inputs; this should never be called";
+ std::abort();
+ }
+ [[noreturn]] void InputReceived(ExecNode*, int, ExecBatch) override { NoInputs(); }
+ [[noreturn]] void ErrorReceived(ExecNode*, Status) override { NoInputs(); }
+ [[noreturn]] void InputFinished(ExecNode*, int) override { NoInputs(); }
+
+ Status StartProducing() override {
+ DCHECK(!stop_requested_) << "Restarted SourceNode";
+
+ CallbackOptions options;
+ if (auto executor = plan()->exec_context()->executor()) {
+ // These options will transfer execution to the desired Executor if necessary.
+ // This can happen for in-memory scans where batches didn't require
+ // any CPU work to decode. Otherwise, parsing etc should have already
+ // been placed us on the desired Executor and no queues will be pushed to.
+ options.executor = executor;
+ options.should_schedule = ShouldSchedule::IfDifferentExecutor;
+ }
+
+ finished_ = Loop([this, options] {
+ std::unique_lock<std::mutex> lock(mutex_);
+ int seq = batch_count_++;
+ if (stop_requested_) {
+ return Future<ControlFlow<int>>::MakeFinished(Break(seq));
+ }
+ lock.unlock();
+
+ return generator_().Then(
+ [=](const util::optional<ExecBatch>& batch) -> ControlFlow<int> {
+ std::unique_lock<std::mutex> lock(mutex_);
+ if (IsIterationEnd(batch) || stop_requested_) {
+ stop_requested_ = true;
+ return Break(seq);
+ }
+ lock.unlock();
+
+ outputs_[0]->InputReceived(this, seq, *batch);
+ return Continue();
+ },
+ [=](const Status& error) -> ControlFlow<int> {
+ // NB: ErrorReceived is independent of InputFinished, but
+ // ErrorReceived will usually prompt StopProducing which will
+ // prompt InputFinished. ErrorReceived may still be called from a
+ // node which was requested to stop (indeed, the request to stop
+ // may prompt an error).
+ std::unique_lock<std::mutex> lock(mutex_);
+ stop_requested_ = true;
+ lock.unlock();
+ outputs_[0]->ErrorReceived(this, error);
+ return Break(seq);
+ },
+ options);
+ }).Then([&](int seq) { outputs_[0]->InputFinished(this, seq); });
+
+ return Status::OK();
+ }
+
+ void PauseProducing(ExecNode* output) override {}
+
+ void ResumeProducing(ExecNode* output) override {}
+
+ void StopProducing(ExecNode* output) override {
+ DCHECK_EQ(output, outputs_[0]);
+ StopProducing();
+ }
+
+ void StopProducing() override {
+ std::unique_lock<std::mutex> lock(mutex_);
+ stop_requested_ = true;
+ }
+
+ Future<> finished() override { return finished_; }
+
+ private:
+ std::mutex mutex_;
+ bool stop_requested_{false};
+ int batch_count_{0};
+ Future<> finished_ = Future<>::MakeFinished();
+ AsyncGenerator<util::optional<ExecBatch>> generator_;
+};
+
+ExecNode* MakeSourceNode(ExecPlan* plan, std::string label,
+ std::shared_ptr<Schema> output_schema,
+ AsyncGenerator<util::optional<ExecBatch>> generator) {
+ return plan->EmplaceNode<SourceNode>(plan, std::move(label), std::move(output_schema),
+ std::move(generator));
+}
+
+struct FilterNode : ExecNode {
+ FilterNode(ExecNode* input, std::string label, Expression filter)
+ : ExecNode(input->plan(), std::move(label), {input}, {"target"},
+ /*output_schema=*/input->output_schema(),
+ /*num_outputs=*/1),
+ filter_(std::move(filter)) {}
+
+ const char* kind_name() override { return "FilterNode"; }
+
+ Result<ExecBatch> DoFilter(const ExecBatch& target) {
+ ARROW_ASSIGN_OR_RAISE(Expression simplified_filter,
+ SimplifyWithGuarantee(filter_, target.guarantee));
+
+ ARROW_ASSIGN_OR_RAISE(Datum mask, ExecuteScalarExpression(simplified_filter, target,
+ plan()->exec_context()));
+
+ if (mask.is_scalar()) {
+ const auto& mask_scalar = mask.scalar_as<BooleanScalar>();
+ if (mask_scalar.is_valid && mask_scalar.value) {
+ return target;
+ }
+
+ return target.Slice(0, 0);
+ }
+
+ // if the values are all scalar then the mask must also be
+ DCHECK(!std::all_of(target.values.begin(), target.values.end(),
+ [](const Datum& value) { return value.is_scalar(); }));
+
+ auto values = target.values;
+ for (auto& value : values) {
+ if (value.is_scalar()) continue;
+ ARROW_ASSIGN_OR_RAISE(value, Filter(value, mask, FilterOptions::Defaults()));
+ }
+ return ExecBatch::Make(std::move(values));
+ }
+
+ void InputReceived(ExecNode* input, int seq, ExecBatch batch) override {
+ DCHECK_EQ(input, inputs_[0]);
+
+ auto maybe_filtered = DoFilter(std::move(batch));
+ if (!maybe_filtered.ok()) {
+ outputs_[0]->ErrorReceived(this, maybe_filtered.status());
+ return;
+ }
+
+ maybe_filtered->guarantee = batch.guarantee;
+ outputs_[0]->InputReceived(this, seq, maybe_filtered.MoveValueUnsafe());
+ }
+
+ void ErrorReceived(ExecNode* input, Status error) override {
+ DCHECK_EQ(input, inputs_[0]);
+ outputs_[0]->ErrorReceived(this, std::move(error));
+ }
+
+ void InputFinished(ExecNode* input, int seq) override {
+ DCHECK_EQ(input, inputs_[0]);
+ outputs_[0]->InputFinished(this, seq);
+ }
+
+ Status StartProducing() override { return Status::OK(); }
+
+ void PauseProducing(ExecNode* output) override {}
+
+ void ResumeProducing(ExecNode* output) override {}
+
+ void StopProducing(ExecNode* output) override {
+ DCHECK_EQ(output, outputs_[0]);
+ StopProducing();
+ }
+
+ void StopProducing() override { inputs_[0]->StopProducing(this); }
+
+ Future<> finished() override { return inputs_[0]->finished(); }
+
+ private:
+ Expression filter_;
+};
+
+Result<ExecNode*> MakeFilterNode(ExecNode* input, std::string label, Expression filter) {
+ if (!filter.IsBound()) {
+ ARROW_ASSIGN_OR_RAISE(filter, filter.Bind(*input->output_schema()));
+ }
+
+ if (filter.type()->id() != Type::BOOL) {
+ return Status::TypeError("Filter expression must evaluate to bool, but ",
+ filter.ToString(), " evaluates to ",
+ filter.type()->ToString());
+ }
+
+ return input->plan()->EmplaceNode<FilterNode>(input, std::move(label),
+ std::move(filter));
+}
+
+struct ProjectNode : ExecNode {
+ ProjectNode(ExecNode* input, std::string label, std::shared_ptr<Schema> output_schema,
+ std::vector<Expression> exprs)
+ : ExecNode(input->plan(), std::move(label), {input}, {"target"},
+ /*output_schema=*/std::move(output_schema),
+ /*num_outputs=*/1),
+ exprs_(std::move(exprs)) {}
+
+ const char* kind_name() override { return "ProjectNode"; }
+
+ Result<ExecBatch> DoProject(const ExecBatch& target) {
+ std::vector<Datum> values{exprs_.size()};
+ for (size_t i = 0; i < exprs_.size(); ++i) {
+ ARROW_ASSIGN_OR_RAISE(Expression simplified_expr,
+ SimplifyWithGuarantee(exprs_[i], target.guarantee));
+
+ ARROW_ASSIGN_OR_RAISE(values[i], ExecuteScalarExpression(simplified_expr, target,
+ plan()->exec_context()));
+ }
+ return ExecBatch{std::move(values), target.length};
+ }
+
+ void InputReceived(ExecNode* input, int seq, ExecBatch batch) override {
+ DCHECK_EQ(input, inputs_[0]);
+
+ auto maybe_projected = DoProject(std::move(batch));
+ if (!maybe_projected.ok()) {
+ outputs_[0]->ErrorReceived(this, maybe_projected.status());
+ return;
+ }
+
+ maybe_projected->guarantee = batch.guarantee;
+ outputs_[0]->InputReceived(this, seq, maybe_projected.MoveValueUnsafe());
+ }
+
+ void ErrorReceived(ExecNode* input, Status error) override {
+ DCHECK_EQ(input, inputs_[0]);
+ outputs_[0]->ErrorReceived(this, std::move(error));
+ }
+
+ void InputFinished(ExecNode* input, int seq) override {
+ DCHECK_EQ(input, inputs_[0]);
+ outputs_[0]->InputFinished(this, seq);
+ }
+
+ Status StartProducing() override { return Status::OK(); }
+
+ void PauseProducing(ExecNode* output) override {}
+
+ void ResumeProducing(ExecNode* output) override {}
+
+ void StopProducing(ExecNode* output) override {
+ DCHECK_EQ(output, outputs_[0]);
+ StopProducing();
+ }
+
+ void StopProducing() override { inputs_[0]->StopProducing(this); }
+
+ Future<> finished() override { return inputs_[0]->finished(); }
+
+ private:
+ std::vector<Expression> exprs_;
+};
+
+Result<ExecNode*> MakeProjectNode(ExecNode* input, std::string label,
+ std::vector<Expression> exprs,
+ std::vector<std::string> names) {
+ FieldVector fields(exprs.size());
+
+ if (names.size() == 0) {
+ names.resize(exprs.size());
+ for (size_t i = 0; i < exprs.size(); ++i) {
+ names[i] = exprs[i].ToString();
+ }
+ }
+
+ int i = 0;
+ for (auto& expr : exprs) {
+ if (!expr.IsBound()) {
+ ARROW_ASSIGN_OR_RAISE(expr, expr.Bind(*input->output_schema()));
+ }
+ fields[i] = field(std::move(names[i]), expr.type());
+ ++i;
+ }
+
+ return input->plan()->EmplaceNode<ProjectNode>(
+ input, std::move(label), schema(std::move(fields)), std::move(exprs));
+}
+
+struct SinkNode : ExecNode {
+ SinkNode(ExecNode* input, std::string label,
+ AsyncGenerator<util::optional<ExecBatch>>* generator)
+ : ExecNode(input->plan(), std::move(label), {input}, {"collected"}, {},
+ /*num_outputs=*/0),
+ producer_(MakeProducer(generator)) {}
+
+ static PushGenerator<util::optional<ExecBatch>>::Producer MakeProducer(
+ AsyncGenerator<util::optional<ExecBatch>>* out_gen) {
+ PushGenerator<util::optional<ExecBatch>> gen;
+ auto out = gen.producer();
+ *out_gen = std::move(gen);
+ return out;
+ }
+
+ const char* kind_name() override { return "SinkNode"; }
+
+ Status StartProducing() override {
+ finished_ = Future<>::Make();
+ return Status::OK();
+ }
+
+ // sink nodes have no outputs from which to feel backpressure
+ [[noreturn]] static void NoOutputs() {
+ DCHECK(false) << "no outputs; this should never be called";
+ std::abort();
+ }
+ [[noreturn]] void ResumeProducing(ExecNode* output) override { NoOutputs(); }
+ [[noreturn]] void PauseProducing(ExecNode* output) override { NoOutputs(); }
+ [[noreturn]] void StopProducing(ExecNode* output) override { NoOutputs(); }
+
+ void StopProducing() override {
+ Finish();
+ inputs_[0]->StopProducing(this);
+ }
+
+ Future<> finished() override { return finished_; }
+
+ void InputReceived(ExecNode* input, int seq_num, ExecBatch batch) override {
+ DCHECK_EQ(input, inputs_[0]);
+
+ std::unique_lock<std::mutex> lock(mutex_);
+ if (finished_.is_finished()) return;
+
+ ++num_received_;
+ if (num_received_ == emit_stop_) {
+ lock.unlock();
+ producer_.Push(std::move(batch));
+ Finish();
+ return;
+ }
+
+ if (emit_stop_ != -1) {
+ DCHECK_LE(seq_num, emit_stop_);
+ }
+
+ lock.unlock();
+ producer_.Push(std::move(batch));
+ }
+
+ void ErrorReceived(ExecNode* input, Status error) override {
+ DCHECK_EQ(input, inputs_[0]);
+ producer_.Push(std::move(error));
+ Finish();
+ inputs_[0]->StopProducing(this);
+ }
+
+ void InputFinished(ExecNode* input, int seq_stop) override {
+ std::unique_lock<std::mutex> lock(mutex_);
+ emit_stop_ = seq_stop;
+ if (num_received_ == emit_stop_) {
+ lock.unlock();
+ Finish();
+ }
+ }
+
+ private:
+ void Finish() {
+ if (producer_.Close()) {
+ finished_.MarkFinished();
+ }
+ }
+
+ std::mutex mutex_;
+
+ int num_received_ = 0;
+ int emit_stop_ = -1;
+ Future<> finished_ = Future<>::MakeFinished();
+
+ PushGenerator<util::optional<ExecBatch>>::Producer producer_;
+};
+
+AsyncGenerator<util::optional<ExecBatch>> MakeSinkNode(ExecNode* input,
+ std::string label) {
+ AsyncGenerator<util::optional<ExecBatch>> out;
+ (void)input->plan()->EmplaceNode<SinkNode>(input, std::move(label), &out);
+ return out;
+}
+
+std::shared_ptr<RecordBatchReader> MakeGeneratorReader(
+ std::shared_ptr<Schema> schema,
+ std::function<Future<util::optional<ExecBatch>>()> gen, MemoryPool* pool) {
+ struct Impl : RecordBatchReader {
+ std::shared_ptr<Schema> schema() const override { return schema_; }
+
+ Status ReadNext(std::shared_ptr<RecordBatch>* record_batch) override {
+ ARROW_ASSIGN_OR_RAISE(auto batch, iterator_.Next());
+ if (batch) {
+ ARROW_ASSIGN_OR_RAISE(*record_batch, batch->ToRecordBatch(schema_, pool_));
+ } else {
+ *record_batch = IterationEnd<std::shared_ptr<RecordBatch>>();
+ }
+ return Status::OK();
+ }
+
+ MemoryPool* pool_;
+ std::shared_ptr<Schema> schema_;
+ Iterator<util::optional<ExecBatch>> iterator_;
+ };
+
+ auto out = std::make_shared<Impl>();
+ out->pool_ = pool;
+ out->schema_ = std::move(schema);
+ out->iterator_ = MakeGeneratorIterator(std::move(gen));
+ return out;
+}
+
+struct ScalarAggregateNode : ExecNode {
+ ScalarAggregateNode(ExecNode* input, std::string label,
+ std::shared_ptr<Schema> output_schema,
+ std::vector<const ScalarAggregateKernel*> kernels,
+ std::vector<std::vector<std::unique_ptr<KernelState>>> states)
+ : ExecNode(input->plan(), std::move(label), {input}, {"target"},
+ /*output_schema=*/std::move(output_schema),
+ /*num_outputs=*/1),
+ kernels_(std::move(kernels)),
+ states_(std::move(states)) {}
+
+ const char* kind_name() override { return "ScalarAggregateNode"; }
+
+ Status DoConsume(const ExecBatch& batch, size_t thread_index) {
+ for (size_t i = 0; i < kernels_.size(); ++i) {
+ KernelContext batch_ctx{plan()->exec_context()};
+ batch_ctx.SetState(states_[i][thread_index].get());
+ ExecBatch single_column_batch{{batch.values[i]}, batch.length};
+ RETURN_NOT_OK(kernels_[i]->consume(&batch_ctx, single_column_batch));
+ }
+ return Status::OK();
+ }
+
+ void InputReceived(ExecNode* input, int seq, ExecBatch batch) override {
+ DCHECK_EQ(input, inputs_[0]);
+
+ std::unique_lock<std::mutex> lock(mutex_);
+ auto it =
+ thread_indices_.emplace(std::this_thread::get_id(), thread_indices_.size()).first;
+ auto thread_index = it->second;
+
+ lock.unlock();
+
+ Status st = DoConsume(std::move(batch), thread_index);
+ if (!st.ok()) {
+ outputs_[0]->ErrorReceived(this, std::move(st));
+ return;
+ }
+
+ lock.lock();
+ ++num_received_;
+ st = MaybeFinish(&lock);
+ if (!st.ok()) {
+ outputs_[0]->ErrorReceived(this, std::move(st));
+ }
+ }
+
+ void ErrorReceived(ExecNode* input, Status error) override {
+ DCHECK_EQ(input, inputs_[0]);
+ outputs_[0]->ErrorReceived(this, std::move(error));
+ }
+
+ void InputFinished(ExecNode* input, int seq) override {
+ DCHECK_EQ(input, inputs_[0]);
+ std::unique_lock<std::mutex> lock(mutex_);
+ num_total_ = seq;
+ Status st = MaybeFinish(&lock);
+
+ if (!st.ok()) {
+ outputs_[0]->ErrorReceived(this, std::move(st));
+ }
+ }
+
+ Status StartProducing() override {
+ finished_ = Future<>::Make();
+ // Scalar aggregates will only output a single batch
+ outputs_[0]->InputFinished(this, 1);
+ return Status::OK();
+ }
+
+ void PauseProducing(ExecNode* output) override {}
+
+ void ResumeProducing(ExecNode* output) override {}
+
+ void StopProducing(ExecNode* output) override {
+ DCHECK_EQ(output, outputs_[0]);
+ StopProducing();
+ }
+
+ void StopProducing() override {
+ inputs_[0]->StopProducing(this);
+ finished_.MarkFinished();
+ }
+
+ Future<> finished() override { return finished_; }
+
+ private:
+ Status MaybeFinish(std::unique_lock<std::mutex>* lock) {
+ if (num_received_ != num_total_) return Status::OK();
+
+ if (states_.empty()) return Status::OK();
+
+ ExecBatch batch{{}, 1};
+ batch.values.resize(kernels_.size());
+
+ for (size_t i = 0; i < kernels_.size(); ++i) {
+ KernelContext ctx{plan()->exec_context()};
+ ARROW_ASSIGN_OR_RAISE(auto merged, ScalarAggregateKernel::MergeAll(
+ kernels_[i], &ctx, std::move(states_[i])));
+ RETURN_NOT_OK(kernels_[i]->finalize(&ctx, &batch.values[i]));
+ }
+ states_.clear();
+ lock->unlock();
+
+ outputs_[0]->InputReceived(this, 0, batch);
+
+ finished_.MarkFinished();
+ return Status::OK();
+ }
+
+ Future<> finished_ = Future<>::MakeFinished();
+ std::vector<const ScalarAggregateKernel*> kernels_;
+ std::vector<std::vector<std::unique_ptr<KernelState>>> states_;
+ std::unordered_map<std::thread::id, size_t> thread_indices_;
+ std::mutex mutex_;
+ int num_received_ = 0, num_total_ = -1;
+};
+
+Result<ExecNode*> MakeScalarAggregateNode(ExecNode* input, std::string label,
+ std::vector<internal::Aggregate> aggregates) {
+ if (input->output_schema()->num_fields() != static_cast<int>(aggregates.size())) {
+ return Status::Invalid("Provided ", aggregates.size(),
+ " aggregates, expected one for each field of ",
+ input->output_schema()->ToString());
+ }
+
+ auto exec_ctx = input->plan()->exec_context();
+
+ std::vector<const ScalarAggregateKernel*> kernels(aggregates.size());
+ std::vector<std::vector<std::unique_ptr<KernelState>>> states(kernels.size());
+ FieldVector fields(kernels.size());
+
+ for (size_t i = 0; i < kernels.size(); ++i) {
+ ARROW_ASSIGN_OR_RAISE(auto function,
+ exec_ctx->func_registry()->GetFunction(aggregates[i].function));
+
+ if (function->kind() != Function::SCALAR_AGGREGATE) {
+ return Status::Invalid("Provided non ScalarAggregateFunction ",
+ aggregates[i].function);
+ }
+
+ auto in_type = ValueDescr::Array(input->output_schema()->fields()[i]->type());
+
+ ARROW_ASSIGN_OR_RAISE(const Kernel* kernel, function->DispatchExact({in_type}));
+ kernels[i] = static_cast<const ScalarAggregateKernel*>(kernel);
+
+ if (aggregates[i].options == nullptr) {
+ aggregates[i].options = function->default_options();
+ }
+
+ KernelContext kernel_ctx{exec_ctx};
+ states[i].resize(exec_ctx->executor() ? exec_ctx->executor()->GetCapacity() : 1);
+ RETURN_NOT_OK(Kernel::InitAll(&kernel_ctx,
+ KernelInitArgs{kernels[i],
+ {
+ in_type,
+ },
+ aggregates[i].options},
+ &states[i]));
+
+ // pick one to resolve the kernel signature
+ kernel_ctx.SetState(states[i][0].get());
+ ARROW_ASSIGN_OR_RAISE(
+ auto descr, kernels[i]->signature->out_type().Resolve(&kernel_ctx, {in_type}));
+
+ fields[i] = field(aggregates[i].function, std::move(descr.type));
+ }
+
+ return input->plan()->EmplaceNode<ScalarAggregateNode>(
+ input, std::move(label), schema(std::move(fields)), std::move(kernels),
+ std::move(states));
+}
+
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/exec_plan.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/exec_plan.h
new file mode 100644
index 00000000000..c36c174af05
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/exec_plan.h
@@ -0,0 +1,287 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/compute/api_aggregate.h"
+#include "arrow/compute/exec.h"
+#include "arrow/compute/type_fwd.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/optional.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace compute {
+
+class ARROW_EXPORT ExecPlan : public std::enable_shared_from_this<ExecPlan> {
+ public:
+ using NodeVector = std::vector<ExecNode*>;
+
+ virtual ~ExecPlan() = default;
+
+ ExecContext* exec_context() const { return exec_context_; }
+
+ /// Make an empty exec plan
+ static Result<std::shared_ptr<ExecPlan>> Make(ExecContext* = default_exec_context());
+
+ ExecNode* AddNode(std::unique_ptr<ExecNode> node);
+
+ template <typename Node, typename... Args>
+ Node* EmplaceNode(Args&&... args) {
+ std::unique_ptr<Node> node{new Node{std::forward<Args>(args)...}};
+ auto out = node.get();
+ AddNode(std::move(node));
+ return out;
+ }
+
+ /// The initial inputs
+ const NodeVector& sources() const;
+
+ /// The final outputs
+ const NodeVector& sinks() const;
+
+ Status Validate();
+
+ /// \brief Start producing on all nodes
+ ///
+ /// Nodes are started in reverse topological order, such that any node
+ /// is started before all of its inputs.
+ Status StartProducing();
+
+ /// \brief Stop producing on all nodes
+ ///
+ /// Nodes are stopped in topological order, such that any node
+ /// is stopped before all of its outputs.
+ void StopProducing();
+
+ /// \brief A future which will be marked finished when all nodes have stopped producing.
+ Future<> finished();
+
+ protected:
+ ExecContext* exec_context_;
+ explicit ExecPlan(ExecContext* exec_context) : exec_context_(exec_context) {}
+};
+
+class ARROW_EXPORT ExecNode {
+ public:
+ using NodeVector = std::vector<ExecNode*>;
+
+ virtual ~ExecNode() = default;
+
+ virtual const char* kind_name() = 0;
+
+ // The number of inputs/outputs expected by this node
+ int num_inputs() const { return static_cast<int>(inputs_.size()); }
+ int num_outputs() const { return num_outputs_; }
+
+ /// This node's predecessors in the exec plan
+ const NodeVector& inputs() const { return inputs_; }
+
+ /// \brief Labels identifying the function of each input.
+ const std::vector<std::string>& input_labels() const { return input_labels_; }
+
+ /// This node's successors in the exec plan
+ const NodeVector& outputs() const { return outputs_; }
+
+ /// The datatypes for batches produced by this node
+ const std::shared_ptr<Schema>& output_schema() const { return output_schema_; }
+
+ /// This node's exec plan
+ ExecPlan* plan() { return plan_; }
+
+ /// \brief An optional label, for display and debugging
+ ///
+ /// There is no guarantee that this value is non-empty or unique.
+ const std::string& label() const { return label_; }
+
+ Status Validate() const;
+
+ /// Upstream API:
+ /// These functions are called by input nodes that want to inform this node
+ /// about an updated condition (a new input batch, an error, an impeding
+ /// end of stream).
+ ///
+ /// Implementation rules:
+ /// - these may be called anytime after StartProducing() has succeeded
+ /// (and even during or after StopProducing())
+ /// - these may be called concurrently
+ /// - these are allowed to call back into PauseProducing(), ResumeProducing()
+ /// and StopProducing()
+
+ /// Transfer input batch to ExecNode
+ virtual void InputReceived(ExecNode* input, int seq_num, ExecBatch batch) = 0;
+
+ /// Signal error to ExecNode
+ virtual void ErrorReceived(ExecNode* input, Status error) = 0;
+
+ /// Mark the inputs finished after the given number of batches.
+ ///
+ /// This may be called before all inputs are received. This simply fixes
+ /// the total number of incoming batches for an input, so that the ExecNode
+ /// knows when it has received all input, regardless of order.
+ virtual void InputFinished(ExecNode* input, int seq_stop) = 0;
+
+ /// Lifecycle API:
+ /// - start / stop to initiate and terminate production
+ /// - pause / resume to apply backpressure
+ ///
+ /// Implementation rules:
+ /// - StartProducing() should not recurse into the inputs, as it is
+ /// handled by ExecPlan::StartProducing()
+ /// - PauseProducing(), ResumeProducing(), StopProducing() may be called
+ /// concurrently (but only after StartProducing() has returned successfully)
+ /// - PauseProducing(), ResumeProducing(), StopProducing() may be called
+ /// by the downstream nodes' InputReceived(), ErrorReceived(), InputFinished()
+ /// methods
+ /// - StopProducing() should recurse into the inputs
+ /// - StopProducing() must be idempotent
+
+ // XXX What happens if StartProducing() calls an output's InputReceived()
+ // synchronously, and InputReceived() decides to call back into StopProducing()
+ // (or PauseProducing()) because it received enough data?
+ //
+ // Right now, since synchronous calls happen in both directions (input to
+ // output and then output to input), a node must be careful to be reentrant
+ // against synchronous calls from its output, *and* also concurrent calls from
+ // other threads. The most reliable solution is to update the internal state
+ // first, and notify outputs only at the end.
+ //
+ // Alternate rules:
+ // - StartProducing(), ResumeProducing() can call synchronously into
+ // its ouputs' consuming methods (InputReceived() etc.)
+ // - InputReceived(), ErrorReceived(), InputFinished() can call asynchronously
+ // into its inputs' PauseProducing(), StopProducing()
+ //
+ // Alternate API:
+ // - InputReceived(), ErrorReceived(), InputFinished() return a ProductionHint
+ // enum: either None (default), PauseProducing, ResumeProducing, StopProducing
+ // - A method allows passing a ProductionHint asynchronously from an output node
+ // (replacing PauseProducing(), ResumeProducing(), StopProducing())
+
+ /// \brief Start producing
+ ///
+ /// This must only be called once. If this fails, then other lifecycle
+ /// methods must not be called.
+ ///
+ /// This is typically called automatically by ExecPlan::StartProducing().
+ virtual Status StartProducing() = 0;
+
+ /// \brief Pause producing temporarily
+ ///
+ /// This call is a hint that an output node is currently not willing
+ /// to receive data.
+ ///
+ /// This may be called any number of times after StartProducing() succeeds.
+ /// However, the node is still free to produce data (which may be difficult
+ /// to prevent anyway if data is produced using multiple threads).
+ virtual void PauseProducing(ExecNode* output) = 0;
+
+ /// \brief Resume producing after a temporary pause
+ ///
+ /// This call is a hint that an output node is willing to receive data again.
+ ///
+ /// This may be called any number of times after StartProducing() succeeds.
+ /// This may also be called concurrently with PauseProducing(), which suggests
+ /// the implementation may use an atomic counter.
+ virtual void ResumeProducing(ExecNode* output) = 0;
+
+ /// \brief Stop producing definitively to a single output
+ ///
+ /// This call is a hint that an output node has completed and is not willing
+ /// to receive any further data.
+ virtual void StopProducing(ExecNode* output) = 0;
+
+ /// \brief Stop producing definitively to all outputs
+ virtual void StopProducing() = 0;
+
+ /// \brief A future which will be marked finished when this node has stopped producing.
+ virtual Future<> finished() = 0;
+
+ protected:
+ ExecNode(ExecPlan* plan, std::string label, NodeVector inputs,
+ std::vector<std::string> input_labels, std::shared_ptr<Schema> output_schema,
+ int num_outputs);
+
+ ExecPlan* plan_;
+ std::string label_;
+
+ NodeVector inputs_;
+ std::vector<std::string> input_labels_;
+
+ std::shared_ptr<Schema> output_schema_;
+ int num_outputs_;
+ NodeVector outputs_;
+};
+
+/// \brief Adapt an AsyncGenerator<ExecBatch> as a source node
+///
+/// plan->exec_context()->executor() is used to parallelize pushing to
+/// outputs, if provided.
+ARROW_EXPORT
+ExecNode* MakeSourceNode(ExecPlan* plan, std::string label,
+ std::shared_ptr<Schema> output_schema,
+ std::function<Future<util::optional<ExecBatch>>()>);
+
+/// \brief Add a sink node which forwards to an AsyncGenerator<ExecBatch>
+///
+/// Emitted batches will not be ordered.
+ARROW_EXPORT
+std::function<Future<util::optional<ExecBatch>>()> MakeSinkNode(ExecNode* input,
+ std::string label);
+
+/// \brief Wrap an ExecBatch generator in a RecordBatchReader.
+///
+/// The RecordBatchReader does not impose any ordering on emitted batches.
+ARROW_EXPORT
+std::shared_ptr<RecordBatchReader> MakeGeneratorReader(
+ std::shared_ptr<Schema>, std::function<Future<util::optional<ExecBatch>>()>,
+ MemoryPool*);
+
+/// \brief Make a node which excludes some rows from batches passed through it
+///
+/// The filter Expression will be evaluated against each batch which is pushed to
+/// this node. Any rows for which the filter does not evaluate to `true` will be excluded
+/// in the batch emitted by this node.
+///
+/// If the filter is not already bound, it will be bound against the input's schema.
+ARROW_EXPORT
+Result<ExecNode*> MakeFilterNode(ExecNode* input, std::string label, Expression filter);
+
+/// \brief Make a node which executes expressions on input batches, producing new batches.
+///
+/// Each expression will be evaluated against each batch which is pushed to
+/// this node to produce a corresponding output column.
+///
+/// If exprs are not already bound, they will be bound against the input's schema.
+/// If names are not provided, the string representations of exprs will be used.
+ARROW_EXPORT
+Result<ExecNode*> MakeProjectNode(ExecNode* input, std::string label,
+ std::vector<Expression> exprs,
+ std::vector<std::string> names = {});
+
+ARROW_EXPORT
+Result<ExecNode*> MakeScalarAggregateNode(ExecNode* input, std::string label,
+ std::vector<internal::Aggregate> aggregates);
+
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/expression.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/expression.cc
new file mode 100644
index 00000000000..4aab64a46a4
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/expression.cc
@@ -0,0 +1,1186 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/exec/expression.h"
+
+#include <unordered_map>
+#include <unordered_set>
+
+#include "arrow/chunked_array.h"
+#include "arrow/compute/api_vector.h"
+#include "arrow/compute/exec/expression_internal.h"
+#include "arrow/compute/exec_internal.h"
+#include "arrow/compute/function_internal.h"
+#include "arrow/io/memory.h"
+#include "arrow/ipc/reader.h"
+#include "arrow/ipc/writer.h"
+#include "arrow/util/hash_util.h"
+#include "arrow/util/key_value_metadata.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/optional.h"
+#include "arrow/util/string.h"
+#include "arrow/util/value_parsing.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+using internal::checked_pointer_cast;
+
+namespace compute {
+
+void Expression::Call::ComputeHash() {
+ hash = std::hash<std::string>{}(function_name);
+ for (const auto& arg : arguments) {
+ arrow::internal::hash_combine(hash, arg.hash());
+ }
+}
+
+Expression::Expression(Call call) {
+ call.ComputeHash();
+ impl_ = std::make_shared<Impl>(std::move(call));
+}
+
+Expression::Expression(Datum literal)
+ : impl_(std::make_shared<Impl>(std::move(literal))) {}
+
+Expression::Expression(Parameter parameter)
+ : impl_(std::make_shared<Impl>(std::move(parameter))) {}
+
+Expression literal(Datum lit) { return Expression(std::move(lit)); }
+
+Expression field_ref(FieldRef ref) {
+ return Expression(Expression::Parameter{std::move(ref), ValueDescr{}, -1});
+}
+
+Expression call(std::string function, std::vector<Expression> arguments,
+ std::shared_ptr<compute::FunctionOptions> options) {
+ Expression::Call call;
+ call.function_name = std::move(function);
+ call.arguments = std::move(arguments);
+ call.options = std::move(options);
+ return Expression(std::move(call));
+}
+
+const Datum* Expression::literal() const { return util::get_if<Datum>(impl_.get()); }
+
+const Expression::Parameter* Expression::parameter() const {
+ return util::get_if<Parameter>(impl_.get());
+}
+
+const FieldRef* Expression::field_ref() const {
+ if (auto parameter = this->parameter()) {
+ return &parameter->ref;
+ }
+ return nullptr;
+}
+
+const Expression::Call* Expression::call() const {
+ return util::get_if<Call>(impl_.get());
+}
+
+ValueDescr Expression::descr() const {
+ if (impl_ == nullptr) return {};
+
+ if (auto lit = literal()) {
+ return lit->descr();
+ }
+
+ if (auto parameter = this->parameter()) {
+ return parameter->descr;
+ }
+
+ return CallNotNull(*this)->descr;
+}
+
+namespace {
+
+std::string PrintDatum(const Datum& datum) {
+ if (datum.is_scalar()) {
+ if (!datum.scalar()->is_valid) return "null";
+
+ switch (datum.type()->id()) {
+ case Type::STRING:
+ case Type::LARGE_STRING:
+ return '"' +
+ Escape(util::string_view(*datum.scalar_as<BaseBinaryScalar>().value)) +
+ '"';
+
+ case Type::BINARY:
+ case Type::FIXED_SIZE_BINARY:
+ case Type::LARGE_BINARY:
+ return '"' + datum.scalar_as<BaseBinaryScalar>().value->ToHexString() + '"';
+
+ default:
+ break;
+ }
+
+ return datum.scalar()->ToString();
+ }
+ return datum.ToString();
+}
+
+} // namespace
+
+std::string Expression::ToString() const {
+ if (auto lit = literal()) {
+ return PrintDatum(*lit);
+ }
+
+ if (auto ref = field_ref()) {
+ if (auto name = ref->name()) {
+ return *name;
+ }
+ if (auto path = ref->field_path()) {
+ return path->ToString();
+ }
+ return ref->ToString();
+ }
+
+ auto call = CallNotNull(*this);
+ auto binary = [&](std::string op) {
+ return "(" + call->arguments[0].ToString() + " " + op + " " +
+ call->arguments[1].ToString() + ")";
+ };
+
+ if (auto cmp = Comparison::Get(call->function_name)) {
+ return binary(Comparison::GetOp(*cmp));
+ }
+
+ constexpr util::string_view kleene = "_kleene";
+ if (util::string_view{call->function_name}.ends_with(kleene)) {
+ auto op = call->function_name.substr(0, call->function_name.size() - kleene.size());
+ return binary(std::move(op));
+ }
+
+ if (auto options = GetMakeStructOptions(*call)) {
+ std::string out = "{";
+ auto argument = call->arguments.begin();
+ for (const auto& field_name : options->field_names) {
+ out += field_name + "=" + argument++->ToString() + ", ";
+ }
+ out.resize(out.size() - 1);
+ out.back() = '}';
+ return out;
+ }
+
+ std::string out = call->function_name + "(";
+ for (const auto& arg : call->arguments) {
+ out += arg.ToString() + ", ";
+ }
+
+ if (call->options) {
+ out += call->options->ToString();
+ out.resize(out.size() + 1);
+ } else {
+ out.resize(out.size() - 1);
+ }
+ out.back() = ')';
+ return out;
+}
+
+void PrintTo(const Expression& expr, std::ostream* os) {
+ *os << expr.ToString();
+ if (expr.IsBound()) {
+ *os << "[bound]";
+ }
+}
+
+bool Expression::Equals(const Expression& other) const {
+ if (Identical(*this, other)) return true;
+
+ if (impl_->index() != other.impl_->index()) {
+ return false;
+ }
+
+ if (auto lit = literal()) {
+ return lit->Equals(*other.literal());
+ }
+
+ if (auto ref = field_ref()) {
+ return ref->Equals(*other.field_ref());
+ }
+
+ auto call = CallNotNull(*this);
+ auto other_call = CallNotNull(other);
+
+ if (call->function_name != other_call->function_name ||
+ call->kernel != other_call->kernel) {
+ return false;
+ }
+
+ for (size_t i = 0; i < call->arguments.size(); ++i) {
+ if (!call->arguments[i].Equals(other_call->arguments[i])) {
+ return false;
+ }
+ }
+
+ if (call->options == other_call->options) return true;
+ if (call->options && other_call->options) {
+ return call->options->Equals(other_call->options);
+ }
+ return false;
+}
+
+bool Identical(const Expression& l, const Expression& r) { return l.impl_ == r.impl_; }
+
+size_t Expression::hash() const {
+ if (auto lit = literal()) {
+ if (lit->is_scalar()) {
+ return lit->scalar()->hash();
+ }
+ return 0;
+ }
+
+ if (auto ref = field_ref()) {
+ return ref->hash();
+ }
+
+ return CallNotNull(*this)->hash;
+}
+
+bool Expression::IsBound() const {
+ if (type() == nullptr) return false;
+
+ if (auto call = this->call()) {
+ if (call->kernel == nullptr) return false;
+
+ for (const Expression& arg : call->arguments) {
+ if (!arg.IsBound()) return false;
+ }
+ }
+
+ return true;
+}
+
+bool Expression::IsScalarExpression() const {
+ if (auto lit = literal()) {
+ return lit->is_scalar();
+ }
+
+ if (field_ref()) return true;
+
+ auto call = CallNotNull(*this);
+
+ for (const Expression& arg : call->arguments) {
+ if (!arg.IsScalarExpression()) return false;
+ }
+
+ if (call->function) {
+ return call->function->kind() == compute::Function::SCALAR;
+ }
+
+ // this expression is not bound; make a best guess based on
+ // the default function registry
+ if (auto function = compute::GetFunctionRegistry()
+ ->GetFunction(call->function_name)
+ .ValueOr(nullptr)) {
+ return function->kind() == compute::Function::SCALAR;
+ }
+
+ // unknown function or other error; conservatively return false
+ return false;
+}
+
+bool Expression::IsNullLiteral() const {
+ if (auto lit = literal()) {
+ if (lit->null_count() == lit->length()) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+bool Expression::IsSatisfiable() const {
+ if (type() && type()->id() == Type::NA) {
+ return false;
+ }
+
+ if (auto lit = literal()) {
+ if (lit->null_count() == lit->length()) {
+ return false;
+ }
+
+ if (lit->is_scalar() && lit->type()->id() == Type::BOOL) {
+ return lit->scalar_as<BooleanScalar>().value;
+ }
+ }
+
+ return true;
+}
+
+namespace {
+
+// Produce a bound Expression from unbound Call and bound arguments.
+Result<Expression> BindNonRecursive(Expression::Call call, bool insert_implicit_casts,
+ compute::ExecContext* exec_context) {
+ DCHECK(std::all_of(call.arguments.begin(), call.arguments.end(),
+ [](const Expression& argument) { return argument.IsBound(); }));
+
+ auto descrs = GetDescriptors(call.arguments);
+ ARROW_ASSIGN_OR_RAISE(call.function, GetFunction(call, exec_context));
+
+ if (!insert_implicit_casts) {
+ ARROW_ASSIGN_OR_RAISE(call.kernel, call.function->DispatchExact(descrs));
+ } else {
+ ARROW_ASSIGN_OR_RAISE(call.kernel, call.function->DispatchBest(&descrs));
+
+ for (size_t i = 0; i < descrs.size(); ++i) {
+ if (descrs[i] == call.arguments[i].descr()) continue;
+
+ if (descrs[i].shape != call.arguments[i].descr().shape) {
+ return Status::NotImplemented(
+ "Automatic broadcasting of scalars arguments to arrays in ",
+ Expression(std::move(call)).ToString());
+ }
+
+ if (auto lit = call.arguments[i].literal()) {
+ ARROW_ASSIGN_OR_RAISE(Datum new_lit, compute::Cast(*lit, descrs[i].type));
+ call.arguments[i] = literal(std::move(new_lit));
+ continue;
+ }
+
+ // construct an implicit cast Expression with which to replace this argument
+ Expression::Call implicit_cast;
+ implicit_cast.function_name = "cast";
+ implicit_cast.arguments = {std::move(call.arguments[i])};
+ implicit_cast.options = std::make_shared<compute::CastOptions>(
+ compute::CastOptions::Safe(descrs[i].type));
+
+ ARROW_ASSIGN_OR_RAISE(
+ call.arguments[i],
+ BindNonRecursive(std::move(implicit_cast),
+ /*insert_implicit_casts=*/false, exec_context));
+ }
+ }
+
+ compute::KernelContext kernel_context(exec_context);
+ if (call.kernel->init) {
+ ARROW_ASSIGN_OR_RAISE(
+ call.kernel_state,
+ call.kernel->init(&kernel_context, {call.kernel, descrs, call.options.get()}));
+
+ kernel_context.SetState(call.kernel_state.get());
+ }
+
+ ARROW_ASSIGN_OR_RAISE(
+ call.descr, call.kernel->signature->out_type().Resolve(&kernel_context, descrs));
+
+ return Expression(std::move(call));
+}
+
+template <typename TypeOrSchema>
+Result<Expression> BindImpl(Expression expr, const TypeOrSchema& in,
+ ValueDescr::Shape shape, compute::ExecContext* exec_context) {
+ if (exec_context == nullptr) {
+ compute::ExecContext exec_context;
+ return BindImpl(std::move(expr), in, shape, &exec_context);
+ }
+
+ if (expr.literal()) return expr;
+
+ if (auto ref = expr.field_ref()) {
+ if (ref->IsNested()) {
+ return Status::NotImplemented("nested field references");
+ }
+
+ ARROW_ASSIGN_OR_RAISE(auto path, ref->FindOne(in));
+
+ auto bound = *expr.parameter();
+ bound.index = path[0];
+ ARROW_ASSIGN_OR_RAISE(auto field, path.Get(in));
+ bound.descr.type = field->type();
+ bound.descr.shape = shape;
+ return Expression{std::move(bound)};
+ }
+
+ auto call = *CallNotNull(expr);
+ for (auto& argument : call.arguments) {
+ ARROW_ASSIGN_OR_RAISE(argument,
+ BindImpl(std::move(argument), in, shape, exec_context));
+ }
+ return BindNonRecursive(std::move(call),
+ /*insert_implicit_casts=*/true, exec_context);
+}
+
+} // namespace
+
+Result<Expression> Expression::Bind(const ValueDescr& in,
+ compute::ExecContext* exec_context) const {
+ return BindImpl(*this, *in.type, in.shape, exec_context);
+}
+
+Result<Expression> Expression::Bind(const Schema& in_schema,
+ compute::ExecContext* exec_context) const {
+ return BindImpl(*this, in_schema, ValueDescr::ARRAY, exec_context);
+}
+
+Result<ExecBatch> MakeExecBatch(const Schema& full_schema, const Datum& partial) {
+ ExecBatch out;
+
+ if (partial.kind() == Datum::RECORD_BATCH) {
+ const auto& partial_batch = *partial.record_batch();
+ out.length = partial_batch.num_rows();
+
+ for (const auto& field : full_schema.fields()) {
+ ARROW_ASSIGN_OR_RAISE(auto column,
+ FieldRef(field->name()).GetOneOrNone(partial_batch));
+
+ if (column) {
+ if (!column->type()->Equals(field->type())) {
+ // Referenced field was present but didn't have the expected type.
+ // This *should* be handled by readers, and will just be an error in the future.
+ ARROW_ASSIGN_OR_RAISE(
+ auto converted,
+ compute::Cast(column, field->type(), compute::CastOptions::Safe()));
+ column = converted.make_array();
+ }
+ out.values.emplace_back(std::move(column));
+ } else {
+ out.values.emplace_back(MakeNullScalar(field->type()));
+ }
+ }
+ return out;
+ }
+
+ // wasteful but useful for testing:
+ if (partial.type()->id() == Type::STRUCT) {
+ if (partial.is_array()) {
+ ARROW_ASSIGN_OR_RAISE(auto partial_batch,
+ RecordBatch::FromStructArray(partial.make_array()));
+
+ return MakeExecBatch(full_schema, partial_batch);
+ }
+
+ if (partial.is_scalar()) {
+ ARROW_ASSIGN_OR_RAISE(auto partial_array,
+ MakeArrayFromScalar(*partial.scalar(), 1));
+ ARROW_ASSIGN_OR_RAISE(auto out, MakeExecBatch(full_schema, partial_array));
+
+ for (Datum& value : out.values) {
+ if (value.is_scalar()) continue;
+ ARROW_ASSIGN_OR_RAISE(value, value.make_array()->GetScalar(0));
+ }
+ return out;
+ }
+ }
+
+ return Status::NotImplemented("MakeExecBatch from ", PrintDatum(partial));
+}
+
+Result<Datum> ExecuteScalarExpression(const Expression& expr, const Schema& full_schema,
+ const Datum& partial_input,
+ compute::ExecContext* exec_context) {
+ ARROW_ASSIGN_OR_RAISE(auto input, MakeExecBatch(full_schema, partial_input));
+ return ExecuteScalarExpression(expr, input, exec_context);
+}
+
+Result<Datum> ExecuteScalarExpression(const Expression& expr, const ExecBatch& input,
+ compute::ExecContext* exec_context) {
+ if (exec_context == nullptr) {
+ compute::ExecContext exec_context;
+ return ExecuteScalarExpression(expr, input, &exec_context);
+ }
+
+ if (!expr.IsBound()) {
+ return Status::Invalid("Cannot Execute unbound expression.");
+ }
+
+ if (!expr.IsScalarExpression()) {
+ return Status::Invalid(
+ "ExecuteScalarExpression cannot Execute non-scalar expression ", expr.ToString());
+ }
+
+ if (auto lit = expr.literal()) return *lit;
+
+ if (auto param = expr.parameter()) {
+ if (param->descr.type->id() == Type::NA) {
+ return MakeNullScalar(null());
+ }
+
+ const Datum& field = input[param->index];
+ if (!field.type()->Equals(param->descr.type)) {
+ return Status::Invalid("Referenced field ", expr.ToString(), " was ",
+ field.type()->ToString(), " but should have been ",
+ param->descr.type->ToString());
+ }
+
+ return field;
+ }
+
+ auto call = CallNotNull(expr);
+
+ std::vector<Datum> arguments(call->arguments.size());
+ for (size_t i = 0; i < arguments.size(); ++i) {
+ ARROW_ASSIGN_OR_RAISE(
+ arguments[i], ExecuteScalarExpression(call->arguments[i], input, exec_context));
+ }
+
+ auto executor = compute::detail::KernelExecutor::MakeScalar();
+
+ compute::KernelContext kernel_context(exec_context);
+ kernel_context.SetState(call->kernel_state.get());
+
+ auto kernel = call->kernel;
+ auto descrs = GetDescriptors(arguments);
+ auto options = call->options.get();
+ RETURN_NOT_OK(executor->Init(&kernel_context, {kernel, descrs, options}));
+
+ auto listener = std::make_shared<compute::detail::DatumAccumulator>();
+ RETURN_NOT_OK(executor->Execute(arguments, listener.get()));
+ return executor->WrapResults(arguments, listener->values());
+}
+
+namespace {
+
+std::array<std::pair<const Expression&, const Expression&>, 2>
+ArgumentsAndFlippedArguments(const Expression::Call& call) {
+ DCHECK_EQ(call.arguments.size(), 2);
+ return {std::pair<const Expression&, const Expression&>{call.arguments[0],
+ call.arguments[1]},
+ std::pair<const Expression&, const Expression&>{call.arguments[1],
+ call.arguments[0]}};
+}
+
+template <typename BinOp, typename It,
+ typename Out = typename std::iterator_traits<It>::value_type>
+util::optional<Out> FoldLeft(It begin, It end, const BinOp& bin_op) {
+ if (begin == end) return util::nullopt;
+
+ Out folded = std::move(*begin++);
+ while (begin != end) {
+ folded = bin_op(std::move(folded), std::move(*begin++));
+ }
+ return folded;
+}
+
+util::optional<compute::NullHandling::type> GetNullHandling(
+ const Expression::Call& call) {
+ if (call.function && call.function->kind() == compute::Function::SCALAR) {
+ return static_cast<const compute::ScalarKernel*>(call.kernel)->null_handling;
+ }
+ return util::nullopt;
+}
+
+} // namespace
+
+std::vector<FieldRef> FieldsInExpression(const Expression& expr) {
+ if (expr.literal()) return {};
+
+ if (auto ref = expr.field_ref()) {
+ return {*ref};
+ }
+
+ std::vector<FieldRef> fields;
+ for (const Expression& arg : CallNotNull(expr)->arguments) {
+ auto argument_fields = FieldsInExpression(arg);
+ std::move(argument_fields.begin(), argument_fields.end(), std::back_inserter(fields));
+ }
+ return fields;
+}
+
+bool ExpressionHasFieldRefs(const Expression& expr) {
+ if (expr.literal()) return false;
+
+ if (expr.field_ref()) return true;
+
+ for (const Expression& arg : CallNotNull(expr)->arguments) {
+ if (ExpressionHasFieldRefs(arg)) return true;
+ }
+ return false;
+}
+
+Result<Expression> FoldConstants(Expression expr) {
+ return Modify(
+ std::move(expr), [](Expression expr) { return expr; },
+ [](Expression expr, ...) -> Result<Expression> {
+ auto call = CallNotNull(expr);
+ if (std::all_of(call->arguments.begin(), call->arguments.end(),
+ [](const Expression& argument) { return argument.literal(); })) {
+ // all arguments are literal; we can evaluate this subexpression *now*
+ static const ExecBatch ignored_input = ExecBatch{};
+ ARROW_ASSIGN_OR_RAISE(Datum constant,
+ ExecuteScalarExpression(expr, ignored_input));
+
+ return literal(std::move(constant));
+ }
+
+ // XXX the following should probably be in a registry of passes instead
+ // of inline
+
+ if (GetNullHandling(*call) == compute::NullHandling::INTERSECTION) {
+ // kernels which always produce intersected validity can be resolved
+ // to null *now* if any of their inputs is a null literal
+ for (const auto& argument : call->arguments) {
+ if (argument.IsNullLiteral()) {
+ return argument;
+ }
+ }
+ }
+
+ if (call->function_name == "and_kleene") {
+ for (auto args : ArgumentsAndFlippedArguments(*call)) {
+ // true and x == x
+ if (args.first == literal(true)) return args.second;
+
+ // false and x == false
+ if (args.first == literal(false)) return args.first;
+
+ // x and x == x
+ if (args.first == args.second) return args.first;
+ }
+ return expr;
+ }
+
+ if (call->function_name == "or_kleene") {
+ for (auto args : ArgumentsAndFlippedArguments(*call)) {
+ // false or x == x
+ if (args.first == literal(false)) return args.second;
+
+ // true or x == true
+ if (args.first == literal(true)) return args.first;
+
+ // x or x == x
+ if (args.first == args.second) return args.first;
+ }
+ return expr;
+ }
+
+ return expr;
+ });
+}
+
+namespace {
+
+std::vector<Expression> GuaranteeConjunctionMembers(
+ const Expression& guaranteed_true_predicate) {
+ auto guarantee = guaranteed_true_predicate.call();
+ if (!guarantee || guarantee->function_name != "and_kleene") {
+ return {guaranteed_true_predicate};
+ }
+ return FlattenedAssociativeChain(guaranteed_true_predicate).fringe;
+}
+
+// Conjunction members which are represented in known_values are erased from
+// conjunction_members
+Status ExtractKnownFieldValuesImpl(
+ std::vector<Expression>* conjunction_members,
+ std::unordered_map<FieldRef, Datum, FieldRef::Hash>* known_values) {
+ auto unconsumed_end =
+ std::partition(conjunction_members->begin(), conjunction_members->end(),
+ [](const Expression& expr) {
+ // search for an equality conditions between a field and a literal
+ auto call = expr.call();
+ if (!call) return true;
+
+ if (call->function_name == "equal") {
+ auto ref = call->arguments[0].field_ref();
+ auto lit = call->arguments[1].literal();
+ return !(ref && lit);
+ }
+
+ if (call->function_name == "is_null") {
+ auto ref = call->arguments[0].field_ref();
+ return !ref;
+ }
+
+ return true;
+ });
+
+ for (auto it = unconsumed_end; it != conjunction_members->end(); ++it) {
+ auto call = CallNotNull(*it);
+
+ if (call->function_name == "equal") {
+ auto ref = call->arguments[0].field_ref();
+ auto lit = call->arguments[1].literal();
+ known_values->emplace(*ref, *lit);
+ } else if (call->function_name == "is_null") {
+ auto ref = call->arguments[0].field_ref();
+ known_values->emplace(*ref, Datum(std::make_shared<NullScalar>()));
+ }
+ }
+
+ conjunction_members->erase(unconsumed_end, conjunction_members->end());
+
+ return Status::OK();
+}
+
+} // namespace
+
+Result<KnownFieldValues> ExtractKnownFieldValues(
+ const Expression& guaranteed_true_predicate) {
+ auto conjunction_members = GuaranteeConjunctionMembers(guaranteed_true_predicate);
+ KnownFieldValues known_values;
+ RETURN_NOT_OK(ExtractKnownFieldValuesImpl(&conjunction_members, &known_values.map));
+ return known_values;
+}
+
+Result<Expression> ReplaceFieldsWithKnownValues(const KnownFieldValues& known_values,
+ Expression expr) {
+ if (!expr.IsBound()) {
+ return Status::Invalid(
+ "ReplaceFieldsWithKnownValues called on an unbound Expression");
+ }
+
+ return Modify(
+ std::move(expr),
+ [&known_values](Expression expr) -> Result<Expression> {
+ if (auto ref = expr.field_ref()) {
+ auto it = known_values.map.find(*ref);
+ if (it != known_values.map.end()) {
+ Datum lit = it->second;
+ if (lit.descr() == expr.descr()) return literal(std::move(lit));
+ // type mismatch, try casting the known value to the correct type
+
+ if (expr.type()->id() == Type::DICTIONARY &&
+ lit.type()->id() != Type::DICTIONARY) {
+ // the known value must be dictionary encoded
+
+ const auto& dict_type = checked_cast<const DictionaryType&>(*expr.type());
+ if (!lit.type()->Equals(dict_type.value_type())) {
+ ARROW_ASSIGN_OR_RAISE(lit, compute::Cast(lit, dict_type.value_type()));
+ }
+
+ if (lit.is_scalar()) {
+ ARROW_ASSIGN_OR_RAISE(auto dictionary,
+ MakeArrayFromScalar(*lit.scalar(), 1));
+
+ lit = Datum{DictionaryScalar::Make(MakeScalar<int32_t>(0),
+ std::move(dictionary))};
+ }
+ }
+
+ ARROW_ASSIGN_OR_RAISE(lit, compute::Cast(lit, expr.type()));
+ return literal(std::move(lit));
+ }
+ }
+ return expr;
+ },
+ [](Expression expr, ...) { return expr; });
+}
+
+namespace {
+
+bool IsBinaryAssociativeCommutative(const Expression::Call& call) {
+ static std::unordered_set<std::string> binary_associative_commutative{
+ "and", "or", "and_kleene", "or_kleene", "xor",
+ "multiply", "add", "multiply_checked", "add_checked"};
+
+ auto it = binary_associative_commutative.find(call.function_name);
+ return it != binary_associative_commutative.end();
+}
+
+} // namespace
+
+Result<Expression> Canonicalize(Expression expr, compute::ExecContext* exec_context) {
+ if (exec_context == nullptr) {
+ compute::ExecContext exec_context;
+ return Canonicalize(std::move(expr), &exec_context);
+ }
+
+ // If potentially reconstructing more deeply than a call's immediate arguments
+ // (for example, when reorganizing an associative chain), add expressions to this set to
+ // avoid unnecessary work
+ struct {
+ std::unordered_set<Expression, Expression::Hash> set_;
+
+ bool operator()(const Expression& expr) const {
+ return set_.find(expr) != set_.end();
+ }
+
+ void Add(std::vector<Expression> exprs) {
+ std::move(exprs.begin(), exprs.end(), std::inserter(set_, set_.end()));
+ }
+ } AlreadyCanonicalized;
+
+ return Modify(
+ std::move(expr),
+ [&AlreadyCanonicalized, exec_context](Expression expr) -> Result<Expression> {
+ auto call = expr.call();
+ if (!call) return expr;
+
+ if (AlreadyCanonicalized(expr)) return expr;
+
+ if (IsBinaryAssociativeCommutative(*call)) {
+ struct {
+ int Priority(const Expression& operand) const {
+ // order literals first, starting with nulls
+ if (operand.IsNullLiteral()) return 0;
+ if (operand.literal()) return 1;
+ return 2;
+ }
+ bool operator()(const Expression& l, const Expression& r) const {
+ return Priority(l) < Priority(r);
+ }
+ } CanonicalOrdering;
+
+ FlattenedAssociativeChain chain(expr);
+ if (chain.was_left_folded &&
+ std::is_sorted(chain.fringe.begin(), chain.fringe.end(),
+ CanonicalOrdering)) {
+ AlreadyCanonicalized.Add(std::move(chain.exprs));
+ return expr;
+ }
+
+ std::stable_sort(chain.fringe.begin(), chain.fringe.end(), CanonicalOrdering);
+
+ // fold the chain back up
+ auto folded =
+ FoldLeft(chain.fringe.begin(), chain.fringe.end(),
+ [call, &AlreadyCanonicalized](Expression l, Expression r) {
+ auto canonicalized_call = *call;
+ canonicalized_call.arguments = {std::move(l), std::move(r)};
+ Expression expr(std::move(canonicalized_call));
+ AlreadyCanonicalized.Add({expr});
+ return expr;
+ });
+ return std::move(*folded);
+ }
+
+ if (auto cmp = Comparison::Get(call->function_name)) {
+ if (call->arguments[0].literal() && !call->arguments[1].literal()) {
+ // ensure that literals are on comparisons' RHS
+ auto flipped_call = *call;
+
+ std::swap(flipped_call.arguments[0], flipped_call.arguments[1]);
+ flipped_call.function_name =
+ Comparison::GetName(Comparison::GetFlipped(*cmp));
+
+ return BindNonRecursive(flipped_call,
+ /*insert_implicit_casts=*/false, exec_context);
+ }
+ }
+
+ return expr;
+ },
+ [](Expression expr, ...) { return expr; });
+}
+
+namespace {
+
+Result<Expression> DirectComparisonSimplification(Expression expr,
+ const Expression::Call& guarantee) {
+ return Modify(
+ std::move(expr), [](Expression expr) { return expr; },
+ [&guarantee](Expression expr, ...) -> Result<Expression> {
+ auto call = expr.call();
+ if (!call) return expr;
+
+ // Ensure both calls are comparisons with equal LHS and scalar RHS
+ auto cmp = Comparison::Get(expr);
+ auto cmp_guarantee = Comparison::Get(guarantee.function_name);
+
+ if (!cmp) return expr;
+ if (!cmp_guarantee) return expr;
+
+ const auto& lhs = Comparison::StripOrderPreservingCasts(call->arguments[0]);
+ const auto& guarantee_lhs = guarantee.arguments[0];
+ if (lhs != guarantee_lhs) return expr;
+
+ auto rhs = call->arguments[1].literal();
+ auto guarantee_rhs = guarantee.arguments[1].literal();
+
+ if (!rhs) return expr;
+ if (!rhs->is_scalar()) return expr;
+
+ if (!guarantee_rhs) return expr;
+ if (!guarantee_rhs->is_scalar()) return expr;
+
+ ARROW_ASSIGN_OR_RAISE(auto cmp_rhs_guarantee_rhs,
+ Comparison::Execute(*rhs, *guarantee_rhs));
+ DCHECK_NE(cmp_rhs_guarantee_rhs, Comparison::NA);
+
+ if (cmp_rhs_guarantee_rhs == Comparison::EQUAL) {
+ // RHS of filter is equal to RHS of guarantee
+
+ if ((*cmp & *cmp_guarantee) == *cmp_guarantee) {
+ // guarantee is a subset of filter, so all data will be included
+ // x > 1, x >= 1, x != 1 guaranteed by x > 1
+ return literal(true);
+ }
+
+ if ((*cmp & *cmp_guarantee) == 0) {
+ // guarantee disjoint with filter, so all data will be excluded
+ // x > 1, x >= 1, x != 1 unsatisfiable if x == 1
+ return literal(false);
+ }
+
+ return expr;
+ }
+
+ if (*cmp_guarantee & cmp_rhs_guarantee_rhs) {
+ // x > 1, x >= 1, x != 1 cannot use guarantee x >= 3
+ return expr;
+ }
+
+ if (*cmp & Comparison::GetFlipped(cmp_rhs_guarantee_rhs)) {
+ // x > 1, x >= 1, x != 1 guaranteed by x >= 3
+ return literal(true);
+ } else {
+ // x < 1, x <= 1, x == 1 unsatisfiable if x >= 3
+ return literal(false);
+ }
+ });
+}
+
+} // namespace
+
+Result<Expression> SimplifyWithGuarantee(Expression expr,
+ const Expression& guaranteed_true_predicate) {
+ auto conjunction_members = GuaranteeConjunctionMembers(guaranteed_true_predicate);
+
+ KnownFieldValues known_values;
+ RETURN_NOT_OK(ExtractKnownFieldValuesImpl(&conjunction_members, &known_values.map));
+
+ ARROW_ASSIGN_OR_RAISE(expr,
+ ReplaceFieldsWithKnownValues(known_values, std::move(expr)));
+
+ auto CanonicalizeAndFoldConstants = [&expr] {
+ ARROW_ASSIGN_OR_RAISE(expr, Canonicalize(std::move(expr)));
+ ARROW_ASSIGN_OR_RAISE(expr, FoldConstants(std::move(expr)));
+ return Status::OK();
+ };
+ RETURN_NOT_OK(CanonicalizeAndFoldConstants());
+
+ for (const auto& guarantee : conjunction_members) {
+ if (Comparison::Get(guarantee) && guarantee.call()->arguments[1].literal()) {
+ ARROW_ASSIGN_OR_RAISE(
+ auto simplified, DirectComparisonSimplification(expr, *CallNotNull(guarantee)));
+
+ if (Identical(simplified, expr)) continue;
+
+ expr = std::move(simplified);
+ RETURN_NOT_OK(CanonicalizeAndFoldConstants());
+ }
+ }
+
+ return expr;
+}
+
+// Serialization is accomplished by converting expressions to KeyValueMetadata and storing
+// this in the schema of a RecordBatch. Embedded arrays and scalars are stored in its
+// columns. Finally, the RecordBatch is written to an IPC file.
+Result<std::shared_ptr<Buffer>> Serialize(const Expression& expr) {
+ struct {
+ std::shared_ptr<KeyValueMetadata> metadata_ = std::make_shared<KeyValueMetadata>();
+ ArrayVector columns_;
+
+ Result<std::string> AddScalar(const Scalar& scalar) {
+ auto ret = columns_.size();
+ ARROW_ASSIGN_OR_RAISE(auto array, MakeArrayFromScalar(scalar, 1));
+ columns_.push_back(std::move(array));
+ return std::to_string(ret);
+ }
+
+ Status Visit(const Expression& expr) {
+ if (auto lit = expr.literal()) {
+ if (!lit->is_scalar()) {
+ return Status::NotImplemented("Serialization of non-scalar literals");
+ }
+ ARROW_ASSIGN_OR_RAISE(auto value, AddScalar(*lit->scalar()));
+ metadata_->Append("literal", std::move(value));
+ return Status::OK();
+ }
+
+ if (auto ref = expr.field_ref()) {
+ if (!ref->name()) {
+ return Status::NotImplemented("Serialization of non-name field_refs");
+ }
+ metadata_->Append("field_ref", *ref->name());
+ return Status::OK();
+ }
+
+ auto call = CallNotNull(expr);
+ metadata_->Append("call", call->function_name);
+
+ for (const auto& argument : call->arguments) {
+ RETURN_NOT_OK(Visit(argument));
+ }
+
+ if (call->options) {
+ ARROW_ASSIGN_OR_RAISE(auto options_scalar,
+ internal::FunctionOptionsToStructScalar(*call->options));
+ ARROW_ASSIGN_OR_RAISE(auto value, AddScalar(*options_scalar));
+ metadata_->Append("options", std::move(value));
+ }
+
+ metadata_->Append("end", call->function_name);
+ return Status::OK();
+ }
+
+ Result<std::shared_ptr<RecordBatch>> operator()(const Expression& expr) {
+ RETURN_NOT_OK(Visit(expr));
+ FieldVector fields(columns_.size());
+ for (size_t i = 0; i < fields.size(); ++i) {
+ fields[i] = field("", columns_[i]->type());
+ }
+ return RecordBatch::Make(schema(std::move(fields), std::move(metadata_)), 1,
+ std::move(columns_));
+ }
+ } ToRecordBatch;
+
+ ARROW_ASSIGN_OR_RAISE(auto batch, ToRecordBatch(expr));
+ ARROW_ASSIGN_OR_RAISE(auto stream, io::BufferOutputStream::Create());
+ ARROW_ASSIGN_OR_RAISE(auto writer, ipc::MakeFileWriter(stream, batch->schema()));
+ RETURN_NOT_OK(writer->WriteRecordBatch(*batch));
+ RETURN_NOT_OK(writer->Close());
+ return stream->Finish();
+}
+
+Result<Expression> Deserialize(std::shared_ptr<Buffer> buffer) {
+ io::BufferReader stream(std::move(buffer));
+ ARROW_ASSIGN_OR_RAISE(auto reader, ipc::RecordBatchFileReader::Open(&stream));
+ ARROW_ASSIGN_OR_RAISE(auto batch, reader->ReadRecordBatch(0));
+ if (batch->schema()->metadata() == nullptr) {
+ return Status::Invalid("serialized Expression's batch repr had null metadata");
+ }
+ if (batch->num_rows() != 1) {
+ return Status::Invalid(
+ "serialized Expression's batch repr was not a single row - had ",
+ batch->num_rows());
+ }
+
+ struct FromRecordBatch {
+ const RecordBatch& batch_;
+ int index_;
+
+ const KeyValueMetadata& metadata() { return *batch_.schema()->metadata(); }
+
+ Result<std::shared_ptr<Scalar>> GetScalar(const std::string& i) {
+ int32_t column_index;
+ if (!::arrow::internal::ParseValue<Int32Type>(i.data(), i.length(),
+ &column_index)) {
+ return Status::Invalid("Couldn't parse column_index");
+ }
+ if (column_index >= batch_.num_columns()) {
+ return Status::Invalid("column_index out of bounds");
+ }
+ return batch_.column(column_index)->GetScalar(0);
+ }
+
+ Result<Expression> GetOne() {
+ if (index_ >= metadata().size()) {
+ return Status::Invalid("unterminated serialized Expression");
+ }
+
+ const std::string& key = metadata().key(index_);
+ const std::string& value = metadata().value(index_);
+ ++index_;
+
+ if (key == "literal") {
+ ARROW_ASSIGN_OR_RAISE(auto scalar, GetScalar(value));
+ return literal(std::move(scalar));
+ }
+
+ if (key == "field_ref") {
+ return field_ref(value);
+ }
+
+ if (key != "call") {
+ return Status::Invalid("Unrecognized serialized Expression key ", key);
+ }
+
+ std::vector<Expression> arguments;
+ while (metadata().key(index_) != "end") {
+ if (metadata().key(index_) == "options") {
+ ARROW_ASSIGN_OR_RAISE(auto options_scalar, GetScalar(metadata().value(index_)));
+ std::shared_ptr<compute::FunctionOptions> options;
+ if (options_scalar) {
+ ARROW_ASSIGN_OR_RAISE(
+ options, internal::FunctionOptionsFromStructScalar(
+ checked_cast<const StructScalar&>(*options_scalar)));
+ }
+ auto expr = call(value, std::move(arguments), std::move(options));
+ index_ += 2;
+ return expr;
+ }
+
+ ARROW_ASSIGN_OR_RAISE(auto argument, GetOne());
+ arguments.push_back(std::move(argument));
+ }
+
+ ++index_;
+ return call(value, std::move(arguments));
+ }
+ };
+
+ return FromRecordBatch{*batch, 0}.GetOne();
+}
+
+Expression project(std::vector<Expression> values, std::vector<std::string> names) {
+ return call("make_struct", std::move(values),
+ compute::MakeStructOptions{std::move(names)});
+}
+
+Expression equal(Expression lhs, Expression rhs) {
+ return call("equal", {std::move(lhs), std::move(rhs)});
+}
+
+Expression not_equal(Expression lhs, Expression rhs) {
+ return call("not_equal", {std::move(lhs), std::move(rhs)});
+}
+
+Expression less(Expression lhs, Expression rhs) {
+ return call("less", {std::move(lhs), std::move(rhs)});
+}
+
+Expression less_equal(Expression lhs, Expression rhs) {
+ return call("less_equal", {std::move(lhs), std::move(rhs)});
+}
+
+Expression greater(Expression lhs, Expression rhs) {
+ return call("greater", {std::move(lhs), std::move(rhs)});
+}
+
+Expression greater_equal(Expression lhs, Expression rhs) {
+ return call("greater_equal", {std::move(lhs), std::move(rhs)});
+}
+
+Expression is_null(Expression lhs) { return call("is_null", {std::move(lhs)}); }
+
+Expression is_valid(Expression lhs) { return call("is_valid", {std::move(lhs)}); }
+
+Expression and_(Expression lhs, Expression rhs) {
+ return call("and_kleene", {std::move(lhs), std::move(rhs)});
+}
+
+Expression and_(const std::vector<Expression>& operands) {
+ auto folded = FoldLeft<Expression(Expression, Expression)>(operands.begin(),
+ operands.end(), and_);
+ if (folded) {
+ return std::move(*folded);
+ }
+ return literal(true);
+}
+
+Expression or_(Expression lhs, Expression rhs) {
+ return call("or_kleene", {std::move(lhs), std::move(rhs)});
+}
+
+Expression or_(const std::vector<Expression>& operands) {
+ auto folded =
+ FoldLeft<Expression(Expression, Expression)>(operands.begin(), operands.end(), or_);
+ if (folded) {
+ return std::move(*folded);
+ }
+ return literal(false);
+}
+
+Expression not_(Expression operand) { return call("invert", {std::move(operand)}); }
+
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/expression.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/expression.h
new file mode 100644
index 00000000000..3810accf70a
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/expression.h
@@ -0,0 +1,269 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This API is EXPERIMENTAL.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/compute/type_fwd.h"
+#include "arrow/datum.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/variant.h"
+
+namespace arrow {
+namespace compute {
+
+/// An unbound expression which maps a single Datum to another Datum.
+/// An expression is one of
+/// - A literal Datum.
+/// - A reference to a single (potentially nested) field of the input Datum.
+/// - A call to a compute function, with arguments specified by other Expressions.
+class ARROW_EXPORT Expression {
+ public:
+ struct Call {
+ std::string function_name;
+ std::vector<Expression> arguments;
+ std::shared_ptr<FunctionOptions> options;
+ // Cached hash value
+ size_t hash;
+
+ // post-Bind properties:
+ std::shared_ptr<Function> function;
+ const Kernel* kernel = NULLPTR;
+ std::shared_ptr<KernelState> kernel_state;
+ ValueDescr descr;
+
+ void ComputeHash();
+ };
+
+ std::string ToString() const;
+ bool Equals(const Expression& other) const;
+ size_t hash() const;
+ struct Hash {
+ size_t operator()(const Expression& expr) const { return expr.hash(); }
+ };
+
+ /// Bind this expression to the given input type, looking up Kernels and field types.
+ /// Some expression simplification may be performed and implicit casts will be inserted.
+ /// Any state necessary for execution will be initialized and returned.
+ Result<Expression> Bind(const ValueDescr& in, ExecContext* = NULLPTR) const;
+ Result<Expression> Bind(const Schema& in_schema, ExecContext* = NULLPTR) const;
+
+ // XXX someday
+ // Clone all KernelState in this bound expression. If any function referenced by this
+ // expression has mutable KernelState, it is not safe to execute or apply simplification
+ // passes to it (or copies of it!) from multiple threads. Cloning state produces new
+ // KernelStates where necessary to ensure that Expressions may be manipulated safely
+ // on multiple threads.
+ // Result<ExpressionState> CloneState() const;
+ // Status SetState(ExpressionState);
+
+ /// Return true if all an expression's field references have explicit ValueDescr and all
+ /// of its functions' kernels are looked up.
+ bool IsBound() const;
+
+ /// Return true if this expression is composed only of Scalar literals, field
+ /// references, and calls to ScalarFunctions.
+ bool IsScalarExpression() const;
+
+ /// Return true if this expression is literal and entirely null.
+ bool IsNullLiteral() const;
+
+ /// Return true if this expression could evaluate to true.
+ bool IsSatisfiable() const;
+
+ // XXX someday
+ // Result<PipelineGraph> GetPipelines();
+
+ /// Access a Call or return nullptr if this expression is not a call
+ const Call* call() const;
+ /// Access a Datum or return nullptr if this expression is not a literal
+ const Datum* literal() const;
+ /// Access a FieldRef or return nullptr if this expression is not a field_ref
+ const FieldRef* field_ref() const;
+
+ /// The type and shape to which this expression will evaluate
+ ValueDescr descr() const;
+ std::shared_ptr<DataType> type() const { return descr().type; }
+ // XXX someday
+ // NullGeneralization::type nullable() const;
+
+ struct Parameter {
+ FieldRef ref;
+
+ // post-bind properties
+ ValueDescr descr;
+ int index;
+ };
+ const Parameter* parameter() const;
+
+ Expression() = default;
+ explicit Expression(Call call);
+ explicit Expression(Datum literal);
+ explicit Expression(Parameter parameter);
+
+ private:
+ using Impl = util::Variant<Datum, Parameter, Call>;
+ std::shared_ptr<Impl> impl_;
+
+ ARROW_EXPORT friend bool Identical(const Expression& l, const Expression& r);
+
+ ARROW_EXPORT friend void PrintTo(const Expression&, std::ostream*);
+};
+
+inline bool operator==(const Expression& l, const Expression& r) { return l.Equals(r); }
+inline bool operator!=(const Expression& l, const Expression& r) { return !l.Equals(r); }
+
+// Factories
+
+ARROW_EXPORT
+Expression literal(Datum lit);
+
+template <typename Arg>
+Expression literal(Arg&& arg) {
+ return literal(Datum(std::forward<Arg>(arg)));
+}
+
+ARROW_EXPORT
+Expression field_ref(FieldRef ref);
+
+ARROW_EXPORT
+Expression call(std::string function, std::vector<Expression> arguments,
+ std::shared_ptr<FunctionOptions> options = NULLPTR);
+
+template <typename Options, typename = typename std::enable_if<
+ std::is_base_of<FunctionOptions, Options>::value>::type>
+Expression call(std::string function, std::vector<Expression> arguments,
+ Options options) {
+ return call(std::move(function), std::move(arguments),
+ std::make_shared<Options>(std::move(options)));
+}
+
+/// Assemble a list of all fields referenced by an Expression at any depth.
+ARROW_EXPORT
+std::vector<FieldRef> FieldsInExpression(const Expression&);
+
+/// Check if the expression references any fields.
+ARROW_EXPORT
+bool ExpressionHasFieldRefs(const Expression&);
+
+/// Assemble a mapping from field references to known values.
+struct ARROW_EXPORT KnownFieldValues;
+ARROW_EXPORT
+Result<KnownFieldValues> ExtractKnownFieldValues(
+ const Expression& guaranteed_true_predicate);
+
+/// \defgroup expression-passes Functions for modification of Expressions
+///
+/// @{
+///
+/// These transform bound expressions. Some transforms utilize a guarantee, which is
+/// provided as an Expression which is guaranteed to evaluate to true. The
+/// guaranteed_true_predicate need not be bound, but canonicalization is currently
+/// deferred to producers of guarantees. For example in order to be recognized as a
+/// guarantee on a field value, an Expression must be a call to "equal" with field_ref LHS
+/// and literal RHS. Flipping the arguments, "is_in" with a one-long value_set, ... or
+/// other semantically identical Expressions will not be recognized.
+
+/// Weak canonicalization which establishes guarantees for subsequent passes. Even
+/// equivalent Expressions may result in different canonicalized expressions.
+/// TODO this could be a strong canonicalization
+ARROW_EXPORT
+Result<Expression> Canonicalize(Expression, ExecContext* = NULLPTR);
+
+/// Simplify Expressions based on literal arguments (for example, add(null, x) will always
+/// be null so replace the call with a null literal). Includes early evaluation of all
+/// calls whose arguments are entirely literal.
+ARROW_EXPORT
+Result<Expression> FoldConstants(Expression);
+
+/// Simplify Expressions by replacing with known values of the fields which it references.
+ARROW_EXPORT
+Result<Expression> ReplaceFieldsWithKnownValues(const KnownFieldValues& known_values,
+ Expression);
+
+/// Simplify an expression by replacing subexpressions based on a guarantee:
+/// a boolean expression which is guaranteed to evaluate to `true`. For example, this is
+/// used to remove redundant function calls from a filter expression or to replace a
+/// reference to a constant-value field with a literal.
+ARROW_EXPORT
+Result<Expression> SimplifyWithGuarantee(Expression,
+ const Expression& guaranteed_true_predicate);
+
+/// @}
+
+// Execution
+
+/// Create an ExecBatch suitable for passing to ExecuteScalarExpression() from a
+/// RecordBatch which may have missing or incorrectly ordered columns.
+/// Missing fields will be replaced with null scalars.
+ARROW_EXPORT Result<ExecBatch> MakeExecBatch(const Schema& full_schema,
+ const Datum& partial);
+
+/// Execute a scalar expression against the provided state and input ExecBatch. This
+/// expression must be bound.
+ARROW_EXPORT
+Result<Datum> ExecuteScalarExpression(const Expression&, const ExecBatch& input,
+ ExecContext* = NULLPTR);
+
+/// Convenience function for invoking against a RecordBatch
+ARROW_EXPORT
+Result<Datum> ExecuteScalarExpression(const Expression&, const Schema& full_schema,
+ const Datum& partial_input, ExecContext* = NULLPTR);
+
+// Serialization
+
+ARROW_EXPORT
+Result<std::shared_ptr<Buffer>> Serialize(const Expression&);
+
+ARROW_EXPORT
+Result<Expression> Deserialize(std::shared_ptr<Buffer>);
+
+// Convenience aliases for factories
+
+ARROW_EXPORT Expression project(std::vector<Expression> values,
+ std::vector<std::string> names);
+
+ARROW_EXPORT Expression equal(Expression lhs, Expression rhs);
+
+ARROW_EXPORT Expression not_equal(Expression lhs, Expression rhs);
+
+ARROW_EXPORT Expression less(Expression lhs, Expression rhs);
+
+ARROW_EXPORT Expression less_equal(Expression lhs, Expression rhs);
+
+ARROW_EXPORT Expression greater(Expression lhs, Expression rhs);
+
+ARROW_EXPORT Expression greater_equal(Expression lhs, Expression rhs);
+
+ARROW_EXPORT Expression is_null(Expression lhs);
+
+ARROW_EXPORT Expression is_valid(Expression lhs);
+
+ARROW_EXPORT Expression and_(Expression lhs, Expression rhs);
+ARROW_EXPORT Expression and_(const std::vector<Expression>&);
+ARROW_EXPORT Expression or_(Expression lhs, Expression rhs);
+ARROW_EXPORT Expression or_(const std::vector<Expression>&);
+ARROW_EXPORT Expression not_(Expression operand);
+
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/expression_internal.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/expression_internal.h
new file mode 100644
index 00000000000..dc38924d932
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/expression_internal.h
@@ -0,0 +1,336 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/exec/expression.h"
+
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "arrow/compute/api_scalar.h"
+#include "arrow/compute/cast.h"
+#include "arrow/compute/registry.h"
+#include "arrow/record_batch.h"
+#include "arrow/table.h"
+#include "arrow/util/logging.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+
+namespace compute {
+
+struct KnownFieldValues {
+ std::unordered_map<FieldRef, Datum, FieldRef::Hash> map;
+};
+
+inline const Expression::Call* CallNotNull(const Expression& expr) {
+ auto call = expr.call();
+ DCHECK_NE(call, nullptr);
+ return call;
+}
+
+inline std::vector<ValueDescr> GetDescriptors(const std::vector<Expression>& exprs) {
+ std::vector<ValueDescr> descrs(exprs.size());
+ for (size_t i = 0; i < exprs.size(); ++i) {
+ DCHECK(exprs[i].IsBound());
+ descrs[i] = exprs[i].descr();
+ }
+ return descrs;
+}
+
+inline std::vector<ValueDescr> GetDescriptors(const std::vector<Datum>& values) {
+ std::vector<ValueDescr> descrs(values.size());
+ for (size_t i = 0; i < values.size(); ++i) {
+ descrs[i] = values[i].descr();
+ }
+ return descrs;
+}
+
+struct Comparison {
+ enum type {
+ NA = 0,
+ EQUAL = 1,
+ LESS = 2,
+ GREATER = 4,
+ NOT_EQUAL = LESS | GREATER,
+ LESS_EQUAL = LESS | EQUAL,
+ GREATER_EQUAL = GREATER | EQUAL,
+ };
+
+ static const type* Get(const std::string& function) {
+ static std::unordered_map<std::string, type> map{
+ {"equal", EQUAL}, {"not_equal", NOT_EQUAL},
+ {"less", LESS}, {"less_equal", LESS_EQUAL},
+ {"greater", GREATER}, {"greater_equal", GREATER_EQUAL},
+ };
+
+ auto it = map.find(function);
+ return it != map.end() ? &it->second : nullptr;
+ }
+
+ static const type* Get(const Expression& expr) {
+ if (auto call = expr.call()) {
+ return Comparison::Get(call->function_name);
+ }
+ return nullptr;
+ }
+
+ // Execute a simple Comparison between scalars
+ static Result<type> Execute(Datum l, Datum r) {
+ if (!l.is_scalar() || !r.is_scalar()) {
+ return Status::Invalid("Cannot Execute Comparison on non-scalars");
+ }
+
+ std::vector<Datum> arguments{std::move(l), std::move(r)};
+
+ ARROW_ASSIGN_OR_RAISE(auto equal, compute::CallFunction("equal", arguments));
+
+ if (!equal.scalar()->is_valid) return NA;
+ if (equal.scalar_as<BooleanScalar>().value) return EQUAL;
+
+ ARROW_ASSIGN_OR_RAISE(auto less, compute::CallFunction("less", arguments));
+
+ if (!less.scalar()->is_valid) return NA;
+ return less.scalar_as<BooleanScalar>().value ? LESS : GREATER;
+ }
+
+ // Given an Expression wrapped in casts which preserve ordering
+ // (for example, cast(field_ref("i16"), to_type=int32())), unwrap the inner Expression.
+ // This is used to destructure implicitly cast field_refs during Expression
+ // simplification.
+ static const Expression& StripOrderPreservingCasts(const Expression& expr) {
+ auto call = expr.call();
+ if (!call) return expr;
+ if (call->function_name != "cast") return expr;
+
+ const Expression& from = call->arguments[0];
+
+ auto from_id = from.type()->id();
+ auto to_id = expr.type()->id();
+
+ if (is_floating(to_id)) {
+ if (is_integer(from_id) || is_floating(from_id)) {
+ return StripOrderPreservingCasts(from);
+ }
+ return expr;
+ }
+
+ if (is_unsigned_integer(to_id)) {
+ if (is_unsigned_integer(from_id) && bit_width(to_id) >= bit_width(from_id)) {
+ return StripOrderPreservingCasts(from);
+ }
+ return expr;
+ }
+
+ if (is_signed_integer(to_id)) {
+ if (is_integer(from_id) && bit_width(to_id) >= bit_width(from_id)) {
+ return StripOrderPreservingCasts(from);
+ }
+ return expr;
+ }
+
+ return expr;
+ }
+
+ static type GetFlipped(type op) {
+ switch (op) {
+ case NA:
+ return NA;
+ case EQUAL:
+ return EQUAL;
+ case LESS:
+ return GREATER;
+ case GREATER:
+ return LESS;
+ case NOT_EQUAL:
+ return NOT_EQUAL;
+ case LESS_EQUAL:
+ return GREATER_EQUAL;
+ case GREATER_EQUAL:
+ return LESS_EQUAL;
+ }
+ DCHECK(false);
+ return NA;
+ }
+
+ static std::string GetName(type op) {
+ switch (op) {
+ case NA:
+ break;
+ case EQUAL:
+ return "equal";
+ case LESS:
+ return "less";
+ case GREATER:
+ return "greater";
+ case NOT_EQUAL:
+ return "not_equal";
+ case LESS_EQUAL:
+ return "less_equal";
+ case GREATER_EQUAL:
+ return "greater_equal";
+ }
+ return "na";
+ }
+
+ static std::string GetOp(type op) {
+ switch (op) {
+ case NA:
+ DCHECK(false) << "unreachable";
+ break;
+ case EQUAL:
+ return "==";
+ case LESS:
+ return "<";
+ case GREATER:
+ return ">";
+ case NOT_EQUAL:
+ return "!=";
+ case LESS_EQUAL:
+ return "<=";
+ case GREATER_EQUAL:
+ return ">=";
+ }
+ DCHECK(false);
+ return "";
+ }
+};
+
+inline const compute::CastOptions* GetCastOptions(const Expression::Call& call) {
+ if (call.function_name != "cast") return nullptr;
+ return checked_cast<const compute::CastOptions*>(call.options.get());
+}
+
+inline bool IsSetLookup(const std::string& function) {
+ return function == "is_in" || function == "index_in";
+}
+
+inline const compute::MakeStructOptions* GetMakeStructOptions(
+ const Expression::Call& call) {
+ if (call.function_name != "make_struct") return nullptr;
+ return checked_cast<const compute::MakeStructOptions*>(call.options.get());
+}
+
+/// A helper for unboxing an Expression composed of associative function calls.
+/// Such expressions can frequently be rearranged to a semantically equivalent
+/// expression for more optimal execution or more straightforward manipulation.
+/// For example, (a + ((b + 3) + 4)) is equivalent to (((4 + 3) + a) + b) and the latter
+/// can be trivially constant-folded to ((7 + a) + b).
+struct FlattenedAssociativeChain {
+ /// True if a chain was already a left fold.
+ bool was_left_folded = true;
+
+ /// All "branch" expressions in a flattened chain. For example given (a + ((b + 3) + 4))
+ /// exprs would be [(a + ((b + 3) + 4)), ((b + 3) + 4), (b + 3)]
+ std::vector<Expression> exprs;
+
+ /// All "leaf" expressions in a flattened chain. For example given (a + ((b + 3) + 4))
+ /// the fringe would be [a, b, 3, 4]
+ std::vector<Expression> fringe;
+
+ explicit FlattenedAssociativeChain(Expression expr) : exprs{std::move(expr)} {
+ auto call = CallNotNull(exprs.back());
+ fringe = call->arguments;
+
+ auto it = fringe.begin();
+
+ while (it != fringe.end()) {
+ auto sub_call = it->call();
+ if (!sub_call || sub_call->function_name != call->function_name) {
+ ++it;
+ continue;
+ }
+
+ if (it != fringe.begin()) {
+ was_left_folded = false;
+ }
+
+ exprs.push_back(std::move(*it));
+ it = fringe.erase(it);
+
+ auto index = it - fringe.begin();
+ fringe.insert(it, sub_call->arguments.begin(), sub_call->arguments.end());
+ it = fringe.begin() + index;
+ // NB: no increment so we hit sub_call's first argument next iteration
+ }
+
+ DCHECK(std::all_of(exprs.begin(), exprs.end(), [](const Expression& expr) {
+ return CallNotNull(expr)->options == nullptr;
+ }));
+ }
+};
+
+inline Result<std::shared_ptr<compute::Function>> GetFunction(
+ const Expression::Call& call, compute::ExecContext* exec_context) {
+ if (call.function_name != "cast") {
+ return exec_context->func_registry()->GetFunction(call.function_name);
+ }
+ // XXX this special case is strange; why not make "cast" a ScalarFunction?
+ const auto& to_type = checked_cast<const compute::CastOptions&>(*call.options).to_type;
+ return compute::GetCastFunction(to_type);
+}
+
+/// Modify an Expression with pre-order and post-order visitation.
+/// `pre` will be invoked on each Expression. `pre` will visit Calls before their
+/// arguments, `post_call` will visit Calls (and no other Expressions) after their
+/// arguments. Visitors should return the Identical expression to indicate no change; this
+/// will prevent unnecessary construction in the common case where a modification is not
+/// possible/necessary/...
+///
+/// If an argument was modified, `post_call` visits a reconstructed Call with the modified
+/// arguments but also receives a pointer to the unmodified Expression as a second
+/// argument. If no arguments were modified the unmodified Expression* will be nullptr.
+template <typename PreVisit, typename PostVisitCall>
+Result<Expression> Modify(Expression expr, const PreVisit& pre,
+ const PostVisitCall& post_call) {
+ ARROW_ASSIGN_OR_RAISE(expr, Result<Expression>(pre(std::move(expr))));
+
+ auto call = expr.call();
+ if (!call) return expr;
+
+ bool at_least_one_modified = false;
+ std::vector<Expression> modified_arguments;
+
+ for (size_t i = 0; i < call->arguments.size(); ++i) {
+ ARROW_ASSIGN_OR_RAISE(auto modified_argument,
+ Modify(call->arguments[i], pre, post_call));
+
+ if (Identical(modified_argument, call->arguments[i])) {
+ continue;
+ }
+
+ if (!at_least_one_modified) {
+ modified_arguments = call->arguments;
+ at_least_one_modified = true;
+ }
+
+ modified_arguments[i] = std::move(modified_argument);
+ }
+
+ if (at_least_one_modified) {
+ // reconstruct the call expression with the modified arguments
+ auto modified_call = *call;
+ modified_call.arguments = std::move(modified_arguments);
+ return post_call(Expression(std::move(modified_call)), &expr);
+ }
+
+ return post_call(std::move(expr), nullptr);
+}
+
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_compare.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_compare.cc
new file mode 100644
index 00000000000..7a5b0be9990
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_compare.cc
@@ -0,0 +1,268 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/exec/key_compare.h"
+
+#include <algorithm>
+#include <cstdint>
+
+#include "arrow/compute/exec/util.h"
+#include "arrow/util/ubsan.h"
+
+namespace arrow {
+namespace compute {
+
+void KeyCompare::CompareRows(uint32_t num_rows_to_compare,
+ const uint16_t* sel_left_maybe_null,
+ const uint32_t* left_to_right_map,
+ KeyEncoder::KeyEncoderContext* ctx, uint32_t* out_num_rows,
+ uint16_t* out_sel_left_maybe_same,
+ const KeyEncoder::KeyRowArray& rows_left,
+ const KeyEncoder::KeyRowArray& rows_right) {
+ ARROW_DCHECK(rows_left.metadata().is_compatible(rows_right.metadata()));
+
+ if (num_rows_to_compare == 0) {
+ *out_num_rows = 0;
+ return;
+ }
+
+ // Allocate temporary byte and bit vectors
+ auto bytevector_holder =
+ util::TempVectorHolder<uint8_t>(ctx->stack, num_rows_to_compare);
+ auto bitvector_holder =
+ util::TempVectorHolder<uint8_t>(ctx->stack, num_rows_to_compare);
+
+ uint8_t* match_bytevector = bytevector_holder.mutable_data();
+ uint8_t* match_bitvector = bitvector_holder.mutable_data();
+
+ // All comparison functions called here will update match byte vector
+ // (AND it with comparison result) instead of overwriting it.
+ memset(match_bytevector, 0xff, num_rows_to_compare);
+
+ if (rows_left.metadata().is_fixed_length) {
+ CompareFixedLength(num_rows_to_compare, sel_left_maybe_null, left_to_right_map,
+ match_bytevector, ctx, rows_left.metadata().fixed_length,
+ rows_left.data(1), rows_right.data(1));
+ } else {
+ CompareVaryingLength(num_rows_to_compare, sel_left_maybe_null, left_to_right_map,
+ match_bytevector, ctx, rows_left.data(2), rows_right.data(2),
+ rows_left.offsets(), rows_right.offsets());
+ }
+
+ // CompareFixedLength can be used to compare nulls as well
+ bool nulls_present = rows_left.has_any_nulls(ctx) || rows_right.has_any_nulls(ctx);
+ if (nulls_present) {
+ CompareFixedLength(num_rows_to_compare, sel_left_maybe_null, left_to_right_map,
+ match_bytevector, ctx,
+ rows_left.metadata().null_masks_bytes_per_row,
+ rows_left.null_masks(), rows_right.null_masks());
+ }
+
+ util::BitUtil::bytes_to_bits(ctx->hardware_flags, num_rows_to_compare, match_bytevector,
+ match_bitvector);
+ if (sel_left_maybe_null) {
+ int out_num_rows_int;
+ util::BitUtil::bits_filter_indexes(0, ctx->hardware_flags, num_rows_to_compare,
+ match_bitvector, sel_left_maybe_null,
+ &out_num_rows_int, out_sel_left_maybe_same);
+ *out_num_rows = out_num_rows_int;
+ } else {
+ int out_num_rows_int;
+ util::BitUtil::bits_to_indexes(0, ctx->hardware_flags, num_rows_to_compare,
+ match_bitvector, &out_num_rows_int,
+ out_sel_left_maybe_same);
+ *out_num_rows = out_num_rows_int;
+ }
+}
+
+void KeyCompare::CompareFixedLength(uint32_t num_rows_to_compare,
+ const uint16_t* sel_left_maybe_null,
+ const uint32_t* left_to_right_map,
+ uint8_t* match_bytevector,
+ KeyEncoder::KeyEncoderContext* ctx,
+ uint32_t fixed_length, const uint8_t* rows_left,
+ const uint8_t* rows_right) {
+ bool use_selection = (sel_left_maybe_null != nullptr);
+
+ uint32_t num_rows_already_processed = 0;
+
+#if defined(ARROW_HAVE_AVX2)
+ if (ctx->has_avx2() && !use_selection) {
+ // Choose between up-to-8B length, up-to-16B length and any size versions
+ if (fixed_length <= 8) {
+ num_rows_already_processed = CompareFixedLength_UpTo8B_avx2(
+ num_rows_to_compare, left_to_right_map, match_bytevector, fixed_length,
+ rows_left, rows_right);
+ } else if (fixed_length <= 16) {
+ num_rows_already_processed = CompareFixedLength_UpTo16B_avx2(
+ num_rows_to_compare, left_to_right_map, match_bytevector, fixed_length,
+ rows_left, rows_right);
+ } else {
+ num_rows_already_processed =
+ CompareFixedLength_avx2(num_rows_to_compare, left_to_right_map,
+ match_bytevector, fixed_length, rows_left, rows_right);
+ }
+ }
+#endif
+
+ typedef void (*CompareFixedLengthImp_t)(uint32_t, uint32_t, const uint16_t*,
+ const uint32_t*, uint8_t*, uint32_t,
+ const uint8_t*, const uint8_t*);
+ static const CompareFixedLengthImp_t CompareFixedLengthImp_fn[] = {
+ CompareFixedLengthImp<false, 1>, CompareFixedLengthImp<false, 2>,
+ CompareFixedLengthImp<false, 0>, CompareFixedLengthImp<true, 1>,
+ CompareFixedLengthImp<true, 2>, CompareFixedLengthImp<true, 0>};
+ int dispatch_const = (use_selection ? 3 : 0) +
+ ((fixed_length <= 8) ? 0 : ((fixed_length <= 16) ? 1 : 2));
+ CompareFixedLengthImp_fn[dispatch_const](
+ num_rows_already_processed, num_rows_to_compare, sel_left_maybe_null,
+ left_to_right_map, match_bytevector, fixed_length, rows_left, rows_right);
+}
+
+template <bool use_selection, int num_64bit_words>
+void KeyCompare::CompareFixedLengthImp(uint32_t num_rows_already_processed,
+ uint32_t num_rows,
+ const uint16_t* sel_left_maybe_null,
+ const uint32_t* left_to_right_map,
+ uint8_t* match_bytevector, uint32_t length,
+ const uint8_t* rows_left,
+ const uint8_t* rows_right) {
+ // Key length (for encoded key) has to be non-zero
+ ARROW_DCHECK(length > 0);
+
+ // Non-zero length guarantees no underflow
+ int32_t num_loops_less_one = (static_cast<int32_t>(length) + 7) / 8 - 1;
+
+ // Length remaining in last loop can only be zero for input length equal to zero
+ uint32_t length_remaining_last_loop = length - num_loops_less_one * 8;
+ uint64_t tail_mask = (~0ULL) >> (8 * (8 - length_remaining_last_loop));
+
+ for (uint32_t id_input = num_rows_already_processed; id_input < num_rows; ++id_input) {
+ uint32_t irow_left = use_selection ? sel_left_maybe_null[id_input] : id_input;
+ uint32_t irow_right = left_to_right_map[irow_left];
+ uint32_t begin_left = length * irow_left;
+ uint32_t begin_right = length * irow_right;
+ const uint64_t* key_left_ptr =
+ reinterpret_cast<const uint64_t*>(rows_left + begin_left);
+ const uint64_t* key_right_ptr =
+ reinterpret_cast<const uint64_t*>(rows_right + begin_right);
+ uint64_t result_or = 0ULL;
+ int32_t istripe = 0;
+
+ // Specializations for keys up to 8 bytes and between 9 and 16 bytes to
+ // avoid internal loop over words in the value for short ones.
+ //
+ // Template argument 0 means arbitrarily many 64-bit words,
+ // 1 means up to 1 and 2 means up to 2.
+ //
+ if (num_64bit_words == 0) {
+ for (; istripe < num_loops_less_one; ++istripe) {
+ uint64_t key_left = util::SafeLoad(&key_left_ptr[istripe]);
+ uint64_t key_right = util::SafeLoad(&key_right_ptr[istripe]);
+ result_or |= (key_left ^ key_right);
+ }
+ } else if (num_64bit_words == 2) {
+ uint64_t key_left = util::SafeLoad(&key_left_ptr[istripe]);
+ uint64_t key_right = util::SafeLoad(&key_right_ptr[istripe]);
+ result_or |= (key_left ^ key_right);
+ ++istripe;
+ }
+
+ uint64_t key_left = util::SafeLoad(&key_left_ptr[istripe]);
+ uint64_t key_right = util::SafeLoad(&key_right_ptr[istripe]);
+ result_or |= (tail_mask & (key_left ^ key_right));
+
+ int result = (result_or == 0 ? 0xff : 0);
+ match_bytevector[id_input] &= result;
+ }
+}
+
+void KeyCompare::CompareVaryingLength(uint32_t num_rows_to_compare,
+ const uint16_t* sel_left_maybe_null,
+ const uint32_t* left_to_right_map,
+ uint8_t* match_bytevector,
+ KeyEncoder::KeyEncoderContext* ctx,
+ const uint8_t* rows_left, const uint8_t* rows_right,
+ const uint32_t* offsets_left,
+ const uint32_t* offsets_right) {
+ bool use_selection = (sel_left_maybe_null != nullptr);
+
+#if defined(ARROW_HAVE_AVX2)
+ if (ctx->has_avx2() && !use_selection) {
+ CompareVaryingLength_avx2(num_rows_to_compare, left_to_right_map, match_bytevector,
+ rows_left, rows_right, offsets_left, offsets_right);
+ } else {
+#endif
+ if (use_selection) {
+ CompareVaryingLengthImp<true>(num_rows_to_compare, sel_left_maybe_null,
+ left_to_right_map, match_bytevector, rows_left,
+ rows_right, offsets_left, offsets_right);
+ } else {
+ CompareVaryingLengthImp<false>(num_rows_to_compare, sel_left_maybe_null,
+ left_to_right_map, match_bytevector, rows_left,
+ rows_right, offsets_left, offsets_right);
+ }
+#if defined(ARROW_HAVE_AVX2)
+ }
+#endif
+}
+
+template <bool use_selection>
+void KeyCompare::CompareVaryingLengthImp(
+ uint32_t num_rows, const uint16_t* sel_left_maybe_null,
+ const uint32_t* left_to_right_map, uint8_t* match_bytevector,
+ const uint8_t* rows_left, const uint8_t* rows_right, const uint32_t* offsets_left,
+ const uint32_t* offsets_right) {
+ static const uint64_t tail_masks[] = {
+ 0x0000000000000000ULL, 0x00000000000000ffULL, 0x000000000000ffffULL,
+ 0x0000000000ffffffULL, 0x00000000ffffffffULL, 0x000000ffffffffffULL,
+ 0x0000ffffffffffffULL, 0x00ffffffffffffffULL, 0xffffffffffffffffULL};
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ uint32_t irow_left = use_selection ? sel_left_maybe_null[i] : i;
+ uint32_t irow_right = left_to_right_map[irow_left];
+ uint32_t begin_left = offsets_left[irow_left];
+ uint32_t begin_right = offsets_right[irow_right];
+ uint32_t length_left = offsets_left[irow_left + 1] - begin_left;
+ uint32_t length_right = offsets_right[irow_right + 1] - begin_right;
+ uint32_t length = std::min(length_left, length_right);
+ const uint64_t* key_left_ptr =
+ reinterpret_cast<const uint64_t*>(rows_left + begin_left);
+ const uint64_t* key_right_ptr =
+ reinterpret_cast<const uint64_t*>(rows_right + begin_right);
+ uint64_t result_or = 0;
+ int32_t istripe;
+ // length can be zero
+ for (istripe = 0; istripe < (static_cast<int32_t>(length) + 7) / 8 - 1; ++istripe) {
+ uint64_t key_left = util::SafeLoad(&key_left_ptr[istripe]);
+ uint64_t key_right = util::SafeLoad(&key_right_ptr[istripe]);
+ result_or |= (key_left ^ key_right);
+ }
+
+ uint32_t length_remaining = length - static_cast<uint32_t>(istripe) * 8;
+ uint64_t tail_mask = tail_masks[length_remaining];
+
+ uint64_t key_left = util::SafeLoad(&key_left_ptr[istripe]);
+ uint64_t key_right = util::SafeLoad(&key_right_ptr[istripe]);
+ result_or |= (tail_mask & (key_left ^ key_right));
+
+ int result = (result_or == 0 ? 0xff : 0);
+ match_bytevector[i] &= result;
+ }
+}
+
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_compare.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_compare.h
new file mode 100644
index 00000000000..1dffabb884b
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_compare.h
@@ -0,0 +1,101 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+
+#include "arrow/compute/exec/key_encode.h"
+#include "arrow/compute/exec/util.h"
+#include "arrow/memory_pool.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+
+namespace arrow {
+namespace compute {
+
+class KeyCompare {
+ public:
+ // Returns a single 16-bit selection vector of rows that failed comparison.
+ // If there is input selection on the left, the resulting selection is a filtered image
+ // of input selection.
+ static void CompareRows(uint32_t num_rows_to_compare,
+ const uint16_t* sel_left_maybe_null,
+ const uint32_t* left_to_right_map,
+ KeyEncoder::KeyEncoderContext* ctx, uint32_t* out_num_rows,
+ uint16_t* out_sel_left_maybe_same,
+ const KeyEncoder::KeyRowArray& rows_left,
+ const KeyEncoder::KeyRowArray& rows_right);
+
+ private:
+ static void CompareFixedLength(uint32_t num_rows_to_compare,
+ const uint16_t* sel_left_maybe_null,
+ const uint32_t* left_to_right_map,
+ uint8_t* match_bytevector,
+ KeyEncoder::KeyEncoderContext* ctx,
+ uint32_t fixed_length, const uint8_t* rows_left,
+ const uint8_t* rows_right);
+ static void CompareVaryingLength(uint32_t num_rows_to_compare,
+ const uint16_t* sel_left_maybe_null,
+ const uint32_t* left_to_right_map,
+ uint8_t* match_bytevector,
+ KeyEncoder::KeyEncoderContext* ctx,
+ const uint8_t* rows_left, const uint8_t* rows_right,
+ const uint32_t* offsets_left,
+ const uint32_t* offsets_right);
+
+ // Second template argument is 0, 1 or 2.
+ // 0 means arbitrarily many 64-bit words, 1 means up to 1 and 2 means up to 2.
+ template <bool use_selection, int num_64bit_words>
+ static void CompareFixedLengthImp(uint32_t num_rows_already_processed,
+ uint32_t num_rows,
+ const uint16_t* sel_left_maybe_null,
+ const uint32_t* left_to_right_map,
+ uint8_t* match_bytevector, uint32_t length,
+ const uint8_t* rows_left, const uint8_t* rows_right);
+ template <bool use_selection>
+ static void CompareVaryingLengthImp(uint32_t num_rows,
+ const uint16_t* sel_left_maybe_null,
+ const uint32_t* left_to_right_map,
+ uint8_t* match_bytevector, const uint8_t* rows_left,
+ const uint8_t* rows_right,
+ const uint32_t* offsets_left,
+ const uint32_t* offsets_right);
+
+#if defined(ARROW_HAVE_AVX2)
+
+ static uint32_t CompareFixedLength_UpTo8B_avx2(
+ uint32_t num_rows, const uint32_t* left_to_right_map, uint8_t* match_bytevector,
+ uint32_t length, const uint8_t* rows_left, const uint8_t* rows_right);
+ static uint32_t CompareFixedLength_UpTo16B_avx2(
+ uint32_t num_rows, const uint32_t* left_to_right_map, uint8_t* match_bytevector,
+ uint32_t length, const uint8_t* rows_left, const uint8_t* rows_right);
+ static uint32_t CompareFixedLength_avx2(uint32_t num_rows,
+ const uint32_t* left_to_right_map,
+ uint8_t* match_bytevector, uint32_t length,
+ const uint8_t* rows_left,
+ const uint8_t* rows_right);
+ static void CompareVaryingLength_avx2(
+ uint32_t num_rows, const uint32_t* left_to_right_map, uint8_t* match_bytevector,
+ const uint8_t* rows_left, const uint8_t* rows_right, const uint32_t* offsets_left,
+ const uint32_t* offsets_right);
+
+#endif
+};
+
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_encode.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_encode.cc
new file mode 100644
index 00000000000..de79558f2c2
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_encode.cc
@@ -0,0 +1,1649 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/exec/key_encode.h"
+
+#include <memory.h>
+
+#include <algorithm>
+
+#include "arrow/compute/exec/util.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/ubsan.h"
+
+namespace arrow {
+namespace compute {
+
+KeyEncoder::KeyRowArray::KeyRowArray()
+ : pool_(nullptr), rows_capacity_(0), bytes_capacity_(0) {}
+
+Status KeyEncoder::KeyRowArray::Init(MemoryPool* pool, const KeyRowMetadata& metadata) {
+ pool_ = pool;
+ metadata_ = metadata;
+
+ DCHECK(!null_masks_ && !offsets_ && !rows_);
+
+ constexpr int64_t rows_capacity = 8;
+ constexpr int64_t bytes_capacity = 1024;
+
+ // Null masks
+ ARROW_ASSIGN_OR_RAISE(auto null_masks,
+ AllocateResizableBuffer(size_null_masks(rows_capacity), pool_));
+ null_masks_ = std::move(null_masks);
+ memset(null_masks_->mutable_data(), 0, size_null_masks(rows_capacity));
+
+ // Offsets and rows
+ if (!metadata.is_fixed_length) {
+ ARROW_ASSIGN_OR_RAISE(auto offsets,
+ AllocateResizableBuffer(size_offsets(rows_capacity), pool_));
+ offsets_ = std::move(offsets);
+ memset(offsets_->mutable_data(), 0, size_offsets(rows_capacity));
+ reinterpret_cast<uint32_t*>(offsets_->mutable_data())[0] = 0;
+
+ ARROW_ASSIGN_OR_RAISE(
+ auto rows,
+ AllocateResizableBuffer(size_rows_varying_length(bytes_capacity), pool_));
+ rows_ = std::move(rows);
+ memset(rows_->mutable_data(), 0, size_rows_varying_length(bytes_capacity));
+ bytes_capacity_ = size_rows_varying_length(bytes_capacity) - padding_for_vectors;
+ } else {
+ ARROW_ASSIGN_OR_RAISE(
+ auto rows, AllocateResizableBuffer(size_rows_fixed_length(rows_capacity), pool_));
+ rows_ = std::move(rows);
+ memset(rows_->mutable_data(), 0, size_rows_fixed_length(rows_capacity));
+ bytes_capacity_ = size_rows_fixed_length(rows_capacity) - padding_for_vectors;
+ }
+
+ update_buffer_pointers();
+
+ rows_capacity_ = rows_capacity;
+
+ num_rows_ = 0;
+ num_rows_for_has_any_nulls_ = 0;
+ has_any_nulls_ = false;
+
+ return Status::OK();
+}
+
+void KeyEncoder::KeyRowArray::Clean() {
+ num_rows_ = 0;
+ num_rows_for_has_any_nulls_ = 0;
+ has_any_nulls_ = false;
+
+ if (!metadata_.is_fixed_length) {
+ reinterpret_cast<uint32_t*>(offsets_->mutable_data())[0] = 0;
+ }
+}
+
+int64_t KeyEncoder::KeyRowArray::size_null_masks(int64_t num_rows) {
+ return num_rows * metadata_.null_masks_bytes_per_row + padding_for_vectors;
+}
+
+int64_t KeyEncoder::KeyRowArray::size_offsets(int64_t num_rows) {
+ return (num_rows + 1) * sizeof(uint32_t) + padding_for_vectors;
+}
+
+int64_t KeyEncoder::KeyRowArray::size_rows_fixed_length(int64_t num_rows) {
+ return num_rows * metadata_.fixed_length + padding_for_vectors;
+}
+
+int64_t KeyEncoder::KeyRowArray::size_rows_varying_length(int64_t num_bytes) {
+ return num_bytes + padding_for_vectors;
+}
+
+void KeyEncoder::KeyRowArray::update_buffer_pointers() {
+ buffers_[0] = mutable_buffers_[0] = null_masks_->mutable_data();
+ if (metadata_.is_fixed_length) {
+ buffers_[1] = mutable_buffers_[1] = rows_->mutable_data();
+ buffers_[2] = mutable_buffers_[2] = nullptr;
+ } else {
+ buffers_[1] = mutable_buffers_[1] = offsets_->mutable_data();
+ buffers_[2] = mutable_buffers_[2] = rows_->mutable_data();
+ }
+}
+
+Status KeyEncoder::KeyRowArray::ResizeFixedLengthBuffers(int64_t num_extra_rows) {
+ if (rows_capacity_ >= num_rows_ + num_extra_rows) {
+ return Status::OK();
+ }
+
+ int64_t rows_capacity_new = std::max(static_cast<int64_t>(1), 2 * rows_capacity_);
+ while (rows_capacity_new < num_rows_ + num_extra_rows) {
+ rows_capacity_new *= 2;
+ }
+
+ // Null masks
+ RETURN_NOT_OK(null_masks_->Resize(size_null_masks(rows_capacity_new), false));
+ memset(null_masks_->mutable_data() + size_null_masks(rows_capacity_), 0,
+ size_null_masks(rows_capacity_new) - size_null_masks(rows_capacity_));
+
+ // Either offsets or rows
+ if (!metadata_.is_fixed_length) {
+ RETURN_NOT_OK(offsets_->Resize(size_offsets(rows_capacity_new), false));
+ memset(offsets_->mutable_data() + size_offsets(rows_capacity_), 0,
+ size_offsets(rows_capacity_new) - size_offsets(rows_capacity_));
+ } else {
+ RETURN_NOT_OK(rows_->Resize(size_rows_fixed_length(rows_capacity_new), false));
+ memset(rows_->mutable_data() + size_rows_fixed_length(rows_capacity_), 0,
+ size_rows_fixed_length(rows_capacity_new) -
+ size_rows_fixed_length(rows_capacity_));
+ bytes_capacity_ = size_rows_fixed_length(rows_capacity_new) - padding_for_vectors;
+ }
+
+ update_buffer_pointers();
+
+ rows_capacity_ = rows_capacity_new;
+
+ return Status::OK();
+}
+
+Status KeyEncoder::KeyRowArray::ResizeOptionalVaryingLengthBuffer(
+ int64_t num_extra_bytes) {
+ int64_t num_bytes = offsets()[num_rows_];
+ if (bytes_capacity_ >= num_bytes + num_extra_bytes || metadata_.is_fixed_length) {
+ return Status::OK();
+ }
+
+ int64_t bytes_capacity_new = std::max(static_cast<int64_t>(1), 2 * bytes_capacity_);
+ while (bytes_capacity_new < num_bytes + num_extra_bytes) {
+ bytes_capacity_new *= 2;
+ }
+
+ RETURN_NOT_OK(rows_->Resize(size_rows_varying_length(bytes_capacity_new), false));
+ memset(rows_->mutable_data() + size_rows_varying_length(bytes_capacity_), 0,
+ size_rows_varying_length(bytes_capacity_new) -
+ size_rows_varying_length(bytes_capacity_));
+
+ update_buffer_pointers();
+
+ bytes_capacity_ = bytes_capacity_new;
+
+ return Status::OK();
+}
+
+Status KeyEncoder::KeyRowArray::AppendSelectionFrom(const KeyRowArray& from,
+ uint32_t num_rows_to_append,
+ const uint16_t* source_row_ids) {
+ DCHECK(metadata_.is_compatible(from.metadata()));
+
+ RETURN_NOT_OK(ResizeFixedLengthBuffers(num_rows_to_append));
+
+ if (!metadata_.is_fixed_length) {
+ // Varying-length rows
+ auto from_offsets = reinterpret_cast<const uint32_t*>(from.offsets_->data());
+ auto to_offsets = reinterpret_cast<uint32_t*>(offsets_->mutable_data());
+ uint32_t total_length = to_offsets[num_rows_];
+ uint32_t total_length_to_append = 0;
+ for (uint32_t i = 0; i < num_rows_to_append; ++i) {
+ uint16_t row_id = source_row_ids[i];
+ uint32_t length = from_offsets[row_id + 1] - from_offsets[row_id];
+ total_length_to_append += length;
+ to_offsets[num_rows_ + i + 1] = total_length + total_length_to_append;
+ }
+
+ RETURN_NOT_OK(ResizeOptionalVaryingLengthBuffer(total_length_to_append));
+
+ const uint8_t* src = from.rows_->data();
+ uint8_t* dst = rows_->mutable_data() + total_length;
+ for (uint32_t i = 0; i < num_rows_to_append; ++i) {
+ uint16_t row_id = source_row_ids[i];
+ uint32_t length = from_offsets[row_id + 1] - from_offsets[row_id];
+ auto src64 = reinterpret_cast<const uint64_t*>(src + from_offsets[row_id]);
+ auto dst64 = reinterpret_cast<uint64_t*>(dst);
+ for (uint32_t j = 0; j < (length + 7) / 8; ++j) {
+ dst64[j] = src64[j];
+ }
+ dst += length;
+ }
+ } else {
+ // Fixed-length rows
+ const uint8_t* src = from.rows_->data();
+ uint8_t* dst = rows_->mutable_data() + num_rows_ * metadata_.fixed_length;
+ for (uint32_t i = 0; i < num_rows_to_append; ++i) {
+ uint16_t row_id = source_row_ids[i];
+ uint32_t length = metadata_.fixed_length;
+ auto src64 = reinterpret_cast<const uint64_t*>(src + length * row_id);
+ auto dst64 = reinterpret_cast<uint64_t*>(dst);
+ for (uint32_t j = 0; j < (length + 7) / 8; ++j) {
+ dst64[j] = src64[j];
+ }
+ dst += length;
+ }
+ }
+
+ // Null masks
+ uint32_t byte_length = metadata_.null_masks_bytes_per_row;
+ uint64_t dst_byte_offset = num_rows_ * byte_length;
+ const uint8_t* src_base = from.null_masks_->data();
+ uint8_t* dst_base = null_masks_->mutable_data();
+ for (uint32_t i = 0; i < num_rows_to_append; ++i) {
+ uint32_t row_id = source_row_ids[i];
+ int64_t src_byte_offset = row_id * byte_length;
+ const uint8_t* src = src_base + src_byte_offset;
+ uint8_t* dst = dst_base + dst_byte_offset;
+ for (uint32_t ibyte = 0; ibyte < byte_length; ++ibyte) {
+ dst[ibyte] = src[ibyte];
+ }
+ dst_byte_offset += byte_length;
+ }
+
+ num_rows_ += num_rows_to_append;
+
+ return Status::OK();
+}
+
+Status KeyEncoder::KeyRowArray::AppendEmpty(uint32_t num_rows_to_append,
+ uint32_t num_extra_bytes_to_append) {
+ RETURN_NOT_OK(ResizeFixedLengthBuffers(num_rows_to_append));
+ RETURN_NOT_OK(ResizeOptionalVaryingLengthBuffer(num_extra_bytes_to_append));
+ num_rows_ += num_rows_to_append;
+ if (metadata_.row_alignment > 1 || metadata_.string_alignment > 1) {
+ memset(rows_->mutable_data(), 0, bytes_capacity_);
+ }
+ return Status::OK();
+}
+
+bool KeyEncoder::KeyRowArray::has_any_nulls(const KeyEncoderContext* ctx) const {
+ if (has_any_nulls_) {
+ return true;
+ }
+ if (num_rows_for_has_any_nulls_ < num_rows_) {
+ auto size_per_row = metadata().null_masks_bytes_per_row;
+ has_any_nulls_ = !util::BitUtil::are_all_bytes_zero(
+ ctx->hardware_flags, null_masks() + size_per_row * num_rows_for_has_any_nulls_,
+ static_cast<uint32_t>(size_per_row * (num_rows_ - num_rows_for_has_any_nulls_)));
+ num_rows_for_has_any_nulls_ = num_rows_;
+ }
+ return has_any_nulls_;
+}
+
+KeyEncoder::KeyColumnArray::KeyColumnArray(const KeyColumnMetadata& metadata,
+ const KeyColumnArray& left,
+ const KeyColumnArray& right,
+ int buffer_id_to_replace) {
+ metadata_ = metadata;
+ length_ = left.length();
+ for (int i = 0; i < max_buffers_; ++i) {
+ buffers_[i] = left.buffers_[i];
+ mutable_buffers_[i] = left.mutable_buffers_[i];
+ }
+ buffers_[buffer_id_to_replace] = right.buffers_[buffer_id_to_replace];
+ mutable_buffers_[buffer_id_to_replace] = right.mutable_buffers_[buffer_id_to_replace];
+ bit_offset_[0] = left.bit_offset_[0];
+ bit_offset_[1] = left.bit_offset_[1];
+ if (buffer_id_to_replace < max_buffers_ - 1) {
+ bit_offset_[buffer_id_to_replace] = right.bit_offset_[buffer_id_to_replace];
+ }
+}
+
+KeyEncoder::KeyColumnArray::KeyColumnArray(const KeyColumnMetadata& metadata,
+ int64_t length, const uint8_t* buffer0,
+ const uint8_t* buffer1, const uint8_t* buffer2,
+ int bit_offset0, int bit_offset1) {
+ metadata_ = metadata;
+ length_ = length;
+ buffers_[0] = buffer0;
+ buffers_[1] = buffer1;
+ buffers_[2] = buffer2;
+ mutable_buffers_[0] = mutable_buffers_[1] = mutable_buffers_[2] = nullptr;
+ bit_offset_[0] = bit_offset0;
+ bit_offset_[1] = bit_offset1;
+}
+
+KeyEncoder::KeyColumnArray::KeyColumnArray(const KeyColumnMetadata& metadata,
+ int64_t length, uint8_t* buffer0,
+ uint8_t* buffer1, uint8_t* buffer2,
+ int bit_offset0, int bit_offset1) {
+ metadata_ = metadata;
+ length_ = length;
+ buffers_[0] = mutable_buffers_[0] = buffer0;
+ buffers_[1] = mutable_buffers_[1] = buffer1;
+ buffers_[2] = mutable_buffers_[2] = buffer2;
+ bit_offset_[0] = bit_offset0;
+ bit_offset_[1] = bit_offset1;
+}
+
+KeyEncoder::KeyColumnArray::KeyColumnArray(const KeyColumnArray& from, int64_t start,
+ int64_t length) {
+ metadata_ = from.metadata_;
+ length_ = length;
+ uint32_t fixed_size =
+ !metadata_.is_fixed_length ? sizeof(uint32_t) : metadata_.fixed_length;
+
+ buffers_[0] =
+ from.buffers_[0] ? from.buffers_[0] + (from.bit_offset_[0] + start) / 8 : nullptr;
+ mutable_buffers_[0] = from.mutable_buffers_[0]
+ ? from.mutable_buffers_[0] + (from.bit_offset_[0] + start) / 8
+ : nullptr;
+ bit_offset_[0] = (from.bit_offset_[0] + start) % 8;
+
+ if (fixed_size == 0) {
+ buffers_[1] =
+ from.buffers_[1] ? from.buffers_[1] + (from.bit_offset_[1] + start) / 8 : nullptr;
+ mutable_buffers_[1] = from.mutable_buffers_[1] ? from.mutable_buffers_[1] +
+ (from.bit_offset_[1] + start) / 8
+ : nullptr;
+ bit_offset_[1] = (from.bit_offset_[1] + start) % 8;
+ } else {
+ buffers_[1] = from.buffers_[1] ? from.buffers_[1] + start * fixed_size : nullptr;
+ mutable_buffers_[1] = from.mutable_buffers_[1]
+ ? from.mutable_buffers_[1] + start * fixed_size
+ : nullptr;
+ bit_offset_[1] = 0;
+ }
+
+ buffers_[2] = from.buffers_[2];
+ mutable_buffers_[2] = from.mutable_buffers_[2];
+}
+
+KeyEncoder::KeyColumnArray KeyEncoder::TransformBoolean::ArrayReplace(
+ const KeyColumnArray& column, const KeyColumnArray& temp) {
+ // Make sure that the temp buffer is large enough
+ DCHECK(temp.length() >= column.length() && temp.metadata().is_fixed_length &&
+ temp.metadata().fixed_length >= sizeof(uint8_t));
+ KeyColumnMetadata metadata;
+ metadata.is_fixed_length = true;
+ metadata.fixed_length = sizeof(uint8_t);
+ constexpr int buffer_index = 1;
+ KeyColumnArray result = KeyColumnArray(metadata, column, temp, buffer_index);
+ return result;
+}
+
+void KeyEncoder::TransformBoolean::PreEncode(const KeyColumnArray& input,
+ KeyColumnArray* output,
+ KeyEncoderContext* ctx) {
+ // Make sure that metadata and lengths are compatible.
+ DCHECK(output->metadata().is_fixed_length == input.metadata().is_fixed_length);
+ DCHECK(output->metadata().fixed_length == 1 && input.metadata().fixed_length == 0);
+ DCHECK(output->length() == input.length());
+ constexpr int buffer_index = 1;
+ DCHECK(input.data(buffer_index) != nullptr);
+ DCHECK(output->mutable_data(buffer_index) != nullptr);
+ util::BitUtil::bits_to_bytes(
+ ctx->hardware_flags, static_cast<int>(input.length()), input.data(buffer_index),
+ output->mutable_data(buffer_index), input.bit_offset(buffer_index));
+}
+
+void KeyEncoder::TransformBoolean::PostDecode(const KeyColumnArray& input,
+ KeyColumnArray* output,
+ KeyEncoderContext* ctx) {
+ // Make sure that metadata and lengths are compatible.
+ DCHECK(output->metadata().is_fixed_length == input.metadata().is_fixed_length);
+ DCHECK(output->metadata().fixed_length == 0 && input.metadata().fixed_length == 1);
+ DCHECK(output->length() == input.length());
+ constexpr int buffer_index = 1;
+ DCHECK(input.data(buffer_index) != nullptr);
+ DCHECK(output->mutable_data(buffer_index) != nullptr);
+
+ util::BitUtil::bytes_to_bits(
+ ctx->hardware_flags, static_cast<int>(input.length()), input.data(buffer_index),
+ output->mutable_data(buffer_index), output->bit_offset(buffer_index));
+}
+
+bool KeyEncoder::EncoderInteger::IsBoolean(const KeyColumnMetadata& metadata) {
+ return metadata.is_fixed_length && metadata.fixed_length == 0;
+}
+
+bool KeyEncoder::EncoderInteger::UsesTransform(const KeyColumnArray& column) {
+ return IsBoolean(column.metadata());
+}
+
+KeyEncoder::KeyColumnArray KeyEncoder::EncoderInteger::ArrayReplace(
+ const KeyColumnArray& column, const KeyColumnArray& temp) {
+ if (IsBoolean(column.metadata())) {
+ return TransformBoolean::ArrayReplace(column, temp);
+ }
+ return column;
+}
+
+void KeyEncoder::EncoderInteger::PreEncode(const KeyColumnArray& input,
+ KeyColumnArray* output,
+ KeyEncoderContext* ctx) {
+ if (IsBoolean(input.metadata())) {
+ TransformBoolean::PreEncode(input, output, ctx);
+ }
+}
+
+void KeyEncoder::EncoderInteger::PostDecode(const KeyColumnArray& input,
+ KeyColumnArray* output,
+ KeyEncoderContext* ctx) {
+ if (IsBoolean(output->metadata())) {
+ TransformBoolean::PostDecode(input, output, ctx);
+ }
+}
+
+void KeyEncoder::EncoderInteger::Encode(uint32_t offset_within_row, KeyRowArray* rows,
+ const KeyColumnArray& col, KeyEncoderContext* ctx,
+ KeyColumnArray* temp) {
+ KeyColumnArray col_prep;
+ if (UsesTransform(col)) {
+ col_prep = ArrayReplace(col, *temp);
+ PreEncode(col, &col_prep, ctx);
+ } else {
+ col_prep = col;
+ }
+
+ const auto num_rows = static_cast<uint32_t>(col.length());
+
+ // When we have a single fixed length column we can just do memcpy
+ if (rows->metadata().is_fixed_length &&
+ rows->metadata().fixed_length == col.metadata().fixed_length) {
+ DCHECK_EQ(offset_within_row, 0);
+ uint32_t row_size = col.metadata().fixed_length;
+ memcpy(rows->mutable_data(1), col.data(1), num_rows * row_size);
+ } else if (rows->metadata().is_fixed_length) {
+ uint32_t row_size = rows->metadata().fixed_length;
+ uint8_t* row_base = rows->mutable_data(1) + offset_within_row;
+ const uint8_t* col_base = col_prep.data(1);
+ switch (col_prep.metadata().fixed_length) {
+ case 1:
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ row_base[i * row_size] = col_base[i];
+ }
+ break;
+ case 2:
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ *reinterpret_cast<uint16_t*>(row_base + i * row_size) =
+ reinterpret_cast<const uint16_t*>(col_base)[i];
+ }
+ break;
+ case 4:
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ *reinterpret_cast<uint32_t*>(row_base + i * row_size) =
+ reinterpret_cast<const uint32_t*>(col_base)[i];
+ }
+ break;
+ case 8:
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ *reinterpret_cast<uint64_t*>(row_base + i * row_size) =
+ reinterpret_cast<const uint64_t*>(col_base)[i];
+ }
+ break;
+ default:
+ DCHECK(false);
+ }
+ } else {
+ const uint32_t* row_offsets = rows->offsets();
+ uint8_t* row_base = rows->mutable_data(2) + offset_within_row;
+ const uint8_t* col_base = col_prep.data(1);
+ switch (col_prep.metadata().fixed_length) {
+ case 1:
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ row_base[row_offsets[i]] = col_base[i];
+ }
+ break;
+ case 2:
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ *reinterpret_cast<uint16_t*>(row_base + row_offsets[i]) =
+ reinterpret_cast<const uint16_t*>(col_base)[i];
+ }
+ break;
+ case 4:
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ *reinterpret_cast<uint32_t*>(row_base + row_offsets[i]) =
+ reinterpret_cast<const uint32_t*>(col_base)[i];
+ }
+ break;
+ case 8:
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ *reinterpret_cast<uint64_t*>(row_base + row_offsets[i]) =
+ reinterpret_cast<const uint64_t*>(col_base)[i];
+ }
+ break;
+ default:
+ DCHECK(false);
+ }
+ }
+}
+
+void KeyEncoder::EncoderInteger::Decode(uint32_t start_row, uint32_t num_rows,
+ uint32_t offset_within_row,
+ const KeyRowArray& rows, KeyColumnArray* col,
+ KeyEncoderContext* ctx, KeyColumnArray* temp) {
+ KeyColumnArray col_prep;
+ if (UsesTransform(*col)) {
+ col_prep = ArrayReplace(*col, *temp);
+ } else {
+ col_prep = *col;
+ }
+
+ // When we have a single fixed length column we can just do memcpy
+ if (rows.metadata().is_fixed_length &&
+ col_prep.metadata().fixed_length == rows.metadata().fixed_length) {
+ DCHECK_EQ(offset_within_row, 0);
+ uint32_t row_size = rows.metadata().fixed_length;
+ memcpy(col_prep.mutable_data(1), rows.data(1) + start_row * row_size,
+ num_rows * row_size);
+ } else if (rows.metadata().is_fixed_length) {
+ uint32_t row_size = rows.metadata().fixed_length;
+ const uint8_t* row_base = rows.data(1) + start_row * row_size;
+ row_base += offset_within_row;
+ uint8_t* col_base = col_prep.mutable_data(1);
+ switch (col_prep.metadata().fixed_length) {
+ case 1:
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ col_base[i] = row_base[i * row_size];
+ }
+ break;
+ case 2:
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ reinterpret_cast<uint16_t*>(col_base)[i] =
+ *reinterpret_cast<const uint16_t*>(row_base + i * row_size);
+ }
+ break;
+ case 4:
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ reinterpret_cast<uint32_t*>(col_base)[i] =
+ *reinterpret_cast<const uint32_t*>(row_base + i * row_size);
+ }
+ break;
+ case 8:
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ reinterpret_cast<uint64_t*>(col_base)[i] =
+ *reinterpret_cast<const uint64_t*>(row_base + i * row_size);
+ }
+ break;
+ default:
+ DCHECK(false);
+ }
+ } else {
+ const uint32_t* row_offsets = rows.offsets() + start_row;
+ const uint8_t* row_base = rows.data(2);
+ row_base += offset_within_row;
+ uint8_t* col_base = col_prep.mutable_data(1);
+ switch (col_prep.metadata().fixed_length) {
+ case 1:
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ col_base[i] = row_base[row_offsets[i]];
+ }
+ break;
+ case 2:
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ reinterpret_cast<uint16_t*>(col_base)[i] =
+ *reinterpret_cast<const uint16_t*>(row_base + row_offsets[i]);
+ }
+ break;
+ case 4:
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ reinterpret_cast<uint32_t*>(col_base)[i] =
+ *reinterpret_cast<const uint32_t*>(row_base + row_offsets[i]);
+ }
+ break;
+ case 8:
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ reinterpret_cast<uint64_t*>(col_base)[i] =
+ *reinterpret_cast<const uint64_t*>(row_base + row_offsets[i]);
+ }
+ break;
+ default:
+ DCHECK(false);
+ }
+ }
+
+ if (UsesTransform(*col)) {
+ PostDecode(col_prep, col, ctx);
+ }
+}
+
+bool KeyEncoder::EncoderBinary::IsInteger(const KeyColumnMetadata& metadata) {
+ bool is_fixed_length = metadata.is_fixed_length;
+ auto size = metadata.fixed_length;
+ return is_fixed_length &&
+ (size == 0 || size == 1 || size == 2 || size == 4 || size == 8);
+}
+
+void KeyEncoder::EncoderBinary::Encode(uint32_t offset_within_row, KeyRowArray* rows,
+ const KeyColumnArray& col, KeyEncoderContext* ctx,
+ KeyColumnArray* temp) {
+ if (IsInteger(col.metadata())) {
+ EncoderInteger::Encode(offset_within_row, rows, col, ctx, temp);
+ } else {
+ KeyColumnArray col_prep;
+ if (EncoderInteger::UsesTransform(col)) {
+ col_prep = EncoderInteger::ArrayReplace(col, *temp);
+ EncoderInteger::PreEncode(col, &col_prep, ctx);
+ } else {
+ col_prep = col;
+ }
+
+ bool is_row_fixed_length = rows->metadata().is_fixed_length;
+
+#if defined(ARROW_HAVE_AVX2)
+ if (ctx->has_avx2()) {
+ EncodeHelper_avx2(is_row_fixed_length, offset_within_row, rows, col);
+ } else {
+#endif
+ if (is_row_fixed_length) {
+ EncodeImp<true>(offset_within_row, rows, col);
+ } else {
+ EncodeImp<false>(offset_within_row, rows, col);
+ }
+#if defined(ARROW_HAVE_AVX2)
+ }
+#endif
+ }
+
+ DCHECK(temp->metadata().is_fixed_length);
+ DCHECK(temp->length() * temp->metadata().fixed_length >=
+ col.length() * static_cast<int64_t>(sizeof(uint16_t)));
+
+ KeyColumnArray temp16bit(KeyColumnMetadata(true, sizeof(uint16_t)), col.length(),
+ nullptr, temp->mutable_data(1), nullptr);
+ ColumnMemsetNulls(offset_within_row, rows, col, ctx, &temp16bit, 0xae);
+}
+
+void KeyEncoder::EncoderBinary::Decode(uint32_t start_row, uint32_t num_rows,
+ uint32_t offset_within_row,
+ const KeyRowArray& rows, KeyColumnArray* col,
+ KeyEncoderContext* ctx, KeyColumnArray* temp) {
+ if (IsInteger(col->metadata())) {
+ EncoderInteger::Decode(start_row, num_rows, offset_within_row, rows, col, ctx, temp);
+ } else {
+ KeyColumnArray col_prep;
+ if (EncoderInteger::UsesTransform(*col)) {
+ col_prep = EncoderInteger::ArrayReplace(*col, *temp);
+ } else {
+ col_prep = *col;
+ }
+
+ bool is_row_fixed_length = rows.metadata().is_fixed_length;
+
+#if defined(ARROW_HAVE_AVX2)
+ if (ctx->has_avx2()) {
+ DecodeHelper_avx2(is_row_fixed_length, start_row, num_rows, offset_within_row, rows,
+ col);
+ } else {
+#endif
+ if (is_row_fixed_length) {
+ DecodeImp<true>(start_row, num_rows, offset_within_row, rows, col);
+ } else {
+ DecodeImp<false>(start_row, num_rows, offset_within_row, rows, col);
+ }
+#if defined(ARROW_HAVE_AVX2)
+ }
+#endif
+
+ if (EncoderInteger::UsesTransform(*col)) {
+ EncoderInteger::PostDecode(col_prep, col, ctx);
+ }
+ }
+}
+
+template <bool is_row_fixed_length>
+void KeyEncoder::EncoderBinary::EncodeImp(uint32_t offset_within_row, KeyRowArray* rows,
+ const KeyColumnArray& col) {
+ EncodeDecodeHelper<is_row_fixed_length, true>(
+ 0, static_cast<uint32_t>(col.length()), offset_within_row, rows, rows, &col,
+ nullptr, [](uint8_t* dst, const uint8_t* src, int64_t length) {
+ auto dst64 = reinterpret_cast<uint64_t*>(dst);
+ auto src64 = reinterpret_cast<const uint64_t*>(src);
+ uint32_t istripe;
+ for (istripe = 0; istripe < length / 8; ++istripe) {
+ dst64[istripe] = util::SafeLoad(src64 + istripe);
+ }
+ if ((length % 8) > 0) {
+ uint64_t mask_last = ~0ULL >> (8 * (8 * (istripe + 1) - length));
+ dst64[istripe] = (dst64[istripe] & ~mask_last) |
+ (util::SafeLoad(src64 + istripe) & mask_last);
+ }
+ });
+}
+
+template <bool is_row_fixed_length>
+void KeyEncoder::EncoderBinary::DecodeImp(uint32_t start_row, uint32_t num_rows,
+ uint32_t offset_within_row,
+ const KeyRowArray& rows, KeyColumnArray* col) {
+ EncodeDecodeHelper<is_row_fixed_length, false>(
+ start_row, num_rows, offset_within_row, &rows, nullptr, col, col,
+ [](uint8_t* dst, const uint8_t* src, int64_t length) {
+ for (uint32_t istripe = 0; istripe < (length + 7) / 8; ++istripe) {
+ auto dst64 = reinterpret_cast<uint64_t*>(dst);
+ auto src64 = reinterpret_cast<const uint64_t*>(src);
+ util::SafeStore(dst64 + istripe, src64[istripe]);
+ }
+ });
+}
+
+void KeyEncoder::EncoderBinary::ColumnMemsetNulls(
+ uint32_t offset_within_row, KeyRowArray* rows, const KeyColumnArray& col,
+ KeyEncoderContext* ctx, KeyColumnArray* temp_vector_16bit, uint8_t byte_value) {
+ using ColumnMemsetNullsImp_t = void (*)(uint32_t, KeyRowArray*, const KeyColumnArray&,
+ KeyEncoderContext*, KeyColumnArray*, uint8_t);
+ static const ColumnMemsetNullsImp_t ColumnMemsetNullsImp_fn[] = {
+ ColumnMemsetNullsImp<false, 1>, ColumnMemsetNullsImp<false, 2>,
+ ColumnMemsetNullsImp<false, 4>, ColumnMemsetNullsImp<false, 8>,
+ ColumnMemsetNullsImp<false, 16>, ColumnMemsetNullsImp<true, 1>,
+ ColumnMemsetNullsImp<true, 2>, ColumnMemsetNullsImp<true, 4>,
+ ColumnMemsetNullsImp<true, 8>, ColumnMemsetNullsImp<true, 16>};
+ uint32_t col_width = col.metadata().fixed_length;
+ int dispatch_const =
+ (rows->metadata().is_fixed_length ? 5 : 0) +
+ (col_width == 1 ? 0
+ : col_width == 2 ? 1 : col_width == 4 ? 2 : col_width == 8 ? 3 : 4);
+ ColumnMemsetNullsImp_fn[dispatch_const](offset_within_row, rows, col, ctx,
+ temp_vector_16bit, byte_value);
+}
+
+template <bool is_row_fixed_length, uint32_t col_width>
+void KeyEncoder::EncoderBinary::ColumnMemsetNullsImp(
+ uint32_t offset_within_row, KeyRowArray* rows, const KeyColumnArray& col,
+ KeyEncoderContext* ctx, KeyColumnArray* temp_vector_16bit, uint8_t byte_value) {
+ // Nothing to do when there are no nulls
+ if (!col.data(0)) {
+ return;
+ }
+
+ const auto num_rows = static_cast<uint32_t>(col.length());
+
+ // Temp vector needs space for the required number of rows
+ DCHECK(temp_vector_16bit->length() >= num_rows);
+ DCHECK(temp_vector_16bit->metadata().is_fixed_length &&
+ temp_vector_16bit->metadata().fixed_length == sizeof(uint16_t));
+ auto temp_vector = reinterpret_cast<uint16_t*>(temp_vector_16bit->mutable_data(1));
+
+ // Bit vector to index vector of null positions
+ int num_selected;
+ util::BitUtil::bits_to_indexes(0, ctx->hardware_flags, static_cast<int>(col.length()),
+ col.data(0), &num_selected, temp_vector,
+ col.bit_offset(0));
+
+ for (int i = 0; i < num_selected; ++i) {
+ uint32_t row_id = temp_vector[i];
+
+ // Target binary field pointer
+ uint8_t* dst;
+ if (is_row_fixed_length) {
+ dst = rows->mutable_data(1) + rows->metadata().fixed_length * row_id;
+ } else {
+ dst = rows->mutable_data(2) + rows->offsets()[row_id];
+ }
+ dst += offset_within_row;
+
+ if (col_width == 1) {
+ *dst = byte_value;
+ } else if (col_width == 2) {
+ *reinterpret_cast<uint16_t*>(dst) =
+ (static_cast<uint16_t>(byte_value) * static_cast<uint16_t>(0x0101));
+ } else if (col_width == 4) {
+ *reinterpret_cast<uint32_t*>(dst) =
+ (static_cast<uint32_t>(byte_value) * static_cast<uint32_t>(0x01010101));
+ } else if (col_width == 8) {
+ *reinterpret_cast<uint64_t*>(dst) =
+ (static_cast<uint64_t>(byte_value) * 0x0101010101010101ULL);
+ } else {
+ uint64_t value = (static_cast<uint64_t>(byte_value) * 0x0101010101010101ULL);
+ uint32_t col_width_actual = col.metadata().fixed_length;
+ uint32_t j;
+ for (j = 0; j < col_width_actual / 8; ++j) {
+ reinterpret_cast<uint64_t*>(dst)[j] = value;
+ }
+ int tail = col_width_actual % 8;
+ if (tail) {
+ uint64_t mask = ~0ULL >> (8 * (8 - tail));
+ reinterpret_cast<uint64_t*>(dst)[j] =
+ (reinterpret_cast<const uint64_t*>(dst)[j] & ~mask) | (value & mask);
+ }
+ }
+ }
+}
+
+void KeyEncoder::EncoderBinaryPair::Encode(uint32_t offset_within_row, KeyRowArray* rows,
+ const KeyColumnArray& col1,
+ const KeyColumnArray& col2,
+ KeyEncoderContext* ctx, KeyColumnArray* temp1,
+ KeyColumnArray* temp2) {
+ DCHECK(CanProcessPair(col1.metadata(), col2.metadata()));
+
+ KeyColumnArray col_prep[2];
+ if (EncoderInteger::UsesTransform(col1)) {
+ col_prep[0] = EncoderInteger::ArrayReplace(col1, *temp1);
+ EncoderInteger::PreEncode(col1, &(col_prep[0]), ctx);
+ } else {
+ col_prep[0] = col1;
+ }
+ if (EncoderInteger::UsesTransform(col2)) {
+ col_prep[1] = EncoderInteger::ArrayReplace(col2, *temp2);
+ EncoderInteger::PreEncode(col2, &(col_prep[1]), ctx);
+ } else {
+ col_prep[1] = col2;
+ }
+
+ uint32_t col_width1 = col_prep[0].metadata().fixed_length;
+ uint32_t col_width2 = col_prep[1].metadata().fixed_length;
+ int log_col_width1 =
+ col_width1 == 8 ? 3 : col_width1 == 4 ? 2 : col_width1 == 2 ? 1 : 0;
+ int log_col_width2 =
+ col_width2 == 8 ? 3 : col_width2 == 4 ? 2 : col_width2 == 2 ? 1 : 0;
+
+ bool is_row_fixed_length = rows->metadata().is_fixed_length;
+
+ const auto num_rows = static_cast<uint32_t>(col1.length());
+ uint32_t num_processed = 0;
+#if defined(ARROW_HAVE_AVX2)
+ if (ctx->has_avx2() && col_width1 == col_width2) {
+ num_processed = EncodeHelper_avx2(is_row_fixed_length, col_width1, offset_within_row,
+ rows, col_prep[0], col_prep[1]);
+ }
+#endif
+ if (num_processed < num_rows) {
+ using EncodeImp_t = void (*)(uint32_t, uint32_t, KeyRowArray*, const KeyColumnArray&,
+ const KeyColumnArray&);
+ static const EncodeImp_t EncodeImp_fn[] = {
+ EncodeImp<false, uint8_t, uint8_t>, EncodeImp<false, uint16_t, uint8_t>,
+ EncodeImp<false, uint32_t, uint8_t>, EncodeImp<false, uint64_t, uint8_t>,
+ EncodeImp<false, uint8_t, uint16_t>, EncodeImp<false, uint16_t, uint16_t>,
+ EncodeImp<false, uint32_t, uint16_t>, EncodeImp<false, uint64_t, uint16_t>,
+ EncodeImp<false, uint8_t, uint32_t>, EncodeImp<false, uint16_t, uint32_t>,
+ EncodeImp<false, uint32_t, uint32_t>, EncodeImp<false, uint64_t, uint32_t>,
+ EncodeImp<false, uint8_t, uint64_t>, EncodeImp<false, uint16_t, uint64_t>,
+ EncodeImp<false, uint32_t, uint64_t>, EncodeImp<false, uint64_t, uint64_t>,
+ EncodeImp<true, uint8_t, uint8_t>, EncodeImp<true, uint16_t, uint8_t>,
+ EncodeImp<true, uint32_t, uint8_t>, EncodeImp<true, uint64_t, uint8_t>,
+ EncodeImp<true, uint8_t, uint16_t>, EncodeImp<true, uint16_t, uint16_t>,
+ EncodeImp<true, uint32_t, uint16_t>, EncodeImp<true, uint64_t, uint16_t>,
+ EncodeImp<true, uint8_t, uint32_t>, EncodeImp<true, uint16_t, uint32_t>,
+ EncodeImp<true, uint32_t, uint32_t>, EncodeImp<true, uint64_t, uint32_t>,
+ EncodeImp<true, uint8_t, uint64_t>, EncodeImp<true, uint16_t, uint64_t>,
+ EncodeImp<true, uint32_t, uint64_t>, EncodeImp<true, uint64_t, uint64_t>};
+ int dispatch_const = (log_col_width2 << 2) | log_col_width1;
+ dispatch_const += (is_row_fixed_length ? 16 : 0);
+ EncodeImp_fn[dispatch_const](num_processed, offset_within_row, rows, col_prep[0],
+ col_prep[1]);
+ }
+}
+
+template <bool is_row_fixed_length, typename col1_type, typename col2_type>
+void KeyEncoder::EncoderBinaryPair::EncodeImp(uint32_t num_rows_to_skip,
+ uint32_t offset_within_row,
+ KeyRowArray* rows,
+ const KeyColumnArray& col1,
+ const KeyColumnArray& col2) {
+ const uint8_t* src_A = col1.data(1);
+ const uint8_t* src_B = col2.data(1);
+
+ const auto num_rows = static_cast<uint32_t>(col1.length());
+
+ uint32_t fixed_length = rows->metadata().fixed_length;
+ const uint32_t* offsets;
+ uint8_t* dst_base;
+ if (is_row_fixed_length) {
+ dst_base = rows->mutable_data(1) + offset_within_row;
+ offsets = nullptr;
+ } else {
+ dst_base = rows->mutable_data(2) + offset_within_row;
+ offsets = rows->offsets();
+ }
+
+ using col1_type_const = typename std::add_const<col1_type>::type;
+ using col2_type_const = typename std::add_const<col2_type>::type;
+
+ if (is_row_fixed_length) {
+ uint8_t* dst = dst_base + num_rows_to_skip * fixed_length;
+ for (uint32_t i = num_rows_to_skip; i < num_rows; ++i) {
+ *reinterpret_cast<col1_type*>(dst) = reinterpret_cast<col1_type_const*>(src_A)[i];
+ *reinterpret_cast<col2_type*>(dst + sizeof(col1_type)) =
+ reinterpret_cast<col2_type_const*>(src_B)[i];
+ dst += fixed_length;
+ }
+ } else {
+ for (uint32_t i = num_rows_to_skip; i < num_rows; ++i) {
+ uint8_t* dst = dst_base + offsets[i];
+ *reinterpret_cast<col1_type*>(dst) = reinterpret_cast<col1_type_const*>(src_A)[i];
+ *reinterpret_cast<col2_type*>(dst + sizeof(col1_type)) =
+ reinterpret_cast<col2_type_const*>(src_B)[i];
+ }
+ }
+}
+
+void KeyEncoder::EncoderBinaryPair::Decode(uint32_t start_row, uint32_t num_rows,
+ uint32_t offset_within_row,
+ const KeyRowArray& rows, KeyColumnArray* col1,
+ KeyColumnArray* col2, KeyEncoderContext* ctx,
+ KeyColumnArray* temp1, KeyColumnArray* temp2) {
+ DCHECK(CanProcessPair(col1->metadata(), col2->metadata()));
+
+ KeyColumnArray col_prep[2];
+ if (EncoderInteger::UsesTransform(*col1)) {
+ col_prep[0] = EncoderInteger::ArrayReplace(*col1, *temp1);
+ } else {
+ col_prep[0] = *col1;
+ }
+ if (EncoderInteger::UsesTransform(*col2)) {
+ col_prep[1] = EncoderInteger::ArrayReplace(*col2, *temp2);
+ } else {
+ col_prep[1] = *col2;
+ }
+
+ uint32_t col_width1 = col_prep[0].metadata().fixed_length;
+ uint32_t col_width2 = col_prep[1].metadata().fixed_length;
+ int log_col_width1 =
+ col_width1 == 8 ? 3 : col_width1 == 4 ? 2 : col_width1 == 2 ? 1 : 0;
+ int log_col_width2 =
+ col_width2 == 8 ? 3 : col_width2 == 4 ? 2 : col_width2 == 2 ? 1 : 0;
+
+ bool is_row_fixed_length = rows.metadata().is_fixed_length;
+
+ uint32_t num_processed = 0;
+#if defined(ARROW_HAVE_AVX2)
+ if (ctx->has_avx2() && col_width1 == col_width2) {
+ num_processed =
+ DecodeHelper_avx2(is_row_fixed_length, col_width1, start_row, num_rows,
+ offset_within_row, rows, &col_prep[0], &col_prep[1]);
+ }
+#endif
+ if (num_processed < num_rows) {
+ using DecodeImp_t = void (*)(uint32_t, uint32_t, uint32_t, uint32_t,
+ const KeyRowArray&, KeyColumnArray*, KeyColumnArray*);
+ static const DecodeImp_t DecodeImp_fn[] = {
+ DecodeImp<false, uint8_t, uint8_t>, DecodeImp<false, uint16_t, uint8_t>,
+ DecodeImp<false, uint32_t, uint8_t>, DecodeImp<false, uint64_t, uint8_t>,
+ DecodeImp<false, uint8_t, uint16_t>, DecodeImp<false, uint16_t, uint16_t>,
+ DecodeImp<false, uint32_t, uint16_t>, DecodeImp<false, uint64_t, uint16_t>,
+ DecodeImp<false, uint8_t, uint32_t>, DecodeImp<false, uint16_t, uint32_t>,
+ DecodeImp<false, uint32_t, uint32_t>, DecodeImp<false, uint64_t, uint32_t>,
+ DecodeImp<false, uint8_t, uint64_t>, DecodeImp<false, uint16_t, uint64_t>,
+ DecodeImp<false, uint32_t, uint64_t>, DecodeImp<false, uint64_t, uint64_t>,
+ DecodeImp<true, uint8_t, uint8_t>, DecodeImp<true, uint16_t, uint8_t>,
+ DecodeImp<true, uint32_t, uint8_t>, DecodeImp<true, uint64_t, uint8_t>,
+ DecodeImp<true, uint8_t, uint16_t>, DecodeImp<true, uint16_t, uint16_t>,
+ DecodeImp<true, uint32_t, uint16_t>, DecodeImp<true, uint64_t, uint16_t>,
+ DecodeImp<true, uint8_t, uint32_t>, DecodeImp<true, uint16_t, uint32_t>,
+ DecodeImp<true, uint32_t, uint32_t>, DecodeImp<true, uint64_t, uint32_t>,
+ DecodeImp<true, uint8_t, uint64_t>, DecodeImp<true, uint16_t, uint64_t>,
+ DecodeImp<true, uint32_t, uint64_t>, DecodeImp<true, uint64_t, uint64_t>};
+ int dispatch_const =
+ (log_col_width2 << 2) | log_col_width1 | (is_row_fixed_length ? 16 : 0);
+ DecodeImp_fn[dispatch_const](num_processed, start_row, num_rows, offset_within_row,
+ rows, &(col_prep[0]), &(col_prep[1]));
+ }
+
+ if (EncoderInteger::UsesTransform(*col1)) {
+ EncoderInteger::PostDecode(col_prep[0], col1, ctx);
+ }
+ if (EncoderInteger::UsesTransform(*col2)) {
+ EncoderInteger::PostDecode(col_prep[1], col2, ctx);
+ }
+}
+
+template <bool is_row_fixed_length, typename col1_type, typename col2_type>
+void KeyEncoder::EncoderBinaryPair::DecodeImp(uint32_t num_rows_to_skip,
+ uint32_t start_row, uint32_t num_rows,
+ uint32_t offset_within_row,
+ const KeyRowArray& rows,
+ KeyColumnArray* col1,
+ KeyColumnArray* col2) {
+ DCHECK(rows.length() >= start_row + num_rows);
+ DCHECK(col1->length() == num_rows && col2->length() == num_rows);
+
+ uint8_t* dst_A = col1->mutable_data(1);
+ uint8_t* dst_B = col2->mutable_data(1);
+
+ uint32_t fixed_length = rows.metadata().fixed_length;
+ const uint32_t* offsets;
+ const uint8_t* src_base;
+ if (is_row_fixed_length) {
+ src_base = rows.data(1) + fixed_length * start_row + offset_within_row;
+ offsets = nullptr;
+ } else {
+ src_base = rows.data(2) + offset_within_row;
+ offsets = rows.offsets() + start_row;
+ }
+
+ using col1_type_const = typename std::add_const<col1_type>::type;
+ using col2_type_const = typename std::add_const<col2_type>::type;
+
+ if (is_row_fixed_length) {
+ const uint8_t* src = src_base + num_rows_to_skip * fixed_length;
+ for (uint32_t i = num_rows_to_skip; i < num_rows; ++i) {
+ reinterpret_cast<col1_type*>(dst_A)[i] = *reinterpret_cast<col1_type_const*>(src);
+ reinterpret_cast<col2_type*>(dst_B)[i] =
+ *reinterpret_cast<col2_type_const*>(src + sizeof(col1_type));
+ src += fixed_length;
+ }
+ } else {
+ for (uint32_t i = num_rows_to_skip; i < num_rows; ++i) {
+ const uint8_t* src = src_base + offsets[i];
+ reinterpret_cast<col1_type*>(dst_A)[i] = *reinterpret_cast<col1_type_const*>(src);
+ reinterpret_cast<col2_type*>(dst_B)[i] =
+ *reinterpret_cast<col2_type_const*>(src + sizeof(col1_type));
+ }
+ }
+}
+
+void KeyEncoder::EncoderOffsets::Encode(KeyRowArray* rows,
+ const std::vector<KeyColumnArray>& varbinary_cols,
+ KeyEncoderContext* ctx) {
+ DCHECK(!varbinary_cols.empty());
+
+ // Rows and columns must all be varying-length
+ DCHECK(!rows->metadata().is_fixed_length);
+ for (const auto& col : varbinary_cols) {
+ DCHECK(!col.metadata().is_fixed_length);
+ }
+
+ const auto num_rows = static_cast<uint32_t>(varbinary_cols[0].length());
+
+ uint32_t num_processed = 0;
+#if defined(ARROW_HAVE_AVX2)
+ // Whether any of the columns has non-zero starting bit offset for non-nulls bit vector
+ bool has_bit_offset = false;
+
+ // The space in columns must be exactly equal to a space for offsets in rows
+ DCHECK(rows->length() == num_rows);
+ for (const auto& col : varbinary_cols) {
+ DCHECK(col.length() == num_rows);
+ if (col.bit_offset(0) != 0) {
+ has_bit_offset = true;
+ }
+ }
+
+ if (ctx->has_avx2() && !has_bit_offset) {
+ // Create a temp vector sized based on the number of columns
+ auto temp_buffer_holder = util::TempVectorHolder<uint32_t>(
+ ctx->stack, static_cast<uint32_t>(varbinary_cols.size()) * 8);
+ auto temp_buffer_32B_per_col = KeyColumnArray(
+ KeyColumnMetadata(true, sizeof(uint32_t)), varbinary_cols.size() * 8, nullptr,
+ reinterpret_cast<uint8_t*>(temp_buffer_holder.mutable_data()), nullptr);
+
+ num_processed = EncodeImp_avx2(rows, varbinary_cols, &temp_buffer_32B_per_col);
+ }
+#endif
+ if (num_processed < num_rows) {
+ EncodeImp(num_processed, rows, varbinary_cols);
+ }
+}
+
+void KeyEncoder::EncoderOffsets::EncodeImp(
+ uint32_t num_rows_already_processed, KeyRowArray* rows,
+ const std::vector<KeyColumnArray>& varbinary_cols) {
+ DCHECK_GT(varbinary_cols.size(), 0);
+
+ int row_alignment = rows->metadata().row_alignment;
+ int string_alignment = rows->metadata().string_alignment;
+
+ uint32_t* row_offsets = rows->mutable_offsets();
+ uint8_t* row_values = rows->mutable_data(2);
+ const auto num_rows = static_cast<uint32_t>(varbinary_cols[0].length());
+
+ if (num_rows_already_processed == 0) {
+ row_offsets[0] = 0;
+ }
+
+ uint32_t row_offset = row_offsets[num_rows_already_processed];
+ for (uint32_t i = num_rows_already_processed; i < num_rows; ++i) {
+ uint32_t* varbinary_end =
+ rows->metadata().varbinary_end_array(row_values + row_offset);
+
+ // Zero out lengths for nulls.
+ // Add lengths of all columns to get row size.
+ // Store varbinary field ends while summing their lengths.
+
+ uint32_t offset_within_row = rows->metadata().fixed_length;
+
+ for (size_t col = 0; col < varbinary_cols.size(); ++col) {
+ const uint32_t* col_offsets = varbinary_cols[col].offsets();
+ uint32_t col_length = col_offsets[i + 1] - col_offsets[i];
+
+ const int bit_offset = varbinary_cols[col].bit_offset(0);
+
+ const uint8_t* non_nulls = varbinary_cols[col].data(0);
+ if (non_nulls && BitUtil::GetBit(non_nulls, bit_offset + i) == 0) {
+ col_length = 0;
+ }
+
+ offset_within_row +=
+ KeyRowMetadata::padding_for_alignment(offset_within_row, string_alignment);
+ offset_within_row += col_length;
+
+ varbinary_end[col] = offset_within_row;
+ }
+
+ offset_within_row +=
+ KeyRowMetadata::padding_for_alignment(offset_within_row, row_alignment);
+ row_offset += offset_within_row;
+ row_offsets[i + 1] = row_offset;
+ }
+}
+
+void KeyEncoder::EncoderOffsets::Decode(
+ uint32_t start_row, uint32_t num_rows, const KeyRowArray& rows,
+ std::vector<KeyColumnArray>* varbinary_cols,
+ const std::vector<uint32_t>& varbinary_cols_base_offset, KeyEncoderContext* ctx) {
+ DCHECK(!varbinary_cols->empty());
+ DCHECK(varbinary_cols->size() == varbinary_cols_base_offset.size());
+
+ DCHECK(!rows.metadata().is_fixed_length);
+ DCHECK(rows.length() >= start_row + num_rows);
+ for (const auto& col : *varbinary_cols) {
+ // Rows and columns must all be varying-length
+ DCHECK(!col.metadata().is_fixed_length);
+ // The space in columns must be exactly equal to a subset of rows selected
+ DCHECK(col.length() == num_rows);
+ }
+
+ // Offsets of varbinary columns data within each encoded row are stored
+ // in the same encoded row as an array of 32-bit integers.
+ // This array follows immediately the data of fixed-length columns.
+ // There is one element for each varying-length column.
+ // The Nth element is the sum of all the lengths of varbinary columns data in
+ // that row, up to and including Nth varbinary column.
+
+ const uint32_t* row_offsets = rows.offsets() + start_row;
+
+ // Set the base offset for each column
+ for (size_t col = 0; col < varbinary_cols->size(); ++col) {
+ uint32_t* col_offsets = (*varbinary_cols)[col].mutable_offsets();
+ col_offsets[0] = varbinary_cols_base_offset[col];
+ }
+
+ int string_alignment = rows.metadata().string_alignment;
+
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ // Find the beginning of cumulative lengths array for next row
+ const uint8_t* row = rows.data(2) + row_offsets[i];
+ const uint32_t* varbinary_ends = rows.metadata().varbinary_end_array(row);
+
+ // Update the offset of each column
+ uint32_t offset_within_row = rows.metadata().fixed_length;
+ for (size_t col = 0; col < varbinary_cols->size(); ++col) {
+ offset_within_row +=
+ KeyRowMetadata::padding_for_alignment(offset_within_row, string_alignment);
+ uint32_t length = varbinary_ends[col] - offset_within_row;
+ offset_within_row = varbinary_ends[col];
+ uint32_t* col_offsets = (*varbinary_cols)[col].mutable_offsets();
+ col_offsets[i + 1] = col_offsets[i] + length;
+ }
+ }
+}
+
+void KeyEncoder::EncoderVarBinary::Encode(uint32_t varbinary_col_id, KeyRowArray* rows,
+ const KeyColumnArray& col,
+ KeyEncoderContext* ctx) {
+#if defined(ARROW_HAVE_AVX2)
+ if (ctx->has_avx2()) {
+ EncodeHelper_avx2(varbinary_col_id, rows, col);
+ } else {
+#endif
+ if (varbinary_col_id == 0) {
+ EncodeImp<true>(varbinary_col_id, rows, col);
+ } else {
+ EncodeImp<false>(varbinary_col_id, rows, col);
+ }
+#if defined(ARROW_HAVE_AVX2)
+ }
+#endif
+}
+
+void KeyEncoder::EncoderVarBinary::Decode(uint32_t start_row, uint32_t num_rows,
+ uint32_t varbinary_col_id,
+ const KeyRowArray& rows, KeyColumnArray* col,
+ KeyEncoderContext* ctx) {
+ // Output column varbinary buffer needs an extra 32B
+ // at the end in avx2 version and 8B otherwise.
+#if defined(ARROW_HAVE_AVX2)
+ if (ctx->has_avx2()) {
+ DecodeHelper_avx2(start_row, num_rows, varbinary_col_id, rows, col);
+ } else {
+#endif
+ if (varbinary_col_id == 0) {
+ DecodeImp<true>(start_row, num_rows, varbinary_col_id, rows, col);
+ } else {
+ DecodeImp<false>(start_row, num_rows, varbinary_col_id, rows, col);
+ }
+#if defined(ARROW_HAVE_AVX2)
+ }
+#endif
+}
+
+template <bool first_varbinary_col>
+void KeyEncoder::EncoderVarBinary::EncodeImp(uint32_t varbinary_col_id, KeyRowArray* rows,
+ const KeyColumnArray& col) {
+ EncodeDecodeHelper<first_varbinary_col, true>(
+ 0, static_cast<uint32_t>(col.length()), varbinary_col_id, rows, rows, &col, nullptr,
+ [](uint8_t* dst, const uint8_t* src, int64_t length) {
+ auto dst64 = reinterpret_cast<uint64_t*>(dst);
+ auto src64 = reinterpret_cast<const uint64_t*>(src);
+ uint32_t istripe;
+ for (istripe = 0; istripe < length / 8; ++istripe) {
+ dst64[istripe] = util::SafeLoad(src64 + istripe);
+ }
+ if ((length % 8) > 0) {
+ uint64_t mask_last = ~0ULL >> (8 * (8 * (istripe + 1) - length));
+ dst64[istripe] = (dst64[istripe] & ~mask_last) |
+ (util::SafeLoad(src64 + istripe) & mask_last);
+ }
+ });
+}
+
+template <bool first_varbinary_col>
+void KeyEncoder::EncoderVarBinary::DecodeImp(uint32_t start_row, uint32_t num_rows,
+ uint32_t varbinary_col_id,
+ const KeyRowArray& rows,
+ KeyColumnArray* col) {
+ EncodeDecodeHelper<first_varbinary_col, false>(
+ start_row, num_rows, varbinary_col_id, &rows, nullptr, col, col,
+ [](uint8_t* dst, const uint8_t* src, int64_t length) {
+ for (uint32_t istripe = 0; istripe < (length + 7) / 8; ++istripe) {
+ auto dst64 = reinterpret_cast<uint64_t*>(dst);
+ auto src64 = reinterpret_cast<const uint64_t*>(src);
+ util::SafeStore(dst64 + istripe, src64[istripe]);
+ }
+ });
+}
+
+void KeyEncoder::EncoderNulls::Encode(KeyRowArray* rows,
+ const std::vector<KeyColumnArray>& cols,
+ KeyEncoderContext* ctx,
+ KeyColumnArray* temp_vector_16bit) {
+ DCHECK_GT(cols.size(), 0);
+ const auto num_rows = static_cast<uint32_t>(rows->length());
+
+ // All input columns should have the same number of rows.
+ // They may or may not have non-nulls bit-vectors allocated.
+ for (const auto& col : cols) {
+ DCHECK(col.length() == num_rows);
+ }
+
+ // Temp vector needs space for the required number of rows
+ DCHECK(temp_vector_16bit->length() >= num_rows);
+ DCHECK(temp_vector_16bit->metadata().is_fixed_length &&
+ temp_vector_16bit->metadata().fixed_length == sizeof(uint16_t));
+
+ uint8_t* null_masks = rows->null_masks();
+ uint32_t null_masks_bytes_per_row = rows->metadata().null_masks_bytes_per_row;
+ memset(null_masks, 0, null_masks_bytes_per_row * num_rows);
+ for (size_t col = 0; col < cols.size(); ++col) {
+ const uint8_t* non_nulls = cols[col].data(0);
+ if (!non_nulls) {
+ continue;
+ }
+ int bit_offset = cols[col].bit_offset(0);
+ DCHECK_LT(bit_offset, 8);
+ int num_selected;
+ util::BitUtil::bits_to_indexes(
+ 0, ctx->hardware_flags, num_rows, non_nulls, &num_selected,
+ reinterpret_cast<uint16_t*>(temp_vector_16bit->mutable_data(1)), bit_offset);
+ for (int i = 0; i < num_selected; ++i) {
+ uint16_t row_id = reinterpret_cast<const uint16_t*>(temp_vector_16bit->data(1))[i];
+ int64_t null_masks_bit_id = row_id * null_masks_bytes_per_row * 8 + col;
+ BitUtil::SetBit(null_masks, null_masks_bit_id);
+ }
+ }
+}
+
+void KeyEncoder::EncoderNulls::Decode(uint32_t start_row, uint32_t num_rows,
+ const KeyRowArray& rows,
+ std::vector<KeyColumnArray>* cols) {
+ // Every output column needs to have a space for exactly the required number
+ // of rows. It also needs to have non-nulls bit-vector allocated and mutable.
+ DCHECK_GT(cols->size(), 0);
+ for (auto& col : *cols) {
+ DCHECK(col.length() == num_rows);
+ DCHECK(col.mutable_data(0));
+ }
+
+ const uint8_t* null_masks = rows.null_masks();
+ uint32_t null_masks_bytes_per_row = rows.metadata().null_masks_bytes_per_row;
+ for (size_t col = 0; col < cols->size(); ++col) {
+ uint8_t* non_nulls = (*cols)[col].mutable_data(0);
+ const int bit_offset = (*cols)[col].bit_offset(0);
+ DCHECK_LT(bit_offset, 8);
+ non_nulls[0] |= 0xff << (bit_offset);
+ if (bit_offset + num_rows > 8) {
+ int bits_in_first_byte = 8 - bit_offset;
+ memset(non_nulls + 1, 0xff, BitUtil::BytesForBits(num_rows - bits_in_first_byte));
+ }
+ for (uint32_t row = 0; row < num_rows; ++row) {
+ uint32_t null_masks_bit_id =
+ (start_row + row) * null_masks_bytes_per_row * 8 + static_cast<uint32_t>(col);
+ bool is_set = BitUtil::GetBit(null_masks, null_masks_bit_id);
+ if (is_set) {
+ BitUtil::ClearBit(non_nulls, bit_offset + row);
+ }
+ }
+ }
+}
+
+uint32_t KeyEncoder::KeyRowMetadata::num_varbinary_cols() const {
+ uint32_t result = 0;
+ for (auto column_metadata : column_metadatas) {
+ if (!column_metadata.is_fixed_length) {
+ ++result;
+ }
+ }
+ return result;
+}
+
+bool KeyEncoder::KeyRowMetadata::is_compatible(const KeyRowMetadata& other) const {
+ if (other.num_cols() != num_cols()) {
+ return false;
+ }
+ if (row_alignment != other.row_alignment ||
+ string_alignment != other.string_alignment) {
+ return false;
+ }
+ for (size_t i = 0; i < column_metadatas.size(); ++i) {
+ if (column_metadatas[i].is_fixed_length !=
+ other.column_metadatas[i].is_fixed_length) {
+ return false;
+ }
+ if (column_metadatas[i].fixed_length != other.column_metadatas[i].fixed_length) {
+ return false;
+ }
+ }
+ return true;
+}
+
+void KeyEncoder::KeyRowMetadata::FromColumnMetadataVector(
+ const std::vector<KeyColumnMetadata>& cols, int in_row_alignment,
+ int in_string_alignment) {
+ column_metadatas.resize(cols.size());
+ for (size_t i = 0; i < cols.size(); ++i) {
+ column_metadatas[i] = cols[i];
+ }
+
+ const auto num_cols = static_cast<uint32_t>(cols.size());
+
+ // Sort columns.
+ // Columns are sorted based on the size in bytes of their fixed-length part.
+ // For the varying-length column, the fixed-length part is the 32-bit field storing
+ // cumulative length of varying-length fields.
+ // The rules are:
+ // a) Boolean column, marked with fixed-length 0, is considered to have fixed-length
+ // part of 1 byte. b) Columns with fixed-length part being power of 2 or multiple of row
+ // alignment precede other columns. They are sorted among themselves based on size of
+ // fixed-length part. c) Fixed-length columns precede varying-length columns when both
+ // have the same size fixed-length part.
+ column_order.resize(num_cols);
+ for (uint32_t i = 0; i < num_cols; ++i) {
+ column_order[i] = i;
+ }
+ std::sort(
+ column_order.begin(), column_order.end(), [&cols](uint32_t left, uint32_t right) {
+ bool is_left_pow2 =
+ !cols[left].is_fixed_length || ARROW_POPCOUNT64(cols[left].fixed_length) <= 1;
+ bool is_right_pow2 = !cols[right].is_fixed_length ||
+ ARROW_POPCOUNT64(cols[right].fixed_length) <= 1;
+ bool is_left_fixedlen = cols[left].is_fixed_length;
+ bool is_right_fixedlen = cols[right].is_fixed_length;
+ uint32_t width_left =
+ cols[left].is_fixed_length ? cols[left].fixed_length : sizeof(uint32_t);
+ uint32_t width_right =
+ cols[right].is_fixed_length ? cols[right].fixed_length : sizeof(uint32_t);
+ if (is_left_pow2 != is_right_pow2) {
+ return is_left_pow2;
+ }
+ if (!is_left_pow2) {
+ return left < right;
+ }
+ if (width_left != width_right) {
+ return width_left > width_right;
+ }
+ if (is_left_fixedlen != is_right_fixedlen) {
+ return is_left_fixedlen;
+ }
+ return left < right;
+ });
+
+ row_alignment = in_row_alignment;
+ string_alignment = in_string_alignment;
+ varbinary_end_array_offset = 0;
+
+ column_offsets.resize(num_cols);
+ uint32_t num_varbinary_cols = 0;
+ uint32_t offset_within_row = 0;
+ for (uint32_t i = 0; i < num_cols; ++i) {
+ const KeyColumnMetadata& col = cols[column_order[i]];
+ offset_within_row +=
+ KeyRowMetadata::padding_for_alignment(offset_within_row, string_alignment, col);
+ column_offsets[i] = offset_within_row;
+ if (!col.is_fixed_length) {
+ if (num_varbinary_cols == 0) {
+ varbinary_end_array_offset = offset_within_row;
+ }
+ DCHECK(column_offsets[i] - varbinary_end_array_offset ==
+ num_varbinary_cols * sizeof(uint32_t));
+ ++num_varbinary_cols;
+ offset_within_row += sizeof(uint32_t);
+ } else {
+ // Boolean column is a bit-vector, which is indicated by
+ // setting fixed length in column metadata to zero.
+ // It will be stored as a byte in output row.
+ if (col.fixed_length == 0) {
+ offset_within_row += 1;
+ } else {
+ offset_within_row += col.fixed_length;
+ }
+ }
+ }
+
+ is_fixed_length = (num_varbinary_cols == 0);
+ fixed_length =
+ offset_within_row +
+ KeyRowMetadata::padding_for_alignment(
+ offset_within_row, num_varbinary_cols == 0 ? row_alignment : string_alignment);
+
+ // We set the number of bytes per row storing null masks of individual key columns
+ // to be a power of two. This is not required. It could be also set to the minimal
+ // number of bytes required for a given number of bits (one bit per column).
+ null_masks_bytes_per_row = 1;
+ while (static_cast<uint32_t>(null_masks_bytes_per_row * 8) < num_cols) {
+ null_masks_bytes_per_row *= 2;
+ }
+}
+
+void KeyEncoder::Init(const std::vector<KeyColumnMetadata>& cols, KeyEncoderContext* ctx,
+ int row_alignment, int string_alignment) {
+ ctx_ = ctx;
+ row_metadata_.FromColumnMetadataVector(cols, row_alignment, string_alignment);
+ uint32_t num_cols = row_metadata_.num_cols();
+ uint32_t num_varbinary_cols = row_metadata_.num_varbinary_cols();
+ batch_all_cols_.resize(num_cols);
+ batch_varbinary_cols_.resize(num_varbinary_cols);
+ batch_varbinary_cols_base_offsets_.resize(num_varbinary_cols);
+}
+
+void KeyEncoder::PrepareKeyColumnArrays(int64_t start_row, int64_t num_rows,
+ const std::vector<KeyColumnArray>& cols_in) {
+ const auto num_cols = static_cast<uint32_t>(cols_in.size());
+ DCHECK(batch_all_cols_.size() == num_cols);
+
+ uint32_t num_varbinary_visited = 0;
+ for (uint32_t i = 0; i < num_cols; ++i) {
+ const KeyColumnArray& col = cols_in[row_metadata_.column_order[i]];
+ KeyColumnArray col_window(col, start_row, num_rows);
+ batch_all_cols_[i] = col_window;
+ if (!col.metadata().is_fixed_length) {
+ DCHECK(num_varbinary_visited < batch_varbinary_cols_.size());
+ // If start row is zero, then base offset of varbinary column is also zero.
+ if (start_row == 0) {
+ batch_varbinary_cols_base_offsets_[num_varbinary_visited] = 0;
+ } else {
+ batch_varbinary_cols_base_offsets_[num_varbinary_visited] =
+ col.offsets()[start_row];
+ }
+ batch_varbinary_cols_[num_varbinary_visited++] = col_window;
+ }
+ }
+}
+
+Status KeyEncoder::PrepareOutputForEncode(int64_t start_row, int64_t num_rows,
+ KeyRowArray* rows,
+ const std::vector<KeyColumnArray>& all_cols) {
+ int64_t num_bytes_required = 0;
+
+ int64_t fixed_part = row_metadata_.fixed_length * num_rows;
+ int64_t var_part = 0;
+ for (const auto& col : all_cols) {
+ if (!col.metadata().is_fixed_length) {
+ DCHECK(col.length() >= start_row + num_rows);
+ const uint32_t* offsets = col.offsets();
+ var_part += offsets[start_row + num_rows] - offsets[start_row];
+ // Include maximum padding that can be added to align the start of varbinary fields.
+ var_part += num_rows * row_metadata_.string_alignment;
+ }
+ }
+ // Include maximum padding that can be added to align the start of the rows.
+ if (!row_metadata_.is_fixed_length) {
+ fixed_part += row_metadata_.row_alignment * num_rows;
+ }
+ num_bytes_required = fixed_part + var_part;
+
+ rows->Clean();
+ RETURN_NOT_OK(rows->AppendEmpty(static_cast<uint32_t>(num_rows),
+ static_cast<uint32_t>(num_bytes_required)));
+
+ return Status::OK();
+}
+
+void KeyEncoder::Encode(int64_t start_row, int64_t num_rows, KeyRowArray* rows,
+ const std::vector<KeyColumnArray>& cols) {
+ // Prepare column array vectors
+ PrepareKeyColumnArrays(start_row, num_rows, cols);
+
+ // Create two temp vectors with 16-bit elements
+ auto temp_buffer_holder_A =
+ util::TempVectorHolder<uint16_t>(ctx_->stack, static_cast<uint32_t>(num_rows));
+ auto temp_buffer_A = KeyColumnArray(
+ KeyColumnMetadata(true, sizeof(uint16_t)), num_rows, nullptr,
+ reinterpret_cast<uint8_t*>(temp_buffer_holder_A.mutable_data()), nullptr);
+ auto temp_buffer_holder_B =
+ util::TempVectorHolder<uint16_t>(ctx_->stack, static_cast<uint32_t>(num_rows));
+ auto temp_buffer_B = KeyColumnArray(
+ KeyColumnMetadata(true, sizeof(uint16_t)), num_rows, nullptr,
+ reinterpret_cast<uint8_t*>(temp_buffer_holder_B.mutable_data()), nullptr);
+
+ bool is_row_fixed_length = row_metadata_.is_fixed_length;
+ if (!is_row_fixed_length) {
+ // This call will generate and fill in data for both:
+ // - offsets to the entire encoded arrays
+ // - offsets for individual varbinary fields within each row
+ EncoderOffsets::Encode(rows, batch_varbinary_cols_, ctx_);
+
+ for (size_t i = 0; i < batch_varbinary_cols_.size(); ++i) {
+ // Memcpy varbinary fields into precomputed in the previous step
+ // positions in the output row buffer.
+ EncoderVarBinary::Encode(static_cast<uint32_t>(i), rows, batch_varbinary_cols_[i],
+ ctx_);
+ }
+ }
+
+ // Process fixed length columns
+ const auto num_cols = static_cast<uint32_t>(batch_all_cols_.size());
+ for (uint32_t i = 0; i < num_cols;) {
+ if (!batch_all_cols_[i].metadata().is_fixed_length) {
+ i += 1;
+ continue;
+ }
+ bool can_process_pair =
+ (i + 1 < num_cols) && batch_all_cols_[i + 1].metadata().is_fixed_length &&
+ EncoderBinaryPair::CanProcessPair(batch_all_cols_[i].metadata(),
+ batch_all_cols_[i + 1].metadata());
+ if (!can_process_pair) {
+ EncoderBinary::Encode(row_metadata_.column_offsets[i], rows, batch_all_cols_[i],
+ ctx_, &temp_buffer_A);
+ i += 1;
+ } else {
+ EncoderBinaryPair::Encode(row_metadata_.column_offsets[i], rows, batch_all_cols_[i],
+ batch_all_cols_[i + 1], ctx_, &temp_buffer_A,
+ &temp_buffer_B);
+ i += 2;
+ }
+ }
+
+ // Process nulls
+ EncoderNulls::Encode(rows, batch_all_cols_, ctx_, &temp_buffer_A);
+}
+
+void KeyEncoder::DecodeFixedLengthBuffers(int64_t start_row_input,
+ int64_t start_row_output, int64_t num_rows,
+ const KeyRowArray& rows,
+ std::vector<KeyColumnArray>* cols) {
+ // Prepare column array vectors
+ PrepareKeyColumnArrays(start_row_output, num_rows, *cols);
+
+ // Create two temp vectors with 16-bit elements
+ auto temp_buffer_holder_A =
+ util::TempVectorHolder<uint16_t>(ctx_->stack, static_cast<uint32_t>(num_rows));
+ auto temp_buffer_A = KeyColumnArray(
+ KeyColumnMetadata(true, sizeof(uint16_t)), num_rows, nullptr,
+ reinterpret_cast<uint8_t*>(temp_buffer_holder_A.mutable_data()), nullptr);
+ auto temp_buffer_holder_B =
+ util::TempVectorHolder<uint16_t>(ctx_->stack, static_cast<uint32_t>(num_rows));
+ auto temp_buffer_B = KeyColumnArray(
+ KeyColumnMetadata(true, sizeof(uint16_t)), num_rows, nullptr,
+ reinterpret_cast<uint8_t*>(temp_buffer_holder_B.mutable_data()), nullptr);
+
+ bool is_row_fixed_length = row_metadata_.is_fixed_length;
+ if (!is_row_fixed_length) {
+ EncoderOffsets::Decode(static_cast<uint32_t>(start_row_input),
+ static_cast<uint32_t>(num_rows), rows, &batch_varbinary_cols_,
+ batch_varbinary_cols_base_offsets_, ctx_);
+ }
+
+ // Process fixed length columns
+ const auto num_cols = static_cast<uint32_t>(batch_all_cols_.size());
+ for (uint32_t i = 0; i < num_cols;) {
+ if (!batch_all_cols_[i].metadata().is_fixed_length) {
+ i += 1;
+ continue;
+ }
+ bool can_process_pair =
+ (i + 1 < num_cols) && batch_all_cols_[i + 1].metadata().is_fixed_length &&
+ EncoderBinaryPair::CanProcessPair(batch_all_cols_[i].metadata(),
+ batch_all_cols_[i + 1].metadata());
+ if (!can_process_pair) {
+ EncoderBinary::Decode(static_cast<uint32_t>(start_row_input),
+ static_cast<uint32_t>(num_rows),
+ row_metadata_.column_offsets[i], rows, &batch_all_cols_[i],
+ ctx_, &temp_buffer_A);
+ i += 1;
+ } else {
+ EncoderBinaryPair::Decode(
+ static_cast<uint32_t>(start_row_input), static_cast<uint32_t>(num_rows),
+ row_metadata_.column_offsets[i], rows, &batch_all_cols_[i],
+ &batch_all_cols_[i + 1], ctx_, &temp_buffer_A, &temp_buffer_B);
+ i += 2;
+ }
+ }
+
+ // Process nulls
+ EncoderNulls::Decode(static_cast<uint32_t>(start_row_input),
+ static_cast<uint32_t>(num_rows), rows, &batch_all_cols_);
+}
+
+void KeyEncoder::DecodeVaryingLengthBuffers(int64_t start_row_input,
+ int64_t start_row_output, int64_t num_rows,
+ const KeyRowArray& rows,
+ std::vector<KeyColumnArray>* cols) {
+ // Prepare column array vectors
+ PrepareKeyColumnArrays(start_row_output, num_rows, *cols);
+
+ bool is_row_fixed_length = row_metadata_.is_fixed_length;
+ if (!is_row_fixed_length) {
+ for (size_t i = 0; i < batch_varbinary_cols_.size(); ++i) {
+ // Memcpy varbinary fields into precomputed in the previous step
+ // positions in the output row buffer.
+ EncoderVarBinary::Decode(static_cast<uint32_t>(start_row_input),
+ static_cast<uint32_t>(num_rows), static_cast<uint32_t>(i),
+ rows, &batch_varbinary_cols_[i], ctx_);
+ }
+ }
+}
+
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_encode.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_encode.h
new file mode 100644
index 00000000000..e5397b9dfd4
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_encode.h
@@ -0,0 +1,635 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "arrow/compute/exec/util.h"
+#include "arrow/memory_pool.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/util/bit_util.h"
+
+namespace arrow {
+namespace compute {
+
+class KeyColumnMetadata;
+
+/// Converts between key representation as a collection of arrays for
+/// individual columns and another representation as a single array of rows
+/// combining data from all columns into one value.
+/// This conversion is reversible.
+/// Row-oriented storage is beneficial when there is a need for random access
+/// of individual rows and at the same time all included columns are likely to
+/// be accessed together, as in the case of hash table key.
+class KeyEncoder {
+ public:
+ struct KeyEncoderContext {
+ bool has_avx2() const {
+ return (hardware_flags & arrow::internal::CpuInfo::AVX2) > 0;
+ }
+ int64_t hardware_flags;
+ util::TempVectorStack* stack;
+ };
+
+ /// Description of a storage format of a single key column as needed
+ /// for the purpose of row encoding.
+ struct KeyColumnMetadata {
+ KeyColumnMetadata() = default;
+ KeyColumnMetadata(bool is_fixed_length_in, uint32_t fixed_length_in)
+ : is_fixed_length(is_fixed_length_in), fixed_length(fixed_length_in) {}
+ /// Is column storing a varying-length binary, using offsets array
+ /// to find a beginning of a value, or is it a fixed-length binary.
+ bool is_fixed_length;
+ /// For a fixed-length binary column: number of bytes per value.
+ /// Zero has a special meaning, indicating a bit vector with one bit per value.
+ /// For a varying-length binary column: number of bytes per offset.
+ uint32_t fixed_length;
+ };
+
+ /// Description of a storage format for rows produced by encoder.
+ struct KeyRowMetadata {
+ /// Is row a varying-length binary, using offsets array to find a beginning of a row,
+ /// or is it a fixed-length binary.
+ bool is_fixed_length;
+
+ /// For a fixed-length binary row, common size of rows in bytes,
+ /// rounded up to the multiple of alignment.
+ ///
+ /// For a varying-length binary, size of all encoded fixed-length key columns,
+ /// including lengths of varying-length columns, rounded up to the multiple of string
+ /// alignment.
+ uint32_t fixed_length;
+
+ /// Offset within a row to the array of 32-bit offsets within a row of
+ /// ends of varbinary fields.
+ /// Used only when the row is not fixed-length, zero for fixed-length row.
+ /// There are N elements for N varbinary fields.
+ /// Each element is the offset within a row of the first byte after
+ /// the corresponding varbinary field bytes in that row.
+ /// If varbinary fields begin at aligned addresses, than the end of the previous
+ /// varbinary field needs to be rounded up according to the specified alignment
+ /// to obtain the beginning of the next varbinary field.
+ /// The first varbinary field starts at offset specified by fixed_length,
+ /// which should already be aligned.
+ uint32_t varbinary_end_array_offset;
+
+ /// Fixed number of bytes per row that are used to encode null masks.
+ /// Null masks indicate for a single row which of its key columns are null.
+ /// Nth bit in the sequence of bytes assigned to a row represents null
+ /// information for Nth field according to the order in which they are encoded.
+ int null_masks_bytes_per_row;
+
+ /// Power of 2. Every row will start at the offset aligned to that number of bytes.
+ int row_alignment;
+
+ /// Power of 2. Must be no greater than row alignment.
+ /// Every non-power-of-2 binary field and every varbinary field bytes
+ /// will start aligned to that number of bytes.
+ int string_alignment;
+
+ /// Metadata of encoded columns in their original order.
+ std::vector<KeyColumnMetadata> column_metadatas;
+
+ /// Order in which fields are encoded.
+ std::vector<uint32_t> column_order;
+
+ /// Offsets within a row to fields in their encoding order.
+ std::vector<uint32_t> column_offsets;
+
+ /// Rounding up offset to the nearest multiple of alignment value.
+ /// Alignment must be a power of 2.
+ static inline uint32_t padding_for_alignment(uint32_t offset,
+ int required_alignment) {
+ ARROW_DCHECK(ARROW_POPCOUNT64(required_alignment) == 1);
+ return static_cast<uint32_t>((-static_cast<int32_t>(offset)) &
+ (required_alignment - 1));
+ }
+
+ /// Rounding up offset to the beginning of next column,
+ /// chosing required alignment based on the data type of that column.
+ static inline uint32_t padding_for_alignment(uint32_t offset, int string_alignment,
+ const KeyColumnMetadata& col_metadata) {
+ if (!col_metadata.is_fixed_length ||
+ ARROW_POPCOUNT64(col_metadata.fixed_length) <= 1) {
+ return 0;
+ } else {
+ return padding_for_alignment(offset, string_alignment);
+ }
+ }
+
+ /// Returns an array of offsets within a row of ends of varbinary fields.
+ inline const uint32_t* varbinary_end_array(const uint8_t* row) const {
+ ARROW_DCHECK(!is_fixed_length);
+ return reinterpret_cast<const uint32_t*>(row + varbinary_end_array_offset);
+ }
+ inline uint32_t* varbinary_end_array(uint8_t* row) const {
+ ARROW_DCHECK(!is_fixed_length);
+ return reinterpret_cast<uint32_t*>(row + varbinary_end_array_offset);
+ }
+
+ /// Returns the offset within the row and length of the first varbinary field.
+ inline void first_varbinary_offset_and_length(const uint8_t* row, uint32_t* offset,
+ uint32_t* length) const {
+ ARROW_DCHECK(!is_fixed_length);
+ *offset = fixed_length;
+ *length = varbinary_end_array(row)[0] - fixed_length;
+ }
+
+ /// Returns the offset within the row and length of the second and further varbinary
+ /// fields.
+ inline void nth_varbinary_offset_and_length(const uint8_t* row, int varbinary_id,
+ uint32_t* out_offset,
+ uint32_t* out_length) const {
+ ARROW_DCHECK(!is_fixed_length);
+ ARROW_DCHECK(varbinary_id > 0);
+ const uint32_t* varbinary_end = varbinary_end_array(row);
+ uint32_t offset = varbinary_end[varbinary_id - 1];
+ offset += padding_for_alignment(offset, string_alignment);
+ *out_offset = offset;
+ *out_length = varbinary_end[varbinary_id] - offset;
+ }
+
+ uint32_t encoded_field_order(uint32_t icol) const { return column_order[icol]; }
+
+ uint32_t encoded_field_offset(uint32_t icol) const { return column_offsets[icol]; }
+
+ uint32_t num_cols() const { return static_cast<uint32_t>(column_metadatas.size()); }
+
+ uint32_t num_varbinary_cols() const;
+
+ void FromColumnMetadataVector(const std::vector<KeyColumnMetadata>& cols,
+ int in_row_alignment, int in_string_alignment);
+
+ bool is_compatible(const KeyRowMetadata& other) const;
+ };
+
+ class KeyRowArray {
+ public:
+ KeyRowArray();
+ Status Init(MemoryPool* pool, const KeyRowMetadata& metadata);
+ void Clean();
+ Status AppendEmpty(uint32_t num_rows_to_append, uint32_t num_extra_bytes_to_append);
+ Status AppendSelectionFrom(const KeyRowArray& from, uint32_t num_rows_to_append,
+ const uint16_t* source_row_ids);
+ const KeyRowMetadata& metadata() const { return metadata_; }
+ int64_t length() const { return num_rows_; }
+ const uint8_t* data(int i) const {
+ ARROW_DCHECK(i >= 0 && i <= max_buffers_);
+ return buffers_[i];
+ }
+ uint8_t* mutable_data(int i) {
+ ARROW_DCHECK(i >= 0 && i <= max_buffers_);
+ return mutable_buffers_[i];
+ }
+ const uint32_t* offsets() const { return reinterpret_cast<const uint32_t*>(data(1)); }
+ uint32_t* mutable_offsets() { return reinterpret_cast<uint32_t*>(mutable_data(1)); }
+ const uint8_t* null_masks() const { return null_masks_->data(); }
+ uint8_t* null_masks() { return null_masks_->mutable_data(); }
+
+ bool has_any_nulls(const KeyEncoderContext* ctx) const;
+
+ private:
+ Status ResizeFixedLengthBuffers(int64_t num_extra_rows);
+ Status ResizeOptionalVaryingLengthBuffer(int64_t num_extra_bytes);
+
+ int64_t size_null_masks(int64_t num_rows);
+ int64_t size_offsets(int64_t num_rows);
+ int64_t size_rows_fixed_length(int64_t num_rows);
+ int64_t size_rows_varying_length(int64_t num_bytes);
+ void update_buffer_pointers();
+
+ static constexpr int64_t padding_for_vectors = 64;
+ MemoryPool* pool_;
+ KeyRowMetadata metadata_;
+ /// Buffers can only expand during lifetime and never shrink.
+ std::unique_ptr<ResizableBuffer> null_masks_;
+ std::unique_ptr<ResizableBuffer> offsets_;
+ std::unique_ptr<ResizableBuffer> rows_;
+ static constexpr int max_buffers_ = 3;
+ const uint8_t* buffers_[max_buffers_];
+ uint8_t* mutable_buffers_[max_buffers_];
+ int64_t num_rows_;
+ int64_t rows_capacity_;
+ int64_t bytes_capacity_;
+
+ // Mutable to allow lazy evaluation
+ mutable int64_t num_rows_for_has_any_nulls_;
+ mutable bool has_any_nulls_;
+ };
+
+ /// A lightweight description of an array representing one of key columns.
+ class KeyColumnArray {
+ public:
+ KeyColumnArray() = default;
+ /// Create as a mix of buffers according to the mask from two descriptions
+ /// (Nth bit is set to 0 if Nth buffer from the first input
+ /// should be used and is set to 1 otherwise).
+ /// Metadata is inherited from the first input.
+ KeyColumnArray(const KeyColumnMetadata& metadata, const KeyColumnArray& left,
+ const KeyColumnArray& right, int buffer_id_to_replace);
+ /// Create for reading
+ KeyColumnArray(const KeyColumnMetadata& metadata, int64_t length,
+ const uint8_t* buffer0, const uint8_t* buffer1, const uint8_t* buffer2,
+ int bit_offset0 = 0, int bit_offset1 = 0);
+ /// Create for writing
+ KeyColumnArray(const KeyColumnMetadata& metadata, int64_t length, uint8_t* buffer0,
+ uint8_t* buffer1, uint8_t* buffer2, int bit_offset0 = 0,
+ int bit_offset1 = 0);
+ /// Create as a window view of original description that is offset
+ /// by a given number of rows.
+ /// The number of rows used in offset must be divisible by 8
+ /// in order to not split bit vectors within a single byte.
+ KeyColumnArray(const KeyColumnArray& from, int64_t start, int64_t length);
+ uint8_t* mutable_data(int i) {
+ ARROW_DCHECK(i >= 0 && i <= max_buffers_);
+ return mutable_buffers_[i];
+ }
+ const uint8_t* data(int i) const {
+ ARROW_DCHECK(i >= 0 && i <= max_buffers_);
+ return buffers_[i];
+ }
+ uint32_t* mutable_offsets() { return reinterpret_cast<uint32_t*>(mutable_data(1)); }
+ const uint32_t* offsets() const { return reinterpret_cast<const uint32_t*>(data(1)); }
+ const KeyColumnMetadata& metadata() const { return metadata_; }
+ int64_t length() const { return length_; }
+ int bit_offset(int i) const {
+ ARROW_DCHECK(i >= 0 && i < max_buffers_);
+ return bit_offset_[i];
+ }
+
+ private:
+ static constexpr int max_buffers_ = 3;
+ const uint8_t* buffers_[max_buffers_];
+ uint8_t* mutable_buffers_[max_buffers_];
+ KeyColumnMetadata metadata_;
+ int64_t length_;
+ // Starting bit offset within the first byte (between 0 and 7)
+ // to be used when accessing buffers that store bit vectors.
+ int bit_offset_[max_buffers_ - 1];
+ };
+
+ void Init(const std::vector<KeyColumnMetadata>& cols, KeyEncoderContext* ctx,
+ int row_alignment, int string_alignment);
+
+ const KeyRowMetadata& row_metadata() { return row_metadata_; }
+
+ /// Find out the required sizes of all buffers output buffers for encoding
+ /// (including varying-length buffers).
+ /// Use that information to resize provided row array so that it can fit
+ /// encoded data.
+ Status PrepareOutputForEncode(int64_t start_input_row, int64_t num_input_rows,
+ KeyRowArray* rows,
+ const std::vector<KeyColumnArray>& all_cols);
+
+ /// Encode a window of column oriented data into the entire output
+ /// row oriented storage.
+ /// The output buffers for encoding need to be correctly sized before
+ /// starting encoding.
+ void Encode(int64_t start_input_row, int64_t num_input_rows, KeyRowArray* rows,
+ const std::vector<KeyColumnArray>& cols);
+
+ /// Decode a window of row oriented data into a corresponding
+ /// window of column oriented storage.
+ /// The output buffers need to be correctly allocated and sized before
+ /// calling each method.
+ /// For that reason decoding is split into two functions.
+ /// The output of the first one, that processes everything except for
+ /// varying length buffers, can be used to find out required varying
+ /// length buffers sizes.
+ void DecodeFixedLengthBuffers(int64_t start_row_input, int64_t start_row_output,
+ int64_t num_rows, const KeyRowArray& rows,
+ std::vector<KeyColumnArray>* cols);
+
+ void DecodeVaryingLengthBuffers(int64_t start_row_input, int64_t start_row_output,
+ int64_t num_rows, const KeyRowArray& rows,
+ std::vector<KeyColumnArray>* cols);
+
+ private:
+ /// Prepare column array vectors.
+ /// Output column arrays represent a range of input column arrays
+ /// specified by starting row and number of rows.
+ /// Three vectors are generated:
+ /// - all columns
+ /// - fixed-length columns only
+ /// - varying-length columns only
+ void PrepareKeyColumnArrays(int64_t start_row, int64_t num_rows,
+ const std::vector<KeyColumnArray>& cols_in);
+
+ class TransformBoolean {
+ public:
+ static KeyColumnArray ArrayReplace(const KeyColumnArray& column,
+ const KeyColumnArray& temp);
+ static void PreEncode(const KeyColumnArray& input, KeyColumnArray* output,
+ KeyEncoderContext* ctx);
+ static void PostDecode(const KeyColumnArray& input, KeyColumnArray* output,
+ KeyEncoderContext* ctx);
+ };
+
+ class EncoderInteger {
+ public:
+ static void Encode(uint32_t offset_within_row, KeyRowArray* rows,
+ const KeyColumnArray& col, KeyEncoderContext* ctx,
+ KeyColumnArray* temp);
+ static void Decode(uint32_t start_row, uint32_t num_rows, uint32_t offset_within_row,
+ const KeyRowArray& rows, KeyColumnArray* col,
+ KeyEncoderContext* ctx, KeyColumnArray* temp);
+ static bool UsesTransform(const KeyColumnArray& column);
+ static KeyColumnArray ArrayReplace(const KeyColumnArray& column,
+ const KeyColumnArray& temp);
+ static void PreEncode(const KeyColumnArray& input, KeyColumnArray* output,
+ KeyEncoderContext* ctx);
+ static void PostDecode(const KeyColumnArray& input, KeyColumnArray* output,
+ KeyEncoderContext* ctx);
+
+ private:
+ static bool IsBoolean(const KeyColumnMetadata& metadata);
+ };
+
+ class EncoderBinary {
+ public:
+ static void Encode(uint32_t offset_within_row, KeyRowArray* rows,
+ const KeyColumnArray& col, KeyEncoderContext* ctx,
+ KeyColumnArray* temp);
+ static void Decode(uint32_t start_row, uint32_t num_rows, uint32_t offset_within_row,
+ const KeyRowArray& rows, KeyColumnArray* col,
+ KeyEncoderContext* ctx, KeyColumnArray* temp);
+ static bool IsInteger(const KeyColumnMetadata& metadata);
+
+ private:
+ template <bool is_row_fixed_length, bool is_encoding, class COPY_FN>
+ static inline void EncodeDecodeHelper(uint32_t start_row, uint32_t num_rows,
+ uint32_t offset_within_row,
+ const KeyRowArray* rows_const,
+ KeyRowArray* rows_mutable_maybe_null,
+ const KeyColumnArray* col_const,
+ KeyColumnArray* col_mutable_maybe_null,
+ COPY_FN copy_fn);
+ template <bool is_row_fixed_length>
+ static void EncodeImp(uint32_t offset_within_row, KeyRowArray* rows,
+ const KeyColumnArray& col);
+ template <bool is_row_fixed_length>
+ static void DecodeImp(uint32_t start_row, uint32_t num_rows,
+ uint32_t offset_within_row, const KeyRowArray& rows,
+ KeyColumnArray* col);
+#if defined(ARROW_HAVE_AVX2)
+ static void EncodeHelper_avx2(bool is_row_fixed_length, uint32_t offset_within_row,
+ KeyRowArray* rows, const KeyColumnArray& col);
+ static void DecodeHelper_avx2(bool is_row_fixed_length, uint32_t start_row,
+ uint32_t num_rows, uint32_t offset_within_row,
+ const KeyRowArray& rows, KeyColumnArray* col);
+ template <bool is_row_fixed_length>
+ static void EncodeImp_avx2(uint32_t offset_within_row, KeyRowArray* rows,
+ const KeyColumnArray& col);
+ template <bool is_row_fixed_length>
+ static void DecodeImp_avx2(uint32_t start_row, uint32_t num_rows,
+ uint32_t offset_within_row, const KeyRowArray& rows,
+ KeyColumnArray* col);
+#endif
+ static void ColumnMemsetNulls(uint32_t offset_within_row, KeyRowArray* rows,
+ const KeyColumnArray& col, KeyEncoderContext* ctx,
+ KeyColumnArray* temp_vector_16bit, uint8_t byte_value);
+ template <bool is_row_fixed_length, uint32_t col_width>
+ static void ColumnMemsetNullsImp(uint32_t offset_within_row, KeyRowArray* rows,
+ const KeyColumnArray& col, KeyEncoderContext* ctx,
+ KeyColumnArray* temp_vector_16bit,
+ uint8_t byte_value);
+ };
+
+ class EncoderBinaryPair {
+ public:
+ static bool CanProcessPair(const KeyColumnMetadata& col1,
+ const KeyColumnMetadata& col2) {
+ return EncoderBinary::IsInteger(col1) && EncoderBinary::IsInteger(col2);
+ }
+ static void Encode(uint32_t offset_within_row, KeyRowArray* rows,
+ const KeyColumnArray& col1, const KeyColumnArray& col2,
+ KeyEncoderContext* ctx, KeyColumnArray* temp1,
+ KeyColumnArray* temp2);
+ static void Decode(uint32_t start_row, uint32_t num_rows, uint32_t offset_within_row,
+ const KeyRowArray& rows, KeyColumnArray* col1,
+ KeyColumnArray* col2, KeyEncoderContext* ctx,
+ KeyColumnArray* temp1, KeyColumnArray* temp2);
+
+ private:
+ template <bool is_row_fixed_length, typename col1_type, typename col2_type>
+ static void EncodeImp(uint32_t num_rows_to_skip, uint32_t offset_within_row,
+ KeyRowArray* rows, const KeyColumnArray& col1,
+ const KeyColumnArray& col2);
+ template <bool is_row_fixed_length, typename col1_type, typename col2_type>
+ static void DecodeImp(uint32_t num_rows_to_skip, uint32_t start_row,
+ uint32_t num_rows, uint32_t offset_within_row,
+ const KeyRowArray& rows, KeyColumnArray* col1,
+ KeyColumnArray* col2);
+#if defined(ARROW_HAVE_AVX2)
+ static uint32_t EncodeHelper_avx2(bool is_row_fixed_length, uint32_t col_width,
+ uint32_t offset_within_row, KeyRowArray* rows,
+ const KeyColumnArray& col1,
+ const KeyColumnArray& col2);
+ static uint32_t DecodeHelper_avx2(bool is_row_fixed_length, uint32_t col_width,
+ uint32_t start_row, uint32_t num_rows,
+ uint32_t offset_within_row, const KeyRowArray& rows,
+ KeyColumnArray* col1, KeyColumnArray* col2);
+ template <bool is_row_fixed_length, uint32_t col_width>
+ static uint32_t EncodeImp_avx2(uint32_t offset_within_row, KeyRowArray* rows,
+ const KeyColumnArray& col1,
+ const KeyColumnArray& col2);
+ template <bool is_row_fixed_length, uint32_t col_width>
+ static uint32_t DecodeImp_avx2(uint32_t start_row, uint32_t num_rows,
+ uint32_t offset_within_row, const KeyRowArray& rows,
+ KeyColumnArray* col1, KeyColumnArray* col2);
+#endif
+ };
+
+ class EncoderOffsets {
+ public:
+ // In order not to repeat work twice,
+ // encoding combines in a single pass computing of:
+ // a) row offsets for varying-length rows
+ // b) within each new row, the cumulative length array
+ // of varying-length values within a row.
+ static void Encode(KeyRowArray* rows,
+ const std::vector<KeyColumnArray>& varbinary_cols,
+ KeyEncoderContext* ctx);
+ static void Decode(uint32_t start_row, uint32_t num_rows, const KeyRowArray& rows,
+ std::vector<KeyColumnArray>* varbinary_cols,
+ const std::vector<uint32_t>& varbinary_cols_base_offset,
+ KeyEncoderContext* ctx);
+
+ private:
+ static void EncodeImp(uint32_t num_rows_already_processed, KeyRowArray* rows,
+ const std::vector<KeyColumnArray>& varbinary_cols);
+#if defined(ARROW_HAVE_AVX2)
+ static uint32_t EncodeImp_avx2(KeyRowArray* rows,
+ const std::vector<KeyColumnArray>& varbinary_cols,
+ KeyColumnArray* temp_buffer_32B_per_col);
+#endif
+ };
+
+ class EncoderVarBinary {
+ public:
+ static void Encode(uint32_t varbinary_col_id, KeyRowArray* rows,
+ const KeyColumnArray& col, KeyEncoderContext* ctx);
+ static void Decode(uint32_t start_row, uint32_t num_rows, uint32_t varbinary_col_id,
+ const KeyRowArray& rows, KeyColumnArray* col,
+ KeyEncoderContext* ctx);
+
+ private:
+ template <bool first_varbinary_col, bool is_encoding, class COPY_FN>
+ static inline void EncodeDecodeHelper(uint32_t start_row, uint32_t num_rows,
+ uint32_t varbinary_col_id,
+ const KeyRowArray* rows_const,
+ KeyRowArray* rows_mutable_maybe_null,
+ const KeyColumnArray* col_const,
+ KeyColumnArray* col_mutable_maybe_null,
+ COPY_FN copy_fn);
+ template <bool first_varbinary_col>
+ static void EncodeImp(uint32_t varbinary_col_id, KeyRowArray* rows,
+ const KeyColumnArray& col);
+ template <bool first_varbinary_col>
+ static void DecodeImp(uint32_t start_row, uint32_t num_rows,
+ uint32_t varbinary_col_id, const KeyRowArray& rows,
+ KeyColumnArray* col);
+#if defined(ARROW_HAVE_AVX2)
+ static void EncodeHelper_avx2(uint32_t varbinary_col_id, KeyRowArray* rows,
+ const KeyColumnArray& col);
+ static void DecodeHelper_avx2(uint32_t start_row, uint32_t num_rows,
+ uint32_t varbinary_col_id, const KeyRowArray& rows,
+ KeyColumnArray* col);
+ template <bool first_varbinary_col>
+ static void EncodeImp_avx2(uint32_t varbinary_col_id, KeyRowArray* rows,
+ const KeyColumnArray& col);
+ template <bool first_varbinary_col>
+ static void DecodeImp_avx2(uint32_t start_row, uint32_t num_rows,
+ uint32_t varbinary_col_id, const KeyRowArray& rows,
+ KeyColumnArray* col);
+#endif
+ };
+
+ class EncoderNulls {
+ public:
+ static void Encode(KeyRowArray* rows, const std::vector<KeyColumnArray>& cols,
+ KeyEncoderContext* ctx, KeyColumnArray* temp_vector_16bit);
+ static void Decode(uint32_t start_row, uint32_t num_rows, const KeyRowArray& rows,
+ std::vector<KeyColumnArray>* cols);
+ };
+
+ KeyEncoderContext* ctx_;
+
+ // Data initialized once, based on data types of key columns
+ KeyRowMetadata row_metadata_;
+
+ // Data initialized for each input batch.
+ // All elements are ordered according to the order of encoded fields in a row.
+ std::vector<KeyColumnArray> batch_all_cols_;
+ std::vector<KeyColumnArray> batch_varbinary_cols_;
+ std::vector<uint32_t> batch_varbinary_cols_base_offsets_;
+};
+
+template <bool is_row_fixed_length, bool is_encoding, class COPY_FN>
+inline void KeyEncoder::EncoderBinary::EncodeDecodeHelper(
+ uint32_t start_row, uint32_t num_rows, uint32_t offset_within_row,
+ const KeyRowArray* rows_const, KeyRowArray* rows_mutable_maybe_null,
+ const KeyColumnArray* col_const, KeyColumnArray* col_mutable_maybe_null,
+ COPY_FN copy_fn) {
+ ARROW_DCHECK(col_const && col_const->metadata().is_fixed_length);
+ uint32_t col_width = col_const->metadata().fixed_length;
+
+ if (is_row_fixed_length) {
+ uint32_t row_width = rows_const->metadata().fixed_length;
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ const uint8_t* src;
+ uint8_t* dst;
+ if (is_encoding) {
+ src = col_const->data(1) + col_width * i;
+ dst = rows_mutable_maybe_null->mutable_data(1) + row_width * (start_row + i) +
+ offset_within_row;
+ } else {
+ src = rows_const->data(1) + row_width * (start_row + i) + offset_within_row;
+ dst = col_mutable_maybe_null->mutable_data(1) + col_width * i;
+ }
+ copy_fn(dst, src, col_width);
+ }
+ } else {
+ const uint32_t* row_offsets = rows_const->offsets();
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ const uint8_t* src;
+ uint8_t* dst;
+ if (is_encoding) {
+ src = col_const->data(1) + col_width * i;
+ dst = rows_mutable_maybe_null->mutable_data(2) + row_offsets[start_row + i] +
+ offset_within_row;
+ } else {
+ src = rows_const->data(2) + row_offsets[start_row + i] + offset_within_row;
+ dst = col_mutable_maybe_null->mutable_data(1) + col_width * i;
+ }
+ copy_fn(dst, src, col_width);
+ }
+ }
+}
+
+template <bool first_varbinary_col, bool is_encoding, class COPY_FN>
+inline void KeyEncoder::EncoderVarBinary::EncodeDecodeHelper(
+ uint32_t start_row, uint32_t num_rows, uint32_t varbinary_col_id,
+ const KeyRowArray* rows_const, KeyRowArray* rows_mutable_maybe_null,
+ const KeyColumnArray* col_const, KeyColumnArray* col_mutable_maybe_null,
+ COPY_FN copy_fn) {
+ // Column and rows need to be varying length
+ ARROW_DCHECK(!rows_const->metadata().is_fixed_length &&
+ !col_const->metadata().is_fixed_length);
+
+ const uint32_t* row_offsets_for_batch = rows_const->offsets() + start_row;
+ const uint32_t* col_offsets = col_const->offsets();
+
+ uint32_t col_offset_next = col_offsets[0];
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ uint32_t col_offset = col_offset_next;
+ col_offset_next = col_offsets[i + 1];
+
+ uint32_t row_offset = row_offsets_for_batch[i];
+ const uint8_t* row = rows_const->data(2) + row_offset;
+
+ uint32_t offset_within_row;
+ uint32_t length;
+ if (first_varbinary_col) {
+ rows_const->metadata().first_varbinary_offset_and_length(row, &offset_within_row,
+ &length);
+ } else {
+ rows_const->metadata().nth_varbinary_offset_and_length(row, varbinary_col_id,
+ &offset_within_row, &length);
+ }
+
+ row_offset += offset_within_row;
+
+ const uint8_t* src;
+ uint8_t* dst;
+ if (is_encoding) {
+ src = col_const->data(2) + col_offset;
+ dst = rows_mutable_maybe_null->mutable_data(2) + row_offset;
+ } else {
+ src = rows_const->data(2) + row_offset;
+ dst = col_mutable_maybe_null->mutable_data(2) + col_offset;
+ }
+ copy_fn(dst, src, length);
+ }
+}
+
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_hash.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_hash.cc
new file mode 100644
index 00000000000..081411e708e
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_hash.cc
@@ -0,0 +1,238 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/exec/key_hash.h"
+
+#include <memory.h>
+
+#include <algorithm>
+#include <cstdint>
+
+#include "arrow/compute/exec/util.h"
+
+namespace arrow {
+namespace compute {
+
+inline uint32_t Hashing::avalanche_helper(uint32_t acc) {
+ acc ^= (acc >> 15);
+ acc *= PRIME32_2;
+ acc ^= (acc >> 13);
+ acc *= PRIME32_3;
+ acc ^= (acc >> 16);
+ return acc;
+}
+
+void Hashing::avalanche(int64_t hardware_flags, uint32_t num_keys, uint32_t* hashes) {
+ uint32_t processed = 0;
+#if defined(ARROW_HAVE_AVX2)
+ if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
+ int tail = num_keys % 8;
+ avalanche_avx2(num_keys - tail, hashes);
+ processed = num_keys - tail;
+ }
+#endif
+ for (uint32_t i = processed; i < num_keys; ++i) {
+ hashes[i] = avalanche_helper(hashes[i]);
+ }
+}
+
+inline uint32_t Hashing::combine_accumulators(const uint32_t acc1, const uint32_t acc2,
+ const uint32_t acc3, const uint32_t acc4) {
+ return ROTL(acc1, 1) + ROTL(acc2, 7) + ROTL(acc3, 12) + ROTL(acc4, 18);
+}
+
+inline void Hashing::helper_8B(uint32_t key_length, uint32_t num_keys,
+ const uint8_t* keys, uint32_t* hashes) {
+ ARROW_DCHECK(key_length <= 8);
+ uint64_t mask = ~0ULL >> (8 * (8 - key_length));
+ constexpr uint64_t multiplier = 14029467366897019727ULL;
+ uint32_t offset = 0;
+ for (uint32_t ikey = 0; ikey < num_keys; ++ikey) {
+ uint64_t x = *reinterpret_cast<const uint64_t*>(keys + offset);
+ x &= mask;
+ hashes[ikey] = static_cast<uint32_t>(BYTESWAP(x * multiplier));
+ offset += key_length;
+ }
+}
+
+inline void Hashing::helper_stripe(uint32_t offset, uint64_t mask_hi, const uint8_t* keys,
+ uint32_t& acc1, uint32_t& acc2, uint32_t& acc3,
+ uint32_t& acc4) {
+ uint64_t v1 = reinterpret_cast<const uint64_t*>(keys + offset)[0];
+ // We do not need to mask v1, because we will not process a stripe
+ // unless at least 9 bytes of it are part of the key.
+ uint64_t v2 = reinterpret_cast<const uint64_t*>(keys + offset)[1];
+ v2 &= mask_hi;
+ uint32_t x1 = static_cast<uint32_t>(v1);
+ uint32_t x2 = static_cast<uint32_t>(v1 >> 32);
+ uint32_t x3 = static_cast<uint32_t>(v2);
+ uint32_t x4 = static_cast<uint32_t>(v2 >> 32);
+ acc1 += x1 * PRIME32_2;
+ acc1 = ROTL(acc1, 13) * PRIME32_1;
+ acc2 += x2 * PRIME32_2;
+ acc2 = ROTL(acc2, 13) * PRIME32_1;
+ acc3 += x3 * PRIME32_2;
+ acc3 = ROTL(acc3, 13) * PRIME32_1;
+ acc4 += x4 * PRIME32_2;
+ acc4 = ROTL(acc4, 13) * PRIME32_1;
+}
+
+void Hashing::helper_stripes(int64_t hardware_flags, uint32_t num_keys,
+ uint32_t key_length, const uint8_t* keys, uint32_t* hash) {
+ uint32_t processed = 0;
+#if defined(ARROW_HAVE_AVX2)
+ if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
+ int tail = num_keys % 2;
+ helper_stripes_avx2(num_keys - tail, key_length, keys, hash);
+ processed = num_keys - tail;
+ }
+#endif
+
+ // If length modulo stripe length is less than or equal 8, round down to the nearest 16B
+ // boundary (8B ending will be processed in a separate function), otherwise round up.
+ const uint32_t num_stripes = (key_length + 7) / 16;
+ uint64_t mask_hi =
+ ~0ULL >>
+ (8 * ((num_stripes * 16 > key_length) ? num_stripes * 16 - key_length : 0));
+
+ for (uint32_t i = processed; i < num_keys; ++i) {
+ uint32_t acc1, acc2, acc3, acc4;
+ acc1 = static_cast<uint32_t>(
+ (static_cast<uint64_t>(PRIME32_1) + static_cast<uint64_t>(PRIME32_2)) &
+ 0xffffffff);
+ acc2 = PRIME32_2;
+ acc3 = 0;
+ acc4 = static_cast<uint32_t>(-static_cast<int32_t>(PRIME32_1));
+ uint32_t offset = i * key_length;
+ for (uint32_t stripe = 0; stripe < num_stripes - 1; ++stripe) {
+ helper_stripe(offset, ~0ULL, keys, acc1, acc2, acc3, acc4);
+ offset += 16;
+ }
+ helper_stripe(offset, mask_hi, keys, acc1, acc2, acc3, acc4);
+ hash[i] = combine_accumulators(acc1, acc2, acc3, acc4);
+ }
+}
+
+inline uint32_t Hashing::helper_tail(uint32_t offset, uint64_t mask, const uint8_t* keys,
+ uint32_t acc) {
+ uint64_t v = reinterpret_cast<const uint64_t*>(keys + offset)[0];
+ v &= mask;
+ uint32_t x1 = static_cast<uint32_t>(v);
+ uint32_t x2 = static_cast<uint32_t>(v >> 32);
+ acc += x1 * PRIME32_3;
+ acc = ROTL(acc, 17) * PRIME32_4;
+ acc += x2 * PRIME32_3;
+ acc = ROTL(acc, 17) * PRIME32_4;
+ return acc;
+}
+
+void Hashing::helper_tails(int64_t hardware_flags, uint32_t num_keys, uint32_t key_length,
+ const uint8_t* keys, uint32_t* hash) {
+ uint32_t processed = 0;
+#if defined(ARROW_HAVE_AVX2)
+ if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
+ int tail = num_keys % 8;
+ helper_tails_avx2(num_keys - tail, key_length, keys, hash);
+ processed = num_keys - tail;
+ }
+#endif
+ uint64_t mask = ~0ULL >> (8 * (((key_length % 8) == 0) ? 0 : 8 - (key_length % 8)));
+ uint32_t offset = key_length / 16 * 16;
+ offset += processed * key_length;
+ for (uint32_t i = processed; i < num_keys; ++i) {
+ hash[i] = helper_tail(offset, mask, keys, hash[i]);
+ offset += key_length;
+ }
+}
+
+void Hashing::hash_fixed(int64_t hardware_flags, uint32_t num_keys, uint32_t length_key,
+ const uint8_t* keys, uint32_t* hashes) {
+ ARROW_DCHECK(length_key > 0);
+
+ if (length_key <= 8) {
+ helper_8B(length_key, num_keys, keys, hashes);
+ return;
+ }
+ helper_stripes(hardware_flags, num_keys, length_key, keys, hashes);
+ if ((length_key % 16) > 0 && (length_key % 16) <= 8) {
+ helper_tails(hardware_flags, num_keys, length_key, keys, hashes);
+ }
+ avalanche(hardware_flags, num_keys, hashes);
+}
+
+void Hashing::hash_varlen_helper(uint32_t length, const uint8_t* key, uint32_t* acc) {
+ for (uint32_t i = 0; i < length / 16; ++i) {
+ for (int j = 0; j < 4; ++j) {
+ uint32_t lane = reinterpret_cast<const uint32_t*>(key)[i * 4 + j];
+ acc[j] += (lane * PRIME32_2);
+ acc[j] = ROTL(acc[j], 13);
+ acc[j] *= PRIME32_1;
+ }
+ }
+
+ int tail = length % 16;
+ if (tail) {
+ uint64_t last_stripe[2];
+ const uint64_t* last_stripe_base =
+ reinterpret_cast<const uint64_t*>(key + length - (length % 16));
+ last_stripe[0] = last_stripe_base[0];
+ uint64_t mask = ~0ULL >> (8 * ((length + 7) / 8 * 8 - length));
+ if (tail <= 8) {
+ last_stripe[1] = 0;
+ last_stripe[0] &= mask;
+ } else {
+ last_stripe[1] = last_stripe_base[1];
+ last_stripe[1] &= mask;
+ }
+ for (int j = 0; j < 4; ++j) {
+ uint32_t lane = reinterpret_cast<const uint32_t*>(last_stripe)[j];
+ acc[j] += (lane * PRIME32_2);
+ acc[j] = ROTL(acc[j], 13);
+ acc[j] *= PRIME32_1;
+ }
+ }
+}
+
+void Hashing::hash_varlen(int64_t hardware_flags, uint32_t num_rows,
+ const uint32_t* offsets, const uint8_t* concatenated_keys,
+ uint32_t* temp_buffer, // Needs to hold 4 x 32-bit per row
+ uint32_t* hashes) {
+#if defined(ARROW_HAVE_AVX2)
+ if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
+ hash_varlen_avx2(num_rows, offsets, concatenated_keys, temp_buffer, hashes);
+ } else {
+#endif
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ uint32_t acc[4];
+ acc[0] = static_cast<uint32_t>(
+ (static_cast<uint64_t>(PRIME32_1) + static_cast<uint64_t>(PRIME32_2)) &
+ 0xffffffff);
+ acc[1] = PRIME32_2;
+ acc[2] = 0;
+ acc[3] = static_cast<uint32_t>(-static_cast<int32_t>(PRIME32_1));
+ uint32_t length = offsets[i + 1] - offsets[i];
+ hash_varlen_helper(length, concatenated_keys + offsets[i], acc);
+ hashes[i] = combine_accumulators(acc[0], acc[1], acc[2], acc[3]);
+ }
+ avalanche(hardware_flags, num_rows, hashes);
+#if defined(ARROW_HAVE_AVX2)
+ }
+#endif
+}
+
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_hash.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_hash.h
new file mode 100644
index 00000000000..7f8ab5185cc
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_hash.h
@@ -0,0 +1,94 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#if defined(ARROW_HAVE_AVX2)
+#include <immintrin.h>
+#endif
+
+#include <cstdint>
+
+#include "arrow/compute/exec/util.h"
+
+namespace arrow {
+namespace compute {
+
+// Implementations are based on xxh3 32-bit algorithm description from:
+// https://github.com/Cyan4973/xxHash/blob/dev/doc/xxhash_spec.md
+//
+class Hashing {
+ public:
+ static void hash_fixed(int64_t hardware_flags, uint32_t num_keys, uint32_t length_key,
+ const uint8_t* keys, uint32_t* hashes);
+
+ static void hash_varlen(int64_t hardware_flags, uint32_t num_rows,
+ const uint32_t* offsets, const uint8_t* concatenated_keys,
+ uint32_t* temp_buffer, // Needs to hold 4 x 32-bit per row
+ uint32_t* hashes);
+
+ private:
+ static const uint32_t PRIME32_1 = 0x9E3779B1; // 0b10011110001101110111100110110001
+ static const uint32_t PRIME32_2 = 0x85EBCA77; // 0b10000101111010111100101001110111
+ static const uint32_t PRIME32_3 = 0xC2B2AE3D; // 0b11000010101100101010111000111101
+ static const uint32_t PRIME32_4 = 0x27D4EB2F; // 0b00100111110101001110101100101111
+ static const uint32_t PRIME32_5 = 0x165667B1; // 0b00010110010101100110011110110001
+
+ // Avalanche
+ static inline uint32_t avalanche_helper(uint32_t acc);
+#if defined(ARROW_HAVE_AVX2)
+ static void avalanche_avx2(uint32_t num_keys, uint32_t* hashes);
+#endif
+ static void avalanche(int64_t hardware_flags, uint32_t num_keys, uint32_t* hashes);
+
+ // Accumulator combine
+ static inline uint32_t combine_accumulators(const uint32_t acc1, const uint32_t acc2,
+ const uint32_t acc3, const uint32_t acc4);
+#if defined(ARROW_HAVE_AVX2)
+ static inline uint64_t combine_accumulators_avx2(__m256i acc);
+#endif
+
+ // Helpers
+ static inline void helper_8B(uint32_t key_length, uint32_t num_keys,
+ const uint8_t* keys, uint32_t* hashes);
+ static inline void helper_stripe(uint32_t offset, uint64_t mask_hi, const uint8_t* keys,
+ uint32_t& acc1, uint32_t& acc2, uint32_t& acc3,
+ uint32_t& acc4);
+ static inline uint32_t helper_tail(uint32_t offset, uint64_t mask, const uint8_t* keys,
+ uint32_t acc);
+#if defined(ARROW_HAVE_AVX2)
+ static void helper_stripes_avx2(uint32_t num_keys, uint32_t key_length,
+ const uint8_t* keys, uint32_t* hash);
+ static void helper_tails_avx2(uint32_t num_keys, uint32_t key_length,
+ const uint8_t* keys, uint32_t* hash);
+#endif
+ static void helper_stripes(int64_t hardware_flags, uint32_t num_keys,
+ uint32_t key_length, const uint8_t* keys, uint32_t* hash);
+ static void helper_tails(int64_t hardware_flags, uint32_t num_keys, uint32_t key_length,
+ const uint8_t* keys, uint32_t* hash);
+
+ static void hash_varlen_helper(uint32_t length, const uint8_t* key, uint32_t* acc);
+#if defined(ARROW_HAVE_AVX2)
+ static void hash_varlen_avx2(uint32_t num_rows, const uint32_t* offsets,
+ const uint8_t* concatenated_keys,
+ uint32_t* temp_buffer, // Needs to hold 4 x 32-bit per row
+ uint32_t* hashes);
+#endif
+};
+
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_map.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_map.cc
new file mode 100644
index 00000000000..ac47c04403c
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_map.cc
@@ -0,0 +1,610 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/exec/key_map.h"
+
+#include <memory.h>
+
+#include <algorithm>
+#include <cstdint>
+
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_ops.h"
+#include "arrow/util/ubsan.h"
+
+namespace arrow {
+
+using BitUtil::CountLeadingZeros;
+
+namespace compute {
+
+constexpr uint64_t kHighBitOfEachByte = 0x8080808080808080ULL;
+
+// Search status bytes inside a block of 8 slots (64-bit word).
+// Try to find a slot that contains a 7-bit stamp matching the one provided.
+// There are three possible outcomes:
+// 1. A matching slot is found.
+// -> Return its index between 0 and 7 and set match found flag.
+// 2. A matching slot is not found and there is an empty slot in the block.
+// -> Return the index of the first empty slot and clear match found flag.
+// 3. A matching slot is not found and there are no empty slots in the block.
+// -> Return 8 as the output slot index and clear match found flag.
+//
+// Optionally an index of the first slot to start the search from can be specified.
+// In this case slots before it will be ignored.
+//
+template <bool use_start_slot>
+inline void SwissTable::search_block(uint64_t block, int stamp, int start_slot,
+ int* out_slot, int* out_match_found) {
+ // Filled slot bytes have the highest bit set to 0 and empty slots are equal to 0x80.
+ uint64_t block_high_bits = block & kHighBitOfEachByte;
+
+ // Replicate 7-bit stamp to all non-empty slots, leaving zeroes for empty slots.
+ uint64_t stamp_pattern = stamp * ((block_high_bits ^ kHighBitOfEachByte) >> 7);
+
+ // If we xor this pattern with block status bytes we get in individual bytes:
+ // a) 0x00, for filled slots matching the stamp,
+ // b) 0x00 < x < 0x80, for filled slots not matching the stamp,
+ // c) 0x80, for empty slots.
+ uint64_t block_xor_pattern = block ^ stamp_pattern;
+
+ // If we then add 0x7f to every byte, we get:
+ // a) 0x7F
+ // b) 0x80 <= x < 0xFF
+ // c) 0xFF
+ uint64_t match_base = block_xor_pattern + ~kHighBitOfEachByte;
+
+ // The highest bit now tells us if we have a match (0) or not (1).
+ // We will negate the bits so that match is represented by a set bit.
+ uint64_t matches = ~match_base;
+
+ // Clear 7 non-relevant bits in each byte.
+ // Also clear bytes that correspond to slots that we were supposed to
+ // skip due to provided start slot index.
+ // Note: the highest byte corresponds to the first slot.
+ if (use_start_slot) {
+ matches &= kHighBitOfEachByte >> (8 * start_slot);
+ } else {
+ matches &= kHighBitOfEachByte;
+ }
+
+ // We get 0 if there are no matches
+ *out_match_found = (matches == 0 ? 0 : 1);
+
+ // Now if we or with the highest bits of the block and scan zero bits in reverse,
+ // we get 8x slot index that we were looking for.
+ // This formula works in all three cases a), b) and c).
+ *out_slot = static_cast<int>(CountLeadingZeros(matches | block_high_bits) >> 3);
+}
+
+// This call follows the call to search_block.
+// The input slot index is the output returned by it, which is a value from 0 to 8,
+// with 8 indicating that both: no match was found and there were no empty slots.
+//
+// If the slot corresponds to a non-empty slot return a group id associated with it.
+// Otherwise return any group id from any of the slots or
+// zero, which is the default value stored in empty slots.
+//
+inline uint64_t SwissTable::extract_group_id(const uint8_t* block_ptr, int slot,
+ uint64_t group_id_mask) {
+ // Input slot can be equal to 8, in which case we need to output any valid group id
+ // value, so we take the one from slot 0 in the block.
+ int clamped_slot = slot & 7;
+
+ // Group id values for all 8 slots in the block are bit-packed and follow the status
+ // bytes. We assume here that the number of bits is rounded up to 8, 16, 32 or 64. In
+ // that case we can extract group id using aligned 64-bit word access.
+ int num_groupid_bits = static_cast<int>(ARROW_POPCOUNT64(group_id_mask));
+ ARROW_DCHECK(num_groupid_bits == 8 || num_groupid_bits == 16 ||
+ num_groupid_bits == 32 || num_groupid_bits == 64);
+
+ int bit_offset = clamped_slot * num_groupid_bits;
+ const uint64_t* group_id_bytes =
+ reinterpret_cast<const uint64_t*>(block_ptr) + 1 + (bit_offset >> 6);
+ uint64_t group_id = (*group_id_bytes >> (bit_offset & 63)) & group_id_mask;
+
+ return group_id;
+}
+
+// Return global slot id (the index including the information about the block)
+// where the search should continue if the first comparison fails.
+// This function always follows search_block and receives the slot id returned by it.
+//
+inline uint64_t SwissTable::next_slot_to_visit(uint64_t block_index, int slot,
+ int match_found) {
+ // The result should be taken modulo the number of all slots in all blocks,
+ // but here we allow it to take a value one above the last slot index.
+ // Modulo operation is postponed to later.
+ return block_index * 8 + slot + match_found;
+}
+
+// Implements first (fast-path, optimistic) lookup.
+// Searches for a match only within the start block and
+// trying only the first slot with a matching stamp.
+//
+// Comparison callback needed for match verification is done outside of this function.
+// Match bit vector filled by it only indicates finding a matching stamp in a slot.
+//
+template <bool use_selection>
+void SwissTable::lookup_1(const uint16_t* selection, const int num_keys,
+ const uint32_t* hashes, uint8_t* out_match_bitvector,
+ uint32_t* out_groupids, uint32_t* out_slot_ids) {
+ // Clear the output bit vector
+ memset(out_match_bitvector, 0, (num_keys + 7) / 8);
+
+ // Based on the size of the table, prepare bit number constants.
+ uint32_t stamp_mask = (1 << bits_stamp_) - 1;
+ int num_groupid_bits = num_groupid_bits_from_log_blocks(log_blocks_);
+ uint32_t groupid_mask = (1 << num_groupid_bits) - 1;
+
+ for (int i = 0; i < num_keys; ++i) {
+ int id;
+ if (use_selection) {
+ id = util::SafeLoad(&selection[i]);
+ } else {
+ id = i;
+ }
+
+ // Extract from hash: block index and stamp
+ //
+ uint32_t hash = hashes[id];
+ uint32_t iblock = hash >> (bits_hash_ - bits_stamp_ - log_blocks_);
+ uint32_t stamp = iblock & stamp_mask;
+ iblock >>= bits_stamp_;
+
+ uint32_t num_block_bytes = num_groupid_bits + 8;
+ const uint8_t* blockbase = reinterpret_cast<const uint8_t*>(blocks_) +
+ static_cast<uint64_t>(iblock) * num_block_bytes;
+ uint64_t block = util::SafeLoadAs<uint64_t>(blockbase);
+
+ // Call helper functions to obtain the output triplet:
+ // - match (of a stamp) found flag
+ // - group id for key comparison
+ // - slot to resume search from in case of no match or false positive
+ int match_found;
+ int islot_in_block;
+ search_block<false>(block, stamp, 0, &islot_in_block, &match_found);
+ uint64_t groupid = extract_group_id(blockbase, islot_in_block, groupid_mask);
+ ARROW_DCHECK(groupid < num_inserted_ || num_inserted_ == 0);
+ uint64_t islot = next_slot_to_visit(iblock, islot_in_block, match_found);
+
+ out_match_bitvector[id / 8] |= match_found << (id & 7);
+ util::SafeStore(&out_groupids[id], static_cast<uint32_t>(groupid));
+ util::SafeStore(&out_slot_ids[id], static_cast<uint32_t>(islot));
+ }
+}
+
+// How many groups we can keep in the hash table without the need for resizing.
+// When we reach this limit, we need to break processing of any further rows and resize.
+//
+uint64_t SwissTable::num_groups_for_resize() const {
+ // Resize small hash tables when 50% full (up to 12KB).
+ // Resize large hash tables when 75% full.
+ constexpr int log_blocks_small_ = 9;
+ uint64_t num_slots = 1ULL << (log_blocks_ + 3);
+ if (log_blocks_ <= log_blocks_small_) {
+ return num_slots / 2;
+ } else {
+ return num_slots * 3 / 4;
+ }
+}
+
+uint64_t SwissTable::wrap_global_slot_id(uint64_t global_slot_id) {
+ uint64_t global_slot_id_mask = (1 << (log_blocks_ + 3)) - 1;
+ return global_slot_id & global_slot_id_mask;
+}
+
+// Run a single round of slot search - comparison / insert - filter unprocessed.
+// Update selection vector to reflect which items have been processed.
+// Ids in selection vector do not have to be sorted.
+//
+Status SwissTable::lookup_2(const uint32_t* hashes, uint32_t* inout_num_selected,
+ uint16_t* inout_selection, bool* out_need_resize,
+ uint32_t* out_group_ids, uint32_t* inout_next_slot_ids) {
+ auto num_groups_limit = num_groups_for_resize();
+ ARROW_DCHECK(num_inserted_ < num_groups_limit);
+
+ // Temporary arrays are of limited size.
+ // The input needs to be split into smaller portions if it exceeds that limit.
+ //
+ ARROW_DCHECK(*inout_num_selected <= static_cast<uint32_t>(1 << log_minibatch_));
+
+ // We will split input row ids into three categories:
+ // - needing to visit next block [0]
+ // - needing comparison [1]
+ // - inserted [2]
+ //
+ auto ids_inserted_buf =
+ util::TempVectorHolder<uint16_t>(temp_stack_, *inout_num_selected);
+ auto ids_for_comparison_buf =
+ util::TempVectorHolder<uint16_t>(temp_stack_, *inout_num_selected);
+ constexpr int category_nomatch = 0;
+ constexpr int category_cmp = 1;
+ constexpr int category_inserted = 2;
+ int num_ids[3];
+ num_ids[0] = num_ids[1] = num_ids[2] = 0;
+ uint16_t* ids[3]{inout_selection, ids_for_comparison_buf.mutable_data(),
+ ids_inserted_buf.mutable_data()};
+ auto push_id = [&num_ids, &ids](int category, int id) {
+ util::SafeStore(&ids[category][num_ids[category]++], static_cast<uint16_t>(id));
+ };
+
+ uint64_t num_groupid_bits = num_groupid_bits_from_log_blocks(log_blocks_);
+ uint64_t groupid_mask = (1ULL << num_groupid_bits) - 1;
+ constexpr uint64_t stamp_mask = 0x7f;
+ uint64_t num_block_bytes = (8 + num_groupid_bits);
+
+ uint32_t num_processed;
+ for (num_processed = 0;
+ // Second condition in for loop:
+ // We need to break processing and have the caller of this function
+ // resize hash table if we reach the limit of the number of groups present.
+ num_processed < *inout_num_selected &&
+ num_inserted_ + num_ids[category_inserted] < num_groups_limit;
+ ++num_processed) {
+ // row id in original batch
+ int id = util::SafeLoad(&inout_selection[num_processed]);
+
+ uint64_t slot_id = wrap_global_slot_id(util::SafeLoad(&inout_next_slot_ids[id]));
+ uint64_t block_id = slot_id >> 3;
+ uint32_t hash = hashes[id];
+ uint8_t* blockbase = blocks_ + num_block_bytes * block_id;
+ uint64_t block = *reinterpret_cast<uint64_t*>(blockbase);
+ uint64_t stamp = (hash >> (bits_hash_ - log_blocks_ - bits_stamp_)) & stamp_mask;
+ int start_slot = (slot_id & 7);
+
+ bool isempty = (blockbase[7 - start_slot] == 0x80);
+ if (isempty) {
+ // If we reach the empty slot we insert key for new group
+
+ blockbase[7 - start_slot] = static_cast<uint8_t>(stamp);
+ uint32_t group_id = num_inserted_ + num_ids[category_inserted];
+ int groupid_bit_offset = static_cast<int>(start_slot * num_groupid_bits);
+
+ // We assume here that the number of bits is rounded up to 8, 16, 32 or 64.
+ // In that case we can insert group id value using aligned 64-bit word access.
+ ARROW_DCHECK(num_groupid_bits == 8 || num_groupid_bits == 16 ||
+ num_groupid_bits == 32 || num_groupid_bits == 64);
+ uint64_t* ptr =
+ &reinterpret_cast<uint64_t*>(blockbase + 8)[groupid_bit_offset >> 6];
+ util::SafeStore(ptr, util::SafeLoad(ptr) | (static_cast<uint64_t>(group_id)
+ << (groupid_bit_offset & 63)));
+
+ hashes_[slot_id] = hash;
+ util::SafeStore(&out_group_ids[id], group_id);
+ push_id(category_inserted, id);
+ } else {
+ // We search for a slot with a matching stamp within a single block.
+ // We append row id to the appropriate sequence of ids based on
+ // whether the match has been found or not.
+
+ int new_match_found;
+ int new_slot;
+ search_block<true>(block, static_cast<int>(stamp), start_slot, &new_slot,
+ &new_match_found);
+ auto new_groupid =
+ static_cast<uint32_t>(extract_group_id(blockbase, new_slot, groupid_mask));
+ ARROW_DCHECK(new_groupid < num_inserted_ + num_ids[category_inserted]);
+ new_slot =
+ static_cast<int>(next_slot_to_visit(block_id, new_slot, new_match_found));
+ util::SafeStore(&inout_next_slot_ids[id], new_slot);
+ util::SafeStore(&out_group_ids[id], new_groupid);
+ push_id(new_match_found, id);
+ }
+ }
+
+ // Copy keys for newly inserted rows using callback
+ RETURN_NOT_OK(append_impl_(num_ids[category_inserted], ids[category_inserted]));
+ num_inserted_ += num_ids[category_inserted];
+
+ // Evaluate comparisons and append ids of rows that failed it to the non-match set.
+ uint32_t num_not_equal;
+ equal_impl_(num_ids[category_cmp], ids[category_cmp], out_group_ids, &num_not_equal,
+ ids[category_nomatch] + num_ids[category_nomatch]);
+ num_ids[category_nomatch] += num_not_equal;
+
+ // Append ids of any unprocessed entries if we aborted processing due to the need
+ // to resize.
+ if (num_processed < *inout_num_selected) {
+ memmove(ids[category_nomatch] + num_ids[category_nomatch],
+ inout_selection + num_processed,
+ sizeof(uint16_t) * (*inout_num_selected - num_processed));
+ num_ids[category_nomatch] += (*inout_num_selected - num_processed);
+ }
+
+ *out_need_resize = (num_inserted_ == num_groups_limit);
+ *inout_num_selected = num_ids[category_nomatch];
+ return Status::OK();
+}
+
+// Use hashes and callbacks to find group ids for already existing keys and
+// to insert and report newly assigned group ids for new keys.
+//
+Status SwissTable::map(const int num_keys, const uint32_t* hashes,
+ uint32_t* out_groupids) {
+ // Temporary buffers have limited size.
+ // Caller is responsible for splitting larger input arrays into smaller chunks.
+ ARROW_DCHECK(num_keys <= (1 << log_minibatch_));
+
+ // Allocate temporary buffers with a lifetime of this function
+ auto match_bitvector_buf = util::TempVectorHolder<uint8_t>(temp_stack_, num_keys);
+ uint8_t* match_bitvector = match_bitvector_buf.mutable_data();
+ auto slot_ids_buf = util::TempVectorHolder<uint32_t>(temp_stack_, num_keys);
+ uint32_t* slot_ids = slot_ids_buf.mutable_data();
+ auto ids_buf = util::TempVectorHolder<uint16_t>(temp_stack_, num_keys);
+ uint16_t* ids = ids_buf.mutable_data();
+ uint32_t num_ids;
+
+ // First-pass processing.
+ // Optimistically use simplified lookup involving only a start block to find
+ // a single group id candidate for every input.
+#if defined(ARROW_HAVE_AVX2)
+ if (hardware_flags_ & arrow::internal::CpuInfo::AVX2) {
+ if (log_blocks_ <= 4) {
+ int tail = num_keys % 32;
+ int delta = num_keys - tail;
+ lookup_1_avx2_x32(num_keys - tail, hashes, match_bitvector, out_groupids, slot_ids);
+ lookup_1_avx2_x8(tail, hashes + delta, match_bitvector + delta / 8,
+ out_groupids + delta, slot_ids + delta);
+ } else {
+ lookup_1_avx2_x8(num_keys, hashes, match_bitvector, out_groupids, slot_ids);
+ }
+ } else {
+#endif
+ lookup_1<false>(nullptr, num_keys, hashes, match_bitvector, out_groupids, slot_ids);
+#if defined(ARROW_HAVE_AVX2)
+ }
+#endif
+
+ int64_t num_matches =
+ arrow::internal::CountSetBits(match_bitvector, /*offset=*/0, num_keys);
+
+ // After the first-pass processing count rows with matches (based on stamp comparison)
+ // and decide based on their percentage whether to call dense or sparse comparison
+ // function. Dense comparison means evaluating it for all inputs, even if the matching
+ // stamp was not found. It may be cheaper to evaluate comparison for all inputs if the
+ // extra cost of filtering is higher than the wasted processing of rows with no match.
+ //
+ // Dense comparison can only be used if there is at least one inserted key,
+ // because otherwise there is no key to compare to.
+ //
+ if (num_inserted_ > 0 && num_matches > 0 && num_matches > 3 * num_keys / 4) {
+ // Dense comparisons
+ equal_impl_(num_keys, nullptr, out_groupids, &num_ids, ids);
+ } else {
+ // Sparse comparisons that involve filtering the input set of keys
+ auto ids_cmp_buf = util::TempVectorHolder<uint16_t>(temp_stack_, num_keys);
+ uint16_t* ids_cmp = ids_cmp_buf.mutable_data();
+ int num_ids_result;
+ util::BitUtil::bits_split_indexes(hardware_flags_, num_keys, match_bitvector,
+ &num_ids_result, ids, ids_cmp);
+ num_ids = num_ids_result;
+ uint32_t num_not_equal;
+ equal_impl_(num_keys - num_ids, ids_cmp, out_groupids, &num_not_equal, ids + num_ids);
+ num_ids += num_not_equal;
+ }
+
+ do {
+ // A single round of slow-pass (robust) lookup or insert.
+ // A single round ends with either a single comparison verifying the match candidate
+ // or inserting a new key. A single round of slow-pass may return early if we reach
+ // the limit of the number of groups due to inserts of new keys. In that case we need
+ // to resize and recalculating starting global slot ids for new bigger hash table.
+ bool out_of_capacity;
+ RETURN_NOT_OK(
+ lookup_2(hashes, &num_ids, ids, &out_of_capacity, out_groupids, slot_ids));
+ if (out_of_capacity) {
+ RETURN_NOT_OK(grow_double());
+ // Reset start slot ids for still unprocessed input keys.
+ //
+ for (uint32_t i = 0; i < num_ids; ++i) {
+ // First slot in the new starting block
+ const int16_t id = util::SafeLoad(&ids[i]);
+ util::SafeStore(&slot_ids[id], (hashes[id] >> (bits_hash_ - log_blocks_)) * 8);
+ }
+ }
+ } while (num_ids > 0);
+
+ return Status::OK();
+}
+
+Status SwissTable::grow_double() {
+ // Before and after metadata
+ int num_group_id_bits_before = num_groupid_bits_from_log_blocks(log_blocks_);
+ int num_group_id_bits_after = num_groupid_bits_from_log_blocks(log_blocks_ + 1);
+ uint64_t group_id_mask_before = ~0ULL >> (64 - num_group_id_bits_before);
+ int log_blocks_before = log_blocks_;
+ int log_blocks_after = log_blocks_ + 1;
+ uint64_t block_size_before = (8 + num_group_id_bits_before);
+ uint64_t block_size_after = (8 + num_group_id_bits_after);
+ uint64_t block_size_total_before = (block_size_before << log_blocks_before) + padding_;
+ uint64_t block_size_total_after = (block_size_after << log_blocks_after) + padding_;
+ uint64_t hashes_size_total_before =
+ (bits_hash_ / 8 * (1 << (log_blocks_before + 3))) + padding_;
+ uint64_t hashes_size_total_after =
+ (bits_hash_ / 8 * (1 << (log_blocks_after + 3))) + padding_;
+ constexpr uint32_t stamp_mask = (1 << bits_stamp_) - 1;
+
+ // Allocate new buffers
+ uint8_t* blocks_new;
+ RETURN_NOT_OK(pool_->Allocate(block_size_total_after, &blocks_new));
+ memset(blocks_new, 0, block_size_total_after);
+ uint8_t* hashes_new_8B;
+ uint32_t* hashes_new;
+ RETURN_NOT_OK(pool_->Allocate(hashes_size_total_after, &hashes_new_8B));
+ hashes_new = reinterpret_cast<uint32_t*>(hashes_new_8B);
+
+ // First pass over all old blocks.
+ // Reinsert entries that were not in the overflow block
+ // (block other than selected by hash bits corresponding to the entry).
+ for (int i = 0; i < (1 << log_blocks_); ++i) {
+ // How many full slots in this block
+ uint8_t* block_base = blocks_ + i * block_size_before;
+ uint8_t* double_block_base_new = blocks_new + 2 * i * block_size_after;
+ uint64_t block = *reinterpret_cast<const uint64_t*>(block_base);
+
+ auto full_slots =
+ static_cast<int>(CountLeadingZeros(block & kHighBitOfEachByte) >> 3);
+ int full_slots_new[2];
+ full_slots_new[0] = full_slots_new[1] = 0;
+ util::SafeStore(double_block_base_new, kHighBitOfEachByte);
+ util::SafeStore(double_block_base_new + block_size_after, kHighBitOfEachByte);
+
+ for (int j = 0; j < full_slots; ++j) {
+ uint64_t slot_id = i * 8 + j;
+ uint32_t hash = hashes_[slot_id];
+ uint64_t block_id_new = hash >> (bits_hash_ - log_blocks_after);
+ bool is_overflow_entry = ((block_id_new >> 1) != static_cast<uint64_t>(i));
+ if (is_overflow_entry) {
+ continue;
+ }
+
+ int ihalf = block_id_new & 1;
+ uint8_t stamp_new =
+ hash >> ((bits_hash_ - log_blocks_after - bits_stamp_)) & stamp_mask;
+ uint64_t group_id_bit_offs = j * num_group_id_bits_before;
+ uint64_t group_id =
+ (util::SafeLoadAs<uint64_t>(block_base + 8 + (group_id_bit_offs >> 3)) >>
+ (group_id_bit_offs & 7)) &
+ group_id_mask_before;
+
+ uint64_t slot_id_new = i * 16 + ihalf * 8 + full_slots_new[ihalf];
+ hashes_new[slot_id_new] = hash;
+ uint8_t* block_base_new = double_block_base_new + ihalf * block_size_after;
+ block_base_new[7 - full_slots_new[ihalf]] = stamp_new;
+ int group_id_bit_offs_new = full_slots_new[ihalf] * num_group_id_bits_after;
+ uint64_t* ptr =
+ reinterpret_cast<uint64_t*>(block_base_new + 8 + (group_id_bit_offs_new >> 3));
+ util::SafeStore(ptr,
+ util::SafeLoad(ptr) | (group_id << (group_id_bit_offs_new & 7)));
+ full_slots_new[ihalf]++;
+ }
+ }
+
+ // Second pass over all old blocks.
+ // Reinsert entries that were in an overflow block.
+ for (int i = 0; i < (1 << log_blocks_); ++i) {
+ // How many full slots in this block
+ uint8_t* block_base = blocks_ + i * block_size_before;
+ uint64_t block = util::SafeLoadAs<uint64_t>(block_base);
+ int full_slots = static_cast<int>(CountLeadingZeros(block & kHighBitOfEachByte) >> 3);
+
+ for (int j = 0; j < full_slots; ++j) {
+ uint64_t slot_id = i * 8 + j;
+ uint32_t hash = hashes_[slot_id];
+ uint64_t block_id_new = hash >> (bits_hash_ - log_blocks_after);
+ bool is_overflow_entry = ((block_id_new >> 1) != static_cast<uint64_t>(i));
+ if (!is_overflow_entry) {
+ continue;
+ }
+
+ uint64_t group_id_bit_offs = j * num_group_id_bits_before;
+ uint64_t group_id =
+ (util::SafeLoadAs<uint64_t>(block_base + 8 + (group_id_bit_offs >> 3)) >>
+ (group_id_bit_offs & 7)) &
+ group_id_mask_before;
+ uint8_t stamp_new =
+ hash >> ((bits_hash_ - log_blocks_after - bits_stamp_)) & stamp_mask;
+
+ uint8_t* block_base_new = blocks_new + block_id_new * block_size_after;
+ uint64_t block_new = util::SafeLoadAs<uint64_t>(block_base_new);
+ int full_slots_new =
+ static_cast<int>(CountLeadingZeros(block_new & kHighBitOfEachByte) >> 3);
+ while (full_slots_new == 8) {
+ block_id_new = (block_id_new + 1) & ((1 << log_blocks_after) - 1);
+ block_base_new = blocks_new + block_id_new * block_size_after;
+ block_new = util::SafeLoadAs<uint64_t>(block_base_new);
+ full_slots_new =
+ static_cast<int>(CountLeadingZeros(block_new & kHighBitOfEachByte) >> 3);
+ }
+
+ hashes_new[block_id_new * 8 + full_slots_new] = hash;
+ block_base_new[7 - full_slots_new] = stamp_new;
+ int group_id_bit_offs_new = full_slots_new * num_group_id_bits_after;
+ uint64_t* ptr =
+ reinterpret_cast<uint64_t*>(block_base_new + 8 + (group_id_bit_offs_new >> 3));
+ util::SafeStore(ptr,
+ util::SafeLoad(ptr) | (group_id << (group_id_bit_offs_new & 7)));
+ }
+ }
+
+ pool_->Free(blocks_, block_size_total_before);
+ pool_->Free(reinterpret_cast<uint8_t*>(hashes_), hashes_size_total_before);
+ log_blocks_ = log_blocks_after;
+ blocks_ = blocks_new;
+ hashes_ = hashes_new;
+
+ return Status::OK();
+}
+
+Status SwissTable::init(int64_t hardware_flags, MemoryPool* pool,
+ util::TempVectorStack* temp_stack, int log_minibatch,
+ EqualImpl equal_impl, AppendImpl append_impl) {
+ hardware_flags_ = hardware_flags;
+ pool_ = pool;
+ temp_stack_ = temp_stack;
+ log_minibatch_ = log_minibatch;
+ equal_impl_ = equal_impl;
+ append_impl_ = append_impl;
+
+ log_blocks_ = 0;
+ int num_groupid_bits = num_groupid_bits_from_log_blocks(log_blocks_);
+ num_inserted_ = 0;
+
+ const uint64_t block_bytes = 8 + num_groupid_bits;
+ const uint64_t slot_bytes = (block_bytes << log_blocks_) + padding_;
+ RETURN_NOT_OK(pool_->Allocate(slot_bytes, &blocks_));
+
+ // Make sure group ids are initially set to zero for all slots.
+ memset(blocks_, 0, slot_bytes);
+
+ // Initialize all status bytes to represent an empty slot.
+ for (uint64_t i = 0; i < (static_cast<uint64_t>(1) << log_blocks_); ++i) {
+ util::SafeStore(blocks_ + i * block_bytes, kHighBitOfEachByte);
+ }
+
+ uint64_t num_slots = 1ULL << (log_blocks_ + 3);
+ const uint64_t hash_size = sizeof(uint32_t);
+ const uint64_t hash_bytes = hash_size * num_slots + padding_;
+ uint8_t* hashes8;
+ RETURN_NOT_OK(pool_->Allocate(hash_bytes, &hashes8));
+ hashes_ = reinterpret_cast<uint32_t*>(hashes8);
+
+ return Status::OK();
+}
+
+void SwissTable::cleanup() {
+ if (blocks_) {
+ int num_groupid_bits = num_groupid_bits_from_log_blocks(log_blocks_);
+ const uint64_t block_bytes = 8 + num_groupid_bits;
+ const uint64_t slot_bytes = (block_bytes << log_blocks_) + padding_;
+ pool_->Free(blocks_, slot_bytes);
+ blocks_ = nullptr;
+ }
+ if (hashes_) {
+ uint64_t num_slots = 1ULL << (log_blocks_ + 3);
+ const uint64_t hash_size = sizeof(uint32_t);
+ const uint64_t hash_bytes = hash_size * num_slots + padding_;
+ pool_->Free(reinterpret_cast<uint8_t*>(hashes_), hash_bytes);
+ hashes_ = nullptr;
+ }
+ log_blocks_ = 0;
+ num_inserted_ = 0;
+}
+
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_map.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_map.h
new file mode 100644
index 00000000000..8c472736ec4
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_map.h
@@ -0,0 +1,172 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <functional>
+
+#include "arrow/compute/exec/util.h"
+#include "arrow/memory_pool.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+
+namespace arrow {
+namespace compute {
+
+class SwissTable {
+ public:
+ SwissTable() = default;
+ ~SwissTable() { cleanup(); }
+
+ using EqualImpl =
+ std::function<void(int num_keys, const uint16_t* selection /* may be null */,
+ const uint32_t* group_ids, uint32_t* out_num_keys_mismatch,
+ uint16_t* out_selection_mismatch)>;
+ using AppendImpl = std::function<Status(int num_keys, const uint16_t* selection)>;
+
+ Status init(int64_t hardware_flags, MemoryPool* pool, util::TempVectorStack* temp_stack,
+ int log_minibatch, EqualImpl equal_impl, AppendImpl append_impl);
+ void cleanup();
+
+ Status map(const int ckeys, const uint32_t* hashes, uint32_t* outgroupids);
+
+ private:
+ // Lookup helpers
+
+ /// \brief Scan bytes in block in reverse and stop as soon
+ /// as a position of interest is found.
+ ///
+ /// Positions of interest:
+ /// a) slot with a matching stamp is encountered,
+ /// b) first empty slot is encountered,
+ /// c) we reach the end of the block.
+ ///
+ /// \param[in] block 8 byte block of hash table
+ /// \param[in] stamp 7 bits of hash used as a stamp
+ /// \param[in] start_slot Index of the first slot in the block to start search from. We
+ /// assume that this index always points to a non-empty slot, equivalently
+ /// that it comes before any empty slots. (Used only by one template
+ /// variant.)
+ /// \param[out] out_slot index corresponding to the discovered position of interest (8
+ /// represents end of block).
+ /// \param[out] out_match_found an integer flag (0 or 1) indicating if we found a
+ /// matching stamp.
+ template <bool use_start_slot>
+ inline void search_block(uint64_t block, int stamp, int start_slot, int* out_slot,
+ int* out_match_found);
+
+ /// \brief Extract group id for a given slot in a given block.
+ ///
+ /// Group ids follow in memory after 64-bit block data.
+ /// Maximum number of groups inserted is equal to the number
+ /// of all slots in all blocks, which is 8 * the number of blocks.
+ /// Group ids are bit packed using that maximum to determine the necessary number of
+ /// bits.
+ inline uint64_t extract_group_id(const uint8_t* block_ptr, int slot,
+ uint64_t group_id_mask);
+
+ inline uint64_t next_slot_to_visit(uint64_t block_index, int slot, int match_found);
+
+ inline void insert(uint8_t* block_base, uint64_t slot_id, uint32_t hash, uint8_t stamp,
+ uint32_t group_id);
+
+ inline uint64_t num_groups_for_resize() const;
+
+ inline uint64_t wrap_global_slot_id(uint64_t global_slot_id);
+
+ // First hash table access
+ // Find first match in the start block if exists.
+ // Possible cases:
+ // 1. Stamp match in a block
+ // 2. No stamp match in a block, no empty buckets in a block
+ // 3. No stamp match in a block, empty buckets in a block
+ //
+ template <bool use_selection>
+ void lookup_1(const uint16_t* selection, const int num_keys, const uint32_t* hashes,
+ uint8_t* out_match_bitvector, uint32_t* out_group_ids,
+ uint32_t* out_slot_ids);
+#if defined(ARROW_HAVE_AVX2)
+ void lookup_1_avx2_x8(const int num_hashes, const uint32_t* hashes,
+ uint8_t* out_match_bitvector, uint32_t* out_group_ids,
+ uint32_t* out_next_slot_ids);
+ void lookup_1_avx2_x32(const int num_hashes, const uint32_t* hashes,
+ uint8_t* out_match_bitvector, uint32_t* out_group_ids,
+ uint32_t* out_next_slot_ids);
+#endif
+
+ // Completing hash table lookup post first access
+ Status lookup_2(const uint32_t* hashes, uint32_t* inout_num_selected,
+ uint16_t* inout_selection, bool* out_need_resize,
+ uint32_t* out_group_ids, uint32_t* out_next_slot_ids);
+
+ // Resize small hash tables when 50% full (up to 8KB).
+ // Resize large hash tables when 75% full.
+ Status grow_double();
+
+ static int num_groupid_bits_from_log_blocks(int log_blocks) {
+ int required_bits = log_blocks + 3;
+ return required_bits <= 8 ? 8
+ : required_bits <= 16 ? 16 : required_bits <= 32 ? 32 : 64;
+ }
+
+ // Use 32-bit hash for now
+ static constexpr int bits_hash_ = 32;
+
+ // Number of hash bits stored in slots in a block.
+ // The highest bits of hash determine block id.
+ // The next set of highest bits is a "stamp" stored in a slot in a block.
+ static constexpr int bits_stamp_ = 7;
+
+ // Padding bytes added at the end of buffers for ease of SIMD access
+ static constexpr int padding_ = 64;
+
+ int log_minibatch_;
+ // Base 2 log of the number of blocks
+ int log_blocks_ = 0;
+ // Number of keys inserted into hash table
+ uint32_t num_inserted_ = 0;
+
+ // Data for blocks.
+ // Each block has 8 status bytes for 8 slots, followed by 8 bit packed group ids for
+ // these slots. In 8B status word, the order of bytes is reversed. Group ids are in
+ // normal order. There is 64B padding at the end.
+ //
+ // 0 byte - 7 bucket | 1. byte - 6 bucket | ...
+ // ---------------------------------------------------
+ // | Empty bit* | Empty bit |
+ // ---------------------------------------------------
+ // | 7-bit hash | 7-bit hash |
+ // ---------------------------------------------------
+ // * Empty bucket has value 0x80. Non-empty bucket has highest bit set to 0.
+ //
+ uint8_t* blocks_;
+
+ // Array of hashes of values inserted into slots.
+ // Undefined if the corresponding slot is empty.
+ // There is 64B padding at the end.
+ uint32_t* hashes_;
+
+ int64_t hardware_flags_;
+ MemoryPool* pool_;
+ util::TempVectorStack* temp_stack_;
+
+ EqualImpl equal_impl_;
+ AppendImpl append_impl_;
+};
+
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/util.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/util.cc
new file mode 100644
index 00000000000..a44676c2f0d
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/util.cc
@@ -0,0 +1,278 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/exec/util.h"
+
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_ops.h"
+#include "arrow/util/ubsan.h"
+
+namespace arrow {
+
+using BitUtil::CountTrailingZeros;
+
+namespace util {
+
+inline void BitUtil::bits_to_indexes_helper(uint64_t word, uint16_t base_index,
+ int* num_indexes, uint16_t* indexes) {
+ int n = *num_indexes;
+ while (word) {
+ indexes[n++] = base_index + static_cast<uint16_t>(CountTrailingZeros(word));
+ word &= word - 1;
+ }
+ *num_indexes = n;
+}
+
+inline void BitUtil::bits_filter_indexes_helper(uint64_t word,
+ const uint16_t* input_indexes,
+ int* num_indexes, uint16_t* indexes) {
+ int n = *num_indexes;
+ while (word) {
+ indexes[n++] = input_indexes[CountTrailingZeros(word)];
+ word &= word - 1;
+ }
+ *num_indexes = n;
+}
+
+template <int bit_to_search, bool filter_input_indexes>
+void BitUtil::bits_to_indexes_internal(int64_t hardware_flags, const int num_bits,
+ const uint8_t* bits, const uint16_t* input_indexes,
+ int* num_indexes, uint16_t* indexes) {
+ // 64 bits at a time
+ constexpr int unroll = 64;
+ int tail = num_bits % unroll;
+#if defined(ARROW_HAVE_AVX2)
+ if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
+ if (filter_input_indexes) {
+ bits_filter_indexes_avx2(bit_to_search, num_bits - tail, bits, input_indexes,
+ num_indexes, indexes);
+ } else {
+ bits_to_indexes_avx2(bit_to_search, num_bits - tail, bits, num_indexes, indexes);
+ }
+ } else {
+#endif
+ *num_indexes = 0;
+ for (int i = 0; i < num_bits / unroll; ++i) {
+ uint64_t word = util::SafeLoad(&reinterpret_cast<const uint64_t*>(bits)[i]);
+ if (bit_to_search == 0) {
+ word = ~word;
+ }
+ if (filter_input_indexes) {
+ bits_filter_indexes_helper(word, input_indexes + i * 64, num_indexes, indexes);
+ } else {
+ bits_to_indexes_helper(word, i * 64, num_indexes, indexes);
+ }
+ }
+#if defined(ARROW_HAVE_AVX2)
+ }
+#endif
+ // Optionally process the last partial word with masking out bits outside range
+ if (tail) {
+ uint64_t word =
+ util::SafeLoad(&reinterpret_cast<const uint64_t*>(bits)[num_bits / unroll]);
+ if (bit_to_search == 0) {
+ word = ~word;
+ }
+ word &= ~0ULL >> (64 - tail);
+ if (filter_input_indexes) {
+ bits_filter_indexes_helper(word, input_indexes + num_bits - tail, num_indexes,
+ indexes);
+ } else {
+ bits_to_indexes_helper(word, num_bits - tail, num_indexes, indexes);
+ }
+ }
+}
+
+void BitUtil::bits_to_indexes(int bit_to_search, int64_t hardware_flags,
+ const int num_bits, const uint8_t* bits, int* num_indexes,
+ uint16_t* indexes, int bit_offset) {
+ bits += bit_offset / 8;
+ bit_offset %= 8;
+ if (bit_offset != 0) {
+ int num_indexes_head = 0;
+ uint64_t bits_head =
+ util::SafeLoad(reinterpret_cast<const uint64_t*>(bits)) >> bit_offset;
+ int bits_in_first_byte = std::min(num_bits, 8 - bit_offset);
+ bits_to_indexes(bit_to_search, hardware_flags, bits_in_first_byte,
+ reinterpret_cast<const uint8_t*>(&bits_head), &num_indexes_head,
+ indexes);
+ int num_indexes_tail = 0;
+ if (num_bits > bits_in_first_byte) {
+ bits_to_indexes(bit_to_search, hardware_flags, num_bits - bits_in_first_byte,
+ bits + 1, &num_indexes_tail, indexes + num_indexes_head);
+ }
+ *num_indexes = num_indexes_head + num_indexes_tail;
+ return;
+ }
+
+ if (bit_to_search == 0) {
+ bits_to_indexes_internal<0, false>(hardware_flags, num_bits, bits, nullptr,
+ num_indexes, indexes);
+ } else {
+ ARROW_DCHECK(bit_to_search == 1);
+ bits_to_indexes_internal<1, false>(hardware_flags, num_bits, bits, nullptr,
+ num_indexes, indexes);
+ }
+}
+
+void BitUtil::bits_filter_indexes(int bit_to_search, int64_t hardware_flags,
+ const int num_bits, const uint8_t* bits,
+ const uint16_t* input_indexes, int* num_indexes,
+ uint16_t* indexes, int bit_offset) {
+ bits += bit_offset / 8;
+ bit_offset %= 8;
+ if (bit_offset != 0) {
+ int num_indexes_head = 0;
+ uint64_t bits_head =
+ util::SafeLoad(reinterpret_cast<const uint64_t*>(bits)) >> bit_offset;
+ int bits_in_first_byte = std::min(num_bits, 8 - bit_offset);
+ bits_filter_indexes(bit_to_search, hardware_flags, bits_in_first_byte,
+ reinterpret_cast<const uint8_t*>(&bits_head), input_indexes,
+ &num_indexes_head, indexes);
+ int num_indexes_tail = 0;
+ if (num_bits > bits_in_first_byte) {
+ bits_filter_indexes(bit_to_search, hardware_flags, num_bits - bits_in_first_byte,
+ bits + 1, input_indexes + bits_in_first_byte, &num_indexes_tail,
+ indexes + num_indexes_head);
+ }
+ *num_indexes = num_indexes_head + num_indexes_tail;
+ return;
+ }
+
+ if (bit_to_search == 0) {
+ bits_to_indexes_internal<0, true>(hardware_flags, num_bits, bits, input_indexes,
+ num_indexes, indexes);
+ } else {
+ ARROW_DCHECK(bit_to_search == 1);
+ bits_to_indexes_internal<1, true>(hardware_flags, num_bits, bits, input_indexes,
+ num_indexes, indexes);
+ }
+}
+
+void BitUtil::bits_split_indexes(int64_t hardware_flags, const int num_bits,
+ const uint8_t* bits, int* num_indexes_bit0,
+ uint16_t* indexes_bit0, uint16_t* indexes_bit1,
+ int bit_offset) {
+ bits_to_indexes(0, hardware_flags, num_bits, bits, num_indexes_bit0, indexes_bit0,
+ bit_offset);
+ int num_indexes_bit1;
+ bits_to_indexes(1, hardware_flags, num_bits, bits, &num_indexes_bit1, indexes_bit1,
+ bit_offset);
+}
+
+void BitUtil::bits_to_bytes(int64_t hardware_flags, const int num_bits,
+ const uint8_t* bits, uint8_t* bytes, int bit_offset) {
+ bits += bit_offset / 8;
+ bit_offset %= 8;
+ if (bit_offset != 0) {
+ uint64_t bits_head =
+ util::SafeLoad(reinterpret_cast<const uint64_t*>(bits)) >> bit_offset;
+ int bits_in_first_byte = std::min(num_bits, 8 - bit_offset);
+ bits_to_bytes(hardware_flags, bits_in_first_byte,
+ reinterpret_cast<const uint8_t*>(&bits_head), bytes);
+ if (num_bits > bits_in_first_byte) {
+ bits_to_bytes(hardware_flags, num_bits - bits_in_first_byte, bits + 1,
+ bytes + bits_in_first_byte);
+ }
+ return;
+ }
+
+ int num_processed = 0;
+#if defined(ARROW_HAVE_AVX2)
+ if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
+ // The function call below processes whole 32 bit chunks together.
+ num_processed = num_bits - (num_bits % 32);
+ bits_to_bytes_avx2(num_processed, bits, bytes);
+ }
+#endif
+ // Processing 8 bits at a time
+ constexpr int unroll = 8;
+ for (int i = num_processed / unroll; i < (num_bits + unroll - 1) / unroll; ++i) {
+ uint8_t bits_next = bits[i];
+ // Clear the lowest bit and then make 8 copies of remaining 7 bits, each 7 bits apart
+ // from the previous.
+ uint64_t unpacked = static_cast<uint64_t>(bits_next & 0xfe) *
+ ((1ULL << 7) | (1ULL << 14) | (1ULL << 21) | (1ULL << 28) |
+ (1ULL << 35) | (1ULL << 42) | (1ULL << 49));
+ unpacked |= (bits_next & 1);
+ unpacked &= 0x0101010101010101ULL;
+ unpacked *= 255;
+ util::SafeStore(&reinterpret_cast<uint64_t*>(bytes)[i], unpacked);
+ }
+}
+
+void BitUtil::bytes_to_bits(int64_t hardware_flags, const int num_bits,
+ const uint8_t* bytes, uint8_t* bits, int bit_offset) {
+ bits += bit_offset / 8;
+ bit_offset %= 8;
+ if (bit_offset != 0) {
+ uint64_t bits_head;
+ int bits_in_first_byte = std::min(num_bits, 8 - bit_offset);
+ bytes_to_bits(hardware_flags, bits_in_first_byte, bytes,
+ reinterpret_cast<uint8_t*>(&bits_head));
+ uint8_t mask = (1 << bit_offset) - 1;
+ *bits = static_cast<uint8_t>((*bits & mask) | (bits_head << bit_offset));
+
+ if (num_bits > bits_in_first_byte) {
+ bytes_to_bits(hardware_flags, num_bits - bits_in_first_byte,
+ bytes + bits_in_first_byte, bits + 1);
+ }
+ return;
+ }
+
+ int num_processed = 0;
+#if defined(ARROW_HAVE_AVX2)
+ if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
+ // The function call below processes whole 32 bit chunks together.
+ num_processed = num_bits - (num_bits % 32);
+ bytes_to_bits_avx2(num_processed, bytes, bits);
+ }
+#endif
+ // Process 8 bits at a time
+ constexpr int unroll = 8;
+ for (int i = num_processed / unroll; i < (num_bits + unroll - 1) / unroll; ++i) {
+ uint64_t bytes_next = util::SafeLoad(&reinterpret_cast<const uint64_t*>(bytes)[i]);
+ bytes_next &= 0x0101010101010101ULL;
+ bytes_next |= (bytes_next >> 7); // Pairs of adjacent output bits in individual bytes
+ bytes_next |= (bytes_next >> 14); // 4 adjacent output bits in individual bytes
+ bytes_next |= (bytes_next >> 28); // All 8 output bits in the lowest byte
+ bits[i] = static_cast<uint8_t>(bytes_next & 0xff);
+ }
+}
+
+bool BitUtil::are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes,
+ uint32_t num_bytes) {
+#if defined(ARROW_HAVE_AVX2)
+ if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
+ return are_all_bytes_zero_avx2(bytes, num_bytes);
+ }
+#endif
+ uint64_t result_or = 0;
+ uint32_t i;
+ for (i = 0; i < num_bytes / 8; ++i) {
+ uint64_t x = util::SafeLoad(&reinterpret_cast<const uint64_t*>(bytes)[i]);
+ result_or |= x;
+ }
+ if (num_bytes % 8 > 0) {
+ uint64_t tail = 0;
+ result_or |= memcmp(bytes + i * 8, &tail, num_bytes % 8);
+ }
+ return result_or == 0;
+}
+
+} // namespace util
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/util.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/util.h
new file mode 100644
index 00000000000..471cc332220
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/util.h
@@ -0,0 +1,171 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <vector>
+
+#include "arrow/buffer.h"
+#include "arrow/memory_pool.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/util/cpu_info.h"
+#include "arrow/util/logging.h"
+
+#if defined(__clang__) || defined(__GNUC__)
+#define BYTESWAP(x) __builtin_bswap64(x)
+#define ROTL(x, n) (((x) << (n)) | ((x) >> (32 - (n))))
+#elif defined(_MSC_VER)
+#include <intrin.h>
+#define BYTESWAP(x) _byteswap_uint64(x)
+#define ROTL(x, n) _rotl((x), (n))
+#endif
+
+namespace arrow {
+namespace util {
+
+// Some platforms typedef int64_t as long int instead of long long int,
+// which breaks the _mm256_i64gather_epi64 and _mm256_i32gather_epi64 intrinsics
+// which need long long.
+// We use the cast to the type below in these intrinsics to make the code
+// compile in all cases.
+//
+using int64_for_gather_t = const long long int; // NOLINT runtime-int
+
+/// Storage used to allocate temporary vectors of a batch size.
+/// Temporary vectors should resemble allocating temporary variables on the stack
+/// but in the context of vectorized processing where we need to store a vector of
+/// temporaries instead of a single value.
+class TempVectorStack {
+ template <typename>
+ friend class TempVectorHolder;
+
+ public:
+ Status Init(MemoryPool* pool, int64_t size) {
+ num_vectors_ = 0;
+ top_ = 0;
+ buffer_size_ = size;
+ ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateResizableBuffer(size, pool));
+ buffer_ = std::move(buffer);
+ return Status::OK();
+ }
+
+ private:
+ void alloc(uint32_t num_bytes, uint8_t** data, int* id) {
+ int64_t old_top = top_;
+ top_ += num_bytes + padding;
+ // Stack overflow check
+ ARROW_DCHECK(top_ <= buffer_size_);
+ *data = buffer_->mutable_data() + old_top;
+ *id = num_vectors_++;
+ }
+ void release(int id, uint32_t num_bytes) {
+ ARROW_DCHECK(num_vectors_ == id + 1);
+ int64_t size = num_bytes + padding;
+ ARROW_DCHECK(top_ >= size);
+ top_ -= size;
+ --num_vectors_;
+ }
+ static constexpr int64_t padding = 64;
+ int num_vectors_;
+ int64_t top_;
+ std::unique_ptr<Buffer> buffer_;
+ int64_t buffer_size_;
+};
+
+template <typename T>
+class TempVectorHolder {
+ friend class TempVectorStack;
+
+ public:
+ ~TempVectorHolder() { stack_->release(id_, num_elements_ * sizeof(T)); }
+ T* mutable_data() { return reinterpret_cast<T*>(data_); }
+ TempVectorHolder(TempVectorStack* stack, uint32_t num_elements) {
+ stack_ = stack;
+ num_elements_ = num_elements;
+ stack_->alloc(num_elements * sizeof(T), &data_, &id_);
+ }
+
+ private:
+ TempVectorStack* stack_;
+ uint8_t* data_;
+ int id_;
+ uint32_t num_elements_;
+};
+
+class BitUtil {
+ public:
+ static void bits_to_indexes(int bit_to_search, int64_t hardware_flags,
+ const int num_bits, const uint8_t* bits, int* num_indexes,
+ uint16_t* indexes, int bit_offset = 0);
+
+ static void bits_filter_indexes(int bit_to_search, int64_t hardware_flags,
+ const int num_bits, const uint8_t* bits,
+ const uint16_t* input_indexes, int* num_indexes,
+ uint16_t* indexes, int bit_offset = 0);
+
+ // Input and output indexes may be pointing to the same data (in-place filtering).
+ static void bits_split_indexes(int64_t hardware_flags, const int num_bits,
+ const uint8_t* bits, int* num_indexes_bit0,
+ uint16_t* indexes_bit0, uint16_t* indexes_bit1,
+ int bit_offset = 0);
+
+ // Bit 1 is replaced with byte 0xFF.
+ static void bits_to_bytes(int64_t hardware_flags, const int num_bits,
+ const uint8_t* bits, uint8_t* bytes, int bit_offset = 0);
+
+ // Return highest bit of each byte.
+ static void bytes_to_bits(int64_t hardware_flags, const int num_bits,
+ const uint8_t* bytes, uint8_t* bits, int bit_offset = 0);
+
+ static bool are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes,
+ uint32_t num_bytes);
+
+ private:
+ inline static void bits_to_indexes_helper(uint64_t word, uint16_t base_index,
+ int* num_indexes, uint16_t* indexes);
+ inline static void bits_filter_indexes_helper(uint64_t word,
+ const uint16_t* input_indexes,
+ int* num_indexes, uint16_t* indexes);
+ template <int bit_to_search, bool filter_input_indexes>
+ static void bits_to_indexes_internal(int64_t hardware_flags, const int num_bits,
+ const uint8_t* bits, const uint16_t* input_indexes,
+ int* num_indexes, uint16_t* indexes);
+
+#if defined(ARROW_HAVE_AVX2)
+ static void bits_to_indexes_avx2(int bit_to_search, const int num_bits,
+ const uint8_t* bits, int* num_indexes,
+ uint16_t* indexes);
+ static void bits_filter_indexes_avx2(int bit_to_search, const int num_bits,
+ const uint8_t* bits, const uint16_t* input_indexes,
+ int* num_indexes, uint16_t* indexes);
+ template <int bit_to_search>
+ static void bits_to_indexes_imp_avx2(const int num_bits, const uint8_t* bits,
+ int* num_indexes, uint16_t* indexes);
+ template <int bit_to_search>
+ static void bits_filter_indexes_imp_avx2(const int num_bits, const uint8_t* bits,
+ const uint16_t* input_indexes,
+ int* num_indexes, uint16_t* indexes);
+ static void bits_to_bytes_avx2(const int num_bits, const uint8_t* bits, uint8_t* bytes);
+ static void bytes_to_bits_avx2(const int num_bits, const uint8_t* bytes, uint8_t* bits);
+ static bool are_all_bytes_zero_avx2(const uint8_t* bytes, uint32_t num_bytes);
+#endif
+};
+
+} // namespace util
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec_internal.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec_internal.h
new file mode 100644
index 00000000000..55daa243cd3
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec_internal.h
@@ -0,0 +1,142 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/buffer.h"
+#include "arrow/compute/exec.h"
+#include "arrow/compute/kernel.h"
+#include "arrow/status.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace compute {
+
+class Function;
+
+static constexpr int64_t kDefaultMaxChunksize = std::numeric_limits<int64_t>::max();
+
+namespace detail {
+
+/// \brief Break std::vector<Datum> into a sequence of ExecBatch for kernel
+/// execution
+class ARROW_EXPORT ExecBatchIterator {
+ public:
+ /// \brief Construct iterator and do basic argument validation
+ ///
+ /// \param[in] args the Datum argument, must be all array-like or scalar
+ /// \param[in] max_chunksize the maximum length of each ExecBatch. Depending
+ /// on the chunk layout of ChunkedArray.
+ static Result<std::unique_ptr<ExecBatchIterator>> Make(
+ std::vector<Datum> args, int64_t max_chunksize = kDefaultMaxChunksize);
+
+ /// \brief Compute the next batch. Always returns at least one batch. Return
+ /// false if the iterator is exhausted
+ bool Next(ExecBatch* batch);
+
+ int64_t length() const { return length_; }
+
+ int64_t position() const { return position_; }
+
+ int64_t max_chunksize() const { return max_chunksize_; }
+
+ private:
+ ExecBatchIterator(std::vector<Datum> args, int64_t length, int64_t max_chunksize);
+
+ std::vector<Datum> args_;
+ std::vector<int> chunk_indexes_;
+ std::vector<int64_t> chunk_positions_;
+ int64_t position_;
+ int64_t length_;
+ int64_t max_chunksize_;
+};
+
+// "Push" / listener API like IPC reader so that consumers can receive
+// processed chunks as soon as they're available.
+
+class ARROW_EXPORT ExecListener {
+ public:
+ virtual ~ExecListener() = default;
+
+ virtual Status OnResult(Datum) { return Status::NotImplemented("OnResult"); }
+};
+
+class DatumAccumulator : public ExecListener {
+ public:
+ DatumAccumulator() = default;
+
+ Status OnResult(Datum value) override {
+ values_.emplace_back(value);
+ return Status::OK();
+ }
+
+ std::vector<Datum> values() { return std::move(values_); }
+
+ private:
+ std::vector<Datum> values_;
+};
+
+/// \brief Check that each Datum is of a "value" type, which means either
+/// SCALAR, ARRAY, or CHUNKED_ARRAY. If there are chunked inputs, then these
+/// inputs will be split into non-chunked ExecBatch values for execution
+Status CheckAllValues(const std::vector<Datum>& values);
+
+class ARROW_EXPORT KernelExecutor {
+ public:
+ virtual ~KernelExecutor() = default;
+
+ /// The Kernel's `init` method must be called and any KernelState set in the
+ /// KernelContext *before* KernelExecutor::Init is called. This is to facilitate
+ /// the case where init may be expensive and does not need to be called again for
+ /// each execution of the kernel, for example the same lookup table can be re-used
+ /// for all scanned batches in a dataset filter.
+ virtual Status Init(KernelContext*, KernelInitArgs) = 0;
+
+ /// XXX: Better configurability for listener
+ /// Not thread-safe
+ virtual Status Execute(const std::vector<Datum>& args, ExecListener* listener) = 0;
+
+ virtual Datum WrapResults(const std::vector<Datum>& args,
+ const std::vector<Datum>& outputs) = 0;
+
+ static std::unique_ptr<KernelExecutor> MakeScalar();
+ static std::unique_ptr<KernelExecutor> MakeVector();
+ static std::unique_ptr<KernelExecutor> MakeScalarAggregate();
+};
+
+/// \brief Populate validity bitmap with the intersection of the nullity of the
+/// arguments. If a preallocated bitmap is not provided, then one will be
+/// allocated if needed (in some cases a bitmap can be zero-copied from the
+/// arguments). If any Scalar value is null, then the entire validity bitmap
+/// will be set to null.
+///
+/// \param[in] ctx kernel execution context, for memory allocation etc.
+/// \param[in] batch the data batch
+/// \param[in] out the output ArrayData, must not be null
+ARROW_EXPORT
+Status PropagateNulls(KernelContext* ctx, const ExecBatch& batch, ArrayData* out);
+
+} // namespace detail
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/function.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/function.cc
new file mode 100644
index 00000000000..05d14d03b16
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/function.cc
@@ -0,0 +1,330 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/function.h"
+
+#include <cstddef>
+#include <memory>
+#include <sstream>
+
+#include "arrow/compute/api_scalar.h"
+#include "arrow/compute/cast.h"
+#include "arrow/compute/exec.h"
+#include "arrow/compute/exec_internal.h"
+#include "arrow/compute/function_internal.h"
+#include "arrow/compute/kernels/common.h"
+#include "arrow/compute/registry.h"
+#include "arrow/datum.h"
+#include "arrow/util/cpu_info.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+
+namespace compute {
+Result<std::shared_ptr<Buffer>> FunctionOptionsType::Serialize(
+ const FunctionOptions&) const {
+ return Status::NotImplemented("Serialize for ", type_name());
+}
+
+Result<std::unique_ptr<FunctionOptions>> FunctionOptionsType::Deserialize(
+ const Buffer& buffer) const {
+ return Status::NotImplemented("Deserialize for ", type_name());
+}
+
+std::string FunctionOptions::ToString() const { return options_type()->Stringify(*this); }
+
+bool FunctionOptions::Equals(const FunctionOptions& other) const {
+ if (this == &other) return true;
+ if (options_type() != other.options_type()) return false;
+ return options_type()->Compare(*this, other);
+}
+
+Result<std::shared_ptr<Buffer>> FunctionOptions::Serialize() const {
+ return options_type()->Serialize(*this);
+}
+
+Result<std::unique_ptr<FunctionOptions>> FunctionOptions::Deserialize(
+ const std::string& type_name, const Buffer& buffer) {
+ ARROW_ASSIGN_OR_RAISE(auto options,
+ GetFunctionRegistry()->GetFunctionOptionsType(type_name));
+ return options->Deserialize(buffer);
+}
+
+void PrintTo(const FunctionOptions& options, std::ostream* os) {
+ *os << options.ToString();
+}
+
+static const FunctionDoc kEmptyFunctionDoc{};
+
+const FunctionDoc& FunctionDoc::Empty() { return kEmptyFunctionDoc; }
+
+static Status CheckArityImpl(const Function* function, int passed_num_args,
+ const char* passed_num_args_label) {
+ if (function->arity().is_varargs && passed_num_args < function->arity().num_args) {
+ return Status::Invalid("VarArgs function ", function->name(), " needs at least ",
+ function->arity().num_args, " arguments but ",
+ passed_num_args_label, " only ", passed_num_args);
+ }
+
+ if (!function->arity().is_varargs && passed_num_args != function->arity().num_args) {
+ return Status::Invalid("Function ", function->name(), " accepts ",
+ function->arity().num_args, " arguments but ",
+ passed_num_args_label, " ", passed_num_args);
+ }
+
+ return Status::OK();
+}
+
+Status Function::CheckArity(const std::vector<InputType>& in_types) const {
+ return CheckArityImpl(this, static_cast<int>(in_types.size()), "kernel accepts");
+}
+
+Status Function::CheckArity(const std::vector<ValueDescr>& descrs) const {
+ return CheckArityImpl(this, static_cast<int>(descrs.size()),
+ "attempted to look up kernel(s) with");
+}
+
+namespace detail {
+
+Status NoMatchingKernel(const Function* func, const std::vector<ValueDescr>& descrs) {
+ return Status::NotImplemented("Function ", func->name(),
+ " has no kernel matching input types ",
+ ValueDescr::ToString(descrs));
+}
+
+template <typename KernelType>
+const KernelType* DispatchExactImpl(const std::vector<KernelType*>& kernels,
+ const std::vector<ValueDescr>& values) {
+ const KernelType* kernel_matches[SimdLevel::MAX] = {nullptr};
+
+ // Validate arity
+ for (const auto& kernel : kernels) {
+ if (kernel->signature->MatchesInputs(values)) {
+ kernel_matches[kernel->simd_level] = kernel;
+ }
+ }
+
+ // Dispatch as the CPU feature
+#if defined(ARROW_HAVE_RUNTIME_AVX512) || defined(ARROW_HAVE_RUNTIME_AVX2)
+ auto cpu_info = arrow::internal::CpuInfo::GetInstance();
+#endif
+#if defined(ARROW_HAVE_RUNTIME_AVX512)
+ if (cpu_info->IsSupported(arrow::internal::CpuInfo::AVX512)) {
+ if (kernel_matches[SimdLevel::AVX512]) {
+ return kernel_matches[SimdLevel::AVX512];
+ }
+ }
+#endif
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
+ if (cpu_info->IsSupported(arrow::internal::CpuInfo::AVX2)) {
+ if (kernel_matches[SimdLevel::AVX2]) {
+ return kernel_matches[SimdLevel::AVX2];
+ }
+ }
+#endif
+ if (kernel_matches[SimdLevel::NONE]) {
+ return kernel_matches[SimdLevel::NONE];
+ }
+
+ return nullptr;
+}
+
+const Kernel* DispatchExactImpl(const Function* func,
+ const std::vector<ValueDescr>& values) {
+ if (func->kind() == Function::SCALAR) {
+ return DispatchExactImpl(checked_cast<const ScalarFunction*>(func)->kernels(),
+ values);
+ }
+
+ if (func->kind() == Function::VECTOR) {
+ return DispatchExactImpl(checked_cast<const VectorFunction*>(func)->kernels(),
+ values);
+ }
+
+ if (func->kind() == Function::SCALAR_AGGREGATE) {
+ return DispatchExactImpl(
+ checked_cast<const ScalarAggregateFunction*>(func)->kernels(), values);
+ }
+
+ if (func->kind() == Function::HASH_AGGREGATE) {
+ return DispatchExactImpl(checked_cast<const HashAggregateFunction*>(func)->kernels(),
+ values);
+ }
+
+ return nullptr;
+}
+
+} // namespace detail
+
+Result<const Kernel*> Function::DispatchExact(
+ const std::vector<ValueDescr>& values) const {
+ if (kind_ == Function::META) {
+ return Status::NotImplemented("Dispatch for a MetaFunction's Kernels");
+ }
+ RETURN_NOT_OK(CheckArity(values));
+
+ if (auto kernel = detail::DispatchExactImpl(this, values)) {
+ return kernel;
+ }
+ return detail::NoMatchingKernel(this, values);
+}
+
+Result<const Kernel*> Function::DispatchBest(std::vector<ValueDescr>* values) const {
+ // TODO(ARROW-11508) permit generic conversions here
+ return DispatchExact(*values);
+}
+
+Result<Datum> Function::Execute(const std::vector<Datum>& args,
+ const FunctionOptions* options, ExecContext* ctx) const {
+ if (options == nullptr) {
+ options = default_options();
+ }
+ if (ctx == nullptr) {
+ ExecContext default_ctx;
+ return Execute(args, options, &default_ctx);
+ }
+
+ // type-check Datum arguments here. Really we'd like to avoid this as much as
+ // possible
+ RETURN_NOT_OK(detail::CheckAllValues(args));
+ std::vector<ValueDescr> inputs(args.size());
+ for (size_t i = 0; i != args.size(); ++i) {
+ inputs[i] = args[i].descr();
+ }
+
+ ARROW_ASSIGN_OR_RAISE(auto kernel, DispatchBest(&inputs));
+ ARROW_ASSIGN_OR_RAISE(auto implicitly_cast_args, Cast(args, inputs, ctx));
+
+ std::unique_ptr<KernelState> state;
+
+ KernelContext kernel_ctx{ctx};
+ if (kernel->init) {
+ ARROW_ASSIGN_OR_RAISE(state, kernel->init(&kernel_ctx, {kernel, inputs, options}));
+ kernel_ctx.SetState(state.get());
+ }
+
+ std::unique_ptr<detail::KernelExecutor> executor;
+ if (kind() == Function::SCALAR) {
+ executor = detail::KernelExecutor::MakeScalar();
+ } else if (kind() == Function::VECTOR) {
+ executor = detail::KernelExecutor::MakeVector();
+ } else if (kind() == Function::SCALAR_AGGREGATE) {
+ executor = detail::KernelExecutor::MakeScalarAggregate();
+ } else {
+ return Status::NotImplemented("Direct execution of HASH_AGGREGATE functions");
+ }
+ RETURN_NOT_OK(executor->Init(&kernel_ctx, {kernel, inputs, options}));
+
+ auto listener = std::make_shared<detail::DatumAccumulator>();
+ RETURN_NOT_OK(executor->Execute(implicitly_cast_args, listener.get()));
+ return executor->WrapResults(implicitly_cast_args, listener->values());
+}
+
+Status Function::Validate() const {
+ if (!doc_->summary.empty()) {
+ // Documentation given, check its contents
+ int arg_count = static_cast<int>(doc_->arg_names.size());
+ if (arg_count == arity_.num_args) {
+ return Status::OK();
+ }
+ if (arity_.is_varargs && arg_count == arity_.num_args + 1) {
+ return Status::OK();
+ }
+ return Status::Invalid(
+ "In function '", name_,
+ "': ", "number of argument names for function documentation != function arity");
+ }
+ return Status::OK();
+}
+
+Status ScalarFunction::AddKernel(std::vector<InputType> in_types, OutputType out_type,
+ ArrayKernelExec exec, KernelInit init) {
+ RETURN_NOT_OK(CheckArity(in_types));
+
+ if (arity_.is_varargs && in_types.size() != 1) {
+ return Status::Invalid("VarArgs signatures must have exactly one input type");
+ }
+ auto sig =
+ KernelSignature::Make(std::move(in_types), std::move(out_type), arity_.is_varargs);
+ kernels_.emplace_back(std::move(sig), exec, init);
+ return Status::OK();
+}
+
+Status ScalarFunction::AddKernel(ScalarKernel kernel) {
+ RETURN_NOT_OK(CheckArity(kernel.signature->in_types()));
+ if (arity_.is_varargs && !kernel.signature->is_varargs()) {
+ return Status::Invalid("Function accepts varargs but kernel signature does not");
+ }
+ kernels_.emplace_back(std::move(kernel));
+ return Status::OK();
+}
+
+Status VectorFunction::AddKernel(std::vector<InputType> in_types, OutputType out_type,
+ ArrayKernelExec exec, KernelInit init) {
+ RETURN_NOT_OK(CheckArity(in_types));
+
+ if (arity_.is_varargs && in_types.size() != 1) {
+ return Status::Invalid("VarArgs signatures must have exactly one input type");
+ }
+ auto sig =
+ KernelSignature::Make(std::move(in_types), std::move(out_type), arity_.is_varargs);
+ kernels_.emplace_back(std::move(sig), exec, init);
+ return Status::OK();
+}
+
+Status VectorFunction::AddKernel(VectorKernel kernel) {
+ RETURN_NOT_OK(CheckArity(kernel.signature->in_types()));
+ if (arity_.is_varargs && !kernel.signature->is_varargs()) {
+ return Status::Invalid("Function accepts varargs but kernel signature does not");
+ }
+ kernels_.emplace_back(std::move(kernel));
+ return Status::OK();
+}
+
+Status ScalarAggregateFunction::AddKernel(ScalarAggregateKernel kernel) {
+ RETURN_NOT_OK(CheckArity(kernel.signature->in_types()));
+ if (arity_.is_varargs && !kernel.signature->is_varargs()) {
+ return Status::Invalid("Function accepts varargs but kernel signature does not");
+ }
+ kernels_.emplace_back(std::move(kernel));
+ return Status::OK();
+}
+
+Status HashAggregateFunction::AddKernel(HashAggregateKernel kernel) {
+ RETURN_NOT_OK(CheckArity(kernel.signature->in_types()));
+ if (arity_.is_varargs && !kernel.signature->is_varargs()) {
+ return Status::Invalid("Function accepts varargs but kernel signature does not");
+ }
+ kernels_.emplace_back(std::move(kernel));
+ return Status::OK();
+}
+
+Result<Datum> MetaFunction::Execute(const std::vector<Datum>& args,
+ const FunctionOptions* options,
+ ExecContext* ctx) const {
+ RETURN_NOT_OK(
+ CheckArityImpl(this, static_cast<int>(args.size()), "attempted to Execute with"));
+
+ if (options == nullptr) {
+ options = default_options();
+ }
+ return ExecuteImpl(args, options, ctx);
+}
+
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/function.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/function.h
new file mode 100644
index 00000000000..bd854bbb28e
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/function.h
@@ -0,0 +1,393 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// NOTE: API is EXPERIMENTAL and will change without going through a
+// deprecation cycle.
+
+#pragma once
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/compute/kernel.h"
+#include "arrow/compute/type_fwd.h"
+#include "arrow/datum.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/util/compare.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace compute {
+
+/// \defgroup compute-functions Abstract compute function API
+///
+/// @{
+
+/// \brief Extension point for defining options outside libarrow (but
+/// still within this project).
+class ARROW_EXPORT FunctionOptionsType {
+ public:
+ virtual ~FunctionOptionsType() = default;
+
+ virtual const char* type_name() const = 0;
+ virtual std::string Stringify(const FunctionOptions&) const = 0;
+ virtual bool Compare(const FunctionOptions&, const FunctionOptions&) const = 0;
+ virtual Result<std::shared_ptr<Buffer>> Serialize(const FunctionOptions&) const;
+ virtual Result<std::unique_ptr<FunctionOptions>> Deserialize(
+ const Buffer& buffer) const;
+};
+
+/// \brief Base class for specifying options configuring a function's behavior,
+/// such as error handling.
+class ARROW_EXPORT FunctionOptions : public util::EqualityComparable<FunctionOptions> {
+ public:
+ virtual ~FunctionOptions() = default;
+
+ const FunctionOptionsType* options_type() const { return options_type_; }
+ const char* type_name() const { return options_type()->type_name(); }
+
+ bool Equals(const FunctionOptions& other) const;
+ using util::EqualityComparable<FunctionOptions>::Equals;
+ using util::EqualityComparable<FunctionOptions>::operator==;
+ using util::EqualityComparable<FunctionOptions>::operator!=;
+ std::string ToString() const;
+ /// \brief Serialize an options struct to a buffer.
+ Result<std::shared_ptr<Buffer>> Serialize() const;
+ /// \brief Deserialize an options struct from a buffer.
+ /// Note: this will only look for `type_name` in the default FunctionRegistry;
+ /// to use a custom FunctionRegistry, look up the FunctionOptionsType, then
+ /// call FunctionOptionsType::Deserialize().
+ static Result<std::unique_ptr<FunctionOptions>> Deserialize(
+ const std::string& type_name, const Buffer& buffer);
+
+ protected:
+ explicit FunctionOptions(const FunctionOptionsType* type) : options_type_(type) {}
+ const FunctionOptionsType* options_type_;
+};
+
+ARROW_EXPORT void PrintTo(const FunctionOptions&, std::ostream*);
+
+/// \brief Contains the number of required arguments for the function.
+///
+/// Naming conventions taken from https://en.wikipedia.org/wiki/Arity.
+struct ARROW_EXPORT Arity {
+ /// \brief A function taking no arguments
+ static Arity Nullary() { return Arity(0, false); }
+
+ /// \brief A function taking 1 argument
+ static Arity Unary() { return Arity(1, false); }
+
+ /// \brief A function taking 2 arguments
+ static Arity Binary() { return Arity(2, false); }
+
+ /// \brief A function taking 3 arguments
+ static Arity Ternary() { return Arity(3, false); }
+
+ /// \brief A function taking a variable number of arguments
+ ///
+ /// \param[in] min_args the minimum number of arguments required when
+ /// invoking the function
+ static Arity VarArgs(int min_args = 0) { return Arity(min_args, true); }
+
+ // NOTE: the 0-argument form (default constructor) is required for Cython
+ explicit Arity(int num_args = 0, bool is_varargs = false)
+ : num_args(num_args), is_varargs(is_varargs) {}
+
+ /// The number of required arguments (or the minimum number for varargs
+ /// functions).
+ int num_args;
+
+ /// If true, then the num_args is the minimum number of required arguments.
+ bool is_varargs = false;
+};
+
+struct ARROW_EXPORT FunctionDoc {
+ /// \brief A one-line summary of the function, using a verb.
+ ///
+ /// For example, "Add two numeric arrays or scalars".
+ std::string summary;
+
+ /// \brief A detailed description of the function, meant to follow the summary.
+ std::string description;
+
+ /// \brief Symbolic names (identifiers) for the function arguments.
+ ///
+ /// Some bindings may use this to generate nicer function signatures.
+ std::vector<std::string> arg_names;
+
+ // TODO add argument descriptions?
+
+ /// \brief Name of the options class, if any.
+ std::string options_class;
+
+ FunctionDoc() = default;
+
+ FunctionDoc(std::string summary, std::string description,
+ std::vector<std::string> arg_names, std::string options_class = "")
+ : summary(std::move(summary)),
+ description(std::move(description)),
+ arg_names(std::move(arg_names)),
+ options_class(std::move(options_class)) {}
+
+ static const FunctionDoc& Empty();
+};
+
+/// \brief Base class for compute functions. Function implementations contain a
+/// collection of "kernels" which are implementations of the function for
+/// specific argument types. Selecting a viable kernel for executing a function
+/// is referred to as "dispatching".
+class ARROW_EXPORT Function {
+ public:
+ /// \brief The kind of function, which indicates in what contexts it is
+ /// valid for use.
+ enum Kind {
+ /// A function that performs scalar data operations on whole arrays of
+ /// data. Can generally process Array or Scalar values. The size of the
+ /// output will be the same as the size (or broadcasted size, in the case
+ /// of mixing Array and Scalar inputs) of the input.
+ SCALAR,
+
+ /// A function with array input and output whose behavior depends on the
+ /// values of the entire arrays passed, rather than the value of each scalar
+ /// value.
+ VECTOR,
+
+ /// A function that computes scalar summary statistics from array input.
+ SCALAR_AGGREGATE,
+
+ /// A function that computes grouped summary statistics from array input
+ /// and an array of group identifiers.
+ HASH_AGGREGATE,
+
+ /// A function that dispatches to other functions and does not contain its
+ /// own kernels.
+ META
+ };
+
+ virtual ~Function() = default;
+
+ /// \brief The name of the kernel. The registry enforces uniqueness of names.
+ const std::string& name() const { return name_; }
+
+ /// \brief The kind of kernel, which indicates in what contexts it is valid
+ /// for use.
+ Function::Kind kind() const { return kind_; }
+
+ /// \brief Contains the number of arguments the function requires, or if the
+ /// function accepts variable numbers of arguments.
+ const Arity& arity() const { return arity_; }
+
+ /// \brief Return the function documentation
+ const FunctionDoc& doc() const { return *doc_; }
+
+ /// \brief Returns the number of registered kernels for this function.
+ virtual int num_kernels() const = 0;
+
+ /// \brief Return a kernel that can execute the function given the exact
+ /// argument types (without implicit type casts or scalar->array promotions).
+ ///
+ /// NB: This function is overridden in CastFunction.
+ virtual Result<const Kernel*> DispatchExact(
+ const std::vector<ValueDescr>& values) const;
+
+ /// \brief Return a best-match kernel that can execute the function given the argument
+ /// types, after implicit casts are applied.
+ ///
+ /// \param[in,out] values Argument types. An element may be modified to indicate that
+ /// the returned kernel only approximately matches the input value descriptors; callers
+ /// are responsible for casting inputs to the type and shape required by the kernel.
+ virtual Result<const Kernel*> DispatchBest(std::vector<ValueDescr>* values) const;
+
+ /// \brief Execute the function eagerly with the passed input arguments with
+ /// kernel dispatch, batch iteration, and memory allocation details taken
+ /// care of.
+ ///
+ /// If the `options` pointer is null, then `default_options()` will be used.
+ ///
+ /// This function can be overridden in subclasses.
+ virtual Result<Datum> Execute(const std::vector<Datum>& args,
+ const FunctionOptions* options, ExecContext* ctx) const;
+
+ /// \brief Returns a the default options for this function.
+ ///
+ /// Whatever option semantics a Function has, implementations must guarantee
+ /// that default_options() is valid to pass to Execute as options.
+ const FunctionOptions* default_options() const { return default_options_; }
+
+ virtual Status Validate() const;
+
+ protected:
+ Function(std::string name, Function::Kind kind, const Arity& arity,
+ const FunctionDoc* doc, const FunctionOptions* default_options)
+ : name_(std::move(name)),
+ kind_(kind),
+ arity_(arity),
+ doc_(doc ? doc : &FunctionDoc::Empty()),
+ default_options_(default_options) {}
+
+ Status CheckArity(const std::vector<InputType>&) const;
+ Status CheckArity(const std::vector<ValueDescr>&) const;
+
+ std::string name_;
+ Function::Kind kind_;
+ Arity arity_;
+ const FunctionDoc* doc_;
+ const FunctionOptions* default_options_ = NULLPTR;
+};
+
+namespace detail {
+
+template <typename KernelType>
+class FunctionImpl : public Function {
+ public:
+ /// \brief Return pointers to current-available kernels for inspection
+ std::vector<const KernelType*> kernels() const {
+ std::vector<const KernelType*> result;
+ for (const auto& kernel : kernels_) {
+ result.push_back(&kernel);
+ }
+ return result;
+ }
+
+ int num_kernels() const override { return static_cast<int>(kernels_.size()); }
+
+ protected:
+ FunctionImpl(std::string name, Function::Kind kind, const Arity& arity,
+ const FunctionDoc* doc, const FunctionOptions* default_options)
+ : Function(std::move(name), kind, arity, doc, default_options) {}
+
+ std::vector<KernelType> kernels_;
+};
+
+/// \brief Look up a kernel in a function. If no Kernel is found, nullptr is returned.
+ARROW_EXPORT
+const Kernel* DispatchExactImpl(const Function* func, const std::vector<ValueDescr>&);
+
+/// \brief Return an error message if no Kernel is found.
+ARROW_EXPORT
+Status NoMatchingKernel(const Function* func, const std::vector<ValueDescr>&);
+
+} // namespace detail
+
+/// \brief A function that executes elementwise operations on arrays or
+/// scalars, and therefore whose results generally do not depend on the order
+/// of the values in the arguments. Accepts and returns arrays that are all of
+/// the same size. These functions roughly correspond to the functions used in
+/// SQL expressions.
+class ARROW_EXPORT ScalarFunction : public detail::FunctionImpl<ScalarKernel> {
+ public:
+ using KernelType = ScalarKernel;
+
+ ScalarFunction(std::string name, const Arity& arity, const FunctionDoc* doc,
+ const FunctionOptions* default_options = NULLPTR)
+ : detail::FunctionImpl<ScalarKernel>(std::move(name), Function::SCALAR, arity, doc,
+ default_options) {}
+
+ /// \brief Add a kernel with given input/output types, no required state
+ /// initialization, preallocation for fixed-width types, and default null
+ /// handling (intersect validity bitmaps of inputs).
+ Status AddKernel(std::vector<InputType> in_types, OutputType out_type,
+ ArrayKernelExec exec, KernelInit init = NULLPTR);
+
+ /// \brief Add a kernel (function implementation). Returns error if the
+ /// kernel's signature does not match the function's arity.
+ Status AddKernel(ScalarKernel kernel);
+};
+
+/// \brief A function that executes general array operations that may yield
+/// outputs of different sizes or have results that depend on the whole array
+/// contents. These functions roughly correspond to the functions found in
+/// non-SQL array languages like APL and its derivatives.
+class ARROW_EXPORT VectorFunction : public detail::FunctionImpl<VectorKernel> {
+ public:
+ using KernelType = VectorKernel;
+
+ VectorFunction(std::string name, const Arity& arity, const FunctionDoc* doc,
+ const FunctionOptions* default_options = NULLPTR)
+ : detail::FunctionImpl<VectorKernel>(std::move(name), Function::VECTOR, arity, doc,
+ default_options) {}
+
+ /// \brief Add a simple kernel with given input/output types, no required
+ /// state initialization, no data preallocation, and no preallocation of the
+ /// validity bitmap.
+ Status AddKernel(std::vector<InputType> in_types, OutputType out_type,
+ ArrayKernelExec exec, KernelInit init = NULLPTR);
+
+ /// \brief Add a kernel (function implementation). Returns error if the
+ /// kernel's signature does not match the function's arity.
+ Status AddKernel(VectorKernel kernel);
+};
+
+class ARROW_EXPORT ScalarAggregateFunction
+ : public detail::FunctionImpl<ScalarAggregateKernel> {
+ public:
+ using KernelType = ScalarAggregateKernel;
+
+ ScalarAggregateFunction(std::string name, const Arity& arity, const FunctionDoc* doc,
+ const FunctionOptions* default_options = NULLPTR)
+ : detail::FunctionImpl<ScalarAggregateKernel>(
+ std::move(name), Function::SCALAR_AGGREGATE, arity, doc, default_options) {}
+
+ /// \brief Add a kernel (function implementation). Returns error if the
+ /// kernel's signature does not match the function's arity.
+ Status AddKernel(ScalarAggregateKernel kernel);
+};
+
+class ARROW_EXPORT HashAggregateFunction
+ : public detail::FunctionImpl<HashAggregateKernel> {
+ public:
+ using KernelType = HashAggregateKernel;
+
+ HashAggregateFunction(std::string name, const Arity& arity, const FunctionDoc* doc,
+ const FunctionOptions* default_options = NULLPTR)
+ : detail::FunctionImpl<HashAggregateKernel>(
+ std::move(name), Function::HASH_AGGREGATE, arity, doc, default_options) {}
+
+ /// \brief Add a kernel (function implementation). Returns error if the
+ /// kernel's signature does not match the function's arity.
+ Status AddKernel(HashAggregateKernel kernel);
+};
+
+/// \brief A function that dispatches to other functions. Must implement
+/// MetaFunction::ExecuteImpl.
+///
+/// For Array, ChunkedArray, and Scalar Datum kinds, may rely on the execution
+/// of concrete Function types, but must handle other Datum kinds on its own.
+class ARROW_EXPORT MetaFunction : public Function {
+ public:
+ int num_kernels() const override { return 0; }
+
+ Result<Datum> Execute(const std::vector<Datum>& args, const FunctionOptions* options,
+ ExecContext* ctx) const override;
+
+ protected:
+ virtual Result<Datum> ExecuteImpl(const std::vector<Datum>& args,
+ const FunctionOptions* options,
+ ExecContext* ctx) const = 0;
+
+ MetaFunction(std::string name, const Arity& arity, const FunctionDoc* doc,
+ const FunctionOptions* default_options = NULLPTR)
+ : Function(std::move(name), Function::META, arity, doc, default_options) {}
+};
+
+/// @}
+
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/function_internal.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/function_internal.cc
new file mode 100644
index 00000000000..0a926e0a39c
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/function_internal.cc
@@ -0,0 +1,113 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/function_internal.h"
+
+#include "arrow/array/util.h"
+#include "arrow/compute/function.h"
+#include "arrow/compute/registry.h"
+#include "arrow/io/memory.h"
+#include "arrow/ipc/reader.h"
+#include "arrow/ipc/writer.h"
+#include "arrow/record_batch.h"
+#include "arrow/scalar.h"
+#include "arrow/util/checked_cast.h"
+
+namespace arrow {
+namespace compute {
+namespace internal {
+using ::arrow::internal::checked_cast;
+
+constexpr char kTypeNameField[] = "_type_name";
+
+Result<std::shared_ptr<StructScalar>> FunctionOptionsToStructScalar(
+ const FunctionOptions& options) {
+ std::vector<std::string> field_names;
+ std::vector<std::shared_ptr<Scalar>> values;
+ const auto* options_type =
+ dynamic_cast<const GenericOptionsType*>(options.options_type());
+ if (!options_type) {
+ return Status::NotImplemented("serializing ", options.type_name(),
+ " to StructScalar");
+ }
+ RETURN_NOT_OK(options_type->ToStructScalar(options, &field_names, &values));
+ field_names.push_back(kTypeNameField);
+ const char* options_name = options.type_name();
+ values.emplace_back(
+ new BinaryScalar(Buffer::Wrap(options_name, std::strlen(options_name))));
+ return StructScalar::Make(std::move(values), std::move(field_names));
+}
+
+Result<std::unique_ptr<FunctionOptions>> FunctionOptionsFromStructScalar(
+ const StructScalar& scalar) {
+ ARROW_ASSIGN_OR_RAISE(auto type_name_holder, scalar.field(kTypeNameField));
+ const std::string type_name =
+ checked_cast<const BinaryScalar&>(*type_name_holder).value->ToString();
+ ARROW_ASSIGN_OR_RAISE(auto raw_options_type,
+ GetFunctionRegistry()->GetFunctionOptionsType(type_name));
+ const auto* options_type = checked_cast<const GenericOptionsType*>(raw_options_type);
+ return options_type->FromStructScalar(scalar);
+}
+
+Result<std::shared_ptr<Buffer>> GenericOptionsType::Serialize(
+ const FunctionOptions& options) const {
+ ARROW_ASSIGN_OR_RAISE(auto scalar, FunctionOptionsToStructScalar(options));
+ ARROW_ASSIGN_OR_RAISE(auto array, MakeArrayFromScalar(*scalar, 1));
+ auto batch =
+ RecordBatch::Make(schema({field("", array->type())}), /*num_rows=*/1, {array});
+ ARROW_ASSIGN_OR_RAISE(auto stream, io::BufferOutputStream::Create());
+ ARROW_ASSIGN_OR_RAISE(auto writer, ipc::MakeFileWriter(stream, batch->schema()));
+ RETURN_NOT_OK(writer->WriteRecordBatch(*batch));
+ RETURN_NOT_OK(writer->Close());
+ return stream->Finish();
+}
+
+Result<std::unique_ptr<FunctionOptions>> GenericOptionsType::Deserialize(
+ const Buffer& buffer) const {
+ return DeserializeFunctionOptions(buffer);
+}
+
+Result<std::unique_ptr<FunctionOptions>> DeserializeFunctionOptions(
+ const Buffer& buffer) {
+ io::BufferReader stream(buffer);
+ ARROW_ASSIGN_OR_RAISE(auto reader, ipc::RecordBatchFileReader::Open(&stream));
+ ARROW_ASSIGN_OR_RAISE(auto batch, reader->ReadRecordBatch(0));
+ if (batch->num_rows() != 1) {
+ return Status::Invalid(
+ "serialized FunctionOptions's batch repr was not a single row - had ",
+ batch->num_rows());
+ }
+ if (batch->num_columns() != 1) {
+ return Status::Invalid(
+ "serialized FunctionOptions's batch repr was not a single column - had ",
+ batch->num_columns());
+ }
+ auto column = batch->column(0);
+ if (column->type()->id() != Type::STRUCT) {
+ return Status::Invalid(
+ "serialized FunctionOptions's batch repr was not a struct column - was ",
+ column->type()->ToString());
+ }
+ ARROW_ASSIGN_OR_RAISE(auto raw_scalar,
+ checked_cast<const StructArray&>(*column).GetScalar(0));
+ auto scalar = checked_cast<const StructScalar&>(*raw_scalar);
+ return FunctionOptionsFromStructScalar(scalar);
+}
+
+} // namespace internal
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/function_internal.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/function_internal.h
new file mode 100644
index 00000000000..fdd7f09ba1f
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/function_internal.h
@@ -0,0 +1,626 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "arrow/array/builder_base.h"
+#include "arrow/array/builder_binary.h"
+#include "arrow/array/builder_nested.h"
+#include "arrow/compute/api_vector.h"
+#include "arrow/compute/function.h"
+#include "arrow/compute/type_fwd.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/key_value_metadata.h"
+#include "arrow/util/reflection_internal.h"
+#include "arrow/util/string.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+struct Scalar;
+struct StructScalar;
+using ::arrow::internal::checked_cast;
+
+namespace internal {
+template <>
+struct EnumTraits<compute::SortOrder>
+ : BasicEnumTraits<compute::SortOrder, compute::SortOrder::Ascending,
+ compute::SortOrder::Descending> {
+ static std::string name() { return "SortOrder"; }
+ static std::string value_name(compute::SortOrder value) {
+ switch (value) {
+ case compute::SortOrder::Ascending:
+ return "Ascending";
+ case compute::SortOrder::Descending:
+ return "Descending";
+ }
+ return "<INVALID>";
+ }
+};
+} // namespace internal
+
+namespace compute {
+namespace internal {
+
+using arrow::internal::EnumTraits;
+using arrow::internal::has_enum_traits;
+
+template <typename Enum, typename CType = typename std::underlying_type<Enum>::type>
+Result<Enum> ValidateEnumValue(CType raw) {
+ for (auto valid : EnumTraits<Enum>::values()) {
+ if (raw == static_cast<CType>(valid)) {
+ return static_cast<Enum>(raw);
+ }
+ }
+ return Status::Invalid("Invalid value for ", EnumTraits<Enum>::name(), ": ", raw);
+}
+
+class GenericOptionsType : public FunctionOptionsType {
+ public:
+ Result<std::shared_ptr<Buffer>> Serialize(const FunctionOptions&) const override;
+ Result<std::unique_ptr<FunctionOptions>> Deserialize(
+ const Buffer& buffer) const override;
+ virtual Status ToStructScalar(const FunctionOptions& options,
+ std::vector<std::string>* field_names,
+ std::vector<std::shared_ptr<Scalar>>* values) const = 0;
+ virtual Result<std::unique_ptr<FunctionOptions>> FromStructScalar(
+ const StructScalar& scalar) const = 0;
+};
+
+ARROW_EXPORT
+Result<std::shared_ptr<StructScalar>> FunctionOptionsToStructScalar(
+ const FunctionOptions&);
+ARROW_EXPORT
+Result<std::unique_ptr<FunctionOptions>> FunctionOptionsFromStructScalar(
+ const StructScalar&);
+ARROW_EXPORT
+Result<std::unique_ptr<FunctionOptions>> DeserializeFunctionOptions(const Buffer& buffer);
+
+template <typename T>
+static inline enable_if_t<!has_enum_traits<T>::value, std::string> GenericToString(
+ const T& value) {
+ std::stringstream ss;
+ ss << value;
+ return ss.str();
+}
+
+static inline std::string GenericToString(bool value) { return value ? "true" : "false"; }
+
+static inline std::string GenericToString(const std::string& value) {
+ std::stringstream ss;
+ ss << '"' << value << '"';
+ return ss.str();
+}
+
+template <typename T>
+static inline enable_if_t<has_enum_traits<T>::value, std::string> GenericToString(
+ const T value) {
+ return EnumTraits<T>::value_name(value);
+}
+
+template <typename T>
+static inline std::string GenericToString(const std::shared_ptr<T>& value) {
+ std::stringstream ss;
+ return value ? value->ToString() : "<NULLPTR>";
+}
+
+static inline std::string GenericToString(const std::shared_ptr<Scalar>& value) {
+ std::stringstream ss;
+ ss << value->type->ToString() << ":" << value->ToString();
+ return ss.str();
+}
+
+static inline std::string GenericToString(
+ const std::shared_ptr<const KeyValueMetadata>& value) {
+ std::stringstream ss;
+ ss << "KeyValueMetadata{";
+ if (value) {
+ bool first = true;
+ for (const auto& pair : value->sorted_pairs()) {
+ if (!first) ss << ", ";
+ first = false;
+ ss << pair.first << ':' << pair.second;
+ }
+ }
+ ss << '}';
+ return ss.str();
+}
+
+static inline std::string GenericToString(const Datum& value) {
+ switch (value.kind()) {
+ case Datum::NONE:
+ return "<NULL DATUM>";
+ case Datum::SCALAR:
+ return GenericToString(value.scalar());
+ case Datum::ARRAY: {
+ std::stringstream ss;
+ ss << value.type()->ToString() << ':' << value.make_array()->ToString();
+ return ss.str();
+ }
+ case Datum::CHUNKED_ARRAY:
+ case Datum::RECORD_BATCH:
+ case Datum::TABLE:
+ case Datum::COLLECTION:
+ return value.ToString();
+ }
+ return value.ToString();
+}
+
+template <typename T>
+static inline std::string GenericToString(const std::vector<T>& value) {
+ std::stringstream ss;
+ ss << "[";
+ bool first = true;
+ // Don't use range-for with auto& to avoid Clang -Wrange-loop-analysis
+ for (auto it = value.begin(); it != value.end(); it++) {
+ if (!first) ss << ", ";
+ first = false;
+ ss << GenericToString(*it);
+ }
+ ss << ']';
+ return ss.str();
+}
+
+static inline std::string GenericToString(SortOrder value) {
+ switch (value) {
+ case SortOrder::Ascending:
+ return "Ascending";
+ case SortOrder::Descending:
+ return "Descending";
+ }
+ return "<INVALID SORT ORDER>";
+}
+
+static inline std::string GenericToString(const std::vector<SortKey>& value) {
+ std::stringstream ss;
+ ss << '[';
+ bool first = true;
+ for (const auto& key : value) {
+ if (!first) {
+ ss << ", ";
+ }
+ first = false;
+ ss << key.ToString();
+ }
+ ss << ']';
+ return ss.str();
+}
+
+template <typename T>
+static inline bool GenericEquals(const T& left, const T& right) {
+ return left == right;
+}
+
+template <typename T>
+static inline bool GenericEquals(const std::shared_ptr<T>& left,
+ const std::shared_ptr<T>& right) {
+ if (left && right) {
+ return left->Equals(*right);
+ }
+ return left == right;
+}
+
+static inline bool IsEmpty(const std::shared_ptr<const KeyValueMetadata>& meta) {
+ return !meta || meta->size() == 0;
+}
+
+static inline bool GenericEquals(const std::shared_ptr<const KeyValueMetadata>& left,
+ const std::shared_ptr<const KeyValueMetadata>& right) {
+ // Special case since null metadata is considered equivalent to empty
+ if (IsEmpty(left) || IsEmpty(right)) {
+ return IsEmpty(left) && IsEmpty(right);
+ }
+ return left->Equals(*right);
+}
+
+template <typename T>
+static inline bool GenericEquals(const std::vector<T>& left,
+ const std::vector<T>& right) {
+ if (left.size() != right.size()) return false;
+ for (size_t i = 0; i < left.size(); i++) {
+ if (!GenericEquals(left[i], right[i])) return false;
+ }
+ return true;
+}
+
+template <typename T>
+static inline decltype(TypeTraits<typename CTypeTraits<T>::ArrowType>::type_singleton())
+GenericTypeSingleton() {
+ return TypeTraits<typename CTypeTraits<T>::ArrowType>::type_singleton();
+}
+
+template <typename T>
+static inline enable_if_same<T, std::shared_ptr<const KeyValueMetadata>,
+ std::shared_ptr<DataType>>
+GenericTypeSingleton() {
+ return map(binary(), binary());
+}
+
+template <typename T>
+static inline enable_if_t<has_enum_traits<T>::value, std::shared_ptr<DataType>>
+GenericTypeSingleton() {
+ return TypeTraits<typename EnumTraits<T>::Type>::type_singleton();
+}
+
+template <typename T>
+static inline enable_if_same<T, SortKey, std::shared_ptr<DataType>>
+GenericTypeSingleton() {
+ std::vector<std::shared_ptr<Field>> fields;
+ fields.emplace_back(new Field("name", GenericTypeSingleton<std::string>()));
+ fields.emplace_back(new Field("order", GenericTypeSingleton<SortOrder>()));
+ return std::make_shared<StructType>(std::move(fields));
+}
+
+// N.B. ordering of overloads is relatively fragile
+template <typename T>
+static inline Result<decltype(MakeScalar(std::declval<T>()))> GenericToScalar(
+ const T& value) {
+ return MakeScalar(value);
+}
+
+// For Clang/libc++: when iterating through vector<bool>, we can't
+// pass it by reference so the overload above doesn't apply
+static inline Result<std::shared_ptr<Scalar>> GenericToScalar(bool value) {
+ return MakeScalar(value);
+}
+
+template <typename T, typename Enable = enable_if_t<has_enum_traits<T>::value>>
+static inline Result<std::shared_ptr<Scalar>> GenericToScalar(const T value) {
+ using CType = typename EnumTraits<T>::CType;
+ return GenericToScalar(static_cast<CType>(value));
+}
+
+static inline Result<std::shared_ptr<Scalar>> GenericToScalar(const SortKey& value) {
+ ARROW_ASSIGN_OR_RAISE(auto name, GenericToScalar(value.name));
+ ARROW_ASSIGN_OR_RAISE(auto order, GenericToScalar(value.order));
+ return StructScalar::Make({name, order}, {"name", "order"});
+}
+
+static inline Result<std::shared_ptr<Scalar>> GenericToScalar(
+ const std::shared_ptr<const KeyValueMetadata>& value) {
+ auto ty = GenericTypeSingleton<std::shared_ptr<const KeyValueMetadata>>();
+ std::unique_ptr<ArrayBuilder> builder;
+ RETURN_NOT_OK(MakeBuilder(default_memory_pool(), ty, &builder));
+ auto* map_builder = checked_cast<MapBuilder*>(builder.get());
+ auto* key_builder = checked_cast<BinaryBuilder*>(map_builder->key_builder());
+ auto* item_builder = checked_cast<BinaryBuilder*>(map_builder->item_builder());
+ RETURN_NOT_OK(map_builder->Append());
+ if (value) {
+ RETURN_NOT_OK(key_builder->AppendValues(value->keys()));
+ RETURN_NOT_OK(item_builder->AppendValues(value->values()));
+ }
+ std::shared_ptr<Array> arr;
+ RETURN_NOT_OK(map_builder->Finish(&arr));
+ return arr->GetScalar(0);
+}
+
+template <typename T>
+static inline Result<std::shared_ptr<Scalar>> GenericToScalar(
+ const std::vector<T>& value) {
+ std::shared_ptr<DataType> type = GenericTypeSingleton<T>();
+ std::vector<std::shared_ptr<Scalar>> scalars;
+ scalars.reserve(value.size());
+ // Don't use range-for with auto& to avoid Clang -Wrange-loop-analysis
+ for (auto it = value.begin(); it != value.end(); it++) {
+ ARROW_ASSIGN_OR_RAISE(auto scalar, GenericToScalar(*it));
+ scalars.push_back(std::move(scalar));
+ }
+ std::unique_ptr<ArrayBuilder> builder;
+ RETURN_NOT_OK(
+ MakeBuilder(default_memory_pool(), type ? type : scalars[0]->type, &builder));
+ RETURN_NOT_OK(builder->AppendScalars(scalars));
+ std::shared_ptr<Array> out;
+ RETURN_NOT_OK(builder->Finish(&out));
+ return std::make_shared<ListScalar>(std::move(out));
+}
+
+static inline Result<std::shared_ptr<Scalar>> GenericToScalar(
+ const std::shared_ptr<DataType>& value) {
+ if (!value) {
+ return Status::Invalid("shared_ptr<DataType> is nullptr");
+ }
+ return MakeNullScalar(value);
+}
+
+static inline Result<std::shared_ptr<Scalar>> GenericToScalar(
+ const std::shared_ptr<Scalar>& value) {
+ return value;
+}
+
+static inline Result<std::shared_ptr<Scalar>> GenericToScalar(
+ const std::shared_ptr<Array>& value) {
+ return std::make_shared<ListScalar>(value);
+}
+
+static inline Result<std::shared_ptr<Scalar>> GenericToScalar(const Datum& value) {
+ // TODO(ARROW-9434): store in a union instead.
+ switch (value.kind()) {
+ case Datum::ARRAY:
+ return GenericToScalar(value.make_array());
+ break;
+ default:
+ return Status::NotImplemented("Cannot serialize Datum kind ", value.kind());
+ }
+}
+
+template <typename T>
+static inline enable_if_primitive_ctype<typename CTypeTraits<T>::ArrowType, Result<T>>
+GenericFromScalar(const std::shared_ptr<Scalar>& value) {
+ using ArrowType = typename CTypeTraits<T>::ArrowType;
+ using ScalarType = typename TypeTraits<ArrowType>::ScalarType;
+ if (value->type->id() != ArrowType::type_id) {
+ return Status::Invalid("Expected type ", ArrowType::type_id, " but got ",
+ value->type->ToString());
+ }
+ const auto& holder = checked_cast<const ScalarType&>(*value);
+ if (!holder.is_valid) return Status::Invalid("Got null scalar");
+ return holder.value;
+}
+
+template <typename T>
+static inline enable_if_primitive_ctype<typename EnumTraits<T>::Type, Result<T>>
+GenericFromScalar(const std::shared_ptr<Scalar>& value) {
+ ARROW_ASSIGN_OR_RAISE(auto raw_val,
+ GenericFromScalar<typename EnumTraits<T>::CType>(value));
+ return ValidateEnumValue<T>(raw_val);
+}
+
+template <typename T, typename U>
+using enable_if_same_result = enable_if_same<T, U, Result<T>>;
+
+template <typename T>
+static inline enable_if_same_result<T, std::string> GenericFromScalar(
+ const std::shared_ptr<Scalar>& value) {
+ if (!is_base_binary_like(value->type->id())) {
+ return Status::Invalid("Expected binary-like type but got ", value->type->ToString());
+ }
+ const auto& holder = checked_cast<const BaseBinaryScalar&>(*value);
+ if (!holder.is_valid) return Status::Invalid("Got null scalar");
+ return holder.value->ToString();
+}
+
+template <typename T>
+static inline enable_if_same_result<T, SortKey> GenericFromScalar(
+ const std::shared_ptr<Scalar>& value) {
+ if (value->type->id() != Type::STRUCT) {
+ return Status::Invalid("Expected type STRUCT but got ", value->type->id());
+ }
+ if (!value->is_valid) return Status::Invalid("Got null scalar");
+ const auto& holder = checked_cast<const StructScalar&>(*value);
+ ARROW_ASSIGN_OR_RAISE(auto name_holder, holder.field("name"));
+ ARROW_ASSIGN_OR_RAISE(auto order_holder, holder.field("order"));
+ ARROW_ASSIGN_OR_RAISE(auto name, GenericFromScalar<std::string>(name_holder));
+ ARROW_ASSIGN_OR_RAISE(auto order, GenericFromScalar<SortOrder>(order_holder));
+ return SortKey{std::move(name), order};
+}
+
+template <typename T>
+static inline enable_if_same_result<T, std::shared_ptr<DataType>> GenericFromScalar(
+ const std::shared_ptr<Scalar>& value) {
+ return value->type;
+}
+
+template <typename T>
+static inline enable_if_same_result<T, std::shared_ptr<Scalar>> GenericFromScalar(
+ const std::shared_ptr<Scalar>& value) {
+ return value;
+}
+
+template <typename T>
+static inline enable_if_same_result<T, std::shared_ptr<const KeyValueMetadata>>
+GenericFromScalar(const std::shared_ptr<Scalar>& value) {
+ auto ty = GenericTypeSingleton<std::shared_ptr<const KeyValueMetadata>>();
+ if (!value->type->Equals(ty)) {
+ return Status::Invalid("Expected ", ty->ToString(), " but got ",
+ value->type->ToString());
+ }
+ const auto& holder = checked_cast<const MapScalar&>(*value);
+ std::vector<std::string> keys;
+ std::vector<std::string> values;
+ const auto& list = checked_cast<const StructArray&>(*holder.value);
+ const auto& key_arr = checked_cast<const BinaryArray&>(*list.field(0));
+ const auto& value_arr = checked_cast<const BinaryArray&>(*list.field(1));
+ for (int64_t i = 0; i < list.length(); i++) {
+ keys.push_back(key_arr.GetString(i));
+ values.push_back(value_arr.GetString(i));
+ }
+ return key_value_metadata(std::move(keys), std::move(values));
+}
+
+template <typename T>
+static inline enable_if_same_result<T, Datum> GenericFromScalar(
+ const std::shared_ptr<Scalar>& value) {
+ if (value->type->id() == Type::LIST) {
+ const auto& holder = checked_cast<const BaseListScalar&>(*value);
+ return holder.value;
+ }
+ // TODO(ARROW-9434): handle other possible datum kinds by looking for a union
+ return Status::Invalid("Cannot deserialize Datum from ", value->ToString());
+}
+
+template <typename T>
+static enable_if_same<typename CTypeTraits<T>::ArrowType, ListType, Result<T>>
+GenericFromScalar(const std::shared_ptr<Scalar>& value) {
+ using ValueType = typename T::value_type;
+ if (value->type->id() != Type::LIST) {
+ return Status::Invalid("Expected type LIST but got ", value->type->ToString());
+ }
+ const auto& holder = checked_cast<const BaseListScalar&>(*value);
+ if (!holder.is_valid) return Status::Invalid("Got null scalar");
+ std::vector<ValueType> result;
+ for (int i = 0; i < holder.value->length(); i++) {
+ ARROW_ASSIGN_OR_RAISE(auto scalar, holder.value->GetScalar(i));
+ ARROW_ASSIGN_OR_RAISE(auto v, GenericFromScalar<ValueType>(scalar));
+ result.push_back(std::move(v));
+ }
+ return result;
+}
+
+template <typename Options>
+struct StringifyImpl {
+ template <typename Tuple>
+ StringifyImpl(const Options& obj, const Tuple& props)
+ : obj_(obj), members_(props.size()) {
+ props.ForEach(*this);
+ }
+
+ template <typename Property>
+ void operator()(const Property& prop, size_t i) {
+ std::stringstream ss;
+ ss << prop.name() << '=' << GenericToString(prop.get(obj_));
+ members_[i] = ss.str();
+ }
+
+ std::string Finish() {
+ return "{" + arrow::internal::JoinStrings(members_, ", ") + "}";
+ }
+
+ const Options& obj_;
+ std::vector<std::string> members_;
+};
+
+template <typename Options>
+struct CompareImpl {
+ template <typename Tuple>
+ CompareImpl(const Options& l, const Options& r, const Tuple& props)
+ : left_(l), right_(r) {
+ props.ForEach(*this);
+ }
+
+ template <typename Property>
+ void operator()(const Property& prop, size_t) {
+ equal_ &= GenericEquals(prop.get(left_), prop.get(right_));
+ }
+
+ const Options& left_;
+ const Options& right_;
+ bool equal_ = true;
+};
+
+template <typename Options>
+struct ToStructScalarImpl {
+ template <typename Tuple>
+ ToStructScalarImpl(const Options& obj, const Tuple& props,
+ std::vector<std::string>* field_names,
+ std::vector<std::shared_ptr<Scalar>>* values)
+ : obj_(obj), field_names_(field_names), values_(values) {
+ props.ForEach(*this);
+ }
+
+ template <typename Property>
+ void operator()(const Property& prop, size_t) {
+ if (!status_.ok()) return;
+ auto result = GenericToScalar(prop.get(obj_));
+ if (!result.ok()) {
+ status_ = result.status().WithMessage("Could not serialize field ", prop.name(),
+ " of options type ", Options::kTypeName, ": ",
+ result.status().message());
+ return;
+ }
+ field_names_->emplace_back(prop.name());
+ values_->push_back(result.MoveValueUnsafe());
+ }
+
+ const Options& obj_;
+ Status status_;
+ std::vector<std::string>* field_names_;
+ std::vector<std::shared_ptr<Scalar>>* values_;
+};
+
+template <typename Options>
+struct FromStructScalarImpl {
+ template <typename Tuple>
+ FromStructScalarImpl(Options* obj, const StructScalar& scalar, const Tuple& props)
+ : obj_(obj), scalar_(scalar) {
+ props.ForEach(*this);
+ }
+
+ template <typename Property>
+ void operator()(const Property& prop, size_t) {
+ if (!status_.ok()) return;
+ auto maybe_holder = scalar_.field(std::string(prop.name()));
+ if (!maybe_holder.ok()) {
+ status_ = maybe_holder.status().WithMessage(
+ "Cannot deserialize field ", prop.name(), " of options type ",
+ Options::kTypeName, ": ", maybe_holder.status().message());
+ return;
+ }
+ auto holder = maybe_holder.MoveValueUnsafe();
+ auto result = GenericFromScalar<typename Property::Type>(holder);
+ if (!result.ok()) {
+ status_ = result.status().WithMessage("Cannot deserialize field ", prop.name(),
+ " of options type ", Options::kTypeName, ": ",
+ result.status().message());
+ return;
+ }
+ prop.set(obj_, result.MoveValueUnsafe());
+ }
+
+ Options* obj_;
+ Status status_;
+ const StructScalar& scalar_;
+};
+
+template <typename Options, typename... Properties>
+const FunctionOptionsType* GetFunctionOptionsType(const Properties&... properties) {
+ static const class OptionsType : public GenericOptionsType {
+ public:
+ explicit OptionsType(const arrow::internal::PropertyTuple<Properties...> properties)
+ : properties_(properties) {}
+
+ const char* type_name() const override { return Options::kTypeName; }
+
+ std::string Stringify(const FunctionOptions& options) const override {
+ const auto& self = checked_cast<const Options&>(options);
+ return StringifyImpl<Options>(self, properties_).Finish();
+ }
+ bool Compare(const FunctionOptions& options,
+ const FunctionOptions& other) const override {
+ const auto& lhs = checked_cast<const Options&>(options);
+ const auto& rhs = checked_cast<const Options&>(other);
+ return CompareImpl<Options>(lhs, rhs, properties_).equal_;
+ }
+ Status ToStructScalar(const FunctionOptions& options,
+ std::vector<std::string>* field_names,
+ std::vector<std::shared_ptr<Scalar>>* values) const override {
+ const auto& self = checked_cast<const Options&>(options);
+ RETURN_NOT_OK(
+ ToStructScalarImpl<Options>(self, properties_, field_names, values).status_);
+ return Status::OK();
+ }
+ Result<std::unique_ptr<FunctionOptions>> FromStructScalar(
+ const StructScalar& scalar) const override {
+ auto options = std::unique_ptr<Options>(new Options());
+ RETURN_NOT_OK(
+ FromStructScalarImpl<Options>(options.get(), scalar, properties_).status_);
+ return std::move(options);
+ }
+
+ private:
+ const arrow::internal::PropertyTuple<Properties...> properties_;
+ } instance(arrow::internal::MakeProperties(properties...));
+ return &instance;
+}
+
+} // namespace internal
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernel.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernel.cc
new file mode 100644
index 00000000000..f131f524d2e
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernel.cc
@@ -0,0 +1,486 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/kernel.h"
+
+#include <cstddef>
+#include <memory>
+#include <sstream>
+#include <string>
+
+#include "arrow/buffer.h"
+#include "arrow/compute/exec.h"
+#include "arrow/compute/util_internal.h"
+#include "arrow/result.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/hash_util.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/macros.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+using internal::hash_combine;
+
+static constexpr size_t kHashSeed = 0;
+
+namespace compute {
+
+// ----------------------------------------------------------------------
+// KernelContext
+
+Result<std::shared_ptr<ResizableBuffer>> KernelContext::Allocate(int64_t nbytes) {
+ return AllocateResizableBuffer(nbytes, exec_ctx_->memory_pool());
+}
+
+Result<std::shared_ptr<ResizableBuffer>> KernelContext::AllocateBitmap(int64_t num_bits) {
+ const int64_t nbytes = BitUtil::BytesForBits(num_bits);
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<ResizableBuffer> result,
+ AllocateResizableBuffer(nbytes, exec_ctx_->memory_pool()));
+ // Since bitmaps are typically written bit by bit, we could leak uninitialized bits.
+ // Make sure all memory is initialized (this also appeases Valgrind).
+ internal::ZeroMemory(result.get());
+ return result;
+}
+
+Status Kernel::InitAll(KernelContext* ctx, const KernelInitArgs& args,
+ std::vector<std::unique_ptr<KernelState>>* states) {
+ for (auto& state : *states) {
+ ARROW_ASSIGN_OR_RAISE(state, args.kernel->init(ctx, args));
+ }
+ return Status::OK();
+}
+
+Result<std::unique_ptr<KernelState>> ScalarAggregateKernel::MergeAll(
+ const ScalarAggregateKernel* kernel, KernelContext* ctx,
+ std::vector<std::unique_ptr<KernelState>> states) {
+ auto out = std::move(states.back());
+ states.pop_back();
+ ctx->SetState(out.get());
+ for (auto& state : states) {
+ RETURN_NOT_OK(kernel->merge(ctx, std::move(*state), out.get()));
+ }
+ return std::move(out);
+}
+
+// ----------------------------------------------------------------------
+// Some basic TypeMatcher implementations
+
+namespace match {
+
+class SameTypeIdMatcher : public TypeMatcher {
+ public:
+ explicit SameTypeIdMatcher(Type::type accepted_id) : accepted_id_(accepted_id) {}
+
+ bool Matches(const DataType& type) const override { return type.id() == accepted_id_; }
+
+ std::string ToString() const override {
+ std::stringstream ss;
+ ss << "Type::" << ::arrow::internal::ToString(accepted_id_);
+ return ss.str();
+ }
+
+ bool Equals(const TypeMatcher& other) const override {
+ if (this == &other) {
+ return true;
+ }
+ auto casted = dynamic_cast<const SameTypeIdMatcher*>(&other);
+ if (casted == nullptr) {
+ return false;
+ }
+ return this->accepted_id_ == casted->accepted_id_;
+ }
+
+ private:
+ Type::type accepted_id_;
+};
+
+std::shared_ptr<TypeMatcher> SameTypeId(Type::type type_id) {
+ return std::make_shared<SameTypeIdMatcher>(type_id);
+}
+
+template <typename ArrowType>
+class TimeUnitMatcher : public TypeMatcher {
+ using ThisType = TimeUnitMatcher<ArrowType>;
+
+ public:
+ explicit TimeUnitMatcher(TimeUnit::type accepted_unit)
+ : accepted_unit_(accepted_unit) {}
+
+ bool Matches(const DataType& type) const override {
+ if (type.id() != ArrowType::type_id) {
+ return false;
+ }
+ const auto& time_type = checked_cast<const ArrowType&>(type);
+ return time_type.unit() == accepted_unit_;
+ }
+
+ bool Equals(const TypeMatcher& other) const override {
+ if (this == &other) {
+ return true;
+ }
+ auto casted = dynamic_cast<const ThisType*>(&other);
+ if (casted == nullptr) {
+ return false;
+ }
+ return this->accepted_unit_ == casted->accepted_unit_;
+ }
+
+ std::string ToString() const override {
+ std::stringstream ss;
+ ss << ArrowType::type_name() << "(" << ::arrow::internal::ToString(accepted_unit_)
+ << ")";
+ return ss.str();
+ }
+
+ private:
+ TimeUnit::type accepted_unit_;
+};
+
+using DurationTypeUnitMatcher = TimeUnitMatcher<DurationType>;
+using Time32TypeUnitMatcher = TimeUnitMatcher<Time32Type>;
+using Time64TypeUnitMatcher = TimeUnitMatcher<Time64Type>;
+using TimestampTypeUnitMatcher = TimeUnitMatcher<TimestampType>;
+
+std::shared_ptr<TypeMatcher> TimestampTypeUnit(TimeUnit::type unit) {
+ return std::make_shared<TimestampTypeUnitMatcher>(unit);
+}
+
+std::shared_ptr<TypeMatcher> Time32TypeUnit(TimeUnit::type unit) {
+ return std::make_shared<Time32TypeUnitMatcher>(unit);
+}
+
+std::shared_ptr<TypeMatcher> Time64TypeUnit(TimeUnit::type unit) {
+ return std::make_shared<Time64TypeUnitMatcher>(unit);
+}
+
+std::shared_ptr<TypeMatcher> DurationTypeUnit(TimeUnit::type unit) {
+ return std::make_shared<DurationTypeUnitMatcher>(unit);
+}
+
+class IntegerMatcher : public TypeMatcher {
+ public:
+ IntegerMatcher() {}
+
+ bool Matches(const DataType& type) const override { return is_integer(type.id()); }
+
+ bool Equals(const TypeMatcher& other) const override {
+ if (this == &other) {
+ return true;
+ }
+ auto casted = dynamic_cast<const IntegerMatcher*>(&other);
+ return casted != nullptr;
+ }
+
+ std::string ToString() const override { return "integer"; }
+};
+
+std::shared_ptr<TypeMatcher> Integer() { return std::make_shared<IntegerMatcher>(); }
+
+class PrimitiveMatcher : public TypeMatcher {
+ public:
+ PrimitiveMatcher() {}
+
+ bool Matches(const DataType& type) const override { return is_primitive(type.id()); }
+
+ bool Equals(const TypeMatcher& other) const override {
+ if (this == &other) {
+ return true;
+ }
+ auto casted = dynamic_cast<const PrimitiveMatcher*>(&other);
+ return casted != nullptr;
+ }
+
+ std::string ToString() const override { return "primitive"; }
+};
+
+std::shared_ptr<TypeMatcher> Primitive() { return std::make_shared<PrimitiveMatcher>(); }
+
+class BinaryLikeMatcher : public TypeMatcher {
+ public:
+ BinaryLikeMatcher() {}
+
+ bool Matches(const DataType& type) const override { return is_binary_like(type.id()); }
+
+ bool Equals(const TypeMatcher& other) const override {
+ if (this == &other) {
+ return true;
+ }
+ auto casted = dynamic_cast<const BinaryLikeMatcher*>(&other);
+ return casted != nullptr;
+ }
+ std::string ToString() const override { return "binary-like"; }
+};
+
+std::shared_ptr<TypeMatcher> BinaryLike() {
+ return std::make_shared<BinaryLikeMatcher>();
+}
+
+class LargeBinaryLikeMatcher : public TypeMatcher {
+ public:
+ LargeBinaryLikeMatcher() {}
+
+ bool Matches(const DataType& type) const override {
+ return is_large_binary_like(type.id());
+ }
+
+ bool Equals(const TypeMatcher& other) const override {
+ if (this == &other) {
+ return true;
+ }
+ auto casted = dynamic_cast<const LargeBinaryLikeMatcher*>(&other);
+ return casted != nullptr;
+ }
+ std::string ToString() const override { return "large-binary-like"; }
+};
+
+std::shared_ptr<TypeMatcher> LargeBinaryLike() {
+ return std::make_shared<LargeBinaryLikeMatcher>();
+}
+
+} // namespace match
+
+// ----------------------------------------------------------------------
+// InputType
+
+size_t InputType::Hash() const {
+ size_t result = kHashSeed;
+ hash_combine(result, static_cast<int>(shape_));
+ hash_combine(result, static_cast<int>(kind_));
+ switch (kind_) {
+ case InputType::EXACT_TYPE:
+ hash_combine(result, type_->Hash());
+ break;
+ default:
+ break;
+ }
+ return result;
+}
+
+std::string InputType::ToString() const {
+ std::stringstream ss;
+ switch (shape_) {
+ case ValueDescr::ANY:
+ ss << "any";
+ break;
+ case ValueDescr::ARRAY:
+ ss << "array";
+ break;
+ case ValueDescr::SCALAR:
+ ss << "scalar";
+ break;
+ default:
+ DCHECK(false);
+ break;
+ }
+ ss << "[";
+ switch (kind_) {
+ case InputType::ANY_TYPE:
+ ss << "any";
+ break;
+ case InputType::EXACT_TYPE:
+ ss << type_->ToString();
+ break;
+ case InputType::USE_TYPE_MATCHER: {
+ ss << type_matcher_->ToString();
+ } break;
+ default:
+ DCHECK(false);
+ break;
+ }
+ ss << "]";
+ return ss.str();
+}
+
+bool InputType::Equals(const InputType& other) const {
+ if (this == &other) {
+ return true;
+ }
+ if (kind_ != other.kind_ || shape_ != other.shape_) {
+ return false;
+ }
+ switch (kind_) {
+ case InputType::ANY_TYPE:
+ return true;
+ case InputType::EXACT_TYPE:
+ return type_->Equals(*other.type_);
+ case InputType::USE_TYPE_MATCHER:
+ return type_matcher_->Equals(*other.type_matcher_);
+ default:
+ return false;
+ }
+}
+
+bool InputType::Matches(const ValueDescr& descr) const {
+ if (shape_ != ValueDescr::ANY && descr.shape != shape_) {
+ return false;
+ }
+ switch (kind_) {
+ case InputType::EXACT_TYPE:
+ return type_->Equals(*descr.type);
+ case InputType::USE_TYPE_MATCHER:
+ return type_matcher_->Matches(*descr.type);
+ default:
+ // ANY_TYPE
+ return true;
+ }
+}
+
+bool InputType::Matches(const Datum& value) const { return Matches(value.descr()); }
+
+const std::shared_ptr<DataType>& InputType::type() const {
+ DCHECK_EQ(InputType::EXACT_TYPE, kind_);
+ return type_;
+}
+
+const TypeMatcher& InputType::type_matcher() const {
+ DCHECK_EQ(InputType::USE_TYPE_MATCHER, kind_);
+ return *type_matcher_;
+}
+
+// ----------------------------------------------------------------------
+// OutputType
+
+OutputType::OutputType(ValueDescr descr) : OutputType(descr.type) {
+ shape_ = descr.shape;
+}
+
+Result<ValueDescr> OutputType::Resolve(KernelContext* ctx,
+ const std::vector<ValueDescr>& args) const {
+ ValueDescr::Shape broadcasted_shape = GetBroadcastShape(args);
+ if (kind_ == OutputType::FIXED) {
+ return ValueDescr(type_, shape_ == ValueDescr::ANY ? broadcasted_shape : shape_);
+ } else {
+ ARROW_ASSIGN_OR_RAISE(ValueDescr resolved_descr, resolver_(ctx, args));
+ if (resolved_descr.shape == ValueDescr::ANY) {
+ resolved_descr.shape = broadcasted_shape;
+ }
+ return resolved_descr;
+ }
+}
+
+const std::shared_ptr<DataType>& OutputType::type() const {
+ DCHECK_EQ(FIXED, kind_);
+ return type_;
+}
+
+const OutputType::Resolver& OutputType::resolver() const {
+ DCHECK_EQ(COMPUTED, kind_);
+ return resolver_;
+}
+
+std::string OutputType::ToString() const {
+ if (kind_ == OutputType::FIXED) {
+ return type_->ToString();
+ } else {
+ return "computed";
+ }
+}
+
+// ----------------------------------------------------------------------
+// KernelSignature
+
+KernelSignature::KernelSignature(std::vector<InputType> in_types, OutputType out_type,
+ bool is_varargs)
+ : in_types_(std::move(in_types)),
+ out_type_(std::move(out_type)),
+ is_varargs_(is_varargs),
+ hash_code_(0) {
+ DCHECK(!is_varargs || (is_varargs && (in_types_.size() >= 1)));
+}
+
+std::shared_ptr<KernelSignature> KernelSignature::Make(std::vector<InputType> in_types,
+ OutputType out_type,
+ bool is_varargs) {
+ return std::make_shared<KernelSignature>(std::move(in_types), std::move(out_type),
+ is_varargs);
+}
+
+bool KernelSignature::Equals(const KernelSignature& other) const {
+ if (is_varargs_ != other.is_varargs_) {
+ return false;
+ }
+ if (in_types_.size() != other.in_types_.size()) {
+ return false;
+ }
+ for (size_t i = 0; i < in_types_.size(); ++i) {
+ if (!in_types_[i].Equals(other.in_types_[i])) {
+ return false;
+ }
+ }
+ return true;
+}
+
+bool KernelSignature::MatchesInputs(const std::vector<ValueDescr>& args) const {
+ if (is_varargs_) {
+ for (size_t i = 0; i < args.size(); ++i) {
+ if (!in_types_[std::min(i, in_types_.size() - 1)].Matches(args[i])) {
+ return false;
+ }
+ }
+ } else {
+ if (args.size() != in_types_.size()) {
+ return false;
+ }
+ for (size_t i = 0; i < in_types_.size(); ++i) {
+ if (!in_types_[i].Matches(args[i])) {
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
+size_t KernelSignature::Hash() const {
+ if (hash_code_ != 0) {
+ return hash_code_;
+ }
+ size_t result = kHashSeed;
+ for (const auto& in_type : in_types_) {
+ hash_combine(result, in_type.Hash());
+ }
+ hash_code_ = result;
+ return result;
+}
+
+std::string KernelSignature::ToString() const {
+ std::stringstream ss;
+
+ if (is_varargs_) {
+ ss << "varargs[";
+ } else {
+ ss << "(";
+ }
+ for (size_t i = 0; i < in_types_.size(); ++i) {
+ if (i > 0) {
+ ss << ", ";
+ }
+ ss << in_types_[i].ToString();
+ }
+ if (is_varargs_) {
+ ss << "]";
+ } else {
+ ss << ")";
+ }
+ ss << " -> " << out_type_.ToString();
+ return ss.str();
+}
+
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernel.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernel.h
new file mode 100644
index 00000000000..36d20c7289e
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernel.h
@@ -0,0 +1,739 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// NOTE: API is EXPERIMENTAL and will change without going through a
+// deprecation cycle
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/buffer.h"
+#include "arrow/compute/exec.h"
+#include "arrow/datum.h"
+#include "arrow/memory_pool.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace compute {
+
+class FunctionOptions;
+
+/// \brief Base class for opaque kernel-specific state. For example, if there
+/// is some kind of initialization required.
+struct ARROW_EXPORT KernelState {
+ virtual ~KernelState() = default;
+};
+
+/// \brief Context/state for the execution of a particular kernel.
+class ARROW_EXPORT KernelContext {
+ public:
+ explicit KernelContext(ExecContext* exec_ctx) : exec_ctx_(exec_ctx), state_() {}
+
+ /// \brief Allocate buffer from the context's memory pool. The contents are
+ /// not initialized.
+ Result<std::shared_ptr<ResizableBuffer>> Allocate(int64_t nbytes);
+
+ /// \brief Allocate buffer for bitmap from the context's memory pool. Like
+ /// Allocate, the contents of the buffer are not initialized but the last
+ /// byte is preemptively zeroed to help avoid ASAN or valgrind issues.
+ Result<std::shared_ptr<ResizableBuffer>> AllocateBitmap(int64_t num_bits);
+
+ /// \brief Assign the active KernelState to be utilized for each stage of
+ /// kernel execution. Ownership and memory lifetime of the KernelState must
+ /// be minded separately.
+ void SetState(KernelState* state) { state_ = state; }
+
+ KernelState* state() { return state_; }
+
+ /// \brief Configuration related to function execution that is to be shared
+ /// across multiple kernels.
+ ExecContext* exec_context() { return exec_ctx_; }
+
+ /// \brief The memory pool to use for allocations. For now, it uses the
+ /// MemoryPool contained in the ExecContext used to create the KernelContext.
+ MemoryPool* memory_pool() { return exec_ctx_->memory_pool(); }
+
+ private:
+ ExecContext* exec_ctx_;
+ KernelState* state_;
+};
+
+/// \brief The standard kernel execution API that must be implemented for
+/// SCALAR and VECTOR kernel types. This includes both stateless and stateful
+/// kernels. Kernels depending on some execution state access that state via
+/// subclasses of KernelState set on the KernelContext object. May be used for
+/// SCALAR and VECTOR kernel kinds. Implementations should endeavor to write
+/// into pre-allocated memory if they are able, though for some kernels
+/// (e.g. in cases when a builder like StringBuilder) must be employed this may
+/// not be possible.
+using ArrayKernelExec = std::function<Status(KernelContext*, const ExecBatch&, Datum*)>;
+
+/// \brief An type-checking interface to permit customizable validation rules
+/// for use with InputType and KernelSignature. This is for scenarios where the
+/// acceptance is not an exact type instance, such as a TIMESTAMP type for a
+/// specific TimeUnit, but permitting any time zone.
+struct ARROW_EXPORT TypeMatcher {
+ virtual ~TypeMatcher() = default;
+
+ /// \brief Return true if this matcher accepts the data type.
+ virtual bool Matches(const DataType& type) const = 0;
+
+ /// \brief A human-interpretable string representation of what the type
+ /// matcher checks for, usable when printing KernelSignature or formatting
+ /// error messages.
+ virtual std::string ToString() const = 0;
+
+ /// \brief Return true if this TypeMatcher contains the same matching rule as
+ /// the other. Currently depends on RTTI.
+ virtual bool Equals(const TypeMatcher& other) const = 0;
+};
+
+namespace match {
+
+/// \brief Match any DataType instance having the same DataType::id.
+ARROW_EXPORT std::shared_ptr<TypeMatcher> SameTypeId(Type::type type_id);
+
+/// \brief Match any TimestampType instance having the same unit, but the time
+/// zones can be different.
+ARROW_EXPORT std::shared_ptr<TypeMatcher> TimestampTypeUnit(TimeUnit::type unit);
+ARROW_EXPORT std::shared_ptr<TypeMatcher> Time32TypeUnit(TimeUnit::type unit);
+ARROW_EXPORT std::shared_ptr<TypeMatcher> Time64TypeUnit(TimeUnit::type unit);
+ARROW_EXPORT std::shared_ptr<TypeMatcher> DurationTypeUnit(TimeUnit::type unit);
+
+// \brief Match any integer type
+ARROW_EXPORT std::shared_ptr<TypeMatcher> Integer();
+
+// Match types using 32-bit varbinary representation
+ARROW_EXPORT std::shared_ptr<TypeMatcher> BinaryLike();
+
+// Match types using 64-bit varbinary representation
+ARROW_EXPORT std::shared_ptr<TypeMatcher> LargeBinaryLike();
+
+// \brief Match any primitive type (boolean or any type representable as a C
+// Type)
+ARROW_EXPORT std::shared_ptr<TypeMatcher> Primitive();
+
+} // namespace match
+
+/// \brief An object used for type- and shape-checking arguments to be passed
+/// to a kernel and stored in a KernelSignature. Distinguishes between ARRAY
+/// and SCALAR arguments using ValueDescr::Shape. The type-checking rule can be
+/// supplied either with an exact DataType instance or a custom TypeMatcher.
+class ARROW_EXPORT InputType {
+ public:
+ /// \brief The kind of type-checking rule that the InputType contains.
+ enum Kind {
+ /// \brief Accept any value type.
+ ANY_TYPE,
+
+ /// \brief A fixed arrow::DataType and will only exact match having this
+ /// exact type (e.g. same TimestampType unit, same decimal scale and
+ /// precision, or same nested child types).
+ EXACT_TYPE,
+
+ /// \brief Uses a TypeMatcher implementation to check the type.
+ USE_TYPE_MATCHER
+ };
+
+ /// \brief Accept any value type but with a specific shape (e.g. any Array or
+ /// any Scalar).
+ InputType(ValueDescr::Shape shape = ValueDescr::ANY) // NOLINT implicit construction
+ : kind_(ANY_TYPE), shape_(shape) {}
+
+ /// \brief Accept an exact value type.
+ InputType(std::shared_ptr<DataType> type, // NOLINT implicit construction
+ ValueDescr::Shape shape = ValueDescr::ANY)
+ : kind_(EXACT_TYPE), shape_(shape), type_(std::move(type)) {}
+
+ /// \brief Accept an exact value type and shape provided by a ValueDescr.
+ InputType(const ValueDescr& descr) // NOLINT implicit construction
+ : InputType(descr.type, descr.shape) {}
+
+ /// \brief Use the passed TypeMatcher to type check.
+ InputType(std::shared_ptr<TypeMatcher> type_matcher, // NOLINT implicit construction
+ ValueDescr::Shape shape = ValueDescr::ANY)
+ : kind_(USE_TYPE_MATCHER), shape_(shape), type_matcher_(std::move(type_matcher)) {}
+
+ /// \brief Match any type with the given Type::type. Uses a TypeMatcher for
+ /// its implementation.
+ explicit InputType(Type::type type_id, ValueDescr::Shape shape = ValueDescr::ANY)
+ : InputType(match::SameTypeId(type_id), shape) {}
+
+ InputType(const InputType& other) { CopyInto(other); }
+
+ void operator=(const InputType& other) { CopyInto(other); }
+
+ InputType(InputType&& other) { MoveInto(std::forward<InputType>(other)); }
+
+ void operator=(InputType&& other) { MoveInto(std::forward<InputType>(other)); }
+
+ // \brief Match an array with the given exact type. Convenience constructor.
+ static InputType Array(std::shared_ptr<DataType> type) {
+ return InputType(std::move(type), ValueDescr::ARRAY);
+ }
+
+ // \brief Match a scalar with the given exact type. Convenience constructor.
+ static InputType Scalar(std::shared_ptr<DataType> type) {
+ return InputType(std::move(type), ValueDescr::SCALAR);
+ }
+
+ // \brief Match an array with the given Type::type id. Convenience
+ // constructor.
+ static InputType Array(Type::type id) { return InputType(id, ValueDescr::ARRAY); }
+
+ // \brief Match a scalar with the given Type::type id. Convenience
+ // constructor.
+ static InputType Scalar(Type::type id) { return InputType(id, ValueDescr::SCALAR); }
+
+ /// \brief Return true if this input type matches the same type cases as the
+ /// other.
+ bool Equals(const InputType& other) const;
+
+ bool operator==(const InputType& other) const { return this->Equals(other); }
+
+ bool operator!=(const InputType& other) const { return !(*this == other); }
+
+ /// \brief Return hash code.
+ size_t Hash() const;
+
+ /// \brief Render a human-readable string representation.
+ std::string ToString() const;
+
+ /// \brief Return true if the value matches this argument kind in type
+ /// and shape.
+ bool Matches(const Datum& value) const;
+
+ /// \brief Return true if the value descriptor matches this argument kind in
+ /// type and shape.
+ bool Matches(const ValueDescr& value) const;
+
+ /// \brief The type matching rule that this InputType uses.
+ Kind kind() const { return kind_; }
+
+ /// \brief Indicates whether this InputType matches Array (ValueDescr::ARRAY),
+ /// Scalar (ValueDescr::SCALAR) values, or both (ValueDescr::ANY).
+ ValueDescr::Shape shape() const { return shape_; }
+
+ /// \brief For InputType::EXACT_TYPE kind, the exact type that this InputType
+ /// must match. Otherwise this function should not be used and will assert in
+ /// debug builds.
+ const std::shared_ptr<DataType>& type() const;
+
+ /// \brief For InputType::USE_TYPE_MATCHER, the TypeMatcher to be used for
+ /// checking the type of a value. Otherwise this function should not be used
+ /// and will assert in debug builds.
+ const TypeMatcher& type_matcher() const;
+
+ private:
+ void CopyInto(const InputType& other) {
+ this->kind_ = other.kind_;
+ this->shape_ = other.shape_;
+ this->type_ = other.type_;
+ this->type_matcher_ = other.type_matcher_;
+ }
+
+ void MoveInto(InputType&& other) {
+ this->kind_ = other.kind_;
+ this->shape_ = other.shape_;
+ this->type_ = std::move(other.type_);
+ this->type_matcher_ = std::move(other.type_matcher_);
+ }
+
+ Kind kind_;
+
+ ValueDescr::Shape shape_ = ValueDescr::ANY;
+
+ // For EXACT_TYPE Kind
+ std::shared_ptr<DataType> type_;
+
+ // For USE_TYPE_MATCHER Kind
+ std::shared_ptr<TypeMatcher> type_matcher_;
+};
+
+/// \brief Container to capture both exact and input-dependent output types.
+///
+/// The value shape returned by Resolve will be determined by broadcasting the
+/// shapes of the input arguments, otherwise this is handled by the
+/// user-defined resolver function:
+///
+/// * Any ARRAY shape -> output shape is ARRAY
+/// * All SCALAR shapes -> output shape is SCALAR
+class ARROW_EXPORT OutputType {
+ public:
+ /// \brief An enum indicating whether the value type is an invariant fixed
+ /// value or one that's computed by a kernel-defined resolver function.
+ enum ResolveKind { FIXED, COMPUTED };
+
+ /// Type resolution function. Given input types and shapes, return output
+ /// type and shape. This function SHOULD _not_ be used to check for arity,
+ /// that is to be performed one or more layers above. May make use of kernel
+ /// state to know what type to output in some cases.
+ using Resolver =
+ std::function<Result<ValueDescr>(KernelContext*, const std::vector<ValueDescr>&)>;
+
+ /// \brief Output an exact type, but with shape determined by promoting the
+ /// shapes of the inputs (any ARRAY argument yields ARRAY).
+ OutputType(std::shared_ptr<DataType> type) // NOLINT implicit construction
+ : kind_(FIXED), type_(std::move(type)) {}
+
+ /// \brief Output the exact type and shape provided by a ValueDescr
+ OutputType(ValueDescr descr); // NOLINT implicit construction
+
+ explicit OutputType(Resolver resolver)
+ : kind_(COMPUTED), resolver_(std::move(resolver)) {}
+
+ OutputType(const OutputType& other) {
+ this->kind_ = other.kind_;
+ this->shape_ = other.shape_;
+ this->type_ = other.type_;
+ this->resolver_ = other.resolver_;
+ }
+
+ OutputType(OutputType&& other) {
+ this->kind_ = other.kind_;
+ this->type_ = std::move(other.type_);
+ this->shape_ = other.shape_;
+ this->resolver_ = other.resolver_;
+ }
+
+ OutputType& operator=(const OutputType&) = default;
+ OutputType& operator=(OutputType&&) = default;
+
+ /// \brief Return the shape and type of the expected output value of the
+ /// kernel given the value descriptors (shapes and types) of the input
+ /// arguments. The resolver may make use of state information kept in the
+ /// KernelContext.
+ Result<ValueDescr> Resolve(KernelContext* ctx,
+ const std::vector<ValueDescr>& args) const;
+
+ /// \brief The exact output value type for the FIXED kind.
+ const std::shared_ptr<DataType>& type() const;
+
+ /// \brief For use with COMPUTED resolution strategy. It may be more
+ /// convenient to invoke this with OutputType::Resolve returned from this
+ /// method.
+ const Resolver& resolver() const;
+
+ /// \brief Render a human-readable string representation.
+ std::string ToString() const;
+
+ /// \brief Return the kind of type resolution of this output type, whether
+ /// fixed/invariant or computed by a resolver.
+ ResolveKind kind() const { return kind_; }
+
+ /// \brief If the shape is ANY, then Resolve will compute the shape based on
+ /// the input arguments.
+ ValueDescr::Shape shape() const { return shape_; }
+
+ private:
+ ResolveKind kind_;
+
+ // For FIXED resolution
+ std::shared_ptr<DataType> type_;
+
+ /// \brief The shape of the output type to return when using Resolve. If ANY
+ /// will promote the input shapes.
+ ValueDescr::Shape shape_ = ValueDescr::ANY;
+
+ // For COMPUTED resolution
+ Resolver resolver_;
+};
+
+/// \brief Holds the input types and output type of the kernel.
+///
+/// VarArgs functions with minimum N arguments should pass up to N input types to be
+/// used to validate the input types of a function invocation. The first N-1 types
+/// will be matched against the first N-1 arguments, and the last type will be
+/// matched against the remaining arguments.
+class ARROW_EXPORT KernelSignature {
+ public:
+ KernelSignature(std::vector<InputType> in_types, OutputType out_type,
+ bool is_varargs = false);
+
+ /// \brief Convenience ctor since make_shared can be awkward
+ static std::shared_ptr<KernelSignature> Make(std::vector<InputType> in_types,
+ OutputType out_type,
+ bool is_varargs = false);
+
+ /// \brief Return true if the signature if compatible with the list of input
+ /// value descriptors.
+ bool MatchesInputs(const std::vector<ValueDescr>& descriptors) const;
+
+ /// \brief Returns true if the input types of each signature are
+ /// equal. Well-formed functions should have a deterministic output type
+ /// given input types, but currently it is the responsibility of the
+ /// developer to ensure this.
+ bool Equals(const KernelSignature& other) const;
+
+ bool operator==(const KernelSignature& other) const { return this->Equals(other); }
+
+ bool operator!=(const KernelSignature& other) const { return !(*this == other); }
+
+ /// \brief Compute a hash code for the signature
+ size_t Hash() const;
+
+ /// \brief The input types for the kernel. For VarArgs functions, this should
+ /// generally contain a single validator to use for validating all of the
+ /// function arguments.
+ const std::vector<InputType>& in_types() const { return in_types_; }
+
+ /// \brief The output type for the kernel. Use Resolve to return the exact
+ /// output given input argument ValueDescrs, since many kernels' output types
+ /// depend on their input types (or their type metadata).
+ const OutputType& out_type() const { return out_type_; }
+
+ /// \brief Render a human-readable string representation
+ std::string ToString() const;
+
+ bool is_varargs() const { return is_varargs_; }
+
+ private:
+ std::vector<InputType> in_types_;
+ OutputType out_type_;
+ bool is_varargs_;
+
+ // For caching the hash code after it's computed the first time
+ mutable uint64_t hash_code_;
+};
+
+/// \brief A function may contain multiple variants of a kernel for a given
+/// type combination for different SIMD levels. Based on the active system's
+/// CPU info or the user's preferences, we can elect to use one over the other.
+struct SimdLevel {
+ enum type { NONE = 0, SSE4_2, AVX, AVX2, AVX512, NEON, MAX };
+};
+
+/// \brief The strategy to use for propagating or otherwise populating the
+/// validity bitmap of a kernel output.
+struct NullHandling {
+ enum type {
+ /// Compute the output validity bitmap by intersecting the validity bitmaps
+ /// of the arguments using bitwise-and operations. This means that values
+ /// in the output are valid/non-null only if the corresponding values in
+ /// all input arguments were valid/non-null. Kernel generally need not
+ /// touch the bitmap thereafter, but a kernel's exec function is permitted
+ /// to alter the bitmap after the null intersection is computed if it needs
+ /// to.
+ INTERSECTION,
+
+ /// Kernel expects a pre-allocated buffer to write the result bitmap
+ /// into. The preallocated memory is not zeroed (except for the last byte),
+ /// so the kernel should ensure to completely populate the bitmap.
+ COMPUTED_PREALLOCATE,
+
+ /// Kernel allocates and sets the validity bitmap of the output.
+ COMPUTED_NO_PREALLOCATE,
+
+ /// Kernel output is never null and a validity bitmap does not need to be
+ /// allocated.
+ OUTPUT_NOT_NULL
+ };
+};
+
+/// \brief The preference for memory preallocation of fixed-width type outputs
+/// in kernel execution.
+struct MemAllocation {
+ enum type {
+ // For data types that support pre-allocation (i.e. fixed-width), the
+ // kernel expects to be provided a pre-allocated data buffer to write
+ // into. Non-fixed-width types must always allocate their own data
+ // buffers. The allocation made for the same length as the execution batch,
+ // so vector kernels yielding differently sized output should not use this.
+ //
+ // It is valid for the data to not be preallocated but the validity bitmap
+ // is (or is computed using the intersection/bitwise-and method).
+ //
+ // For variable-size output types like BinaryType or StringType, or for
+ // nested types, this option has no effect.
+ PREALLOCATE,
+
+ // The kernel is responsible for allocating its own data buffer for
+ // fixed-width type outputs.
+ NO_PREALLOCATE
+ };
+};
+
+struct Kernel;
+
+/// \brief Arguments to pass to a KernelInit function. A struct is used to help
+/// avoid API breakage should the arguments passed need to be expanded.
+struct KernelInitArgs {
+ /// \brief A pointer to the kernel being initialized. The init function may
+ /// depend on the kernel's KernelSignature or other data contained there.
+ const Kernel* kernel;
+
+ /// \brief The types and shapes of the input arguments that the kernel is
+ /// about to be executed against.
+ ///
+ /// TODO: should this be const std::vector<ValueDescr>*? const-ref is being
+ /// used to avoid the cost of copying the struct into the args struct.
+ const std::vector<ValueDescr>& inputs;
+
+ /// \brief Opaque options specific to this kernel. May be nullptr for functions
+ /// that do not require options.
+ const FunctionOptions* options;
+};
+
+/// \brief Common initializer function for all kernel types.
+using KernelInit = std::function<Result<std::unique_ptr<KernelState>>(
+ KernelContext*, const KernelInitArgs&)>;
+
+/// \brief Base type for kernels. Contains the function signature and
+/// optionally the state initialization function, along with some common
+/// attributes
+struct Kernel {
+ Kernel() = default;
+
+ Kernel(std::shared_ptr<KernelSignature> sig, KernelInit init)
+ : signature(std::move(sig)), init(std::move(init)) {}
+
+ Kernel(std::vector<InputType> in_types, OutputType out_type, KernelInit init)
+ : Kernel(KernelSignature::Make(std::move(in_types), std::move(out_type)),
+ std::move(init)) {}
+
+ /// \brief The "signature" of the kernel containing the InputType input
+ /// argument validators and OutputType output type and shape resolver.
+ std::shared_ptr<KernelSignature> signature;
+
+ /// \brief Create a new KernelState for invocations of this kernel, e.g. to
+ /// set up any options or state relevant for execution.
+ KernelInit init;
+
+ /// \brief Create a vector of new KernelState for invocations of this kernel.
+ static Status InitAll(KernelContext*, const KernelInitArgs&,
+ std::vector<std::unique_ptr<KernelState>>*);
+
+ /// \brief Indicates whether execution can benefit from parallelization
+ /// (splitting large chunks into smaller chunks and using multiple
+ /// threads). Some kernels may not support parallel execution at
+ /// all. Synchronization and concurrency-related issues are currently the
+ /// responsibility of the Kernel's implementation.
+ bool parallelizable = true;
+
+ /// \brief Indicates the level of SIMD instruction support in the host CPU is
+ /// required to use the function. The intention is for functions to be able to
+ /// contain multiple kernels with the same signature but different levels of SIMD,
+ /// so that the most optimized kernel supported on a host's processor can be chosen.
+ SimdLevel::type simd_level = SimdLevel::NONE;
+};
+
+/// \brief Common kernel base data structure for ScalarKernel and
+/// VectorKernel. It is called "ArrayKernel" in that the functions generally
+/// output array values (as opposed to scalar values in the case of aggregate
+/// functions).
+struct ArrayKernel : public Kernel {
+ ArrayKernel() = default;
+
+ ArrayKernel(std::shared_ptr<KernelSignature> sig, ArrayKernelExec exec,
+ KernelInit init = NULLPTR)
+ : Kernel(std::move(sig), init), exec(std::move(exec)) {}
+
+ ArrayKernel(std::vector<InputType> in_types, OutputType out_type, ArrayKernelExec exec,
+ KernelInit init = NULLPTR)
+ : Kernel(std::move(in_types), std::move(out_type), std::move(init)),
+ exec(std::move(exec)) {}
+
+ /// \brief Perform a single invocation of this kernel. Depending on the
+ /// implementation, it may only write into preallocated memory, while in some
+ /// cases it will allocate its own memory. Any required state is managed
+ /// through the KernelContext.
+ ArrayKernelExec exec;
+
+ /// \brief Writing execution results into larger contiguous allocations
+ /// requires that the kernel be able to write into sliced output ArrayData*,
+ /// including sliced output validity bitmaps. Some kernel implementations may
+ /// not be able to do this, so setting this to false disables this
+ /// functionality.
+ bool can_write_into_slices = true;
+};
+
+/// \brief Kernel data structure for implementations of ScalarFunction. In
+/// addition to the members found in ArrayKernel, contains the null handling
+/// and memory pre-allocation preferences.
+struct ScalarKernel : public ArrayKernel {
+ using ArrayKernel::ArrayKernel;
+
+ // For scalar functions preallocated data and intersecting arg validity
+ // bitmaps is a reasonable default
+ NullHandling::type null_handling = NullHandling::INTERSECTION;
+ MemAllocation::type mem_allocation = MemAllocation::PREALLOCATE;
+};
+
+// ----------------------------------------------------------------------
+// VectorKernel (for VectorFunction)
+
+/// \brief See VectorKernel::finalize member for usage
+using VectorFinalize = std::function<Status(KernelContext*, std::vector<Datum>*)>;
+
+/// \brief Kernel data structure for implementations of VectorFunction. In
+/// addition to the members found in ArrayKernel, contains an optional
+/// finalizer function, the null handling and memory pre-allocation preferences
+/// (which have different defaults from ScalarKernel), and some other
+/// execution-related options.
+struct VectorKernel : public ArrayKernel {
+ VectorKernel() = default;
+
+ VectorKernel(std::shared_ptr<KernelSignature> sig, ArrayKernelExec exec)
+ : ArrayKernel(std::move(sig), std::move(exec)) {}
+
+ VectorKernel(std::vector<InputType> in_types, OutputType out_type, ArrayKernelExec exec,
+ KernelInit init = NULLPTR, VectorFinalize finalize = NULLPTR)
+ : ArrayKernel(std::move(in_types), std::move(out_type), std::move(exec),
+ std::move(init)),
+ finalize(std::move(finalize)) {}
+
+ VectorKernel(std::shared_ptr<KernelSignature> sig, ArrayKernelExec exec,
+ KernelInit init = NULLPTR, VectorFinalize finalize = NULLPTR)
+ : ArrayKernel(std::move(sig), std::move(exec), std::move(init)),
+ finalize(std::move(finalize)) {}
+
+ /// \brief For VectorKernel, convert intermediate results into finalized
+ /// results. Mutates input argument. Some kernels may accumulate state
+ /// (example: hashing-related functions) through processing chunked inputs, and
+ /// then need to attach some accumulated state to each of the outputs of
+ /// processing each chunk of data.
+ VectorFinalize finalize;
+
+ /// Since vector kernels generally are implemented rather differently from
+ /// scalar/elementwise kernels (and they may not even yield arrays of the same
+ /// size), so we make the developer opt-in to any memory preallocation rather
+ /// than having to turn it off.
+ NullHandling::type null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
+ MemAllocation::type mem_allocation = MemAllocation::NO_PREALLOCATE;
+
+ /// Some vector kernels can do chunkwise execution using ExecBatchIterator,
+ /// in some cases accumulating some state. Other kernels (like Take) need to
+ /// be passed whole arrays and don't work on ChunkedArray inputs
+ bool can_execute_chunkwise = true;
+
+ /// Some kernels (like unique and value_counts) yield non-chunked output from
+ /// chunked-array inputs. This option controls how the results are boxed when
+ /// returned from ExecVectorFunction
+ ///
+ /// true -> ChunkedArray
+ /// false -> Array
+ bool output_chunked = true;
+};
+
+// ----------------------------------------------------------------------
+// ScalarAggregateKernel (for ScalarAggregateFunction)
+
+using ScalarAggregateConsume = std::function<Status(KernelContext*, const ExecBatch&)>;
+
+using ScalarAggregateMerge =
+ std::function<Status(KernelContext*, KernelState&&, KernelState*)>;
+
+// Finalize returns Datum to permit multiple return values
+using ScalarAggregateFinalize = std::function<Status(KernelContext*, Datum*)>;
+
+/// \brief Kernel data structure for implementations of
+/// ScalarAggregateFunction. The four necessary components of an aggregation
+/// kernel are the init, consume, merge, and finalize functions.
+///
+/// * init: creates a new KernelState for a kernel.
+/// * consume: processes an ExecBatch and updates the KernelState found in the
+/// KernelContext.
+/// * merge: combines one KernelState with another.
+/// * finalize: produces the end result of the aggregation using the
+/// KernelState in the KernelContext.
+struct ScalarAggregateKernel : public Kernel {
+ ScalarAggregateKernel() = default;
+
+ ScalarAggregateKernel(std::shared_ptr<KernelSignature> sig, KernelInit init,
+ ScalarAggregateConsume consume, ScalarAggregateMerge merge,
+ ScalarAggregateFinalize finalize)
+ : Kernel(std::move(sig), std::move(init)),
+ consume(std::move(consume)),
+ merge(std::move(merge)),
+ finalize(std::move(finalize)) {}
+
+ ScalarAggregateKernel(std::vector<InputType> in_types, OutputType out_type,
+ KernelInit init, ScalarAggregateConsume consume,
+ ScalarAggregateMerge merge, ScalarAggregateFinalize finalize)
+ : ScalarAggregateKernel(
+ KernelSignature::Make(std::move(in_types), std::move(out_type)),
+ std::move(init), std::move(consume), std::move(merge), std::move(finalize)) {}
+
+ /// \brief Merge a vector of KernelStates into a single KernelState.
+ /// The merged state will be returned and will be set on the KernelContext.
+ static Result<std::unique_ptr<KernelState>> MergeAll(
+ const ScalarAggregateKernel* kernel, KernelContext* ctx,
+ std::vector<std::unique_ptr<KernelState>> states);
+
+ ScalarAggregateConsume consume;
+ ScalarAggregateMerge merge;
+ ScalarAggregateFinalize finalize;
+};
+
+// ----------------------------------------------------------------------
+// HashAggregateKernel (for HashAggregateFunction)
+
+using HashAggregateConsume = std::function<Status(KernelContext*, const ExecBatch&)>;
+
+using HashAggregateMerge =
+ std::function<Status(KernelContext*, KernelState&&, KernelState*)>;
+
+// Finalize returns Datum to permit multiple return values
+using HashAggregateFinalize = std::function<Status(KernelContext*, Datum*)>;
+
+/// \brief Kernel data structure for implementations of
+/// HashAggregateFunction. The four necessary components of an aggregation
+/// kernel are the init, consume, merge, and finalize functions.
+///
+/// * init: creates a new KernelState for a kernel.
+/// * consume: processes an ExecBatch (which includes the argument as well
+/// as an array of group identifiers) and updates the KernelState found in the
+/// KernelContext.
+/// * merge: combines one KernelState with another.
+/// * finalize: produces the end result of the aggregation using the
+/// KernelState in the KernelContext.
+struct HashAggregateKernel : public Kernel {
+ HashAggregateKernel() = default;
+
+ HashAggregateKernel(std::shared_ptr<KernelSignature> sig, KernelInit init,
+ HashAggregateConsume consume, HashAggregateMerge merge,
+ HashAggregateFinalize finalize)
+ : Kernel(std::move(sig), std::move(init)),
+ consume(std::move(consume)),
+ merge(std::move(merge)),
+ finalize(std::move(finalize)) {}
+
+ HashAggregateKernel(std::vector<InputType> in_types, OutputType out_type,
+ KernelInit init, HashAggregateMerge merge,
+ HashAggregateConsume consume, HashAggregateFinalize finalize)
+ : HashAggregateKernel(
+ KernelSignature::Make(std::move(in_types), std::move(out_type)),
+ std::move(init), std::move(consume), std::move(merge), std::move(finalize)) {}
+
+ HashAggregateConsume consume;
+ HashAggregateMerge merge;
+ HashAggregateFinalize finalize;
+};
+
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_basic.cc
new file mode 100644
index 00000000000..a7df66695b2
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_basic.cc
@@ -0,0 +1,604 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/api_aggregate.h"
+#include "arrow/compute/kernels/aggregate_basic_internal.h"
+#include "arrow/compute/kernels/aggregate_internal.h"
+#include "arrow/compute/kernels/common.h"
+#include "arrow/util/cpu_info.h"
+#include "arrow/util/make_unique.h"
+
+namespace arrow {
+namespace compute {
+
+namespace {
+
+Status AggregateConsume(KernelContext* ctx, const ExecBatch& batch) {
+ return checked_cast<ScalarAggregator*>(ctx->state())->Consume(ctx, batch);
+}
+
+Status AggregateMerge(KernelContext* ctx, KernelState&& src, KernelState* dst) {
+ return checked_cast<ScalarAggregator*>(dst)->MergeFrom(ctx, std::move(src));
+}
+
+Status AggregateFinalize(KernelContext* ctx, Datum* out) {
+ return checked_cast<ScalarAggregator*>(ctx->state())->Finalize(ctx, out);
+}
+
+} // namespace
+
+void AddAggKernel(std::shared_ptr<KernelSignature> sig, KernelInit init,
+ ScalarAggregateFunction* func, SimdLevel::type simd_level) {
+ ScalarAggregateKernel kernel(std::move(sig), init, AggregateConsume, AggregateMerge,
+ AggregateFinalize);
+ // Set the simd level
+ kernel.simd_level = simd_level;
+ DCHECK_OK(func->AddKernel(kernel));
+}
+
+namespace aggregate {
+
+// ----------------------------------------------------------------------
+// Count implementation
+
+struct CountImpl : public ScalarAggregator {
+ explicit CountImpl(ScalarAggregateOptions options) : options(std::move(options)) {}
+
+ Status Consume(KernelContext*, const ExecBatch& batch) override {
+ if (batch[0].is_array()) {
+ const ArrayData& input = *batch[0].array();
+ const int64_t nulls = input.GetNullCount();
+ this->nulls += nulls;
+ this->non_nulls += input.length - nulls;
+ } else {
+ const Scalar& input = *batch[0].scalar();
+ this->nulls += !input.is_valid * batch.length;
+ this->non_nulls += input.is_valid * batch.length;
+ }
+ return Status::OK();
+ }
+
+ Status MergeFrom(KernelContext*, KernelState&& src) override {
+ const auto& other_state = checked_cast<const CountImpl&>(src);
+ this->non_nulls += other_state.non_nulls;
+ this->nulls += other_state.nulls;
+ return Status::OK();
+ }
+
+ Status Finalize(KernelContext* ctx, Datum* out) override {
+ const auto& state = checked_cast<const CountImpl&>(*ctx->state());
+ if (state.options.skip_nulls) {
+ *out = Datum(state.non_nulls);
+ } else {
+ *out = Datum(state.nulls);
+ }
+ return Status::OK();
+ }
+
+ ScalarAggregateOptions options;
+ int64_t non_nulls = 0;
+ int64_t nulls = 0;
+};
+
+Result<std::unique_ptr<KernelState>> CountInit(KernelContext*,
+ const KernelInitArgs& args) {
+ return ::arrow::internal::make_unique<CountImpl>(
+ static_cast<const ScalarAggregateOptions&>(*args.options));
+}
+
+// ----------------------------------------------------------------------
+// Sum implementation
+
+template <typename ArrowType>
+struct SumImplDefault : public SumImpl<ArrowType, SimdLevel::NONE> {
+ explicit SumImplDefault(const ScalarAggregateOptions& options_) {
+ this->options = options_;
+ }
+};
+
+template <typename ArrowType>
+struct MeanImplDefault : public MeanImpl<ArrowType, SimdLevel::NONE> {
+ explicit MeanImplDefault(const ScalarAggregateOptions& options_) {
+ this->options = options_;
+ }
+};
+
+Result<std::unique_ptr<KernelState>> SumInit(KernelContext* ctx,
+ const KernelInitArgs& args) {
+ SumLikeInit<SumImplDefault> visitor(
+ ctx, *args.inputs[0].type,
+ static_cast<const ScalarAggregateOptions&>(*args.options));
+ return visitor.Create();
+}
+
+Result<std::unique_ptr<KernelState>> MeanInit(KernelContext* ctx,
+ const KernelInitArgs& args) {
+ SumLikeInit<MeanImplDefault> visitor(
+ ctx, *args.inputs[0].type,
+ static_cast<const ScalarAggregateOptions&>(*args.options));
+ return visitor.Create();
+}
+
+// ----------------------------------------------------------------------
+// MinMax implementation
+
+Result<std::unique_ptr<KernelState>> MinMaxInit(KernelContext* ctx,
+ const KernelInitArgs& args) {
+ MinMaxInitState<SimdLevel::NONE> visitor(
+ ctx, *args.inputs[0].type, args.kernel->signature->out_type().type(),
+ static_cast<const ScalarAggregateOptions&>(*args.options));
+ return visitor.Create();
+}
+
+// ----------------------------------------------------------------------
+// Any implementation
+
+struct BooleanAnyImpl : public ScalarAggregator {
+ explicit BooleanAnyImpl(ScalarAggregateOptions options) : options(std::move(options)) {}
+
+ Status Consume(KernelContext*, const ExecBatch& batch) override {
+ // short-circuit if seen a True already
+ if (this->any == true) {
+ return Status::OK();
+ }
+ if (batch[0].is_scalar()) {
+ const auto& scalar = *batch[0].scalar();
+ this->has_nulls = !scalar.is_valid;
+ this->any = scalar.is_valid && checked_cast<const BooleanScalar&>(scalar).value;
+ return Status::OK();
+ }
+ const auto& data = *batch[0].array();
+ this->has_nulls = data.GetNullCount() > 0;
+ arrow::internal::OptionalBinaryBitBlockCounter counter(
+ data.buffers[0], data.offset, data.buffers[1], data.offset, data.length);
+ int64_t position = 0;
+ while (position < data.length) {
+ const auto block = counter.NextAndBlock();
+ if (block.popcount > 0) {
+ this->any = true;
+ break;
+ }
+ position += block.length;
+ }
+ return Status::OK();
+ }
+
+ Status MergeFrom(KernelContext*, KernelState&& src) override {
+ const auto& other = checked_cast<const BooleanAnyImpl&>(src);
+ this->any |= other.any;
+ this->has_nulls |= other.has_nulls;
+ return Status::OK();
+ }
+
+ Status Finalize(KernelContext* ctx, Datum* out) override {
+ if (!options.skip_nulls && !this->any && this->has_nulls) {
+ out->value = std::make_shared<BooleanScalar>();
+ } else {
+ out->value = std::make_shared<BooleanScalar>(this->any);
+ }
+ return Status::OK();
+ }
+
+ bool any = false;
+ bool has_nulls = false;
+ ScalarAggregateOptions options;
+};
+
+Result<std::unique_ptr<KernelState>> AnyInit(KernelContext*, const KernelInitArgs& args) {
+ const ScalarAggregateOptions options =
+ static_cast<const ScalarAggregateOptions&>(*args.options);
+ return ::arrow::internal::make_unique<BooleanAnyImpl>(
+ static_cast<const ScalarAggregateOptions&>(*args.options));
+}
+
+// ----------------------------------------------------------------------
+// All implementation
+
+struct BooleanAllImpl : public ScalarAggregator {
+ explicit BooleanAllImpl(ScalarAggregateOptions options) : options(std::move(options)) {}
+
+ Status Consume(KernelContext*, const ExecBatch& batch) override {
+ // short-circuit if seen a false already
+ if (this->all == false) {
+ return Status::OK();
+ }
+ // short-circuit if seen a null already
+ if (!options.skip_nulls && this->has_nulls) {
+ return Status::OK();
+ }
+ if (batch[0].is_scalar()) {
+ const auto& scalar = *batch[0].scalar();
+ this->has_nulls = !scalar.is_valid;
+ this->all = !scalar.is_valid || checked_cast<const BooleanScalar&>(scalar).value;
+ return Status::OK();
+ }
+ const auto& data = *batch[0].array();
+ this->has_nulls = data.GetNullCount() > 0;
+ arrow::internal::OptionalBinaryBitBlockCounter counter(
+ data.buffers[1], data.offset, data.buffers[0], data.offset, data.length);
+ int64_t position = 0;
+ while (position < data.length) {
+ const auto block = counter.NextOrNotBlock();
+ if (!block.AllSet()) {
+ this->all = false;
+ break;
+ }
+ position += block.length;
+ }
+
+ return Status::OK();
+ }
+
+ Status MergeFrom(KernelContext*, KernelState&& src) override {
+ const auto& other = checked_cast<const BooleanAllImpl&>(src);
+ this->all &= other.all;
+ this->has_nulls |= other.has_nulls;
+ return Status::OK();
+ }
+
+ Status Finalize(KernelContext*, Datum* out) override {
+ if (!options.skip_nulls && this->all && this->has_nulls) {
+ out->value = std::make_shared<BooleanScalar>();
+ } else {
+ out->value = std::make_shared<BooleanScalar>(this->all);
+ }
+ return Status::OK();
+ }
+
+ bool all = true;
+ bool has_nulls = false;
+ ScalarAggregateOptions options;
+};
+
+Result<std::unique_ptr<KernelState>> AllInit(KernelContext*, const KernelInitArgs& args) {
+ return ::arrow::internal::make_unique<BooleanAllImpl>(
+ static_cast<const ScalarAggregateOptions&>(*args.options));
+}
+
+// ----------------------------------------------------------------------
+// Index implementation
+
+template <typename ArgType>
+struct IndexImpl : public ScalarAggregator {
+ using ArgValue = typename internal::GetViewType<ArgType>::T;
+
+ explicit IndexImpl(IndexOptions options, KernelState* raw_state)
+ : options(std::move(options)), seen(0), index(-1) {
+ if (auto state = static_cast<IndexImpl<ArgType>*>(raw_state)) {
+ seen = state->seen;
+ index = state->index;
+ }
+ }
+
+ Status Consume(KernelContext* ctx, const ExecBatch& batch) override {
+ // short-circuit
+ if (index >= 0 || !options.value->is_valid) {
+ return Status::OK();
+ }
+
+ auto input = batch[0].array();
+ seen = input->length;
+ const ArgValue desired = internal::UnboxScalar<ArgType>::Unbox(*options.value);
+ int64_t i = 0;
+
+ ARROW_UNUSED(internal::VisitArrayValuesInline<ArgType>(
+ *input,
+ [&](ArgValue v) -> Status {
+ if (v == desired) {
+ index = i;
+ return Status::Cancelled("Found");
+ } else {
+ ++i;
+ return Status::OK();
+ }
+ },
+ [&]() -> Status {
+ ++i;
+ return Status::OK();
+ }));
+
+ return Status::OK();
+ }
+
+ Status MergeFrom(KernelContext*, KernelState&& src) override {
+ const auto& other = checked_cast<const IndexImpl&>(src);
+ if (index < 0 && other.index >= 0) {
+ index = seen + other.index;
+ }
+ seen += other.seen;
+ return Status::OK();
+ }
+
+ Status Finalize(KernelContext*, Datum* out) override {
+ out->value = std::make_shared<Int64Scalar>(index >= 0 ? index : -1);
+ return Status::OK();
+ }
+
+ const IndexOptions options;
+ int64_t seen = 0;
+ int64_t index = -1;
+};
+
+struct IndexInit {
+ std::unique_ptr<KernelState> state;
+ KernelContext* ctx;
+ const IndexOptions& options;
+ const DataType& type;
+
+ IndexInit(KernelContext* ctx, const IndexOptions& options, const DataType& type)
+ : ctx(ctx), options(options), type(type) {}
+
+ Status Visit(const DataType& type) {
+ return Status::NotImplemented("Index kernel not implemented for ", type.ToString());
+ }
+
+ Status Visit(const BooleanType&) {
+ state.reset(new IndexImpl<BooleanType>(options, ctx->state()));
+ return Status::OK();
+ }
+
+ template <typename Type>
+ enable_if_number<Type, Status> Visit(const Type&) {
+ state.reset(new IndexImpl<Type>(options, ctx->state()));
+ return Status::OK();
+ }
+
+ template <typename Type>
+ enable_if_base_binary<Type, Status> Visit(const Type&) {
+ state.reset(new IndexImpl<Type>(options, ctx->state()));
+ return Status::OK();
+ }
+
+ template <typename Type>
+ enable_if_date<Type, Status> Visit(const Type&) {
+ state.reset(new IndexImpl<Type>(options, ctx->state()));
+ return Status::OK();
+ }
+
+ template <typename Type>
+ enable_if_time<Type, Status> Visit(const Type&) {
+ state.reset(new IndexImpl<Type>(options, ctx->state()));
+ return Status::OK();
+ }
+
+ template <typename Type>
+ enable_if_timestamp<Type, Status> Visit(const Type&) {
+ state.reset(new IndexImpl<Type>(options, ctx->state()));
+ return Status::OK();
+ }
+
+ Result<std::unique_ptr<KernelState>> Create() {
+ RETURN_NOT_OK(VisitTypeInline(type, this));
+ return std::move(state);
+ }
+
+ static Result<std::unique_ptr<KernelState>> Init(KernelContext* ctx,
+ const KernelInitArgs& args) {
+ IndexInit visitor(ctx, static_cast<const IndexOptions&>(*args.options),
+ *args.inputs[0].type);
+ return visitor.Create();
+ }
+};
+
+void AddBasicAggKernels(KernelInit init,
+ const std::vector<std::shared_ptr<DataType>>& types,
+ std::shared_ptr<DataType> out_ty, ScalarAggregateFunction* func,
+ SimdLevel::type simd_level) {
+ for (const auto& ty : types) {
+ // array[InT] -> scalar[OutT]
+ auto sig = KernelSignature::Make({InputType::Array(ty)}, ValueDescr::Scalar(out_ty));
+ AddAggKernel(std::move(sig), init, func, simd_level);
+ }
+}
+
+void AddScalarAggKernels(KernelInit init,
+ const std::vector<std::shared_ptr<DataType>>& types,
+ std::shared_ptr<DataType> out_ty,
+ ScalarAggregateFunction* func) {
+ for (const auto& ty : types) {
+ // scalar[InT] -> scalar[OutT]
+ auto sig = KernelSignature::Make({InputType::Scalar(ty)}, ValueDescr::Scalar(out_ty));
+ AddAggKernel(std::move(sig), init, func, SimdLevel::NONE);
+ }
+}
+
+void AddArrayScalarAggKernels(KernelInit init,
+ const std::vector<std::shared_ptr<DataType>>& types,
+ std::shared_ptr<DataType> out_ty,
+ ScalarAggregateFunction* func,
+ SimdLevel::type simd_level = SimdLevel::NONE) {
+ AddBasicAggKernels(init, types, out_ty, func, simd_level);
+ AddScalarAggKernels(init, types, out_ty, func);
+}
+
+void AddMinMaxKernels(KernelInit init,
+ const std::vector<std::shared_ptr<DataType>>& types,
+ ScalarAggregateFunction* func, SimdLevel::type simd_level) {
+ for (const auto& ty : types) {
+ // any[T] -> scalar[struct<min: T, max: T>]
+ auto out_ty = struct_({field("min", ty), field("max", ty)});
+ auto sig = KernelSignature::Make({InputType(ty)}, ValueDescr::Scalar(out_ty));
+ AddAggKernel(std::move(sig), init, func, simd_level);
+ }
+}
+
+} // namespace aggregate
+
+namespace internal {
+namespace {
+
+const FunctionDoc count_doc{"Count the number of null / non-null values",
+ ("By default, only non-null values are counted.\n"
+ "This can be changed through ScalarAggregateOptions."),
+ {"array"},
+ "ScalarAggregateOptions"};
+
+const FunctionDoc sum_doc{
+ "Compute the sum of a numeric array",
+ ("Null values are ignored by default. Minimum count of non-null\n"
+ "values can be set and null is returned if too few are present.\n"
+ "This can be changed through ScalarAggregateOptions."),
+ {"array"},
+ "ScalarAggregateOptions"};
+
+const FunctionDoc mean_doc{
+ "Compute the mean of a numeric array",
+ ("Null values are ignored by default. Minimum count of non-null\n"
+ "values can be set and null is returned if too few are "
+ "present.\nThis can be changed through ScalarAggregateOptions.\n"
+ "The result is always computed as a double, regardless of the input types."),
+ {"array"},
+ "ScalarAggregateOptions"};
+
+const FunctionDoc min_max_doc{"Compute the minimum and maximum values of a numeric array",
+ ("Null values are ignored by default.\n"
+ "This can be changed through ScalarAggregateOptions."),
+ {"array"},
+ "ScalarAggregateOptions"};
+
+const FunctionDoc any_doc{"Test whether any element in a boolean array evaluates to true",
+ ("Null values are ignored by default.\n"
+ "If null values are taken into account by setting "
+ "ScalarAggregateOptions parameter skip_nulls = false then "
+ "Kleene logic is used.\n"
+ "See KleeneOr for more details on Kleene logic."),
+ {"array"},
+ "ScalarAggregateOptions"};
+
+const FunctionDoc all_doc{"Test whether all elements in a boolean array evaluate to true",
+ ("Null values are ignored by default.\n"
+ "If null values are taken into account by setting "
+ "ScalarAggregateOptions parameter skip_nulls = false then "
+ "Kleene logic is used.\n"
+ "See KleeneAnd for more details on Kleene logic."),
+ {"array"},
+ "ScalarAggregateOptions"};
+
+const FunctionDoc index_doc{"Find the index of the first occurrence of a given value",
+ ("The result is always computed as an int64_t, regardless\n"
+ "of the offset type of the input array."),
+ {"array"},
+ "IndexOptions"};
+
+} // namespace
+
+void RegisterScalarAggregateBasic(FunctionRegistry* registry) {
+ static auto default_scalar_aggregate_options = ScalarAggregateOptions::Defaults();
+
+ auto func = std::make_shared<ScalarAggregateFunction>(
+ "count", Arity::Unary(), &count_doc, &default_scalar_aggregate_options);
+
+ // Takes any array input, outputs int64 scalar
+ InputType any_array(ValueDescr::ARRAY);
+ AddAggKernel(KernelSignature::Make({any_array}, ValueDescr::Scalar(int64())),
+ aggregate::CountInit, func.get());
+ AddAggKernel(
+ KernelSignature::Make({InputType(ValueDescr::SCALAR)}, ValueDescr::Scalar(int64())),
+ aggregate::CountInit, func.get());
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+
+ func = std::make_shared<ScalarAggregateFunction>("sum", Arity::Unary(), &sum_doc,
+ &default_scalar_aggregate_options);
+ aggregate::AddArrayScalarAggKernels(aggregate::SumInit, {boolean()}, int64(),
+ func.get());
+ aggregate::AddArrayScalarAggKernels(aggregate::SumInit, SignedIntTypes(), int64(),
+ func.get());
+ aggregate::AddArrayScalarAggKernels(aggregate::SumInit, UnsignedIntTypes(), uint64(),
+ func.get());
+ aggregate::AddArrayScalarAggKernels(aggregate::SumInit, FloatingPointTypes(), float64(),
+ func.get());
+ // Add the SIMD variants for sum
+#if defined(ARROW_HAVE_RUNTIME_AVX2) || defined(ARROW_HAVE_RUNTIME_AVX512)
+ auto cpu_info = arrow::internal::CpuInfo::GetInstance();
+#endif
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
+ if (cpu_info->IsSupported(arrow::internal::CpuInfo::AVX2)) {
+ aggregate::AddSumAvx2AggKernels(func.get());
+ }
+#endif
+#if defined(ARROW_HAVE_RUNTIME_AVX512)
+ if (cpu_info->IsSupported(arrow::internal::CpuInfo::AVX512)) {
+ aggregate::AddSumAvx512AggKernels(func.get());
+ }
+#endif
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+
+ func = std::make_shared<ScalarAggregateFunction>("mean", Arity::Unary(), &mean_doc,
+ &default_scalar_aggregate_options);
+ aggregate::AddArrayScalarAggKernels(aggregate::MeanInit, {boolean()}, float64(),
+ func.get());
+ aggregate::AddArrayScalarAggKernels(aggregate::MeanInit, NumericTypes(), float64(),
+ func.get());
+ // Add the SIMD variants for mean
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
+ if (cpu_info->IsSupported(arrow::internal::CpuInfo::AVX2)) {
+ aggregate::AddMeanAvx2AggKernels(func.get());
+ }
+#endif
+#if defined(ARROW_HAVE_RUNTIME_AVX512)
+ if (cpu_info->IsSupported(arrow::internal::CpuInfo::AVX512)) {
+ aggregate::AddMeanAvx512AggKernels(func.get());
+ }
+#endif
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+
+ func = std::make_shared<ScalarAggregateFunction>(
+ "min_max", Arity::Unary(), &min_max_doc, &default_scalar_aggregate_options);
+ aggregate::AddMinMaxKernels(aggregate::MinMaxInit, {boolean()}, func.get());
+ aggregate::AddMinMaxKernels(aggregate::MinMaxInit, NumericTypes(), func.get());
+ // Add the SIMD variants for min max
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
+ if (cpu_info->IsSupported(arrow::internal::CpuInfo::AVX2)) {
+ aggregate::AddMinMaxAvx2AggKernels(func.get());
+ }
+#endif
+#if defined(ARROW_HAVE_RUNTIME_AVX512)
+ if (cpu_info->IsSupported(arrow::internal::CpuInfo::AVX512)) {
+ aggregate::AddMinMaxAvx512AggKernels(func.get());
+ }
+#endif
+
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+
+ // any
+ func = std::make_shared<ScalarAggregateFunction>("any", Arity::Unary(), &any_doc,
+ &default_scalar_aggregate_options);
+ aggregate::AddArrayScalarAggKernels(aggregate::AnyInit, {boolean()}, boolean(),
+ func.get());
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+
+ // all
+ func = std::make_shared<ScalarAggregateFunction>("all", Arity::Unary(), &all_doc,
+ &default_scalar_aggregate_options);
+ aggregate::AddArrayScalarAggKernels(aggregate::AllInit, {boolean()}, boolean(),
+ func.get());
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+
+ // index
+ func = std::make_shared<ScalarAggregateFunction>("index", Arity::Unary(), &index_doc);
+ aggregate::AddBasicAggKernels(aggregate::IndexInit::Init, BaseBinaryTypes(), int64(),
+ func.get());
+ aggregate::AddBasicAggKernels(aggregate::IndexInit::Init, PrimitiveTypes(), int64(),
+ func.get());
+ aggregate::AddBasicAggKernels(aggregate::IndexInit::Init, TemporalTypes(), int64(),
+ func.get());
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+}
+
+} // namespace internal
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h
new file mode 100644
index 00000000000..5163d3fd03d
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h
@@ -0,0 +1,463 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cmath>
+
+#include "arrow/compute/api_aggregate.h"
+#include "arrow/compute/kernels/aggregate_internal.h"
+#include "arrow/compute/kernels/common.h"
+#include "arrow/util/align_util.h"
+#include "arrow/util/bit_block_counter.h"
+
+namespace arrow {
+namespace compute {
+namespace aggregate {
+
+void AddBasicAggKernels(KernelInit init,
+ const std::vector<std::shared_ptr<DataType>>& types,
+ std::shared_ptr<DataType> out_ty, ScalarAggregateFunction* func,
+ SimdLevel::type simd_level = SimdLevel::NONE);
+
+void AddMinMaxKernels(KernelInit init,
+ const std::vector<std::shared_ptr<DataType>>& types,
+ ScalarAggregateFunction* func,
+ SimdLevel::type simd_level = SimdLevel::NONE);
+
+// SIMD variants for kernels
+void AddSumAvx2AggKernels(ScalarAggregateFunction* func);
+void AddMeanAvx2AggKernels(ScalarAggregateFunction* func);
+void AddMinMaxAvx2AggKernels(ScalarAggregateFunction* func);
+
+void AddSumAvx512AggKernels(ScalarAggregateFunction* func);
+void AddMeanAvx512AggKernels(ScalarAggregateFunction* func);
+void AddMinMaxAvx512AggKernels(ScalarAggregateFunction* func);
+
+// ----------------------------------------------------------------------
+// Sum implementation
+
+template <typename ArrowType, SimdLevel::type SimdLevel>
+struct SumImpl : public ScalarAggregator {
+ using ThisType = SumImpl<ArrowType, SimdLevel>;
+ using CType = typename ArrowType::c_type;
+ using SumType = typename FindAccumulatorType<ArrowType>::Type;
+ using OutputType = typename TypeTraits<SumType>::ScalarType;
+
+ Status Consume(KernelContext*, const ExecBatch& batch) override {
+ if (batch[0].is_array()) {
+ const auto& data = batch[0].array();
+ this->count += data->length - data->GetNullCount();
+ if (is_boolean_type<ArrowType>::value) {
+ this->sum +=
+ static_cast<typename SumType::c_type>(BooleanArray(data).true_count());
+ } else {
+ this->sum +=
+ arrow::compute::detail::SumArray<CType, typename SumType::c_type, SimdLevel>(
+ *data);
+ }
+ } else {
+ const auto& data = *batch[0].scalar();
+ this->count += data.is_valid * batch.length;
+ if (data.is_valid) {
+ this->sum += internal::UnboxScalar<ArrowType>::Unbox(data) * batch.length;
+ }
+ }
+ return Status::OK();
+ }
+
+ Status MergeFrom(KernelContext*, KernelState&& src) override {
+ const auto& other = checked_cast<const ThisType&>(src);
+ this->count += other.count;
+ this->sum += other.sum;
+ return Status::OK();
+ }
+
+ Status Finalize(KernelContext*, Datum* out) override {
+ if (this->count < options.min_count) {
+ out->value = std::make_shared<OutputType>();
+ } else {
+ out->value = MakeScalar(this->sum);
+ }
+ return Status::OK();
+ }
+
+ size_t count = 0;
+ typename SumType::c_type sum = 0;
+ ScalarAggregateOptions options;
+};
+
+template <typename ArrowType, SimdLevel::type SimdLevel>
+struct MeanImpl : public SumImpl<ArrowType, SimdLevel> {
+ Status Finalize(KernelContext*, Datum* out) override {
+ if (this->count < options.min_count) {
+ out->value = std::make_shared<DoubleScalar>();
+ } else {
+ const double mean = static_cast<double>(this->sum) / this->count;
+ out->value = std::make_shared<DoubleScalar>(mean);
+ }
+ return Status::OK();
+ }
+ ScalarAggregateOptions options;
+};
+
+template <template <typename> class KernelClass>
+struct SumLikeInit {
+ std::unique_ptr<KernelState> state;
+ KernelContext* ctx;
+ const DataType& type;
+ const ScalarAggregateOptions& options;
+
+ SumLikeInit(KernelContext* ctx, const DataType& type,
+ const ScalarAggregateOptions& options)
+ : ctx(ctx), type(type), options(options) {}
+
+ Status Visit(const DataType&) { return Status::NotImplemented("No sum implemented"); }
+
+ Status Visit(const HalfFloatType&) {
+ return Status::NotImplemented("No sum implemented");
+ }
+
+ Status Visit(const BooleanType&) {
+ state.reset(new KernelClass<BooleanType>(options));
+ return Status::OK();
+ }
+
+ template <typename Type>
+ enable_if_number<Type, Status> Visit(const Type&) {
+ state.reset(new KernelClass<Type>(options));
+ return Status::OK();
+ }
+
+ Result<std::unique_ptr<KernelState>> Create() {
+ RETURN_NOT_OK(VisitTypeInline(type, this));
+ return std::move(state);
+ }
+};
+
+// ----------------------------------------------------------------------
+// MinMax implementation
+
+template <typename ArrowType, SimdLevel::type SimdLevel, typename Enable = void>
+struct MinMaxState {};
+
+template <typename ArrowType, SimdLevel::type SimdLevel>
+struct MinMaxState<ArrowType, SimdLevel, enable_if_boolean<ArrowType>> {
+ using ThisType = MinMaxState<ArrowType, SimdLevel>;
+ using T = typename ArrowType::c_type;
+
+ ThisType& operator+=(const ThisType& rhs) {
+ this->has_nulls |= rhs.has_nulls;
+ this->has_values |= rhs.has_values;
+ this->min = this->min && rhs.min;
+ this->max = this->max || rhs.max;
+ return *this;
+ }
+
+ void MergeOne(T value) {
+ this->min = this->min && value;
+ this->max = this->max || value;
+ }
+
+ T min = true;
+ T max = false;
+ bool has_nulls = false;
+ bool has_values = false;
+};
+
+template <typename ArrowType, SimdLevel::type SimdLevel>
+struct MinMaxState<ArrowType, SimdLevel, enable_if_integer<ArrowType>> {
+ using ThisType = MinMaxState<ArrowType, SimdLevel>;
+ using T = typename ArrowType::c_type;
+
+ ThisType& operator+=(const ThisType& rhs) {
+ this->has_nulls |= rhs.has_nulls;
+ this->has_values |= rhs.has_values;
+ this->min = std::min(this->min, rhs.min);
+ this->max = std::max(this->max, rhs.max);
+ return *this;
+ }
+
+ void MergeOne(T value) {
+ this->min = std::min(this->min, value);
+ this->max = std::max(this->max, value);
+ }
+
+ T min = std::numeric_limits<T>::max();
+ T max = std::numeric_limits<T>::min();
+ bool has_nulls = false;
+ bool has_values = false;
+};
+
+template <typename ArrowType, SimdLevel::type SimdLevel>
+struct MinMaxState<ArrowType, SimdLevel, enable_if_floating_point<ArrowType>> {
+ using ThisType = MinMaxState<ArrowType, SimdLevel>;
+ using T = typename ArrowType::c_type;
+
+ ThisType& operator+=(const ThisType& rhs) {
+ this->has_nulls |= rhs.has_nulls;
+ this->has_values |= rhs.has_values;
+ this->min = std::fmin(this->min, rhs.min);
+ this->max = std::fmax(this->max, rhs.max);
+ return *this;
+ }
+
+ void MergeOne(T value) {
+ this->min = std::fmin(this->min, value);
+ this->max = std::fmax(this->max, value);
+ }
+
+ T min = std::numeric_limits<T>::infinity();
+ T max = -std::numeric_limits<T>::infinity();
+ bool has_nulls = false;
+ bool has_values = false;
+};
+
+template <typename ArrowType, SimdLevel::type SimdLevel>
+struct MinMaxImpl : public ScalarAggregator {
+ using ArrayType = typename TypeTraits<ArrowType>::ArrayType;
+ using ThisType = MinMaxImpl<ArrowType, SimdLevel>;
+ using StateType = MinMaxState<ArrowType, SimdLevel>;
+
+ MinMaxImpl(const std::shared_ptr<DataType>& out_type,
+ const ScalarAggregateOptions& options)
+ : out_type(out_type), options(options) {}
+
+ Status Consume(KernelContext*, const ExecBatch& batch) override {
+ if (batch[0].is_array()) {
+ return ConsumeArray(ArrayType(batch[0].array()));
+ }
+ return ConsumeScalar(*batch[0].scalar());
+ }
+
+ Status ConsumeScalar(const Scalar& scalar) {
+ StateType local;
+ local.has_nulls = !scalar.is_valid;
+ local.has_values = scalar.is_valid;
+
+ if (local.has_nulls && !options.skip_nulls) {
+ this->state = local;
+ return Status::OK();
+ }
+
+ local.MergeOne(internal::UnboxScalar<ArrowType>::Unbox(scalar));
+ this->state = local;
+ return Status::OK();
+ }
+
+ Status ConsumeArray(const ArrayType& arr) {
+ StateType local;
+
+ const auto null_count = arr.null_count();
+ local.has_nulls = null_count > 0;
+ local.has_values = (arr.length() - null_count) > 0;
+
+ if (local.has_nulls && !options.skip_nulls) {
+ this->state = local;
+ return Status::OK();
+ }
+
+ if (local.has_nulls) {
+ local += ConsumeWithNulls(arr);
+ } else { // All true values
+ for (int64_t i = 0; i < arr.length(); i++) {
+ local.MergeOne(arr.Value(i));
+ }
+ }
+ this->state = local;
+ return Status::OK();
+ }
+
+ Status MergeFrom(KernelContext*, KernelState&& src) override {
+ const auto& other = checked_cast<const ThisType&>(src);
+ this->state += other.state;
+ return Status::OK();
+ }
+
+ Status Finalize(KernelContext*, Datum* out) override {
+ using ScalarType = typename TypeTraits<ArrowType>::ScalarType;
+
+ std::vector<std::shared_ptr<Scalar>> values;
+ if (!state.has_values || (state.has_nulls && !options.skip_nulls)) {
+ // (null, null)
+ values = {std::make_shared<ScalarType>(), std::make_shared<ScalarType>()};
+ } else {
+ values = {std::make_shared<ScalarType>(state.min),
+ std::make_shared<ScalarType>(state.max)};
+ }
+ out->value = std::make_shared<StructScalar>(std::move(values), this->out_type);
+ return Status::OK();
+ }
+
+ std::shared_ptr<DataType> out_type;
+ ScalarAggregateOptions options;
+ MinMaxState<ArrowType, SimdLevel> state;
+
+ private:
+ StateType ConsumeWithNulls(const ArrayType& arr) const {
+ StateType local;
+ const int64_t length = arr.length();
+ int64_t offset = arr.offset();
+ const uint8_t* bitmap = arr.null_bitmap_data();
+ int64_t idx = 0;
+
+ const auto p = arrow::internal::BitmapWordAlign<1>(bitmap, offset, length);
+ // First handle the leading bits
+ const int64_t leading_bits = p.leading_bits;
+ while (idx < leading_bits) {
+ if (BitUtil::GetBit(bitmap, offset)) {
+ local.MergeOne(arr.Value(idx));
+ }
+ idx++;
+ offset++;
+ }
+
+ // The aligned parts scanned with BitBlockCounter
+ arrow::internal::BitBlockCounter data_counter(bitmap, offset, length - leading_bits);
+ auto current_block = data_counter.NextWord();
+ while (idx < length) {
+ if (current_block.AllSet()) { // All true values
+ int run_length = 0;
+ // Scan forward until a block that has some false values (or the end)
+ while (current_block.length > 0 && current_block.AllSet()) {
+ run_length += current_block.length;
+ current_block = data_counter.NextWord();
+ }
+ for (int64_t i = 0; i < run_length; i++) {
+ local.MergeOne(arr.Value(idx + i));
+ }
+ idx += run_length;
+ offset += run_length;
+ // The current_block already computed, advance to next loop
+ continue;
+ } else if (!current_block.NoneSet()) { // Some values are null
+ BitmapReader reader(arr.null_bitmap_data(), offset, current_block.length);
+ for (int64_t i = 0; i < current_block.length; i++) {
+ if (reader.IsSet()) {
+ local.MergeOne(arr.Value(idx + i));
+ }
+ reader.Next();
+ }
+
+ idx += current_block.length;
+ offset += current_block.length;
+ } else { // All null values
+ idx += current_block.length;
+ offset += current_block.length;
+ }
+ current_block = data_counter.NextWord();
+ }
+
+ return local;
+ }
+};
+
+template <SimdLevel::type SimdLevel>
+struct BooleanMinMaxImpl : public MinMaxImpl<BooleanType, SimdLevel> {
+ using StateType = MinMaxState<BooleanType, SimdLevel>;
+ using ArrayType = typename TypeTraits<BooleanType>::ArrayType;
+ using MinMaxImpl<BooleanType, SimdLevel>::MinMaxImpl;
+ using MinMaxImpl<BooleanType, SimdLevel>::options;
+
+ Status Consume(KernelContext*, const ExecBatch& batch) override {
+ if (ARROW_PREDICT_FALSE(batch[0].is_scalar())) {
+ return ConsumeScalar(checked_cast<const BooleanScalar&>(*batch[0].scalar()));
+ }
+ StateType local;
+ ArrayType arr(batch[0].array());
+
+ const auto arr_length = arr.length();
+ const auto null_count = arr.null_count();
+ const auto valid_count = arr_length - null_count;
+
+ local.has_nulls = null_count > 0;
+ local.has_values = valid_count > 0;
+ if (local.has_nulls && !options.skip_nulls) {
+ this->state = local;
+ return Status::OK();
+ }
+
+ const auto true_count = arr.true_count();
+ const auto false_count = valid_count - true_count;
+ local.max = true_count > 0;
+ local.min = false_count == 0;
+
+ this->state = local;
+ return Status::OK();
+ }
+
+ Status ConsumeScalar(const BooleanScalar& scalar) {
+ StateType local;
+
+ local.has_nulls = !scalar.is_valid;
+ local.has_values = scalar.is_valid;
+ if (local.has_nulls && !options.skip_nulls) {
+ this->state = local;
+ return Status::OK();
+ }
+
+ const int true_count = scalar.is_valid && scalar.value;
+ const int false_count = scalar.is_valid && !scalar.value;
+ local.max = true_count > 0;
+ local.min = false_count == 0;
+
+ this->state = local;
+ return Status::OK();
+ }
+};
+
+template <SimdLevel::type SimdLevel>
+struct MinMaxInitState {
+ std::unique_ptr<KernelState> state;
+ KernelContext* ctx;
+ const DataType& in_type;
+ const std::shared_ptr<DataType>& out_type;
+ const ScalarAggregateOptions& options;
+
+ MinMaxInitState(KernelContext* ctx, const DataType& in_type,
+ const std::shared_ptr<DataType>& out_type,
+ const ScalarAggregateOptions& options)
+ : ctx(ctx), in_type(in_type), out_type(out_type), options(options) {}
+
+ Status Visit(const DataType&) {
+ return Status::NotImplemented("No min/max implemented");
+ }
+
+ Status Visit(const HalfFloatType&) {
+ return Status::NotImplemented("No min/max implemented");
+ }
+
+ Status Visit(const BooleanType&) {
+ state.reset(new BooleanMinMaxImpl<SimdLevel>(out_type, options));
+ return Status::OK();
+ }
+
+ template <typename Type>
+ enable_if_number<Type, Status> Visit(const Type&) {
+ state.reset(new MinMaxImpl<Type, SimdLevel>(out_type, options));
+ return Status::OK();
+ }
+
+ Result<std::unique_ptr<KernelState>> Create() {
+ RETURN_NOT_OK(VisitTypeInline(in_type, this));
+ return std::move(state);
+ }
+};
+
+} // namespace aggregate
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_internal.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_internal.h
new file mode 100644
index 00000000000..ed29f26f2c3
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_internal.h
@@ -0,0 +1,172 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/bit_run_reader.h"
+#include "arrow/util/logging.h"
+
+namespace arrow {
+namespace compute {
+
+// Find the largest compatible primitive type for a primitive type.
+template <typename I, typename Enable = void>
+struct FindAccumulatorType {};
+
+template <typename I>
+struct FindAccumulatorType<I, enable_if_boolean<I>> {
+ using Type = UInt64Type;
+};
+
+template <typename I>
+struct FindAccumulatorType<I, enable_if_signed_integer<I>> {
+ using Type = Int64Type;
+};
+
+template <typename I>
+struct FindAccumulatorType<I, enable_if_unsigned_integer<I>> {
+ using Type = UInt64Type;
+};
+
+template <typename I>
+struct FindAccumulatorType<I, enable_if_floating_point<I>> {
+ using Type = DoubleType;
+};
+
+struct ScalarAggregator : public KernelState {
+ virtual Status Consume(KernelContext* ctx, const ExecBatch& batch) = 0;
+ virtual Status MergeFrom(KernelContext* ctx, KernelState&& src) = 0;
+ virtual Status Finalize(KernelContext* ctx, Datum* out) = 0;
+};
+
+void AddAggKernel(std::shared_ptr<KernelSignature> sig, KernelInit init,
+ ScalarAggregateFunction* func,
+ SimdLevel::type simd_level = SimdLevel::NONE);
+
+namespace detail {
+
+using arrow::internal::VisitSetBitRunsVoid;
+
+// SumArray must be parameterized with the SIMD level since it's called both from
+// translation units with and without vectorization. Normally it gets inlined but
+// if not, without the parameter, we'll have multiple definitions of the same
+// symbol and we'll get unexpected results.
+
+// non-recursive pairwise summation for floating points
+// https://en.wikipedia.org/wiki/Pairwise_summation
+template <typename ValueType, typename SumType, SimdLevel::type SimdLevel,
+ typename ValueFunc>
+enable_if_t<std::is_floating_point<SumType>::value, SumType> SumArray(
+ const ArrayData& data, ValueFunc&& func) {
+ const int64_t data_size = data.length - data.GetNullCount();
+ if (data_size == 0) {
+ return 0;
+ }
+
+ // number of inputs to accumulate before merging with another block
+ constexpr int kBlockSize = 16; // same as numpy
+ // levels (tree depth) = ceil(log2(len)) + 1, a bit larger than necessary
+ const int levels = BitUtil::Log2(static_cast<uint64_t>(data_size)) + 1;
+ // temporary summation per level
+ std::vector<SumType> sum(levels);
+ // whether two summations are ready and should be reduced to upper level
+ // one bit for each level, bit0 -> level0, ...
+ uint64_t mask = 0;
+ // level of root node holding the final summation
+ int root_level = 0;
+
+ // reduce summation of one block (may be smaller than kBlockSize) from leaf node
+ // continue reducing to upper level if two summations are ready for non-leaf node
+ auto reduce = [&](SumType block_sum) {
+ int cur_level = 0;
+ uint64_t cur_level_mask = 1ULL;
+ sum[cur_level] += block_sum;
+ mask ^= cur_level_mask;
+ while ((mask & cur_level_mask) == 0) {
+ block_sum = sum[cur_level];
+ sum[cur_level] = 0;
+ ++cur_level;
+ DCHECK_LT(cur_level, levels);
+ cur_level_mask <<= 1;
+ sum[cur_level] += block_sum;
+ mask ^= cur_level_mask;
+ }
+ root_level = std::max(root_level, cur_level);
+ };
+
+ const ValueType* values = data.GetValues<ValueType>(1);
+ VisitSetBitRunsVoid(data.buffers[0], data.offset, data.length,
+ [&](int64_t pos, int64_t len) {
+ const ValueType* v = &values[pos];
+ // unsigned division by constant is cheaper than signed one
+ const uint64_t blocks = static_cast<uint64_t>(len) / kBlockSize;
+ const uint64_t remains = static_cast<uint64_t>(len) % kBlockSize;
+
+ for (uint64_t i = 0; i < blocks; ++i) {
+ SumType block_sum = 0;
+ for (int j = 0; j < kBlockSize; ++j) {
+ block_sum += func(v[j]);
+ }
+ reduce(block_sum);
+ v += kBlockSize;
+ }
+
+ if (remains > 0) {
+ SumType block_sum = 0;
+ for (uint64_t i = 0; i < remains; ++i) {
+ block_sum += func(v[i]);
+ }
+ reduce(block_sum);
+ }
+ });
+
+ // reduce intermediate summations from all non-leaf nodes
+ for (int i = 1; i <= root_level; ++i) {
+ sum[i] += sum[i - 1];
+ }
+
+ return sum[root_level];
+}
+
+// naive summation for integers
+template <typename ValueType, typename SumType, SimdLevel::type SimdLevel,
+ typename ValueFunc>
+enable_if_t<!std::is_floating_point<SumType>::value, SumType> SumArray(
+ const ArrayData& data, ValueFunc&& func) {
+ SumType sum = 0;
+ const ValueType* values = data.GetValues<ValueType>(1);
+ VisitSetBitRunsVoid(data.buffers[0], data.offset, data.length,
+ [&](int64_t pos, int64_t len) {
+ for (int64_t i = 0; i < len; ++i) {
+ sum += func(values[pos + i]);
+ }
+ });
+ return sum;
+}
+
+template <typename ValueType, typename SumType, SimdLevel::type SimdLevel>
+SumType SumArray(const ArrayData& data) {
+ return SumArray<ValueType, SumType, SimdLevel>(
+ data, [](ValueType v) { return static_cast<SumType>(v); });
+}
+
+} // namespace detail
+
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_mode.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_mode.cc
new file mode 100644
index 00000000000..6ad0eeb6456
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_mode.cc
@@ -0,0 +1,392 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cmath>
+#include <queue>
+#include <utility>
+
+#include "arrow/compute/api_aggregate.h"
+#include "arrow/compute/kernels/aggregate_internal.h"
+#include "arrow/compute/kernels/common.h"
+#include "arrow/compute/kernels/util_internal.h"
+#include "arrow/result.h"
+#include "arrow/stl_allocator.h"
+#include "arrow/type_traits.h"
+
+namespace arrow {
+namespace compute {
+namespace internal {
+
+namespace {
+
+using ModeState = OptionsWrapper<ModeOptions>;
+
+constexpr char kModeFieldName[] = "mode";
+constexpr char kCountFieldName[] = "count";
+
+constexpr uint64_t kCountEOF = ~0ULL;
+
+template <typename InType, typename CType = typename InType::c_type>
+Result<std::pair<CType*, int64_t*>> PrepareOutput(int64_t n, KernelContext* ctx,
+ Datum* out) {
+ const auto& mode_type = TypeTraits<InType>::type_singleton();
+ const auto& count_type = int64();
+
+ auto mode_data = ArrayData::Make(mode_type, /*length=*/n, /*null_count=*/0);
+ mode_data->buffers.resize(2, nullptr);
+ auto count_data = ArrayData::Make(count_type, n, 0);
+ count_data->buffers.resize(2, nullptr);
+
+ CType* mode_buffer = nullptr;
+ int64_t* count_buffer = nullptr;
+
+ if (n > 0) {
+ ARROW_ASSIGN_OR_RAISE(mode_data->buffers[1], ctx->Allocate(n * sizeof(CType)));
+ ARROW_ASSIGN_OR_RAISE(count_data->buffers[1], ctx->Allocate(n * sizeof(int64_t)));
+ mode_buffer = mode_data->template GetMutableValues<CType>(1);
+ count_buffer = count_data->template GetMutableValues<int64_t>(1);
+ }
+
+ const auto& out_type =
+ struct_({field(kModeFieldName, mode_type), field(kCountFieldName, count_type)});
+ *out = Datum(ArrayData::Make(out_type, n, {nullptr}, {mode_data, count_data}, 0));
+
+ return std::make_pair(mode_buffer, count_buffer);
+}
+
+// find top-n value:count pairs with minimal heap
+// suboptimal for tiny or large n, possibly okay as we're not in hot path
+template <typename InType, typename Generator>
+Status Finalize(KernelContext* ctx, Datum* out, Generator&& gen) {
+ using CType = typename InType::c_type;
+
+ using ValueCountPair = std::pair<CType, uint64_t>;
+ auto gt = [](const ValueCountPair& lhs, const ValueCountPair& rhs) {
+ const bool rhs_is_nan = rhs.first != rhs.first; // nan as largest value
+ return lhs.second > rhs.second ||
+ (lhs.second == rhs.second && (lhs.first < rhs.first || rhs_is_nan));
+ };
+
+ std::priority_queue<ValueCountPair, std::vector<ValueCountPair>, decltype(gt)> min_heap(
+ std::move(gt));
+
+ const ModeOptions& options = ModeState::Get(ctx);
+ while (true) {
+ const ValueCountPair& value_count = gen();
+ DCHECK_NE(value_count.second, 0);
+ if (value_count.second == kCountEOF) break;
+ if (static_cast<int64_t>(min_heap.size()) < options.n) {
+ min_heap.push(value_count);
+ } else if (gt(value_count, min_heap.top())) {
+ min_heap.pop();
+ min_heap.push(value_count);
+ }
+ }
+ const int64_t n = min_heap.size();
+
+ CType* mode_buffer;
+ int64_t* count_buffer;
+ ARROW_ASSIGN_OR_RAISE(std::tie(mode_buffer, count_buffer),
+ PrepareOutput<InType>(n, ctx, out));
+
+ for (int64_t i = n - 1; i >= 0; --i) {
+ std::tie(mode_buffer[i], count_buffer[i]) = min_heap.top();
+ min_heap.pop();
+ }
+
+ return Status::OK();
+}
+
+// count value occurances for integers with narrow value range
+// O(1) space, O(n) time
+template <typename T>
+struct CountModer {
+ using CType = typename T::c_type;
+
+ CType min;
+ std::vector<uint64_t> counts;
+
+ CountModer(CType min, CType max) {
+ uint32_t value_range = static_cast<uint32_t>(max - min) + 1;
+ DCHECK_LT(value_range, 1 << 20);
+ this->min = min;
+ this->counts.resize(value_range, 0);
+ }
+
+ Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ // count values in all chunks, ignore nulls
+ const Datum& datum = batch[0];
+ CountValues<CType>(this->counts.data(), datum, this->min);
+
+ // generator to emit next value:count pair
+ int index = 0;
+ auto gen = [&]() {
+ for (; index < static_cast<int>(counts.size()); ++index) {
+ if (counts[index] != 0) {
+ auto value_count =
+ std::make_pair(static_cast<CType>(index + this->min), counts[index]);
+ ++index;
+ return value_count;
+ }
+ }
+ return std::pair<CType, uint64_t>(0, kCountEOF);
+ };
+
+ return Finalize<T>(ctx, out, std::move(gen));
+ }
+};
+
+// booleans can be handled more straightforward
+template <>
+struct CountModer<BooleanType> {
+ Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ int64_t counts[2]{};
+
+ const Datum& datum = batch[0];
+ for (const auto& array : datum.chunks()) {
+ if (array->length() > array->null_count()) {
+ const int64_t true_count =
+ arrow::internal::checked_pointer_cast<BooleanArray>(array)->true_count();
+ const int64_t false_count = array->length() - array->null_count() - true_count;
+ counts[true] += true_count;
+ counts[false] += false_count;
+ }
+ }
+
+ const ModeOptions& options = ModeState::Get(ctx);
+ const int64_t distinct_values = (counts[0] != 0) + (counts[1] != 0);
+ const int64_t n = std::min(options.n, distinct_values);
+
+ bool* mode_buffer;
+ int64_t* count_buffer;
+ ARROW_ASSIGN_OR_RAISE(std::tie(mode_buffer, count_buffer),
+ PrepareOutput<BooleanType>(n, ctx, out));
+
+ if (n >= 1) {
+ const bool index = counts[1] > counts[0];
+ mode_buffer[0] = index;
+ count_buffer[0] = counts[index];
+ if (n == 2) {
+ mode_buffer[1] = !index;
+ count_buffer[1] = counts[!index];
+ }
+ }
+
+ return Status::OK();
+ }
+};
+
+// copy and sort approach for floating points or integers with wide value range
+// O(n) space, O(nlogn) time
+template <typename T>
+struct SortModer {
+ using CType = typename T::c_type;
+ using Allocator = arrow::stl::allocator<CType>;
+
+ Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ // copy all chunks to a buffer, ignore nulls and nans
+ std::vector<CType, Allocator> in_buffer(Allocator(ctx->memory_pool()));
+
+ uint64_t nan_count = 0;
+ const Datum& datum = batch[0];
+ const int64_t in_length = datum.length() - datum.null_count();
+ if (in_length > 0) {
+ in_buffer.resize(in_length);
+ CopyNonNullValues(datum, in_buffer.data());
+
+ // drop nan
+ if (is_floating_type<T>::value) {
+ const auto& it = std::remove_if(in_buffer.begin(), in_buffer.end(),
+ [](CType v) { return v != v; });
+ nan_count = in_buffer.end() - it;
+ in_buffer.resize(it - in_buffer.begin());
+ }
+ }
+
+ // sort the input data to count same values
+ std::sort(in_buffer.begin(), in_buffer.end());
+
+ // generator to emit next value:count pair
+ auto it = in_buffer.cbegin();
+ auto gen = [&]() {
+ if (ARROW_PREDICT_FALSE(it == in_buffer.cend())) {
+ // handle NAN at last
+ if (nan_count > 0) {
+ auto value_count = std::make_pair(static_cast<CType>(NAN), nan_count);
+ nan_count = 0;
+ return value_count;
+ }
+ return std::pair<CType, uint64_t>(static_cast<CType>(0), kCountEOF);
+ }
+ // count same values
+ const CType value = *it;
+ uint64_t count = 0;
+ do {
+ ++it;
+ ++count;
+ } while (it != in_buffer.cend() && *it == value);
+ return std::make_pair(value, count);
+ };
+
+ return Finalize<T>(ctx, out, std::move(gen));
+ }
+};
+
+// pick counting or sorting approach per integers value range
+template <typename T>
+struct CountOrSortModer {
+ using CType = typename T::c_type;
+
+ Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ // cross point to benefit from counting approach
+ // about 2x improvement for int32/64 from micro-benchmarking
+ static constexpr int kMinArraySize = 8192;
+ static constexpr int kMaxValueRange = 32768;
+
+ const Datum& datum = batch[0];
+ if (datum.length() - datum.null_count() >= kMinArraySize) {
+ CType min, max;
+ std::tie(min, max) = GetMinMax<CType>(datum);
+
+ if (static_cast<uint64_t>(max) - static_cast<uint64_t>(min) <= kMaxValueRange) {
+ return CountModer<T>(min, max).Exec(ctx, batch, out);
+ }
+ }
+
+ return SortModer<T>().Exec(ctx, batch, out);
+ }
+};
+
+template <typename InType, typename Enable = void>
+struct Moder;
+
+template <>
+struct Moder<Int8Type> {
+ CountModer<Int8Type> impl;
+ Moder() : impl(-128, 127) {}
+};
+
+template <>
+struct Moder<UInt8Type> {
+ CountModer<UInt8Type> impl;
+ Moder() : impl(0, 255) {}
+};
+
+template <>
+struct Moder<BooleanType> {
+ CountModer<BooleanType> impl;
+};
+
+template <typename InType>
+struct Moder<InType, enable_if_t<(is_integer_type<InType>::value &&
+ (sizeof(typename InType::c_type) > 1))>> {
+ CountOrSortModer<InType> impl;
+};
+
+template <typename InType>
+struct Moder<InType, enable_if_t<is_floating_type<InType>::value>> {
+ SortModer<InType> impl;
+};
+
+template <typename T>
+Status ScalarMode(KernelContext* ctx, const Scalar& scalar, Datum* out) {
+ using CType = typename T::c_type;
+ if (scalar.is_valid) {
+ bool called = false;
+ return Finalize<T>(ctx, out, [&]() {
+ if (!called) {
+ called = true;
+ return std::pair<CType, uint64_t>(UnboxScalar<T>::Unbox(scalar), 1);
+ }
+ return std::pair<CType, uint64_t>(static_cast<CType>(0), kCountEOF);
+ });
+ }
+ return Finalize<T>(ctx, out, []() {
+ return std::pair<CType, uint64_t>(static_cast<CType>(0), kCountEOF);
+ });
+}
+
+template <typename _, typename InType>
+struct ModeExecutor {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ if (ctx->state() == nullptr) {
+ return Status::Invalid("Mode requires ModeOptions");
+ }
+ const ModeOptions& options = ModeState::Get(ctx);
+ if (options.n <= 0) {
+ return Status::Invalid("ModeOption::n must be strictly positive");
+ }
+
+ if (batch[0].is_scalar()) {
+ return ScalarMode<InType>(ctx, *batch[0].scalar(), out);
+ }
+
+ return Moder<InType>().impl.Exec(ctx, batch, out);
+ }
+};
+
+VectorKernel NewModeKernel(const std::shared_ptr<DataType>& in_type) {
+ VectorKernel kernel;
+ kernel.init = ModeState::Init;
+ kernel.can_execute_chunkwise = false;
+ kernel.output_chunked = false;
+ auto out_type =
+ struct_({field(kModeFieldName, in_type), field(kCountFieldName, int64())});
+ kernel.signature =
+ KernelSignature::Make({InputType(in_type)}, ValueDescr::Array(out_type));
+ return kernel;
+}
+
+void AddBooleanModeKernel(VectorFunction* func) {
+ VectorKernel kernel = NewModeKernel(boolean());
+ kernel.exec = ModeExecutor<StructType, BooleanType>::Exec;
+ DCHECK_OK(func->AddKernel(kernel));
+}
+
+void AddNumericModeKernels(VectorFunction* func) {
+ for (const auto& type : NumericTypes()) {
+ VectorKernel kernel = NewModeKernel(type);
+ kernel.exec = GenerateNumeric<ModeExecutor, StructType>(*type);
+ DCHECK_OK(func->AddKernel(kernel));
+ }
+}
+
+const FunctionDoc mode_doc{
+ "Calculate the modal (most common) values of a numeric array",
+ ("Returns top-n most common values and number of times they occur in an array.\n"
+ "Result is an array of `struct<mode: T, count: int64>`, where T is the input type.\n"
+ "Values with larger counts are returned before smaller counts.\n"
+ "If there are more than one values with same count, smaller one is returned first.\n"
+ "Nulls are ignored. If there are no non-null values in the array,\n"
+ "empty array is returned."),
+ {"array"},
+ "ModeOptions"};
+
+} // namespace
+
+void RegisterScalarAggregateMode(FunctionRegistry* registry) {
+ static auto default_options = ModeOptions::Defaults();
+ auto func = std::make_shared<VectorFunction>("mode", Arity::Unary(), &mode_doc,
+ &default_options);
+ AddBooleanModeKernel(func.get());
+ AddNumericModeKernels(func.get());
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+}
+
+} // namespace internal
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_quantile.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_quantile.cc
new file mode 100644
index 00000000000..7d2ffe0770c
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_quantile.cc
@@ -0,0 +1,493 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cmath>
+#include <vector>
+
+#include "arrow/compute/api_aggregate.h"
+#include "arrow/compute/kernels/common.h"
+#include "arrow/compute/kernels/util_internal.h"
+#include "arrow/stl_allocator.h"
+
+namespace arrow {
+namespace compute {
+namespace internal {
+
+namespace {
+
+using QuantileState = internal::OptionsWrapper<QuantileOptions>;
+
+// output is at some input data point, not interpolated
+bool IsDataPoint(const QuantileOptions& options) {
+ // some interpolation methods return exact data point
+ return options.interpolation == QuantileOptions::LOWER ||
+ options.interpolation == QuantileOptions::HIGHER ||
+ options.interpolation == QuantileOptions::NEAREST;
+}
+
+// quantile to exact datapoint index (IsDataPoint == true)
+uint64_t QuantileToDataPoint(size_t length, double q,
+ enum QuantileOptions::Interpolation interpolation) {
+ const double index = (length - 1) * q;
+ uint64_t datapoint_index = static_cast<uint64_t>(index);
+ const double fraction = index - datapoint_index;
+
+ if (interpolation == QuantileOptions::LINEAR ||
+ interpolation == QuantileOptions::MIDPOINT) {
+ DCHECK_EQ(fraction, 0);
+ }
+
+ // convert NEAREST interpolation method to LOWER or HIGHER
+ if (interpolation == QuantileOptions::NEAREST) {
+ if (fraction < 0.5) {
+ interpolation = QuantileOptions::LOWER;
+ } else if (fraction > 0.5) {
+ interpolation = QuantileOptions::HIGHER;
+ } else {
+ // round 0.5 to nearest even number, similar to numpy.around
+ interpolation =
+ (datapoint_index & 1) ? QuantileOptions::HIGHER : QuantileOptions::LOWER;
+ }
+ }
+
+ if (interpolation == QuantileOptions::HIGHER && fraction != 0) {
+ ++datapoint_index;
+ }
+
+ return datapoint_index;
+}
+
+// copy and nth_element approach, large memory footprint
+template <typename InType>
+struct SortQuantiler {
+ using CType = typename InType::c_type;
+ using Allocator = arrow::stl::allocator<CType>;
+
+ Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const QuantileOptions& options = QuantileState::Get(ctx);
+
+ // copy all chunks to a buffer, ignore nulls and nans
+ std::vector<CType, Allocator> in_buffer(Allocator(ctx->memory_pool()));
+
+ const Datum& datum = batch[0];
+ const int64_t in_length = datum.length() - datum.null_count();
+ if (in_length > 0) {
+ in_buffer.resize(in_length);
+ CopyNonNullValues(datum, in_buffer.data());
+
+ // drop nan
+ if (is_floating_type<InType>::value) {
+ const auto& it = std::remove_if(in_buffer.begin(), in_buffer.end(),
+ [](CType v) { return v != v; });
+ in_buffer.resize(it - in_buffer.begin());
+ }
+ }
+
+ // prepare out array
+ int64_t out_length = options.q.size();
+ if (in_buffer.empty()) {
+ out_length = 0; // input is empty or only contains null and nan, return empty array
+ }
+ // out type depends on options
+ const bool is_datapoint = IsDataPoint(options);
+ const std::shared_ptr<DataType> out_type =
+ is_datapoint ? TypeTraits<InType>::type_singleton() : float64();
+ auto out_data = ArrayData::Make(out_type, out_length, 0);
+ out_data->buffers.resize(2, nullptr);
+
+ // calculate quantiles
+ if (out_length > 0) {
+ ARROW_ASSIGN_OR_RAISE(out_data->buffers[1],
+ ctx->Allocate(out_length * GetBitWidth(*out_type) / 8));
+
+ // find quantiles in descending order
+ std::vector<int64_t> q_indices(out_length);
+ std::iota(q_indices.begin(), q_indices.end(), 0);
+ std::sort(q_indices.begin(), q_indices.end(),
+ [&options](int64_t left_index, int64_t right_index) {
+ return options.q[right_index] < options.q[left_index];
+ });
+
+ // input array is partitioned around data point at `last_index` (pivot)
+ // for next quatile which is smaller, we only consider inputs left of the pivot
+ uint64_t last_index = in_buffer.size();
+ if (is_datapoint) {
+ CType* out_buffer = out_data->template GetMutableValues<CType>(1);
+ for (int64_t i = 0; i < out_length; ++i) {
+ const int64_t q_index = q_indices[i];
+ out_buffer[q_index] = GetQuantileAtDataPoint(
+ in_buffer, &last_index, options.q[q_index], options.interpolation);
+ }
+ } else {
+ double* out_buffer = out_data->template GetMutableValues<double>(1);
+ for (int64_t i = 0; i < out_length; ++i) {
+ const int64_t q_index = q_indices[i];
+ out_buffer[q_index] = GetQuantileByInterp(
+ in_buffer, &last_index, options.q[q_index], options.interpolation);
+ }
+ }
+ }
+
+ *out = Datum(std::move(out_data));
+ return Status::OK();
+ }
+
+ // return quantile located exactly at some input data point
+ CType GetQuantileAtDataPoint(std::vector<CType, Allocator>& in, uint64_t* last_index,
+ double q,
+ enum QuantileOptions::Interpolation interpolation) {
+ const uint64_t datapoint_index = QuantileToDataPoint(in.size(), q, interpolation);
+
+ if (datapoint_index != *last_index) {
+ DCHECK_LT(datapoint_index, *last_index);
+ std::nth_element(in.begin(), in.begin() + datapoint_index,
+ in.begin() + *last_index);
+ *last_index = datapoint_index;
+ }
+
+ return in[datapoint_index];
+ }
+
+ // return quantile interpolated from adjacent input data points
+ double GetQuantileByInterp(std::vector<CType, Allocator>& in, uint64_t* last_index,
+ double q,
+ enum QuantileOptions::Interpolation interpolation) {
+ const double index = (in.size() - 1) * q;
+ const uint64_t lower_index = static_cast<uint64_t>(index);
+ const double fraction = index - lower_index;
+
+ if (lower_index != *last_index) {
+ DCHECK_LT(lower_index, *last_index);
+ std::nth_element(in.begin(), in.begin() + lower_index, in.begin() + *last_index);
+ }
+
+ const double lower_value = static_cast<double>(in[lower_index]);
+ if (fraction == 0) {
+ *last_index = lower_index;
+ return lower_value;
+ }
+
+ const uint64_t higher_index = lower_index + 1;
+ DCHECK_LT(higher_index, in.size());
+ if (lower_index != *last_index && higher_index != *last_index) {
+ DCHECK_LT(higher_index, *last_index);
+ // higher value must be the minimal value after lower_index
+ auto min = std::min_element(in.begin() + higher_index, in.begin() + *last_index);
+ std::iter_swap(in.begin() + higher_index, min);
+ }
+ *last_index = lower_index;
+
+ const double higher_value = static_cast<double>(in[higher_index]);
+
+ if (interpolation == QuantileOptions::LINEAR) {
+ // more stable than naive linear interpolation
+ return fraction * higher_value + (1 - fraction) * lower_value;
+ } else if (interpolation == QuantileOptions::MIDPOINT) {
+ return lower_value / 2 + higher_value / 2;
+ } else {
+ DCHECK(false);
+ return NAN;
+ }
+ }
+};
+
+// histogram approach with constant memory, only for integers within limited value range
+template <typename InType>
+struct CountQuantiler {
+ using CType = typename InType::c_type;
+
+ CType min;
+ std::vector<uint64_t> counts; // counts[i]: # of values equals i + min
+
+ // indices to adjacent non-empty bins covering current quantile
+ struct AdjacentBins {
+ int left_index;
+ int right_index;
+ uint64_t total_count; // accumulated counts till left_index (inclusive)
+ };
+
+ CountQuantiler(CType min, CType max) {
+ uint32_t value_range = static_cast<uint32_t>(max - min) + 1;
+ DCHECK_LT(value_range, 1 << 30);
+ this->min = min;
+ this->counts.resize(value_range, 0);
+ }
+
+ Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const QuantileOptions& options = QuantileState::Get(ctx);
+
+ // count values in all chunks, ignore nulls
+ const Datum& datum = batch[0];
+ int64_t in_length = CountValues<CType>(this->counts.data(), datum, this->min);
+
+ // prepare out array
+ int64_t out_length = options.q.size();
+ if (in_length == 0) {
+ out_length = 0; // input is empty or only contains null, return empty array
+ }
+ // out type depends on options
+ const bool is_datapoint = IsDataPoint(options);
+ const std::shared_ptr<DataType> out_type =
+ is_datapoint ? TypeTraits<InType>::type_singleton() : float64();
+ auto out_data = ArrayData::Make(out_type, out_length, 0);
+ out_data->buffers.resize(2, nullptr);
+
+ // calculate quantiles
+ if (out_length > 0) {
+ ARROW_ASSIGN_OR_RAISE(out_data->buffers[1],
+ ctx->Allocate(out_length * GetBitWidth(*out_type) / 8));
+
+ // find quantiles in ascending order
+ std::vector<int64_t> q_indices(out_length);
+ std::iota(q_indices.begin(), q_indices.end(), 0);
+ std::sort(q_indices.begin(), q_indices.end(),
+ [&options](int64_t left_index, int64_t right_index) {
+ return options.q[left_index] < options.q[right_index];
+ });
+
+ AdjacentBins bins{0, 0, this->counts[0]};
+ if (is_datapoint) {
+ CType* out_buffer = out_data->template GetMutableValues<CType>(1);
+ for (int64_t i = 0; i < out_length; ++i) {
+ const int64_t q_index = q_indices[i];
+ out_buffer[q_index] = GetQuantileAtDataPoint(
+ in_length, &bins, options.q[q_index], options.interpolation);
+ }
+ } else {
+ double* out_buffer = out_data->template GetMutableValues<double>(1);
+ for (int64_t i = 0; i < out_length; ++i) {
+ const int64_t q_index = q_indices[i];
+ out_buffer[q_index] = GetQuantileByInterp(in_length, &bins, options.q[q_index],
+ options.interpolation);
+ }
+ }
+ }
+
+ *out = Datum(std::move(out_data));
+ return Status::OK();
+ }
+
+ // return quantile located exactly at some input data point
+ CType GetQuantileAtDataPoint(int64_t in_length, AdjacentBins* bins, double q,
+ enum QuantileOptions::Interpolation interpolation) {
+ const uint64_t datapoint_index = QuantileToDataPoint(in_length, q, interpolation);
+ while (datapoint_index >= bins->total_count &&
+ static_cast<size_t>(bins->left_index) < this->counts.size() - 1) {
+ ++bins->left_index;
+ bins->total_count += this->counts[bins->left_index];
+ }
+ DCHECK_LT(datapoint_index, bins->total_count);
+ return static_cast<CType>(bins->left_index + this->min);
+ }
+
+ // return quantile interpolated from adjacent input data points
+ double GetQuantileByInterp(int64_t in_length, AdjacentBins* bins, double q,
+ enum QuantileOptions::Interpolation interpolation) {
+ const double index = (in_length - 1) * q;
+ const uint64_t index_floor = static_cast<uint64_t>(index);
+ const double fraction = index - index_floor;
+
+ while (index_floor >= bins->total_count &&
+ static_cast<size_t>(bins->left_index) < this->counts.size() - 1) {
+ ++bins->left_index;
+ bins->total_count += this->counts[bins->left_index];
+ }
+ DCHECK_LT(index_floor, bins->total_count);
+ const double lower_value = static_cast<double>(bins->left_index + this->min);
+
+ // quantile lies in this bin, no interpolation needed
+ if (index <= bins->total_count - 1) {
+ return lower_value;
+ }
+
+ // quantile lies across two bins, locate next bin if not already done
+ DCHECK_EQ(index_floor, bins->total_count - 1);
+ if (bins->right_index <= bins->left_index) {
+ bins->right_index = bins->left_index + 1;
+ while (static_cast<size_t>(bins->right_index) < this->counts.size() - 1 &&
+ this->counts[bins->right_index] == 0) {
+ ++bins->right_index;
+ }
+ }
+ DCHECK_LT(static_cast<size_t>(bins->right_index), this->counts.size());
+ DCHECK_GT(this->counts[bins->right_index], 0);
+ const double higher_value = static_cast<double>(bins->right_index + this->min);
+
+ if (interpolation == QuantileOptions::LINEAR) {
+ return fraction * higher_value + (1 - fraction) * lower_value;
+ } else if (interpolation == QuantileOptions::MIDPOINT) {
+ return lower_value / 2 + higher_value / 2;
+ } else {
+ DCHECK(false);
+ return NAN;
+ }
+ }
+};
+
+// histogram or 'copy & nth_element' approach per value range and size, only for integers
+template <typename InType>
+struct CountOrSortQuantiler {
+ using CType = typename InType::c_type;
+
+ Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ // cross point to benefit from histogram approach
+ // parameters estimated from ad-hoc benchmarks manually
+ static constexpr int kMinArraySize = 65536;
+ static constexpr int kMaxValueRange = 65536;
+
+ const Datum& datum = batch[0];
+ if (datum.length() - datum.null_count() >= kMinArraySize) {
+ CType min, max;
+ std::tie(min, max) = GetMinMax<CType>(datum);
+
+ if (static_cast<uint64_t>(max) - static_cast<uint64_t>(min) <= kMaxValueRange) {
+ return CountQuantiler<InType>(min, max).Exec(ctx, batch, out);
+ }
+ }
+
+ return SortQuantiler<InType>().Exec(ctx, batch, out);
+ }
+};
+
+template <typename InType, typename Enable = void>
+struct ExactQuantiler;
+
+template <>
+struct ExactQuantiler<UInt8Type> {
+ CountQuantiler<UInt8Type> impl;
+ ExactQuantiler() : impl(0, 255) {}
+};
+
+template <>
+struct ExactQuantiler<Int8Type> {
+ CountQuantiler<Int8Type> impl;
+ ExactQuantiler() : impl(-128, 127) {}
+};
+
+template <typename InType>
+struct ExactQuantiler<InType, enable_if_t<(is_integer_type<InType>::value &&
+ (sizeof(typename InType::c_type) > 1))>> {
+ CountOrSortQuantiler<InType> impl;
+};
+
+template <typename InType>
+struct ExactQuantiler<InType, enable_if_t<is_floating_type<InType>::value>> {
+ SortQuantiler<InType> impl;
+};
+
+template <typename T>
+Status ScalarQuantile(KernelContext* ctx, const QuantileOptions& options,
+ const Scalar& scalar, Datum* out) {
+ using CType = typename T::c_type;
+ ArrayData* output = out->mutable_array();
+ if (!scalar.is_valid) {
+ output->length = 0;
+ output->null_count = 0;
+ return Status::OK();
+ }
+ auto out_type = IsDataPoint(options) ? scalar.type : float64();
+ output->length = options.q.size();
+ output->null_count = 0;
+ ARROW_ASSIGN_OR_RAISE(
+ output->buffers[1],
+ ctx->Allocate(output->length * BitUtil::BytesForBits(GetBitWidth(*out_type))));
+ if (IsDataPoint(options)) {
+ CType* out_buffer = output->template GetMutableValues<CType>(1);
+ for (int64_t i = 0; i < output->length; i++) {
+ out_buffer[i] = UnboxScalar<T>::Unbox(scalar);
+ }
+ } else {
+ double* out_buffer = output->template GetMutableValues<double>(1);
+ for (int64_t i = 0; i < output->length; i++) {
+ out_buffer[i] = static_cast<double>(UnboxScalar<T>::Unbox(scalar));
+ }
+ }
+ return Status::OK();
+}
+
+template <typename _, typename InType>
+struct QuantileExecutor {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ if (ctx->state() == nullptr) {
+ return Status::Invalid("Quantile requires QuantileOptions");
+ }
+
+ const QuantileOptions& options = QuantileState::Get(ctx);
+ if (options.q.empty()) {
+ return Status::Invalid("Requires quantile argument");
+ }
+ for (double q : options.q) {
+ if (q < 0 || q > 1) {
+ return Status::Invalid("Quantile must be between 0 and 1");
+ }
+ }
+
+ if (batch[0].is_scalar()) {
+ return ScalarQuantile<InType>(ctx, options, *batch[0].scalar(), out);
+ }
+
+ return ExactQuantiler<InType>().impl.Exec(ctx, batch, out);
+ }
+};
+
+Result<ValueDescr> ResolveOutput(KernelContext* ctx,
+ const std::vector<ValueDescr>& args) {
+ const QuantileOptions& options = QuantileState::Get(ctx);
+ if (IsDataPoint(options)) {
+ return ValueDescr::Array(args[0].type);
+ } else {
+ return ValueDescr::Array(float64());
+ }
+}
+
+void AddQuantileKernels(VectorFunction* func) {
+ VectorKernel base;
+ base.init = QuantileState::Init;
+ base.can_execute_chunkwise = false;
+ base.output_chunked = false;
+
+ for (const auto& ty : NumericTypes()) {
+ base.signature = KernelSignature::Make({InputType(ty)}, OutputType(ResolveOutput));
+ // output type is determined at runtime, set template argument to nulltype
+ base.exec = GenerateNumeric<QuantileExecutor, NullType>(*ty);
+ DCHECK_OK(func->AddKernel(base));
+ }
+}
+
+const FunctionDoc quantile_doc{
+ "Compute an array of quantiles of a numeric array or chunked array",
+ ("By default, 0.5 quantile (median) is returned.\n"
+ "If quantile lies between two data points, an interpolated value is\n"
+ "returned based on selected interpolation method.\n"
+ "Nulls and NaNs are ignored.\n"
+ "An empty array is returned if there is no valid data point."),
+ {"array"},
+ "QuantileOptions"};
+
+} // namespace
+
+void RegisterScalarAggregateQuantile(FunctionRegistry* registry) {
+ static QuantileOptions default_options;
+ auto func = std::make_shared<VectorFunction>("quantile", Arity::Unary(), &quantile_doc,
+ &default_options);
+ AddQuantileKernels(func.get());
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+}
+
+} // namespace internal
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_tdigest.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_tdigest.cc
new file mode 100644
index 00000000000..4c261604c85
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_tdigest.cc
@@ -0,0 +1,164 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/api_aggregate.h"
+#include "arrow/compute/kernels/aggregate_internal.h"
+#include "arrow/compute/kernels/common.h"
+#include "arrow/util/bit_run_reader.h"
+#include "arrow/util/tdigest.h"
+
+namespace arrow {
+namespace compute {
+namespace internal {
+
+namespace {
+
+using arrow::internal::TDigest;
+using arrow::internal::VisitSetBitRunsVoid;
+
+template <typename ArrowType>
+struct TDigestImpl : public ScalarAggregator {
+ using ThisType = TDigestImpl<ArrowType>;
+ using ArrayType = typename TypeTraits<ArrowType>::ArrayType;
+ using CType = typename ArrowType::c_type;
+
+ explicit TDigestImpl(const TDigestOptions& options)
+ : q{options.q}, tdigest{options.delta, options.buffer_size} {}
+
+ Status Consume(KernelContext*, const ExecBatch& batch) override {
+ if (batch[0].is_array()) {
+ const ArrayData& data = *batch[0].array();
+ const CType* values = data.GetValues<CType>(1);
+
+ if (data.length > data.GetNullCount()) {
+ VisitSetBitRunsVoid(data.buffers[0], data.offset, data.length,
+ [&](int64_t pos, int64_t len) {
+ for (int64_t i = 0; i < len; ++i) {
+ this->tdigest.NanAdd(values[pos + i]);
+ }
+ });
+ }
+ } else {
+ const CType value = UnboxScalar<ArrowType>::Unbox(*batch[0].scalar());
+ if (batch[0].scalar()->is_valid) {
+ this->tdigest.NanAdd(value);
+ }
+ }
+ return Status::OK();
+ }
+
+ Status MergeFrom(KernelContext*, KernelState&& src) override {
+ auto& other = checked_cast<ThisType&>(src);
+ std::vector<TDigest> other_tdigest;
+ other_tdigest.push_back(std::move(other.tdigest));
+ this->tdigest.Merge(&other_tdigest);
+ return Status::OK();
+ }
+
+ Status Finalize(KernelContext* ctx, Datum* out) override {
+ const int64_t out_length = this->tdigest.is_empty() ? 0 : this->q.size();
+ auto out_data = ArrayData::Make(float64(), out_length, 0);
+ out_data->buffers.resize(2, nullptr);
+
+ if (out_length > 0) {
+ ARROW_ASSIGN_OR_RAISE(out_data->buffers[1],
+ ctx->Allocate(out_length * sizeof(double)));
+ double* out_buffer = out_data->template GetMutableValues<double>(1);
+ for (int64_t i = 0; i < out_length; ++i) {
+ out_buffer[i] = this->tdigest.Quantile(this->q[i]);
+ }
+ }
+
+ *out = Datum(std::move(out_data));
+ return Status::OK();
+ }
+
+ const std::vector<double>& q;
+ TDigest tdigest;
+};
+
+struct TDigestInitState {
+ std::unique_ptr<KernelState> state;
+ KernelContext* ctx;
+ const DataType& in_type;
+ const TDigestOptions& options;
+
+ TDigestInitState(KernelContext* ctx, const DataType& in_type,
+ const TDigestOptions& options)
+ : ctx(ctx), in_type(in_type), options(options) {}
+
+ Status Visit(const DataType&) {
+ return Status::NotImplemented("No tdigest implemented");
+ }
+
+ Status Visit(const HalfFloatType&) {
+ return Status::NotImplemented("No tdigest implemented");
+ }
+
+ template <typename Type>
+ enable_if_t<is_number_type<Type>::value, Status> Visit(const Type&) {
+ state.reset(new TDigestImpl<Type>(options));
+ return Status::OK();
+ }
+
+ Result<std::unique_ptr<KernelState>> Create() {
+ RETURN_NOT_OK(VisitTypeInline(in_type, this));
+ return std::move(state);
+ }
+};
+
+Result<std::unique_ptr<KernelState>> TDigestInit(KernelContext* ctx,
+ const KernelInitArgs& args) {
+ TDigestInitState visitor(ctx, *args.inputs[0].type,
+ static_cast<const TDigestOptions&>(*args.options));
+ return visitor.Create();
+}
+
+void AddTDigestKernels(KernelInit init,
+ const std::vector<std::shared_ptr<DataType>>& types,
+ ScalarAggregateFunction* func) {
+ for (const auto& ty : types) {
+ auto sig = KernelSignature::Make({InputType(ty)}, float64());
+ AddAggKernel(std::move(sig), init, func);
+ }
+}
+
+const FunctionDoc tdigest_doc{
+ "Approximate quantiles of a numeric array with T-Digest algorithm",
+ ("By default, 0.5 quantile (median) is returned.\n"
+ "Nulls and NaNs are ignored.\n"
+ "An empty array is returned if there is no valid data point."),
+ {"array"},
+ "TDigestOptions"};
+
+std::shared_ptr<ScalarAggregateFunction> AddTDigestAggKernels() {
+ static auto default_tdigest_options = TDigestOptions::Defaults();
+ auto func = std::make_shared<ScalarAggregateFunction>(
+ "tdigest", Arity::Unary(), &tdigest_doc, &default_tdigest_options);
+ AddTDigestKernels(TDigestInit, NumericTypes(), func.get());
+ return func;
+}
+
+} // namespace
+
+void RegisterScalarAggregateTDigest(FunctionRegistry* registry) {
+ DCHECK_OK(registry->AddFunction(AddTDigestAggKernels()));
+}
+
+} // namespace internal
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_var_std.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_var_std.cc
new file mode 100644
index 00000000000..d6965fed4a3
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_var_std.cc
@@ -0,0 +1,326 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cmath>
+
+#include "arrow/compute/api_aggregate.h"
+#include "arrow/compute/kernels/aggregate_internal.h"
+#include "arrow/compute/kernels/common.h"
+#include "arrow/util/bit_run_reader.h"
+#include "arrow/util/int128_internal.h"
+
+namespace arrow {
+namespace compute {
+namespace internal {
+
+namespace {
+
+using arrow::internal::int128_t;
+using arrow::internal::VisitSetBitRunsVoid;
+
+template <typename ArrowType>
+struct VarStdState {
+ using ArrayType = typename TypeTraits<ArrowType>::ArrayType;
+ using CType = typename ArrowType::c_type;
+ using ThisType = VarStdState<ArrowType>;
+
+ // float/double/int64: calculate `m2` (sum((X-mean)^2)) with `two pass algorithm`
+ // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Two-pass_algorithm
+ template <typename T = ArrowType>
+ enable_if_t<is_floating_type<T>::value || (sizeof(CType) > 4)> Consume(
+ const ArrayType& array) {
+ int64_t count = array.length() - array.null_count();
+ if (count == 0) {
+ return;
+ }
+
+ using SumType =
+ typename std::conditional<is_floating_type<T>::value, double, int128_t>::type;
+ SumType sum =
+ arrow::compute::detail::SumArray<CType, SumType, SimdLevel::NONE>(*array.data());
+
+ const double mean = static_cast<double>(sum) / count;
+ const double m2 = arrow::compute::detail::SumArray<CType, double, SimdLevel::NONE>(
+ *array.data(), [mean](CType value) {
+ const double v = static_cast<double>(value);
+ return (v - mean) * (v - mean);
+ });
+
+ this->count = count;
+ this->mean = mean;
+ this->m2 = m2;
+ }
+
+ // int32/16/8: textbook one pass algorithm with integer arithmetic
+ template <typename T = ArrowType>
+ enable_if_t<is_integer_type<T>::value && (sizeof(CType) <= 4)> Consume(
+ const ArrayType& array) {
+ // max number of elements that sum will not overflow int64 (2Gi int32 elements)
+ // for uint32: 0 <= sum < 2^63 (int64 >= 0)
+ // for int32: -2^62 <= sum < 2^62
+ constexpr int64_t max_length = 1ULL << (63 - sizeof(CType) * 8);
+
+ int64_t start_index = 0;
+ int64_t valid_count = array.length() - array.null_count();
+
+ while (valid_count > 0) {
+ // process in chunks that overflow will never happen
+ const auto slice = array.Slice(start_index, max_length);
+ const int64_t count = slice->length() - slice->null_count();
+ start_index += max_length;
+ valid_count -= count;
+
+ if (count > 0) {
+ int64_t sum = 0;
+ int128_t square_sum = 0;
+ const ArrayData& data = *slice->data();
+ const CType* values = data.GetValues<CType>(1);
+ VisitSetBitRunsVoid(data.buffers[0], data.offset, data.length,
+ [&](int64_t pos, int64_t len) {
+ for (int64_t i = 0; i < len; ++i) {
+ const auto value = values[pos + i];
+ sum += value;
+ square_sum += static_cast<uint64_t>(value) * value;
+ }
+ });
+
+ const double mean = static_cast<double>(sum) / count;
+ // calculate m2 = square_sum - sum * sum / count
+ // decompose `sum * sum / count` into integers and fractions
+ const int128_t sum_square = static_cast<int128_t>(sum) * sum;
+ const int128_t integers = sum_square / count;
+ const double fractions = static_cast<double>(sum_square % count) / count;
+ const double m2 = static_cast<double>(square_sum - integers) - fractions;
+
+ // merge variance
+ ThisType state;
+ state.count = count;
+ state.mean = mean;
+ state.m2 = m2;
+ this->MergeFrom(state);
+ }
+ }
+ }
+
+ // Combine `m2` from two chunks (m2 = n*s2)
+ // https://www.emathzone.com/tutorials/basic-statistics/combined-variance.html
+ void MergeFrom(const ThisType& state) {
+ if (state.count == 0) {
+ return;
+ }
+ if (this->count == 0) {
+ this->count = state.count;
+ this->mean = state.mean;
+ this->m2 = state.m2;
+ return;
+ }
+ double mean = (this->mean * this->count + state.mean * state.count) /
+ (this->count + state.count);
+ this->m2 += state.m2 + this->count * (this->mean - mean) * (this->mean - mean) +
+ state.count * (state.mean - mean) * (state.mean - mean);
+ this->count += state.count;
+ this->mean = mean;
+ }
+
+ int64_t count = 0;
+ double mean = 0;
+ double m2 = 0; // m2 = count*s2 = sum((X-mean)^2)
+};
+
+enum class VarOrStd : bool { Var, Std };
+
+template <typename ArrowType>
+struct VarStdImpl : public ScalarAggregator {
+ using ThisType = VarStdImpl<ArrowType>;
+ using ArrayType = typename TypeTraits<ArrowType>::ArrayType;
+
+ explicit VarStdImpl(const std::shared_ptr<DataType>& out_type,
+ const VarianceOptions& options, VarOrStd return_type)
+ : out_type(out_type), options(options), return_type(return_type) {}
+
+ Status Consume(KernelContext*, const ExecBatch& batch) override {
+ ArrayType array(batch[0].array());
+ this->state.Consume(array);
+ return Status::OK();
+ }
+
+ Status MergeFrom(KernelContext*, KernelState&& src) override {
+ const auto& other = checked_cast<const ThisType&>(src);
+ this->state.MergeFrom(other.state);
+ return Status::OK();
+ }
+
+ Status Finalize(KernelContext*, Datum* out) override {
+ if (this->state.count <= options.ddof) {
+ out->value = std::make_shared<DoubleScalar>();
+ } else {
+ double var = this->state.m2 / (this->state.count - options.ddof);
+ out->value =
+ std::make_shared<DoubleScalar>(return_type == VarOrStd::Var ? var : sqrt(var));
+ }
+ return Status::OK();
+ }
+
+ std::shared_ptr<DataType> out_type;
+ VarStdState<ArrowType> state;
+ VarianceOptions options;
+ VarOrStd return_type;
+};
+
+struct ScalarVarStdImpl : public ScalarAggregator {
+ explicit ScalarVarStdImpl(const VarianceOptions& options)
+ : options(options), seen(false) {}
+
+ Status Consume(KernelContext*, const ExecBatch& batch) override {
+ seen = batch[0].scalar()->is_valid;
+ return Status::OK();
+ }
+
+ Status MergeFrom(KernelContext*, KernelState&& src) override {
+ const auto& other = checked_cast<const ScalarVarStdImpl&>(src);
+ seen = seen || other.seen;
+ return Status::OK();
+ }
+
+ Status Finalize(KernelContext*, Datum* out) override {
+ if (!seen || options.ddof > 0) {
+ out->value = std::make_shared<DoubleScalar>();
+ } else {
+ out->value = std::make_shared<DoubleScalar>(0.0);
+ }
+ return Status::OK();
+ }
+
+ const VarianceOptions options;
+ bool seen;
+};
+
+struct VarStdInitState {
+ std::unique_ptr<KernelState> state;
+ KernelContext* ctx;
+ const DataType& in_type;
+ const std::shared_ptr<DataType>& out_type;
+ const VarianceOptions& options;
+ VarOrStd return_type;
+
+ VarStdInitState(KernelContext* ctx, const DataType& in_type,
+ const std::shared_ptr<DataType>& out_type,
+ const VarianceOptions& options, VarOrStd return_type)
+ : ctx(ctx),
+ in_type(in_type),
+ out_type(out_type),
+ options(options),
+ return_type(return_type) {}
+
+ Status Visit(const DataType&) {
+ return Status::NotImplemented("No variance/stddev implemented");
+ }
+
+ Status Visit(const HalfFloatType&) {
+ return Status::NotImplemented("No variance/stddev implemented");
+ }
+
+ template <typename Type>
+ enable_if_t<is_number_type<Type>::value, Status> Visit(const Type&) {
+ state.reset(new VarStdImpl<Type>(out_type, options, return_type));
+ return Status::OK();
+ }
+
+ Result<std::unique_ptr<KernelState>> Create() {
+ RETURN_NOT_OK(VisitTypeInline(in_type, this));
+ return std::move(state);
+ }
+};
+
+Result<std::unique_ptr<KernelState>> StddevInit(KernelContext* ctx,
+ const KernelInitArgs& args) {
+ VarStdInitState visitor(
+ ctx, *args.inputs[0].type, args.kernel->signature->out_type().type(),
+ static_cast<const VarianceOptions&>(*args.options), VarOrStd::Std);
+ return visitor.Create();
+}
+
+Result<std::unique_ptr<KernelState>> VarianceInit(KernelContext* ctx,
+ const KernelInitArgs& args) {
+ VarStdInitState visitor(
+ ctx, *args.inputs[0].type, args.kernel->signature->out_type().type(),
+ static_cast<const VarianceOptions&>(*args.options), VarOrStd::Var);
+ return visitor.Create();
+}
+
+Result<std::unique_ptr<KernelState>> ScalarVarStdInit(KernelContext* ctx,
+ const KernelInitArgs& args) {
+ return arrow::internal::make_unique<ScalarVarStdImpl>(
+ static_cast<const VarianceOptions&>(*args.options));
+}
+
+void AddVarStdKernels(KernelInit init,
+ const std::vector<std::shared_ptr<DataType>>& types,
+ ScalarAggregateFunction* func) {
+ for (const auto& ty : types) {
+ auto sig = KernelSignature::Make({InputType::Array(ty)}, float64());
+ AddAggKernel(std::move(sig), init, func);
+
+ sig = KernelSignature::Make({InputType::Scalar(ty)}, float64());
+ AddAggKernel(std::move(sig), ScalarVarStdInit, func);
+ }
+}
+
+const FunctionDoc stddev_doc{
+ "Calculate the standard deviation of a numeric array",
+ ("The number of degrees of freedom can be controlled using VarianceOptions.\n"
+ "By default (`ddof` = 0), the population standard deviation is calculated.\n"
+ "Nulls are ignored. If there are not enough non-null values in the array\n"
+ "to satisfy `ddof`, null is returned."),
+ {"array"},
+ "VarianceOptions"};
+
+const FunctionDoc variance_doc{
+ "Calculate the variance of a numeric array",
+ ("The number of degrees of freedom can be controlled using VarianceOptions.\n"
+ "By default (`ddof` = 0), the population variance is calculated.\n"
+ "Nulls are ignored. If there are not enough non-null values in the array\n"
+ "to satisfy `ddof`, null is returned."),
+ {"array"},
+ "VarianceOptions"};
+
+std::shared_ptr<ScalarAggregateFunction> AddStddevAggKernels() {
+ static auto default_std_options = VarianceOptions::Defaults();
+ auto func = std::make_shared<ScalarAggregateFunction>(
+ "stddev", Arity::Unary(), &stddev_doc, &default_std_options);
+ AddVarStdKernels(StddevInit, NumericTypes(), func.get());
+ return func;
+}
+
+std::shared_ptr<ScalarAggregateFunction> AddVarianceAggKernels() {
+ static auto default_var_options = VarianceOptions::Defaults();
+ auto func = std::make_shared<ScalarAggregateFunction>(
+ "variance", Arity::Unary(), &variance_doc, &default_var_options);
+ AddVarStdKernels(VarianceInit, NumericTypes(), func.get());
+ return func;
+}
+
+} // namespace
+
+void RegisterScalarAggregateVariance(FunctionRegistry* registry) {
+ DCHECK_OK(registry->AddFunction(AddVarianceAggKernels()));
+ DCHECK_OK(registry->AddFunction(AddStddevAggKernels()));
+}
+
+} // namespace internal
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/codegen_internal.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/codegen_internal.cc
new file mode 100644
index 00000000000..bab8e7000cd
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/codegen_internal.cc
@@ -0,0 +1,337 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/kernels/codegen_internal.h"
+
+#include <functional>
+#include <memory>
+#include <mutex>
+#include <vector>
+
+#include "arrow/type_fwd.h"
+
+namespace arrow {
+namespace compute {
+namespace internal {
+
+Status ExecFail(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ return Status::NotImplemented("This kernel is malformed");
+}
+
+ArrayKernelExec MakeFlippedBinaryExec(ArrayKernelExec exec) {
+ return [exec](KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ ExecBatch flipped_batch = batch;
+ std::swap(flipped_batch.values[0], flipped_batch.values[1]);
+ return exec(ctx, flipped_batch, out);
+ };
+}
+
+std::vector<std::shared_ptr<DataType>> g_signed_int_types;
+std::vector<std::shared_ptr<DataType>> g_unsigned_int_types;
+std::vector<std::shared_ptr<DataType>> g_int_types;
+std::vector<std::shared_ptr<DataType>> g_floating_types;
+std::vector<std::shared_ptr<DataType>> g_numeric_types;
+std::vector<std::shared_ptr<DataType>> g_base_binary_types;
+std::vector<std::shared_ptr<DataType>> g_temporal_types;
+std::vector<std::shared_ptr<DataType>> g_primitive_types;
+std::vector<Type::type> g_decimal_type_ids;
+static std::once_flag codegen_static_initialized;
+
+template <typename T>
+void Extend(const std::vector<T>& values, std::vector<T>* out) {
+ for (const auto& t : values) {
+ out->push_back(t);
+ }
+}
+
+static void InitStaticData() {
+ // Signed int types
+ g_signed_int_types = {int8(), int16(), int32(), int64()};
+
+ // Unsigned int types
+ g_unsigned_int_types = {uint8(), uint16(), uint32(), uint64()};
+
+ // All int types
+ Extend(g_unsigned_int_types, &g_int_types);
+ Extend(g_signed_int_types, &g_int_types);
+
+ // Floating point types
+ g_floating_types = {float32(), float64()};
+
+ // Decimal types
+ g_decimal_type_ids = {Type::DECIMAL128, Type::DECIMAL256};
+
+ // Numeric types
+ Extend(g_int_types, &g_numeric_types);
+ Extend(g_floating_types, &g_numeric_types);
+
+ // Temporal types
+ g_temporal_types = {date32(),
+ date64(),
+ time32(TimeUnit::SECOND),
+ time32(TimeUnit::MILLI),
+ time64(TimeUnit::MICRO),
+ time64(TimeUnit::NANO),
+ timestamp(TimeUnit::SECOND),
+ timestamp(TimeUnit::MILLI),
+ timestamp(TimeUnit::MICRO),
+ timestamp(TimeUnit::NANO)};
+
+ // Base binary types (without FixedSizeBinary)
+ g_base_binary_types = {binary(), utf8(), large_binary(), large_utf8()};
+
+ // Non-parametric, non-nested types. This also DOES NOT include
+ //
+ // * Decimal
+ // * Fixed Size Binary
+ // * Time32
+ // * Time64
+ // * Timestamp
+ g_primitive_types = {null(), boolean(), date32(), date64()};
+ Extend(g_numeric_types, &g_primitive_types);
+ Extend(g_base_binary_types, &g_primitive_types);
+}
+
+const std::vector<std::shared_ptr<DataType>>& BaseBinaryTypes() {
+ std::call_once(codegen_static_initialized, InitStaticData);
+ return g_base_binary_types;
+}
+
+const std::vector<std::shared_ptr<DataType>>& StringTypes() {
+ static DataTypeVector types = {utf8(), large_utf8()};
+ return types;
+}
+
+const std::vector<std::shared_ptr<DataType>>& SignedIntTypes() {
+ std::call_once(codegen_static_initialized, InitStaticData);
+ return g_signed_int_types;
+}
+
+const std::vector<std::shared_ptr<DataType>>& UnsignedIntTypes() {
+ std::call_once(codegen_static_initialized, InitStaticData);
+ return g_unsigned_int_types;
+}
+
+const std::vector<std::shared_ptr<DataType>>& IntTypes() {
+ std::call_once(codegen_static_initialized, InitStaticData);
+ return g_int_types;
+}
+
+const std::vector<std::shared_ptr<DataType>>& FloatingPointTypes() {
+ std::call_once(codegen_static_initialized, InitStaticData);
+ return g_floating_types;
+}
+
+const std::vector<Type::type>& DecimalTypeIds() {
+ std::call_once(codegen_static_initialized, InitStaticData);
+ return g_decimal_type_ids;
+}
+
+const std::vector<TimeUnit::type>& AllTimeUnits() {
+ static std::vector<TimeUnit::type> units = {TimeUnit::SECOND, TimeUnit::MILLI,
+ TimeUnit::MICRO, TimeUnit::NANO};
+ return units;
+}
+
+const std::vector<std::shared_ptr<DataType>>& NumericTypes() {
+ std::call_once(codegen_static_initialized, InitStaticData);
+ return g_numeric_types;
+}
+
+const std::vector<std::shared_ptr<DataType>>& TemporalTypes() {
+ std::call_once(codegen_static_initialized, InitStaticData);
+ return g_temporal_types;
+}
+
+const std::vector<std::shared_ptr<DataType>>& PrimitiveTypes() {
+ std::call_once(codegen_static_initialized, InitStaticData);
+ return g_primitive_types;
+}
+
+const std::vector<std::shared_ptr<DataType>>& ExampleParametricTypes() {
+ static DataTypeVector example_parametric_types = {
+ decimal128(12, 2),
+ duration(TimeUnit::SECOND),
+ timestamp(TimeUnit::SECOND),
+ time32(TimeUnit::SECOND),
+ time64(TimeUnit::MICRO),
+ fixed_size_binary(0),
+ list(null()),
+ large_list(null()),
+ fixed_size_list(field("dummy", null()), 0),
+ struct_({}),
+ sparse_union(FieldVector{}),
+ dense_union(FieldVector{}),
+ dictionary(int32(), null()),
+ map(null(), null())};
+ return example_parametric_types;
+}
+
+// Construct dummy parametric types so that we can get VisitTypeInline to
+// work above
+
+Result<ValueDescr> FirstType(KernelContext*, const std::vector<ValueDescr>& descrs) {
+ ValueDescr result = descrs.front();
+ result.shape = GetBroadcastShape(descrs);
+ return result;
+}
+
+void EnsureDictionaryDecoded(std::vector<ValueDescr>* descrs) {
+ for (ValueDescr& descr : *descrs) {
+ if (descr.type->id() == Type::DICTIONARY) {
+ descr.type = checked_cast<const DictionaryType&>(*descr.type).value_type();
+ }
+ }
+}
+
+void ReplaceNullWithOtherType(std::vector<ValueDescr>* descrs) {
+ DCHECK_EQ(descrs->size(), 2);
+
+ if (descrs->at(0).type->id() == Type::NA) {
+ descrs->at(0).type = descrs->at(1).type;
+ return;
+ }
+
+ if (descrs->at(1).type->id() == Type::NA) {
+ descrs->at(1).type = descrs->at(0).type;
+ return;
+ }
+}
+
+void ReplaceTypes(const std::shared_ptr<DataType>& type,
+ std::vector<ValueDescr>* descrs) {
+ for (auto& descr : *descrs) {
+ descr.type = type;
+ }
+}
+
+std::shared_ptr<DataType> CommonNumeric(const std::vector<ValueDescr>& descrs) {
+ return CommonNumeric(descrs.data(), descrs.size());
+}
+
+std::shared_ptr<DataType> CommonNumeric(const ValueDescr* begin, size_t count) {
+ DCHECK_GT(count, 0) << "tried to find CommonNumeric type of an empty set";
+
+ for (size_t i = 0; i < count; i++) {
+ const auto& descr = *(begin + i);
+ auto id = descr.type->id();
+ if (!is_floating(id) && !is_integer(id)) {
+ // a common numeric type is only possible if all types are numeric
+ return nullptr;
+ }
+ if (id == Type::HALF_FLOAT) {
+ // float16 arithmetic is not currently supported
+ return nullptr;
+ }
+ }
+
+ for (size_t i = 0; i < count; i++) {
+ const auto& descr = *(begin + i);
+ if (descr.type->id() == Type::DOUBLE) return float64();
+ }
+
+ for (size_t i = 0; i < count; i++) {
+ const auto& descr = *(begin + i);
+ if (descr.type->id() == Type::FLOAT) return float32();
+ }
+
+ int max_width_signed = 0, max_width_unsigned = 0;
+
+ for (size_t i = 0; i < count; i++) {
+ const auto& descr = *(begin + i);
+ auto id = descr.type->id();
+ auto max_width = &(is_signed_integer(id) ? max_width_signed : max_width_unsigned);
+ *max_width = std::max(bit_width(id), *max_width);
+ }
+
+ if (max_width_signed == 0) {
+ if (max_width_unsigned >= 64) return uint64();
+ if (max_width_unsigned == 32) return uint32();
+ if (max_width_unsigned == 16) return uint16();
+ DCHECK_EQ(max_width_unsigned, 8);
+ return uint8();
+ }
+
+ if (max_width_signed <= max_width_unsigned) {
+ max_width_signed = static_cast<int>(BitUtil::NextPower2(max_width_unsigned + 1));
+ }
+
+ if (max_width_signed >= 64) return int64();
+ if (max_width_signed == 32) return int32();
+ if (max_width_signed == 16) return int16();
+ DCHECK_EQ(max_width_signed, 8);
+ return int8();
+}
+
+std::shared_ptr<DataType> CommonTimestamp(const std::vector<ValueDescr>& descrs) {
+ TimeUnit::type finest_unit = TimeUnit::SECOND;
+
+ for (const auto& descr : descrs) {
+ auto id = descr.type->id();
+ // a common timestamp is only possible if all types are timestamp like
+ switch (id) {
+ case Type::DATE32:
+ case Type::DATE64:
+ continue;
+ case Type::TIMESTAMP:
+ finest_unit =
+ std::max(finest_unit, checked_cast<const TimestampType&>(*descr.type).unit());
+ continue;
+ default:
+ return nullptr;
+ }
+ }
+
+ return timestamp(finest_unit);
+}
+
+std::shared_ptr<DataType> CommonBinary(const std::vector<ValueDescr>& descrs) {
+ bool all_utf8 = true, all_offset32 = true;
+
+ for (const auto& descr : descrs) {
+ auto id = descr.type->id();
+ // a common varbinary type is only possible if all types are binary like
+ switch (id) {
+ case Type::STRING:
+ continue;
+ case Type::BINARY:
+ all_utf8 = false;
+ continue;
+ case Type::LARGE_STRING:
+ all_offset32 = false;
+ continue;
+ case Type::LARGE_BINARY:
+ all_offset32 = false;
+ all_utf8 = false;
+ continue;
+ default:
+ return nullptr;
+ }
+ }
+
+ if (all_utf8) {
+ if (all_offset32) return utf8();
+ return large_utf8();
+ }
+
+ if (all_offset32) return binary();
+ return large_binary();
+}
+
+} // namespace internal
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/codegen_internal.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/codegen_internal.h
new file mode 100644
index 00000000000..cb9b13bb3d7
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/codegen_internal.h
@@ -0,0 +1,1381 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/array/builder_binary.h"
+#include "arrow/array/data.h"
+#include "arrow/buffer.h"
+#include "arrow/buffer_builder.h"
+#include "arrow/compute/exec.h"
+#include "arrow/compute/kernel.h"
+#include "arrow/datum.h"
+#include "arrow/result.h"
+#include "arrow/scalar.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/bit_block_counter.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_generate.h"
+#include "arrow/util/bitmap_reader.h"
+#include "arrow/util/bitmap_writer.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/decimal.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/make_unique.h"
+#include "arrow/util/optional.h"
+#include "arrow/util/string_view.h"
+#include "arrow/visitor_inline.h"
+
+namespace arrow {
+
+using internal::BinaryBitBlockCounter;
+using internal::BitBlockCount;
+using internal::BitmapReader;
+using internal::checked_cast;
+using internal::FirstTimeBitmapWriter;
+using internal::GenerateBitsUnrolled;
+using internal::VisitBitBlocksVoid;
+using internal::VisitTwoBitBlocksVoid;
+
+namespace compute {
+namespace internal {
+
+/// KernelState adapter for the common case of kernels whose only
+/// state is an instance of a subclass of FunctionOptions.
+/// Default FunctionOptions are *not* handled here.
+template <typename OptionsType>
+struct OptionsWrapper : public KernelState {
+ explicit OptionsWrapper(OptionsType options) : options(std::move(options)) {}
+
+ static Result<std::unique_ptr<KernelState>> Init(KernelContext* ctx,
+ const KernelInitArgs& args) {
+ if (auto options = static_cast<const OptionsType*>(args.options)) {
+ return ::arrow::internal::make_unique<OptionsWrapper>(*options);
+ }
+
+ return Status::Invalid(
+ "Attempted to initialize KernelState from null FunctionOptions");
+ }
+
+ static const OptionsType& Get(const KernelState& state) {
+ return ::arrow::internal::checked_cast<const OptionsWrapper&>(state).options;
+ }
+
+ static const OptionsType& Get(KernelContext* ctx) { return Get(*ctx->state()); }
+
+ OptionsType options;
+};
+
+/// KernelState adapter for when the state is an instance constructed with the
+/// KernelContext and the FunctionOptions as argument
+template <typename StateType, typename OptionsType>
+struct KernelStateFromFunctionOptions : public KernelState {
+ explicit KernelStateFromFunctionOptions(KernelContext* ctx, OptionsType state)
+ : state(StateType(ctx, std::move(state))) {}
+
+ static Result<std::unique_ptr<KernelState>> Init(KernelContext* ctx,
+ const KernelInitArgs& args) {
+ if (auto options = static_cast<const OptionsType*>(args.options)) {
+ return ::arrow::internal::make_unique<KernelStateFromFunctionOptions>(ctx,
+ *options);
+ }
+
+ return Status::Invalid(
+ "Attempted to initialize KernelState from null FunctionOptions");
+ }
+
+ static const StateType& Get(const KernelState& state) {
+ return ::arrow::internal::checked_cast<const KernelStateFromFunctionOptions&>(state)
+ .state;
+ }
+
+ static const StateType& Get(KernelContext* ctx) { return Get(*ctx->state()); }
+
+ StateType state;
+};
+
+// ----------------------------------------------------------------------
+// Input and output value type definitions
+
+template <typename Type, typename Enable = void>
+struct GetViewType;
+
+template <typename Type>
+struct GetViewType<Type, enable_if_has_c_type<Type>> {
+ using T = typename Type::c_type;
+ using PhysicalType = T;
+
+ static T LogicalValue(PhysicalType value) { return value; }
+};
+
+template <typename Type>
+struct GetViewType<Type, enable_if_t<is_base_binary_type<Type>::value ||
+ is_fixed_size_binary_type<Type>::value>> {
+ using T = util::string_view;
+ using PhysicalType = T;
+
+ static T LogicalValue(PhysicalType value) { return value; }
+};
+
+template <>
+struct GetViewType<Decimal128Type> {
+ using T = Decimal128;
+ using PhysicalType = util::string_view;
+
+ static T LogicalValue(PhysicalType value) {
+ return Decimal128(reinterpret_cast<const uint8_t*>(value.data()));
+ }
+
+ static T LogicalValue(T value) { return value; }
+};
+
+template <>
+struct GetViewType<Decimal256Type> {
+ using T = Decimal256;
+ using PhysicalType = util::string_view;
+
+ static T LogicalValue(PhysicalType value) {
+ return Decimal256(reinterpret_cast<const uint8_t*>(value.data()));
+ }
+
+ static T LogicalValue(T value) { return value; }
+};
+
+template <typename Type, typename Enable = void>
+struct GetOutputType;
+
+template <typename Type>
+struct GetOutputType<Type, enable_if_has_c_type<Type>> {
+ using T = typename Type::c_type;
+};
+
+template <typename Type>
+struct GetOutputType<Type, enable_if_t<is_string_like_type<Type>::value>> {
+ using T = std::string;
+};
+
+template <>
+struct GetOutputType<Decimal128Type> {
+ using T = Decimal128;
+};
+
+template <>
+struct GetOutputType<Decimal256Type> {
+ using T = Decimal256;
+};
+
+// ----------------------------------------------------------------------
+// Iteration / value access utilities
+
+template <typename T, typename R = void>
+using enable_if_has_c_type_not_boolean =
+ enable_if_t<has_c_type<T>::value && !is_boolean_type<T>::value, R>;
+
+// Iterator over various input array types, yielding a GetViewType<Type>
+
+template <typename Type, typename Enable = void>
+struct ArrayIterator;
+
+template <typename Type>
+struct ArrayIterator<Type, enable_if_has_c_type_not_boolean<Type>> {
+ using T = typename Type::c_type;
+ const T* values;
+
+ explicit ArrayIterator(const ArrayData& data) : values(data.GetValues<T>(1)) {}
+ T operator()() { return *values++; }
+};
+
+template <typename Type>
+struct ArrayIterator<Type, enable_if_boolean<Type>> {
+ BitmapReader reader;
+
+ explicit ArrayIterator(const ArrayData& data)
+ : reader(data.buffers[1]->data(), data.offset, data.length) {}
+ bool operator()() {
+ bool out = reader.IsSet();
+ reader.Next();
+ return out;
+ }
+};
+
+template <typename Type>
+struct ArrayIterator<Type, enable_if_base_binary<Type>> {
+ using offset_type = typename Type::offset_type;
+ const ArrayData& arr;
+ const offset_type* offsets;
+ offset_type cur_offset;
+ const char* data;
+ int64_t position;
+
+ explicit ArrayIterator(const ArrayData& arr)
+ : arr(arr),
+ offsets(reinterpret_cast<const offset_type*>(arr.buffers[1]->data()) +
+ arr.offset),
+ cur_offset(offsets[0]),
+ data(reinterpret_cast<const char*>(arr.buffers[2]->data())),
+ position(0) {}
+
+ util::string_view operator()() {
+ offset_type next_offset = offsets[++position];
+ auto result = util::string_view(data + cur_offset, next_offset - cur_offset);
+ cur_offset = next_offset;
+ return result;
+ }
+};
+
+template <typename Type>
+struct ArrayIterator<Type, enable_if_decimal<Type>> {
+ using T = typename TypeTraits<Type>::ScalarType::ValueType;
+ using endian_agnostic = std::array<uint8_t, sizeof(T)>;
+ const endian_agnostic* values;
+
+ explicit ArrayIterator(const ArrayData& data)
+ : values(data.GetValues<endian_agnostic>(1)) {}
+
+ T operator()() { return T{values++->data()}; }
+};
+
+// Iterator over various output array types, taking a GetOutputType<Type>
+
+template <typename Type, typename Enable = void>
+struct OutputArrayWriter;
+
+template <typename Type>
+struct OutputArrayWriter<Type, enable_if_has_c_type_not_boolean<Type>> {
+ using T = typename Type::c_type;
+ T* values;
+
+ explicit OutputArrayWriter(ArrayData* data) : values(data->GetMutableValues<T>(1)) {}
+
+ void Write(T value) { *values++ = value; }
+
+ // Note that this doesn't write the null bitmap, which should be consistent
+ // with Write / WriteNull calls
+ void WriteNull() { *values++ = T{}; }
+
+ void WriteAllNull(int64_t length) { std::memset(values, 0, sizeof(T) * length); }
+};
+
+template <typename Type>
+struct OutputArrayWriter<Type, enable_if_decimal<Type>> {
+ using T = typename TypeTraits<Type>::ScalarType::ValueType;
+ using endian_agnostic = std::array<uint8_t, sizeof(T)>;
+ endian_agnostic* values;
+
+ explicit OutputArrayWriter(ArrayData* data)
+ : values(data->GetMutableValues<endian_agnostic>(1)) {}
+
+ void Write(T value) { value.ToBytes(values++->data()); }
+
+ void WriteNull() { T{}.ToBytes(values++->data()); }
+
+ void WriteAllNull(int64_t length) { std::memset(values, 0, sizeof(T) * length); }
+};
+
+// (Un)box Scalar to / from C++ value
+
+template <typename Type, typename Enable = void>
+struct UnboxScalar;
+
+template <typename Type>
+struct UnboxScalar<Type, enable_if_has_c_type<Type>> {
+ using T = typename Type::c_type;
+ static T Unbox(const Scalar& val) {
+ return *reinterpret_cast<const T*>(
+ checked_cast<const ::arrow::internal::PrimitiveScalarBase&>(val).data());
+ }
+};
+
+template <typename Type>
+struct UnboxScalar<Type, enable_if_has_string_view<Type>> {
+ static util::string_view Unbox(const Scalar& val) {
+ if (!val.is_valid) return util::string_view();
+ return util::string_view(*checked_cast<const BaseBinaryScalar&>(val).value);
+ }
+};
+
+template <>
+struct UnboxScalar<Decimal128Type> {
+ static Decimal128 Unbox(const Scalar& val) {
+ return checked_cast<const Decimal128Scalar&>(val).value;
+ }
+};
+
+template <>
+struct UnboxScalar<Decimal256Type> {
+ static Decimal256 Unbox(const Scalar& val) {
+ return checked_cast<const Decimal256Scalar&>(val).value;
+ }
+};
+
+template <typename Type, typename Enable = void>
+struct BoxScalar;
+
+template <typename Type>
+struct BoxScalar<Type, enable_if_has_c_type<Type>> {
+ using T = typename GetOutputType<Type>::T;
+ static void Box(T val, Scalar* out) {
+ // Enables BoxScalar<Int64Type> to work on a (for example) Time64Scalar
+ T* mutable_data = reinterpret_cast<T*>(
+ checked_cast<::arrow::internal::PrimitiveScalarBase*>(out)->mutable_data());
+ *mutable_data = val;
+ }
+};
+
+template <typename Type>
+struct BoxScalar<Type, enable_if_base_binary<Type>> {
+ using T = typename GetOutputType<Type>::T;
+ using ScalarType = typename TypeTraits<Type>::ScalarType;
+ static void Box(T val, Scalar* out) {
+ checked_cast<ScalarType*>(out)->value = std::make_shared<Buffer>(val);
+ }
+};
+
+template <>
+struct BoxScalar<Decimal128Type> {
+ using T = Decimal128;
+ using ScalarType = Decimal128Scalar;
+ static void Box(T val, Scalar* out) { checked_cast<ScalarType*>(out)->value = val; }
+};
+
+template <>
+struct BoxScalar<Decimal256Type> {
+ using T = Decimal256;
+ using ScalarType = Decimal256Scalar;
+ static void Box(T val, Scalar* out) { checked_cast<ScalarType*>(out)->value = val; }
+};
+
+// A VisitArrayDataInline variant that calls its visitor function with logical
+// values, such as Decimal128 rather than util::string_view.
+
+template <typename T, typename VisitFunc, typename NullFunc>
+static typename arrow::internal::call_traits::enable_if_return<VisitFunc, void>::type
+VisitArrayValuesInline(const ArrayData& arr, VisitFunc&& valid_func,
+ NullFunc&& null_func) {
+ VisitArrayDataInline<T>(
+ arr,
+ [&](typename GetViewType<T>::PhysicalType v) {
+ valid_func(GetViewType<T>::LogicalValue(std::move(v)));
+ },
+ std::forward<NullFunc>(null_func));
+}
+
+template <typename T, typename VisitFunc, typename NullFunc>
+static typename arrow::internal::call_traits::enable_if_return<VisitFunc, Status>::type
+VisitArrayValuesInline(const ArrayData& arr, VisitFunc&& valid_func,
+ NullFunc&& null_func) {
+ return VisitArrayDataInline<T>(
+ arr,
+ [&](typename GetViewType<T>::PhysicalType v) {
+ return valid_func(GetViewType<T>::LogicalValue(std::move(v)));
+ },
+ std::forward<NullFunc>(null_func));
+}
+
+// Like VisitArrayValuesInline, but for binary functions.
+
+template <typename Arg0Type, typename Arg1Type, typename VisitFunc, typename NullFunc>
+static void VisitTwoArrayValuesInline(const ArrayData& arr0, const ArrayData& arr1,
+ VisitFunc&& valid_func, NullFunc&& null_func) {
+ ArrayIterator<Arg0Type> arr0_it(arr0);
+ ArrayIterator<Arg1Type> arr1_it(arr1);
+
+ auto visit_valid = [&](int64_t i) {
+ valid_func(GetViewType<Arg0Type>::LogicalValue(arr0_it()),
+ GetViewType<Arg1Type>::LogicalValue(arr1_it()));
+ };
+ auto visit_null = [&]() {
+ arr0_it();
+ arr1_it();
+ null_func();
+ };
+ VisitTwoBitBlocksVoid(arr0.buffers[0], arr0.offset, arr1.buffers[0], arr1.offset,
+ arr0.length, std::move(visit_valid), std::move(visit_null));
+}
+
+// ----------------------------------------------------------------------
+// Reusable type resolvers
+
+Result<ValueDescr> FirstType(KernelContext*, const std::vector<ValueDescr>& descrs);
+
+// ----------------------------------------------------------------------
+// Generate an array kernel given template classes
+
+Status ExecFail(KernelContext* ctx, const ExecBatch& batch, Datum* out);
+
+ArrayKernelExec MakeFlippedBinaryExec(ArrayKernelExec exec);
+
+// ----------------------------------------------------------------------
+// Helpers for iterating over common DataType instances for adding kernels to
+// functions
+
+const std::vector<std::shared_ptr<DataType>>& BaseBinaryTypes();
+const std::vector<std::shared_ptr<DataType>>& StringTypes();
+const std::vector<std::shared_ptr<DataType>>& SignedIntTypes();
+const std::vector<std::shared_ptr<DataType>>& UnsignedIntTypes();
+const std::vector<std::shared_ptr<DataType>>& IntTypes();
+const std::vector<std::shared_ptr<DataType>>& FloatingPointTypes();
+const std::vector<Type::type>& DecimalTypeIds();
+
+ARROW_EXPORT
+const std::vector<TimeUnit::type>& AllTimeUnits();
+
+// Returns a vector of example instances of parametric types such as
+//
+// * Decimal
+// * Timestamp (requiring unit)
+// * Time32 (requiring unit)
+// * Time64 (requiring unit)
+// * Duration (requiring unit)
+// * List, LargeList, FixedSizeList
+// * Struct
+// * Union
+// * Dictionary
+// * Map
+//
+// Generally kernels will use the "FirstType" OutputType::Resolver above for
+// the OutputType of the kernel's signature and match::SameTypeId for the
+// corresponding InputType
+const std::vector<std::shared_ptr<DataType>>& ExampleParametricTypes();
+
+// Number types without boolean
+const std::vector<std::shared_ptr<DataType>>& NumericTypes();
+
+// Temporal types including time and timestamps for each unit
+const std::vector<std::shared_ptr<DataType>>& TemporalTypes();
+
+// Integer, floating point, base binary, and temporal
+const std::vector<std::shared_ptr<DataType>>& PrimitiveTypes();
+
+// ----------------------------------------------------------------------
+// "Applicators" take an operator definition (which may be scalar-valued or
+// array-valued) and creates an ArrayKernelExec which can be used to add an
+// ArrayKernel to a Function.
+
+namespace applicator {
+
+// Generate an ArrayKernelExec given a functor that handles all of its own
+// iteration, etc.
+//
+// Operator must implement
+//
+// static Status Call(KernelContext*, const ArrayData& in, ArrayData* out)
+// static Status Call(KernelContext*, const Scalar& in, Scalar* out)
+template <typename Operator>
+static Status SimpleUnary(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ if (batch[0].kind() == Datum::SCALAR) {
+ return Operator::Call(ctx, *batch[0].scalar(), out->scalar().get());
+ } else if (batch.length > 0) {
+ return Operator::Call(ctx, *batch[0].array(), out->mutable_array());
+ }
+ return Status::OK();
+}
+
+// Generate an ArrayKernelExec given a functor that handles all of its own
+// iteration, etc.
+//
+// Operator must implement
+//
+// static Status Call(KernelContext*, const ArrayData& arg0, const ArrayData& arg1,
+// ArrayData* out)
+// static Status Call(KernelContext*, const ArrayData& arg0, const Scalar& arg1,
+// ArrayData* out)
+// static Status Call(KernelContext*, const Scalar& arg0, const ArrayData& arg1,
+// ArrayData* out)
+// static Status Call(KernelContext*, const Scalar& arg0, const Scalar& arg1,
+// Scalar* out)
+template <typename Operator>
+static Status SimpleBinary(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ if (batch.length == 0) return Status::OK();
+
+ if (batch[0].kind() == Datum::ARRAY) {
+ if (batch[1].kind() == Datum::ARRAY) {
+ return Operator::Call(ctx, *batch[0].array(), *batch[1].array(),
+ out->mutable_array());
+ } else {
+ return Operator::Call(ctx, *batch[0].array(), *batch[1].scalar(),
+ out->mutable_array());
+ }
+ } else {
+ if (batch[1].kind() == Datum::ARRAY) {
+ return Operator::Call(ctx, *batch[0].scalar(), *batch[1].array(),
+ out->mutable_array());
+ } else {
+ return Operator::Call(ctx, *batch[0].scalar(), *batch[1].scalar(),
+ out->scalar().get());
+ }
+ }
+}
+
+// OutputAdapter allows passing an inlineable lambda that provides a sequence
+// of output values to write into output memory. Boolean and primitive outputs
+// are currently implemented, and the validity bitmap is presumed to be handled
+// at a higher level, so this writes into every output slot, null or not.
+template <typename Type, typename Enable = void>
+struct OutputAdapter;
+
+template <typename Type>
+struct OutputAdapter<Type, enable_if_boolean<Type>> {
+ template <typename Generator>
+ static Status Write(KernelContext*, Datum* out, Generator&& generator) {
+ ArrayData* out_arr = out->mutable_array();
+ auto out_bitmap = out_arr->buffers[1]->mutable_data();
+ GenerateBitsUnrolled(out_bitmap, out_arr->offset, out_arr->length,
+ std::forward<Generator>(generator));
+ return Status::OK();
+ }
+};
+
+template <typename Type>
+struct OutputAdapter<Type, enable_if_has_c_type_not_boolean<Type>> {
+ template <typename Generator>
+ static Status Write(KernelContext*, Datum* out, Generator&& generator) {
+ ArrayData* out_arr = out->mutable_array();
+ auto out_data = out_arr->GetMutableValues<typename Type::c_type>(1);
+ // TODO: Is this as fast as a more explicitly inlined function?
+ for (int64_t i = 0; i < out_arr->length; ++i) {
+ *out_data++ = generator();
+ }
+ return Status::OK();
+ }
+};
+
+template <typename Type>
+struct OutputAdapter<Type, enable_if_base_binary<Type>> {
+ template <typename Generator>
+ static Status Write(KernelContext* ctx, Datum* out, Generator&& generator) {
+ return Status::NotImplemented("NYI");
+ }
+};
+
+template <typename Type>
+struct OutputAdapter<Type, enable_if_decimal<Type>> {
+ using T = typename TypeTraits<Type>::ScalarType::ValueType;
+ using endian_agnostic = std::array<uint8_t, sizeof(T)>;
+
+ template <typename Generator>
+ static Status Write(KernelContext*, Datum* out, Generator&& generator) {
+ ArrayData* out_arr = out->mutable_array();
+ auto out_data = out_arr->GetMutableValues<endian_agnostic>(1);
+ for (int64_t i = 0; i < out_arr->length; ++i) {
+ generator().ToBytes(out_data++->data());
+ }
+ return Status::OK();
+ }
+};
+
+// A kernel exec generator for unary functions that addresses both array and
+// scalar inputs and dispatches input iteration and output writing to other
+// templates
+//
+// This template executes the operator even on the data behind null values,
+// therefore it is generally only suitable for operators that are safe to apply
+// even on the null slot values.
+//
+// The "Op" functor should have the form
+//
+// struct Op {
+// template <typename OutValue, typename Arg0Value>
+// static OutValue Call(KernelContext* ctx, Arg0Value val, Status* st) {
+// // implementation
+// // NOTE: "status" should only populated with errors,
+// // leave it unmodified to indicate Status::OK()
+// }
+// };
+template <typename OutType, typename Arg0Type, typename Op>
+struct ScalarUnary {
+ using OutValue = typename GetOutputType<OutType>::T;
+ using Arg0Value = typename GetViewType<Arg0Type>::T;
+
+ static Status ExecArray(KernelContext* ctx, const ArrayData& arg0, Datum* out) {
+ Status st = Status::OK();
+ ArrayIterator<Arg0Type> arg0_it(arg0);
+ RETURN_NOT_OK(OutputAdapter<OutType>::Write(ctx, out, [&]() -> OutValue {
+ return Op::template Call<OutValue, Arg0Value>(ctx, arg0_it(), &st);
+ }));
+ return st;
+ }
+
+ static Status ExecScalar(KernelContext* ctx, const Scalar& arg0, Datum* out) {
+ Status st = Status::OK();
+ Scalar* out_scalar = out->scalar().get();
+ if (arg0.is_valid) {
+ Arg0Value arg0_val = UnboxScalar<Arg0Type>::Unbox(arg0);
+ out_scalar->is_valid = true;
+ BoxScalar<OutType>::Box(Op::template Call<OutValue, Arg0Value>(ctx, arg0_val, &st),
+ out_scalar);
+ } else {
+ out_scalar->is_valid = false;
+ }
+ return st;
+ }
+
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ if (batch[0].kind() == Datum::ARRAY) {
+ return ExecArray(ctx, *batch[0].array(), out);
+ } else {
+ return ExecScalar(ctx, *batch[0].scalar(), out);
+ }
+ }
+};
+
+// An alternative to ScalarUnary that Applies a scalar operation with state on
+// only the not-null values of a single array
+template <typename OutType, typename Arg0Type, typename Op>
+struct ScalarUnaryNotNullStateful {
+ using ThisType = ScalarUnaryNotNullStateful<OutType, Arg0Type, Op>;
+ using OutValue = typename GetOutputType<OutType>::T;
+ using Arg0Value = typename GetViewType<Arg0Type>::T;
+
+ Op op;
+ explicit ScalarUnaryNotNullStateful(Op op) : op(std::move(op)) {}
+
+ // NOTE: In ArrayExec<Type>, Type is really OutputType
+
+ template <typename Type, typename Enable = void>
+ struct ArrayExec {
+ static Status Exec(const ThisType& functor, KernelContext* ctx,
+ const ExecBatch& batch, Datum* out) {
+ ARROW_LOG(FATAL) << "Missing ArrayExec specialization for output type "
+ << out->type();
+ return Status::NotImplemented("NYI");
+ }
+ };
+
+ template <typename Type>
+ struct ArrayExec<
+ Type, enable_if_t<has_c_type<Type>::value && !is_boolean_type<Type>::value>> {
+ static Status Exec(const ThisType& functor, KernelContext* ctx, const ArrayData& arg0,
+ Datum* out) {
+ Status st = Status::OK();
+ ArrayData* out_arr = out->mutable_array();
+ auto out_data = out_arr->GetMutableValues<OutValue>(1);
+ VisitArrayValuesInline<Arg0Type>(
+ arg0,
+ [&](Arg0Value v) {
+ *out_data++ = functor.op.template Call<OutValue, Arg0Value>(ctx, v, &st);
+ },
+ [&]() {
+ // null
+ *out_data++ = OutValue{};
+ });
+ return st;
+ }
+ };
+
+ template <typename Type>
+ struct ArrayExec<Type, enable_if_base_binary<Type>> {
+ static Status Exec(const ThisType& functor, KernelContext* ctx, const ArrayData& arg0,
+ Datum* out) {
+ // NOTE: This code is not currently used by any kernels and has
+ // suboptimal performance because it's recomputing the validity bitmap
+ // that is already computed by the kernel execution layer. Consider
+ // writing a lower-level "output adapter" for base binary types.
+ typename TypeTraits<Type>::BuilderType builder;
+ Status st = Status::OK();
+ RETURN_NOT_OK(VisitArrayValuesInline<Arg0Type>(
+ arg0, [&](Arg0Value v) { return builder.Append(functor.op.Call(ctx, v, &st)); },
+ [&]() { return builder.AppendNull(); }));
+ if (st.ok()) {
+ std::shared_ptr<ArrayData> result;
+ RETURN_NOT_OK(builder.FinishInternal(&result));
+ out->value = std::move(result);
+ }
+ return st;
+ }
+ };
+
+ template <typename Type>
+ struct ArrayExec<Type, enable_if_t<is_boolean_type<Type>::value>> {
+ static Status Exec(const ThisType& functor, KernelContext* ctx, const ArrayData& arg0,
+ Datum* out) {
+ Status st = Status::OK();
+ ArrayData* out_arr = out->mutable_array();
+ FirstTimeBitmapWriter out_writer(out_arr->buffers[1]->mutable_data(),
+ out_arr->offset, out_arr->length);
+ VisitArrayValuesInline<Arg0Type>(
+ arg0,
+ [&](Arg0Value v) {
+ if (functor.op.template Call<OutValue, Arg0Value>(ctx, v, &st)) {
+ out_writer.Set();
+ }
+ out_writer.Next();
+ },
+ [&]() {
+ // null
+ out_writer.Clear();
+ out_writer.Next();
+ });
+ out_writer.Finish();
+ return st;
+ }
+ };
+
+ template <typename Type>
+ struct ArrayExec<Type, enable_if_decimal<Type>> {
+ static Status Exec(const ThisType& functor, KernelContext* ctx, const ArrayData& arg0,
+ Datum* out) {
+ Status st = Status::OK();
+ ArrayData* out_arr = out->mutable_array();
+ // Decimal128 data buffers are not safely reinterpret_cast-able on big-endian
+ using endian_agnostic =
+ std::array<uint8_t, sizeof(typename TypeTraits<Type>::ScalarType::ValueType)>;
+ auto out_data = out_arr->GetMutableValues<endian_agnostic>(1);
+ VisitArrayValuesInline<Arg0Type>(
+ arg0,
+ [&](Arg0Value v) {
+ functor.op.template Call<OutValue, Arg0Value>(ctx, v, &st)
+ .ToBytes(out_data++->data());
+ },
+ [&]() {
+ // null
+ std::memset(out_data, 0, sizeof(*out_data));
+ ++out_data;
+ });
+ return st;
+ }
+ };
+
+ Status Scalar(KernelContext* ctx, const Scalar& arg0, Datum* out) {
+ Status st = Status::OK();
+ if (arg0.is_valid) {
+ Arg0Value arg0_val = UnboxScalar<Arg0Type>::Unbox(arg0);
+ BoxScalar<OutType>::Box(
+ this->op.template Call<OutValue, Arg0Value>(ctx, arg0_val, &st),
+ out->scalar().get());
+ }
+ return st;
+ }
+
+ Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ if (batch[0].kind() == Datum::ARRAY) {
+ return ArrayExec<OutType>::Exec(*this, ctx, *batch[0].array(), out);
+ } else {
+ return Scalar(ctx, *batch[0].scalar(), out);
+ }
+ }
+};
+
+// An alternative to ScalarUnary that Applies a scalar operation on only the
+// not-null values of a single array. The operator is not stateful; if the
+// operator requires some initialization use ScalarUnaryNotNullStateful
+template <typename OutType, typename Arg0Type, typename Op>
+struct ScalarUnaryNotNull {
+ using OutValue = typename GetOutputType<OutType>::T;
+ using Arg0Value = typename GetViewType<Arg0Type>::T;
+
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ // Seed kernel with dummy state
+ ScalarUnaryNotNullStateful<OutType, Arg0Type, Op> kernel({});
+ return kernel.Exec(ctx, batch, out);
+ }
+};
+
+// A kernel exec generator for binary functions that addresses both array and
+// scalar inputs and dispatches input iteration and output writing to other
+// templates
+//
+// This template executes the operator even on the data behind null values,
+// therefore it is generally only suitable for operators that are safe to apply
+// even on the null slot values.
+//
+// The "Op" functor should have the form
+//
+// struct Op {
+// template <typename OutValue, typename Arg0Value, typename Arg1Value>
+// static OutValue Call(KernelContext* ctx, Arg0Value arg0, Arg1Value arg1, Status* st)
+// {
+// // implementation
+// // NOTE: "status" should only populated with errors,
+// // leave it unmodified to indicate Status::OK()
+// }
+// };
+template <typename OutType, typename Arg0Type, typename Arg1Type, typename Op>
+struct ScalarBinary {
+ using OutValue = typename GetOutputType<OutType>::T;
+ using Arg0Value = typename GetViewType<Arg0Type>::T;
+ using Arg1Value = typename GetViewType<Arg1Type>::T;
+
+ static Status ArrayArray(KernelContext* ctx, const ArrayData& arg0,
+ const ArrayData& arg1, Datum* out) {
+ Status st = Status::OK();
+ ArrayIterator<Arg0Type> arg0_it(arg0);
+ ArrayIterator<Arg1Type> arg1_it(arg1);
+ RETURN_NOT_OK(OutputAdapter<OutType>::Write(ctx, out, [&]() -> OutValue {
+ return Op::template Call<OutValue, Arg0Value, Arg1Value>(ctx, arg0_it(), arg1_it(),
+ &st);
+ }));
+ return st;
+ }
+
+ static Status ArrayScalar(KernelContext* ctx, const ArrayData& arg0, const Scalar& arg1,
+ Datum* out) {
+ Status st = Status::OK();
+ ArrayIterator<Arg0Type> arg0_it(arg0);
+ auto arg1_val = UnboxScalar<Arg1Type>::Unbox(arg1);
+ RETURN_NOT_OK(OutputAdapter<OutType>::Write(ctx, out, [&]() -> OutValue {
+ return Op::template Call<OutValue, Arg0Value, Arg1Value>(ctx, arg0_it(), arg1_val,
+ &st);
+ }));
+ return st;
+ }
+
+ static Status ScalarArray(KernelContext* ctx, const Scalar& arg0, const ArrayData& arg1,
+ Datum* out) {
+ Status st = Status::OK();
+ auto arg0_val = UnboxScalar<Arg0Type>::Unbox(arg0);
+ ArrayIterator<Arg1Type> arg1_it(arg1);
+ RETURN_NOT_OK(OutputAdapter<OutType>::Write(ctx, out, [&]() -> OutValue {
+ return Op::template Call<OutValue, Arg0Value, Arg1Value>(ctx, arg0_val, arg1_it(),
+ &st);
+ }));
+ return st;
+ }
+
+ static Status ScalarScalar(KernelContext* ctx, const Scalar& arg0, const Scalar& arg1,
+ Datum* out) {
+ Status st = Status::OK();
+ if (out->scalar()->is_valid) {
+ auto arg0_val = UnboxScalar<Arg0Type>::Unbox(arg0);
+ auto arg1_val = UnboxScalar<Arg1Type>::Unbox(arg1);
+ BoxScalar<OutType>::Box(
+ Op::template Call<OutValue, Arg0Value, Arg1Value>(ctx, arg0_val, arg1_val, &st),
+ out->scalar().get());
+ }
+ return st;
+ }
+
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ if (batch[0].kind() == Datum::ARRAY) {
+ if (batch[1].kind() == Datum::ARRAY) {
+ return ArrayArray(ctx, *batch[0].array(), *batch[1].array(), out);
+ } else {
+ return ArrayScalar(ctx, *batch[0].array(), *batch[1].scalar(), out);
+ }
+ } else {
+ if (batch[1].kind() == Datum::ARRAY) {
+ return ScalarArray(ctx, *batch[0].scalar(), *batch[1].array(), out);
+ } else {
+ return ScalarScalar(ctx, *batch[0].scalar(), *batch[1].scalar(), out);
+ }
+ }
+ }
+};
+
+// An alternative to ScalarBinary that Applies a scalar operation with state on
+// only the value pairs which are not-null in both arrays
+template <typename OutType, typename Arg0Type, typename Arg1Type, typename Op>
+struct ScalarBinaryNotNullStateful {
+ using ThisType = ScalarBinaryNotNullStateful<OutType, Arg0Type, Arg1Type, Op>;
+ using OutValue = typename GetOutputType<OutType>::T;
+ using Arg0Value = typename GetViewType<Arg0Type>::T;
+ using Arg1Value = typename GetViewType<Arg1Type>::T;
+
+ Op op;
+ explicit ScalarBinaryNotNullStateful(Op op) : op(std::move(op)) {}
+
+ // NOTE: In ArrayExec<Type>, Type is really OutputType
+
+ Status ArrayArray(KernelContext* ctx, const ArrayData& arg0, const ArrayData& arg1,
+ Datum* out) {
+ Status st = Status::OK();
+ OutputArrayWriter<OutType> writer(out->mutable_array());
+ VisitTwoArrayValuesInline<Arg0Type, Arg1Type>(
+ arg0, arg1,
+ [&](Arg0Value u, Arg1Value v) {
+ writer.Write(op.template Call<OutValue, Arg0Value, Arg1Value>(ctx, u, v, &st));
+ },
+ [&]() { writer.WriteNull(); });
+ return st;
+ }
+
+ Status ArrayScalar(KernelContext* ctx, const ArrayData& arg0, const Scalar& arg1,
+ Datum* out) {
+ Status st = Status::OK();
+ OutputArrayWriter<OutType> writer(out->mutable_array());
+ if (arg1.is_valid) {
+ const auto arg1_val = UnboxScalar<Arg1Type>::Unbox(arg1);
+ VisitArrayValuesInline<Arg0Type>(
+ arg0,
+ [&](Arg0Value u) {
+ writer.Write(
+ op.template Call<OutValue, Arg0Value, Arg1Value>(ctx, u, arg1_val, &st));
+ },
+ [&]() { writer.WriteNull(); });
+ } else {
+ writer.WriteAllNull(out->mutable_array()->length);
+ }
+ return st;
+ }
+
+ Status ScalarArray(KernelContext* ctx, const Scalar& arg0, const ArrayData& arg1,
+ Datum* out) {
+ Status st = Status::OK();
+ OutputArrayWriter<OutType> writer(out->mutable_array());
+ if (arg0.is_valid) {
+ const auto arg0_val = UnboxScalar<Arg0Type>::Unbox(arg0);
+ VisitArrayValuesInline<Arg1Type>(
+ arg1,
+ [&](Arg1Value v) {
+ writer.Write(
+ op.template Call<OutValue, Arg0Value, Arg1Value>(ctx, arg0_val, v, &st));
+ },
+ [&]() { writer.WriteNull(); });
+ } else {
+ writer.WriteAllNull(out->mutable_array()->length);
+ }
+ return st;
+ }
+
+ Status ScalarScalar(KernelContext* ctx, const Scalar& arg0, const Scalar& arg1,
+ Datum* out) {
+ Status st = Status::OK();
+ if (arg0.is_valid && arg1.is_valid) {
+ const auto arg0_val = UnboxScalar<Arg0Type>::Unbox(arg0);
+ const auto arg1_val = UnboxScalar<Arg1Type>::Unbox(arg1);
+ BoxScalar<OutType>::Box(
+ op.template Call<OutValue, Arg0Value, Arg1Value>(ctx, arg0_val, arg1_val, &st),
+ out->scalar().get());
+ }
+ return st;
+ }
+
+ Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ if (batch[0].kind() == Datum::ARRAY) {
+ if (batch[1].kind() == Datum::ARRAY) {
+ return ArrayArray(ctx, *batch[0].array(), *batch[1].array(), out);
+ } else {
+ return ArrayScalar(ctx, *batch[0].array(), *batch[1].scalar(), out);
+ }
+ } else {
+ if (batch[1].kind() == Datum::ARRAY) {
+ return ScalarArray(ctx, *batch[0].scalar(), *batch[1].array(), out);
+ } else {
+ return ScalarScalar(ctx, *batch[0].scalar(), *batch[1].scalar(), out);
+ }
+ }
+ }
+};
+
+// An alternative to ScalarBinary that Applies a scalar operation on only
+// the value pairs which are not-null in both arrays.
+// The operator is not stateful; if the operator requires some initialization
+// use ScalarBinaryNotNullStateful.
+template <typename OutType, typename Arg0Type, typename Arg1Type, typename Op>
+struct ScalarBinaryNotNull {
+ using OutValue = typename GetOutputType<OutType>::T;
+ using Arg0Value = typename GetViewType<Arg0Type>::T;
+ using Arg1Value = typename GetViewType<Arg1Type>::T;
+
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ // Seed kernel with dummy state
+ ScalarBinaryNotNullStateful<OutType, Arg0Type, Arg1Type, Op> kernel({});
+ return kernel.Exec(ctx, batch, out);
+ }
+};
+
+// A kernel exec generator for binary kernels where both input types are the
+// same
+template <typename OutType, typename ArgType, typename Op>
+using ScalarBinaryEqualTypes = ScalarBinary<OutType, ArgType, ArgType, Op>;
+
+// A kernel exec generator for non-null binary kernels where both input types are the
+// same
+template <typename OutType, typename ArgType, typename Op>
+using ScalarBinaryNotNullEqualTypes = ScalarBinaryNotNull<OutType, ArgType, ArgType, Op>;
+
+} // namespace applicator
+
+// ----------------------------------------------------------------------
+// BEGIN of kernel generator-dispatchers ("GD")
+//
+// These GD functions instantiate kernel functor templates and select one of
+// the instantiated kernels dynamically based on the data type or Type::type id
+// that is passed. This enables functions to be populated with kernels by
+// looping over vectors of data types rather than using macros or other
+// approaches.
+//
+// The kernel functor must be of the form:
+//
+// template <typename Type0, typename Type1, Args...>
+// struct FUNCTOR {
+// static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+// // IMPLEMENTATION
+// }
+// };
+//
+// When you pass FUNCTOR to a GD function, you must pass at least one static
+// type along with the functor -- this is often the fixed return type of the
+// functor. This Type0 argument is passed as the first argument to the functor
+// during instantiation. The 2nd type passed to the functor is the DataType
+// subclass corresponding to the type passed as argument (not template type) to
+// the function.
+//
+// For example, GenerateNumeric<FUNCTOR, Type0>(int32()) will select a kernel
+// instantiated like FUNCTOR<Type0, Int32Type>. Any additional variadic
+// template arguments will be passed as additional template arguments to the
+// kernel template.
+
+namespace detail {
+
+// Convenience so we can pass DataType or Type::type for the GD's
+struct GetTypeId {
+ Type::type id;
+ GetTypeId(const std::shared_ptr<DataType>& type) // NOLINT implicit construction
+ : id(type->id()) {}
+ GetTypeId(const DataType& type) // NOLINT implicit construction
+ : id(type.id()) {}
+ GetTypeId(Type::type id) // NOLINT implicit construction
+ : id(id) {}
+};
+
+} // namespace detail
+
+// GD for numeric types (integer and floating point)
+template <template <typename...> class Generator, typename Type0, typename... Args>
+ArrayKernelExec GenerateNumeric(detail::GetTypeId get_id) {
+ switch (get_id.id) {
+ case Type::INT8:
+ return Generator<Type0, Int8Type, Args...>::Exec;
+ case Type::UINT8:
+ return Generator<Type0, UInt8Type, Args...>::Exec;
+ case Type::INT16:
+ return Generator<Type0, Int16Type, Args...>::Exec;
+ case Type::UINT16:
+ return Generator<Type0, UInt16Type, Args...>::Exec;
+ case Type::INT32:
+ return Generator<Type0, Int32Type, Args...>::Exec;
+ case Type::UINT32:
+ return Generator<Type0, UInt32Type, Args...>::Exec;
+ case Type::INT64:
+ return Generator<Type0, Int64Type, Args...>::Exec;
+ case Type::UINT64:
+ return Generator<Type0, UInt64Type, Args...>::Exec;
+ case Type::FLOAT:
+ return Generator<Type0, FloatType, Args...>::Exec;
+ case Type::DOUBLE:
+ return Generator<Type0, DoubleType, Args...>::Exec;
+ default:
+ DCHECK(false);
+ return ExecFail;
+ }
+}
+
+// Generate a kernel given a templated functor for floating point types
+//
+// See "Numeric" above for description of the generator functor
+template <template <typename...> class Generator, typename Type0, typename... Args>
+ArrayKernelExec GenerateFloatingPoint(detail::GetTypeId get_id) {
+ switch (get_id.id) {
+ case Type::FLOAT:
+ return Generator<Type0, FloatType, Args...>::Exec;
+ case Type::DOUBLE:
+ return Generator<Type0, DoubleType, Args...>::Exec;
+ default:
+ DCHECK(false);
+ return ExecFail;
+ }
+}
+
+// Generate a kernel given a templated functor for integer types
+//
+// See "Numeric" above for description of the generator functor
+template <template <typename...> class Generator, typename Type0, typename... Args>
+ArrayKernelExec GenerateInteger(detail::GetTypeId get_id) {
+ switch (get_id.id) {
+ case Type::INT8:
+ return Generator<Type0, Int8Type, Args...>::Exec;
+ case Type::INT16:
+ return Generator<Type0, Int16Type, Args...>::Exec;
+ case Type::INT32:
+ return Generator<Type0, Int32Type, Args...>::Exec;
+ case Type::INT64:
+ return Generator<Type0, Int64Type, Args...>::Exec;
+ case Type::UINT8:
+ return Generator<Type0, UInt8Type, Args...>::Exec;
+ case Type::UINT16:
+ return Generator<Type0, UInt16Type, Args...>::Exec;
+ case Type::UINT32:
+ return Generator<Type0, UInt32Type, Args...>::Exec;
+ case Type::UINT64:
+ return Generator<Type0, UInt64Type, Args...>::Exec;
+ default:
+ DCHECK(false);
+ return ExecFail;
+ }
+}
+
+template <template <typename...> class Generator, typename Type0, typename... Args>
+ArrayKernelExec GeneratePhysicalInteger(detail::GetTypeId get_id) {
+ switch (get_id.id) {
+ case Type::INT8:
+ return Generator<Type0, Int8Type, Args...>::Exec;
+ case Type::INT16:
+ return Generator<Type0, Int16Type, Args...>::Exec;
+ case Type::INT32:
+ case Type::DATE32:
+ case Type::TIME32:
+ return Generator<Type0, Int32Type, Args...>::Exec;
+ case Type::INT64:
+ case Type::DATE64:
+ case Type::TIMESTAMP:
+ case Type::TIME64:
+ case Type::DURATION:
+ return Generator<Type0, Int64Type, Args...>::Exec;
+ case Type::UINT8:
+ return Generator<Type0, UInt8Type, Args...>::Exec;
+ case Type::UINT16:
+ return Generator<Type0, UInt16Type, Args...>::Exec;
+ case Type::UINT32:
+ return Generator<Type0, UInt32Type, Args...>::Exec;
+ case Type::UINT64:
+ return Generator<Type0, UInt64Type, Args...>::Exec;
+ default:
+ DCHECK(false);
+ return ExecFail;
+ }
+}
+
+template <template <typename... Args> class Generator, typename... Args>
+ArrayKernelExec GeneratePhysicalNumeric(detail::GetTypeId get_id) {
+ switch (get_id.id) {
+ case Type::INT8:
+ return Generator<Int8Type, Args...>::Exec;
+ case Type::INT16:
+ return Generator<Int16Type, Args...>::Exec;
+ case Type::INT32:
+ case Type::DATE32:
+ case Type::TIME32:
+ return Generator<Int32Type, Args...>::Exec;
+ case Type::INT64:
+ case Type::DATE64:
+ case Type::TIMESTAMP:
+ case Type::TIME64:
+ case Type::DURATION:
+ return Generator<Int64Type, Args...>::Exec;
+ case Type::UINT8:
+ return Generator<UInt8Type, Args...>::Exec;
+ case Type::UINT16:
+ return Generator<UInt16Type, Args...>::Exec;
+ case Type::UINT32:
+ return Generator<UInt32Type, Args...>::Exec;
+ case Type::UINT64:
+ return Generator<UInt64Type, Args...>::Exec;
+ case Type::FLOAT:
+ return Generator<FloatType, Args...>::Exec;
+ case Type::DOUBLE:
+ return Generator<DoubleType, Args...>::Exec;
+ default:
+ DCHECK(false);
+ return ExecFail;
+ }
+}
+
+// Generate a kernel given a templated functor for integer types
+//
+// See "Numeric" above for description of the generator functor
+template <template <typename...> class Generator, typename Type0, typename... Args>
+ArrayKernelExec GenerateSignedInteger(detail::GetTypeId get_id) {
+ switch (get_id.id) {
+ case Type::INT8:
+ return Generator<Type0, Int8Type, Args...>::Exec;
+ case Type::INT16:
+ return Generator<Type0, Int16Type, Args...>::Exec;
+ case Type::INT32:
+ return Generator<Type0, Int32Type, Args...>::Exec;
+ case Type::INT64:
+ return Generator<Type0, Int64Type, Args...>::Exec;
+ default:
+ DCHECK(false);
+ return ExecFail;
+ }
+}
+
+// Generate a kernel given a templated functor. Only a single template is
+// instantiated for each bit width, and the functor is expected to treat types
+// of the same bit width the same without utilizing any type-specific behavior
+// (e.g. int64 should be handled equivalent to uint64 or double -- all 64
+// bits).
+//
+// See "Numeric" above for description of the generator functor
+template <template <typename...> class Generator, typename... Args>
+ArrayKernelExec GenerateTypeAgnosticPrimitive(detail::GetTypeId get_id) {
+ switch (get_id.id) {
+ case Type::NA:
+ return Generator<NullType, Args...>::Exec;
+ case Type::BOOL:
+ return Generator<BooleanType, Args...>::Exec;
+ case Type::UINT8:
+ case Type::INT8:
+ return Generator<UInt8Type, Args...>::Exec;
+ case Type::UINT16:
+ case Type::INT16:
+ return Generator<UInt16Type, Args...>::Exec;
+ case Type::UINT32:
+ case Type::INT32:
+ case Type::FLOAT:
+ case Type::DATE32:
+ case Type::TIME32:
+ case Type::INTERVAL_MONTHS:
+ return Generator<UInt32Type, Args...>::Exec;
+ case Type::UINT64:
+ case Type::INT64:
+ case Type::DOUBLE:
+ case Type::DATE64:
+ case Type::TIMESTAMP:
+ case Type::TIME64:
+ case Type::DURATION:
+ case Type::INTERVAL_DAY_TIME:
+ return Generator<UInt64Type, Args...>::Exec;
+ default:
+ DCHECK(false);
+ return ExecFail;
+ }
+}
+
+// similar to GenerateTypeAgnosticPrimitive, but for variable types
+template <template <typename...> class Generator, typename... Args>
+ArrayKernelExec GenerateTypeAgnosticVarBinaryBase(detail::GetTypeId get_id) {
+ switch (get_id.id) {
+ case Type::BINARY:
+ case Type::STRING:
+ return Generator<BinaryType, Args...>::Exec;
+ case Type::LARGE_BINARY:
+ case Type::LARGE_STRING:
+ return Generator<LargeBinaryType, Args...>::Exec;
+ default:
+ DCHECK(false);
+ return ExecFail;
+ }
+}
+
+// Generate a kernel given a templated functor for base binary types. Generates
+// a single kernel for binary/string and large binary / large string. If your
+// kernel implementation needs access to the specific type at compile time,
+// please use BaseBinarySpecific.
+//
+// See "Numeric" above for description of the generator functor
+template <template <typename...> class Generator, typename Type0, typename... Args>
+ArrayKernelExec GenerateVarBinaryBase(detail::GetTypeId get_id) {
+ switch (get_id.id) {
+ case Type::BINARY:
+ case Type::STRING:
+ return Generator<Type0, BinaryType, Args...>::Exec;
+ case Type::LARGE_BINARY:
+ case Type::LARGE_STRING:
+ return Generator<Type0, LargeBinaryType, Args...>::Exec;
+ default:
+ DCHECK(false);
+ return ExecFail;
+ }
+}
+
+// See BaseBinary documentation
+template <template <typename...> class Generator, typename Type0, typename... Args>
+ArrayKernelExec GenerateVarBinary(detail::GetTypeId get_id) {
+ switch (get_id.id) {
+ case Type::BINARY:
+ return Generator<Type0, BinaryType, Args...>::Exec;
+ case Type::STRING:
+ return Generator<Type0, StringType, Args...>::Exec;
+ case Type::LARGE_BINARY:
+ return Generator<Type0, LargeBinaryType, Args...>::Exec;
+ case Type::LARGE_STRING:
+ return Generator<Type0, LargeStringType, Args...>::Exec;
+ default:
+ DCHECK(false);
+ return ExecFail;
+ }
+}
+
+// Generate a kernel given a templated functor for temporal types
+//
+// See "Numeric" above for description of the generator functor
+template <template <typename...> class Generator, typename Type0, typename... Args>
+ArrayKernelExec GenerateTemporal(detail::GetTypeId get_id) {
+ switch (get_id.id) {
+ case Type::DATE32:
+ return Generator<Type0, Date32Type, Args...>::Exec;
+ case Type::DATE64:
+ return Generator<Type0, Date64Type, Args...>::Exec;
+ case Type::DURATION:
+ return Generator<Type0, DurationType, Args...>::Exec;
+ case Type::TIME32:
+ return Generator<Type0, Time32Type, Args...>::Exec;
+ case Type::TIME64:
+ return Generator<Type0, Time64Type, Args...>::Exec;
+ case Type::TIMESTAMP:
+ return Generator<Type0, TimestampType, Args...>::Exec;
+ default:
+ DCHECK(false);
+ return ExecFail;
+ }
+}
+
+// Generate a kernel given a templated functor for decimal types
+//
+// See "Numeric" above for description of the generator functor
+template <template <typename...> class Generator, typename Type0, typename... Args>
+ArrayKernelExec GenerateDecimal(detail::GetTypeId get_id) {
+ switch (get_id.id) {
+ case Type::DECIMAL128:
+ return Generator<Type0, Decimal128Type, Args...>::Exec;
+ case Type::DECIMAL256:
+ return Generator<Type0, Decimal256Type, Args...>::Exec;
+ default:
+ DCHECK(false);
+ return ExecFail;
+ }
+}
+
+// END of kernel generator-dispatchers
+// ----------------------------------------------------------------------
+
+ARROW_EXPORT
+void EnsureDictionaryDecoded(std::vector<ValueDescr>* descrs);
+
+ARROW_EXPORT
+void ReplaceNullWithOtherType(std::vector<ValueDescr>* descrs);
+
+ARROW_EXPORT
+void ReplaceTypes(const std::shared_ptr<DataType>&, std::vector<ValueDescr>* descrs);
+
+ARROW_EXPORT
+std::shared_ptr<DataType> CommonNumeric(const std::vector<ValueDescr>& descrs);
+
+ARROW_EXPORT
+std::shared_ptr<DataType> CommonNumeric(const ValueDescr* begin, size_t count);
+
+ARROW_EXPORT
+std::shared_ptr<DataType> CommonTimestamp(const std::vector<ValueDescr>& descrs);
+
+ARROW_EXPORT
+std::shared_ptr<DataType> CommonBinary(const std::vector<ValueDescr>& descrs);
+
+} // namespace internal
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/common.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/common.h
new file mode 100644
index 00000000000..21244320f38
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/common.h
@@ -0,0 +1,54 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+// IWYU pragma: begin_exports
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "arrow/array/data.h"
+#include "arrow/buffer.h"
+#include "arrow/chunked_array.h"
+#include "arrow/compute/exec.h"
+#include "arrow/compute/function.h"
+#include "arrow/compute/kernel.h"
+#include "arrow/compute/kernels/codegen_internal.h"
+#include "arrow/compute/registry.h"
+#include "arrow/datum.h"
+#include "arrow/memory_pool.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/string_view.h"
+
+// IWYU pragma: end_exports
+
+namespace arrow {
+
+using internal::checked_cast;
+using internal::checked_pointer_cast;
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/hash_aggregate.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/hash_aggregate.cc
new file mode 100644
index 00000000000..ed40a6b1b8c
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/hash_aggregate.cc
@@ -0,0 +1,1379 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "arrow/buffer_builder.h"
+#include "arrow/compute/api_aggregate.h"
+#include "arrow/compute/api_vector.h"
+#include "arrow/compute/exec/key_compare.h"
+#include "arrow/compute/exec/key_encode.h"
+#include "arrow/compute/exec/key_hash.h"
+#include "arrow/compute/exec/key_map.h"
+#include "arrow/compute/exec/util.h"
+#include "arrow/compute/exec_internal.h"
+#include "arrow/compute/kernel.h"
+#include "arrow/compute/kernels/aggregate_internal.h"
+#include "arrow/compute/kernels/common.h"
+#include "arrow/util/bit_run_reader.h"
+#include "arrow/util/bitmap_ops.h"
+#include "arrow/util/bitmap_writer.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/cpu_info.h"
+#include "arrow/util/make_unique.h"
+#include "arrow/visitor_inline.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+using internal::FirstTimeBitmapWriter;
+
+namespace compute {
+namespace internal {
+namespace {
+
+struct KeyEncoder {
+ // the first byte of an encoded key is used to indicate nullity
+ static constexpr bool kExtraByteForNull = true;
+
+ static constexpr uint8_t kNullByte = 1;
+ static constexpr uint8_t kValidByte = 0;
+
+ virtual ~KeyEncoder() = default;
+
+ virtual void AddLength(const ArrayData&, int32_t* lengths) = 0;
+
+ virtual Status Encode(const ArrayData&, uint8_t** encoded_bytes) = 0;
+
+ virtual Result<std::shared_ptr<ArrayData>> Decode(uint8_t** encoded_bytes,
+ int32_t length, MemoryPool*) = 0;
+
+ // extract the null bitmap from the leading nullity bytes of encoded keys
+ static Status DecodeNulls(MemoryPool* pool, int32_t length, uint8_t** encoded_bytes,
+ std::shared_ptr<Buffer>* null_bitmap, int32_t* null_count) {
+ // first count nulls to determine if a null bitmap is necessary
+ *null_count = 0;
+ for (int32_t i = 0; i < length; ++i) {
+ *null_count += (encoded_bytes[i][0] == kNullByte);
+ }
+
+ if (*null_count > 0) {
+ ARROW_ASSIGN_OR_RAISE(*null_bitmap, AllocateBitmap(length, pool));
+ uint8_t* validity = (*null_bitmap)->mutable_data();
+
+ FirstTimeBitmapWriter writer(validity, 0, length);
+ for (int32_t i = 0; i < length; ++i) {
+ if (encoded_bytes[i][0] == kValidByte) {
+ writer.Set();
+ } else {
+ writer.Clear();
+ }
+ writer.Next();
+ encoded_bytes[i] += 1;
+ }
+ writer.Finish();
+ } else {
+ for (int32_t i = 0; i < length; ++i) {
+ encoded_bytes[i] += 1;
+ }
+ }
+ return Status ::OK();
+ }
+};
+
+struct BooleanKeyEncoder : KeyEncoder {
+ static constexpr int kByteWidth = 1;
+
+ void AddLength(const ArrayData& data, int32_t* lengths) override {
+ for (int64_t i = 0; i < data.length; ++i) {
+ lengths[i] += kByteWidth + kExtraByteForNull;
+ }
+ }
+
+ Status Encode(const ArrayData& data, uint8_t** encoded_bytes) override {
+ VisitArrayDataInline<BooleanType>(
+ data,
+ [&](bool value) {
+ auto& encoded_ptr = *encoded_bytes++;
+ *encoded_ptr++ = kValidByte;
+ *encoded_ptr++ = value;
+ },
+ [&] {
+ auto& encoded_ptr = *encoded_bytes++;
+ *encoded_ptr++ = kNullByte;
+ *encoded_ptr++ = 0;
+ });
+ return Status::OK();
+ }
+
+ Result<std::shared_ptr<ArrayData>> Decode(uint8_t** encoded_bytes, int32_t length,
+ MemoryPool* pool) override {
+ std::shared_ptr<Buffer> null_buf;
+ int32_t null_count;
+ RETURN_NOT_OK(DecodeNulls(pool, length, encoded_bytes, &null_buf, &null_count));
+
+ ARROW_ASSIGN_OR_RAISE(auto key_buf, AllocateBitmap(length, pool));
+
+ uint8_t* raw_output = key_buf->mutable_data();
+ for (int32_t i = 0; i < length; ++i) {
+ auto& encoded_ptr = encoded_bytes[i];
+ BitUtil::SetBitTo(raw_output, i, encoded_ptr[0] != 0);
+ encoded_ptr += 1;
+ }
+
+ return ArrayData::Make(boolean(), length, {std::move(null_buf), std::move(key_buf)},
+ null_count);
+ }
+};
+
+struct FixedWidthKeyEncoder : KeyEncoder {
+ explicit FixedWidthKeyEncoder(std::shared_ptr<DataType> type)
+ : type_(std::move(type)),
+ byte_width_(checked_cast<const FixedWidthType&>(*type_).bit_width() / 8) {}
+
+ void AddLength(const ArrayData& data, int32_t* lengths) override {
+ for (int64_t i = 0; i < data.length; ++i) {
+ lengths[i] += byte_width_ + kExtraByteForNull;
+ }
+ }
+
+ Status Encode(const ArrayData& data, uint8_t** encoded_bytes) override {
+ ArrayData viewed(fixed_size_binary(byte_width_), data.length, data.buffers,
+ data.null_count, data.offset);
+
+ VisitArrayDataInline<FixedSizeBinaryType>(
+ viewed,
+ [&](util::string_view bytes) {
+ auto& encoded_ptr = *encoded_bytes++;
+ *encoded_ptr++ = kValidByte;
+ memcpy(encoded_ptr, bytes.data(), byte_width_);
+ encoded_ptr += byte_width_;
+ },
+ [&] {
+ auto& encoded_ptr = *encoded_bytes++;
+ *encoded_ptr++ = kNullByte;
+ memset(encoded_ptr, 0, byte_width_);
+ encoded_ptr += byte_width_;
+ });
+ return Status::OK();
+ }
+
+ Result<std::shared_ptr<ArrayData>> Decode(uint8_t** encoded_bytes, int32_t length,
+ MemoryPool* pool) override {
+ std::shared_ptr<Buffer> null_buf;
+ int32_t null_count;
+ RETURN_NOT_OK(DecodeNulls(pool, length, encoded_bytes, &null_buf, &null_count));
+
+ ARROW_ASSIGN_OR_RAISE(auto key_buf, AllocateBuffer(length * byte_width_, pool));
+
+ uint8_t* raw_output = key_buf->mutable_data();
+ for (int32_t i = 0; i < length; ++i) {
+ auto& encoded_ptr = encoded_bytes[i];
+ std::memcpy(raw_output, encoded_ptr, byte_width_);
+ encoded_ptr += byte_width_;
+ raw_output += byte_width_;
+ }
+
+ return ArrayData::Make(type_, length, {std::move(null_buf), std::move(key_buf)},
+ null_count);
+ }
+
+ std::shared_ptr<DataType> type_;
+ int byte_width_;
+};
+
+struct DictionaryKeyEncoder : FixedWidthKeyEncoder {
+ DictionaryKeyEncoder(std::shared_ptr<DataType> type, MemoryPool* pool)
+ : FixedWidthKeyEncoder(std::move(type)), pool_(pool) {}
+
+ Status Encode(const ArrayData& data, uint8_t** encoded_bytes) override {
+ auto dict = MakeArray(data.dictionary);
+ if (dictionary_) {
+ if (!dictionary_->Equals(dict)) {
+ // TODO(bkietz) unify if necessary. For now, just error if any batch's dictionary
+ // differs from the first we saw for this key
+ return Status::NotImplemented("Unifying differing dictionaries");
+ }
+ } else {
+ dictionary_ = std::move(dict);
+ }
+ return FixedWidthKeyEncoder::Encode(data, encoded_bytes);
+ }
+
+ Result<std::shared_ptr<ArrayData>> Decode(uint8_t** encoded_bytes, int32_t length,
+ MemoryPool* pool) override {
+ ARROW_ASSIGN_OR_RAISE(auto data,
+ FixedWidthKeyEncoder::Decode(encoded_bytes, length, pool));
+
+ if (dictionary_) {
+ data->dictionary = dictionary_->data();
+ } else {
+ ARROW_ASSIGN_OR_RAISE(auto dict, MakeArrayOfNull(type_, 0));
+ data->dictionary = dict->data();
+ }
+
+ data->type = type_;
+ return data;
+ }
+
+ MemoryPool* pool_;
+ std::shared_ptr<Array> dictionary_;
+};
+
+template <typename T>
+struct VarLengthKeyEncoder : KeyEncoder {
+ using Offset = typename T::offset_type;
+
+ void AddLength(const ArrayData& data, int32_t* lengths) override {
+ int64_t i = 0;
+ VisitArrayDataInline<T>(
+ data,
+ [&](util::string_view bytes) {
+ lengths[i++] +=
+ kExtraByteForNull + sizeof(Offset) + static_cast<int32_t>(bytes.size());
+ },
+ [&] { lengths[i++] += kExtraByteForNull + sizeof(Offset); });
+ }
+
+ Status Encode(const ArrayData& data, uint8_t** encoded_bytes) override {
+ VisitArrayDataInline<T>(
+ data,
+ [&](util::string_view bytes) {
+ auto& encoded_ptr = *encoded_bytes++;
+ *encoded_ptr++ = kValidByte;
+ util::SafeStore(encoded_ptr, static_cast<Offset>(bytes.size()));
+ encoded_ptr += sizeof(Offset);
+ memcpy(encoded_ptr, bytes.data(), bytes.size());
+ encoded_ptr += bytes.size();
+ },
+ [&] {
+ auto& encoded_ptr = *encoded_bytes++;
+ *encoded_ptr++ = kNullByte;
+ util::SafeStore(encoded_ptr, static_cast<Offset>(0));
+ encoded_ptr += sizeof(Offset);
+ });
+ return Status::OK();
+ }
+
+ Result<std::shared_ptr<ArrayData>> Decode(uint8_t** encoded_bytes, int32_t length,
+ MemoryPool* pool) override {
+ std::shared_ptr<Buffer> null_buf;
+ int32_t null_count;
+ RETURN_NOT_OK(DecodeNulls(pool, length, encoded_bytes, &null_buf, &null_count));
+
+ Offset length_sum = 0;
+ for (int32_t i = 0; i < length; ++i) {
+ length_sum += util::SafeLoadAs<Offset>(encoded_bytes[i]);
+ }
+
+ ARROW_ASSIGN_OR_RAISE(auto offset_buf,
+ AllocateBuffer(sizeof(Offset) * (1 + length), pool));
+ ARROW_ASSIGN_OR_RAISE(auto key_buf, AllocateBuffer(length_sum));
+
+ auto raw_offsets = reinterpret_cast<Offset*>(offset_buf->mutable_data());
+ auto raw_keys = key_buf->mutable_data();
+
+ Offset current_offset = 0;
+ for (int32_t i = 0; i < length; ++i) {
+ raw_offsets[i] = current_offset;
+
+ auto key_length = util::SafeLoadAs<Offset>(encoded_bytes[i]);
+ encoded_bytes[i] += sizeof(Offset);
+
+ memcpy(raw_keys + current_offset, encoded_bytes[i], key_length);
+ encoded_bytes[i] += key_length;
+
+ current_offset += key_length;
+ }
+ raw_offsets[length] = current_offset;
+
+ return ArrayData::Make(
+ type_, length, {std::move(null_buf), std::move(offset_buf), std::move(key_buf)},
+ null_count);
+ }
+
+ explicit VarLengthKeyEncoder(std::shared_ptr<DataType> type) : type_(std::move(type)) {}
+
+ std::shared_ptr<DataType> type_;
+};
+
+struct GrouperImpl : Grouper {
+ static Result<std::unique_ptr<GrouperImpl>> Make(const std::vector<ValueDescr>& keys,
+ ExecContext* ctx) {
+ auto impl = ::arrow::internal::make_unique<GrouperImpl>();
+
+ impl->encoders_.resize(keys.size());
+ impl->ctx_ = ctx;
+
+ for (size_t i = 0; i < keys.size(); ++i) {
+ const auto& key = keys[i].type;
+
+ if (key->id() == Type::BOOL) {
+ impl->encoders_[i] = ::arrow::internal::make_unique<BooleanKeyEncoder>();
+ continue;
+ }
+
+ if (key->id() == Type::DICTIONARY) {
+ impl->encoders_[i] =
+ ::arrow::internal::make_unique<DictionaryKeyEncoder>(key, ctx->memory_pool());
+ continue;
+ }
+
+ if (is_fixed_width(key->id())) {
+ impl->encoders_[i] = ::arrow::internal::make_unique<FixedWidthKeyEncoder>(key);
+ continue;
+ }
+
+ if (is_binary_like(key->id())) {
+ impl->encoders_[i] =
+ ::arrow::internal::make_unique<VarLengthKeyEncoder<BinaryType>>(key);
+ continue;
+ }
+
+ if (is_large_binary_like(key->id())) {
+ impl->encoders_[i] =
+ ::arrow::internal::make_unique<VarLengthKeyEncoder<LargeBinaryType>>(key);
+ continue;
+ }
+
+ return Status::NotImplemented("Keys of type ", *key);
+ }
+
+ return std::move(impl);
+ }
+
+ Result<Datum> Consume(const ExecBatch& batch) override {
+ std::vector<int32_t> offsets_batch(batch.length + 1);
+ for (int i = 0; i < batch.num_values(); ++i) {
+ encoders_[i]->AddLength(*batch[i].array(), offsets_batch.data());
+ }
+
+ int32_t total_length = 0;
+ for (int64_t i = 0; i < batch.length; ++i) {
+ auto total_length_before = total_length;
+ total_length += offsets_batch[i];
+ offsets_batch[i] = total_length_before;
+ }
+ offsets_batch[batch.length] = total_length;
+
+ std::vector<uint8_t> key_bytes_batch(total_length);
+ std::vector<uint8_t*> key_buf_ptrs(batch.length);
+ for (int64_t i = 0; i < batch.length; ++i) {
+ key_buf_ptrs[i] = key_bytes_batch.data() + offsets_batch[i];
+ }
+
+ for (int i = 0; i < batch.num_values(); ++i) {
+ RETURN_NOT_OK(encoders_[i]->Encode(*batch[i].array(), key_buf_ptrs.data()));
+ }
+
+ TypedBufferBuilder<uint32_t> group_ids_batch(ctx_->memory_pool());
+ RETURN_NOT_OK(group_ids_batch.Resize(batch.length));
+
+ for (int64_t i = 0; i < batch.length; ++i) {
+ int32_t key_length = offsets_batch[i + 1] - offsets_batch[i];
+ std::string key(
+ reinterpret_cast<const char*>(key_bytes_batch.data() + offsets_batch[i]),
+ key_length);
+
+ auto it_success = map_.emplace(key, num_groups_);
+ auto group_id = it_success.first->second;
+
+ if (it_success.second) {
+ // new key; update offsets and key_bytes
+ ++num_groups_;
+ auto next_key_offset = static_cast<int32_t>(key_bytes_.size());
+ key_bytes_.resize(next_key_offset + key_length);
+ offsets_.push_back(next_key_offset + key_length);
+ memcpy(key_bytes_.data() + next_key_offset, key.c_str(), key_length);
+ }
+
+ group_ids_batch.UnsafeAppend(group_id);
+ }
+
+ ARROW_ASSIGN_OR_RAISE(auto group_ids, group_ids_batch.Finish());
+ return Datum(UInt32Array(batch.length, std::move(group_ids)));
+ }
+
+ uint32_t num_groups() const override { return num_groups_; }
+
+ Result<ExecBatch> GetUniques() override {
+ ExecBatch out({}, num_groups_);
+
+ std::vector<uint8_t*> key_buf_ptrs(num_groups_);
+ for (int64_t i = 0; i < num_groups_; ++i) {
+ key_buf_ptrs[i] = key_bytes_.data() + offsets_[i];
+ }
+
+ out.values.resize(encoders_.size());
+ for (size_t i = 0; i < encoders_.size(); ++i) {
+ ARROW_ASSIGN_OR_RAISE(
+ out.values[i],
+ encoders_[i]->Decode(key_buf_ptrs.data(), static_cast<int32_t>(num_groups_),
+ ctx_->memory_pool()));
+ }
+
+ return out;
+ }
+
+ ExecContext* ctx_;
+ std::unordered_map<std::string, uint32_t> map_;
+ std::vector<int32_t> offsets_ = {0};
+ std::vector<uint8_t> key_bytes_;
+ uint32_t num_groups_ = 0;
+ std::vector<std::unique_ptr<KeyEncoder>> encoders_;
+};
+
+struct GrouperFastImpl : Grouper {
+ static constexpr int kBitmapPaddingForSIMD = 64; // bits
+ static constexpr int kPaddingForSIMD = 32; // bytes
+
+ static bool CanUse(const std::vector<ValueDescr>& keys) {
+#if ARROW_LITTLE_ENDIAN
+ for (size_t i = 0; i < keys.size(); ++i) {
+ const auto& key = keys[i].type;
+ if (is_large_binary_like(key->id())) {
+ return false;
+ }
+ }
+ return true;
+#else
+ return false;
+#endif
+ }
+
+ static Result<std::unique_ptr<GrouperFastImpl>> Make(
+ const std::vector<ValueDescr>& keys, ExecContext* ctx) {
+ auto impl = ::arrow::internal::make_unique<GrouperFastImpl>();
+ impl->ctx_ = ctx;
+
+ RETURN_NOT_OK(impl->temp_stack_.Init(ctx->memory_pool(), 64 * minibatch_size_max_));
+ impl->encode_ctx_.hardware_flags =
+ arrow::internal::CpuInfo::GetInstance()->hardware_flags();
+ impl->encode_ctx_.stack = &impl->temp_stack_;
+
+ auto num_columns = keys.size();
+ impl->col_metadata_.resize(num_columns);
+ impl->key_types_.resize(num_columns);
+ impl->dictionaries_.resize(num_columns);
+ for (size_t icol = 0; icol < num_columns; ++icol) {
+ const auto& key = keys[icol].type;
+ if (key->id() == Type::DICTIONARY) {
+ auto bit_width = checked_cast<const FixedWidthType&>(*key).bit_width();
+ ARROW_DCHECK(bit_width % 8 == 0);
+ impl->col_metadata_[icol] =
+ arrow::compute::KeyEncoder::KeyColumnMetadata(true, bit_width / 8);
+ } else if (key->id() == Type::BOOL) {
+ impl->col_metadata_[icol] =
+ arrow::compute::KeyEncoder::KeyColumnMetadata(true, 0);
+ } else if (is_fixed_width(key->id())) {
+ impl->col_metadata_[icol] = arrow::compute::KeyEncoder::KeyColumnMetadata(
+ true, checked_cast<const FixedWidthType&>(*key).bit_width() / 8);
+ } else if (is_binary_like(key->id())) {
+ impl->col_metadata_[icol] =
+ arrow::compute::KeyEncoder::KeyColumnMetadata(false, sizeof(uint32_t));
+ } else {
+ return Status::NotImplemented("Keys of type ", *key);
+ }
+ impl->key_types_[icol] = key;
+ }
+
+ impl->encoder_.Init(impl->col_metadata_, &impl->encode_ctx_,
+ /* row_alignment = */ sizeof(uint64_t),
+ /* string_alignment = */ sizeof(uint64_t));
+ RETURN_NOT_OK(impl->rows_.Init(ctx->memory_pool(), impl->encoder_.row_metadata()));
+ RETURN_NOT_OK(
+ impl->rows_minibatch_.Init(ctx->memory_pool(), impl->encoder_.row_metadata()));
+ impl->minibatch_size_ = impl->minibatch_size_min_;
+ GrouperFastImpl* impl_ptr = impl.get();
+ auto equal_func = [impl_ptr](
+ int num_keys_to_compare, const uint16_t* selection_may_be_null,
+ const uint32_t* group_ids, uint32_t* out_num_keys_mismatch,
+ uint16_t* out_selection_mismatch) {
+ arrow::compute::KeyCompare::CompareRows(
+ num_keys_to_compare, selection_may_be_null, group_ids, &impl_ptr->encode_ctx_,
+ out_num_keys_mismatch, out_selection_mismatch, impl_ptr->rows_minibatch_,
+ impl_ptr->rows_);
+ };
+ auto append_func = [impl_ptr](int num_keys, const uint16_t* selection) {
+ return impl_ptr->rows_.AppendSelectionFrom(impl_ptr->rows_minibatch_, num_keys,
+ selection);
+ };
+ RETURN_NOT_OK(impl->map_.init(impl->encode_ctx_.hardware_flags, ctx->memory_pool(),
+ impl->encode_ctx_.stack, impl->log_minibatch_max_,
+ equal_func, append_func));
+ impl->cols_.resize(num_columns);
+ impl->minibatch_hashes_.resize(impl->minibatch_size_max_ +
+ kPaddingForSIMD / sizeof(uint32_t));
+
+ return std::move(impl);
+ }
+
+ ~GrouperFastImpl() { map_.cleanup(); }
+
+ Result<Datum> Consume(const ExecBatch& batch) override {
+ int64_t num_rows = batch.length;
+ int num_columns = batch.num_values();
+
+ // Process dictionaries
+ for (int icol = 0; icol < num_columns; ++icol) {
+ if (key_types_[icol]->id() == Type::DICTIONARY) {
+ auto data = batch[icol].array();
+ auto dict = MakeArray(data->dictionary);
+ if (dictionaries_[icol]) {
+ if (!dictionaries_[icol]->Equals(dict)) {
+ // TODO(bkietz) unify if necessary. For now, just error if any batch's
+ // dictionary differs from the first we saw for this key
+ return Status::NotImplemented("Unifying differing dictionaries");
+ }
+ } else {
+ dictionaries_[icol] = std::move(dict);
+ }
+ }
+ }
+
+ std::shared_ptr<arrow::Buffer> group_ids;
+ ARROW_ASSIGN_OR_RAISE(
+ group_ids, AllocateBuffer(sizeof(uint32_t) * num_rows, ctx_->memory_pool()));
+
+ for (int icol = 0; icol < num_columns; ++icol) {
+ const uint8_t* non_nulls = nullptr;
+ if (batch[icol].array()->buffers[0] != NULLPTR) {
+ non_nulls = batch[icol].array()->buffers[0]->data();
+ }
+ const uint8_t* fixedlen = batch[icol].array()->buffers[1]->data();
+ const uint8_t* varlen = nullptr;
+ if (!col_metadata_[icol].is_fixed_length) {
+ varlen = batch[icol].array()->buffers[2]->data();
+ }
+
+ int64_t offset = batch[icol].array()->offset;
+
+ auto col_base = arrow::compute::KeyEncoder::KeyColumnArray(
+ col_metadata_[icol], offset + num_rows, non_nulls, fixedlen, varlen);
+
+ cols_[icol] =
+ arrow::compute::KeyEncoder::KeyColumnArray(col_base, offset, num_rows);
+ }
+
+ // Split into smaller mini-batches
+ //
+ for (uint32_t start_row = 0; start_row < num_rows;) {
+ uint32_t batch_size_next = std::min(static_cast<uint32_t>(minibatch_size_),
+ static_cast<uint32_t>(num_rows) - start_row);
+
+ // Encode
+ rows_minibatch_.Clean();
+ RETURN_NOT_OK(encoder_.PrepareOutputForEncode(start_row, batch_size_next,
+ &rows_minibatch_, cols_));
+ encoder_.Encode(start_row, batch_size_next, &rows_minibatch_, cols_);
+
+ // Compute hash
+ if (encoder_.row_metadata().is_fixed_length) {
+ Hashing::hash_fixed(encode_ctx_.hardware_flags, batch_size_next,
+ encoder_.row_metadata().fixed_length, rows_minibatch_.data(1),
+ minibatch_hashes_.data());
+ } else {
+ auto hash_temp_buf =
+ util::TempVectorHolder<uint32_t>(&temp_stack_, 4 * batch_size_next);
+ Hashing::hash_varlen(encode_ctx_.hardware_flags, batch_size_next,
+ rows_minibatch_.offsets(), rows_minibatch_.data(2),
+ hash_temp_buf.mutable_data(), minibatch_hashes_.data());
+ }
+
+ // Map
+ RETURN_NOT_OK(
+ map_.map(batch_size_next, minibatch_hashes_.data(),
+ reinterpret_cast<uint32_t*>(group_ids->mutable_data()) + start_row));
+
+ start_row += batch_size_next;
+
+ if (minibatch_size_ * 2 <= minibatch_size_max_) {
+ minibatch_size_ *= 2;
+ }
+ }
+
+ return Datum(UInt32Array(batch.length, std::move(group_ids)));
+ }
+
+ uint32_t num_groups() const override { return static_cast<uint32_t>(rows_.length()); }
+
+ // Make sure padded buffers end up with the right logical size
+
+ Result<std::shared_ptr<Buffer>> AllocatePaddedBitmap(int64_t length) {
+ ARROW_ASSIGN_OR_RAISE(
+ std::shared_ptr<Buffer> buf,
+ AllocateBitmap(length + kBitmapPaddingForSIMD, ctx_->memory_pool()));
+ return SliceMutableBuffer(buf, 0, BitUtil::BytesForBits(length));
+ }
+
+ Result<std::shared_ptr<Buffer>> AllocatePaddedBuffer(int64_t size) {
+ ARROW_ASSIGN_OR_RAISE(
+ std::shared_ptr<Buffer> buf,
+ AllocateBuffer(size + kBitmapPaddingForSIMD, ctx_->memory_pool()));
+ return SliceMutableBuffer(buf, 0, size);
+ }
+
+ Result<ExecBatch> GetUniques() override {
+ auto num_columns = static_cast<uint32_t>(col_metadata_.size());
+ int64_t num_groups = rows_.length();
+
+ std::vector<std::shared_ptr<Buffer>> non_null_bufs(num_columns);
+ std::vector<std::shared_ptr<Buffer>> fixedlen_bufs(num_columns);
+ std::vector<std::shared_ptr<Buffer>> varlen_bufs(num_columns);
+
+ for (size_t i = 0; i < num_columns; ++i) {
+ ARROW_ASSIGN_OR_RAISE(non_null_bufs[i], AllocatePaddedBitmap(num_groups));
+ if (col_metadata_[i].is_fixed_length) {
+ if (col_metadata_[i].fixed_length == 0) {
+ ARROW_ASSIGN_OR_RAISE(fixedlen_bufs[i], AllocatePaddedBitmap(num_groups));
+ } else {
+ ARROW_ASSIGN_OR_RAISE(
+ fixedlen_bufs[i],
+ AllocatePaddedBuffer(num_groups * col_metadata_[i].fixed_length));
+ }
+ } else {
+ ARROW_ASSIGN_OR_RAISE(fixedlen_bufs[i],
+ AllocatePaddedBuffer((num_groups + 1) * sizeof(uint32_t)));
+ }
+ cols_[i] = arrow::compute::KeyEncoder::KeyColumnArray(
+ col_metadata_[i], num_groups, non_null_bufs[i]->mutable_data(),
+ fixedlen_bufs[i]->mutable_data(), nullptr);
+ }
+
+ for (int64_t start_row = 0; start_row < num_groups;) {
+ int64_t batch_size_next =
+ std::min(num_groups - start_row, static_cast<int64_t>(minibatch_size_max_));
+ encoder_.DecodeFixedLengthBuffers(start_row, start_row, batch_size_next, rows_,
+ &cols_);
+ start_row += batch_size_next;
+ }
+
+ if (!rows_.metadata().is_fixed_length) {
+ for (size_t i = 0; i < num_columns; ++i) {
+ if (!col_metadata_[i].is_fixed_length) {
+ auto varlen_size =
+ reinterpret_cast<const uint32_t*>(fixedlen_bufs[i]->data())[num_groups];
+ ARROW_ASSIGN_OR_RAISE(varlen_bufs[i], AllocatePaddedBuffer(varlen_size));
+ cols_[i] = arrow::compute::KeyEncoder::KeyColumnArray(
+ col_metadata_[i], num_groups, non_null_bufs[i]->mutable_data(),
+ fixedlen_bufs[i]->mutable_data(), varlen_bufs[i]->mutable_data());
+ }
+ }
+
+ for (int64_t start_row = 0; start_row < num_groups;) {
+ int64_t batch_size_next =
+ std::min(num_groups - start_row, static_cast<int64_t>(minibatch_size_max_));
+ encoder_.DecodeVaryingLengthBuffers(start_row, start_row, batch_size_next, rows_,
+ &cols_);
+ start_row += batch_size_next;
+ }
+ }
+
+ ExecBatch out({}, num_groups);
+ out.values.resize(num_columns);
+ for (size_t i = 0; i < num_columns; ++i) {
+ auto valid_count = arrow::internal::CountSetBits(
+ non_null_bufs[i]->data(), /*offset=*/0, static_cast<int64_t>(num_groups));
+ int null_count = static_cast<int>(num_groups) - static_cast<int>(valid_count);
+
+ if (col_metadata_[i].is_fixed_length) {
+ out.values[i] = ArrayData::Make(
+ key_types_[i], num_groups,
+ {std::move(non_null_bufs[i]), std::move(fixedlen_bufs[i])}, null_count);
+ } else {
+ out.values[i] =
+ ArrayData::Make(key_types_[i], num_groups,
+ {std::move(non_null_bufs[i]), std::move(fixedlen_bufs[i]),
+ std::move(varlen_bufs[i])},
+ null_count);
+ }
+ }
+
+ // Process dictionaries
+ for (size_t icol = 0; icol < num_columns; ++icol) {
+ if (key_types_[icol]->id() == Type::DICTIONARY) {
+ if (dictionaries_[icol]) {
+ out.values[icol].array()->dictionary = dictionaries_[icol]->data();
+ } else {
+ ARROW_ASSIGN_OR_RAISE(auto dict, MakeArrayOfNull(key_types_[icol], 0));
+ out.values[icol].array()->dictionary = dict->data();
+ }
+ }
+ }
+
+ return out;
+ }
+
+ static constexpr int log_minibatch_max_ = 10;
+ static constexpr int minibatch_size_max_ = 1 << log_minibatch_max_;
+ static constexpr int minibatch_size_min_ = 128;
+ int minibatch_size_;
+
+ ExecContext* ctx_;
+ arrow::util::TempVectorStack temp_stack_;
+ arrow::compute::KeyEncoder::KeyEncoderContext encode_ctx_;
+
+ std::vector<std::shared_ptr<arrow::DataType>> key_types_;
+ std::vector<arrow::compute::KeyEncoder::KeyColumnMetadata> col_metadata_;
+ std::vector<arrow::compute::KeyEncoder::KeyColumnArray> cols_;
+ std::vector<uint32_t> minibatch_hashes_;
+
+ std::vector<std::shared_ptr<Array>> dictionaries_;
+
+ arrow::compute::KeyEncoder::KeyRowArray rows_;
+ arrow::compute::KeyEncoder::KeyRowArray rows_minibatch_;
+ arrow::compute::KeyEncoder encoder_;
+ arrow::compute::SwissTable map_;
+};
+
+/// C++ abstract base class for the HashAggregateKernel interface.
+/// Implementations should be default constructible and perform initialization in
+/// Init().
+struct GroupedAggregator : KernelState {
+ virtual Status Init(ExecContext*, const FunctionOptions*,
+ const std::shared_ptr<DataType>&) = 0;
+
+ virtual Status Consume(const ExecBatch& batch) = 0;
+
+ virtual Result<Datum> Finalize() = 0;
+
+ template <typename Reserve>
+ Status MaybeReserve(int64_t old_num_groups, const ExecBatch& batch,
+ const Reserve& reserve) {
+ int64_t new_num_groups = batch[2].scalar_as<UInt32Scalar>().value;
+ if (new_num_groups <= old_num_groups) {
+ return Status::OK();
+ }
+ return reserve(new_num_groups - old_num_groups);
+ }
+
+ virtual std::shared_ptr<DataType> out_type() const = 0;
+};
+
+// ----------------------------------------------------------------------
+// Count implementation
+
+struct GroupedCountImpl : public GroupedAggregator {
+ Status Init(ExecContext* ctx, const FunctionOptions* options,
+ const std::shared_ptr<DataType>&) override {
+ options_ = checked_cast<const ScalarAggregateOptions&>(*options);
+ counts_ = BufferBuilder(ctx->memory_pool());
+ return Status::OK();
+ }
+
+ Status Consume(const ExecBatch& batch) override {
+ RETURN_NOT_OK(MaybeReserve(num_groups_, batch, [&](int64_t added_groups) {
+ num_groups_ += added_groups;
+ return counts_.Append(added_groups * sizeof(int64_t), 0);
+ }));
+
+ auto group_ids = batch[1].array()->GetValues<uint32_t>(1);
+ auto raw_counts = reinterpret_cast<int64_t*>(counts_.mutable_data());
+
+ const auto& input = batch[0].array();
+
+ if (!options_.skip_nulls) {
+ if (input->GetNullCount() != 0) {
+ for (int64_t i = 0, input_i = input->offset; i < input->length; ++i, ++input_i) {
+ auto g = group_ids[i];
+ raw_counts[g] += !BitUtil::GetBit(input->buffers[0]->data(), input_i);
+ }
+ }
+ return Status::OK();
+ }
+
+ arrow::internal::VisitSetBitRunsVoid(
+ input->buffers[0], input->offset, input->length,
+ [&](int64_t begin, int64_t length) {
+ for (int64_t input_i = begin, i = begin - input->offset;
+ input_i < begin + length; ++input_i, ++i) {
+ auto g = group_ids[i];
+ raw_counts[g] += 1;
+ }
+ });
+ return Status::OK();
+ }
+
+ Result<Datum> Finalize() override {
+ ARROW_ASSIGN_OR_RAISE(auto counts, counts_.Finish());
+ return std::make_shared<Int64Array>(num_groups_, std::move(counts));
+ }
+
+ std::shared_ptr<DataType> out_type() const override { return int64(); }
+
+ int64_t num_groups_ = 0;
+ ScalarAggregateOptions options_;
+ BufferBuilder counts_;
+};
+
+// ----------------------------------------------------------------------
+// Sum implementation
+
+struct GroupedSumImpl : public GroupedAggregator {
+ // NB: whether we are accumulating into double, int64_t, or uint64_t
+ // we always have 64 bits per group in the sums buffer.
+ static constexpr size_t kSumSize = sizeof(int64_t);
+
+ using ConsumeImpl = std::function<void(const std::shared_ptr<ArrayData>&,
+ const uint32_t*, void*, int64_t*)>;
+
+ struct GetConsumeImpl {
+ template <typename T, typename AccType = typename FindAccumulatorType<T>::Type>
+ Status Visit(const T&) {
+ consume_impl = [](const std::shared_ptr<ArrayData>& input, const uint32_t* group,
+ void* boxed_sums, int64_t* counts) {
+ auto sums = reinterpret_cast<typename TypeTraits<AccType>::CType*>(boxed_sums);
+
+ VisitArrayDataInline<T>(
+ *input,
+ [&](typename TypeTraits<T>::CType value) {
+ sums[*group] += value;
+ counts[*group] += 1;
+ ++group;
+ },
+ [&] { ++group; });
+ };
+ out_type = TypeTraits<AccType>::type_singleton();
+ return Status::OK();
+ }
+
+ Status Visit(const HalfFloatType& type) {
+ return Status::NotImplemented("Summing data of type ", type);
+ }
+
+ Status Visit(const DataType& type) {
+ return Status::NotImplemented("Summing data of type ", type);
+ }
+
+ ConsumeImpl consume_impl;
+ std::shared_ptr<DataType> out_type;
+ };
+
+ Status Init(ExecContext* ctx, const FunctionOptions*,
+ const std::shared_ptr<DataType>& input_type) override {
+ pool_ = ctx->memory_pool();
+ sums_ = BufferBuilder(pool_);
+ counts_ = BufferBuilder(pool_);
+
+ GetConsumeImpl get_consume_impl;
+ RETURN_NOT_OK(VisitTypeInline(*input_type, &get_consume_impl));
+
+ consume_impl_ = std::move(get_consume_impl.consume_impl);
+ out_type_ = std::move(get_consume_impl.out_type);
+
+ return Status::OK();
+ }
+
+ Status Consume(const ExecBatch& batch) override {
+ RETURN_NOT_OK(MaybeReserve(num_groups_, batch, [&](int64_t added_groups) {
+ num_groups_ += added_groups;
+ RETURN_NOT_OK(sums_.Append(added_groups * kSumSize, 0));
+ RETURN_NOT_OK(counts_.Append(added_groups * sizeof(int64_t), 0));
+ return Status::OK();
+ }));
+
+ auto group_ids = batch[1].array()->GetValues<uint32_t>(1);
+ consume_impl_(batch[0].array(), group_ids, sums_.mutable_data(),
+ reinterpret_cast<int64_t*>(counts_.mutable_data()));
+ return Status::OK();
+ }
+
+ Result<Datum> Finalize() override {
+ std::shared_ptr<Buffer> null_bitmap;
+ int64_t null_count = 0;
+
+ for (int64_t i = 0; i < num_groups_; ++i) {
+ if (reinterpret_cast<const int64_t*>(counts_.data())[i] > 0) continue;
+
+ if (null_bitmap == nullptr) {
+ ARROW_ASSIGN_OR_RAISE(null_bitmap, AllocateBitmap(num_groups_, pool_));
+ BitUtil::SetBitsTo(null_bitmap->mutable_data(), 0, num_groups_, true);
+ }
+
+ null_count += 1;
+ BitUtil::SetBitTo(null_bitmap->mutable_data(), i, false);
+ }
+
+ ARROW_ASSIGN_OR_RAISE(auto sums, sums_.Finish());
+
+ return ArrayData::Make(std::move(out_type_), num_groups_,
+ {std::move(null_bitmap), std::move(sums)}, null_count);
+ }
+
+ std::shared_ptr<DataType> out_type() const override { return out_type_; }
+
+ // NB: counts are used here instead of a simple "has_values_" bitmap since
+ // we expect to reuse this kernel to handle Mean
+ int64_t num_groups_ = 0;
+ BufferBuilder sums_, counts_;
+ std::shared_ptr<DataType> out_type_;
+ ConsumeImpl consume_impl_;
+ MemoryPool* pool_;
+};
+
+// ----------------------------------------------------------------------
+// MinMax implementation
+
+template <typename CType>
+struct Extrema : std::numeric_limits<CType> {};
+
+template <>
+struct Extrema<float> {
+ static constexpr float min() { return -std::numeric_limits<float>::infinity(); }
+ static constexpr float max() { return std::numeric_limits<float>::infinity(); }
+};
+
+template <>
+struct Extrema<double> {
+ static constexpr double min() { return -std::numeric_limits<double>::infinity(); }
+ static constexpr double max() { return std::numeric_limits<double>::infinity(); }
+};
+
+struct GroupedMinMaxImpl : public GroupedAggregator {
+ using ConsumeImpl =
+ std::function<void(const std::shared_ptr<ArrayData>&, const uint32_t*, void*, void*,
+ uint8_t*, uint8_t*)>;
+
+ using ResizeImpl = std::function<Status(BufferBuilder*, int64_t)>;
+
+ template <typename CType>
+ static ResizeImpl MakeResizeImpl(CType anti_extreme) {
+ // resize a min or max buffer, storing the correct anti extreme
+ return [anti_extreme](BufferBuilder* builder, int64_t added_groups) {
+ TypedBufferBuilder<CType> typed_builder(std::move(*builder));
+ RETURN_NOT_OK(typed_builder.Append(added_groups, anti_extreme));
+ *builder = std::move(*typed_builder.bytes_builder());
+ return Status::OK();
+ };
+ }
+
+ struct GetImpl {
+ template <typename T, typename CType = typename TypeTraits<T>::CType>
+ enable_if_number<T, Status> Visit(const T&) {
+ consume_impl = [](const std::shared_ptr<ArrayData>& input, const uint32_t* group,
+ void* mins, void* maxes, uint8_t* has_values,
+ uint8_t* has_nulls) {
+ auto raw_mins = reinterpret_cast<CType*>(mins);
+ auto raw_maxes = reinterpret_cast<CType*>(maxes);
+
+ VisitArrayDataInline<T>(
+ *input,
+ [&](CType val) {
+ raw_maxes[*group] = std::max(raw_maxes[*group], val);
+ raw_mins[*group] = std::min(raw_mins[*group], val);
+ BitUtil::SetBit(has_values, *group++);
+ },
+ [&] { BitUtil::SetBit(has_nulls, *group++); });
+ };
+
+ resize_min_impl = MakeResizeImpl(Extrema<CType>::max());
+ resize_max_impl = MakeResizeImpl(Extrema<CType>::min());
+ return Status::OK();
+ }
+
+ Status Visit(const BooleanType& type) {
+ return Status::NotImplemented("Grouped MinMax data of type ", type);
+ }
+
+ Status Visit(const HalfFloatType& type) {
+ return Status::NotImplemented("Grouped MinMax data of type ", type);
+ }
+
+ Status Visit(const DataType& type) {
+ return Status::NotImplemented("Grouped MinMax data of type ", type);
+ }
+
+ ConsumeImpl consume_impl;
+ ResizeImpl resize_min_impl, resize_max_impl;
+ };
+
+ Status Init(ExecContext* ctx, const FunctionOptions* options,
+ const std::shared_ptr<DataType>& input_type) override {
+ options_ = *checked_cast<const ScalarAggregateOptions*>(options);
+ type_ = input_type;
+
+ mins_ = BufferBuilder(ctx->memory_pool());
+ maxes_ = BufferBuilder(ctx->memory_pool());
+ has_values_ = TypedBufferBuilder<bool>(ctx->memory_pool());
+ has_nulls_ = TypedBufferBuilder<bool>(ctx->memory_pool());
+
+ GetImpl get_impl;
+ RETURN_NOT_OK(VisitTypeInline(*input_type, &get_impl));
+
+ consume_impl_ = std::move(get_impl.consume_impl);
+ resize_min_impl_ = std::move(get_impl.resize_min_impl);
+ resize_max_impl_ = std::move(get_impl.resize_max_impl);
+
+ return Status::OK();
+ }
+
+ Status Consume(const ExecBatch& batch) override {
+ RETURN_NOT_OK(MaybeReserve(num_groups_, batch, [&](int64_t added_groups) {
+ num_groups_ += added_groups;
+ RETURN_NOT_OK(resize_min_impl_(&mins_, added_groups));
+ RETURN_NOT_OK(resize_max_impl_(&maxes_, added_groups));
+ RETURN_NOT_OK(has_values_.Append(added_groups, false));
+ RETURN_NOT_OK(has_nulls_.Append(added_groups, false));
+ return Status::OK();
+ }));
+
+ auto group_ids = batch[1].array()->GetValues<uint32_t>(1);
+ consume_impl_(batch[0].array(), group_ids, mins_.mutable_data(),
+ maxes_.mutable_data(), has_values_.mutable_data(),
+ has_nulls_.mutable_data());
+ return Status::OK();
+ }
+
+ Result<Datum> Finalize() override {
+ // aggregation for group is valid if there was at least one value in that group
+ ARROW_ASSIGN_OR_RAISE(auto null_bitmap, has_values_.Finish());
+
+ if (!options_.skip_nulls) {
+ // ... and there were no nulls in that group
+ ARROW_ASSIGN_OR_RAISE(auto has_nulls, has_nulls_.Finish());
+ arrow::internal::BitmapAndNot(null_bitmap->data(), 0, has_nulls->data(), 0,
+ num_groups_, 0, null_bitmap->mutable_data());
+ }
+
+ auto mins = ArrayData::Make(type_, num_groups_, {null_bitmap, nullptr});
+ auto maxes = ArrayData::Make(type_, num_groups_, {std::move(null_bitmap), nullptr});
+ ARROW_ASSIGN_OR_RAISE(mins->buffers[1], mins_.Finish());
+ ARROW_ASSIGN_OR_RAISE(maxes->buffers[1], maxes_.Finish());
+
+ return ArrayData::Make(out_type(), num_groups_, {nullptr},
+ {std::move(mins), std::move(maxes)});
+ }
+
+ std::shared_ptr<DataType> out_type() const override {
+ return struct_({field("min", type_), field("max", type_)});
+ }
+
+ int64_t num_groups_;
+ BufferBuilder mins_, maxes_;
+ TypedBufferBuilder<bool> has_values_, has_nulls_;
+ std::shared_ptr<DataType> type_;
+ ConsumeImpl consume_impl_;
+ ResizeImpl resize_min_impl_, resize_max_impl_;
+ ScalarAggregateOptions options_;
+};
+
+template <typename Impl>
+HashAggregateKernel MakeKernel(InputType argument_type) {
+ HashAggregateKernel kernel;
+
+ kernel.init = [](KernelContext* ctx,
+ const KernelInitArgs& args) -> Result<std::unique_ptr<KernelState>> {
+ auto impl = ::arrow::internal::make_unique<Impl>();
+ // FIXME(bkietz) Init should not take a type. That should be an unboxed template arg
+ // for the Impl. Otherwise we're not exposing dispatch as well as we should.
+ RETURN_NOT_OK(impl->Init(ctx->exec_context(), args.options, args.inputs[0].type));
+ return std::move(impl);
+ };
+
+ kernel.signature = KernelSignature::Make(
+ {std::move(argument_type), InputType::Array(Type::UINT32),
+ InputType::Scalar(Type::UINT32)},
+ OutputType(
+ [](KernelContext* ctx, const std::vector<ValueDescr>&) -> Result<ValueDescr> {
+ return checked_cast<GroupedAggregator*>(ctx->state())->out_type();
+ }));
+
+ kernel.consume = [](KernelContext* ctx, const ExecBatch& batch) {
+ return checked_cast<GroupedAggregator*>(ctx->state())->Consume(batch);
+ };
+
+ kernel.merge = [](KernelContext* ctx, KernelState&&, KernelState*) {
+ // TODO(ARROW-11840) merge two hash tables
+ return Status::NotImplemented("Merge hashed aggregations");
+ };
+
+ kernel.finalize = [](KernelContext* ctx, Datum* out) {
+ ARROW_ASSIGN_OR_RAISE(*out,
+ checked_cast<GroupedAggregator*>(ctx->state())->Finalize());
+ return Status::OK();
+ };
+
+ return kernel;
+}
+
+Result<std::vector<const HashAggregateKernel*>> GetKernels(
+ ExecContext* ctx, const std::vector<Aggregate>& aggregates,
+ const std::vector<ValueDescr>& in_descrs) {
+ if (aggregates.size() != in_descrs.size()) {
+ return Status::Invalid(aggregates.size(), " aggregate functions were specified but ",
+ in_descrs.size(), " arguments were provided.");
+ }
+
+ std::vector<const HashAggregateKernel*> kernels(in_descrs.size());
+
+ for (size_t i = 0; i < aggregates.size(); ++i) {
+ ARROW_ASSIGN_OR_RAISE(auto function,
+ ctx->func_registry()->GetFunction(aggregates[i].function));
+ ARROW_ASSIGN_OR_RAISE(
+ const Kernel* kernel,
+ function->DispatchExact(
+ {in_descrs[i], ValueDescr::Array(uint32()), ValueDescr::Scalar(uint32())}));
+ kernels[i] = static_cast<const HashAggregateKernel*>(kernel);
+ }
+ return kernels;
+}
+
+Result<std::vector<std::unique_ptr<KernelState>>> InitKernels(
+ const std::vector<const HashAggregateKernel*>& kernels, ExecContext* ctx,
+ const std::vector<Aggregate>& aggregates, const std::vector<ValueDescr>& in_descrs) {
+ std::vector<std::unique_ptr<KernelState>> states(kernels.size());
+
+ for (size_t i = 0; i < aggregates.size(); ++i) {
+ auto options = aggregates[i].options;
+
+ if (options == nullptr) {
+ // use known default options for the named function if possible
+ auto maybe_function = ctx->func_registry()->GetFunction(aggregates[i].function);
+ if (maybe_function.ok()) {
+ options = maybe_function.ValueOrDie()->default_options();
+ }
+ }
+
+ KernelContext kernel_ctx{ctx};
+ ARROW_ASSIGN_OR_RAISE(
+ states[i], kernels[i]->init(&kernel_ctx, KernelInitArgs{kernels[i],
+ {
+ in_descrs[i].type,
+ uint32(),
+ uint32(),
+ },
+ options}));
+ }
+
+ return std::move(states);
+}
+
+Result<FieldVector> ResolveKernels(
+ const std::vector<Aggregate>& aggregates,
+ const std::vector<const HashAggregateKernel*>& kernels,
+ const std::vector<std::unique_ptr<KernelState>>& states, ExecContext* ctx,
+ const std::vector<ValueDescr>& descrs) {
+ FieldVector fields(descrs.size());
+
+ for (size_t i = 0; i < kernels.size(); ++i) {
+ KernelContext kernel_ctx{ctx};
+ kernel_ctx.SetState(states[i].get());
+
+ ARROW_ASSIGN_OR_RAISE(auto descr, kernels[i]->signature->out_type().Resolve(
+ &kernel_ctx, {
+ descrs[i].type,
+ uint32(),
+ uint32(),
+ }));
+ fields[i] = field(aggregates[i].function, std::move(descr.type));
+ }
+ return fields;
+}
+
+} // namespace
+
+Result<std::unique_ptr<Grouper>> Grouper::Make(const std::vector<ValueDescr>& descrs,
+ ExecContext* ctx) {
+ if (GrouperFastImpl::CanUse(descrs)) {
+ return GrouperFastImpl::Make(descrs, ctx);
+ }
+ return GrouperImpl::Make(descrs, ctx);
+}
+
+Result<Datum> GroupBy(const std::vector<Datum>& arguments, const std::vector<Datum>& keys,
+ const std::vector<Aggregate>& aggregates, ExecContext* ctx) {
+ // Construct and initialize HashAggregateKernels
+ ARROW_ASSIGN_OR_RAISE(auto argument_descrs,
+ ExecBatch::Make(arguments).Map(
+ [](ExecBatch batch) { return batch.GetDescriptors(); }));
+
+ ARROW_ASSIGN_OR_RAISE(auto kernels, GetKernels(ctx, aggregates, argument_descrs));
+
+ ARROW_ASSIGN_OR_RAISE(auto states,
+ InitKernels(kernels, ctx, aggregates, argument_descrs));
+
+ ARROW_ASSIGN_OR_RAISE(
+ FieldVector out_fields,
+ ResolveKernels(aggregates, kernels, states, ctx, argument_descrs));
+
+ using arrow::compute::detail::ExecBatchIterator;
+
+ ARROW_ASSIGN_OR_RAISE(auto argument_batch_iterator,
+ ExecBatchIterator::Make(arguments, ctx->exec_chunksize()));
+
+ // Construct Grouper
+ ARROW_ASSIGN_OR_RAISE(auto key_descrs, ExecBatch::Make(keys).Map([](ExecBatch batch) {
+ return batch.GetDescriptors();
+ }));
+
+ ARROW_ASSIGN_OR_RAISE(auto grouper, Grouper::Make(key_descrs, ctx));
+
+ int i = 0;
+ for (ValueDescr& key_descr : key_descrs) {
+ out_fields.push_back(field("key_" + std::to_string(i++), std::move(key_descr.type)));
+ }
+
+ ARROW_ASSIGN_OR_RAISE(auto key_batch_iterator,
+ ExecBatchIterator::Make(keys, ctx->exec_chunksize()));
+
+ // start "streaming" execution
+ ExecBatch key_batch, argument_batch;
+ while (argument_batch_iterator->Next(&argument_batch) &&
+ key_batch_iterator->Next(&key_batch)) {
+ if (key_batch.length == 0) continue;
+
+ // compute a batch of group ids
+ ARROW_ASSIGN_OR_RAISE(Datum id_batch, grouper->Consume(key_batch));
+
+ // consume group ids with HashAggregateKernels
+ for (size_t i = 0; i < kernels.size(); ++i) {
+ KernelContext batch_ctx{ctx};
+ batch_ctx.SetState(states[i].get());
+ ARROW_ASSIGN_OR_RAISE(auto batch, ExecBatch::Make({argument_batch[i], id_batch,
+ Datum(grouper->num_groups())}));
+ RETURN_NOT_OK(kernels[i]->consume(&batch_ctx, batch));
+ }
+ }
+
+ // Finalize output
+ ArrayDataVector out_data(arguments.size() + keys.size());
+ auto it = out_data.begin();
+
+ for (size_t i = 0; i < kernels.size(); ++i) {
+ KernelContext batch_ctx{ctx};
+ batch_ctx.SetState(states[i].get());
+ Datum out;
+ RETURN_NOT_OK(kernels[i]->finalize(&batch_ctx, &out));
+ *it++ = out.array();
+ }
+
+ ARROW_ASSIGN_OR_RAISE(ExecBatch out_keys, grouper->GetUniques());
+ for (const auto& key : out_keys.values) {
+ *it++ = key.array();
+ }
+
+ int64_t length = out_data[0]->length;
+ return ArrayData::Make(struct_(std::move(out_fields)), length,
+ {/*null_bitmap=*/nullptr}, std::move(out_data),
+ /*null_count=*/0);
+}
+
+Result<std::shared_ptr<ListArray>> Grouper::ApplyGroupings(const ListArray& groupings,
+ const Array& array,
+ ExecContext* ctx) {
+ ARROW_ASSIGN_OR_RAISE(Datum sorted,
+ compute::Take(array, groupings.data()->child_data[0],
+ TakeOptions::NoBoundsCheck(), ctx));
+
+ return std::make_shared<ListArray>(list(array.type()), groupings.length(),
+ groupings.value_offsets(), sorted.make_array());
+}
+
+Result<std::shared_ptr<ListArray>> Grouper::MakeGroupings(const UInt32Array& ids,
+ uint32_t num_groups,
+ ExecContext* ctx) {
+ if (ids.null_count() != 0) {
+ return Status::Invalid("MakeGroupings with null ids");
+ }
+
+ ARROW_ASSIGN_OR_RAISE(auto offsets, AllocateBuffer(sizeof(int32_t) * (num_groups + 1),
+ ctx->memory_pool()));
+ auto raw_offsets = reinterpret_cast<int32_t*>(offsets->mutable_data());
+
+ std::memset(raw_offsets, 0, offsets->size());
+ for (int i = 0; i < ids.length(); ++i) {
+ DCHECK_LT(ids.Value(i), num_groups);
+ raw_offsets[ids.Value(i)] += 1;
+ }
+ int32_t length = 0;
+ for (uint32_t id = 0; id < num_groups; ++id) {
+ auto offset = raw_offsets[id];
+ raw_offsets[id] = length;
+ length += offset;
+ }
+ raw_offsets[num_groups] = length;
+ DCHECK_EQ(ids.length(), length);
+
+ ARROW_ASSIGN_OR_RAISE(auto offsets_copy,
+ offsets->CopySlice(0, offsets->size(), ctx->memory_pool()));
+ raw_offsets = reinterpret_cast<int32_t*>(offsets_copy->mutable_data());
+
+ ARROW_ASSIGN_OR_RAISE(auto sort_indices, AllocateBuffer(sizeof(int32_t) * ids.length(),
+ ctx->memory_pool()));
+ auto raw_sort_indices = reinterpret_cast<int32_t*>(sort_indices->mutable_data());
+ for (int i = 0; i < ids.length(); ++i) {
+ raw_sort_indices[raw_offsets[ids.Value(i)]++] = i;
+ }
+
+ return std::make_shared<ListArray>(
+ list(int32()), num_groups, std::move(offsets),
+ std::make_shared<Int32Array>(ids.length(), std::move(sort_indices)));
+}
+
+namespace {
+const FunctionDoc hash_count_doc{"Count the number of null / non-null values",
+ ("By default, non-null values are counted.\n"
+ "This can be changed through ScalarAggregateOptions."),
+ {"array", "group_id_array", "group_count"},
+ "ScalarAggregateOptions"};
+
+const FunctionDoc hash_sum_doc{"Sum values of a numeric array",
+ ("Null values are ignored."),
+ {"array", "group_id_array", "group_count"}};
+
+const FunctionDoc hash_min_max_doc{
+ "Compute the minimum and maximum values of a numeric array",
+ ("Null values are ignored by default.\n"
+ "This can be changed through ScalarAggregateOptions."),
+ {"array", "group_id_array", "group_count"},
+ "ScalarAggregateOptions"};
+} // namespace
+
+void RegisterHashAggregateBasic(FunctionRegistry* registry) {
+ {
+ static auto default_scalar_aggregate_options = ScalarAggregateOptions::Defaults();
+ auto func = std::make_shared<HashAggregateFunction>(
+ "hash_count", Arity::Ternary(), &hash_count_doc,
+ &default_scalar_aggregate_options);
+ DCHECK_OK(func->AddKernel(MakeKernel<GroupedCountImpl>(ValueDescr::ARRAY)));
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+ }
+
+ {
+ auto func = std::make_shared<HashAggregateFunction>("hash_sum", Arity::Ternary(),
+ &hash_sum_doc);
+ DCHECK_OK(func->AddKernel(MakeKernel<GroupedSumImpl>(ValueDescr::ARRAY)));
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+ }
+
+ {
+ static auto default_scalar_aggregate_options = ScalarAggregateOptions::Defaults();
+ auto func = std::make_shared<HashAggregateFunction>(
+ "hash_min_max", Arity::Ternary(), &hash_min_max_doc,
+ &default_scalar_aggregate_options);
+ DCHECK_OK(func->AddKernel(MakeKernel<GroupedMinMaxImpl>(ValueDescr::ARRAY)));
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+ }
+}
+
+} // namespace internal
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc
new file mode 100644
index 00000000000..a5d4a557740
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc
@@ -0,0 +1,1823 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <utility>
+
+#include "arrow/compute/kernels/codegen_internal.h"
+#include "arrow/compute/kernels/common.h"
+#include "arrow/compute/kernels/util_internal.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/decimal.h"
+#include "arrow/util/int_util_internal.h"
+#include "arrow/util/macros.h"
+
+namespace arrow {
+
+using internal::AddWithOverflow;
+using internal::DivideWithOverflow;
+using internal::MultiplyWithOverflow;
+using internal::NegateWithOverflow;
+using internal::SubtractWithOverflow;
+
+namespace compute {
+namespace internal {
+
+using applicator::ScalarBinaryEqualTypes;
+using applicator::ScalarBinaryNotNullEqualTypes;
+using applicator::ScalarUnary;
+using applicator::ScalarUnaryNotNull;
+
+namespace {
+
+template <typename T>
+using is_unsigned_integer = std::integral_constant<bool, std::is_integral<T>::value &&
+ std::is_unsigned<T>::value>;
+
+template <typename T>
+using is_signed_integer =
+ std::integral_constant<bool, std::is_integral<T>::value && std::is_signed<T>::value>;
+
+template <typename T, typename R = T>
+using enable_if_signed_integer = enable_if_t<is_signed_integer<T>::value, R>;
+
+template <typename T, typename R = T>
+using enable_if_unsigned_integer = enable_if_t<is_unsigned_integer<T>::value, R>;
+
+template <typename T, typename R = T>
+using enable_if_integer =
+ enable_if_t<is_signed_integer<T>::value || is_unsigned_integer<T>::value, R>;
+
+template <typename T, typename R = T>
+using enable_if_floating_point = enable_if_t<std::is_floating_point<T>::value, R>;
+
+template <typename T>
+using enable_if_decimal =
+ enable_if_t<std::is_same<Decimal128, T>::value || std::is_same<Decimal256, T>::value,
+ T>;
+
+template <typename T, typename Unsigned = typename std::make_unsigned<T>::type>
+constexpr Unsigned to_unsigned(T signed_) {
+ return static_cast<Unsigned>(signed_);
+}
+
+struct AbsoluteValue {
+ template <typename T, typename Arg>
+ static constexpr enable_if_floating_point<T> Call(KernelContext*, T arg, Status*) {
+ return std::fabs(arg);
+ }
+
+ template <typename T, typename Arg>
+ static constexpr enable_if_unsigned_integer<T> Call(KernelContext*, T arg, Status*) {
+ return arg;
+ }
+
+ template <typename T, typename Arg>
+ static constexpr enable_if_signed_integer<T> Call(KernelContext*, T arg, Status* st) {
+ return (arg < 0) ? arrow::internal::SafeSignedNegate(arg) : arg;
+ }
+};
+
+struct AbsoluteValueChecked {
+ template <typename T, typename Arg>
+ static enable_if_signed_integer<T> Call(KernelContext*, Arg arg, Status* st) {
+ static_assert(std::is_same<T, Arg>::value, "");
+ if (arg == std::numeric_limits<Arg>::min()) {
+ *st = Status::Invalid("overflow");
+ return arg;
+ }
+ return std::abs(arg);
+ }
+
+ template <typename T, typename Arg>
+ static enable_if_unsigned_integer<T> Call(KernelContext* ctx, Arg arg, Status* st) {
+ static_assert(std::is_same<T, Arg>::value, "");
+ return arg;
+ }
+
+ template <typename T, typename Arg>
+ static constexpr enable_if_floating_point<T> Call(KernelContext*, Arg arg, Status* st) {
+ static_assert(std::is_same<T, Arg>::value, "");
+ return std::fabs(arg);
+ }
+};
+
+struct Add {
+ template <typename T, typename Arg0, typename Arg1>
+ static constexpr enable_if_floating_point<T> Call(KernelContext*, Arg0 left, Arg1 right,
+ Status*) {
+ return left + right;
+ }
+
+ template <typename T, typename Arg0, typename Arg1>
+ static constexpr enable_if_unsigned_integer<T> Call(KernelContext*, Arg0 left,
+ Arg1 right, Status*) {
+ return left + right;
+ }
+
+ template <typename T, typename Arg0, typename Arg1>
+ static constexpr enable_if_signed_integer<T> Call(KernelContext*, Arg0 left, Arg1 right,
+ Status*) {
+ return arrow::internal::SafeSignedAdd(left, right);
+ }
+
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_decimal<T> Call(KernelContext*, Arg0 left, Arg1 right, Status*) {
+ return left + right;
+ }
+};
+
+struct AddChecked {
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_integer<T> Call(KernelContext*, Arg0 left, Arg1 right, Status* st) {
+ static_assert(std::is_same<T, Arg0>::value && std::is_same<T, Arg1>::value, "");
+ T result = 0;
+ if (ARROW_PREDICT_FALSE(AddWithOverflow(left, right, &result))) {
+ *st = Status::Invalid("overflow");
+ }
+ return result;
+ }
+
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_floating_point<T> Call(KernelContext*, Arg0 left, Arg1 right,
+ Status*) {
+ static_assert(std::is_same<T, Arg0>::value && std::is_same<T, Arg1>::value, "");
+ return left + right;
+ }
+
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_decimal<T> Call(KernelContext*, Arg0 left, Arg1 right, Status*) {
+ return left + right;
+ }
+};
+
+struct Subtract {
+ template <typename T, typename Arg0, typename Arg1>
+ static constexpr enable_if_floating_point<T> Call(KernelContext*, Arg0 left, Arg1 right,
+ Status*) {
+ static_assert(std::is_same<T, Arg0>::value && std::is_same<T, Arg1>::value, "");
+ return left - right;
+ }
+
+ template <typename T, typename Arg0, typename Arg1>
+ static constexpr enable_if_unsigned_integer<T> Call(KernelContext*, Arg0 left,
+ Arg1 right, Status*) {
+ static_assert(std::is_same<T, Arg0>::value && std::is_same<T, Arg1>::value, "");
+ return left - right;
+ }
+
+ template <typename T, typename Arg0, typename Arg1>
+ static constexpr enable_if_signed_integer<T> Call(KernelContext*, Arg0 left, Arg1 right,
+ Status*) {
+ static_assert(std::is_same<T, Arg0>::value && std::is_same<T, Arg1>::value, "");
+ return arrow::internal::SafeSignedSubtract(left, right);
+ }
+
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_decimal<T> Call(KernelContext*, Arg0 left, Arg1 right, Status*) {
+ return left + (-right);
+ }
+};
+
+struct SubtractChecked {
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_integer<T> Call(KernelContext*, Arg0 left, Arg1 right, Status* st) {
+ static_assert(std::is_same<T, Arg0>::value && std::is_same<T, Arg1>::value, "");
+ T result = 0;
+ if (ARROW_PREDICT_FALSE(SubtractWithOverflow(left, right, &result))) {
+ *st = Status::Invalid("overflow");
+ }
+ return result;
+ }
+
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_floating_point<T> Call(KernelContext*, Arg0 left, Arg1 right,
+ Status*) {
+ static_assert(std::is_same<T, Arg0>::value && std::is_same<T, Arg1>::value, "");
+ return left - right;
+ }
+
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_decimal<T> Call(KernelContext*, Arg0 left, Arg1 right, Status*) {
+ return left + (-right);
+ }
+};
+
+struct Multiply {
+ static_assert(std::is_same<decltype(int8_t() * int8_t()), int32_t>::value, "");
+ static_assert(std::is_same<decltype(uint8_t() * uint8_t()), int32_t>::value, "");
+ static_assert(std::is_same<decltype(int16_t() * int16_t()), int32_t>::value, "");
+ static_assert(std::is_same<decltype(uint16_t() * uint16_t()), int32_t>::value, "");
+ static_assert(std::is_same<decltype(int32_t() * int32_t()), int32_t>::value, "");
+ static_assert(std::is_same<decltype(uint32_t() * uint32_t()), uint32_t>::value, "");
+ static_assert(std::is_same<decltype(int64_t() * int64_t()), int64_t>::value, "");
+ static_assert(std::is_same<decltype(uint64_t() * uint64_t()), uint64_t>::value, "");
+
+ template <typename T, typename Arg0, typename Arg1>
+ static constexpr enable_if_floating_point<T> Call(KernelContext*, T left, T right,
+ Status*) {
+ return left * right;
+ }
+
+ template <typename T, typename Arg0, typename Arg1>
+ static constexpr enable_if_t<
+ is_unsigned_integer<T>::value && !std::is_same<T, uint16_t>::value, T>
+ Call(KernelContext*, T left, T right, Status*) {
+ return left * right;
+ }
+
+ template <typename T, typename Arg0, typename Arg1>
+ static constexpr enable_if_t<
+ is_signed_integer<T>::value && !std::is_same<T, int16_t>::value, T>
+ Call(KernelContext*, T left, T right, Status*) {
+ return to_unsigned(left) * to_unsigned(right);
+ }
+
+ // Multiplication of 16 bit integer types implicitly promotes to signed 32 bit
+ // integer. However, some inputs may nevertheless overflow (which triggers undefined
+ // behaviour). Therefore we first cast to 32 bit unsigned integers where overflow is
+ // well defined.
+ template <typename T, typename Arg0, typename Arg1>
+ static constexpr enable_if_same<T, int16_t, T> Call(KernelContext*, int16_t left,
+ int16_t right, Status*) {
+ return static_cast<uint32_t>(left) * static_cast<uint32_t>(right);
+ }
+ template <typename T, typename Arg0, typename Arg1>
+ static constexpr enable_if_same<T, uint16_t, T> Call(KernelContext*, uint16_t left,
+ uint16_t right, Status*) {
+ return static_cast<uint32_t>(left) * static_cast<uint32_t>(right);
+ }
+
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_decimal<T> Call(KernelContext*, Arg0 left, Arg1 right, Status*) {
+ return left * right;
+ }
+};
+
+struct MultiplyChecked {
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_integer<T> Call(KernelContext*, Arg0 left, Arg1 right, Status* st) {
+ static_assert(std::is_same<T, Arg0>::value && std::is_same<T, Arg1>::value, "");
+ T result = 0;
+ if (ARROW_PREDICT_FALSE(MultiplyWithOverflow(left, right, &result))) {
+ *st = Status::Invalid("overflow");
+ }
+ return result;
+ }
+
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_floating_point<T> Call(KernelContext*, Arg0 left, Arg1 right,
+ Status*) {
+ static_assert(std::is_same<T, Arg0>::value && std::is_same<T, Arg1>::value, "");
+ return left * right;
+ }
+
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_decimal<T> Call(KernelContext*, Arg0 left, Arg1 right, Status*) {
+ return left * right;
+ }
+};
+
+struct Divide {
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_floating_point<T> Call(KernelContext*, Arg0 left, Arg1 right,
+ Status*) {
+ return left / right;
+ }
+
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_integer<T> Call(KernelContext*, Arg0 left, Arg1 right, Status* st) {
+ T result;
+ if (ARROW_PREDICT_FALSE(DivideWithOverflow(left, right, &result))) {
+ if (right == 0) {
+ *st = Status::Invalid("divide by zero");
+ } else {
+ result = 0;
+ }
+ }
+ return result;
+ }
+
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_decimal<T> Call(KernelContext*, Arg0 left, Arg1 right, Status* st) {
+ if (right == Arg1()) {
+ *st = Status::Invalid("Divide by zero");
+ return T();
+ } else {
+ return left / right;
+ }
+ }
+};
+
+struct DivideChecked {
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_integer<T> Call(KernelContext*, Arg0 left, Arg1 right, Status* st) {
+ static_assert(std::is_same<T, Arg0>::value && std::is_same<T, Arg1>::value, "");
+ T result;
+ if (ARROW_PREDICT_FALSE(DivideWithOverflow(left, right, &result))) {
+ if (right == 0) {
+ *st = Status::Invalid("divide by zero");
+ } else {
+ *st = Status::Invalid("overflow");
+ }
+ }
+ return result;
+ }
+
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_floating_point<T> Call(KernelContext*, Arg0 left, Arg1 right,
+ Status* st) {
+ static_assert(std::is_same<T, Arg0>::value && std::is_same<T, Arg1>::value, "");
+ if (ARROW_PREDICT_FALSE(right == 0)) {
+ *st = Status::Invalid("divide by zero");
+ return 0;
+ }
+ return left / right;
+ }
+
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_decimal<T> Call(KernelContext* ctx, Arg0 left, Arg1 right,
+ Status* st) {
+ return Divide::Call<T>(ctx, left, right, st);
+ }
+};
+
+struct Negate {
+ template <typename T, typename Arg>
+ static constexpr enable_if_floating_point<T> Call(KernelContext*, Arg arg, Status*) {
+ return -arg;
+ }
+
+ template <typename T, typename Arg>
+ static constexpr enable_if_unsigned_integer<T> Call(KernelContext*, Arg arg, Status*) {
+ return ~arg + 1;
+ }
+
+ template <typename T, typename Arg>
+ static constexpr enable_if_signed_integer<T> Call(KernelContext*, Arg arg, Status*) {
+ return arrow::internal::SafeSignedNegate(arg);
+ }
+};
+
+struct NegateChecked {
+ template <typename T, typename Arg>
+ static enable_if_signed_integer<T> Call(KernelContext*, Arg arg, Status* st) {
+ static_assert(std::is_same<T, Arg>::value, "");
+ T result = 0;
+ if (ARROW_PREDICT_FALSE(NegateWithOverflow(arg, &result))) {
+ *st = Status::Invalid("overflow");
+ }
+ return result;
+ }
+
+ template <typename T, typename Arg>
+ static enable_if_unsigned_integer<T> Call(KernelContext* ctx, Arg arg, Status* st) {
+ static_assert(std::is_same<T, Arg>::value, "");
+ DCHECK(false) << "This is included only for the purposes of instantiability from the "
+ "arithmetic kernel generator";
+ return 0;
+ }
+
+ template <typename T, typename Arg>
+ static constexpr enable_if_floating_point<T> Call(KernelContext*, Arg arg, Status* st) {
+ static_assert(std::is_same<T, Arg>::value, "");
+ return -arg;
+ }
+};
+
+struct Power {
+ ARROW_NOINLINE
+ static uint64_t IntegerPower(uint64_t base, uint64_t exp) {
+ // right to left O(logn) power
+ uint64_t pow = 1;
+ while (exp) {
+ pow *= (exp & 1) ? base : 1;
+ base *= base;
+ exp >>= 1;
+ }
+ return pow;
+ }
+
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_integer<T> Call(KernelContext*, T base, T exp, Status* st) {
+ if (exp < 0) {
+ *st = Status::Invalid("integers to negative integer powers are not allowed");
+ return 0;
+ }
+ return static_cast<T>(IntegerPower(base, exp));
+ }
+
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_floating_point<T> Call(KernelContext*, T base, T exp, Status*) {
+ return std::pow(base, exp);
+ }
+};
+
+struct PowerChecked {
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_integer<T> Call(KernelContext*, Arg0 base, Arg1 exp, Status* st) {
+ if (exp < 0) {
+ *st = Status::Invalid("integers to negative integer powers are not allowed");
+ return 0;
+ } else if (exp == 0) {
+ return 1;
+ }
+ // left to right O(logn) power with overflow checks
+ bool overflow = false;
+ uint64_t bitmask =
+ 1ULL << (63 - BitUtil::CountLeadingZeros(static_cast<uint64_t>(exp)));
+ T pow = 1;
+ while (bitmask) {
+ overflow |= MultiplyWithOverflow(pow, pow, &pow);
+ if (exp & bitmask) {
+ overflow |= MultiplyWithOverflow(pow, base, &pow);
+ }
+ bitmask >>= 1;
+ }
+ if (overflow) {
+ *st = Status::Invalid("overflow");
+ }
+ return pow;
+ }
+
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_floating_point<T> Call(KernelContext*, Arg0 base, Arg1 exp, Status*) {
+ static_assert(std::is_same<T, Arg0>::value && std::is_same<T, Arg1>::value, "");
+ return std::pow(base, exp);
+ }
+};
+
+struct Sign {
+ template <typename T, typename Arg>
+ static constexpr enable_if_floating_point<T> Call(KernelContext*, Arg arg, Status*) {
+ return std::isnan(arg) ? arg : ((arg == 0) ? 0 : (std::signbit(arg) ? -1 : 1));
+ }
+
+ template <typename T, typename Arg>
+ static constexpr enable_if_unsigned_integer<T> Call(KernelContext*, Arg arg, Status*) {
+ return arg > 0;
+ }
+
+ template <typename T, typename Arg>
+ static constexpr enable_if_signed_integer<T> Call(KernelContext*, Arg arg, Status*) {
+ return (arg > 0) ? 1 : ((arg == 0) ? 0 : -1);
+ }
+};
+
+// Bitwise operations
+
+struct BitWiseNot {
+ template <typename T, typename Arg>
+ static T Call(KernelContext*, Arg arg, Status*) {
+ return ~arg;
+ }
+};
+
+struct BitWiseAnd {
+ template <typename T, typename Arg0, typename Arg1>
+ static T Call(KernelContext*, Arg0 lhs, Arg1 rhs, Status*) {
+ return lhs & rhs;
+ }
+};
+
+struct BitWiseOr {
+ template <typename T, typename Arg0, typename Arg1>
+ static T Call(KernelContext*, Arg0 lhs, Arg1 rhs, Status*) {
+ return lhs | rhs;
+ }
+};
+
+struct BitWiseXor {
+ template <typename T, typename Arg0, typename Arg1>
+ static T Call(KernelContext*, Arg0 lhs, Arg1 rhs, Status*) {
+ return lhs ^ rhs;
+ }
+};
+
+struct ShiftLeft {
+ template <typename T, typename Arg0, typename Arg1>
+ static T Call(KernelContext*, Arg0 lhs, Arg1 rhs, Status*) {
+ using Unsigned = typename std::make_unsigned<Arg0>::type;
+ static_assert(std::is_same<T, Arg0>::value, "");
+ if (ARROW_PREDICT_FALSE(rhs < 0 || rhs >= std::numeric_limits<Arg0>::digits)) {
+ return lhs;
+ }
+ return static_cast<T>(static_cast<Unsigned>(lhs) << static_cast<Unsigned>(rhs));
+ }
+};
+
+// See SEI CERT C Coding Standard rule INT34-C
+struct ShiftLeftChecked {
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_unsigned_integer<T> Call(KernelContext*, Arg0 lhs, Arg1 rhs,
+ Status* st) {
+ static_assert(std::is_same<T, Arg0>::value, "");
+ if (ARROW_PREDICT_FALSE(rhs < 0 || rhs >= std::numeric_limits<Arg0>::digits)) {
+ *st = Status::Invalid("shift amount must be >= 0 and less than precision of type");
+ return lhs;
+ }
+ return lhs << rhs;
+ }
+
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_signed_integer<T> Call(KernelContext*, Arg0 lhs, Arg1 rhs,
+ Status* st) {
+ using Unsigned = typename std::make_unsigned<Arg0>::type;
+ static_assert(std::is_same<T, Arg0>::value, "");
+ if (ARROW_PREDICT_FALSE(rhs < 0 || rhs >= std::numeric_limits<Arg0>::digits)) {
+ *st = Status::Invalid("shift amount must be >= 0 and less than precision of type");
+ return lhs;
+ }
+ // In C/C++ left shift of a negative number is undefined (C++11 standard 5.8.2)
+ // Mimic Java/etc. and treat left shift as based on two's complement representation
+ // Assumes two's complement machine
+ return static_cast<T>(static_cast<Unsigned>(lhs) << static_cast<Unsigned>(rhs));
+ }
+};
+
+struct ShiftRight {
+ template <typename T, typename Arg0, typename Arg1>
+ static T Call(KernelContext*, Arg0 lhs, Arg1 rhs, Status*) {
+ static_assert(std::is_same<T, Arg0>::value, "");
+ // Logical right shift when Arg0 is unsigned
+ // Arithmetic otherwise (this is implementation-defined but GCC and MSVC document this
+ // as arithmetic right shift)
+ // https://gcc.gnu.org/onlinedocs/gcc/Integers-implementation.html#Integers-implementation
+ // https://docs.microsoft.com/en-us/cpp/cpp/left-shift-and-right-shift-operators-input-and-output?view=msvc-160
+ // Clang doesn't document their behavior.
+ if (ARROW_PREDICT_FALSE(rhs < 0 || rhs >= std::numeric_limits<Arg0>::digits)) {
+ return lhs;
+ }
+ return lhs >> rhs;
+ }
+};
+
+struct ShiftRightChecked {
+ template <typename T, typename Arg0, typename Arg1>
+ static T Call(KernelContext*, Arg0 lhs, Arg1 rhs, Status* st) {
+ static_assert(std::is_same<T, Arg0>::value, "");
+ if (ARROW_PREDICT_FALSE(rhs < 0 || rhs >= std::numeric_limits<Arg0>::digits)) {
+ *st = Status::Invalid("shift amount must be >= 0 and less than precision of type");
+ return lhs;
+ }
+ return lhs >> rhs;
+ }
+};
+
+struct Sin {
+ template <typename T, typename Arg0>
+ static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status*) {
+ static_assert(std::is_same<T, Arg0>::value, "");
+ return std::sin(val);
+ }
+};
+
+struct SinChecked {
+ template <typename T, typename Arg0>
+ static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status* st) {
+ static_assert(std::is_same<T, Arg0>::value, "");
+ if (ARROW_PREDICT_FALSE(std::isinf(val))) {
+ *st = Status::Invalid("domain error");
+ return val;
+ }
+ return std::sin(val);
+ }
+};
+
+struct Cos {
+ template <typename T, typename Arg0>
+ static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status*) {
+ static_assert(std::is_same<T, Arg0>::value, "");
+ return std::cos(val);
+ }
+};
+
+struct CosChecked {
+ template <typename T, typename Arg0>
+ static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status* st) {
+ static_assert(std::is_same<T, Arg0>::value, "");
+ if (ARROW_PREDICT_FALSE(std::isinf(val))) {
+ *st = Status::Invalid("domain error");
+ return val;
+ }
+ return std::cos(val);
+ }
+};
+
+struct Tan {
+ template <typename T, typename Arg0>
+ static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status*) {
+ static_assert(std::is_same<T, Arg0>::value, "");
+ return std::tan(val);
+ }
+};
+
+struct TanChecked {
+ template <typename T, typename Arg0>
+ static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status* st) {
+ static_assert(std::is_same<T, Arg0>::value, "");
+ if (ARROW_PREDICT_FALSE(std::isinf(val))) {
+ *st = Status::Invalid("domain error");
+ return val;
+ }
+ // Cannot raise range errors (overflow) since PI/2 is not exactly representable
+ return std::tan(val);
+ }
+};
+
+struct Asin {
+ template <typename T, typename Arg0>
+ static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status*) {
+ static_assert(std::is_same<T, Arg0>::value, "");
+ if (ARROW_PREDICT_FALSE(val < -1.0 || val > 1.0)) {
+ return std::numeric_limits<T>::quiet_NaN();
+ }
+ return std::asin(val);
+ }
+};
+
+struct AsinChecked {
+ template <typename T, typename Arg0>
+ static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status* st) {
+ static_assert(std::is_same<T, Arg0>::value, "");
+ if (ARROW_PREDICT_FALSE(val < -1.0 || val > 1.0)) {
+ *st = Status::Invalid("domain error");
+ return val;
+ }
+ return std::asin(val);
+ }
+};
+
+struct Acos {
+ template <typename T, typename Arg0>
+ static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status*) {
+ static_assert(std::is_same<T, Arg0>::value, "");
+ if (ARROW_PREDICT_FALSE((val < -1.0 || val > 1.0))) {
+ return std::numeric_limits<T>::quiet_NaN();
+ }
+ return std::acos(val);
+ }
+};
+
+struct AcosChecked {
+ template <typename T, typename Arg0>
+ static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status* st) {
+ static_assert(std::is_same<T, Arg0>::value, "");
+ if (ARROW_PREDICT_FALSE((val < -1.0 || val > 1.0))) {
+ *st = Status::Invalid("domain error");
+ return val;
+ }
+ return std::acos(val);
+ }
+};
+
+struct Atan {
+ template <typename T, typename Arg0>
+ static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status*) {
+ static_assert(std::is_same<T, Arg0>::value, "");
+ return std::atan(val);
+ }
+};
+
+struct Atan2 {
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 y, Arg1 x, Status*) {
+ static_assert(std::is_same<T, Arg0>::value, "");
+ static_assert(std::is_same<Arg0, Arg1>::value, "");
+ return std::atan2(y, x);
+ }
+};
+
+struct LogNatural {
+ template <typename T, typename Arg>
+ static enable_if_floating_point<Arg, T> Call(KernelContext*, Arg arg, Status*) {
+ static_assert(std::is_same<T, Arg>::value, "");
+ if (arg == 0.0) {
+ return -std::numeric_limits<T>::infinity();
+ } else if (arg < 0.0) {
+ return std::numeric_limits<T>::quiet_NaN();
+ }
+ return std::log(arg);
+ }
+};
+
+struct LogNaturalChecked {
+ template <typename T, typename Arg>
+ static enable_if_floating_point<Arg, T> Call(KernelContext*, Arg arg, Status* st) {
+ static_assert(std::is_same<T, Arg>::value, "");
+ if (arg == 0.0) {
+ *st = Status::Invalid("logarithm of zero");
+ return arg;
+ } else if (arg < 0.0) {
+ *st = Status::Invalid("logarithm of negative number");
+ return arg;
+ }
+ return std::log(arg);
+ }
+};
+
+struct Log10 {
+ template <typename T, typename Arg>
+ static enable_if_floating_point<Arg, T> Call(KernelContext*, Arg arg, Status*) {
+ static_assert(std::is_same<T, Arg>::value, "");
+ if (arg == 0.0) {
+ return -std::numeric_limits<T>::infinity();
+ } else if (arg < 0.0) {
+ return std::numeric_limits<T>::quiet_NaN();
+ }
+ return std::log10(arg);
+ }
+};
+
+struct Log10Checked {
+ template <typename T, typename Arg>
+ static enable_if_floating_point<Arg, T> Call(KernelContext*, Arg arg, Status* st) {
+ static_assert(std::is_same<T, Arg>::value, "");
+ if (arg == 0) {
+ *st = Status::Invalid("logarithm of zero");
+ return arg;
+ } else if (arg < 0) {
+ *st = Status::Invalid("logarithm of negative number");
+ return arg;
+ }
+ return std::log10(arg);
+ }
+};
+
+struct Log2 {
+ template <typename T, typename Arg>
+ static enable_if_floating_point<Arg, T> Call(KernelContext*, Arg arg, Status*) {
+ static_assert(std::is_same<T, Arg>::value, "");
+ if (arg == 0.0) {
+ return -std::numeric_limits<T>::infinity();
+ } else if (arg < 0.0) {
+ return std::numeric_limits<T>::quiet_NaN();
+ }
+ return std::log2(arg);
+ }
+};
+
+struct Log2Checked {
+ template <typename T, typename Arg>
+ static enable_if_floating_point<Arg, T> Call(KernelContext*, Arg arg, Status* st) {
+ static_assert(std::is_same<T, Arg>::value, "");
+ if (arg == 0.0) {
+ *st = Status::Invalid("logarithm of zero");
+ return arg;
+ } else if (arg < 0.0) {
+ *st = Status::Invalid("logarithm of negative number");
+ return arg;
+ }
+ return std::log2(arg);
+ }
+};
+
+struct Log1p {
+ template <typename T, typename Arg>
+ static enable_if_floating_point<Arg, T> Call(KernelContext*, Arg arg, Status*) {
+ static_assert(std::is_same<T, Arg>::value, "");
+ if (arg == -1) {
+ return -std::numeric_limits<T>::infinity();
+ } else if (arg < -1) {
+ return std::numeric_limits<T>::quiet_NaN();
+ }
+ return std::log1p(arg);
+ }
+};
+
+struct Log1pChecked {
+ template <typename T, typename Arg>
+ static enable_if_floating_point<Arg, T> Call(KernelContext*, Arg arg, Status* st) {
+ static_assert(std::is_same<T, Arg>::value, "");
+ if (arg == -1) {
+ *st = Status::Invalid("logarithm of zero");
+ return arg;
+ } else if (arg < -1) {
+ *st = Status::Invalid("logarithm of negative number");
+ return arg;
+ }
+ return std::log1p(arg);
+ }
+};
+
+struct Floor {
+ template <typename T, typename Arg>
+ static constexpr enable_if_floating_point<T> Call(KernelContext*, Arg arg, Status*) {
+ return std::floor(arg);
+ }
+};
+
+struct Ceil {
+ template <typename T, typename Arg>
+ static constexpr enable_if_floating_point<T> Call(KernelContext*, Arg arg, Status*) {
+ return std::ceil(arg);
+ }
+};
+
+struct Trunc {
+ template <typename T, typename Arg>
+ static constexpr enable_if_floating_point<T> Call(KernelContext*, Arg arg, Status*) {
+ return std::trunc(arg);
+ }
+};
+
+// Generate a kernel given an arithmetic functor
+template <template <typename... Args> class KernelGenerator, typename Op>
+ArrayKernelExec ArithmeticExecFromOp(detail::GetTypeId get_id) {
+ switch (get_id.id) {
+ case Type::INT8:
+ return KernelGenerator<Int8Type, Int8Type, Op>::Exec;
+ case Type::UINT8:
+ return KernelGenerator<UInt8Type, UInt8Type, Op>::Exec;
+ case Type::INT16:
+ return KernelGenerator<Int16Type, Int16Type, Op>::Exec;
+ case Type::UINT16:
+ return KernelGenerator<UInt16Type, UInt16Type, Op>::Exec;
+ case Type::INT32:
+ return KernelGenerator<Int32Type, Int32Type, Op>::Exec;
+ case Type::UINT32:
+ return KernelGenerator<UInt32Type, UInt32Type, Op>::Exec;
+ case Type::INT64:
+ case Type::TIMESTAMP:
+ return KernelGenerator<Int64Type, Int64Type, Op>::Exec;
+ case Type::UINT64:
+ return KernelGenerator<UInt64Type, UInt64Type, Op>::Exec;
+ case Type::FLOAT:
+ return KernelGenerator<FloatType, FloatType, Op>::Exec;
+ case Type::DOUBLE:
+ return KernelGenerator<DoubleType, DoubleType, Op>::Exec;
+ default:
+ DCHECK(false);
+ return ExecFail;
+ }
+}
+
+// Generate a kernel given a bitwise arithmetic functor. Assumes the
+// functor treats all integer types of equal width identically
+template <template <typename... Args> class KernelGenerator, typename Op>
+ArrayKernelExec TypeAgnosticBitWiseExecFromOp(detail::GetTypeId get_id) {
+ switch (get_id.id) {
+ case Type::INT8:
+ case Type::UINT8:
+ return KernelGenerator<UInt8Type, UInt8Type, Op>::Exec;
+ case Type::INT16:
+ case Type::UINT16:
+ return KernelGenerator<UInt16Type, UInt16Type, Op>::Exec;
+ case Type::INT32:
+ case Type::UINT32:
+ return KernelGenerator<UInt32Type, UInt32Type, Op>::Exec;
+ case Type::INT64:
+ case Type::UINT64:
+ return KernelGenerator<UInt64Type, UInt64Type, Op>::Exec;
+ default:
+ DCHECK(false);
+ return ExecFail;
+ }
+}
+
+template <template <typename... Args> class KernelGenerator, typename Op>
+ArrayKernelExec ShiftExecFromOp(detail::GetTypeId get_id) {
+ switch (get_id.id) {
+ case Type::INT8:
+ return KernelGenerator<Int8Type, Int8Type, Op>::Exec;
+ case Type::UINT8:
+ return KernelGenerator<UInt8Type, UInt8Type, Op>::Exec;
+ case Type::INT16:
+ return KernelGenerator<Int16Type, Int16Type, Op>::Exec;
+ case Type::UINT16:
+ return KernelGenerator<UInt16Type, UInt16Type, Op>::Exec;
+ case Type::INT32:
+ return KernelGenerator<Int32Type, Int32Type, Op>::Exec;
+ case Type::UINT32:
+ return KernelGenerator<UInt32Type, UInt32Type, Op>::Exec;
+ case Type::INT64:
+ return KernelGenerator<Int64Type, Int64Type, Op>::Exec;
+ case Type::UINT64:
+ return KernelGenerator<UInt64Type, UInt64Type, Op>::Exec;
+ default:
+ DCHECK(false);
+ return ExecFail;
+ }
+}
+
+template <template <typename... Args> class KernelGenerator, typename Op>
+ArrayKernelExec GenerateArithmeticFloatingPoint(detail::GetTypeId get_id) {
+ switch (get_id.id) {
+ case Type::FLOAT:
+ return KernelGenerator<FloatType, FloatType, Op>::Exec;
+ case Type::DOUBLE:
+ return KernelGenerator<DoubleType, DoubleType, Op>::Exec;
+ default:
+ DCHECK(false);
+ return ExecFail;
+ }
+}
+
+Status CastBinaryDecimalArgs(const std::string& func_name,
+ std::vector<ValueDescr>* values) {
+ auto& left_type = (*values)[0].type;
+ auto& right_type = (*values)[1].type;
+ DCHECK(is_decimal(left_type->id()) || is_decimal(right_type->id()));
+
+ // decimal + float = float
+ if (is_floating(left_type->id())) {
+ right_type = left_type;
+ return Status::OK();
+ } else if (is_floating(right_type->id())) {
+ left_type = right_type;
+ return Status::OK();
+ }
+
+ // precision, scale of left and right args
+ int32_t p1, s1, p2, s2;
+
+ // decimal + integer = decimal
+ if (is_decimal(left_type->id())) {
+ auto decimal = checked_cast<const DecimalType*>(left_type.get());
+ p1 = decimal->precision();
+ s1 = decimal->scale();
+ } else {
+ DCHECK(is_integer(left_type->id()));
+ p1 = static_cast<int32_t>(std::ceil(std::log10(bit_width(left_type->id()))));
+ s1 = 0;
+ }
+ if (is_decimal(right_type->id())) {
+ auto decimal = checked_cast<const DecimalType*>(right_type.get());
+ p2 = decimal->precision();
+ s2 = decimal->scale();
+ } else {
+ DCHECK(is_integer(right_type->id()));
+ p2 = static_cast<int32_t>(std::ceil(std::log10(bit_width(right_type->id()))));
+ s2 = 0;
+ }
+ if (s1 < 0 || s2 < 0) {
+ return Status::NotImplemented("Decimals with negative scales not supported");
+ }
+
+ // decimal128 + decimal256 = decimal256
+ Type::type casted_type_id = Type::DECIMAL128;
+ if (left_type->id() == Type::DECIMAL256 || right_type->id() == Type::DECIMAL256) {
+ casted_type_id = Type::DECIMAL256;
+ }
+
+ // decimal promotion rules compatible with amazon redshift
+ // https://docs.aws.amazon.com/redshift/latest/dg/r_numeric_computations201.html
+ int32_t left_scaleup, right_scaleup;
+
+ // "add_checked" -> "add"
+ const std::string op = func_name.substr(0, func_name.find("_"));
+ if (op == "add" || op == "subtract") {
+ left_scaleup = std::max(s1, s2) - s1;
+ right_scaleup = std::max(s1, s2) - s2;
+ } else if (op == "multiply") {
+ left_scaleup = right_scaleup = 0;
+ } else if (op == "divide") {
+ left_scaleup = std::max(4, s1 + p2 - s2 + 1) + s2 - s1;
+ right_scaleup = 0;
+ } else {
+ return Status::Invalid("Invalid decimal function: ", func_name);
+ }
+
+ ARROW_ASSIGN_OR_RAISE(
+ left_type, DecimalType::Make(casted_type_id, p1 + left_scaleup, s1 + left_scaleup));
+ ARROW_ASSIGN_OR_RAISE(right_type, DecimalType::Make(casted_type_id, p2 + right_scaleup,
+ s2 + right_scaleup));
+ return Status::OK();
+}
+
+// resolve decimal binary operation output type per *casted* args
+template <typename OutputGetter>
+Result<ValueDescr> ResolveDecimalBinaryOperationOutput(
+ const std::vector<ValueDescr>& args, OutputGetter&& getter) {
+ // casted args should be same size decimals
+ auto left_type = checked_cast<const DecimalType*>(args[0].type.get());
+ auto right_type = checked_cast<const DecimalType*>(args[1].type.get());
+ DCHECK_EQ(left_type->id(), right_type->id());
+
+ int32_t precision, scale;
+ std::tie(precision, scale) = getter(left_type->precision(), left_type->scale(),
+ right_type->precision(), right_type->scale());
+ ARROW_ASSIGN_OR_RAISE(auto type, DecimalType::Make(left_type->id(), precision, scale));
+ return ValueDescr(std::move(type), GetBroadcastShape(args));
+}
+
+Result<ValueDescr> ResolveDecimalAdditionOrSubtractionOutput(
+ KernelContext*, const std::vector<ValueDescr>& args) {
+ return ResolveDecimalBinaryOperationOutput(
+ args, [](int32_t p1, int32_t s1, int32_t p2, int32_t s2) {
+ DCHECK_EQ(s1, s2);
+ const int32_t scale = s1;
+ const int32_t precision = std::max(p1 - s1, p2 - s2) + scale + 1;
+ return std::make_pair(precision, scale);
+ });
+}
+
+Result<ValueDescr> ResolveDecimalMultiplicationOutput(
+ KernelContext*, const std::vector<ValueDescr>& args) {
+ return ResolveDecimalBinaryOperationOutput(
+ args, [](int32_t p1, int32_t s1, int32_t p2, int32_t s2) {
+ const int32_t scale = s1 + s2;
+ const int32_t precision = p1 + p2 + 1;
+ return std::make_pair(precision, scale);
+ });
+}
+
+Result<ValueDescr> ResolveDecimalDivisionOutput(KernelContext*,
+ const std::vector<ValueDescr>& args) {
+ return ResolveDecimalBinaryOperationOutput(
+ args, [](int32_t p1, int32_t s1, int32_t p2, int32_t s2) {
+ DCHECK_GE(s1, s2);
+ const int32_t scale = s1 - s2;
+ const int32_t precision = p1;
+ return std::make_pair(precision, scale);
+ });
+}
+
+template <typename Op>
+void AddDecimalBinaryKernels(const std::string& name,
+ std::shared_ptr<ScalarFunction>* func) {
+ OutputType out_type(null());
+ const std::string op = name.substr(0, name.find("_"));
+ if (op == "add" || op == "subtract") {
+ out_type = OutputType(ResolveDecimalAdditionOrSubtractionOutput);
+ } else if (op == "multiply") {
+ out_type = OutputType(ResolveDecimalMultiplicationOutput);
+ } else if (op == "divide") {
+ out_type = OutputType(ResolveDecimalDivisionOutput);
+ } else {
+ DCHECK(false);
+ }
+
+ auto in_type128 = InputType(Type::DECIMAL128);
+ auto in_type256 = InputType(Type::DECIMAL256);
+ auto exec128 = ScalarBinaryNotNullEqualTypes<Decimal128Type, Decimal128Type, Op>::Exec;
+ auto exec256 = ScalarBinaryNotNullEqualTypes<Decimal256Type, Decimal256Type, Op>::Exec;
+ DCHECK_OK((*func)->AddKernel({in_type128, in_type128}, out_type, exec128));
+ DCHECK_OK((*func)->AddKernel({in_type256, in_type256}, out_type, exec256));
+}
+
+// Generate a kernel given an arithmetic functor
+template <template <typename...> class KernelGenerator, typename OutType, typename Op>
+ArrayKernelExec GenerateArithmeticWithFixedIntOutType(detail::GetTypeId get_id) {
+ switch (get_id.id) {
+ case Type::INT8:
+ return KernelGenerator<OutType, Int8Type, Op>::Exec;
+ case Type::UINT8:
+ return KernelGenerator<OutType, UInt8Type, Op>::Exec;
+ case Type::INT16:
+ return KernelGenerator<OutType, Int16Type, Op>::Exec;
+ case Type::UINT16:
+ return KernelGenerator<OutType, UInt16Type, Op>::Exec;
+ case Type::INT32:
+ return KernelGenerator<OutType, Int32Type, Op>::Exec;
+ case Type::UINT32:
+ return KernelGenerator<OutType, UInt32Type, Op>::Exec;
+ case Type::INT64:
+ case Type::TIMESTAMP:
+ return KernelGenerator<OutType, Int64Type, Op>::Exec;
+ case Type::UINT64:
+ return KernelGenerator<OutType, UInt64Type, Op>::Exec;
+ case Type::FLOAT:
+ return KernelGenerator<FloatType, FloatType, Op>::Exec;
+ case Type::DOUBLE:
+ return KernelGenerator<DoubleType, DoubleType, Op>::Exec;
+ default:
+ DCHECK(false);
+ return ExecFail;
+ }
+}
+
+struct ArithmeticFunction : ScalarFunction {
+ using ScalarFunction::ScalarFunction;
+
+ Result<const Kernel*> DispatchBest(std::vector<ValueDescr>* values) const override {
+ RETURN_NOT_OK(CheckArity(*values));
+
+ RETURN_NOT_OK(CheckDecimals(values));
+
+ using arrow::compute::detail::DispatchExactImpl;
+ if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
+
+ EnsureDictionaryDecoded(values);
+
+ // Only promote types for binary functions
+ if (values->size() == 2) {
+ ReplaceNullWithOtherType(values);
+
+ if (auto type = CommonNumeric(*values)) {
+ ReplaceTypes(type, values);
+ }
+ }
+
+ if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
+ return arrow::compute::detail::NoMatchingKernel(this, *values);
+ }
+
+ Status CheckDecimals(std::vector<ValueDescr>* values) const {
+ bool has_decimal = false;
+ for (const auto& value : *values) {
+ if (is_decimal(value.type->id())) {
+ has_decimal = true;
+ break;
+ }
+ }
+ if (!has_decimal) return Status::OK();
+
+ if (values->size() == 2) {
+ return CastBinaryDecimalArgs(name(), values);
+ }
+ return Status::OK();
+ }
+};
+
+/// An ArithmeticFunction that promotes integer arguments to double.
+struct ArithmeticFloatingPointFunction : public ArithmeticFunction {
+ using ArithmeticFunction::ArithmeticFunction;
+
+ Result<const Kernel*> DispatchBest(std::vector<ValueDescr>* values) const override {
+ RETURN_NOT_OK(CheckArity(*values));
+ RETURN_NOT_OK(CheckDecimals(values));
+
+ using arrow::compute::detail::DispatchExactImpl;
+ if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
+
+ EnsureDictionaryDecoded(values);
+
+ if (values->size() == 2) {
+ ReplaceNullWithOtherType(values);
+ }
+
+ for (auto& descr : *values) {
+ if (is_integer(descr.type->id())) {
+ descr.type = float64();
+ }
+ }
+ if (auto type = CommonNumeric(*values)) {
+ ReplaceTypes(type, values);
+ }
+
+ if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
+ return arrow::compute::detail::NoMatchingKernel(this, *values);
+ }
+};
+
+template <typename Op>
+std::shared_ptr<ScalarFunction> MakeArithmeticFunction(std::string name,
+ const FunctionDoc* doc) {
+ auto func = std::make_shared<ArithmeticFunction>(name, Arity::Binary(), doc);
+ for (const auto& ty : NumericTypes()) {
+ auto exec = ArithmeticExecFromOp<ScalarBinaryEqualTypes, Op>(ty);
+ DCHECK_OK(func->AddKernel({ty, ty}, ty, exec));
+ }
+ return func;
+}
+
+// Like MakeArithmeticFunction, but for arithmetic ops that need to run
+// only on non-null output.
+template <typename Op>
+std::shared_ptr<ScalarFunction> MakeArithmeticFunctionNotNull(std::string name,
+ const FunctionDoc* doc) {
+ auto func = std::make_shared<ArithmeticFunction>(name, Arity::Binary(), doc);
+ for (const auto& ty : NumericTypes()) {
+ auto exec = ArithmeticExecFromOp<ScalarBinaryNotNullEqualTypes, Op>(ty);
+ DCHECK_OK(func->AddKernel({ty, ty}, ty, exec));
+ }
+ return func;
+}
+
+template <typename Op>
+std::shared_ptr<ScalarFunction> MakeUnaryArithmeticFunction(std::string name,
+ const FunctionDoc* doc) {
+ auto func = std::make_shared<ArithmeticFunction>(name, Arity::Unary(), doc);
+ for (const auto& ty : NumericTypes()) {
+ auto exec = ArithmeticExecFromOp<ScalarUnary, Op>(ty);
+ DCHECK_OK(func->AddKernel({ty}, ty, exec));
+ }
+ return func;
+}
+
+// Like MakeUnaryArithmeticFunction, but for unary arithmetic ops with a fixed
+// output type for integral inputs.
+template <typename Op, typename IntOutType>
+std::shared_ptr<ScalarFunction> MakeUnaryArithmeticFunctionWithFixedIntOutType(
+ std::string name, const FunctionDoc* doc) {
+ auto int_out_ty = TypeTraits<IntOutType>::type_singleton();
+ auto func = std::make_shared<ArithmeticFunction>(name, Arity::Unary(), doc);
+ for (const auto& ty : NumericTypes()) {
+ auto out_ty = arrow::is_floating(ty->id()) ? ty : int_out_ty;
+ auto exec = GenerateArithmeticWithFixedIntOutType<ScalarUnary, IntOutType, Op>(ty);
+ DCHECK_OK(func->AddKernel({ty}, out_ty, exec));
+ }
+ return func;
+}
+
+// Like MakeUnaryArithmeticFunction, but for arithmetic ops that need to run
+// only on non-null output.
+template <typename Op>
+std::shared_ptr<ScalarFunction> MakeUnaryArithmeticFunctionNotNull(
+ std::string name, const FunctionDoc* doc) {
+ auto func = std::make_shared<ArithmeticFunction>(name, Arity::Unary(), doc);
+ for (const auto& ty : NumericTypes()) {
+ auto exec = ArithmeticExecFromOp<ScalarUnaryNotNull, Op>(ty);
+ DCHECK_OK(func->AddKernel({ty}, ty, exec));
+ }
+ return func;
+}
+
+// Like MakeUnaryArithmeticFunction, but for signed arithmetic ops that need to run
+// only on non-null output.
+template <typename Op>
+std::shared_ptr<ScalarFunction> MakeUnarySignedArithmeticFunctionNotNull(
+ std::string name, const FunctionDoc* doc) {
+ auto func = std::make_shared<ArithmeticFunction>(name, Arity::Unary(), doc);
+ for (const auto& ty : NumericTypes()) {
+ if (!arrow::is_unsigned_integer(ty->id())) {
+ auto exec = ArithmeticExecFromOp<ScalarUnaryNotNull, Op>(ty);
+ DCHECK_OK(func->AddKernel({ty}, ty, exec));
+ }
+ }
+ return func;
+}
+
+template <typename Op>
+std::shared_ptr<ScalarFunction> MakeBitWiseFunctionNotNull(std::string name,
+ const FunctionDoc* doc) {
+ auto func = std::make_shared<ArithmeticFunction>(name, Arity::Binary(), doc);
+ for (const auto& ty : IntTypes()) {
+ auto exec = TypeAgnosticBitWiseExecFromOp<ScalarBinaryNotNullEqualTypes, Op>(ty);
+ DCHECK_OK(func->AddKernel({ty, ty}, ty, exec));
+ }
+ return func;
+}
+
+template <typename Op>
+std::shared_ptr<ScalarFunction> MakeShiftFunctionNotNull(std::string name,
+ const FunctionDoc* doc) {
+ auto func = std::make_shared<ArithmeticFunction>(name, Arity::Binary(), doc);
+ for (const auto& ty : IntTypes()) {
+ auto exec = ShiftExecFromOp<ScalarBinaryNotNullEqualTypes, Op>(ty);
+ DCHECK_OK(func->AddKernel({ty, ty}, ty, exec));
+ }
+ return func;
+}
+
+template <typename Op>
+std::shared_ptr<ScalarFunction> MakeUnaryArithmeticFunctionFloatingPoint(
+ std::string name, const FunctionDoc* doc) {
+ auto func =
+ std::make_shared<ArithmeticFloatingPointFunction>(name, Arity::Unary(), doc);
+ for (const auto& ty : FloatingPointTypes()) {
+ auto output = is_integer(ty->id()) ? float64() : ty;
+ auto exec = GenerateArithmeticFloatingPoint<ScalarUnary, Op>(ty);
+ DCHECK_OK(func->AddKernel({ty}, output, exec));
+ }
+ return func;
+}
+
+template <typename Op>
+std::shared_ptr<ScalarFunction> MakeUnaryArithmeticFunctionFloatingPointNotNull(
+ std::string name, const FunctionDoc* doc) {
+ auto func =
+ std::make_shared<ArithmeticFloatingPointFunction>(name, Arity::Unary(), doc);
+ for (const auto& ty : FloatingPointTypes()) {
+ auto output = is_integer(ty->id()) ? float64() : ty;
+ auto exec = GenerateArithmeticFloatingPoint<ScalarUnaryNotNull, Op>(ty);
+ DCHECK_OK(func->AddKernel({ty}, output, exec));
+ }
+ return func;
+}
+
+template <typename Op>
+std::shared_ptr<ScalarFunction> MakeArithmeticFunctionFloatingPoint(
+ std::string name, const FunctionDoc* doc) {
+ auto func =
+ std::make_shared<ArithmeticFloatingPointFunction>(name, Arity::Binary(), doc);
+ for (const auto& ty : FloatingPointTypes()) {
+ auto output = is_integer(ty->id()) ? float64() : ty;
+ auto exec = GenerateArithmeticFloatingPoint<ScalarBinaryEqualTypes, Op>(ty);
+ DCHECK_OK(func->AddKernel({ty, ty}, output, exec));
+ }
+ return func;
+}
+
+const FunctionDoc absolute_value_doc{
+ "Calculate the absolute value of the argument element-wise",
+ ("Results will wrap around on integer overflow.\n"
+ "Use function \"abs_checked\" if you want overflow\n"
+ "to return an error."),
+ {"x"}};
+
+const FunctionDoc absolute_value_checked_doc{
+ "Calculate the absolute value of the argument element-wise",
+ ("This function returns an error on overflow. For a variant that\n"
+ "doesn't fail on overflow, use function \"abs\"."),
+ {"x"}};
+
+const FunctionDoc add_doc{"Add the arguments element-wise",
+ ("Results will wrap around on integer overflow.\n"
+ "Use function \"add_checked\" if you want overflow\n"
+ "to return an error."),
+ {"x", "y"}};
+
+const FunctionDoc add_checked_doc{
+ "Add the arguments element-wise",
+ ("This function returns an error on overflow. For a variant that\n"
+ "doesn't fail on overflow, use function \"add\"."),
+ {"x", "y"}};
+
+const FunctionDoc sub_doc{"Subtract the arguments element-wise",
+ ("Results will wrap around on integer overflow.\n"
+ "Use function \"subtract_checked\" if you want overflow\n"
+ "to return an error."),
+ {"x", "y"}};
+
+const FunctionDoc sub_checked_doc{
+ "Subtract the arguments element-wise",
+ ("This function returns an error on overflow. For a variant that\n"
+ "doesn't fail on overflow, use function \"subtract\"."),
+ {"x", "y"}};
+
+const FunctionDoc mul_doc{"Multiply the arguments element-wise",
+ ("Results will wrap around on integer overflow.\n"
+ "Use function \"multiply_checked\" if you want overflow\n"
+ "to return an error."),
+ {"x", "y"}};
+
+const FunctionDoc mul_checked_doc{
+ "Multiply the arguments element-wise",
+ ("This function returns an error on overflow. For a variant that\n"
+ "doesn't fail on overflow, use function \"multiply\"."),
+ {"x", "y"}};
+
+const FunctionDoc div_doc{
+ "Divide the arguments element-wise",
+ ("Integer division by zero returns an error. However, integer overflow\n"
+ "wraps around, and floating-point division by zero returns an infinite.\n"
+ "Use function \"divide_checked\" if you want to get an error\n"
+ "in all the aforementioned cases."),
+ {"dividend", "divisor"}};
+
+const FunctionDoc div_checked_doc{
+ "Divide the arguments element-wise",
+ ("An error is returned when trying to divide by zero, or when\n"
+ "integer overflow is encountered."),
+ {"dividend", "divisor"}};
+
+const FunctionDoc negate_doc{"Negate the argument element-wise",
+ ("Results will wrap around on integer overflow.\n"
+ "Use function \"negate_checked\" if you want overflow\n"
+ "to return an error."),
+ {"x"}};
+
+const FunctionDoc negate_checked_doc{
+ "Negate the arguments element-wise",
+ ("This function returns an error on overflow. For a variant that\n"
+ "doesn't fail on overflow, use function \"negate\"."),
+ {"x"}};
+
+const FunctionDoc pow_doc{
+ "Raise arguments to power element-wise",
+ ("Integer to negative integer power returns an error. However, integer overflow\n"
+ "wraps around. If either base or exponent is null the result will be null."),
+ {"base", "exponent"}};
+
+const FunctionDoc pow_checked_doc{
+ "Raise arguments to power element-wise",
+ ("An error is returned when integer to negative integer power is encountered,\n"
+ "or integer overflow is encountered."),
+ {"base", "exponent"}};
+
+const FunctionDoc sign_doc{
+ "Get the signedness of the arguments element-wise",
+ ("Output is any of (-1,1) for nonzero inputs and 0 for zero input.\n"
+ "NaN values return NaN. Integral values return signedness as Int8 and\n"
+ "floating-point values return it with the same type as the input values."),
+ {"x"}};
+
+const FunctionDoc bit_wise_not_doc{
+ "Bit-wise negate the arguments element-wise", "Null values return null.", {"x"}};
+
+const FunctionDoc bit_wise_and_doc{
+ "Bit-wise AND the arguments element-wise", "Null values return null.", {"x", "y"}};
+
+const FunctionDoc bit_wise_or_doc{
+ "Bit-wise OR the arguments element-wise", "Null values return null.", {"x", "y"}};
+
+const FunctionDoc bit_wise_xor_doc{
+ "Bit-wise XOR the arguments element-wise", "Null values return null.", {"x", "y"}};
+
+const FunctionDoc shift_left_doc{
+ "Left shift `x` by `y`",
+ ("This function will return `x` if `y` (the amount to shift by) is: "
+ "(1) negative or (2) greater than or equal to the precision of `x`.\n"
+ "The shift operates as if on the two's complement representation of the number. "
+ "In other words, this is equivalent to multiplying `x` by 2 to the power `y`, "
+ "even if overflow occurs.\n"
+ "Use function \"shift_left_checked\" if you want an invalid shift amount to "
+ "return an error."),
+ {"x", "y"}};
+
+const FunctionDoc shift_left_checked_doc{
+ "Left shift `x` by `y` with invalid shift check",
+ ("This function will raise an error if `y` (the amount to shift by) is: "
+ "(1) negative or (2) greater than or equal to the precision of `x`. "
+ "The shift operates as if on the two's complement representation of the number. "
+ "In other words, this is equivalent to multiplying `x` by 2 to the power `y`, "
+ "even if overflow occurs.\n"
+ "See \"shift_left\" for a variant that doesn't fail for an invalid shift amount."),
+ {"x", "y"}};
+
+const FunctionDoc shift_right_doc{
+ "Right shift `x` by `y`",
+ ("Perform a logical shift for unsigned `x` and an arithmetic shift for signed `x`.\n"
+ "This function will return `x` if `y` (the amount to shift by) is: "
+ "(1) negative or (2) greater than or equal to the precision of `x`.\n"
+ "Use function \"shift_right_checked\" if you want an invalid shift amount to return "
+ "an error."),
+ {"x", "y"}};
+
+const FunctionDoc shift_right_checked_doc{
+ "Right shift `x` by `y` with invalid shift check",
+ ("Perform a logical shift for unsigned `x` and an arithmetic shift for signed `x`.\n"
+ "This function will raise an error if `y` (the amount to shift by) is: "
+ "(1) negative or (2) greater than or equal to the precision of `x`.\n"
+ "See \"shift_right\" for a variant that doesn't fail for an invalid shift amount"),
+ {"x", "y"}};
+
+const FunctionDoc sin_doc{"Compute the sine of the elements argument-wise",
+ ("Integer arguments return double values. "
+ "This function returns NaN on values outside its domain. "
+ "To raise an error instead, see \"sin_checked\"."),
+ {"x"}};
+
+const FunctionDoc sin_checked_doc{
+ "Compute the sine of the elements argument-wise",
+ ("Integer arguments return double values. "
+ "This function raises an error on values outside its domain. "
+ "To return NaN instead, see \"sin\"."),
+ {"x"}};
+
+const FunctionDoc cos_doc{"Compute the cosine of the elements argument-wise",
+ ("Integer arguments return double values. "
+ "This function returns NaN on values outside its domain. "
+ "To raise an error instead, see \"cos_checked\"."),
+ {"x"}};
+
+const FunctionDoc cos_checked_doc{
+ "Compute the cosine of the elements argument-wise",
+ ("Integer arguments return double values. "
+ "This function raises an error on values outside its domain. "
+ "To return NaN instead, see \"cos\"."),
+ {"x"}};
+
+const FunctionDoc tan_doc{"Compute the tangent of the elements argument-wise",
+ ("Integer arguments return double values. "
+ "This function returns NaN on values outside its domain. "
+ "To raise an error instead, see \"tan_checked\"."),
+ {"x"}};
+
+const FunctionDoc tan_checked_doc{
+ "Compute the tangent of the elements argument-wise",
+ ("Integer arguments return double values. "
+ "This function raises an error on values outside its domain. "
+ "To return NaN instead, see \"tan\"."),
+ {"x"}};
+
+const FunctionDoc asin_doc{"Compute the inverse sine of the elements argument-wise",
+ ("Integer arguments return double values. "
+ "This function returns NaN on values outside its domain. "
+ "To raise an error instead, see \"asin_checked\"."),
+ {"x"}};
+
+const FunctionDoc asin_checked_doc{
+ "Compute the inverse sine of the elements argument-wise",
+ ("Integer arguments return double values. "
+ "This function raises an error on values outside its domain. "
+ "To return NaN instead, see \"asin\"."),
+ {"x"}};
+
+const FunctionDoc acos_doc{"Compute the inverse cosine of the elements argument-wise",
+ ("Integer arguments return double values. "
+ "This function returns NaN on values outside its domain. "
+ "To raise an error instead, see \"acos_checked\"."),
+ {"x"}};
+
+const FunctionDoc acos_checked_doc{
+ "Compute the inverse cosine of the elements argument-wise",
+ ("Integer arguments return double values. "
+ "This function raises an error on values outside its domain. "
+ "To return NaN instead, see \"acos\"."),
+ {"x"}};
+
+const FunctionDoc atan_doc{"Compute the principal value of the inverse tangent",
+ "Integer arguments return double values.",
+ {"x"}};
+
+const FunctionDoc atan2_doc{
+ "Compute the inverse tangent using argument signs to determine the quadrant",
+ "Integer arguments return double values.",
+ {"y", "x"}};
+
+const FunctionDoc ln_doc{
+ "Compute natural log of arguments element-wise",
+ ("Non-positive values return -inf or NaN. Null values return null.\n"
+ "Use function \"ln_checked\" if you want non-positive values to raise an error."),
+ {"x"}};
+
+const FunctionDoc ln_checked_doc{
+ "Compute natural log of arguments element-wise",
+ ("Non-positive values return -inf or NaN. Null values return null.\n"
+ "Use function \"ln\" if you want non-positive values to return "
+ "-inf or NaN."),
+ {"x"}};
+
+const FunctionDoc log10_doc{
+ "Compute log base 10 of arguments element-wise",
+ ("Non-positive values return -inf or NaN. Null values return null.\n"
+ "Use function \"log10_checked\" if you want non-positive values to raise an error."),
+ {"x"}};
+
+const FunctionDoc log10_checked_doc{
+ "Compute log base 10 of arguments element-wise",
+ ("Non-positive values return -inf or NaN. Null values return null.\n"
+ "Use function \"log10\" if you want non-positive values to return "
+ "-inf or NaN."),
+ {"x"}};
+
+const FunctionDoc log2_doc{
+ "Compute log base 2 of arguments element-wise",
+ ("Non-positive values return -inf or NaN. Null values return null.\n"
+ "Use function \"log2_checked\" if you want non-positive values to raise an error."),
+ {"x"}};
+
+const FunctionDoc log2_checked_doc{
+ "Compute log base 2 of arguments element-wise",
+ ("Non-positive values return -inf or NaN. Null values return null.\n"
+ "Use function \"log2\" if you want non-positive values to return "
+ "-inf or NaN."),
+ {"x"}};
+
+const FunctionDoc log1p_doc{
+ "Compute natural log of (1+x) element-wise",
+ ("Values <= -1 return -inf or NaN. Null values return null.\n"
+ "This function may be more precise than log(1 + x) for x close to zero."
+ "Use function \"log1p_checked\" if you want non-positive values to raise an error."),
+ {"x"}};
+
+const FunctionDoc log1p_checked_doc{
+ "Compute natural log of (1+x) element-wise",
+ ("Values <= -1 return -inf or NaN. Null values return null.\n"
+ "This function may be more precise than log(1 + x) for x close to zero."
+ "Use function \"log1p\" if you want non-positive values to return "
+ "-inf or NaN."),
+ {"x"}};
+
+const FunctionDoc floor_doc{
+ "Round down to the nearest integer",
+ ("Calculate the nearest integer less than or equal in magnitude to the "
+ "argument element-wise"),
+ {"x"}};
+
+const FunctionDoc ceil_doc{
+ "Round up to the nearest integer",
+ ("Calculate the nearest integer greater than or equal in magnitude to the "
+ "argument element-wise"),
+ {"x"}};
+
+const FunctionDoc trunc_doc{
+ "Get the integral part without fractional digits",
+ ("Calculate the nearest integer not greater in magnitude than to the "
+ "argument element-wise."),
+ {"x"}};
+} // namespace
+
+void RegisterScalarArithmetic(FunctionRegistry* registry) {
+ // ----------------------------------------------------------------------
+ auto absolute_value =
+ MakeUnaryArithmeticFunction<AbsoluteValue>("abs", &absolute_value_doc);
+ DCHECK_OK(registry->AddFunction(std::move(absolute_value)));
+
+ // ----------------------------------------------------------------------
+ auto absolute_value_checked = MakeUnaryArithmeticFunctionNotNull<AbsoluteValueChecked>(
+ "abs_checked", &absolute_value_checked_doc);
+ DCHECK_OK(registry->AddFunction(std::move(absolute_value_checked)));
+
+ // ----------------------------------------------------------------------
+ auto add = MakeArithmeticFunction<Add>("add", &add_doc);
+ AddDecimalBinaryKernels<Add>("add", &add);
+ DCHECK_OK(registry->AddFunction(std::move(add)));
+
+ // ----------------------------------------------------------------------
+ auto add_checked =
+ MakeArithmeticFunctionNotNull<AddChecked>("add_checked", &add_checked_doc);
+ AddDecimalBinaryKernels<AddChecked>("add_checked", &add_checked);
+ DCHECK_OK(registry->AddFunction(std::move(add_checked)));
+
+ // ----------------------------------------------------------------------
+ auto subtract = MakeArithmeticFunction<Subtract>("subtract", &sub_doc);
+ AddDecimalBinaryKernels<Subtract>("subtract", &subtract);
+
+ // Add subtract(timestamp, timestamp) -> duration
+ for (auto unit : AllTimeUnits()) {
+ InputType in_type(match::TimestampTypeUnit(unit));
+ auto exec = ArithmeticExecFromOp<ScalarBinaryEqualTypes, Subtract>(Type::TIMESTAMP);
+ DCHECK_OK(subtract->AddKernel({in_type, in_type}, duration(unit), std::move(exec)));
+ }
+
+ DCHECK_OK(registry->AddFunction(std::move(subtract)));
+
+ // ----------------------------------------------------------------------
+ auto subtract_checked = MakeArithmeticFunctionNotNull<SubtractChecked>(
+ "subtract_checked", &sub_checked_doc);
+ AddDecimalBinaryKernels<SubtractChecked>("subtract_checked", &subtract_checked);
+ DCHECK_OK(registry->AddFunction(std::move(subtract_checked)));
+
+ // ----------------------------------------------------------------------
+ auto multiply = MakeArithmeticFunction<Multiply>("multiply", &mul_doc);
+ AddDecimalBinaryKernels<Multiply>("multiply", &multiply);
+ DCHECK_OK(registry->AddFunction(std::move(multiply)));
+
+ // ----------------------------------------------------------------------
+ auto multiply_checked = MakeArithmeticFunctionNotNull<MultiplyChecked>(
+ "multiply_checked", &mul_checked_doc);
+ AddDecimalBinaryKernels<MultiplyChecked>("multiply_checked", &multiply_checked);
+ DCHECK_OK(registry->AddFunction(std::move(multiply_checked)));
+
+ // ----------------------------------------------------------------------
+ auto divide = MakeArithmeticFunctionNotNull<Divide>("divide", &div_doc);
+ AddDecimalBinaryKernels<Divide>("divide", &divide);
+ DCHECK_OK(registry->AddFunction(std::move(divide)));
+
+ // ----------------------------------------------------------------------
+ auto divide_checked =
+ MakeArithmeticFunctionNotNull<DivideChecked>("divide_checked", &div_checked_doc);
+ AddDecimalBinaryKernels<DivideChecked>("divide_checked", &divide_checked);
+ DCHECK_OK(registry->AddFunction(std::move(divide_checked)));
+
+ // ----------------------------------------------------------------------
+ auto negate = MakeUnaryArithmeticFunction<Negate>("negate", &negate_doc);
+ DCHECK_OK(registry->AddFunction(std::move(negate)));
+
+ // ----------------------------------------------------------------------
+ auto negate_checked = MakeUnarySignedArithmeticFunctionNotNull<NegateChecked>(
+ "negate_checked", &negate_checked_doc);
+ DCHECK_OK(registry->AddFunction(std::move(negate_checked)));
+
+ // ----------------------------------------------------------------------
+ auto power = MakeArithmeticFunction<Power>("power", &pow_doc);
+ DCHECK_OK(registry->AddFunction(std::move(power)));
+
+ // ----------------------------------------------------------------------
+ auto power_checked =
+ MakeArithmeticFunctionNotNull<PowerChecked>("power_checked", &pow_checked_doc);
+ DCHECK_OK(registry->AddFunction(std::move(power_checked)));
+
+ // ----------------------------------------------------------------------
+ auto sign =
+ MakeUnaryArithmeticFunctionWithFixedIntOutType<Sign, Int8Type>("sign", &sign_doc);
+ DCHECK_OK(registry->AddFunction(std::move(sign)));
+
+ // ----------------------------------------------------------------------
+ // Bitwise functions
+ {
+ auto bit_wise_not = std::make_shared<ArithmeticFunction>(
+ "bit_wise_not", Arity::Unary(), &bit_wise_not_doc);
+ for (const auto& ty : IntTypes()) {
+ auto exec = TypeAgnosticBitWiseExecFromOp<ScalarUnaryNotNull, BitWiseNot>(ty);
+ DCHECK_OK(bit_wise_not->AddKernel({ty}, ty, exec));
+ }
+ DCHECK_OK(registry->AddFunction(std::move(bit_wise_not)));
+ }
+
+ auto bit_wise_and =
+ MakeBitWiseFunctionNotNull<BitWiseAnd>("bit_wise_and", &bit_wise_and_doc);
+ DCHECK_OK(registry->AddFunction(std::move(bit_wise_and)));
+
+ auto bit_wise_or =
+ MakeBitWiseFunctionNotNull<BitWiseOr>("bit_wise_or", &bit_wise_or_doc);
+ DCHECK_OK(registry->AddFunction(std::move(bit_wise_or)));
+
+ auto bit_wise_xor =
+ MakeBitWiseFunctionNotNull<BitWiseXor>("bit_wise_xor", &bit_wise_xor_doc);
+ DCHECK_OK(registry->AddFunction(std::move(bit_wise_xor)));
+
+ auto shift_left = MakeShiftFunctionNotNull<ShiftLeft>("shift_left", &shift_left_doc);
+ DCHECK_OK(registry->AddFunction(std::move(shift_left)));
+
+ auto shift_left_checked = MakeShiftFunctionNotNull<ShiftLeftChecked>(
+ "shift_left_checked", &shift_left_checked_doc);
+ DCHECK_OK(registry->AddFunction(std::move(shift_left_checked)));
+
+ auto shift_right =
+ MakeShiftFunctionNotNull<ShiftRight>("shift_right", &shift_right_doc);
+ DCHECK_OK(registry->AddFunction(std::move(shift_right)));
+
+ auto shift_right_checked = MakeShiftFunctionNotNull<ShiftRightChecked>(
+ "shift_right_checked", &shift_right_checked_doc);
+ DCHECK_OK(registry->AddFunction(std::move(shift_right_checked)));
+
+ // ----------------------------------------------------------------------
+ // Trig functions
+ auto sin = MakeUnaryArithmeticFunctionFloatingPoint<Sin>("sin", &sin_doc);
+ DCHECK_OK(registry->AddFunction(std::move(sin)));
+
+ auto sin_checked = MakeUnaryArithmeticFunctionFloatingPointNotNull<SinChecked>(
+ "sin_checked", &sin_checked_doc);
+ DCHECK_OK(registry->AddFunction(std::move(sin_checked)));
+
+ auto cos = MakeUnaryArithmeticFunctionFloatingPoint<Cos>("cos", &cos_doc);
+ DCHECK_OK(registry->AddFunction(std::move(cos)));
+
+ auto cos_checked = MakeUnaryArithmeticFunctionFloatingPointNotNull<CosChecked>(
+ "cos_checked", &cos_checked_doc);
+ DCHECK_OK(registry->AddFunction(std::move(cos_checked)));
+
+ auto tan = MakeUnaryArithmeticFunctionFloatingPoint<Tan>("tan", &tan_doc);
+ DCHECK_OK(registry->AddFunction(std::move(tan)));
+
+ auto tan_checked = MakeUnaryArithmeticFunctionFloatingPointNotNull<TanChecked>(
+ "tan_checked", &tan_checked_doc);
+ DCHECK_OK(registry->AddFunction(std::move(tan_checked)));
+
+ auto asin = MakeUnaryArithmeticFunctionFloatingPoint<Asin>("asin", &asin_doc);
+ DCHECK_OK(registry->AddFunction(std::move(asin)));
+
+ auto asin_checked = MakeUnaryArithmeticFunctionFloatingPointNotNull<AsinChecked>(
+ "asin_checked", &asin_checked_doc);
+ DCHECK_OK(registry->AddFunction(std::move(asin_checked)));
+
+ auto acos = MakeUnaryArithmeticFunctionFloatingPoint<Acos>("acos", &acos_doc);
+ DCHECK_OK(registry->AddFunction(std::move(acos)));
+
+ auto acos_checked = MakeUnaryArithmeticFunctionFloatingPointNotNull<AcosChecked>(
+ "acos_checked", &acos_checked_doc);
+ DCHECK_OK(registry->AddFunction(std::move(acos_checked)));
+
+ auto atan = MakeUnaryArithmeticFunctionFloatingPoint<Atan>("atan", &atan_doc);
+ DCHECK_OK(registry->AddFunction(std::move(atan)));
+
+ auto atan2 = MakeArithmeticFunctionFloatingPoint<Atan2>("atan2", &atan2_doc);
+ DCHECK_OK(registry->AddFunction(std::move(atan2)));
+
+ // ----------------------------------------------------------------------
+ // Logarithms
+ auto ln = MakeUnaryArithmeticFunctionFloatingPoint<LogNatural>("ln", &ln_doc);
+ DCHECK_OK(registry->AddFunction(std::move(ln)));
+
+ auto ln_checked = MakeUnaryArithmeticFunctionFloatingPointNotNull<LogNaturalChecked>(
+ "ln_checked", &ln_checked_doc);
+ DCHECK_OK(registry->AddFunction(std::move(ln_checked)));
+
+ auto log10 = MakeUnaryArithmeticFunctionFloatingPoint<Log10>("log10", &log10_doc);
+ DCHECK_OK(registry->AddFunction(std::move(log10)));
+
+ auto log10_checked = MakeUnaryArithmeticFunctionFloatingPointNotNull<Log10Checked>(
+ "log10_checked", &log10_checked_doc);
+ DCHECK_OK(registry->AddFunction(std::move(log10_checked)));
+
+ auto log2 = MakeUnaryArithmeticFunctionFloatingPoint<Log2>("log2", &log2_doc);
+ DCHECK_OK(registry->AddFunction(std::move(log2)));
+
+ auto log2_checked = MakeUnaryArithmeticFunctionFloatingPointNotNull<Log2Checked>(
+ "log2_checked", &log2_checked_doc);
+ DCHECK_OK(registry->AddFunction(std::move(log2_checked)));
+
+ auto log1p = MakeUnaryArithmeticFunctionFloatingPoint<Log1p>("log1p", &log1p_doc);
+ DCHECK_OK(registry->AddFunction(std::move(log1p)));
+
+ auto log1p_checked = MakeUnaryArithmeticFunctionFloatingPointNotNull<Log1pChecked>(
+ "log1p_checked", &log1p_checked_doc);
+ DCHECK_OK(registry->AddFunction(std::move(log1p_checked)));
+
+ // ----------------------------------------------------------------------
+ // Rounding functions
+ auto floor = MakeUnaryArithmeticFunctionFloatingPoint<Floor>("floor", &floor_doc);
+ DCHECK_OK(registry->AddFunction(std::move(floor)));
+
+ auto ceil = MakeUnaryArithmeticFunctionFloatingPoint<Ceil>("ceil", &ceil_doc);
+ DCHECK_OK(registry->AddFunction(std::move(ceil)));
+
+ auto trunc = MakeUnaryArithmeticFunctionFloatingPoint<Trunc>("trunc", &trunc_doc);
+ DCHECK_OK(registry->AddFunction(std::move(trunc)));
+}
+
+} // namespace internal
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_boolean.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_boolean.cc
new file mode 100644
index 00000000000..7a0e3654edb
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_boolean.cc
@@ -0,0 +1,563 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <array>
+
+#include "arrow/compute/kernels/common.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap.h"
+#include "arrow/util/bitmap_ops.h"
+
+namespace arrow {
+
+using internal::Bitmap;
+
+namespace compute {
+
+namespace {
+
+template <typename ComputeWord>
+void ComputeKleene(ComputeWord&& compute_word, KernelContext* ctx, const ArrayData& left,
+ const ArrayData& right, ArrayData* out) {
+ DCHECK(left.null_count != 0 || right.null_count != 0)
+ << "ComputeKleene is unnecessarily expensive for the non-null case";
+
+ Bitmap left_valid_bm{left.buffers[0], left.offset, left.length};
+ Bitmap left_data_bm{left.buffers[1], left.offset, left.length};
+
+ Bitmap right_valid_bm{right.buffers[0], right.offset, right.length};
+ Bitmap right_data_bm{right.buffers[1], right.offset, right.length};
+
+ std::array<Bitmap, 2> out_bms{Bitmap(out->buffers[0], out->offset, out->length),
+ Bitmap(out->buffers[1], out->offset, out->length)};
+
+ auto apply = [&](uint64_t left_valid, uint64_t left_data, uint64_t right_valid,
+ uint64_t right_data, uint64_t* out_validity, uint64_t* out_data) {
+ auto left_true = left_valid & left_data;
+ auto left_false = left_valid & ~left_data;
+
+ auto right_true = right_valid & right_data;
+ auto right_false = right_valid & ~right_data;
+
+ compute_word(left_true, left_false, right_true, right_false, out_validity, out_data);
+ };
+
+ if (right.null_count == 0) {
+ std::array<Bitmap, 3> in_bms{left_valid_bm, left_data_bm, right_data_bm};
+ Bitmap::VisitWordsAndWrite(
+ in_bms, &out_bms,
+ [&](const std::array<uint64_t, 3>& in, std::array<uint64_t, 2>* out) {
+ apply(in[0], in[1], ~uint64_t(0), in[2], &(out->at(0)), &(out->at(1)));
+ });
+ return;
+ }
+
+ if (left.null_count == 0) {
+ std::array<Bitmap, 3> in_bms{left_data_bm, right_valid_bm, right_data_bm};
+ Bitmap::VisitWordsAndWrite(
+ in_bms, &out_bms,
+ [&](const std::array<uint64_t, 3>& in, std::array<uint64_t, 2>* out) {
+ apply(~uint64_t(0), in[0], in[1], in[2], &(out->at(0)), &(out->at(1)));
+ });
+ return;
+ }
+
+ DCHECK(left.null_count != 0 && right.null_count != 0);
+ std::array<Bitmap, 4> in_bms{left_valid_bm, left_data_bm, right_valid_bm,
+ right_data_bm};
+ Bitmap::VisitWordsAndWrite(
+ in_bms, &out_bms,
+ [&](const std::array<uint64_t, 4>& in, std::array<uint64_t, 2>* out) {
+ apply(in[0], in[1], in[2], in[3], &(out->at(0)), &(out->at(1)));
+ });
+}
+
+inline BooleanScalar InvertScalar(const Scalar& in) {
+ return in.is_valid ? BooleanScalar(!checked_cast<const BooleanScalar&>(in).value)
+ : BooleanScalar();
+}
+
+inline Bitmap GetBitmap(const ArrayData& arr, int index) {
+ return Bitmap{arr.buffers[index], arr.offset, arr.length};
+}
+
+struct InvertOp {
+ static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) {
+ *checked_cast<BooleanScalar*>(out) = InvertScalar(in);
+ return Status::OK();
+ }
+
+ static Status Call(KernelContext* ctx, const ArrayData& in, ArrayData* out) {
+ GetBitmap(*out, 1).CopyFromInverted(GetBitmap(in, 1));
+ return Status::OK();
+ }
+};
+
+template <typename Op>
+struct Commutative {
+ static Status Call(KernelContext* ctx, const Scalar& left, const ArrayData& right,
+ ArrayData* out) {
+ return Op::Call(ctx, right, left, out);
+ }
+};
+
+struct AndOp : Commutative<AndOp> {
+ using Commutative<AndOp>::Call;
+
+ static Status Call(KernelContext* ctx, const Scalar& left, const Scalar& right,
+ Scalar* out) {
+ if (left.is_valid && right.is_valid) {
+ checked_cast<BooleanScalar*>(out)->value =
+ checked_cast<const BooleanScalar&>(left).value &&
+ checked_cast<const BooleanScalar&>(right).value;
+ }
+ return Status::OK();
+ }
+
+ static Status Call(KernelContext* ctx, const ArrayData& left, const Scalar& right,
+ ArrayData* out) {
+ if (right.is_valid) {
+ checked_cast<const BooleanScalar&>(right).value
+ ? GetBitmap(*out, 1).CopyFrom(GetBitmap(left, 1))
+ : GetBitmap(*out, 1).SetBitsTo(false);
+ }
+ return Status::OK();
+ }
+
+ static Status Call(KernelContext* ctx, const ArrayData& left, const ArrayData& right,
+ ArrayData* out) {
+ ::arrow::internal::BitmapAnd(left.buffers[1]->data(), left.offset,
+ right.buffers[1]->data(), right.offset, right.length,
+ out->offset, out->buffers[1]->mutable_data());
+ return Status::OK();
+ }
+};
+
+struct KleeneAndOp : Commutative<KleeneAndOp> {
+ using Commutative<KleeneAndOp>::Call;
+
+ static Status Call(KernelContext* ctx, const Scalar& left, const Scalar& right,
+ Scalar* out) {
+ bool left_true = left.is_valid && checked_cast<const BooleanScalar&>(left).value;
+ bool left_false = left.is_valid && !checked_cast<const BooleanScalar&>(left).value;
+
+ bool right_true = right.is_valid && checked_cast<const BooleanScalar&>(right).value;
+ bool right_false = right.is_valid && !checked_cast<const BooleanScalar&>(right).value;
+
+ checked_cast<BooleanScalar*>(out)->value = left_true && right_true;
+ out->is_valid = left_false || right_false || (left_true && right_true);
+ return Status::OK();
+ }
+
+ static Status Call(KernelContext* ctx, const ArrayData& left, const Scalar& right,
+ ArrayData* out) {
+ bool right_true = right.is_valid && checked_cast<const BooleanScalar&>(right).value;
+ bool right_false = right.is_valid && !checked_cast<const BooleanScalar&>(right).value;
+
+ if (right_false) {
+ out->null_count = 0;
+ out->buffers[0] = nullptr;
+ GetBitmap(*out, 1).SetBitsTo(false); // all false case
+ return Status::OK();
+ }
+
+ if (right_true) {
+ if (left.GetNullCount() == 0) {
+ out->null_count = 0;
+ out->buffers[0] = nullptr;
+ } else {
+ GetBitmap(*out, 0).CopyFrom(GetBitmap(left, 0));
+ }
+ GetBitmap(*out, 1).CopyFrom(GetBitmap(left, 1));
+ return Status::OK();
+ }
+
+ // scalar was null: out[i] is valid iff left[i] was false
+ if (left.GetNullCount() == 0) {
+ ::arrow::internal::InvertBitmap(left.buffers[1]->data(), left.offset, left.length,
+ out->buffers[0]->mutable_data(), out->offset);
+ } else {
+ ::arrow::internal::BitmapAndNot(left.buffers[0]->data(), left.offset,
+ left.buffers[1]->data(), left.offset, left.length,
+ out->offset, out->buffers[0]->mutable_data());
+ }
+ ::arrow::internal::CopyBitmap(left.buffers[1]->data(), left.offset, left.length,
+ out->buffers[1]->mutable_data(), out->offset);
+ return Status::OK();
+ }
+
+ static Status Call(KernelContext* ctx, const ArrayData& left, const ArrayData& right,
+ ArrayData* out) {
+ if (left.GetNullCount() == 0 && right.GetNullCount() == 0) {
+ out->null_count = 0;
+ // Kleene kernels have validity bitmap pre-allocated. Therefore, set it to 1
+ BitUtil::SetBitmap(out->buffers[0]->mutable_data(), out->offset, out->length);
+ return AndOp::Call(ctx, left, right, out);
+ }
+ auto compute_word = [](uint64_t left_true, uint64_t left_false, uint64_t right_true,
+ uint64_t right_false, uint64_t* out_valid,
+ uint64_t* out_data) {
+ *out_data = left_true & right_true;
+ *out_valid = left_false | right_false | (left_true & right_true);
+ };
+ ComputeKleene(compute_word, ctx, left, right, out);
+ return Status::OK();
+ }
+};
+
+struct OrOp : Commutative<OrOp> {
+ using Commutative<OrOp>::Call;
+
+ static Status Call(KernelContext* ctx, const Scalar& left, const Scalar& right,
+ Scalar* out) {
+ if (left.is_valid && right.is_valid) {
+ checked_cast<BooleanScalar*>(out)->value =
+ checked_cast<const BooleanScalar&>(left).value ||
+ checked_cast<const BooleanScalar&>(right).value;
+ }
+ return Status::OK();
+ }
+
+ static Status Call(KernelContext* ctx, const ArrayData& left, const Scalar& right,
+ ArrayData* out) {
+ if (right.is_valid) {
+ checked_cast<const BooleanScalar&>(right).value
+ ? GetBitmap(*out, 1).SetBitsTo(true)
+ : GetBitmap(*out, 1).CopyFrom(GetBitmap(left, 1));
+ }
+ return Status::OK();
+ }
+
+ static Status Call(KernelContext* ctx, const ArrayData& left, const ArrayData& right,
+ ArrayData* out) {
+ ::arrow::internal::BitmapOr(left.buffers[1]->data(), left.offset,
+ right.buffers[1]->data(), right.offset, right.length,
+ out->offset, out->buffers[1]->mutable_data());
+ return Status::OK();
+ }
+};
+
+struct KleeneOrOp : Commutative<KleeneOrOp> {
+ using Commutative<KleeneOrOp>::Call;
+
+ static Status Call(KernelContext* ctx, const Scalar& left, const Scalar& right,
+ Scalar* out) {
+ bool left_true = left.is_valid && checked_cast<const BooleanScalar&>(left).value;
+ bool left_false = left.is_valid && !checked_cast<const BooleanScalar&>(left).value;
+
+ bool right_true = right.is_valid && checked_cast<const BooleanScalar&>(right).value;
+ bool right_false = right.is_valid && !checked_cast<const BooleanScalar&>(right).value;
+
+ checked_cast<BooleanScalar*>(out)->value = left_true || right_true;
+ out->is_valid = left_true || right_true || (left_false && right_false);
+ return Status::OK();
+ }
+
+ static Status Call(KernelContext* ctx, const ArrayData& left, const Scalar& right,
+ ArrayData* out) {
+ bool right_true = right.is_valid && checked_cast<const BooleanScalar&>(right).value;
+ bool right_false = right.is_valid && !checked_cast<const BooleanScalar&>(right).value;
+
+ if (right_true) {
+ out->null_count = 0;
+ out->buffers[0] = nullptr;
+ GetBitmap(*out, 1).SetBitsTo(true); // all true case
+ return Status::OK();
+ }
+
+ if (right_false) {
+ if (left.GetNullCount() == 0) {
+ out->null_count = 0;
+ out->buffers[0] = nullptr;
+ } else {
+ GetBitmap(*out, 0).CopyFrom(GetBitmap(left, 0));
+ }
+ GetBitmap(*out, 1).CopyFrom(GetBitmap(left, 1));
+ return Status::OK();
+ }
+
+ // scalar was null: out[i] is valid iff left[i] was true
+ if (left.GetNullCount() == 0) {
+ ::arrow::internal::CopyBitmap(left.buffers[1]->data(), left.offset, left.length,
+ out->buffers[0]->mutable_data(), out->offset);
+ } else {
+ ::arrow::internal::BitmapAnd(left.buffers[0]->data(), left.offset,
+ left.buffers[1]->data(), left.offset, left.length,
+ out->offset, out->buffers[0]->mutable_data());
+ }
+ ::arrow::internal::CopyBitmap(left.buffers[1]->data(), left.offset, left.length,
+ out->buffers[1]->mutable_data(), out->offset);
+ return Status::OK();
+ }
+
+ static Status Call(KernelContext* ctx, const ArrayData& left, const ArrayData& right,
+ ArrayData* out) {
+ if (left.GetNullCount() == 0 && right.GetNullCount() == 0) {
+ out->null_count = 0;
+ // Kleene kernels have validity bitmap pre-allocated. Therefore, set it to 1
+ BitUtil::SetBitmap(out->buffers[0]->mutable_data(), out->offset, out->length);
+ return OrOp::Call(ctx, left, right, out);
+ }
+
+ static auto compute_word = [](uint64_t left_true, uint64_t left_false,
+ uint64_t right_true, uint64_t right_false,
+ uint64_t* out_valid, uint64_t* out_data) {
+ *out_data = left_true | right_true;
+ *out_valid = left_true | right_true | (left_false & right_false);
+ };
+
+ ComputeKleene(compute_word, ctx, left, right, out);
+ return Status::OK();
+ }
+};
+
+struct XorOp : Commutative<XorOp> {
+ using Commutative<XorOp>::Call;
+
+ static Status Call(KernelContext* ctx, const Scalar& left, const Scalar& right,
+ Scalar* out) {
+ if (left.is_valid && right.is_valid) {
+ checked_cast<BooleanScalar*>(out)->value =
+ checked_cast<const BooleanScalar&>(left).value ^
+ checked_cast<const BooleanScalar&>(right).value;
+ }
+ return Status::OK();
+ }
+
+ static Status Call(KernelContext* ctx, const ArrayData& left, const Scalar& right,
+ ArrayData* out) {
+ if (right.is_valid) {
+ checked_cast<const BooleanScalar&>(right).value
+ ? GetBitmap(*out, 1).CopyFromInverted(GetBitmap(left, 1))
+ : GetBitmap(*out, 1).CopyFrom(GetBitmap(left, 1));
+ }
+ return Status::OK();
+ }
+
+ static Status Call(KernelContext* ctx, const ArrayData& left, const ArrayData& right,
+ ArrayData* out) {
+ ::arrow::internal::BitmapXor(left.buffers[1]->data(), left.offset,
+ right.buffers[1]->data(), right.offset, right.length,
+ out->offset, out->buffers[1]->mutable_data());
+ return Status::OK();
+ }
+};
+
+struct AndNotOp {
+ static Status Call(KernelContext* ctx, const Scalar& left, const Scalar& right,
+ Scalar* out) {
+ return AndOp::Call(ctx, left, InvertScalar(right), out);
+ }
+
+ static Status Call(KernelContext* ctx, const Scalar& left, const ArrayData& right,
+ ArrayData* out) {
+ if (left.is_valid) {
+ checked_cast<const BooleanScalar&>(left).value
+ ? GetBitmap(*out, 1).CopyFromInverted(GetBitmap(right, 1))
+ : GetBitmap(*out, 1).SetBitsTo(false);
+ }
+ return Status::OK();
+ }
+
+ static Status Call(KernelContext* ctx, const ArrayData& left, const Scalar& right,
+ ArrayData* out) {
+ return AndOp::Call(ctx, left, InvertScalar(right), out);
+ }
+
+ static Status Call(KernelContext* ctx, const ArrayData& left, const ArrayData& right,
+ ArrayData* out) {
+ ::arrow::internal::BitmapAndNot(left.buffers[1]->data(), left.offset,
+ right.buffers[1]->data(), right.offset, right.length,
+ out->offset, out->buffers[1]->mutable_data());
+ return Status::OK();
+ }
+};
+
+struct KleeneAndNotOp {
+ static Status Call(KernelContext* ctx, const Scalar& left, const Scalar& right,
+ Scalar* out) {
+ return KleeneAndOp::Call(ctx, left, InvertScalar(right), out);
+ }
+
+ static Status Call(KernelContext* ctx, const Scalar& left, const ArrayData& right,
+ ArrayData* out) {
+ bool left_true = left.is_valid && checked_cast<const BooleanScalar&>(left).value;
+ bool left_false = left.is_valid && !checked_cast<const BooleanScalar&>(left).value;
+
+ if (left_false) {
+ out->null_count = 0;
+ out->buffers[0] = nullptr;
+ GetBitmap(*out, 1).SetBitsTo(false); // all false case
+ return Status::OK();
+ }
+
+ if (left_true) {
+ if (right.GetNullCount() == 0) {
+ out->null_count = 0;
+ out->buffers[0] = nullptr;
+ } else {
+ GetBitmap(*out, 0).CopyFrom(GetBitmap(right, 0));
+ }
+ GetBitmap(*out, 1).CopyFromInverted(GetBitmap(right, 1));
+ return Status::OK();
+ }
+
+ // scalar was null: out[i] is valid iff right[i] was true
+ if (right.GetNullCount() == 0) {
+ ::arrow::internal::CopyBitmap(right.buffers[1]->data(), right.offset, right.length,
+ out->buffers[0]->mutable_data(), out->offset);
+ } else {
+ ::arrow::internal::BitmapAnd(right.buffers[0]->data(), right.offset,
+ right.buffers[1]->data(), right.offset, right.length,
+ out->offset, out->buffers[0]->mutable_data());
+ }
+ ::arrow::internal::InvertBitmap(right.buffers[1]->data(), right.offset, right.length,
+ out->buffers[1]->mutable_data(), out->offset);
+ return Status::OK();
+ }
+
+ static Status Call(KernelContext* ctx, const ArrayData& left, const Scalar& right,
+ ArrayData* out) {
+ return KleeneAndOp::Call(ctx, left, InvertScalar(right), out);
+ }
+
+ static Status Call(KernelContext* ctx, const ArrayData& left, const ArrayData& right,
+ ArrayData* out) {
+ if (left.GetNullCount() == 0 && right.GetNullCount() == 0) {
+ out->null_count = 0;
+ // Kleene kernels have validity bitmap pre-allocated. Therefore, set it to 1
+ BitUtil::SetBitmap(out->buffers[0]->mutable_data(), out->offset, out->length);
+ return AndNotOp::Call(ctx, left, right, out);
+ }
+
+ static auto compute_word = [](uint64_t left_true, uint64_t left_false,
+ uint64_t right_true, uint64_t right_false,
+ uint64_t* out_valid, uint64_t* out_data) {
+ *out_data = left_true & right_false;
+ *out_valid = left_false | right_true | (left_true & right_false);
+ };
+
+ ComputeKleene(compute_word, ctx, left, right, out);
+ return Status::OK();
+ }
+};
+
+void MakeFunction(const std::string& name, int arity, ArrayKernelExec exec,
+ const FunctionDoc* doc, FunctionRegistry* registry,
+ NullHandling::type null_handling = NullHandling::INTERSECTION) {
+ auto func = std::make_shared<ScalarFunction>(name, Arity(arity), doc);
+
+ // Scalar arguments not yet supported
+ std::vector<InputType> in_types(arity, InputType(boolean()));
+ ScalarKernel kernel(std::move(in_types), boolean(), exec);
+ kernel.null_handling = null_handling;
+
+ DCHECK_OK(func->AddKernel(kernel));
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+}
+
+const FunctionDoc invert_doc{"Invert boolean values", "", {"values"}};
+
+const FunctionDoc and_doc{
+ "Logical 'and' boolean values",
+ ("When a null is encountered in either input, a null is output.\n"
+ "For a different null behavior, see function \"and_kleene\"."),
+ {"x", "y"}};
+
+const FunctionDoc and_not_doc{
+ "Logical 'and not' boolean values",
+ ("When a null is encountered in either input, a null is output.\n"
+ "For a different null behavior, see function \"and_not_kleene\"."),
+ {"x", "y"}};
+
+const FunctionDoc or_doc{
+ "Logical 'or' boolean values",
+ ("When a null is encountered in either input, a null is output.\n"
+ "For a different null behavior, see function \"or_kleene\"."),
+ {"x", "y"}};
+
+const FunctionDoc xor_doc{
+ "Logical 'xor' boolean values",
+ ("When a null is encountered in either input, a null is output."),
+ {"x", "y"}};
+
+const FunctionDoc and_kleene_doc{
+ "Logical 'and' boolean values (Kleene logic)",
+ ("This function behaves as follows with nulls:\n\n"
+ "- true and null = null\n"
+ "- null and true = null\n"
+ "- false and null = false\n"
+ "- null and false = false\n"
+ "- null and null = null\n"
+ "\n"
+ "In other words, in this context a null value really means \"unknown\",\n"
+ "and an unknown value 'and' false is always false.\n"
+ "For a different null behavior, see function \"and\"."),
+ {"x", "y"}};
+
+const FunctionDoc and_not_kleene_doc{
+ "Logical 'and not' boolean values (Kleene logic)",
+ ("This function behaves as follows with nulls:\n\n"
+ "- true and null = null\n"
+ "- null and false = null\n"
+ "- false and null = false\n"
+ "- null and true = false\n"
+ "- null and null = null\n"
+ "\n"
+ "In other words, in this context a null value really means \"unknown\",\n"
+ "and an unknown value 'and not' true is always false, as is false\n"
+ "'and not' an unknown value.\n"
+ "For a different null behavior, see function \"and_not\"."),
+ {"x", "y"}};
+
+const FunctionDoc or_kleene_doc{
+ "Logical 'or' boolean values (Kleene logic)",
+ ("This function behaves as follows with nulls:\n\n"
+ "- true or null = true\n"
+ "- null and true = true\n"
+ "- false and null = null\n"
+ "- null and false = null\n"
+ "- null and null = null\n"
+ "\n"
+ "In other words, in this context a null value really means \"unknown\",\n"
+ "and an unknown value 'or' true is always true.\n"
+ "For a different null behavior, see function \"and\"."),
+ {"x", "y"}};
+
+} // namespace
+
+namespace internal {
+
+void RegisterScalarBoolean(FunctionRegistry* registry) {
+ // These functions can write into sliced output bitmaps
+ MakeFunction("invert", 1, applicator::SimpleUnary<InvertOp>, &invert_doc, registry);
+ MakeFunction("and", 2, applicator::SimpleBinary<AndOp>, &and_doc, registry);
+ MakeFunction("and_not", 2, applicator::SimpleBinary<AndNotOp>, &and_not_doc, registry);
+ MakeFunction("or", 2, applicator::SimpleBinary<OrOp>, &or_doc, registry);
+ MakeFunction("xor", 2, applicator::SimpleBinary<XorOp>, &xor_doc, registry);
+
+ MakeFunction("and_kleene", 2, applicator::SimpleBinary<KleeneAndOp>, &and_kleene_doc,
+ registry, NullHandling::COMPUTED_PREALLOCATE);
+ MakeFunction("and_not_kleene", 2, applicator::SimpleBinary<KleeneAndNotOp>,
+ &and_not_kleene_doc, registry, NullHandling::COMPUTED_PREALLOCATE);
+ MakeFunction("or_kleene", 2, applicator::SimpleBinary<KleeneOrOp>, &or_kleene_doc,
+ registry, NullHandling::COMPUTED_PREALLOCATE);
+}
+
+} // namespace internal
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc
new file mode 100644
index 00000000000..dad94c1ace7
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc
@@ -0,0 +1,70 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Cast types to boolean
+
+#include "arrow/array/builder_primitive.h"
+#include "arrow/compute/kernels/common.h"
+#include "arrow/compute/kernels/scalar_cast_internal.h"
+#include "arrow/util/value_parsing.h"
+
+namespace arrow {
+
+using internal::ParseValue;
+
+namespace compute {
+namespace internal {
+
+struct IsNonZero {
+ template <typename OutValue, typename Arg0Value>
+ static OutValue Call(KernelContext*, Arg0Value val, Status*) {
+ return val != 0;
+ }
+};
+
+struct ParseBooleanString {
+ template <typename OutValue, typename Arg0Value>
+ static OutValue Call(KernelContext*, Arg0Value val, Status* st) {
+ bool result = false;
+ if (ARROW_PREDICT_FALSE(!ParseValue<BooleanType>(val.data(), val.size(), &result))) {
+ *st = Status::Invalid("Failed to parse value: ", val);
+ }
+ return result;
+ }
+};
+
+std::vector<std::shared_ptr<CastFunction>> GetBooleanCasts() {
+ auto func = std::make_shared<CastFunction>("cast_boolean", Type::BOOL);
+ AddCommonCasts(Type::BOOL, boolean(), func.get());
+ AddZeroCopyCast(Type::BOOL, boolean(), boolean(), func.get());
+
+ for (const auto& ty : NumericTypes()) {
+ ArrayKernelExec exec =
+ GenerateNumeric<applicator::ScalarUnary, BooleanType, IsNonZero>(*ty);
+ DCHECK_OK(func->AddKernel(ty->id(), {ty}, boolean(), exec));
+ }
+ for (const auto& ty : BaseBinaryTypes()) {
+ ArrayKernelExec exec = GenerateVarBinaryBase<applicator::ScalarUnaryNotNull,
+ BooleanType, ParseBooleanString>(*ty);
+ DCHECK_OK(func->AddKernel(ty->id(), {ty}, boolean(), exec));
+ }
+ return {func};
+}
+
+} // namespace internal
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc
new file mode 100644
index 00000000000..b1e1164fd34
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc
@@ -0,0 +1,126 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Implementation of casting to dictionary type
+
+#include <arrow/util/bitmap_ops.h>
+#include <arrow/util/checked_cast.h>
+
+#include "arrow/array/builder_primitive.h"
+#include "arrow/compute/cast_internal.h"
+#include "arrow/compute/kernels/scalar_cast_internal.h"
+#include "arrow/compute/kernels/util_internal.h"
+#include "arrow/util/int_util.h"
+
+namespace arrow {
+using internal::CopyBitmap;
+
+namespace compute {
+namespace internal {
+
+Status CastDictionary(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const CastOptions& options = CastState::Get(ctx);
+ auto out_type = std::static_pointer_cast<DictionaryType>(out->type());
+
+ // if out type is same as in type, return input
+ if (out_type->Equals(batch[0].type())) {
+ *out = batch[0];
+ return Status::OK();
+ }
+
+ if (batch[0].is_scalar()) { // if input is scalar
+ auto in_scalar = checked_cast<const DictionaryScalar&>(*batch[0].scalar());
+
+ // if invalid scalar, return null scalar
+ if (!in_scalar.is_valid) {
+ *out = MakeNullScalar(out_type);
+ return Status::OK();
+ }
+
+ Datum casted_index, casted_dict;
+ if (in_scalar.value.index->type->Equals(out_type->index_type())) {
+ casted_index = in_scalar.value.index;
+ } else {
+ ARROW_ASSIGN_OR_RAISE(casted_index,
+ Cast(in_scalar.value.index, out_type->index_type(), options,
+ ctx->exec_context()));
+ }
+
+ if (in_scalar.value.dictionary->type()->Equals(out_type->value_type())) {
+ casted_dict = in_scalar.value.dictionary;
+ } else {
+ ARROW_ASSIGN_OR_RAISE(
+ casted_dict, Cast(in_scalar.value.dictionary, out_type->value_type(), options,
+ ctx->exec_context()));
+ }
+
+ *out = std::static_pointer_cast<Scalar>(
+ DictionaryScalar::Make(casted_index.scalar(), casted_dict.make_array()));
+
+ return Status::OK();
+ }
+
+ // if input is array
+ const std::shared_ptr<ArrayData>& in_array = batch[0].array();
+ const auto& in_type = checked_cast<const DictionaryType&>(*in_array->type);
+
+ ArrayData* out_array = out->mutable_array();
+
+ if (in_type.index_type()->Equals(out_type->index_type())) {
+ out_array->buffers[0] = in_array->buffers[0];
+ out_array->buffers[1] = in_array->buffers[1];
+ out_array->null_count = in_array->GetNullCount();
+ out_array->offset = in_array->offset;
+ } else {
+ // for indices, create a dummy ArrayData with index_type()
+ const std::shared_ptr<ArrayData>& indices_arr =
+ ArrayData::Make(in_type.index_type(), in_array->length, in_array->buffers,
+ in_array->GetNullCount(), in_array->offset);
+ ARROW_ASSIGN_OR_RAISE(auto casted_indices, Cast(indices_arr, out_type->index_type(),
+ options, ctx->exec_context()));
+ out_array->buffers[0] = std::move(casted_indices.array()->buffers[0]);
+ out_array->buffers[1] = std::move(casted_indices.array()->buffers[1]);
+ }
+
+ // data (dict)
+ if (in_type.value_type()->Equals(out_type->value_type())) {
+ out_array->dictionary = in_array->dictionary;
+ } else {
+ const std::shared_ptr<Array>& dict_arr = MakeArray(in_array->dictionary);
+ ARROW_ASSIGN_OR_RAISE(auto casted_data, Cast(dict_arr, out_type->value_type(),
+ options, ctx->exec_context()));
+ out_array->dictionary = casted_data.array();
+ }
+ return Status::OK();
+}
+
+std::vector<std::shared_ptr<CastFunction>> GetDictionaryCasts() {
+ auto func = std::make_shared<CastFunction>("cast_dictionary", Type::DICTIONARY);
+
+ AddCommonCasts(Type::DICTIONARY, kOutputTargetType, func.get());
+ ScalarKernel kernel({InputType(Type::DICTIONARY)}, kOutputTargetType, CastDictionary);
+ kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
+ kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
+
+ DCHECK_OK(func->AddKernel(Type::DICTIONARY, std::move(kernel)));
+
+ return {func};
+}
+
+} // namespace internal
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc
new file mode 100644
index 00000000000..198c82bd97e
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc
@@ -0,0 +1,285 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/kernels/scalar_cast_internal.h"
+#include "arrow/compute/cast_internal.h"
+#include "arrow/compute/kernels/common.h"
+#include "arrow/extension_type.h"
+
+namespace arrow {
+
+using internal::PrimitiveScalarBase;
+
+namespace compute {
+namespace internal {
+
+// ----------------------------------------------------------------------
+
+template <typename OutT, typename InT>
+ARROW_DISABLE_UBSAN("float-cast-overflow")
+void DoStaticCast(const void* in_data, int64_t in_offset, int64_t length,
+ int64_t out_offset, void* out_data) {
+ auto in = reinterpret_cast<const InT*>(in_data) + in_offset;
+ auto out = reinterpret_cast<OutT*>(out_data) + out_offset;
+ for (int64_t i = 0; i < length; ++i) {
+ *out++ = static_cast<OutT>(*in++);
+ }
+}
+
+using StaticCastFunc = std::function<void(const void*, int64_t, int64_t, int64_t, void*)>;
+
+template <typename OutType, typename InType, typename Enable = void>
+struct CastPrimitive {
+ static void Exec(const Datum& input, Datum* out) {
+ using OutT = typename OutType::c_type;
+ using InT = typename InType::c_type;
+
+ StaticCastFunc caster = DoStaticCast<OutT, InT>;
+ if (input.kind() == Datum::ARRAY) {
+ const ArrayData& arr = *input.array();
+ ArrayData* out_arr = out->mutable_array();
+ caster(arr.buffers[1]->data(), arr.offset, arr.length, out_arr->offset,
+ out_arr->buffers[1]->mutable_data());
+ } else {
+ // Scalar path. Use the caster with length 1 to place the casted value into
+ // the output
+ const auto& in_scalar = input.scalar_as<PrimitiveScalarBase>();
+ auto out_scalar = checked_cast<PrimitiveScalarBase*>(out->scalar().get());
+ caster(in_scalar.data(), /*in_offset=*/0, /*length=*/1, /*out_offset=*/0,
+ out_scalar->mutable_data());
+ }
+ }
+};
+
+template <typename OutType, typename InType>
+struct CastPrimitive<OutType, InType, enable_if_t<std::is_same<OutType, InType>::value>> {
+ // memcpy output
+ static void Exec(const Datum& input, Datum* out) {
+ using T = typename InType::c_type;
+
+ if (input.kind() == Datum::ARRAY) {
+ const ArrayData& arr = *input.array();
+ ArrayData* out_arr = out->mutable_array();
+ std::memcpy(
+ reinterpret_cast<T*>(out_arr->buffers[1]->mutable_data()) + out_arr->offset,
+ reinterpret_cast<const T*>(arr.buffers[1]->data()) + arr.offset,
+ arr.length * sizeof(T));
+ } else {
+ // Scalar path. Use the caster with length 1 to place the casted value into
+ // the output
+ const auto& in_scalar = input.scalar_as<PrimitiveScalarBase>();
+ auto out_scalar = checked_cast<PrimitiveScalarBase*>(out->scalar().get());
+ *reinterpret_cast<T*>(out_scalar->mutable_data()) =
+ *reinterpret_cast<const T*>(in_scalar.data());
+ }
+ }
+};
+
+template <typename InType>
+void CastNumberImpl(Type::type out_type, const Datum& input, Datum* out) {
+ switch (out_type) {
+ case Type::INT8:
+ return CastPrimitive<Int8Type, InType>::Exec(input, out);
+ case Type::INT16:
+ return CastPrimitive<Int16Type, InType>::Exec(input, out);
+ case Type::INT32:
+ return CastPrimitive<Int32Type, InType>::Exec(input, out);
+ case Type::INT64:
+ return CastPrimitive<Int64Type, InType>::Exec(input, out);
+ case Type::UINT8:
+ return CastPrimitive<UInt8Type, InType>::Exec(input, out);
+ case Type::UINT16:
+ return CastPrimitive<UInt16Type, InType>::Exec(input, out);
+ case Type::UINT32:
+ return CastPrimitive<UInt32Type, InType>::Exec(input, out);
+ case Type::UINT64:
+ return CastPrimitive<UInt64Type, InType>::Exec(input, out);
+ case Type::FLOAT:
+ return CastPrimitive<FloatType, InType>::Exec(input, out);
+ case Type::DOUBLE:
+ return CastPrimitive<DoubleType, InType>::Exec(input, out);
+ default:
+ break;
+ }
+}
+
+void CastNumberToNumberUnsafe(Type::type in_type, Type::type out_type, const Datum& input,
+ Datum* out) {
+ switch (in_type) {
+ case Type::INT8:
+ return CastNumberImpl<Int8Type>(out_type, input, out);
+ case Type::INT16:
+ return CastNumberImpl<Int16Type>(out_type, input, out);
+ case Type::INT32:
+ return CastNumberImpl<Int32Type>(out_type, input, out);
+ case Type::INT64:
+ return CastNumberImpl<Int64Type>(out_type, input, out);
+ case Type::UINT8:
+ return CastNumberImpl<UInt8Type>(out_type, input, out);
+ case Type::UINT16:
+ return CastNumberImpl<UInt16Type>(out_type, input, out);
+ case Type::UINT32:
+ return CastNumberImpl<UInt32Type>(out_type, input, out);
+ case Type::UINT64:
+ return CastNumberImpl<UInt64Type>(out_type, input, out);
+ case Type::FLOAT:
+ return CastNumberImpl<FloatType>(out_type, input, out);
+ case Type::DOUBLE:
+ return CastNumberImpl<DoubleType>(out_type, input, out);
+ default:
+ DCHECK(false);
+ break;
+ }
+}
+
+// ----------------------------------------------------------------------
+
+Status UnpackDictionary(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ DCHECK(out->is_array());
+
+ DictionaryArray dict_arr(batch[0].array());
+ const CastOptions& options = checked_cast<const CastState&>(*ctx->state()).options;
+
+ const auto& dict_type = *dict_arr.dictionary()->type();
+ if (!dict_type.Equals(options.to_type) && !CanCast(dict_type, *options.to_type)) {
+ return Status::Invalid("Cast type ", options.to_type->ToString(),
+ " incompatible with dictionary type ", dict_type.ToString());
+ }
+
+ ARROW_ASSIGN_OR_RAISE(*out,
+ Take(Datum(dict_arr.dictionary()), Datum(dict_arr.indices()),
+ TakeOptions::Defaults(), ctx->exec_context()));
+
+ if (!dict_type.Equals(options.to_type)) {
+ ARROW_ASSIGN_OR_RAISE(*out, Cast(*out, options));
+ }
+ return Status::OK();
+}
+
+Status OutputAllNull(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ if (out->is_scalar()) {
+ out->scalar()->is_valid = false;
+ } else {
+ ArrayData* output = out->mutable_array();
+ output->buffers = {nullptr};
+ output->null_count = batch.length;
+ }
+ return Status::OK();
+}
+
+Status CastFromExtension(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const CastOptions& options = checked_cast<const CastState*>(ctx->state())->options;
+
+ const DataType& in_type = *batch[0].type();
+ const auto storage_type = checked_cast<const ExtensionType&>(in_type).storage_type();
+
+ ExtensionArray extension(batch[0].array());
+
+ Datum casted_storage;
+ RETURN_NOT_OK(Cast(*extension.storage(), out->type(), options, ctx->exec_context())
+ .Value(&casted_storage));
+ out->value = casted_storage.array();
+ return Status::OK();
+}
+
+Status CastFromNull(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ if (!batch[0].is_scalar()) {
+ ArrayData* output = out->mutable_array();
+ std::shared_ptr<Array> nulls;
+ RETURN_NOT_OK(MakeArrayOfNull(output->type, batch.length).Value(&nulls));
+ out->value = nulls->data();
+ }
+ return Status::OK();
+}
+
+Result<ValueDescr> ResolveOutputFromOptions(KernelContext* ctx,
+ const std::vector<ValueDescr>& args) {
+ const CastOptions& options = checked_cast<const CastState&>(*ctx->state()).options;
+ return ValueDescr(options.to_type, args[0].shape);
+}
+
+/// You will see some of kernels with
+///
+/// kOutputTargetType
+///
+/// for their output type resolution. This is somewhat of an eyesore but the
+/// easiest initial way to get the requested cast type including the TimeUnit
+/// to the kernel (which is needed to compute the output) was through
+/// CastOptions
+
+OutputType kOutputTargetType(ResolveOutputFromOptions);
+
+Status ZeroCopyCastExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ DCHECK_EQ(batch[0].kind(), Datum::ARRAY);
+ // Make a copy of the buffers into a destination array without carrying
+ // the type
+ const ArrayData& input = *batch[0].array();
+ ArrayData* output = out->mutable_array();
+ output->length = input.length;
+ output->SetNullCount(input.null_count);
+ output->buffers = input.buffers;
+ output->offset = input.offset;
+ output->child_data = input.child_data;
+ return Status::OK();
+}
+
+void AddZeroCopyCast(Type::type in_type_id, InputType in_type, OutputType out_type,
+ CastFunction* func) {
+ auto sig = KernelSignature::Make({in_type}, out_type);
+ ScalarKernel kernel;
+ kernel.exec = TrivialScalarUnaryAsArraysExec(ZeroCopyCastExec);
+ kernel.signature = sig;
+ kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
+ kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
+ DCHECK_OK(func->AddKernel(in_type_id, std::move(kernel)));
+}
+
+static bool CanCastFromDictionary(Type::type type_id) {
+ return (is_primitive(type_id) || is_base_binary_like(type_id) ||
+ is_fixed_size_binary(type_id));
+}
+
+void AddCommonCasts(Type::type out_type_id, OutputType out_ty, CastFunction* func) {
+ // From null to this type
+ ScalarKernel kernel;
+ kernel.exec = CastFromNull;
+ kernel.signature = KernelSignature::Make({null()}, out_ty);
+ kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
+ kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
+ DCHECK_OK(func->AddKernel(Type::NA, std::move(kernel)));
+
+ // From dictionary to this type
+ if (CanCastFromDictionary(out_type_id)) {
+ // Dictionary unpacking not implemented for boolean or nested types.
+ //
+ // XXX: Uses Take and does its own memory allocation for the moment. We can
+ // fix this later.
+ DCHECK_OK(func->AddKernel(Type::DICTIONARY, {InputType(Type::DICTIONARY)}, out_ty,
+ TrivialScalarUnaryAsArraysExec(UnpackDictionary),
+ NullHandling::COMPUTED_NO_PREALLOCATE,
+ MemAllocation::NO_PREALLOCATE));
+ }
+
+ // From extension type to this type
+ DCHECK_OK(func->AddKernel(Type::EXTENSION, {InputType::Array(Type::EXTENSION)}, out_ty,
+ CastFromExtension, NullHandling::COMPUTED_NO_PREALLOCATE,
+ MemAllocation::NO_PREALLOCATE));
+}
+
+} // namespace internal
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_internal.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_internal.h
new file mode 100644
index 00000000000..2419d898a68
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_internal.h
@@ -0,0 +1,88 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/compute/api_vector.h"
+#include "arrow/compute/cast.h" // IWYU pragma: export
+#include "arrow/compute/cast_internal.h" // IWYU pragma: export
+#include "arrow/compute/kernels/common.h"
+#include "arrow/compute/kernels/util_internal.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+
+namespace compute {
+namespace internal {
+
+template <typename OutType, typename InType, typename Enable = void>
+struct CastFunctor {};
+
+// No-op functor for identity casts
+template <typename O, typename I>
+struct CastFunctor<
+ O, I, enable_if_t<std::is_same<O, I>::value && is_parameter_free_type<I>::value>> {
+ static Status Exec(KernelContext*, const ExecBatch&, Datum*) { return Status::OK(); }
+};
+
+Status CastFromExtension(KernelContext* ctx, const ExecBatch& batch, Datum* out);
+
+// Utility for numeric casts
+void CastNumberToNumberUnsafe(Type::type in_type, Type::type out_type, const Datum& input,
+ Datum* out);
+
+// ----------------------------------------------------------------------
+// Dictionary to other things
+
+Status UnpackDictionary(KernelContext* ctx, const ExecBatch& batch, Datum* out);
+
+Status OutputAllNull(KernelContext* ctx, const ExecBatch& batch, Datum* out);
+
+Status CastFromNull(KernelContext* ctx, const ExecBatch& batch, Datum* out);
+
+// Adds a cast function where CastFunctor is specialized and the input and output
+// types are parameter free (have a type_singleton). Scalar inputs are handled by
+// wrapping with TrivialScalarUnaryAsArraysExec.
+template <typename InType, typename OutType>
+void AddSimpleCast(InputType in_ty, OutputType out_ty, CastFunction* func) {
+ DCHECK_OK(func->AddKernel(
+ InType::type_id, {in_ty}, out_ty,
+ TrivialScalarUnaryAsArraysExec(CastFunctor<OutType, InType>::Exec)));
+}
+
+Status ZeroCopyCastExec(KernelContext* ctx, const ExecBatch& batch, Datum* out);
+
+void AddZeroCopyCast(Type::type in_type_id, InputType in_type, OutputType out_type,
+ CastFunction* func);
+
+// OutputType::Resolver that returns a descr with the shape of the input
+// argument and the type from CastOptions
+Result<ValueDescr> ResolveOutputFromOptions(KernelContext* ctx,
+ const std::vector<ValueDescr>& args);
+
+ARROW_EXPORT extern OutputType kOutputTargetType;
+
+// Add generic casts to out_ty from:
+// - the null type
+// - dictionary with out_ty as given value type
+// - extension types with a compatible storage type
+void AddCommonCasts(Type::type out_type_id, OutputType out_ty, CastFunction* func);
+
+} // namespace internal
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc
new file mode 100644
index 00000000000..ec92dbb5d60
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc
@@ -0,0 +1,133 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Implementation of casting to (or between) list types
+
+#include <utility>
+#include <vector>
+
+#include "arrow/array/builder_nested.h"
+#include "arrow/compute/api_scalar.h"
+#include "arrow/compute/cast.h"
+#include "arrow/compute/kernels/common.h"
+#include "arrow/compute/kernels/scalar_cast_internal.h"
+#include "arrow/util/bitmap_ops.h"
+
+namespace arrow {
+
+using internal::CopyBitmap;
+
+namespace compute {
+namespace internal {
+
+template <typename Type>
+Status CastListExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ using offset_type = typename Type::offset_type;
+ using ScalarType = typename TypeTraits<Type>::ScalarType;
+
+ const CastOptions& options = CastState::Get(ctx);
+
+ auto child_type = checked_cast<const Type&>(*out->type()).value_type();
+
+ if (out->kind() == Datum::SCALAR) {
+ const auto& in_scalar = checked_cast<const ScalarType&>(*batch[0].scalar());
+ auto out_scalar = checked_cast<ScalarType*>(out->scalar().get());
+
+ DCHECK(!out_scalar->is_valid);
+ if (in_scalar.is_valid) {
+ ARROW_ASSIGN_OR_RAISE(out_scalar->value, Cast(*in_scalar.value, child_type, options,
+ ctx->exec_context()));
+
+ out_scalar->is_valid = true;
+ }
+ return Status::OK();
+ }
+
+ const ArrayData& in_array = *batch[0].array();
+ ArrayData* out_array = out->mutable_array();
+
+ // Copy from parent
+ out_array->buffers = in_array.buffers;
+ Datum values = in_array.child_data[0];
+
+ if (in_array.offset != 0) {
+ if (in_array.buffers[0]) {
+ ARROW_ASSIGN_OR_RAISE(out_array->buffers[0],
+ CopyBitmap(ctx->memory_pool(), in_array.buffers[0]->data(),
+ in_array.offset, in_array.length));
+ }
+ ARROW_ASSIGN_OR_RAISE(out_array->buffers[1],
+ ctx->Allocate(sizeof(offset_type) * (in_array.length + 1)));
+
+ auto offsets = in_array.GetValues<offset_type>(1);
+ auto shifted_offsets = out_array->GetMutableValues<offset_type>(1);
+
+ for (int64_t i = 0; i < in_array.length + 1; ++i) {
+ shifted_offsets[i] = offsets[i] - offsets[0];
+ }
+ values = in_array.child_data[0]->Slice(offsets[0], offsets[in_array.length]);
+ }
+
+ ARROW_ASSIGN_OR_RAISE(Datum cast_values,
+ Cast(values, child_type, options, ctx->exec_context()));
+
+ DCHECK_EQ(Datum::ARRAY, cast_values.kind());
+ out_array->child_data.push_back(cast_values.array());
+ return Status::OK();
+}
+
+template <typename Type>
+void AddListCast(CastFunction* func) {
+ ScalarKernel kernel;
+ kernel.exec = CastListExec<Type>;
+ kernel.signature = KernelSignature::Make({InputType(Type::type_id)}, kOutputTargetType);
+ kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
+ DCHECK_OK(func->AddKernel(Type::type_id, std::move(kernel)));
+}
+
+std::vector<std::shared_ptr<CastFunction>> GetNestedCasts() {
+ // We use the list<T> from the CastOptions when resolving the output type
+
+ auto cast_list = std::make_shared<CastFunction>("cast_list", Type::LIST);
+ AddCommonCasts(Type::LIST, kOutputTargetType, cast_list.get());
+ AddListCast<ListType>(cast_list.get());
+
+ auto cast_large_list =
+ std::make_shared<CastFunction>("cast_large_list", Type::LARGE_LIST);
+ AddCommonCasts(Type::LARGE_LIST, kOutputTargetType, cast_large_list.get());
+ AddListCast<LargeListType>(cast_large_list.get());
+
+ // FSL is a bit incomplete at the moment
+ auto cast_fsl =
+ std::make_shared<CastFunction>("cast_fixed_size_list", Type::FIXED_SIZE_LIST);
+ AddCommonCasts(Type::FIXED_SIZE_LIST, kOutputTargetType, cast_fsl.get());
+
+ // So is struct
+ auto cast_struct = std::make_shared<CastFunction>("cast_struct", Type::STRUCT);
+ AddCommonCasts(Type::STRUCT, kOutputTargetType, cast_struct.get());
+
+ // So is dictionary
+ auto cast_dictionary =
+ std::make_shared<CastFunction>("cast_dictionary", Type::DICTIONARY);
+ AddCommonCasts(Type::DICTIONARY, kOutputTargetType, cast_dictionary.get());
+
+ return {cast_list, cast_large_list, cast_fsl, cast_struct, cast_dictionary};
+}
+
+} // namespace internal
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
new file mode 100644
index 00000000000..cc7b533f262
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
@@ -0,0 +1,727 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Implementation of casting to integer, floating point, or decimal types
+
+#include "arrow/array/builder_primitive.h"
+#include "arrow/compute/kernels/common.h"
+#include "arrow/compute/kernels/scalar_cast_internal.h"
+#include "arrow/compute/kernels/util_internal.h"
+#include "arrow/util/bit_block_counter.h"
+#include "arrow/util/int_util.h"
+#include "arrow/util/value_parsing.h"
+
+namespace arrow {
+
+using internal::BitBlockCount;
+using internal::CheckIntegersInRange;
+using internal::IntegersCanFit;
+using internal::OptionalBitBlockCounter;
+using internal::ParseValue;
+
+namespace compute {
+namespace internal {
+
+Status CastIntegerToInteger(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const auto& options = checked_cast<const CastState*>(ctx->state())->options;
+ if (!options.allow_int_overflow) {
+ RETURN_NOT_OK(IntegersCanFit(batch[0], *out->type()));
+ }
+ CastNumberToNumberUnsafe(batch[0].type()->id(), out->type()->id(), batch[0], out);
+ return Status::OK();
+}
+
+Status CastFloatingToFloating(KernelContext*, const ExecBatch& batch, Datum* out) {
+ CastNumberToNumberUnsafe(batch[0].type()->id(), out->type()->id(), batch[0], out);
+ return Status::OK();
+}
+
+// ----------------------------------------------------------------------
+// Implement fast safe floating point to integer cast
+
+// InType is a floating point type we are planning to cast to integer
+template <typename InType, typename OutType, typename InT = typename InType::c_type,
+ typename OutT = typename OutType::c_type>
+ARROW_DISABLE_UBSAN("float-cast-overflow")
+Status CheckFloatTruncation(const Datum& input, const Datum& output) {
+ auto WasTruncated = [&](OutT out_val, InT in_val) -> bool {
+ return static_cast<InT>(out_val) != in_val;
+ };
+ auto WasTruncatedMaybeNull = [&](OutT out_val, InT in_val, bool is_valid) -> bool {
+ return is_valid && static_cast<InT>(out_val) != in_val;
+ };
+ auto GetErrorMessage = [&](InT val) {
+ return Status::Invalid("Float value ", val, " was truncated converting to ",
+ *output.type());
+ };
+
+ if (input.kind() == Datum::SCALAR) {
+ DCHECK_EQ(output.kind(), Datum::SCALAR);
+ const auto& in_scalar = input.scalar_as<typename TypeTraits<InType>::ScalarType>();
+ const auto& out_scalar = output.scalar_as<typename TypeTraits<OutType>::ScalarType>();
+ if (WasTruncatedMaybeNull(out_scalar.value, in_scalar.value, out_scalar.is_valid)) {
+ return GetErrorMessage(in_scalar.value);
+ }
+ return Status::OK();
+ }
+
+ const ArrayData& in_array = *input.array();
+ const ArrayData& out_array = *output.array();
+
+ const InT* in_data = in_array.GetValues<InT>(1);
+ const OutT* out_data = out_array.GetValues<OutT>(1);
+
+ const uint8_t* bitmap = nullptr;
+ if (in_array.buffers[0]) {
+ bitmap = in_array.buffers[0]->data();
+ }
+ OptionalBitBlockCounter bit_counter(bitmap, in_array.offset, in_array.length);
+ int64_t position = 0;
+ int64_t offset_position = in_array.offset;
+ while (position < in_array.length) {
+ BitBlockCount block = bit_counter.NextBlock();
+ bool block_out_of_bounds = false;
+ if (block.popcount == block.length) {
+ // Fast path: branchless
+ for (int64_t i = 0; i < block.length; ++i) {
+ block_out_of_bounds |= WasTruncated(out_data[i], in_data[i]);
+ }
+ } else if (block.popcount > 0) {
+ // Indices have nulls, must only boundscheck non-null values
+ for (int64_t i = 0; i < block.length; ++i) {
+ block_out_of_bounds |= WasTruncatedMaybeNull(
+ out_data[i], in_data[i], BitUtil::GetBit(bitmap, offset_position + i));
+ }
+ }
+ if (ARROW_PREDICT_FALSE(block_out_of_bounds)) {
+ if (in_array.GetNullCount() > 0) {
+ for (int64_t i = 0; i < block.length; ++i) {
+ if (WasTruncatedMaybeNull(out_data[i], in_data[i],
+ BitUtil::GetBit(bitmap, offset_position + i))) {
+ return GetErrorMessage(in_data[i]);
+ }
+ }
+ } else {
+ for (int64_t i = 0; i < block.length; ++i) {
+ if (WasTruncated(out_data[i], in_data[i])) {
+ return GetErrorMessage(in_data[i]);
+ }
+ }
+ }
+ }
+ in_data += block.length;
+ out_data += block.length;
+ position += block.length;
+ offset_position += block.length;
+ }
+ return Status::OK();
+}
+
+template <typename InType>
+Status CheckFloatToIntTruncationImpl(const Datum& input, const Datum& output) {
+ switch (output.type()->id()) {
+ case Type::INT8:
+ return CheckFloatTruncation<InType, Int8Type>(input, output);
+ case Type::INT16:
+ return CheckFloatTruncation<InType, Int16Type>(input, output);
+ case Type::INT32:
+ return CheckFloatTruncation<InType, Int32Type>(input, output);
+ case Type::INT64:
+ return CheckFloatTruncation<InType, Int64Type>(input, output);
+ case Type::UINT8:
+ return CheckFloatTruncation<InType, UInt8Type>(input, output);
+ case Type::UINT16:
+ return CheckFloatTruncation<InType, UInt16Type>(input, output);
+ case Type::UINT32:
+ return CheckFloatTruncation<InType, UInt32Type>(input, output);
+ case Type::UINT64:
+ return CheckFloatTruncation<InType, UInt64Type>(input, output);
+ default:
+ break;
+ }
+ DCHECK(false);
+ return Status::OK();
+}
+
+Status CheckFloatToIntTruncation(const Datum& input, const Datum& output) {
+ switch (input.type()->id()) {
+ case Type::FLOAT:
+ return CheckFloatToIntTruncationImpl<FloatType>(input, output);
+ case Type::DOUBLE:
+ return CheckFloatToIntTruncationImpl<DoubleType>(input, output);
+ default:
+ break;
+ }
+ DCHECK(false);
+ return Status::OK();
+}
+
+Status CastFloatingToInteger(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const auto& options = checked_cast<const CastState*>(ctx->state())->options;
+ CastNumberToNumberUnsafe(batch[0].type()->id(), out->type()->id(), batch[0], out);
+ if (!options.allow_float_truncate) {
+ RETURN_NOT_OK(CheckFloatToIntTruncation(batch[0], *out));
+ }
+ return Status::OK();
+}
+
+// ----------------------------------------------------------------------
+// Implement fast integer to floating point cast
+
+// These are the limits for exact representation of whole numbers in floating
+// point numbers
+template <typename T>
+struct FloatingIntegerBound {};
+
+template <>
+struct FloatingIntegerBound<float> {
+ static const int64_t value = 1LL << 24;
+};
+
+template <>
+struct FloatingIntegerBound<double> {
+ static const int64_t value = 1LL << 53;
+};
+
+template <typename InType, typename OutType, typename InT = typename InType::c_type,
+ typename OutT = typename OutType::c_type,
+ bool IsSigned = is_signed_integer_type<InType>::value>
+Status CheckIntegerFloatTruncateImpl(const Datum& input) {
+ using InScalarType = typename TypeTraits<InType>::ScalarType;
+ const int64_t limit = FloatingIntegerBound<OutT>::value;
+ InScalarType bound_lower(IsSigned ? -limit : 0);
+ InScalarType bound_upper(limit);
+ return CheckIntegersInRange(input, bound_lower, bound_upper);
+}
+
+Status CheckForIntegerToFloatingTruncation(const Datum& input, Type::type out_type) {
+ switch (input.type()->id()) {
+ // Small integers are all exactly representable as whole numbers
+ case Type::INT8:
+ case Type::INT16:
+ case Type::UINT8:
+ case Type::UINT16:
+ return Status::OK();
+ case Type::INT32: {
+ if (out_type == Type::DOUBLE) {
+ return Status::OK();
+ }
+ return CheckIntegerFloatTruncateImpl<Int32Type, FloatType>(input);
+ }
+ case Type::UINT32: {
+ if (out_type == Type::DOUBLE) {
+ return Status::OK();
+ }
+ return CheckIntegerFloatTruncateImpl<UInt32Type, FloatType>(input);
+ }
+ case Type::INT64: {
+ if (out_type == Type::FLOAT) {
+ return CheckIntegerFloatTruncateImpl<Int64Type, FloatType>(input);
+ } else {
+ return CheckIntegerFloatTruncateImpl<Int64Type, DoubleType>(input);
+ }
+ }
+ case Type::UINT64: {
+ if (out_type == Type::FLOAT) {
+ return CheckIntegerFloatTruncateImpl<UInt64Type, FloatType>(input);
+ } else {
+ return CheckIntegerFloatTruncateImpl<UInt64Type, DoubleType>(input);
+ }
+ }
+ default:
+ break;
+ }
+ DCHECK(false);
+ return Status::OK();
+}
+
+Status CastIntegerToFloating(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const auto& options = checked_cast<const CastState*>(ctx->state())->options;
+ Type::type out_type = out->type()->id();
+ if (!options.allow_float_truncate) {
+ RETURN_NOT_OK(CheckForIntegerToFloatingTruncation(batch[0], out_type));
+ }
+ CastNumberToNumberUnsafe(batch[0].type()->id(), out_type, batch[0], out);
+ return Status::OK();
+}
+
+// ----------------------------------------------------------------------
+// Boolean to number
+
+struct BooleanToNumber {
+ template <typename OutValue, typename Arg0Value>
+ static OutValue Call(KernelContext*, Arg0Value val, Status*) {
+ constexpr auto kOne = static_cast<OutValue>(1);
+ constexpr auto kZero = static_cast<OutValue>(0);
+ return val ? kOne : kZero;
+ }
+};
+
+template <typename O>
+struct CastFunctor<O, BooleanType, enable_if_number<O>> {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ return applicator::ScalarUnary<O, BooleanType, BooleanToNumber>::Exec(ctx, batch,
+ out);
+ }
+};
+
+// ----------------------------------------------------------------------
+// String to number
+
+template <typename OutType>
+struct ParseString {
+ template <typename OutValue, typename Arg0Value>
+ OutValue Call(KernelContext* ctx, Arg0Value val, Status* st) const {
+ OutValue result = OutValue(0);
+ if (ARROW_PREDICT_FALSE(!ParseValue<OutType>(val.data(), val.size(), &result))) {
+ *st = Status::Invalid("Failed to parse string: '", val, "' as a scalar of type ",
+ TypeTraits<OutType>::type_singleton()->ToString());
+ }
+ return result;
+ }
+};
+
+template <typename O, typename I>
+struct CastFunctor<O, I, enable_if_base_binary<I>> {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ return applicator::ScalarUnaryNotNull<O, I, ParseString<O>>::Exec(ctx, batch, out);
+ }
+};
+
+// ----------------------------------------------------------------------
+// Decimal to integer
+
+struct DecimalToIntegerMixin {
+ template <typename OutValue, typename Arg0Value>
+ OutValue ToInteger(KernelContext* ctx, const Arg0Value& val, Status* st) const {
+ constexpr auto min_value = std::numeric_limits<OutValue>::min();
+ constexpr auto max_value = std::numeric_limits<OutValue>::max();
+
+ if (!allow_int_overflow_ && ARROW_PREDICT_FALSE(val < min_value || val > max_value)) {
+ *st = Status::Invalid("Integer value out of bounds");
+ return OutValue{}; // Zero
+ } else {
+ return static_cast<OutValue>(val.low_bits());
+ }
+ }
+
+ DecimalToIntegerMixin(int32_t in_scale, bool allow_int_overflow)
+ : in_scale_(in_scale), allow_int_overflow_(allow_int_overflow) {}
+
+ int32_t in_scale_;
+ bool allow_int_overflow_;
+};
+
+struct UnsafeUpscaleDecimalToInteger : public DecimalToIntegerMixin {
+ using DecimalToIntegerMixin::DecimalToIntegerMixin;
+
+ template <typename OutValue, typename Arg0Value>
+ OutValue Call(KernelContext* ctx, Arg0Value val, Status* st) const {
+ return ToInteger<OutValue>(ctx, val.IncreaseScaleBy(-in_scale_), st);
+ }
+};
+
+struct UnsafeDownscaleDecimalToInteger : public DecimalToIntegerMixin {
+ using DecimalToIntegerMixin::DecimalToIntegerMixin;
+
+ template <typename OutValue, typename Arg0Value>
+ OutValue Call(KernelContext* ctx, Arg0Value val, Status* st) const {
+ return ToInteger<OutValue>(ctx, val.ReduceScaleBy(in_scale_, false), st);
+ }
+};
+
+struct SafeRescaleDecimalToInteger : public DecimalToIntegerMixin {
+ using DecimalToIntegerMixin::DecimalToIntegerMixin;
+
+ template <typename OutValue, typename Arg0Value>
+ OutValue Call(KernelContext* ctx, Arg0Value val, Status* st) const {
+ auto result = val.Rescale(in_scale_, 0);
+ if (ARROW_PREDICT_FALSE(!result.ok())) {
+ *st = result.status();
+ return OutValue{}; // Zero
+ } else {
+ return ToInteger<OutValue>(ctx, *result, st);
+ }
+ }
+};
+
+template <typename O, typename I>
+struct CastFunctor<O, I,
+ enable_if_t<is_integer_type<O>::value && is_decimal_type<I>::value>> {
+ using out_type = typename O::c_type;
+
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const auto& options = checked_cast<const CastState*>(ctx->state())->options;
+
+ const auto& in_type_inst = checked_cast<const I&>(*batch[0].type());
+ const auto in_scale = in_type_inst.scale();
+
+ if (options.allow_decimal_truncate) {
+ if (in_scale < 0) {
+ // Unsafe upscale
+ applicator::ScalarUnaryNotNullStateful<O, I, UnsafeUpscaleDecimalToInteger>
+ kernel(UnsafeUpscaleDecimalToInteger{in_scale, options.allow_int_overflow});
+ return kernel.Exec(ctx, batch, out);
+ } else {
+ // Unsafe downscale
+ applicator::ScalarUnaryNotNullStateful<O, I, UnsafeDownscaleDecimalToInteger>
+ kernel(UnsafeDownscaleDecimalToInteger{in_scale, options.allow_int_overflow});
+ return kernel.Exec(ctx, batch, out);
+ }
+ } else {
+ // Safe rescale
+ applicator::ScalarUnaryNotNullStateful<O, I, SafeRescaleDecimalToInteger> kernel(
+ SafeRescaleDecimalToInteger{in_scale, options.allow_int_overflow});
+ return kernel.Exec(ctx, batch, out);
+ }
+ }
+};
+
+// ----------------------------------------------------------------------
+// Decimal to decimal
+
+// Helper that converts the input and output decimals
+// For instance, Decimal128 -> Decimal256 requires converting, then scaling
+// Decimal256 -> Decimal128 requires scaling, then truncating
+template <typename OutDecimal, typename InDecimal>
+struct DecimalConversions {};
+
+template <typename InDecimal>
+struct DecimalConversions<Decimal256, InDecimal> {
+ // Convert then scale
+ static Decimal256 ConvertInput(InDecimal&& val) { return Decimal256(val); }
+ static Decimal256 ConvertOutput(Decimal256&& val) { return val; }
+};
+
+template <>
+struct DecimalConversions<Decimal128, Decimal256> {
+ // Scale then truncate
+ static Decimal256 ConvertInput(Decimal256&& val) { return val; }
+ static Decimal128 ConvertOutput(Decimal256&& val) {
+ return Decimal128(val.little_endian_array()[1], val.little_endian_array()[0]);
+ }
+};
+
+template <>
+struct DecimalConversions<Decimal128, Decimal128> {
+ static Decimal128 ConvertInput(Decimal128&& val) { return val; }
+ static Decimal128 ConvertOutput(Decimal128&& val) { return val; }
+};
+
+struct UnsafeUpscaleDecimal {
+ template <typename OutValue, typename Arg0Value>
+ OutValue Call(KernelContext*, Arg0Value val, Status*) const {
+ using Conv = DecimalConversions<OutValue, Arg0Value>;
+ return Conv::ConvertOutput(Conv::ConvertInput(std::move(val)).IncreaseScaleBy(by_));
+ }
+ int32_t by_;
+};
+
+struct UnsafeDownscaleDecimal {
+ template <typename OutValue, typename Arg0Value>
+ OutValue Call(KernelContext*, Arg0Value val, Status*) const {
+ using Conv = DecimalConversions<OutValue, Arg0Value>;
+ return Conv::ConvertOutput(
+ Conv::ConvertInput(std::move(val)).ReduceScaleBy(by_, false));
+ }
+ int32_t by_;
+};
+
+struct SafeRescaleDecimal {
+ template <typename OutValue, typename Arg0Value>
+ OutValue Call(KernelContext*, Arg0Value val, Status* st) const {
+ using Conv = DecimalConversions<OutValue, Arg0Value>;
+ auto maybe_rescaled =
+ Conv::ConvertInput(std::move(val)).Rescale(in_scale_, out_scale_);
+ if (ARROW_PREDICT_FALSE(!maybe_rescaled.ok())) {
+ *st = maybe_rescaled.status();
+ return {}; // Zero
+ }
+
+ if (ARROW_PREDICT_TRUE(maybe_rescaled->FitsInPrecision(out_precision_))) {
+ return Conv::ConvertOutput(maybe_rescaled.MoveValueUnsafe());
+ }
+
+ *st = Status::Invalid("Decimal value does not fit in precision ", out_precision_);
+ return {}; // Zero
+ }
+
+ int32_t out_scale_, out_precision_, in_scale_;
+};
+
+template <typename O, typename I>
+struct CastFunctor<O, I,
+ enable_if_t<is_decimal_type<O>::value && is_decimal_type<I>::value>> {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const auto& options = checked_cast<const CastState*>(ctx->state())->options;
+
+ const auto& in_type = checked_cast<const I&>(*batch[0].type());
+ const auto& out_type = checked_cast<const O&>(*out->type());
+ const auto in_scale = in_type.scale();
+ const auto out_scale = out_type.scale();
+
+ if (options.allow_decimal_truncate) {
+ if (in_scale < out_scale) {
+ // Unsafe upscale
+ applicator::ScalarUnaryNotNullStateful<O, I, UnsafeUpscaleDecimal> kernel(
+ UnsafeUpscaleDecimal{out_scale - in_scale});
+ return kernel.Exec(ctx, batch, out);
+ } else {
+ // Unsafe downscale
+ applicator::ScalarUnaryNotNullStateful<O, I, UnsafeDownscaleDecimal> kernel(
+ UnsafeDownscaleDecimal{in_scale - out_scale});
+ return kernel.Exec(ctx, batch, out);
+ }
+ }
+
+ // Safe rescale
+ applicator::ScalarUnaryNotNullStateful<O, I, SafeRescaleDecimal> kernel(
+ SafeRescaleDecimal{out_scale, out_type.precision(), in_scale});
+ return kernel.Exec(ctx, batch, out);
+ }
+};
+
+// ----------------------------------------------------------------------
+// Real to decimal
+
+struct RealToDecimal {
+ template <typename OutValue, typename RealType>
+ OutValue Call(KernelContext*, RealType val, Status* st) const {
+ auto maybe_decimal = OutValue::FromReal(val, out_precision_, out_scale_);
+
+ if (ARROW_PREDICT_TRUE(maybe_decimal.ok())) {
+ return maybe_decimal.MoveValueUnsafe();
+ }
+
+ if (!allow_truncate_) {
+ *st = maybe_decimal.status();
+ }
+ return {}; // Zero
+ }
+
+ int32_t out_scale_, out_precision_;
+ bool allow_truncate_;
+};
+
+template <typename O, typename I>
+struct CastFunctor<O, I,
+ enable_if_t<is_decimal_type<O>::value && is_floating_type<I>::value>> {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const auto& options = checked_cast<const CastState*>(ctx->state())->options;
+ const auto& out_type = checked_cast<const O&>(*out->type());
+ const auto out_scale = out_type.scale();
+ const auto out_precision = out_type.precision();
+
+ applicator::ScalarUnaryNotNullStateful<O, I, RealToDecimal> kernel(
+ RealToDecimal{out_scale, out_precision, options.allow_decimal_truncate});
+ return kernel.Exec(ctx, batch, out);
+ }
+};
+
+// ----------------------------------------------------------------------
+// Decimal to real
+
+struct DecimalToReal {
+ template <typename RealType, typename Arg0Value>
+ RealType Call(KernelContext*, const Arg0Value& val, Status*) const {
+ return val.template ToReal<RealType>(in_scale_);
+ }
+
+ int32_t in_scale_;
+};
+
+template <typename O, typename I>
+struct CastFunctor<O, I,
+ enable_if_t<is_floating_type<O>::value && is_decimal_type<I>::value>> {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const auto& in_type = checked_cast<const I&>(*batch[0].type());
+ const auto in_scale = in_type.scale();
+
+ applicator::ScalarUnaryNotNullStateful<O, I, DecimalToReal> kernel(
+ DecimalToReal{in_scale});
+ return kernel.Exec(ctx, batch, out);
+ }
+};
+
+// ----------------------------------------------------------------------
+// Top-level kernel instantiation
+
+namespace {
+
+template <typename OutType>
+void AddCommonNumberCasts(const std::shared_ptr<DataType>& out_ty, CastFunction* func) {
+ AddCommonCasts(out_ty->id(), out_ty, func);
+
+ // Cast from boolean to number
+ DCHECK_OK(func->AddKernel(Type::BOOL, {boolean()}, out_ty,
+ CastFunctor<OutType, BooleanType>::Exec));
+
+ // Cast from other strings
+ for (const std::shared_ptr<DataType>& in_ty : BaseBinaryTypes()) {
+ auto exec = GenerateVarBinaryBase<CastFunctor, OutType>(*in_ty);
+ DCHECK_OK(func->AddKernel(in_ty->id(), {in_ty}, out_ty, exec));
+ }
+}
+
+template <typename OutType>
+std::shared_ptr<CastFunction> GetCastToInteger(std::string name) {
+ auto func = std::make_shared<CastFunction>(std::move(name), OutType::type_id);
+ auto out_ty = TypeTraits<OutType>::type_singleton();
+
+ for (const std::shared_ptr<DataType>& in_ty : IntTypes()) {
+ DCHECK_OK(func->AddKernel(in_ty->id(), {in_ty}, out_ty, CastIntegerToInteger));
+ }
+
+ // Cast from floating point
+ for (const std::shared_ptr<DataType>& in_ty : FloatingPointTypes()) {
+ DCHECK_OK(func->AddKernel(in_ty->id(), {in_ty}, out_ty, CastFloatingToInteger));
+ }
+
+ // From other numbers to integer
+ AddCommonNumberCasts<OutType>(out_ty, func.get());
+
+ // From decimal to integer
+ DCHECK_OK(func->AddKernel(Type::DECIMAL, {InputType(Type::DECIMAL)}, out_ty,
+ CastFunctor<OutType, Decimal128Type>::Exec));
+ DCHECK_OK(func->AddKernel(Type::DECIMAL256, {InputType(Type::DECIMAL256)}, out_ty,
+ CastFunctor<OutType, Decimal256Type>::Exec));
+ return func;
+}
+
+template <typename OutType>
+std::shared_ptr<CastFunction> GetCastToFloating(std::string name) {
+ auto func = std::make_shared<CastFunction>(std::move(name), OutType::type_id);
+ auto out_ty = TypeTraits<OutType>::type_singleton();
+
+ // Casts from integer to floating point
+ for (const std::shared_ptr<DataType>& in_ty : IntTypes()) {
+ DCHECK_OK(func->AddKernel(in_ty->id(), {in_ty}, out_ty, CastIntegerToFloating));
+ }
+
+ // Cast from floating point
+ for (const std::shared_ptr<DataType>& in_ty : FloatingPointTypes()) {
+ DCHECK_OK(func->AddKernel(in_ty->id(), {in_ty}, out_ty, CastFloatingToFloating));
+ }
+
+ // From other numbers to floating point
+ AddCommonNumberCasts<OutType>(out_ty, func.get());
+
+ // From decimal to floating point
+ DCHECK_OK(func->AddKernel(Type::DECIMAL, {InputType(Type::DECIMAL)}, out_ty,
+ CastFunctor<OutType, Decimal128Type>::Exec));
+ DCHECK_OK(func->AddKernel(Type::DECIMAL256, {InputType(Type::DECIMAL256)}, out_ty,
+ CastFunctor<OutType, Decimal256Type>::Exec));
+ return func;
+}
+
+std::shared_ptr<CastFunction> GetCastToDecimal128() {
+ OutputType sig_out_ty(ResolveOutputFromOptions);
+
+ auto func = std::make_shared<CastFunction>("cast_decimal", Type::DECIMAL128);
+ AddCommonCasts(Type::DECIMAL128, sig_out_ty, func.get());
+
+ // Cast from floating point
+ DCHECK_OK(func->AddKernel(Type::FLOAT, {float32()}, sig_out_ty,
+ CastFunctor<Decimal128Type, FloatType>::Exec));
+ DCHECK_OK(func->AddKernel(Type::DOUBLE, {float64()}, sig_out_ty,
+ CastFunctor<Decimal128Type, DoubleType>::Exec));
+
+ // Cast from other decimal
+ auto exec = CastFunctor<Decimal128Type, Decimal128Type>::Exec;
+ // We resolve the output type of this kernel from the CastOptions
+ DCHECK_OK(
+ func->AddKernel(Type::DECIMAL128, {InputType(Type::DECIMAL128)}, sig_out_ty, exec));
+ exec = CastFunctor<Decimal128Type, Decimal256Type>::Exec;
+ DCHECK_OK(
+ func->AddKernel(Type::DECIMAL256, {InputType(Type::DECIMAL256)}, sig_out_ty, exec));
+ return func;
+}
+
+std::shared_ptr<CastFunction> GetCastToDecimal256() {
+ OutputType sig_out_ty(ResolveOutputFromOptions);
+
+ auto func = std::make_shared<CastFunction>("cast_decimal256", Type::DECIMAL256);
+ AddCommonCasts(Type::DECIMAL256, sig_out_ty, func.get());
+
+ // Cast from floating point
+ DCHECK_OK(func->AddKernel(Type::FLOAT, {float32()}, sig_out_ty,
+ CastFunctor<Decimal256Type, FloatType>::Exec));
+ DCHECK_OK(func->AddKernel(Type::DOUBLE, {float64()}, sig_out_ty,
+ CastFunctor<Decimal256Type, DoubleType>::Exec));
+
+ // Cast from other decimal
+ auto exec = CastFunctor<Decimal256Type, Decimal128Type>::Exec;
+ DCHECK_OK(
+ func->AddKernel(Type::DECIMAL128, {InputType(Type::DECIMAL128)}, sig_out_ty, exec));
+ exec = CastFunctor<Decimal256Type, Decimal256Type>::Exec;
+ DCHECK_OK(
+ func->AddKernel(Type::DECIMAL256, {InputType(Type::DECIMAL256)}, sig_out_ty, exec));
+ return func;
+}
+
+} // namespace
+
+std::vector<std::shared_ptr<CastFunction>> GetNumericCasts() {
+ std::vector<std::shared_ptr<CastFunction>> functions;
+
+ // Make a cast to null that does not do much. Not sure why we need to be able
+ // to cast from dict<null> -> null but there are unit tests for it
+ auto cast_null = std::make_shared<CastFunction>("cast_null", Type::NA);
+ DCHECK_OK(cast_null->AddKernel(Type::DICTIONARY, {InputType(Type::DICTIONARY)}, null(),
+ OutputAllNull));
+ functions.push_back(cast_null);
+
+ functions.push_back(GetCastToInteger<Int8Type>("cast_int8"));
+ functions.push_back(GetCastToInteger<Int16Type>("cast_int16"));
+
+ auto cast_int32 = GetCastToInteger<Int32Type>("cast_int32");
+ // Convert DATE32 or TIME32 to INT32 zero copy
+ AddZeroCopyCast(Type::DATE32, date32(), int32(), cast_int32.get());
+ AddZeroCopyCast(Type::TIME32, InputType(Type::TIME32), int32(), cast_int32.get());
+ functions.push_back(cast_int32);
+
+ auto cast_int64 = GetCastToInteger<Int64Type>("cast_int64");
+ // Convert DATE64, DURATION, TIMESTAMP, TIME64 to INT64 zero copy
+ AddZeroCopyCast(Type::DATE64, InputType(Type::DATE64), int64(), cast_int64.get());
+ AddZeroCopyCast(Type::DURATION, InputType(Type::DURATION), int64(), cast_int64.get());
+ AddZeroCopyCast(Type::TIMESTAMP, InputType(Type::TIMESTAMP), int64(), cast_int64.get());
+ AddZeroCopyCast(Type::TIME64, InputType(Type::TIME64), int64(), cast_int64.get());
+ functions.push_back(cast_int64);
+
+ functions.push_back(GetCastToInteger<UInt8Type>("cast_uint8"));
+ functions.push_back(GetCastToInteger<UInt16Type>("cast_uint16"));
+ functions.push_back(GetCastToInteger<UInt32Type>("cast_uint32"));
+ functions.push_back(GetCastToInteger<UInt64Type>("cast_uint64"));
+
+ // HalfFloat is a bit brain-damaged for now
+ auto cast_half_float =
+ std::make_shared<CastFunction>("cast_half_float", Type::HALF_FLOAT);
+ AddCommonCasts(Type::HALF_FLOAT, float16(), cast_half_float.get());
+ functions.push_back(cast_half_float);
+
+ functions.push_back(GetCastToFloating<FloatType>("cast_float"));
+ functions.push_back(GetCastToFloating<DoubleType>("cast_double"));
+
+ functions.push_back(GetCastToDecimal128());
+ functions.push_back(GetCastToDecimal256());
+
+ return functions;
+}
+
+} // namespace internal
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_string.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_string.cc
new file mode 100644
index 00000000000..3ce537b7223
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_string.cc
@@ -0,0 +1,247 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <limits>
+
+#include "arrow/array/array_base.h"
+#include "arrow/array/builder_binary.h"
+#include "arrow/compute/kernels/common.h"
+#include "arrow/compute/kernels/scalar_cast_internal.h"
+#include "arrow/result.h"
+#include "arrow/util/formatting.h"
+#include "arrow/util/int_util.h"
+#include "arrow/util/optional.h"
+#include "arrow/util/utf8.h"
+#include "arrow/visitor_inline.h"
+
+namespace arrow {
+
+using internal::StringFormatter;
+using util::InitializeUTF8;
+using util::ValidateUTF8;
+
+namespace compute {
+namespace internal {
+
+namespace {
+
+// ----------------------------------------------------------------------
+// Number / Boolean to String
+
+template <typename O, typename I>
+struct NumericToStringCastFunctor {
+ using value_type = typename TypeTraits<I>::CType;
+ using BuilderType = typename TypeTraits<O>::BuilderType;
+ using FormatterType = StringFormatter<I>;
+
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ DCHECK(out->is_array());
+ const ArrayData& input = *batch[0].array();
+ ArrayData* output = out->mutable_array();
+ return Convert(ctx, input, output);
+ }
+
+ static Status Convert(KernelContext* ctx, const ArrayData& input, ArrayData* output) {
+ FormatterType formatter(input.type);
+ BuilderType builder(input.type, ctx->memory_pool());
+ RETURN_NOT_OK(VisitArrayDataInline<I>(
+ input,
+ [&](value_type v) {
+ return formatter(v, [&](util::string_view v) { return builder.Append(v); });
+ },
+ [&]() { return builder.AppendNull(); }));
+
+ std::shared_ptr<Array> output_array;
+ RETURN_NOT_OK(builder.Finish(&output_array));
+ *output = std::move(*output_array->data());
+ return Status::OK();
+ }
+};
+
+// ----------------------------------------------------------------------
+// Binary-like to binary-like
+//
+
+#if defined(_MSC_VER)
+// Silence warning: """'visitor': unreferenced local variable"""
+#pragma warning(push)
+#pragma warning(disable : 4101)
+#endif
+
+struct Utf8Validator {
+ Status VisitNull() { return Status::OK(); }
+
+ Status VisitValue(util::string_view str) {
+ if (ARROW_PREDICT_FALSE(!ValidateUTF8(str))) {
+ return Status::Invalid("Invalid UTF8 payload");
+ }
+ return Status::OK();
+ }
+};
+
+template <typename I, typename O>
+Status CastBinaryToBinaryOffsets(KernelContext* ctx, const ArrayData& input,
+ ArrayData* output) {
+ static_assert(std::is_same<I, O>::value, "Cast same-width offsets (no-op)");
+ return Status::OK();
+}
+
+// Upcast offsets
+template <>
+Status CastBinaryToBinaryOffsets<int32_t, int64_t>(KernelContext* ctx,
+ const ArrayData& input,
+ ArrayData* output) {
+ using input_offset_type = int32_t;
+ using output_offset_type = int64_t;
+ ARROW_ASSIGN_OR_RAISE(
+ output->buffers[1],
+ ctx->Allocate((output->length + output->offset + 1) * sizeof(output_offset_type)));
+ memset(output->buffers[1]->mutable_data(), 0,
+ output->offset * sizeof(output_offset_type));
+ ::arrow::internal::CastInts(input.GetValues<input_offset_type>(1),
+ output->GetMutableValues<output_offset_type>(1),
+ output->length + 1);
+ return Status::OK();
+}
+
+// Downcast offsets
+template <>
+Status CastBinaryToBinaryOffsets<int64_t, int32_t>(KernelContext* ctx,
+ const ArrayData& input,
+ ArrayData* output) {
+ using input_offset_type = int64_t;
+ using output_offset_type = int32_t;
+
+ constexpr input_offset_type kMaxOffset = std::numeric_limits<output_offset_type>::max();
+
+ auto input_offsets = input.GetValues<input_offset_type>(1);
+
+ // Binary offsets are ascending, so it's enough to check the last one for overflow.
+ if (input_offsets[input.length] > kMaxOffset) {
+ return Status::Invalid("Failed casting from ", input.type->ToString(), " to ",
+ output->type->ToString(), ": input array too large");
+ } else {
+ ARROW_ASSIGN_OR_RAISE(output->buffers[1],
+ ctx->Allocate((output->length + output->offset + 1) *
+ sizeof(output_offset_type)));
+ memset(output->buffers[1]->mutable_data(), 0,
+ output->offset * sizeof(output_offset_type));
+ ::arrow::internal::CastInts(input.GetValues<input_offset_type>(1),
+ output->GetMutableValues<output_offset_type>(1),
+ output->length + 1);
+ return Status::OK();
+ }
+}
+
+template <typename O, typename I>
+Status BinaryToBinaryCastExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ DCHECK(out->is_array());
+ const CastOptions& options = checked_cast<const CastState&>(*ctx->state()).options;
+ const ArrayData& input = *batch[0].array();
+
+ if (!I::is_utf8 && O::is_utf8 && !options.allow_invalid_utf8) {
+ InitializeUTF8();
+
+ ArrayDataVisitor<I> visitor;
+ Utf8Validator validator;
+ RETURN_NOT_OK(visitor.Visit(input, &validator));
+ }
+
+ // Start with a zero-copy cast, but change indices to expected size
+ RETURN_NOT_OK(ZeroCopyCastExec(ctx, batch, out));
+ return CastBinaryToBinaryOffsets<typename I::offset_type, typename O::offset_type>(
+ ctx, input, out->mutable_array());
+}
+
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+
+// ----------------------------------------------------------------------
+// Cast functions registration
+
+template <typename OutType>
+void AddNumberToStringCasts(CastFunction* func) {
+ auto out_ty = TypeTraits<OutType>::type_singleton();
+
+ DCHECK_OK(func->AddKernel(Type::BOOL, {boolean()}, out_ty,
+ TrivialScalarUnaryAsArraysExec(
+ NumericToStringCastFunctor<OutType, BooleanType>::Exec),
+ NullHandling::COMPUTED_NO_PREALLOCATE));
+
+ for (const std::shared_ptr<DataType>& in_ty : NumericTypes()) {
+ DCHECK_OK(
+ func->AddKernel(in_ty->id(), {in_ty}, out_ty,
+ TrivialScalarUnaryAsArraysExec(
+ GenerateNumeric<NumericToStringCastFunctor, OutType>(*in_ty)),
+ NullHandling::COMPUTED_NO_PREALLOCATE));
+ }
+}
+
+template <typename OutType, typename InType>
+void AddBinaryToBinaryCast(CastFunction* func) {
+ auto in_ty = TypeTraits<InType>::type_singleton();
+ auto out_ty = TypeTraits<OutType>::type_singleton();
+
+ DCHECK_OK(func->AddKernel(
+ InType::type_id, {in_ty}, out_ty,
+ TrivialScalarUnaryAsArraysExec(BinaryToBinaryCastExec<OutType, InType>),
+ NullHandling::COMPUTED_NO_PREALLOCATE));
+}
+
+template <typename OutType>
+void AddBinaryToBinaryCast(CastFunction* func) {
+ AddBinaryToBinaryCast<OutType, StringType>(func);
+ AddBinaryToBinaryCast<OutType, BinaryType>(func);
+ AddBinaryToBinaryCast<OutType, LargeStringType>(func);
+ AddBinaryToBinaryCast<OutType, LargeBinaryType>(func);
+}
+
+} // namespace
+
+std::vector<std::shared_ptr<CastFunction>> GetBinaryLikeCasts() {
+ auto cast_binary = std::make_shared<CastFunction>("cast_binary", Type::BINARY);
+ AddCommonCasts(Type::BINARY, binary(), cast_binary.get());
+ AddBinaryToBinaryCast<BinaryType>(cast_binary.get());
+
+ auto cast_large_binary =
+ std::make_shared<CastFunction>("cast_large_binary", Type::LARGE_BINARY);
+ AddCommonCasts(Type::LARGE_BINARY, large_binary(), cast_large_binary.get());
+ AddBinaryToBinaryCast<LargeBinaryType>(cast_large_binary.get());
+
+ auto cast_string = std::make_shared<CastFunction>("cast_string", Type::STRING);
+ AddCommonCasts(Type::STRING, utf8(), cast_string.get());
+ AddNumberToStringCasts<StringType>(cast_string.get());
+ AddBinaryToBinaryCast<StringType>(cast_string.get());
+
+ auto cast_large_string =
+ std::make_shared<CastFunction>("cast_large_string", Type::LARGE_STRING);
+ AddCommonCasts(Type::LARGE_STRING, large_utf8(), cast_large_string.get());
+ AddNumberToStringCasts<LargeStringType>(cast_large_string.get());
+ AddBinaryToBinaryCast<LargeStringType>(cast_large_string.get());
+
+ auto cast_fsb =
+ std::make_shared<CastFunction>("cast_fixed_size_binary", Type::FIXED_SIZE_BINARY);
+ AddCommonCasts(Type::FIXED_SIZE_BINARY, OutputType(ResolveOutputFromOptions),
+ cast_fsb.get());
+
+ return {cast_binary, cast_large_binary, cast_string, cast_large_string, cast_fsb};
+}
+
+} // namespace internal
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc
new file mode 100644
index 00000000000..1a58fce7c74
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc
@@ -0,0 +1,452 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Implementation of casting to (or between) temporal types
+
+#include <limits>
+
+#include "arrow/array/builder_time.h"
+#include "arrow/compute/kernels/common.h"
+#include "arrow/compute/kernels/scalar_cast_internal.h"
+#include "arrow/util/bitmap_reader.h"
+#include "arrow/util/time.h"
+#include "arrow/util/value_parsing.h"
+
+namespace arrow {
+
+using internal::ParseValue;
+
+namespace compute {
+namespace internal {
+
+constexpr int64_t kMillisecondsInDay = 86400000;
+
+// ----------------------------------------------------------------------
+// From one timestamp to another
+
+template <typename in_type, typename out_type>
+Status ShiftTime(KernelContext* ctx, const util::DivideOrMultiply factor_op,
+ const int64_t factor, const ArrayData& input, ArrayData* output) {
+ const CastOptions& options = checked_cast<const CastState&>(*ctx->state()).options;
+ auto in_data = input.GetValues<in_type>(1);
+ auto out_data = output->GetMutableValues<out_type>(1);
+
+ if (factor == 1) {
+ for (int64_t i = 0; i < input.length; i++) {
+ out_data[i] = static_cast<out_type>(in_data[i]);
+ }
+ } else if (factor_op == util::MULTIPLY) {
+ if (options.allow_time_overflow) {
+ for (int64_t i = 0; i < input.length; i++) {
+ out_data[i] = static_cast<out_type>(in_data[i] * factor);
+ }
+ } else {
+#define RAISE_OVERFLOW_CAST(VAL) \
+ return Status::Invalid("Casting from ", input.type->ToString(), " to ", \
+ output->type->ToString(), " would result in ", \
+ "out of bounds timestamp: ", VAL);
+
+ int64_t max_val = std::numeric_limits<int64_t>::max() / factor;
+ int64_t min_val = std::numeric_limits<int64_t>::min() / factor;
+ if (input.null_count != 0) {
+ BitmapReader bit_reader(input.buffers[0]->data(), input.offset, input.length);
+ for (int64_t i = 0; i < input.length; i++) {
+ if (bit_reader.IsSet() && (in_data[i] < min_val || in_data[i] > max_val)) {
+ RAISE_OVERFLOW_CAST(in_data[i]);
+ }
+ out_data[i] = static_cast<out_type>(in_data[i] * factor);
+ bit_reader.Next();
+ }
+ } else {
+ for (int64_t i = 0; i < input.length; i++) {
+ if (in_data[i] < min_val || in_data[i] > max_val) {
+ RAISE_OVERFLOW_CAST(in_data[i]);
+ }
+ out_data[i] = static_cast<out_type>(in_data[i] * factor);
+ }
+ }
+
+#undef RAISE_OVERFLOW_CAST
+ }
+ } else {
+ if (options.allow_time_truncate) {
+ for (int64_t i = 0; i < input.length; i++) {
+ out_data[i] = static_cast<out_type>(in_data[i] / factor);
+ }
+ } else {
+#define RAISE_INVALID_CAST(VAL) \
+ return Status::Invalid("Casting from ", input.type->ToString(), " to ", \
+ output->type->ToString(), " would lose data: ", VAL);
+
+ if (input.null_count != 0) {
+ BitmapReader bit_reader(input.buffers[0]->data(), input.offset, input.length);
+ for (int64_t i = 0; i < input.length; i++) {
+ out_data[i] = static_cast<out_type>(in_data[i] / factor);
+ if (bit_reader.IsSet() && (out_data[i] * factor != in_data[i])) {
+ RAISE_INVALID_CAST(in_data[i]);
+ }
+ bit_reader.Next();
+ }
+ } else {
+ for (int64_t i = 0; i < input.length; i++) {
+ out_data[i] = static_cast<out_type>(in_data[i] / factor);
+ if (out_data[i] * factor != in_data[i]) {
+ RAISE_INVALID_CAST(in_data[i]);
+ }
+ }
+ }
+
+#undef RAISE_INVALID_CAST
+ }
+ }
+
+ return Status::OK();
+}
+
+// <TimestampType, TimestampType> and <DurationType, DurationType>
+template <typename O, typename I>
+struct CastFunctor<
+ O, I,
+ enable_if_t<(is_timestamp_type<O>::value && is_timestamp_type<I>::value) ||
+ (is_duration_type<O>::value && is_duration_type<I>::value)>> {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ DCHECK_EQ(batch[0].kind(), Datum::ARRAY);
+
+ const ArrayData& input = *batch[0].array();
+ ArrayData* output = out->mutable_array();
+
+ // If units are the same, zero copy, otherwise convert
+ const auto& in_type = checked_cast<const I&>(*batch[0].type());
+ const auto& out_type = checked_cast<const O&>(*output->type);
+
+ // The units may be equal if the time zones are different. We might go to
+ // lengths to make this zero copy in the future but we leave it for now
+
+ auto conversion = util::GetTimestampConversion(in_type.unit(), out_type.unit());
+ return ShiftTime<int64_t, int64_t>(ctx, conversion.first, conversion.second, input,
+ output);
+ }
+};
+
+template <>
+struct CastFunctor<Date32Type, TimestampType> {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ DCHECK_EQ(batch[0].kind(), Datum::ARRAY);
+
+ const ArrayData& input = *batch[0].array();
+ ArrayData* output = out->mutable_array();
+
+ const auto& in_type = checked_cast<const TimestampType&>(*input.type);
+
+ static const int64_t kTimestampToDateFactors[4] = {
+ 86400LL, // SECOND
+ 86400LL * 1000LL, // MILLI
+ 86400LL * 1000LL * 1000LL, // MICRO
+ 86400LL * 1000LL * 1000LL * 1000LL, // NANO
+ };
+
+ const int64_t factor = kTimestampToDateFactors[static_cast<int>(in_type.unit())];
+ return ShiftTime<int64_t, int32_t>(ctx, util::DIVIDE, factor, input, output);
+ }
+};
+
+template <>
+struct CastFunctor<Date64Type, TimestampType> {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ DCHECK_EQ(batch[0].kind(), Datum::ARRAY);
+
+ const CastOptions& options = checked_cast<const CastState&>(*ctx->state()).options;
+ const ArrayData& input = *batch[0].array();
+ ArrayData* output = out->mutable_array();
+ const auto& in_type = checked_cast<const TimestampType&>(*input.type);
+
+ auto conversion = util::GetTimestampConversion(in_type.unit(), TimeUnit::MILLI);
+ RETURN_NOT_OK((ShiftTime<int64_t, int64_t>(ctx, conversion.first, conversion.second,
+ input, output)));
+
+ // Ensure that intraday milliseconds have been zeroed out
+ auto out_data = output->GetMutableValues<int64_t>(1);
+
+ if (input.null_count != 0) {
+ BitmapReader bit_reader(input.buffers[0]->data(), input.offset, input.length);
+
+ for (int64_t i = 0; i < input.length; ++i) {
+ const int64_t remainder = out_data[i] % kMillisecondsInDay;
+ if (ARROW_PREDICT_FALSE(!options.allow_time_truncate && bit_reader.IsSet() &&
+ remainder > 0)) {
+ return Status::Invalid("Timestamp value had non-zero intraday milliseconds");
+ }
+ out_data[i] -= remainder;
+ bit_reader.Next();
+ }
+ } else {
+ for (int64_t i = 0; i < input.length; ++i) {
+ const int64_t remainder = out_data[i] % kMillisecondsInDay;
+ if (ARROW_PREDICT_FALSE(!options.allow_time_truncate && remainder > 0)) {
+ return Status::Invalid("Timestamp value had non-zero intraday milliseconds");
+ }
+ out_data[i] -= remainder;
+ }
+ }
+
+ return Status::OK();
+ }
+};
+
+// ----------------------------------------------------------------------
+// From one time32 or time64 to another
+
+template <typename O, typename I>
+struct CastFunctor<O, I, enable_if_t<is_time_type<I>::value && is_time_type<O>::value>> {
+ using in_t = typename I::c_type;
+ using out_t = typename O::c_type;
+
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ DCHECK_EQ(batch[0].kind(), Datum::ARRAY);
+
+ const ArrayData& input = *batch[0].array();
+ ArrayData* output = out->mutable_array();
+
+ // If units are the same, zero copy, otherwise convert
+ const auto& in_type = checked_cast<const I&>(*input.type);
+ const auto& out_type = checked_cast<const O&>(*output->type);
+ DCHECK_NE(in_type.unit(), out_type.unit()) << "Do not cast equal types";
+ auto conversion = util::GetTimestampConversion(in_type.unit(), out_type.unit());
+ return ShiftTime<in_t, out_t>(ctx, conversion.first, conversion.second, input,
+ output);
+ }
+};
+
+// ----------------------------------------------------------------------
+// Between date32 and date64
+
+template <>
+struct CastFunctor<Date64Type, Date32Type> {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ DCHECK_EQ(batch[0].kind(), Datum::ARRAY);
+
+ return ShiftTime<int32_t, int64_t>(ctx, util::MULTIPLY, kMillisecondsInDay,
+ *batch[0].array(), out->mutable_array());
+ }
+};
+
+template <>
+struct CastFunctor<Date32Type, Date64Type> {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ DCHECK_EQ(batch[0].kind(), Datum::ARRAY);
+
+ return ShiftTime<int64_t, int32_t>(ctx, util::DIVIDE, kMillisecondsInDay,
+ *batch[0].array(), out->mutable_array());
+ }
+};
+
+// ----------------------------------------------------------------------
+// date32, date64 to timestamp
+
+template <>
+struct CastFunctor<TimestampType, Date32Type> {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ DCHECK_EQ(batch[0].kind(), Datum::ARRAY);
+
+ const auto& out_type = checked_cast<const TimestampType&>(*out->type());
+ // get conversion SECOND -> unit
+ auto conversion = util::GetTimestampConversion(TimeUnit::SECOND, out_type.unit());
+ DCHECK_EQ(conversion.first, util::MULTIPLY);
+
+ // multiply to achieve days -> unit
+ conversion.second *= kMillisecondsInDay / 1000;
+ return ShiftTime<int32_t, int64_t>(ctx, util::MULTIPLY, conversion.second,
+ *batch[0].array(), out->mutable_array());
+ }
+};
+
+template <>
+struct CastFunctor<TimestampType, Date64Type> {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ DCHECK_EQ(batch[0].kind(), Datum::ARRAY);
+
+ const auto& out_type = checked_cast<const TimestampType&>(*out->type());
+
+ // date64 is ms since epoch
+ auto conversion = util::GetTimestampConversion(TimeUnit::MILLI, out_type.unit());
+ return ShiftTime<int64_t, int64_t>(ctx, conversion.first, conversion.second,
+ *batch[0].array(), out->mutable_array());
+ }
+};
+
+// ----------------------------------------------------------------------
+// String to Timestamp
+
+struct ParseTimestamp {
+ template <typename OutValue, typename Arg0Value>
+ OutValue Call(KernelContext*, Arg0Value val, Status* st) const {
+ OutValue result = 0;
+ if (ARROW_PREDICT_FALSE(!ParseValue(type, val.data(), val.size(), &result))) {
+ *st = Status::Invalid("Failed to parse string: '", val, "' as a scalar of type ",
+ type.ToString());
+ }
+ return result;
+ }
+
+ const TimestampType& type;
+};
+
+template <typename I>
+struct CastFunctor<TimestampType, I, enable_if_t<is_base_binary_type<I>::value>> {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const auto& out_type = checked_cast<const TimestampType&>(*out->type());
+ applicator::ScalarUnaryNotNullStateful<TimestampType, I, ParseTimestamp> kernel(
+ ParseTimestamp{out_type});
+ return kernel.Exec(ctx, batch, out);
+ }
+};
+
+template <typename Type>
+void AddCrossUnitCast(CastFunction* func) {
+ ScalarKernel kernel;
+ kernel.exec = TrivialScalarUnaryAsArraysExec(CastFunctor<Type, Type>::Exec);
+ kernel.signature = KernelSignature::Make({InputType(Type::type_id)}, kOutputTargetType);
+ DCHECK_OK(func->AddKernel(Type::type_id, std::move(kernel)));
+}
+
+std::shared_ptr<CastFunction> GetDate32Cast() {
+ auto func = std::make_shared<CastFunction>("cast_date32", Type::DATE32);
+ auto out_ty = date32();
+ AddCommonCasts(Type::DATE32, out_ty, func.get());
+
+ // int32 -> date32
+ AddZeroCopyCast(Type::INT32, int32(), date32(), func.get());
+
+ // date64 -> date32
+ AddSimpleCast<Date64Type, Date32Type>(date64(), date32(), func.get());
+
+ // timestamp -> date32
+ AddSimpleCast<TimestampType, Date32Type>(InputType(Type::TIMESTAMP), date32(),
+ func.get());
+ return func;
+}
+
+std::shared_ptr<CastFunction> GetDate64Cast() {
+ auto func = std::make_shared<CastFunction>("cast_date64", Type::DATE64);
+ auto out_ty = date64();
+ AddCommonCasts(Type::DATE64, out_ty, func.get());
+
+ // int64 -> date64
+ AddZeroCopyCast(Type::INT64, int64(), date64(), func.get());
+
+ // date32 -> date64
+ AddSimpleCast<Date32Type, Date64Type>(date32(), date64(), func.get());
+
+ // timestamp -> date64
+ AddSimpleCast<TimestampType, Date64Type>(InputType(Type::TIMESTAMP), date64(),
+ func.get());
+ return func;
+}
+
+std::shared_ptr<CastFunction> GetDurationCast() {
+ auto func = std::make_shared<CastFunction>("cast_duration", Type::DURATION);
+ AddCommonCasts(Type::DURATION, kOutputTargetType, func.get());
+
+ auto seconds = duration(TimeUnit::SECOND);
+ auto millis = duration(TimeUnit::MILLI);
+ auto micros = duration(TimeUnit::MICRO);
+ auto nanos = duration(TimeUnit::NANO);
+
+ // Same integer representation
+ AddZeroCopyCast(Type::INT64, /*in_type=*/int64(), kOutputTargetType, func.get());
+
+ // Between durations
+ AddCrossUnitCast<DurationType>(func.get());
+
+ return func;
+}
+
+std::shared_ptr<CastFunction> GetTime32Cast() {
+ auto func = std::make_shared<CastFunction>("cast_time32", Type::TIME32);
+ AddCommonCasts(Type::TIME32, kOutputTargetType, func.get());
+
+ // Zero copy when the unit is the same or same integer representation
+ AddZeroCopyCast(Type::INT32, /*in_type=*/int32(), kOutputTargetType, func.get());
+
+ // time64 -> time32
+ AddSimpleCast<Time64Type, Time32Type>(InputType(Type::TIME64), kOutputTargetType,
+ func.get());
+
+ // time32 -> time32
+ AddCrossUnitCast<Time32Type>(func.get());
+
+ return func;
+}
+
+std::shared_ptr<CastFunction> GetTime64Cast() {
+ auto func = std::make_shared<CastFunction>("cast_time64", Type::TIME64);
+ AddCommonCasts(Type::TIME64, kOutputTargetType, func.get());
+
+ // Zero copy when the unit is the same or same integer representation
+ AddZeroCopyCast(Type::INT64, /*in_type=*/int64(), kOutputTargetType, func.get());
+
+ // time32 -> time64
+ AddSimpleCast<Time32Type, Time64Type>(InputType(Type::TIME32), kOutputTargetType,
+ func.get());
+
+ // Between durations
+ AddCrossUnitCast<Time64Type>(func.get());
+
+ return func;
+}
+
+std::shared_ptr<CastFunction> GetTimestampCast() {
+ auto func = std::make_shared<CastFunction>("cast_timestamp", Type::TIMESTAMP);
+ AddCommonCasts(Type::TIMESTAMP, kOutputTargetType, func.get());
+
+ // Same integer representation
+ AddZeroCopyCast(Type::INT64, /*in_type=*/int64(), kOutputTargetType, func.get());
+
+ // From date types
+ // TODO: ARROW-8876, these casts are not directly tested
+ AddSimpleCast<Date32Type, TimestampType>(InputType(Type::DATE32), kOutputTargetType,
+ func.get());
+ AddSimpleCast<Date64Type, TimestampType>(InputType(Type::DATE64), kOutputTargetType,
+ func.get());
+
+ // string -> timestamp
+ AddSimpleCast<StringType, TimestampType>(utf8(), kOutputTargetType, func.get());
+ // large_string -> timestamp
+ AddSimpleCast<LargeStringType, TimestampType>(large_utf8(), kOutputTargetType,
+ func.get());
+
+ // From one timestamp to another
+ AddCrossUnitCast<TimestampType>(func.get());
+
+ return func;
+}
+
+std::vector<std::shared_ptr<CastFunction>> GetTemporalCasts() {
+ std::vector<std::shared_ptr<CastFunction>> functions;
+
+ functions.push_back(GetDate32Cast());
+ functions.push_back(GetDate64Cast());
+ functions.push_back(GetDurationCast());
+ functions.push_back(GetTime32Cast());
+ functions.push_back(GetTime64Cast());
+ functions.push_back(GetTimestampCast());
+ return functions;
+}
+
+} // namespace internal
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_compare.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_compare.cc
new file mode 100644
index 00000000000..4342d776c38
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_compare.cc
@@ -0,0 +1,524 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cmath>
+#include <limits>
+
+#include "arrow/compute/api_scalar.h"
+#include "arrow/compute/kernels/common.h"
+#include "arrow/util/bitmap_ops.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+using internal::checked_pointer_cast;
+using util::string_view;
+
+namespace compute {
+namespace internal {
+
+namespace {
+
+struct Equal {
+ template <typename T, typename Arg0, typename Arg1>
+ static constexpr T Call(KernelContext*, const Arg0& left, const Arg1& right, Status*) {
+ static_assert(std::is_same<T, bool>::value && std::is_same<Arg0, Arg1>::value, "");
+ return left == right;
+ }
+};
+
+struct NotEqual {
+ template <typename T, typename Arg0, typename Arg1>
+ static constexpr T Call(KernelContext*, const Arg0& left, const Arg1& right, Status*) {
+ static_assert(std::is_same<T, bool>::value && std::is_same<Arg0, Arg1>::value, "");
+ return left != right;
+ }
+};
+
+struct Greater {
+ template <typename T, typename Arg0, typename Arg1>
+ static constexpr T Call(KernelContext*, const Arg0& left, const Arg1& right, Status*) {
+ static_assert(std::is_same<T, bool>::value && std::is_same<Arg0, Arg1>::value, "");
+ return left > right;
+ }
+};
+
+struct GreaterEqual {
+ template <typename T, typename Arg0, typename Arg1>
+ static constexpr T Call(KernelContext*, const Arg0& left, const Arg1& right, Status*) {
+ static_assert(std::is_same<T, bool>::value && std::is_same<Arg0, Arg1>::value, "");
+ return left >= right;
+ }
+};
+
+template <typename T>
+using is_unsigned_integer = std::integral_constant<bool, std::is_integral<T>::value &&
+ std::is_unsigned<T>::value>;
+
+template <typename T>
+using is_signed_integer =
+ std::integral_constant<bool, std::is_integral<T>::value && std::is_signed<T>::value>;
+
+template <typename T>
+using enable_if_integer =
+ enable_if_t<is_signed_integer<T>::value || is_unsigned_integer<T>::value, T>;
+
+template <typename T>
+using enable_if_floating_point = enable_if_t<std::is_floating_point<T>::value, T>;
+
+struct Minimum {
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_floating_point<T> Call(Arg0 left, Arg1 right) {
+ static_assert(std::is_same<T, Arg0>::value && std::is_same<Arg0, Arg1>::value, "");
+ return std::fmin(left, right);
+ }
+
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_integer<T> Call(Arg0 left, Arg1 right) {
+ static_assert(std::is_same<T, Arg0>::value && std::is_same<Arg0, Arg1>::value, "");
+ return std::min(left, right);
+ }
+
+ template <typename T>
+ static constexpr enable_if_t<std::is_same<float, T>::value, T> antiextreme() {
+ return std::nanf("");
+ }
+
+ template <typename T>
+ static constexpr enable_if_t<std::is_same<double, T>::value, T> antiextreme() {
+ return std::nan("");
+ }
+
+ template <typename T>
+ static constexpr enable_if_integer<T> antiextreme() {
+ return std::numeric_limits<T>::max();
+ }
+};
+
+struct Maximum {
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_floating_point<T> Call(Arg0 left, Arg1 right) {
+ static_assert(std::is_same<T, Arg0>::value && std::is_same<Arg0, Arg1>::value, "");
+ return std::fmax(left, right);
+ }
+
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_integer<T> Call(Arg0 left, Arg1 right) {
+ static_assert(std::is_same<T, Arg0>::value && std::is_same<Arg0, Arg1>::value, "");
+ return std::max(left, right);
+ }
+
+ template <typename T>
+ static constexpr enable_if_t<std::is_same<float, T>::value, T> antiextreme() {
+ return std::nanf("");
+ }
+
+ template <typename T>
+ static constexpr enable_if_t<std::is_same<double, T>::value, T> antiextreme() {
+ return std::nan("");
+ }
+
+ template <typename T>
+ static constexpr enable_if_integer<T> antiextreme() {
+ return std::numeric_limits<T>::min();
+ }
+};
+
+// Implement Less, LessEqual by flipping arguments to Greater, GreaterEqual
+
+template <typename Op>
+void AddIntegerCompare(const std::shared_ptr<DataType>& ty, ScalarFunction* func) {
+ auto exec =
+ GeneratePhysicalInteger<applicator::ScalarBinaryEqualTypes, BooleanType, Op>(*ty);
+ DCHECK_OK(func->AddKernel({ty, ty}, boolean(), std::move(exec)));
+}
+
+template <typename InType, typename Op>
+void AddGenericCompare(const std::shared_ptr<DataType>& ty, ScalarFunction* func) {
+ DCHECK_OK(
+ func->AddKernel({ty, ty}, boolean(),
+ applicator::ScalarBinaryEqualTypes<BooleanType, InType, Op>::Exec));
+}
+
+struct CompareFunction : ScalarFunction {
+ using ScalarFunction::ScalarFunction;
+
+ Result<const Kernel*> DispatchBest(std::vector<ValueDescr>* values) const override {
+ RETURN_NOT_OK(CheckArity(*values));
+
+ using arrow::compute::detail::DispatchExactImpl;
+ if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
+
+ EnsureDictionaryDecoded(values);
+ ReplaceNullWithOtherType(values);
+
+ if (auto type = CommonNumeric(*values)) {
+ ReplaceTypes(type, values);
+ } else if (auto type = CommonTimestamp(*values)) {
+ ReplaceTypes(type, values);
+ } else if (auto type = CommonBinary(*values)) {
+ ReplaceTypes(type, values);
+ }
+
+ if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
+ return arrow::compute::detail::NoMatchingKernel(this, *values);
+ }
+};
+
+struct VarArgsCompareFunction : ScalarFunction {
+ using ScalarFunction::ScalarFunction;
+
+ Result<const Kernel*> DispatchBest(std::vector<ValueDescr>* values) const override {
+ RETURN_NOT_OK(CheckArity(*values));
+
+ using arrow::compute::detail::DispatchExactImpl;
+ if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
+
+ EnsureDictionaryDecoded(values);
+
+ if (auto type = CommonNumeric(*values)) {
+ ReplaceTypes(type, values);
+ } else if (auto type = CommonTimestamp(*values)) {
+ ReplaceTypes(type, values);
+ }
+
+ if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
+ return arrow::compute::detail::NoMatchingKernel(this, *values);
+ }
+};
+
+template <typename Op>
+std::shared_ptr<ScalarFunction> MakeCompareFunction(std::string name,
+ const FunctionDoc* doc) {
+ auto func = std::make_shared<CompareFunction>(name, Arity::Binary(), doc);
+
+ DCHECK_OK(func->AddKernel(
+ {boolean(), boolean()}, boolean(),
+ applicator::ScalarBinary<BooleanType, BooleanType, BooleanType, Op>::Exec));
+
+ for (const std::shared_ptr<DataType>& ty : IntTypes()) {
+ AddIntegerCompare<Op>(ty, func.get());
+ }
+ AddIntegerCompare<Op>(date32(), func.get());
+ AddIntegerCompare<Op>(date64(), func.get());
+
+ AddGenericCompare<FloatType, Op>(float32(), func.get());
+ AddGenericCompare<DoubleType, Op>(float64(), func.get());
+
+ // Add timestamp kernels
+ for (auto unit : AllTimeUnits()) {
+ InputType in_type(match::TimestampTypeUnit(unit));
+ auto exec =
+ GeneratePhysicalInteger<applicator::ScalarBinaryEqualTypes, BooleanType, Op>(
+ int64());
+ DCHECK_OK(func->AddKernel({in_type, in_type}, boolean(), std::move(exec)));
+ }
+
+ // Duration
+ for (auto unit : AllTimeUnits()) {
+ InputType in_type(match::DurationTypeUnit(unit));
+ auto exec =
+ GeneratePhysicalInteger<applicator::ScalarBinaryEqualTypes, BooleanType, Op>(
+ int64());
+ DCHECK_OK(func->AddKernel({in_type, in_type}, boolean(), std::move(exec)));
+ }
+
+ // Time32 and Time64
+ for (auto unit : {TimeUnit::SECOND, TimeUnit::MILLI}) {
+ InputType in_type(match::Time32TypeUnit(unit));
+ auto exec =
+ GeneratePhysicalInteger<applicator::ScalarBinaryEqualTypes, BooleanType, Op>(
+ int32());
+ DCHECK_OK(func->AddKernel({in_type, in_type}, boolean(), std::move(exec)));
+ }
+ for (auto unit : {TimeUnit::MICRO, TimeUnit::NANO}) {
+ InputType in_type(match::Time64TypeUnit(unit));
+ auto exec =
+ GeneratePhysicalInteger<applicator::ScalarBinaryEqualTypes, BooleanType, Op>(
+ int64());
+ DCHECK_OK(func->AddKernel({in_type, in_type}, boolean(), std::move(exec)));
+ }
+
+ for (const std::shared_ptr<DataType>& ty : BaseBinaryTypes()) {
+ auto exec =
+ GenerateVarBinaryBase<applicator::ScalarBinaryEqualTypes, BooleanType, Op>(*ty);
+ DCHECK_OK(func->AddKernel({ty, ty}, boolean(), std::move(exec)));
+ }
+
+ return func;
+}
+
+std::shared_ptr<ScalarFunction> MakeFlippedFunction(std::string name,
+ const ScalarFunction& func,
+ const FunctionDoc* doc) {
+ auto flipped_func = std::make_shared<CompareFunction>(name, Arity::Binary(), doc);
+ for (const ScalarKernel* kernel : func.kernels()) {
+ ScalarKernel flipped_kernel = *kernel;
+ flipped_kernel.exec = MakeFlippedBinaryExec(kernel->exec);
+ DCHECK_OK(flipped_func->AddKernel(std::move(flipped_kernel)));
+ }
+ return flipped_func;
+}
+
+using MinMaxState = OptionsWrapper<ElementWiseAggregateOptions>;
+
+// Implement a variadic scalar min/max kernel.
+template <typename OutType, typename Op>
+struct ScalarMinMax {
+ using OutValue = typename GetOutputType<OutType>::T;
+
+ static void ExecScalar(const ExecBatch& batch,
+ const ElementWiseAggregateOptions& options, Scalar* out) {
+ // All arguments are scalar
+ OutValue value{};
+ bool valid = false;
+ for (const auto& arg : batch.values) {
+ // Ignore non-scalar arguments so we can use it in the mixed-scalar-and-array case
+ if (!arg.is_scalar()) continue;
+ const auto& scalar = *arg.scalar();
+ if (!scalar.is_valid) {
+ if (options.skip_nulls) continue;
+ out->is_valid = false;
+ return;
+ }
+ if (!valid) {
+ value = UnboxScalar<OutType>::Unbox(scalar);
+ valid = true;
+ } else {
+ value = Op::template Call<OutValue, OutValue, OutValue>(
+ value, UnboxScalar<OutType>::Unbox(scalar));
+ }
+ }
+ out->is_valid = valid;
+ if (valid) {
+ BoxScalar<OutType>::Box(value, out);
+ }
+ }
+
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const ElementWiseAggregateOptions& options = MinMaxState::Get(ctx);
+ const auto descrs = batch.GetDescriptors();
+ const size_t scalar_count =
+ static_cast<size_t>(std::count_if(batch.values.begin(), batch.values.end(),
+ [](const Datum& d) { return d.is_scalar(); }));
+ if (scalar_count == batch.values.size()) {
+ ExecScalar(batch, options, out->scalar().get());
+ return Status::OK();
+ }
+
+ ArrayData* output = out->mutable_array();
+
+ // At least one array, two or more arguments
+ ArrayDataVector arrays;
+ for (const auto& arg : batch.values) {
+ if (!arg.is_array()) continue;
+ arrays.push_back(arg.array());
+ }
+
+ bool initialize_output = true;
+ if (scalar_count > 0) {
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> temp_scalar,
+ MakeScalar(out->type(), 0));
+ ExecScalar(batch, options, temp_scalar.get());
+ if (temp_scalar->is_valid) {
+ const auto value = UnboxScalar<OutType>::Unbox(*temp_scalar);
+ initialize_output = false;
+ OutValue* out = output->GetMutableValues<OutValue>(1);
+ std::fill(out, out + batch.length, value);
+ } else if (!options.skip_nulls) {
+ // Abort early
+ ARROW_ASSIGN_OR_RAISE(auto array, MakeArrayFromScalar(*temp_scalar, batch.length,
+ ctx->memory_pool()));
+ *output = *array->data();
+ return Status::OK();
+ }
+ }
+
+ if (initialize_output) {
+ OutValue* out = output->GetMutableValues<OutValue>(1);
+ std::fill(out, out + batch.length, Op::template antiextreme<OutValue>());
+ }
+
+ // Precompute the validity buffer
+ if (options.skip_nulls && initialize_output) {
+ // OR together the validity buffers of all arrays
+ if (std::all_of(arrays.begin(), arrays.end(),
+ [](const std::shared_ptr<ArrayData>& arr) {
+ return arr->MayHaveNulls();
+ })) {
+ for (const auto& arr : arrays) {
+ if (!arr->MayHaveNulls()) continue;
+ if (!output->buffers[0]) {
+ ARROW_ASSIGN_OR_RAISE(output->buffers[0], ctx->AllocateBitmap(batch.length));
+ ::arrow::internal::CopyBitmap(arr->buffers[0]->data(), arr->offset,
+
+ batch.length,
+ output->buffers[0]->mutable_data(),
+ /*dest_offset=*/0);
+ } else {
+ ::arrow::internal::BitmapOr(
+ output->buffers[0]->data(), /*left_offset=*/0, arr->buffers[0]->data(),
+ arr->offset, batch.length,
+ /*out_offset=*/0, output->buffers[0]->mutable_data());
+ }
+ }
+ }
+ } else if (!options.skip_nulls) {
+ // AND together the validity buffers of all arrays
+ for (const auto& arr : arrays) {
+ if (!arr->MayHaveNulls()) continue;
+ if (!output->buffers[0]) {
+ ARROW_ASSIGN_OR_RAISE(output->buffers[0], ctx->AllocateBitmap(batch.length));
+ ::arrow::internal::CopyBitmap(arr->buffers[0]->data(), arr->offset,
+ batch.length, output->buffers[0]->mutable_data(),
+ /*dest_offset=*/0);
+ } else {
+ ::arrow::internal::BitmapAnd(output->buffers[0]->data(), /*left_offset=*/0,
+ arr->buffers[0]->data(), arr->offset, batch.length,
+ /*out_offset=*/0,
+ output->buffers[0]->mutable_data());
+ }
+ }
+ }
+
+ for (const auto& array : arrays) {
+ OutputArrayWriter<OutType> writer(out->mutable_array());
+ ArrayIterator<OutType> out_it(*output);
+ int64_t index = 0;
+ VisitArrayValuesInline<OutType>(
+ *array,
+ [&](OutValue value) {
+ auto u = out_it();
+ if (!output->buffers[0] ||
+ BitUtil::GetBit(output->buffers[0]->data(), index)) {
+ writer.Write(Op::template Call<OutValue, OutValue, OutValue>(u, value));
+ } else {
+ writer.Write(value);
+ }
+ index++;
+ },
+ [&]() {
+ // RHS is null, preserve the LHS
+ writer.values++;
+ index++;
+ out_it();
+ });
+ }
+ output->null_count = output->buffers[0] ? -1 : 0;
+ return Status::OK();
+ }
+};
+
+template <typename Op>
+std::shared_ptr<ScalarFunction> MakeScalarMinMax(std::string name,
+ const FunctionDoc* doc) {
+ static auto default_element_wise_aggregate_options =
+ ElementWiseAggregateOptions::Defaults();
+
+ auto func = std::make_shared<VarArgsCompareFunction>(
+ name, Arity::VarArgs(), doc, &default_element_wise_aggregate_options);
+ for (const auto& ty : NumericTypes()) {
+ auto exec = GeneratePhysicalNumeric<ScalarMinMax, Op>(ty);
+ ScalarKernel kernel{KernelSignature::Make({ty}, ty, /*is_varargs=*/true), exec,
+ MinMaxState::Init};
+ kernel.null_handling = NullHandling::type::COMPUTED_NO_PREALLOCATE;
+ kernel.mem_allocation = MemAllocation::type::PREALLOCATE;
+ DCHECK_OK(func->AddKernel(std::move(kernel)));
+ }
+ for (const auto& ty : TemporalTypes()) {
+ auto exec = GeneratePhysicalNumeric<ScalarMinMax, Op>(ty);
+ ScalarKernel kernel{KernelSignature::Make({ty}, ty, /*is_varargs=*/true), exec,
+ MinMaxState::Init};
+ kernel.null_handling = NullHandling::type::COMPUTED_NO_PREALLOCATE;
+ kernel.mem_allocation = MemAllocation::type::PREALLOCATE;
+ DCHECK_OK(func->AddKernel(std::move(kernel)));
+ }
+ return func;
+}
+
+const FunctionDoc equal_doc{"Compare values for equality (x == y)",
+ ("A null on either side emits a null comparison result."),
+ {"x", "y"}};
+
+const FunctionDoc not_equal_doc{"Compare values for inequality (x != y)",
+ ("A null on either side emits a null comparison result."),
+ {"x", "y"}};
+
+const FunctionDoc greater_doc{"Compare values for ordered inequality (x > y)",
+ ("A null on either side emits a null comparison result."),
+ {"x", "y"}};
+
+const FunctionDoc greater_equal_doc{
+ "Compare values for ordered inequality (x >= y)",
+ ("A null on either side emits a null comparison result."),
+ {"x", "y"}};
+
+const FunctionDoc less_doc{"Compare values for ordered inequality (x < y)",
+ ("A null on either side emits a null comparison result."),
+ {"x", "y"}};
+
+const FunctionDoc less_equal_doc{
+ "Compare values for ordered inequality (x <= y)",
+ ("A null on either side emits a null comparison result."),
+ {"x", "y"}};
+
+const FunctionDoc min_element_wise_doc{
+ "Find the element-wise minimum value",
+ ("Nulls will be ignored (default) or propagated. "
+ "NaN will be taken over null, but not over any valid float."),
+ {"*args"},
+ "ElementWiseAggregateOptions"};
+
+const FunctionDoc max_element_wise_doc{
+ "Find the element-wise maximum value",
+ ("Nulls will be ignored (default) or propagated. "
+ "NaN will be taken over null, but not over any valid float."),
+ {"*args"},
+ "ElementWiseAggregateOptions"};
+} // namespace
+
+void RegisterScalarComparison(FunctionRegistry* registry) {
+ DCHECK_OK(registry->AddFunction(MakeCompareFunction<Equal>("equal", &equal_doc)));
+ DCHECK_OK(
+ registry->AddFunction(MakeCompareFunction<NotEqual>("not_equal", &not_equal_doc)));
+
+ auto greater = MakeCompareFunction<Greater>("greater", &greater_doc);
+ auto greater_equal =
+ MakeCompareFunction<GreaterEqual>("greater_equal", &greater_equal_doc);
+
+ auto less = MakeFlippedFunction("less", *greater, &less_doc);
+ auto less_equal = MakeFlippedFunction("less_equal", *greater_equal, &less_equal_doc);
+ DCHECK_OK(registry->AddFunction(std::move(less)));
+ DCHECK_OK(registry->AddFunction(std::move(less_equal)));
+ DCHECK_OK(registry->AddFunction(std::move(greater)));
+ DCHECK_OK(registry->AddFunction(std::move(greater_equal)));
+
+ // ----------------------------------------------------------------------
+ // Variadic element-wise functions
+
+ auto min_element_wise =
+ MakeScalarMinMax<Minimum>("min_element_wise", &min_element_wise_doc);
+ DCHECK_OK(registry->AddFunction(std::move(min_element_wise)));
+
+ auto max_element_wise =
+ MakeScalarMinMax<Maximum>("max_element_wise", &max_element_wise_doc);
+ DCHECK_OK(registry->AddFunction(std::move(max_element_wise)));
+}
+
+} // namespace internal
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_fill_null.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_fill_null.cc
new file mode 100644
index 00000000000..cf22b0de3dc
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_fill_null.cc
@@ -0,0 +1,244 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <algorithm>
+#include <cstring>
+
+#include "arrow/compute/kernels/common.h"
+#include "arrow/scalar.h"
+#include "arrow/util/bit_block_counter.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_ops.h"
+
+namespace arrow {
+
+using internal::BitBlockCount;
+using internal::BitBlockCounter;
+
+namespace compute {
+namespace internal {
+
+namespace {
+
+template <typename Type, typename Enable = void>
+struct FillNullFunctor {};
+
+// Numeric inputs
+
+template <typename Type>
+struct FillNullFunctor<Type, enable_if_t<is_number_type<Type>::value>> {
+ using T = typename TypeTraits<Type>::CType;
+
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const ArrayData& data = *batch[0].array();
+ const Scalar& fill_value = *batch[1].scalar();
+ ArrayData* output = out->mutable_array();
+
+ // Ensure the kernel is configured properly to have no validity bitmap /
+ // null count 0 unless we explicitly propagate it below.
+ DCHECK(output->buffers[0] == nullptr);
+
+ T value = UnboxScalar<Type>::Unbox(fill_value);
+ if (data.MayHaveNulls() != 0 && fill_value.is_valid) {
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Buffer> out_buf,
+ ctx->Allocate(data.length * sizeof(T)));
+
+ const uint8_t* is_valid = data.buffers[0]->data();
+ const T* in_values = data.GetValues<T>(1);
+ T* out_values = reinterpret_cast<T*>(out_buf->mutable_data());
+ int64_t offset = data.offset;
+ BitBlockCounter bit_counter(is_valid, data.offset, data.length);
+ while (offset < data.offset + data.length) {
+ BitBlockCount block = bit_counter.NextWord();
+ if (block.AllSet()) {
+ // Block all not null
+ std::memcpy(out_values, in_values, block.length * sizeof(T));
+ } else if (block.NoneSet()) {
+ // Block all null
+ std::fill(out_values, out_values + block.length, value);
+ } else {
+ for (int64_t i = 0; i < block.length; ++i) {
+ out_values[i] = BitUtil::GetBit(is_valid, offset + i) ? in_values[i] : value;
+ }
+ }
+ offset += block.length;
+ out_values += block.length;
+ in_values += block.length;
+ }
+ output->buffers[1] = out_buf;
+ output->null_count = 0;
+ } else {
+ *output = data;
+ }
+
+ return Status::OK();
+ }
+};
+
+// Boolean input
+
+template <typename Type>
+struct FillNullFunctor<Type, enable_if_t<is_boolean_type<Type>::value>> {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const ArrayData& data = *batch[0].array();
+ const Scalar& fill_value = *batch[1].scalar();
+ ArrayData* output = out->mutable_array();
+
+ bool value = UnboxScalar<BooleanType>::Unbox(fill_value);
+ if (data.MayHaveNulls() != 0 && fill_value.is_valid) {
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Buffer> out_buf,
+ ctx->AllocateBitmap(data.length));
+
+ const uint8_t* is_valid = data.buffers[0]->data();
+ const uint8_t* data_bitmap = data.buffers[1]->data();
+ uint8_t* out_bitmap = out_buf->mutable_data();
+
+ int64_t data_offset = data.offset;
+ BitBlockCounter bit_counter(is_valid, data.offset, data.length);
+
+ int64_t out_offset = 0;
+ while (out_offset < data.length) {
+ BitBlockCount block = bit_counter.NextWord();
+ if (block.AllSet()) {
+ // Block all not null
+ ::arrow::internal::CopyBitmap(data_bitmap, data_offset, block.length,
+ out_bitmap, out_offset);
+ } else if (block.NoneSet()) {
+ // Block all null
+ BitUtil::SetBitsTo(out_bitmap, out_offset, block.length, value);
+ } else {
+ for (int64_t i = 0; i < block.length; ++i) {
+ BitUtil::SetBitTo(out_bitmap, out_offset + i,
+ BitUtil::GetBit(is_valid, data_offset + i)
+ ? BitUtil::GetBit(data_bitmap, data_offset + i)
+ : value);
+ }
+ }
+ data_offset += block.length;
+ out_offset += block.length;
+ }
+ output->buffers[1] = out_buf;
+ output->null_count = 0;
+ } else {
+ *output = data;
+ }
+
+ return Status::OK();
+ }
+};
+
+// Null input
+
+template <typename Type>
+struct FillNullFunctor<Type, enable_if_t<is_null_type<Type>::value>> {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ // Nothing preallocated, so we assign into the output
+ *out->mutable_array() = *batch[0].array();
+ return Status::OK();
+ }
+};
+
+// Binary-like input
+
+template <typename Type>
+struct FillNullFunctor<Type, enable_if_t<is_base_binary_type<Type>::value>> {
+ using BuilderType = typename TypeTraits<Type>::BuilderType;
+
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const ArrayData& input = *batch[0].array();
+ const auto& fill_value_scalar =
+ checked_cast<const BaseBinaryScalar&>(*batch[1].scalar());
+ ArrayData* output = out->mutable_array();
+
+ // Ensure the kernel is configured properly to have no validity bitmap /
+ // null count 0 unless we explicitly propagate it below.
+ DCHECK(output->buffers[0] == nullptr);
+
+ const int64_t null_count = input.GetNullCount();
+
+ if (null_count > 0 && fill_value_scalar.is_valid) {
+ util::string_view fill_value(*fill_value_scalar.value);
+ BuilderType builder(input.type, ctx->memory_pool());
+ RETURN_NOT_OK(builder.ReserveData(input.buffers[2]->size() +
+ fill_value.length() * null_count));
+ RETURN_NOT_OK(builder.Resize(input.length));
+
+ VisitArrayDataInline<Type>(
+ input, [&](util::string_view s) { builder.UnsafeAppend(s); },
+ [&]() { builder.UnsafeAppend(fill_value); });
+ std::shared_ptr<Array> string_array;
+ RETURN_NOT_OK(builder.Finish(&string_array));
+ *output = *string_array->data();
+ // The builder does not match the logical type, due to
+ // GenerateTypeAgnosticVarBinaryBase
+ output->type = input.type;
+ } else {
+ *output = input;
+ }
+
+ return Status::OK();
+ }
+};
+
+void AddBasicFillNullKernels(ScalarKernel kernel, ScalarFunction* func) {
+ auto AddKernels = [&](const std::vector<std::shared_ptr<DataType>>& types) {
+ for (const std::shared_ptr<DataType>& ty : types) {
+ kernel.signature =
+ KernelSignature::Make({InputType::Array(ty), InputType::Scalar(ty)}, ty);
+ kernel.exec = GenerateTypeAgnosticPrimitive<FillNullFunctor>(*ty);
+ DCHECK_OK(func->AddKernel(kernel));
+ }
+ };
+ AddKernels(NumericTypes());
+ AddKernels(TemporalTypes());
+ AddKernels({boolean(), null()});
+}
+
+void AddBinaryFillNullKernels(ScalarKernel kernel, ScalarFunction* func) {
+ for (const std::shared_ptr<DataType>& ty : BaseBinaryTypes()) {
+ kernel.signature =
+ KernelSignature::Make({InputType::Array(ty), InputType::Scalar(ty)}, ty);
+ kernel.exec = GenerateTypeAgnosticVarBinaryBase<FillNullFunctor>(*ty);
+ DCHECK_OK(func->AddKernel(kernel));
+ }
+}
+
+const FunctionDoc fill_null_doc{
+ "Replace null elements",
+ ("`fill_value` must be a scalar of the same type as `values`.\n"
+ "Each non-null value in `values` is emitted as-is.\n"
+ "Each null value in `values` is replaced with `fill_value`."),
+ {"values", "fill_value"}};
+
+} // namespace
+
+void RegisterScalarFillNull(FunctionRegistry* registry) {
+ {
+ ScalarKernel fill_null_base;
+ fill_null_base.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
+ fill_null_base.mem_allocation = MemAllocation::NO_PREALLOCATE;
+ auto fill_null =
+ std::make_shared<ScalarFunction>("fill_null", Arity::Binary(), &fill_null_doc);
+ AddBasicFillNullKernels(fill_null_base, fill_null.get());
+ AddBinaryFillNullKernels(fill_null_base, fill_null.get());
+ DCHECK_OK(registry->AddFunction(fill_null));
+ }
+}
+
+} // namespace internal
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_if_else.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_if_else.cc
new file mode 100644
index 00000000000..ff308a673a3
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_if_else.cc
@@ -0,0 +1,1730 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <arrow/compute/api.h>
+#include <arrow/compute/kernels/codegen_internal.h>
+#include <arrow/compute/util_internal.h>
+#include <arrow/util/bit_block_counter.h>
+#include <arrow/util/bitmap.h>
+#include <arrow/util/bitmap_ops.h>
+#include <arrow/util/bitmap_reader.h>
+
+namespace arrow {
+using internal::BitBlockCount;
+using internal::BitBlockCounter;
+using internal::Bitmap;
+using internal::BitmapWordReader;
+
+namespace compute {
+namespace internal {
+
+namespace {
+
+constexpr uint64_t kAllNull = 0;
+constexpr uint64_t kAllValid = ~kAllNull;
+
+util::optional<uint64_t> GetConstantValidityWord(const Datum& data) {
+ if (data.is_scalar()) {
+ return data.scalar()->is_valid ? kAllValid : kAllNull;
+ }
+
+ if (data.array()->null_count == data.array()->length) return kAllNull;
+
+ if (!data.array()->MayHaveNulls()) return kAllValid;
+
+ // no constant validity word available
+ return {};
+}
+
+inline Bitmap GetBitmap(const Datum& datum, int i) {
+ if (datum.is_scalar()) return {};
+ const ArrayData& a = *datum.array();
+ return Bitmap{a.buffers[i], a.offset, a.length};
+}
+
+// if the condition is null then output is null otherwise we take validity from the
+// selected argument
+// ie. cond.valid & (cond.data & left.valid | ~cond.data & right.valid)
+template <typename AllocateNullBitmap>
+Status PromoteNullsVisitor(KernelContext* ctx, const Datum& cond_d, const Datum& left_d,
+ const Datum& right_d, ArrayData* output) {
+ auto cond_const = GetConstantValidityWord(cond_d);
+ auto left_const = GetConstantValidityWord(left_d);
+ auto right_const = GetConstantValidityWord(right_d);
+
+ enum { COND_CONST = 1, LEFT_CONST = 2, RIGHT_CONST = 4 };
+ auto flag = COND_CONST * cond_const.has_value() | LEFT_CONST * left_const.has_value() |
+ RIGHT_CONST * right_const.has_value();
+
+ const ArrayData& cond = *cond_d.array();
+ // cond.data will always be available
+ Bitmap cond_data{cond.buffers[1], cond.offset, cond.length};
+ Bitmap cond_valid{cond.buffers[0], cond.offset, cond.length};
+ Bitmap left_valid = GetBitmap(left_d, 0);
+ Bitmap right_valid = GetBitmap(right_d, 0);
+
+ // cond.valid & (cond.data & left.valid | ~cond.data & right.valid)
+ // In the following cases, we dont need to allocate out_valid bitmap
+
+ // if cond & left & right all ones, then output is all valid.
+ // if output validity buffer is already allocated (NullHandling::
+ // COMPUTED_PREALLOCATE) -> set all bits
+ // else, return nullptr
+ if (cond_const == kAllValid && left_const == kAllValid && right_const == kAllValid) {
+ if (AllocateNullBitmap::value) { // NullHandling::COMPUTED_NO_PREALLOCATE
+ output->buffers[0] = nullptr;
+ } else { // NullHandling::COMPUTED_PREALLOCATE
+ BitUtil::SetBitmap(output->buffers[0]->mutable_data(), output->offset,
+ output->length);
+ }
+ return Status::OK();
+ }
+
+ if (left_const == kAllValid && right_const == kAllValid) {
+ // if both left and right are valid, no need to calculate out_valid bitmap. Copy
+ // cond validity buffer
+ if (AllocateNullBitmap::value) { // NullHandling::COMPUTED_NO_PREALLOCATE
+ // if there's an offset, copy bitmap (cannot slice a bitmap)
+ if (cond.offset) {
+ ARROW_ASSIGN_OR_RAISE(
+ output->buffers[0],
+ arrow::internal::CopyBitmap(ctx->memory_pool(), cond.buffers[0]->data(),
+ cond.offset, cond.length));
+ } else { // just copy assign cond validity buffer
+ output->buffers[0] = cond.buffers[0];
+ }
+ } else { // NullHandling::COMPUTED_PREALLOCATE
+ arrow::internal::CopyBitmap(cond.buffers[0]->data(), cond.offset, cond.length,
+ output->buffers[0]->mutable_data(), output->offset);
+ }
+ return Status::OK();
+ }
+
+ // lambda function that will be used inside the visitor
+ auto apply = [&](uint64_t c_valid, uint64_t c_data, uint64_t l_valid,
+ uint64_t r_valid) {
+ return c_valid & ((c_data & l_valid) | (~c_data & r_valid));
+ };
+
+ if (AllocateNullBitmap::value) {
+ // following cases requires a separate out_valid buffer. COMPUTED_NO_PREALLOCATE
+ // would not have allocated buffers for it.
+ ARROW_ASSIGN_OR_RAISE(output->buffers[0], ctx->AllocateBitmap(cond.length));
+ }
+
+ std::array<Bitmap, 1> out_bitmaps{
+ Bitmap{output->buffers[0], output->offset, output->length}};
+
+ switch (flag) {
+ case COND_CONST | LEFT_CONST | RIGHT_CONST: {
+ std::array<Bitmap, 1> bitmaps{cond_data};
+ Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
+ [&](const std::array<uint64_t, 1>& words_in,
+ std::array<uint64_t, 1>* word_out) {
+ word_out->at(0) = apply(*cond_const, words_in[0],
+ *left_const, *right_const);
+ });
+ break;
+ }
+ case LEFT_CONST | RIGHT_CONST: {
+ std::array<Bitmap, 2> bitmaps{cond_valid, cond_data};
+ Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
+ [&](const std::array<uint64_t, 2>& words_in,
+ std::array<uint64_t, 1>* word_out) {
+ word_out->at(0) = apply(words_in[0], words_in[1],
+ *left_const, *right_const);
+ });
+ break;
+ }
+ case COND_CONST | RIGHT_CONST: {
+ // bitmaps[C_VALID], bitmaps[R_VALID] might be null; override to make it safe for
+ // Visit()
+ std::array<Bitmap, 2> bitmaps{cond_data, left_valid};
+ Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
+ [&](const std::array<uint64_t, 2>& words_in,
+ std::array<uint64_t, 1>* word_out) {
+ word_out->at(0) = apply(*cond_const, words_in[0],
+ words_in[1], *right_const);
+ });
+ break;
+ }
+ case RIGHT_CONST: {
+ // bitmaps[R_VALID] might be null; override to make it safe for Visit()
+ std::array<Bitmap, 3> bitmaps{cond_valid, cond_data, left_valid};
+ Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
+ [&](const std::array<uint64_t, 3>& words_in,
+ std::array<uint64_t, 1>* word_out) {
+ word_out->at(0) = apply(words_in[0], words_in[1],
+ words_in[2], *right_const);
+ });
+ break;
+ }
+ case COND_CONST | LEFT_CONST: {
+ // bitmaps[C_VALID], bitmaps[L_VALID] might be null; override to make it safe for
+ // Visit()
+ std::array<Bitmap, 2> bitmaps{cond_data, right_valid};
+ Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
+ [&](const std::array<uint64_t, 2>& words_in,
+ std::array<uint64_t, 1>* word_out) {
+ word_out->at(0) = apply(*cond_const, words_in[0],
+ *left_const, words_in[1]);
+ });
+ break;
+ }
+ case LEFT_CONST: {
+ // bitmaps[L_VALID] might be null; override to make it safe for Visit()
+ std::array<Bitmap, 3> bitmaps{cond_valid, cond_data, right_valid};
+ Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
+ [&](const std::array<uint64_t, 3>& words_in,
+ std::array<uint64_t, 1>* word_out) {
+ word_out->at(0) = apply(words_in[0], words_in[1],
+ *left_const, words_in[2]);
+ });
+ break;
+ }
+ case COND_CONST: {
+ // bitmaps[C_VALID] might be null; override to make it safe for Visit()
+ std::array<Bitmap, 3> bitmaps{cond_data, left_valid, right_valid};
+ Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
+ [&](const std::array<uint64_t, 3>& words_in,
+ std::array<uint64_t, 1>* word_out) {
+ word_out->at(0) = apply(*cond_const, words_in[0],
+ words_in[1], words_in[2]);
+ });
+ break;
+ }
+ case 0: {
+ std::array<Bitmap, 4> bitmaps{cond_valid, cond_data, left_valid, right_valid};
+ Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
+ [&](const std::array<uint64_t, 4>& words_in,
+ std::array<uint64_t, 1>* word_out) {
+ word_out->at(0) = apply(words_in[0], words_in[1],
+ words_in[2], words_in[3]);
+ });
+ break;
+ }
+ }
+ return Status::OK();
+}
+
+using Word = uint64_t;
+static constexpr int64_t word_len = sizeof(Word) * 8;
+
+/// Runs the main if_else loop. Here, it is expected that the right data has already
+/// been copied to the output.
+/// If `invert` is meant to invert the cond.data. If is set to `true`, then the
+/// buffer will be inverted before calling the handle_block or handle_each functions.
+/// This is useful, when left is an array and right is scalar. Then rather than
+/// copying data from the right to output, we can copy left data to the output and
+/// invert the cond data to fill right values. Filling out with a scalar is presumed to
+/// be more efficient than filling with an array
+///
+/// `HandleBlock` has the signature:
+/// [](int64_t offset, int64_t length){...}
+/// It should copy `length` number of elements from source array to output array with
+/// `offset` offset in both arrays
+template <typename HandleBlock, bool invert = false>
+void RunIfElseLoop(const ArrayData& cond, const HandleBlock& handle_block) {
+ int64_t data_offset = 0;
+ int64_t bit_offset = cond.offset;
+ const auto* cond_data = cond.buffers[1]->data(); // this is a BoolArray
+
+ BitmapWordReader<Word> cond_reader(cond_data, cond.offset, cond.length);
+
+ constexpr Word pickAll = invert ? 0 : UINT64_MAX;
+ constexpr Word pickNone = ~pickAll;
+
+ int64_t cnt = cond_reader.words();
+ while (cnt--) {
+ Word word = cond_reader.NextWord();
+
+ if (word == pickAll) {
+ handle_block(data_offset, word_len);
+ } else if (word != pickNone) {
+ for (int64_t i = 0; i < word_len; ++i) {
+ if (BitUtil::GetBit(cond_data, bit_offset + i) != invert) {
+ handle_block(data_offset + i, 1);
+ }
+ }
+ }
+ data_offset += word_len;
+ bit_offset += word_len;
+ }
+
+ constexpr uint8_t pickAllByte = invert ? 0 : UINT8_MAX;
+ // byte bit-wise inversion is int-wide. Hence XOR with 0xff
+ constexpr uint8_t pickNoneByte = pickAllByte ^ 0xff;
+
+ cnt = cond_reader.trailing_bytes();
+ while (cnt--) {
+ int valid_bits;
+ uint8_t byte = cond_reader.NextTrailingByte(valid_bits);
+
+ if (byte == pickAllByte && valid_bits == 8) {
+ handle_block(data_offset, 8);
+ } else if (byte != pickNoneByte) {
+ for (int i = 0; i < valid_bits; ++i) {
+ if (BitUtil::GetBit(cond_data, bit_offset + i) != invert) {
+ handle_block(data_offset + i, 1);
+ }
+ }
+ }
+ data_offset += 8;
+ bit_offset += 8;
+ }
+}
+
+template <typename HandleBlock>
+void RunIfElseLoopInverted(const ArrayData& cond, const HandleBlock& handle_block) {
+ RunIfElseLoop<HandleBlock, true>(cond, handle_block);
+}
+
+/// Runs if-else when cond is a scalar. Two special functions are required,
+/// 1.CopyArrayData, 2. BroadcastScalar
+template <typename CopyArrayData, typename BroadcastScalar>
+Status RunIfElseScalar(const BooleanScalar& cond, const Datum& left, const Datum& right,
+ Datum* out, const CopyArrayData& copy_array_data,
+ const BroadcastScalar& broadcast_scalar) {
+ if (left.is_scalar() && right.is_scalar()) { // output will be a scalar
+ if (cond.is_valid) {
+ *out = cond.value ? left.scalar() : right.scalar();
+ } else {
+ *out = MakeNullScalar(left.type());
+ }
+ return Status::OK();
+ }
+
+ // either left or right is an array. Output is always an array`
+ const std::shared_ptr<ArrayData>& out_array = out->array();
+ if (!cond.is_valid) {
+ // cond is null; output is all null --> clear validity buffer
+ BitUtil::ClearBitmap(out_array->buffers[0]->mutable_data(), out_array->offset,
+ out_array->length);
+ return Status::OK();
+ }
+
+ // cond is a non-null scalar
+ const auto& valid_data = cond.value ? left : right;
+ if (valid_data.is_array()) {
+ // valid_data is an array. Hence copy data to the output buffers
+ const auto& valid_array = valid_data.array();
+ if (valid_array->MayHaveNulls()) {
+ arrow::internal::CopyBitmap(
+ valid_array->buffers[0]->data(), valid_array->offset, valid_array->length,
+ out_array->buffers[0]->mutable_data(), out_array->offset);
+ } else { // validity buffer is nullptr --> set all bits
+ BitUtil::SetBitmap(out_array->buffers[0]->mutable_data(), out_array->offset,
+ out_array->length);
+ }
+ copy_array_data(*valid_array, out_array.get());
+ return Status::OK();
+
+ } else { // valid data is scalar
+ // valid data is a scalar that needs to be broadcasted
+ const auto& valid_scalar = *valid_data.scalar();
+ if (valid_scalar.is_valid) { // if the scalar is non-null, broadcast
+ BitUtil::SetBitmap(out_array->buffers[0]->mutable_data(), out_array->offset,
+ out_array->length);
+ broadcast_scalar(*valid_data.scalar(), out_array.get());
+ } else { // scalar is null, clear the output validity buffer
+ BitUtil::ClearBitmap(out_array->buffers[0]->mutable_data(), out_array->offset,
+ out_array->length);
+ }
+ return Status::OK();
+ }
+}
+
+template <typename Type, typename Enable = void>
+struct IfElseFunctor {};
+
+// only number types needs to be handled for Fixed sized primitive data types because,
+// internal::GenerateTypeAgnosticPrimitive forwards types to the corresponding unsigned
+// int type
+template <typename Type>
+struct IfElseFunctor<Type, enable_if_number<Type>> {
+ using T = typename TypeTraits<Type>::CType;
+ // A - Array, S - Scalar, X = Array/Scalar
+
+ // SXX
+ static Status Call(KernelContext* ctx, const BooleanScalar& cond, const Datum& left,
+ const Datum& right, Datum* out) {
+ return RunIfElseScalar(
+ cond, left, right, out,
+ /*CopyArrayData*/
+ [&](const ArrayData& valid_array, ArrayData* out_array) {
+ std::memcpy(out_array->GetMutableValues<T>(1), valid_array.GetValues<T>(1),
+ valid_array.length * sizeof(T));
+ },
+ /*BroadcastScalar*/
+ [&](const Scalar& scalar, ArrayData* out_array) {
+ T scalar_data = internal::UnboxScalar<Type>::Unbox(scalar);
+ std::fill(out_array->GetMutableValues<T>(1),
+ out_array->GetMutableValues<T>(1) + out_array->length, scalar_data);
+ });
+ }
+
+ // AAA
+ static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
+ const ArrayData& right, ArrayData* out) {
+ T* out_values = out->template GetMutableValues<T>(1);
+
+ // copy right data to out_buff
+ const T* right_data = right.GetValues<T>(1);
+ std::memcpy(out_values, right_data, right.length * sizeof(T));
+
+ // selectively copy values from left data
+ const T* left_data = left.GetValues<T>(1);
+
+ RunIfElseLoop(cond, [&](int64_t data_offset, int64_t num_elems) {
+ std::memcpy(out_values + data_offset, left_data + data_offset,
+ num_elems * sizeof(T));
+ });
+
+ return Status::OK();
+ }
+
+ // ASA
+ static Status Call(KernelContext* ctx, const ArrayData& cond, const Scalar& left,
+ const ArrayData& right, ArrayData* out) {
+ T* out_values = out->template GetMutableValues<T>(1);
+
+ // copy right data to out_buff
+ const T* right_data = right.GetValues<T>(1);
+ std::memcpy(out_values, right_data, right.length * sizeof(T));
+
+ // selectively copy values from left data
+ T left_data = internal::UnboxScalar<Type>::Unbox(left);
+
+ RunIfElseLoop(cond, [&](int64_t data_offset, int64_t num_elems) {
+ std::fill(out_values + data_offset, out_values + data_offset + num_elems,
+ left_data);
+ });
+
+ return Status::OK();
+ }
+
+ // AAS
+ static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
+ const Scalar& right, ArrayData* out) {
+ T* out_values = out->template GetMutableValues<T>(1);
+
+ // copy left data to out_buff
+ const T* left_data = left.GetValues<T>(1);
+ std::memcpy(out_values, left_data, left.length * sizeof(T));
+
+ T right_data = internal::UnboxScalar<Type>::Unbox(right);
+
+ RunIfElseLoopInverted(cond, [&](int64_t data_offset, int64_t num_elems) {
+ std::fill(out_values + data_offset, out_values + data_offset + num_elems,
+ right_data);
+ });
+
+ return Status::OK();
+ }
+
+ // ASS
+ static Status Call(KernelContext* ctx, const ArrayData& cond, const Scalar& left,
+ const Scalar& right, ArrayData* out) {
+ T* out_values = out->template GetMutableValues<T>(1);
+
+ // copy right data to out_buff
+ T right_data = internal::UnboxScalar<Type>::Unbox(right);
+ std::fill(out_values, out_values + cond.length, right_data);
+
+ // selectively copy values from left data
+ T left_data = internal::UnboxScalar<Type>::Unbox(left);
+ RunIfElseLoop(cond, [&](int64_t data_offset, int64_t num_elems) {
+ std::fill(out_values + data_offset, out_values + data_offset + num_elems,
+ left_data);
+ });
+
+ return Status::OK();
+ }
+};
+
+template <typename Type>
+struct IfElseFunctor<Type, enable_if_boolean<Type>> {
+ // A - Array, S - Scalar, X = Array/Scalar
+
+ // SXX
+ static Status Call(KernelContext* ctx, const BooleanScalar& cond, const Datum& left,
+ const Datum& right, Datum* out) {
+ return RunIfElseScalar(
+ cond, left, right, out,
+ /*CopyArrayData*/
+ [&](const ArrayData& valid_array, ArrayData* out_array) {
+ arrow::internal::CopyBitmap(
+ valid_array.buffers[1]->data(), valid_array.offset, valid_array.length,
+ out_array->buffers[1]->mutable_data(), out_array->offset);
+ },
+ /*BroadcastScalar*/
+ [&](const Scalar& scalar, ArrayData* out_array) {
+ bool scalar_data = internal::UnboxScalar<Type>::Unbox(scalar);
+ BitUtil::SetBitsTo(out_array->buffers[1]->mutable_data(), out_array->offset,
+ out_array->length, scalar_data);
+ });
+ }
+
+ // AAA
+ static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
+ const ArrayData& right, ArrayData* out) {
+ // out_buff = right & ~cond
+ const auto& out_buf = out->buffers[1];
+ arrow::internal::BitmapAndNot(right.buffers[1]->data(), right.offset,
+ cond.buffers[1]->data(), cond.offset, cond.length,
+ out->offset, out_buf->mutable_data());
+
+ // out_buff = left & cond
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Buffer> temp_buf,
+ arrow::internal::BitmapAnd(
+ ctx->memory_pool(), left.buffers[1]->data(), left.offset,
+ cond.buffers[1]->data(), cond.offset, cond.length, 0));
+
+ arrow::internal::BitmapOr(out_buf->data(), out->offset, temp_buf->data(), 0,
+ cond.length, out->offset, out_buf->mutable_data());
+
+ return Status::OK();
+ }
+
+ // ASA
+ static Status Call(KernelContext* ctx, const ArrayData& cond, const Scalar& left,
+ const ArrayData& right, ArrayData* out) {
+ // out_buff = right & ~cond
+ const auto& out_buf = out->buffers[1];
+ arrow::internal::BitmapAndNot(right.buffers[1]->data(), right.offset,
+ cond.buffers[1]->data(), cond.offset, cond.length,
+ out->offset, out_buf->mutable_data());
+
+ // out_buff = left & cond
+ bool left_data = internal::UnboxScalar<BooleanType>::Unbox(left);
+ if (left_data) {
+ arrow::internal::BitmapOr(out_buf->data(), out->offset, cond.buffers[1]->data(),
+ cond.offset, cond.length, out->offset,
+ out_buf->mutable_data());
+ }
+
+ return Status::OK();
+ }
+
+ // AAS
+ static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
+ const Scalar& right, ArrayData* out) {
+ // out_buff = left & cond
+ const auto& out_buf = out->buffers[1];
+ arrow::internal::BitmapAnd(left.buffers[1]->data(), left.offset,
+ cond.buffers[1]->data(), cond.offset, cond.length,
+ out->offset, out_buf->mutable_data());
+
+ bool right_data = internal::UnboxScalar<BooleanType>::Unbox(right);
+
+ // out_buff = left & cond | right & ~cond
+ if (right_data) {
+ arrow::internal::BitmapOrNot(out_buf->data(), out->offset, cond.buffers[1]->data(),
+ cond.offset, cond.length, out->offset,
+ out_buf->mutable_data());
+ }
+
+ return Status::OK();
+ }
+
+ // ASS
+ static Status Call(KernelContext* ctx, const ArrayData& cond, const Scalar& left,
+ const Scalar& right, ArrayData* out) {
+ bool left_data = internal::UnboxScalar<BooleanType>::Unbox(left);
+ bool right_data = internal::UnboxScalar<BooleanType>::Unbox(right);
+
+ const auto& out_buf = out->buffers[1];
+
+ // out_buf = left & cond | right & ~cond
+ // std::shared_ptr<Buffer> out_buf = nullptr;
+ if (left_data) {
+ if (right_data) {
+ // out_buf = ones
+ BitUtil::SetBitmap(out_buf->mutable_data(), out->offset, cond.length);
+ } else {
+ // out_buf = cond
+ arrow::internal::CopyBitmap(cond.buffers[1]->data(), cond.offset, cond.length,
+ out_buf->mutable_data(), out->offset);
+ }
+ } else {
+ if (right_data) {
+ // out_buf = ~cond
+ arrow::internal::InvertBitmap(cond.buffers[1]->data(), cond.offset, cond.length,
+ out_buf->mutable_data(), out->offset);
+ } else {
+ // out_buf = zeros
+ BitUtil::ClearBitmap(out_buf->mutable_data(), out->offset, cond.length);
+ }
+ }
+
+ return Status::OK();
+ }
+};
+
+template <typename Type>
+struct IfElseFunctor<Type, enable_if_base_binary<Type>> {
+ using OffsetType = typename TypeTraits<Type>::OffsetType::c_type;
+ using ArrayType = typename TypeTraits<Type>::ArrayType;
+ using BuilderType = typename TypeTraits<Type>::BuilderType;
+
+ // A - Array, S - Scalar, X = Array/Scalar
+
+ // SXX
+ static Status Call(KernelContext* ctx, const BooleanScalar& cond, const Datum& left,
+ const Datum& right, Datum* out) {
+ if (left.is_scalar() && right.is_scalar()) {
+ if (cond.is_valid) {
+ *out = cond.value ? left.scalar() : right.scalar();
+ } else {
+ *out = MakeNullScalar(left.type());
+ }
+ return Status::OK();
+ }
+ // either left or right is an array. Output is always an array
+ int64_t out_arr_len = std::max(left.length(), right.length());
+ if (!cond.is_valid) {
+ // cond is null; just create a null array
+ ARROW_ASSIGN_OR_RAISE(*out,
+ MakeArrayOfNull(left.type(), out_arr_len, ctx->memory_pool()))
+ return Status::OK();
+ }
+
+ const auto& valid_data = cond.value ? left : right;
+ if (valid_data.is_array()) {
+ *out = valid_data;
+ } else {
+ // valid data is a scalar that needs to be broadcasted
+ ARROW_ASSIGN_OR_RAISE(*out, MakeArrayFromScalar(*valid_data.scalar(), out_arr_len,
+ ctx->memory_pool()));
+ }
+ return Status::OK();
+ }
+
+ // AAA
+ static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
+ const ArrayData& right, ArrayData* out) {
+ const auto* left_offsets = left.GetValues<OffsetType>(1);
+ const uint8_t* left_data = left.buffers[2]->data();
+ const auto* right_offsets = right.GetValues<OffsetType>(1);
+ const uint8_t* right_data = right.buffers[2]->data();
+
+ // allocate data buffer conservatively
+ int64_t data_buff_alloc = left_offsets[left.length] - left_offsets[0] +
+ right_offsets[right.length] - right_offsets[0];
+
+ BuilderType builder(ctx->memory_pool());
+ ARROW_RETURN_NOT_OK(builder.Reserve(cond.length + 1));
+ ARROW_RETURN_NOT_OK(builder.ReserveData(data_buff_alloc));
+
+ RunLoop(
+ cond, *out,
+ [&](int64_t i) {
+ builder.UnsafeAppend(left_data + left_offsets[i],
+ left_offsets[i + 1] - left_offsets[i]);
+ },
+ [&](int64_t i) {
+ builder.UnsafeAppend(right_data + right_offsets[i],
+ right_offsets[i + 1] - right_offsets[i]);
+ },
+ [&]() { builder.UnsafeAppendNull(); });
+ ARROW_ASSIGN_OR_RAISE(auto out_arr, builder.Finish());
+
+ out->SetNullCount(out_arr->data()->null_count);
+ out->buffers[0] = std::move(out_arr->data()->buffers[0]);
+ out->buffers[1] = std::move(out_arr->data()->buffers[1]);
+ out->buffers[2] = std::move(out_arr->data()->buffers[2]);
+ return Status::OK();
+ }
+
+ // ASA
+ static Status Call(KernelContext* ctx, const ArrayData& cond, const Scalar& left,
+ const ArrayData& right, ArrayData* out) {
+ util::string_view left_data = internal::UnboxScalar<Type>::Unbox(left);
+ auto left_size = static_cast<OffsetType>(left_data.size());
+
+ const auto* right_offsets = right.GetValues<OffsetType>(1);
+ const uint8_t* right_data = right.buffers[2]->data();
+
+ // allocate data buffer conservatively
+ int64_t data_buff_alloc =
+ left_size * cond.length + right_offsets[right.length] - right_offsets[0];
+
+ BuilderType builder(ctx->memory_pool());
+ ARROW_RETURN_NOT_OK(builder.Reserve(cond.length + 1));
+ ARROW_RETURN_NOT_OK(builder.ReserveData(data_buff_alloc));
+
+ RunLoop(
+ cond, *out, [&](int64_t i) { builder.UnsafeAppend(left_data.data(), left_size); },
+ [&](int64_t i) {
+ builder.UnsafeAppend(right_data + right_offsets[i],
+ right_offsets[i + 1] - right_offsets[i]);
+ },
+ [&]() { builder.UnsafeAppendNull(); });
+ ARROW_ASSIGN_OR_RAISE(auto out_arr, builder.Finish());
+
+ out->SetNullCount(out_arr->data()->null_count);
+ out->buffers[0] = std::move(out_arr->data()->buffers[0]);
+ out->buffers[1] = std::move(out_arr->data()->buffers[1]);
+ out->buffers[2] = std::move(out_arr->data()->buffers[2]);
+ return Status::OK();
+ }
+
+ // AAS
+ static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
+ const Scalar& right, ArrayData* out) {
+ const auto* left_offsets = left.GetValues<OffsetType>(1);
+ const uint8_t* left_data = left.buffers[2]->data();
+
+ util::string_view right_data = internal::UnboxScalar<Type>::Unbox(right);
+ auto right_size = static_cast<OffsetType>(right_data.size());
+
+ // allocate data buffer conservatively
+ int64_t data_buff_alloc =
+ right_size * cond.length + left_offsets[left.length] - left_offsets[0];
+
+ BuilderType builder(ctx->memory_pool());
+ ARROW_RETURN_NOT_OK(builder.Reserve(cond.length + 1));
+ ARROW_RETURN_NOT_OK(builder.ReserveData(data_buff_alloc));
+
+ RunLoop(
+ cond, *out,
+ [&](int64_t i) {
+ builder.UnsafeAppend(left_data + left_offsets[i],
+ left_offsets[i + 1] - left_offsets[i]);
+ },
+ [&](int64_t i) { builder.UnsafeAppend(right_data.data(), right_size); },
+ [&]() { builder.UnsafeAppendNull(); });
+ ARROW_ASSIGN_OR_RAISE(auto out_arr, builder.Finish());
+
+ out->SetNullCount(out_arr->data()->null_count);
+ out->buffers[0] = std::move(out_arr->data()->buffers[0]);
+ out->buffers[1] = std::move(out_arr->data()->buffers[1]);
+ out->buffers[2] = std::move(out_arr->data()->buffers[2]);
+ return Status::OK();
+ }
+
+ // ASS
+ static Status Call(KernelContext* ctx, const ArrayData& cond, const Scalar& left,
+ const Scalar& right, ArrayData* out) {
+ util::string_view left_data = internal::UnboxScalar<Type>::Unbox(left);
+ auto left_size = static_cast<OffsetType>(left_data.size());
+
+ util::string_view right_data = internal::UnboxScalar<Type>::Unbox(right);
+ auto right_size = static_cast<OffsetType>(right_data.size());
+
+ // allocate data buffer conservatively
+ int64_t data_buff_alloc = std::max(right_size, left_size) * cond.length;
+ BuilderType builder(ctx->memory_pool());
+ ARROW_RETURN_NOT_OK(builder.Reserve(cond.length + 1));
+ ARROW_RETURN_NOT_OK(builder.ReserveData(data_buff_alloc));
+
+ RunLoop(
+ cond, *out, [&](int64_t i) { builder.UnsafeAppend(left_data.data(), left_size); },
+ [&](int64_t i) { builder.UnsafeAppend(right_data.data(), right_size); },
+ [&]() { builder.UnsafeAppendNull(); });
+ ARROW_ASSIGN_OR_RAISE(auto out_arr, builder.Finish());
+
+ out->SetNullCount(out_arr->data()->null_count);
+ out->buffers[0] = std::move(out_arr->data()->buffers[0]);
+ out->buffers[1] = std::move(out_arr->data()->buffers[1]);
+ out->buffers[2] = std::move(out_arr->data()->buffers[2]);
+ return Status::OK();
+ }
+
+ template <typename HandleLeft, typename HandleRight, typename HandleNull>
+ static void RunLoop(const ArrayData& cond, const ArrayData& output,
+ HandleLeft&& handle_left, HandleRight&& handle_right,
+ HandleNull&& handle_null) {
+ const auto* cond_data = cond.buffers[1]->data();
+
+ if (output.buffers[0]) { // output may have nulls
+ // output validity buffer is allocated internally from the IfElseFunctor. Therefore
+ // it is cond.length'd with 0 offset.
+ const auto* out_valid = output.buffers[0]->data();
+
+ for (int64_t i = 0; i < cond.length; i++) {
+ if (BitUtil::GetBit(out_valid, i)) {
+ BitUtil::GetBit(cond_data, cond.offset + i) ? handle_left(i) : handle_right(i);
+ } else {
+ handle_null();
+ }
+ }
+ } else { // output is all valid (no nulls)
+ for (int64_t i = 0; i < cond.length; i++) {
+ BitUtil::GetBit(cond_data, cond.offset + i) ? handle_left(i) : handle_right(i);
+ }
+ }
+ }
+};
+
+template <typename Type>
+struct IfElseFunctor<Type, enable_if_fixed_size_binary<Type>> {
+ // A - Array, S - Scalar, X = Array/Scalar
+
+ // SXX
+ static Status Call(KernelContext* ctx, const BooleanScalar& cond, const Datum& left,
+ const Datum& right, Datum* out) {
+ ARROW_ASSIGN_OR_RAISE(auto byte_width, GetByteWidth(*left.type(), *right.type()));
+ return RunIfElseScalar(
+ cond, left, right, out,
+ /*CopyArrayData*/
+ [&](const ArrayData& valid_array, ArrayData* out_array) {
+ std::memcpy(
+ out_array->buffers[1]->mutable_data() + out_array->offset * byte_width,
+ valid_array.buffers[1]->data() + valid_array.offset * byte_width,
+ valid_array.length * byte_width);
+ },
+ /*BroadcastScalar*/
+ [&](const Scalar& scalar, ArrayData* out_array) {
+ const util::string_view& scalar_data =
+ internal::UnboxScalar<FixedSizeBinaryType>::Unbox(scalar);
+ uint8_t* start =
+ out_array->buffers[1]->mutable_data() + out_array->offset * byte_width;
+ for (int64_t i = 0; i < out_array->length; i++) {
+ std::memcpy(start + i * byte_width, scalar_data.data(), scalar_data.size());
+ }
+ });
+ }
+
+ // AAA
+ static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
+ const ArrayData& right, ArrayData* out) {
+ ARROW_ASSIGN_OR_RAISE(auto byte_width, GetByteWidth(*left.type, *right.type));
+ auto* out_values = out->buffers[1]->mutable_data() + out->offset * byte_width;
+
+ // copy right data to out_buff
+ const uint8_t* right_data = right.buffers[1]->data() + right.offset * byte_width;
+ std::memcpy(out_values, right_data, right.length * byte_width);
+
+ // selectively copy values from left data
+ const uint8_t* left_data = left.buffers[1]->data() + left.offset * byte_width;
+
+ RunIfElseLoop(cond, [&](int64_t data_offset, int64_t num_elems) {
+ std::memcpy(out_values + data_offset * byte_width,
+ left_data + data_offset * byte_width, num_elems * byte_width);
+ });
+
+ return Status::OK();
+ }
+
+ // ASA
+ static Status Call(KernelContext* ctx, const ArrayData& cond, const Scalar& left,
+ const ArrayData& right, ArrayData* out) {
+ ARROW_ASSIGN_OR_RAISE(auto byte_width, GetByteWidth(*left.type, *right.type));
+ auto* out_values = out->buffers[1]->mutable_data() + out->offset * byte_width;
+
+ // copy right data to out_buff
+ const uint8_t* right_data = right.buffers[1]->data() + right.offset * byte_width;
+ std::memcpy(out_values, right_data, right.length * byte_width);
+
+ // selectively copy values from left data
+ const util::string_view& left_data =
+ internal::UnboxScalar<FixedSizeBinaryType>::Unbox(left);
+
+ RunIfElseLoop(cond, [&](int64_t data_offset, int64_t num_elems) {
+ if (left_data.data()) {
+ for (int64_t i = 0; i < num_elems; i++) {
+ std::memcpy(out_values + (data_offset + i) * byte_width, left_data.data(),
+ left_data.size());
+ }
+ }
+ });
+
+ return Status::OK();
+ }
+
+ // AAS
+ static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
+ const Scalar& right, ArrayData* out) {
+ ARROW_ASSIGN_OR_RAISE(auto byte_width, GetByteWidth(*left.type, *right.type));
+ auto* out_values = out->buffers[1]->mutable_data() + out->offset * byte_width;
+
+ // copy left data to out_buff
+ const uint8_t* left_data = left.buffers[1]->data() + left.offset * byte_width;
+ std::memcpy(out_values, left_data, left.length * byte_width);
+
+ const util::string_view& right_data =
+ internal::UnboxScalar<FixedSizeBinaryType>::Unbox(right);
+
+ RunIfElseLoopInverted(cond, [&](int64_t data_offset, int64_t num_elems) {
+ if (right_data.data()) {
+ for (int64_t i = 0; i < num_elems; i++) {
+ std::memcpy(out_values + (data_offset + i) * byte_width, right_data.data(),
+ right_data.size());
+ }
+ }
+ });
+
+ return Status::OK();
+ }
+
+ // ASS
+ static Status Call(KernelContext* ctx, const ArrayData& cond, const Scalar& left,
+ const Scalar& right, ArrayData* out) {
+ ARROW_ASSIGN_OR_RAISE(auto byte_width, GetByteWidth(*left.type, *right.type));
+ auto* out_values = out->buffers[1]->mutable_data() + out->offset * byte_width;
+
+ // copy right data to out_buff
+ const util::string_view& right_data =
+ internal::UnboxScalar<FixedSizeBinaryType>::Unbox(right);
+ if (right_data.data()) {
+ for (int64_t i = 0; i < cond.length; i++) {
+ std::memcpy(out_values + i * byte_width, right_data.data(), right_data.size());
+ }
+ }
+
+ // selectively copy values from left data
+ const util::string_view& left_data =
+ internal::UnboxScalar<FixedSizeBinaryType>::Unbox(left);
+
+ RunIfElseLoop(cond, [&](int64_t data_offset, int64_t num_elems) {
+ if (left_data.data()) {
+ for (int64_t i = 0; i < num_elems; i++) {
+ std::memcpy(out_values + (data_offset + i) * byte_width, left_data.data(),
+ left_data.size());
+ }
+ }
+ });
+
+ return Status::OK();
+ }
+
+ static Result<int32_t> GetByteWidth(const DataType& left_type,
+ const DataType& right_type) {
+ int width = checked_cast<const FixedSizeBinaryType&>(left_type).byte_width();
+ if (width == checked_cast<const FixedSizeBinaryType&>(right_type).byte_width()) {
+ return width;
+ } else {
+ return Status::Invalid("FixedSizeBinaryType byte_widths should be equal");
+ }
+ }
+};
+
+template <typename Type, typename AllocateMem>
+struct ResolveIfElseExec {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ // cond is scalar
+ if (batch[0].is_scalar()) {
+ const auto& cond = batch[0].scalar_as<BooleanScalar>();
+ return IfElseFunctor<Type>::Call(ctx, cond, batch[1], batch[2], out);
+ }
+
+ // cond is array. Use functors to sort things out
+ ARROW_RETURN_NOT_OK(PromoteNullsVisitor<AllocateMem>(ctx, batch[0], batch[1],
+ batch[2], out->mutable_array()));
+
+ if (batch[1].kind() == Datum::ARRAY) {
+ if (batch[2].kind() == Datum::ARRAY) { // AAA
+ return IfElseFunctor<Type>::Call(ctx, *batch[0].array(), *batch[1].array(),
+ *batch[2].array(), out->mutable_array());
+ } else { // AAS
+ return IfElseFunctor<Type>::Call(ctx, *batch[0].array(), *batch[1].array(),
+ *batch[2].scalar(), out->mutable_array());
+ }
+ } else {
+ if (batch[2].kind() == Datum::ARRAY) { // ASA
+ return IfElseFunctor<Type>::Call(ctx, *batch[0].array(), *batch[1].scalar(),
+ *batch[2].array(), out->mutable_array());
+ } else { // ASS
+ return IfElseFunctor<Type>::Call(ctx, *batch[0].array(), *batch[1].scalar(),
+ *batch[2].scalar(), out->mutable_array());
+ }
+ }
+ }
+};
+
+template <typename AllocateMem>
+struct ResolveIfElseExec<NullType, AllocateMem> {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ // if all are scalars, return a null scalar
+ if (batch[0].is_scalar() && batch[1].is_scalar() && batch[2].is_scalar()) {
+ *out = MakeNullScalar(null());
+ } else {
+ ARROW_ASSIGN_OR_RAISE(*out,
+ MakeArrayOfNull(null(), batch.length, ctx->memory_pool()));
+ }
+ return Status::OK();
+ }
+};
+
+struct IfElseFunction : ScalarFunction {
+ using ScalarFunction::ScalarFunction;
+
+ Result<const Kernel*> DispatchBest(std::vector<ValueDescr>* values) const override {
+ RETURN_NOT_OK(CheckArity(*values));
+
+ using arrow::compute::detail::DispatchExactImpl;
+ if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
+
+ // if 0th descriptor is null, replace with bool
+ if (values->at(0).type->id() == Type::NA) {
+ values->at(0).type = boolean();
+ }
+
+ // if-else 0'th descriptor is bool, so skip it
+ std::vector<ValueDescr> values_copy(values->begin() + 1, values->end());
+ internal::EnsureDictionaryDecoded(&values_copy);
+ internal::ReplaceNullWithOtherType(&values_copy);
+
+ if (auto type = internal::CommonNumeric(values_copy)) {
+ internal::ReplaceTypes(type, &values_copy);
+ }
+
+ std::move(values_copy.begin(), values_copy.end(), values->begin() + 1);
+
+ if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
+
+ return arrow::compute::detail::NoMatchingKernel(this, *values);
+ }
+};
+
+void AddNullIfElseKernel(const std::shared_ptr<IfElseFunction>& scalar_function) {
+ ScalarKernel kernel({boolean(), null(), null()}, null(),
+ ResolveIfElseExec<NullType,
+ /*AllocateMem=*/std::true_type>::Exec);
+ kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
+ kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
+ kernel.can_write_into_slices = false;
+
+ DCHECK_OK(scalar_function->AddKernel(std::move(kernel)));
+}
+
+void AddPrimitiveIfElseKernels(const std::shared_ptr<ScalarFunction>& scalar_function,
+ const std::vector<std::shared_ptr<DataType>>& types) {
+ for (auto&& type : types) {
+ auto exec =
+ internal::GenerateTypeAgnosticPrimitive<ResolveIfElseExec,
+ /*AllocateMem=*/std::false_type>(*type);
+ // cond array needs to be boolean always
+ ScalarKernel kernel({boolean(), type, type}, type, exec);
+ kernel.null_handling = NullHandling::COMPUTED_PREALLOCATE;
+ kernel.mem_allocation = MemAllocation::PREALLOCATE;
+ kernel.can_write_into_slices = true;
+
+ DCHECK_OK(scalar_function->AddKernel(std::move(kernel)));
+ }
+}
+
+void AddBinaryIfElseKernels(const std::shared_ptr<IfElseFunction>& scalar_function,
+ const std::vector<std::shared_ptr<DataType>>& types) {
+ for (auto&& type : types) {
+ auto exec =
+ internal::GenerateTypeAgnosticVarBinaryBase<ResolveIfElseExec,
+ /*AllocateMem=*/std::true_type>(
+ *type);
+ // cond array needs to be boolean always
+ ScalarKernel kernel({boolean(), type, type}, type, exec);
+ kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
+ kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
+ kernel.can_write_into_slices = false;
+
+ DCHECK_OK(scalar_function->AddKernel(std::move(kernel)));
+ }
+}
+
+void AddFSBinaryIfElseKernel(const std::shared_ptr<IfElseFunction>& scalar_function) {
+ // cond array needs to be boolean always
+ ScalarKernel kernel(
+ {boolean(), InputType(Type::FIXED_SIZE_BINARY), InputType(Type::FIXED_SIZE_BINARY)},
+ OutputType([](KernelContext*, const std::vector<ValueDescr>& descrs) {
+ return ValueDescr(descrs[1].type, ValueDescr::ANY);
+ }),
+ ResolveIfElseExec<FixedSizeBinaryType, /*AllocateMem=*/std::false_type>::Exec);
+ kernel.null_handling = NullHandling::COMPUTED_PREALLOCATE;
+ kernel.mem_allocation = MemAllocation::PREALLOCATE;
+ kernel.can_write_into_slices = true;
+
+ DCHECK_OK(scalar_function->AddKernel(std::move(kernel)));
+}
+
+// Helper to copy or broadcast fixed-width values between buffers.
+template <typename Type, typename Enable = void>
+struct CopyFixedWidth {};
+template <>
+struct CopyFixedWidth<BooleanType> {
+ static void CopyScalar(const Scalar& scalar, const int64_t length,
+ uint8_t* raw_out_values, const int64_t out_offset) {
+ const bool value = UnboxScalar<BooleanType>::Unbox(scalar);
+ BitUtil::SetBitsTo(raw_out_values, out_offset, length, value);
+ }
+ static void CopyArray(const DataType&, const uint8_t* in_values,
+ const int64_t in_offset, const int64_t length,
+ uint8_t* raw_out_values, const int64_t out_offset) {
+ arrow::internal::CopyBitmap(in_values, in_offset, length, raw_out_values, out_offset);
+ }
+};
+template <typename Type>
+struct CopyFixedWidth<Type, enable_if_number<Type>> {
+ using CType = typename TypeTraits<Type>::CType;
+ static void CopyScalar(const Scalar& scalar, const int64_t length,
+ uint8_t* raw_out_values, const int64_t out_offset) {
+ CType* out_values = reinterpret_cast<CType*>(raw_out_values);
+ const CType value = UnboxScalar<Type>::Unbox(scalar);
+ std::fill(out_values + out_offset, out_values + out_offset + length, value);
+ }
+ static void CopyArray(const DataType&, const uint8_t* in_values,
+ const int64_t in_offset, const int64_t length,
+ uint8_t* raw_out_values, const int64_t out_offset) {
+ std::memcpy(raw_out_values + out_offset * sizeof(CType),
+ in_values + in_offset * sizeof(CType), length * sizeof(CType));
+ }
+};
+template <typename Type>
+struct CopyFixedWidth<Type, enable_if_same<Type, FixedSizeBinaryType>> {
+ static void CopyScalar(const Scalar& values, const int64_t length,
+ uint8_t* raw_out_values, const int64_t out_offset) {
+ const int32_t width =
+ checked_cast<const FixedSizeBinaryType&>(*values.type).byte_width();
+ uint8_t* next = raw_out_values + (width * out_offset);
+ const auto& scalar = checked_cast<const FixedSizeBinaryScalar&>(values);
+ // Scalar may have null value buffer
+ if (!scalar.value) {
+ std::memset(next, 0x00, width * length);
+ } else {
+ DCHECK_EQ(scalar.value->size(), width);
+ for (int i = 0; i < length; i++) {
+ std::memcpy(next, scalar.value->data(), width);
+ next += width;
+ }
+ }
+ }
+ static void CopyArray(const DataType& type, const uint8_t* in_values,
+ const int64_t in_offset, const int64_t length,
+ uint8_t* raw_out_values, const int64_t out_offset) {
+ const int32_t width = checked_cast<const FixedSizeBinaryType&>(type).byte_width();
+ uint8_t* next = raw_out_values + (width * out_offset);
+ std::memcpy(next, in_values + in_offset * width, length * width);
+ }
+};
+template <typename Type>
+struct CopyFixedWidth<Type, enable_if_decimal<Type>> {
+ using ScalarType = typename TypeTraits<Type>::ScalarType;
+ static void CopyScalar(const Scalar& values, const int64_t length,
+ uint8_t* raw_out_values, const int64_t out_offset) {
+ const int32_t width =
+ checked_cast<const FixedSizeBinaryType&>(*values.type).byte_width();
+ uint8_t* next = raw_out_values + (width * out_offset);
+ const auto& scalar = checked_cast<const ScalarType&>(values);
+ const auto value = scalar.value.ToBytes();
+ for (int i = 0; i < length; i++) {
+ std::memcpy(next, value.data(), width);
+ next += width;
+ }
+ }
+ static void CopyArray(const DataType& type, const uint8_t* in_values,
+ const int64_t in_offset, const int64_t length,
+ uint8_t* raw_out_values, const int64_t out_offset) {
+ const int32_t width = checked_cast<const FixedSizeBinaryType&>(type).byte_width();
+ uint8_t* next = raw_out_values + (width * out_offset);
+ std::memcpy(next, in_values + in_offset * width, length * width);
+ }
+};
+// Copy fixed-width values from a scalar/array datum into an output values buffer
+template <typename Type>
+void CopyValues(const Datum& in_values, const int64_t in_offset, const int64_t length,
+ uint8_t* out_valid, uint8_t* out_values, const int64_t out_offset) {
+ if (in_values.is_scalar()) {
+ const auto& scalar = *in_values.scalar();
+ if (out_valid) {
+ BitUtil::SetBitsTo(out_valid, out_offset, length, scalar.is_valid);
+ }
+ CopyFixedWidth<Type>::CopyScalar(scalar, length, out_values, out_offset);
+ } else {
+ const ArrayData& array = *in_values.array();
+ if (out_valid) {
+ if (array.MayHaveNulls()) {
+ if (length == 1) {
+ // CopyBitmap is slow for short runs
+ BitUtil::SetBitTo(
+ out_valid, out_offset,
+ BitUtil::GetBit(array.buffers[0]->data(), array.offset + in_offset));
+ } else {
+ arrow::internal::CopyBitmap(array.buffers[0]->data(), array.offset + in_offset,
+ length, out_valid, out_offset);
+ }
+ } else {
+ BitUtil::SetBitsTo(out_valid, out_offset, length, true);
+ }
+ }
+ CopyFixedWidth<Type>::CopyArray(*array.type, array.buffers[1]->data(),
+ array.offset + in_offset, length, out_values,
+ out_offset);
+ }
+}
+
+// Specialized helper to copy a single value from a source array. Allows avoiding
+// repeatedly calling MayHaveNulls and Buffer::data() which have internal checks that
+// add up when called in a loop.
+template <typename Type>
+void CopyOneArrayValue(const DataType& type, const uint8_t* in_valid,
+ const uint8_t* in_values, const int64_t in_offset,
+ uint8_t* out_valid, uint8_t* out_values,
+ const int64_t out_offset) {
+ if (out_valid) {
+ BitUtil::SetBitTo(out_valid, out_offset,
+ !in_valid || BitUtil::GetBit(in_valid, in_offset));
+ }
+ CopyFixedWidth<Type>::CopyArray(type, in_values, in_offset, /*length=*/1, out_values,
+ out_offset);
+}
+
+struct CaseWhenFunction : ScalarFunction {
+ using ScalarFunction::ScalarFunction;
+
+ Result<const Kernel*> DispatchBest(std::vector<ValueDescr>* values) const override {
+ // The first function is a struct of booleans, where the number of fields in the
+ // struct is either equal to the number of other arguments or is one less.
+ RETURN_NOT_OK(CheckArity(*values));
+ EnsureDictionaryDecoded(values);
+ auto first_type = (*values)[0].type;
+ if (first_type->id() != Type::STRUCT) {
+ return Status::TypeError("case_when: first argument must be STRUCT, not ",
+ *first_type);
+ }
+ auto num_fields = static_cast<size_t>(first_type->num_fields());
+ if (num_fields < values->size() - 2 || num_fields >= values->size()) {
+ return Status::Invalid(
+ "case_when: number of struct fields must be equal to or one less than count of "
+ "remaining arguments (",
+ values->size() - 1, "), got: ", first_type->num_fields());
+ }
+ for (const auto& field : first_type->fields()) {
+ if (field->type()->id() != Type::BOOL) {
+ return Status::TypeError(
+ "case_when: all fields of first argument must be BOOL, but ", field->name(),
+ " was of type: ", *field->type());
+ }
+ }
+
+ if (auto type = CommonNumeric(values->data() + 1, values->size() - 1)) {
+ for (auto it = values->begin() + 1; it != values->end(); it++) {
+ it->type = type;
+ }
+ }
+ if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
+ return arrow::compute::detail::NoMatchingKernel(this, *values);
+ }
+};
+
+// Implement a 'case when' (SQL)/'select' (NumPy) function for any scalar conditions
+template <typename Type>
+Status ExecScalarCaseWhen(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const auto& conds = checked_cast<const StructScalar&>(*batch.values[0].scalar());
+ if (!conds.is_valid) {
+ return Status::Invalid("cond struct must not be null");
+ }
+ Datum result;
+ for (size_t i = 0; i < batch.values.size() - 1; i++) {
+ if (i < conds.value.size()) {
+ const Scalar& cond = *conds.value[i];
+ if (cond.is_valid && internal::UnboxScalar<BooleanType>::Unbox(cond)) {
+ result = batch[i + 1];
+ break;
+ }
+ } else {
+ // ELSE clause
+ result = batch[i + 1];
+ break;
+ }
+ }
+ if (out->is_scalar()) {
+ *out = result.is_scalar() ? result.scalar() : MakeNullScalar(out->type());
+ return Status::OK();
+ }
+ ArrayData* output = out->mutable_array();
+ if (!result.is_value()) {
+ // All conditions false, no 'else' argument
+ result = MakeNullScalar(out->type());
+ }
+ CopyValues<Type>(result, /*in_offset=*/0, batch.length,
+ output->GetMutableValues<uint8_t>(0, 0),
+ output->GetMutableValues<uint8_t>(1, 0), output->offset);
+ return Status::OK();
+}
+
+// Implement 'case when' for any mix of scalar/array arguments for any fixed-width type,
+// given helper functions to copy data from a source array to a target array
+template <typename Type>
+Status ExecArrayCaseWhen(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const auto& conds_array = *batch.values[0].array();
+ if (conds_array.GetNullCount() > 0) {
+ return Status::Invalid("cond struct must not have top-level nulls");
+ }
+ ArrayData* output = out->mutable_array();
+ const int64_t out_offset = output->offset;
+ const auto num_value_args = batch.values.size() - 1;
+ const bool have_else_arg =
+ static_cast<size_t>(conds_array.type->num_fields()) < num_value_args;
+ uint8_t* out_valid = output->buffers[0]->mutable_data();
+ uint8_t* out_values = output->buffers[1]->mutable_data();
+ if (have_else_arg) {
+ // Copy 'else' value into output
+ CopyValues<Type>(batch.values.back(), /*in_offset=*/0, batch.length, out_valid,
+ out_values, out_offset);
+ } else {
+ // There's no 'else' argument, so we should have an all-null validity bitmap
+ BitUtil::SetBitsTo(out_valid, out_offset, batch.length, false);
+ }
+
+ // Allocate a temporary bitmap to determine which elements still need setting.
+ ARROW_ASSIGN_OR_RAISE(auto mask_buffer, ctx->AllocateBitmap(batch.length));
+ uint8_t* mask = mask_buffer->mutable_data();
+ std::memset(mask, 0xFF, mask_buffer->size());
+
+ // Then iterate through each argument in turn and set elements.
+ for (size_t i = 0; i < batch.values.size() - (have_else_arg ? 2 : 1); i++) {
+ const ArrayData& cond_array = *conds_array.child_data[i];
+ const int64_t cond_offset = conds_array.offset + cond_array.offset;
+ const uint8_t* cond_values = cond_array.buffers[1]->data();
+ const Datum& values_datum = batch[i + 1];
+ int64_t offset = 0;
+
+ if (cond_array.GetNullCount() == 0) {
+ // If no valid buffer, visit mask & cond bitmap simultaneously
+ BinaryBitBlockCounter counter(mask, /*start_offset=*/0, cond_values, cond_offset,
+ batch.length);
+ while (offset < batch.length) {
+ const auto block = counter.NextAndWord();
+ if (block.AllSet()) {
+ CopyValues<Type>(values_datum, offset, block.length, out_valid, out_values,
+ out_offset + offset);
+ BitUtil::SetBitsTo(mask, offset, block.length, false);
+ } else if (block.popcount) {
+ for (int64_t j = 0; j < block.length; ++j) {
+ if (BitUtil::GetBit(mask, offset + j) &&
+ BitUtil::GetBit(cond_values, cond_offset + offset + j)) {
+ CopyValues<Type>(values_datum, offset + j, /*length=*/1, out_valid,
+ out_values, out_offset + offset + j);
+ BitUtil::SetBitTo(mask, offset + j, false);
+ }
+ }
+ }
+ offset += block.length;
+ }
+ } else {
+ // Visit mask & cond bitmap & cond validity
+ const uint8_t* cond_valid = cond_array.buffers[0]->data();
+ Bitmap bitmaps[3] = {{mask, /*offset=*/0, batch.length},
+ {cond_values, cond_offset, batch.length},
+ {cond_valid, cond_offset, batch.length}};
+ Bitmap::VisitWords(bitmaps, [&](std::array<uint64_t, 3> words) {
+ const uint64_t word = words[0] & words[1] & words[2];
+ const int64_t block_length = std::min<int64_t>(64, batch.length - offset);
+ if (word == std::numeric_limits<uint64_t>::max()) {
+ CopyValues<Type>(values_datum, offset, block_length, out_valid, out_values,
+ out_offset + offset);
+ BitUtil::SetBitsTo(mask, offset, block_length, false);
+ } else if (word) {
+ for (int64_t j = 0; j < block_length; ++j) {
+ if (BitUtil::GetBit(mask, offset + j) &&
+ BitUtil::GetBit(cond_valid, cond_offset + offset + j) &&
+ BitUtil::GetBit(cond_values, cond_offset + offset + j)) {
+ CopyValues<Type>(values_datum, offset + j, /*length=*/1, out_valid,
+ out_values, out_offset + offset + j);
+ BitUtil::SetBitTo(mask, offset + j, false);
+ }
+ }
+ }
+ });
+ }
+ }
+ if (!have_else_arg) {
+ // Need to initialize any remaining null slots (uninitialized memory)
+ BitBlockCounter counter(mask, /*offset=*/0, batch.length);
+ int64_t offset = 0;
+ auto bit_width = checked_cast<const FixedWidthType&>(*out->type()).bit_width();
+ auto byte_width = BitUtil::BytesForBits(bit_width);
+ while (offset < batch.length) {
+ const auto block = counter.NextWord();
+ if (block.AllSet()) {
+ if (bit_width == 1) {
+ BitUtil::SetBitsTo(out_values, out_offset + offset, block.length, false);
+ } else {
+ std::memset(out_values + (out_offset + offset) * byte_width, 0x00,
+ byte_width * block.length);
+ }
+ } else if (!block.NoneSet()) {
+ for (int64_t j = 0; j < block.length; ++j) {
+ if (BitUtil::GetBit(out_valid, out_offset + offset + j)) continue;
+ if (bit_width == 1) {
+ BitUtil::ClearBit(out_values, out_offset + offset + j);
+ } else {
+ std::memset(out_values + (out_offset + offset + j) * byte_width, 0x00,
+ byte_width);
+ }
+ }
+ }
+ offset += block.length;
+ }
+ }
+ return Status::OK();
+}
+
+template <typename Type, typename Enable = void>
+struct CaseWhenFunctor {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ if (batch.values[0].is_array()) {
+ return ExecArrayCaseWhen<Type>(ctx, batch, out);
+ }
+ return ExecScalarCaseWhen<Type>(ctx, batch, out);
+ }
+};
+
+template <>
+struct CaseWhenFunctor<NullType> {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ return Status::OK();
+ }
+};
+
+struct CoalesceFunction : ScalarFunction {
+ using ScalarFunction::ScalarFunction;
+
+ Result<const Kernel*> DispatchBest(std::vector<ValueDescr>* values) const override {
+ RETURN_NOT_OK(CheckArity(*values));
+ using arrow::compute::detail::DispatchExactImpl;
+ if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
+ EnsureDictionaryDecoded(values);
+ if (auto type = CommonNumeric(*values)) {
+ ReplaceTypes(type, values);
+ }
+ if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
+ return arrow::compute::detail::NoMatchingKernel(this, *values);
+ }
+};
+
+// Implement a 'coalesce' (SQL) operator for any number of scalar inputs
+Status ExecScalarCoalesce(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ for (const auto& datum : batch.values) {
+ if (datum.scalar()->is_valid) {
+ *out = datum;
+ break;
+ }
+ }
+ return Status::OK();
+}
+
+// Helper: copy from a source datum into all null slots of the output
+template <typename Type>
+void CopyValuesAllValid(Datum source, uint8_t* out_valid, uint8_t* out_values,
+ const int64_t out_offset, const int64_t length) {
+ BitBlockCounter counter(out_valid, out_offset, length);
+ int64_t offset = 0;
+ while (offset < length) {
+ const auto block = counter.NextWord();
+ if (block.NoneSet()) {
+ CopyValues<Type>(source, offset, block.length, out_valid, out_values,
+ out_offset + offset);
+ } else if (!block.AllSet()) {
+ for (int64_t j = 0; j < block.length; ++j) {
+ if (!BitUtil::GetBit(out_valid, out_offset + offset + j)) {
+ CopyValues<Type>(source, offset + j, 1, out_valid, out_values,
+ out_offset + offset + j);
+ }
+ }
+ }
+ offset += block.length;
+ }
+}
+
+// Helper: zero the values buffer of the output wherever the slot is null
+void InitializeNullSlots(const DataType& type, uint8_t* out_valid, uint8_t* out_values,
+ const int64_t out_offset, const int64_t length) {
+ BitBlockCounter counter(out_valid, out_offset, length);
+ int64_t offset = 0;
+ auto bit_width = checked_cast<const FixedWidthType&>(type).bit_width();
+ auto byte_width = BitUtil::BytesForBits(bit_width);
+ while (offset < length) {
+ const auto block = counter.NextWord();
+ if (block.NoneSet()) {
+ if (bit_width == 1) {
+ BitUtil::SetBitsTo(out_values, out_offset + offset, block.length, false);
+ } else {
+ std::memset(out_values + (out_offset + offset) * byte_width, 0x00,
+ byte_width * block.length);
+ }
+ } else if (!block.AllSet()) {
+ for (int64_t j = 0; j < block.length; ++j) {
+ if (BitUtil::GetBit(out_valid, out_offset + offset + j)) continue;
+ if (bit_width == 1) {
+ BitUtil::ClearBit(out_values, out_offset + offset + j);
+ } else {
+ std::memset(out_values + (out_offset + offset + j) * byte_width, 0x00,
+ byte_width);
+ }
+ }
+ }
+ offset += block.length;
+ }
+}
+
+// Implement 'coalesce' for any mix of scalar/array arguments for any fixed-width type
+template <typename Type>
+Status ExecArrayCoalesce(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ ArrayData* output = out->mutable_array();
+ const int64_t out_offset = output->offset;
+ // Use output validity buffer as mask to decide what values to copy
+ uint8_t* out_valid = output->buffers[0]->mutable_data();
+ // Clear output buffer - no values are set initially
+ BitUtil::SetBitsTo(out_valid, out_offset, batch.length, false);
+ uint8_t* out_values = output->buffers[1]->mutable_data();
+
+ for (const auto& datum : batch.values) {
+ if ((datum.is_scalar() && datum.scalar()->is_valid) ||
+ (datum.is_array() && !datum.array()->MayHaveNulls())) {
+ // Valid scalar, or all-valid array
+ CopyValuesAllValid<Type>(datum, out_valid, out_values, out_offset, batch.length);
+ break;
+ } else if (datum.is_array()) {
+ // Array with nulls
+ const ArrayData& arr = *datum.array();
+ const DataType& type = *datum.type();
+ const uint8_t* in_valid = arr.buffers[0]->data();
+ const uint8_t* in_values = arr.buffers[1]->data();
+ BinaryBitBlockCounter counter(in_valid, arr.offset, out_valid, out_offset,
+ batch.length);
+ int64_t offset = 0;
+ while (offset < batch.length) {
+ const auto block = counter.NextAndNotWord();
+ if (block.AllSet()) {
+ CopyValues<Type>(datum, offset, block.length, out_valid, out_values,
+ out_offset + offset);
+ } else if (block.popcount) {
+ for (int64_t j = 0; j < block.length; ++j) {
+ if (!BitUtil::GetBit(out_valid, out_offset + offset + j) &&
+ BitUtil::GetBit(in_valid, arr.offset + offset + j)) {
+ // This version lets us avoid calling MayHaveNulls() on every iteration
+ // (which does an atomic load and can add up)
+ CopyOneArrayValue<Type>(type, in_valid, in_values, arr.offset + offset + j,
+ out_valid, out_values, out_offset + offset + j);
+ }
+ }
+ }
+ offset += block.length;
+ }
+ }
+ }
+
+ // Initialize any remaining null slots (uninitialized memory)
+ InitializeNullSlots(*out->type(), out_valid, out_values, out_offset, batch.length);
+ return Status::OK();
+}
+
+template <typename Type, typename Enable = void>
+struct CoalesceFunctor {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ for (const auto& datum : batch.values) {
+ if (datum.is_array()) {
+ return ExecArrayCoalesce<Type>(ctx, batch, out);
+ }
+ }
+ return ExecScalarCoalesce(ctx, batch, out);
+ }
+};
+
+template <>
+struct CoalesceFunctor<NullType> {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ return Status::OK();
+ }
+};
+
+template <typename Type>
+struct CoalesceFunctor<Type, enable_if_base_binary<Type>> {
+ using offset_type = typename Type::offset_type;
+ using BuilderType = typename TypeTraits<Type>::BuilderType;
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ for (const auto& datum : batch.values) {
+ if (datum.is_array()) {
+ return ExecArray(ctx, batch, out);
+ }
+ }
+ return ExecScalarCoalesce(ctx, batch, out);
+ }
+
+ static Status ExecArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ // Special case: grab any leading non-null scalar or array arguments
+ for (const auto& datum : batch.values) {
+ if (datum.is_scalar()) {
+ if (!datum.scalar()->is_valid) continue;
+ ARROW_ASSIGN_OR_RAISE(
+ *out, MakeArrayFromScalar(*datum.scalar(), batch.length, ctx->memory_pool()));
+ return Status::OK();
+ } else if (datum.is_array() && !datum.array()->MayHaveNulls()) {
+ *out = datum;
+ return Status::OK();
+ }
+ break;
+ }
+ ArrayData* output = out->mutable_array();
+ BuilderType builder(batch[0].type(), ctx->memory_pool());
+ RETURN_NOT_OK(builder.Reserve(batch.length));
+ for (int64_t i = 0; i < batch.length; i++) {
+ bool set = false;
+ for (const auto& datum : batch.values) {
+ if (datum.is_scalar()) {
+ if (datum.scalar()->is_valid) {
+ RETURN_NOT_OK(builder.Append(UnboxScalar<Type>::Unbox(*datum.scalar())));
+ set = true;
+ break;
+ }
+ } else {
+ const ArrayData& source = *datum.array();
+ if (!source.MayHaveNulls() ||
+ BitUtil::GetBit(source.buffers[0]->data(), source.offset + i)) {
+ const uint8_t* data = source.buffers[2]->data();
+ const offset_type* offsets = source.GetValues<offset_type>(1);
+ const offset_type offset0 = offsets[i];
+ const offset_type offset1 = offsets[i + 1];
+ RETURN_NOT_OK(builder.Append(data + offset0, offset1 - offset0));
+ set = true;
+ break;
+ }
+ }
+ }
+ if (!set) RETURN_NOT_OK(builder.AppendNull());
+ }
+ ARROW_ASSIGN_OR_RAISE(auto temp_output, builder.Finish());
+ *output = *temp_output->data();
+ // Builder type != logical type due to GenerateTypeAgnosticVarBinaryBase
+ output->type = batch[0].type();
+ return Status::OK();
+ }
+};
+
+Result<ValueDescr> LastType(KernelContext*, const std::vector<ValueDescr>& descrs) {
+ ValueDescr result = descrs.back();
+ result.shape = GetBroadcastShape(descrs);
+ return result;
+}
+
+void AddCaseWhenKernel(const std::shared_ptr<CaseWhenFunction>& scalar_function,
+ detail::GetTypeId get_id, ArrayKernelExec exec) {
+ ScalarKernel kernel(
+ KernelSignature::Make({InputType(Type::STRUCT), InputType(get_id.id)},
+ OutputType(LastType),
+ /*is_varargs=*/true),
+ exec);
+ kernel.null_handling = NullHandling::COMPUTED_PREALLOCATE;
+ kernel.mem_allocation = MemAllocation::PREALLOCATE;
+ kernel.can_write_into_slices = is_fixed_width(get_id.id);
+ DCHECK_OK(scalar_function->AddKernel(std::move(kernel)));
+}
+
+void AddPrimitiveCaseWhenKernels(const std::shared_ptr<CaseWhenFunction>& scalar_function,
+ const std::vector<std::shared_ptr<DataType>>& types) {
+ for (auto&& type : types) {
+ auto exec = GenerateTypeAgnosticPrimitive<CaseWhenFunctor>(*type);
+ AddCaseWhenKernel(scalar_function, type, std::move(exec));
+ }
+}
+
+void AddCoalesceKernel(const std::shared_ptr<ScalarFunction>& scalar_function,
+ detail::GetTypeId get_id, ArrayKernelExec exec) {
+ ScalarKernel kernel(KernelSignature::Make({InputType(get_id.id)}, OutputType(FirstType),
+ /*is_varargs=*/true),
+ exec);
+ kernel.null_handling = NullHandling::COMPUTED_PREALLOCATE;
+ kernel.mem_allocation = MemAllocation::PREALLOCATE;
+ kernel.can_write_into_slices = is_fixed_width(get_id.id);
+ DCHECK_OK(scalar_function->AddKernel(std::move(kernel)));
+}
+
+void AddPrimitiveCoalesceKernels(const std::shared_ptr<ScalarFunction>& scalar_function,
+ const std::vector<std::shared_ptr<DataType>>& types) {
+ for (auto&& type : types) {
+ auto exec = GenerateTypeAgnosticPrimitive<CoalesceFunctor>(*type);
+ AddCoalesceKernel(scalar_function, type, std::move(exec));
+ }
+}
+
+const FunctionDoc if_else_doc{"Choose values based on a condition",
+ ("`cond` must be a Boolean scalar/ array. \n`left` or "
+ "`right` must be of the same type scalar/ array.\n"
+ "`null` values in `cond` will be promoted to the"
+ " output."),
+ {"cond", "left", "right"}};
+
+const FunctionDoc case_when_doc{
+ "Choose values based on multiple conditions",
+ ("`cond` must be a struct of Boolean values. `cases` can be a mix "
+ "of scalar and array arguments (of any type, but all must be the "
+ "same type or castable to a common type), with either exactly one "
+ "datum per child of `cond`, or one more `cases` than children of "
+ "`cond` (in which case we have an \"else\" value).\n"
+ "Each row of the output will be the corresponding value of the "
+ "first datum in `cases` for which the corresponding child of `cond` "
+ "is true, or otherwise the \"else\" value (if given), or null. "
+ "Essentially, this implements a switch-case or if-else, if-else... "
+ "statement."),
+ {"cond", "*cases"}};
+
+const FunctionDoc coalesce_doc{
+ "Select the first non-null value in each slot",
+ ("Each row of the output will be the value from the first corresponding input "
+ "for which the value is not null. If all inputs are null in a row, the output "
+ "will be null."),
+ {"*values"}};
+} // namespace
+
+void RegisterScalarIfElse(FunctionRegistry* registry) {
+ {
+ auto func =
+ std::make_shared<IfElseFunction>("if_else", Arity::Ternary(), &if_else_doc);
+
+ AddPrimitiveIfElseKernels(func, NumericTypes());
+ AddPrimitiveIfElseKernels(func, TemporalTypes());
+ AddPrimitiveIfElseKernels(func, {boolean(), day_time_interval(), month_interval()});
+ AddNullIfElseKernel(func);
+ AddBinaryIfElseKernels(func, BaseBinaryTypes());
+ AddFSBinaryIfElseKernel(func);
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+ }
+ {
+ auto func = std::make_shared<CaseWhenFunction>(
+ "case_when", Arity::VarArgs(/*min_args=*/1), &case_when_doc);
+ AddPrimitiveCaseWhenKernels(func, NumericTypes());
+ AddPrimitiveCaseWhenKernels(func, TemporalTypes());
+ AddPrimitiveCaseWhenKernels(
+ func, {boolean(), null(), day_time_interval(), month_interval()});
+ AddCaseWhenKernel(func, Type::FIXED_SIZE_BINARY,
+ CaseWhenFunctor<FixedSizeBinaryType>::Exec);
+ AddCaseWhenKernel(func, Type::DECIMAL128, CaseWhenFunctor<Decimal128Type>::Exec);
+ AddCaseWhenKernel(func, Type::DECIMAL256, CaseWhenFunctor<Decimal256Type>::Exec);
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+ }
+ {
+ auto func = std::make_shared<CoalesceFunction>(
+ "coalesce", Arity::VarArgs(/*min_args=*/1), &coalesce_doc);
+ AddPrimitiveCoalesceKernels(func, NumericTypes());
+ AddPrimitiveCoalesceKernels(func, TemporalTypes());
+ AddPrimitiveCoalesceKernels(
+ func, {boolean(), null(), day_time_interval(), month_interval()});
+ AddCoalesceKernel(func, Type::FIXED_SIZE_BINARY,
+ CoalesceFunctor<FixedSizeBinaryType>::Exec);
+ AddCoalesceKernel(func, Type::DECIMAL128, CoalesceFunctor<Decimal128Type>::Exec);
+ AddCoalesceKernel(func, Type::DECIMAL256, CoalesceFunctor<Decimal256Type>::Exec);
+ for (const auto& ty : BaseBinaryTypes()) {
+ AddCoalesceKernel(func, ty, GenerateTypeAgnosticVarBinaryBase<CoalesceFunctor>(ty));
+ }
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+ }
+}
+
+} // namespace internal
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_nested.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_nested.cc
new file mode 100644
index 00000000000..e9f0696c8fd
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_nested.cc
@@ -0,0 +1,183 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Vector kernels involving nested types
+
+#include "arrow/array/array_base.h"
+#include "arrow/compute/api_scalar.h"
+#include "arrow/compute/kernels/common.h"
+#include "arrow/result.h"
+#include "arrow/util/bit_block_counter.h"
+
+namespace arrow {
+namespace compute {
+namespace internal {
+namespace {
+
+template <typename Type, typename offset_type = typename Type::offset_type>
+Status ListValueLength(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ using ScalarType = typename TypeTraits<Type>::ScalarType;
+ using OffsetScalarType = typename TypeTraits<Type>::OffsetScalarType;
+
+ if (batch[0].kind() == Datum::ARRAY) {
+ typename TypeTraits<Type>::ArrayType list(batch[0].array());
+ ArrayData* out_arr = out->mutable_array();
+ auto out_values = out_arr->GetMutableValues<offset_type>(1);
+ const offset_type* offsets = list.raw_value_offsets();
+ ::arrow::internal::VisitBitBlocksVoid(
+ list.data()->buffers[0], list.offset(), list.length(),
+ [&](int64_t position) {
+ *out_values++ = offsets[position + 1] - offsets[position];
+ },
+ [&]() { *out_values++ = 0; });
+ } else {
+ const auto& arg0 = batch[0].scalar_as<ScalarType>();
+ if (arg0.is_valid) {
+ checked_cast<OffsetScalarType*>(out->scalar().get())->value =
+ static_cast<offset_type>(arg0.value->length());
+ }
+ }
+
+ return Status::OK();
+}
+
+const FunctionDoc list_value_length_doc{
+ "Compute list lengths",
+ ("`lists` must have a list-like type.\n"
+ "For each non-null value in `lists`, its length is emitted.\n"
+ "Null values emit a null in the output."),
+ {"lists"}};
+
+Result<ValueDescr> MakeStructResolve(KernelContext* ctx,
+ const std::vector<ValueDescr>& descrs) {
+ auto names = OptionsWrapper<MakeStructOptions>::Get(ctx).field_names;
+ auto nullable = OptionsWrapper<MakeStructOptions>::Get(ctx).field_nullability;
+ auto metadata = OptionsWrapper<MakeStructOptions>::Get(ctx).field_metadata;
+
+ if (names.size() == 0) {
+ names.resize(descrs.size());
+ nullable.resize(descrs.size(), true);
+ metadata.resize(descrs.size(), nullptr);
+ int i = 0;
+ for (auto& name : names) {
+ name = std::to_string(i++);
+ }
+ } else if (names.size() != descrs.size() || nullable.size() != descrs.size() ||
+ metadata.size() != descrs.size()) {
+ return Status::Invalid("make_struct() was passed ", descrs.size(), " arguments but ",
+ names.size(), " field names, ", nullable.size(),
+ " nullability bits, and ", metadata.size(),
+ " metadata dictionaries.");
+ }
+
+ size_t i = 0;
+ FieldVector fields(descrs.size());
+
+ ValueDescr::Shape shape = ValueDescr::SCALAR;
+ for (const ValueDescr& descr : descrs) {
+ if (descr.shape != ValueDescr::SCALAR) {
+ shape = ValueDescr::ARRAY;
+ } else {
+ switch (descr.type->id()) {
+ case Type::EXTENSION:
+ case Type::DENSE_UNION:
+ case Type::SPARSE_UNION:
+ return Status::NotImplemented("Broadcasting scalars of type ", *descr.type);
+ default:
+ break;
+ }
+ }
+
+ fields[i] =
+ field(std::move(names[i]), descr.type, nullable[i], std::move(metadata[i]));
+ ++i;
+ }
+
+ return ValueDescr{struct_(std::move(fields)), shape};
+}
+
+Status MakeStructExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ ARROW_ASSIGN_OR_RAISE(auto descr, MakeStructResolve(ctx, batch.GetDescriptors()));
+
+ for (int i = 0; i < batch.num_values(); ++i) {
+ const auto& field = checked_cast<const StructType&>(*descr.type).field(i);
+ if (batch[i].null_count() > 0 && !field->nullable()) {
+ return Status::Invalid("Output field ", field, " (#", i,
+ ") does not allow nulls but the corresponding "
+ "argument was not entirely valid.");
+ }
+ }
+
+ if (descr.shape == ValueDescr::SCALAR) {
+ ScalarVector scalars(batch.num_values());
+ for (int i = 0; i < batch.num_values(); ++i) {
+ scalars[i] = batch[i].scalar();
+ }
+
+ *out =
+ Datum(std::make_shared<StructScalar>(std::move(scalars), std::move(descr.type)));
+ return Status::OK();
+ }
+
+ ArrayVector arrays(batch.num_values());
+ for (int i = 0; i < batch.num_values(); ++i) {
+ if (batch[i].is_array()) {
+ arrays[i] = batch[i].make_array();
+ continue;
+ }
+
+ ARROW_ASSIGN_OR_RAISE(arrays[i], MakeArrayFromScalar(*batch[i].scalar(), batch.length,
+ ctx->memory_pool()));
+ }
+
+ *out = std::make_shared<StructArray>(descr.type, batch.length, std::move(arrays));
+ return Status::OK();
+}
+
+const FunctionDoc make_struct_doc{"Wrap Arrays into a StructArray",
+ ("Names of the StructArray's fields are\n"
+ "specified through MakeStructOptions."),
+ {"*args"},
+ "MakeStructOptions"};
+
+} // namespace
+
+void RegisterScalarNested(FunctionRegistry* registry) {
+ auto list_value_length = std::make_shared<ScalarFunction>(
+ "list_value_length", Arity::Unary(), &list_value_length_doc);
+ DCHECK_OK(list_value_length->AddKernel({InputType(Type::LIST)}, int32(),
+ ListValueLength<ListType>));
+ DCHECK_OK(list_value_length->AddKernel({InputType(Type::LARGE_LIST)}, int64(),
+ ListValueLength<LargeListType>));
+ DCHECK_OK(registry->AddFunction(std::move(list_value_length)));
+
+ static MakeStructOptions kDefaultMakeStructOptions;
+ auto make_struct_function = std::make_shared<ScalarFunction>(
+ "make_struct", Arity::VarArgs(), &make_struct_doc, &kDefaultMakeStructOptions);
+
+ ScalarKernel kernel{KernelSignature::Make({InputType{}}, OutputType{MakeStructResolve},
+ /*is_varargs=*/true),
+ MakeStructExec, OptionsWrapper<MakeStructOptions>::Init};
+ kernel.null_handling = NullHandling::OUTPUT_NOT_NULL;
+ kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
+ DCHECK_OK(make_struct_function->AddKernel(std::move(kernel)));
+ DCHECK_OK(registry->AddFunction(std::move(make_struct_function)));
+}
+
+} // namespace internal
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
new file mode 100644
index 00000000000..3e2e95e5401
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
@@ -0,0 +1,513 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/array/array_base.h"
+#include "arrow/array/builder_primitive.h"
+#include "arrow/compute/api_scalar.h"
+#include "arrow/compute/cast.h"
+#include "arrow/compute/kernels/common.h"
+#include "arrow/compute/kernels/util_internal.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_writer.h"
+#include "arrow/util/hashing.h"
+#include "arrow/visitor_inline.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+using internal::HashTraits;
+
+namespace compute {
+namespace internal {
+namespace {
+
+template <typename Type>
+struct SetLookupState : public KernelState {
+ explicit SetLookupState(MemoryPool* pool) : lookup_table(pool, 0) {}
+
+ Status Init(const SetLookupOptions& options) {
+ if (options.value_set.kind() == Datum::ARRAY) {
+ const ArrayData& value_set = *options.value_set.array();
+ memo_index_to_value_index.reserve(value_set.length);
+ RETURN_NOT_OK(AddArrayValueSet(options, *options.value_set.array()));
+ } else if (options.value_set.kind() == Datum::CHUNKED_ARRAY) {
+ const ChunkedArray& value_set = *options.value_set.chunked_array();
+ memo_index_to_value_index.reserve(value_set.length());
+ int64_t offset = 0;
+ for (const std::shared_ptr<Array>& chunk : value_set.chunks()) {
+ RETURN_NOT_OK(AddArrayValueSet(options, *chunk->data(), offset));
+ offset += chunk->length();
+ }
+ } else {
+ return Status::Invalid("value_set should be an array or chunked array");
+ }
+ if (!options.skip_nulls && lookup_table.GetNull() >= 0) {
+ null_index = memo_index_to_value_index[lookup_table.GetNull()];
+ }
+ return Status::OK();
+ }
+
+ Status AddArrayValueSet(const SetLookupOptions& options, const ArrayData& data,
+ int64_t start_index = 0) {
+ using T = typename GetViewType<Type>::T;
+ int32_t index = static_cast<int32_t>(start_index);
+ auto visit_valid = [&](T v) {
+ const auto memo_size = static_cast<int32_t>(memo_index_to_value_index.size());
+ int32_t unused_memo_index;
+ auto on_found = [&](int32_t memo_index) { DCHECK_LT(memo_index, memo_size); };
+ auto on_not_found = [&](int32_t memo_index) {
+ DCHECK_EQ(memo_index, memo_size);
+ memo_index_to_value_index.push_back(index);
+ };
+ RETURN_NOT_OK(lookup_table.GetOrInsert(
+ v, std::move(on_found), std::move(on_not_found), &unused_memo_index));
+ ++index;
+ return Status::OK();
+ };
+ auto visit_null = [&]() {
+ const auto memo_size = static_cast<int32_t>(memo_index_to_value_index.size());
+ auto on_found = [&](int32_t memo_index) { DCHECK_LT(memo_index, memo_size); };
+ auto on_not_found = [&](int32_t memo_index) {
+ DCHECK_EQ(memo_index, memo_size);
+ memo_index_to_value_index.push_back(index);
+ };
+ lookup_table.GetOrInsertNull(std::move(on_found), std::move(on_not_found));
+ ++index;
+ return Status::OK();
+ };
+
+ return VisitArrayDataInline<Type>(data, visit_valid, visit_null);
+ }
+
+ using MemoTable = typename HashTraits<Type>::MemoTableType;
+ MemoTable lookup_table;
+ // When there are duplicates in value_set, the MemoTable indices must
+ // be mapped back to indices in the value_set.
+ std::vector<int32_t> memo_index_to_value_index;
+ int32_t null_index = -1;
+};
+
+template <>
+struct SetLookupState<NullType> : public KernelState {
+ explicit SetLookupState(MemoryPool*) {}
+
+ Status Init(const SetLookupOptions& options) {
+ value_set_has_null = (options.value_set.length() > 0) && !options.skip_nulls;
+ return Status::OK();
+ }
+
+ bool value_set_has_null;
+};
+
+// TODO: Put this concept somewhere reusable
+template <int width>
+struct UnsignedIntType;
+
+template <>
+struct UnsignedIntType<1> {
+ using Type = UInt8Type;
+};
+
+template <>
+struct UnsignedIntType<2> {
+ using Type = UInt16Type;
+};
+
+template <>
+struct UnsignedIntType<4> {
+ using Type = UInt32Type;
+};
+
+template <>
+struct UnsignedIntType<8> {
+ using Type = UInt64Type;
+};
+
+// Constructing the type requires a type parameter
+struct InitStateVisitor {
+ KernelContext* ctx;
+ SetLookupOptions options;
+ const std::shared_ptr<DataType>& arg_type;
+ std::unique_ptr<KernelState> result;
+
+ InitStateVisitor(KernelContext* ctx, const KernelInitArgs& args)
+ : ctx(ctx),
+ options(*checked_cast<const SetLookupOptions*>(args.options)),
+ arg_type(args.inputs[0].type) {}
+
+ template <typename Type>
+ Status Init() {
+ using StateType = SetLookupState<Type>;
+ result.reset(new StateType(ctx->exec_context()->memory_pool()));
+ return static_cast<StateType*>(result.get())->Init(options);
+ }
+
+ Status Visit(const DataType&) { return Init<NullType>(); }
+
+ template <typename Type>
+ enable_if_boolean<Type, Status> Visit(const Type&) {
+ return Init<BooleanType>();
+ }
+
+ template <typename Type>
+ enable_if_t<has_c_type<Type>::value && !is_boolean_type<Type>::value, Status> Visit(
+ const Type&) {
+ return Init<typename UnsignedIntType<sizeof(typename Type::c_type)>::Type>();
+ }
+
+ template <typename Type>
+ enable_if_base_binary<Type, Status> Visit(const Type&) {
+ return Init<typename Type::PhysicalType>();
+ }
+
+ // Handle Decimal128Type, FixedSizeBinaryType
+ Status Visit(const FixedSizeBinaryType& type) { return Init<FixedSizeBinaryType>(); }
+
+ Result<std::unique_ptr<KernelState>> GetResult() {
+ if (!options.value_set.type()->Equals(arg_type)) {
+ ARROW_ASSIGN_OR_RAISE(
+ options.value_set,
+ Cast(options.value_set, CastOptions::Safe(arg_type), ctx->exec_context()));
+ }
+
+ RETURN_NOT_OK(VisitTypeInline(*arg_type, this));
+ return std::move(result);
+ }
+};
+
+Result<std::unique_ptr<KernelState>> InitSetLookup(KernelContext* ctx,
+ const KernelInitArgs& args) {
+ if (args.options == nullptr) {
+ return Status::Invalid(
+ "Attempted to call a set lookup function without SetLookupOptions");
+ }
+
+ return InitStateVisitor{ctx, args}.GetResult();
+}
+
+struct IndexInVisitor {
+ KernelContext* ctx;
+ const ArrayData& data;
+ Datum* out;
+ Int32Builder builder;
+
+ IndexInVisitor(KernelContext* ctx, const ArrayData& data, Datum* out)
+ : ctx(ctx), data(data), out(out), builder(ctx->exec_context()->memory_pool()) {}
+
+ Status Visit(const DataType& type) {
+ DCHECK_EQ(type.id(), Type::NA);
+ const auto& state = checked_cast<const SetLookupState<NullType>&>(*ctx->state());
+ if (data.length != 0) {
+ // skip_nulls is honored for consistency with other types
+ if (state.value_set_has_null) {
+ RETURN_NOT_OK(this->builder.Reserve(data.length));
+ for (int64_t i = 0; i < data.length; ++i) {
+ this->builder.UnsafeAppend(0);
+ }
+ } else {
+ RETURN_NOT_OK(this->builder.AppendNulls(data.length));
+ }
+ }
+ return Status::OK();
+ }
+
+ template <typename Type>
+ Status ProcessIndexIn() {
+ using T = typename GetViewType<Type>::T;
+
+ const auto& state = checked_cast<const SetLookupState<Type>&>(*ctx->state());
+
+ RETURN_NOT_OK(this->builder.Reserve(data.length));
+ VisitArrayDataInline<Type>(
+ data,
+ [&](T v) {
+ int32_t index = state.lookup_table.Get(v);
+ if (index != -1) {
+ // matching needle; output index from value_set
+ this->builder.UnsafeAppend(state.memo_index_to_value_index[index]);
+ } else {
+ // no matching needle; output null
+ this->builder.UnsafeAppendNull();
+ }
+ },
+ [&]() {
+ if (state.null_index != -1) {
+ // value_set included null
+ this->builder.UnsafeAppend(state.null_index);
+ } else {
+ // value_set does not include null; output null
+ this->builder.UnsafeAppendNull();
+ }
+ });
+ return Status::OK();
+ }
+
+ template <typename Type>
+ enable_if_boolean<Type, Status> Visit(const Type&) {
+ return ProcessIndexIn<BooleanType>();
+ }
+
+ template <typename Type>
+ enable_if_t<has_c_type<Type>::value && !is_boolean_type<Type>::value, Status> Visit(
+ const Type&) {
+ return ProcessIndexIn<
+ typename UnsignedIntType<sizeof(typename Type::c_type)>::Type>();
+ }
+
+ template <typename Type>
+ enable_if_base_binary<Type, Status> Visit(const Type&) {
+ return ProcessIndexIn<typename Type::PhysicalType>();
+ }
+
+ // Handle Decimal128Type, FixedSizeBinaryType
+ Status Visit(const FixedSizeBinaryType& type) {
+ return ProcessIndexIn<FixedSizeBinaryType>();
+ }
+
+ Status Execute() {
+ Status s = VisitTypeInline(*data.type, this);
+ if (!s.ok()) {
+ return s;
+ }
+ std::shared_ptr<ArrayData> out_data;
+ RETURN_NOT_OK(this->builder.FinishInternal(&out_data));
+ out->value = std::move(out_data);
+ return Status::OK();
+ }
+};
+
+Status ExecIndexIn(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ return IndexInVisitor(ctx, *batch[0].array(), out).Execute();
+}
+
+// ----------------------------------------------------------------------
+
+// IsIn writes the results into a preallocated boolean data bitmap
+struct IsInVisitor {
+ KernelContext* ctx;
+ const ArrayData& data;
+ Datum* out;
+
+ IsInVisitor(KernelContext* ctx, const ArrayData& data, Datum* out)
+ : ctx(ctx), data(data), out(out) {}
+
+ Status Visit(const DataType& type) {
+ DCHECK_EQ(type.id(), Type::NA);
+ const auto& state = checked_cast<const SetLookupState<NullType>&>(*ctx->state());
+ ArrayData* output = out->mutable_array();
+ // skip_nulls is honored for consistency with other types
+ BitUtil::SetBitsTo(output->buffers[1]->mutable_data(), output->offset, output->length,
+ state.value_set_has_null);
+ return Status::OK();
+ }
+
+ template <typename Type>
+ Status ProcessIsIn() {
+ using T = typename GetViewType<Type>::T;
+ const auto& state = checked_cast<const SetLookupState<Type>&>(*ctx->state());
+ ArrayData* output = out->mutable_array();
+
+ FirstTimeBitmapWriter writer(output->buffers[1]->mutable_data(), output->offset,
+ output->length);
+
+ VisitArrayDataInline<Type>(
+ this->data,
+ [&](T v) {
+ if (state.lookup_table.Get(v) != -1) {
+ writer.Set();
+ } else {
+ writer.Clear();
+ }
+ writer.Next();
+ },
+ [&]() {
+ if (state.null_index != -1) {
+ writer.Set();
+ } else {
+ writer.Clear();
+ }
+ writer.Next();
+ });
+ writer.Finish();
+ return Status::OK();
+ }
+
+ template <typename Type>
+ enable_if_boolean<Type, Status> Visit(const Type&) {
+ return ProcessIsIn<BooleanType>();
+ }
+
+ template <typename Type>
+ enable_if_t<has_c_type<Type>::value && !is_boolean_type<Type>::value, Status> Visit(
+ const Type&) {
+ return ProcessIsIn<typename UnsignedIntType<sizeof(typename Type::c_type)>::Type>();
+ }
+
+ template <typename Type>
+ enable_if_base_binary<Type, Status> Visit(const Type&) {
+ return ProcessIsIn<typename Type::PhysicalType>();
+ }
+
+ // Handle Decimal128Type, FixedSizeBinaryType
+ Status Visit(const FixedSizeBinaryType& type) {
+ return ProcessIsIn<FixedSizeBinaryType>();
+ }
+
+ Status Execute() { return VisitTypeInline(*data.type, this); }
+};
+
+Status ExecIsIn(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ return IsInVisitor(ctx, *batch[0].array(), out).Execute();
+}
+
+// Unary set lookup kernels available for the following input types
+//
+// * Null type
+// * Boolean
+// * Numeric
+// * Simple temporal types (date, time, timestamp)
+// * Base binary types
+// * Decimal
+
+void AddBasicSetLookupKernels(ScalarKernel kernel,
+ const std::shared_ptr<DataType>& out_ty,
+ ScalarFunction* func) {
+ auto AddKernels = [&](const std::vector<std::shared_ptr<DataType>>& types) {
+ for (const std::shared_ptr<DataType>& ty : types) {
+ kernel.signature = KernelSignature::Make({ty}, out_ty);
+ DCHECK_OK(func->AddKernel(kernel));
+ }
+ };
+
+ AddKernels(BaseBinaryTypes());
+ AddKernels(NumericTypes());
+ AddKernels(TemporalTypes());
+
+ std::vector<Type::type> other_types = {Type::BOOL, Type::DECIMAL,
+ Type::FIXED_SIZE_BINARY};
+ for (auto ty : other_types) {
+ kernel.signature = KernelSignature::Make({InputType::Array(ty)}, out_ty);
+ DCHECK_OK(func->AddKernel(kernel));
+ }
+}
+
+// Enables calling is_in with CallFunction as though it were binary.
+class IsInMetaBinary : public MetaFunction {
+ public:
+ IsInMetaBinary()
+ : MetaFunction("is_in_meta_binary", Arity::Binary(), /*doc=*/nullptr) {}
+
+ Result<Datum> ExecuteImpl(const std::vector<Datum>& args,
+ const FunctionOptions* options,
+ ExecContext* ctx) const override {
+ if (options != nullptr) {
+ return Status::Invalid("Unexpected options for 'is_in_meta_binary' function");
+ }
+ return IsIn(args[0], args[1], ctx);
+ }
+};
+
+// Enables calling index_in with CallFunction as though it were binary.
+class IndexInMetaBinary : public MetaFunction {
+ public:
+ IndexInMetaBinary()
+ : MetaFunction("index_in_meta_binary", Arity::Binary(), /*doc=*/nullptr) {}
+
+ Result<Datum> ExecuteImpl(const std::vector<Datum>& args,
+ const FunctionOptions* options,
+ ExecContext* ctx) const override {
+ if (options != nullptr) {
+ return Status::Invalid("Unexpected options for 'index_in_meta_binary' function");
+ }
+ return IndexIn(args[0], args[1], ctx);
+ }
+};
+
+struct SetLookupFunction : ScalarFunction {
+ using ScalarFunction::ScalarFunction;
+
+ Result<const Kernel*> DispatchBest(std::vector<ValueDescr>* values) const override {
+ EnsureDictionaryDecoded(values);
+ return DispatchExact(*values);
+ }
+};
+
+const FunctionDoc is_in_doc{
+ "Find each element in a set of values",
+ ("For each element in `values`, return true if it is found in a given\n"
+ "set of values, false otherwise.\n"
+ "The set of values to look for must be given in SetLookupOptions.\n"
+ "By default, nulls are matched against the value set, this can be\n"
+ "changed in SetLookupOptions."),
+ {"values"},
+ "SetLookupOptions"};
+
+const FunctionDoc index_in_doc{
+ "Return index of each element in a set of values",
+ ("For each element in `values`, return its index in a given set of\n"
+ "values, or null if it is not found there.\n"
+ "The set of values to look for must be given in SetLookupOptions.\n"
+ "By default, nulls are matched against the value set, this can be\n"
+ "changed in SetLookupOptions."),
+ {"values"},
+ "SetLookupOptions"};
+
+} // namespace
+
+void RegisterScalarSetLookup(FunctionRegistry* registry) {
+ // IsIn writes its boolean output into preallocated memory
+ {
+ ScalarKernel isin_base;
+ isin_base.init = InitSetLookup;
+ isin_base.exec =
+ TrivialScalarUnaryAsArraysExec(ExecIsIn, NullHandling::OUTPUT_NOT_NULL);
+ isin_base.null_handling = NullHandling::OUTPUT_NOT_NULL;
+ auto is_in = std::make_shared<SetLookupFunction>("is_in", Arity::Unary(), &is_in_doc);
+
+ AddBasicSetLookupKernels(isin_base, /*output_type=*/boolean(), is_in.get());
+
+ isin_base.signature = KernelSignature::Make({null()}, boolean());
+ DCHECK_OK(is_in->AddKernel(isin_base));
+ DCHECK_OK(registry->AddFunction(is_in));
+
+ DCHECK_OK(registry->AddFunction(std::make_shared<IsInMetaBinary>()));
+ }
+
+ // IndexIn uses Int32Builder and so is responsible for all its own allocation
+ {
+ ScalarKernel index_in_base;
+ index_in_base.init = InitSetLookup;
+ index_in_base.exec = TrivialScalarUnaryAsArraysExec(
+ ExecIndexIn, NullHandling::COMPUTED_NO_PREALLOCATE);
+ index_in_base.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
+ index_in_base.mem_allocation = MemAllocation::NO_PREALLOCATE;
+ auto index_in =
+ std::make_shared<SetLookupFunction>("index_in", Arity::Unary(), &index_in_doc);
+
+ AddBasicSetLookupKernels(index_in_base, /*output_type=*/int32(), index_in.get());
+
+ index_in_base.signature = KernelSignature::Make({null()}, int32());
+ DCHECK_OK(index_in->AddKernel(index_in_base));
+ DCHECK_OK(registry->AddFunction(index_in));
+
+ DCHECK_OK(registry->AddFunction(std::make_shared<IndexInMetaBinary>()));
+ }
+}
+
+} // namespace internal
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_string.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_string.cc
new file mode 100644
index 00000000000..ab0a490eeb3
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -0,0 +1,4145 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <algorithm>
+#include <cctype>
+#include <iterator>
+#include <string>
+
+#ifdef ARROW_WITH_UTF8PROC
+#include <utf8proc.h>
+#endif
+
+#ifdef ARROW_WITH_RE2
+#include <re2/re2.h>
+#endif
+
+#include "arrow/array/builder_binary.h"
+#include "arrow/array/builder_nested.h"
+#include "arrow/buffer_builder.h"
+
+#include "arrow/builder.h"
+#include "arrow/compute/api_scalar.h"
+#include "arrow/compute/kernels/common.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/utf8.h"
+#include "arrow/util/value_parsing.h"
+#include "arrow/visitor_inline.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+
+namespace compute {
+namespace internal {
+
+namespace {
+
+#ifdef ARROW_WITH_RE2
+util::string_view ToStringView(re2::StringPiece piece) {
+ return {piece.data(), piece.length()};
+}
+
+re2::StringPiece ToStringPiece(util::string_view view) {
+ return {view.data(), view.length()};
+}
+
+Status RegexStatus(const RE2& regex) {
+ if (!regex.ok()) {
+ return Status::Invalid("Invalid regular expression: ", regex.error());
+ }
+ return Status::OK();
+}
+#endif
+
+// Code units in the range [a-z] can only be an encoding of an ascii
+// character/codepoint, not the 2nd, 3rd or 4th code unit (byte) of an different
+// codepoint. This guaranteed by non-overlap design of the unicode standard. (see
+// section 2.5 of Unicode Standard Core Specification v13.0)
+
+static inline uint8_t ascii_tolower(uint8_t utf8_code_unit) {
+ return ((utf8_code_unit >= 'A') && (utf8_code_unit <= 'Z')) ? (utf8_code_unit + 32)
+ : utf8_code_unit;
+}
+
+static inline uint8_t ascii_toupper(uint8_t utf8_code_unit) {
+ return ((utf8_code_unit >= 'a') && (utf8_code_unit <= 'z')) ? (utf8_code_unit - 32)
+ : utf8_code_unit;
+}
+
+template <typename T>
+static inline bool IsAsciiCharacter(T character) {
+ return character < 128;
+}
+
+struct BinaryLength {
+ template <typename OutValue, typename Arg0Value = util::string_view>
+ static OutValue Call(KernelContext*, Arg0Value val, Status*) {
+ return static_cast<OutValue>(val.size());
+ }
+};
+
+struct Utf8Length {
+ template <typename OutValue, typename Arg0Value = util::string_view>
+ static OutValue Call(KernelContext*, Arg0Value val, Status*) {
+ auto str = reinterpret_cast<const uint8_t*>(val.data());
+ auto strlen = val.size();
+ return static_cast<OutValue>(util::UTF8Length(str, str + strlen));
+ }
+};
+
+#ifdef ARROW_WITH_UTF8PROC
+
+// Direct lookup tables for unicode properties
+constexpr uint32_t kMaxCodepointLookup =
+ 0xffff; // up to this codepoint is in a lookup table
+std::vector<uint32_t> lut_upper_codepoint;
+std::vector<uint32_t> lut_lower_codepoint;
+std::vector<utf8proc_category_t> lut_category;
+std::once_flag flag_case_luts;
+
+void EnsureLookupTablesFilled() {
+ std::call_once(flag_case_luts, []() {
+ lut_upper_codepoint.reserve(kMaxCodepointLookup + 1);
+ lut_lower_codepoint.reserve(kMaxCodepointLookup + 1);
+ for (uint32_t i = 0; i <= kMaxCodepointLookup; i++) {
+ lut_upper_codepoint.push_back(utf8proc_toupper(i));
+ lut_lower_codepoint.push_back(utf8proc_tolower(i));
+ lut_category.push_back(utf8proc_category(i));
+ }
+ });
+}
+
+#else
+
+void EnsureLookupTablesFilled() {}
+
+#endif // ARROW_WITH_UTF8PROC
+
+constexpr int64_t kTransformError = -1;
+
+struct StringTransformBase {
+ virtual Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ return Status::OK();
+ }
+
+ // Return the maximum total size of the output in codeunits (i.e. bytes)
+ // given input characteristics.
+ virtual int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) {
+ return input_ncodeunits;
+ }
+
+ virtual Status InvalidStatus() {
+ return Status::Invalid("Invalid UTF8 sequence in input");
+ }
+
+ // Derived classes should also define this method:
+ // int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
+ // uint8_t* output);
+};
+
+template <typename Type, typename StringTransform>
+struct StringTransformExecBase {
+ using offset_type = typename Type::offset_type;
+ using ArrayType = typename TypeTraits<Type>::ArrayType;
+
+ static Status Execute(KernelContext* ctx, StringTransform* transform,
+ const ExecBatch& batch, Datum* out) {
+ if (batch[0].kind() == Datum::ARRAY) {
+ return ExecArray(ctx, transform, batch[0].array(), out);
+ }
+ DCHECK_EQ(batch[0].kind(), Datum::SCALAR);
+ return ExecScalar(ctx, transform, batch[0].scalar(), out);
+ }
+
+ static Status ExecArray(KernelContext* ctx, StringTransform* transform,
+ const std::shared_ptr<ArrayData>& data, Datum* out) {
+ ArrayType input(data);
+ ArrayData* output = out->mutable_array();
+
+ const int64_t input_ncodeunits = input.total_values_length();
+ const int64_t input_nstrings = input.length();
+
+ const int64_t output_ncodeunits_max =
+ transform->MaxCodeunits(input_nstrings, input_ncodeunits);
+ if (output_ncodeunits_max > std::numeric_limits<offset_type>::max()) {
+ return Status::CapacityError(
+ "Result might not fit in a 32bit utf8 array, convert to large_utf8");
+ }
+
+ ARROW_ASSIGN_OR_RAISE(auto values_buffer, ctx->Allocate(output_ncodeunits_max));
+ output->buffers[2] = values_buffer;
+
+ // String offsets are preallocated
+ offset_type* output_string_offsets = output->GetMutableValues<offset_type>(1);
+ uint8_t* output_str = output->buffers[2]->mutable_data();
+ offset_type output_ncodeunits = 0;
+
+ output_string_offsets[0] = 0;
+ for (int64_t i = 0; i < input_nstrings; i++) {
+ if (!input.IsNull(i)) {
+ offset_type input_string_ncodeunits;
+ const uint8_t* input_string = input.GetValue(i, &input_string_ncodeunits);
+ auto encoded_nbytes = static_cast<offset_type>(transform->Transform(
+ input_string, input_string_ncodeunits, output_str + output_ncodeunits));
+ if (encoded_nbytes < 0) {
+ return transform->InvalidStatus();
+ }
+ output_ncodeunits += encoded_nbytes;
+ }
+ output_string_offsets[i + 1] = output_ncodeunits;
+ }
+ DCHECK_LE(output_ncodeunits, output_ncodeunits_max);
+
+ // Trim the codepoint buffer, since we allocated too much
+ return values_buffer->Resize(output_ncodeunits, /*shrink_to_fit=*/true);
+ }
+
+ static Status ExecScalar(KernelContext* ctx, StringTransform* transform,
+ const std::shared_ptr<Scalar>& scalar, Datum* out) {
+ const auto& input = checked_cast<const BaseBinaryScalar&>(*scalar);
+ if (!input.is_valid) {
+ return Status::OK();
+ }
+ auto* result = checked_cast<BaseBinaryScalar*>(out->scalar().get());
+ result->is_valid = true;
+ const int64_t data_nbytes = static_cast<int64_t>(input.value->size());
+
+ const int64_t output_ncodeunits_max = transform->MaxCodeunits(1, data_nbytes);
+ if (output_ncodeunits_max > std::numeric_limits<offset_type>::max()) {
+ return Status::CapacityError(
+ "Result might not fit in a 32bit utf8 array, convert to large_utf8");
+ }
+ ARROW_ASSIGN_OR_RAISE(auto value_buffer, ctx->Allocate(output_ncodeunits_max));
+ result->value = value_buffer;
+ auto encoded_nbytes = static_cast<offset_type>(transform->Transform(
+ input.value->data(), data_nbytes, value_buffer->mutable_data()));
+ if (encoded_nbytes < 0) {
+ return transform->InvalidStatus();
+ }
+ DCHECK_LE(encoded_nbytes, output_ncodeunits_max);
+ return value_buffer->Resize(encoded_nbytes, /*shrink_to_fit=*/true);
+ }
+};
+
+template <typename Type, typename StringTransform>
+struct StringTransformExec : public StringTransformExecBase<Type, StringTransform> {
+ using StringTransformExecBase<Type, StringTransform>::Execute;
+
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ StringTransform transform;
+ RETURN_NOT_OK(transform.PreExec(ctx, batch, out));
+ return Execute(ctx, &transform, batch, out);
+ }
+};
+
+template <typename Type, typename StringTransform>
+struct StringTransformExecWithState
+ : public StringTransformExecBase<Type, StringTransform> {
+ using State = typename StringTransform::State;
+ using StringTransformExecBase<Type, StringTransform>::Execute;
+
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ StringTransform transform(State::Get(ctx));
+ RETURN_NOT_OK(transform.PreExec(ctx, batch, out));
+ return Execute(ctx, &transform, batch, out);
+ }
+};
+
+#ifdef ARROW_WITH_UTF8PROC
+
+template <typename CodepointTransform>
+struct StringTransformCodepoint : public StringTransformBase {
+ Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) override {
+ EnsureLookupTablesFilled();
+ return Status::OK();
+ }
+
+ int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) override {
+ return CodepointTransform::MaxCodeunits(ninputs, input_ncodeunits);
+ }
+
+ int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
+ uint8_t* output) {
+ uint8_t* output_start = output;
+ if (ARROW_PREDICT_FALSE(
+ !arrow::util::UTF8Transform(input, input + input_string_ncodeunits, &output,
+ CodepointTransform::TransformCodepoint))) {
+ return kTransformError;
+ }
+ return output - output_start;
+ }
+};
+
+// struct CaseMappingMixin {
+struct CaseMappingTransform {
+ static int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) {
+ // Section 5.18 of the Unicode spec claim that the number of codepoints for case
+ // mapping can grow by a factor of 3. This means grow by a factor of 3 in bytes
+ // However, since we don't support all casings (SpecialCasing.txt) the growth
+ // in bytes iss actually only at max 3/2 (as covered by the unittest).
+ // Note that rounding down the 3/2 is ok, since only codepoints encoded by
+ // two code units (even) can grow to 3 code units.
+ return static_cast<int64_t>(input_ncodeunits) * 3 / 2;
+ }
+};
+
+struct UTF8UpperTransform : public CaseMappingTransform {
+ static uint32_t TransformCodepoint(uint32_t codepoint) {
+ return codepoint <= kMaxCodepointLookup ? lut_upper_codepoint[codepoint]
+ : utf8proc_toupper(codepoint);
+ }
+};
+
+template <typename Type>
+using UTF8Upper = StringTransformExec<Type, StringTransformCodepoint<UTF8UpperTransform>>;
+
+struct UTF8LowerTransform : public CaseMappingTransform {
+ static uint32_t TransformCodepoint(uint32_t codepoint) {
+ return codepoint <= kMaxCodepointLookup ? lut_lower_codepoint[codepoint]
+ : utf8proc_tolower(codepoint);
+ }
+};
+
+template <typename Type>
+using UTF8Lower = StringTransformExec<Type, StringTransformCodepoint<UTF8LowerTransform>>;
+
+#endif // ARROW_WITH_UTF8PROC
+
+struct AsciiReverseTransform : public StringTransformBase {
+ int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
+ uint8_t* output) {
+ uint8_t utf8_char_found = 0;
+ for (int64_t i = 0; i < input_string_ncodeunits; i++) {
+ // if a utf8 char is found, report to utf8_char_found
+ utf8_char_found |= input[i] & 0x80;
+ output[input_string_ncodeunits - i - 1] = input[i];
+ }
+ return utf8_char_found ? kTransformError : input_string_ncodeunits;
+ }
+
+ Status InvalidStatus() override {
+ return Status::Invalid("Non-ASCII sequence in input");
+ }
+};
+
+template <typename Type>
+using AsciiReverse = StringTransformExec<Type, AsciiReverseTransform>;
+
+struct Utf8ReverseTransform : public StringTransformBase {
+ int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
+ uint8_t* output) {
+ int64_t i = 0;
+ while (i < input_string_ncodeunits) {
+ int64_t char_end = std::min(i + util::ValidUtf8CodepointByteSize(input + i),
+ input_string_ncodeunits);
+ std::copy(input + i, input + char_end, output + input_string_ncodeunits - char_end);
+ i = char_end;
+ }
+ return input_string_ncodeunits;
+ }
+};
+
+template <typename Type>
+using Utf8Reverse = StringTransformExec<Type, Utf8ReverseTransform>;
+
+using TransformFunc = std::function<void(const uint8_t*, int64_t, uint8_t*)>;
+
+// Transform a buffer of offsets to one which begins with 0 and has same
+// value lengths.
+template <typename T>
+Status GetShiftedOffsets(KernelContext* ctx, const Buffer& input_buffer, int64_t offset,
+ int64_t length, std::shared_ptr<Buffer>* out) {
+ ARROW_ASSIGN_OR_RAISE(*out, ctx->Allocate((length + 1) * sizeof(T)));
+ const T* input_offsets = reinterpret_cast<const T*>(input_buffer.data()) + offset;
+ T* out_offsets = reinterpret_cast<T*>((*out)->mutable_data());
+ T first_offset = *input_offsets;
+ for (int64_t i = 0; i < length; ++i) {
+ *out_offsets++ = input_offsets[i] - first_offset;
+ }
+ *out_offsets = input_offsets[length] - first_offset;
+ return Status::OK();
+}
+
+// Apply `transform` to input character data- this function cannot change the
+// length
+template <typename Type>
+Status StringDataTransform(KernelContext* ctx, const ExecBatch& batch,
+ TransformFunc transform, Datum* out) {
+ using ArrayType = typename TypeTraits<Type>::ArrayType;
+ using offset_type = typename Type::offset_type;
+
+ if (batch[0].kind() == Datum::ARRAY) {
+ const ArrayData& input = *batch[0].array();
+ ArrayType input_boxed(batch[0].array());
+
+ ArrayData* out_arr = out->mutable_array();
+
+ if (input.offset == 0) {
+ // We can reuse offsets from input
+ out_arr->buffers[1] = input.buffers[1];
+ } else {
+ DCHECK(input.buffers[1]);
+ // We must allocate new space for the offsets and shift the existing offsets
+ RETURN_NOT_OK(GetShiftedOffsets<offset_type>(ctx, *input.buffers[1], input.offset,
+ input.length, &out_arr->buffers[1]));
+ }
+
+ // Allocate space for output data
+ int64_t data_nbytes = input_boxed.total_values_length();
+ RETURN_NOT_OK(ctx->Allocate(data_nbytes).Value(&out_arr->buffers[2]));
+ if (input.length > 0) {
+ transform(input.buffers[2]->data() + input_boxed.value_offset(0), data_nbytes,
+ out_arr->buffers[2]->mutable_data());
+ }
+ } else {
+ const auto& input = checked_cast<const BaseBinaryScalar&>(*batch[0].scalar());
+ auto result = checked_pointer_cast<BaseBinaryScalar>(MakeNullScalar(out->type()));
+ if (input.is_valid) {
+ result->is_valid = true;
+ int64_t data_nbytes = input.value->size();
+ RETURN_NOT_OK(ctx->Allocate(data_nbytes).Value(&result->value));
+ transform(input.value->data(), data_nbytes, result->value->mutable_data());
+ }
+ out->value = result;
+ }
+
+ return Status::OK();
+}
+
+void TransformAsciiUpper(const uint8_t* input, int64_t length, uint8_t* output) {
+ std::transform(input, input + length, output, ascii_toupper);
+}
+
+template <typename Type>
+struct AsciiUpper {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ return StringDataTransform<Type>(ctx, batch, TransformAsciiUpper, out);
+ }
+};
+
+void TransformAsciiLower(const uint8_t* input, int64_t length, uint8_t* output) {
+ std::transform(input, input + length, output, ascii_tolower);
+}
+
+template <typename Type>
+struct AsciiLower {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ return StringDataTransform<Type>(ctx, batch, TransformAsciiLower, out);
+ }
+};
+
+// ----------------------------------------------------------------------
+// exact pattern detection
+
+using StrToBoolTransformFunc =
+ std::function<void(const void*, const uint8_t*, int64_t, int64_t, uint8_t*)>;
+
+// Apply `transform` to input character data- this function cannot change the
+// length
+template <typename Type>
+void StringBoolTransform(KernelContext* ctx, const ExecBatch& batch,
+ StrToBoolTransformFunc transform, Datum* out) {
+ using offset_type = typename Type::offset_type;
+
+ if (batch[0].kind() == Datum::ARRAY) {
+ const ArrayData& input = *batch[0].array();
+ ArrayData* out_arr = out->mutable_array();
+ if (input.length > 0) {
+ transform(
+ reinterpret_cast<const offset_type*>(input.buffers[1]->data()) + input.offset,
+ input.buffers[2]->data(), input.length, out_arr->offset,
+ out_arr->buffers[1]->mutable_data());
+ }
+ } else {
+ const auto& input = checked_cast<const BaseBinaryScalar&>(*batch[0].scalar());
+ if (input.is_valid) {
+ uint8_t result_value = 0;
+ std::array<offset_type, 2> offsets{0,
+ static_cast<offset_type>(input.value->size())};
+ transform(offsets.data(), input.value->data(), 1, /*output_offset=*/0,
+ &result_value);
+ out->value = std::make_shared<BooleanScalar>(result_value > 0);
+ }
+ }
+}
+
+using MatchSubstringState = OptionsWrapper<MatchSubstringOptions>;
+
+// This is an implementation of the Knuth-Morris-Pratt algorithm
+struct PlainSubstringMatcher {
+ const MatchSubstringOptions& options_;
+ std::vector<int64_t> prefix_table;
+
+ static Result<std::unique_ptr<PlainSubstringMatcher>> Make(
+ const MatchSubstringOptions& options) {
+ // Should be handled by partial template specialization below
+ DCHECK(!options.ignore_case);
+ return ::arrow::internal::make_unique<PlainSubstringMatcher>(options);
+ }
+
+ explicit PlainSubstringMatcher(const MatchSubstringOptions& options)
+ : options_(options) {
+ // Phase 1: Build the prefix table
+ const auto pattern_length = options_.pattern.size();
+ prefix_table.resize(pattern_length + 1, /*value=*/0);
+ int64_t prefix_length = -1;
+ prefix_table[0] = -1;
+ for (size_t pos = 0; pos < pattern_length; ++pos) {
+ // The prefix cannot be expanded, reset.
+ while (prefix_length >= 0 &&
+ options_.pattern[pos] != options_.pattern[prefix_length]) {
+ prefix_length = prefix_table[prefix_length];
+ }
+ prefix_length++;
+ prefix_table[pos + 1] = prefix_length;
+ }
+ }
+
+ int64_t Find(util::string_view current) const {
+ // Phase 2: Find the prefix in the data
+ const auto pattern_length = options_.pattern.size();
+ int64_t pattern_pos = 0;
+ int64_t pos = 0;
+ if (pattern_length == 0) return 0;
+ for (const auto c : current) {
+ while ((pattern_pos >= 0) && (options_.pattern[pattern_pos] != c)) {
+ pattern_pos = prefix_table[pattern_pos];
+ }
+ pattern_pos++;
+ if (static_cast<size_t>(pattern_pos) == pattern_length) {
+ return pos + 1 - pattern_length;
+ }
+ pos++;
+ }
+ return -1;
+ }
+
+ bool Match(util::string_view current) const { return Find(current) >= 0; }
+};
+
+struct PlainStartsWithMatcher {
+ const MatchSubstringOptions& options_;
+
+ explicit PlainStartsWithMatcher(const MatchSubstringOptions& options)
+ : options_(options) {}
+
+ static Result<std::unique_ptr<PlainStartsWithMatcher>> Make(
+ const MatchSubstringOptions& options) {
+ // Should be handled by partial template specialization below
+ DCHECK(!options.ignore_case);
+ return ::arrow::internal::make_unique<PlainStartsWithMatcher>(options);
+ }
+
+ bool Match(util::string_view current) const {
+ // string_view::starts_with is C++20
+ return current.substr(0, options_.pattern.size()) == options_.pattern;
+ }
+};
+
+struct PlainEndsWithMatcher {
+ const MatchSubstringOptions& options_;
+
+ explicit PlainEndsWithMatcher(const MatchSubstringOptions& options)
+ : options_(options) {}
+
+ static Result<std::unique_ptr<PlainEndsWithMatcher>> Make(
+ const MatchSubstringOptions& options) {
+ // Should be handled by partial template specialization below
+ DCHECK(!options.ignore_case);
+ return ::arrow::internal::make_unique<PlainEndsWithMatcher>(options);
+ }
+
+ bool Match(util::string_view current) const {
+ // string_view::ends_with is C++20
+ return current.size() >= options_.pattern.size() &&
+ current.substr(current.size() - options_.pattern.size(),
+ options_.pattern.size()) == options_.pattern;
+ }
+};
+
+#ifdef ARROW_WITH_RE2
+struct RegexSubstringMatcher {
+ const MatchSubstringOptions& options_;
+ const RE2 regex_match_;
+
+ static Result<std::unique_ptr<RegexSubstringMatcher>> Make(
+ const MatchSubstringOptions& options, bool literal = false) {
+ auto matcher =
+ ::arrow::internal::make_unique<RegexSubstringMatcher>(options, literal);
+ RETURN_NOT_OK(RegexStatus(matcher->regex_match_));
+ return std::move(matcher);
+ }
+
+ explicit RegexSubstringMatcher(const MatchSubstringOptions& options,
+ bool literal = false)
+ : options_(options),
+ regex_match_(options_.pattern, MakeRE2Options(options, literal)) {}
+
+ bool Match(util::string_view current) const {
+ auto piece = re2::StringPiece(current.data(), current.length());
+ return re2::RE2::PartialMatch(piece, regex_match_);
+ }
+
+ static RE2::RE2::Options MakeRE2Options(const MatchSubstringOptions& options,
+ bool literal) {
+ RE2::RE2::Options re2_options(RE2::Quiet);
+ re2_options.set_case_sensitive(!options.ignore_case);
+ re2_options.set_literal(literal);
+ return re2_options;
+ }
+};
+#endif
+
+template <typename Type, typename Matcher>
+struct MatchSubstringImpl {
+ using offset_type = typename Type::offset_type;
+
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out,
+ const Matcher* matcher) {
+ StringBoolTransform<Type>(
+ ctx, batch,
+ [&matcher](const void* raw_offsets, const uint8_t* data, int64_t length,
+ int64_t output_offset, uint8_t* output) {
+ const offset_type* offsets = reinterpret_cast<const offset_type*>(raw_offsets);
+ FirstTimeBitmapWriter bitmap_writer(output, output_offset, length);
+ for (int64_t i = 0; i < length; ++i) {
+ const char* current_data = reinterpret_cast<const char*>(data + offsets[i]);
+ int64_t current_length = offsets[i + 1] - offsets[i];
+ if (matcher->Match(util::string_view(current_data, current_length))) {
+ bitmap_writer.Set();
+ }
+ bitmap_writer.Next();
+ }
+ bitmap_writer.Finish();
+ },
+ out);
+ return Status::OK();
+ }
+};
+
+template <typename Type, typename Matcher>
+struct MatchSubstring {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ // TODO Cache matcher across invocations (for regex compilation)
+ ARROW_ASSIGN_OR_RAISE(auto matcher, Matcher::Make(MatchSubstringState::Get(ctx)));
+ return MatchSubstringImpl<Type, Matcher>::Exec(ctx, batch, out, matcher.get());
+ }
+};
+
+template <typename Type>
+struct MatchSubstring<Type, PlainSubstringMatcher> {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ auto options = MatchSubstringState::Get(ctx);
+ if (options.ignore_case) {
+#ifdef ARROW_WITH_RE2
+ ARROW_ASSIGN_OR_RAISE(auto matcher,
+ RegexSubstringMatcher::Make(options, /*literal=*/true));
+ return MatchSubstringImpl<Type, RegexSubstringMatcher>::Exec(ctx, batch, out,
+ matcher.get());
+#else
+ return Status::NotImplemented("ignore_case requires RE2");
+#endif
+ }
+ ARROW_ASSIGN_OR_RAISE(auto matcher, PlainSubstringMatcher::Make(options));
+ return MatchSubstringImpl<Type, PlainSubstringMatcher>::Exec(ctx, batch, out,
+ matcher.get());
+ }
+};
+
+template <typename Type>
+struct MatchSubstring<Type, PlainStartsWithMatcher> {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ auto options = MatchSubstringState::Get(ctx);
+ if (options.ignore_case) {
+#ifdef ARROW_WITH_RE2
+ MatchSubstringOptions converted_options = options;
+ converted_options.pattern = "^" + RE2::QuoteMeta(options.pattern);
+ ARROW_ASSIGN_OR_RAISE(auto matcher, RegexSubstringMatcher::Make(converted_options));
+ return MatchSubstringImpl<Type, RegexSubstringMatcher>::Exec(ctx, batch, out,
+ matcher.get());
+#else
+ return Status::NotImplemented("ignore_case requires RE2");
+#endif
+ }
+ ARROW_ASSIGN_OR_RAISE(auto matcher, PlainStartsWithMatcher::Make(options));
+ return MatchSubstringImpl<Type, PlainStartsWithMatcher>::Exec(ctx, batch, out,
+ matcher.get());
+ }
+};
+
+template <typename Type>
+struct MatchSubstring<Type, PlainEndsWithMatcher> {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ auto options = MatchSubstringState::Get(ctx);
+ if (options.ignore_case) {
+#ifdef ARROW_WITH_RE2
+ MatchSubstringOptions converted_options = options;
+ converted_options.pattern = RE2::QuoteMeta(options.pattern) + "$";
+ ARROW_ASSIGN_OR_RAISE(auto matcher, RegexSubstringMatcher::Make(converted_options));
+ return MatchSubstringImpl<Type, RegexSubstringMatcher>::Exec(ctx, batch, out,
+ matcher.get());
+#else
+ return Status::NotImplemented("ignore_case requires RE2");
+#endif
+ }
+ ARROW_ASSIGN_OR_RAISE(auto matcher, PlainEndsWithMatcher::Make(options));
+ return MatchSubstringImpl<Type, PlainEndsWithMatcher>::Exec(ctx, batch, out,
+ matcher.get());
+ }
+};
+
+const FunctionDoc match_substring_doc(
+ "Match strings against literal pattern",
+ ("For each string in `strings`, emit true iff it contains a given pattern.\n"
+ "Null inputs emit null. The pattern must be given in MatchSubstringOptions. "
+ "If ignore_case is set, only simple case folding is performed."),
+ {"strings"}, "MatchSubstringOptions");
+
+const FunctionDoc starts_with_doc(
+ "Check if strings start with a literal pattern",
+ ("For each string in `strings`, emit true iff it starts with a given pattern.\n"
+ "Null inputs emit null. The pattern must be given in MatchSubstringOptions. "
+ "If ignore_case is set, only simple case folding is performed."),
+ {"strings"}, "MatchSubstringOptions");
+
+const FunctionDoc ends_with_doc(
+ "Check if strings end with a literal pattern",
+ ("For each string in `strings`, emit true iff it ends with a given pattern.\n"
+ "Null inputs emit null. The pattern must be given in MatchSubstringOptions. "
+ "If ignore_case is set, only simple case folding is performed."),
+ {"strings"}, "MatchSubstringOptions");
+
+#ifdef ARROW_WITH_RE2
+const FunctionDoc match_substring_regex_doc(
+ "Match strings against regex pattern",
+ ("For each string in `strings`, emit true iff it matches a given pattern at any "
+ "position.\n"
+ "Null inputs emit null. The pattern must be given in MatchSubstringOptions. "
+ "If ignore_case is set, only simple case folding is performed."),
+ {"strings"}, "MatchSubstringOptions");
+
+// SQL LIKE match
+
+/// Convert a SQL-style LIKE pattern (using '%' and '_') into a regex pattern
+std::string MakeLikeRegex(const MatchSubstringOptions& options) {
+ // Allow . to match \n
+ std::string like_pattern = "(?s:^";
+ like_pattern.reserve(options.pattern.size() + 7);
+ bool escaped = false;
+ for (const char c : options.pattern) {
+ if (!escaped && c == '%') {
+ like_pattern.append(".*");
+ } else if (!escaped && c == '_') {
+ like_pattern.append(".");
+ } else if (!escaped && c == '\\') {
+ escaped = true;
+ } else {
+ switch (c) {
+ case '.':
+ case '?':
+ case '+':
+ case '*':
+ case '^':
+ case '$':
+ case '\\':
+ case '[':
+ case '{':
+ case '(':
+ case ')':
+ case '|': {
+ like_pattern.push_back('\\');
+ like_pattern.push_back(c);
+ escaped = false;
+ break;
+ }
+ default: {
+ like_pattern.push_back(c);
+ escaped = false;
+ break;
+ }
+ }
+ }
+ }
+ like_pattern.append("$)");
+ return like_pattern;
+}
+
+// Evaluate a SQL-like LIKE pattern by translating it to a regexp or
+// substring search as appropriate. See what Apache Impala does:
+// https://github.com/apache/impala/blob/9c38568657d62b6f6d7b10aa1c721ba843374dd8/be/src/exprs/like-predicate.cc
+template <typename StringType>
+struct MatchLike {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ // NOTE: avoid making those constants global to avoid compiling regexes at startup
+ // A LIKE pattern matching this regex can be translated into a substring search.
+ static const RE2 kLikePatternIsSubstringMatch(R"(%+([^%_]*[^\\%_])?%+)");
+ // A LIKE pattern matching this regex can be translated into a prefix search.
+ static const RE2 kLikePatternIsStartsWith(R"(([^%_]*[^\\%_])?%+)");
+ // A LIKE pattern matching this regex can be translated into a suffix search.
+ static const RE2 kLikePatternIsEndsWith(R"(%+([^%_]*))");
+
+ auto original_options = MatchSubstringState::Get(ctx);
+ auto original_state = ctx->state();
+
+ Status status;
+ std::string pattern;
+ if (!original_options.ignore_case &&
+ re2::RE2::FullMatch(original_options.pattern, kLikePatternIsSubstringMatch,
+ &pattern)) {
+ MatchSubstringOptions converted_options{pattern, original_options.ignore_case};
+ MatchSubstringState converted_state(converted_options);
+ ctx->SetState(&converted_state);
+ status = MatchSubstring<StringType, PlainSubstringMatcher>::Exec(ctx, batch, out);
+ } else if (!original_options.ignore_case &&
+ re2::RE2::FullMatch(original_options.pattern, kLikePatternIsStartsWith,
+ &pattern)) {
+ MatchSubstringOptions converted_options{pattern, original_options.ignore_case};
+ MatchSubstringState converted_state(converted_options);
+ ctx->SetState(&converted_state);
+ status = MatchSubstring<StringType, PlainStartsWithMatcher>::Exec(ctx, batch, out);
+ } else if (!original_options.ignore_case &&
+ re2::RE2::FullMatch(original_options.pattern, kLikePatternIsEndsWith,
+ &pattern)) {
+ MatchSubstringOptions converted_options{pattern, original_options.ignore_case};
+ MatchSubstringState converted_state(converted_options);
+ ctx->SetState(&converted_state);
+ status = MatchSubstring<StringType, PlainEndsWithMatcher>::Exec(ctx, batch, out);
+ } else {
+ MatchSubstringOptions converted_options{MakeLikeRegex(original_options),
+ original_options.ignore_case};
+ MatchSubstringState converted_state(converted_options);
+ ctx->SetState(&converted_state);
+ status = MatchSubstring<StringType, RegexSubstringMatcher>::Exec(ctx, batch, out);
+ }
+ ctx->SetState(original_state);
+ return status;
+ }
+};
+
+const FunctionDoc match_like_doc(
+ "Match strings against SQL-style LIKE pattern",
+ ("For each string in `strings`, emit true iff it fully matches a given pattern "
+ "at any position. That is, '%' will match any number of characters, '_' will "
+ "match exactly one character, and any other character matches itself. To "
+ "match a literal '%', '_', or '\\', precede the character with a backslash.\n"
+ "Null inputs emit null. The pattern must be given in MatchSubstringOptions."),
+ {"strings"}, "MatchSubstringOptions");
+
+#endif
+
+void AddMatchSubstring(FunctionRegistry* registry) {
+ {
+ auto func = std::make_shared<ScalarFunction>("match_substring", Arity::Unary(),
+ &match_substring_doc);
+ auto exec_32 = MatchSubstring<StringType, PlainSubstringMatcher>::Exec;
+ auto exec_64 = MatchSubstring<LargeStringType, PlainSubstringMatcher>::Exec;
+ DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32, MatchSubstringState::Init));
+ DCHECK_OK(
+ func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init));
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+ }
+ {
+ auto func = std::make_shared<ScalarFunction>("starts_with", Arity::Unary(),
+ &match_substring_doc);
+ auto exec_32 = MatchSubstring<StringType, PlainStartsWithMatcher>::Exec;
+ auto exec_64 = MatchSubstring<LargeStringType, PlainStartsWithMatcher>::Exec;
+ DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32, MatchSubstringState::Init));
+ DCHECK_OK(
+ func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init));
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+ }
+ {
+ auto func = std::make_shared<ScalarFunction>("ends_with", Arity::Unary(),
+ &match_substring_doc);
+ auto exec_32 = MatchSubstring<StringType, PlainEndsWithMatcher>::Exec;
+ auto exec_64 = MatchSubstring<LargeStringType, PlainEndsWithMatcher>::Exec;
+ DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32, MatchSubstringState::Init));
+ DCHECK_OK(
+ func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init));
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+ }
+#ifdef ARROW_WITH_RE2
+ {
+ auto func = std::make_shared<ScalarFunction>("match_substring_regex", Arity::Unary(),
+ &match_substring_regex_doc);
+ auto exec_32 = MatchSubstring<StringType, RegexSubstringMatcher>::Exec;
+ auto exec_64 = MatchSubstring<LargeStringType, RegexSubstringMatcher>::Exec;
+ DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32, MatchSubstringState::Init));
+ DCHECK_OK(
+ func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init));
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+ }
+ {
+ auto func =
+ std::make_shared<ScalarFunction>("match_like", Arity::Unary(), &match_like_doc);
+ auto exec_32 = MatchLike<StringType>::Exec;
+ auto exec_64 = MatchLike<LargeStringType>::Exec;
+ DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32, MatchSubstringState::Init));
+ DCHECK_OK(
+ func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init));
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+ }
+#endif
+}
+
+// Substring find - lfind/index/etc.
+
+struct FindSubstring {
+ const PlainSubstringMatcher matcher_;
+
+ explicit FindSubstring(PlainSubstringMatcher matcher) : matcher_(std::move(matcher)) {}
+
+ template <typename OutValue, typename... Ignored>
+ OutValue Call(KernelContext*, util::string_view val, Status*) const {
+ return static_cast<OutValue>(matcher_.Find(val));
+ }
+};
+
+#ifdef ARROW_WITH_RE2
+struct FindSubstringRegex {
+ std::unique_ptr<RE2> regex_match_;
+
+ explicit FindSubstringRegex(const MatchSubstringOptions& options,
+ bool literal = false) {
+ std::string regex = "(";
+ regex.reserve(options.pattern.length() + 2);
+ regex += literal ? RE2::QuoteMeta(options.pattern) : options.pattern;
+ regex += ")";
+ regex_match_.reset(new RE2(std::move(regex), RegexSubstringMatcher::MakeRE2Options(
+ options, /*literal=*/false)));
+ }
+
+ template <typename OutValue, typename... Ignored>
+ OutValue Call(KernelContext*, util::string_view val, Status*) const {
+ re2::StringPiece piece(val.data(), val.length());
+ re2::StringPiece match;
+ if (re2::RE2::PartialMatch(piece, *regex_match_, &match)) {
+ return static_cast<OutValue>(match.data() - piece.data());
+ }
+ return -1;
+ }
+};
+#endif
+
+template <typename InputType>
+struct FindSubstringExec {
+ using OffsetType = typename TypeTraits<InputType>::OffsetType;
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const MatchSubstringOptions& options = MatchSubstringState::Get(ctx);
+ if (options.ignore_case) {
+#ifdef ARROW_WITH_RE2
+ applicator::ScalarUnaryNotNullStateful<OffsetType, InputType, FindSubstringRegex>
+ kernel{FindSubstringRegex(options, /*literal=*/true)};
+ return kernel.Exec(ctx, batch, out);
+#endif
+ return Status::NotImplemented("ignore_case requires RE2");
+ }
+ applicator::ScalarUnaryNotNullStateful<OffsetType, InputType, FindSubstring> kernel{
+ FindSubstring(PlainSubstringMatcher(options))};
+ return kernel.Exec(ctx, batch, out);
+ }
+};
+
+const FunctionDoc find_substring_doc(
+ "Find first occurrence of substring",
+ ("For each string in `strings`, emit the index of the first occurrence of the given "
+ "pattern, or -1 if not found.\n"
+ "Null inputs emit null. The pattern must be given in MatchSubstringOptions."),
+ {"strings"}, "MatchSubstringOptions");
+
+#ifdef ARROW_WITH_RE2
+template <typename InputType>
+struct FindSubstringRegexExec {
+ using OffsetType = typename TypeTraits<InputType>::OffsetType;
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const MatchSubstringOptions& options = MatchSubstringState::Get(ctx);
+ applicator::ScalarUnaryNotNullStateful<OffsetType, InputType, FindSubstringRegex>
+ kernel{FindSubstringRegex(options, /*literal=*/false)};
+ return kernel.Exec(ctx, batch, out);
+ }
+};
+
+const FunctionDoc find_substring_regex_doc(
+ "Find location of first match of regex pattern",
+ ("For each string in `strings`, emit the index of the first match of the given "
+ "pattern, or -1 if not found.\n"
+ "Null inputs emit null. The pattern must be given in MatchSubstringOptions."),
+ {"strings"}, "MatchSubstringOptions");
+#endif
+
+void AddFindSubstring(FunctionRegistry* registry) {
+ {
+ auto func = std::make_shared<ScalarFunction>("find_substring", Arity::Unary(),
+ &find_substring_doc);
+ for (const auto& ty : BaseBinaryTypes()) {
+ auto offset_type = offset_bit_width(ty->id()) == 64 ? int64() : int32();
+ DCHECK_OK(func->AddKernel({ty}, offset_type,
+ GenerateTypeAgnosticVarBinaryBase<FindSubstringExec>(ty),
+ MatchSubstringState::Init));
+ }
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+ }
+#ifdef ARROW_WITH_RE2
+ {
+ auto func = std::make_shared<ScalarFunction>("find_substring_regex", Arity::Unary(),
+ &find_substring_regex_doc);
+ for (const auto& ty : BaseBinaryTypes()) {
+ auto offset_type = offset_bit_width(ty->id()) == 64 ? int64() : int32();
+ DCHECK_OK(
+ func->AddKernel({ty}, offset_type,
+ GenerateTypeAgnosticVarBinaryBase<FindSubstringRegexExec>(ty),
+ MatchSubstringState::Init));
+ }
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+ }
+#endif
+}
+
+// Substring count
+
+struct CountSubstring {
+ const PlainSubstringMatcher matcher_;
+
+ explicit CountSubstring(PlainSubstringMatcher matcher) : matcher_(std::move(matcher)) {}
+
+ template <typename OutValue, typename... Ignored>
+ OutValue Call(KernelContext*, util::string_view val, Status*) const {
+ OutValue count = 0;
+ uint64_t start = 0;
+ const auto pattern_size = std::max<uint64_t>(1, matcher_.options_.pattern.size());
+ while (start <= val.size()) {
+ const int64_t index = matcher_.Find(val.substr(start));
+ if (index >= 0) {
+ count++;
+ start += index + pattern_size;
+ } else {
+ break;
+ }
+ }
+ return count;
+ }
+};
+
+#ifdef ARROW_WITH_RE2
+struct CountSubstringRegex {
+ std::unique_ptr<RE2> regex_match_;
+
+ explicit CountSubstringRegex(const MatchSubstringOptions& options, bool literal = false)
+ : regex_match_(new RE2(options.pattern,
+ RegexSubstringMatcher::MakeRE2Options(options, literal))) {}
+
+ static Result<CountSubstringRegex> Make(const MatchSubstringOptions& options,
+ bool literal = false) {
+ CountSubstringRegex counter(options, literal);
+ RETURN_NOT_OK(RegexStatus(*counter.regex_match_));
+ return std::move(counter);
+ }
+
+ template <typename OutValue, typename... Ignored>
+ OutValue Call(KernelContext*, util::string_view val, Status*) const {
+ OutValue count = 0;
+ re2::StringPiece input(val.data(), val.size());
+ auto last_size = input.size();
+ while (re2::RE2::FindAndConsume(&input, *regex_match_)) {
+ count++;
+ if (last_size == input.size()) {
+ // 0-length match
+ if (input.size() > 0) {
+ input.remove_prefix(1);
+ } else {
+ break;
+ }
+ }
+ last_size = input.size();
+ }
+ return count;
+ }
+};
+
+template <typename InputType>
+struct CountSubstringRegexExec {
+ using OffsetType = typename TypeTraits<InputType>::OffsetType;
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const MatchSubstringOptions& options = MatchSubstringState::Get(ctx);
+ ARROW_ASSIGN_OR_RAISE(auto counter, CountSubstringRegex::Make(options));
+ applicator::ScalarUnaryNotNullStateful<OffsetType, InputType, CountSubstringRegex>
+ kernel{std::move(counter)};
+ return kernel.Exec(ctx, batch, out);
+ }
+};
+#endif
+
+template <typename InputType>
+struct CountSubstringExec {
+ using OffsetType = typename TypeTraits<InputType>::OffsetType;
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const MatchSubstringOptions& options = MatchSubstringState::Get(ctx);
+ if (options.ignore_case) {
+#ifdef ARROW_WITH_RE2
+ ARROW_ASSIGN_OR_RAISE(auto counter,
+ CountSubstringRegex::Make(options, /*literal=*/true));
+ applicator::ScalarUnaryNotNullStateful<OffsetType, InputType, CountSubstringRegex>
+ kernel{std::move(counter)};
+ return kernel.Exec(ctx, batch, out);
+#else
+ return Status::NotImplemented("ignore_case requires RE2");
+#endif
+ }
+ applicator::ScalarUnaryNotNullStateful<OffsetType, InputType, CountSubstring> kernel{
+ CountSubstring(PlainSubstringMatcher(options))};
+ return kernel.Exec(ctx, batch, out);
+ }
+};
+
+const FunctionDoc count_substring_doc(
+ "Count occurrences of substring",
+ ("For each string in `strings`, emit the number of occurrences of the given "
+ "pattern.\n"
+ "Null inputs emit null. The pattern must be given in MatchSubstringOptions."),
+ {"strings"}, "MatchSubstringOptions");
+
+#ifdef ARROW_WITH_RE2
+const FunctionDoc count_substring_regex_doc(
+ "Count occurrences of substring",
+ ("For each string in `strings`, emit the number of occurrences of the given "
+ "regex pattern.\n"
+ "Null inputs emit null. The pattern must be given in MatchSubstringOptions."),
+ {"strings"}, "MatchSubstringOptions");
+#endif
+
+void AddCountSubstring(FunctionRegistry* registry) {
+ {
+ auto func = std::make_shared<ScalarFunction>("count_substring", Arity::Unary(),
+ &count_substring_doc);
+ for (const auto& ty : BaseBinaryTypes()) {
+ auto offset_type = offset_bit_width(ty->id()) == 64 ? int64() : int32();
+ DCHECK_OK(func->AddKernel({ty}, offset_type,
+ GenerateTypeAgnosticVarBinaryBase<CountSubstringExec>(ty),
+ MatchSubstringState::Init));
+ }
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+ }
+#ifdef ARROW_WITH_RE2
+ {
+ auto func = std::make_shared<ScalarFunction>("count_substring_regex", Arity::Unary(),
+ &count_substring_regex_doc);
+ for (const auto& ty : BaseBinaryTypes()) {
+ auto offset_type = offset_bit_width(ty->id()) == 64 ? int64() : int32();
+ DCHECK_OK(
+ func->AddKernel({ty}, offset_type,
+ GenerateTypeAgnosticVarBinaryBase<CountSubstringRegexExec>(ty),
+ MatchSubstringState::Init));
+ }
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+ }
+#endif
+}
+
+// Slicing
+
+struct SliceTransformBase : public StringTransformBase {
+ using State = OptionsWrapper<SliceOptions>;
+
+ const SliceOptions* options;
+
+ Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) override {
+ options = &State::Get(ctx);
+ if (options->step == 0) {
+ return Status::Invalid("Slice step cannot be zero");
+ }
+ return Status::OK();
+ }
+};
+
+struct SliceCodeunitsTransform : SliceTransformBase {
+ int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) override {
+ const SliceOptions& opt = *this->options;
+ if ((opt.start >= 0) != (opt.stop >= 0)) {
+ // If start and stop don't have the same sign, we can't guess an upper bound
+ // on the resulting slice lengths, so return a worst case estimate.
+ return input_ncodeunits;
+ }
+ int64_t max_slice_codepoints = (opt.stop - opt.start + opt.step - 1) / opt.step;
+ // The maximum UTF8 byte size of a codepoint is 4
+ return std::min(input_ncodeunits,
+ 4 * ninputs * std::max<int64_t>(0, max_slice_codepoints));
+ }
+
+ int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
+ uint8_t* output) {
+ if (options->step >= 1) {
+ return SliceForward(input, input_string_ncodeunits, output);
+ }
+ return SliceBackward(input, input_string_ncodeunits, output);
+ }
+
+#define RETURN_IF_UTF8_ERROR(expr) \
+ do { \
+ if (ARROW_PREDICT_FALSE(!expr)) { \
+ return kTransformError; \
+ } \
+ } while (0)
+
+ int64_t SliceForward(const uint8_t* input, int64_t input_string_ncodeunits,
+ uint8_t* output) {
+ // Slice in forward order (step > 0)
+ const SliceOptions& opt = *this->options;
+ const uint8_t* begin = input;
+ const uint8_t* end = input + input_string_ncodeunits;
+ const uint8_t* begin_sliced = begin;
+ const uint8_t* end_sliced = end;
+
+ // First, compute begin_sliced and end_sliced
+ if (opt.start >= 0) {
+ // start counting from the left
+ RETURN_IF_UTF8_ERROR(
+ arrow::util::UTF8AdvanceCodepoints(begin, end, &begin_sliced, opt.start));
+ if (opt.stop > opt.start) {
+ // continue counting from begin_sliced
+ const int64_t length = opt.stop - opt.start;
+ RETURN_IF_UTF8_ERROR(
+ arrow::util::UTF8AdvanceCodepoints(begin_sliced, end, &end_sliced, length));
+ } else if (opt.stop < 0) {
+ // or from the end (but we will never need to < begin_sliced)
+ RETURN_IF_UTF8_ERROR(arrow::util::UTF8AdvanceCodepointsReverse(
+ begin_sliced, end, &end_sliced, -opt.stop));
+ } else {
+ // zero length slice
+ return 0;
+ }
+ } else {
+ // start counting from the right
+ RETURN_IF_UTF8_ERROR(arrow::util::UTF8AdvanceCodepointsReverse(
+ begin, end, &begin_sliced, -opt.start));
+ if (opt.stop > 0) {
+ // continue counting from the left, we cannot start from begin_sliced because we
+ // don't know how many codepoints are between begin and begin_sliced
+ RETURN_IF_UTF8_ERROR(
+ arrow::util::UTF8AdvanceCodepoints(begin, end, &end_sliced, opt.stop));
+ // and therefore we also needs this
+ if (end_sliced <= begin_sliced) {
+ // zero length slice
+ return 0;
+ }
+ } else if ((opt.stop < 0) && (opt.stop > opt.start)) {
+ // stop is negative, but larger than start, so we count again from the right
+ // in some cases we can optimize this, depending on the shortest path (from end
+ // or begin_sliced), but begin_sliced and opt.start can be 'out of sync',
+ // for instance when start=-100, when the string length is only 10.
+ RETURN_IF_UTF8_ERROR(arrow::util::UTF8AdvanceCodepointsReverse(
+ begin_sliced, end, &end_sliced, -opt.stop));
+ } else {
+ // zero length slice
+ return 0;
+ }
+ }
+
+ // Second, copy computed slice to output
+ DCHECK(begin_sliced <= end_sliced);
+ if (opt.step == 1) {
+ // fast case, where we simply can finish with a memcpy
+ std::copy(begin_sliced, end_sliced, output);
+ return end_sliced - begin_sliced;
+ }
+ uint8_t* dest = output;
+ const uint8_t* i = begin_sliced;
+
+ while (i < end_sliced) {
+ uint32_t codepoint = 0;
+ // write a single codepoint
+ RETURN_IF_UTF8_ERROR(arrow::util::UTF8Decode(&i, &codepoint));
+ dest = arrow::util::UTF8Encode(dest, codepoint);
+ // and skip the remainder
+ int64_t skips = opt.step - 1;
+ while ((skips--) && (i < end_sliced)) {
+ RETURN_IF_UTF8_ERROR(arrow::util::UTF8Decode(&i, &codepoint));
+ }
+ }
+ return dest - output;
+ }
+
+ int64_t SliceBackward(const uint8_t* input, int64_t input_string_ncodeunits,
+ uint8_t* output) {
+ // Slice in reverse order (step < 0)
+ const SliceOptions& opt = *this->options;
+ const uint8_t* begin = input;
+ const uint8_t* end = input + input_string_ncodeunits;
+ const uint8_t* begin_sliced = begin;
+ const uint8_t* end_sliced = end;
+
+ // Serious +1 -1 kung fu because begin_sliced and end_sliced act like
+ // reverse iterators.
+ if (opt.start >= 0) {
+ // +1 because begin_sliced acts as as the end of a reverse iterator
+ RETURN_IF_UTF8_ERROR(
+ arrow::util::UTF8AdvanceCodepoints(begin, end, &begin_sliced, opt.start + 1));
+ } else {
+ // -1 because start=-1 means the last codeunit, which is 0 advances
+ RETURN_IF_UTF8_ERROR(arrow::util::UTF8AdvanceCodepointsReverse(
+ begin, end, &begin_sliced, -opt.start - 1));
+ }
+ // make it point at the last codeunit of the previous codeunit
+ begin_sliced--;
+
+ // similar to opt.start
+ if (opt.stop >= 0) {
+ RETURN_IF_UTF8_ERROR(
+ arrow::util::UTF8AdvanceCodepoints(begin, end, &end_sliced, opt.stop + 1));
+ } else {
+ RETURN_IF_UTF8_ERROR(arrow::util::UTF8AdvanceCodepointsReverse(
+ begin, end, &end_sliced, -opt.stop - 1));
+ }
+ end_sliced--;
+
+ // Copy computed slice to output
+ uint8_t* dest = output;
+ const uint8_t* i = begin_sliced;
+ while (i > end_sliced) {
+ uint32_t codepoint = 0;
+ // write a single codepoint
+ RETURN_IF_UTF8_ERROR(arrow::util::UTF8DecodeReverse(&i, &codepoint));
+ dest = arrow::util::UTF8Encode(dest, codepoint);
+ // and skip the remainder
+ int64_t skips = -opt.step - 1;
+ while ((skips--) && (i > end_sliced)) {
+ RETURN_IF_UTF8_ERROR(arrow::util::UTF8DecodeReverse(&i, &codepoint));
+ }
+ }
+ return dest - output;
+ }
+
+#undef RETURN_IF_UTF8_ERROR
+};
+
+template <typename Type>
+using SliceCodeunits = StringTransformExec<Type, SliceCodeunitsTransform>;
+
+const FunctionDoc utf8_slice_codeunits_doc(
+ "Slice string ",
+ ("For each string in `strings`, slice into a substring defined by\n"
+ "`start`, `stop`, `step`) as given by `SliceOptions` where `start` is inclusive\n"
+ "and `stop` is exclusive and are measured in codeunits. If step is negative, the\n"
+ "string will be advanced in reversed order. A `step` of zero is considered an\n"
+ "error.\n"
+ "Null inputs emit null."),
+ {"strings"}, "SliceOptions");
+
+void AddSlice(FunctionRegistry* registry) {
+ auto func = std::make_shared<ScalarFunction>("utf8_slice_codeunits", Arity::Unary(),
+ &utf8_slice_codeunits_doc);
+ using t32 = SliceCodeunits<StringType>;
+ using t64 = SliceCodeunits<LargeStringType>;
+ DCHECK_OK(
+ func->AddKernel({utf8()}, utf8(), t32::Exec, SliceCodeunitsTransform::State::Init));
+ DCHECK_OK(func->AddKernel({large_utf8()}, large_utf8(), t64::Exec,
+ SliceCodeunitsTransform::State::Init));
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+}
+
+// IsAlpha/Digit etc
+
+#ifdef ARROW_WITH_UTF8PROC
+
+static inline bool HasAnyUnicodeGeneralCategory(uint32_t codepoint, uint32_t mask) {
+ utf8proc_category_t general_category = codepoint <= kMaxCodepointLookup
+ ? lut_category[codepoint]
+ : utf8proc_category(codepoint);
+ uint32_t general_category_bit = 1 << general_category;
+ // for e.g. undefined (but valid) codepoints, general_category == 0 ==
+ // UTF8PROC_CATEGORY_CN
+ return (general_category != UTF8PROC_CATEGORY_CN) &&
+ ((general_category_bit & mask) != 0);
+}
+
+template <typename... Categories>
+static inline bool HasAnyUnicodeGeneralCategory(uint32_t codepoint, uint32_t mask,
+ utf8proc_category_t category,
+ Categories... categories) {
+ return HasAnyUnicodeGeneralCategory(codepoint, mask | (1 << category), categories...);
+}
+
+template <typename... Categories>
+static inline bool HasAnyUnicodeGeneralCategory(uint32_t codepoint,
+ utf8proc_category_t category,
+ Categories... categories) {
+ return HasAnyUnicodeGeneralCategory(codepoint, static_cast<uint32_t>(1u << category),
+ categories...);
+}
+
+static inline bool IsCasedCharacterUnicode(uint32_t codepoint) {
+ return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LU,
+ UTF8PROC_CATEGORY_LL, UTF8PROC_CATEGORY_LT) ||
+ ((static_cast<uint32_t>(utf8proc_toupper(codepoint)) != codepoint) ||
+ (static_cast<uint32_t>(utf8proc_tolower(codepoint)) != codepoint));
+}
+
+static inline bool IsLowerCaseCharacterUnicode(uint32_t codepoint) {
+ // although this trick seems to work for upper case, this is not enough for lower case
+ // testing, see https://github.com/JuliaStrings/utf8proc/issues/195 . But currently the
+ // best we can do
+ return (HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LL) ||
+ ((static_cast<uint32_t>(utf8proc_toupper(codepoint)) != codepoint) &&
+ (static_cast<uint32_t>(utf8proc_tolower(codepoint)) == codepoint))) &&
+ !HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LT);
+}
+
+static inline bool IsUpperCaseCharacterUnicode(uint32_t codepoint) {
+ // this seems to be a good workaround for utf8proc not having case information
+ // https://github.com/JuliaStrings/utf8proc/issues/195
+ return (HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LU) ||
+ ((static_cast<uint32_t>(utf8proc_toupper(codepoint)) == codepoint) &&
+ (static_cast<uint32_t>(utf8proc_tolower(codepoint)) != codepoint))) &&
+ !HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LT);
+}
+
+static inline bool IsAlphaNumericCharacterUnicode(uint32_t codepoint) {
+ return HasAnyUnicodeGeneralCategory(
+ codepoint, UTF8PROC_CATEGORY_LU, UTF8PROC_CATEGORY_LL, UTF8PROC_CATEGORY_LT,
+ UTF8PROC_CATEGORY_LM, UTF8PROC_CATEGORY_LO, UTF8PROC_CATEGORY_ND,
+ UTF8PROC_CATEGORY_NL, UTF8PROC_CATEGORY_NO);
+}
+
+static inline bool IsAlphaCharacterUnicode(uint32_t codepoint) {
+ return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LU,
+ UTF8PROC_CATEGORY_LL, UTF8PROC_CATEGORY_LT,
+ UTF8PROC_CATEGORY_LM, UTF8PROC_CATEGORY_LO);
+}
+
+static inline bool IsDecimalCharacterUnicode(uint32_t codepoint) {
+ return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_ND);
+}
+
+static inline bool IsDigitCharacterUnicode(uint32_t codepoint) {
+ // Python defines this as Numeric_Type=Digit or Numeric_Type=Decimal.
+ // utf8proc has no support for this, this is the best we can do:
+ return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_ND);
+}
+
+static inline bool IsNumericCharacterUnicode(uint32_t codepoint) {
+ // Formally this is not correct, but utf8proc does not allow us to query for Numerical
+ // properties, e.g. Numeric_Value and Numeric_Type
+ // Python defines Numeric as Numeric_Type=Digit, Numeric_Type=Decimal or
+ // Numeric_Type=Numeric.
+ return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_ND,
+ UTF8PROC_CATEGORY_NL, UTF8PROC_CATEGORY_NO);
+}
+
+static inline bool IsSpaceCharacterUnicode(uint32_t codepoint) {
+ auto property = utf8proc_get_property(codepoint);
+ return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_ZS) ||
+ property->bidi_class == UTF8PROC_BIDI_CLASS_WS ||
+ property->bidi_class == UTF8PROC_BIDI_CLASS_B ||
+ property->bidi_class == UTF8PROC_BIDI_CLASS_S;
+}
+
+static inline bool IsPrintableCharacterUnicode(uint32_t codepoint) {
+ uint32_t general_category = utf8proc_category(codepoint);
+ return (general_category != UTF8PROC_CATEGORY_CN) &&
+ !HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_CC,
+ UTF8PROC_CATEGORY_CF, UTF8PROC_CATEGORY_CS,
+ UTF8PROC_CATEGORY_CO, UTF8PROC_CATEGORY_ZS,
+ UTF8PROC_CATEGORY_ZL, UTF8PROC_CATEGORY_ZP);
+}
+
+#endif
+
+static inline bool IsLowerCaseCharacterAscii(uint8_t ascii_character) {
+ return (ascii_character >= 'a') && (ascii_character <= 'z');
+}
+
+static inline bool IsUpperCaseCharacterAscii(uint8_t ascii_character) {
+ return (ascii_character >= 'A') && (ascii_character <= 'Z');
+}
+
+static inline bool IsCasedCharacterAscii(uint8_t ascii_character) {
+ return IsLowerCaseCharacterAscii(ascii_character) ||
+ IsUpperCaseCharacterAscii(ascii_character);
+}
+
+static inline bool IsAlphaCharacterAscii(uint8_t ascii_character) {
+ return IsCasedCharacterAscii(ascii_character); // same
+}
+
+static inline bool IsAlphaNumericCharacterAscii(uint8_t ascii_character) {
+ return ((ascii_character >= '0') && (ascii_character <= '9')) ||
+ ((ascii_character >= 'a') && (ascii_character <= 'z')) ||
+ ((ascii_character >= 'A') && (ascii_character <= 'Z'));
+}
+
+static inline bool IsDecimalCharacterAscii(uint8_t ascii_character) {
+ return ((ascii_character >= '0') && (ascii_character <= '9'));
+}
+
+static inline bool IsSpaceCharacterAscii(uint8_t ascii_character) {
+ return ((ascii_character >= 0x09) && (ascii_character <= 0x0D)) ||
+ (ascii_character == ' ');
+}
+
+static inline bool IsPrintableCharacterAscii(uint8_t ascii_character) {
+ return ((ascii_character >= ' ') && (ascii_character <= '~'));
+}
+
+template <typename Derived, bool allow_empty = false>
+struct CharacterPredicateUnicode {
+ static bool Call(KernelContext*, const uint8_t* input, size_t input_string_ncodeunits,
+ Status* st) {
+ if (allow_empty && input_string_ncodeunits == 0) {
+ return true;
+ }
+ bool all;
+ bool any = false;
+ if (!ARROW_PREDICT_TRUE(arrow::util::UTF8AllOf(
+ input, input + input_string_ncodeunits, &all, [&any](uint32_t codepoint) {
+ any |= Derived::PredicateCharacterAny(codepoint);
+ return Derived::PredicateCharacterAll(codepoint);
+ }))) {
+ *st = Status::Invalid("Invalid UTF8 sequence in input");
+ return false;
+ }
+ return all & any;
+ }
+
+ static inline bool PredicateCharacterAny(uint32_t) {
+ return true; // default condition make sure there is at least 1 charachter
+ }
+};
+
+template <typename Derived, bool allow_empty = false>
+struct CharacterPredicateAscii {
+ static bool Call(KernelContext*, const uint8_t* input, size_t input_string_ncodeunits,
+ Status*) {
+ if (allow_empty && input_string_ncodeunits == 0) {
+ return true;
+ }
+ bool any = false;
+ // MB: A simple for loops seems 8% faster on gcc 9.3, running the IsAlphaNumericAscii
+ // benchmark. I don't consider that worth it.
+ bool all = std::all_of(input, input + input_string_ncodeunits,
+ [&any](uint8_t ascii_character) {
+ any |= Derived::PredicateCharacterAny(ascii_character);
+ return Derived::PredicateCharacterAll(ascii_character);
+ });
+ return all & any;
+ }
+
+ static inline bool PredicateCharacterAny(uint8_t) {
+ return true; // default condition make sure there is at least 1 charachter
+ }
+};
+
+#ifdef ARROW_WITH_UTF8PROC
+struct IsAlphaNumericUnicode : CharacterPredicateUnicode<IsAlphaNumericUnicode> {
+ static inline bool PredicateCharacterAll(uint32_t codepoint) {
+ return IsAlphaNumericCharacterUnicode(codepoint);
+ }
+};
+#endif
+
+struct IsAlphaNumericAscii : CharacterPredicateAscii<IsAlphaNumericAscii> {
+ static inline bool PredicateCharacterAll(uint8_t ascii_character) {
+ return IsAlphaNumericCharacterAscii(ascii_character);
+ }
+};
+
+#ifdef ARROW_WITH_UTF8PROC
+struct IsAlphaUnicode : CharacterPredicateUnicode<IsAlphaUnicode> {
+ static inline bool PredicateCharacterAll(uint32_t codepoint) {
+ return IsAlphaCharacterUnicode(codepoint);
+ }
+};
+#endif
+
+struct IsAlphaAscii : CharacterPredicateAscii<IsAlphaAscii> {
+ static inline bool PredicateCharacterAll(uint8_t ascii_character) {
+ return IsAlphaCharacterAscii(ascii_character);
+ }
+};
+
+#ifdef ARROW_WITH_UTF8PROC
+struct IsDecimalUnicode : CharacterPredicateUnicode<IsDecimalUnicode> {
+ static inline bool PredicateCharacterAll(uint32_t codepoint) {
+ return IsDecimalCharacterUnicode(codepoint);
+ }
+};
+#endif
+
+struct IsDecimalAscii : CharacterPredicateAscii<IsDecimalAscii> {
+ static inline bool PredicateCharacterAll(uint8_t ascii_character) {
+ return IsDecimalCharacterAscii(ascii_character);
+ }
+};
+
+#ifdef ARROW_WITH_UTF8PROC
+struct IsDigitUnicode : CharacterPredicateUnicode<IsDigitUnicode> {
+ static inline bool PredicateCharacterAll(uint32_t codepoint) {
+ return IsDigitCharacterUnicode(codepoint);
+ }
+};
+
+struct IsNumericUnicode : CharacterPredicateUnicode<IsNumericUnicode> {
+ static inline bool PredicateCharacterAll(uint32_t codepoint) {
+ return IsNumericCharacterUnicode(codepoint);
+ }
+};
+#endif
+
+struct IsAscii {
+ static bool Call(KernelContext*, const uint8_t* input,
+ size_t input_string_nascii_characters, Status*) {
+ return std::all_of(input, input + input_string_nascii_characters,
+ IsAsciiCharacter<uint8_t>);
+ }
+};
+
+#ifdef ARROW_WITH_UTF8PROC
+struct IsLowerUnicode : CharacterPredicateUnicode<IsLowerUnicode> {
+ static inline bool PredicateCharacterAll(uint32_t codepoint) {
+ // Only for cased character it needs to be lower case
+ return !IsCasedCharacterUnicode(codepoint) || IsLowerCaseCharacterUnicode(codepoint);
+ }
+ static inline bool PredicateCharacterAny(uint32_t codepoint) {
+ return IsCasedCharacterUnicode(codepoint); // at least 1 cased character
+ }
+};
+#endif
+
+struct IsLowerAscii : CharacterPredicateAscii<IsLowerAscii> {
+ static inline bool PredicateCharacterAll(uint8_t ascii_character) {
+ // Only for cased character it needs to be lower case
+ return !IsCasedCharacterAscii(ascii_character) ||
+ IsLowerCaseCharacterAscii(ascii_character);
+ }
+ static inline bool PredicateCharacterAny(uint8_t ascii_character) {
+ return IsCasedCharacterAscii(ascii_character); // at least 1 cased character
+ }
+};
+
+#ifdef ARROW_WITH_UTF8PROC
+struct IsPrintableUnicode
+ : CharacterPredicateUnicode<IsPrintableUnicode, /*allow_empty=*/true> {
+ static inline bool PredicateCharacterAll(uint32_t codepoint) {
+ return codepoint == ' ' || IsPrintableCharacterUnicode(codepoint);
+ }
+};
+#endif
+
+struct IsPrintableAscii
+ : CharacterPredicateAscii<IsPrintableAscii, /*allow_empty=*/true> {
+ static inline bool PredicateCharacterAll(uint8_t ascii_character) {
+ return IsPrintableCharacterAscii(ascii_character);
+ }
+};
+
+#ifdef ARROW_WITH_UTF8PROC
+struct IsSpaceUnicode : CharacterPredicateUnicode<IsSpaceUnicode> {
+ static inline bool PredicateCharacterAll(uint32_t codepoint) {
+ return IsSpaceCharacterUnicode(codepoint);
+ }
+};
+#endif
+
+struct IsSpaceAscii : CharacterPredicateAscii<IsSpaceAscii> {
+ static inline bool PredicateCharacterAll(uint8_t ascii_character) {
+ return IsSpaceCharacterAscii(ascii_character);
+ }
+};
+
+#ifdef ARROW_WITH_UTF8PROC
+struct IsTitleUnicode {
+ static bool Call(KernelContext*, const uint8_t* input, size_t input_string_ncodeunits,
+ Status* st) {
+ // rules:
+ // * 1: lower case follows cased
+ // * 2: upper case follows uncased
+ // * 3: at least 1 cased character (which logically should be upper/title)
+ bool rules_1_and_2;
+ bool previous_cased = false; // in LL, LU or LT
+ bool rule_3 = false;
+ bool status =
+ arrow::util::UTF8AllOf(input, input + input_string_ncodeunits, &rules_1_and_2,
+ [&previous_cased, &rule_3](uint32_t codepoint) {
+ if (IsLowerCaseCharacterUnicode(codepoint)) {
+ if (!previous_cased) return false; // rule 1 broken
+ previous_cased = true;
+ } else if (IsCasedCharacterUnicode(codepoint)) {
+ if (previous_cased) return false; // rule 2 broken
+ // next should be a lower case or uncased
+ previous_cased = true;
+ rule_3 = true; // rule 3 obeyed
+ } else {
+ // a non-cased char, like _ or 1
+ // next should be upper case or more uncased
+ previous_cased = false;
+ }
+ return true;
+ });
+ if (!ARROW_PREDICT_TRUE(status)) {
+ *st = Status::Invalid("Invalid UTF8 sequence in input");
+ return false;
+ }
+ return rules_1_and_2 & rule_3;
+ }
+};
+#endif
+
+struct IsTitleAscii {
+ static bool Call(KernelContext*, const uint8_t* input, size_t input_string_ncodeunits,
+ Status*) {
+ // rules:
+ // * 1: lower case follows cased
+ // * 2: upper case follows uncased
+ // * 3: at least 1 cased character (which logically should be upper/title)
+ bool rules_1_and_2 = true;
+ bool previous_cased = false; // in LL, LU or LT
+ bool rule_3 = false;
+ // we cannot rely on std::all_of because we need guaranteed order
+ for (const uint8_t* c = input; c < input + input_string_ncodeunits; ++c) {
+ if (IsLowerCaseCharacterAscii(*c)) {
+ if (!previous_cased) {
+ // rule 1 broken
+ rules_1_and_2 = false;
+ break;
+ }
+ previous_cased = true;
+ } else if (IsCasedCharacterAscii(*c)) {
+ if (previous_cased) {
+ // rule 2 broken
+ rules_1_and_2 = false;
+ break;
+ }
+ // next should be a lower case or uncased
+ previous_cased = true;
+ rule_3 = true; // rule 3 obeyed
+ } else {
+ // a non-cased char, like _ or 1
+ // next should be upper case or more uncased
+ previous_cased = false;
+ }
+ }
+ return rules_1_and_2 & rule_3;
+ }
+};
+
+#ifdef ARROW_WITH_UTF8PROC
+struct IsUpperUnicode : CharacterPredicateUnicode<IsUpperUnicode> {
+ static inline bool PredicateCharacterAll(uint32_t codepoint) {
+ // Only for cased character it needs to be lower case
+ return !IsCasedCharacterUnicode(codepoint) || IsUpperCaseCharacterUnicode(codepoint);
+ }
+ static inline bool PredicateCharacterAny(uint32_t codepoint) {
+ return IsCasedCharacterUnicode(codepoint); // at least 1 cased character
+ }
+};
+#endif
+
+struct IsUpperAscii : CharacterPredicateAscii<IsUpperAscii> {
+ static inline bool PredicateCharacterAll(uint8_t ascii_character) {
+ // Only for cased character it needs to be lower case
+ return !IsCasedCharacterAscii(ascii_character) ||
+ IsUpperCaseCharacterAscii(ascii_character);
+ }
+ static inline bool PredicateCharacterAny(uint8_t ascii_character) {
+ return IsCasedCharacterAscii(ascii_character); // at least 1 cased character
+ }
+};
+
+// splitting
+
+template <typename Options>
+struct SplitFinderBase {
+ virtual Status PreExec(const Options& options) { return Status::OK(); }
+
+ // Derived classes should also define these methods:
+ // static bool Find(const uint8_t* begin, const uint8_t* end,
+ // const uint8_t** separator_begin,
+ // const uint8_t** separator_end,
+ // const SplitPatternOptions& options);
+ //
+ // static bool FindReverse(const uint8_t* begin, const uint8_t* end,
+ // const uint8_t** separator_begin,
+ // const uint8_t** separator_end,
+ // const SplitPatternOptions& options);
+};
+
+template <typename Type, typename ListType, typename SplitFinder,
+ typename Options = typename SplitFinder::Options>
+struct SplitExec {
+ using string_offset_type = typename Type::offset_type;
+ using list_offset_type = typename ListType::offset_type;
+ using ArrayType = typename TypeTraits<Type>::ArrayType;
+ using ArrayListType = typename TypeTraits<ListType>::ArrayType;
+ using ListScalarType = typename TypeTraits<ListType>::ScalarType;
+ using ScalarType = typename TypeTraits<Type>::ScalarType;
+ using BuilderType = typename TypeTraits<Type>::BuilderType;
+ using ListOffsetsBuilderType = TypedBufferBuilder<list_offset_type>;
+ using State = OptionsWrapper<Options>;
+
+ // Keep the temporary storage accross individual values, to minimize reallocations
+ std::vector<util::string_view> parts;
+ Options options;
+
+ explicit SplitExec(const Options& options) : options(options) {}
+
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ return SplitExec{State::Get(ctx)}.Execute(ctx, batch, out);
+ }
+
+ Status Execute(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ SplitFinder finder;
+ RETURN_NOT_OK(finder.PreExec(options));
+ if (batch[0].kind() == Datum::ARRAY) {
+ return Execute(ctx, &finder, batch[0].array(), out);
+ }
+ DCHECK_EQ(batch[0].kind(), Datum::SCALAR);
+ return Execute(ctx, &finder, batch[0].scalar(), out);
+ }
+
+ Status Execute(KernelContext* ctx, SplitFinder* finder,
+ const std::shared_ptr<ArrayData>& data, Datum* out) {
+ const ArrayType input(data);
+
+ BuilderType builder(input.type(), ctx->memory_pool());
+ // A slight overestimate of the data needed
+ RETURN_NOT_OK(builder.ReserveData(input.total_values_length()));
+ // The minimum amount of strings needed
+ RETURN_NOT_OK(builder.Resize(input.length() - input.null_count()));
+
+ ArrayData* output_list = out->mutable_array();
+ // List offsets were preallocated
+ auto* list_offsets = output_list->GetMutableValues<list_offset_type>(1);
+ DCHECK_NE(list_offsets, nullptr);
+ // Initial value
+ *list_offsets++ = 0;
+ for (int64_t i = 0; i < input.length(); ++i) {
+ if (!input.IsNull(i)) {
+ RETURN_NOT_OK(SplitString(input.GetView(i), finder, &builder));
+ if (ARROW_PREDICT_FALSE(builder.length() >
+ std::numeric_limits<list_offset_type>::max())) {
+ return Status::CapacityError("List offset does not fit into 32 bit");
+ }
+ }
+ *list_offsets++ = static_cast<list_offset_type>(builder.length());
+ }
+ // Assign string array to list child data
+ std::shared_ptr<Array> string_array;
+ RETURN_NOT_OK(builder.Finish(&string_array));
+ output_list->child_data.push_back(string_array->data());
+ return Status::OK();
+ }
+
+ Status Execute(KernelContext* ctx, SplitFinder* finder,
+ const std::shared_ptr<Scalar>& scalar, Datum* out) {
+ const auto& input = checked_cast<const ScalarType&>(*scalar);
+ auto result = checked_cast<ListScalarType*>(out->scalar().get());
+ if (input.is_valid) {
+ result->is_valid = true;
+ BuilderType builder(input.type, ctx->memory_pool());
+ util::string_view s(*input.value);
+ RETURN_NOT_OK(SplitString(s, finder, &builder));
+ RETURN_NOT_OK(builder.Finish(&result->value));
+ }
+ return Status::OK();
+ }
+
+ Status SplitString(const util::string_view& s, SplitFinder* finder,
+ BuilderType* builder) {
+ const uint8_t* begin = reinterpret_cast<const uint8_t*>(s.data());
+ const uint8_t* end = begin + s.length();
+
+ int64_t max_splits = options.max_splits;
+ // if there is no max splits, reversing does not make sense (and is probably less
+ // efficient), but is useful for testing
+ if (options.reverse) {
+ // note that i points 1 further than the 'current'
+ const uint8_t* i = end;
+ // we will record the parts in reverse order
+ parts.clear();
+ if (max_splits > -1) {
+ parts.reserve(max_splits + 1);
+ }
+ while (max_splits != 0) {
+ const uint8_t *separator_begin, *separator_end;
+ // find with whatever algo the part we will 'cut out'
+ if (finder->FindReverse(begin, i, &separator_begin, &separator_end, options)) {
+ parts.emplace_back(reinterpret_cast<const char*>(separator_end),
+ i - separator_end);
+ i = separator_begin;
+ max_splits--;
+ } else {
+ // if we cannot find a separator, we're done
+ break;
+ }
+ }
+ parts.emplace_back(reinterpret_cast<const char*>(begin), i - begin);
+ // now we do the copying
+ for (auto it = parts.rbegin(); it != parts.rend(); ++it) {
+ RETURN_NOT_OK(builder->Append(*it));
+ }
+ } else {
+ const uint8_t* i = begin;
+ while (max_splits != 0) {
+ const uint8_t *separator_begin, *separator_end;
+ // find with whatever algo the part we will 'cut out'
+ if (finder->Find(i, end, &separator_begin, &separator_end, options)) {
+ // the part till the beginning of the 'cut'
+ RETURN_NOT_OK(
+ builder->Append(i, static_cast<string_offset_type>(separator_begin - i)));
+ i = separator_end;
+ max_splits--;
+ } else {
+ // if we cannot find a separator, we're done
+ break;
+ }
+ }
+ // trailing part
+ RETURN_NOT_OK(builder->Append(i, static_cast<string_offset_type>(end - i)));
+ }
+ return Status::OK();
+ }
+};
+
+struct SplitPatternFinder : public SplitFinderBase<SplitPatternOptions> {
+ using Options = SplitPatternOptions;
+
+ Status PreExec(const SplitPatternOptions& options) override {
+ if (options.pattern.length() == 0) {
+ return Status::Invalid("Empty separator");
+ }
+ return Status::OK();
+ }
+
+ static bool Find(const uint8_t* begin, const uint8_t* end,
+ const uint8_t** separator_begin, const uint8_t** separator_end,
+ const SplitPatternOptions& options) {
+ const uint8_t* pattern = reinterpret_cast<const uint8_t*>(options.pattern.c_str());
+ const int64_t pattern_length = options.pattern.length();
+ const uint8_t* i = begin;
+ // this is O(n*m) complexity, we could use the Knuth-Morris-Pratt algorithm used in
+ // the match kernel
+ while ((i + pattern_length <= end)) {
+ i = std::search(i, end, pattern, pattern + pattern_length);
+ if (i != end) {
+ *separator_begin = i;
+ *separator_end = i + pattern_length;
+ return true;
+ }
+ }
+ return false;
+ }
+
+ static bool FindReverse(const uint8_t* begin, const uint8_t* end,
+ const uint8_t** separator_begin, const uint8_t** separator_end,
+ const SplitPatternOptions& options) {
+ const uint8_t* pattern = reinterpret_cast<const uint8_t*>(options.pattern.c_str());
+ const int64_t pattern_length = options.pattern.length();
+ // this is O(n*m) complexity, we could use the Knuth-Morris-Pratt algorithm used in
+ // the match kernel
+ std::reverse_iterator<const uint8_t*> ri(end);
+ std::reverse_iterator<const uint8_t*> rend(begin);
+ std::reverse_iterator<const uint8_t*> pattern_rbegin(pattern + pattern_length);
+ std::reverse_iterator<const uint8_t*> pattern_rend(pattern);
+ while (begin <= ri.base() - pattern_length) {
+ ri = std::search(ri, rend, pattern_rbegin, pattern_rend);
+ if (ri != rend) {
+ *separator_begin = ri.base() - pattern_length;
+ *separator_end = ri.base();
+ return true;
+ }
+ }
+ return false;
+ }
+};
+
+template <typename Type, typename ListType>
+using SplitPatternExec = SplitExec<Type, ListType, SplitPatternFinder>;
+
+const FunctionDoc split_pattern_doc(
+ "Split string according to separator",
+ ("Split each string according to the exact `pattern` defined in\n"
+ "SplitPatternOptions. The output for each string input is a list\n"
+ "of strings.\n"
+ "\n"
+ "The maximum number of splits and direction of splitting\n"
+ "(forward, reverse) can optionally be defined in SplitPatternOptions."),
+ {"strings"}, "SplitPatternOptions");
+
+const FunctionDoc ascii_split_whitespace_doc(
+ "Split string according to any ASCII whitespace",
+ ("Split each string according any non-zero length sequence of ASCII\n"
+ "whitespace characters. The output for each string input is a list\n"
+ "of strings.\n"
+ "\n"
+ "The maximum number of splits and direction of splitting\n"
+ "(forward, reverse) can optionally be defined in SplitOptions."),
+ {"strings"}, "SplitOptions");
+
+const FunctionDoc utf8_split_whitespace_doc(
+ "Split string according to any Unicode whitespace",
+ ("Split each string according any non-zero length sequence of Unicode\n"
+ "whitespace characters. The output for each string input is a list\n"
+ "of strings.\n"
+ "\n"
+ "The maximum number of splits and direction of splitting\n"
+ "(forward, reverse) can optionally be defined in SplitOptions."),
+ {"strings"}, "SplitOptions");
+
+void AddSplitPattern(FunctionRegistry* registry) {
+ auto func = std::make_shared<ScalarFunction>("split_pattern", Arity::Unary(),
+ &split_pattern_doc);
+ using t32 = SplitPatternExec<StringType, ListType>;
+ using t64 = SplitPatternExec<LargeStringType, ListType>;
+ DCHECK_OK(func->AddKernel({utf8()}, {list(utf8())}, t32::Exec, t32::State::Init));
+ DCHECK_OK(
+ func->AddKernel({large_utf8()}, {list(large_utf8())}, t64::Exec, t64::State::Init));
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+}
+
+struct SplitWhitespaceAsciiFinder : public SplitFinderBase<SplitOptions> {
+ using Options = SplitOptions;
+
+ static bool Find(const uint8_t* begin, const uint8_t* end,
+ const uint8_t** separator_begin, const uint8_t** separator_end,
+ const SplitOptions& options) {
+ const uint8_t* i = begin;
+ while (i < end) {
+ if (IsSpaceCharacterAscii(*i)) {
+ *separator_begin = i;
+ do {
+ i++;
+ } while (IsSpaceCharacterAscii(*i) && i < end);
+ *separator_end = i;
+ return true;
+ }
+ i++;
+ }
+ return false;
+ }
+
+ static bool FindReverse(const uint8_t* begin, const uint8_t* end,
+ const uint8_t** separator_begin, const uint8_t** separator_end,
+ const SplitOptions& options) {
+ const uint8_t* i = end - 1;
+ while ((i >= begin)) {
+ if (IsSpaceCharacterAscii(*i)) {
+ *separator_end = i + 1;
+ do {
+ i--;
+ } while (IsSpaceCharacterAscii(*i) && i >= begin);
+ *separator_begin = i + 1;
+ return true;
+ }
+ i--;
+ }
+ return false;
+ }
+};
+
+template <typename Type, typename ListType>
+using SplitWhitespaceAsciiExec = SplitExec<Type, ListType, SplitWhitespaceAsciiFinder>;
+
+void AddSplitWhitespaceAscii(FunctionRegistry* registry) {
+ static const SplitOptions default_options{};
+ auto func =
+ std::make_shared<ScalarFunction>("ascii_split_whitespace", Arity::Unary(),
+ &ascii_split_whitespace_doc, &default_options);
+ using t32 = SplitWhitespaceAsciiExec<StringType, ListType>;
+ using t64 = SplitWhitespaceAsciiExec<LargeStringType, ListType>;
+ DCHECK_OK(func->AddKernel({utf8()}, {list(utf8())}, t32::Exec, t32::State::Init));
+ DCHECK_OK(
+ func->AddKernel({large_utf8()}, {list(large_utf8())}, t64::Exec, t64::State::Init));
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+}
+
+#ifdef ARROW_WITH_UTF8PROC
+struct SplitWhitespaceUtf8Finder : public SplitFinderBase<SplitOptions> {
+ using Options = SplitOptions;
+
+ Status PreExec(const SplitOptions& options) override {
+ EnsureLookupTablesFilled();
+ return Status::OK();
+ }
+
+ bool Find(const uint8_t* begin, const uint8_t* end, const uint8_t** separator_begin,
+ const uint8_t** separator_end, const SplitOptions& options) {
+ const uint8_t* i = begin;
+ while ((i < end)) {
+ uint32_t codepoint = 0;
+ *separator_begin = i;
+ if (ARROW_PREDICT_FALSE(!arrow::util::UTF8Decode(&i, &codepoint))) {
+ return false;
+ }
+ if (IsSpaceCharacterUnicode(codepoint)) {
+ do {
+ *separator_end = i;
+ if (ARROW_PREDICT_FALSE(!arrow::util::UTF8Decode(&i, &codepoint))) {
+ return false;
+ }
+ } while (IsSpaceCharacterUnicode(codepoint) && i < end);
+ return true;
+ }
+ }
+ return false;
+ }
+
+ bool FindReverse(const uint8_t* begin, const uint8_t* end,
+ const uint8_t** separator_begin, const uint8_t** separator_end,
+ const SplitOptions& options) {
+ const uint8_t* i = end - 1;
+ while ((i >= begin)) {
+ uint32_t codepoint = 0;
+ *separator_end = i + 1;
+ if (ARROW_PREDICT_FALSE(!arrow::util::UTF8DecodeReverse(&i, &codepoint))) {
+ return false;
+ }
+ if (IsSpaceCharacterUnicode(codepoint)) {
+ do {
+ *separator_begin = i + 1;
+ if (ARROW_PREDICT_FALSE(!arrow::util::UTF8DecodeReverse(&i, &codepoint))) {
+ return false;
+ }
+ } while (IsSpaceCharacterUnicode(codepoint) && i >= begin);
+ return true;
+ }
+ }
+ return false;
+ }
+};
+
+template <typename Type, typename ListType>
+using SplitWhitespaceUtf8Exec = SplitExec<Type, ListType, SplitWhitespaceUtf8Finder>;
+
+void AddSplitWhitespaceUTF8(FunctionRegistry* registry) {
+ static const SplitOptions default_options{};
+ auto func =
+ std::make_shared<ScalarFunction>("utf8_split_whitespace", Arity::Unary(),
+ &utf8_split_whitespace_doc, &default_options);
+ using t32 = SplitWhitespaceUtf8Exec<StringType, ListType>;
+ using t64 = SplitWhitespaceUtf8Exec<LargeStringType, ListType>;
+ DCHECK_OK(func->AddKernel({utf8()}, {list(utf8())}, t32::Exec, t32::State::Init));
+ DCHECK_OK(
+ func->AddKernel({large_utf8()}, {list(large_utf8())}, t64::Exec, t64::State::Init));
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+}
+#endif // ARROW_WITH_UTF8PROC
+
+#ifdef ARROW_WITH_RE2
+struct SplitRegexFinder : public SplitFinderBase<SplitPatternOptions> {
+ using Options = SplitPatternOptions;
+
+ util::optional<RE2> regex_split;
+
+ Status PreExec(const SplitPatternOptions& options) override {
+ if (options.reverse) {
+ return Status::NotImplemented("Cannot split in reverse with regex");
+ }
+ // RE2 does *not* give you the full match! Must wrap the regex in a capture group
+ // There is FindAndConsume, but it would give only the end of the separator
+ std::string pattern = "(";
+ pattern.reserve(options.pattern.size() + 2);
+ pattern += options.pattern;
+ pattern += ')';
+ regex_split.emplace(std::move(pattern));
+ return RegexStatus(*regex_split);
+ }
+
+ bool Find(const uint8_t* begin, const uint8_t* end, const uint8_t** separator_begin,
+ const uint8_t** separator_end, const SplitPatternOptions& options) {
+ re2::StringPiece piece(reinterpret_cast<const char*>(begin),
+ std::distance(begin, end));
+ // "StringPiece is mutated to point to matched piece"
+ re2::StringPiece result;
+ if (!re2::RE2::PartialMatch(piece, *regex_split, &result)) {
+ return false;
+ }
+ *separator_begin = reinterpret_cast<const uint8_t*>(result.data());
+ *separator_end = reinterpret_cast<const uint8_t*>(result.data() + result.size());
+ return true;
+ }
+
+ bool FindReverse(const uint8_t* begin, const uint8_t* end,
+ const uint8_t** separator_begin, const uint8_t** separator_end,
+ const SplitPatternOptions& options) {
+ // Unsupported (see PreExec)
+ return false;
+ }
+};
+
+template <typename Type, typename ListType>
+using SplitRegexExec = SplitExec<Type, ListType, SplitRegexFinder>;
+
+const FunctionDoc split_pattern_regex_doc(
+ "Split string according to regex pattern",
+ ("Split each string according to the regex `pattern` defined in\n"
+ "SplitPatternOptions. The output for each string input is a list\n"
+ "of strings.\n"
+ "\n"
+ "The maximum number of splits and direction of splitting\n"
+ "(forward, reverse) can optionally be defined in SplitPatternOptions."),
+ {"strings"}, "SplitPatternOptions");
+
+void AddSplitRegex(FunctionRegistry* registry) {
+ auto func = std::make_shared<ScalarFunction>("split_pattern_regex", Arity::Unary(),
+ &split_pattern_regex_doc);
+ using t32 = SplitRegexExec<StringType, ListType>;
+ using t64 = SplitRegexExec<LargeStringType, ListType>;
+ DCHECK_OK(func->AddKernel({utf8()}, {list(utf8())}, t32::Exec, t32::State::Init));
+ DCHECK_OK(
+ func->AddKernel({large_utf8()}, {list(large_utf8())}, t64::Exec, t64::State::Init));
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+}
+#endif // ARROW_WITH_RE2
+
+void AddSplit(FunctionRegistry* registry) {
+ AddSplitPattern(registry);
+ AddSplitWhitespaceAscii(registry);
+#ifdef ARROW_WITH_UTF8PROC
+ AddSplitWhitespaceUTF8(registry);
+#endif
+#ifdef ARROW_WITH_RE2
+ AddSplitRegex(registry);
+#endif
+}
+
+// ----------------------------------------------------------------------
+// Replace substring (plain, regex)
+
+template <typename Type, typename Replacer>
+struct ReplaceSubString {
+ using ScalarType = typename TypeTraits<Type>::ScalarType;
+ using offset_type = typename Type::offset_type;
+ using ValueDataBuilder = TypedBufferBuilder<uint8_t>;
+ using OffsetBuilder = TypedBufferBuilder<offset_type>;
+ using State = OptionsWrapper<ReplaceSubstringOptions>;
+
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ // TODO Cache replacer across invocations (for regex compilation)
+ ARROW_ASSIGN_OR_RAISE(auto replacer, Replacer::Make(State::Get(ctx)));
+ return Replace(ctx, batch, *replacer, out);
+ }
+
+ static Status Replace(KernelContext* ctx, const ExecBatch& batch,
+ const Replacer& replacer, Datum* out) {
+ ValueDataBuilder value_data_builder(ctx->memory_pool());
+ OffsetBuilder offset_builder(ctx->memory_pool());
+
+ if (batch[0].kind() == Datum::ARRAY) {
+ // We already know how many strings we have, so we can use Reserve/UnsafeAppend
+ RETURN_NOT_OK(offset_builder.Reserve(batch[0].array()->length + 1));
+ offset_builder.UnsafeAppend(0); // offsets start at 0
+
+ const ArrayData& input = *batch[0].array();
+ RETURN_NOT_OK(VisitArrayDataInline<Type>(
+ input,
+ [&](util::string_view s) {
+ RETURN_NOT_OK(replacer.ReplaceString(s, &value_data_builder));
+ offset_builder.UnsafeAppend(
+ static_cast<offset_type>(value_data_builder.length()));
+ return Status::OK();
+ },
+ [&]() {
+ // offset for null value
+ offset_builder.UnsafeAppend(
+ static_cast<offset_type>(value_data_builder.length()));
+ return Status::OK();
+ }));
+ ArrayData* output = out->mutable_array();
+ RETURN_NOT_OK(value_data_builder.Finish(&output->buffers[2]));
+ RETURN_NOT_OK(offset_builder.Finish(&output->buffers[1]));
+ } else {
+ const auto& input = checked_cast<const ScalarType&>(*batch[0].scalar());
+ auto result = std::make_shared<ScalarType>();
+ if (input.is_valid) {
+ util::string_view s = static_cast<util::string_view>(*input.value);
+ RETURN_NOT_OK(replacer.ReplaceString(s, &value_data_builder));
+ RETURN_NOT_OK(value_data_builder.Finish(&result->value));
+ result->is_valid = true;
+ }
+ out->value = result;
+ }
+
+ return Status::OK();
+ }
+};
+
+struct PlainSubStringReplacer {
+ const ReplaceSubstringOptions& options_;
+
+ static Result<std::unique_ptr<PlainSubStringReplacer>> Make(
+ const ReplaceSubstringOptions& options) {
+ return arrow::internal::make_unique<PlainSubStringReplacer>(options);
+ }
+
+ explicit PlainSubStringReplacer(const ReplaceSubstringOptions& options)
+ : options_(options) {}
+
+ Status ReplaceString(util::string_view s, TypedBufferBuilder<uint8_t>* builder) const {
+ const char* i = s.begin();
+ const char* end = s.end();
+ int64_t max_replacements = options_.max_replacements;
+ while ((i < end) && (max_replacements != 0)) {
+ const char* pos =
+ std::search(i, end, options_.pattern.begin(), options_.pattern.end());
+ if (pos == end) {
+ RETURN_NOT_OK(builder->Append(reinterpret_cast<const uint8_t*>(i),
+ static_cast<int64_t>(end - i)));
+ i = end;
+ } else {
+ // the string before the pattern
+ RETURN_NOT_OK(builder->Append(reinterpret_cast<const uint8_t*>(i),
+ static_cast<int64_t>(pos - i)));
+ // the replacement
+ RETURN_NOT_OK(
+ builder->Append(reinterpret_cast<const uint8_t*>(options_.replacement.data()),
+ options_.replacement.length()));
+ // skip pattern
+ i = pos + options_.pattern.length();
+ max_replacements--;
+ }
+ }
+ // if we exited early due to max_replacements, add the trailing part
+ return builder->Append(reinterpret_cast<const uint8_t*>(i),
+ static_cast<int64_t>(end - i));
+ }
+};
+
+#ifdef ARROW_WITH_RE2
+struct RegexSubStringReplacer {
+ const ReplaceSubstringOptions& options_;
+ const RE2 regex_find_;
+ const RE2 regex_replacement_;
+
+ static Result<std::unique_ptr<RegexSubStringReplacer>> Make(
+ const ReplaceSubstringOptions& options) {
+ auto replacer = arrow::internal::make_unique<RegexSubStringReplacer>(options);
+
+ RETURN_NOT_OK(RegexStatus(replacer->regex_find_));
+ RETURN_NOT_OK(RegexStatus(replacer->regex_replacement_));
+
+ std::string replacement_error;
+ if (!replacer->regex_replacement_.CheckRewriteString(replacer->options_.replacement,
+ &replacement_error)) {
+ return Status::Invalid("Invalid replacement string: ",
+ std::move(replacement_error));
+ }
+
+ return std::move(replacer);
+ }
+
+ // Using RE2::FindAndConsume we can only find the pattern if it is a group, therefore
+ // we have 2 regexes, one with () around it, one without.
+ explicit RegexSubStringReplacer(const ReplaceSubstringOptions& options)
+ : options_(options),
+ regex_find_("(" + options_.pattern + ")", RE2::Quiet),
+ regex_replacement_(options_.pattern, RE2::Quiet) {}
+
+ Status ReplaceString(util::string_view s, TypedBufferBuilder<uint8_t>* builder) const {
+ re2::StringPiece replacement(options_.replacement);
+
+ if (options_.max_replacements == -1) {
+ std::string s_copy(s.to_string());
+ re2::RE2::GlobalReplace(&s_copy, regex_replacement_, replacement);
+ return builder->Append(reinterpret_cast<const uint8_t*>(s_copy.data()),
+ s_copy.length());
+ }
+
+ // Since RE2 does not have the concept of max_replacements, we have to do some work
+ // ourselves.
+ // We might do this faster similar to RE2::GlobalReplace using Match and Rewrite
+ const char* i = s.begin();
+ const char* end = s.end();
+ re2::StringPiece piece(s.data(), s.length());
+
+ int64_t max_replacements = options_.max_replacements;
+ while ((i < end) && (max_replacements != 0)) {
+ std::string found;
+ if (!re2::RE2::FindAndConsume(&piece, regex_find_, &found)) {
+ RETURN_NOT_OK(builder->Append(reinterpret_cast<const uint8_t*>(i),
+ static_cast<int64_t>(end - i)));
+ i = end;
+ } else {
+ // wind back to the beginning of the match
+ const char* pos = piece.begin() - found.length();
+ // the string before the pattern
+ RETURN_NOT_OK(builder->Append(reinterpret_cast<const uint8_t*>(i),
+ static_cast<int64_t>(pos - i)));
+ // replace the pattern in what we found
+ if (!re2::RE2::Replace(&found, regex_replacement_, replacement)) {
+ return Status::Invalid("Regex found, but replacement failed");
+ }
+ RETURN_NOT_OK(builder->Append(reinterpret_cast<const uint8_t*>(found.data()),
+ static_cast<int64_t>(found.length())));
+ // skip pattern
+ i = piece.begin();
+ max_replacements--;
+ }
+ }
+ // If we exited early due to max_replacements, add the trailing part
+ return builder->Append(reinterpret_cast<const uint8_t*>(i),
+ static_cast<int64_t>(end - i));
+ }
+};
+#endif
+
+template <typename Type>
+using ReplaceSubStringPlain = ReplaceSubString<Type, PlainSubStringReplacer>;
+
+const FunctionDoc replace_substring_doc(
+ "Replace non-overlapping substrings that match pattern by replacement",
+ ("For each string in `strings`, replace non-overlapping substrings that match\n"
+ "`pattern` by `replacement`. If `max_replacements != -1`, it determines the\n"
+ "maximum amount of replacements made, counting from the left. Null values emit\n"
+ "null."),
+ {"strings"}, "ReplaceSubstringOptions");
+
+#ifdef ARROW_WITH_RE2
+template <typename Type>
+using ReplaceSubStringRegex = ReplaceSubString<Type, RegexSubStringReplacer>;
+
+const FunctionDoc replace_substring_regex_doc(
+ "Replace non-overlapping substrings that match regex `pattern` by `replacement`",
+ ("For each string in `strings`, replace non-overlapping substrings that match the\n"
+ "regular expression `pattern` by `replacement` using the Google RE2 library.\n"
+ "If `max_replacements != -1`, it determines the maximum amount of replacements\n"
+ "made, counting from the left. Note that if the pattern contains groups,\n"
+ "backreferencing macan be used. Null values emit null."),
+ {"strings"}, "ReplaceSubstringOptions");
+#endif
+
+// ----------------------------------------------------------------------
+// Replace slice
+
+struct ReplaceSliceTransformBase : public StringTransformBase {
+ using State = OptionsWrapper<ReplaceSliceOptions>;
+
+ const ReplaceSliceOptions* options;
+
+ explicit ReplaceSliceTransformBase(const ReplaceSliceOptions& options)
+ : options{&options} {}
+
+ int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) override {
+ return ninputs * options->replacement.size() + input_ncodeunits;
+ }
+};
+
+struct BinaryReplaceSliceTransform : ReplaceSliceTransformBase {
+ using ReplaceSliceTransformBase::ReplaceSliceTransformBase;
+ int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
+ uint8_t* output) {
+ const auto& opts = *options;
+ int64_t before_slice = 0;
+ int64_t after_slice = 0;
+ uint8_t* output_start = output;
+
+ if (opts.start >= 0) {
+ // Count from left
+ before_slice = std::min<int64_t>(input_string_ncodeunits, opts.start);
+ } else {
+ // Count from right
+ before_slice = std::max<int64_t>(0, input_string_ncodeunits + opts.start);
+ }
+ // Mimic Pandas: if stop would be before start, treat as 0-length slice
+ if (opts.stop >= 0) {
+ // Count from left
+ after_slice =
+ std::min<int64_t>(input_string_ncodeunits, std::max(before_slice, opts.stop));
+ } else {
+ // Count from right
+ after_slice = std::max<int64_t>(before_slice, input_string_ncodeunits + opts.stop);
+ }
+ output = std::copy(input, input + before_slice, output);
+ output = std::copy(opts.replacement.begin(), opts.replacement.end(), output);
+ output = std::copy(input + after_slice, input + input_string_ncodeunits, output);
+ return output - output_start;
+ }
+};
+
+struct Utf8ReplaceSliceTransform : ReplaceSliceTransformBase {
+ using ReplaceSliceTransformBase::ReplaceSliceTransformBase;
+ int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
+ uint8_t* output) {
+ const auto& opts = *options;
+ const uint8_t* begin = input;
+ const uint8_t* end = input + input_string_ncodeunits;
+ const uint8_t *begin_sliced, *end_sliced;
+ uint8_t* output_start = output;
+
+ // Mimic Pandas: if stop would be before start, treat as 0-length slice
+ if (opts.start >= 0) {
+ // Count from left
+ if (!arrow::util::UTF8AdvanceCodepoints(begin, end, &begin_sliced, opts.start)) {
+ return kTransformError;
+ }
+ if (opts.stop > options->start) {
+ // Continue counting from left
+ const int64_t length = opts.stop - options->start;
+ if (!arrow::util::UTF8AdvanceCodepoints(begin_sliced, end, &end_sliced, length)) {
+ return kTransformError;
+ }
+ } else if (opts.stop < 0) {
+ // Count from right
+ if (!arrow::util::UTF8AdvanceCodepointsReverse(begin_sliced, end, &end_sliced,
+ -opts.stop)) {
+ return kTransformError;
+ }
+ } else {
+ // Zero-length slice
+ end_sliced = begin_sliced;
+ }
+ } else {
+ // Count from right
+ if (!arrow::util::UTF8AdvanceCodepointsReverse(begin, end, &begin_sliced,
+ -opts.start)) {
+ return kTransformError;
+ }
+ if (opts.stop >= 0) {
+ // Restart counting from left
+ if (!arrow::util::UTF8AdvanceCodepoints(begin, end, &end_sliced, opts.stop)) {
+ return kTransformError;
+ }
+ if (end_sliced <= begin_sliced) {
+ // Zero-length slice
+ end_sliced = begin_sliced;
+ }
+ } else if ((opts.stop < 0) && (options->stop > options->start)) {
+ // Count from right
+ if (!arrow::util::UTF8AdvanceCodepointsReverse(begin_sliced, end, &end_sliced,
+ -opts.stop)) {
+ return kTransformError;
+ }
+ } else {
+ // zero-length slice
+ end_sliced = begin_sliced;
+ }
+ }
+ output = std::copy(begin, begin_sliced, output);
+ output = std::copy(opts.replacement.begin(), options->replacement.end(), output);
+ output = std::copy(end_sliced, end, output);
+ return output - output_start;
+ }
+};
+
+template <typename Type>
+using BinaryReplaceSlice =
+ StringTransformExecWithState<Type, BinaryReplaceSliceTransform>;
+template <typename Type>
+using Utf8ReplaceSlice = StringTransformExecWithState<Type, Utf8ReplaceSliceTransform>;
+
+const FunctionDoc binary_replace_slice_doc(
+ "Replace a slice of a binary string with `replacement`",
+ ("For each string in `strings`, replace a slice of the string defined by `start`"
+ "and `stop` with `replacement`. `start` is inclusive and `stop` is exclusive, "
+ "and both are measured in bytes.\n"
+ "Null values emit null."),
+ {"strings"}, "ReplaceSliceOptions");
+
+const FunctionDoc utf8_replace_slice_doc(
+ "Replace a slice of a string with `replacement`",
+ ("For each string in `strings`, replace a slice of the string defined by `start`"
+ "and `stop` with `replacement`. `start` is inclusive and `stop` is exclusive, "
+ "and both are measured in codeunits.\n"
+ "Null values emit null."),
+ {"strings"}, "ReplaceSliceOptions");
+
+void AddReplaceSlice(FunctionRegistry* registry) {
+ {
+ auto func = std::make_shared<ScalarFunction>("binary_replace_slice", Arity::Unary(),
+ &binary_replace_slice_doc);
+ for (const auto& ty : BaseBinaryTypes()) {
+ DCHECK_OK(func->AddKernel({ty}, ty,
+ GenerateTypeAgnosticVarBinaryBase<BinaryReplaceSlice>(ty),
+ ReplaceSliceTransformBase::State::Init));
+ }
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+ }
+
+ {
+ auto func = std::make_shared<ScalarFunction>("utf8_replace_slice", Arity::Unary(),
+ &utf8_replace_slice_doc);
+ DCHECK_OK(func->AddKernel({utf8()}, utf8(), Utf8ReplaceSlice<StringType>::Exec,
+ ReplaceSliceTransformBase::State::Init));
+ DCHECK_OK(func->AddKernel({large_utf8()}, large_utf8(),
+ Utf8ReplaceSlice<LargeStringType>::Exec,
+ ReplaceSliceTransformBase::State::Init));
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+ }
+}
+
+// ----------------------------------------------------------------------
+// Extract with regex
+
+#ifdef ARROW_WITH_RE2
+
+// TODO cache this once per ExtractRegexOptions
+struct ExtractRegexData {
+ // Use unique_ptr<> because RE2 is non-movable
+ std::unique_ptr<RE2> regex;
+ std::vector<std::string> group_names;
+
+ static Result<ExtractRegexData> Make(const ExtractRegexOptions& options) {
+ ExtractRegexData data(options.pattern);
+ RETURN_NOT_OK(RegexStatus(*data.regex));
+
+ const int group_count = data.regex->NumberOfCapturingGroups();
+ const auto& name_map = data.regex->CapturingGroupNames();
+ data.group_names.reserve(group_count);
+
+ for (int i = 0; i < group_count; i++) {
+ auto item = name_map.find(i + 1); // re2 starts counting from 1
+ if (item == name_map.end()) {
+ // XXX should we instead just create fields with an empty name?
+ return Status::Invalid("Regular expression contains unnamed groups");
+ }
+ data.group_names.emplace_back(item->second);
+ }
+ return std::move(data);
+ }
+
+ Result<ValueDescr> ResolveOutputType(const std::vector<ValueDescr>& args) const {
+ const auto& input_type = args[0].type;
+ if (input_type == nullptr) {
+ // No input type specified => propagate shape
+ return args[0];
+ }
+ // Input type is either String or LargeString and is also the type of each
+ // field in the output struct type.
+ DCHECK(input_type->id() == Type::STRING || input_type->id() == Type::LARGE_STRING);
+ FieldVector fields;
+ fields.reserve(group_names.size());
+ std::transform(group_names.begin(), group_names.end(), std::back_inserter(fields),
+ [&](const std::string& name) { return field(name, input_type); });
+ return struct_(std::move(fields));
+ }
+
+ private:
+ explicit ExtractRegexData(const std::string& pattern)
+ : regex(new RE2(pattern, RE2::Quiet)) {}
+};
+
+Result<ValueDescr> ResolveExtractRegexOutput(KernelContext* ctx,
+ const std::vector<ValueDescr>& args) {
+ using State = OptionsWrapper<ExtractRegexOptions>;
+ ExtractRegexOptions options = State::Get(ctx);
+ ARROW_ASSIGN_OR_RAISE(auto data, ExtractRegexData::Make(options));
+ return data.ResolveOutputType(args);
+}
+
+struct ExtractRegexBase {
+ const ExtractRegexData& data;
+ const int group_count;
+ std::vector<re2::StringPiece> found_values;
+ std::vector<re2::RE2::Arg> args;
+ std::vector<const re2::RE2::Arg*> args_pointers;
+ const re2::RE2::Arg** args_pointers_start;
+ const re2::RE2::Arg* null_arg = nullptr;
+
+ explicit ExtractRegexBase(const ExtractRegexData& data)
+ : data(data),
+ group_count(static_cast<int>(data.group_names.size())),
+ found_values(group_count) {
+ args.reserve(group_count);
+ args_pointers.reserve(group_count);
+
+ for (int i = 0; i < group_count; i++) {
+ args.emplace_back(&found_values[i]);
+ // Since we reserved capacity, we're guaranteed the pointer remains valid
+ args_pointers.push_back(&args[i]);
+ }
+ // Avoid null pointer if there is no capture group
+ args_pointers_start = (group_count > 0) ? args_pointers.data() : &null_arg;
+ }
+
+ bool Match(util::string_view s) {
+ return re2::RE2::PartialMatchN(ToStringPiece(s), *data.regex, args_pointers_start,
+ group_count);
+ }
+};
+
+template <typename Type>
+struct ExtractRegex : public ExtractRegexBase {
+ using ArrayType = typename TypeTraits<Type>::ArrayType;
+ using ScalarType = typename TypeTraits<Type>::ScalarType;
+ using BuilderType = typename TypeTraits<Type>::BuilderType;
+ using State = OptionsWrapper<ExtractRegexOptions>;
+
+ using ExtractRegexBase::ExtractRegexBase;
+
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ ExtractRegexOptions options = State::Get(ctx);
+ ARROW_ASSIGN_OR_RAISE(auto data, ExtractRegexData::Make(options));
+ return ExtractRegex{data}.Extract(ctx, batch, out);
+ }
+
+ Status Extract(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ ARROW_ASSIGN_OR_RAISE(auto descr, data.ResolveOutputType(batch.GetDescriptors()));
+ DCHECK_NE(descr.type, nullptr);
+ const auto& type = descr.type;
+
+ if (batch[0].kind() == Datum::ARRAY) {
+ std::unique_ptr<ArrayBuilder> array_builder;
+ RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), type, &array_builder));
+ StructBuilder* struct_builder = checked_cast<StructBuilder*>(array_builder.get());
+
+ std::vector<BuilderType*> field_builders;
+ field_builders.reserve(group_count);
+ for (int i = 0; i < group_count; i++) {
+ field_builders.push_back(
+ checked_cast<BuilderType*>(struct_builder->field_builder(i)));
+ }
+
+ auto visit_null = [&]() { return struct_builder->AppendNull(); };
+ auto visit_value = [&](util::string_view s) {
+ if (Match(s)) {
+ for (int i = 0; i < group_count; i++) {
+ RETURN_NOT_OK(field_builders[i]->Append(ToStringView(found_values[i])));
+ }
+ return struct_builder->Append();
+ } else {
+ return struct_builder->AppendNull();
+ }
+ };
+ const ArrayData& input = *batch[0].array();
+ RETURN_NOT_OK(VisitArrayDataInline<Type>(input, visit_value, visit_null));
+
+ std::shared_ptr<Array> out_array;
+ RETURN_NOT_OK(struct_builder->Finish(&out_array));
+ *out = std::move(out_array);
+ } else {
+ const auto& input = checked_cast<const ScalarType&>(*batch[0].scalar());
+ auto result = std::make_shared<StructScalar>(type);
+ if (input.is_valid && Match(util::string_view(*input.value))) {
+ result->value.reserve(group_count);
+ for (int i = 0; i < group_count; i++) {
+ result->value.push_back(
+ std::make_shared<ScalarType>(found_values[i].as_string()));
+ }
+ result->is_valid = true;
+ } else {
+ result->is_valid = false;
+ }
+ out->value = std::move(result);
+ }
+
+ return Status::OK();
+ }
+};
+
+const FunctionDoc extract_regex_doc(
+ "Extract substrings captured by a regex pattern",
+ ("For each string in `strings`, match the regular expression and, if\n"
+ "successful, emit a struct with field names and values coming from the\n"
+ "regular expression's named capture groups. If the input is null or the\n"
+ "regular expression fails matching, a null output value is emitted.\n"
+ "\n"
+ "Regular expression matching is done using the Google RE2 library."),
+ {"strings"}, "ExtractRegexOptions");
+
+void AddExtractRegex(FunctionRegistry* registry) {
+ auto func = std::make_shared<ScalarFunction>("extract_regex", Arity::Unary(),
+ &extract_regex_doc);
+ using t32 = ExtractRegex<StringType>;
+ using t64 = ExtractRegex<LargeStringType>;
+ OutputType out_ty(ResolveExtractRegexOutput);
+ ScalarKernel kernel;
+
+ // Null values will be computed based on regex match or not
+ kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
+ kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
+ kernel.signature.reset(new KernelSignature({utf8()}, out_ty));
+ kernel.exec = t32::Exec;
+ kernel.init = t32::State::Init;
+ DCHECK_OK(func->AddKernel(kernel));
+ kernel.signature.reset(new KernelSignature({large_utf8()}, out_ty));
+ kernel.exec = t64::Exec;
+ kernel.init = t64::State::Init;
+ DCHECK_OK(func->AddKernel(kernel));
+
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+}
+#endif // ARROW_WITH_RE2
+
+// ----------------------------------------------------------------------
+// strptime string parsing
+
+using StrptimeState = OptionsWrapper<StrptimeOptions>;
+
+struct ParseStrptime {
+ explicit ParseStrptime(const StrptimeOptions& options)
+ : parser(TimestampParser::MakeStrptime(options.format)), unit(options.unit) {}
+
+ template <typename... Ignored>
+ int64_t Call(KernelContext*, util::string_view val, Status* st) const {
+ int64_t result = 0;
+ if (!(*parser)(val.data(), val.size(), unit, &result)) {
+ *st = Status::Invalid("Failed to parse string: '", val, "' as a scalar of type ",
+ TimestampType(unit).ToString());
+ }
+ return result;
+ }
+
+ std::shared_ptr<TimestampParser> parser;
+ TimeUnit::type unit;
+};
+
+template <typename InputType>
+Status StrptimeExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ applicator::ScalarUnaryNotNullStateful<TimestampType, InputType, ParseStrptime> kernel{
+ ParseStrptime(StrptimeState::Get(ctx))};
+ return kernel.Exec(ctx, batch, out);
+}
+
+Result<ValueDescr> StrptimeResolve(KernelContext* ctx, const std::vector<ValueDescr>&) {
+ if (ctx->state()) {
+ return ::arrow::timestamp(StrptimeState::Get(ctx).unit);
+ }
+
+ return Status::Invalid("strptime does not provide default StrptimeOptions");
+}
+
+// ----------------------------------------------------------------------
+// string padding
+
+template <bool PadLeft, bool PadRight>
+struct AsciiPadTransform : public StringTransformBase {
+ using State = OptionsWrapper<PadOptions>;
+
+ const PadOptions& options_;
+
+ explicit AsciiPadTransform(const PadOptions& options) : options_(options) {}
+
+ Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) override {
+ if (options_.padding.size() != 1) {
+ return Status::Invalid("Padding must be one byte, got '", options_.padding, "'");
+ }
+ return Status::OK();
+ }
+
+ int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) override {
+ // This is likely very overallocated but hard to do better without
+ // actually looking at each string (because of strings that may be
+ // longer than the given width)
+ return input_ncodeunits + ninputs * options_.width;
+ }
+
+ int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
+ uint8_t* output) {
+ if (input_string_ncodeunits >= options_.width) {
+ std::copy(input, input + input_string_ncodeunits, output);
+ return input_string_ncodeunits;
+ }
+ const int64_t spaces = options_.width - input_string_ncodeunits;
+ int64_t left = 0;
+ int64_t right = 0;
+ if (PadLeft && PadRight) {
+ // If odd number of spaces, put the extra space on the right
+ left = spaces / 2;
+ right = spaces - left;
+ } else if (PadLeft) {
+ left = spaces;
+ } else if (PadRight) {
+ right = spaces;
+ } else {
+ DCHECK(false) << "unreachable";
+ return 0;
+ }
+ std::fill(output, output + left, options_.padding[0]);
+ output += left;
+ output = std::copy(input, input + input_string_ncodeunits, output);
+ std::fill(output, output + right, options_.padding[0]);
+ return options_.width;
+ }
+};
+
+template <bool PadLeft, bool PadRight>
+struct Utf8PadTransform : public StringTransformBase {
+ using State = OptionsWrapper<PadOptions>;
+
+ const PadOptions& options_;
+
+ explicit Utf8PadTransform(const PadOptions& options) : options_(options) {}
+
+ Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) override {
+ auto str = reinterpret_cast<const uint8_t*>(options_.padding.data());
+ auto strlen = options_.padding.size();
+ if (util::UTF8Length(str, str + strlen) != 1) {
+ return Status::Invalid("Padding must be one codepoint, got '", options_.padding,
+ "'");
+ }
+ return Status::OK();
+ }
+
+ int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) override {
+ // This is likely very overallocated but hard to do better without
+ // actually looking at each string (because of strings that may be
+ // longer than the given width)
+ // One codepoint may be up to 4 bytes
+ return input_ncodeunits + 4 * ninputs * options_.width;
+ }
+
+ int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
+ uint8_t* output) {
+ const int64_t input_width = util::UTF8Length(input, input + input_string_ncodeunits);
+ if (input_width >= options_.width) {
+ std::copy(input, input + input_string_ncodeunits, output);
+ return input_string_ncodeunits;
+ }
+ const int64_t spaces = options_.width - input_width;
+ int64_t left = 0;
+ int64_t right = 0;
+ if (PadLeft && PadRight) {
+ // If odd number of spaces, put the extra space on the right
+ left = spaces / 2;
+ right = spaces - left;
+ } else if (PadLeft) {
+ left = spaces;
+ } else if (PadRight) {
+ right = spaces;
+ } else {
+ DCHECK(false) << "unreachable";
+ return 0;
+ }
+ uint8_t* start = output;
+ while (left) {
+ output = std::copy(options_.padding.begin(), options_.padding.end(), output);
+ left--;
+ }
+ output = std::copy(input, input + input_string_ncodeunits, output);
+ while (right) {
+ output = std::copy(options_.padding.begin(), options_.padding.end(), output);
+ right--;
+ }
+ return output - start;
+ }
+};
+
+template <typename Type>
+using AsciiLPad = StringTransformExecWithState<Type, AsciiPadTransform<true, false>>;
+template <typename Type>
+using AsciiRPad = StringTransformExecWithState<Type, AsciiPadTransform<false, true>>;
+template <typename Type>
+using AsciiCenter = StringTransformExecWithState<Type, AsciiPadTransform<true, true>>;
+template <typename Type>
+using Utf8LPad = StringTransformExecWithState<Type, Utf8PadTransform<true, false>>;
+template <typename Type>
+using Utf8RPad = StringTransformExecWithState<Type, Utf8PadTransform<false, true>>;
+template <typename Type>
+using Utf8Center = StringTransformExecWithState<Type, Utf8PadTransform<true, true>>;
+
+// ----------------------------------------------------------------------
+// string trimming
+
+#ifdef ARROW_WITH_UTF8PROC
+
+template <bool TrimLeft, bool TrimRight>
+struct UTF8TrimWhitespaceTransform : public StringTransformBase {
+ Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) override {
+ EnsureLookupTablesFilled();
+ return Status::OK();
+ }
+
+ int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
+ uint8_t* output) {
+ const uint8_t* begin = input;
+ const uint8_t* end = input + input_string_ncodeunits;
+ const uint8_t* end_trimmed = end;
+ const uint8_t* begin_trimmed = begin;
+
+ auto predicate = [](uint32_t c) { return !IsSpaceCharacterUnicode(c); };
+ if (TrimLeft && !ARROW_PREDICT_TRUE(
+ arrow::util::UTF8FindIf(begin, end, predicate, &begin_trimmed))) {
+ return kTransformError;
+ }
+ if (TrimRight && begin_trimmed < end) {
+ if (!ARROW_PREDICT_TRUE(arrow::util::UTF8FindIfReverse(begin_trimmed, end,
+ predicate, &end_trimmed))) {
+ return kTransformError;
+ }
+ }
+ std::copy(begin_trimmed, end_trimmed, output);
+ return end_trimmed - begin_trimmed;
+ }
+};
+
+template <typename Type>
+using UTF8TrimWhitespace =
+ StringTransformExec<Type, UTF8TrimWhitespaceTransform<true, true>>;
+
+template <typename Type>
+using UTF8LTrimWhitespace =
+ StringTransformExec<Type, UTF8TrimWhitespaceTransform<true, false>>;
+
+template <typename Type>
+using UTF8RTrimWhitespace =
+ StringTransformExec<Type, UTF8TrimWhitespaceTransform<false, true>>;
+
+struct UTF8TrimState {
+ TrimOptions options_;
+ std::vector<bool> codepoints_;
+ Status status_ = Status::OK();
+
+ explicit UTF8TrimState(KernelContext* ctx, TrimOptions options)
+ : options_(std::move(options)) {
+ if (!ARROW_PREDICT_TRUE(
+ arrow::util::UTF8ForEach(options_.characters, [&](uint32_t c) {
+ codepoints_.resize(
+ std::max(c + 1, static_cast<uint32_t>(codepoints_.size())));
+ codepoints_.at(c) = true;
+ }))) {
+ status_ = Status::Invalid("Invalid UTF8 sequence in input");
+ }
+ }
+};
+
+template <bool TrimLeft, bool TrimRight>
+struct UTF8TrimTransform : public StringTransformBase {
+ using State = KernelStateFromFunctionOptions<UTF8TrimState, TrimOptions>;
+
+ const UTF8TrimState& state_;
+
+ explicit UTF8TrimTransform(const UTF8TrimState& state) : state_(state) {}
+
+ Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) override {
+ return state_.status_;
+ }
+
+ int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
+ uint8_t* output) {
+ const uint8_t* begin = input;
+ const uint8_t* end = input + input_string_ncodeunits;
+ const uint8_t* end_trimmed = end;
+ const uint8_t* begin_trimmed = begin;
+
+ auto predicate = [&](uint32_t c) { return !state_.codepoints_[c]; };
+ if (TrimLeft && !ARROW_PREDICT_TRUE(
+ arrow::util::UTF8FindIf(begin, end, predicate, &begin_trimmed))) {
+ return kTransformError;
+ }
+ if (TrimRight && begin_trimmed < end) {
+ if (!ARROW_PREDICT_TRUE(arrow::util::UTF8FindIfReverse(begin_trimmed, end,
+ predicate, &end_trimmed))) {
+ return kTransformError;
+ }
+ }
+ std::copy(begin_trimmed, end_trimmed, output);
+ return end_trimmed - begin_trimmed;
+ }
+};
+
+template <typename Type>
+using UTF8Trim = StringTransformExecWithState<Type, UTF8TrimTransform<true, true>>;
+
+template <typename Type>
+using UTF8LTrim = StringTransformExecWithState<Type, UTF8TrimTransform<true, false>>;
+
+template <typename Type>
+using UTF8RTrim = StringTransformExecWithState<Type, UTF8TrimTransform<false, true>>;
+
+#endif
+
+template <bool TrimLeft, bool TrimRight>
+struct AsciiTrimWhitespaceTransform : public StringTransformBase {
+ int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
+ uint8_t* output) {
+ const uint8_t* begin = input;
+ const uint8_t* end = input + input_string_ncodeunits;
+ const uint8_t* end_trimmed = end;
+ const uint8_t* begin_trimmed = begin;
+
+ auto predicate = [](unsigned char c) { return !IsSpaceCharacterAscii(c); };
+ if (TrimLeft) {
+ begin_trimmed = std::find_if(begin, end, predicate);
+ }
+ if (TrimRight && begin_trimmed < end) {
+ std::reverse_iterator<const uint8_t*> rbegin(end);
+ std::reverse_iterator<const uint8_t*> rend(begin_trimmed);
+ end_trimmed = std::find_if(rbegin, rend, predicate).base();
+ }
+ std::copy(begin_trimmed, end_trimmed, output);
+ return end_trimmed - begin_trimmed;
+ }
+};
+
+template <typename Type>
+using AsciiTrimWhitespace =
+ StringTransformExec<Type, AsciiTrimWhitespaceTransform<true, true>>;
+
+template <typename Type>
+using AsciiLTrimWhitespace =
+ StringTransformExec<Type, AsciiTrimWhitespaceTransform<true, false>>;
+
+template <typename Type>
+using AsciiRTrimWhitespace =
+ StringTransformExec<Type, AsciiTrimWhitespaceTransform<false, true>>;
+
+struct AsciiTrimState {
+ TrimOptions options_;
+ std::vector<bool> characters_;
+
+ explicit AsciiTrimState(KernelContext* ctx, TrimOptions options)
+ : options_(std::move(options)), characters_(256) {
+ for (const auto c : options_.characters) {
+ characters_[static_cast<unsigned char>(c)] = true;
+ }
+ }
+};
+
+template <bool TrimLeft, bool TrimRight>
+struct AsciiTrimTransform : public StringTransformBase {
+ using State = KernelStateFromFunctionOptions<AsciiTrimState, TrimOptions>;
+
+ const AsciiTrimState& state_;
+
+ explicit AsciiTrimTransform(const AsciiTrimState& state) : state_(state) {}
+
+ int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
+ uint8_t* output) {
+ const uint8_t* begin = input;
+ const uint8_t* end = input + input_string_ncodeunits;
+ const uint8_t* end_trimmed = end;
+ const uint8_t* begin_trimmed = begin;
+
+ auto predicate = [&](uint8_t c) { return !state_.characters_[c]; };
+ if (TrimLeft) {
+ begin_trimmed = std::find_if(begin, end, predicate);
+ }
+ if (TrimRight && begin_trimmed < end) {
+ std::reverse_iterator<const uint8_t*> rbegin(end);
+ std::reverse_iterator<const uint8_t*> rend(begin_trimmed);
+ end_trimmed = std::find_if(rbegin, rend, predicate).base();
+ }
+ std::copy(begin_trimmed, end_trimmed, output);
+ return end_trimmed - begin_trimmed;
+ }
+};
+
+template <typename Type>
+using AsciiTrim = StringTransformExecWithState<Type, AsciiTrimTransform<true, true>>;
+
+template <typename Type>
+using AsciiLTrim = StringTransformExecWithState<Type, AsciiTrimTransform<true, false>>;
+
+template <typename Type>
+using AsciiRTrim = StringTransformExecWithState<Type, AsciiTrimTransform<false, true>>;
+
+const FunctionDoc utf8_center_doc(
+ "Center strings by padding with a given character",
+ ("For each string in `strings`, emit a centered string by padding both sides \n"
+ "with the given UTF8 codeunit.\nNull values emit null."),
+ {"strings"}, "PadOptions");
+
+const FunctionDoc utf8_lpad_doc(
+ "Right-align strings by padding with a given character",
+ ("For each string in `strings`, emit a right-aligned string by prepending \n"
+ "the given UTF8 codeunit.\nNull values emit null."),
+ {"strings"}, "PadOptions");
+
+const FunctionDoc utf8_rpad_doc(
+ "Left-align strings by padding with a given character",
+ ("For each string in `strings`, emit a left-aligned string by appending \n"
+ "the given UTF8 codeunit.\nNull values emit null."),
+ {"strings"}, "PadOptions");
+
+const FunctionDoc ascii_center_doc(
+ utf8_center_doc.description + "",
+ ("For each string in `strings`, emit a centered string by padding both sides \n"
+ "with the given ASCII character.\nNull values emit null."),
+ {"strings"}, "PadOptions");
+
+const FunctionDoc ascii_lpad_doc(
+ utf8_lpad_doc.description + "",
+ ("For each string in `strings`, emit a right-aligned string by prepending \n"
+ "the given ASCII character.\nNull values emit null."),
+ {"strings"}, "PadOptions");
+
+const FunctionDoc ascii_rpad_doc(
+ utf8_rpad_doc.description + "",
+ ("For each string in `strings`, emit a left-aligned string by appending \n"
+ "the given ASCII character.\nNull values emit null."),
+ {"strings"}, "PadOptions");
+
+const FunctionDoc utf8_trim_whitespace_doc(
+ "Trim leading and trailing whitespace characters",
+ ("For each string in `strings`, emit a string with leading and trailing whitespace\n"
+ "characters removed, where whitespace characters are defined by the Unicode\n"
+ "standard. Null values emit null."),
+ {"strings"});
+
+const FunctionDoc utf8_ltrim_whitespace_doc(
+ "Trim leading whitespace characters",
+ ("For each string in `strings`, emit a string with leading whitespace\n"
+ "characters removed, where whitespace characters are defined by the Unicode\n"
+ "standard. Null values emit null."),
+ {"strings"});
+
+const FunctionDoc utf8_rtrim_whitespace_doc(
+ "Trim trailing whitespace characters",
+ ("For each string in `strings`, emit a string with trailing whitespace\n"
+ "characters removed, where whitespace characters are defined by the Unicode\n"
+ "standard. Null values emit null."),
+ {"strings"});
+
+const FunctionDoc ascii_trim_whitespace_doc(
+ "Trim leading and trailing ASCII whitespace characters",
+ ("For each string in `strings`, emit a string with leading and trailing ASCII\n"
+ "whitespace characters removed. Use `utf8_trim_whitespace` to trim Unicode\n"
+ "whitespace characters. Null values emit null."),
+ {"strings"});
+
+const FunctionDoc ascii_ltrim_whitespace_doc(
+ "Trim leading ASCII whitespace characters",
+ ("For each string in `strings`, emit a string with leading ASCII whitespace\n"
+ "characters removed. Use `utf8_ltrim_whitespace` to trim leading Unicode\n"
+ "whitespace characters. Null values emit null."),
+ {"strings"});
+
+const FunctionDoc ascii_rtrim_whitespace_doc(
+ "Trim trailing ASCII whitespace characters",
+ ("For each string in `strings`, emit a string with trailing ASCII whitespace\n"
+ "characters removed. Use `utf8_rtrim_whitespace` to trim trailing Unicode\n"
+ "whitespace characters. Null values emit null."),
+ {"strings"});
+
+const FunctionDoc utf8_trim_doc(
+ "Trim leading and trailing characters present in the `characters` arguments",
+ ("For each string in `strings`, emit a string with leading and trailing\n"
+ "characters removed that are present in the `characters` argument. Null values\n"
+ "emit null."),
+ {"strings"}, "TrimOptions");
+
+const FunctionDoc utf8_ltrim_doc(
+ "Trim leading characters present in the `characters` arguments",
+ ("For each string in `strings`, emit a string with leading\n"
+ "characters removed that are present in the `characters` argument. Null values\n"
+ "emit null."),
+ {"strings"}, "TrimOptions");
+
+const FunctionDoc utf8_rtrim_doc(
+ "Trim trailing characters present in the `characters` arguments",
+ ("For each string in `strings`, emit a string with leading "
+ "characters removed that are present in the `characters` argument. Null values\n"
+ "emit null."),
+ {"strings"}, "TrimOptions");
+
+const FunctionDoc ascii_trim_doc(
+ utf8_trim_doc.summary + "",
+ utf8_trim_doc.description +
+ ("\nBoth the input string as the `characters` argument are interepreted as\n"
+ "ASCII characters, to trim non-ASCII characters, use `utf8_trim`."),
+ {"strings"}, "TrimOptions");
+
+const FunctionDoc ascii_ltrim_doc(
+ utf8_ltrim_doc.summary + "",
+ utf8_ltrim_doc.description +
+ ("\nBoth the input string as the `characters` argument are interepreted as\n"
+ "ASCII characters, to trim non-ASCII characters, use `utf8_trim`."),
+ {"strings"}, "TrimOptions");
+
+const FunctionDoc ascii_rtrim_doc(
+ utf8_rtrim_doc.summary + "",
+ utf8_rtrim_doc.description +
+ ("\nBoth the input string as the `characters` argument are interepreted as\n"
+ "ASCII characters, to trim non-ASCII characters, use `utf8_trim`."),
+ {"strings"}, "TrimOptions");
+
+const FunctionDoc strptime_doc(
+ "Parse timestamps",
+ ("For each string in `strings`, parse it as a timestamp.\n"
+ "The timestamp unit and the expected string pattern must be given\n"
+ "in StrptimeOptions. Null inputs emit null. If a non-null string\n"
+ "fails parsing, an error is returned."),
+ {"strings"}, "StrptimeOptions");
+
+const FunctionDoc binary_length_doc(
+ "Compute string lengths",
+ ("For each string in `strings`, emit the number of bytes. Null values emit null."),
+ {"strings"});
+
+const FunctionDoc utf8_length_doc("Compute UTF8 string lengths",
+ ("For each string in `strings`, emit the number of "
+ "UTF8 characters. Null values emit null."),
+ {"strings"});
+
+void AddStrptime(FunctionRegistry* registry) {
+ auto func = std::make_shared<ScalarFunction>("strptime", Arity::Unary(), &strptime_doc);
+ DCHECK_OK(func->AddKernel({utf8()}, OutputType(StrptimeResolve),
+ StrptimeExec<StringType>, StrptimeState::Init));
+ DCHECK_OK(func->AddKernel({large_utf8()}, OutputType(StrptimeResolve),
+ StrptimeExec<LargeStringType>, StrptimeState::Init));
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+}
+
+void AddBinaryLength(FunctionRegistry* registry) {
+ auto func = std::make_shared<ScalarFunction>("binary_length", Arity::Unary(),
+ &binary_length_doc);
+ ArrayKernelExec exec_offset_32 =
+ applicator::ScalarUnaryNotNull<Int32Type, StringType, BinaryLength>::Exec;
+ ArrayKernelExec exec_offset_64 =
+ applicator::ScalarUnaryNotNull<Int64Type, LargeStringType, BinaryLength>::Exec;
+ for (const auto& input_type : {binary(), utf8()}) {
+ DCHECK_OK(func->AddKernel({input_type}, int32(), exec_offset_32));
+ }
+ for (const auto& input_type : {large_binary(), large_utf8()}) {
+ DCHECK_OK(func->AddKernel({input_type}, int64(), exec_offset_64));
+ }
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+}
+
+void AddUtf8Length(FunctionRegistry* registry) {
+ auto func =
+ std::make_shared<ScalarFunction>("utf8_length", Arity::Unary(), &utf8_length_doc);
+
+ ArrayKernelExec exec_offset_32 =
+ applicator::ScalarUnaryNotNull<Int32Type, StringType, Utf8Length>::Exec;
+ DCHECK_OK(func->AddKernel({utf8()}, int32(), std::move(exec_offset_32)));
+
+ ArrayKernelExec exec_offset_64 =
+ applicator::ScalarUnaryNotNull<Int64Type, LargeStringType, Utf8Length>::Exec;
+ DCHECK_OK(func->AddKernel({large_utf8()}, int64(), std::move(exec_offset_64)));
+
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+}
+
+template <typename BinaryType, typename ListType>
+struct BinaryJoin {
+ using ArrayType = typename TypeTraits<BinaryType>::ArrayType;
+ using ListArrayType = typename TypeTraits<ListType>::ArrayType;
+ using ListScalarType = typename TypeTraits<ListType>::ScalarType;
+ using ListOffsetType = typename ListArrayType::offset_type;
+ using BuilderType = typename TypeTraits<BinaryType>::BuilderType;
+
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ if (batch[0].kind() == Datum::SCALAR) {
+ if (batch[1].kind() == Datum::SCALAR) {
+ return ExecScalarScalar(ctx, *batch[0].scalar(), *batch[1].scalar(), out);
+ }
+ DCHECK_EQ(batch[1].kind(), Datum::ARRAY);
+ return ExecScalarArray(ctx, *batch[0].scalar(), batch[1].array(), out);
+ }
+ DCHECK_EQ(batch[0].kind(), Datum::ARRAY);
+ if (batch[1].kind() == Datum::SCALAR) {
+ return ExecArrayScalar(ctx, batch[0].array(), *batch[1].scalar(), out);
+ }
+ DCHECK_EQ(batch[1].kind(), Datum::ARRAY);
+ return ExecArrayArray(ctx, batch[0].array(), batch[1].array(), out);
+ }
+
+ struct ListScalarOffsetLookup {
+ const ArrayType& values;
+
+ int64_t GetStart(int64_t i) { return 0; }
+ int64_t GetStop(int64_t i) { return values.length(); }
+ bool IsNull(int64_t i) { return false; }
+ };
+
+ struct ListArrayOffsetLookup {
+ explicit ListArrayOffsetLookup(const ListArrayType& lists)
+ : lists_(lists), offsets_(lists.raw_value_offsets()) {}
+
+ int64_t GetStart(int64_t i) { return offsets_[i]; }
+ int64_t GetStop(int64_t i) { return offsets_[i + 1]; }
+ bool IsNull(int64_t i) { return lists_.IsNull(i); }
+
+ private:
+ const ListArrayType& lists_;
+ const ListOffsetType* offsets_;
+ };
+
+ struct SeparatorScalarLookup {
+ const util::string_view separator;
+
+ bool IsNull(int64_t i) { return false; }
+ util::string_view GetView(int64_t i) { return separator; }
+ };
+
+ struct SeparatorArrayLookup {
+ const ArrayType& separators;
+
+ bool IsNull(int64_t i) { return separators.IsNull(i); }
+ util::string_view GetView(int64_t i) { return separators.GetView(i); }
+ };
+
+ // Scalar, scalar -> scalar
+ static Status ExecScalarScalar(KernelContext* ctx, const Scalar& left,
+ const Scalar& right, Datum* out) {
+ const auto& list = checked_cast<const ListScalarType&>(left);
+ const auto& separator_scalar = checked_cast<const BaseBinaryScalar&>(right);
+ if (!list.is_valid || !separator_scalar.is_valid) {
+ return Status::OK();
+ }
+ util::string_view separator(*separator_scalar.value);
+
+ const auto& strings = checked_cast<const ArrayType&>(*list.value);
+ if (strings.null_count() > 0) {
+ out->scalar()->is_valid = false;
+ return Status::OK();
+ }
+
+ TypedBufferBuilder<uint8_t> builder(ctx->memory_pool());
+ auto Append = [&](util::string_view value) {
+ return builder.Append(reinterpret_cast<const uint8_t*>(value.data()),
+ static_cast<int64_t>(value.size()));
+ };
+ if (strings.length() > 0) {
+ auto data_length =
+ strings.total_values_length() + (strings.length() - 1) * separator.length();
+ RETURN_NOT_OK(builder.Reserve(data_length));
+ RETURN_NOT_OK(Append(strings.GetView(0)));
+ for (int64_t j = 1; j < strings.length(); j++) {
+ RETURN_NOT_OK(Append(separator));
+ RETURN_NOT_OK(Append(strings.GetView(j)));
+ }
+ }
+ auto out_scalar = checked_cast<BaseBinaryScalar*>(out->scalar().get());
+ return builder.Finish(&out_scalar->value);
+ }
+
+ // Scalar, array -> array
+ static Status ExecScalarArray(KernelContext* ctx, const Scalar& left,
+ const std::shared_ptr<ArrayData>& right, Datum* out) {
+ const auto& list_scalar = checked_cast<const BaseListScalar&>(left);
+ if (!list_scalar.is_valid) {
+ ARROW_ASSIGN_OR_RAISE(
+ auto nulls, MakeArrayOfNull(right->type, right->length, ctx->memory_pool()));
+ *out = *nulls->data();
+ return Status::OK();
+ }
+ const auto& strings = checked_cast<const ArrayType&>(*list_scalar.value);
+ if (strings.null_count() != 0) {
+ ARROW_ASSIGN_OR_RAISE(
+ auto nulls, MakeArrayOfNull(right->type, right->length, ctx->memory_pool()));
+ *out = *nulls->data();
+ return Status::OK();
+ }
+ const ArrayType separators(right);
+
+ BuilderType builder(ctx->memory_pool());
+ RETURN_NOT_OK(builder.Reserve(separators.length()));
+
+ // Presize data to avoid multiple reallocations when joining strings
+ int64_t total_data_length = 0;
+ const int64_t list_length = strings.length();
+ if (list_length) {
+ const int64_t string_length = strings.total_values_length();
+ total_data_length +=
+ string_length * (separators.length() - separators.null_count());
+ for (int64_t i = 0; i < separators.length(); ++i) {
+ if (separators.IsNull(i)) {
+ continue;
+ }
+ total_data_length += (list_length - 1) * separators.value_length(i);
+ }
+ }
+ RETURN_NOT_OK(builder.ReserveData(total_data_length));
+
+ return JoinStrings(separators.length(), strings, ListScalarOffsetLookup{strings},
+ SeparatorArrayLookup{separators}, &builder, out);
+ }
+
+ // Array, scalar -> array
+ static Status ExecArrayScalar(KernelContext* ctx,
+ const std::shared_ptr<ArrayData>& left,
+ const Scalar& right, Datum* out) {
+ const ListArrayType lists(left);
+ const auto& separator_scalar = checked_cast<const BaseBinaryScalar&>(right);
+
+ if (!separator_scalar.is_valid) {
+ ARROW_ASSIGN_OR_RAISE(
+ auto nulls,
+ MakeArrayOfNull(lists.value_type(), lists.length(), ctx->memory_pool()));
+ *out = *nulls->data();
+ return Status::OK();
+ }
+
+ util::string_view separator(*separator_scalar.value);
+ const auto& strings = checked_cast<const ArrayType&>(*lists.values());
+ const auto list_offsets = lists.raw_value_offsets();
+
+ BuilderType builder(ctx->memory_pool());
+ RETURN_NOT_OK(builder.Reserve(lists.length()));
+
+ // Presize data to avoid multiple reallocations when joining strings
+ int64_t total_data_length = strings.total_values_length();
+ for (int64_t i = 0; i < lists.length(); ++i) {
+ const auto start = list_offsets[i], end = list_offsets[i + 1];
+ if (end > start && !ValuesContainNull(strings, start, end)) {
+ total_data_length += (end - start - 1) * separator.length();
+ }
+ }
+ RETURN_NOT_OK(builder.ReserveData(total_data_length));
+
+ return JoinStrings(lists.length(), strings, ListArrayOffsetLookup{lists},
+ SeparatorScalarLookup{separator}, &builder, out);
+ }
+
+ // Array, array -> array
+ static Status ExecArrayArray(KernelContext* ctx, const std::shared_ptr<ArrayData>& left,
+ const std::shared_ptr<ArrayData>& right, Datum* out) {
+ const ListArrayType lists(left);
+ const auto& strings = checked_cast<const ArrayType&>(*lists.values());
+ const auto list_offsets = lists.raw_value_offsets();
+ const auto string_offsets = strings.raw_value_offsets();
+ const ArrayType separators(right);
+
+ BuilderType builder(ctx->memory_pool());
+ RETURN_NOT_OK(builder.Reserve(lists.length()));
+
+ // Presize data to avoid multiple reallocations when joining strings
+ int64_t total_data_length = 0;
+ for (int64_t i = 0; i < lists.length(); ++i) {
+ if (separators.IsNull(i)) {
+ continue;
+ }
+ const auto start = list_offsets[i], end = list_offsets[i + 1];
+ if (end > start && !ValuesContainNull(strings, start, end)) {
+ total_data_length += string_offsets[end] - string_offsets[start];
+ total_data_length += (end - start - 1) * separators.value_length(i);
+ }
+ }
+ RETURN_NOT_OK(builder.ReserveData(total_data_length));
+
+ struct SeparatorLookup {
+ const ArrayType& separators;
+
+ bool IsNull(int64_t i) { return separators.IsNull(i); }
+ util::string_view GetView(int64_t i) { return separators.GetView(i); }
+ };
+ return JoinStrings(lists.length(), strings, ListArrayOffsetLookup{lists},
+ SeparatorArrayLookup{separators}, &builder, out);
+ }
+
+ template <typename ListOffsetLookup, typename SeparatorLookup>
+ static Status JoinStrings(int64_t length, const ArrayType& strings,
+ ListOffsetLookup&& list_offsets, SeparatorLookup&& separators,
+ BuilderType* builder, Datum* out) {
+ for (int64_t i = 0; i < length; ++i) {
+ if (list_offsets.IsNull(i) || separators.IsNull(i)) {
+ builder->UnsafeAppendNull();
+ continue;
+ }
+ const auto j_start = list_offsets.GetStart(i), j_end = list_offsets.GetStop(i);
+ if (j_start == j_end) {
+ builder->UnsafeAppendEmptyValue();
+ continue;
+ }
+ if (ValuesContainNull(strings, j_start, j_end)) {
+ builder->UnsafeAppendNull();
+ continue;
+ }
+ builder->UnsafeAppend(strings.GetView(j_start));
+ for (int64_t j = j_start + 1; j < j_end; ++j) {
+ builder->UnsafeExtendCurrent(separators.GetView(i));
+ builder->UnsafeExtendCurrent(strings.GetView(j));
+ }
+ }
+
+ std::shared_ptr<Array> string_array;
+ RETURN_NOT_OK(builder->Finish(&string_array));
+ *out = *string_array->data();
+ // Correct the output type based on the input
+ out->mutable_array()->type = strings.type();
+ return Status::OK();
+ }
+
+ static bool ValuesContainNull(const ArrayType& values, int64_t start, int64_t end) {
+ if (values.null_count() == 0) {
+ return false;
+ }
+ for (int64_t i = start; i < end; ++i) {
+ if (values.IsNull(i)) {
+ return true;
+ }
+ }
+ return false;
+ }
+};
+
+using BinaryJoinElementWiseState = OptionsWrapper<JoinOptions>;
+
+template <typename Type>
+struct BinaryJoinElementWise {
+ using ArrayType = typename TypeTraits<Type>::ArrayType;
+ using BuilderType = typename TypeTraits<Type>::BuilderType;
+ using offset_type = typename Type::offset_type;
+
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ JoinOptions options = BinaryJoinElementWiseState::Get(ctx);
+ // Last argument is the separator (for consistency with binary_join)
+ if (std::all_of(batch.values.begin(), batch.values.end(),
+ [](const Datum& d) { return d.is_scalar(); })) {
+ return ExecOnlyScalar(ctx, options, batch, out);
+ }
+ return ExecContainingArrays(ctx, options, batch, out);
+ }
+
+ static Status ExecOnlyScalar(KernelContext* ctx, const JoinOptions& options,
+ const ExecBatch& batch, Datum* out) {
+ BaseBinaryScalar* output = checked_cast<BaseBinaryScalar*>(out->scalar().get());
+ const size_t num_args = batch.values.size();
+ if (num_args == 1) {
+ // Only separator, no values
+ ARROW_ASSIGN_OR_RAISE(output->value, ctx->Allocate(0));
+ output->is_valid = batch.values[0].scalar()->is_valid;
+ return Status::OK();
+ }
+
+ int64_t final_size = CalculateRowSize(options, batch, 0);
+ if (final_size < 0) {
+ ARROW_ASSIGN_OR_RAISE(output->value, ctx->Allocate(0));
+ output->is_valid = false;
+ return Status::OK();
+ }
+ ARROW_ASSIGN_OR_RAISE(output->value, ctx->Allocate(final_size));
+ const auto separator = UnboxScalar<Type>::Unbox(*batch.values.back().scalar());
+ uint8_t* buf = output->value->mutable_data();
+ bool first = true;
+ for (size_t i = 0; i < num_args - 1; i++) {
+ const Scalar& scalar = *batch[i].scalar();
+ util::string_view s;
+ if (scalar.is_valid) {
+ s = UnboxScalar<Type>::Unbox(scalar);
+ } else {
+ switch (options.null_handling) {
+ case JoinOptions::EMIT_NULL:
+ // Handled by CalculateRowSize
+ DCHECK(false) << "unreachable";
+ break;
+ case JoinOptions::SKIP:
+ continue;
+ case JoinOptions::REPLACE:
+ s = options.null_replacement;
+ break;
+ }
+ }
+ if (!first) {
+ buf = std::copy(separator.begin(), separator.end(), buf);
+ }
+ first = false;
+ buf = std::copy(s.begin(), s.end(), buf);
+ }
+ output->is_valid = true;
+ DCHECK_EQ(final_size, buf - output->value->mutable_data());
+ return Status::OK();
+ }
+
+ static Status ExecContainingArrays(KernelContext* ctx, const JoinOptions& options,
+ const ExecBatch& batch, Datum* out) {
+ // Presize data to avoid reallocations
+ int64_t final_size = 0;
+ for (int64_t i = 0; i < batch.length; i++) {
+ auto size = CalculateRowSize(options, batch, i);
+ if (size > 0) final_size += size;
+ }
+ BuilderType builder(ctx->memory_pool());
+ RETURN_NOT_OK(builder.Reserve(batch.length));
+ RETURN_NOT_OK(builder.ReserveData(final_size));
+
+ std::vector<util::string_view> valid_cols(batch.values.size());
+ for (size_t row = 0; row < static_cast<size_t>(batch.length); row++) {
+ size_t num_valid = 0; // Not counting separator
+ for (size_t col = 0; col < batch.values.size(); col++) {
+ if (batch[col].is_scalar()) {
+ const auto& scalar = *batch[col].scalar();
+ if (scalar.is_valid) {
+ valid_cols[col] = UnboxScalar<Type>::Unbox(scalar);
+ if (col < batch.values.size() - 1) num_valid++;
+ } else {
+ valid_cols[col] = util::string_view();
+ }
+ } else {
+ const ArrayData& array = *batch[col].array();
+ if (!array.MayHaveNulls() ||
+ BitUtil::GetBit(array.buffers[0]->data(), array.offset + row)) {
+ const offset_type* offsets = array.GetValues<offset_type>(1);
+ const uint8_t* data = array.GetValues<uint8_t>(2, /*absolute_offset=*/0);
+ const int64_t length = offsets[row + 1] - offsets[row];
+ valid_cols[col] = util::string_view(
+ reinterpret_cast<const char*>(data + offsets[row]), length);
+ if (col < batch.values.size() - 1) num_valid++;
+ } else {
+ valid_cols[col] = util::string_view();
+ }
+ }
+ }
+
+ if (!valid_cols.back().data()) {
+ // Separator is null
+ builder.UnsafeAppendNull();
+ continue;
+ } else if (batch.values.size() == 1) {
+ // Only given separator
+ builder.UnsafeAppendEmptyValue();
+ continue;
+ } else if (num_valid < batch.values.size() - 1) {
+ // We had some nulls
+ if (options.null_handling == JoinOptions::EMIT_NULL) {
+ builder.UnsafeAppendNull();
+ continue;
+ }
+ }
+ const auto separator = valid_cols.back();
+ bool first = true;
+ for (size_t col = 0; col < batch.values.size() - 1; col++) {
+ util::string_view value = valid_cols[col];
+ if (!value.data()) {
+ switch (options.null_handling) {
+ case JoinOptions::EMIT_NULL:
+ DCHECK(false) << "unreachable";
+ break;
+ case JoinOptions::SKIP:
+ continue;
+ case JoinOptions::REPLACE:
+ value = options.null_replacement;
+ break;
+ }
+ }
+ if (first) {
+ builder.UnsafeAppend(value);
+ first = false;
+ continue;
+ }
+ builder.UnsafeExtendCurrent(separator);
+ builder.UnsafeExtendCurrent(value);
+ }
+ }
+
+ std::shared_ptr<Array> string_array;
+ RETURN_NOT_OK(builder.Finish(&string_array));
+ *out = *string_array->data();
+ out->mutable_array()->type = batch[0].type();
+ DCHECK_EQ(batch.length, out->array()->length);
+ DCHECK_EQ(final_size,
+ checked_cast<const ArrayType&>(*string_array).total_values_length());
+ return Status::OK();
+ }
+
+ // Compute the length of the output for the given position, or -1 if it would be null.
+ static int64_t CalculateRowSize(const JoinOptions& options, const ExecBatch& batch,
+ const int64_t index) {
+ const auto num_args = batch.values.size();
+ int64_t final_size = 0;
+ int64_t num_non_null_args = 0;
+ for (size_t i = 0; i < num_args; i++) {
+ int64_t element_size = 0;
+ bool valid = true;
+ if (batch[i].is_scalar()) {
+ const Scalar& scalar = *batch[i].scalar();
+ valid = scalar.is_valid;
+ element_size = UnboxScalar<Type>::Unbox(scalar).size();
+ } else {
+ const ArrayData& array = *batch[i].array();
+ valid = !array.MayHaveNulls() ||
+ BitUtil::GetBit(array.buffers[0]->data(), array.offset + index);
+ const offset_type* offsets = array.GetValues<offset_type>(1);
+ element_size = offsets[index + 1] - offsets[index];
+ }
+ if (i == num_args - 1) {
+ if (!valid) return -1;
+ if (num_non_null_args > 1) {
+ // Add separator size (only if there were values to join)
+ final_size += (num_non_null_args - 1) * element_size;
+ }
+ break;
+ }
+ if (!valid) {
+ switch (options.null_handling) {
+ case JoinOptions::EMIT_NULL:
+ return -1;
+ case JoinOptions::SKIP:
+ continue;
+ case JoinOptions::REPLACE:
+ element_size = options.null_replacement.size();
+ break;
+ }
+ }
+ num_non_null_args++;
+ final_size += element_size;
+ }
+ return final_size;
+ }
+};
+
+const FunctionDoc binary_join_doc(
+ "Join a list of strings together with a `separator` to form a single string",
+ ("Insert `separator` between `list` elements, and concatenate them.\n"
+ "Any null input and any null `list` element emits a null output.\n"),
+ {"list", "separator"});
+
+const FunctionDoc binary_join_element_wise_doc(
+ "Join string arguments into one, using the last argument as the separator",
+ ("Insert the last argument of `strings` between the rest of the elements, "
+ "and concatenate them.\n"
+ "Any null separator element emits a null output. Null elements either "
+ "emit a null (the default), are skipped, or replaced with a given string.\n"),
+ {"*strings"}, "JoinOptions");
+
+const auto kDefaultJoinOptions = JoinOptions::Defaults();
+
+template <typename ListType>
+void AddBinaryJoinForListType(ScalarFunction* func) {
+ for (const std::shared_ptr<DataType>& ty : BaseBinaryTypes()) {
+ auto exec = GenerateTypeAgnosticVarBinaryBase<BinaryJoin, ListType>(*ty);
+ auto list_ty = std::make_shared<ListType>(ty);
+ DCHECK_OK(func->AddKernel({InputType(list_ty), InputType(ty)}, ty, exec));
+ }
+}
+
+void AddBinaryJoin(FunctionRegistry* registry) {
+ {
+ auto func = std::make_shared<ScalarFunction>("binary_join", Arity::Binary(),
+ &binary_join_doc);
+ AddBinaryJoinForListType<ListType>(func.get());
+ AddBinaryJoinForListType<LargeListType>(func.get());
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+ }
+ {
+ auto func = std::make_shared<ScalarFunction>(
+ "binary_join_element_wise", Arity::VarArgs(/*min_args=*/1),
+ &binary_join_element_wise_doc, &kDefaultJoinOptions);
+ for (const auto& ty : BaseBinaryTypes()) {
+ ScalarKernel kernel{KernelSignature::Make({InputType(ty)}, ty, /*is_varargs=*/true),
+ GenerateTypeAgnosticVarBinaryBase<BinaryJoinElementWise>(ty),
+ BinaryJoinElementWiseState::Init};
+ kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
+ kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
+ DCHECK_OK(func->AddKernel(std::move(kernel)));
+ }
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+ }
+}
+
+template <template <typename> class ExecFunctor>
+void MakeUnaryStringBatchKernel(
+ std::string name, FunctionRegistry* registry, const FunctionDoc* doc,
+ MemAllocation::type mem_allocation = MemAllocation::PREALLOCATE) {
+ auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
+ {
+ auto exec_32 = ExecFunctor<StringType>::Exec;
+ ScalarKernel kernel{{utf8()}, utf8(), exec_32};
+ kernel.mem_allocation = mem_allocation;
+ DCHECK_OK(func->AddKernel(std::move(kernel)));
+ }
+ {
+ auto exec_64 = ExecFunctor<LargeStringType>::Exec;
+ ScalarKernel kernel{{large_utf8()}, large_utf8(), exec_64};
+ kernel.mem_allocation = mem_allocation;
+ DCHECK_OK(func->AddKernel(std::move(kernel)));
+ }
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+}
+
+template <template <typename> class ExecFunctor>
+void MakeUnaryStringBatchKernelWithState(
+ std::string name, FunctionRegistry* registry, const FunctionDoc* doc,
+ MemAllocation::type mem_allocation = MemAllocation::PREALLOCATE) {
+ auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
+ {
+ using t32 = ExecFunctor<StringType>;
+ ScalarKernel kernel{{utf8()}, utf8(), t32::Exec, t32::State::Init};
+ kernel.mem_allocation = mem_allocation;
+ DCHECK_OK(func->AddKernel(std::move(kernel)));
+ }
+ {
+ using t64 = ExecFunctor<LargeStringType>;
+ ScalarKernel kernel{{large_utf8()}, large_utf8(), t64::Exec, t64::State::Init};
+ kernel.mem_allocation = mem_allocation;
+ DCHECK_OK(func->AddKernel(std::move(kernel)));
+ }
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+}
+
+#ifdef ARROW_WITH_UTF8PROC
+
+template <template <typename> class Transformer>
+void MakeUnaryStringUTF8TransformKernel(std::string name, FunctionRegistry* registry,
+ const FunctionDoc* doc) {
+ auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
+ ArrayKernelExec exec_32 = Transformer<StringType>::Exec;
+ ArrayKernelExec exec_64 = Transformer<LargeStringType>::Exec;
+ DCHECK_OK(func->AddKernel({utf8()}, utf8(), exec_32));
+ DCHECK_OK(func->AddKernel({large_utf8()}, large_utf8(), exec_64));
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+}
+
+#endif
+
+// NOTE: Predicate should only populate 'status' with errors,
+// leave it unmodified to indicate Status::OK()
+using StringPredicate =
+ std::function<bool(KernelContext*, const uint8_t*, size_t, Status*)>;
+
+template <typename Type>
+Status ApplyPredicate(KernelContext* ctx, const ExecBatch& batch,
+ StringPredicate predicate, Datum* out) {
+ Status st = Status::OK();
+ EnsureLookupTablesFilled();
+ if (batch[0].kind() == Datum::ARRAY) {
+ const ArrayData& input = *batch[0].array();
+ ArrayIterator<Type> input_it(input);
+ ArrayData* out_arr = out->mutable_array();
+ ::arrow::internal::GenerateBitsUnrolled(
+ out_arr->buffers[1]->mutable_data(), out_arr->offset, input.length,
+ [&]() -> bool {
+ util::string_view val = input_it();
+ return predicate(ctx, reinterpret_cast<const uint8_t*>(val.data()), val.size(),
+ &st);
+ });
+ } else {
+ const auto& input = checked_cast<const BaseBinaryScalar&>(*batch[0].scalar());
+ if (input.is_valid) {
+ bool boolean_result = predicate(ctx, input.value->data(),
+ static_cast<size_t>(input.value->size()), &st);
+ // UTF decoding can lead to issues
+ if (st.ok()) {
+ out->value = std::make_shared<BooleanScalar>(boolean_result);
+ }
+ }
+ }
+ return st;
+}
+
+template <typename Predicate>
+void AddUnaryStringPredicate(std::string name, FunctionRegistry* registry,
+ const FunctionDoc* doc) {
+ auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
+ auto exec_32 = [](KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ return ApplyPredicate<StringType>(ctx, batch, Predicate::Call, out);
+ };
+ auto exec_64 = [](KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ return ApplyPredicate<LargeStringType>(ctx, batch, Predicate::Call, out);
+ };
+ DCHECK_OK(func->AddKernel({utf8()}, boolean(), std::move(exec_32)));
+ DCHECK_OK(func->AddKernel({large_utf8()}, boolean(), std::move(exec_64)));
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+}
+
+FunctionDoc StringPredicateDoc(std::string summary, std::string description) {
+ return FunctionDoc{std::move(summary), std::move(description), {"strings"}};
+}
+
+FunctionDoc StringClassifyDoc(std::string class_summary, std::string class_desc,
+ bool non_empty) {
+ std::string summary, description;
+ {
+ std::stringstream ss;
+ ss << "Classify strings as " << class_summary;
+ summary = ss.str();
+ }
+ {
+ std::stringstream ss;
+ if (non_empty) {
+ ss
+ << ("For each string in `strings`, emit true iff the string is non-empty\n"
+ "and consists only of ");
+ } else {
+ ss
+ << ("For each string in `strings`, emit true iff the string consists only\n"
+ "of ");
+ }
+ ss << class_desc << ". Null strings emit null.";
+ description = ss.str();
+ }
+ return StringPredicateDoc(std::move(summary), std::move(description));
+}
+
+const auto string_is_ascii_doc = StringClassifyDoc("ASCII", "ASCII characters", false);
+
+const auto ascii_is_alnum_doc =
+ StringClassifyDoc("ASCII alphanumeric", "alphanumeric ASCII characters", true);
+const auto ascii_is_alpha_doc =
+ StringClassifyDoc("ASCII alphabetic", "alphabetic ASCII characters", true);
+const auto ascii_is_decimal_doc =
+ StringClassifyDoc("ASCII decimal", "decimal ASCII characters", true);
+const auto ascii_is_lower_doc =
+ StringClassifyDoc("ASCII lowercase", "lowercase ASCII characters", true);
+const auto ascii_is_printable_doc =
+ StringClassifyDoc("ASCII printable", "printable ASCII characters", true);
+const auto ascii_is_space_doc =
+ StringClassifyDoc("ASCII whitespace", "whitespace ASCII characters", true);
+const auto ascii_is_upper_doc =
+ StringClassifyDoc("ASCII uppercase", "uppercase ASCII characters", true);
+
+const auto ascii_is_title_doc = StringPredicateDoc(
+ "Classify strings as ASCII titlecase",
+ ("For each string in `strings`, emit true iff the string is title-cased,\n"
+ "i.e. it has at least one cased character, each uppercase character\n"
+ "follows a non-cased character, and each lowercase character follows\n"
+ "an uppercase character.\n"));
+
+const auto utf8_is_alnum_doc =
+ StringClassifyDoc("alphanumeric", "alphanumeric Unicode characters", true);
+const auto utf8_is_alpha_doc =
+ StringClassifyDoc("alphabetic", "alphabetic Unicode characters", true);
+const auto utf8_is_decimal_doc =
+ StringClassifyDoc("decimal", "decimal Unicode characters", true);
+const auto utf8_is_digit_doc = StringClassifyDoc("digits", "Unicode digits", true);
+const auto utf8_is_lower_doc =
+ StringClassifyDoc("lowercase", "lowercase Unicode characters", true);
+const auto utf8_is_numeric_doc =
+ StringClassifyDoc("numeric", "numeric Unicode characters", true);
+const auto utf8_is_printable_doc =
+ StringClassifyDoc("printable", "printable Unicode characters", true);
+const auto utf8_is_space_doc =
+ StringClassifyDoc("whitespace", "whitespace Unicode characters", true);
+const auto utf8_is_upper_doc =
+ StringClassifyDoc("uppercase", "uppercase Unicode characters", true);
+
+const auto utf8_is_title_doc = StringPredicateDoc(
+ "Classify strings as titlecase",
+ ("For each string in `strings`, emit true iff the string is title-cased,\n"
+ "i.e. it has at least one cased character, each uppercase character\n"
+ "follows a non-cased character, and each lowercase character follows\n"
+ "an uppercase character.\n"));
+
+const FunctionDoc ascii_upper_doc(
+ "Transform ASCII input to uppercase",
+ ("For each string in `strings`, return an uppercase version.\n\n"
+ "This function assumes the input is fully ASCII. It it may contain\n"
+ "non-ASCII characters, use \"utf8_upper\" instead."),
+ {"strings"});
+
+const FunctionDoc ascii_lower_doc(
+ "Transform ASCII input to lowercase",
+ ("For each string in `strings`, return a lowercase version.\n\n"
+ "This function assumes the input is fully ASCII. If it may contain\n"
+ "non-ASCII characters, use \"utf8_lower\" instead."),
+ {"strings"});
+
+const FunctionDoc utf8_upper_doc(
+ "Transform input to uppercase",
+ ("For each string in `strings`, return an uppercase version."), {"strings"});
+
+const FunctionDoc utf8_lower_doc(
+ "Transform input to lowercase",
+ ("For each string in `strings`, return a lowercase version."), {"strings"});
+
+const FunctionDoc ascii_reverse_doc(
+ "Reverse ASCII input",
+ ("For each ASCII string in `strings`, return a reversed version.\n\n"
+ "This function assumes the input is fully ASCII. If it may contain\n"
+ "non-ASCII characters, use \"utf8_reverse\" instead."),
+ {"strings"});
+
+const FunctionDoc utf8_reverse_doc(
+ "Reverse utf8 input",
+ ("For each utf8 string in `strings`, return a reversed version.\n\n"
+ "This function operates on codepoints/UTF-8 code units, not grapheme\n"
+ "clusters. Hence, it will not correctly reverse grapheme clusters\n"
+ "composed of multiple codepoints."),
+ {"strings"});
+
+} // namespace
+
+void RegisterScalarStringAscii(FunctionRegistry* registry) {
+ // ascii_upper and ascii_lower are able to reuse the original offsets buffer,
+ // so don't preallocate them in the output.
+ MakeUnaryStringBatchKernel<AsciiUpper>("ascii_upper", registry, &ascii_upper_doc,
+ MemAllocation::NO_PREALLOCATE);
+ MakeUnaryStringBatchKernel<AsciiLower>("ascii_lower", registry, &ascii_lower_doc,
+ MemAllocation::NO_PREALLOCATE);
+ MakeUnaryStringBatchKernel<AsciiTrimWhitespace>("ascii_trim_whitespace", registry,
+ &ascii_trim_whitespace_doc);
+ MakeUnaryStringBatchKernel<AsciiLTrimWhitespace>("ascii_ltrim_whitespace", registry,
+ &ascii_ltrim_whitespace_doc);
+ MakeUnaryStringBatchKernel<AsciiRTrimWhitespace>("ascii_rtrim_whitespace", registry,
+ &ascii_rtrim_whitespace_doc);
+ MakeUnaryStringBatchKernel<AsciiReverse>("ascii_reverse", registry, &ascii_reverse_doc);
+ MakeUnaryStringBatchKernel<Utf8Reverse>("utf8_reverse", registry, &utf8_reverse_doc);
+
+ MakeUnaryStringBatchKernelWithState<AsciiCenter>("ascii_center", registry,
+ &ascii_center_doc);
+ MakeUnaryStringBatchKernelWithState<AsciiLPad>("ascii_lpad", registry, &ascii_lpad_doc);
+ MakeUnaryStringBatchKernelWithState<AsciiRPad>("ascii_rpad", registry, &ascii_rpad_doc);
+ MakeUnaryStringBatchKernelWithState<Utf8Center>("utf8_center", registry,
+ &utf8_center_doc);
+ MakeUnaryStringBatchKernelWithState<Utf8LPad>("utf8_lpad", registry, &utf8_lpad_doc);
+ MakeUnaryStringBatchKernelWithState<Utf8RPad>("utf8_rpad", registry, &utf8_rpad_doc);
+
+ MakeUnaryStringBatchKernelWithState<AsciiTrim>("ascii_trim", registry, &ascii_trim_doc);
+ MakeUnaryStringBatchKernelWithState<AsciiLTrim>("ascii_ltrim", registry,
+ &ascii_ltrim_doc);
+ MakeUnaryStringBatchKernelWithState<AsciiRTrim>("ascii_rtrim", registry,
+ &ascii_rtrim_doc);
+
+ AddUnaryStringPredicate<IsAscii>("string_is_ascii", registry, &string_is_ascii_doc);
+
+ AddUnaryStringPredicate<IsAlphaNumericAscii>("ascii_is_alnum", registry,
+ &ascii_is_alnum_doc);
+ AddUnaryStringPredicate<IsAlphaAscii>("ascii_is_alpha", registry, &ascii_is_alpha_doc);
+ AddUnaryStringPredicate<IsDecimalAscii>("ascii_is_decimal", registry,
+ &ascii_is_decimal_doc);
+ // no is_digit for ascii, since it is the same as is_decimal
+ AddUnaryStringPredicate<IsLowerAscii>("ascii_is_lower", registry, &ascii_is_lower_doc);
+ // no is_numeric for ascii, since it is the same as is_decimal
+ AddUnaryStringPredicate<IsPrintableAscii>("ascii_is_printable", registry,
+ &ascii_is_printable_doc);
+ AddUnaryStringPredicate<IsSpaceAscii>("ascii_is_space", registry, &ascii_is_space_doc);
+ AddUnaryStringPredicate<IsTitleAscii>("ascii_is_title", registry, &ascii_is_title_doc);
+ AddUnaryStringPredicate<IsUpperAscii>("ascii_is_upper", registry, &ascii_is_upper_doc);
+
+#ifdef ARROW_WITH_UTF8PROC
+ MakeUnaryStringUTF8TransformKernel<UTF8Upper>("utf8_upper", registry, &utf8_upper_doc);
+ MakeUnaryStringUTF8TransformKernel<UTF8Lower>("utf8_lower", registry, &utf8_lower_doc);
+ MakeUnaryStringBatchKernel<UTF8TrimWhitespace>("utf8_trim_whitespace", registry,
+ &utf8_trim_whitespace_doc);
+ MakeUnaryStringBatchKernel<UTF8LTrimWhitespace>("utf8_ltrim_whitespace", registry,
+ &utf8_ltrim_whitespace_doc);
+ MakeUnaryStringBatchKernel<UTF8RTrimWhitespace>("utf8_rtrim_whitespace", registry,
+ &utf8_rtrim_whitespace_doc);
+ MakeUnaryStringBatchKernelWithState<UTF8Trim>("utf8_trim", registry, &utf8_trim_doc);
+ MakeUnaryStringBatchKernelWithState<UTF8LTrim>("utf8_ltrim", registry, &utf8_ltrim_doc);
+ MakeUnaryStringBatchKernelWithState<UTF8RTrim>("utf8_rtrim", registry, &utf8_rtrim_doc);
+
+ AddUnaryStringPredicate<IsAlphaNumericUnicode>("utf8_is_alnum", registry,
+ &utf8_is_alnum_doc);
+ AddUnaryStringPredicate<IsAlphaUnicode>("utf8_is_alpha", registry, &utf8_is_alpha_doc);
+ AddUnaryStringPredicate<IsDecimalUnicode>("utf8_is_decimal", registry,
+ &utf8_is_decimal_doc);
+ AddUnaryStringPredicate<IsDigitUnicode>("utf8_is_digit", registry, &utf8_is_digit_doc);
+ AddUnaryStringPredicate<IsLowerUnicode>("utf8_is_lower", registry, &utf8_is_lower_doc);
+ AddUnaryStringPredicate<IsNumericUnicode>("utf8_is_numeric", registry,
+ &utf8_is_numeric_doc);
+ AddUnaryStringPredicate<IsPrintableUnicode>("utf8_is_printable", registry,
+ &utf8_is_printable_doc);
+ AddUnaryStringPredicate<IsSpaceUnicode>("utf8_is_space", registry, &utf8_is_space_doc);
+ AddUnaryStringPredicate<IsTitleUnicode>("utf8_is_title", registry, &utf8_is_title_doc);
+ AddUnaryStringPredicate<IsUpperUnicode>("utf8_is_upper", registry, &utf8_is_upper_doc);
+#endif
+
+ AddBinaryLength(registry);
+ AddUtf8Length(registry);
+ AddMatchSubstring(registry);
+ AddFindSubstring(registry);
+ AddCountSubstring(registry);
+ MakeUnaryStringBatchKernelWithState<ReplaceSubStringPlain>(
+ "replace_substring", registry, &replace_substring_doc,
+ MemAllocation::NO_PREALLOCATE);
+#ifdef ARROW_WITH_RE2
+ MakeUnaryStringBatchKernelWithState<ReplaceSubStringRegex>(
+ "replace_substring_regex", registry, &replace_substring_regex_doc,
+ MemAllocation::NO_PREALLOCATE);
+ AddExtractRegex(registry);
+#endif
+ AddReplaceSlice(registry);
+ AddSlice(registry);
+ AddSplit(registry);
+ AddStrptime(registry);
+ AddBinaryJoin(registry);
+}
+
+} // namespace internal
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_temporal.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_temporal.cc
new file mode 100644
index 00000000000..f0257772d4a
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_temporal.cc
@@ -0,0 +1,663 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/builder.h"
+#include "arrow/compute/api_scalar.h"
+#include "arrow/compute/kernels/common.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/time.h"
+#include "arrow/vendored/datetime.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+using internal::checked_pointer_cast;
+
+namespace compute {
+namespace internal {
+
+namespace {
+
+using arrow_vendored::date::days;
+using arrow_vendored::date::floor;
+using arrow_vendored::date::hh_mm_ss;
+using arrow_vendored::date::sys_time;
+using arrow_vendored::date::trunc;
+using arrow_vendored::date::weekday;
+using arrow_vendored::date::weeks;
+using arrow_vendored::date::year_month_day;
+using arrow_vendored::date::years;
+using arrow_vendored::date::literals::dec;
+using arrow_vendored::date::literals::jan;
+using arrow_vendored::date::literals::last;
+using arrow_vendored::date::literals::mon;
+using arrow_vendored::date::literals::thu;
+using internal::applicator::ScalarUnaryNotNull;
+using internal::applicator::SimpleUnary;
+
+using DayOfWeekState = OptionsWrapper<DayOfWeekOptions>;
+
+const std::string& GetInputTimezone(const Datum& datum) {
+ return checked_cast<const TimestampType&>(*datum.type()).timezone();
+}
+
+const std::string& GetInputTimezone(const Scalar& scalar) {
+ return checked_cast<const TimestampType&>(*scalar.type).timezone();
+}
+
+const std::string& GetInputTimezone(const ArrayData& array) {
+ return checked_cast<const TimestampType&>(*array.type).timezone();
+}
+
+template <typename T>
+Status TemporalComponentExtractCheckTimezone(const T& input) {
+ const auto& timezone = GetInputTimezone(input);
+ if (!timezone.empty()) {
+ return Status::NotImplemented(
+ "Cannot extract components from timestamp with specific timezone: ", timezone);
+ }
+ return Status::OK();
+}
+
+template <typename Op, typename OutType>
+struct TemporalComponentExtract {
+ using OutValue = typename internal::GetOutputType<OutType>::T;
+
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ RETURN_NOT_OK(TemporalComponentExtractCheckTimezone(batch.values[0]));
+ return ScalarUnaryNotNull<OutType, TimestampType, Op>::Exec(ctx, batch, out);
+ }
+};
+
+template <typename Op, typename OutType>
+struct DayOfWeekExec {
+ using OutValue = typename internal::GetOutputType<OutType>::T;
+
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const DayOfWeekOptions& options = DayOfWeekState::Get(ctx);
+ if (options.week_start < 1 || 7 < options.week_start) {
+ return Status::Invalid(
+ "week_start must follow ISO convention (Monday=1, Sunday=7). Got week_start=",
+ options.week_start);
+ }
+
+ RETURN_NOT_OK(TemporalComponentExtractCheckTimezone(batch.values[0]));
+ applicator::ScalarUnaryNotNullStateful<OutType, TimestampType, Op> kernel{
+ Op(options)};
+ return kernel.Exec(ctx, batch, out);
+ }
+};
+
+// ----------------------------------------------------------------------
+// Extract year from timestamp
+
+template <typename Duration>
+struct Year {
+ template <typename T, typename Arg0>
+ static T Call(KernelContext*, Arg0 arg, Status*) {
+ return static_cast<T>(static_cast<const int32_t>(
+ year_month_day(floor<days>(sys_time<Duration>(Duration{arg}))).year()));
+ }
+};
+
+// ----------------------------------------------------------------------
+// Extract month from timestamp
+
+template <typename Duration>
+struct Month {
+ template <typename T, typename Arg0>
+ static T Call(KernelContext*, Arg0 arg, Status*) {
+ return static_cast<T>(static_cast<const uint32_t>(
+ year_month_day(floor<days>(sys_time<Duration>(Duration{arg}))).month()));
+ }
+};
+
+// ----------------------------------------------------------------------
+// Extract day from timestamp
+
+template <typename Duration>
+struct Day {
+ template <typename T, typename Arg0>
+ static T Call(KernelContext*, Arg0 arg, Status*) {
+ return static_cast<T>(static_cast<const uint32_t>(
+ year_month_day(floor<days>(sys_time<Duration>(Duration{arg}))).day()));
+ }
+};
+
+// ----------------------------------------------------------------------
+// Extract day of week from timestamp
+//
+// By default week starts on Monday represented by 0 and ends on Sunday represented
+// by 6. Start day of the week (Monday=1, Sunday=7) and numbering start (0 or 1) can be
+// set using DayOfWeekOptions
+
+template <typename Duration>
+struct DayOfWeek {
+ explicit DayOfWeek(const DayOfWeekOptions& options) {
+ for (int i = 0; i < 7; i++) {
+ lookup_table[i] = i + 8 - options.week_start;
+ lookup_table[i] = (lookup_table[i] > 6) ? lookup_table[i] - 7 : lookup_table[i];
+ lookup_table[i] += options.one_based_numbering;
+ }
+ }
+
+ template <typename T, typename Arg0>
+ T Call(KernelContext*, Arg0 arg, Status*) const {
+ const auto wd = arrow_vendored::date::year_month_weekday(
+ floor<days>(sys_time<Duration>(Duration{arg})))
+ .weekday()
+ .iso_encoding();
+ return lookup_table[wd - 1];
+ }
+ std::array<int64_t, 7> lookup_table;
+};
+
+// ----------------------------------------------------------------------
+// Extract day of year from timestamp
+
+template <typename Duration>
+struct DayOfYear {
+ template <typename T, typename Arg0>
+ static T Call(KernelContext*, Arg0 arg, Status*) {
+ const auto t = floor<days>(sys_time<Duration>(Duration{arg}));
+ return static_cast<T>(
+ (t - sys_time<days>(year_month_day(t).year() / jan / 0)).count());
+ }
+};
+
+// ----------------------------------------------------------------------
+// Extract ISO Year values from timestamp
+//
+// First week of an ISO year has the majority (4 or more) of it's days in January.
+// Last week of an ISO year has the year's last Thursday in it.
+
+template <typename Duration>
+struct ISOYear {
+ template <typename T, typename Arg0>
+ static T Call(KernelContext*, Arg0 arg, Status*) {
+ const auto t = floor<days>(sys_time<Duration>(Duration{arg}));
+ auto y = year_month_day{t + days{3}}.year();
+ auto start = sys_time<days>((y - years{1}) / dec / thu[last]) + (mon - thu);
+ if (t < start) {
+ --y;
+ }
+ return static_cast<T>(static_cast<int32_t>(y));
+ }
+};
+
+// ----------------------------------------------------------------------
+// Extract ISO week from timestamp
+//
+// First week of an ISO year has the majority (4 or more) of it's days in January.
+// Last week of an ISO year has the year's last Thursday in it.
+// Based on
+// https://github.com/HowardHinnant/date/blob/6e921e1b1d21e84a5c82416ba7ecd98e33a436d0/include/date/iso_week.h#L1503
+template <typename Duration>
+struct ISOWeek {
+ template <typename T, typename Arg0>
+ static T Call(KernelContext*, Arg0 arg, Status*) {
+ const auto t = floor<days>(sys_time<Duration>(Duration{arg}));
+ auto y = year_month_day{t + days{3}}.year();
+ auto start = sys_time<days>((y - years{1}) / dec / thu[last]) + (mon - thu);
+ if (t < start) {
+ --y;
+ start = sys_time<days>((y - years{1}) / dec / thu[last]) + (mon - thu);
+ }
+ return static_cast<T>(trunc<weeks>(t - start).count() + 1);
+ }
+};
+
+// ----------------------------------------------------------------------
+// Extract quarter from timestamp
+
+template <typename Duration>
+struct Quarter {
+ template <typename T, typename Arg0>
+ static T Call(KernelContext*, Arg0 arg, Status*) {
+ const auto ymd = year_month_day(floor<days>(sys_time<Duration>(Duration{arg})));
+ return static_cast<T>((static_cast<const uint32_t>(ymd.month()) - 1) / 3 + 1);
+ }
+};
+
+// ----------------------------------------------------------------------
+// Extract hour from timestamp
+
+template <typename Duration>
+struct Hour {
+ template <typename T, typename Arg0>
+ static T Call(KernelContext*, Arg0 arg, Status*) {
+ Duration t = Duration{arg};
+ return static_cast<T>((t - floor<days>(t)) / std::chrono::hours(1));
+ }
+};
+
+// ----------------------------------------------------------------------
+// Extract minute from timestamp
+
+template <typename Duration>
+struct Minute {
+ template <typename T, typename Arg0>
+ static T Call(KernelContext*, Arg0 arg, Status*) {
+ Duration t = Duration{arg};
+ return static_cast<T>((t - floor<std::chrono::hours>(t)) / std::chrono::minutes(1));
+ }
+};
+
+// ----------------------------------------------------------------------
+// Extract second from timestamp
+
+template <typename Duration>
+struct Second {
+ template <typename T, typename Arg0>
+ static T Call(KernelContext*, Arg0 arg, Status*) {
+ Duration t = Duration{arg};
+ return static_cast<T>((t - floor<std::chrono::minutes>(t)) / std::chrono::seconds(1));
+ }
+};
+
+// ----------------------------------------------------------------------
+// Extract subsecond from timestamp
+
+template <typename Duration>
+struct Subsecond {
+ template <typename T, typename Arg0>
+ static T Call(KernelContext*, Arg0 arg, Status*) {
+ Duration t = Duration{arg};
+ return static_cast<T>(
+ (std::chrono::duration<double>(t - floor<std::chrono::seconds>(t)).count()));
+ }
+};
+
+// ----------------------------------------------------------------------
+// Extract milliseconds from timestamp
+
+template <typename Duration>
+struct Millisecond {
+ template <typename T, typename Arg0>
+ static T Call(KernelContext*, Arg0 arg, Status*) {
+ Duration t = Duration{arg};
+ return static_cast<T>(
+ ((t - floor<std::chrono::seconds>(t)) / std::chrono::milliseconds(1)) % 1000);
+ }
+};
+
+// ----------------------------------------------------------------------
+// Extract microseconds from timestamp
+
+template <typename Duration>
+struct Microsecond {
+ template <typename T, typename Arg0>
+ static T Call(KernelContext*, Arg0 arg, Status*) {
+ Duration t = Duration{arg};
+ return static_cast<T>(
+ ((t - floor<std::chrono::seconds>(t)) / std::chrono::microseconds(1)) % 1000);
+ }
+};
+
+// ----------------------------------------------------------------------
+// Extract nanoseconds from timestamp
+
+template <typename Duration>
+struct Nanosecond {
+ template <typename T, typename Arg0>
+ static T Call(KernelContext*, Arg0 arg, Status*) {
+ Duration t = Duration{arg};
+ return static_cast<T>(
+ ((t - floor<std::chrono::seconds>(t)) / std::chrono::nanoseconds(1)) % 1000);
+ }
+};
+
+template <typename Duration>
+inline std::vector<int64_t> get_iso_calendar(int64_t arg) {
+ const auto t = floor<days>(sys_time<Duration>(Duration{arg}));
+ const auto ymd = year_month_day(t);
+ auto y = year_month_day{t + days{3}}.year();
+ auto start = sys_time<days>((y - years{1}) / dec / thu[last]) + (mon - thu);
+ if (t < start) {
+ --y;
+ start = sys_time<days>((y - years{1}) / dec / thu[last]) + (mon - thu);
+ }
+ return {static_cast<int64_t>(static_cast<int32_t>(y)),
+ static_cast<int64_t>(trunc<weeks>(t - start).count() + 1),
+ static_cast<int64_t>(weekday(ymd).iso_encoding())};
+}
+
+// ----------------------------------------------------------------------
+// Extract ISO calendar values from timestamp
+
+template <typename Duration>
+struct ISOCalendar {
+ static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) {
+ RETURN_NOT_OK(TemporalComponentExtractCheckTimezone(in));
+ if (in.is_valid) {
+ const std::shared_ptr<DataType> iso_calendar_type =
+ struct_({field("iso_year", int64()), field("iso_week", int64()),
+ field("iso_day_of_week", int64())});
+ const auto& in_val = internal::UnboxScalar<const TimestampType>::Unbox(in);
+ const auto iso_calendar = get_iso_calendar<Duration>(in_val);
+
+ std::vector<std::shared_ptr<Scalar>> values = {
+ std::make_shared<Int64Scalar>(iso_calendar[0]),
+ std::make_shared<Int64Scalar>(iso_calendar[1]),
+ std::make_shared<Int64Scalar>(iso_calendar[2])};
+ *checked_cast<StructScalar*>(out) = StructScalar(values, iso_calendar_type);
+ } else {
+ out->is_valid = false;
+ }
+ return Status::OK();
+ }
+
+ static Status Call(KernelContext* ctx, const ArrayData& in, ArrayData* out) {
+ using BuilderType = typename TypeTraits<Int64Type>::BuilderType;
+
+ RETURN_NOT_OK(TemporalComponentExtractCheckTimezone(in));
+ const std::shared_ptr<DataType> iso_calendar_type =
+ struct_({field("iso_year", int64()), field("iso_week", int64()),
+ field("iso_day_of_week", int64())});
+
+ std::unique_ptr<ArrayBuilder> array_builder;
+ RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), iso_calendar_type, &array_builder));
+ StructBuilder* struct_builder = checked_cast<StructBuilder*>(array_builder.get());
+ RETURN_NOT_OK(struct_builder->Reserve(in.length));
+
+ std::vector<BuilderType*> field_builders;
+ field_builders.reserve(3);
+ for (int i = 0; i < 3; i++) {
+ field_builders.push_back(
+ checked_cast<BuilderType*>(struct_builder->field_builder(i)));
+ RETURN_NOT_OK(field_builders[i]->Reserve(1));
+ }
+ auto visit_null = [&]() { return struct_builder->AppendNull(); };
+ auto visit_value = [&](int64_t arg) {
+ const auto iso_calendar = get_iso_calendar<Duration>(arg);
+ field_builders[0]->UnsafeAppend(iso_calendar[0]);
+ field_builders[1]->UnsafeAppend(iso_calendar[1]);
+ field_builders[2]->UnsafeAppend(iso_calendar[2]);
+ return struct_builder->Append();
+ };
+ RETURN_NOT_OK(VisitArrayDataInline<Int64Type>(in, visit_value, visit_null));
+
+ std::shared_ptr<Array> out_array;
+ RETURN_NOT_OK(struct_builder->Finish(&out_array));
+ *out = *std::move(out_array->data());
+
+ return Status::OK();
+ }
+};
+
+template <template <typename...> class Op, typename OutType>
+std::shared_ptr<ScalarFunction> MakeTemporal(std::string name, const FunctionDoc* doc) {
+ const auto& out_type = TypeTraits<OutType>::type_singleton();
+ auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
+
+ for (auto unit : internal::AllTimeUnits()) {
+ InputType in_type{match::TimestampTypeUnit(unit)};
+ switch (unit) {
+ case TimeUnit::SECOND: {
+ auto exec = TemporalComponentExtract<Op<std::chrono::seconds>, OutType>::Exec;
+ DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec)));
+ break;
+ }
+ case TimeUnit::MILLI: {
+ auto exec =
+ TemporalComponentExtract<Op<std::chrono::milliseconds>, OutType>::Exec;
+ DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec)));
+ break;
+ }
+ case TimeUnit::MICRO: {
+ auto exec =
+ TemporalComponentExtract<Op<std::chrono::microseconds>, OutType>::Exec;
+ DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec)));
+ break;
+ }
+ case TimeUnit::NANO: {
+ auto exec = TemporalComponentExtract<Op<std::chrono::nanoseconds>, OutType>::Exec;
+ DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec)));
+ break;
+ }
+ }
+ }
+ return func;
+}
+
+template <template <typename...> class Op, typename OutType>
+std::shared_ptr<ScalarFunction> MakeTemporalWithOptions(
+ std::string name, const FunctionDoc* doc, const DayOfWeekOptions& default_options,
+ KernelInit init) {
+ const auto& out_type = TypeTraits<OutType>::type_singleton();
+ auto func =
+ std::make_shared<ScalarFunction>(name, Arity::Unary(), doc, &default_options);
+
+ for (auto unit : internal::AllTimeUnits()) {
+ InputType in_type{match::TimestampTypeUnit(unit)};
+ switch (unit) {
+ case TimeUnit::SECOND: {
+ auto exec = DayOfWeekExec<Op<std::chrono::seconds>, OutType>::Exec;
+ DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec), init));
+ break;
+ }
+ case TimeUnit::MILLI: {
+ auto exec = DayOfWeekExec<Op<std::chrono::milliseconds>, OutType>::Exec;
+ DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec), init));
+ break;
+ }
+ case TimeUnit::MICRO: {
+ auto exec = DayOfWeekExec<Op<std::chrono::microseconds>, OutType>::Exec;
+ DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec), init));
+ break;
+ }
+ case TimeUnit::NANO: {
+ auto exec = DayOfWeekExec<Op<std::chrono::nanoseconds>, OutType>::Exec;
+ DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec), init));
+ break;
+ }
+ }
+ }
+ return func;
+}
+
+template <template <typename...> class Op>
+std::shared_ptr<ScalarFunction> MakeStructTemporal(std::string name,
+ const FunctionDoc* doc) {
+ const auto& out_type = struct_({field("iso_year", int64()), field("iso_week", int64()),
+ field("iso_day_of_week", int64())});
+ auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
+
+ for (auto unit : internal::AllTimeUnits()) {
+ InputType in_type{match::TimestampTypeUnit(unit)};
+ switch (unit) {
+ case TimeUnit::SECOND: {
+ auto exec = SimpleUnary<Op<std::chrono::seconds>>;
+ DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec)));
+ break;
+ }
+ case TimeUnit::MILLI: {
+ auto exec = SimpleUnary<Op<std::chrono::milliseconds>>;
+ DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec)));
+ break;
+ }
+ case TimeUnit::MICRO: {
+ auto exec = SimpleUnary<Op<std::chrono::microseconds>>;
+ DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec)));
+ break;
+ }
+ case TimeUnit::NANO: {
+ auto exec = SimpleUnary<Op<std::chrono::nanoseconds>>;
+ DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec)));
+ break;
+ }
+ }
+ }
+ return func;
+}
+
+const FunctionDoc year_doc{
+ "Extract year from timestamp",
+ "Returns an error if timestamp has a defined timezone. Null values return null.",
+ {"values"}};
+
+const FunctionDoc month_doc{
+ "Extract month number",
+ ("Month is encoded as January=1, December=12.\n"
+ "Returns an error if timestamp has a defined timezone. Null values return null."),
+ {"values"}};
+
+const FunctionDoc day_doc{
+ "Extract day number",
+ "Returns an error if timestamp has a defined timezone. Null values return null.",
+ {"values"}};
+
+const FunctionDoc day_of_week_doc{
+ "Extract day of the week number",
+ ("By default, the week starts on Monday represented by 0 and ends on Sunday "
+ "represented by 6.\n"
+ "DayOfWeekOptions.week_start can be used to set another starting day using ISO "
+ "convention (Monday=1, Sunday=7). Day numbering can start with 0 or 1 using "
+ "DayOfWeekOptions.one_based_numbering parameter.\n"
+ "Returns an error if timestamp has a defined timezone. Null values return null."),
+ {"values"},
+ "DayOfWeekOptions"};
+
+const FunctionDoc day_of_year_doc{
+ "Extract number of day of year",
+ ("January 1st maps to day number 1, February 1st to 32, etc.\n"
+ "Returns an error if timestamp has a defined timezone. Null values return null."),
+ {"values"}};
+
+const FunctionDoc iso_year_doc{
+ "Extract ISO year number",
+ ("First week of an ISO year has the majority (4 or more) of its days in January."
+ "Returns an error if timestamp has a defined timezone. Null values return null."),
+ {"values"}};
+
+const FunctionDoc iso_week_doc{
+ "Extract ISO week of year number",
+ ("First ISO week has the majority (4 or more) of its days in January.\n"
+ "Week of the year starts with 1 and can run up to 53.\n"
+ "Returns an error if timestamp has a defined timezone. Null values return null."),
+ {"values"}};
+
+const FunctionDoc iso_calendar_doc{
+ "Extract (ISO year, ISO week, ISO day of week) struct",
+ ("ISO week starts on Monday denoted by 1 and ends on Sunday denoted by 7.\n"
+ "Returns an error if timestamp has a defined timezone. Null values return null."),
+ {"values"}};
+
+const FunctionDoc quarter_doc{
+ "Extract quarter of year number",
+ ("First quarter maps to 1 and forth quarter maps to 4.\n"
+ "Returns an error if timestamp has a defined timezone. Null values return null."),
+ {"values"}};
+
+const FunctionDoc hour_doc{
+ "Extract hour value",
+ "Returns an error if timestamp has a defined timezone. Null values return null.",
+ {"values"}};
+
+const FunctionDoc minute_doc{
+ "Extract minute values",
+ "Returns an error if timestamp has a defined timezone. Null values return null.",
+ {"values"}};
+
+const FunctionDoc second_doc{
+ "Extract second values",
+ "Returns an error if timestamp has a defined timezone. Null values return null.",
+ {"values"}};
+
+const FunctionDoc millisecond_doc{
+ "Extract millisecond values",
+ ("Millisecond returns number of milliseconds since the last full second.\n"
+ "Returns an error if timestamp has a defined timezone. Null values return null."),
+ {"values"}};
+
+const FunctionDoc microsecond_doc{
+ "Extract microsecond values",
+ ("Millisecond returns number of microseconds since the last full millisecond.\n"
+ "Returns an error if timestamp has a defined timezone. Null values return null."),
+ {"values"}};
+
+const FunctionDoc nanosecond_doc{
+ "Extract nanosecond values",
+ ("Nanosecond returns number of nanoseconds since the last full microsecond.\n"
+ "Returns an error if timestamp has a defined timezone. Null values return null."),
+ {"values"}};
+
+const FunctionDoc subsecond_doc{
+ "Extract subsecond values",
+ ("Subsecond returns the fraction of a second since the last full second.\n"
+ "Returns an error if timestamp has a defined timezone. Null values return null."),
+ {"values"}};
+
+} // namespace
+
+void RegisterScalarTemporal(FunctionRegistry* registry) {
+ auto year = MakeTemporal<Year, Int64Type>("year", &year_doc);
+ DCHECK_OK(registry->AddFunction(std::move(year)));
+
+ auto month = MakeTemporal<Month, Int64Type>("month", &year_doc);
+ DCHECK_OK(registry->AddFunction(std::move(month)));
+
+ auto day = MakeTemporal<Day, Int64Type>("day", &year_doc);
+ DCHECK_OK(registry->AddFunction(std::move(day)));
+
+ static auto default_day_of_week_options = DayOfWeekOptions::Defaults();
+ auto day_of_week = MakeTemporalWithOptions<DayOfWeek, Int64Type>(
+ "day_of_week", &day_of_week_doc, default_day_of_week_options, DayOfWeekState::Init);
+ DCHECK_OK(registry->AddFunction(std::move(day_of_week)));
+
+ auto day_of_year = MakeTemporal<DayOfYear, Int64Type>("day_of_year", &day_of_year_doc);
+ DCHECK_OK(registry->AddFunction(std::move(day_of_year)));
+
+ auto iso_year = MakeTemporal<ISOYear, Int64Type>("iso_year", &iso_year_doc);
+ DCHECK_OK(registry->AddFunction(std::move(iso_year)));
+
+ auto iso_week = MakeTemporal<ISOWeek, Int64Type>("iso_week", &iso_week_doc);
+ DCHECK_OK(registry->AddFunction(std::move(iso_week)));
+
+ auto iso_calendar = MakeStructTemporal<ISOCalendar>("iso_calendar", &iso_calendar_doc);
+ DCHECK_OK(registry->AddFunction(std::move(iso_calendar)));
+
+ auto quarter = MakeTemporal<Quarter, Int64Type>("quarter", &quarter_doc);
+ DCHECK_OK(registry->AddFunction(std::move(quarter)));
+
+ auto hour = MakeTemporal<Hour, Int64Type>("hour", &hour_doc);
+ DCHECK_OK(registry->AddFunction(std::move(hour)));
+
+ auto minute = MakeTemporal<Minute, Int64Type>("minute", &minute_doc);
+ DCHECK_OK(registry->AddFunction(std::move(minute)));
+
+ auto second = MakeTemporal<Second, Int64Type>("second", &second_doc);
+ DCHECK_OK(registry->AddFunction(std::move(second)));
+
+ auto millisecond =
+ MakeTemporal<Millisecond, Int64Type>("millisecond", &millisecond_doc);
+ DCHECK_OK(registry->AddFunction(std::move(millisecond)));
+
+ auto microsecond =
+ MakeTemporal<Microsecond, Int64Type>("microsecond", &microsecond_doc);
+ DCHECK_OK(registry->AddFunction(std::move(microsecond)));
+
+ auto nanosecond = MakeTemporal<Nanosecond, Int64Type>("nanosecond", &nanosecond_doc);
+ DCHECK_OK(registry->AddFunction(std::move(nanosecond)));
+
+ auto subsecond = MakeTemporal<Subsecond, DoubleType>("subsecond", &subsecond_doc);
+ DCHECK_OK(registry->AddFunction(std::move(subsecond)));
+}
+
+} // namespace internal
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_validity.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_validity.cc
new file mode 100644
index 00000000000..ead88abc0f2
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_validity.cc
@@ -0,0 +1,230 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cmath>
+
+#include "arrow/compute/kernels/common.h"
+
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_ops.h"
+
+namespace arrow {
+
+using internal::CopyBitmap;
+using internal::InvertBitmap;
+
+namespace compute {
+namespace internal {
+namespace {
+
+struct IsValidOperator {
+ static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) {
+ checked_cast<BooleanScalar*>(out)->value = in.is_valid;
+ return Status::OK();
+ }
+
+ static Status Call(KernelContext* ctx, const ArrayData& arr, ArrayData* out) {
+ DCHECK_EQ(out->offset, 0);
+ DCHECK_LE(out->length, arr.length);
+ if (arr.MayHaveNulls()) {
+ // Input has nulls => output is the null (validity) bitmap.
+ // To avoid copying the null bitmap, slice from the starting byte offset
+ // and set the offset to the remaining bit offset.
+ out->offset = arr.offset % 8;
+ out->buffers[1] =
+ arr.offset == 0 ? arr.buffers[0]
+ : SliceBuffer(arr.buffers[0], arr.offset / 8,
+ BitUtil::BytesForBits(out->length + out->offset));
+ return Status::OK();
+ }
+
+ // Input has no nulls => output is entirely true.
+ ARROW_ASSIGN_OR_RAISE(out->buffers[1],
+ ctx->AllocateBitmap(out->length + out->offset));
+ BitUtil::SetBitsTo(out->buffers[1]->mutable_data(), out->offset, out->length, true);
+ return Status::OK();
+ }
+};
+
+struct IsFiniteOperator {
+ template <typename OutType, typename InType>
+ static constexpr OutType Call(KernelContext*, const InType& value, Status*) {
+ return std::isfinite(value);
+ }
+};
+
+struct IsInfOperator {
+ template <typename OutType, typename InType>
+ static constexpr OutType Call(KernelContext*, const InType& value, Status*) {
+ return std::isinf(value);
+ }
+};
+
+struct IsNullOperator {
+ static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) {
+ checked_cast<BooleanScalar*>(out)->value = !in.is_valid;
+ return Status::OK();
+ }
+
+ static Status Call(KernelContext* ctx, const ArrayData& arr, ArrayData* out) {
+ if (arr.MayHaveNulls()) {
+ // Input has nulls => output is the inverted null (validity) bitmap.
+ InvertBitmap(arr.buffers[0]->data(), arr.offset, arr.length,
+ out->buffers[1]->mutable_data(), out->offset);
+ } else {
+ // Input has no nulls => output is entirely false.
+ BitUtil::SetBitsTo(out->buffers[1]->mutable_data(), out->offset, out->length,
+ false);
+ }
+ return Status::OK();
+ }
+};
+
+struct IsNanOperator {
+ template <typename OutType, typename InType>
+ static constexpr OutType Call(KernelContext*, const InType& value, Status*) {
+ return std::isnan(value);
+ }
+};
+
+void MakeFunction(std::string name, const FunctionDoc* doc,
+ std::vector<InputType> in_types, OutputType out_type,
+ ArrayKernelExec exec, FunctionRegistry* registry,
+ MemAllocation::type mem_allocation, bool can_write_into_slices) {
+ Arity arity{static_cast<int>(in_types.size())};
+ auto func = std::make_shared<ScalarFunction>(name, arity, doc);
+
+ ScalarKernel kernel(std::move(in_types), out_type, exec);
+ kernel.null_handling = NullHandling::OUTPUT_NOT_NULL;
+ kernel.can_write_into_slices = can_write_into_slices;
+ kernel.mem_allocation = mem_allocation;
+
+ DCHECK_OK(func->AddKernel(std::move(kernel)));
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+}
+
+template <typename InType, typename Op>
+void AddFloatValidityKernel(const std::shared_ptr<DataType>& ty, ScalarFunction* func) {
+ DCHECK_OK(func->AddKernel({ty}, boolean(),
+ applicator::ScalarUnary<BooleanType, InType, Op>::Exec));
+}
+
+std::shared_ptr<ScalarFunction> MakeIsFiniteFunction(std::string name,
+ const FunctionDoc* doc) {
+ auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
+
+ AddFloatValidityKernel<FloatType, IsFiniteOperator>(float32(), func.get());
+ AddFloatValidityKernel<DoubleType, IsFiniteOperator>(float64(), func.get());
+
+ return func;
+}
+
+std::shared_ptr<ScalarFunction> MakeIsInfFunction(std::string name,
+ const FunctionDoc* doc) {
+ auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
+
+ AddFloatValidityKernel<FloatType, IsInfOperator>(float32(), func.get());
+ AddFloatValidityKernel<DoubleType, IsInfOperator>(float64(), func.get());
+
+ return func;
+}
+
+std::shared_ptr<ScalarFunction> MakeIsNanFunction(std::string name,
+ const FunctionDoc* doc) {
+ auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
+
+ AddFloatValidityKernel<FloatType, IsNanOperator>(float32(), func.get());
+ AddFloatValidityKernel<DoubleType, IsNanOperator>(float64(), func.get());
+
+ return func;
+}
+
+Status IsValidExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const Datum& arg0 = batch[0];
+ if (arg0.type()->id() == Type::NA) {
+ auto false_value = std::make_shared<BooleanScalar>(false);
+ if (arg0.kind() == Datum::SCALAR) {
+ out->value = false_value;
+ } else {
+ std::shared_ptr<Array> false_values;
+ RETURN_NOT_OK(MakeArrayFromScalar(*false_value, out->length(), ctx->memory_pool())
+ .Value(&false_values));
+ out->value = false_values->data();
+ }
+ return Status::OK();
+ } else {
+ return applicator::SimpleUnary<IsValidOperator>(ctx, batch, out);
+ }
+}
+
+Status IsNullExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const Datum& arg0 = batch[0];
+ if (arg0.type()->id() == Type::NA) {
+ if (arg0.kind() == Datum::SCALAR) {
+ out->value = std::make_shared<BooleanScalar>(true);
+ } else {
+ // Data is preallocated
+ ArrayData* out_arr = out->mutable_array();
+ BitUtil::SetBitsTo(out_arr->buffers[1]->mutable_data(), out_arr->offset,
+ out_arr->length, true);
+ }
+ return Status::OK();
+ } else {
+ return applicator::SimpleUnary<IsNullOperator>(ctx, batch, out);
+ }
+}
+
+const FunctionDoc is_valid_doc(
+ "Return true if non-null",
+ ("For each input value, emit true iff the value is valid (non-null)."), {"values"});
+
+const FunctionDoc is_finite_doc(
+ "Return true if value is finite",
+ ("For each input value, emit true iff the value is finite (not NaN, inf, or -inf)."),
+ {"values"});
+
+const FunctionDoc is_inf_doc(
+ "Return true if infinity",
+ ("For each input value, emit true iff the value is infinite (inf or -inf)."),
+ {"values"});
+
+const FunctionDoc is_null_doc("Return true if null",
+ ("For each input value, emit true iff the value is null."),
+ {"values"});
+
+const FunctionDoc is_nan_doc("Return true if NaN",
+ ("For each input value, emit true iff the value is NaN."),
+ {"values"});
+
+} // namespace
+
+void RegisterScalarValidity(FunctionRegistry* registry) {
+ MakeFunction("is_valid", &is_valid_doc, {ValueDescr::ANY}, boolean(), IsValidExec,
+ registry, MemAllocation::NO_PREALLOCATE, /*can_write_into_slices=*/false);
+
+ MakeFunction("is_null", &is_null_doc, {ValueDescr::ANY}, boolean(), IsNullExec,
+ registry, MemAllocation::PREALLOCATE,
+ /*can_write_into_slices=*/true);
+
+ DCHECK_OK(registry->AddFunction(MakeIsFiniteFunction("is_finite", &is_finite_doc)));
+ DCHECK_OK(registry->AddFunction(MakeIsInfFunction("is_inf", &is_inf_doc)));
+ DCHECK_OK(registry->AddFunction(MakeIsNanFunction("is_nan", &is_nan_doc)));
+}
+
+} // namespace internal
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/util_internal.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/util_internal.cc
new file mode 100644
index 00000000000..846fa26baf2
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/util_internal.cc
@@ -0,0 +1,82 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/kernels/util_internal.h"
+
+#include <cstdint>
+
+#include "arrow/array/data.h"
+#include "arrow/type.h"
+#include "arrow/util/checked_cast.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+
+namespace compute {
+namespace internal {
+
+const uint8_t* GetValidityBitmap(const ArrayData& data) {
+ const uint8_t* bitmap = nullptr;
+ if (data.buffers[0]) {
+ bitmap = data.buffers[0]->data();
+ }
+ return bitmap;
+}
+
+int GetBitWidth(const DataType& type) {
+ return checked_cast<const FixedWidthType&>(type).bit_width();
+}
+
+PrimitiveArg GetPrimitiveArg(const ArrayData& arr) {
+ PrimitiveArg arg;
+ arg.is_valid = GetValidityBitmap(arr);
+ arg.data = arr.buffers[1]->data();
+ arg.bit_width = GetBitWidth(*arr.type);
+ arg.offset = arr.offset;
+ arg.length = arr.length;
+ if (arg.bit_width > 1) {
+ arg.data += arr.offset * arg.bit_width / 8;
+ }
+ // This may be kUnknownNullCount
+ arg.null_count = (arg.is_valid != nullptr) ? arr.null_count.load() : 0;
+ return arg;
+}
+
+ArrayKernelExec TrivialScalarUnaryAsArraysExec(ArrayKernelExec exec,
+ NullHandling::type null_handling) {
+ return [=](KernelContext* ctx, const ExecBatch& batch, Datum* out) -> Status {
+ if (out->is_array()) {
+ return exec(ctx, batch, out);
+ }
+
+ if (null_handling == NullHandling::INTERSECTION && !batch[0].scalar()->is_valid) {
+ out->scalar()->is_valid = false;
+ return Status::OK();
+ }
+
+ ARROW_ASSIGN_OR_RAISE(Datum array_in, MakeArrayFromScalar(*batch[0].scalar(), 1));
+ ARROW_ASSIGN_OR_RAISE(Datum array_out, MakeArrayFromScalar(*out->scalar(), 1));
+ RETURN_NOT_OK(exec(ctx, ExecBatch{{std::move(array_in)}, 1}, &array_out));
+ ARROW_ASSIGN_OR_RAISE(*out, array_out.make_array()->GetScalar(0));
+ return Status::OK();
+ };
+}
+
+} // namespace internal
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/util_internal.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/util_internal.h
new file mode 100644
index 00000000000..394e08da581
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/util_internal.h
@@ -0,0 +1,166 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <utility>
+
+#include "arrow/array/util.h"
+#include "arrow/buffer.h"
+#include "arrow/compute/kernels/codegen_internal.h"
+#include "arrow/compute/type_fwd.h"
+#include "arrow/util/bit_run_reader.h"
+
+namespace arrow {
+namespace compute {
+namespace internal {
+
+// Used in some kernels and testing - not provided by default in MSVC
+// and _USE_MATH_DEFINES is not reliable with unity builds
+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#endif
+#ifndef M_PI_2
+#define M_PI_2 1.57079632679489661923
+#endif
+#ifndef M_PI_4
+#define M_PI_4 0.785398163397448309616
+#endif
+
+// An internal data structure for unpacking a primitive argument to pass to a
+// kernel implementation
+struct PrimitiveArg {
+ const uint8_t* is_valid;
+ // If the bit_width is a multiple of 8 (i.e. not boolean), then "data" should
+ // be shifted by offset * (bit_width / 8). For bit-packed data, the offset
+ // must be used when indexing.
+ const uint8_t* data;
+ int bit_width;
+ int64_t length;
+ int64_t offset;
+ // This may be kUnknownNullCount if the null_count has not yet been computed,
+ // so use null_count != 0 to determine "may have nulls".
+ int64_t null_count;
+};
+
+// Get validity bitmap data or return nullptr if there is no validity buffer
+const uint8_t* GetValidityBitmap(const ArrayData& data);
+
+int GetBitWidth(const DataType& type);
+
+// Reduce code size by dealing with the unboxing of the kernel inputs once
+// rather than duplicating compiled code to do all these in each kernel.
+PrimitiveArg GetPrimitiveArg(const ArrayData& arr);
+
+// Augment a unary ArrayKernelExec which supports only array-like inputs with support for
+// scalar inputs. Scalars will be transformed to 1-long arrays with the scalar's value (or
+// null if the scalar is null) as its only element. This 1-long array will be passed to
+// the original exec, then the only element of the resulting array will be extracted as
+// the output scalar. This could be far more efficient, but instead of optimizing this
+// it'd be better to support scalar inputs "upstream" in original exec.
+ArrayKernelExec TrivialScalarUnaryAsArraysExec(
+ ArrayKernelExec exec, NullHandling::type null_handling = NullHandling::INTERSECTION);
+
+// Return (min, max) of a numerical array, ignore nulls.
+// For empty array, return the maximal number limit as 'min', and minimal limit as 'max'.
+template <typename T>
+ARROW_NOINLINE std::pair<T, T> GetMinMax(const ArrayData& data) {
+ T min = std::numeric_limits<T>::max();
+ T max = std::numeric_limits<T>::lowest();
+
+ const T* values = data.GetValues<T>(1);
+ arrow::internal::VisitSetBitRunsVoid(data.buffers[0], data.offset, data.length,
+ [&](int64_t pos, int64_t len) {
+ for (int64_t i = 0; i < len; ++i) {
+ min = std::min(min, values[pos + i]);
+ max = std::max(max, values[pos + i]);
+ }
+ });
+
+ return std::make_pair(min, max);
+}
+
+template <typename T>
+std::pair<T, T> GetMinMax(const Datum& datum) {
+ T min = std::numeric_limits<T>::max();
+ T max = std::numeric_limits<T>::lowest();
+
+ for (const auto& array : datum.chunks()) {
+ T local_min, local_max;
+ std::tie(local_min, local_max) = GetMinMax<T>(*array->data());
+ min = std::min(min, local_min);
+ max = std::max(max, local_max);
+ }
+
+ return std::make_pair(min, max);
+}
+
+// Count value occurrences of an array, ignore nulls.
+// 'counts' must be zeroed and with enough size.
+template <typename T>
+ARROW_NOINLINE int64_t CountValues(uint64_t* counts, const ArrayData& data, T min) {
+ const int64_t n = data.length - data.GetNullCount();
+ if (n > 0) {
+ const T* values = data.GetValues<T>(1);
+ arrow::internal::VisitSetBitRunsVoid(data.buffers[0], data.offset, data.length,
+ [&](int64_t pos, int64_t len) {
+ for (int64_t i = 0; i < len; ++i) {
+ ++counts[values[pos + i] - min];
+ }
+ });
+ }
+ return n;
+}
+
+template <typename T>
+int64_t CountValues(uint64_t* counts, const Datum& datum, T min) {
+ int64_t n = 0;
+ for (const auto& array : datum.chunks()) {
+ n += CountValues<T>(counts, *array->data(), min);
+ }
+ return n;
+}
+
+// Copy numerical array values to a buffer, ignore nulls.
+template <typename T>
+ARROW_NOINLINE int64_t CopyNonNullValues(const ArrayData& data, T* out) {
+ const int64_t n = data.length - data.GetNullCount();
+ if (n > 0) {
+ int64_t index = 0;
+ const T* values = data.GetValues<T>(1);
+ arrow::internal::VisitSetBitRunsVoid(
+ data.buffers[0], data.offset, data.length, [&](int64_t pos, int64_t len) {
+ memcpy(out + index, values + pos, len * sizeof(T));
+ index += len;
+ });
+ }
+ return n;
+}
+
+template <typename T>
+int64_t CopyNonNullValues(const Datum& datum, T* out) {
+ int64_t n = 0;
+ for (const auto& array : datum.chunks()) {
+ n += CopyNonNullValues(*array->data(), out + n);
+ }
+ return n;
+}
+
+} // namespace internal
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_hash.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_hash.cc
new file mode 100644
index 00000000000..a68e78130f2
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_hash.cc
@@ -0,0 +1,782 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cstring>
+#include <mutex>
+
+#include "arrow/array/array_base.h"
+#include "arrow/array/array_dict.h"
+#include "arrow/array/array_nested.h"
+#include "arrow/array/builder_primitive.h"
+#include "arrow/array/concatenate.h"
+#include "arrow/array/dict_internal.h"
+#include "arrow/array/util.h"
+#include "arrow/compute/api_vector.h"
+#include "arrow/compute/kernels/common.h"
+#include "arrow/result.h"
+#include "arrow/util/hashing.h"
+#include "arrow/util/make_unique.h"
+
+namespace arrow {
+
+using internal::DictionaryTraits;
+using internal::HashTraits;
+
+namespace compute {
+namespace internal {
+
+namespace {
+
+class ActionBase {
+ public:
+ ActionBase(const std::shared_ptr<DataType>& type, MemoryPool* pool)
+ : type_(type), pool_(pool) {}
+
+ protected:
+ std::shared_ptr<DataType> type_;
+ MemoryPool* pool_;
+};
+
+// ----------------------------------------------------------------------
+// Unique
+
+class UniqueAction final : public ActionBase {
+ public:
+ using ActionBase::ActionBase;
+
+ static constexpr bool with_error_status = false;
+
+ UniqueAction(const std::shared_ptr<DataType>& type, const FunctionOptions* options,
+ MemoryPool* pool)
+ : ActionBase(type, pool) {}
+
+ Status Reset() { return Status::OK(); }
+
+ Status Reserve(const int64_t length) { return Status::OK(); }
+
+ template <class Index>
+ void ObserveNullFound(Index index) {}
+
+ template <class Index>
+ void ObserveNullNotFound(Index index) {}
+
+ template <class Index>
+ void ObserveFound(Index index) {}
+
+ template <class Index>
+ void ObserveNotFound(Index index) {}
+
+ bool ShouldEncodeNulls() { return true; }
+
+ Status Flush(Datum* out) { return Status::OK(); }
+
+ Status FlushFinal(Datum* out) { return Status::OK(); }
+};
+
+// ----------------------------------------------------------------------
+// Count values
+
+class ValueCountsAction final : ActionBase {
+ public:
+ using ActionBase::ActionBase;
+
+ static constexpr bool with_error_status = true;
+
+ ValueCountsAction(const std::shared_ptr<DataType>& type, const FunctionOptions* options,
+ MemoryPool* pool)
+ : ActionBase(type, pool), count_builder_(pool) {}
+
+ Status Reserve(const int64_t length) {
+ // builder size is independent of input array size.
+ return Status::OK();
+ }
+
+ Status Reset() {
+ count_builder_.Reset();
+ return Status::OK();
+ }
+
+ // Don't do anything on flush because we don't want to finalize the builder
+ // or incur the cost of memory copies.
+ Status Flush(Datum* out) { return Status::OK(); }
+
+ // Return the counts corresponding the MemoTable keys.
+ Status FlushFinal(Datum* out) {
+ std::shared_ptr<ArrayData> result;
+ RETURN_NOT_OK(count_builder_.FinishInternal(&result));
+ out->value = std::move(result);
+ return Status::OK();
+ }
+
+ template <class Index>
+ void ObserveNullFound(Index index) {
+ count_builder_[index]++;
+ }
+
+ template <class Index>
+ void ObserveNullNotFound(Index index) {
+ ARROW_LOG(FATAL) << "ObserveNullNotFound without err_status should not be called";
+ }
+
+ template <class Index>
+ void ObserveNullNotFound(Index index, Status* status) {
+ Status s = count_builder_.Append(1);
+ if (ARROW_PREDICT_FALSE(!s.ok())) {
+ *status = s;
+ }
+ }
+
+ template <class Index>
+ void ObserveFound(Index slot) {
+ count_builder_[slot]++;
+ }
+
+ template <class Index>
+ void ObserveNotFound(Index slot, Status* status) {
+ Status s = count_builder_.Append(1);
+ if (ARROW_PREDICT_FALSE(!s.ok())) {
+ *status = s;
+ }
+ }
+
+ bool ShouldEncodeNulls() const { return true; }
+
+ private:
+ Int64Builder count_builder_;
+};
+
+// ----------------------------------------------------------------------
+// Dictionary encode implementation
+
+class DictEncodeAction final : public ActionBase {
+ public:
+ using ActionBase::ActionBase;
+
+ static constexpr bool with_error_status = false;
+
+ DictEncodeAction(const std::shared_ptr<DataType>& type, const FunctionOptions* options,
+ MemoryPool* pool)
+ : ActionBase(type, pool), indices_builder_(pool) {
+ if (auto options_ptr = static_cast<const DictionaryEncodeOptions*>(options)) {
+ encode_options_ = *options_ptr;
+ }
+ }
+
+ Status Reset() {
+ indices_builder_.Reset();
+ return Status::OK();
+ }
+
+ Status Reserve(const int64_t length) { return indices_builder_.Reserve(length); }
+
+ template <class Index>
+ void ObserveNullFound(Index index) {
+ if (encode_options_.null_encoding_behavior == DictionaryEncodeOptions::MASK) {
+ indices_builder_.UnsafeAppendNull();
+ } else {
+ indices_builder_.UnsafeAppend(index);
+ }
+ }
+
+ template <class Index>
+ void ObserveNullNotFound(Index index) {
+ ObserveNullFound(index);
+ }
+
+ template <class Index>
+ void ObserveFound(Index index) {
+ indices_builder_.UnsafeAppend(index);
+ }
+
+ template <class Index>
+ void ObserveNotFound(Index index) {
+ ObserveFound(index);
+ }
+
+ bool ShouldEncodeNulls() {
+ return encode_options_.null_encoding_behavior == DictionaryEncodeOptions::ENCODE;
+ }
+
+ Status Flush(Datum* out) {
+ std::shared_ptr<ArrayData> result;
+ RETURN_NOT_OK(indices_builder_.FinishInternal(&result));
+ out->value = std::move(result);
+ return Status::OK();
+ }
+
+ Status FlushFinal(Datum* out) { return Status::OK(); }
+
+ private:
+ Int32Builder indices_builder_;
+ DictionaryEncodeOptions encode_options_;
+};
+
+class HashKernel : public KernelState {
+ public:
+ HashKernel() : options_(nullptr) {}
+ explicit HashKernel(const FunctionOptions* options) : options_(options) {}
+
+ // Reset for another run.
+ virtual Status Reset() = 0;
+
+ // Flush out accumulated results from the last invocation of Call.
+ virtual Status Flush(Datum* out) = 0;
+ // Flush out accumulated results across all invocations of Call. The kernel
+ // should not be used until after Reset() is called.
+ virtual Status FlushFinal(Datum* out) = 0;
+ // Get the values (keys) accumulated in the dictionary so far.
+ virtual Status GetDictionary(std::shared_ptr<ArrayData>* out) = 0;
+
+ virtual std::shared_ptr<DataType> value_type() const = 0;
+
+ Status Append(KernelContext* ctx, const ArrayData& input) {
+ std::lock_guard<std::mutex> guard(lock_);
+ return Append(input);
+ }
+
+ // Prepare the Action for the given input (e.g. reserve appropriately sized
+ // data structures) and visit the given input with Action.
+ virtual Status Append(const ArrayData& arr) = 0;
+
+ protected:
+ const FunctionOptions* options_;
+ std::mutex lock_;
+};
+
+// ----------------------------------------------------------------------
+// Base class for all "regular" hash kernel implementations
+// (NullType has a separate implementation)
+
+template <typename Type, typename Scalar, typename Action,
+ bool with_error_status = Action::with_error_status>
+class RegularHashKernel : public HashKernel {
+ public:
+ RegularHashKernel(const std::shared_ptr<DataType>& type, const FunctionOptions* options,
+ MemoryPool* pool)
+ : HashKernel(options), pool_(pool), type_(type), action_(type, options, pool) {}
+
+ Status Reset() override {
+ memo_table_.reset(new MemoTable(pool_, 0));
+ return action_.Reset();
+ }
+
+ Status Append(const ArrayData& arr) override {
+ RETURN_NOT_OK(action_.Reserve(arr.length));
+ return DoAppend(arr);
+ }
+
+ Status Flush(Datum* out) override { return action_.Flush(out); }
+
+ Status FlushFinal(Datum* out) override { return action_.FlushFinal(out); }
+
+ Status GetDictionary(std::shared_ptr<ArrayData>* out) override {
+ return DictionaryTraits<Type>::GetDictionaryArrayData(pool_, type_, *memo_table_,
+ 0 /* start_offset */, out);
+ }
+
+ std::shared_ptr<DataType> value_type() const override { return type_; }
+
+ template <bool HasError = with_error_status>
+ enable_if_t<!HasError, Status> DoAppend(const ArrayData& arr) {
+ return VisitArrayDataInline<Type>(
+ arr,
+ [this](Scalar v) {
+ auto on_found = [this](int32_t memo_index) {
+ action_.ObserveFound(memo_index);
+ };
+ auto on_not_found = [this](int32_t memo_index) {
+ action_.ObserveNotFound(memo_index);
+ };
+
+ int32_t unused_memo_index;
+ return memo_table_->GetOrInsert(v, std::move(on_found), std::move(on_not_found),
+ &unused_memo_index);
+ },
+ [this]() {
+ if (action_.ShouldEncodeNulls()) {
+ auto on_found = [this](int32_t memo_index) {
+ action_.ObserveNullFound(memo_index);
+ };
+ auto on_not_found = [this](int32_t memo_index) {
+ action_.ObserveNullNotFound(memo_index);
+ };
+ memo_table_->GetOrInsertNull(std::move(on_found), std::move(on_not_found));
+ } else {
+ action_.ObserveNullNotFound(-1);
+ }
+ return Status::OK();
+ });
+ }
+
+ template <bool HasError = with_error_status>
+ enable_if_t<HasError, Status> DoAppend(const ArrayData& arr) {
+ return VisitArrayDataInline<Type>(
+ arr,
+ [this](Scalar v) {
+ Status s = Status::OK();
+ auto on_found = [this](int32_t memo_index) {
+ action_.ObserveFound(memo_index);
+ };
+ auto on_not_found = [this, &s](int32_t memo_index) {
+ action_.ObserveNotFound(memo_index, &s);
+ };
+
+ int32_t unused_memo_index;
+ RETURN_NOT_OK(memo_table_->GetOrInsert(
+ v, std::move(on_found), std::move(on_not_found), &unused_memo_index));
+ return s;
+ },
+ [this]() {
+ // Null
+ Status s = Status::OK();
+ auto on_found = [this](int32_t memo_index) {
+ action_.ObserveNullFound(memo_index);
+ };
+ auto on_not_found = [this, &s](int32_t memo_index) {
+ action_.ObserveNullNotFound(memo_index, &s);
+ };
+ if (action_.ShouldEncodeNulls()) {
+ memo_table_->GetOrInsertNull(std::move(on_found), std::move(on_not_found));
+ }
+ return s;
+ });
+ }
+
+ protected:
+ using MemoTable = typename HashTraits<Type>::MemoTableType;
+
+ MemoryPool* pool_;
+ std::shared_ptr<DataType> type_;
+ Action action_;
+ std::unique_ptr<MemoTable> memo_table_;
+};
+
+// ----------------------------------------------------------------------
+// Hash kernel implementation for nulls
+
+template <typename Action, bool with_error_status = Action::with_error_status>
+class NullHashKernel : public HashKernel {
+ public:
+ NullHashKernel(const std::shared_ptr<DataType>& type, const FunctionOptions* options,
+ MemoryPool* pool)
+ : pool_(pool), type_(type), action_(type, options, pool) {}
+
+ Status Reset() override { return action_.Reset(); }
+
+ Status Append(const ArrayData& arr) override { return DoAppend(arr); }
+
+ template <bool HasError = with_error_status>
+ enable_if_t<!HasError, Status> DoAppend(const ArrayData& arr) {
+ RETURN_NOT_OK(action_.Reserve(arr.length));
+ for (int64_t i = 0; i < arr.length; ++i) {
+ if (i == 0) {
+ seen_null_ = true;
+ action_.ObserveNullNotFound(0);
+ } else {
+ action_.ObserveNullFound(0);
+ }
+ }
+ return Status::OK();
+ }
+
+ template <bool HasError = with_error_status>
+ enable_if_t<HasError, Status> DoAppend(const ArrayData& arr) {
+ Status s = Status::OK();
+ RETURN_NOT_OK(action_.Reserve(arr.length));
+ for (int64_t i = 0; i < arr.length; ++i) {
+ if (seen_null_ == false && i == 0) {
+ seen_null_ = true;
+ action_.ObserveNullNotFound(0, &s);
+ } else {
+ action_.ObserveNullFound(0);
+ }
+ }
+ return s;
+ }
+
+ Status Flush(Datum* out) override { return action_.Flush(out); }
+ Status FlushFinal(Datum* out) override { return action_.FlushFinal(out); }
+
+ Status GetDictionary(std::shared_ptr<ArrayData>* out) override {
+ std::shared_ptr<NullArray> null_array;
+ if (seen_null_) {
+ null_array = std::make_shared<NullArray>(1);
+ } else {
+ null_array = std::make_shared<NullArray>(0);
+ }
+ *out = null_array->data();
+ return Status::OK();
+ }
+
+ std::shared_ptr<DataType> value_type() const override { return type_; }
+
+ protected:
+ MemoryPool* pool_;
+ std::shared_ptr<DataType> type_;
+ bool seen_null_ = false;
+ Action action_;
+};
+
+// ----------------------------------------------------------------------
+// Hashing for dictionary type
+
+class DictionaryHashKernel : public HashKernel {
+ public:
+ explicit DictionaryHashKernel(std::unique_ptr<HashKernel> indices_kernel)
+ : indices_kernel_(std::move(indices_kernel)) {}
+
+ Status Reset() override { return indices_kernel_->Reset(); }
+
+ Status Append(const ArrayData& arr) override {
+ if (!dictionary_) {
+ dictionary_ = arr.dictionary;
+ } else if (!MakeArray(dictionary_)->Equals(*MakeArray(arr.dictionary))) {
+ // NOTE: This approach computes a new dictionary unification per chunk.
+ // This is in effect O(n*k) where n is the total chunked array length and
+ // k is the number of chunks (therefore O(n**2) if chunks have a fixed size).
+ //
+ // A better approach may be to run the kernel over each individual chunk,
+ // and then hash-aggregate all results (for example sum-group-by for
+ // the "value_counts" kernel).
+ auto out_dict_type = dictionary_->type;
+ std::shared_ptr<Buffer> transpose_map;
+ std::shared_ptr<Array> out_dict;
+ ARROW_ASSIGN_OR_RAISE(auto unifier, DictionaryUnifier::Make(out_dict_type));
+
+ ARROW_CHECK_OK(unifier->Unify(*MakeArray(dictionary_)));
+ ARROW_CHECK_OK(unifier->Unify(*MakeArray(arr.dictionary), &transpose_map));
+ ARROW_CHECK_OK(unifier->GetResult(&out_dict_type, &out_dict));
+
+ this->dictionary_ = out_dict->data();
+ auto transpose = reinterpret_cast<const int32_t*>(transpose_map->data());
+ auto in_dict_array = MakeArray(std::make_shared<ArrayData>(arr));
+ ARROW_ASSIGN_OR_RAISE(
+ auto tmp, arrow::internal::checked_cast<const DictionaryArray&>(*in_dict_array)
+ .Transpose(arr.type, out_dict, transpose));
+ return indices_kernel_->Append(*tmp->data());
+ }
+
+ return indices_kernel_->Append(arr);
+ }
+
+ Status Flush(Datum* out) override { return indices_kernel_->Flush(out); }
+
+ Status FlushFinal(Datum* out) override { return indices_kernel_->FlushFinal(out); }
+
+ Status GetDictionary(std::shared_ptr<ArrayData>* out) override {
+ return indices_kernel_->GetDictionary(out);
+ }
+
+ std::shared_ptr<DataType> value_type() const override {
+ return indices_kernel_->value_type();
+ }
+
+ std::shared_ptr<ArrayData> dictionary() const { return dictionary_; }
+
+ private:
+ std::unique_ptr<HashKernel> indices_kernel_;
+ std::shared_ptr<ArrayData> dictionary_;
+};
+
+// ----------------------------------------------------------------------
+
+template <typename Type, typename Action, typename Enable = void>
+struct HashKernelTraits {};
+
+template <typename Type, typename Action>
+struct HashKernelTraits<Type, Action, enable_if_null<Type>> {
+ using HashKernel = NullHashKernel<Action>;
+};
+
+template <typename Type, typename Action>
+struct HashKernelTraits<Type, Action, enable_if_has_c_type<Type>> {
+ using HashKernel = RegularHashKernel<Type, typename Type::c_type, Action>;
+};
+
+template <typename Type, typename Action>
+struct HashKernelTraits<Type, Action, enable_if_has_string_view<Type>> {
+ using HashKernel = RegularHashKernel<Type, util::string_view, Action>;
+};
+
+template <typename Type, typename Action>
+Result<std::unique_ptr<HashKernel>> HashInitImpl(KernelContext* ctx,
+ const KernelInitArgs& args) {
+ using HashKernelType = typename HashKernelTraits<Type, Action>::HashKernel;
+ auto result = ::arrow::internal::make_unique<HashKernelType>(
+ args.inputs[0].type, args.options, ctx->memory_pool());
+ RETURN_NOT_OK(result->Reset());
+ return std::move(result);
+}
+
+template <typename Type, typename Action>
+Result<std::unique_ptr<KernelState>> HashInit(KernelContext* ctx,
+ const KernelInitArgs& args) {
+ return HashInitImpl<Type, Action>(ctx, args);
+}
+
+template <typename Action>
+KernelInit GetHashInit(Type::type type_id) {
+ // ARROW-8933: Generate only a single hash kernel per physical data
+ // representation
+ switch (type_id) {
+ case Type::NA:
+ return HashInit<NullType, Action>;
+ case Type::BOOL:
+ return HashInit<BooleanType, Action>;
+ case Type::INT8:
+ case Type::UINT8:
+ return HashInit<UInt8Type, Action>;
+ case Type::INT16:
+ case Type::UINT16:
+ return HashInit<UInt16Type, Action>;
+ case Type::INT32:
+ case Type::UINT32:
+ case Type::FLOAT:
+ case Type::DATE32:
+ case Type::TIME32:
+ return HashInit<UInt32Type, Action>;
+ case Type::INT64:
+ case Type::UINT64:
+ case Type::DOUBLE:
+ case Type::DATE64:
+ case Type::TIME64:
+ case Type::TIMESTAMP:
+ case Type::DURATION:
+ return HashInit<UInt64Type, Action>;
+ case Type::BINARY:
+ case Type::STRING:
+ return HashInit<BinaryType, Action>;
+ case Type::LARGE_BINARY:
+ case Type::LARGE_STRING:
+ return HashInit<LargeBinaryType, Action>;
+ case Type::FIXED_SIZE_BINARY:
+ case Type::DECIMAL128:
+ case Type::DECIMAL256:
+ return HashInit<FixedSizeBinaryType, Action>;
+ default:
+ DCHECK(false);
+ return nullptr;
+ }
+}
+
+using DictionaryEncodeState = OptionsWrapper<DictionaryEncodeOptions>;
+
+template <typename Action>
+Result<std::unique_ptr<KernelState>> DictionaryHashInit(KernelContext* ctx,
+ const KernelInitArgs& args) {
+ const auto& dict_type = checked_cast<const DictionaryType&>(*args.inputs[0].type);
+ Result<std::unique_ptr<HashKernel>> indices_hasher;
+ switch (dict_type.index_type()->id()) {
+ case Type::INT8:
+ indices_hasher = HashInitImpl<UInt8Type, Action>(ctx, args);
+ break;
+ case Type::INT16:
+ indices_hasher = HashInitImpl<UInt16Type, Action>(ctx, args);
+ break;
+ case Type::INT32:
+ indices_hasher = HashInitImpl<UInt32Type, Action>(ctx, args);
+ break;
+ case Type::INT64:
+ indices_hasher = HashInitImpl<UInt64Type, Action>(ctx, args);
+ break;
+ default:
+ DCHECK(false) << "Unsupported dictionary index type";
+ break;
+ }
+ RETURN_NOT_OK(indices_hasher);
+ return ::arrow::internal::make_unique<DictionaryHashKernel>(
+ std::move(indices_hasher.ValueOrDie()));
+}
+
+Status HashExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ auto hash_impl = checked_cast<HashKernel*>(ctx->state());
+ RETURN_NOT_OK(hash_impl->Append(ctx, *batch[0].array()));
+ RETURN_NOT_OK(hash_impl->Flush(out));
+ return Status::OK();
+}
+
+Status UniqueFinalize(KernelContext* ctx, std::vector<Datum>* out) {
+ auto hash_impl = checked_cast<HashKernel*>(ctx->state());
+ std::shared_ptr<ArrayData> uniques;
+ RETURN_NOT_OK(hash_impl->GetDictionary(&uniques));
+ *out = {Datum(uniques)};
+ return Status::OK();
+}
+
+Status DictEncodeFinalize(KernelContext* ctx, std::vector<Datum>* out) {
+ auto hash_impl = checked_cast<HashKernel*>(ctx->state());
+ std::shared_ptr<ArrayData> uniques;
+ RETURN_NOT_OK(hash_impl->GetDictionary(&uniques));
+ auto dict_type = dictionary(int32(), uniques->type);
+ auto dict = MakeArray(uniques);
+ for (size_t i = 0; i < out->size(); ++i) {
+ (*out)[i] =
+ std::make_shared<DictionaryArray>(dict_type, (*out)[i].make_array(), dict);
+ }
+ return Status::OK();
+}
+
+std::shared_ptr<ArrayData> BoxValueCounts(const std::shared_ptr<ArrayData>& uniques,
+ const std::shared_ptr<ArrayData>& counts) {
+ auto data_type =
+ struct_({field(kValuesFieldName, uniques->type), field(kCountsFieldName, int64())});
+ ArrayVector children = {MakeArray(uniques), MakeArray(counts)};
+ return std::make_shared<StructArray>(data_type, uniques->length, children)->data();
+}
+
+Status ValueCountsFinalize(KernelContext* ctx, std::vector<Datum>* out) {
+ auto hash_impl = checked_cast<HashKernel*>(ctx->state());
+ std::shared_ptr<ArrayData> uniques;
+ Datum value_counts;
+
+ RETURN_NOT_OK(hash_impl->GetDictionary(&uniques));
+ RETURN_NOT_OK(hash_impl->FlushFinal(&value_counts));
+ *out = {Datum(BoxValueCounts(uniques, value_counts.array()))};
+ return Status::OK();
+}
+
+Status UniqueFinalizeDictionary(KernelContext* ctx, std::vector<Datum>* out) {
+ RETURN_NOT_OK(UniqueFinalize(ctx, out));
+ auto hash = checked_cast<DictionaryHashKernel*>(ctx->state());
+ (*out)[0].mutable_array()->dictionary = hash->dictionary();
+ return Status::OK();
+}
+
+Status ValueCountsFinalizeDictionary(KernelContext* ctx, std::vector<Datum>* out) {
+ auto hash = checked_cast<DictionaryHashKernel*>(ctx->state());
+ std::shared_ptr<ArrayData> uniques;
+ Datum value_counts;
+ RETURN_NOT_OK(hash->GetDictionary(&uniques));
+ RETURN_NOT_OK(hash->FlushFinal(&value_counts));
+ uniques->dictionary = hash->dictionary();
+ *out = {Datum(BoxValueCounts(uniques, value_counts.array()))};
+ return Status::OK();
+}
+
+ValueDescr DictEncodeOutput(KernelContext*, const std::vector<ValueDescr>& descrs) {
+ return ValueDescr::Array(dictionary(int32(), descrs[0].type));
+}
+
+ValueDescr ValueCountsOutput(KernelContext*, const std::vector<ValueDescr>& descrs) {
+ return ValueDescr::Array(struct_(
+ {field(kValuesFieldName, descrs[0].type), field(kCountsFieldName, int64())}));
+}
+
+template <typename Action>
+void AddHashKernels(VectorFunction* func, VectorKernel base, OutputType out_ty) {
+ for (const auto& ty : PrimitiveTypes()) {
+ base.init = GetHashInit<Action>(ty->id());
+ base.signature = KernelSignature::Make({InputType::Array(ty)}, out_ty);
+ DCHECK_OK(func->AddKernel(base));
+ }
+
+ // Example parametric types that we want to match only on Type::type
+ auto parametric_types = {time32(TimeUnit::SECOND), time64(TimeUnit::MICRO),
+ timestamp(TimeUnit::SECOND), fixed_size_binary(0)};
+ for (const auto& ty : parametric_types) {
+ base.init = GetHashInit<Action>(ty->id());
+ base.signature = KernelSignature::Make({InputType::Array(ty->id())}, out_ty);
+ DCHECK_OK(func->AddKernel(base));
+ }
+
+ for (auto t : {Type::DECIMAL128, Type::DECIMAL256}) {
+ base.init = GetHashInit<Action>(t);
+ base.signature = KernelSignature::Make({InputType::Array(t)}, out_ty);
+ DCHECK_OK(func->AddKernel(base));
+ }
+}
+
+const FunctionDoc unique_doc(
+ "Compute unique elements",
+ ("Return an array with distinct values. Nulls in the input are ignored."),
+ {"array"});
+
+const FunctionDoc value_counts_doc(
+ "Compute counts of unique elements",
+ ("For each distinct value, compute the number of times it occurs in the array.\n"
+ "The result is returned as an array of `struct<input type, int64>`.\n"
+ "Nulls in the input are ignored."),
+ {"array"});
+
+const auto kDefaultDictionaryEncodeOptions = DictionaryEncodeOptions::Defaults();
+const FunctionDoc dictionary_encode_doc(
+ "Dictionary-encode array",
+ ("Return a dictionary-encoded version of the input array."), {"array"},
+ "DictionaryEncodeOptions");
+
+} // namespace
+
+void RegisterVectorHash(FunctionRegistry* registry) {
+ VectorKernel base;
+ base.exec = HashExec;
+
+ // ----------------------------------------------------------------------
+ // unique
+
+ base.finalize = UniqueFinalize;
+ base.output_chunked = false;
+ auto unique = std::make_shared<VectorFunction>("unique", Arity::Unary(), &unique_doc);
+ AddHashKernels<UniqueAction>(unique.get(), base, OutputType(FirstType));
+
+ // Dictionary unique
+ base.init = DictionaryHashInit<UniqueAction>;
+ base.finalize = UniqueFinalizeDictionary;
+ base.signature =
+ KernelSignature::Make({InputType::Array(Type::DICTIONARY)}, OutputType(FirstType));
+ DCHECK_OK(unique->AddKernel(base));
+
+ DCHECK_OK(registry->AddFunction(std::move(unique)));
+
+ // ----------------------------------------------------------------------
+ // value_counts
+
+ base.finalize = ValueCountsFinalize;
+ auto value_counts =
+ std::make_shared<VectorFunction>("value_counts", Arity::Unary(), &value_counts_doc);
+ AddHashKernels<ValueCountsAction>(value_counts.get(), base,
+ OutputType(ValueCountsOutput));
+
+ // Dictionary value counts
+ base.init = DictionaryHashInit<ValueCountsAction>;
+ base.finalize = ValueCountsFinalizeDictionary;
+ base.signature = KernelSignature::Make({InputType::Array(Type::DICTIONARY)},
+ OutputType(ValueCountsOutput));
+ DCHECK_OK(value_counts->AddKernel(base));
+
+ DCHECK_OK(registry->AddFunction(std::move(value_counts)));
+
+ // ----------------------------------------------------------------------
+ // dictionary_encode
+
+ base.finalize = DictEncodeFinalize;
+ // Unique and ValueCounts output unchunked arrays
+ base.output_chunked = true;
+ auto dict_encode = std::make_shared<VectorFunction>("dictionary_encode", Arity::Unary(),
+ &dictionary_encode_doc,
+ &kDefaultDictionaryEncodeOptions);
+ AddHashKernels<DictEncodeAction>(dict_encode.get(), base, OutputType(DictEncodeOutput));
+
+ // Calling dictionary_encode on dictionary input not supported, but if it
+ // ends up being needed (or convenience), a kernel could be added to make it
+ // a no-op
+
+ DCHECK_OK(registry->AddFunction(std::move(dict_encode)));
+}
+
+} // namespace internal
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_nested.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_nested.cc
new file mode 100644
index 00000000000..b84640854ed
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_nested.cc
@@ -0,0 +1,102 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Vector kernels involving nested types
+
+#include "arrow/array/array_base.h"
+#include "arrow/compute/kernels/common.h"
+#include "arrow/result.h"
+
+namespace arrow {
+namespace compute {
+namespace internal {
+namespace {
+
+template <typename Type>
+Status ListFlatten(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ typename TypeTraits<Type>::ArrayType list_array(batch[0].array());
+ ARROW_ASSIGN_OR_RAISE(auto result, list_array.Flatten(ctx->memory_pool()));
+ out->value = result->data();
+ return Status::OK();
+}
+
+template <typename Type, typename offset_type = typename Type::offset_type>
+Status ListParentIndices(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ typename TypeTraits<Type>::ArrayType list(batch[0].array());
+ ArrayData* out_arr = out->mutable_array();
+
+ const offset_type* offsets = list.raw_value_offsets();
+ offset_type values_length = offsets[list.length()] - offsets[0];
+
+ out_arr->length = values_length;
+ out_arr->null_count = 0;
+ ARROW_ASSIGN_OR_RAISE(out_arr->buffers[1],
+ ctx->Allocate(values_length * sizeof(offset_type)));
+ auto out_indices = reinterpret_cast<offset_type*>(out_arr->buffers[1]->mutable_data());
+ for (int64_t i = 0; i < list.length(); ++i) {
+ // Note: In most cases, null slots are empty, but when they are non-empty
+ // we write out the indices so make sure they are accounted for. This
+ // behavior could be changed if needed in the future.
+ for (offset_type j = offsets[i]; j < offsets[i + 1]; ++j) {
+ *out_indices++ = static_cast<offset_type>(i);
+ }
+ }
+ return Status::OK();
+}
+
+Result<ValueDescr> ValuesType(KernelContext*, const std::vector<ValueDescr>& args) {
+ const auto& list_type = checked_cast<const BaseListType&>(*args[0].type);
+ return ValueDescr::Array(list_type.value_type());
+}
+
+const FunctionDoc list_flatten_doc(
+ "Flatten list values",
+ ("`lists` must have a list-like type.\n"
+ "Return an array with the top list level flattened.\n"
+ "Top-level null values in `lists` do not emit anything in the input."),
+ {"lists"});
+
+const FunctionDoc list_parent_indices_doc(
+ "Compute parent indices of nested list values",
+ ("`lists` must have a list-like type.\n"
+ "For each value in each list of `lists`, the top-level list index\n"
+ "is emitted."),
+ {"lists"});
+
+} // namespace
+
+void RegisterVectorNested(FunctionRegistry* registry) {
+ auto flatten =
+ std::make_shared<VectorFunction>("list_flatten", Arity::Unary(), &list_flatten_doc);
+ DCHECK_OK(flatten->AddKernel({InputType::Array(Type::LIST)}, OutputType(ValuesType),
+ ListFlatten<ListType>));
+ DCHECK_OK(flatten->AddKernel({InputType::Array(Type::LARGE_LIST)},
+ OutputType(ValuesType), ListFlatten<LargeListType>));
+ DCHECK_OK(registry->AddFunction(std::move(flatten)));
+
+ auto list_parent_indices = std::make_shared<VectorFunction>(
+ "list_parent_indices", Arity::Unary(), &list_parent_indices_doc);
+ DCHECK_OK(list_parent_indices->AddKernel({InputType::Array(Type::LIST)}, int32(),
+ ListParentIndices<ListType>));
+ DCHECK_OK(list_parent_indices->AddKernel({InputType::Array(Type::LARGE_LIST)}, int64(),
+ ListParentIndices<LargeListType>));
+ DCHECK_OK(registry->AddFunction(std::move(list_parent_indices)));
+}
+
+} // namespace internal
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_replace.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_replace.cc
new file mode 100644
index 00000000000..644aec2a4e9
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_replace.cc
@@ -0,0 +1,540 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/api_scalar.h"
+#include "arrow/compute/kernels/common.h"
+#include "arrow/util/bitmap_ops.h"
+
+namespace arrow {
+namespace compute {
+namespace internal {
+
+namespace {
+
+Status ReplacementArrayTooShort(int64_t expected, int64_t actual) {
+ return Status::Invalid("Replacement array must be of appropriate length (expected ",
+ expected, " items but got ", actual, " items)");
+}
+
+// Helper to implement replace_with kernel with scalar mask for fixed-width types,
+// using callbacks to handle both bool and byte-sized types
+template <typename Functor>
+Status ReplaceWithScalarMask(KernelContext* ctx, const ArrayData& array,
+ const BooleanScalar& mask, const Datum& replacements,
+ ArrayData* output) {
+ Datum source = array;
+ if (!mask.is_valid) {
+ // Output = null
+ source = MakeNullScalar(output->type);
+ } else if (mask.value) {
+ // Output = replacement
+ source = replacements;
+ }
+ uint8_t* out_bitmap = output->buffers[0]->mutable_data();
+ uint8_t* out_values = output->buffers[1]->mutable_data();
+ const int64_t out_offset = output->offset;
+ if (source.is_array()) {
+ const ArrayData& in_data = *source.array();
+ if (in_data.length < array.length) {
+ return ReplacementArrayTooShort(array.length, in_data.length);
+ }
+ Functor::CopyData(*array.type, out_values, out_offset, in_data, /*in_offset=*/0,
+ array.length);
+ if (in_data.MayHaveNulls()) {
+ arrow::internal::CopyBitmap(in_data.buffers[0]->data(), in_data.offset,
+ array.length, out_bitmap, out_offset);
+ } else {
+ BitUtil::SetBitsTo(out_bitmap, out_offset, array.length, true);
+ }
+ } else {
+ const Scalar& in_data = *source.scalar();
+ Functor::CopyData(*array.type, out_values, out_offset, in_data, /*in_offset=*/0,
+ array.length);
+ BitUtil::SetBitsTo(out_bitmap, out_offset, array.length, in_data.is_valid);
+ }
+ return Status::OK();
+}
+
+struct CopyArrayBitmap {
+ const uint8_t* in_bitmap;
+ int64_t in_offset;
+
+ void CopyBitmap(uint8_t* out_bitmap, int64_t out_offset, int64_t offset,
+ int64_t length) const {
+ arrow::internal::CopyBitmap(in_bitmap, in_offset + offset, length, out_bitmap,
+ out_offset);
+ }
+
+ void SetBit(uint8_t* out_bitmap, int64_t out_offset, int64_t offset) const {
+ BitUtil::SetBitTo(out_bitmap, out_offset,
+ BitUtil::GetBit(in_bitmap, in_offset + offset));
+ }
+};
+
+struct CopyScalarBitmap {
+ const bool is_valid;
+
+ void CopyBitmap(uint8_t* out_bitmap, int64_t out_offset, int64_t offset,
+ int64_t length) const {
+ BitUtil::SetBitsTo(out_bitmap, out_offset, length, is_valid);
+ }
+
+ void SetBit(uint8_t* out_bitmap, int64_t out_offset, int64_t offset) const {
+ BitUtil::SetBitTo(out_bitmap, out_offset, is_valid);
+ }
+};
+
+// Helper to implement replace_with kernel with array mask for fixed-width types,
+// using callbacks to handle both bool and byte-sized types and to handle
+// scalar and array replacements
+template <typename Functor, typename Data, typename CopyBitmap>
+void ReplaceWithArrayMaskImpl(const ArrayData& array, const ArrayData& mask,
+ const Data& replacements, bool replacements_bitmap,
+ const CopyBitmap& copy_bitmap, const uint8_t* mask_bitmap,
+ const uint8_t* mask_values, uint8_t* out_bitmap,
+ uint8_t* out_values, const int64_t out_offset) {
+ Functor::CopyData(*array.type, out_values, /*out_offset=*/0, array, /*in_offset=*/0,
+ array.length);
+ arrow::internal::OptionalBinaryBitBlockCounter counter(
+ mask_values, mask.offset, mask_bitmap, mask.offset, mask.length);
+ int64_t write_offset = 0;
+ int64_t replacements_offset = 0;
+ while (write_offset < array.length) {
+ BitBlockCount block = counter.NextAndBlock();
+ if (block.AllSet()) {
+ // Copy from replacement array
+ Functor::CopyData(*array.type, out_values, out_offset + write_offset, replacements,
+ replacements_offset, block.length);
+ if (replacements_bitmap) {
+ copy_bitmap.CopyBitmap(out_bitmap, out_offset + write_offset, replacements_offset,
+ block.length);
+ } else if (!replacements_bitmap && out_bitmap) {
+ BitUtil::SetBitsTo(out_bitmap, out_offset + write_offset, block.length, true);
+ }
+ replacements_offset += block.length;
+ } else if (block.popcount) {
+ for (int64_t i = 0; i < block.length; ++i) {
+ if (BitUtil::GetBit(mask_values, write_offset + mask.offset + i) &&
+ (!mask_bitmap ||
+ BitUtil::GetBit(mask_bitmap, write_offset + mask.offset + i))) {
+ Functor::CopyData(*array.type, out_values, out_offset + write_offset + i,
+ replacements, replacements_offset, /*length=*/1);
+ if (replacements_bitmap) {
+ copy_bitmap.SetBit(out_bitmap, out_offset + write_offset + i,
+ replacements_offset);
+ }
+ replacements_offset++;
+ }
+ }
+ }
+ write_offset += block.length;
+ }
+}
+
+template <typename Functor>
+Status ReplaceWithArrayMask(KernelContext* ctx, const ArrayData& array,
+ const ArrayData& mask, const Datum& replacements,
+ ArrayData* output) {
+ const int64_t out_offset = output->offset;
+ uint8_t* out_bitmap = nullptr;
+ uint8_t* out_values = output->buffers[1]->mutable_data();
+ const uint8_t* mask_bitmap = mask.MayHaveNulls() ? mask.buffers[0]->data() : nullptr;
+ const uint8_t* mask_values = mask.buffers[1]->data();
+ const bool replacements_bitmap = replacements.is_array()
+ ? replacements.array()->MayHaveNulls()
+ : !replacements.scalar()->is_valid;
+ if (replacements.is_array()) {
+ // Check that we have enough replacement values
+ const int64_t replacements_length = replacements.array()->length;
+
+ BooleanArray mask_arr(mask.length, mask.buffers[1], mask.buffers[0], mask.null_count,
+ mask.offset);
+ const int64_t count = mask_arr.true_count();
+ if (count > replacements_length) {
+ return ReplacementArrayTooShort(count, replacements_length);
+ }
+ }
+ if (array.MayHaveNulls() || mask.MayHaveNulls() || replacements_bitmap) {
+ out_bitmap = output->buffers[0]->mutable_data();
+ output->null_count = -1;
+ if (array.MayHaveNulls()) {
+ // Copy array's bitmap
+ arrow::internal::CopyBitmap(array.buffers[0]->data(), array.offset, array.length,
+ out_bitmap, out_offset);
+ } else {
+ // Array has no bitmap but mask/replacements do, generate an all-valid bitmap
+ BitUtil::SetBitsTo(out_bitmap, out_offset, array.length, true);
+ }
+ } else {
+ BitUtil::SetBitsTo(output->buffers[0]->mutable_data(), out_offset, array.length,
+ true);
+ output->null_count = 0;
+ }
+
+ if (replacements.is_array()) {
+ const ArrayData& array_repl = *replacements.array();
+ ReplaceWithArrayMaskImpl<Functor>(
+ array, mask, array_repl, replacements_bitmap,
+ CopyArrayBitmap{replacements_bitmap ? array_repl.buffers[0]->data() : nullptr,
+ array_repl.offset},
+ mask_bitmap, mask_values, out_bitmap, out_values, out_offset);
+ } else {
+ const Scalar& scalar_repl = *replacements.scalar();
+ ReplaceWithArrayMaskImpl<Functor>(array, mask, scalar_repl, replacements_bitmap,
+ CopyScalarBitmap{scalar_repl.is_valid}, mask_bitmap,
+ mask_values, out_bitmap, out_values, out_offset);
+ }
+
+ if (mask.MayHaveNulls()) {
+ arrow::internal::BitmapAnd(out_bitmap, out_offset, mask.buffers[0]->data(),
+ mask.offset, array.length, out_offset, out_bitmap);
+ }
+ return Status::OK();
+}
+
+template <typename Type, typename Enable = void>
+struct ReplaceWithMask {};
+
+template <typename Type>
+struct ReplaceWithMask<Type, enable_if_number<Type>> {
+ using T = typename TypeTraits<Type>::CType;
+
+ static void CopyData(const DataType&, uint8_t* out, const int64_t out_offset,
+ const ArrayData& in, const int64_t in_offset,
+ const int64_t length) {
+ const auto in_arr = in.GetValues<uint8_t>(1, (in_offset + in.offset) * sizeof(T));
+ std::memcpy(out + (out_offset * sizeof(T)), in_arr, length * sizeof(T));
+ }
+
+ static void CopyData(const DataType&, uint8_t* out, const int64_t out_offset,
+ const Scalar& in, const int64_t in_offset, const int64_t length) {
+ T* begin = reinterpret_cast<T*>(out + (out_offset * sizeof(T)));
+ T* end = begin + length;
+ std::fill(begin, end, UnboxScalar<Type>::Unbox(in));
+ }
+
+ static Status ExecScalarMask(KernelContext* ctx, const ArrayData& array,
+ const BooleanScalar& mask, const Datum& replacements,
+ ArrayData* output) {
+ return ReplaceWithScalarMask<ReplaceWithMask<Type>>(ctx, array, mask, replacements,
+ output);
+ }
+
+ static Status ExecArrayMask(KernelContext* ctx, const ArrayData& array,
+ const ArrayData& mask, const Datum& replacements,
+ ArrayData* output) {
+ return ReplaceWithArrayMask<ReplaceWithMask<Type>>(ctx, array, mask, replacements,
+ output);
+ }
+};
+
+template <typename Type>
+struct ReplaceWithMask<Type, enable_if_boolean<Type>> {
+ static void CopyData(const DataType&, uint8_t* out, const int64_t out_offset,
+ const ArrayData& in, const int64_t in_offset,
+ const int64_t length) {
+ const auto in_arr = in.GetValues<uint8_t>(1, /*absolute_offset=*/0);
+ arrow::internal::CopyBitmap(in_arr, in_offset + in.offset, length, out, out_offset);
+ }
+ static void CopyData(const DataType&, uint8_t* out, const int64_t out_offset,
+ const Scalar& in, const int64_t in_offset, const int64_t length) {
+ BitUtil::SetBitsTo(out, out_offset, length, in.is_valid);
+ }
+
+ static Status ExecScalarMask(KernelContext* ctx, const ArrayData& array,
+ const BooleanScalar& mask, const Datum& replacements,
+ ArrayData* output) {
+ return ReplaceWithScalarMask<ReplaceWithMask<Type>>(ctx, array, mask, replacements,
+ output);
+ }
+ static Status ExecArrayMask(KernelContext* ctx, const ArrayData& array,
+ const ArrayData& mask, const Datum& replacements,
+ ArrayData* output) {
+ return ReplaceWithArrayMask<ReplaceWithMask<Type>>(ctx, array, mask, replacements,
+ output);
+ }
+};
+
+template <typename Type>
+struct ReplaceWithMask<Type, enable_if_same<Type, FixedSizeBinaryType>> {
+ static void CopyData(const DataType& ty, uint8_t* out, const int64_t out_offset,
+ const ArrayData& in, const int64_t in_offset,
+ const int64_t length) {
+ const int32_t width = checked_cast<const FixedSizeBinaryType&>(ty).byte_width();
+ uint8_t* begin = out + (out_offset * width);
+ const auto in_arr = in.GetValues<uint8_t>(1, (in_offset + in.offset) * width);
+ std::memcpy(begin, in_arr, length * width);
+ }
+ static void CopyData(const DataType& ty, uint8_t* out, const int64_t out_offset,
+ const Scalar& in, const int64_t in_offset, const int64_t length) {
+ const int32_t width = checked_cast<const FixedSizeBinaryType&>(ty).byte_width();
+ uint8_t* begin = out + (out_offset * width);
+ const auto& scalar = checked_cast<const FixedSizeBinaryScalar&>(in);
+ // Null scalar may have null value buffer
+ if (!scalar.value) return;
+ const Buffer& buffer = *scalar.value;
+ const uint8_t* value = buffer.data();
+ DCHECK_GE(buffer.size(), width);
+ for (int i = 0; i < length; i++) {
+ std::memcpy(begin, value, width);
+ begin += width;
+ }
+ }
+
+ static Status ExecScalarMask(KernelContext* ctx, const ArrayData& array,
+ const BooleanScalar& mask, const Datum& replacements,
+ ArrayData* output) {
+ return ReplaceWithScalarMask<ReplaceWithMask<Type>>(ctx, array, mask, replacements,
+ output);
+ }
+
+ static Status ExecArrayMask(KernelContext* ctx, const ArrayData& array,
+ const ArrayData& mask, const Datum& replacements,
+ ArrayData* output) {
+ return ReplaceWithArrayMask<ReplaceWithMask<Type>>(ctx, array, mask, replacements,
+ output);
+ }
+};
+
+template <typename Type>
+struct ReplaceWithMask<Type, enable_if_decimal<Type>> {
+ using ScalarType = typename TypeTraits<Type>::ScalarType;
+ static void CopyData(const DataType& ty, uint8_t* out, const int64_t out_offset,
+ const ArrayData& in, const int64_t in_offset,
+ const int64_t length) {
+ const int32_t width = checked_cast<const FixedSizeBinaryType&>(ty).byte_width();
+ uint8_t* begin = out + (out_offset * width);
+ const auto in_arr = in.GetValues<uint8_t>(1, (in_offset + in.offset) * width);
+ std::memcpy(begin, in_arr, length * width);
+ }
+ static void CopyData(const DataType& ty, uint8_t* out, const int64_t out_offset,
+ const Scalar& in, const int64_t in_offset, const int64_t length) {
+ const int32_t width = checked_cast<const FixedSizeBinaryType&>(ty).byte_width();
+ uint8_t* begin = out + (out_offset * width);
+ const auto& scalar = checked_cast<const ScalarType&>(in);
+ const auto value = scalar.value.ToBytes();
+ for (int i = 0; i < length; i++) {
+ std::memcpy(begin, value.data(), width);
+ begin += width;
+ }
+ }
+
+ static Status ExecScalarMask(KernelContext* ctx, const ArrayData& array,
+ const BooleanScalar& mask, const Datum& replacements,
+ ArrayData* output) {
+ return ReplaceWithScalarMask<ReplaceWithMask<Type>>(ctx, array, mask, replacements,
+ output);
+ }
+
+ static Status ExecArrayMask(KernelContext* ctx, const ArrayData& array,
+ const ArrayData& mask, const Datum& replacements,
+ ArrayData* output) {
+ return ReplaceWithArrayMask<ReplaceWithMask<Type>>(ctx, array, mask, replacements,
+ output);
+ }
+};
+
+template <typename Type>
+struct ReplaceWithMask<Type, enable_if_null<Type>> {
+ static Status ExecScalarMask(KernelContext* ctx, const ArrayData& array,
+ const BooleanScalar& mask, const Datum& replacements,
+ ArrayData* output) {
+ *output = array;
+ return Status::OK();
+ }
+ static Status ExecArrayMask(KernelContext* ctx, const ArrayData& array,
+ const ArrayData& mask, const Datum& replacements,
+ ArrayData* output) {
+ *output = array;
+ return Status::OK();
+ }
+};
+
+template <typename Type>
+struct ReplaceWithMask<Type, enable_if_base_binary<Type>> {
+ using offset_type = typename Type::offset_type;
+ using BuilderType = typename TypeTraits<Type>::BuilderType;
+
+ static Status ExecScalarMask(KernelContext* ctx, const ArrayData& array,
+ const BooleanScalar& mask, const Datum& replacements,
+ ArrayData* output) {
+ if (!mask.is_valid) {
+ // Output = null
+ ARROW_ASSIGN_OR_RAISE(
+ auto replacement_array,
+ MakeArrayOfNull(array.type, array.length, ctx->memory_pool()));
+ *output = *replacement_array->data();
+ } else if (mask.value) {
+ // Output = replacement
+ if (replacements.is_scalar()) {
+ ARROW_ASSIGN_OR_RAISE(auto replacement_array,
+ MakeArrayFromScalar(*replacements.scalar(), array.length,
+ ctx->memory_pool()));
+ *output = *replacement_array->data();
+ } else {
+ const ArrayData& replacement_array = *replacements.array();
+ if (replacement_array.length < array.length) {
+ return ReplacementArrayTooShort(array.length, replacement_array.length);
+ }
+ *output = replacement_array;
+ output->length = array.length;
+ }
+ } else {
+ // Output = input
+ *output = array;
+ }
+ return Status::OK();
+ }
+ static Status ExecArrayMask(KernelContext* ctx, const ArrayData& array,
+ const ArrayData& mask, const Datum& replacements,
+ ArrayData* output) {
+ BuilderType builder(array.type, ctx->memory_pool());
+ RETURN_NOT_OK(builder.Reserve(array.length));
+ RETURN_NOT_OK(builder.ReserveData(array.buffers[2]->size()));
+ int64_t source_offset = 0;
+ int64_t replacements_offset = 0;
+ RETURN_NOT_OK(VisitArrayDataInline<BooleanType>(
+ mask,
+ [&](bool replace) {
+ if (replace && replacements.is_scalar()) {
+ const Scalar& scalar = *replacements.scalar();
+ if (scalar.is_valid) {
+ RETURN_NOT_OK(builder.Append(UnboxScalar<Type>::Unbox(scalar)));
+ } else {
+ RETURN_NOT_OK(builder.AppendNull());
+ }
+ } else {
+ const ArrayData& source = replace ? *replacements.array() : array;
+ const int64_t offset = replace ? replacements_offset++ : source_offset;
+ if (!source.MayHaveNulls() ||
+ BitUtil::GetBit(source.buffers[0]->data(), source.offset + offset)) {
+ const uint8_t* data = source.buffers[2]->data();
+ const offset_type* offsets = source.GetValues<offset_type>(1);
+ const offset_type offset0 = offsets[offset];
+ const offset_type offset1 = offsets[offset + 1];
+ RETURN_NOT_OK(builder.Append(data + offset0, offset1 - offset0));
+ } else {
+ RETURN_NOT_OK(builder.AppendNull());
+ }
+ }
+ source_offset++;
+ return Status::OK();
+ },
+ [&]() {
+ RETURN_NOT_OK(builder.AppendNull());
+ source_offset++;
+ return Status::OK();
+ }));
+ std::shared_ptr<Array> temp_output;
+ RETURN_NOT_OK(builder.Finish(&temp_output));
+ *output = *temp_output->data();
+ // Builder type != logical type due to GenerateTypeAgnosticVarBinaryBase
+ output->type = array.type;
+ return Status::OK();
+ }
+};
+
+template <typename Type>
+struct ReplaceWithMaskFunctor {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const ArrayData& array = *batch[0].array();
+ const Datum& replacements = batch[2];
+ ArrayData* output = out->array().get();
+ output->length = array.length;
+
+ // Needed for FixedSizeBinary/parameterized types
+ if (!array.type->Equals(*replacements.type(), /*check_metadata=*/false)) {
+ return Status::Invalid("Replacements must be of same type (expected ",
+ array.type->ToString(), " but got ",
+ replacements.type()->ToString(), ")");
+ }
+
+ if (!replacements.is_array() && !replacements.is_scalar()) {
+ return Status::Invalid("Replacements must be array or scalar");
+ }
+
+ if (batch[1].is_scalar()) {
+ return ReplaceWithMask<Type>::ExecScalarMask(
+ ctx, array, batch[1].scalar_as<BooleanScalar>(), replacements, output);
+ }
+ const ArrayData& mask = *batch[1].array();
+ if (array.length != mask.length) {
+ return Status::Invalid("Mask must be of same length as array (expected ",
+ array.length, " items but got ", mask.length, " items)");
+ }
+ return ReplaceWithMask<Type>::ExecArrayMask(ctx, array, mask, replacements, output);
+ }
+};
+
+} // namespace
+
+const FunctionDoc replace_with_mask_doc(
+ "Replace items using a mask and replacement values",
+ ("Given an array and a Boolean mask (either scalar or of equal length), "
+ "along with replacement values (either scalar or array), "
+ "each element of the array for which the corresponding mask element is "
+ "true will be replaced by the next value from the replacements, "
+ "or with null if the mask is null. "
+ "Hence, for replacement arrays, len(replacements) == sum(mask == true)."),
+ {"values", "mask", "replacements"});
+
+void RegisterVectorReplace(FunctionRegistry* registry) {
+ auto func = std::make_shared<VectorFunction>("replace_with_mask", Arity::Ternary(),
+ &replace_with_mask_doc);
+ auto add_kernel = [&](detail::GetTypeId get_id, ArrayKernelExec exec) {
+ VectorKernel kernel;
+ kernel.can_execute_chunkwise = false;
+ if (is_fixed_width(get_id.id)) {
+ kernel.null_handling = NullHandling::type::COMPUTED_PREALLOCATE;
+ } else {
+ kernel.can_write_into_slices = false;
+ kernel.null_handling = NullHandling::type::COMPUTED_NO_PREALLOCATE;
+ }
+ kernel.mem_allocation = MemAllocation::type::PREALLOCATE;
+ kernel.signature = KernelSignature::Make(
+ {InputType::Array(get_id.id), InputType(boolean()), InputType(get_id.id)},
+ OutputType(FirstType));
+ kernel.exec = std::move(exec);
+ DCHECK_OK(func->AddKernel(std::move(kernel)));
+ };
+ auto add_primitive_kernel = [&](detail::GetTypeId get_id) {
+ add_kernel(get_id, GenerateTypeAgnosticPrimitive<ReplaceWithMaskFunctor>(get_id));
+ };
+ for (const auto& ty : NumericTypes()) {
+ add_primitive_kernel(ty);
+ }
+ for (const auto& ty : TemporalTypes()) {
+ add_primitive_kernel(ty);
+ }
+ add_primitive_kernel(null());
+ add_primitive_kernel(boolean());
+ add_primitive_kernel(day_time_interval());
+ add_primitive_kernel(month_interval());
+ add_kernel(Type::FIXED_SIZE_BINARY, ReplaceWithMaskFunctor<FixedSizeBinaryType>::Exec);
+ add_kernel(Type::DECIMAL128, ReplaceWithMaskFunctor<Decimal128Type>::Exec);
+ add_kernel(Type::DECIMAL256, ReplaceWithMaskFunctor<Decimal256Type>::Exec);
+ for (const auto& ty : BaseBinaryTypes()) {
+ add_kernel(ty->id(), GenerateTypeAgnosticVarBinaryBase<ReplaceWithMaskFunctor>(*ty));
+ }
+ // TODO: list types
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+
+ // TODO(ARROW-9431): "replace_with_indices"
+}
+} // namespace internal
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_selection.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_selection.cc
new file mode 100644
index 00000000000..5845a7ee2d0
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_selection.cc
@@ -0,0 +1,2268 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <algorithm>
+#include <cstring>
+#include <limits>
+
+#include "arrow/array/array_base.h"
+#include "arrow/array/array_binary.h"
+#include "arrow/array/array_dict.h"
+#include "arrow/array/array_nested.h"
+#include "arrow/array/builder_primitive.h"
+#include "arrow/array/concatenate.h"
+#include "arrow/buffer_builder.h"
+#include "arrow/chunked_array.h"
+#include "arrow/compute/api_vector.h"
+#include "arrow/compute/kernels/common.h"
+#include "arrow/compute/kernels/util_internal.h"
+#include "arrow/extension_type.h"
+#include "arrow/record_batch.h"
+#include "arrow/result.h"
+#include "arrow/table.h"
+#include "arrow/type.h"
+#include "arrow/util/bit_block_counter.h"
+#include "arrow/util/bit_run_reader.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_ops.h"
+#include "arrow/util/bitmap_reader.h"
+#include "arrow/util/int_util.h"
+
+namespace arrow {
+
+using internal::BinaryBitBlockCounter;
+using internal::BitBlockCount;
+using internal::BitBlockCounter;
+using internal::CheckIndexBounds;
+using internal::CopyBitmap;
+using internal::CountSetBits;
+using internal::GetArrayView;
+using internal::GetByteWidth;
+using internal::OptionalBitBlockCounter;
+using internal::OptionalBitIndexer;
+
+namespace compute {
+namespace internal {
+
+int64_t GetFilterOutputSize(const ArrayData& filter,
+ FilterOptions::NullSelectionBehavior null_selection) {
+ int64_t output_size = 0;
+
+ if (filter.MayHaveNulls()) {
+ const uint8_t* filter_is_valid = filter.buffers[0]->data();
+ BinaryBitBlockCounter bit_counter(filter.buffers[1]->data(), filter.offset,
+ filter_is_valid, filter.offset, filter.length);
+ int64_t position = 0;
+ if (null_selection == FilterOptions::EMIT_NULL) {
+ while (position < filter.length) {
+ BitBlockCount block = bit_counter.NextOrNotWord();
+ output_size += block.popcount;
+ position += block.length;
+ }
+ } else {
+ while (position < filter.length) {
+ BitBlockCount block = bit_counter.NextAndWord();
+ output_size += block.popcount;
+ position += block.length;
+ }
+ }
+ } else {
+ // The filter has no nulls, so we can use CountSetBits
+ output_size = CountSetBits(filter.buffers[1]->data(), filter.offset, filter.length);
+ }
+ return output_size;
+}
+
+namespace {
+
+template <typename IndexType>
+Result<std::shared_ptr<ArrayData>> GetTakeIndicesImpl(
+ const ArrayData& filter, FilterOptions::NullSelectionBehavior null_selection,
+ MemoryPool* memory_pool) {
+ using T = typename IndexType::c_type;
+
+ const uint8_t* filter_data = filter.buffers[1]->data();
+ const bool have_filter_nulls = filter.MayHaveNulls();
+ const uint8_t* filter_is_valid =
+ have_filter_nulls ? filter.buffers[0]->data() : nullptr;
+
+ if (have_filter_nulls && null_selection == FilterOptions::EMIT_NULL) {
+ // Most complex case: the filter may have nulls and we don't drop them.
+ // The logic is ternary:
+ // - filter is null: emit null
+ // - filter is valid and true: emit index
+ // - filter is valid and false: don't emit anything
+
+ typename TypeTraits<IndexType>::BuilderType builder(memory_pool);
+
+ // The position relative to the start of the filter
+ T position = 0;
+ // The current position taking the filter offset into account
+ int64_t position_with_offset = filter.offset;
+
+ // To count blocks where filter_data[i] || !filter_is_valid[i]
+ BinaryBitBlockCounter filter_counter(filter_data, filter.offset, filter_is_valid,
+ filter.offset, filter.length);
+ BitBlockCounter is_valid_counter(filter_is_valid, filter.offset, filter.length);
+ while (position < filter.length) {
+ // true OR NOT valid
+ BitBlockCount selected_or_null_block = filter_counter.NextOrNotWord();
+ if (selected_or_null_block.NoneSet()) {
+ position += selected_or_null_block.length;
+ position_with_offset += selected_or_null_block.length;
+ continue;
+ }
+ RETURN_NOT_OK(builder.Reserve(selected_or_null_block.popcount));
+
+ // If the values are all valid and the selected_or_null_block is full,
+ // then we can infer that all the values are true and skip the bit checking
+ BitBlockCount is_valid_block = is_valid_counter.NextWord();
+
+ if (selected_or_null_block.AllSet() && is_valid_block.AllSet()) {
+ // All the values are selected and non-null
+ for (int64_t i = 0; i < selected_or_null_block.length; ++i) {
+ builder.UnsafeAppend(position++);
+ }
+ position_with_offset += selected_or_null_block.length;
+ } else {
+ // Some of the values are false or null
+ for (int64_t i = 0; i < selected_or_null_block.length; ++i) {
+ if (BitUtil::GetBit(filter_is_valid, position_with_offset)) {
+ if (BitUtil::GetBit(filter_data, position_with_offset)) {
+ builder.UnsafeAppend(position);
+ }
+ } else {
+ // Null slot, so append a null
+ builder.UnsafeAppendNull();
+ }
+ ++position;
+ ++position_with_offset;
+ }
+ }
+ }
+ std::shared_ptr<ArrayData> result;
+ RETURN_NOT_OK(builder.FinishInternal(&result));
+ return result;
+ }
+
+ // Other cases don't emit nulls and are therefore simpler.
+ TypedBufferBuilder<T> builder(memory_pool);
+
+ if (have_filter_nulls) {
+ // The filter may have nulls, so we scan the validity bitmap and the filter
+ // data bitmap together.
+ DCHECK_EQ(null_selection, FilterOptions::DROP);
+
+ // The position relative to the start of the filter
+ T position = 0;
+ // The current position taking the filter offset into account
+ int64_t position_with_offset = filter.offset;
+
+ BinaryBitBlockCounter filter_counter(filter_data, filter.offset, filter_is_valid,
+ filter.offset, filter.length);
+ while (position < filter.length) {
+ BitBlockCount and_block = filter_counter.NextAndWord();
+ RETURN_NOT_OK(builder.Reserve(and_block.popcount));
+ if (and_block.AllSet()) {
+ // All the values are selected and non-null
+ for (int64_t i = 0; i < and_block.length; ++i) {
+ builder.UnsafeAppend(position++);
+ }
+ position_with_offset += and_block.length;
+ } else if (!and_block.NoneSet()) {
+ // Some of the values are false or null
+ for (int64_t i = 0; i < and_block.length; ++i) {
+ if (BitUtil::GetBit(filter_is_valid, position_with_offset) &&
+ BitUtil::GetBit(filter_data, position_with_offset)) {
+ builder.UnsafeAppend(position);
+ }
+ ++position;
+ ++position_with_offset;
+ }
+ } else {
+ position += and_block.length;
+ position_with_offset += and_block.length;
+ }
+ }
+ } else {
+ // The filter has no nulls, so we need only look for true values
+ RETURN_NOT_OK(::arrow::internal::VisitSetBitRuns(
+ filter_data, filter.offset, filter.length, [&](int64_t offset, int64_t length) {
+ // Append the consecutive run of indices
+ RETURN_NOT_OK(builder.Reserve(length));
+ for (int64_t i = 0; i < length; ++i) {
+ builder.UnsafeAppend(static_cast<T>(offset + i));
+ }
+ return Status::OK();
+ }));
+ }
+
+ const int64_t length = builder.length();
+ std::shared_ptr<Buffer> out_buffer;
+ RETURN_NOT_OK(builder.Finish(&out_buffer));
+ return std::make_shared<ArrayData>(TypeTraits<IndexType>::type_singleton(), length,
+ BufferVector{nullptr, out_buffer}, /*null_count=*/0);
+}
+
+} // namespace
+
+Result<std::shared_ptr<ArrayData>> GetTakeIndices(
+ const ArrayData& filter, FilterOptions::NullSelectionBehavior null_selection,
+ MemoryPool* memory_pool) {
+ DCHECK_EQ(filter.type->id(), Type::BOOL);
+ if (filter.length <= std::numeric_limits<uint16_t>::max()) {
+ return GetTakeIndicesImpl<UInt16Type>(filter, null_selection, memory_pool);
+ } else if (filter.length <= std::numeric_limits<uint32_t>::max()) {
+ return GetTakeIndicesImpl<UInt32Type>(filter, null_selection, memory_pool);
+ } else {
+ // Arrays over 4 billion elements, not especially likely.
+ return Status::NotImplemented(
+ "Filter length exceeds UINT32_MAX, "
+ "consider a different strategy for selecting elements");
+ }
+}
+
+namespace {
+
+using FilterState = OptionsWrapper<FilterOptions>;
+using TakeState = OptionsWrapper<TakeOptions>;
+
+Status PreallocateData(KernelContext* ctx, int64_t length, int bit_width,
+ bool allocate_validity, ArrayData* out) {
+ // Preallocate memory
+ out->length = length;
+ out->buffers.resize(2);
+
+ if (allocate_validity) {
+ ARROW_ASSIGN_OR_RAISE(out->buffers[0], ctx->AllocateBitmap(length));
+ }
+ if (bit_width == 1) {
+ ARROW_ASSIGN_OR_RAISE(out->buffers[1], ctx->AllocateBitmap(length));
+ } else {
+ ARROW_ASSIGN_OR_RAISE(out->buffers[1], ctx->Allocate(length * bit_width / 8));
+ }
+ return Status::OK();
+}
+
+// ----------------------------------------------------------------------
+// Implement optimized take for primitive types from boolean to 1/2/4/8-byte
+// C-type based types. Use common implementation for every byte width and only
+// generate code for unsigned integer indices, since after boundschecking to
+// check for negative numbers in the indices we can safely reinterpret_cast
+// signed integers as unsigned.
+
+/// \brief The Take implementation for primitive (fixed-width) types does not
+/// use the logical Arrow type but rather the physical C type. This way we
+/// only generate one take function for each byte width.
+///
+/// This function assumes that the indices have been boundschecked.
+template <typename IndexCType, typename ValueCType>
+struct PrimitiveTakeImpl {
+ static void Exec(const PrimitiveArg& values, const PrimitiveArg& indices,
+ ArrayData* out_arr) {
+ auto values_data = reinterpret_cast<const ValueCType*>(values.data);
+ auto values_is_valid = values.is_valid;
+ auto values_offset = values.offset;
+
+ auto indices_data = reinterpret_cast<const IndexCType*>(indices.data);
+ auto indices_is_valid = indices.is_valid;
+ auto indices_offset = indices.offset;
+
+ auto out = out_arr->GetMutableValues<ValueCType>(1);
+ auto out_is_valid = out_arr->buffers[0]->mutable_data();
+ auto out_offset = out_arr->offset;
+
+ // If either the values or indices have nulls, we preemptively zero out the
+ // out validity bitmap so that we don't have to use ClearBit in each
+ // iteration for nulls.
+ if (values.null_count != 0 || indices.null_count != 0) {
+ BitUtil::SetBitsTo(out_is_valid, out_offset, indices.length, false);
+ }
+
+ OptionalBitBlockCounter indices_bit_counter(indices_is_valid, indices_offset,
+ indices.length);
+ int64_t position = 0;
+ int64_t valid_count = 0;
+ while (position < indices.length) {
+ BitBlockCount block = indices_bit_counter.NextBlock();
+ if (values.null_count == 0) {
+ // Values are never null, so things are easier
+ valid_count += block.popcount;
+ if (block.popcount == block.length) {
+ // Fastest path: neither values nor index nulls
+ BitUtil::SetBitsTo(out_is_valid, out_offset + position, block.length, true);
+ for (int64_t i = 0; i < block.length; ++i) {
+ out[position] = values_data[indices_data[position]];
+ ++position;
+ }
+ } else if (block.popcount > 0) {
+ // Slow path: some indices but not all are null
+ for (int64_t i = 0; i < block.length; ++i) {
+ if (BitUtil::GetBit(indices_is_valid, indices_offset + position)) {
+ // index is not null
+ BitUtil::SetBit(out_is_valid, out_offset + position);
+ out[position] = values_data[indices_data[position]];
+ } else {
+ out[position] = ValueCType{};
+ }
+ ++position;
+ }
+ } else {
+ memset(out + position, 0, sizeof(ValueCType) * block.length);
+ position += block.length;
+ }
+ } else {
+ // Values have nulls, so we must do random access into the values bitmap
+ if (block.popcount == block.length) {
+ // Faster path: indices are not null but values may be
+ for (int64_t i = 0; i < block.length; ++i) {
+ if (BitUtil::GetBit(values_is_valid,
+ values_offset + indices_data[position])) {
+ // value is not null
+ out[position] = values_data[indices_data[position]];
+ BitUtil::SetBit(out_is_valid, out_offset + position);
+ ++valid_count;
+ } else {
+ out[position] = ValueCType{};
+ }
+ ++position;
+ }
+ } else if (block.popcount > 0) {
+ // Slow path: some but not all indices are null. Since we are doing
+ // random access in general we have to check the value nullness one by
+ // one.
+ for (int64_t i = 0; i < block.length; ++i) {
+ if (BitUtil::GetBit(indices_is_valid, indices_offset + position) &&
+ BitUtil::GetBit(values_is_valid,
+ values_offset + indices_data[position])) {
+ // index is not null && value is not null
+ out[position] = values_data[indices_data[position]];
+ BitUtil::SetBit(out_is_valid, out_offset + position);
+ ++valid_count;
+ } else {
+ out[position] = ValueCType{};
+ }
+ ++position;
+ }
+ } else {
+ memset(out + position, 0, sizeof(ValueCType) * block.length);
+ position += block.length;
+ }
+ }
+ }
+ out_arr->null_count = out_arr->length - valid_count;
+ }
+};
+
+template <typename IndexCType>
+struct BooleanTakeImpl {
+ static void Exec(const PrimitiveArg& values, const PrimitiveArg& indices,
+ ArrayData* out_arr) {
+ const uint8_t* values_data = values.data;
+ auto values_is_valid = values.is_valid;
+ auto values_offset = values.offset;
+
+ auto indices_data = reinterpret_cast<const IndexCType*>(indices.data);
+ auto indices_is_valid = indices.is_valid;
+ auto indices_offset = indices.offset;
+
+ auto out = out_arr->buffers[1]->mutable_data();
+ auto out_is_valid = out_arr->buffers[0]->mutable_data();
+ auto out_offset = out_arr->offset;
+
+ // If either the values or indices have nulls, we preemptively zero out the
+ // out validity bitmap so that we don't have to use ClearBit in each
+ // iteration for nulls.
+ if (values.null_count != 0 || indices.null_count != 0) {
+ BitUtil::SetBitsTo(out_is_valid, out_offset, indices.length, false);
+ }
+ // Avoid uninitialized data in values array
+ BitUtil::SetBitsTo(out, out_offset, indices.length, false);
+
+ auto PlaceDataBit = [&](int64_t loc, IndexCType index) {
+ BitUtil::SetBitTo(out, out_offset + loc,
+ BitUtil::GetBit(values_data, values_offset + index));
+ };
+
+ OptionalBitBlockCounter indices_bit_counter(indices_is_valid, indices_offset,
+ indices.length);
+ int64_t position = 0;
+ int64_t valid_count = 0;
+ while (position < indices.length) {
+ BitBlockCount block = indices_bit_counter.NextBlock();
+ if (values.null_count == 0) {
+ // Values are never null, so things are easier
+ valid_count += block.popcount;
+ if (block.popcount == block.length) {
+ // Fastest path: neither values nor index nulls
+ BitUtil::SetBitsTo(out_is_valid, out_offset + position, block.length, true);
+ for (int64_t i = 0; i < block.length; ++i) {
+ PlaceDataBit(position, indices_data[position]);
+ ++position;
+ }
+ } else if (block.popcount > 0) {
+ // Slow path: some but not all indices are null
+ for (int64_t i = 0; i < block.length; ++i) {
+ if (BitUtil::GetBit(indices_is_valid, indices_offset + position)) {
+ // index is not null
+ BitUtil::SetBit(out_is_valid, out_offset + position);
+ PlaceDataBit(position, indices_data[position]);
+ }
+ ++position;
+ }
+ } else {
+ position += block.length;
+ }
+ } else {
+ // Values have nulls, so we must do random access into the values bitmap
+ if (block.popcount == block.length) {
+ // Faster path: indices are not null but values may be
+ for (int64_t i = 0; i < block.length; ++i) {
+ if (BitUtil::GetBit(values_is_valid,
+ values_offset + indices_data[position])) {
+ // value is not null
+ BitUtil::SetBit(out_is_valid, out_offset + position);
+ PlaceDataBit(position, indices_data[position]);
+ ++valid_count;
+ }
+ ++position;
+ }
+ } else if (block.popcount > 0) {
+ // Slow path: some but not all indices are null. Since we are doing
+ // random access in general we have to check the value nullness one by
+ // one.
+ for (int64_t i = 0; i < block.length; ++i) {
+ if (BitUtil::GetBit(indices_is_valid, indices_offset + position)) {
+ // index is not null
+ if (BitUtil::GetBit(values_is_valid,
+ values_offset + indices_data[position])) {
+ // value is not null
+ PlaceDataBit(position, indices_data[position]);
+ BitUtil::SetBit(out_is_valid, out_offset + position);
+ ++valid_count;
+ }
+ }
+ ++position;
+ }
+ } else {
+ position += block.length;
+ }
+ }
+ }
+ out_arr->null_count = out_arr->length - valid_count;
+ }
+};
+
+template <template <typename...> class TakeImpl, typename... Args>
+void TakeIndexDispatch(const PrimitiveArg& values, const PrimitiveArg& indices,
+ ArrayData* out) {
+ // With the simplifying assumption that boundschecking has taken place
+ // already at a higher level, we can now assume that the index values are all
+ // non-negative. Thus, we can interpret signed integers as unsigned and avoid
+ // having to generate double the amount of binary code to handle each integer
+ // width.
+ switch (indices.bit_width) {
+ case 8:
+ return TakeImpl<uint8_t, Args...>::Exec(values, indices, out);
+ case 16:
+ return TakeImpl<uint16_t, Args...>::Exec(values, indices, out);
+ case 32:
+ return TakeImpl<uint32_t, Args...>::Exec(values, indices, out);
+ case 64:
+ return TakeImpl<uint64_t, Args...>::Exec(values, indices, out);
+ default:
+ DCHECK(false) << "Invalid indices byte width";
+ break;
+ }
+}
+
+Status PrimitiveTake(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ if (TakeState::Get(ctx).boundscheck) {
+ RETURN_NOT_OK(CheckIndexBounds(*batch[1].array(), batch[0].length()));
+ }
+
+ PrimitiveArg values = GetPrimitiveArg(*batch[0].array());
+ PrimitiveArg indices = GetPrimitiveArg(*batch[1].array());
+
+ ArrayData* out_arr = out->mutable_array();
+
+ // TODO: When neither values nor indices contain nulls, we can skip
+ // allocating the validity bitmap altogether and save time and space. A
+ // streamlined PrimitiveTakeImpl would need to be written that skips all
+ // interactions with the output validity bitmap, though.
+ RETURN_NOT_OK(PreallocateData(ctx, indices.length, values.bit_width,
+ /*allocate_validity=*/true, out_arr));
+ switch (values.bit_width) {
+ case 1:
+ TakeIndexDispatch<BooleanTakeImpl>(values, indices, out_arr);
+ break;
+ case 8:
+ TakeIndexDispatch<PrimitiveTakeImpl, int8_t>(values, indices, out_arr);
+ break;
+ case 16:
+ TakeIndexDispatch<PrimitiveTakeImpl, int16_t>(values, indices, out_arr);
+ break;
+ case 32:
+ TakeIndexDispatch<PrimitiveTakeImpl, int32_t>(values, indices, out_arr);
+ break;
+ case 64:
+ TakeIndexDispatch<PrimitiveTakeImpl, int64_t>(values, indices, out_arr);
+ break;
+ default:
+ DCHECK(false) << "Invalid values byte width";
+ break;
+ }
+ return Status::OK();
+}
+
+// ----------------------------------------------------------------------
+// Optimized and streamlined filter for primitive types
+
+// Use either BitBlockCounter or BinaryBitBlockCounter to quickly scan filter a
+// word at a time for the DROP selection type.
+class DropNullCounter {
+ public:
+ // validity bitmap may be null
+ DropNullCounter(const uint8_t* validity, const uint8_t* data, int64_t offset,
+ int64_t length)
+ : data_counter_(data, offset, length),
+ data_and_validity_counter_(data, offset, validity, offset, length),
+ has_validity_(validity != nullptr) {}
+
+ BitBlockCount NextBlock() {
+ if (has_validity_) {
+ // filter is true AND not null
+ return data_and_validity_counter_.NextAndWord();
+ } else {
+ return data_counter_.NextWord();
+ }
+ }
+
+ private:
+ // For when just data is present, but no validity bitmap
+ BitBlockCounter data_counter_;
+
+ // For when both validity bitmap and data are present
+ BinaryBitBlockCounter data_and_validity_counter_;
+ const bool has_validity_;
+};
+
+/// \brief The Filter implementation for primitive (fixed-width) types does not
+/// use the logical Arrow type but rather the physical C type. This way we only
+/// generate one take function for each byte width. We use the same
+/// implementation here for boolean and fixed-byte-size inputs with some
+/// template specialization.
+template <typename ArrowType>
+class PrimitiveFilterImpl {
+ public:
+ using T = typename std::conditional<std::is_same<ArrowType, BooleanType>::value,
+ uint8_t, typename ArrowType::c_type>::type;
+
+ PrimitiveFilterImpl(const PrimitiveArg& values, const PrimitiveArg& filter,
+ FilterOptions::NullSelectionBehavior null_selection,
+ ArrayData* out_arr)
+ : values_is_valid_(values.is_valid),
+ values_data_(reinterpret_cast<const T*>(values.data)),
+ values_null_count_(values.null_count),
+ values_offset_(values.offset),
+ values_length_(values.length),
+ filter_is_valid_(filter.is_valid),
+ filter_data_(filter.data),
+ filter_null_count_(filter.null_count),
+ filter_offset_(filter.offset),
+ null_selection_(null_selection) {
+ if (out_arr->buffers[0] != nullptr) {
+ // May not be allocated if neither filter nor values contains nulls
+ out_is_valid_ = out_arr->buffers[0]->mutable_data();
+ }
+ out_data_ = reinterpret_cast<T*>(out_arr->buffers[1]->mutable_data());
+ out_offset_ = out_arr->offset;
+ out_length_ = out_arr->length;
+ out_position_ = 0;
+ }
+
+ void ExecNonNull() {
+ // Fast filter when values and filter are not null
+ ::arrow::internal::VisitSetBitRunsVoid(
+ filter_data_, filter_offset_, values_length_,
+ [&](int64_t position, int64_t length) { WriteValueSegment(position, length); });
+ }
+
+ void Exec() {
+ if (filter_null_count_ == 0 && values_null_count_ == 0) {
+ return ExecNonNull();
+ }
+
+ // Bit counters used for both null_selection behaviors
+ DropNullCounter drop_null_counter(filter_is_valid_, filter_data_, filter_offset_,
+ values_length_);
+ OptionalBitBlockCounter data_counter(values_is_valid_, values_offset_,
+ values_length_);
+ OptionalBitBlockCounter filter_valid_counter(filter_is_valid_, filter_offset_,
+ values_length_);
+
+ auto WriteNotNull = [&](int64_t index) {
+ BitUtil::SetBit(out_is_valid_, out_offset_ + out_position_);
+ // Increments out_position_
+ WriteValue(index);
+ };
+
+ auto WriteMaybeNull = [&](int64_t index) {
+ BitUtil::SetBitTo(out_is_valid_, out_offset_ + out_position_,
+ BitUtil::GetBit(values_is_valid_, values_offset_ + index));
+ // Increments out_position_
+ WriteValue(index);
+ };
+
+ int64_t in_position = 0;
+ while (in_position < values_length_) {
+ BitBlockCount filter_block = drop_null_counter.NextBlock();
+ BitBlockCount filter_valid_block = filter_valid_counter.NextWord();
+ BitBlockCount data_block = data_counter.NextWord();
+ if (filter_block.AllSet() && data_block.AllSet()) {
+ // Fastest path: all values in block are included and not null
+ BitUtil::SetBitsTo(out_is_valid_, out_offset_ + out_position_,
+ filter_block.length, true);
+ WriteValueSegment(in_position, filter_block.length);
+ in_position += filter_block.length;
+ } else if (filter_block.AllSet()) {
+ // Faster: all values are selected, but some values are null
+ // Batch copy bits from values validity bitmap to output validity bitmap
+ CopyBitmap(values_is_valid_, values_offset_ + in_position, filter_block.length,
+ out_is_valid_, out_offset_ + out_position_);
+ WriteValueSegment(in_position, filter_block.length);
+ in_position += filter_block.length;
+ } else if (filter_block.NoneSet() && null_selection_ == FilterOptions::DROP) {
+ // For this exceedingly common case in low-selectivity filters we can
+ // skip further analysis of the data and move on to the next block.
+ in_position += filter_block.length;
+ } else {
+ // Some filter values are false or null
+ if (data_block.AllSet()) {
+ // No values are null
+ if (filter_valid_block.AllSet()) {
+ // Filter is non-null but some values are false
+ for (int64_t i = 0; i < filter_block.length; ++i) {
+ if (BitUtil::GetBit(filter_data_, filter_offset_ + in_position)) {
+ WriteNotNull(in_position);
+ }
+ ++in_position;
+ }
+ } else if (null_selection_ == FilterOptions::DROP) {
+ // If any values are selected, they ARE NOT null
+ for (int64_t i = 0; i < filter_block.length; ++i) {
+ if (BitUtil::GetBit(filter_is_valid_, filter_offset_ + in_position) &&
+ BitUtil::GetBit(filter_data_, filter_offset_ + in_position)) {
+ WriteNotNull(in_position);
+ }
+ ++in_position;
+ }
+ } else { // null_selection == FilterOptions::EMIT_NULL
+ // Data values in this block are not null
+ for (int64_t i = 0; i < filter_block.length; ++i) {
+ const bool is_valid =
+ BitUtil::GetBit(filter_is_valid_, filter_offset_ + in_position);
+ if (is_valid &&
+ BitUtil::GetBit(filter_data_, filter_offset_ + in_position)) {
+ // Filter slot is non-null and set
+ WriteNotNull(in_position);
+ } else if (!is_valid) {
+ // Filter slot is null, so we have a null in the output
+ BitUtil::ClearBit(out_is_valid_, out_offset_ + out_position_);
+ WriteNull();
+ }
+ ++in_position;
+ }
+ }
+ } else { // !data_block.AllSet()
+ // Some values are null
+ if (filter_valid_block.AllSet()) {
+ // Filter is non-null but some values are false
+ for (int64_t i = 0; i < filter_block.length; ++i) {
+ if (BitUtil::GetBit(filter_data_, filter_offset_ + in_position)) {
+ WriteMaybeNull(in_position);
+ }
+ ++in_position;
+ }
+ } else if (null_selection_ == FilterOptions::DROP) {
+ // If any values are selected, they ARE NOT null
+ for (int64_t i = 0; i < filter_block.length; ++i) {
+ if (BitUtil::GetBit(filter_is_valid_, filter_offset_ + in_position) &&
+ BitUtil::GetBit(filter_data_, filter_offset_ + in_position)) {
+ WriteMaybeNull(in_position);
+ }
+ ++in_position;
+ }
+ } else { // null_selection == FilterOptions::EMIT_NULL
+ // Data values in this block are not null
+ for (int64_t i = 0; i < filter_block.length; ++i) {
+ const bool is_valid =
+ BitUtil::GetBit(filter_is_valid_, filter_offset_ + in_position);
+ if (is_valid &&
+ BitUtil::GetBit(filter_data_, filter_offset_ + in_position)) {
+ // Filter slot is non-null and set
+ WriteMaybeNull(in_position);
+ } else if (!is_valid) {
+ // Filter slot is null, so we have a null in the output
+ BitUtil::ClearBit(out_is_valid_, out_offset_ + out_position_);
+ WriteNull();
+ }
+ ++in_position;
+ }
+ }
+ }
+ } // !filter_block.AllSet()
+ } // while(in_position < values_length_)
+ }
+
+ // Write the next out_position given the selected in_position for the input
+ // data and advance out_position
+ void WriteValue(int64_t in_position) {
+ out_data_[out_position_++] = values_data_[in_position];
+ }
+
+ void WriteValueSegment(int64_t in_start, int64_t length) {
+ std::memcpy(out_data_ + out_position_, values_data_ + in_start, length * sizeof(T));
+ out_position_ += length;
+ }
+
+ void WriteNull() {
+ // Zero the memory
+ out_data_[out_position_++] = T{};
+ }
+
+ private:
+ const uint8_t* values_is_valid_;
+ const T* values_data_;
+ int64_t values_null_count_;
+ int64_t values_offset_;
+ int64_t values_length_;
+ const uint8_t* filter_is_valid_;
+ const uint8_t* filter_data_;
+ int64_t filter_null_count_;
+ int64_t filter_offset_;
+ FilterOptions::NullSelectionBehavior null_selection_;
+ uint8_t* out_is_valid_;
+ T* out_data_;
+ int64_t out_offset_;
+ int64_t out_length_;
+ int64_t out_position_;
+};
+
+template <>
+inline void PrimitiveFilterImpl<BooleanType>::WriteValue(int64_t in_position) {
+ BitUtil::SetBitTo(out_data_, out_offset_ + out_position_++,
+ BitUtil::GetBit(values_data_, values_offset_ + in_position));
+}
+
+template <>
+inline void PrimitiveFilterImpl<BooleanType>::WriteValueSegment(int64_t in_start,
+ int64_t length) {
+ CopyBitmap(values_data_, values_offset_ + in_start, length, out_data_,
+ out_offset_ + out_position_);
+ out_position_ += length;
+}
+
+template <>
+inline void PrimitiveFilterImpl<BooleanType>::WriteNull() {
+ // Zero the bit
+ BitUtil::ClearBit(out_data_, out_offset_ + out_position_++);
+}
+
+Status PrimitiveFilter(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ PrimitiveArg values = GetPrimitiveArg(*batch[0].array());
+ PrimitiveArg filter = GetPrimitiveArg(*batch[1].array());
+ FilterOptions::NullSelectionBehavior null_selection =
+ FilterState::Get(ctx).null_selection_behavior;
+
+ int64_t output_length = GetFilterOutputSize(*batch[1].array(), null_selection);
+
+ ArrayData* out_arr = out->mutable_array();
+
+ // The output precomputed null count is unknown except in the narrow
+ // condition that all the values are non-null and the filter will not cause
+ // any new nulls to be created.
+ if (values.null_count == 0 &&
+ (null_selection == FilterOptions::DROP || filter.null_count == 0)) {
+ out_arr->null_count = 0;
+ } else {
+ out_arr->null_count = kUnknownNullCount;
+ }
+
+ // When neither the values nor filter is known to have any nulls, we will
+ // elect the optimized ExecNonNull path where there is no need to populate a
+ // validity bitmap.
+ bool allocate_validity = values.null_count != 0 || filter.null_count != 0;
+
+ RETURN_NOT_OK(
+ PreallocateData(ctx, output_length, values.bit_width, allocate_validity, out_arr));
+
+ switch (values.bit_width) {
+ case 1:
+ PrimitiveFilterImpl<BooleanType>(values, filter, null_selection, out_arr).Exec();
+ break;
+ case 8:
+ PrimitiveFilterImpl<UInt8Type>(values, filter, null_selection, out_arr).Exec();
+ break;
+ case 16:
+ PrimitiveFilterImpl<UInt16Type>(values, filter, null_selection, out_arr).Exec();
+ break;
+ case 32:
+ PrimitiveFilterImpl<UInt32Type>(values, filter, null_selection, out_arr).Exec();
+ break;
+ case 64:
+ PrimitiveFilterImpl<UInt64Type>(values, filter, null_selection, out_arr).Exec();
+ break;
+ default:
+ DCHECK(false) << "Invalid values bit width";
+ break;
+ }
+ return Status::OK();
+}
+
+// ----------------------------------------------------------------------
+// Optimized filter for base binary types (32-bit and 64-bit)
+
+#define BINARY_FILTER_SETUP_COMMON() \
+ auto raw_offsets = \
+ reinterpret_cast<const offset_type*>(values.buffers[1]->data()) + values.offset; \
+ const uint8_t* raw_data = values.buffers[2]->data(); \
+ \
+ TypedBufferBuilder<offset_type> offset_builder(ctx->memory_pool()); \
+ TypedBufferBuilder<uint8_t> data_builder(ctx->memory_pool()); \
+ RETURN_NOT_OK(offset_builder.Reserve(output_length + 1)); \
+ \
+ /* Presize the data builder with a rough estimate */ \
+ if (values.length > 0) { \
+ const double mean_value_length = (raw_offsets[values.length] - raw_offsets[0]) / \
+ static_cast<double>(values.length); \
+ RETURN_NOT_OK( \
+ data_builder.Reserve(static_cast<int64_t>(mean_value_length * output_length))); \
+ } \
+ int64_t space_available = data_builder.capacity(); \
+ offset_type offset = 0;
+
+#define APPEND_RAW_DATA(DATA, NBYTES) \
+ if (ARROW_PREDICT_FALSE(NBYTES > space_available)) { \
+ RETURN_NOT_OK(data_builder.Reserve(NBYTES)); \
+ space_available = data_builder.capacity() - data_builder.length(); \
+ } \
+ data_builder.UnsafeAppend(DATA, NBYTES); \
+ space_available -= NBYTES
+
+#define APPEND_SINGLE_VALUE() \
+ do { \
+ offset_type val_size = raw_offsets[in_position + 1] - raw_offsets[in_position]; \
+ APPEND_RAW_DATA(raw_data + raw_offsets[in_position], val_size); \
+ offset += val_size; \
+ } while (0)
+
+// Optimized binary filter for the case where neither values nor filter have
+// nulls
+template <typename Type>
+Status BinaryFilterNonNullImpl(KernelContext* ctx, const ArrayData& values,
+ const ArrayData& filter, int64_t output_length,
+ FilterOptions::NullSelectionBehavior null_selection,
+ ArrayData* out) {
+ using offset_type = typename Type::offset_type;
+ const auto filter_data = filter.buffers[1]->data();
+
+ BINARY_FILTER_SETUP_COMMON();
+
+ RETURN_NOT_OK(arrow::internal::VisitSetBitRuns(
+ filter_data, filter.offset, filter.length, [&](int64_t position, int64_t length) {
+ // Bulk-append raw data
+ const offset_type run_data_bytes =
+ (raw_offsets[position + length] - raw_offsets[position]);
+ APPEND_RAW_DATA(raw_data + raw_offsets[position], run_data_bytes);
+ // Append offsets
+ offset_type cur_offset = raw_offsets[position];
+ for (int64_t i = 0; i < length; ++i) {
+ offset_builder.UnsafeAppend(offset);
+ offset += raw_offsets[i + position + 1] - cur_offset;
+ cur_offset = raw_offsets[i + position + 1];
+ }
+ return Status::OK();
+ }));
+
+ offset_builder.UnsafeAppend(offset);
+ out->length = output_length;
+ RETURN_NOT_OK(offset_builder.Finish(&out->buffers[1]));
+ return data_builder.Finish(&out->buffers[2]);
+}
+
+template <typename Type>
+Status BinaryFilterImpl(KernelContext* ctx, const ArrayData& values,
+ const ArrayData& filter, int64_t output_length,
+ FilterOptions::NullSelectionBehavior null_selection,
+ ArrayData* out) {
+ using offset_type = typename Type::offset_type;
+
+ const auto filter_data = filter.buffers[1]->data();
+ const uint8_t* filter_is_valid = GetValidityBitmap(filter);
+ const int64_t filter_offset = filter.offset;
+
+ const uint8_t* values_is_valid = GetValidityBitmap(values);
+ const int64_t values_offset = values.offset;
+
+ uint8_t* out_is_valid = out->buffers[0]->mutable_data();
+ // Zero bits and then only have to set valid values to true
+ BitUtil::SetBitsTo(out_is_valid, 0, output_length, false);
+
+ // We use 3 block counters for fast scanning of the filter
+ //
+ // * values_valid_counter: for values null/not-null
+ // * filter_valid_counter: for filter null/not-null
+ // * filter_counter: for filter true/false
+ OptionalBitBlockCounter values_valid_counter(values_is_valid, values_offset,
+ values.length);
+ OptionalBitBlockCounter filter_valid_counter(filter_is_valid, filter_offset,
+ filter.length);
+ BitBlockCounter filter_counter(filter_data, filter_offset, filter.length);
+
+ BINARY_FILTER_SETUP_COMMON();
+
+ int64_t in_position = 0;
+ int64_t out_position = 0;
+ while (in_position < filter.length) {
+ BitBlockCount filter_valid_block = filter_valid_counter.NextWord();
+ BitBlockCount values_valid_block = values_valid_counter.NextWord();
+ BitBlockCount filter_block = filter_counter.NextWord();
+ if (filter_block.NoneSet() && null_selection == FilterOptions::DROP) {
+ // For this exceedingly common case in low-selectivity filters we can
+ // skip further analysis of the data and move on to the next block.
+ in_position += filter_block.length;
+ } else if (filter_valid_block.AllSet()) {
+ // Simpler path: no filter values are null
+ if (filter_block.AllSet()) {
+ // Fastest path: filter values are all true and not null
+ if (values_valid_block.AllSet()) {
+ // The values aren't null either
+ BitUtil::SetBitsTo(out_is_valid, out_position, filter_block.length, true);
+
+ // Bulk-append raw data
+ offset_type block_data_bytes =
+ (raw_offsets[in_position + filter_block.length] - raw_offsets[in_position]);
+ APPEND_RAW_DATA(raw_data + raw_offsets[in_position], block_data_bytes);
+ // Append offsets
+ for (int64_t i = 0; i < filter_block.length; ++i, ++in_position) {
+ offset_builder.UnsafeAppend(offset);
+ offset += raw_offsets[in_position + 1] - raw_offsets[in_position];
+ }
+ out_position += filter_block.length;
+ } else {
+ // Some of the values in this block are null
+ for (int64_t i = 0; i < filter_block.length;
+ ++i, ++in_position, ++out_position) {
+ offset_builder.UnsafeAppend(offset);
+ if (BitUtil::GetBit(values_is_valid, values_offset + in_position)) {
+ BitUtil::SetBit(out_is_valid, out_position);
+ APPEND_SINGLE_VALUE();
+ }
+ }
+ }
+ } else { // !filter_block.AllSet()
+ // Some of the filter values are false, but all not null
+ if (values_valid_block.AllSet()) {
+ // All the values are not-null, so we can skip null checking for
+ // them
+ for (int64_t i = 0; i < filter_block.length; ++i, ++in_position) {
+ if (BitUtil::GetBit(filter_data, filter_offset + in_position)) {
+ offset_builder.UnsafeAppend(offset);
+ BitUtil::SetBit(out_is_valid, out_position++);
+ APPEND_SINGLE_VALUE();
+ }
+ }
+ } else {
+ // Some of the values in the block are null, so we have to check
+ // each one
+ for (int64_t i = 0; i < filter_block.length; ++i, ++in_position) {
+ if (BitUtil::GetBit(filter_data, filter_offset + in_position)) {
+ offset_builder.UnsafeAppend(offset);
+ if (BitUtil::GetBit(values_is_valid, values_offset + in_position)) {
+ BitUtil::SetBit(out_is_valid, out_position);
+ APPEND_SINGLE_VALUE();
+ }
+ ++out_position;
+ }
+ }
+ }
+ }
+ } else { // !filter_valid_block.AllSet()
+ // Some of the filter values are null, so we have to handle the DROP
+ // versus EMIT_NULL null selection behavior.
+ if (null_selection == FilterOptions::DROP) {
+ // Filter null values are treated as false.
+ if (values_valid_block.AllSet()) {
+ for (int64_t i = 0; i < filter_block.length; ++i, ++in_position) {
+ if (BitUtil::GetBit(filter_is_valid, filter_offset + in_position) &&
+ BitUtil::GetBit(filter_data, filter_offset + in_position)) {
+ offset_builder.UnsafeAppend(offset);
+ BitUtil::SetBit(out_is_valid, out_position++);
+ APPEND_SINGLE_VALUE();
+ }
+ }
+ } else {
+ for (int64_t i = 0; i < filter_block.length; ++i, ++in_position) {
+ if (BitUtil::GetBit(filter_is_valid, filter_offset + in_position) &&
+ BitUtil::GetBit(filter_data, filter_offset + in_position)) {
+ offset_builder.UnsafeAppend(offset);
+ if (BitUtil::GetBit(values_is_valid, values_offset + in_position)) {
+ BitUtil::SetBit(out_is_valid, out_position);
+ APPEND_SINGLE_VALUE();
+ }
+ ++out_position;
+ }
+ }
+ }
+ } else {
+ // EMIT_NULL
+
+ // Filter null values are appended to output as null whether the
+ // value in the corresponding slot is valid or not
+ if (values_valid_block.AllSet()) {
+ for (int64_t i = 0; i < filter_block.length; ++i, ++in_position) {
+ const bool filter_not_null =
+ BitUtil::GetBit(filter_is_valid, filter_offset + in_position);
+ if (filter_not_null &&
+ BitUtil::GetBit(filter_data, filter_offset + in_position)) {
+ offset_builder.UnsafeAppend(offset);
+ BitUtil::SetBit(out_is_valid, out_position++);
+ APPEND_SINGLE_VALUE();
+ } else if (!filter_not_null) {
+ offset_builder.UnsafeAppend(offset);
+ ++out_position;
+ }
+ }
+ } else {
+ for (int64_t i = 0; i < filter_block.length; ++i, ++in_position) {
+ const bool filter_not_null =
+ BitUtil::GetBit(filter_is_valid, filter_offset + in_position);
+ if (filter_not_null &&
+ BitUtil::GetBit(filter_data, filter_offset + in_position)) {
+ offset_builder.UnsafeAppend(offset);
+ if (BitUtil::GetBit(values_is_valid, values_offset + in_position)) {
+ BitUtil::SetBit(out_is_valid, out_position);
+ APPEND_SINGLE_VALUE();
+ }
+ ++out_position;
+ } else if (!filter_not_null) {
+ offset_builder.UnsafeAppend(offset);
+ ++out_position;
+ }
+ }
+ }
+ }
+ }
+ }
+ offset_builder.UnsafeAppend(offset);
+ out->length = output_length;
+ RETURN_NOT_OK(offset_builder.Finish(&out->buffers[1]));
+ return data_builder.Finish(&out->buffers[2]);
+}
+
+#undef BINARY_FILTER_SETUP_COMMON
+#undef APPEND_RAW_DATA
+#undef APPEND_SINGLE_VALUE
+
+Status BinaryFilter(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ FilterOptions::NullSelectionBehavior null_selection =
+ FilterState::Get(ctx).null_selection_behavior;
+
+ const ArrayData& values = *batch[0].array();
+ const ArrayData& filter = *batch[1].array();
+ int64_t output_length = GetFilterOutputSize(filter, null_selection);
+ ArrayData* out_arr = out->mutable_array();
+
+ // The output precomputed null count is unknown except in the narrow
+ // condition that all the values are non-null and the filter will not cause
+ // any new nulls to be created.
+ if (values.null_count == 0 &&
+ (null_selection == FilterOptions::DROP || filter.null_count == 0)) {
+ out_arr->null_count = 0;
+ } else {
+ out_arr->null_count = kUnknownNullCount;
+ }
+ Type::type type_id = values.type->id();
+ if (values.null_count == 0 && filter.null_count == 0) {
+ // Faster no-nulls case
+ if (is_binary_like(type_id)) {
+ RETURN_NOT_OK(BinaryFilterNonNullImpl<BinaryType>(
+ ctx, values, filter, output_length, null_selection, out_arr));
+ } else if (is_large_binary_like(type_id)) {
+ RETURN_NOT_OK(BinaryFilterNonNullImpl<LargeBinaryType>(
+ ctx, values, filter, output_length, null_selection, out_arr));
+ } else {
+ DCHECK(false);
+ }
+ } else {
+ // Output may have nulls
+ RETURN_NOT_OK(ctx->AllocateBitmap(output_length).Value(&out_arr->buffers[0]));
+ if (is_binary_like(type_id)) {
+ RETURN_NOT_OK(BinaryFilterImpl<BinaryType>(ctx, values, filter, output_length,
+ null_selection, out_arr));
+ } else if (is_large_binary_like(type_id)) {
+ RETURN_NOT_OK(BinaryFilterImpl<LargeBinaryType>(ctx, values, filter, output_length,
+ null_selection, out_arr));
+ } else {
+ DCHECK(false);
+ }
+ }
+
+ return Status::OK();
+}
+
+// ----------------------------------------------------------------------
+// Null take and filter
+
+Status NullTake(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ if (TakeState::Get(ctx).boundscheck) {
+ RETURN_NOT_OK(CheckIndexBounds(*batch[1].array(), batch[0].length()));
+ }
+ // batch.length doesn't take into account the take indices
+ auto new_length = batch[1].array()->length;
+ out->value = std::make_shared<NullArray>(new_length)->data();
+ return Status::OK();
+}
+
+Status NullFilter(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ int64_t output_length = GetFilterOutputSize(
+ *batch[1].array(), FilterState::Get(ctx).null_selection_behavior);
+ out->value = std::make_shared<NullArray>(output_length)->data();
+ return Status::OK();
+}
+
+// ----------------------------------------------------------------------
+// Dictionary take and filter
+
+Status DictionaryTake(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ DictionaryArray values(batch[0].array());
+ Datum result;
+ RETURN_NOT_OK(
+ Take(Datum(values.indices()), batch[1], TakeState::Get(ctx), ctx->exec_context())
+ .Value(&result));
+ DictionaryArray taken_values(values.type(), result.make_array(), values.dictionary());
+ out->value = taken_values.data();
+ return Status::OK();
+}
+
+Status DictionaryFilter(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ DictionaryArray dict_values(batch[0].array());
+ Datum result;
+ RETURN_NOT_OK(Filter(Datum(dict_values.indices()), batch[1].array(),
+ FilterState::Get(ctx), ctx->exec_context())
+ .Value(&result));
+ DictionaryArray filtered_values(dict_values.type(), result.make_array(),
+ dict_values.dictionary());
+ out->value = filtered_values.data();
+ return Status::OK();
+}
+
+// ----------------------------------------------------------------------
+// Extension take and filter
+
+Status ExtensionTake(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ ExtensionArray values(batch[0].array());
+ Datum result;
+ RETURN_NOT_OK(
+ Take(Datum(values.storage()), batch[1], TakeState::Get(ctx), ctx->exec_context())
+ .Value(&result));
+ ExtensionArray taken_values(values.type(), result.make_array());
+ out->value = taken_values.data();
+ return Status::OK();
+}
+
+Status ExtensionFilter(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ ExtensionArray ext_values(batch[0].array());
+ Datum result;
+ RETURN_NOT_OK(Filter(Datum(ext_values.storage()), batch[1].array(),
+ FilterState::Get(ctx), ctx->exec_context())
+ .Value(&result));
+ ExtensionArray filtered_values(ext_values.type(), result.make_array());
+ out->value = filtered_values.data();
+ return Status::OK();
+}
+
+// ----------------------------------------------------------------------
+// Implement take for other data types where there is less performance
+// sensitivity by visiting the selected indices.
+
+// Use CRTP to dispatch to type-specific processing of take indices for each
+// unsigned integer type.
+template <typename Impl, typename Type>
+struct Selection {
+ using ValuesArrayType = typename TypeTraits<Type>::ArrayType;
+
+ // Forwards the generic value visitors to the take index visitor template
+ template <typename IndexCType>
+ struct TakeAdapter {
+ static constexpr bool is_take = true;
+
+ Impl* impl;
+ explicit TakeAdapter(Impl* impl) : impl(impl) {}
+ template <typename ValidVisitor, typename NullVisitor>
+ Status Generate(ValidVisitor&& visit_valid, NullVisitor&& visit_null) {
+ return impl->template VisitTake<IndexCType>(std::forward<ValidVisitor>(visit_valid),
+ std::forward<NullVisitor>(visit_null));
+ }
+ };
+
+ // Forwards the generic value visitors to the VisitFilter template
+ struct FilterAdapter {
+ static constexpr bool is_take = false;
+
+ Impl* impl;
+ explicit FilterAdapter(Impl* impl) : impl(impl) {}
+ template <typename ValidVisitor, typename NullVisitor>
+ Status Generate(ValidVisitor&& visit_valid, NullVisitor&& visit_null) {
+ return impl->VisitFilter(std::forward<ValidVisitor>(visit_valid),
+ std::forward<NullVisitor>(visit_null));
+ }
+ };
+
+ KernelContext* ctx;
+ std::shared_ptr<ArrayData> values;
+ std::shared_ptr<ArrayData> selection;
+ int64_t output_length;
+ ArrayData* out;
+ TypedBufferBuilder<bool> validity_builder;
+
+ Selection(KernelContext* ctx, const ExecBatch& batch, int64_t output_length, Datum* out)
+ : ctx(ctx),
+ values(batch[0].array()),
+ selection(batch[1].array()),
+ output_length(output_length),
+ out(out->mutable_array()),
+ validity_builder(ctx->memory_pool()) {}
+
+ virtual ~Selection() = default;
+
+ Status FinishCommon() {
+ out->buffers.resize(values->buffers.size());
+ out->length = validity_builder.length();
+ out->null_count = validity_builder.false_count();
+ return validity_builder.Finish(&out->buffers[0]);
+ }
+
+ template <typename IndexCType, typename ValidVisitor, typename NullVisitor>
+ Status VisitTake(ValidVisitor&& visit_valid, NullVisitor&& visit_null) {
+ const auto indices_values = selection->GetValues<IndexCType>(1);
+ const uint8_t* is_valid = GetValidityBitmap(*selection);
+ OptionalBitIndexer indices_is_valid(selection->buffers[0], selection->offset);
+ OptionalBitIndexer values_is_valid(values->buffers[0], values->offset);
+
+ const bool values_have_nulls = values->MayHaveNulls();
+ OptionalBitBlockCounter bit_counter(is_valid, selection->offset, selection->length);
+ int64_t position = 0;
+ while (position < selection->length) {
+ BitBlockCount block = bit_counter.NextBlock();
+ const bool indices_have_nulls = block.popcount < block.length;
+ if (!indices_have_nulls && !values_have_nulls) {
+ // Fastest path, neither indices nor values have nulls
+ validity_builder.UnsafeAppend(block.length, true);
+ for (int64_t i = 0; i < block.length; ++i) {
+ RETURN_NOT_OK(visit_valid(indices_values[position++]));
+ }
+ } else if (block.popcount > 0) {
+ // Since we have to branch on whether the indices are null or not, we
+ // combine the "non-null indices block but some values null" and
+ // "some-null indices block but values non-null" into a single loop.
+ for (int64_t i = 0; i < block.length; ++i) {
+ if ((!indices_have_nulls || indices_is_valid[position]) &&
+ values_is_valid[indices_values[position]]) {
+ validity_builder.UnsafeAppend(true);
+ RETURN_NOT_OK(visit_valid(indices_values[position]));
+ } else {
+ validity_builder.UnsafeAppend(false);
+ RETURN_NOT_OK(visit_null());
+ }
+ ++position;
+ }
+ } else {
+ // The whole block is null
+ validity_builder.UnsafeAppend(block.length, false);
+ for (int64_t i = 0; i < block.length; ++i) {
+ RETURN_NOT_OK(visit_null());
+ }
+ position += block.length;
+ }
+ }
+ return Status::OK();
+ }
+
+ // We use the NullVisitor both for "selected" nulls as well as "emitted"
+ // nulls coming from the filter when using FilterOptions::EMIT_NULL
+ template <typename ValidVisitor, typename NullVisitor>
+ Status VisitFilter(ValidVisitor&& visit_valid, NullVisitor&& visit_null) {
+ auto null_selection = FilterState::Get(ctx).null_selection_behavior;
+
+ const auto filter_data = selection->buffers[1]->data();
+
+ const uint8_t* filter_is_valid = GetValidityBitmap(*selection);
+ const int64_t filter_offset = selection->offset;
+ OptionalBitIndexer values_is_valid(values->buffers[0], values->offset);
+
+ // We use 3 block counters for fast scanning of the filter
+ //
+ // * values_valid_counter: for values null/not-null
+ // * filter_valid_counter: for filter null/not-null
+ // * filter_counter: for filter true/false
+ OptionalBitBlockCounter values_valid_counter(GetValidityBitmap(*values),
+ values->offset, values->length);
+ OptionalBitBlockCounter filter_valid_counter(filter_is_valid, filter_offset,
+ selection->length);
+ BitBlockCounter filter_counter(filter_data, filter_offset, selection->length);
+ int64_t in_position = 0;
+
+ auto AppendNotNull = [&](int64_t index) -> Status {
+ validity_builder.UnsafeAppend(true);
+ return visit_valid(index);
+ };
+
+ auto AppendNull = [&]() -> Status {
+ validity_builder.UnsafeAppend(false);
+ return visit_null();
+ };
+
+ auto AppendMaybeNull = [&](int64_t index) -> Status {
+ if (values_is_valid[index]) {
+ return AppendNotNull(index);
+ } else {
+ return AppendNull();
+ }
+ };
+
+ while (in_position < selection->length) {
+ BitBlockCount filter_valid_block = filter_valid_counter.NextWord();
+ BitBlockCount values_valid_block = values_valid_counter.NextWord();
+ BitBlockCount filter_block = filter_counter.NextWord();
+ if (filter_block.NoneSet() && null_selection == FilterOptions::DROP) {
+ // For this exceedingly common case in low-selectivity filters we can
+ // skip further analysis of the data and move on to the next block.
+ in_position += filter_block.length;
+ } else if (filter_valid_block.AllSet()) {
+ // Simpler path: no filter values are null
+ if (filter_block.AllSet()) {
+ // Fastest path: filter values are all true and not null
+ if (values_valid_block.AllSet()) {
+ // The values aren't null either
+ validity_builder.UnsafeAppend(filter_block.length, true);
+ for (int64_t i = 0; i < filter_block.length; ++i) {
+ RETURN_NOT_OK(visit_valid(in_position++));
+ }
+ } else {
+ // Some of the values in this block are null
+ for (int64_t i = 0; i < filter_block.length; ++i) {
+ RETURN_NOT_OK(AppendMaybeNull(in_position++));
+ }
+ }
+ } else { // !filter_block.AllSet()
+ // Some of the filter values are false, but all not null
+ if (values_valid_block.AllSet()) {
+ // All the values are not-null, so we can skip null checking for
+ // them
+ for (int64_t i = 0; i < filter_block.length; ++i) {
+ if (BitUtil::GetBit(filter_data, filter_offset + in_position)) {
+ RETURN_NOT_OK(AppendNotNull(in_position));
+ }
+ ++in_position;
+ }
+ } else {
+ // Some of the values in the block are null, so we have to check
+ // each one
+ for (int64_t i = 0; i < filter_block.length; ++i) {
+ if (BitUtil::GetBit(filter_data, filter_offset + in_position)) {
+ RETURN_NOT_OK(AppendMaybeNull(in_position));
+ }
+ ++in_position;
+ }
+ }
+ }
+ } else { // !filter_valid_block.AllSet()
+ // Some of the filter values are null, so we have to handle the DROP
+ // versus EMIT_NULL null selection behavior.
+ if (null_selection == FilterOptions::DROP) {
+ // Filter null values are treated as false.
+ for (int64_t i = 0; i < filter_block.length; ++i) {
+ if (BitUtil::GetBit(filter_is_valid, filter_offset + in_position) &&
+ BitUtil::GetBit(filter_data, filter_offset + in_position)) {
+ RETURN_NOT_OK(AppendMaybeNull(in_position));
+ }
+ ++in_position;
+ }
+ } else {
+ // Filter null values are appended to output as null whether the
+ // value in the corresponding slot is valid or not
+ for (int64_t i = 0; i < filter_block.length; ++i) {
+ const bool filter_not_null =
+ BitUtil::GetBit(filter_is_valid, filter_offset + in_position);
+ if (filter_not_null &&
+ BitUtil::GetBit(filter_data, filter_offset + in_position)) {
+ RETURN_NOT_OK(AppendMaybeNull(in_position));
+ } else if (!filter_not_null) {
+ // EMIT_NULL case
+ RETURN_NOT_OK(AppendNull());
+ }
+ ++in_position;
+ }
+ }
+ }
+ }
+ return Status::OK();
+ }
+
+ virtual Status Init() { return Status::OK(); }
+
+ // Implementation specific finish logic
+ virtual Status Finish() = 0;
+
+ Status ExecTake() {
+ RETURN_NOT_OK(this->validity_builder.Reserve(output_length));
+ RETURN_NOT_OK(Init());
+ int index_width = GetByteWidth(*this->selection->type);
+
+ // CTRP dispatch here
+ switch (index_width) {
+ case 1: {
+ Status s =
+ static_cast<Impl*>(this)->template GenerateOutput<TakeAdapter<uint8_t>>();
+ RETURN_NOT_OK(s);
+ } break;
+ case 2: {
+ Status s =
+ static_cast<Impl*>(this)->template GenerateOutput<TakeAdapter<uint16_t>>();
+ RETURN_NOT_OK(s);
+ } break;
+ case 4: {
+ Status s =
+ static_cast<Impl*>(this)->template GenerateOutput<TakeAdapter<uint32_t>>();
+ RETURN_NOT_OK(s);
+ } break;
+ case 8: {
+ Status s =
+ static_cast<Impl*>(this)->template GenerateOutput<TakeAdapter<uint64_t>>();
+ RETURN_NOT_OK(s);
+ } break;
+ default:
+ DCHECK(false) << "Invalid index width";
+ break;
+ }
+ RETURN_NOT_OK(this->FinishCommon());
+ return Finish();
+ }
+
+ Status ExecFilter() {
+ RETURN_NOT_OK(this->validity_builder.Reserve(output_length));
+ RETURN_NOT_OK(Init());
+ // CRTP dispatch
+ Status s = static_cast<Impl*>(this)->template GenerateOutput<FilterAdapter>();
+ RETURN_NOT_OK(s);
+ RETURN_NOT_OK(this->FinishCommon());
+ return Finish();
+ }
+};
+
+#define LIFT_BASE_MEMBERS() \
+ using ValuesArrayType = typename Base::ValuesArrayType; \
+ using Base::ctx; \
+ using Base::values; \
+ using Base::selection; \
+ using Base::output_length; \
+ using Base::out; \
+ using Base::validity_builder
+
+static inline Status VisitNoop() { return Status::OK(); }
+
+// A selection implementation for 32-bit and 64-bit variable binary
+// types. Common generated kernels are shared between Binary/String and
+// LargeBinary/LargeString
+template <typename Type>
+struct VarBinaryImpl : public Selection<VarBinaryImpl<Type>, Type> {
+ using offset_type = typename Type::offset_type;
+
+ using Base = Selection<VarBinaryImpl<Type>, Type>;
+ LIFT_BASE_MEMBERS();
+
+ std::shared_ptr<ArrayData> values_as_binary;
+ TypedBufferBuilder<offset_type> offset_builder;
+ TypedBufferBuilder<uint8_t> data_builder;
+
+ static constexpr int64_t kOffsetLimit = std::numeric_limits<offset_type>::max() - 1;
+
+ VarBinaryImpl(KernelContext* ctx, const ExecBatch& batch, int64_t output_length,
+ Datum* out)
+ : Base(ctx, batch, output_length, out),
+ offset_builder(ctx->memory_pool()),
+ data_builder(ctx->memory_pool()) {}
+
+ template <typename Adapter>
+ Status GenerateOutput() {
+ ValuesArrayType typed_values(this->values_as_binary);
+
+ // Presize the data builder with a rough estimate of the required data size
+ if (values->length > 0) {
+ const double mean_value_length =
+ (typed_values.total_values_length() / static_cast<double>(values->length));
+
+ // TODO: See if possible to reduce output_length for take/filter cases
+ // where there are nulls in the selection array
+ RETURN_NOT_OK(
+ data_builder.Reserve(static_cast<int64_t>(mean_value_length * output_length)));
+ }
+ int64_t space_available = data_builder.capacity();
+
+ const offset_type* raw_offsets = typed_values.raw_value_offsets();
+ const uint8_t* raw_data = typed_values.raw_data();
+
+ offset_type offset = 0;
+ Adapter adapter(this);
+ RETURN_NOT_OK(adapter.Generate(
+ [&](int64_t index) {
+ offset_builder.UnsafeAppend(offset);
+ offset_type val_offset = raw_offsets[index];
+ offset_type val_size = raw_offsets[index + 1] - val_offset;
+
+ // Use static property to prune this code from the filter path in
+ // optimized builds
+ if (Adapter::is_take &&
+ ARROW_PREDICT_FALSE(static_cast<int64_t>(offset) +
+ static_cast<int64_t>(val_size)) > kOffsetLimit) {
+ return Status::Invalid("Take operation overflowed binary array capacity");
+ }
+ offset += val_size;
+ if (ARROW_PREDICT_FALSE(val_size > space_available)) {
+ RETURN_NOT_OK(data_builder.Reserve(val_size));
+ space_available = data_builder.capacity() - data_builder.length();
+ }
+ data_builder.UnsafeAppend(raw_data + val_offset, val_size);
+ space_available -= val_size;
+ return Status::OK();
+ },
+ [&]() {
+ offset_builder.UnsafeAppend(offset);
+ return Status::OK();
+ }));
+ offset_builder.UnsafeAppend(offset);
+ return Status::OK();
+ }
+
+ Status Init() override {
+ ARROW_ASSIGN_OR_RAISE(this->values_as_binary,
+ GetArrayView(this->values, TypeTraits<Type>::type_singleton()));
+ return offset_builder.Reserve(output_length + 1);
+ }
+
+ Status Finish() override {
+ RETURN_NOT_OK(offset_builder.Finish(&out->buffers[1]));
+ return data_builder.Finish(&out->buffers[2]);
+ }
+};
+
+struct FSBImpl : public Selection<FSBImpl, FixedSizeBinaryType> {
+ using Base = Selection<FSBImpl, FixedSizeBinaryType>;
+ LIFT_BASE_MEMBERS();
+
+ TypedBufferBuilder<uint8_t> data_builder;
+
+ FSBImpl(KernelContext* ctx, const ExecBatch& batch, int64_t output_length, Datum* out)
+ : Base(ctx, batch, output_length, out), data_builder(ctx->memory_pool()) {}
+
+ template <typename Adapter>
+ Status GenerateOutput() {
+ FixedSizeBinaryArray typed_values(this->values);
+ int32_t value_size = typed_values.byte_width();
+
+ RETURN_NOT_OK(data_builder.Reserve(value_size * output_length));
+ Adapter adapter(this);
+ return adapter.Generate(
+ [&](int64_t index) {
+ auto val = typed_values.GetView(index);
+ data_builder.UnsafeAppend(reinterpret_cast<const uint8_t*>(val.data()),
+ value_size);
+ return Status::OK();
+ },
+ [&]() {
+ data_builder.UnsafeAppend(value_size, static_cast<uint8_t>(0x00));
+ return Status::OK();
+ });
+ }
+
+ Status Finish() override { return data_builder.Finish(&out->buffers[1]); }
+};
+
+template <typename Type>
+struct ListImpl : public Selection<ListImpl<Type>, Type> {
+ using offset_type = typename Type::offset_type;
+
+ using Base = Selection<ListImpl<Type>, Type>;
+ LIFT_BASE_MEMBERS();
+
+ TypedBufferBuilder<offset_type> offset_builder;
+ typename TypeTraits<Type>::OffsetBuilderType child_index_builder;
+
+ ListImpl(KernelContext* ctx, const ExecBatch& batch, int64_t output_length, Datum* out)
+ : Base(ctx, batch, output_length, out),
+ offset_builder(ctx->memory_pool()),
+ child_index_builder(ctx->memory_pool()) {}
+
+ template <typename Adapter>
+ Status GenerateOutput() {
+ ValuesArrayType typed_values(this->values);
+
+ // TODO presize child_index_builder with a similar heuristic as VarBinaryImpl
+
+ offset_type offset = 0;
+ Adapter adapter(this);
+ RETURN_NOT_OK(adapter.Generate(
+ [&](int64_t index) {
+ offset_builder.UnsafeAppend(offset);
+ offset_type value_offset = typed_values.value_offset(index);
+ offset_type value_length = typed_values.value_length(index);
+ offset += value_length;
+ RETURN_NOT_OK(child_index_builder.Reserve(value_length));
+ for (offset_type j = value_offset; j < value_offset + value_length; ++j) {
+ child_index_builder.UnsafeAppend(j);
+ }
+ return Status::OK();
+ },
+ [&]() {
+ offset_builder.UnsafeAppend(offset);
+ return Status::OK();
+ }));
+ offset_builder.UnsafeAppend(offset);
+ return Status::OK();
+ }
+
+ Status Init() override {
+ RETURN_NOT_OK(offset_builder.Reserve(output_length + 1));
+ return Status::OK();
+ }
+
+ Status Finish() override {
+ std::shared_ptr<Array> child_indices;
+ RETURN_NOT_OK(child_index_builder.Finish(&child_indices));
+
+ ValuesArrayType typed_values(this->values);
+
+ // No need to boundscheck the child values indices
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Array> taken_child,
+ Take(*typed_values.values(), *child_indices,
+ TakeOptions::NoBoundsCheck(), ctx->exec_context()));
+ RETURN_NOT_OK(offset_builder.Finish(&out->buffers[1]));
+ out->child_data = {taken_child->data()};
+ return Status::OK();
+ }
+};
+
+struct DenseUnionImpl : public Selection<DenseUnionImpl, DenseUnionType> {
+ using Base = Selection<DenseUnionImpl, DenseUnionType>;
+ LIFT_BASE_MEMBERS();
+
+ TypedBufferBuilder<int32_t> value_offset_buffer_builder_;
+ TypedBufferBuilder<int8_t> child_id_buffer_builder_;
+ std::vector<int8_t> type_codes_;
+ std::vector<Int32Builder> child_indices_builders_;
+
+ DenseUnionImpl(KernelContext* ctx, const ExecBatch& batch, int64_t output_length,
+ Datum* out)
+ : Base(ctx, batch, output_length, out),
+ value_offset_buffer_builder_(ctx->memory_pool()),
+ child_id_buffer_builder_(ctx->memory_pool()),
+ type_codes_(checked_cast<const UnionType&>(*this->values->type).type_codes()),
+ child_indices_builders_(type_codes_.size()) {
+ for (auto& child_indices_builder : child_indices_builders_) {
+ child_indices_builder = Int32Builder(ctx->memory_pool());
+ }
+ }
+
+ template <typename Adapter>
+ Status GenerateOutput() {
+ DenseUnionArray typed_values(this->values);
+ Adapter adapter(this);
+ RETURN_NOT_OK(adapter.Generate(
+ [&](int64_t index) {
+ int8_t child_id = typed_values.child_id(index);
+ child_id_buffer_builder_.UnsafeAppend(type_codes_[child_id]);
+ int32_t value_offset = typed_values.value_offset(index);
+ value_offset_buffer_builder_.UnsafeAppend(
+ static_cast<int32_t>(child_indices_builders_[child_id].length()));
+ RETURN_NOT_OK(child_indices_builders_[child_id].Reserve(1));
+ child_indices_builders_[child_id].UnsafeAppend(value_offset);
+ return Status::OK();
+ },
+ [&]() {
+ int8_t child_id = 0;
+ child_id_buffer_builder_.UnsafeAppend(type_codes_[child_id]);
+ value_offset_buffer_builder_.UnsafeAppend(
+ static_cast<int32_t>(child_indices_builders_[child_id].length()));
+ RETURN_NOT_OK(child_indices_builders_[child_id].Reserve(1));
+ child_indices_builders_[child_id].UnsafeAppendNull();
+ return Status::OK();
+ }));
+ return Status::OK();
+ }
+
+ Status Init() override {
+ RETURN_NOT_OK(child_id_buffer_builder_.Reserve(output_length));
+ RETURN_NOT_OK(value_offset_buffer_builder_.Reserve(output_length));
+ return Status::OK();
+ }
+
+ Status Finish() override {
+ ARROW_ASSIGN_OR_RAISE(auto child_ids_buffer, child_id_buffer_builder_.Finish());
+ ARROW_ASSIGN_OR_RAISE(auto value_offsets_buffer,
+ value_offset_buffer_builder_.Finish());
+ DenseUnionArray typed_values(this->values);
+ auto num_fields = typed_values.num_fields();
+ auto num_rows = child_ids_buffer->size();
+ BufferVector buffers{nullptr, std::move(child_ids_buffer),
+ std::move(value_offsets_buffer)};
+ *out = ArrayData(typed_values.type(), num_rows, std::move(buffers), 0);
+ for (auto i = 0; i < num_fields; i++) {
+ ARROW_ASSIGN_OR_RAISE(auto child_indices_array,
+ child_indices_builders_[i].Finish());
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Array> child_array,
+ Take(*typed_values.field(i), *child_indices_array));
+ out->child_data.push_back(child_array->data());
+ }
+ return Status::OK();
+ }
+};
+
+struct FSLImpl : public Selection<FSLImpl, FixedSizeListType> {
+ Int64Builder child_index_builder;
+
+ using Base = Selection<FSLImpl, FixedSizeListType>;
+ LIFT_BASE_MEMBERS();
+
+ FSLImpl(KernelContext* ctx, const ExecBatch& batch, int64_t output_length, Datum* out)
+ : Base(ctx, batch, output_length, out), child_index_builder(ctx->memory_pool()) {}
+
+ template <typename Adapter>
+ Status GenerateOutput() {
+ ValuesArrayType typed_values(this->values);
+ int32_t list_size = typed_values.list_type()->list_size();
+
+ /// We must take list_size elements even for null elements of
+ /// indices.
+ RETURN_NOT_OK(child_index_builder.Reserve(output_length * list_size));
+
+ Adapter adapter(this);
+ return adapter.Generate(
+ [&](int64_t index) {
+ int64_t offset = index * list_size;
+ for (int64_t j = offset; j < offset + list_size; ++j) {
+ child_index_builder.UnsafeAppend(j);
+ }
+ return Status::OK();
+ },
+ [&]() { return child_index_builder.AppendNulls(list_size); });
+ }
+
+ Status Finish() override {
+ std::shared_ptr<Array> child_indices;
+ RETURN_NOT_OK(child_index_builder.Finish(&child_indices));
+
+ ValuesArrayType typed_values(this->values);
+
+ // No need to boundscheck the child values indices
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Array> taken_child,
+ Take(*typed_values.values(), *child_indices,
+ TakeOptions::NoBoundsCheck(), ctx->exec_context()));
+ out->child_data = {taken_child->data()};
+ return Status::OK();
+ }
+};
+
+// ----------------------------------------------------------------------
+// Struct selection implementations
+
+// We need a slightly different approach for StructType. For Take, we can
+// invoke Take on each struct field's data with boundschecking disabled. For
+// Filter on the other hand, if we naively call Filter on each field, then the
+// filter output length will have to be redundantly computed. Thus, for Filter
+// we instead convert the filter to selection indices and then invoke take.
+
+// Struct selection implementation. ONLY used for Take
+struct StructImpl : public Selection<StructImpl, StructType> {
+ using Base = Selection<StructImpl, StructType>;
+ LIFT_BASE_MEMBERS();
+ using Base::Base;
+
+ template <typename Adapter>
+ Status GenerateOutput() {
+ StructArray typed_values(values);
+ Adapter adapter(this);
+ // There's nothing to do for Struct except to generate the validity bitmap
+ return adapter.Generate([&](int64_t index) { return Status::OK(); },
+ /*visit_null=*/VisitNoop);
+ }
+
+ Status Finish() override {
+ StructArray typed_values(values);
+
+ // Select from children without boundschecking
+ out->child_data.resize(values->type->num_fields());
+ for (int field_index = 0; field_index < values->type->num_fields(); ++field_index) {
+ ARROW_ASSIGN_OR_RAISE(Datum taken_field,
+ Take(Datum(typed_values.field(field_index)), Datum(selection),
+ TakeOptions::NoBoundsCheck(), ctx->exec_context()));
+ out->child_data[field_index] = taken_field.array();
+ }
+ return Status::OK();
+ }
+};
+
+Status StructFilter(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ // Transform filter to selection indices and then use Take.
+ std::shared_ptr<ArrayData> indices;
+ RETURN_NOT_OK(GetTakeIndices(*batch[1].array(),
+ FilterState::Get(ctx).null_selection_behavior,
+ ctx->memory_pool())
+ .Value(&indices));
+
+ Datum result;
+ RETURN_NOT_OK(
+ Take(batch[0], Datum(indices), TakeOptions::NoBoundsCheck(), ctx->exec_context())
+ .Value(&result));
+ out->value = result.array();
+ return Status::OK();
+}
+
+#undef LIFT_BASE_MEMBERS
+
+// ----------------------------------------------------------------------
+// Implement Filter metafunction
+
+Result<std::shared_ptr<RecordBatch>> FilterRecordBatch(const RecordBatch& batch,
+ const Datum& filter,
+ const FunctionOptions* options,
+ ExecContext* ctx) {
+ if (batch.num_rows() != filter.length()) {
+ return Status::Invalid("Filter inputs must all be the same length");
+ }
+
+ // Convert filter to selection vector/indices and use Take
+ const auto& filter_opts = *static_cast<const FilterOptions*>(options);
+ ARROW_ASSIGN_OR_RAISE(
+ std::shared_ptr<ArrayData> indices,
+ GetTakeIndices(*filter.array(), filter_opts.null_selection_behavior,
+ ctx->memory_pool()));
+ std::vector<std::shared_ptr<Array>> columns(batch.num_columns());
+ for (int i = 0; i < batch.num_columns(); ++i) {
+ ARROW_ASSIGN_OR_RAISE(Datum out, Take(batch.column(i)->data(), Datum(indices),
+ TakeOptions::NoBoundsCheck(), ctx));
+ columns[i] = out.make_array();
+ }
+ return RecordBatch::Make(batch.schema(), indices->length, std::move(columns));
+}
+
+Result<std::shared_ptr<Table>> FilterTable(const Table& table, const Datum& filter,
+ const FunctionOptions* options,
+ ExecContext* ctx) {
+ if (table.num_rows() != filter.length()) {
+ return Status::Invalid("Filter inputs must all be the same length");
+ }
+ if (table.num_rows() == 0) {
+ return Table::Make(table.schema(), table.columns(), 0);
+ }
+
+ // Last input element will be the filter array
+ const int num_columns = table.num_columns();
+ std::vector<ArrayVector> inputs(num_columns + 1);
+
+ // Fetch table columns
+ for (int i = 0; i < num_columns; ++i) {
+ inputs[i] = table.column(i)->chunks();
+ }
+ // Fetch filter
+ const auto& filter_opts = *static_cast<const FilterOptions*>(options);
+ switch (filter.kind()) {
+ case Datum::ARRAY:
+ inputs.back().push_back(filter.make_array());
+ break;
+ case Datum::CHUNKED_ARRAY:
+ inputs.back() = filter.chunked_array()->chunks();
+ break;
+ default:
+ return Status::NotImplemented("Filter should be array-like");
+ }
+
+ // Rechunk inputs to allow consistent iteration over their respective chunks
+ inputs = arrow::internal::RechunkArraysConsistently(inputs);
+
+ // Instead of filtering each column with the boolean filter
+ // (which would be slow if the table has a large number of columns: ARROW-10569),
+ // convert each filter chunk to indices, and take() the column.
+ const int64_t num_chunks = static_cast<int64_t>(inputs.back().size());
+ std::vector<ArrayVector> out_columns(num_columns);
+ int64_t out_num_rows = 0;
+
+ for (int64_t i = 0; i < num_chunks; ++i) {
+ const ArrayData& filter_chunk = *inputs.back()[i]->data();
+ ARROW_ASSIGN_OR_RAISE(
+ const auto indices,
+ GetTakeIndices(filter_chunk, filter_opts.null_selection_behavior,
+ ctx->memory_pool()));
+
+ if (indices->length > 0) {
+ // Take from all input columns
+ Datum indices_datum{std::move(indices)};
+ for (int col = 0; col < num_columns; ++col) {
+ const auto& column_chunk = inputs[col][i];
+ ARROW_ASSIGN_OR_RAISE(Datum out, Take(column_chunk, indices_datum,
+ TakeOptions::NoBoundsCheck(), ctx));
+ out_columns[col].push_back(std::move(out).make_array());
+ }
+ out_num_rows += indices->length;
+ }
+ }
+
+ ChunkedArrayVector out_chunks(num_columns);
+ for (int i = 0; i < num_columns; ++i) {
+ out_chunks[i] = std::make_shared<ChunkedArray>(std::move(out_columns[i]),
+ table.column(i)->type());
+ }
+ return Table::Make(table.schema(), std::move(out_chunks), out_num_rows);
+}
+
+static auto kDefaultFilterOptions = FilterOptions::Defaults();
+
+const FunctionDoc filter_doc(
+ "Filter with a boolean selection filter",
+ ("The output is populated with values from the input at positions\n"
+ "where the selection filter is non-zero. Nulls in the selection filter\n"
+ "are handled based on FilterOptions."),
+ {"input", "selection_filter"}, "FilterOptions");
+
+class FilterMetaFunction : public MetaFunction {
+ public:
+ FilterMetaFunction()
+ : MetaFunction("filter", Arity::Binary(), &filter_doc, &kDefaultFilterOptions) {}
+
+ Result<Datum> ExecuteImpl(const std::vector<Datum>& args,
+ const FunctionOptions* options,
+ ExecContext* ctx) const override {
+ if (args[1].type()->id() != Type::BOOL) {
+ return Status::NotImplemented("Filter argument must be boolean type");
+ }
+
+ if (args[0].kind() == Datum::RECORD_BATCH) {
+ auto values_batch = args[0].record_batch();
+ ARROW_ASSIGN_OR_RAISE(
+ std::shared_ptr<RecordBatch> out_batch,
+ FilterRecordBatch(*args[0].record_batch(), args[1], options, ctx));
+ return Datum(out_batch);
+ } else if (args[0].kind() == Datum::TABLE) {
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Table> out_table,
+ FilterTable(*args[0].table(), args[1], options, ctx));
+ return Datum(out_table);
+ } else {
+ return CallFunction("array_filter", args, options, ctx);
+ }
+ }
+};
+
+// ----------------------------------------------------------------------
+// Implement Take metafunction
+
+// Shorthand naming of these functions
+// A -> Array
+// C -> ChunkedArray
+// R -> RecordBatch
+// T -> Table
+
+Result<std::shared_ptr<Array>> TakeAA(const Array& values, const Array& indices,
+ const TakeOptions& options, ExecContext* ctx) {
+ ARROW_ASSIGN_OR_RAISE(Datum result,
+ CallFunction("array_take", {values, indices}, &options, ctx));
+ return result.make_array();
+}
+
+Result<std::shared_ptr<ChunkedArray>> TakeCA(const ChunkedArray& values,
+ const Array& indices,
+ const TakeOptions& options,
+ ExecContext* ctx) {
+ auto num_chunks = values.num_chunks();
+ std::vector<std::shared_ptr<Array>> new_chunks(1); // Hard-coded 1 for now
+ std::shared_ptr<Array> current_chunk;
+
+ // Case 1: `values` has a single chunk, so just use it
+ if (num_chunks == 1) {
+ current_chunk = values.chunk(0);
+ } else {
+ // TODO Case 2: See if all `indices` fall in the same chunk and call Array Take on it
+ // See
+ // https://github.com/apache/arrow/blob/6f2c9041137001f7a9212f244b51bc004efc29af/r/src/compute.cpp#L123-L151
+ // TODO Case 3: If indices are sorted, can slice them and call Array Take
+
+ // Case 4: Else, concatenate chunks and call Array Take
+ ARROW_ASSIGN_OR_RAISE(current_chunk,
+ Concatenate(values.chunks(), ctx->memory_pool()));
+ }
+ // Call Array Take on our single chunk
+ ARROW_ASSIGN_OR_RAISE(new_chunks[0], TakeAA(*current_chunk, indices, options, ctx));
+ return std::make_shared<ChunkedArray>(std::move(new_chunks));
+}
+
+Result<std::shared_ptr<ChunkedArray>> TakeCC(const ChunkedArray& values,
+ const ChunkedArray& indices,
+ const TakeOptions& options,
+ ExecContext* ctx) {
+ auto num_chunks = indices.num_chunks();
+ std::vector<std::shared_ptr<Array>> new_chunks(num_chunks);
+ for (int i = 0; i < num_chunks; i++) {
+ // Take with that indices chunk
+ // Note that as currently implemented, this is inefficient because `values`
+ // will get concatenated on every iteration of this loop
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<ChunkedArray> current_chunk,
+ TakeCA(values, *indices.chunk(i), options, ctx));
+ // Concatenate the result to make a single array for this chunk
+ ARROW_ASSIGN_OR_RAISE(new_chunks[i],
+ Concatenate(current_chunk->chunks(), ctx->memory_pool()));
+ }
+ return std::make_shared<ChunkedArray>(std::move(new_chunks));
+}
+
+Result<std::shared_ptr<ChunkedArray>> TakeAC(const Array& values,
+ const ChunkedArray& indices,
+ const TakeOptions& options,
+ ExecContext* ctx) {
+ auto num_chunks = indices.num_chunks();
+ std::vector<std::shared_ptr<Array>> new_chunks(num_chunks);
+ for (int i = 0; i < num_chunks; i++) {
+ // Take with that indices chunk
+ ARROW_ASSIGN_OR_RAISE(new_chunks[i], TakeAA(values, *indices.chunk(i), options, ctx));
+ }
+ return std::make_shared<ChunkedArray>(std::move(new_chunks));
+}
+
+Result<std::shared_ptr<RecordBatch>> TakeRA(const RecordBatch& batch,
+ const Array& indices,
+ const TakeOptions& options,
+ ExecContext* ctx) {
+ auto ncols = batch.num_columns();
+ auto nrows = indices.length();
+ std::vector<std::shared_ptr<Array>> columns(ncols);
+ for (int j = 0; j < ncols; j++) {
+ ARROW_ASSIGN_OR_RAISE(columns[j], TakeAA(*batch.column(j), indices, options, ctx));
+ }
+ return RecordBatch::Make(batch.schema(), nrows, std::move(columns));
+}
+
+Result<std::shared_ptr<Table>> TakeTA(const Table& table, const Array& indices,
+ const TakeOptions& options, ExecContext* ctx) {
+ auto ncols = table.num_columns();
+ std::vector<std::shared_ptr<ChunkedArray>> columns(ncols);
+
+ for (int j = 0; j < ncols; j++) {
+ ARROW_ASSIGN_OR_RAISE(columns[j], TakeCA(*table.column(j), indices, options, ctx));
+ }
+ return Table::Make(table.schema(), std::move(columns));
+}
+
+Result<std::shared_ptr<Table>> TakeTC(const Table& table, const ChunkedArray& indices,
+ const TakeOptions& options, ExecContext* ctx) {
+ auto ncols = table.num_columns();
+ std::vector<std::shared_ptr<ChunkedArray>> columns(ncols);
+ for (int j = 0; j < ncols; j++) {
+ ARROW_ASSIGN_OR_RAISE(columns[j], TakeCC(*table.column(j), indices, options, ctx));
+ }
+ return Table::Make(table.schema(), std::move(columns));
+}
+
+static auto kDefaultTakeOptions = TakeOptions::Defaults();
+
+const FunctionDoc take_doc(
+ "Select values from an input based on indices from another array",
+ ("The output is populated with values from the input at positions\n"
+ "given by `indices`. Nulls in `indices` emit null in the output."),
+ {"input", "indices"}, "TakeOptions");
+
+// Metafunction for dispatching to different Take implementations other than
+// Array-Array.
+//
+// TODO: Revamp approach to executing Take operations. In addition to being
+// overly complex dispatching, there is no parallelization.
+class TakeMetaFunction : public MetaFunction {
+ public:
+ TakeMetaFunction()
+ : MetaFunction("take", Arity::Binary(), &take_doc, &kDefaultTakeOptions) {}
+
+ Result<Datum> ExecuteImpl(const std::vector<Datum>& args,
+ const FunctionOptions* options,
+ ExecContext* ctx) const override {
+ Datum::Kind index_kind = args[1].kind();
+ const TakeOptions& take_opts = static_cast<const TakeOptions&>(*options);
+ switch (args[0].kind()) {
+ case Datum::ARRAY:
+ if (index_kind == Datum::ARRAY) {
+ return TakeAA(*args[0].make_array(), *args[1].make_array(), take_opts, ctx);
+ } else if (index_kind == Datum::CHUNKED_ARRAY) {
+ return TakeAC(*args[0].make_array(), *args[1].chunked_array(), take_opts, ctx);
+ }
+ break;
+ case Datum::CHUNKED_ARRAY:
+ if (index_kind == Datum::ARRAY) {
+ return TakeCA(*args[0].chunked_array(), *args[1].make_array(), take_opts, ctx);
+ } else if (index_kind == Datum::CHUNKED_ARRAY) {
+ return TakeCC(*args[0].chunked_array(), *args[1].chunked_array(), take_opts,
+ ctx);
+ }
+ break;
+ case Datum::RECORD_BATCH:
+ if (index_kind == Datum::ARRAY) {
+ return TakeRA(*args[0].record_batch(), *args[1].make_array(), take_opts, ctx);
+ }
+ break;
+ case Datum::TABLE:
+ if (index_kind == Datum::ARRAY) {
+ return TakeTA(*args[0].table(), *args[1].make_array(), take_opts, ctx);
+ } else if (index_kind == Datum::CHUNKED_ARRAY) {
+ return TakeTC(*args[0].table(), *args[1].chunked_array(), take_opts, ctx);
+ }
+ break;
+ default:
+ break;
+ }
+ return Status::NotImplemented(
+ "Unsupported types for take operation: "
+ "values=",
+ args[0].ToString(), "indices=", args[1].ToString());
+ }
+};
+
+// ----------------------------------------------------------------------
+
+template <typename Impl>
+Status FilterExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ // TODO: where are the values and filter length equality checked?
+ int64_t output_length = GetFilterOutputSize(
+ *batch[1].array(), FilterState::Get(ctx).null_selection_behavior);
+ Impl kernel(ctx, batch, output_length, out);
+ return kernel.ExecFilter();
+}
+
+template <typename Impl>
+Status TakeExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ if (TakeState::Get(ctx).boundscheck) {
+ RETURN_NOT_OK(CheckIndexBounds(*batch[1].array(), batch[0].length()));
+ }
+ Impl kernel(ctx, batch, /*output_length=*/batch[1].length(), out);
+ return kernel.ExecTake();
+}
+
+struct SelectionKernelDescr {
+ InputType input;
+ ArrayKernelExec exec;
+};
+
+void RegisterSelectionFunction(const std::string& name, const FunctionDoc* doc,
+ VectorKernel base_kernel, InputType selection_type,
+ const std::vector<SelectionKernelDescr>& descrs,
+ const FunctionOptions* default_options,
+ FunctionRegistry* registry) {
+ auto func =
+ std::make_shared<VectorFunction>(name, Arity::Binary(), doc, default_options);
+ for (auto& descr : descrs) {
+ base_kernel.signature = KernelSignature::Make(
+ {std::move(descr.input), selection_type}, OutputType(FirstType));
+ base_kernel.exec = descr.exec;
+ DCHECK_OK(func->AddKernel(base_kernel));
+ }
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+}
+
+const FunctionDoc array_filter_doc(
+ "Filter with a boolean selection filter",
+ ("The output is populated with values from the input `array` at positions\n"
+ "where the selection filter is non-zero. Nulls in the selection filter\n"
+ "are handled based on FilterOptions."),
+ {"array", "selection_filter"}, "FilterOptions");
+
+const FunctionDoc array_take_doc(
+ "Select values from an array based on indices from another array",
+ ("The output is populated with values from the input array at positions\n"
+ "given by `indices`. Nulls in `indices` emit null in the output."),
+ {"array", "indices"}, "TakeOptions");
+
+} // namespace
+
+void RegisterVectorSelection(FunctionRegistry* registry) {
+ // Filter kernels
+ std::vector<SelectionKernelDescr> filter_kernel_descrs = {
+ {InputType(match::Primitive(), ValueDescr::ARRAY), PrimitiveFilter},
+ {InputType(match::BinaryLike(), ValueDescr::ARRAY), BinaryFilter},
+ {InputType(match::LargeBinaryLike(), ValueDescr::ARRAY), BinaryFilter},
+ {InputType::Array(Type::FIXED_SIZE_BINARY), FilterExec<FSBImpl>},
+ {InputType::Array(null()), NullFilter},
+ {InputType::Array(Type::DECIMAL), FilterExec<FSBImpl>},
+ {InputType::Array(Type::DICTIONARY), DictionaryFilter},
+ {InputType::Array(Type::EXTENSION), ExtensionFilter},
+ {InputType::Array(Type::LIST), FilterExec<ListImpl<ListType>>},
+ {InputType::Array(Type::LARGE_LIST), FilterExec<ListImpl<LargeListType>>},
+ {InputType::Array(Type::FIXED_SIZE_LIST), FilterExec<FSLImpl>},
+ {InputType::Array(Type::DENSE_UNION), FilterExec<DenseUnionImpl>},
+ {InputType::Array(Type::STRUCT), StructFilter},
+ // TODO: Reuse ListType kernel for MAP
+ {InputType::Array(Type::MAP), FilterExec<ListImpl<MapType>>},
+ };
+
+ VectorKernel filter_base;
+ filter_base.init = FilterState::Init;
+ RegisterSelectionFunction("array_filter", &array_filter_doc, filter_base,
+ /*selection_type=*/InputType::Array(boolean()),
+ filter_kernel_descrs, &kDefaultFilterOptions, registry);
+
+ DCHECK_OK(registry->AddFunction(std::make_shared<FilterMetaFunction>()));
+
+ // Take kernels
+ std::vector<SelectionKernelDescr> take_kernel_descrs = {
+ {InputType(match::Primitive(), ValueDescr::ARRAY), PrimitiveTake},
+ {InputType(match::BinaryLike(), ValueDescr::ARRAY),
+ TakeExec<VarBinaryImpl<BinaryType>>},
+ {InputType(match::LargeBinaryLike(), ValueDescr::ARRAY),
+ TakeExec<VarBinaryImpl<LargeBinaryType>>},
+ {InputType::Array(Type::FIXED_SIZE_BINARY), TakeExec<FSBImpl>},
+ {InputType::Array(null()), NullTake},
+ {InputType::Array(Type::DECIMAL128), TakeExec<FSBImpl>},
+ {InputType::Array(Type::DECIMAL256), TakeExec<FSBImpl>},
+ {InputType::Array(Type::DICTIONARY), DictionaryTake},
+ {InputType::Array(Type::EXTENSION), ExtensionTake},
+ {InputType::Array(Type::LIST), TakeExec<ListImpl<ListType>>},
+ {InputType::Array(Type::LARGE_LIST), TakeExec<ListImpl<LargeListType>>},
+ {InputType::Array(Type::FIXED_SIZE_LIST), TakeExec<FSLImpl>},
+ {InputType::Array(Type::DENSE_UNION), TakeExec<DenseUnionImpl>},
+ {InputType::Array(Type::STRUCT), TakeExec<StructImpl>},
+ // TODO: Reuse ListType kernel for MAP
+ {InputType::Array(Type::MAP), TakeExec<ListImpl<MapType>>},
+ };
+
+ VectorKernel take_base;
+ take_base.init = TakeState::Init;
+ take_base.can_execute_chunkwise = false;
+ RegisterSelectionFunction(
+ "array_take", &array_take_doc, take_base,
+ /*selection_type=*/InputType(match::Integer(), ValueDescr::ARRAY),
+ take_kernel_descrs, &kDefaultTakeOptions, registry);
+
+ DCHECK_OK(registry->AddFunction(std::make_shared<TakeMetaFunction>()));
+}
+
+} // namespace internal
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_sort.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_sort.cc
new file mode 100644
index 00000000000..7fa43e715d8
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_sort.cc
@@ -0,0 +1,1838 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <numeric>
+#include <type_traits>
+#include <utility>
+
+#include "arrow/array/data.h"
+#include "arrow/compute/api_vector.h"
+#include "arrow/compute/kernels/common.h"
+#include "arrow/compute/kernels/util_internal.h"
+#include "arrow/table.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/bit_block_counter.h"
+#include "arrow/util/bitmap.h"
+#include "arrow/util/bitmap_ops.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/optional.h"
+#include "arrow/visitor_inline.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+
+namespace compute {
+namespace internal {
+
+// Visit all physical types for which sorting is implemented.
+#define VISIT_PHYSICAL_TYPES(VISIT) \
+ VISIT(BooleanType) \
+ VISIT(Int8Type) \
+ VISIT(Int16Type) \
+ VISIT(Int32Type) \
+ VISIT(Int64Type) \
+ VISIT(UInt8Type) \
+ VISIT(UInt16Type) \
+ VISIT(UInt32Type) \
+ VISIT(UInt64Type) \
+ VISIT(FloatType) \
+ VISIT(DoubleType) \
+ VISIT(BinaryType) \
+ VISIT(LargeBinaryType) \
+ VISIT(FixedSizeBinaryType) \
+ VISIT(Decimal128Type) \
+ VISIT(Decimal256Type)
+
+namespace {
+
+// The target chunk in a chunked array.
+template <typename ArrayType>
+struct ResolvedChunk {
+ using V = GetViewType<typename ArrayType::TypeClass>;
+ using LogicalValueType = typename V::T;
+
+ // The target array in chunked array.
+ const ArrayType* array;
+ // The index in the target array.
+ const int64_t index;
+
+ ResolvedChunk(const ArrayType* array, int64_t index) : array(array), index(index) {}
+
+ bool IsNull() const { return array->IsNull(index); }
+
+ LogicalValueType Value() const { return V::LogicalValue(array->GetView(index)); }
+};
+
+// ResolvedChunk specialization for untyped arrays when all is needed is null lookup
+template <>
+struct ResolvedChunk<Array> {
+ // The target array in chunked array.
+ const Array* array;
+ // The index in the target array.
+ const int64_t index;
+
+ ResolvedChunk(const Array* array, int64_t index) : array(array), index(index) {}
+
+ bool IsNull() const { return array->IsNull(index); }
+};
+
+// An object that resolves an array chunk depending on the index.
+struct ChunkedArrayResolver {
+ explicit ChunkedArrayResolver(const std::vector<const Array*>& chunks)
+ : num_chunks_(static_cast<int64_t>(chunks.size())),
+ chunks_(chunks.data()),
+ offsets_(MakeEndOffsets(chunks)),
+ cached_chunk_(0) {}
+
+ template <typename ArrayType>
+ ResolvedChunk<ArrayType> Resolve(int64_t index) const {
+ // It is common for the algorithms below to make consecutive accesses at
+ // a relatively small distance from each other, hence often falling in
+ // the same chunk.
+ // This is trivial when merging (assuming each side of the merge uses
+ // its own resolver), but also in the inner recursive invocations of
+ // partitioning.
+ const bool cache_hit =
+ (index >= offsets_[cached_chunk_] && index < offsets_[cached_chunk_ + 1]);
+ if (ARROW_PREDICT_TRUE(cache_hit)) {
+ return ResolvedChunk<ArrayType>(
+ checked_cast<const ArrayType*>(chunks_[cached_chunk_]),
+ index - offsets_[cached_chunk_]);
+ } else {
+ return ResolveMissBisect<ArrayType>(index);
+ }
+ }
+
+ private:
+ template <typename ArrayType>
+ ResolvedChunk<ArrayType> ResolveMissBisect(int64_t index) const {
+ // Like std::upper_bound(), but hand-written as it can help the compiler.
+ const int64_t* raw_offsets = offsets_.data();
+ // Search [lo, lo + n)
+ int64_t lo = 0, n = num_chunks_;
+ while (n > 1) {
+ int64_t m = n >> 1;
+ int64_t mid = lo + m;
+ if (index >= raw_offsets[mid]) {
+ lo = mid;
+ n -= m;
+ } else {
+ n = m;
+ }
+ }
+ cached_chunk_ = lo;
+ return ResolvedChunk<ArrayType>(checked_cast<const ArrayType*>(chunks_[lo]),
+ index - offsets_[lo]);
+ }
+
+ static std::vector<int64_t> MakeEndOffsets(const std::vector<const Array*>& chunks) {
+ std::vector<int64_t> end_offsets(chunks.size() + 1);
+ int64_t offset = 0;
+ end_offsets[0] = 0;
+ std::transform(chunks.begin(), chunks.end(), end_offsets.begin() + 1,
+ [&](const Array* chunk) {
+ offset += chunk->length();
+ return offset;
+ });
+ return end_offsets;
+ }
+
+ int64_t num_chunks_;
+ const Array* const* chunks_;
+ std::vector<int64_t> offsets_;
+
+ mutable int64_t cached_chunk_;
+};
+
+// We could try to reproduce the concrete Array classes' facilities
+// (such as cached raw values pointer) in a separate hierarchy of
+// physical accessors, but doing so ends up too cumbersome.
+// Instead, we simply create the desired concrete Array objects.
+std::shared_ptr<Array> GetPhysicalArray(const Array& array,
+ const std::shared_ptr<DataType>& physical_type) {
+ auto new_data = array.data()->Copy();
+ new_data->type = physical_type;
+ return MakeArray(std::move(new_data));
+}
+
+ArrayVector GetPhysicalChunks(const ChunkedArray& chunked_array,
+ const std::shared_ptr<DataType>& physical_type) {
+ const auto& chunks = chunked_array.chunks();
+ ArrayVector physical(chunks.size());
+ std::transform(chunks.begin(), chunks.end(), physical.begin(),
+ [&](const std::shared_ptr<Array>& array) {
+ return GetPhysicalArray(*array, physical_type);
+ });
+ return physical;
+}
+
+std::vector<const Array*> GetArrayPointers(const ArrayVector& arrays) {
+ std::vector<const Array*> pointers(arrays.size());
+ std::transform(arrays.begin(), arrays.end(), pointers.begin(),
+ [&](const std::shared_ptr<Array>& array) { return array.get(); });
+ return pointers;
+}
+
+// NOTE: std::partition is usually faster than std::stable_partition.
+
+struct NonStablePartitioner {
+ template <typename Predicate>
+ uint64_t* operator()(uint64_t* indices_begin, uint64_t* indices_end, Predicate&& pred) {
+ return std::partition(indices_begin, indices_end, std::forward<Predicate>(pred));
+ }
+};
+
+struct StablePartitioner {
+ template <typename Predicate>
+ uint64_t* operator()(uint64_t* indices_begin, uint64_t* indices_end, Predicate&& pred) {
+ return std::stable_partition(indices_begin, indices_end,
+ std::forward<Predicate>(pred));
+ }
+};
+
+// TODO factor out value comparison and NaN checking?
+
+template <typename TypeClass, typename Enable = void>
+struct NullTraits {
+ static constexpr bool has_null_like_values = false;
+};
+
+template <typename TypeClass>
+struct NullTraits<TypeClass, enable_if_floating_point<TypeClass>> {
+ static constexpr bool has_null_like_values = true;
+};
+
+// Move nulls (not null-like values) to end of array. Return where null starts.
+//
+// `offset` is used when this is called on a chunk of a chunked array
+template <typename Partitioner>
+uint64_t* PartitionNullsOnly(uint64_t* indices_begin, uint64_t* indices_end,
+ const Array& values, int64_t offset) {
+ if (values.null_count() == 0) {
+ return indices_end;
+ }
+ Partitioner partitioner;
+ return partitioner(indices_begin, indices_end, [&values, &offset](uint64_t ind) {
+ return !values.IsNull(ind - offset);
+ });
+}
+
+// For chunked array.
+template <typename Partitioner>
+uint64_t* PartitionNullsOnly(uint64_t* indices_begin, uint64_t* indices_end,
+ const std::vector<const Array*>& arrays,
+ int64_t null_count) {
+ if (null_count == 0) {
+ return indices_end;
+ }
+ ChunkedArrayResolver resolver(arrays);
+ Partitioner partitioner;
+ return partitioner(indices_begin, indices_end, [&](uint64_t ind) {
+ const auto chunk = resolver.Resolve<Array>(ind);
+ return !chunk.IsNull();
+ });
+}
+
+// Move non-null null-like values to end of array. Return where null-like starts.
+//
+// `offset` is used when this is called on a chunk of a chunked array
+template <typename ArrayType, typename Partitioner>
+enable_if_t<!is_floating_type<typename ArrayType::TypeClass>::value, uint64_t*>
+PartitionNullLikes(uint64_t* indices_begin, uint64_t* indices_end,
+ const ArrayType& values, int64_t offset) {
+ return indices_end;
+}
+
+// For chunked array.
+template <typename ArrayType, typename Partitioner>
+enable_if_t<!is_floating_type<typename ArrayType::TypeClass>::value, uint64_t*>
+PartitionNullLikes(uint64_t* indices_begin, uint64_t* indices_end,
+ const std::vector<const Array*>& arrays, int64_t null_count) {
+ return indices_end;
+}
+
+template <typename ArrayType, typename Partitioner>
+enable_if_t<is_floating_type<typename ArrayType::TypeClass>::value, uint64_t*>
+PartitionNullLikes(uint64_t* indices_begin, uint64_t* indices_end,
+ const ArrayType& values, int64_t offset) {
+ Partitioner partitioner;
+ return partitioner(indices_begin, indices_end, [&values, &offset](uint64_t ind) {
+ return !std::isnan(values.GetView(ind - offset));
+ });
+}
+
+template <typename ArrayType, typename Partitioner>
+enable_if_t<is_floating_type<typename ArrayType::TypeClass>::value, uint64_t*>
+PartitionNullLikes(uint64_t* indices_begin, uint64_t* indices_end,
+ const std::vector<const Array*>& arrays, int64_t null_count) {
+ Partitioner partitioner;
+ ChunkedArrayResolver resolver(arrays);
+ return partitioner(indices_begin, indices_end, [&](uint64_t ind) {
+ const auto chunk = resolver.Resolve<ArrayType>(ind);
+ return !std::isnan(chunk.Value());
+ });
+}
+
+// Move nulls to end of array. Return where null starts.
+//
+// `offset` is used when this is called on a chunk of a chunked array
+template <typename ArrayType, typename Partitioner>
+uint64_t* PartitionNulls(uint64_t* indices_begin, uint64_t* indices_end,
+ const ArrayType& values, int64_t offset) {
+ // Partition nulls at end, and null-like values just before
+ uint64_t* nulls_begin =
+ PartitionNullsOnly<Partitioner>(indices_begin, indices_end, values, offset);
+ return PartitionNullLikes<ArrayType, Partitioner>(indices_begin, nulls_begin, values,
+ offset);
+}
+
+// For chunked array.
+template <typename ArrayType, typename Partitioner>
+uint64_t* PartitionNulls(uint64_t* indices_begin, uint64_t* indices_end,
+ const std::vector<const Array*>& arrays, int64_t null_count) {
+ // Partition nulls at end, and null-like values just before
+ uint64_t* nulls_begin =
+ PartitionNullsOnly<Partitioner>(indices_begin, indices_end, arrays, null_count);
+ return PartitionNullLikes<ArrayType, Partitioner>(indices_begin, nulls_begin, arrays,
+ null_count);
+}
+
+// ----------------------------------------------------------------------
+// partition_nth_indices implementation
+
+// We need to preserve the options
+using PartitionNthToIndicesState = internal::OptionsWrapper<PartitionNthOptions>;
+
+template <typename OutType, typename InType>
+struct PartitionNthToIndices {
+ using ArrayType = typename TypeTraits<InType>::ArrayType;
+
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ using GetView = GetViewType<InType>;
+
+ if (ctx->state() == nullptr) {
+ return Status::Invalid("NthToIndices requires PartitionNthOptions");
+ }
+
+ ArrayType arr(batch[0].array());
+
+ int64_t pivot = PartitionNthToIndicesState::Get(ctx).pivot;
+ if (pivot > arr.length()) {
+ return Status::IndexError("NthToIndices index out of bound");
+ }
+ ArrayData* out_arr = out->mutable_array();
+ uint64_t* out_begin = out_arr->GetMutableValues<uint64_t>(1);
+ uint64_t* out_end = out_begin + arr.length();
+ std::iota(out_begin, out_end, 0);
+ if (pivot == arr.length()) {
+ return Status::OK();
+ }
+ auto nulls_begin =
+ PartitionNulls<ArrayType, NonStablePartitioner>(out_begin, out_end, arr, 0);
+ auto nth_begin = out_begin + pivot;
+ if (nth_begin < nulls_begin) {
+ std::nth_element(out_begin, nth_begin, nulls_begin,
+ [&arr](uint64_t left, uint64_t right) {
+ const auto lval = GetView::LogicalValue(arr.GetView(left));
+ const auto rval = GetView::LogicalValue(arr.GetView(right));
+ return lval < rval;
+ });
+ }
+ return Status::OK();
+ }
+};
+
+// ----------------------------------------------------------------------
+// Array sorting implementations
+
+template <typename ArrayType, typename VisitorNotNull, typename VisitorNull>
+inline void VisitRawValuesInline(const ArrayType& values,
+ VisitorNotNull&& visitor_not_null,
+ VisitorNull&& visitor_null) {
+ const auto data = values.raw_values();
+ VisitBitBlocksVoid(
+ values.null_bitmap(), values.offset(), values.length(),
+ [&](int64_t i) { visitor_not_null(data[i]); }, [&]() { visitor_null(); });
+}
+
+template <typename VisitorNotNull, typename VisitorNull>
+inline void VisitRawValuesInline(const BooleanArray& values,
+ VisitorNotNull&& visitor_not_null,
+ VisitorNull&& visitor_null) {
+ if (values.null_count() != 0) {
+ const uint8_t* data = values.data()->GetValues<uint8_t>(1, 0);
+ VisitBitBlocksVoid(
+ values.null_bitmap(), values.offset(), values.length(),
+ [&](int64_t i) { visitor_not_null(BitUtil::GetBit(data, values.offset() + i)); },
+ [&]() { visitor_null(); });
+ } else {
+ // Can avoid GetBit() overhead in the no-nulls case
+ VisitBitBlocksVoid(
+ values.data()->buffers[1], values.offset(), values.length(),
+ [&](int64_t i) { visitor_not_null(true); }, [&]() { visitor_not_null(false); });
+ }
+}
+
+template <typename ArrowType>
+class ArrayCompareSorter {
+ using ArrayType = typename TypeTraits<ArrowType>::ArrayType;
+ using GetView = GetViewType<ArrowType>;
+
+ public:
+ // Returns where null starts.
+ //
+ // `offset` is used when this is called on a chunk of a chunked array
+ uint64_t* Sort(uint64_t* indices_begin, uint64_t* indices_end, const ArrayType& values,
+ int64_t offset, const ArraySortOptions& options) {
+ auto nulls_begin = PartitionNulls<ArrayType, StablePartitioner>(
+ indices_begin, indices_end, values, offset);
+ if (options.order == SortOrder::Ascending) {
+ std::stable_sort(
+ indices_begin, nulls_begin, [&values, &offset](uint64_t left, uint64_t right) {
+ const auto lhs = GetView::LogicalValue(values.GetView(left - offset));
+ const auto rhs = GetView::LogicalValue(values.GetView(right - offset));
+ return lhs < rhs;
+ });
+ } else {
+ std::stable_sort(
+ indices_begin, nulls_begin, [&values, &offset](uint64_t left, uint64_t right) {
+ const auto lhs = GetView::LogicalValue(values.GetView(left - offset));
+ const auto rhs = GetView::LogicalValue(values.GetView(right - offset));
+ // We don't use 'left > right' here to reduce required operator.
+ // If we use 'right < left' here, '<' is only required.
+ return rhs < lhs;
+ });
+ }
+ return nulls_begin;
+ }
+};
+
+template <typename ArrowType>
+class ArrayCountSorter {
+ using ArrayType = typename TypeTraits<ArrowType>::ArrayType;
+ using c_type = typename ArrowType::c_type;
+
+ public:
+ ArrayCountSorter() = default;
+
+ explicit ArrayCountSorter(c_type min, c_type max) { SetMinMax(min, max); }
+
+ // Assume: max >= min && (max - min) < 4Gi
+ void SetMinMax(c_type min, c_type max) {
+ min_ = min;
+ value_range_ = static_cast<uint32_t>(max - min) + 1;
+ }
+
+ // Returns where null starts.
+ uint64_t* Sort(uint64_t* indices_begin, uint64_t* indices_end, const ArrayType& values,
+ int64_t offset, const ArraySortOptions& options) {
+ // 32bit counter performs much better than 64bit one
+ if (values.length() < (1LL << 32)) {
+ return SortInternal<uint32_t>(indices_begin, indices_end, values, offset, options);
+ } else {
+ return SortInternal<uint64_t>(indices_begin, indices_end, values, offset, options);
+ }
+ }
+
+ private:
+ c_type min_{0};
+ uint32_t value_range_{0};
+
+ // Returns where null starts.
+ //
+ // `offset` is used when this is called on a chunk of a chunked array
+ template <typename CounterType>
+ uint64_t* SortInternal(uint64_t* indices_begin, uint64_t* indices_end,
+ const ArrayType& values, int64_t offset,
+ const ArraySortOptions& options) {
+ const uint32_t value_range = value_range_;
+
+ // first slot reserved for prefix sum
+ std::vector<CounterType> counts(1 + value_range);
+
+ if (options.order == SortOrder::Ascending) {
+ VisitRawValuesInline(
+ values, [&](c_type v) { ++counts[v - min_ + 1]; }, []() {});
+ for (uint32_t i = 1; i <= value_range; ++i) {
+ counts[i] += counts[i - 1];
+ }
+ auto null_position = counts[value_range];
+ auto nulls_begin = indices_begin + null_position;
+ int64_t index = offset;
+ VisitRawValuesInline(
+ values, [&](c_type v) { indices_begin[counts[v - min_]++] = index++; },
+ [&]() { indices_begin[null_position++] = index++; });
+ return nulls_begin;
+ } else {
+ VisitRawValuesInline(
+ values, [&](c_type v) { ++counts[v - min_]; }, []() {});
+ for (uint32_t i = value_range; i >= 1; --i) {
+ counts[i - 1] += counts[i];
+ }
+ auto null_position = counts[0];
+ auto nulls_begin = indices_begin + null_position;
+ int64_t index = offset;
+ VisitRawValuesInline(
+ values, [&](c_type v) { indices_begin[counts[v - min_ + 1]++] = index++; },
+ [&]() { indices_begin[null_position++] = index++; });
+ return nulls_begin;
+ }
+ }
+};
+
+using ::arrow::internal::Bitmap;
+
+template <>
+class ArrayCountSorter<BooleanType> {
+ public:
+ ArrayCountSorter() = default;
+
+ // Returns where null starts.
+ // `offset` is used when this is called on a chunk of a chunked array
+ uint64_t* Sort(uint64_t* indices_begin, uint64_t* indices_end,
+ const BooleanArray& values, int64_t offset,
+ const ArraySortOptions& options) {
+ std::array<int64_t, 2> counts{0, 0};
+
+ const int64_t nulls = values.null_count();
+ const int64_t ones = values.true_count();
+ const int64_t zeros = values.length() - ones - nulls;
+
+ int64_t null_position = values.length() - nulls;
+ int64_t index = offset;
+ const auto nulls_begin = indices_begin + null_position;
+
+ if (options.order == SortOrder::Ascending) {
+ // ones start after zeros
+ counts[1] = zeros;
+ } else {
+ // zeros start after ones
+ counts[0] = ones;
+ }
+ VisitRawValuesInline(
+ values, [&](bool v) { indices_begin[counts[v]++] = index++; },
+ [&]() { indices_begin[null_position++] = index++; });
+ return nulls_begin;
+ }
+};
+
+// Sort integers with counting sort or comparison based sorting algorithm
+// - Use O(n) counting sort if values are in a small range
+// - Use O(nlogn) std::stable_sort otherwise
+template <typename ArrowType>
+class ArrayCountOrCompareSorter {
+ using ArrayType = typename TypeTraits<ArrowType>::ArrayType;
+ using c_type = typename ArrowType::c_type;
+
+ public:
+ // Returns where null starts.
+ //
+ // `offset` is used when this is called on a chunk of a chunked array
+ uint64_t* Sort(uint64_t* indices_begin, uint64_t* indices_end, const ArrayType& values,
+ int64_t offset, const ArraySortOptions& options) {
+ if (values.length() >= countsort_min_len_ && values.length() > values.null_count()) {
+ c_type min, max;
+ std::tie(min, max) = GetMinMax<c_type>(*values.data());
+
+ // For signed int32/64, (max - min) may overflow and trigger UBSAN.
+ // Cast to largest unsigned type(uint64_t) before subtraction.
+ if (static_cast<uint64_t>(max) - static_cast<uint64_t>(min) <=
+ countsort_max_range_) {
+ count_sorter_.SetMinMax(min, max);
+ return count_sorter_.Sort(indices_begin, indices_end, values, offset, options);
+ }
+ }
+
+ return compare_sorter_.Sort(indices_begin, indices_end, values, offset, options);
+ }
+
+ private:
+ ArrayCompareSorter<ArrowType> compare_sorter_;
+ ArrayCountSorter<ArrowType> count_sorter_;
+
+ // Cross point to prefer counting sort than stl::stable_sort(merge sort)
+ // - array to be sorted is longer than "count_min_len_"
+ // - value range (max-min) is within "count_max_range_"
+ //
+ // The optimal setting depends heavily on running CPU. Below setting is
+ // conservative to adapt to various hardware and keep code simple.
+ // It's possible to decrease array-len and/or increase value-range to cover
+ // more cases, or setup a table for best array-len/value-range combinations.
+ // See https://issues.apache.org/jira/browse/ARROW-1571 for detailed analysis.
+ static const uint32_t countsort_min_len_ = 1024;
+ static const uint32_t countsort_max_range_ = 4096;
+};
+
+template <typename Type, typename Enable = void>
+struct ArraySorter;
+
+template <>
+struct ArraySorter<BooleanType> {
+ ArrayCountSorter<BooleanType> impl;
+};
+
+template <>
+struct ArraySorter<UInt8Type> {
+ ArrayCountSorter<UInt8Type> impl;
+ ArraySorter() : impl(0, 255) {}
+};
+
+template <>
+struct ArraySorter<Int8Type> {
+ ArrayCountSorter<Int8Type> impl;
+ ArraySorter() : impl(-128, 127) {}
+};
+
+template <typename Type>
+struct ArraySorter<Type, enable_if_t<(is_integer_type<Type>::value &&
+ (sizeof(typename Type::c_type) > 1)) ||
+ is_temporal_type<Type>::value>> {
+ ArrayCountOrCompareSorter<Type> impl;
+};
+
+template <typename Type>
+struct ArraySorter<
+ Type, enable_if_t<is_floating_type<Type>::value || is_base_binary_type<Type>::value ||
+ is_fixed_size_binary_type<Type>::value>> {
+ ArrayCompareSorter<Type> impl;
+};
+
+using ArraySortIndicesState = internal::OptionsWrapper<ArraySortOptions>;
+
+template <typename OutType, typename InType>
+struct ArraySortIndices {
+ using ArrayType = typename TypeTraits<InType>::ArrayType;
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const auto& options = ArraySortIndicesState::Get(ctx);
+
+ ArrayType arr(batch[0].array());
+ ArrayData* out_arr = out->mutable_array();
+ uint64_t* out_begin = out_arr->GetMutableValues<uint64_t>(1);
+ uint64_t* out_end = out_begin + arr.length();
+ std::iota(out_begin, out_end, 0);
+
+ ArraySorter<InType> sorter;
+ sorter.impl.Sort(out_begin, out_end, arr, 0, options);
+
+ return Status::OK();
+ }
+};
+
+// Sort indices kernels implemented for
+//
+// * Boolean type
+// * Number types
+// * Base binary types
+
+template <template <typename...> class ExecTemplate>
+void AddSortingKernels(VectorKernel base, VectorFunction* func) {
+ // bool type
+ base.signature = KernelSignature::Make({InputType::Array(boolean())}, uint64());
+ base.exec = ExecTemplate<UInt64Type, BooleanType>::Exec;
+ DCHECK_OK(func->AddKernel(base));
+
+ for (const auto& ty : NumericTypes()) {
+ auto physical_type = GetPhysicalType(ty);
+ base.signature = KernelSignature::Make({InputType::Array(ty)}, uint64());
+ base.exec = GenerateNumeric<ExecTemplate, UInt64Type>(*physical_type);
+ DCHECK_OK(func->AddKernel(base));
+ }
+ for (const auto& ty : TemporalTypes()) {
+ auto physical_type = GetPhysicalType(ty);
+ base.signature = KernelSignature::Make({InputType::Array(ty)}, uint64());
+ base.exec = GenerateNumeric<ExecTemplate, UInt64Type>(*physical_type);
+ DCHECK_OK(func->AddKernel(base));
+ }
+ for (const auto id : DecimalTypeIds()) {
+ base.signature = KernelSignature::Make({InputType::Array(id)}, uint64());
+ base.exec = GenerateDecimal<ExecTemplate, UInt64Type>(id);
+ DCHECK_OK(func->AddKernel(base));
+ }
+ for (const auto& ty : BaseBinaryTypes()) {
+ auto physical_type = GetPhysicalType(ty);
+ base.signature = KernelSignature::Make({InputType::Array(ty)}, uint64());
+ base.exec = GenerateVarBinaryBase<ExecTemplate, UInt64Type>(*physical_type);
+ DCHECK_OK(func->AddKernel(base));
+ }
+ base.signature =
+ KernelSignature::Make({InputType::Array(Type::FIXED_SIZE_BINARY)}, uint64());
+ base.exec = ExecTemplate<UInt64Type, FixedSizeBinaryType>::Exec;
+ DCHECK_OK(func->AddKernel(base));
+}
+
+// ----------------------------------------------------------------------
+// ChunkedArray sorting implementations
+
+// Sort a chunked array directly without sorting each array in the
+// chunked array. This is used for processing the second and following
+// sort keys in TableRadixSorter.
+//
+// This uses the same algorithm as ArrayCompareSorter.
+template <typename Type>
+class ChunkedArrayCompareSorter {
+ using ArrayType = typename TypeTraits<Type>::ArrayType;
+
+ public:
+ // Returns where null starts.
+ uint64_t* Sort(uint64_t* indices_begin, uint64_t* indices_end,
+ const std::vector<const Array*>& arrays, int64_t null_count,
+ const ArraySortOptions& options) {
+ auto nulls_begin = PartitionNulls<ArrayType, StablePartitioner>(
+ indices_begin, indices_end, arrays, null_count);
+ ChunkedArrayResolver resolver(arrays);
+ if (options.order == SortOrder::Ascending) {
+ std::stable_sort(indices_begin, nulls_begin, [&](uint64_t left, uint64_t right) {
+ const auto chunk_left = resolver.Resolve<ArrayType>(left);
+ const auto chunk_right = resolver.Resolve<ArrayType>(right);
+ return chunk_left.Value() < chunk_right.Value();
+ });
+ } else {
+ std::stable_sort(indices_begin, nulls_begin, [&](uint64_t left, uint64_t right) {
+ const auto chunk_left = resolver.Resolve<ArrayType>(left);
+ const auto chunk_right = resolver.Resolve<ArrayType>(right);
+ // We don't use 'left > right' here to reduce required operator.
+ // If we use 'right < left' here, '<' is only required.
+ return chunk_right.Value() < chunk_left.Value();
+ });
+ }
+ return nulls_begin;
+ }
+};
+
+// Sort a chunked array by sorting each array in the chunked array.
+//
+// TODO: This is a naive implementation. We'll be able to improve
+// performance of this. For example, we'll be able to use threads for
+// sorting each array.
+class ChunkedArraySorter : public TypeVisitor {
+ public:
+ ChunkedArraySorter(ExecContext* ctx, uint64_t* indices_begin, uint64_t* indices_end,
+ const ChunkedArray& chunked_array, const SortOrder order,
+ bool can_use_array_sorter = true)
+ : TypeVisitor(),
+ indices_begin_(indices_begin),
+ indices_end_(indices_end),
+ chunked_array_(chunked_array),
+ physical_type_(GetPhysicalType(chunked_array.type())),
+ physical_chunks_(GetPhysicalChunks(chunked_array_, physical_type_)),
+ order_(order),
+ can_use_array_sorter_(can_use_array_sorter),
+ ctx_(ctx) {}
+
+ Status Sort() { return physical_type_->Accept(this); }
+
+#define VISIT(TYPE) \
+ Status Visit(const TYPE& type) override { return SortInternal<TYPE>(); }
+
+ VISIT_PHYSICAL_TYPES(VISIT)
+
+#undef VISIT
+
+ private:
+ template <typename Type>
+ Status SortInternal() {
+ using ArrayType = typename TypeTraits<Type>::ArrayType;
+ ArraySortOptions options(order_);
+ const auto num_chunks = chunked_array_.num_chunks();
+ if (num_chunks == 0) {
+ return Status::OK();
+ }
+ const auto arrays = GetArrayPointers(physical_chunks_);
+ if (can_use_array_sorter_) {
+ // Sort each chunk independently and merge to sorted indices.
+ // This is a serial implementation.
+ ArraySorter<Type> sorter;
+ struct SortedChunk {
+ int64_t begin_offset;
+ int64_t end_offset;
+ int64_t nulls_offset;
+ };
+ std::vector<SortedChunk> sorted(num_chunks);
+
+ // First sort all individual chunks
+ int64_t begin_offset = 0;
+ int64_t end_offset = 0;
+ int64_t null_count = 0;
+ for (int i = 0; i < num_chunks; ++i) {
+ const auto array = checked_cast<const ArrayType*>(arrays[i]);
+ end_offset += array->length();
+ null_count += array->null_count();
+ uint64_t* nulls_begin =
+ sorter.impl.Sort(indices_begin_ + begin_offset, indices_begin_ + end_offset,
+ *array, begin_offset, options);
+ sorted[i] = {begin_offset, end_offset, nulls_begin - indices_begin_};
+ begin_offset = end_offset;
+ }
+ DCHECK_EQ(end_offset, indices_end_ - indices_begin_);
+
+ std::unique_ptr<Buffer> temp_buffer;
+ uint64_t* temp_indices = nullptr;
+ if (sorted.size() > 1) {
+ ARROW_ASSIGN_OR_RAISE(
+ temp_buffer,
+ AllocateBuffer(sizeof(int64_t) * (indices_end_ - indices_begin_ - null_count),
+ ctx_->memory_pool()));
+ temp_indices = reinterpret_cast<uint64_t*>(temp_buffer->mutable_data());
+ }
+
+ // Then merge them by pairs, recursively
+ while (sorted.size() > 1) {
+ auto out_it = sorted.begin();
+ auto it = sorted.begin();
+ while (it < sorted.end() - 1) {
+ const auto& left = *it++;
+ const auto& right = *it++;
+ DCHECK_EQ(left.end_offset, right.begin_offset);
+ DCHECK_GE(left.nulls_offset, left.begin_offset);
+ DCHECK_LE(left.nulls_offset, left.end_offset);
+ DCHECK_GE(right.nulls_offset, right.begin_offset);
+ DCHECK_LE(right.nulls_offset, right.end_offset);
+ uint64_t* nulls_begin = Merge<ArrayType>(
+ indices_begin_ + left.begin_offset, indices_begin_ + left.end_offset,
+ indices_begin_ + right.end_offset, indices_begin_ + left.nulls_offset,
+ indices_begin_ + right.nulls_offset, arrays, null_count, order_,
+ temp_indices);
+ *out_it++ = {left.begin_offset, right.end_offset, nulls_begin - indices_begin_};
+ }
+ if (it < sorted.end()) {
+ *out_it++ = *it++;
+ }
+ sorted.erase(out_it, sorted.end());
+ }
+ DCHECK_EQ(sorted.size(), 1);
+ DCHECK_EQ(sorted[0].begin_offset, 0);
+ DCHECK_EQ(sorted[0].end_offset, chunked_array_.length());
+ // Note that "nulls" can also include NaNs, hence the >= check
+ DCHECK_GE(chunked_array_.length() - sorted[0].nulls_offset, null_count);
+ } else {
+ // Sort the chunked array directory.
+ ChunkedArrayCompareSorter<Type> sorter;
+ sorter.Sort(indices_begin_, indices_end_, arrays, chunked_array_.null_count(),
+ options);
+ }
+ return Status::OK();
+ }
+
+ // Merges two sorted indices arrays and returns where nulls starts.
+ // Where nulls starts is used when the next merge to detect the
+ // sorted indices locations.
+ template <typename ArrayType>
+ uint64_t* Merge(uint64_t* indices_begin, uint64_t* indices_middle,
+ uint64_t* indices_end, uint64_t* left_nulls_begin,
+ uint64_t* right_nulls_begin, const std::vector<const Array*>& arrays,
+ int64_t null_count, const SortOrder order, uint64_t* temp_indices) {
+ // Input layout:
+ // [left non-nulls .... left nulls .... right non-nulls .... right nulls]
+ // ^ ^ ^ ^
+ // | | | |
+ // indices_begin left_nulls_begin indices_middle right_nulls_begin
+ auto left_num_non_nulls = left_nulls_begin - indices_begin;
+ auto right_num_non_nulls = right_nulls_begin - indices_middle;
+
+ // Mutate the input, stably, to obtain the following layout:
+ // [left non-nulls .... right non-nulls .... left nulls .... right nulls]
+ // ^ ^ ^ ^
+ // | | | |
+ // indices_begin indices_middle nulls_begin right_nulls_begin
+ std::rotate(left_nulls_begin, indices_middle, right_nulls_begin);
+ auto nulls_begin = indices_begin + left_num_non_nulls + right_num_non_nulls;
+ // If the type has null-like values (such as NaN), ensure those plus regular
+ // nulls are partitioned in the right order. Note this assumes that all
+ // null-like values (e.g. NaN) are ordered equally.
+ if (NullTraits<typename ArrayType::TypeClass>::has_null_like_values) {
+ PartitionNullsOnly<StablePartitioner>(nulls_begin, indices_end, arrays, null_count);
+ }
+
+ // Merge the non-null values into temp area
+ indices_middle = indices_begin + left_num_non_nulls;
+ indices_end = indices_middle + right_num_non_nulls;
+ const ChunkedArrayResolver left_resolver(arrays);
+ const ChunkedArrayResolver right_resolver(arrays);
+ if (order == SortOrder::Ascending) {
+ std::merge(indices_begin, indices_middle, indices_middle, indices_end, temp_indices,
+ [&](uint64_t left, uint64_t right) {
+ const auto chunk_left = left_resolver.Resolve<ArrayType>(left);
+ const auto chunk_right = right_resolver.Resolve<ArrayType>(right);
+ return chunk_left.Value() < chunk_right.Value();
+ });
+ } else {
+ std::merge(indices_begin, indices_middle, indices_middle, indices_end, temp_indices,
+ [&](uint64_t left, uint64_t right) {
+ const auto chunk_left = left_resolver.Resolve<ArrayType>(left);
+ const auto chunk_right = right_resolver.Resolve<ArrayType>(right);
+ // We don't use 'left > right' here to reduce required
+ // operator. If we use 'right < left' here, '<' is only
+ // required.
+ return chunk_right.Value() < chunk_left.Value();
+ });
+ }
+ // Copy back temp area into main buffer
+ std::copy(temp_indices, temp_indices + (nulls_begin - indices_begin), indices_begin);
+ return nulls_begin;
+ }
+
+ uint64_t* indices_begin_;
+ uint64_t* indices_end_;
+ const ChunkedArray& chunked_array_;
+ const std::shared_ptr<DataType> physical_type_;
+ const ArrayVector physical_chunks_;
+ const SortOrder order_;
+ const bool can_use_array_sorter_;
+ ExecContext* ctx_;
+};
+
+// ----------------------------------------------------------------------
+// Record batch sorting implementation(s)
+
+// Visit contiguous ranges of equal values. All entries are assumed
+// to be non-null.
+template <typename ArrayType, typename Visitor>
+void VisitConstantRanges(const ArrayType& array, uint64_t* indices_begin,
+ uint64_t* indices_end, Visitor&& visit) {
+ using GetView = GetViewType<typename ArrayType::TypeClass>;
+
+ if (indices_begin == indices_end) {
+ return;
+ }
+ auto range_start = indices_begin;
+ auto range_cur = range_start;
+ auto last_value = GetView::LogicalValue(array.GetView(*range_cur));
+ while (++range_cur != indices_end) {
+ auto v = GetView::LogicalValue(array.GetView(*range_cur));
+ if (v != last_value) {
+ visit(range_start, range_cur);
+ range_start = range_cur;
+ last_value = v;
+ }
+ }
+ if (range_start != range_cur) {
+ visit(range_start, range_cur);
+ }
+}
+
+// A sorter for a single column of a RecordBatch, deferring to the next column
+// for ranges of equal values.
+class RecordBatchColumnSorter {
+ public:
+ explicit RecordBatchColumnSorter(RecordBatchColumnSorter* next_column = nullptr)
+ : next_column_(next_column) {}
+ virtual ~RecordBatchColumnSorter() {}
+
+ virtual void SortRange(uint64_t* indices_begin, uint64_t* indices_end) = 0;
+
+ protected:
+ RecordBatchColumnSorter* next_column_;
+};
+
+template <typename Type>
+class ConcreteRecordBatchColumnSorter : public RecordBatchColumnSorter {
+ public:
+ using ArrayType = typename TypeTraits<Type>::ArrayType;
+
+ ConcreteRecordBatchColumnSorter(std::shared_ptr<Array> array, SortOrder order,
+ RecordBatchColumnSorter* next_column = nullptr)
+ : RecordBatchColumnSorter(next_column),
+ owned_array_(std::move(array)),
+ array_(checked_cast<const ArrayType&>(*owned_array_)),
+ order_(order),
+ null_count_(array_.null_count()) {}
+
+ void SortRange(uint64_t* indices_begin, uint64_t* indices_end) {
+ using GetView = GetViewType<Type>;
+
+ constexpr int64_t offset = 0;
+ uint64_t* nulls_begin;
+ if (null_count_ == 0) {
+ nulls_begin = indices_end;
+ } else {
+ // NOTE that null_count_ is merely an upper bound on the number of nulls
+ // in this particular range.
+ nulls_begin = PartitionNullsOnly<StablePartitioner>(indices_begin, indices_end,
+ array_, offset);
+ DCHECK_LE(indices_end - nulls_begin, null_count_);
+ }
+ uint64_t* null_likes_begin = PartitionNullLikes<ArrayType, StablePartitioner>(
+ indices_begin, nulls_begin, array_, offset);
+
+ // TODO This is roughly the same as ArrayCompareSorter.
+ // Also, we would like to use a counting sort if possible. This requires
+ // a counting sort compatible with indirect indexing.
+ if (order_ == SortOrder::Ascending) {
+ std::stable_sort(
+ indices_begin, null_likes_begin, [&](uint64_t left, uint64_t right) {
+ const auto lhs = GetView::LogicalValue(array_.GetView(left - offset));
+ const auto rhs = GetView::LogicalValue(array_.GetView(right - offset));
+ return lhs < rhs;
+ });
+ } else {
+ std::stable_sort(
+ indices_begin, null_likes_begin, [&](uint64_t left, uint64_t right) {
+ // We don't use 'left > right' here to reduce required operator.
+ // If we use 'right < left' here, '<' is only required.
+ const auto lhs = GetView::LogicalValue(array_.GetView(left - offset));
+ const auto rhs = GetView::LogicalValue(array_.GetView(right - offset));
+ return lhs > rhs;
+ });
+ }
+
+ if (next_column_ != nullptr) {
+ // Visit all ranges of equal values in this column and sort them on
+ // the next column.
+ SortNextColumn(null_likes_begin, nulls_begin);
+ SortNextColumn(nulls_begin, indices_end);
+ VisitConstantRanges(array_, indices_begin, null_likes_begin,
+ [&](uint64_t* range_start, uint64_t* range_end) {
+ SortNextColumn(range_start, range_end);
+ });
+ }
+ }
+
+ void SortNextColumn(uint64_t* indices_begin, uint64_t* indices_end) {
+ // Avoid the cost of a virtual method call in trivial cases
+ if (indices_end - indices_begin > 1) {
+ next_column_->SortRange(indices_begin, indices_end);
+ }
+ }
+
+ protected:
+ const std::shared_ptr<Array> owned_array_;
+ const ArrayType& array_;
+ const SortOrder order_;
+ const int64_t null_count_;
+};
+
+// Sort a batch using a single-pass left-to-right radix sort.
+class RadixRecordBatchSorter {
+ public:
+ RadixRecordBatchSorter(uint64_t* indices_begin, uint64_t* indices_end,
+ const RecordBatch& batch, const SortOptions& options)
+ : batch_(batch),
+ options_(options),
+ indices_begin_(indices_begin),
+ indices_end_(indices_end) {}
+
+ Status Sort() {
+ ARROW_ASSIGN_OR_RAISE(const auto sort_keys,
+ ResolveSortKeys(batch_, options_.sort_keys));
+
+ // Create column sorters from right to left
+ std::vector<std::unique_ptr<RecordBatchColumnSorter>> column_sorts(sort_keys.size());
+ RecordBatchColumnSorter* next_column = nullptr;
+ for (int64_t i = static_cast<int64_t>(sort_keys.size() - 1); i >= 0; --i) {
+ ColumnSortFactory factory(sort_keys[i], next_column);
+ ARROW_ASSIGN_OR_RAISE(column_sorts[i], factory.MakeColumnSort());
+ next_column = column_sorts[i].get();
+ }
+
+ // Sort from left to right
+ column_sorts.front()->SortRange(indices_begin_, indices_end_);
+ return Status::OK();
+ }
+
+ protected:
+ struct ResolvedSortKey {
+ std::shared_ptr<Array> array;
+ SortOrder order;
+ };
+
+ struct ColumnSortFactory {
+ ColumnSortFactory(const ResolvedSortKey& sort_key,
+ RecordBatchColumnSorter* next_column)
+ : physical_type(GetPhysicalType(sort_key.array->type())),
+ array(GetPhysicalArray(*sort_key.array, physical_type)),
+ order(sort_key.order),
+ next_column(next_column) {}
+
+ Result<std::unique_ptr<RecordBatchColumnSorter>> MakeColumnSort() {
+ RETURN_NOT_OK(VisitTypeInline(*physical_type, this));
+ DCHECK_NE(result, nullptr);
+ return std::move(result);
+ }
+
+#define VISIT(TYPE) \
+ Status Visit(const TYPE& type) { return VisitGeneric(type); }
+
+ VISIT_PHYSICAL_TYPES(VISIT)
+
+#undef VISIT
+
+ Status Visit(const DataType& type) {
+ return Status::TypeError("Unsupported type for RecordBatch sorting: ",
+ type.ToString());
+ }
+
+ template <typename Type>
+ Status VisitGeneric(const Type&) {
+ result.reset(new ConcreteRecordBatchColumnSorter<Type>(array, order, next_column));
+ return Status::OK();
+ }
+
+ std::shared_ptr<DataType> physical_type;
+ std::shared_ptr<Array> array;
+ SortOrder order;
+ RecordBatchColumnSorter* next_column;
+ std::unique_ptr<RecordBatchColumnSorter> result;
+ };
+
+ static Result<std::vector<ResolvedSortKey>> ResolveSortKeys(
+ const RecordBatch& batch, const std::vector<SortKey>& sort_keys) {
+ std::vector<ResolvedSortKey> resolved;
+ resolved.reserve(sort_keys.size());
+ for (const auto& sort_key : sort_keys) {
+ auto array = batch.GetColumnByName(sort_key.name);
+ if (!array) {
+ return Status::Invalid("Nonexistent sort key column: ", sort_key.name);
+ }
+ resolved.push_back({std::move(array), sort_key.order});
+ }
+ return resolved;
+ }
+
+ const RecordBatch& batch_;
+ const SortOptions& options_;
+ uint64_t* indices_begin_;
+ uint64_t* indices_end_;
+};
+
+// Compare two records in the same RecordBatch or Table
+// (indexing is handled through ResolvedSortKey)
+template <typename ResolvedSortKey>
+class MultipleKeyComparator {
+ public:
+ explicit MultipleKeyComparator(const std::vector<ResolvedSortKey>& sort_keys)
+ : sort_keys_(sort_keys) {}
+
+ Status status() const { return status_; }
+
+ // Returns true if the left-th value should be ordered before the
+ // right-th value, false otherwise. The start_sort_key_index-th
+ // sort key and subsequent sort keys are used for comparison.
+ bool Compare(uint64_t left, uint64_t right, size_t start_sort_key_index) {
+ current_left_ = left;
+ current_right_ = right;
+ current_compared_ = 0;
+ auto num_sort_keys = sort_keys_.size();
+ for (size_t i = start_sort_key_index; i < num_sort_keys; ++i) {
+ current_sort_key_index_ = i;
+ status_ = VisitTypeInline(*sort_keys_[i].type, this);
+ // If the left value equals to the right value, we need to
+ // continue to sort.
+ if (current_compared_ != 0) {
+ break;
+ }
+ }
+ return current_compared_ < 0;
+ }
+
+#define VISIT(TYPE) \
+ Status Visit(const TYPE& type) { \
+ current_compared_ = CompareType<TYPE>(); \
+ return Status::OK(); \
+ }
+
+ VISIT_PHYSICAL_TYPES(VISIT)
+
+#undef VISIT
+
+ Status Visit(const DataType& type) {
+ return Status::TypeError("Unsupported type for RecordBatch sorting: ",
+ type.ToString());
+ }
+
+ private:
+ // Compares two records in the same table and returns -1, 0 or 1.
+ //
+ // -1: The left is less than the right.
+ // 0: The left equals to the right.
+ // 1: The left is greater than the right.
+ //
+ // This supports null and NaN. Null is processed in this and NaN
+ // is processed in CompareTypeValue().
+ template <typename Type>
+ int32_t CompareType() {
+ using ArrayType = typename TypeTraits<Type>::ArrayType;
+ const auto& sort_key = sort_keys_[current_sort_key_index_];
+ auto order = sort_key.order;
+ const auto chunk_left = sort_key.template GetChunk<ArrayType>(current_left_);
+ const auto chunk_right = sort_key.template GetChunk<ArrayType>(current_right_);
+ if (sort_key.null_count > 0) {
+ auto is_null_left = chunk_left.IsNull();
+ auto is_null_right = chunk_right.IsNull();
+ if (is_null_left && is_null_right) {
+ return 0;
+ } else if (is_null_left) {
+ return 1;
+ } else if (is_null_right) {
+ return -1;
+ }
+ }
+ return CompareTypeValue<Type>(chunk_left, chunk_right, order);
+ }
+
+ // For non-float types. Value is never NaN.
+ template <typename Type>
+ enable_if_t<!is_floating_type<Type>::value, int32_t> CompareTypeValue(
+ const ResolvedChunk<typename TypeTraits<Type>::ArrayType>& chunk_left,
+ const ResolvedChunk<typename TypeTraits<Type>::ArrayType>& chunk_right,
+ const SortOrder order) {
+ const auto left = chunk_left.Value();
+ const auto right = chunk_right.Value();
+ int32_t compared;
+ if (left == right) {
+ compared = 0;
+ } else if (left > right) {
+ compared = 1;
+ } else {
+ compared = -1;
+ }
+ if (order == SortOrder::Descending) {
+ compared = -compared;
+ }
+ return compared;
+ }
+
+ // For float types. Value may be NaN.
+ template <typename Type>
+ enable_if_t<is_floating_type<Type>::value, int32_t> CompareTypeValue(
+ const ResolvedChunk<typename TypeTraits<Type>::ArrayType>& chunk_left,
+ const ResolvedChunk<typename TypeTraits<Type>::ArrayType>& chunk_right,
+ const SortOrder order) {
+ const auto left = chunk_left.Value();
+ const auto right = chunk_right.Value();
+ auto is_nan_left = std::isnan(left);
+ auto is_nan_right = std::isnan(right);
+ if (is_nan_left && is_nan_right) {
+ return 0;
+ } else if (is_nan_left) {
+ return 1;
+ } else if (is_nan_right) {
+ return -1;
+ }
+ int32_t compared;
+ if (left == right) {
+ compared = 0;
+ } else if (left > right) {
+ compared = 1;
+ } else {
+ compared = -1;
+ }
+ if (order == SortOrder::Descending) {
+ compared = -compared;
+ }
+ return compared;
+ }
+
+ const std::vector<ResolvedSortKey>& sort_keys_;
+ Status status_;
+ int64_t current_left_;
+ int64_t current_right_;
+ size_t current_sort_key_index_;
+ int32_t current_compared_;
+};
+
+// Sort a batch using a single sort and multiple-key comparisons.
+class MultipleKeyRecordBatchSorter : public TypeVisitor {
+ private:
+ // Preprocessed sort key.
+ struct ResolvedSortKey {
+ ResolvedSortKey(const std::shared_ptr<Array>& array, const SortOrder order)
+ : type(GetPhysicalType(array->type())),
+ owned_array(GetPhysicalArray(*array, type)),
+ array(*owned_array),
+ order(order),
+ null_count(array->null_count()) {}
+
+ template <typename ArrayType>
+ ResolvedChunk<ArrayType> GetChunk(int64_t index) const {
+ return {&checked_cast<const ArrayType&>(array), index};
+ }
+
+ const std::shared_ptr<DataType> type;
+ std::shared_ptr<Array> owned_array;
+ const Array& array;
+ SortOrder order;
+ int64_t null_count;
+ };
+
+ using Comparator = MultipleKeyComparator<ResolvedSortKey>;
+
+ public:
+ MultipleKeyRecordBatchSorter(uint64_t* indices_begin, uint64_t* indices_end,
+ const RecordBatch& batch, const SortOptions& options)
+ : indices_begin_(indices_begin),
+ indices_end_(indices_end),
+ sort_keys_(ResolveSortKeys(batch, options.sort_keys, &status_)),
+ comparator_(sort_keys_) {}
+
+ // This is optimized for the first sort key. The first sort key sort
+ // is processed in this class. The second and following sort keys
+ // are processed in Comparator.
+ Status Sort() {
+ RETURN_NOT_OK(status_);
+ return sort_keys_[0].type->Accept(this);
+ }
+
+#define VISIT(TYPE) \
+ Status Visit(const TYPE& type) override { return SortInternal<TYPE>(); }
+
+ VISIT_PHYSICAL_TYPES(VISIT)
+
+#undef VISIT
+
+ private:
+ static std::vector<ResolvedSortKey> ResolveSortKeys(
+ const RecordBatch& batch, const std::vector<SortKey>& sort_keys, Status* status) {
+ std::vector<ResolvedSortKey> resolved;
+ for (const auto& sort_key : sort_keys) {
+ auto array = batch.GetColumnByName(sort_key.name);
+ if (!array) {
+ *status = Status::Invalid("Nonexistent sort key column: ", sort_key.name);
+ break;
+ }
+ resolved.emplace_back(array, sort_key.order);
+ }
+ return resolved;
+ }
+
+ template <typename Type>
+ Status SortInternal() {
+ using ArrayType = typename TypeTraits<Type>::ArrayType;
+
+ auto& comparator = comparator_;
+ const auto& first_sort_key = sort_keys_[0];
+ const ArrayType& array = checked_cast<const ArrayType&>(first_sort_key.array);
+ auto nulls_begin = indices_end_;
+ nulls_begin = PartitionNullsInternal<Type>(first_sort_key);
+ // Sort first-key non-nulls
+ std::stable_sort(indices_begin_, nulls_begin, [&](uint64_t left, uint64_t right) {
+ // Both values are never null nor NaN
+ // (otherwise they've been partitioned away above).
+ const auto value_left = array.GetView(left);
+ const auto value_right = array.GetView(right);
+ if (value_left != value_right) {
+ bool compared = value_left < value_right;
+ if (first_sort_key.order == SortOrder::Ascending) {
+ return compared;
+ } else {
+ return !compared;
+ }
+ }
+ // If the left value equals to the right value,
+ // we need to compare the second and following
+ // sort keys.
+ return comparator.Compare(left, right, 1);
+ });
+ return comparator_.status();
+ }
+
+ // Behaves like PatitionNulls() but this supports multiple sort keys.
+ //
+ // For non-float types.
+ template <typename Type>
+ enable_if_t<!is_floating_type<Type>::value, uint64_t*> PartitionNullsInternal(
+ const ResolvedSortKey& first_sort_key) {
+ using ArrayType = typename TypeTraits<Type>::ArrayType;
+ if (first_sort_key.null_count == 0) {
+ return indices_end_;
+ }
+ const ArrayType& array = checked_cast<const ArrayType&>(first_sort_key.array);
+ StablePartitioner partitioner;
+ auto nulls_begin = partitioner(indices_begin_, indices_end_,
+ [&](uint64_t index) { return !array.IsNull(index); });
+ // Sort all nulls by second and following sort keys
+ // TODO: could we instead run an independent sort from the second key on
+ // this slice?
+ if (nulls_begin != indices_end_) {
+ auto& comparator = comparator_;
+ std::stable_sort(nulls_begin, indices_end_,
+ [&comparator](uint64_t left, uint64_t right) {
+ return comparator.Compare(left, right, 1);
+ });
+ }
+ return nulls_begin;
+ }
+
+ // Behaves like PatitionNulls() but this supports multiple sort keys.
+ //
+ // For float types.
+ template <typename Type>
+ enable_if_t<is_floating_type<Type>::value, uint64_t*> PartitionNullsInternal(
+ const ResolvedSortKey& first_sort_key) {
+ using ArrayType = typename TypeTraits<Type>::ArrayType;
+ const ArrayType& array = checked_cast<const ArrayType&>(first_sort_key.array);
+ StablePartitioner partitioner;
+ uint64_t* nulls_begin;
+ if (first_sort_key.null_count == 0) {
+ nulls_begin = indices_end_;
+ } else {
+ nulls_begin = partitioner(indices_begin_, indices_end_,
+ [&](uint64_t index) { return !array.IsNull(index); });
+ }
+ uint64_t* nans_and_nulls_begin =
+ partitioner(indices_begin_, nulls_begin,
+ [&](uint64_t index) { return !std::isnan(array.GetView(index)); });
+ auto& comparator = comparator_;
+ if (nans_and_nulls_begin != nulls_begin) {
+ // Sort all NaNs by the second and following sort keys.
+ // TODO: could we instead run an independent sort from the second key on
+ // this slice?
+ std::stable_sort(nans_and_nulls_begin, nulls_begin,
+ [&comparator](uint64_t left, uint64_t right) {
+ return comparator.Compare(left, right, 1);
+ });
+ }
+ if (nulls_begin != indices_end_) {
+ // Sort all nulls by the second and following sort keys.
+ // TODO: could we instead run an independent sort from the second key on
+ // this slice?
+ std::stable_sort(nulls_begin, indices_end_,
+ [&comparator](uint64_t left, uint64_t right) {
+ return comparator.Compare(left, right, 1);
+ });
+ }
+ return nans_and_nulls_begin;
+ }
+
+ uint64_t* indices_begin_;
+ uint64_t* indices_end_;
+ Status status_;
+ std::vector<ResolvedSortKey> sort_keys_;
+ Comparator comparator_;
+};
+
+// ----------------------------------------------------------------------
+// Table sorting implementations
+
+// Sort a table using a radix sort-like algorithm.
+// A distinct stable sort is called for each sort key, from the last key to the first.
+class TableRadixSorter {
+ public:
+ Status Sort(ExecContext* ctx, uint64_t* indices_begin, uint64_t* indices_end,
+ const Table& table, const SortOptions& options) {
+ for (auto i = options.sort_keys.size(); i > 0; --i) {
+ const auto& sort_key = options.sort_keys[i - 1];
+ const auto& chunked_array = table.GetColumnByName(sort_key.name);
+ if (!chunked_array) {
+ return Status::Invalid("Nonexistent sort key column: ", sort_key.name);
+ }
+ // We can use ArraySorter only for the sort key that is
+ // processed first because ArraySorter doesn't care about
+ // existing indices.
+ const auto can_use_array_sorter = (i == 0);
+ ChunkedArraySorter sorter(ctx, indices_begin, indices_end, *chunked_array.get(),
+ sort_key.order, can_use_array_sorter);
+ ARROW_RETURN_NOT_OK(sorter.Sort());
+ }
+ return Status::OK();
+ }
+};
+
+// Sort a table using a single sort and multiple-key comparisons.
+class MultipleKeyTableSorter : public TypeVisitor {
+ private:
+ // TODO instead of resolving chunks for each column independently, we could
+ // split the table into RecordBatches and pay the cost of chunked indexing
+ // at the first column only.
+
+ // Preprocessed sort key.
+ struct ResolvedSortKey {
+ ResolvedSortKey(const ChunkedArray& chunked_array, const SortOrder order)
+ : order(order),
+ type(GetPhysicalType(chunked_array.type())),
+ chunks(GetPhysicalChunks(chunked_array, type)),
+ chunk_pointers(GetArrayPointers(chunks)),
+ null_count(chunked_array.null_count()),
+ num_chunks(chunked_array.num_chunks()),
+ resolver(chunk_pointers) {}
+
+ // Finds the target chunk and index in the target chunk from an
+ // index in chunked array.
+ template <typename ArrayType>
+ ResolvedChunk<ArrayType> GetChunk(int64_t index) const {
+ return resolver.Resolve<ArrayType>(index);
+ }
+
+ const SortOrder order;
+ const std::shared_ptr<DataType> type;
+ const ArrayVector chunks;
+ const std::vector<const Array*> chunk_pointers;
+ const int64_t null_count;
+ const int num_chunks;
+ const ChunkedArrayResolver resolver;
+ };
+
+ using Comparator = MultipleKeyComparator<ResolvedSortKey>;
+
+ public:
+ MultipleKeyTableSorter(uint64_t* indices_begin, uint64_t* indices_end,
+ const Table& table, const SortOptions& options)
+ : indices_begin_(indices_begin),
+ indices_end_(indices_end),
+ sort_keys_(ResolveSortKeys(table, options.sort_keys, &status_)),
+ comparator_(sort_keys_) {}
+
+ // This is optimized for the first sort key. The first sort key sort
+ // is processed in this class. The second and following sort keys
+ // are processed in Comparator.
+ Status Sort() {
+ ARROW_RETURN_NOT_OK(status_);
+ return sort_keys_[0].type->Accept(this);
+ }
+
+#define VISIT(TYPE) \
+ Status Visit(const TYPE& type) override { return SortInternal<TYPE>(); }
+
+ VISIT_PHYSICAL_TYPES(VISIT)
+
+#undef VISIT
+
+ private:
+ static std::vector<ResolvedSortKey> ResolveSortKeys(
+ const Table& table, const std::vector<SortKey>& sort_keys, Status* status) {
+ std::vector<ResolvedSortKey> resolved;
+ resolved.reserve(sort_keys.size());
+ for (const auto& sort_key : sort_keys) {
+ const auto& chunked_array = table.GetColumnByName(sort_key.name);
+ if (!chunked_array) {
+ *status = Status::Invalid("Nonexistent sort key column: ", sort_key.name);
+ break;
+ }
+ resolved.emplace_back(*chunked_array, sort_key.order);
+ }
+ return resolved;
+ }
+
+ template <typename Type>
+ Status SortInternal() {
+ using ArrayType = typename TypeTraits<Type>::ArrayType;
+
+ auto& comparator = comparator_;
+ const auto& first_sort_key = sort_keys_[0];
+ auto nulls_begin = indices_end_;
+ nulls_begin = PartitionNullsInternal<Type>(first_sort_key);
+ std::stable_sort(indices_begin_, nulls_begin, [&](uint64_t left, uint64_t right) {
+ // Both values are never null nor NaN.
+ auto chunk_left = first_sort_key.GetChunk<ArrayType>(left);
+ auto chunk_right = first_sort_key.GetChunk<ArrayType>(right);
+ auto value_left = chunk_left.Value();
+ auto value_right = chunk_right.Value();
+ if (value_left == value_right) {
+ // If the left value equals to the right value,
+ // we need to compare the second and following
+ // sort keys.
+ return comparator.Compare(left, right, 1);
+ } else {
+ auto compared = value_left < value_right;
+ if (first_sort_key.order == SortOrder::Ascending) {
+ return compared;
+ } else {
+ return !compared;
+ }
+ }
+ });
+ return comparator_.status();
+ }
+
+ // Behaves like PatitionNulls() but this supports multiple sort keys.
+ //
+ // For non-float types.
+ template <typename Type>
+ enable_if_t<!is_floating_type<Type>::value, uint64_t*> PartitionNullsInternal(
+ const ResolvedSortKey& first_sort_key) {
+ using ArrayType = typename TypeTraits<Type>::ArrayType;
+ if (first_sort_key.null_count == 0) {
+ return indices_end_;
+ }
+ StablePartitioner partitioner;
+ auto nulls_begin =
+ partitioner(indices_begin_, indices_end_, [&first_sort_key](uint64_t index) {
+ const auto chunk = first_sort_key.GetChunk<ArrayType>(index);
+ return !chunk.IsNull();
+ });
+ DCHECK_EQ(indices_end_ - nulls_begin, first_sort_key.null_count);
+ auto& comparator = comparator_;
+ std::stable_sort(nulls_begin, indices_end_, [&](uint64_t left, uint64_t right) {
+ return comparator.Compare(left, right, 1);
+ });
+ return nulls_begin;
+ }
+
+ // Behaves like PatitionNulls() but this supports multiple sort keys.
+ //
+ // For float types.
+ template <typename Type>
+ enable_if_t<is_floating_type<Type>::value, uint64_t*> PartitionNullsInternal(
+ const ResolvedSortKey& first_sort_key) {
+ using ArrayType = typename TypeTraits<Type>::ArrayType;
+ StablePartitioner partitioner;
+ uint64_t* nulls_begin;
+ if (first_sort_key.null_count == 0) {
+ nulls_begin = indices_end_;
+ } else {
+ nulls_begin = partitioner(indices_begin_, indices_end_, [&](uint64_t index) {
+ const auto chunk = first_sort_key.GetChunk<ArrayType>(index);
+ return !chunk.IsNull();
+ });
+ }
+ DCHECK_EQ(indices_end_ - nulls_begin, first_sort_key.null_count);
+ uint64_t* nans_begin = partitioner(indices_begin_, nulls_begin, [&](uint64_t index) {
+ const auto chunk = first_sort_key.GetChunk<ArrayType>(index);
+ return !std::isnan(chunk.Value());
+ });
+ auto& comparator = comparator_;
+ // Sort all NaNs by the second and following sort keys.
+ std::stable_sort(nans_begin, nulls_begin, [&](uint64_t left, uint64_t right) {
+ return comparator.Compare(left, right, 1);
+ });
+ // Sort all nulls by the second and following sort keys.
+ std::stable_sort(nulls_begin, indices_end_, [&](uint64_t left, uint64_t right) {
+ return comparator.Compare(left, right, 1);
+ });
+ return nans_begin;
+ }
+
+ uint64_t* indices_begin_;
+ uint64_t* indices_end_;
+ Status status_;
+ std::vector<ResolvedSortKey> sort_keys_;
+ Comparator comparator_;
+};
+
+// ----------------------------------------------------------------------
+// Top-level sort functions
+
+const auto kDefaultSortOptions = SortOptions::Defaults();
+
+const FunctionDoc sort_indices_doc(
+ "Return the indices that would sort an array, record batch or table",
+ ("This function computes an array of indices that define a stable sort\n"
+ "of the input array, record batch or table. Null values are considered\n"
+ "greater than any other value and are therefore sorted at the end of the\n"
+ "input. For floating-point types, NaNs are considered greater than any\n"
+ "other non-null value, but smaller than null values."),
+ {"input"}, "SortOptions");
+
+class SortIndicesMetaFunction : public MetaFunction {
+ public:
+ SortIndicesMetaFunction()
+ : MetaFunction("sort_indices", Arity::Unary(), &sort_indices_doc,
+ &kDefaultSortOptions) {}
+
+ Result<Datum> ExecuteImpl(const std::vector<Datum>& args,
+ const FunctionOptions* options,
+ ExecContext* ctx) const override {
+ const SortOptions& sort_options = static_cast<const SortOptions&>(*options);
+ switch (args[0].kind()) {
+ case Datum::ARRAY:
+ return SortIndices(*args[0].make_array(), sort_options, ctx);
+ break;
+ case Datum::CHUNKED_ARRAY:
+ return SortIndices(*args[0].chunked_array(), sort_options, ctx);
+ break;
+ case Datum::RECORD_BATCH: {
+ return SortIndices(*args[0].record_batch(), sort_options, ctx);
+ } break;
+ case Datum::TABLE:
+ return SortIndices(*args[0].table(), sort_options, ctx);
+ break;
+ default:
+ break;
+ }
+ return Status::NotImplemented(
+ "Unsupported types for sort_indices operation: "
+ "values=",
+ args[0].ToString());
+ }
+
+ private:
+ Result<Datum> SortIndices(const Array& values, const SortOptions& options,
+ ExecContext* ctx) const {
+ SortOrder order = SortOrder::Ascending;
+ if (!options.sort_keys.empty()) {
+ order = options.sort_keys[0].order;
+ }
+ ArraySortOptions array_options(order);
+ return CallFunction("array_sort_indices", {values}, &array_options, ctx);
+ }
+
+ Result<Datum> SortIndices(const ChunkedArray& chunked_array, const SortOptions& options,
+ ExecContext* ctx) const {
+ SortOrder order = SortOrder::Ascending;
+ if (!options.sort_keys.empty()) {
+ order = options.sort_keys[0].order;
+ }
+
+ auto out_type = uint64();
+ auto length = chunked_array.length();
+ auto buffer_size = BitUtil::BytesForBits(
+ length * std::static_pointer_cast<UInt64Type>(out_type)->bit_width());
+ std::vector<std::shared_ptr<Buffer>> buffers(2);
+ ARROW_ASSIGN_OR_RAISE(buffers[1],
+ AllocateResizableBuffer(buffer_size, ctx->memory_pool()));
+ auto out = std::make_shared<ArrayData>(out_type, length, buffers, 0);
+ auto out_begin = out->GetMutableValues<uint64_t>(1);
+ auto out_end = out_begin + length;
+ std::iota(out_begin, out_end, 0);
+
+ ChunkedArraySorter sorter(ctx, out_begin, out_end, chunked_array, order);
+ ARROW_RETURN_NOT_OK(sorter.Sort());
+ return Datum(out);
+ }
+
+ Result<Datum> SortIndices(const RecordBatch& batch, const SortOptions& options,
+ ExecContext* ctx) const {
+ auto n_sort_keys = options.sort_keys.size();
+ if (n_sort_keys == 0) {
+ return Status::Invalid("Must specify one or more sort keys");
+ }
+ if (n_sort_keys == 1) {
+ auto array = batch.GetColumnByName(options.sort_keys[0].name);
+ if (!array) {
+ return Status::Invalid("Nonexistent sort key column: ",
+ options.sort_keys[0].name);
+ }
+ return SortIndices(*array, options, ctx);
+ }
+
+ auto out_type = uint64();
+ auto length = batch.num_rows();
+ auto buffer_size = BitUtil::BytesForBits(
+ length * std::static_pointer_cast<UInt64Type>(out_type)->bit_width());
+ BufferVector buffers(2);
+ ARROW_ASSIGN_OR_RAISE(buffers[1],
+ AllocateResizableBuffer(buffer_size, ctx->memory_pool()));
+ auto out = std::make_shared<ArrayData>(out_type, length, buffers, 0);
+ auto out_begin = out->GetMutableValues<uint64_t>(1);
+ auto out_end = out_begin + length;
+ std::iota(out_begin, out_end, 0);
+
+ // Radix sorting is consistently faster except when there is a large number
+ // of sort keys, in which case it can end up degrading catastrophically.
+ // Cut off above 8 sort keys.
+ if (n_sort_keys <= 8) {
+ RadixRecordBatchSorter sorter(out_begin, out_end, batch, options);
+ ARROW_RETURN_NOT_OK(sorter.Sort());
+ } else {
+ MultipleKeyRecordBatchSorter sorter(out_begin, out_end, batch, options);
+ ARROW_RETURN_NOT_OK(sorter.Sort());
+ }
+ return Datum(out);
+ }
+
+ Result<Datum> SortIndices(const Table& table, const SortOptions& options,
+ ExecContext* ctx) const {
+ auto n_sort_keys = options.sort_keys.size();
+ if (n_sort_keys == 0) {
+ return Status::Invalid("Must specify one or more sort keys");
+ }
+ if (n_sort_keys == 1) {
+ auto chunked_array = table.GetColumnByName(options.sort_keys[0].name);
+ if (!chunked_array) {
+ return Status::Invalid("Nonexistent sort key column: ",
+ options.sort_keys[0].name);
+ }
+ return SortIndices(*chunked_array, options, ctx);
+ }
+
+ auto out_type = uint64();
+ auto length = table.num_rows();
+ auto buffer_size = BitUtil::BytesForBits(
+ length * std::static_pointer_cast<UInt64Type>(out_type)->bit_width());
+ std::vector<std::shared_ptr<Buffer>> buffers(2);
+ ARROW_ASSIGN_OR_RAISE(buffers[1],
+ AllocateResizableBuffer(buffer_size, ctx->memory_pool()));
+ auto out = std::make_shared<ArrayData>(out_type, length, buffers, 0);
+ auto out_begin = out->GetMutableValues<uint64_t>(1);
+ auto out_end = out_begin + length;
+ std::iota(out_begin, out_end, 0);
+
+ // TODO: We should choose suitable sort implementation
+ // automatically. The current TableRadixSorter implementation is
+ // faster than MultipleKeyTableSorter only when the number of
+ // sort keys is 2 and counting sort is used. So we always
+ // MultipleKeyTableSorter for now.
+ //
+ // TableRadixSorter sorter;
+ // ARROW_RETURN_NOT_OK(sorter.Sort(ctx, out_begin, out_end, table, options));
+ MultipleKeyTableSorter sorter(out_begin, out_end, table, options);
+ ARROW_RETURN_NOT_OK(sorter.Sort());
+ return Datum(out);
+ }
+};
+
+const auto kDefaultArraySortOptions = ArraySortOptions::Defaults();
+
+const FunctionDoc array_sort_indices_doc(
+ "Return the indices that would sort an array",
+ ("This function computes an array of indices that define a stable sort\n"
+ "of the input array. Null values are considered greater than any\n"
+ "other value and are therefore sorted at the end of the array.\n"
+ "For floating-point types, NaNs are considered greater than any\n"
+ "other non-null value, but smaller than null values."),
+ {"array"}, "ArraySortOptions");
+
+const FunctionDoc partition_nth_indices_doc(
+ "Return the indices that would partition an array around a pivot",
+ ("This functions computes an array of indices that define a non-stable\n"
+ "partial sort of the input array.\n"
+ "\n"
+ "The output is such that the `N`'th index points to the `N`'th element\n"
+ "of the input in sorted order, and all indices before the `N`'th point\n"
+ "to elements in the input less or equal to elements at or after the `N`'th.\n"
+ "\n"
+ "Null values are considered greater than any other value and are\n"
+ "therefore partitioned towards the end of the array.\n"
+ "For floating-point types, NaNs are considered greater than any\n"
+ "other non-null value, but smaller than null values.\n"
+ "\n"
+ "The pivot index `N` must be given in PartitionNthOptions."),
+ {"array"}, "PartitionNthOptions");
+
+} // namespace
+
+void RegisterVectorSort(FunctionRegistry* registry) {
+ // The kernel outputs into preallocated memory and is never null
+ VectorKernel base;
+ base.mem_allocation = MemAllocation::PREALLOCATE;
+ base.null_handling = NullHandling::OUTPUT_NOT_NULL;
+
+ auto array_sort_indices = std::make_shared<VectorFunction>(
+ "array_sort_indices", Arity::Unary(), &array_sort_indices_doc,
+ &kDefaultArraySortOptions);
+ base.init = ArraySortIndicesState::Init;
+ AddSortingKernels<ArraySortIndices>(base, array_sort_indices.get());
+ DCHECK_OK(registry->AddFunction(std::move(array_sort_indices)));
+
+ DCHECK_OK(registry->AddFunction(std::make_shared<SortIndicesMetaFunction>()));
+
+ // partition_nth_indices has a parameter so needs its init function
+ auto part_indices = std::make_shared<VectorFunction>(
+ "partition_nth_indices", Arity::Unary(), &partition_nth_indices_doc);
+ base.init = PartitionNthToIndicesState::Init;
+ AddSortingKernels<PartitionNthToIndices>(base, part_indices.get());
+ DCHECK_OK(registry->AddFunction(std::move(part_indices)));
+}
+
+#undef VISIT_PHYSICAL_TYPES
+
+} // namespace internal
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/registry.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/registry.cc
new file mode 100644
index 00000000000..ca7b6137306
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/registry.cc
@@ -0,0 +1,199 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/registry.h"
+
+#include <algorithm>
+#include <memory>
+#include <mutex>
+#include <unordered_map>
+#include <utility>
+
+#include "arrow/compute/function.h"
+#include "arrow/compute/function_internal.h"
+#include "arrow/compute/registry_internal.h"
+#include "arrow/status.h"
+#include "arrow/util/logging.h"
+
+namespace arrow {
+namespace compute {
+
+class FunctionRegistry::FunctionRegistryImpl {
+ public:
+ Status AddFunction(std::shared_ptr<Function> function, bool allow_overwrite) {
+ RETURN_NOT_OK(function->Validate());
+
+ std::lock_guard<std::mutex> mutation_guard(lock_);
+
+ const std::string& name = function->name();
+ auto it = name_to_function_.find(name);
+ if (it != name_to_function_.end() && !allow_overwrite) {
+ return Status::KeyError("Already have a function registered with name: ", name);
+ }
+ name_to_function_[name] = std::move(function);
+ return Status::OK();
+ }
+
+ Status AddAlias(const std::string& target_name, const std::string& source_name) {
+ std::lock_guard<std::mutex> mutation_guard(lock_);
+
+ auto it = name_to_function_.find(source_name);
+ if (it == name_to_function_.end()) {
+ return Status::KeyError("No function registered with name: ", source_name);
+ }
+ name_to_function_[target_name] = it->second;
+ return Status::OK();
+ }
+
+ Status AddFunctionOptionsType(const FunctionOptionsType* options_type,
+ bool allow_overwrite = false) {
+ std::lock_guard<std::mutex> mutation_guard(lock_);
+
+ const std::string name = options_type->type_name();
+ auto it = name_to_options_type_.find(name);
+ if (it != name_to_options_type_.end() && !allow_overwrite) {
+ return Status::KeyError(
+ "Already have a function options type registered with name: ", name);
+ }
+ name_to_options_type_[name] = options_type;
+ return Status::OK();
+ }
+
+ Result<std::shared_ptr<Function>> GetFunction(const std::string& name) const {
+ auto it = name_to_function_.find(name);
+ if (it == name_to_function_.end()) {
+ return Status::KeyError("No function registered with name: ", name);
+ }
+ return it->second;
+ }
+
+ std::vector<std::string> GetFunctionNames() const {
+ std::vector<std::string> results;
+ for (auto it : name_to_function_) {
+ results.push_back(it.first);
+ }
+ std::sort(results.begin(), results.end());
+ return results;
+ }
+
+ Result<const FunctionOptionsType*> GetFunctionOptionsType(
+ const std::string& name) const {
+ auto it = name_to_options_type_.find(name);
+ if (it == name_to_options_type_.end()) {
+ return Status::KeyError("No function options type registered with name: ", name);
+ }
+ return it->second;
+ }
+
+ int num_functions() const { return static_cast<int>(name_to_function_.size()); }
+
+ private:
+ std::mutex lock_;
+ std::unordered_map<std::string, std::shared_ptr<Function>> name_to_function_;
+ std::unordered_map<std::string, const FunctionOptionsType*> name_to_options_type_;
+};
+
+std::unique_ptr<FunctionRegistry> FunctionRegistry::Make() {
+ return std::unique_ptr<FunctionRegistry>(new FunctionRegistry());
+}
+
+FunctionRegistry::FunctionRegistry() { impl_.reset(new FunctionRegistryImpl()); }
+
+FunctionRegistry::~FunctionRegistry() {}
+
+Status FunctionRegistry::AddFunction(std::shared_ptr<Function> function,
+ bool allow_overwrite) {
+ return impl_->AddFunction(std::move(function), allow_overwrite);
+}
+
+Status FunctionRegistry::AddAlias(const std::string& target_name,
+ const std::string& source_name) {
+ return impl_->AddAlias(target_name, source_name);
+}
+
+Status FunctionRegistry::AddFunctionOptionsType(const FunctionOptionsType* options_type,
+ bool allow_overwrite) {
+ return impl_->AddFunctionOptionsType(options_type, allow_overwrite);
+}
+
+Result<std::shared_ptr<Function>> FunctionRegistry::GetFunction(
+ const std::string& name) const {
+ return impl_->GetFunction(name);
+}
+
+std::vector<std::string> FunctionRegistry::GetFunctionNames() const {
+ return impl_->GetFunctionNames();
+}
+
+Result<const FunctionOptionsType*> FunctionRegistry::GetFunctionOptionsType(
+ const std::string& name) const {
+ return impl_->GetFunctionOptionsType(name);
+}
+
+int FunctionRegistry::num_functions() const { return impl_->num_functions(); }
+
+namespace internal {
+
+static std::unique_ptr<FunctionRegistry> CreateBuiltInRegistry() {
+ auto registry = FunctionRegistry::Make();
+
+ // Scalar functions
+ RegisterScalarArithmetic(registry.get());
+ RegisterScalarBoolean(registry.get());
+ RegisterScalarCast(registry.get());
+ RegisterScalarComparison(registry.get());
+ RegisterScalarNested(registry.get());
+ RegisterScalarSetLookup(registry.get());
+ RegisterScalarStringAscii(registry.get());
+ RegisterScalarValidity(registry.get());
+ RegisterScalarFillNull(registry.get());
+ RegisterScalarIfElse(registry.get());
+ RegisterScalarTemporal(registry.get());
+
+ RegisterScalarOptions(registry.get());
+
+ // Vector functions
+ RegisterVectorHash(registry.get());
+ RegisterVectorReplace(registry.get());
+ RegisterVectorSelection(registry.get());
+ RegisterVectorNested(registry.get());
+ RegisterVectorSort(registry.get());
+
+ RegisterVectorOptions(registry.get());
+
+ // Aggregate functions
+ RegisterScalarAggregateBasic(registry.get());
+ RegisterScalarAggregateMode(registry.get());
+ RegisterScalarAggregateQuantile(registry.get());
+ RegisterScalarAggregateTDigest(registry.get());
+ RegisterScalarAggregateVariance(registry.get());
+ RegisterHashAggregateBasic(registry.get());
+
+ RegisterAggregateOptions(registry.get());
+
+ return registry;
+}
+
+} // namespace internal
+
+FunctionRegistry* GetFunctionRegistry() {
+ static auto g_registry = internal::CreateBuiltInRegistry();
+ return g_registry.get();
+}
+
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/registry.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/registry.h
new file mode 100644
index 00000000000..e83036db6ac
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/registry.h
@@ -0,0 +1,93 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// NOTE: API is EXPERIMENTAL and will change without going through a
+// deprecation cycle
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace compute {
+
+class Function;
+class FunctionOptionsType;
+
+/// \brief A mutable central function registry for built-in functions as well
+/// as user-defined functions. Functions are implementations of
+/// arrow::compute::Function.
+///
+/// Generally, each function contains kernels which are implementations of a
+/// function for a specific argument signature. After looking up a function in
+/// the registry, one can either execute it eagerly with Function::Execute or
+/// use one of the function's dispatch methods to pick a suitable kernel for
+/// lower-level function execution.
+class ARROW_EXPORT FunctionRegistry {
+ public:
+ ~FunctionRegistry();
+
+ /// \brief Construct a new registry. Most users only need to use the global
+ /// registry
+ static std::unique_ptr<FunctionRegistry> Make();
+
+ /// \brief Add a new function to the registry. Returns Status::KeyError if a
+ /// function with the same name is already registered
+ Status AddFunction(std::shared_ptr<Function> function, bool allow_overwrite = false);
+
+ /// \brief Add aliases for the given function name. Returns Status::KeyError if the
+ /// function with the given name is not registered
+ Status AddAlias(const std::string& target_name, const std::string& source_name);
+
+ /// \brief Add a new function options type to the registry. Returns Status::KeyError if
+ /// a function options type with the same name is already registered
+ Status AddFunctionOptionsType(const FunctionOptionsType* options_type,
+ bool allow_overwrite = false);
+
+ /// \brief Retrieve a function by name from the registry
+ Result<std::shared_ptr<Function>> GetFunction(const std::string& name) const;
+
+ /// \brief Return vector of all entry names in the registry. Helpful for
+ /// displaying a manifest of available functions
+ std::vector<std::string> GetFunctionNames() const;
+
+ /// \brief Retrieve a function options type by name from the registry
+ Result<const FunctionOptionsType*> GetFunctionOptionsType(
+ const std::string& name) const;
+
+ /// \brief The number of currently registered functions
+ int num_functions() const;
+
+ private:
+ FunctionRegistry();
+
+ // Use PIMPL pattern to not have std::unordered_map here
+ class FunctionRegistryImpl;
+ std::unique_ptr<FunctionRegistryImpl> impl_;
+};
+
+/// \brief Return the process-global function registry
+ARROW_EXPORT FunctionRegistry* GetFunctionRegistry();
+
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/registry_internal.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/registry_internal.h
new file mode 100644
index 00000000000..892b54341da
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/registry_internal.h
@@ -0,0 +1,63 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+namespace arrow {
+namespace compute {
+
+class FunctionRegistry;
+
+namespace internal {
+
+// Built-in scalar / elementwise functions
+void RegisterScalarArithmetic(FunctionRegistry* registry);
+void RegisterScalarBoolean(FunctionRegistry* registry);
+void RegisterScalarCast(FunctionRegistry* registry);
+void RegisterScalarComparison(FunctionRegistry* registry);
+void RegisterScalarNested(FunctionRegistry* registry);
+void RegisterScalarSetLookup(FunctionRegistry* registry);
+void RegisterScalarStringAscii(FunctionRegistry* registry);
+void RegisterScalarValidity(FunctionRegistry* registry);
+void RegisterScalarFillNull(FunctionRegistry* registry);
+void RegisterScalarIfElse(FunctionRegistry* registry);
+void RegisterScalarTemporal(FunctionRegistry* registry);
+
+void RegisterScalarOptions(FunctionRegistry* registry);
+
+// Vector functions
+void RegisterVectorHash(FunctionRegistry* registry);
+void RegisterVectorReplace(FunctionRegistry* registry);
+void RegisterVectorSelection(FunctionRegistry* registry);
+void RegisterVectorNested(FunctionRegistry* registry);
+void RegisterVectorSort(FunctionRegistry* registry);
+
+void RegisterVectorOptions(FunctionRegistry* registry);
+
+// Aggregate functions
+void RegisterScalarAggregateBasic(FunctionRegistry* registry);
+void RegisterScalarAggregateMode(FunctionRegistry* registry);
+void RegisterScalarAggregateQuantile(FunctionRegistry* registry);
+void RegisterScalarAggregateTDigest(FunctionRegistry* registry);
+void RegisterScalarAggregateVariance(FunctionRegistry* registry);
+void RegisterHashAggregateBasic(FunctionRegistry* registry);
+
+void RegisterAggregateOptions(FunctionRegistry* registry);
+
+} // namespace internal
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/type_fwd.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/type_fwd.h
new file mode 100644
index 00000000000..eebc8c1b678
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/type_fwd.h
@@ -0,0 +1,48 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+namespace arrow {
+
+struct Datum;
+struct ValueDescr;
+
+namespace compute {
+
+class Function;
+class FunctionOptions;
+
+class CastOptions;
+
+struct ExecBatch;
+class ExecContext;
+class KernelContext;
+
+struct Kernel;
+struct ScalarKernel;
+struct ScalarAggregateKernel;
+struct VectorKernel;
+
+struct KernelState;
+
+class Expression;
+class ExecNode;
+class ExecPlan;
+
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/util_internal.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/util_internal.h
new file mode 100644
index 00000000000..396c2ca2a0b
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/util_internal.h
@@ -0,0 +1,32 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/buffer.h"
+
+namespace arrow {
+namespace compute {
+namespace internal {
+
+static inline void ZeroMemory(Buffer* buffer) {
+ std::memset(buffer->mutable_data(), 0, buffer->size());
+}
+
+} // namespace internal
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/config.cc b/contrib/libs/apache/arrow/cpp/src/arrow/config.cc
new file mode 100644
index 00000000000..b93f207161d
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/config.cc
@@ -0,0 +1,78 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/config.h"
+
+#include <cstdint>
+
+#include "arrow/util/config.h"
+#include "arrow/util/cpu_info.h"
+
+namespace arrow {
+
+using internal::CpuInfo;
+
+namespace {
+
+const BuildInfo kBuildInfo = {
+ // clang-format off
+ ARROW_VERSION,
+ ARROW_VERSION_MAJOR,
+ ARROW_VERSION_MINOR,
+ ARROW_VERSION_PATCH,
+ ARROW_VERSION_STRING,
+ ARROW_SO_VERSION,
+ ARROW_FULL_SO_VERSION,
+ ARROW_CXX_COMPILER_ID,
+ ARROW_CXX_COMPILER_VERSION,
+ ARROW_CXX_COMPILER_FLAGS,
+ ARROW_GIT_ID,
+ ARROW_GIT_DESCRIPTION,
+ ARROW_PACKAGE_KIND,
+ // clang-format on
+};
+
+template <typename QueryFlagFunction>
+std::string MakeSimdLevelString(QueryFlagFunction&& query_flag) {
+ if (query_flag(CpuInfo::AVX512)) {
+ return "avx512";
+ } else if (query_flag(CpuInfo::AVX2)) {
+ return "avx2";
+ } else if (query_flag(CpuInfo::AVX)) {
+ return "avx";
+ } else if (query_flag(CpuInfo::SSE4_2)) {
+ return "sse4_2";
+ } else {
+ return "none";
+ }
+}
+
+}; // namespace
+
+const BuildInfo& GetBuildInfo() { return kBuildInfo; }
+
+RuntimeInfo GetRuntimeInfo() {
+ RuntimeInfo info;
+ auto cpu_info = CpuInfo::GetInstance();
+ info.simd_level =
+ MakeSimdLevelString([&](int64_t flags) { return cpu_info->IsSupported(flags); });
+ info.detected_simd_level =
+ MakeSimdLevelString([&](int64_t flags) { return cpu_info->IsDetected(flags); });
+ return info;
+}
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/config.h b/contrib/libs/apache/arrow/cpp/src/arrow/config.h
new file mode 100644
index 00000000000..5ae7e223164
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/config.h
@@ -0,0 +1,72 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <string>
+
+#include "arrow/util/config.h" // IWYU pragma: export
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+struct BuildInfo {
+ /// The packed version number, e.g. 1002003 (decimal) for Arrow 1.2.3
+ int version;
+ /// The "major" version number, e.g. 1 for Arrow 1.2.3
+ int version_major;
+ /// The "minor" version number, e.g. 2 for Arrow 1.2.3
+ int version_minor;
+ /// The "patch" version number, e.g. 3 for Arrow 1.2.3
+ int version_patch;
+ /// The version string, e.g. "1.2.3"
+ std::string version_string;
+ std::string so_version;
+ std::string full_so_version;
+ std::string compiler_id;
+ std::string compiler_version;
+ std::string compiler_flags;
+ std::string git_id;
+ std::string git_description;
+ std::string package_kind;
+};
+
+struct RuntimeInfo {
+ /// The enabled SIMD level
+ ///
+ /// This can be less than `detected_simd_level` if the ARROW_USER_SIMD_LEVEL
+ /// environment variable is set to another value.
+ std::string simd_level;
+
+ /// The SIMD level available on the OS and CPU
+ std::string detected_simd_level;
+};
+
+/// \brief Get runtime build info.
+///
+/// The returned values correspond to exact loaded version of the Arrow library,
+/// rather than the values frozen at application compile-time through the `ARROW_*`
+/// preprocessor definitions.
+ARROW_EXPORT
+const BuildInfo& GetBuildInfo();
+
+/// \brief Get runtime info.
+///
+ARROW_EXPORT
+RuntimeInfo GetRuntimeInfo();
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/csv/api.h b/contrib/libs/apache/arrow/cpp/src/arrow/csv/api.h
new file mode 100644
index 00000000000..7bf39315767
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/csv/api.h
@@ -0,0 +1,26 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/csv/options.h"
+#include "arrow/csv/reader.h"
+
+// The writer depends on compute module for casting.
+#ifdef ARROW_COMPUTE
+#include "arrow/csv/writer.h"
+#endif
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/csv/chunker.cc b/contrib/libs/apache/arrow/cpp/src/arrow/csv/chunker.cc
new file mode 100644
index 00000000000..b3a0dead593
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/csv/chunker.cc
@@ -0,0 +1,300 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/csv/chunker.h"
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+
+#include "arrow/status.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/make_unique.h"
+#include "arrow/util/string_view.h"
+
+namespace arrow {
+namespace csv {
+
+namespace {
+
+// NOTE: csvmonkey (https://github.com/dw/csvmonkey) has optimization ideas
+
+template <bool quoting, bool escaping>
+class Lexer {
+ public:
+ enum State {
+ FIELD_START,
+ IN_FIELD,
+ AT_ESCAPE,
+ IN_QUOTED_FIELD,
+ AT_QUOTED_QUOTE,
+ AT_QUOTED_ESCAPE
+ };
+
+ explicit Lexer(const ParseOptions& options) : options_(options) {
+ DCHECK_EQ(quoting, options_.quoting);
+ DCHECK_EQ(escaping, options_.escaping);
+ }
+
+ const char* ReadLine(const char* data, const char* data_end) {
+ // The parsing state machine
+ char c;
+ DCHECK_GT(data_end - data, 0);
+ if (ARROW_PREDICT_TRUE(state_ == FIELD_START)) {
+ goto FieldStart;
+ }
+ switch (state_) {
+ case FIELD_START:
+ goto FieldStart;
+ case IN_FIELD:
+ goto InField;
+ case AT_ESCAPE:
+ goto AtEscape;
+ case IN_QUOTED_FIELD:
+ goto InQuotedField;
+ case AT_QUOTED_QUOTE:
+ goto AtQuotedQuote;
+ case AT_QUOTED_ESCAPE:
+ goto AtQuotedEscape;
+ }
+
+ FieldStart:
+ // At the start of a field
+ if (ARROW_PREDICT_FALSE(data == data_end)) {
+ state_ = FIELD_START;
+ goto AbortLine;
+ }
+ // Quoting is only recognized at start of field
+ if (quoting && *data == options_.quote_char) {
+ data++;
+ goto InQuotedField;
+ } else {
+ goto InField;
+ }
+
+ InField:
+ // Inside a non-quoted part of a field
+ if (ARROW_PREDICT_FALSE(data == data_end)) {
+ state_ = IN_FIELD;
+ goto AbortLine;
+ }
+ c = *data++;
+ if (escaping && ARROW_PREDICT_FALSE(c == options_.escape_char)) {
+ if (ARROW_PREDICT_FALSE(data == data_end)) {
+ state_ = AT_ESCAPE;
+ goto AbortLine;
+ }
+ data++;
+ goto InField;
+ }
+ if (ARROW_PREDICT_FALSE(c == '\r')) {
+ if (ARROW_PREDICT_TRUE(data != data_end) && *data == '\n') {
+ data++;
+ }
+ goto LineEnd;
+ }
+ if (ARROW_PREDICT_FALSE(c == '\n')) {
+ goto LineEnd;
+ }
+ if (ARROW_PREDICT_FALSE(c == options_.delimiter)) {
+ goto FieldEnd;
+ }
+ goto InField;
+
+ AtEscape:
+ // Coming here if last block ended on a non-quoted escape
+ data++;
+ goto InField;
+
+ InQuotedField:
+ // Inside a quoted part of a field
+ if (ARROW_PREDICT_FALSE(data == data_end)) {
+ state_ = IN_QUOTED_FIELD;
+ goto AbortLine;
+ }
+ c = *data++;
+ if (escaping && ARROW_PREDICT_FALSE(c == options_.escape_char)) {
+ if (ARROW_PREDICT_FALSE(data == data_end)) {
+ state_ = AT_QUOTED_ESCAPE;
+ goto AbortLine;
+ }
+ data++;
+ goto InQuotedField;
+ }
+ if (ARROW_PREDICT_FALSE(c == options_.quote_char)) {
+ if (ARROW_PREDICT_FALSE(data == data_end)) {
+ state_ = AT_QUOTED_QUOTE;
+ goto AbortLine;
+ }
+ if (options_.double_quote && *data == options_.quote_char) {
+ // Double-quoting
+ data++;
+ } else {
+ // End of single-quoting
+ goto InField;
+ }
+ }
+ goto InQuotedField;
+
+ AtQuotedEscape:
+ // Coming here if last block ended on a quoted escape
+ data++;
+ goto InQuotedField;
+
+ AtQuotedQuote:
+ // Coming here if last block ended on a quoted quote
+ if (options_.double_quote && *data == options_.quote_char) {
+ // Double-quoting
+ data++;
+ goto InQuotedField;
+ } else {
+ // End of single-quoting
+ goto InField;
+ }
+
+ FieldEnd:
+ // At the end of a field
+ goto FieldStart;
+
+ LineEnd:
+ state_ = FIELD_START;
+ return data;
+
+ AbortLine:
+ // Truncated line
+ return nullptr;
+ }
+
+ protected:
+ const ParseOptions& options_;
+ State state_ = FIELD_START;
+};
+
+// A BoundaryFinder implementation that assumes CSV cells can contain raw newlines,
+// and uses actual CSV lexing to delimit them.
+template <bool quoting, bool escaping>
+class LexingBoundaryFinder : public BoundaryFinder {
+ public:
+ explicit LexingBoundaryFinder(ParseOptions options) : options_(std::move(options)) {}
+
+ Status FindFirst(util::string_view partial, util::string_view block,
+ int64_t* out_pos) override {
+ Lexer<quoting, escaping> lexer(options_);
+
+ const char* line_end =
+ lexer.ReadLine(partial.data(), partial.data() + partial.size());
+ DCHECK_EQ(line_end, nullptr); // Otherwise `partial` is a whole CSV line
+ line_end = lexer.ReadLine(block.data(), block.data() + block.size());
+
+ if (line_end == nullptr) {
+ // No complete CSV line
+ *out_pos = -1;
+ } else {
+ *out_pos = static_cast<int64_t>(line_end - block.data());
+ DCHECK_GT(*out_pos, 0);
+ }
+ return Status::OK();
+ }
+
+ Status FindLast(util::string_view block, int64_t* out_pos) override {
+ Lexer<quoting, escaping> lexer(options_);
+
+ const char* data = block.data();
+ const char* const data_end = block.data() + block.size();
+
+ while (data < data_end) {
+ const char* line_end = lexer.ReadLine(data, data_end);
+ if (line_end == nullptr) {
+ // Cannot read any further
+ break;
+ }
+ DCHECK_GT(line_end, data);
+ data = line_end;
+ }
+ if (data == block.data()) {
+ // No complete CSV line
+ *out_pos = -1;
+ } else {
+ *out_pos = static_cast<int64_t>(data - block.data());
+ DCHECK_GT(*out_pos, 0);
+ }
+ return Status::OK();
+ }
+
+ Status FindNth(util::string_view partial, util::string_view block, int64_t count,
+ int64_t* out_pos, int64_t* num_found) override {
+ Lexer<quoting, escaping> lexer(options_);
+ int64_t found = 0;
+ const char* data = block.data();
+ const char* const data_end = block.data() + block.size();
+
+ const char* line_end;
+ if (partial.size()) {
+ line_end = lexer.ReadLine(partial.data(), partial.data() + partial.size());
+ DCHECK_EQ(line_end, nullptr); // Otherwise `partial` is a whole CSV line
+ }
+
+ for (; data < data_end && found < count; ++found) {
+ line_end = lexer.ReadLine(data, data_end);
+ if (line_end == nullptr) {
+ // Cannot read any further
+ break;
+ }
+ DCHECK_GT(line_end, data);
+ data = line_end;
+ }
+
+ if (data == block.data()) {
+ // No complete CSV line
+ *out_pos = kNoDelimiterFound;
+ } else {
+ *out_pos = static_cast<int64_t>(data - block.data());
+ }
+ *num_found = found;
+ return Status::OK();
+ }
+
+ protected:
+ ParseOptions options_;
+};
+
+} // namespace
+
+std::unique_ptr<Chunker> MakeChunker(const ParseOptions& options) {
+ std::shared_ptr<BoundaryFinder> delimiter;
+ if (!options.newlines_in_values) {
+ delimiter = MakeNewlineBoundaryFinder();
+ } else {
+ if (options.quoting) {
+ if (options.escaping) {
+ delimiter = std::make_shared<LexingBoundaryFinder<true, true>>(options);
+ } else {
+ delimiter = std::make_shared<LexingBoundaryFinder<true, false>>(options);
+ }
+ } else {
+ if (options.escaping) {
+ delimiter = std::make_shared<LexingBoundaryFinder<false, true>>(options);
+ } else {
+ delimiter = std::make_shared<LexingBoundaryFinder<false, false>>(options);
+ }
+ }
+ }
+ return internal::make_unique<Chunker>(std::move(delimiter));
+}
+
+} // namespace csv
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/csv/chunker.h b/contrib/libs/apache/arrow/cpp/src/arrow/csv/chunker.h
new file mode 100644
index 00000000000..662b16ec40a
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/csv/chunker.h
@@ -0,0 +1,36 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+
+#include "arrow/csv/options.h"
+#include "arrow/status.h"
+#include "arrow/util/delimiting.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace csv {
+
+ARROW_EXPORT
+std::unique_ptr<Chunker> MakeChunker(const ParseOptions& options);
+
+} // namespace csv
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/csv/column_builder.cc b/contrib/libs/apache/arrow/cpp/src/arrow/csv/column_builder.cc
new file mode 100644
index 00000000000..bc974428734
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/csv/column_builder.cc
@@ -0,0 +1,367 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <mutex>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/array/builder_base.h"
+#include "arrow/chunked_array.h"
+#include "arrow/csv/column_builder.h"
+#include "arrow/csv/converter.h"
+#include "arrow/csv/inference_internal.h"
+#include "arrow/csv/options.h"
+#include "arrow/csv/parser.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/task_group.h"
+
+namespace arrow {
+namespace csv {
+
+class BlockParser;
+
+using internal::TaskGroup;
+
+class ConcreteColumnBuilder : public ColumnBuilder {
+ public:
+ explicit ConcreteColumnBuilder(MemoryPool* pool,
+ std::shared_ptr<internal::TaskGroup> task_group,
+ int32_t col_index = -1)
+ : ColumnBuilder(std::move(task_group)), pool_(pool), col_index_(col_index) {}
+
+ void Append(const std::shared_ptr<BlockParser>& parser) override {
+ Insert(static_cast<int64_t>(chunks_.size()), parser);
+ }
+
+ Result<std::shared_ptr<ChunkedArray>> Finish() override {
+ std::lock_guard<std::mutex> lock(mutex_);
+
+ return FinishUnlocked();
+ }
+
+ protected:
+ virtual std::shared_ptr<DataType> type() const = 0;
+
+ Result<std::shared_ptr<ChunkedArray>> FinishUnlocked() {
+ auto type = this->type();
+ for (const auto& chunk : chunks_) {
+ if (chunk == nullptr) {
+ return Status::UnknownError("a chunk failed converting for an unknown reason");
+ }
+ DCHECK_EQ(chunk->type()->id(), type->id()) << "Chunk types not equal!";
+ }
+ return std::make_shared<ChunkedArray>(chunks_, std::move(type));
+ }
+
+ void ReserveChunks(int64_t block_index) {
+ // Create a null Array pointer at the back at the list.
+ std::lock_guard<std::mutex> lock(mutex_);
+ ReserveChunksUnlocked(block_index);
+ }
+
+ void ReserveChunksUnlocked(int64_t block_index) {
+ // Create a null Array pointer at the back at the list.
+ size_t chunk_index = static_cast<size_t>(block_index);
+ if (chunks_.size() <= chunk_index) {
+ chunks_.resize(chunk_index + 1);
+ }
+ }
+
+ Status SetChunk(int64_t chunk_index, Result<std::shared_ptr<Array>> maybe_array) {
+ std::lock_guard<std::mutex> lock(mutex_);
+ return SetChunkUnlocked(chunk_index, std::move(maybe_array));
+ }
+
+ Status SetChunkUnlocked(int64_t chunk_index,
+ Result<std::shared_ptr<Array>> maybe_array) {
+ // Should not insert an already built chunk
+ DCHECK_EQ(chunks_[chunk_index], nullptr);
+
+ if (maybe_array.ok()) {
+ chunks_[chunk_index] = *std::move(maybe_array);
+ return Status::OK();
+ } else {
+ return WrapConversionError(maybe_array.status());
+ }
+ }
+
+ Status WrapConversionError(const Status& st) {
+ if (ARROW_PREDICT_TRUE(st.ok())) {
+ return st;
+ } else {
+ std::stringstream ss;
+ ss << "In CSV column #" << col_index_ << ": " << st.message();
+ return st.WithMessage(ss.str());
+ }
+ }
+
+ MemoryPool* pool_;
+ int32_t col_index_;
+
+ ArrayVector chunks_;
+
+ std::mutex mutex_;
+};
+
+//////////////////////////////////////////////////////////////////////////
+// Null column builder implementation (for a column not in the CSV file)
+
+class NullColumnBuilder : public ConcreteColumnBuilder {
+ public:
+ explicit NullColumnBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool,
+ const std::shared_ptr<internal::TaskGroup>& task_group)
+ : ConcreteColumnBuilder(pool, task_group), type_(type) {}
+
+ void Insert(int64_t block_index, const std::shared_ptr<BlockParser>& parser) override;
+
+ protected:
+ std::shared_ptr<DataType> type() const override { return type_; }
+
+ std::shared_ptr<DataType> type_;
+};
+
+void NullColumnBuilder::Insert(int64_t block_index,
+ const std::shared_ptr<BlockParser>& parser) {
+ ReserveChunks(block_index);
+
+ // Spawn a task that will build an array of nulls with the right DataType
+ const int32_t num_rows = parser->num_rows();
+ DCHECK_GE(num_rows, 0);
+
+ task_group_->Append([=]() -> Status {
+ std::unique_ptr<ArrayBuilder> builder;
+ RETURN_NOT_OK(MakeBuilder(pool_, type_, &builder));
+ std::shared_ptr<Array> res;
+ RETURN_NOT_OK(builder->AppendNulls(num_rows));
+ RETURN_NOT_OK(builder->Finish(&res));
+
+ return SetChunk(block_index, res);
+ });
+}
+
+//////////////////////////////////////////////////////////////////////////
+// Pre-typed column builder implementation
+
+class TypedColumnBuilder : public ConcreteColumnBuilder {
+ public:
+ TypedColumnBuilder(const std::shared_ptr<DataType>& type, int32_t col_index,
+ const ConvertOptions& options, MemoryPool* pool,
+ const std::shared_ptr<internal::TaskGroup>& task_group)
+ : ConcreteColumnBuilder(pool, task_group, col_index),
+ type_(type),
+ options_(options) {}
+
+ Status Init();
+
+ void Insert(int64_t block_index, const std::shared_ptr<BlockParser>& parser) override;
+
+ protected:
+ std::shared_ptr<DataType> type() const override { return type_; }
+
+ std::shared_ptr<DataType> type_;
+ // CAUTION: ConvertOptions can grow large (if it customizes hundreds or
+ // thousands of columns), so avoid copying it in each TypedColumnBuilder.
+ const ConvertOptions& options_;
+
+ std::shared_ptr<Converter> converter_;
+};
+
+Status TypedColumnBuilder::Init() {
+ ARROW_ASSIGN_OR_RAISE(converter_, Converter::Make(type_, options_, pool_));
+ return Status::OK();
+}
+
+void TypedColumnBuilder::Insert(int64_t block_index,
+ const std::shared_ptr<BlockParser>& parser) {
+ DCHECK_NE(converter_, nullptr);
+
+ ReserveChunks(block_index);
+
+ // We're careful that all references in the closure outlive the Append() call
+ task_group_->Append([=]() -> Status {
+ return SetChunk(block_index, converter_->Convert(*parser, col_index_));
+ });
+}
+
+//////////////////////////////////////////////////////////////////////////
+// Type-inferring column builder implementation
+
+class InferringColumnBuilder : public ConcreteColumnBuilder {
+ public:
+ InferringColumnBuilder(int32_t col_index, const ConvertOptions& options,
+ MemoryPool* pool,
+ const std::shared_ptr<internal::TaskGroup>& task_group)
+ : ConcreteColumnBuilder(pool, task_group, col_index),
+ options_(options),
+ infer_status_(options) {}
+
+ Status Init();
+
+ void Insert(int64_t block_index, const std::shared_ptr<BlockParser>& parser) override;
+ Result<std::shared_ptr<ChunkedArray>> Finish() override;
+
+ protected:
+ std::shared_ptr<DataType> type() const override {
+ DCHECK_NE(converter_, nullptr);
+ return converter_->type();
+ }
+
+ Status UpdateType();
+ Status TryConvertChunk(int64_t chunk_index);
+ // This must be called unlocked!
+ void ScheduleConvertChunk(int64_t chunk_index);
+
+ // CAUTION: ConvertOptions can grow large (if it customizes hundreds or
+ // thousands of columns), so avoid copying it in each InferringColumnBuilder.
+ const ConvertOptions& options_;
+
+ // Current inference status
+ InferStatus infer_status_;
+ std::shared_ptr<Converter> converter_;
+
+ // The parsers corresponding to each chunk (for reconverting)
+ std::vector<std::shared_ptr<BlockParser>> parsers_;
+};
+
+Status InferringColumnBuilder::Init() { return UpdateType(); }
+
+Status InferringColumnBuilder::UpdateType() {
+ return infer_status_.MakeConverter(pool_).Value(&converter_);
+}
+
+void InferringColumnBuilder::ScheduleConvertChunk(int64_t chunk_index) {
+ task_group_->Append([=]() { return TryConvertChunk(chunk_index); });
+}
+
+Status InferringColumnBuilder::TryConvertChunk(int64_t chunk_index) {
+ std::unique_lock<std::mutex> lock(mutex_);
+ std::shared_ptr<Converter> converter = converter_;
+ std::shared_ptr<BlockParser> parser = parsers_[chunk_index];
+ InferKind kind = infer_status_.kind();
+
+ DCHECK_NE(parser, nullptr);
+
+ lock.unlock();
+ auto maybe_array = converter->Convert(*parser, col_index_);
+ lock.lock();
+
+ if (kind != infer_status_.kind()) {
+ // infer_kind_ was changed by another task, reconvert
+ lock.unlock();
+ ScheduleConvertChunk(chunk_index);
+ return Status::OK();
+ }
+
+ if (maybe_array.ok() || !infer_status_.can_loosen_type()) {
+ // Conversion succeeded, or failed definitively
+ if (!infer_status_.can_loosen_type()) {
+ // We won't try to reconvert anymore
+ parsers_[chunk_index].reset();
+ }
+ return SetChunkUnlocked(chunk_index, maybe_array);
+ }
+
+ // Conversion failed, try another type
+ infer_status_.LoosenType(maybe_array.status());
+ RETURN_NOT_OK(UpdateType());
+
+ // Reconvert past finished chunks
+ // (unfinished chunks will notice by themselves if they need reconverting)
+ const auto nchunks = static_cast<int64_t>(chunks_.size());
+ for (int64_t i = 0; i < nchunks; ++i) {
+ if (i != chunk_index && chunks_[i]) {
+ // We're assuming the chunk was converted using the wrong type
+ // (which should be true unless the executor reorders tasks)
+ chunks_[i].reset();
+ lock.unlock();
+ ScheduleConvertChunk(i);
+ lock.lock();
+ }
+ }
+
+ // Reconvert this chunk
+ lock.unlock();
+ ScheduleConvertChunk(chunk_index);
+
+ return Status::OK();
+}
+
+void InferringColumnBuilder::Insert(int64_t block_index,
+ const std::shared_ptr<BlockParser>& parser) {
+ // Create a slot for the new chunk and spawn a task to convert it
+ size_t chunk_index = static_cast<size_t>(block_index);
+ {
+ std::lock_guard<std::mutex> lock(mutex_);
+
+ DCHECK_NE(converter_, nullptr);
+ if (parsers_.size() <= chunk_index) {
+ parsers_.resize(chunk_index + 1);
+ }
+ // Should not insert an already converting chunk
+ DCHECK_EQ(parsers_[chunk_index], nullptr);
+ parsers_[chunk_index] = parser;
+ ReserveChunksUnlocked(block_index);
+ }
+
+ ScheduleConvertChunk(chunk_index);
+}
+
+Result<std::shared_ptr<ChunkedArray>> InferringColumnBuilder::Finish() {
+ std::lock_guard<std::mutex> lock(mutex_);
+
+ parsers_.clear();
+ return FinishUnlocked();
+}
+
+//////////////////////////////////////////////////////////////////////////
+// Factory functions
+
+Result<std::shared_ptr<ColumnBuilder>> ColumnBuilder::Make(
+ MemoryPool* pool, const std::shared_ptr<DataType>& type, int32_t col_index,
+ const ConvertOptions& options, const std::shared_ptr<TaskGroup>& task_group) {
+ auto ptr =
+ std::make_shared<TypedColumnBuilder>(type, col_index, options, pool, task_group);
+ RETURN_NOT_OK(ptr->Init());
+ return ptr;
+}
+
+Result<std::shared_ptr<ColumnBuilder>> ColumnBuilder::Make(
+ MemoryPool* pool, int32_t col_index, const ConvertOptions& options,
+ const std::shared_ptr<TaskGroup>& task_group) {
+ auto ptr =
+ std::make_shared<InferringColumnBuilder>(col_index, options, pool, task_group);
+ RETURN_NOT_OK(ptr->Init());
+ return ptr;
+}
+
+Result<std::shared_ptr<ColumnBuilder>> ColumnBuilder::MakeNull(
+ MemoryPool* pool, const std::shared_ptr<DataType>& type,
+ const std::shared_ptr<internal::TaskGroup>& task_group) {
+ return std::make_shared<NullColumnBuilder>(type, pool, task_group);
+}
+
+} // namespace csv
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/csv/column_builder.h b/contrib/libs/apache/arrow/cpp/src/arrow/csv/column_builder.h
new file mode 100644
index 00000000000..170a8ad0673
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/csv/column_builder.h
@@ -0,0 +1,78 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+
+#include "arrow/result.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/type_fwd.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace csv {
+
+class BlockParser;
+struct ConvertOptions;
+
+class ARROW_EXPORT ColumnBuilder {
+ public:
+ virtual ~ColumnBuilder() = default;
+
+ /// Spawn a task that will try to convert and append the given CSV block.
+ /// All calls to Append() should happen on the same thread, otherwise
+ /// call Insert() instead.
+ virtual void Append(const std::shared_ptr<BlockParser>& parser) = 0;
+
+ /// Spawn a task that will try to convert and insert the given CSV block
+ virtual void Insert(int64_t block_index,
+ const std::shared_ptr<BlockParser>& parser) = 0;
+
+ /// Return the final chunked array. The TaskGroup _must_ have finished!
+ virtual Result<std::shared_ptr<ChunkedArray>> Finish() = 0;
+
+ std::shared_ptr<internal::TaskGroup> task_group() { return task_group_; }
+
+ /// Construct a strictly-typed ColumnBuilder.
+ static Result<std::shared_ptr<ColumnBuilder>> Make(
+ MemoryPool* pool, const std::shared_ptr<DataType>& type, int32_t col_index,
+ const ConvertOptions& options,
+ const std::shared_ptr<internal::TaskGroup>& task_group);
+
+ /// Construct a type-inferring ColumnBuilder.
+ static Result<std::shared_ptr<ColumnBuilder>> Make(
+ MemoryPool* pool, int32_t col_index, const ConvertOptions& options,
+ const std::shared_ptr<internal::TaskGroup>& task_group);
+
+ /// Construct a ColumnBuilder for a column of nulls
+ /// (i.e. not present in the CSV file).
+ static Result<std::shared_ptr<ColumnBuilder>> MakeNull(
+ MemoryPool* pool, const std::shared_ptr<DataType>& type,
+ const std::shared_ptr<internal::TaskGroup>& task_group);
+
+ protected:
+ explicit ColumnBuilder(std::shared_ptr<internal::TaskGroup> task_group)
+ : task_group_(std::move(task_group)) {}
+
+ std::shared_ptr<internal::TaskGroup> task_group_;
+};
+
+} // namespace csv
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/csv/column_decoder.cc b/contrib/libs/apache/arrow/cpp/src/arrow/csv/column_decoder.cc
new file mode 100644
index 00000000000..436d703a9cc
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/csv/column_decoder.cc
@@ -0,0 +1,243 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/csv/column_decoder.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <utility>
+
+#include "arrow/array.h"
+#include "arrow/array/builder_base.h"
+#include "arrow/csv/converter.h"
+#include "arrow/csv/inference_internal.h"
+#include "arrow/csv/options.h"
+#include "arrow/csv/parser.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/future.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/task_group.h"
+
+namespace arrow {
+namespace csv {
+
+using internal::TaskGroup;
+
+class ConcreteColumnDecoder : public ColumnDecoder {
+ public:
+ explicit ConcreteColumnDecoder(MemoryPool* pool, int32_t col_index = -1)
+ : ColumnDecoder(), pool_(pool), col_index_(col_index) {}
+
+ protected:
+ // XXX useful?
+ virtual std::shared_ptr<DataType> type() const = 0;
+
+ Result<std::shared_ptr<Array>> WrapConversionError(
+ const Result<std::shared_ptr<Array>>& result) {
+ if (ARROW_PREDICT_TRUE(result.ok())) {
+ return result;
+ } else {
+ const auto& st = result.status();
+ std::stringstream ss;
+ ss << "In CSV column #" << col_index_ << ": " << st.message();
+ return st.WithMessage(ss.str());
+ }
+ }
+
+ MemoryPool* pool_;
+ int32_t col_index_;
+ internal::Executor* executor_;
+};
+
+//////////////////////////////////////////////////////////////////////////
+// Null column decoder implementation (for a column not in the CSV file)
+
+class NullColumnDecoder : public ConcreteColumnDecoder {
+ public:
+ explicit NullColumnDecoder(const std::shared_ptr<DataType>& type, MemoryPool* pool)
+ : ConcreteColumnDecoder(pool), type_(type) {}
+
+ Future<std::shared_ptr<Array>> Decode(
+ const std::shared_ptr<BlockParser>& parser) override;
+
+ protected:
+ std::shared_ptr<DataType> type() const override { return type_; }
+
+ std::shared_ptr<DataType> type_;
+};
+
+Future<std::shared_ptr<Array>> NullColumnDecoder::Decode(
+ const std::shared_ptr<BlockParser>& parser) {
+ DCHECK_GE(parser->num_rows(), 0);
+ return WrapConversionError(MakeArrayOfNull(type_, parser->num_rows(), pool_));
+}
+
+//////////////////////////////////////////////////////////////////////////
+// Pre-typed column decoder implementation
+
+class TypedColumnDecoder : public ConcreteColumnDecoder {
+ public:
+ TypedColumnDecoder(const std::shared_ptr<DataType>& type, int32_t col_index,
+ const ConvertOptions& options, MemoryPool* pool)
+ : ConcreteColumnDecoder(pool, col_index), type_(type), options_(options) {}
+
+ Status Init();
+
+ Future<std::shared_ptr<Array>> Decode(
+ const std::shared_ptr<BlockParser>& parser) override;
+
+ protected:
+ std::shared_ptr<DataType> type() const override { return type_; }
+
+ std::shared_ptr<DataType> type_;
+ // CAUTION: ConvertOptions can grow large (if it customizes hundreds or
+ // thousands of columns), so avoid copying it in each TypedColumnDecoder.
+ const ConvertOptions& options_;
+
+ std::shared_ptr<Converter> converter_;
+};
+
+Status TypedColumnDecoder::Init() {
+ ARROW_ASSIGN_OR_RAISE(converter_, Converter::Make(type_, options_, pool_));
+ return Status::OK();
+}
+
+Future<std::shared_ptr<Array>> TypedColumnDecoder::Decode(
+ const std::shared_ptr<BlockParser>& parser) {
+ DCHECK_NE(converter_, nullptr);
+ return Future<std::shared_ptr<Array>>::MakeFinished(
+ WrapConversionError(converter_->Convert(*parser, col_index_)));
+}
+
+//////////////////////////////////////////////////////////////////////////
+// Type-inferring column builder implementation
+
+class InferringColumnDecoder : public ConcreteColumnDecoder {
+ public:
+ InferringColumnDecoder(int32_t col_index, const ConvertOptions& options,
+ MemoryPool* pool)
+ : ConcreteColumnDecoder(pool, col_index),
+ options_(options),
+ infer_status_(options),
+ type_frozen_(false) {
+ first_inference_run_ = Future<>::Make();
+ first_inferrer_ = 0;
+ }
+
+ Status Init();
+
+ Future<std::shared_ptr<Array>> Decode(
+ const std::shared_ptr<BlockParser>& parser) override;
+
+ protected:
+ std::shared_ptr<DataType> type() const override {
+ DCHECK_NE(converter_, nullptr);
+ return converter_->type();
+ }
+
+ Status UpdateType();
+ Result<std::shared_ptr<Array>> RunInference(const std::shared_ptr<BlockParser>& parser);
+
+ // CAUTION: ConvertOptions can grow large (if it customizes hundreds or
+ // thousands of columns), so avoid copying it in each InferringColumnDecoder.
+ const ConvertOptions& options_;
+
+ // Current inference status
+ InferStatus infer_status_;
+ bool type_frozen_;
+ std::atomic<int> first_inferrer_;
+ Future<> first_inference_run_;
+ std::shared_ptr<Converter> converter_;
+};
+
+Status InferringColumnDecoder::Init() { return UpdateType(); }
+
+Status InferringColumnDecoder::UpdateType() {
+ return infer_status_.MakeConverter(pool_).Value(&converter_);
+}
+
+Result<std::shared_ptr<Array>> InferringColumnDecoder::RunInference(
+ const std::shared_ptr<BlockParser>& parser) {
+ while (true) {
+ // (no one else should be updating converter_ concurrently)
+ auto maybe_array = converter_->Convert(*parser, col_index_);
+
+ if (maybe_array.ok() || !infer_status_.can_loosen_type()) {
+ // Conversion succeeded, or failed definitively
+ DCHECK(!type_frozen_);
+ type_frozen_ = true;
+ return maybe_array;
+ }
+ // Conversion failed temporarily, try another type
+ infer_status_.LoosenType(maybe_array.status());
+ auto update_status = UpdateType();
+ if (!update_status.ok()) {
+ return update_status;
+ }
+ }
+}
+
+Future<std::shared_ptr<Array>> InferringColumnDecoder::Decode(
+ const std::shared_ptr<BlockParser>& parser) {
+ bool already_taken = first_inferrer_.fetch_or(1);
+ // First block: run inference
+ if (!already_taken) {
+ auto maybe_array = RunInference(parser);
+ first_inference_run_.MarkFinished();
+ return Future<std::shared_ptr<Array>>::MakeFinished(std::move(maybe_array));
+ }
+
+ // Non-first block: wait for inference to finish on first block now,
+ // without blocking a TaskGroup thread.
+ return first_inference_run_.Then([this, parser] {
+ DCHECK(type_frozen_);
+ auto maybe_array = converter_->Convert(*parser, col_index_);
+ return WrapConversionError(converter_->Convert(*parser, col_index_));
+ });
+}
+
+//////////////////////////////////////////////////////////////////////////
+// Factory functions
+
+Result<std::shared_ptr<ColumnDecoder>> ColumnDecoder::Make(
+ MemoryPool* pool, int32_t col_index, const ConvertOptions& options) {
+ auto ptr = std::make_shared<InferringColumnDecoder>(col_index, options, pool);
+ RETURN_NOT_OK(ptr->Init());
+ return ptr;
+}
+
+Result<std::shared_ptr<ColumnDecoder>> ColumnDecoder::Make(
+ MemoryPool* pool, std::shared_ptr<DataType> type, int32_t col_index,
+ const ConvertOptions& options) {
+ auto ptr =
+ std::make_shared<TypedColumnDecoder>(std::move(type), col_index, options, pool);
+ RETURN_NOT_OK(ptr->Init());
+ return ptr;
+}
+
+Result<std::shared_ptr<ColumnDecoder>> ColumnDecoder::MakeNull(
+ MemoryPool* pool, std::shared_ptr<DataType> type) {
+ return std::make_shared<NullColumnDecoder>(std::move(type), pool);
+}
+
+} // namespace csv
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/csv/column_decoder.h b/contrib/libs/apache/arrow/cpp/src/arrow/csv/column_decoder.h
new file mode 100644
index 00000000000..5fbbd5df58b
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/csv/column_decoder.h
@@ -0,0 +1,64 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+
+#include "arrow/result.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/type_fwd.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace csv {
+
+class BlockParser;
+struct ConvertOptions;
+
+class ARROW_EXPORT ColumnDecoder {
+ public:
+ virtual ~ColumnDecoder() = default;
+
+ /// Spawn a task that will try to convert and insert the given CSV block
+ virtual Future<std::shared_ptr<Array>> Decode(
+ const std::shared_ptr<BlockParser>& parser) = 0;
+
+ /// Construct a strictly-typed ColumnDecoder.
+ static Result<std::shared_ptr<ColumnDecoder>> Make(MemoryPool* pool,
+ std::shared_ptr<DataType> type,
+ int32_t col_index,
+ const ConvertOptions& options);
+
+ /// Construct a type-inferring ColumnDecoder.
+ /// Inference will run only on the first block, the type will be frozen afterwards.
+ static Result<std::shared_ptr<ColumnDecoder>> Make(MemoryPool* pool, int32_t col_index,
+ const ConvertOptions& options);
+
+ /// Construct a ColumnDecoder for a column of nulls
+ /// (i.e. not present in the CSV file).
+ static Result<std::shared_ptr<ColumnDecoder>> MakeNull(MemoryPool* pool,
+ std::shared_ptr<DataType> type);
+
+ protected:
+ ColumnDecoder() = default;
+};
+
+} // namespace csv
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/csv/converter.cc b/contrib/libs/apache/arrow/cpp/src/arrow/csv/converter.cc
new file mode 100644
index 00000000000..cb72b22b405
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/csv/converter.cc
@@ -0,0 +1,692 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/csv/converter.h"
+
+#include <cstring>
+#include <limits>
+#include <sstream>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include "arrow/array/builder_binary.h"
+#include "arrow/array/builder_decimal.h"
+#include "arrow/array/builder_dict.h"
+#include "arrow/array/builder_primitive.h"
+#include "arrow/csv/parser.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_fwd.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/decimal.h"
+#include "arrow/util/trie.h"
+#include "arrow/util/utf8.h"
+#include "arrow/util/value_parsing.h" // IWYU pragma: keep
+
+namespace arrow {
+namespace csv {
+
+using internal::checked_cast;
+using internal::Trie;
+using internal::TrieBuilder;
+
+namespace {
+
+Status GenericConversionError(const std::shared_ptr<DataType>& type, const uint8_t* data,
+ uint32_t size) {
+ return Status::Invalid("CSV conversion error to ", type->ToString(),
+ ": invalid value '",
+ std::string(reinterpret_cast<const char*>(data), size), "'");
+}
+
+inline bool IsWhitespace(uint8_t c) {
+ if (ARROW_PREDICT_TRUE(c > ' ')) {
+ return false;
+ }
+ return c == ' ' || c == '\t';
+}
+
+// Updates data_inout and size_inout to not include leading/trailing whitespace
+// characters.
+inline void TrimWhiteSpace(const uint8_t** data_inout, uint32_t* size_inout) {
+ const uint8_t*& data = *data_inout;
+ uint32_t& size = *size_inout;
+ // Skip trailing whitespace
+ if (ARROW_PREDICT_TRUE(size > 0) && ARROW_PREDICT_FALSE(IsWhitespace(data[size - 1]))) {
+ const uint8_t* p = data + size - 1;
+ while (size > 0 && IsWhitespace(*p)) {
+ --size;
+ --p;
+ }
+ }
+ // Skip leading whitespace
+ if (ARROW_PREDICT_TRUE(size > 0) && ARROW_PREDICT_FALSE(IsWhitespace(data[0]))) {
+ while (size > 0 && IsWhitespace(*data)) {
+ --size;
+ ++data;
+ }
+ }
+}
+
+Status InitializeTrie(const std::vector<std::string>& inputs, Trie* trie) {
+ TrieBuilder builder;
+ for (const auto& s : inputs) {
+ RETURN_NOT_OK(builder.Append(s, true /* allow_duplicates */));
+ }
+ *trie = builder.Finish();
+ return Status::OK();
+}
+
+// Presize a builder based on parser contents
+template <typename BuilderType>
+enable_if_t<!is_base_binary_type<typename BuilderType::TypeClass>::value, Status>
+PresizeBuilder(const BlockParser& parser, BuilderType* builder) {
+ return builder->Resize(parser.num_rows());
+}
+
+// Same, for variable-sized binary builders
+template <typename T>
+Status PresizeBuilder(const BlockParser& parser, BaseBinaryBuilder<T>* builder) {
+ RETURN_NOT_OK(builder->Resize(parser.num_rows()));
+ return builder->ReserveData(parser.num_bytes());
+}
+
+/////////////////////////////////////////////////////////////////////////
+// Per-type value decoders
+
+struct ValueDecoder {
+ explicit ValueDecoder(const std::shared_ptr<DataType>& type,
+ const ConvertOptions& options)
+ : type_(type), options_(options) {}
+
+ Status Initialize() {
+ // TODO no need to build a separate Trie for each instance
+ return InitializeTrie(options_.null_values, &null_trie_);
+ }
+
+ bool IsNull(const uint8_t* data, uint32_t size, bool quoted) {
+ if (quoted) {
+ return false;
+ }
+ return null_trie_.Find(
+ util::string_view(reinterpret_cast<const char*>(data), size)) >= 0;
+ }
+
+ protected:
+ Trie null_trie_;
+ std::shared_ptr<DataType> type_;
+ const ConvertOptions& options_;
+};
+
+//
+// Value decoder for fixed-size binary
+//
+
+struct FixedSizeBinaryValueDecoder : public ValueDecoder {
+ using value_type = const uint8_t*;
+
+ explicit FixedSizeBinaryValueDecoder(const std::shared_ptr<DataType>& type,
+ const ConvertOptions& options)
+ : ValueDecoder(type, options),
+ byte_width_(checked_cast<const FixedSizeBinaryType&>(*type).byte_width()) {}
+
+ Status Decode(const uint8_t* data, uint32_t size, bool quoted, value_type* out) {
+ if (ARROW_PREDICT_FALSE(size != byte_width_)) {
+ return Status::Invalid("CSV conversion error to ", type_->ToString(), ": got a ",
+ size, "-byte long string");
+ }
+ *out = data;
+ return Status::OK();
+ }
+
+ protected:
+ const uint32_t byte_width_;
+};
+
+//
+// Value decoder for variable-size binary
+//
+
+template <bool CheckUTF8>
+struct BinaryValueDecoder : public ValueDecoder {
+ using value_type = util::string_view;
+
+ using ValueDecoder::ValueDecoder;
+
+ Status Initialize() {
+ util::InitializeUTF8();
+ return ValueDecoder::Initialize();
+ }
+
+ Status Decode(const uint8_t* data, uint32_t size, bool quoted, value_type* out) {
+ if (CheckUTF8 && ARROW_PREDICT_FALSE(!util::ValidateUTF8(data, size))) {
+ return Status::Invalid("CSV conversion error to ", type_->ToString(),
+ ": invalid UTF8 data");
+ }
+ *out = {reinterpret_cast<const char*>(data), size};
+ return Status::OK();
+ }
+
+ bool IsNull(const uint8_t* data, uint32_t size, bool quoted) {
+ return options_.strings_can_be_null &&
+ (!quoted || options_.quoted_strings_can_be_null) &&
+ ValueDecoder::IsNull(data, size, false /* quoted */);
+ }
+};
+
+//
+// Value decoder for integers and floats
+//
+
+template <typename T>
+struct NumericValueDecoder : public ValueDecoder {
+ using value_type = typename T::c_type;
+
+ using ValueDecoder::ValueDecoder;
+
+ Status Decode(const uint8_t* data, uint32_t size, bool quoted, value_type* out) {
+ // XXX should quoted values be allowed at all?
+ TrimWhiteSpace(&data, &size);
+ if (ARROW_PREDICT_FALSE(
+ !internal::ParseValue<T>(reinterpret_cast<const char*>(data), size, out))) {
+ return GenericConversionError(type_, data, size);
+ }
+ return Status::OK();
+ }
+};
+
+//
+// Value decoder for booleans
+//
+
+struct BooleanValueDecoder : public ValueDecoder {
+ using value_type = bool;
+
+ using ValueDecoder::ValueDecoder;
+
+ Status Initialize() {
+ // TODO no need to build separate Tries for each instance
+ RETURN_NOT_OK(InitializeTrie(options_.true_values, &true_trie_));
+ RETURN_NOT_OK(InitializeTrie(options_.false_values, &false_trie_));
+ return ValueDecoder::Initialize();
+ }
+
+ Status Decode(const uint8_t* data, uint32_t size, bool quoted, value_type* out) {
+ // XXX should quoted values be allowed at all?
+ if (false_trie_.Find(util::string_view(reinterpret_cast<const char*>(data), size)) >=
+ 0) {
+ *out = false;
+ return Status::OK();
+ }
+ if (ARROW_PREDICT_TRUE(true_trie_.Find(util::string_view(
+ reinterpret_cast<const char*>(data), size)) >= 0)) {
+ *out = true;
+ return Status::OK();
+ }
+ return GenericConversionError(type_, data, size);
+ }
+
+ protected:
+ Trie true_trie_;
+ Trie false_trie_;
+};
+
+//
+// Value decoder for decimals
+//
+
+struct DecimalValueDecoder : public ValueDecoder {
+ using value_type = Decimal128;
+
+ explicit DecimalValueDecoder(const std::shared_ptr<DataType>& type,
+ const ConvertOptions& options)
+ : ValueDecoder(type, options),
+ decimal_type_(internal::checked_cast<const DecimalType&>(*type_)),
+ type_precision_(decimal_type_.precision()),
+ type_scale_(decimal_type_.scale()) {}
+
+ Status Decode(const uint8_t* data, uint32_t size, bool quoted, value_type* out) {
+ TrimWhiteSpace(&data, &size);
+ Decimal128 decimal;
+ int32_t precision, scale;
+ util::string_view view(reinterpret_cast<const char*>(data), size);
+ RETURN_NOT_OK(Decimal128::FromString(view, &decimal, &precision, &scale));
+ if (precision > type_precision_) {
+ return Status::Invalid("Error converting '", view, "' to ", type_->ToString(),
+ ": precision not supported by type.");
+ }
+ if (scale != type_scale_) {
+ ARROW_ASSIGN_OR_RAISE(*out, decimal.Rescale(scale, type_scale_));
+ } else {
+ *out = std::move(decimal);
+ }
+ return Status::OK();
+ }
+
+ protected:
+ const DecimalType& decimal_type_;
+ const int32_t type_precision_;
+ const int32_t type_scale_;
+};
+
+//
+// Value decoders for timestamps
+//
+
+struct InlineISO8601ValueDecoder : public ValueDecoder {
+ using value_type = int64_t;
+
+ explicit InlineISO8601ValueDecoder(const std::shared_ptr<DataType>& type,
+ const ConvertOptions& options)
+ : ValueDecoder(type, options),
+ unit_(checked_cast<const TimestampType&>(*type_).unit()) {}
+
+ Status Decode(const uint8_t* data, uint32_t size, bool quoted, value_type* out) {
+ if (ARROW_PREDICT_FALSE(!internal::ParseTimestampISO8601(
+ reinterpret_cast<const char*>(data), size, unit_, out))) {
+ return GenericConversionError(type_, data, size);
+ }
+ return Status::OK();
+ }
+
+ protected:
+ TimeUnit::type unit_;
+};
+
+struct SingleParserTimestampValueDecoder : public ValueDecoder {
+ using value_type = int64_t;
+
+ explicit SingleParserTimestampValueDecoder(const std::shared_ptr<DataType>& type,
+ const ConvertOptions& options)
+ : ValueDecoder(type, options),
+ unit_(checked_cast<const TimestampType&>(*type_).unit()),
+ parser_(*options_.timestamp_parsers[0]) {}
+
+ Status Decode(const uint8_t* data, uint32_t size, bool quoted, value_type* out) {
+ if (ARROW_PREDICT_FALSE(
+ !parser_(reinterpret_cast<const char*>(data), size, unit_, out))) {
+ return GenericConversionError(type_, data, size);
+ }
+ return Status::OK();
+ }
+
+ protected:
+ TimeUnit::type unit_;
+ const TimestampParser& parser_;
+};
+
+struct MultipleParsersTimestampValueDecoder : public ValueDecoder {
+ using value_type = int64_t;
+
+ explicit MultipleParsersTimestampValueDecoder(const std::shared_ptr<DataType>& type,
+ const ConvertOptions& options)
+ : ValueDecoder(type, options),
+ unit_(checked_cast<const TimestampType&>(*type_).unit()),
+ parsers_(GetParsers(options_)) {}
+
+ Status Decode(const uint8_t* data, uint32_t size, bool quoted, value_type* out) {
+ for (const auto& parser : parsers_) {
+ if (parser->operator()(reinterpret_cast<const char*>(data), size, unit_, out)) {
+ return Status::OK();
+ }
+ }
+ return GenericConversionError(type_, data, size);
+ }
+
+ protected:
+ using ParserVector = std::vector<const TimestampParser*>;
+
+ static ParserVector GetParsers(const ConvertOptions& options) {
+ ParserVector parsers(options.timestamp_parsers.size());
+ for (size_t i = 0; i < options.timestamp_parsers.size(); ++i) {
+ parsers[i] = options.timestamp_parsers[i].get();
+ }
+ return parsers;
+ }
+
+ TimeUnit::type unit_;
+ std::vector<const TimestampParser*> parsers_;
+};
+
+/////////////////////////////////////////////////////////////////////////
+// Concrete Converter hierarchy
+
+class ConcreteConverter : public Converter {
+ public:
+ using Converter::Converter;
+};
+
+class ConcreteDictionaryConverter : public DictionaryConverter {
+ public:
+ using DictionaryConverter::DictionaryConverter;
+};
+
+//
+// Concrete Converter for nulls
+//
+
+class NullConverter : public ConcreteConverter {
+ public:
+ NullConverter(const std::shared_ptr<DataType>& type, const ConvertOptions& options,
+ MemoryPool* pool)
+ : ConcreteConverter(type, options, pool), decoder_(type_, options_) {}
+
+ Result<std::shared_ptr<Array>> Convert(const BlockParser& parser,
+ int32_t col_index) override {
+ NullBuilder builder(pool_);
+
+ auto visit = [&](const uint8_t* data, uint32_t size, bool quoted) -> Status {
+ if (ARROW_PREDICT_TRUE(decoder_.IsNull(data, size, quoted))) {
+ return builder.AppendNull();
+ } else {
+ return GenericConversionError(type_, data, size);
+ }
+ };
+ RETURN_NOT_OK(parser.VisitColumn(col_index, visit));
+ std::shared_ptr<Array> res;
+ RETURN_NOT_OK(builder.Finish(&res));
+ return res;
+ }
+
+ protected:
+ Status Initialize() override { return decoder_.Initialize(); }
+
+ ValueDecoder decoder_;
+};
+
+//
+// Concrete Converter for primitives
+//
+
+template <typename T, typename ValueDecoderType>
+class PrimitiveConverter : public ConcreteConverter {
+ public:
+ PrimitiveConverter(const std::shared_ptr<DataType>& type, const ConvertOptions& options,
+ MemoryPool* pool)
+ : ConcreteConverter(type, options, pool), decoder_(type_, options_) {}
+
+ Result<std::shared_ptr<Array>> Convert(const BlockParser& parser,
+ int32_t col_index) override {
+ using BuilderType = typename TypeTraits<T>::BuilderType;
+ using value_type = typename ValueDecoderType::value_type;
+
+ BuilderType builder(type_, pool_);
+ RETURN_NOT_OK(PresizeBuilder(parser, &builder));
+
+ auto visit = [&](const uint8_t* data, uint32_t size, bool quoted) -> Status {
+ if (decoder_.IsNull(data, size, quoted /* quoted */)) {
+ return builder.AppendNull();
+ }
+ value_type value{};
+ RETURN_NOT_OK(decoder_.Decode(data, size, quoted, &value));
+ builder.UnsafeAppend(value);
+ return Status::OK();
+ };
+ RETURN_NOT_OK(parser.VisitColumn(col_index, visit));
+
+ std::shared_ptr<Array> res;
+ RETURN_NOT_OK(builder.Finish(&res));
+ return res;
+ }
+
+ protected:
+ Status Initialize() override { return decoder_.Initialize(); }
+
+ ValueDecoderType decoder_;
+};
+
+//
+// Concrete Converter for dictionaries
+//
+
+template <typename T, typename ValueDecoderType>
+class TypedDictionaryConverter : public ConcreteDictionaryConverter {
+ public:
+ TypedDictionaryConverter(const std::shared_ptr<DataType>& value_type,
+ const ConvertOptions& options, MemoryPool* pool)
+ : ConcreteDictionaryConverter(value_type, options, pool),
+ decoder_(value_type, options_) {}
+
+ Result<std::shared_ptr<Array>> Convert(const BlockParser& parser,
+ int32_t col_index) override {
+ // We use a fixed index width so that all column chunks get the same index type
+ using BuilderType = Dictionary32Builder<T>;
+ using value_type = typename ValueDecoderType::value_type;
+
+ BuilderType builder(value_type_, pool_);
+ RETURN_NOT_OK(PresizeBuilder(parser, &builder));
+
+ auto visit = [&](const uint8_t* data, uint32_t size, bool quoted) -> Status {
+ if (decoder_.IsNull(data, size, quoted /* quoted */)) {
+ return builder.AppendNull();
+ }
+ if (ARROW_PREDICT_FALSE(builder.dictionary_length() > max_cardinality_)) {
+ return Status::IndexError("Dictionary length exceeded max cardinality");
+ }
+ value_type value{};
+ RETURN_NOT_OK(decoder_.Decode(data, size, quoted, &value));
+ return builder.Append(value);
+ };
+ RETURN_NOT_OK(parser.VisitColumn(col_index, visit));
+
+ std::shared_ptr<Array> res;
+ RETURN_NOT_OK(builder.Finish(&res));
+ return res;
+ }
+
+ void SetMaxCardinality(int32_t max_length) override { max_cardinality_ = max_length; }
+
+ protected:
+ Status Initialize() override {
+ util::InitializeUTF8();
+ return decoder_.Initialize();
+ }
+
+ ValueDecoderType decoder_;
+ int32_t max_cardinality_ = std::numeric_limits<int32_t>::max();
+};
+
+//
+// Concrete Converter factory for timestamps
+//
+
+template <template <typename, typename> class ConverterType>
+std::shared_ptr<Converter> MakeTimestampConverter(const std::shared_ptr<DataType>& type,
+ const ConvertOptions& options,
+ MemoryPool* pool) {
+ if (options.timestamp_parsers.size() == 0) {
+ // Default to ISO-8601
+ return std::make_shared<ConverterType<TimestampType, InlineISO8601ValueDecoder>>(
+ type, options, pool);
+ } else if (options.timestamp_parsers.size() == 1) {
+ // Single user-supplied converter
+ return std::make_shared<
+ ConverterType<TimestampType, SingleParserTimestampValueDecoder>>(type, options,
+ pool);
+ } else {
+ // Multiple converters, must iterate for each value
+ return std::make_shared<
+ ConverterType<TimestampType, MultipleParsersTimestampValueDecoder>>(type, options,
+ pool);
+ }
+}
+
+} // namespace
+
+/////////////////////////////////////////////////////////////////////////
+// Base Converter class implementation
+
+Converter::Converter(const std::shared_ptr<DataType>& type, const ConvertOptions& options,
+ MemoryPool* pool)
+ : options_(options), pool_(pool), type_(type) {}
+
+DictionaryConverter::DictionaryConverter(const std::shared_ptr<DataType>& value_type,
+ const ConvertOptions& options, MemoryPool* pool)
+ : Converter(dictionary(int32(), value_type), options, pool),
+ value_type_(value_type) {}
+
+Result<std::shared_ptr<Converter>> Converter::Make(const std::shared_ptr<DataType>& type,
+ const ConvertOptions& options,
+ MemoryPool* pool) {
+ std::shared_ptr<Converter> ptr;
+
+ switch (type->id()) {
+#define CONVERTER_CASE(TYPE_ID, CONVERTER_TYPE) \
+ case TYPE_ID: \
+ ptr.reset(new CONVERTER_TYPE(type, options, pool)); \
+ break;
+
+#define NUMERIC_CONVERTER_CASE(TYPE_ID, TYPE_CLASS) \
+ CONVERTER_CASE(TYPE_ID, \
+ (PrimitiveConverter<TYPE_CLASS, NumericValueDecoder<TYPE_CLASS>>))
+
+ CONVERTER_CASE(Type::NA, NullConverter)
+ NUMERIC_CONVERTER_CASE(Type::INT8, Int8Type)
+ NUMERIC_CONVERTER_CASE(Type::INT16, Int16Type)
+ NUMERIC_CONVERTER_CASE(Type::INT32, Int32Type)
+ NUMERIC_CONVERTER_CASE(Type::INT64, Int64Type)
+ NUMERIC_CONVERTER_CASE(Type::UINT8, UInt8Type)
+ NUMERIC_CONVERTER_CASE(Type::UINT16, UInt16Type)
+ NUMERIC_CONVERTER_CASE(Type::UINT32, UInt32Type)
+ NUMERIC_CONVERTER_CASE(Type::UINT64, UInt64Type)
+ NUMERIC_CONVERTER_CASE(Type::FLOAT, FloatType)
+ NUMERIC_CONVERTER_CASE(Type::DOUBLE, DoubleType)
+ NUMERIC_CONVERTER_CASE(Type::DATE32, Date32Type)
+ NUMERIC_CONVERTER_CASE(Type::DATE64, Date64Type)
+ CONVERTER_CASE(Type::BOOL, (PrimitiveConverter<BooleanType, BooleanValueDecoder>))
+ CONVERTER_CASE(Type::BINARY,
+ (PrimitiveConverter<BinaryType, BinaryValueDecoder<false>>))
+ CONVERTER_CASE(Type::LARGE_BINARY,
+ (PrimitiveConverter<LargeBinaryType, BinaryValueDecoder<false>>))
+ CONVERTER_CASE(Type::FIXED_SIZE_BINARY,
+ (PrimitiveConverter<FixedSizeBinaryType, FixedSizeBinaryValueDecoder>))
+ CONVERTER_CASE(Type::DECIMAL,
+ (PrimitiveConverter<Decimal128Type, DecimalValueDecoder>))
+
+ case Type::TIMESTAMP:
+ ptr = MakeTimestampConverter<PrimitiveConverter>(type, options, pool);
+ break;
+
+ case Type::STRING:
+ if (options.check_utf8) {
+ ptr = std::make_shared<PrimitiveConverter<StringType, BinaryValueDecoder<true>>>(
+ type, options, pool);
+ } else {
+ ptr = std::make_shared<PrimitiveConverter<StringType, BinaryValueDecoder<false>>>(
+ type, options, pool);
+ }
+ break;
+
+ case Type::LARGE_STRING:
+ if (options.check_utf8) {
+ ptr = std::make_shared<
+ PrimitiveConverter<LargeStringType, BinaryValueDecoder<true>>>(type, options,
+ pool);
+ } else {
+ ptr = std::make_shared<
+ PrimitiveConverter<LargeStringType, BinaryValueDecoder<false>>>(type, options,
+ pool);
+ }
+ break;
+
+ case Type::DICTIONARY: {
+ const auto& dict_type = checked_cast<const DictionaryType&>(*type);
+ if (dict_type.index_type()->id() != Type::INT32) {
+ return Status::NotImplemented(
+ "CSV conversion to dictionary only supported for int32 indices, "
+ "got ",
+ type->ToString());
+ }
+ return DictionaryConverter::Make(dict_type.value_type(), options, pool);
+ }
+
+ default: {
+ return Status::NotImplemented("CSV conversion to ", type->ToString(),
+ " is not supported");
+ }
+
+#undef CONVERTER_CASE
+#undef NUMERIC_CONVERTER_CASE
+ }
+ RETURN_NOT_OK(ptr->Initialize());
+ return ptr;
+}
+
+Result<std::shared_ptr<DictionaryConverter>> DictionaryConverter::Make(
+ const std::shared_ptr<DataType>& type, const ConvertOptions& options,
+ MemoryPool* pool) {
+ std::shared_ptr<DictionaryConverter> ptr;
+
+ switch (type->id()) {
+#define CONVERTER_CASE(TYPE_ID, TYPE, VALUE_DECODER_TYPE) \
+ case TYPE_ID: \
+ ptr.reset( \
+ new TypedDictionaryConverter<TYPE, VALUE_DECODER_TYPE>(type, options, pool)); \
+ break;
+
+ // XXX Are 32-bit types useful?
+ CONVERTER_CASE(Type::INT32, Int32Type, NumericValueDecoder<Int32Type>)
+ CONVERTER_CASE(Type::INT64, Int64Type, NumericValueDecoder<Int64Type>)
+ CONVERTER_CASE(Type::UINT32, UInt32Type, NumericValueDecoder<UInt32Type>)
+ CONVERTER_CASE(Type::UINT64, UInt64Type, NumericValueDecoder<UInt64Type>)
+ CONVERTER_CASE(Type::FLOAT, FloatType, NumericValueDecoder<FloatType>)
+ CONVERTER_CASE(Type::DOUBLE, DoubleType, NumericValueDecoder<DoubleType>)
+ CONVERTER_CASE(Type::DECIMAL, Decimal128Type, DecimalValueDecoder)
+ CONVERTER_CASE(Type::FIXED_SIZE_BINARY, FixedSizeBinaryType,
+ FixedSizeBinaryValueDecoder)
+ CONVERTER_CASE(Type::BINARY, BinaryType, BinaryValueDecoder<false>)
+ CONVERTER_CASE(Type::LARGE_BINARY, LargeBinaryType, BinaryValueDecoder<false>)
+
+ case Type::STRING:
+ if (options.check_utf8) {
+ ptr = std::make_shared<
+ TypedDictionaryConverter<StringType, BinaryValueDecoder<true>>>(type, options,
+ pool);
+ } else {
+ ptr = std::make_shared<
+ TypedDictionaryConverter<StringType, BinaryValueDecoder<false>>>(
+ type, options, pool);
+ }
+ break;
+
+ case Type::LARGE_STRING:
+ if (options.check_utf8) {
+ ptr = std::make_shared<
+ TypedDictionaryConverter<LargeStringType, BinaryValueDecoder<true>>>(
+ type, options, pool);
+ } else {
+ ptr = std::make_shared<
+ TypedDictionaryConverter<LargeStringType, BinaryValueDecoder<false>>>(
+ type, options, pool);
+ }
+ break;
+
+ default: {
+ return Status::NotImplemented("CSV dictionary conversion to ", type->ToString(),
+ " is not supported");
+ }
+
+#undef CONVERTER_CASE
+ }
+ RETURN_NOT_OK(ptr->Initialize());
+ return ptr;
+}
+
+} // namespace csv
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/csv/converter.h b/contrib/libs/apache/arrow/cpp/src/arrow/csv/converter.h
new file mode 100644
index 00000000000..639f692f26a
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/csv/converter.h
@@ -0,0 +1,82 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+
+#include "arrow/csv/options.h"
+#include "arrow/result.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace csv {
+
+class BlockParser;
+
+class ARROW_EXPORT Converter {
+ public:
+ Converter(const std::shared_ptr<DataType>& type, const ConvertOptions& options,
+ MemoryPool* pool);
+ virtual ~Converter() = default;
+
+ virtual Result<std::shared_ptr<Array>> Convert(const BlockParser& parser,
+ int32_t col_index) = 0;
+
+ std::shared_ptr<DataType> type() const { return type_; }
+
+ // Create a Converter for the given data type
+ static Result<std::shared_ptr<Converter>> Make(
+ const std::shared_ptr<DataType>& type, const ConvertOptions& options,
+ MemoryPool* pool = default_memory_pool());
+
+ protected:
+ ARROW_DISALLOW_COPY_AND_ASSIGN(Converter);
+
+ virtual Status Initialize() = 0;
+
+ // CAUTION: ConvertOptions can grow large (if it customizes hundreds or
+ // thousands of columns), so avoid copying it in each Converter.
+ const ConvertOptions& options_;
+ MemoryPool* pool_;
+ std::shared_ptr<DataType> type_;
+};
+
+class ARROW_EXPORT DictionaryConverter : public Converter {
+ public:
+ DictionaryConverter(const std::shared_ptr<DataType>& value_type,
+ const ConvertOptions& options, MemoryPool* pool);
+
+ // If the dictionary length goes above this value, conversion will fail
+ // with Status::IndexError.
+ virtual void SetMaxCardinality(int32_t max_length) = 0;
+
+ // Create a Converter for the given dictionary value type.
+ // The dictionary index type will always be Int32.
+ static Result<std::shared_ptr<DictionaryConverter>> Make(
+ const std::shared_ptr<DataType>& value_type, const ConvertOptions& options,
+ MemoryPool* pool = default_memory_pool());
+
+ protected:
+ std::shared_ptr<DataType> value_type_;
+};
+
+} // namespace csv
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/csv/inference_internal.h b/contrib/libs/apache/arrow/cpp/src/arrow/csv/inference_internal.h
new file mode 100644
index 00000000000..42486a1ebaf
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/csv/inference_internal.h
@@ -0,0 +1,150 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+
+#include "arrow/csv/converter.h"
+#include "arrow/csv/options.h"
+#include "arrow/util/logging.h"
+
+namespace arrow {
+namespace csv {
+
+enum class InferKind {
+ Null,
+ Integer,
+ Boolean,
+ Real,
+ Date,
+ Timestamp,
+ TimestampNS,
+ TextDict,
+ BinaryDict,
+ Text,
+ Binary
+};
+
+class InferStatus {
+ public:
+ explicit InferStatus(const ConvertOptions& options)
+ : kind_(InferKind::Null), can_loosen_type_(true), options_(options) {}
+
+ InferKind kind() const { return kind_; }
+
+ bool can_loosen_type() const { return can_loosen_type_; }
+
+ void LoosenType(const Status& conversion_error) {
+ DCHECK(can_loosen_type_);
+
+ switch (kind_) {
+ case InferKind::Null:
+ return SetKind(InferKind::Integer);
+ case InferKind::Integer:
+ return SetKind(InferKind::Boolean);
+ case InferKind::Boolean:
+ return SetKind(InferKind::Date);
+ case InferKind::Date:
+ return SetKind(InferKind::Timestamp);
+ case InferKind::Timestamp:
+ return SetKind(InferKind::TimestampNS);
+ case InferKind::TimestampNS:
+ return SetKind(InferKind::Real);
+ case InferKind::Real:
+ if (options_.auto_dict_encode) {
+ return SetKind(InferKind::TextDict);
+ } else {
+ return SetKind(InferKind::Text);
+ }
+ case InferKind::TextDict:
+ if (conversion_error.IsIndexError()) {
+ // Cardinality too large, fall back to non-dict encoding
+ return SetKind(InferKind::Text);
+ } else {
+ // Assuming UTF8 validation failure
+ return SetKind(InferKind::BinaryDict);
+ }
+ break;
+ case InferKind::BinaryDict:
+ // Assuming cardinality too large
+ return SetKind(InferKind::Binary);
+ case InferKind::Text:
+ // Assuming UTF8 validation failure
+ return SetKind(InferKind::Binary);
+ default:
+ ARROW_LOG(FATAL) << "Shouldn't come here";
+ }
+ }
+
+ Result<std::shared_ptr<Converter>> MakeConverter(MemoryPool* pool) {
+ auto make_converter =
+ [&](std::shared_ptr<DataType> type) -> Result<std::shared_ptr<Converter>> {
+ return Converter::Make(type, options_, pool);
+ };
+
+ auto make_dict_converter =
+ [&](std::shared_ptr<DataType> type) -> Result<std::shared_ptr<Converter>> {
+ ARROW_ASSIGN_OR_RAISE(auto dict_converter,
+ DictionaryConverter::Make(type, options_, pool));
+ dict_converter->SetMaxCardinality(options_.auto_dict_max_cardinality);
+ return dict_converter;
+ };
+
+ switch (kind_) {
+ case InferKind::Null:
+ return make_converter(null());
+ case InferKind::Integer:
+ return make_converter(int64());
+ case InferKind::Boolean:
+ return make_converter(boolean());
+ case InferKind::Date:
+ return make_converter(date32());
+ case InferKind::Timestamp:
+ return make_converter(timestamp(TimeUnit::SECOND));
+ case InferKind::TimestampNS:
+ return make_converter(timestamp(TimeUnit::NANO));
+ case InferKind::Real:
+ return make_converter(float64());
+ case InferKind::Text:
+ return make_converter(utf8());
+ case InferKind::Binary:
+ return make_converter(binary());
+ case InferKind::TextDict:
+ return make_dict_converter(utf8());
+ case InferKind::BinaryDict:
+ return make_dict_converter(binary());
+ }
+ return Status::UnknownError("Shouldn't come here");
+ }
+
+ protected:
+ void SetKind(InferKind kind) {
+ kind_ = kind;
+ if (kind == InferKind::Binary) {
+ // Binary is the catch-all type
+ can_loosen_type_ = false;
+ }
+ }
+
+ InferKind kind_;
+ bool can_loosen_type_;
+ const ConvertOptions& options_;
+};
+
+} // namespace csv
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/csv/options.cc b/contrib/libs/apache/arrow/cpp/src/arrow/csv/options.cc
new file mode 100644
index 00000000000..c71cfdaf295
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/csv/options.cc
@@ -0,0 +1,83 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/csv/options.h"
+
+namespace arrow {
+namespace csv {
+
+ParseOptions ParseOptions::Defaults() { return ParseOptions(); }
+
+Status ParseOptions::Validate() const {
+ if (ARROW_PREDICT_FALSE(delimiter == '\n' || delimiter == '\r')) {
+ return Status::Invalid("ParseOptions: delimiter cannot be \\r or \\n");
+ }
+ if (ARROW_PREDICT_FALSE(quoting && (quote_char == '\n' || quote_char == '\r'))) {
+ return Status::Invalid("ParseOptions: quote_char cannot be \\r or \\n");
+ }
+ if (ARROW_PREDICT_FALSE(escaping && (escape_char == '\n' || escape_char == '\r'))) {
+ return Status::Invalid("ParseOptions: escape_char cannot be \\r or \\n");
+ }
+ return Status::OK();
+}
+
+ConvertOptions ConvertOptions::Defaults() {
+ auto options = ConvertOptions();
+ // Same default null / true / false spellings as in Pandas.
+ options.null_values = {"", "#N/A", "#N/A N/A", "#NA", "-1.#IND", "-1.#QNAN",
+ "-NaN", "-nan", "1.#IND", "1.#QNAN", "N/A", "NA",
+ "NULL", "NaN", "n/a", "nan", "null"};
+ options.true_values = {"1", "True", "TRUE", "true"};
+ options.false_values = {"0", "False", "FALSE", "false"};
+ return options;
+}
+
+Status ConvertOptions::Validate() const { return Status::OK(); }
+
+ReadOptions ReadOptions::Defaults() { return ReadOptions(); }
+
+Status ReadOptions::Validate() const {
+ if (ARROW_PREDICT_FALSE(block_size < 1)) {
+ // Min is 1 because some tests use really small block sizes
+ return Status::Invalid("ReadOptions: block_size must be at least 1: ", block_size);
+ }
+ if (ARROW_PREDICT_FALSE(skip_rows < 0)) {
+ return Status::Invalid("ReadOptions: skip_rows cannot be negative: ", skip_rows);
+ }
+ if (ARROW_PREDICT_FALSE(skip_rows_after_names < 0)) {
+ return Status::Invalid("ReadOptions: skip_rows_after_names cannot be negative: ",
+ skip_rows_after_names);
+ }
+ if (ARROW_PREDICT_FALSE(autogenerate_column_names && !column_names.empty())) {
+ return Status::Invalid(
+ "ReadOptions: autogenerate_column_names cannot be true when column_names are "
+ "provided");
+ }
+ return Status::OK();
+}
+
+WriteOptions WriteOptions::Defaults() { return WriteOptions(); }
+
+Status WriteOptions::Validate() const {
+ if (ARROW_PREDICT_FALSE(batch_size < 1)) {
+ return Status::Invalid("WriteOptions: batch_size must be at least 1: ", batch_size);
+ }
+ return Status::OK();
+}
+
+} // namespace csv
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/csv/options.h b/contrib/libs/apache/arrow/cpp/src/arrow/csv/options.h
new file mode 100644
index 00000000000..5face6f32d8
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/csv/options.h
@@ -0,0 +1,189 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "arrow/csv/type_fwd.h"
+#include "arrow/io/interfaces.h"
+#include "arrow/status.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class DataType;
+class TimestampParser;
+
+namespace csv {
+
+// Silly workaround for https://github.com/michaeljones/breathe/issues/453
+constexpr char kDefaultEscapeChar = '\\';
+
+struct ARROW_EXPORT ParseOptions {
+ // Parsing options
+
+ /// Field delimiter
+ char delimiter = ',';
+ /// Whether quoting is used
+ bool quoting = true;
+ /// Quoting character (if `quoting` is true)
+ char quote_char = '"';
+ /// Whether a quote inside a value is double-quoted
+ bool double_quote = true;
+ /// Whether escaping is used
+ bool escaping = false;
+ /// Escaping character (if `escaping` is true)
+ char escape_char = kDefaultEscapeChar;
+ /// Whether values are allowed to contain CR (0x0d) and LF (0x0a) characters
+ bool newlines_in_values = false;
+ /// Whether empty lines are ignored. If false, an empty line represents
+ /// a single empty value (assuming a one-column CSV file).
+ bool ignore_empty_lines = true;
+
+ /// Create parsing options with default values
+ static ParseOptions Defaults();
+
+ /// \brief Test that all set options are valid
+ Status Validate() const;
+};
+
+struct ARROW_EXPORT ConvertOptions {
+ // Conversion options
+
+ /// Whether to check UTF8 validity of string columns
+ bool check_utf8 = true;
+ /// Optional per-column types (disabling type inference on those columns)
+ std::unordered_map<std::string, std::shared_ptr<DataType>> column_types;
+ /// Recognized spellings for null values
+ std::vector<std::string> null_values;
+ /// Recognized spellings for boolean true values
+ std::vector<std::string> true_values;
+ /// Recognized spellings for boolean false values
+ std::vector<std::string> false_values;
+
+ /// Whether string / binary columns can have null values.
+ ///
+ /// If true, then strings in "null_values" are considered null for string columns.
+ /// If false, then all strings are valid string values.
+ bool strings_can_be_null = false;
+ /// Whether string / binary columns can have quoted null values.
+ ///
+ /// If true *and* `strings_can_be_null` is true, then quoted strings in
+ /// "null_values" are also considered null for string columns. Otherwise,
+ /// quoted strings are never considered null.
+ bool quoted_strings_can_be_null = true;
+
+ /// Whether to try to automatically dict-encode string / binary data.
+ /// If true, then when type inference detects a string or binary column,
+ /// it is dict-encoded up to `auto_dict_max_cardinality` distinct values
+ /// (per chunk), after which it switches to regular encoding.
+ ///
+ /// This setting is ignored for non-inferred columns (those in `column_types`).
+ bool auto_dict_encode = false;
+ int32_t auto_dict_max_cardinality = 50;
+
+ // XXX Should we have a separate FilterOptions?
+
+ /// If non-empty, indicates the names of columns from the CSV file that should
+ /// be actually read and converted (in the vector's order).
+ /// Columns not in this vector will be ignored.
+ std::vector<std::string> include_columns;
+ /// If false, columns in `include_columns` but not in the CSV file will error out.
+ /// If true, columns in `include_columns` but not in the CSV file will produce
+ /// a column of nulls (whose type is selected using `column_types`,
+ /// or null by default)
+ /// This option is ignored if `include_columns` is empty.
+ bool include_missing_columns = false;
+
+ /// User-defined timestamp parsers, using the virtual parser interface in
+ /// arrow/util/value_parsing.h. More than one parser can be specified, and
+ /// the CSV conversion logic will try parsing values starting from the
+ /// beginning of this vector. If no parsers are specified, we use the default
+ /// built-in ISO-8601 parser.
+ std::vector<std::shared_ptr<TimestampParser>> timestamp_parsers;
+
+ /// Create conversion options with default values, including conventional
+ /// values for `null_values`, `true_values` and `false_values`
+ static ConvertOptions Defaults();
+
+ /// \brief Test that all set options are valid
+ Status Validate() const;
+};
+
+struct ARROW_EXPORT ReadOptions {
+ // Reader options
+
+ /// Whether to use the global CPU thread pool
+ bool use_threads = true;
+
+ /// \brief Block size we request from the IO layer.
+ ///
+ /// This will determine multi-threading granularity as well as
+ /// the size of individual record batches.
+ /// Minimum valid value for block size is 1
+ int32_t block_size = 1 << 20; // 1 MB
+
+ /// Number of header rows to skip (not including the row of column names, if any)
+ int32_t skip_rows = 0;
+
+ /// Number of rows to skip after the column names are read, if any
+ int32_t skip_rows_after_names = 0;
+
+ /// Column names for the target table.
+ /// If empty, fall back on autogenerate_column_names.
+ std::vector<std::string> column_names;
+
+ /// Whether to autogenerate column names if `column_names` is empty.
+ /// If true, column names will be of the form "f0", "f1"...
+ /// If false, column names will be read from the first CSV row after `skip_rows`.
+ bool autogenerate_column_names = false;
+
+ /// Create read options with default values
+ static ReadOptions Defaults();
+
+ /// \brief Test that all set options are valid
+ Status Validate() const;
+};
+
+/// Experimental
+struct ARROW_EXPORT WriteOptions {
+ /// Whether to write an initial header line with column names
+ bool include_header = true;
+
+ /// \brief Maximum number of rows processed at a time
+ ///
+ /// The CSV writer converts and writes data in batches of N rows.
+ /// This number can impact performance.
+ int32_t batch_size = 1024;
+
+ /// \brief IO context for writing.
+ io::IOContext io_context;
+
+ /// Create write options with default values
+ static WriteOptions Defaults();
+
+ /// \brief Test that all set options are valid
+ Status Validate() const;
+};
+
+} // namespace csv
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/csv/parser.cc b/contrib/libs/apache/arrow/cpp/src/arrow/csv/parser.cc
new file mode 100644
index 00000000000..446f36a4ee5
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/csv/parser.cc
@@ -0,0 +1,581 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/csv/parser.h"
+
+#include <algorithm>
+#include <cstdio>
+#include <limits>
+#include <utility>
+
+#include "arrow/memory_pool.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/util/logging.h"
+
+namespace arrow {
+namespace csv {
+
+using detail::DataBatch;
+using detail::ParsedValueDesc;
+
+namespace {
+
+template <typename... Args>
+Status ParseError(Args&&... args) {
+ return Status::Invalid("CSV parse error: ", std::forward<Args>(args)...);
+}
+
+Status MismatchingColumns(int32_t expected, int32_t actual, int64_t row_num,
+ util::string_view row) {
+ std::string ellipse;
+ if (row.length() > 100) {
+ row = row.substr(0, 96);
+ ellipse = " ...";
+ }
+ if (row_num < 0) {
+ return ParseError("Expected ", expected, " columns, got ", actual, ": ", row,
+ ellipse);
+ }
+ return ParseError("Row #", row_num, ": Expected ", expected, " columns, got ", actual,
+ ": ", row, ellipse);
+}
+
+inline bool IsControlChar(uint8_t c) { return c < ' '; }
+
+template <bool Quoting, bool Escaping>
+class SpecializedOptions {
+ public:
+ static constexpr bool quoting = Quoting;
+ static constexpr bool escaping = Escaping;
+};
+
+// A helper class allocating the buffer for parsed values and writing into it
+// without any further resizes, except at the end.
+class PresizedDataWriter {
+ public:
+ PresizedDataWriter(MemoryPool* pool, uint32_t size)
+ : parsed_size_(0), parsed_capacity_(size) {
+ parsed_buffer_ = *AllocateResizableBuffer(parsed_capacity_, pool);
+ parsed_ = parsed_buffer_->mutable_data();
+ }
+
+ void Finish(std::shared_ptr<Buffer>* out_parsed) {
+ ARROW_CHECK_OK(parsed_buffer_->Resize(parsed_size_));
+ *out_parsed = parsed_buffer_;
+ }
+
+ void BeginLine() { saved_parsed_size_ = parsed_size_; }
+
+ void PushFieldChar(char c) {
+ DCHECK_LT(parsed_size_, parsed_capacity_);
+ parsed_[parsed_size_++] = static_cast<uint8_t>(c);
+ }
+
+ // Rollback the state that was saved in BeginLine()
+ void RollbackLine() { parsed_size_ = saved_parsed_size_; }
+
+ int64_t size() { return parsed_size_; }
+
+ protected:
+ std::shared_ptr<ResizableBuffer> parsed_buffer_;
+ uint8_t* parsed_;
+ int64_t parsed_size_;
+ int64_t parsed_capacity_;
+ // Checkpointing, for when an incomplete line is encountered at end of block
+ int64_t saved_parsed_size_;
+};
+
+template <typename Derived>
+class ValueDescWriter {
+ public:
+ Derived* derived() { return static_cast<Derived*>(this); }
+
+ template <typename DataWriter>
+ void Start(DataWriter& parsed_writer) {
+ derived()->PushValue(
+ {static_cast<uint32_t>(parsed_writer.size()) & 0x7fffffffU, false});
+ }
+
+ void BeginLine() { saved_values_size_ = values_size_; }
+
+ // Rollback the state that was saved in BeginLine()
+ void RollbackLine() { values_size_ = saved_values_size_; }
+
+ void StartField(bool quoted) { quoted_ = quoted; }
+
+ template <typename DataWriter>
+ void FinishField(DataWriter* parsed_writer) {
+ derived()->PushValue(
+ {static_cast<uint32_t>(parsed_writer->size()) & 0x7fffffffU, quoted_});
+ }
+
+ void Finish(std::shared_ptr<Buffer>* out_values) {
+ ARROW_CHECK_OK(values_buffer_->Resize(values_size_ * sizeof(*values_)));
+ *out_values = values_buffer_;
+ }
+
+ protected:
+ ValueDescWriter(MemoryPool* pool, int64_t values_capacity)
+ : values_size_(0), values_capacity_(values_capacity) {
+ values_buffer_ = *AllocateResizableBuffer(values_capacity_ * sizeof(*values_), pool);
+ values_ = reinterpret_cast<ParsedValueDesc*>(values_buffer_->mutable_data());
+ }
+
+ std::shared_ptr<ResizableBuffer> values_buffer_;
+ ParsedValueDesc* values_;
+ int64_t values_size_;
+ int64_t values_capacity_;
+ bool quoted_;
+ // Checkpointing, for when an incomplete line is encountered at end of block
+ int64_t saved_values_size_;
+};
+
+// A helper class handling a growable buffer for values offsets. This class is
+// used when the number of columns is not yet known and we therefore cannot
+// efficiently presize the target area for a given number of rows.
+class ResizableValueDescWriter : public ValueDescWriter<ResizableValueDescWriter> {
+ public:
+ explicit ResizableValueDescWriter(MemoryPool* pool)
+ : ValueDescWriter(pool, /*values_capacity=*/256) {}
+
+ void PushValue(ParsedValueDesc v) {
+ if (ARROW_PREDICT_FALSE(values_size_ == values_capacity_)) {
+ values_capacity_ = values_capacity_ * 2;
+ ARROW_CHECK_OK(values_buffer_->Resize(values_capacity_ * sizeof(*values_)));
+ values_ = reinterpret_cast<ParsedValueDesc*>(values_buffer_->mutable_data());
+ }
+ values_[values_size_++] = v;
+ }
+};
+
+// A helper class allocating the buffer for values offsets and writing into it
+// without any further resizes, except at the end. This class is used once the
+// number of columns is known, as it eliminates resizes and generates simpler,
+// faster CSV parsing code.
+class PresizedValueDescWriter : public ValueDescWriter<PresizedValueDescWriter> {
+ public:
+ PresizedValueDescWriter(MemoryPool* pool, int32_t num_rows, int32_t num_cols)
+ : ValueDescWriter(pool, /*values_capacity=*/1 + num_rows * num_cols) {}
+
+ void PushValue(ParsedValueDesc v) {
+ DCHECK_LT(values_size_, values_capacity_);
+ values_[values_size_++] = v;
+ }
+};
+
+} // namespace
+
+class BlockParserImpl {
+ public:
+ BlockParserImpl(MemoryPool* pool, ParseOptions options, int32_t num_cols,
+ int64_t first_row, int32_t max_num_rows)
+ : pool_(pool),
+ options_(options),
+ first_row_(first_row),
+ max_num_rows_(max_num_rows),
+ batch_(num_cols) {}
+
+ const DataBatch& parsed_batch() const { return batch_; }
+
+ int64_t first_row_num() const { return first_row_; }
+
+ template <typename SpecializedOptions, typename ValueDescWriter, typename DataWriter>
+ Status ParseLine(ValueDescWriter* values_writer, DataWriter* parsed_writer,
+ const char* data, const char* data_end, bool is_final,
+ const char** out_data) {
+ int32_t num_cols = 0;
+ char c;
+ const auto start = data;
+
+ DCHECK_GT(data_end, data);
+
+ auto FinishField = [&]() { values_writer->FinishField(parsed_writer); };
+
+ values_writer->BeginLine();
+ parsed_writer->BeginLine();
+
+ // The parsing state machine
+
+ // Special case empty lines: do we start with a newline separator?
+ c = *data;
+ if (ARROW_PREDICT_FALSE(IsControlChar(c))) {
+ if (c == '\r') {
+ data++;
+ if (data < data_end && *data == '\n') {
+ data++;
+ }
+ goto EmptyLine;
+ }
+ if (c == '\n') {
+ data++;
+ goto EmptyLine;
+ }
+ }
+
+ FieldStart:
+ // At the start of a field
+ // Quoting is only recognized at start of field
+ if (SpecializedOptions::quoting &&
+ ARROW_PREDICT_FALSE(*data == options_.quote_char)) {
+ ++data;
+ values_writer->StartField(true /* quoted */);
+ goto InQuotedField;
+ } else {
+ values_writer->StartField(false /* quoted */);
+ goto InField;
+ }
+
+ InField:
+ // Inside a non-quoted part of a field
+ if (ARROW_PREDICT_FALSE(data == data_end)) {
+ goto AbortLine;
+ }
+ c = *data++;
+ if (SpecializedOptions::escaping && ARROW_PREDICT_FALSE(c == options_.escape_char)) {
+ if (ARROW_PREDICT_FALSE(data == data_end)) {
+ goto AbortLine;
+ }
+ c = *data++;
+ parsed_writer->PushFieldChar(c);
+ goto InField;
+ }
+ if (ARROW_PREDICT_FALSE(c == options_.delimiter)) {
+ goto FieldEnd;
+ }
+ if (ARROW_PREDICT_FALSE(IsControlChar(c))) {
+ if (c == '\r') {
+ // In the middle of a newline separator?
+ if (ARROW_PREDICT_TRUE(data < data_end) && *data == '\n') {
+ data++;
+ }
+ goto LineEnd;
+ }
+ if (c == '\n') {
+ goto LineEnd;
+ }
+ }
+ parsed_writer->PushFieldChar(c);
+ goto InField;
+
+ InQuotedField:
+ // Inside a quoted part of a field
+ if (ARROW_PREDICT_FALSE(data == data_end)) {
+ goto AbortLine;
+ }
+ c = *data++;
+ if (SpecializedOptions::escaping && ARROW_PREDICT_FALSE(c == options_.escape_char)) {
+ if (ARROW_PREDICT_FALSE(data == data_end)) {
+ goto AbortLine;
+ }
+ c = *data++;
+ parsed_writer->PushFieldChar(c);
+ goto InQuotedField;
+ }
+ if (ARROW_PREDICT_FALSE(c == options_.quote_char)) {
+ if (options_.double_quote && ARROW_PREDICT_TRUE(data < data_end) &&
+ ARROW_PREDICT_FALSE(*data == options_.quote_char)) {
+ // Double-quoting
+ ++data;
+ } else {
+ // End of single-quoting
+ goto InField;
+ }
+ }
+ parsed_writer->PushFieldChar(c);
+ goto InQuotedField;
+
+ FieldEnd:
+ // At the end of a field
+ FinishField();
+ ++num_cols;
+ if (ARROW_PREDICT_FALSE(data == data_end)) {
+ goto AbortLine;
+ }
+ goto FieldStart;
+
+ LineEnd:
+ // At the end of line
+ FinishField();
+ ++num_cols;
+ if (ARROW_PREDICT_FALSE(num_cols != batch_.num_cols_)) {
+ if (batch_.num_cols_ == -1) {
+ batch_.num_cols_ = num_cols;
+ } else {
+ // Find the end of the line without newline or carriage return
+ auto end = data;
+ if (*(end - 1) == '\n') {
+ --end;
+ }
+ if (*(end - 1) == '\r') {
+ --end;
+ }
+ return MismatchingColumns(batch_.num_cols_, num_cols,
+ first_row_ < 0 ? -1 : first_row_ + batch_.num_rows_,
+ util::string_view(start, end - start));
+ }
+ }
+ ++batch_.num_rows_;
+ *out_data = data;
+ return Status::OK();
+
+ AbortLine:
+ // Not a full line except perhaps if in final block
+ if (is_final) {
+ goto LineEnd;
+ }
+ // Truncated line at end of block, rewind parsed state
+ values_writer->RollbackLine();
+ parsed_writer->RollbackLine();
+ return Status::OK();
+
+ EmptyLine:
+ if (!options_.ignore_empty_lines) {
+ if (batch_.num_cols_ == -1) {
+ // Consider as single value
+ batch_.num_cols_ = 1;
+ }
+ // Record as row of empty (null?) values
+ while (num_cols++ < batch_.num_cols_) {
+ values_writer->StartField(false /* quoted */);
+ FinishField();
+ }
+ ++batch_.num_rows_;
+ }
+ *out_data = data;
+ return Status::OK();
+ }
+
+ template <typename SpecializedOptions, typename ValueDescWriter, typename DataWriter>
+ Status ParseChunk(ValueDescWriter* values_writer, DataWriter* parsed_writer,
+ const char* data, const char* data_end, bool is_final,
+ int32_t rows_in_chunk, const char** out_data,
+ bool* finished_parsing) {
+ int32_t num_rows_deadline = batch_.num_rows_ + rows_in_chunk;
+
+ while (data < data_end && batch_.num_rows_ < num_rows_deadline) {
+ const char* line_end = data;
+ RETURN_NOT_OK(ParseLine<SpecializedOptions>(values_writer, parsed_writer, data,
+ data_end, is_final, &line_end));
+ if (line_end == data) {
+ // Cannot parse any further
+ *finished_parsing = true;
+ break;
+ }
+ data = line_end;
+ }
+ // Append new buffers and update size
+ std::shared_ptr<Buffer> values_buffer;
+ values_writer->Finish(&values_buffer);
+ if (values_buffer->size() > 0) {
+ values_size_ +=
+ static_cast<int32_t>(values_buffer->size() / sizeof(ParsedValueDesc) - 1);
+ batch_.values_buffers_.push_back(std::move(values_buffer));
+ }
+ *out_data = data;
+ return Status::OK();
+ }
+
+ template <typename SpecializedOptions>
+ Status ParseSpecialized(const std::vector<util::string_view>& views, bool is_final,
+ uint32_t* out_size) {
+ batch_ = DataBatch{batch_.num_cols_};
+ values_size_ = 0;
+
+ size_t total_view_length = 0;
+ for (const auto& view : views) {
+ total_view_length += view.length();
+ }
+ if (total_view_length > std::numeric_limits<uint32_t>::max()) {
+ return Status::Invalid("CSV block too large");
+ }
+
+ PresizedDataWriter parsed_writer(pool_, static_cast<uint32_t>(total_view_length));
+ uint32_t total_parsed_length = 0;
+
+ for (const auto& view : views) {
+ const char* data = view.data();
+ const char* data_end = view.data() + view.length();
+ bool finished_parsing = false;
+
+ if (batch_.num_cols_ == -1) {
+ // Can't presize values when the number of columns is not known, first parse
+ // a single line
+ const int32_t rows_in_chunk = 1;
+ ResizableValueDescWriter values_writer(pool_);
+ values_writer.Start(parsed_writer);
+
+ RETURN_NOT_OK(ParseChunk<SpecializedOptions>(&values_writer, &parsed_writer, data,
+ data_end, is_final, rows_in_chunk,
+ &data, &finished_parsing));
+ if (batch_.num_cols_ == -1) {
+ return ParseError("Empty CSV file or block: cannot infer number of columns");
+ }
+ }
+
+ while (!finished_parsing && data < data_end && batch_.num_rows_ < max_num_rows_) {
+ // We know the number of columns, so can presize a values array for
+ // a given number of rows
+ DCHECK_GE(batch_.num_cols_, 0);
+
+ int32_t rows_in_chunk;
+ constexpr int32_t kTargetChunkSize = 32768; // in number of values
+ if (batch_.num_cols_ > 0) {
+ rows_in_chunk = std::min(std::max(kTargetChunkSize / batch_.num_cols_, 512),
+ max_num_rows_ - batch_.num_rows_);
+ } else {
+ rows_in_chunk = std::min(kTargetChunkSize, max_num_rows_ - batch_.num_rows_);
+ }
+
+ PresizedValueDescWriter values_writer(pool_, rows_in_chunk, batch_.num_cols_);
+ values_writer.Start(parsed_writer);
+
+ RETURN_NOT_OK(ParseChunk<SpecializedOptions>(&values_writer, &parsed_writer, data,
+ data_end, is_final, rows_in_chunk,
+ &data, &finished_parsing));
+ }
+ DCHECK_GE(data, view.data());
+ DCHECK_LE(data, data_end);
+ total_parsed_length += static_cast<uint32_t>(data - view.data());
+
+ if (data < data_end) {
+ // Stopped early, for some reason
+ break;
+ }
+ }
+
+ parsed_writer.Finish(&batch_.parsed_buffer_);
+ batch_.parsed_size_ = static_cast<int32_t>(batch_.parsed_buffer_->size());
+ batch_.parsed_ = batch_.parsed_buffer_->data();
+
+ if (batch_.num_cols_ == -1) {
+ DCHECK_EQ(batch_.num_rows_, 0);
+ }
+ DCHECK_EQ(values_size_, batch_.num_rows_ * batch_.num_cols_);
+#ifndef NDEBUG
+ if (batch_.num_rows_ > 0) {
+ // Ending parsed offset should be equal to number of parsed bytes
+ DCHECK_GT(batch_.values_buffers_.size(), 0);
+ const auto& last_values_buffer = batch_.values_buffers_.back();
+ const auto last_values =
+ reinterpret_cast<const ParsedValueDesc*>(last_values_buffer->data());
+ const auto last_values_size = last_values_buffer->size() / sizeof(ParsedValueDesc);
+ const auto check_parsed_size =
+ static_cast<int32_t>(last_values[last_values_size - 1].offset);
+ DCHECK_EQ(batch_.parsed_size_, check_parsed_size);
+ } else {
+ DCHECK_EQ(batch_.parsed_size_, 0);
+ }
+#endif
+ *out_size = static_cast<uint32_t>(total_parsed_length);
+ return Status::OK();
+ }
+
+ Status Parse(const std::vector<util::string_view>& data, bool is_final,
+ uint32_t* out_size) {
+ if (options_.quoting) {
+ if (options_.escaping) {
+ return ParseSpecialized<SpecializedOptions<true, true>>(data, is_final, out_size);
+ } else {
+ return ParseSpecialized<SpecializedOptions<true, false>>(data, is_final,
+ out_size);
+ }
+ } else {
+ if (options_.escaping) {
+ return ParseSpecialized<SpecializedOptions<false, true>>(data, is_final,
+ out_size);
+ } else {
+ return ParseSpecialized<SpecializedOptions<false, false>>(data, is_final,
+ out_size);
+ }
+ }
+ }
+
+ protected:
+ MemoryPool* pool_;
+ const ParseOptions options_;
+ const int64_t first_row_;
+ // The maximum number of rows to parse from a block
+ int32_t max_num_rows_;
+
+ // Unparsed data size
+ int32_t values_size_;
+ // Parsed data batch
+ DataBatch batch_;
+};
+
+BlockParser::BlockParser(ParseOptions options, int32_t num_cols, int64_t first_row,
+ int32_t max_num_rows)
+ : BlockParser(default_memory_pool(), options, num_cols, first_row, max_num_rows) {}
+
+BlockParser::BlockParser(MemoryPool* pool, ParseOptions options, int32_t num_cols,
+ int64_t first_row, int32_t max_num_rows)
+ : impl_(new BlockParserImpl(pool, std::move(options), num_cols, first_row,
+ max_num_rows)) {}
+
+BlockParser::~BlockParser() {}
+
+Status BlockParser::Parse(const std::vector<util::string_view>& data,
+ uint32_t* out_size) {
+ return impl_->Parse(data, false /* is_final */, out_size);
+}
+
+Status BlockParser::ParseFinal(const std::vector<util::string_view>& data,
+ uint32_t* out_size) {
+ return impl_->Parse(data, true /* is_final */, out_size);
+}
+
+Status BlockParser::Parse(util::string_view data, uint32_t* out_size) {
+ return impl_->Parse({data}, false /* is_final */, out_size);
+}
+
+Status BlockParser::ParseFinal(util::string_view data, uint32_t* out_size) {
+ return impl_->Parse({data}, true /* is_final */, out_size);
+}
+
+const DataBatch& BlockParser::parsed_batch() const { return impl_->parsed_batch(); }
+
+int64_t BlockParser::first_row_num() const { return impl_->first_row_num(); }
+
+int32_t SkipRows(const uint8_t* data, uint32_t size, int32_t num_rows,
+ const uint8_t** out_data) {
+ const auto end = data + size;
+ int32_t skipped_rows = 0;
+ *out_data = data;
+
+ for (; skipped_rows < num_rows; ++skipped_rows) {
+ uint8_t c;
+ do {
+ while (ARROW_PREDICT_FALSE(data < end && !IsControlChar(*data))) {
+ ++data;
+ }
+ if (ARROW_PREDICT_FALSE(data == end)) {
+ return skipped_rows;
+ }
+ c = *data++;
+ } while (c != '\r' && c != '\n');
+ if (c == '\r' && data < end && *data == '\n') {
+ ++data;
+ }
+ *out_data = data;
+ }
+
+ return skipped_rows;
+}
+
+} // namespace csv
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/csv/parser.h b/contrib/libs/apache/arrow/cpp/src/arrow/csv/parser.h
new file mode 100644
index 00000000000..ffc735c228f
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/csv/parser.h
@@ -0,0 +1,202 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "arrow/buffer.h"
+#include "arrow/csv/options.h"
+#include "arrow/status.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/string_view.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class MemoryPool;
+
+namespace csv {
+
+/// Skip at most num_rows from the given input. The input pointer is updated
+/// and the number of actually skipped rows is returns (may be less than
+/// requested if the input is too short).
+ARROW_EXPORT
+int32_t SkipRows(const uint8_t* data, uint32_t size, int32_t num_rows,
+ const uint8_t** out_data);
+
+class BlockParserImpl;
+
+namespace detail {
+
+struct ParsedValueDesc {
+ uint32_t offset : 31;
+ bool quoted : 1;
+};
+
+class ARROW_EXPORT DataBatch {
+ public:
+ explicit DataBatch(int32_t num_cols) : num_cols_(num_cols) {}
+
+ /// \brief Return the number of parsed rows
+ int32_t num_rows() const { return num_rows_; }
+ /// \brief Return the number of parsed columns
+ int32_t num_cols() const { return num_cols_; }
+ /// \brief Return the total size in bytes of parsed data
+ uint32_t num_bytes() const { return parsed_size_; }
+
+ template <typename Visitor>
+ Status VisitColumn(int32_t col_index, int64_t first_row, Visitor&& visit) const {
+ using detail::ParsedValueDesc;
+
+ int64_t row = first_row;
+ for (size_t buf_index = 0; buf_index < values_buffers_.size(); ++buf_index) {
+ const auto& values_buffer = values_buffers_[buf_index];
+ const auto values = reinterpret_cast<const ParsedValueDesc*>(values_buffer->data());
+ const auto max_pos =
+ static_cast<int32_t>(values_buffer->size() / sizeof(ParsedValueDesc)) - 1;
+ for (int32_t pos = col_index; pos < max_pos; pos += num_cols_, ++row) {
+ auto start = values[pos].offset;
+ auto stop = values[pos + 1].offset;
+ auto quoted = values[pos + 1].quoted;
+ Status status = visit(parsed_ + start, stop - start, quoted);
+ if (ARROW_PREDICT_FALSE(!status.ok())) {
+ if (first_row >= 0) {
+ status = status.WithMessage("Row #", row, ": ", status.message());
+ }
+ ARROW_RETURN_NOT_OK(status);
+ }
+ }
+ }
+ return Status::OK();
+ }
+
+ template <typename Visitor>
+ Status VisitLastRow(Visitor&& visit) const {
+ using detail::ParsedValueDesc;
+
+ const auto& values_buffer = values_buffers_.back();
+ const auto values = reinterpret_cast<const ParsedValueDesc*>(values_buffer->data());
+ const auto start_pos =
+ static_cast<int32_t>(values_buffer->size() / sizeof(ParsedValueDesc)) -
+ num_cols_ - 1;
+ for (int32_t col_index = 0; col_index < num_cols_; ++col_index) {
+ auto start = values[start_pos + col_index].offset;
+ auto stop = values[start_pos + col_index + 1].offset;
+ auto quoted = values[start_pos + col_index + 1].quoted;
+ ARROW_RETURN_NOT_OK(visit(parsed_ + start, stop - start, quoted));
+ }
+ return Status::OK();
+ }
+
+ protected:
+ // The number of rows in this batch
+ int32_t num_rows_ = 0;
+ // The number of columns
+ int32_t num_cols_ = 0;
+
+ // XXX should we ensure the parsed buffer is padded with 8 or 16 excess zero bytes?
+ // It may help with null parsing...
+ std::vector<std::shared_ptr<Buffer>> values_buffers_;
+ std::shared_ptr<Buffer> parsed_buffer_;
+ const uint8_t* parsed_ = NULLPTR;
+ int32_t parsed_size_ = 0;
+
+ friend class ::arrow::csv::BlockParserImpl;
+};
+
+} // namespace detail
+
+constexpr int32_t kMaxParserNumRows = 100000;
+
+/// \class BlockParser
+/// \brief A reusable block-based parser for CSV data
+///
+/// The parser takes a block of CSV data and delimits rows and fields,
+/// unquoting and unescaping them on the fly. Parsed data is own by the
+/// parser, so the original buffer can be discarded after Parse() returns.
+///
+/// If the block is truncated (i.e. not all data can be parsed), it is up
+/// to the caller to arrange the next block to start with the trailing data.
+/// Also, if the previous block ends with CR (0x0d) and a new block starts
+/// with LF (0x0a), the parser will consider the leading newline as an empty
+/// line; the caller should therefore strip it.
+class ARROW_EXPORT BlockParser {
+ public:
+ explicit BlockParser(ParseOptions options, int32_t num_cols = -1,
+ int64_t first_row = -1, int32_t max_num_rows = kMaxParserNumRows);
+ explicit BlockParser(MemoryPool* pool, ParseOptions options, int32_t num_cols = -1,
+ int64_t first_row = -1, int32_t max_num_rows = kMaxParserNumRows);
+ ~BlockParser();
+
+ /// \brief Parse a block of data
+ ///
+ /// Parse a block of CSV data, ingesting up to max_num_rows rows.
+ /// The number of bytes actually parsed is returned in out_size.
+ Status Parse(util::string_view data, uint32_t* out_size);
+
+ /// \brief Parse sequential blocks of data
+ ///
+ /// Only the last block is allowed to be truncated.
+ Status Parse(const std::vector<util::string_view>& data, uint32_t* out_size);
+
+ /// \brief Parse the final block of data
+ ///
+ /// Like Parse(), but called with the final block in a file.
+ /// The last row may lack a trailing line separator.
+ Status ParseFinal(util::string_view data, uint32_t* out_size);
+
+ /// \brief Parse the final sequential blocks of data
+ ///
+ /// Only the last block is allowed to be truncated.
+ Status ParseFinal(const std::vector<util::string_view>& data, uint32_t* out_size);
+
+ /// \brief Return the number of parsed rows
+ int32_t num_rows() const { return parsed_batch().num_rows(); }
+ /// \brief Return the number of parsed columns
+ int32_t num_cols() const { return parsed_batch().num_cols(); }
+ /// \brief Return the total size in bytes of parsed data
+ uint32_t num_bytes() const { return parsed_batch().num_bytes(); }
+ /// \brief Return the row number of the first row in the block or -1 if unsupported
+ int64_t first_row_num() const;
+
+ /// \brief Visit parsed values in a column
+ ///
+ /// The signature of the visitor is
+ /// Status(const uint8_t* data, uint32_t size, bool quoted)
+ template <typename Visitor>
+ Status VisitColumn(int32_t col_index, Visitor&& visit) const {
+ return parsed_batch().VisitColumn(col_index, first_row_num(),
+ std::forward<Visitor>(visit));
+ }
+
+ template <typename Visitor>
+ Status VisitLastRow(Visitor&& visit) const {
+ return parsed_batch().VisitLastRow(std::forward<Visitor>(visit));
+ }
+
+ protected:
+ std::unique_ptr<BlockParserImpl> impl_;
+
+ const detail::DataBatch& parsed_batch() const;
+};
+
+} // namespace csv
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/csv/reader.cc b/contrib/libs/apache/arrow/cpp/src/arrow/csv/reader.cc
new file mode 100644
index 00000000000..1a7836561da
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/csv/reader.cc
@@ -0,0 +1,1279 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/csv/reader.h"
+
+#include <cstdint>
+#include <cstring>
+#include <functional>
+#include <limits>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/buffer.h"
+#include "arrow/csv/chunker.h"
+#include "arrow/csv/column_builder.h"
+#include "arrow/csv/column_decoder.h"
+#include "arrow/csv/options.h"
+#include "arrow/csv/parser.h"
+#include "arrow/io/interfaces.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/table.h"
+#include "arrow/type.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/async_generator.h"
+#include "arrow/util/future.h"
+#include "arrow/util/iterator.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/optional.h"
+#include "arrow/util/task_group.h"
+#include "arrow/util/thread_pool.h"
+#include "arrow/util/utf8.h"
+#include "arrow/util/vector.h"
+
+namespace arrow {
+namespace csv {
+
+using internal::Executor;
+
+namespace {
+
+struct ConversionSchema {
+ struct Column {
+ std::string name;
+ // Physical column index in CSV file
+ int32_t index;
+ // If true, make a column of nulls
+ bool is_missing;
+ // If set, convert the CSV column to this type
+ // If unset (and is_missing is false), infer the type from the CSV column
+ std::shared_ptr<DataType> type;
+ };
+
+ static Column NullColumn(std::string col_name, std::shared_ptr<DataType> type) {
+ return Column{std::move(col_name), -1, true, std::move(type)};
+ }
+
+ static Column TypedColumn(std::string col_name, int32_t col_index,
+ std::shared_ptr<DataType> type) {
+ return Column{std::move(col_name), col_index, false, std::move(type)};
+ }
+
+ static Column InferredColumn(std::string col_name, int32_t col_index) {
+ return Column{std::move(col_name), col_index, false, nullptr};
+ }
+
+ std::vector<Column> columns;
+};
+
+// An iterator of Buffers that makes sure there is no straddling CRLF sequence.
+class CSVBufferIterator {
+ public:
+ static Iterator<std::shared_ptr<Buffer>> Make(
+ Iterator<std::shared_ptr<Buffer>> buffer_iterator) {
+ Transformer<std::shared_ptr<Buffer>, std::shared_ptr<Buffer>> fn =
+ CSVBufferIterator();
+ return MakeTransformedIterator(std::move(buffer_iterator), fn);
+ }
+
+ static AsyncGenerator<std::shared_ptr<Buffer>> MakeAsync(
+ AsyncGenerator<std::shared_ptr<Buffer>> buffer_iterator) {
+ Transformer<std::shared_ptr<Buffer>, std::shared_ptr<Buffer>> fn =
+ CSVBufferIterator();
+ return MakeTransformedGenerator(std::move(buffer_iterator), fn);
+ }
+
+ Result<TransformFlow<std::shared_ptr<Buffer>>> operator()(std::shared_ptr<Buffer> buf) {
+ if (buf == nullptr) {
+ // EOF
+ return TransformFinish();
+ }
+
+ int64_t offset = 0;
+ if (first_buffer_) {
+ ARROW_ASSIGN_OR_RAISE(auto data, util::SkipUTF8BOM(buf->data(), buf->size()));
+ offset += data - buf->data();
+ DCHECK_GE(offset, 0);
+ first_buffer_ = false;
+ }
+
+ if (trailing_cr_ && buf->data()[offset] == '\n') {
+ // Skip '\r\n' line separator that started at the end of previous buffer
+ ++offset;
+ }
+
+ trailing_cr_ = (buf->data()[buf->size() - 1] == '\r');
+ buf = SliceBuffer(buf, offset);
+ if (buf->size() == 0) {
+ // EOF
+ return TransformFinish();
+ } else {
+ return TransformYield(buf);
+ }
+ }
+
+ protected:
+ bool first_buffer_ = true;
+ // Whether there was a trailing CR at the end of last received buffer
+ bool trailing_cr_ = false;
+};
+
+struct CSVBlock {
+ // (partial + completion + buffer) is an entire delimited CSV buffer.
+ std::shared_ptr<Buffer> partial;
+ std::shared_ptr<Buffer> completion;
+ std::shared_ptr<Buffer> buffer;
+ int64_t block_index;
+ bool is_final;
+ int64_t bytes_skipped;
+ std::function<Status(int64_t)> consume_bytes;
+};
+
+} // namespace
+} // namespace csv
+
+template <>
+struct IterationTraits<csv::CSVBlock> {
+ static csv::CSVBlock End() { return csv::CSVBlock{{}, {}, {}, -1, true, 0, {}}; }
+ static bool IsEnd(const csv::CSVBlock& val) { return val.block_index < 0; }
+};
+
+namespace csv {
+namespace {
+
+// This is a callable that can be used to transform an iterator. The source iterator
+// will contain buffers of data and the output iterator will contain delimited CSV
+// blocks. util::optional is used so that there is an end token (required by the
+// iterator APIs (e.g. Visit)) even though an empty optional is never used in this code.
+class BlockReader {
+ public:
+ BlockReader(std::unique_ptr<Chunker> chunker, std::shared_ptr<Buffer> first_buffer,
+ int64_t skip_rows)
+ : chunker_(std::move(chunker)),
+ partial_(std::make_shared<Buffer>("")),
+ buffer_(std::move(first_buffer)),
+ skip_rows_(skip_rows) {}
+
+ protected:
+ std::unique_ptr<Chunker> chunker_;
+ std::shared_ptr<Buffer> partial_, buffer_;
+ int64_t skip_rows_;
+ int64_t block_index_ = 0;
+ // Whether there was a trailing CR at the end of last received buffer
+ bool trailing_cr_ = false;
+};
+
+// An object that reads delimited CSV blocks for serial use.
+// The number of bytes consumed should be notified after each read,
+// using CSVBlock::consume_bytes.
+class SerialBlockReader : public BlockReader {
+ public:
+ using BlockReader::BlockReader;
+
+ static Iterator<CSVBlock> MakeIterator(
+ Iterator<std::shared_ptr<Buffer>> buffer_iterator, std::unique_ptr<Chunker> chunker,
+ std::shared_ptr<Buffer> first_buffer, int64_t skip_rows) {
+ auto block_reader =
+ std::make_shared<SerialBlockReader>(std::move(chunker), first_buffer, skip_rows);
+ // Wrap shared pointer in callable
+ Transformer<std::shared_ptr<Buffer>, CSVBlock> block_reader_fn =
+ [block_reader](std::shared_ptr<Buffer> buf) {
+ return (*block_reader)(std::move(buf));
+ };
+ return MakeTransformedIterator(std::move(buffer_iterator), block_reader_fn);
+ }
+
+ static AsyncGenerator<CSVBlock> MakeAsyncIterator(
+ AsyncGenerator<std::shared_ptr<Buffer>> buffer_generator,
+ std::unique_ptr<Chunker> chunker, std::shared_ptr<Buffer> first_buffer,
+ int64_t skip_rows) {
+ auto block_reader =
+ std::make_shared<SerialBlockReader>(std::move(chunker), first_buffer, skip_rows);
+ // Wrap shared pointer in callable
+ Transformer<std::shared_ptr<Buffer>, CSVBlock> block_reader_fn =
+ [block_reader](std::shared_ptr<Buffer> next) {
+ return (*block_reader)(std::move(next));
+ };
+ return MakeTransformedGenerator(std::move(buffer_generator), block_reader_fn);
+ }
+
+ Result<TransformFlow<CSVBlock>> operator()(std::shared_ptr<Buffer> next_buffer) {
+ if (buffer_ == nullptr) {
+ return TransformFinish();
+ }
+
+ bool is_final = (next_buffer == nullptr);
+ int64_t bytes_skipped = 0;
+
+ if (skip_rows_) {
+ bytes_skipped += partial_->size();
+ auto orig_size = buffer_->size();
+ RETURN_NOT_OK(
+ chunker_->ProcessSkip(partial_, buffer_, is_final, &skip_rows_, &buffer_));
+ bytes_skipped += orig_size - buffer_->size();
+ auto empty = std::make_shared<Buffer>(nullptr, 0);
+ if (skip_rows_) {
+ // Still have rows beyond this buffer to skip return empty block
+ partial_ = std::move(buffer_);
+ buffer_ = next_buffer;
+ return TransformYield<CSVBlock>(CSVBlock{empty, empty, empty, block_index_++,
+ is_final, bytes_skipped,
+ [](int64_t) { return Status::OK(); }});
+ }
+ partial_ = std::move(empty);
+ }
+
+ std::shared_ptr<Buffer> completion;
+
+ if (is_final) {
+ // End of file reached => compute completion from penultimate block
+ RETURN_NOT_OK(chunker_->ProcessFinal(partial_, buffer_, &completion, &buffer_));
+ } else {
+ // Get completion of partial from previous block.
+ RETURN_NOT_OK(
+ chunker_->ProcessWithPartial(partial_, buffer_, &completion, &buffer_));
+ }
+ int64_t bytes_before_buffer = partial_->size() + completion->size();
+
+ auto consume_bytes = [this, bytes_before_buffer,
+ next_buffer](int64_t nbytes) -> Status {
+ DCHECK_GE(nbytes, 0);
+ auto offset = nbytes - bytes_before_buffer;
+ if (offset < 0) {
+ // Should not happen
+ return Status::Invalid("CSV parser got out of sync with chunker");
+ }
+ partial_ = SliceBuffer(buffer_, offset);
+ buffer_ = next_buffer;
+ return Status::OK();
+ };
+
+ return TransformYield<CSVBlock>(CSVBlock{partial_, completion, buffer_,
+ block_index_++, is_final, bytes_skipped,
+ std::move(consume_bytes)});
+ }
+};
+
+// An object that reads delimited CSV blocks for threaded use.
+class ThreadedBlockReader : public BlockReader {
+ public:
+ using BlockReader::BlockReader;
+
+ static AsyncGenerator<CSVBlock> MakeAsyncIterator(
+ AsyncGenerator<std::shared_ptr<Buffer>> buffer_generator,
+ std::unique_ptr<Chunker> chunker, std::shared_ptr<Buffer> first_buffer,
+ int64_t skip_rows) {
+ auto block_reader = std::make_shared<ThreadedBlockReader>(std::move(chunker),
+ first_buffer, skip_rows);
+ // Wrap shared pointer in callable
+ Transformer<std::shared_ptr<Buffer>, CSVBlock> block_reader_fn =
+ [block_reader](std::shared_ptr<Buffer> next) { return (*block_reader)(next); };
+ return MakeTransformedGenerator(std::move(buffer_generator), block_reader_fn);
+ }
+
+ Result<TransformFlow<CSVBlock>> operator()(std::shared_ptr<Buffer> next_buffer) {
+ if (buffer_ == nullptr) {
+ // EOF
+ return TransformFinish();
+ }
+
+ bool is_final = (next_buffer == nullptr);
+
+ auto current_partial = std::move(partial_);
+ auto current_buffer = std::move(buffer_);
+ int64_t bytes_skipped = 0;
+
+ if (skip_rows_) {
+ auto orig_size = current_buffer->size();
+ bytes_skipped = current_partial->size();
+ RETURN_NOT_OK(chunker_->ProcessSkip(current_partial, current_buffer, is_final,
+ &skip_rows_, &current_buffer));
+ bytes_skipped += orig_size - current_buffer->size();
+ current_partial = std::make_shared<Buffer>(nullptr, 0);
+ if (skip_rows_) {
+ partial_ = std::move(current_buffer);
+ buffer_ = std::move(next_buffer);
+ return TransformYield<CSVBlock>(CSVBlock{current_partial,
+ current_partial,
+ current_partial,
+ block_index_++,
+ is_final,
+ bytes_skipped,
+ {}});
+ }
+ }
+
+ std::shared_ptr<Buffer> whole, completion, next_partial;
+
+ if (is_final) {
+ // End of file reached => compute completion from penultimate block
+ RETURN_NOT_OK(
+ chunker_->ProcessFinal(current_partial, current_buffer, &completion, &whole));
+ } else {
+ // Get completion of partial from previous block.
+ std::shared_ptr<Buffer> starts_with_whole;
+ // Get completion of partial from previous block.
+ RETURN_NOT_OK(chunker_->ProcessWithPartial(current_partial, current_buffer,
+ &completion, &starts_with_whole));
+
+ // Get a complete CSV block inside `partial + block`, and keep
+ // the rest for the next iteration.
+ RETURN_NOT_OK(chunker_->Process(starts_with_whole, &whole, &next_partial));
+ }
+
+ partial_ = std::move(next_partial);
+ buffer_ = std::move(next_buffer);
+
+ return TransformYield<CSVBlock>(CSVBlock{
+ current_partial, completion, whole, block_index_++, is_final, bytes_skipped, {}});
+ }
+};
+
+struct ParsedBlock {
+ std::shared_ptr<BlockParser> parser;
+ int64_t block_index;
+ int64_t bytes_parsed_or_skipped;
+};
+
+struct DecodedBlock {
+ std::shared_ptr<RecordBatch> record_batch;
+ // Represents the number of input bytes represented by this batch
+ // This will include bytes skipped when skipping rows after the header
+ int64_t bytes_processed;
+};
+
+} // namespace
+
+} // namespace csv
+
+template <>
+struct IterationTraits<csv::ParsedBlock> {
+ static csv::ParsedBlock End() { return csv::ParsedBlock{nullptr, -1, -1}; }
+ static bool IsEnd(const csv::ParsedBlock& val) { return val.block_index < 0; }
+};
+
+template <>
+struct IterationTraits<csv::DecodedBlock> {
+ static csv::DecodedBlock End() { return csv::DecodedBlock{nullptr, -1}; }
+ static bool IsEnd(const csv::DecodedBlock& val) { return val.bytes_processed < 0; }
+};
+
+namespace csv {
+namespace {
+
+// A function object that takes in a buffer of CSV data and returns a parsed batch of CSV
+// data (CSVBlock -> ParsedBlock) for use with MakeMappedGenerator.
+// The parsed batch contains a list of offsets for each of the columns so that columns
+// can be individually scanned
+//
+// This operator is not re-entrant
+class BlockParsingOperator {
+ public:
+ BlockParsingOperator(io::IOContext io_context, ParseOptions parse_options,
+ int num_csv_cols, int64_t first_row)
+ : io_context_(io_context),
+ parse_options_(parse_options),
+ num_csv_cols_(num_csv_cols),
+ count_rows_(first_row >= 0),
+ num_rows_seen_(first_row) {}
+
+ Result<ParsedBlock> operator()(const CSVBlock& block) {
+ constexpr int32_t max_num_rows = std::numeric_limits<int32_t>::max();
+ auto parser = std::make_shared<BlockParser>(
+ io_context_.pool(), parse_options_, num_csv_cols_, num_rows_seen_, max_num_rows);
+
+ std::shared_ptr<Buffer> straddling;
+ std::vector<util::string_view> views;
+ if (block.partial->size() != 0 || block.completion->size() != 0) {
+ if (block.partial->size() == 0) {
+ straddling = block.completion;
+ } else if (block.completion->size() == 0) {
+ straddling = block.partial;
+ } else {
+ ARROW_ASSIGN_OR_RAISE(
+ straddling,
+ ConcatenateBuffers({block.partial, block.completion}, io_context_.pool()));
+ }
+ views = {util::string_view(*straddling), util::string_view(*block.buffer)};
+ } else {
+ views = {util::string_view(*block.buffer)};
+ }
+ uint32_t parsed_size;
+ if (block.is_final) {
+ RETURN_NOT_OK(parser->ParseFinal(views, &parsed_size));
+ } else {
+ RETURN_NOT_OK(parser->Parse(views, &parsed_size));
+ }
+ if (count_rows_) {
+ num_rows_seen_ += parser->num_rows();
+ }
+ RETURN_NOT_OK(block.consume_bytes(parsed_size));
+ return ParsedBlock{std::move(parser), block.block_index,
+ static_cast<int64_t>(parsed_size) + block.bytes_skipped};
+ }
+
+ private:
+ io::IOContext io_context_;
+ ParseOptions parse_options_;
+ int num_csv_cols_;
+ bool count_rows_;
+ int64_t num_rows_seen_;
+};
+
+// A function object that takes in parsed batch of CSV data and decodes it to an arrow
+// record batch (ParsedBlock -> DecodedBlock) for use with MakeMappedGenerator.
+class BlockDecodingOperator {
+ public:
+ Future<DecodedBlock> operator()(const ParsedBlock& block) {
+ DCHECK(!state_->column_decoders.empty());
+ std::vector<Future<std::shared_ptr<Array>>> decoded_array_futs;
+ for (auto& decoder : state_->column_decoders) {
+ decoded_array_futs.push_back(decoder->Decode(block.parser));
+ }
+ auto bytes_parsed_or_skipped = block.bytes_parsed_or_skipped;
+ auto decoded_arrays_fut = All(std::move(decoded_array_futs));
+ auto state = state_;
+ return decoded_arrays_fut.Then(
+ [state, bytes_parsed_or_skipped](
+ const std::vector<Result<std::shared_ptr<Array>>>& maybe_decoded_arrays)
+ -> Result<DecodedBlock> {
+ ARROW_ASSIGN_OR_RAISE(auto decoded_arrays,
+ internal::UnwrapOrRaise(maybe_decoded_arrays));
+
+ ARROW_ASSIGN_OR_RAISE(auto batch,
+ state->DecodedArraysToBatch(std::move(decoded_arrays)));
+ return DecodedBlock{std::move(batch), bytes_parsed_or_skipped};
+ });
+ }
+
+ static Result<BlockDecodingOperator> Make(io::IOContext io_context,
+ ConvertOptions convert_options,
+ ConversionSchema conversion_schema) {
+ BlockDecodingOperator op(std::move(io_context), std::move(convert_options),
+ std::move(conversion_schema));
+ RETURN_NOT_OK(op.state_->MakeColumnDecoders(io_context));
+ return op;
+ }
+
+ private:
+ BlockDecodingOperator(io::IOContext io_context, ConvertOptions convert_options,
+ ConversionSchema conversion_schema)
+ : state_(std::make_shared<State>(std::move(io_context), std::move(convert_options),
+ std::move(conversion_schema))) {}
+
+ struct State {
+ State(io::IOContext io_context, ConvertOptions convert_options,
+ ConversionSchema conversion_schema)
+ : convert_options(std::move(convert_options)),
+ conversion_schema(std::move(conversion_schema)) {}
+
+ Result<std::shared_ptr<RecordBatch>> DecodedArraysToBatch(
+ std::vector<std::shared_ptr<Array>> arrays) {
+ if (schema == nullptr) {
+ FieldVector fields(arrays.size());
+ for (size_t i = 0; i < arrays.size(); ++i) {
+ fields[i] = field(conversion_schema.columns[i].name, arrays[i]->type());
+ }
+ schema = arrow::schema(std::move(fields));
+ }
+ const auto n_rows = arrays[0]->length();
+ return RecordBatch::Make(schema, n_rows, std::move(arrays));
+ }
+
+ // Make column decoders from conversion schema
+ Status MakeColumnDecoders(io::IOContext io_context) {
+ for (const auto& column : conversion_schema.columns) {
+ std::shared_ptr<ColumnDecoder> decoder;
+ if (column.is_missing) {
+ ARROW_ASSIGN_OR_RAISE(decoder,
+ ColumnDecoder::MakeNull(io_context.pool(), column.type));
+ } else if (column.type != nullptr) {
+ ARROW_ASSIGN_OR_RAISE(
+ decoder, ColumnDecoder::Make(io_context.pool(), column.type, column.index,
+ convert_options));
+ } else {
+ ARROW_ASSIGN_OR_RAISE(
+ decoder,
+ ColumnDecoder::Make(io_context.pool(), column.index, convert_options));
+ }
+ column_decoders.push_back(std::move(decoder));
+ }
+ return Status::OK();
+ }
+
+ ConvertOptions convert_options;
+ ConversionSchema conversion_schema;
+ std::vector<std::shared_ptr<ColumnDecoder>> column_decoders;
+ std::shared_ptr<Schema> schema;
+ };
+
+ std::shared_ptr<State> state_;
+};
+
+/////////////////////////////////////////////////////////////////////////
+// Base class for common functionality
+
+class ReaderMixin {
+ public:
+ ReaderMixin(io::IOContext io_context, std::shared_ptr<io::InputStream> input,
+ const ReadOptions& read_options, const ParseOptions& parse_options,
+ const ConvertOptions& convert_options, bool count_rows)
+ : io_context_(std::move(io_context)),
+ read_options_(read_options),
+ parse_options_(parse_options),
+ convert_options_(convert_options),
+ count_rows_(count_rows),
+ num_rows_seen_(count_rows_ ? 1 : -1),
+ input_(std::move(input)) {}
+
+ protected:
+ // Read header and column names from buffer, create column builders
+ // Returns the # of bytes consumed
+ Result<int64_t> ProcessHeader(const std::shared_ptr<Buffer>& buf,
+ std::shared_ptr<Buffer>* rest) {
+ const uint8_t* data = buf->data();
+ const auto data_end = data + buf->size();
+ DCHECK_GT(data_end - data, 0);
+
+ if (read_options_.skip_rows) {
+ // Skip initial rows (potentially invalid CSV data)
+ auto num_skipped_rows = SkipRows(data, static_cast<uint32_t>(data_end - data),
+ read_options_.skip_rows, &data);
+ if (num_skipped_rows < read_options_.skip_rows) {
+ return Status::Invalid(
+ "Could not skip initial ", read_options_.skip_rows,
+ " rows from CSV file, "
+ "either file is too short or header is larger than block size");
+ }
+ if (count_rows_) {
+ num_rows_seen_ += num_skipped_rows;
+ }
+ }
+
+ if (read_options_.column_names.empty()) {
+ // Parse one row (either to read column names or to know the number of columns)
+ BlockParser parser(io_context_.pool(), parse_options_, num_csv_cols_,
+ num_rows_seen_, 1);
+ uint32_t parsed_size = 0;
+ RETURN_NOT_OK(parser.Parse(
+ util::string_view(reinterpret_cast<const char*>(data), data_end - data),
+ &parsed_size));
+ if (parser.num_rows() != 1) {
+ return Status::Invalid(
+ "Could not read first row from CSV file, either "
+ "file is too short or header is larger than block size");
+ }
+ if (parser.num_cols() == 0) {
+ return Status::Invalid("No columns in CSV file");
+ }
+
+ if (read_options_.autogenerate_column_names) {
+ column_names_ = GenerateColumnNames(parser.num_cols());
+ } else {
+ // Read column names from header row
+ auto visit = [&](const uint8_t* data, uint32_t size, bool quoted) -> Status {
+ column_names_.emplace_back(reinterpret_cast<const char*>(data), size);
+ return Status::OK();
+ };
+ RETURN_NOT_OK(parser.VisitLastRow(visit));
+ DCHECK_EQ(static_cast<size_t>(parser.num_cols()), column_names_.size());
+ // Skip parsed header row
+ data += parsed_size;
+ if (count_rows_) {
+ ++num_rows_seen_;
+ }
+ }
+ } else {
+ column_names_ = read_options_.column_names;
+ }
+
+ if (count_rows_) {
+ // increase rows seen to skip past rows which will be skipped
+ num_rows_seen_ += read_options_.skip_rows_after_names;
+ }
+
+ auto bytes_consumed = data - buf->data();
+ *rest = SliceBuffer(buf, bytes_consumed);
+
+ num_csv_cols_ = static_cast<int32_t>(column_names_.size());
+ DCHECK_GT(num_csv_cols_, 0);
+
+ RETURN_NOT_OK(MakeConversionSchema());
+ return bytes_consumed;
+ }
+
+ std::vector<std::string> GenerateColumnNames(int32_t num_cols) {
+ std::vector<std::string> res;
+ res.reserve(num_cols);
+ for (int32_t i = 0; i < num_cols; ++i) {
+ std::stringstream ss;
+ ss << "f" << i;
+ res.push_back(ss.str());
+ }
+ return res;
+ }
+
+ // Make conversion schema from options and parsed CSV header
+ Status MakeConversionSchema() {
+ // Append a column converted from CSV data
+ auto append_csv_column = [&](std::string col_name, int32_t col_index) {
+ // Does the named column have a fixed type?
+ auto it = convert_options_.column_types.find(col_name);
+ if (it == convert_options_.column_types.end()) {
+ conversion_schema_.columns.push_back(
+ ConversionSchema::InferredColumn(std::move(col_name), col_index));
+ } else {
+ conversion_schema_.columns.push_back(
+ ConversionSchema::TypedColumn(std::move(col_name), col_index, it->second));
+ }
+ };
+
+ // Append a column of nulls
+ auto append_null_column = [&](std::string col_name) {
+ // If the named column has a fixed type, use it, otherwise use null()
+ std::shared_ptr<DataType> type;
+ auto it = convert_options_.column_types.find(col_name);
+ if (it == convert_options_.column_types.end()) {
+ type = null();
+ } else {
+ type = it->second;
+ }
+ conversion_schema_.columns.push_back(
+ ConversionSchema::NullColumn(std::move(col_name), std::move(type)));
+ };
+
+ if (convert_options_.include_columns.empty()) {
+ // Include all columns in CSV file order
+ for (int32_t col_index = 0; col_index < num_csv_cols_; ++col_index) {
+ append_csv_column(column_names_[col_index], col_index);
+ }
+ } else {
+ // Include columns from `include_columns` (in that order)
+ // Compute indices of columns in the CSV file
+ std::unordered_map<std::string, int32_t> col_indices;
+ col_indices.reserve(column_names_.size());
+ for (int32_t i = 0; i < static_cast<int32_t>(column_names_.size()); ++i) {
+ col_indices.emplace(column_names_[i], i);
+ }
+
+ for (const auto& col_name : convert_options_.include_columns) {
+ auto it = col_indices.find(col_name);
+ if (it != col_indices.end()) {
+ append_csv_column(col_name, it->second);
+ } else if (convert_options_.include_missing_columns) {
+ append_null_column(col_name);
+ } else {
+ return Status::KeyError("Column '", col_name,
+ "' in include_columns "
+ "does not exist in CSV file");
+ }
+ }
+ }
+ return Status::OK();
+ }
+
+ struct ParseResult {
+ std::shared_ptr<BlockParser> parser;
+ int64_t parsed_bytes;
+ };
+
+ Result<ParseResult> Parse(const std::shared_ptr<Buffer>& partial,
+ const std::shared_ptr<Buffer>& completion,
+ const std::shared_ptr<Buffer>& block, int64_t block_index,
+ bool is_final) {
+ static constexpr int32_t max_num_rows = std::numeric_limits<int32_t>::max();
+ auto parser = std::make_shared<BlockParser>(
+ io_context_.pool(), parse_options_, num_csv_cols_, num_rows_seen_, max_num_rows);
+
+ std::shared_ptr<Buffer> straddling;
+ std::vector<util::string_view> views;
+ if (partial->size() != 0 || completion->size() != 0) {
+ if (partial->size() == 0) {
+ straddling = completion;
+ } else if (completion->size() == 0) {
+ straddling = partial;
+ } else {
+ ARROW_ASSIGN_OR_RAISE(
+ straddling, ConcatenateBuffers({partial, completion}, io_context_.pool()));
+ }
+ views = {util::string_view(*straddling), util::string_view(*block)};
+ } else {
+ views = {util::string_view(*block)};
+ }
+ uint32_t parsed_size;
+ if (is_final) {
+ RETURN_NOT_OK(parser->ParseFinal(views, &parsed_size));
+ } else {
+ RETURN_NOT_OK(parser->Parse(views, &parsed_size));
+ }
+ if (count_rows_) {
+ num_rows_seen_ += parser->num_rows();
+ }
+ return ParseResult{std::move(parser), static_cast<int64_t>(parsed_size)};
+ }
+
+ io::IOContext io_context_;
+ ReadOptions read_options_;
+ ParseOptions parse_options_;
+ ConvertOptions convert_options_;
+
+ // Number of columns in the CSV file
+ int32_t num_csv_cols_ = -1;
+ // Whether num_rows_seen_ tracks the number of rows seen in the CSV being parsed
+ bool count_rows_;
+ // Number of rows seen in the csv. Not used if count_rows is false
+ int64_t num_rows_seen_;
+ // Column names in the CSV file
+ std::vector<std::string> column_names_;
+ ConversionSchema conversion_schema_;
+
+ std::shared_ptr<io::InputStream> input_;
+ std::shared_ptr<internal::TaskGroup> task_group_;
+};
+
+/////////////////////////////////////////////////////////////////////////
+// Base class for one-shot table readers
+
+class BaseTableReader : public ReaderMixin, public csv::TableReader {
+ public:
+ using ReaderMixin::ReaderMixin;
+
+ virtual Status Init() = 0;
+
+ Future<std::shared_ptr<Table>> ReadAsync() override {
+ return Future<std::shared_ptr<Table>>::MakeFinished(Read());
+ }
+
+ protected:
+ // Make column builders from conversion schema
+ Status MakeColumnBuilders() {
+ for (const auto& column : conversion_schema_.columns) {
+ std::shared_ptr<ColumnBuilder> builder;
+ if (column.is_missing) {
+ ARROW_ASSIGN_OR_RAISE(builder, ColumnBuilder::MakeNull(io_context_.pool(),
+ column.type, task_group_));
+ } else if (column.type != nullptr) {
+ ARROW_ASSIGN_OR_RAISE(
+ builder, ColumnBuilder::Make(io_context_.pool(), column.type, column.index,
+ convert_options_, task_group_));
+ } else {
+ ARROW_ASSIGN_OR_RAISE(builder,
+ ColumnBuilder::Make(io_context_.pool(), column.index,
+ convert_options_, task_group_));
+ }
+ column_builders_.push_back(std::move(builder));
+ }
+ return Status::OK();
+ }
+
+ Result<int64_t> ParseAndInsert(const std::shared_ptr<Buffer>& partial,
+ const std::shared_ptr<Buffer>& completion,
+ const std::shared_ptr<Buffer>& block,
+ int64_t block_index, bool is_final) {
+ ARROW_ASSIGN_OR_RAISE(auto result,
+ Parse(partial, completion, block, block_index, is_final));
+ RETURN_NOT_OK(ProcessData(result.parser, block_index));
+ return result.parsed_bytes;
+ }
+
+ // Trigger conversion of parsed block data
+ Status ProcessData(const std::shared_ptr<BlockParser>& parser, int64_t block_index) {
+ for (auto& builder : column_builders_) {
+ builder->Insert(block_index, parser);
+ }
+ return Status::OK();
+ }
+
+ Result<std::shared_ptr<Table>> MakeTable() {
+ DCHECK_EQ(column_builders_.size(), conversion_schema_.columns.size());
+
+ std::vector<std::shared_ptr<Field>> fields;
+ std::vector<std::shared_ptr<ChunkedArray>> columns;
+
+ for (int32_t i = 0; i < static_cast<int32_t>(column_builders_.size()); ++i) {
+ const auto& column = conversion_schema_.columns[i];
+ ARROW_ASSIGN_OR_RAISE(auto array, column_builders_[i]->Finish());
+ fields.push_back(::arrow::field(column.name, array->type()));
+ columns.emplace_back(std::move(array));
+ }
+ return Table::Make(schema(std::move(fields)), std::move(columns));
+ }
+
+ // Column builders for target Table (in ConversionSchema order)
+ std::vector<std::shared_ptr<ColumnBuilder>> column_builders_;
+};
+
+/////////////////////////////////////////////////////////////////////////
+// Base class for streaming readers
+
+class StreamingReaderImpl : public ReaderMixin,
+ public csv::StreamingReader,
+ public std::enable_shared_from_this<StreamingReaderImpl> {
+ public:
+ StreamingReaderImpl(io::IOContext io_context, std::shared_ptr<io::InputStream> input,
+ const ReadOptions& read_options, const ParseOptions& parse_options,
+ const ConvertOptions& convert_options, bool count_rows)
+ : ReaderMixin(io_context, std::move(input), read_options, parse_options,
+ convert_options, count_rows),
+ bytes_decoded_(std::make_shared<std::atomic<int64_t>>(0)) {}
+
+ Future<> Init(Executor* cpu_executor) {
+ ARROW_ASSIGN_OR_RAISE(auto istream_it,
+ io::MakeInputStreamIterator(input_, read_options_.block_size));
+
+ // TODO Consider exposing readahead as a read option (ARROW-12090)
+ ARROW_ASSIGN_OR_RAISE(auto bg_it, MakeBackgroundGenerator(std::move(istream_it),
+ io_context_.executor()));
+
+ auto transferred_it = MakeTransferredGenerator(bg_it, cpu_executor);
+
+ auto buffer_generator = CSVBufferIterator::MakeAsync(std::move(transferred_it));
+
+ int max_readahead = cpu_executor->GetCapacity();
+ auto self = shared_from_this();
+
+ return buffer_generator().Then([self, buffer_generator, max_readahead](
+ const std::shared_ptr<Buffer>& first_buffer) {
+ return self->InitAfterFirstBuffer(first_buffer, buffer_generator, max_readahead);
+ });
+ }
+
+ std::shared_ptr<Schema> schema() const override { return schema_; }
+
+ int64_t bytes_read() const override { return bytes_decoded_->load(); }
+
+ Status ReadNext(std::shared_ptr<RecordBatch>* batch) override {
+ auto next_fut = ReadNextAsync();
+ auto next_result = next_fut.result();
+ return std::move(next_result).Value(batch);
+ }
+
+ Future<std::shared_ptr<RecordBatch>> ReadNextAsync() override {
+ return record_batch_gen_();
+ }
+
+ protected:
+ Future<> InitAfterFirstBuffer(const std::shared_ptr<Buffer>& first_buffer,
+ AsyncGenerator<std::shared_ptr<Buffer>> buffer_generator,
+ int max_readahead) {
+ if (first_buffer == nullptr) {
+ return Status::Invalid("Empty CSV file");
+ }
+
+ std::shared_ptr<Buffer> after_header;
+ ARROW_ASSIGN_OR_RAISE(auto header_bytes_consumed,
+ ProcessHeader(first_buffer, &after_header));
+ bytes_decoded_->fetch_add(header_bytes_consumed);
+
+ auto parser_op =
+ BlockParsingOperator(io_context_, parse_options_, num_csv_cols_, num_rows_seen_);
+ ARROW_ASSIGN_OR_RAISE(
+ auto decoder_op,
+ BlockDecodingOperator::Make(io_context_, convert_options_, conversion_schema_));
+
+ auto block_gen = SerialBlockReader::MakeAsyncIterator(
+ std::move(buffer_generator), MakeChunker(parse_options_), std::move(after_header),
+ read_options_.skip_rows_after_names);
+ auto parsed_block_gen =
+ MakeMappedGenerator(std::move(block_gen), std::move(parser_op));
+ auto rb_gen = MakeMappedGenerator(std::move(parsed_block_gen), std::move(decoder_op));
+
+ auto self = shared_from_this();
+ return rb_gen().Then([self, rb_gen, max_readahead](const DecodedBlock& first_block) {
+ return self->InitAfterFirstBatch(first_block, std::move(rb_gen), max_readahead);
+ });
+ }
+
+ Status InitAfterFirstBatch(const DecodedBlock& first_block,
+ AsyncGenerator<DecodedBlock> batch_gen, int max_readahead) {
+ schema_ = first_block.record_batch->schema();
+
+ AsyncGenerator<DecodedBlock> readahead_gen;
+ if (read_options_.use_threads) {
+ readahead_gen = MakeReadaheadGenerator(std::move(batch_gen), max_readahead);
+ } else {
+ readahead_gen = std::move(batch_gen);
+ }
+
+ AsyncGenerator<DecodedBlock> restarted_gen;
+ // Streaming reader should not emit empty record batches
+ if (first_block.record_batch->num_rows() > 0) {
+ restarted_gen = MakeGeneratorStartsWith({first_block}, std::move(readahead_gen));
+ } else {
+ restarted_gen = std::move(readahead_gen);
+ }
+
+ auto bytes_decoded = bytes_decoded_;
+ auto unwrap_and_record_bytes =
+ [bytes_decoded](
+ const DecodedBlock& block) -> Result<std::shared_ptr<RecordBatch>> {
+ bytes_decoded->fetch_add(block.bytes_processed);
+ return block.record_batch;
+ };
+
+ auto unwrapped =
+ MakeMappedGenerator(std::move(restarted_gen), std::move(unwrap_and_record_bytes));
+
+ record_batch_gen_ = MakeCancellable(std::move(unwrapped), io_context_.stop_token());
+ return Status::OK();
+ }
+
+ std::shared_ptr<Schema> schema_;
+ AsyncGenerator<std::shared_ptr<RecordBatch>> record_batch_gen_;
+ // bytes which have been decoded and asked for by the caller
+ std::shared_ptr<std::atomic<int64_t>> bytes_decoded_;
+};
+
+/////////////////////////////////////////////////////////////////////////
+// Serial TableReader implementation
+
+class SerialTableReader : public BaseTableReader {
+ public:
+ using BaseTableReader::BaseTableReader;
+
+ Status Init() override {
+ ARROW_ASSIGN_OR_RAISE(auto istream_it,
+ io::MakeInputStreamIterator(input_, read_options_.block_size));
+
+ // Since we're converting serially, no need to readahead more than one block
+ int32_t block_queue_size = 1;
+ ARROW_ASSIGN_OR_RAISE(auto rh_it,
+ MakeReadaheadIterator(std::move(istream_it), block_queue_size));
+ buffer_iterator_ = CSVBufferIterator::Make(std::move(rh_it));
+ return Status::OK();
+ }
+
+ Result<std::shared_ptr<Table>> Read() override {
+ task_group_ = internal::TaskGroup::MakeSerial(io_context_.stop_token());
+
+ // First block
+ ARROW_ASSIGN_OR_RAISE(auto first_buffer, buffer_iterator_.Next());
+ if (first_buffer == nullptr) {
+ return Status::Invalid("Empty CSV file");
+ }
+ RETURN_NOT_OK(ProcessHeader(first_buffer, &first_buffer));
+ RETURN_NOT_OK(MakeColumnBuilders());
+
+ auto block_iterator = SerialBlockReader::MakeIterator(
+ std::move(buffer_iterator_), MakeChunker(parse_options_), std::move(first_buffer),
+ read_options_.skip_rows_after_names);
+ while (true) {
+ RETURN_NOT_OK(io_context_.stop_token().Poll());
+
+ ARROW_ASSIGN_OR_RAISE(auto maybe_block, block_iterator.Next());
+ if (IsIterationEnd(maybe_block)) {
+ // EOF
+ break;
+ }
+ ARROW_ASSIGN_OR_RAISE(
+ int64_t parsed_bytes,
+ ParseAndInsert(maybe_block.partial, maybe_block.completion, maybe_block.buffer,
+ maybe_block.block_index, maybe_block.is_final));
+ RETURN_NOT_OK(maybe_block.consume_bytes(parsed_bytes));
+ }
+ // Finish conversion, create schema and table
+ RETURN_NOT_OK(task_group_->Finish());
+ return MakeTable();
+ }
+
+ protected:
+ Iterator<std::shared_ptr<Buffer>> buffer_iterator_;
+};
+
+class AsyncThreadedTableReader
+ : public BaseTableReader,
+ public std::enable_shared_from_this<AsyncThreadedTableReader> {
+ public:
+ using BaseTableReader::BaseTableReader;
+
+ AsyncThreadedTableReader(io::IOContext io_context,
+ std::shared_ptr<io::InputStream> input,
+ const ReadOptions& read_options,
+ const ParseOptions& parse_options,
+ const ConvertOptions& convert_options, Executor* cpu_executor)
+ // Count rows is currently not supported during parallel read
+ : BaseTableReader(std::move(io_context), input, read_options, parse_options,
+ convert_options, /*count_rows=*/false),
+ cpu_executor_(cpu_executor) {}
+
+ ~AsyncThreadedTableReader() override {
+ if (task_group_) {
+ // In case of error, make sure all pending tasks are finished before
+ // we start destroying BaseTableReader members
+ ARROW_UNUSED(task_group_->Finish());
+ }
+ }
+
+ Status Init() override {
+ ARROW_ASSIGN_OR_RAISE(auto istream_it,
+ io::MakeInputStreamIterator(input_, read_options_.block_size));
+
+ int max_readahead = cpu_executor_->GetCapacity();
+ int readahead_restart = std::max(1, max_readahead / 2);
+
+ ARROW_ASSIGN_OR_RAISE(
+ auto bg_it, MakeBackgroundGenerator(std::move(istream_it), io_context_.executor(),
+ max_readahead, readahead_restart));
+
+ auto transferred_it = MakeTransferredGenerator(bg_it, cpu_executor_);
+ buffer_generator_ = CSVBufferIterator::MakeAsync(std::move(transferred_it));
+ return Status::OK();
+ }
+
+ Result<std::shared_ptr<Table>> Read() override { return ReadAsync().result(); }
+
+ Future<std::shared_ptr<Table>> ReadAsync() override {
+ task_group_ =
+ internal::TaskGroup::MakeThreaded(cpu_executor_, io_context_.stop_token());
+
+ auto self = shared_from_this();
+ return ProcessFirstBuffer().Then([self](const std::shared_ptr<Buffer>& first_buffer) {
+ auto block_generator = ThreadedBlockReader::MakeAsyncIterator(
+ self->buffer_generator_, MakeChunker(self->parse_options_),
+ std::move(first_buffer), self->read_options_.skip_rows_after_names);
+
+ std::function<Status(CSVBlock)> block_visitor =
+ [self](CSVBlock maybe_block) -> Status {
+ // The logic in VisitAsyncGenerator ensures that we will never be
+ // passed an empty block (visit does not call with the end token) so
+ // we can be assured maybe_block has a value.
+ DCHECK_GE(maybe_block.block_index, 0);
+ DCHECK(!maybe_block.consume_bytes);
+
+ // Launch parse task
+ self->task_group_->Append([self, maybe_block] {
+ return self
+ ->ParseAndInsert(maybe_block.partial, maybe_block.completion,
+ maybe_block.buffer, maybe_block.block_index,
+ maybe_block.is_final)
+ .status();
+ });
+ return Status::OK();
+ };
+
+ return VisitAsyncGenerator(std::move(block_generator), block_visitor)
+ .Then([self]() -> Future<> {
+ // By this point we've added all top level tasks so it is safe to call
+ // FinishAsync
+ return self->task_group_->FinishAsync();
+ })
+ .Then([self]() -> Result<std::shared_ptr<Table>> {
+ // Finish conversion, create schema and table
+ return self->MakeTable();
+ });
+ });
+ }
+
+ protected:
+ Future<std::shared_ptr<Buffer>> ProcessFirstBuffer() {
+ // First block
+ auto first_buffer_future = buffer_generator_();
+ return first_buffer_future.Then([this](const std::shared_ptr<Buffer>& first_buffer)
+ -> Result<std::shared_ptr<Buffer>> {
+ if (first_buffer == nullptr) {
+ return Status::Invalid("Empty CSV file");
+ }
+ std::shared_ptr<Buffer> first_buffer_processed;
+ RETURN_NOT_OK(ProcessHeader(first_buffer, &first_buffer_processed));
+ RETURN_NOT_OK(MakeColumnBuilders());
+ return first_buffer_processed;
+ });
+ }
+
+ Executor* cpu_executor_;
+ AsyncGenerator<std::shared_ptr<Buffer>> buffer_generator_;
+};
+
+Result<std::shared_ptr<TableReader>> MakeTableReader(
+ MemoryPool* pool, io::IOContext io_context, std::shared_ptr<io::InputStream> input,
+ const ReadOptions& read_options, const ParseOptions& parse_options,
+ const ConvertOptions& convert_options) {
+ RETURN_NOT_OK(parse_options.Validate());
+ RETURN_NOT_OK(read_options.Validate());
+ RETURN_NOT_OK(convert_options.Validate());
+ std::shared_ptr<BaseTableReader> reader;
+ if (read_options.use_threads) {
+ auto cpu_executor = internal::GetCpuThreadPool();
+ reader = std::make_shared<AsyncThreadedTableReader>(
+ io_context, input, read_options, parse_options, convert_options, cpu_executor);
+ } else {
+ reader = std::make_shared<SerialTableReader>(io_context, input, read_options,
+ parse_options, convert_options,
+ /*count_rows=*/true);
+ }
+ RETURN_NOT_OK(reader->Init());
+ return reader;
+}
+
+Future<std::shared_ptr<StreamingReader>> MakeStreamingReader(
+ io::IOContext io_context, std::shared_ptr<io::InputStream> input,
+ internal::Executor* cpu_executor, const ReadOptions& read_options,
+ const ParseOptions& parse_options, const ConvertOptions& convert_options) {
+ RETURN_NOT_OK(parse_options.Validate());
+ RETURN_NOT_OK(read_options.Validate());
+ RETURN_NOT_OK(convert_options.Validate());
+ std::shared_ptr<StreamingReaderImpl> reader;
+ reader = std::make_shared<StreamingReaderImpl>(
+ io_context, input, read_options, parse_options, convert_options,
+ /*count_rows=*/!read_options.use_threads || cpu_executor->GetCapacity() == 1);
+ return reader->Init(cpu_executor).Then([reader] {
+ return std::dynamic_pointer_cast<StreamingReader>(reader);
+ });
+}
+
+/////////////////////////////////////////////////////////////////////////
+// Row count implementation
+
+class CSVRowCounter : public ReaderMixin,
+ public std::enable_shared_from_this<CSVRowCounter> {
+ public:
+ CSVRowCounter(io::IOContext io_context, Executor* cpu_executor,
+ std::shared_ptr<io::InputStream> input, const ReadOptions& read_options,
+ const ParseOptions& parse_options)
+ : ReaderMixin(io_context, std::move(input), read_options, parse_options,
+ ConvertOptions::Defaults(), /*count_rows=*/true),
+ cpu_executor_(cpu_executor),
+ row_count_(0) {}
+
+ Future<int64_t> Count() {
+ auto self = shared_from_this();
+ return Init(self).Then([self]() { return self->DoCount(self); });
+ }
+
+ private:
+ Future<> Init(const std::shared_ptr<CSVRowCounter>& self) {
+ ARROW_ASSIGN_OR_RAISE(auto istream_it,
+ io::MakeInputStreamIterator(input_, read_options_.block_size));
+ // TODO Consider exposing readahead as a read option (ARROW-12090)
+ ARROW_ASSIGN_OR_RAISE(auto bg_it, MakeBackgroundGenerator(std::move(istream_it),
+ io_context_.executor()));
+ auto transferred_it = MakeTransferredGenerator(bg_it, cpu_executor_);
+ auto buffer_generator = CSVBufferIterator::MakeAsync(std::move(transferred_it));
+
+ return buffer_generator().Then(
+ [self, buffer_generator](std::shared_ptr<Buffer> first_buffer) {
+ if (!first_buffer) {
+ return Status::Invalid("Empty CSV file");
+ }
+ RETURN_NOT_OK(self->ProcessHeader(first_buffer, &first_buffer));
+ self->block_generator_ = SerialBlockReader::MakeAsyncIterator(
+ buffer_generator, MakeChunker(self->parse_options_),
+ std::move(first_buffer), 0);
+ return Status::OK();
+ });
+ }
+
+ Future<int64_t> DoCount(const std::shared_ptr<CSVRowCounter>& self) {
+ // count_cb must return a value instead of Status/Future<> to work with
+ // MakeMappedGenerator, and it must use a type with a valid end value to work with
+ // IterationEnd.
+ std::function<Result<util::optional<int64_t>>(const CSVBlock&)> count_cb =
+ [self](const CSVBlock& maybe_block) -> Result<util::optional<int64_t>> {
+ ARROW_ASSIGN_OR_RAISE(
+ auto parser,
+ self->Parse(maybe_block.partial, maybe_block.completion, maybe_block.buffer,
+ maybe_block.block_index, maybe_block.is_final));
+ RETURN_NOT_OK(maybe_block.consume_bytes(parser.parsed_bytes));
+ self->row_count_ += parser.parser->num_rows();
+ return parser.parser->num_rows();
+ };
+ auto count_gen = MakeMappedGenerator(block_generator_, std::move(count_cb));
+ return DiscardAllFromAsyncGenerator(count_gen).Then(
+ [self]() { return self->row_count_; });
+ }
+
+ Executor* cpu_executor_;
+ AsyncGenerator<CSVBlock> block_generator_;
+ int64_t row_count_;
+};
+
+} // namespace
+
+/////////////////////////////////////////////////////////////////////////
+// Factory functions
+
+Result<std::shared_ptr<TableReader>> TableReader::Make(
+ io::IOContext io_context, std::shared_ptr<io::InputStream> input,
+ const ReadOptions& read_options, const ParseOptions& parse_options,
+ const ConvertOptions& convert_options) {
+ return MakeTableReader(io_context.pool(), io_context, std::move(input), read_options,
+ parse_options, convert_options);
+}
+
+Result<std::shared_ptr<TableReader>> TableReader::Make(
+ MemoryPool* pool, io::IOContext io_context, std::shared_ptr<io::InputStream> input,
+ const ReadOptions& read_options, const ParseOptions& parse_options,
+ const ConvertOptions& convert_options) {
+ return MakeTableReader(pool, io_context, std::move(input), read_options, parse_options,
+ convert_options);
+}
+
+Result<std::shared_ptr<StreamingReader>> StreamingReader::Make(
+ MemoryPool* pool, std::shared_ptr<io::InputStream> input,
+ const ReadOptions& read_options, const ParseOptions& parse_options,
+ const ConvertOptions& convert_options) {
+ auto io_context = io::IOContext(pool);
+ auto cpu_executor = internal::GetCpuThreadPool();
+ auto reader_fut = MakeStreamingReader(io_context, std::move(input), cpu_executor,
+ read_options, parse_options, convert_options);
+ auto reader_result = reader_fut.result();
+ ARROW_ASSIGN_OR_RAISE(auto reader, reader_result);
+ return reader;
+}
+
+Result<std::shared_ptr<StreamingReader>> StreamingReader::Make(
+ io::IOContext io_context, std::shared_ptr<io::InputStream> input,
+ const ReadOptions& read_options, const ParseOptions& parse_options,
+ const ConvertOptions& convert_options) {
+ auto cpu_executor = internal::GetCpuThreadPool();
+ auto reader_fut = MakeStreamingReader(io_context, std::move(input), cpu_executor,
+ read_options, parse_options, convert_options);
+ auto reader_result = reader_fut.result();
+ ARROW_ASSIGN_OR_RAISE(auto reader, reader_result);
+ return reader;
+}
+
+Future<std::shared_ptr<StreamingReader>> StreamingReader::MakeAsync(
+ io::IOContext io_context, std::shared_ptr<io::InputStream> input,
+ internal::Executor* cpu_executor, const ReadOptions& read_options,
+ const ParseOptions& parse_options, const ConvertOptions& convert_options) {
+ return MakeStreamingReader(io_context, std::move(input), cpu_executor, read_options,
+ parse_options, convert_options);
+}
+
+Future<int64_t> CountRowsAsync(io::IOContext io_context,
+ std::shared_ptr<io::InputStream> input,
+ internal::Executor* cpu_executor,
+ const ReadOptions& read_options,
+ const ParseOptions& parse_options) {
+ RETURN_NOT_OK(parse_options.Validate());
+ RETURN_NOT_OK(read_options.Validate());
+ auto counter = std::make_shared<CSVRowCounter>(
+ io_context, cpu_executor, std::move(input), read_options, parse_options);
+ return counter->Count();
+}
+
+} // namespace csv
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/csv/reader.h b/contrib/libs/apache/arrow/cpp/src/arrow/csv/reader.h
new file mode 100644
index 00000000000..48f02882b10
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/csv/reader.h
@@ -0,0 +1,123 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+
+#include "arrow/csv/options.h" // IWYU pragma: keep
+#include "arrow/io/interfaces.h"
+#include "arrow/record_batch.h"
+#include "arrow/result.h"
+#include "arrow/type.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/future.h"
+#include "arrow/util/thread_pool.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace io {
+class InputStream;
+} // namespace io
+
+namespace csv {
+
+/// A class that reads an entire CSV file into a Arrow Table
+class ARROW_EXPORT TableReader {
+ public:
+ virtual ~TableReader() = default;
+
+ /// Read the entire CSV file and convert it to a Arrow Table
+ virtual Result<std::shared_ptr<Table>> Read() = 0;
+ /// Read the entire CSV file and convert it to a Arrow Table
+ virtual Future<std::shared_ptr<Table>> ReadAsync() = 0;
+
+ /// Create a TableReader instance
+ static Result<std::shared_ptr<TableReader>> Make(io::IOContext io_context,
+ std::shared_ptr<io::InputStream> input,
+ const ReadOptions&,
+ const ParseOptions&,
+ const ConvertOptions&);
+
+ ARROW_DEPRECATED("Use MemoryPool-less variant (the IOContext holds a pool already)")
+ static Result<std::shared_ptr<TableReader>> Make(
+ MemoryPool* pool, io::IOContext io_context, std::shared_ptr<io::InputStream> input,
+ const ReadOptions&, const ParseOptions&, const ConvertOptions&);
+};
+
+/// \brief A class that reads a CSV file incrementally
+///
+/// Caveats:
+/// - For now, this is always single-threaded (regardless of `ReadOptions::use_threads`.
+/// - Type inference is done on the first block and types are frozen afterwards;
+/// to make sure the right data types are inferred, either set
+/// `ReadOptions::block_size` to a large enough value, or use
+/// `ConvertOptions::column_types` to set the desired data types explicitly.
+class ARROW_EXPORT StreamingReader : public RecordBatchReader {
+ public:
+ virtual ~StreamingReader() = default;
+
+ virtual Future<std::shared_ptr<RecordBatch>> ReadNextAsync() = 0;
+
+ /// \brief Return the number of bytes which have been read and processed
+ ///
+ /// The returned number includes CSV bytes which the StreamingReader has
+ /// finished processing, but not bytes for which some processing (e.g.
+ /// CSV parsing or conversion to Arrow layout) is still ongoing.
+ ///
+ /// Furthermore, the following rules apply:
+ /// - bytes skipped by `ReadOptions.skip_rows` are counted as being read before
+ /// any records are returned.
+ /// - bytes read while parsing the header are counted as being read before any
+ /// records are returned.
+ /// - bytes skipped by `ReadOptions.skip_rows_after_names` are counted after the
+ /// first batch is returned.
+ virtual int64_t bytes_read() const = 0;
+
+ /// Create a StreamingReader instance
+ ///
+ /// This involves some I/O as the first batch must be loaded during the creation process
+ /// so it is returned as a future
+ ///
+ /// Currently, the StreamingReader is not async-reentrant and does not do any fan-out
+ /// parsing (see ARROW-11889)
+ static Future<std::shared_ptr<StreamingReader>> MakeAsync(
+ io::IOContext io_context, std::shared_ptr<io::InputStream> input,
+ internal::Executor* cpu_executor, const ReadOptions&, const ParseOptions&,
+ const ConvertOptions&);
+
+ static Result<std::shared_ptr<StreamingReader>> Make(
+ io::IOContext io_context, std::shared_ptr<io::InputStream> input,
+ const ReadOptions&, const ParseOptions&, const ConvertOptions&);
+
+ ARROW_DEPRECATED("Use IOContext-based overload")
+ static Result<std::shared_ptr<StreamingReader>> Make(
+ MemoryPool* pool, std::shared_ptr<io::InputStream> input,
+ const ReadOptions& read_options, const ParseOptions& parse_options,
+ const ConvertOptions& convert_options);
+};
+
+/// \brief Count the logical rows of data in a CSV file (i.e. the
+/// number of rows you would get if you read the file into a table).
+ARROW_EXPORT
+Future<int64_t> CountRowsAsync(io::IOContext io_context,
+ std::shared_ptr<io::InputStream> input,
+ internal::Executor* cpu_executor, const ReadOptions&,
+ const ParseOptions&);
+
+} // namespace csv
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/csv/type_fwd.h b/contrib/libs/apache/arrow/cpp/src/arrow/csv/type_fwd.h
new file mode 100644
index 00000000000..c0a53847a90
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/csv/type_fwd.h
@@ -0,0 +1,28 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+namespace arrow {
+namespace csv {
+
+class TableReader;
+struct ConvertOptions;
+struct ReadOptions;
+struct ParseOptions;
+struct WriteOptions;
+
+} // namespace csv
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/csv/writer.cc b/contrib/libs/apache/arrow/cpp/src/arrow/csv/writer.cc
new file mode 100644
index 00000000000..1b782cae7dc
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/csv/writer.cc
@@ -0,0 +1,460 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/csv/writer.h"
+#include "arrow/array.h"
+#include "arrow/compute/cast.h"
+#include "arrow/io/interfaces.h"
+#include "arrow/ipc/writer.h"
+#include "arrow/record_batch.h"
+#include "arrow/result.h"
+#include "arrow/result_internal.h"
+#include "arrow/stl_allocator.h"
+#include "arrow/util/iterator.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/make_unique.h"
+
+#include "arrow/visitor_inline.h"
+
+namespace arrow {
+namespace csv {
+// This implementation is intentionally light on configurability to minimize the size of
+// the initial PR. Aditional features can be added as there is demand and interest to
+// implement them.
+//
+// The algorithm used here at a high level is to break RecordBatches/Tables into slices
+// and convert each slice independently. A slice is then converted to CSV by first
+// scanning each column to determine the size of its contents when rendered as a string in
+// CSV. For non-string types this requires casting the value to string (which is cached).
+// This data is used to understand the precise length of each row and a single allocation
+// for the final CSV data buffer. Once the final size is known each column is then
+// iterated over again to place its contents into the CSV data buffer. The rationale for
+// choosing this approach is it allows for reuse of the cast functionality in the compute
+// module and inline data visiting functionality in the core library. A performance
+// comparison has not been done using a naive single-pass approach. This approach might
+// still be competitive due to reduction in the number of per row branches necessary with
+// a single pass approach. Profiling would likely yield further opportunities for
+// optimization with this approach.
+
+namespace {
+
+struct SliceIteratorFunctor {
+ Result<std::shared_ptr<RecordBatch>> Next() {
+ if (current_offset < batch->num_rows()) {
+ std::shared_ptr<RecordBatch> next = batch->Slice(current_offset, slice_size);
+ current_offset += slice_size;
+ return next;
+ }
+ return IterationTraits<std::shared_ptr<RecordBatch>>::End();
+ }
+ const RecordBatch* const batch;
+ const int64_t slice_size;
+ int64_t current_offset;
+};
+
+RecordBatchIterator RecordBatchSliceIterator(const RecordBatch& batch,
+ int64_t slice_size) {
+ SliceIteratorFunctor functor = {&batch, slice_size, /*offset=*/static_cast<int64_t>(0)};
+ return RecordBatchIterator(std::move(functor));
+}
+
+// Counts the number of characters that need escaping in s.
+int64_t CountEscapes(util::string_view s) {
+ return static_cast<int64_t>(std::count(s.begin(), s.end(), '"'));
+}
+
+// Matching quote pair character length.
+constexpr int64_t kQuoteCount = 2;
+constexpr int64_t kQuoteDelimiterCount = kQuoteCount + /*end_char*/ 1;
+
+// Interface for generating CSV data per column.
+// The intended usage is to iteratively call UpdateRowLengths for a column and
+// then PopulateColumns. PopulateColumns must be called in the reverse order of the
+// populators (it populates data backwards).
+class ColumnPopulator {
+ public:
+ ColumnPopulator(MemoryPool* pool, char end_char) : end_char_(end_char), pool_(pool) {}
+
+ virtual ~ColumnPopulator() = default;
+
+ // Adds the number of characters each entry in data will add to to elements
+ // in row_lengths.
+ Status UpdateRowLengths(const Array& data, int32_t* row_lengths) {
+ compute::ExecContext ctx(pool_);
+ // Populators are intented to be applied to reasonably small data. In most cases
+ // threading overhead would not be justified.
+ ctx.set_use_threads(false);
+ ASSIGN_OR_RAISE(
+ std::shared_ptr<Array> casted,
+ compute::Cast(data, /*to_type=*/utf8(), compute::CastOptions(), &ctx));
+ casted_array_ = internal::checked_pointer_cast<StringArray>(casted);
+ return UpdateRowLengths(row_lengths);
+ }
+
+ // Places string data onto each row in output and updates the corresponding row
+ // row pointers in preparation for calls to other (preceding) ColumnPopulators.
+ // Args:
+ // output: character buffer to write to.
+ // offsets: an array of end of row column within the the output buffer (values are
+ // one past the end of the position to write to).
+ virtual void PopulateColumns(char* output, int32_t* offsets) const = 0;
+
+ protected:
+ virtual Status UpdateRowLengths(int32_t* row_lengths) = 0;
+ std::shared_ptr<StringArray> casted_array_;
+ const char end_char_;
+
+ private:
+ MemoryPool* const pool_;
+};
+
+// Copies the contents of to out properly escaping any necessary characters.
+// Returns the position prior to last copied character (out_end is decremented).
+char* EscapeReverse(arrow::util::string_view s, char* out_end) {
+ for (const char* val = s.data() + s.length() - 1; val >= s.data(); val--, out_end--) {
+ if (*val == '"') {
+ *out_end = *val;
+ out_end--;
+ }
+ *out_end = *val;
+ }
+ return out_end;
+}
+
+// Populator for non-string types. This populator relies on compute Cast functionality to
+// String if it doesn't exist it will be an error. it also assumes the resulting string
+// from a cast does not require quoting or escaping.
+class UnquotedColumnPopulator : public ColumnPopulator {
+ public:
+ explicit UnquotedColumnPopulator(MemoryPool* memory_pool, char end_char)
+ : ColumnPopulator(memory_pool, end_char) {}
+
+ Status UpdateRowLengths(int32_t* row_lengths) override {
+ for (int x = 0; x < casted_array_->length(); x++) {
+ row_lengths[x] += casted_array_->value_length(x);
+ }
+ return Status::OK();
+ }
+
+ void PopulateColumns(char* output, int32_t* offsets) const override {
+ VisitArrayDataInline<StringType>(
+ *casted_array_->data(),
+ [&](arrow::util::string_view s) {
+ int64_t next_column_offset = s.length() + /*end_char*/ 1;
+ memcpy((output + *offsets - next_column_offset), s.data(), s.length());
+ *(output + *offsets - 1) = end_char_;
+ *offsets -= static_cast<int32_t>(next_column_offset);
+ offsets++;
+ },
+ [&]() {
+ // Nulls are empty (unquoted) to distinguish with empty string.
+ *(output + *offsets - 1) = end_char_;
+ *offsets -= 1;
+ offsets++;
+ });
+ }
+};
+
+// Strings need special handling to ensure they are escaped properly.
+// This class handles escaping assuming that all strings will be quoted
+// and that the only character within the string that needs to escaped is
+// a quote character (") and escaping is done my adding another quote.
+class QuotedColumnPopulator : public ColumnPopulator {
+ public:
+ QuotedColumnPopulator(MemoryPool* pool, char end_char)
+ : ColumnPopulator(pool, end_char) {}
+
+ Status UpdateRowLengths(int32_t* row_lengths) override {
+ const StringArray& input = *casted_array_;
+ int row_number = 0;
+ row_needs_escaping_.resize(casted_array_->length());
+ VisitArrayDataInline<StringType>(
+ *input.data(),
+ [&](arrow::util::string_view s) {
+ int64_t escaped_count = CountEscapes(s);
+ // TODO: Maybe use 64 bit row lengths or safe cast?
+ row_needs_escaping_[row_number] = escaped_count > 0;
+ row_lengths[row_number] += static_cast<int32_t>(s.length()) +
+ static_cast<int32_t>(escaped_count + kQuoteCount);
+ row_number++;
+ },
+ [&]() {
+ row_needs_escaping_[row_number] = false;
+ row_number++;
+ });
+ return Status::OK();
+ }
+
+ void PopulateColumns(char* output, int32_t* offsets) const override {
+ auto needs_escaping = row_needs_escaping_.begin();
+ VisitArrayDataInline<StringType>(
+ *(casted_array_->data()),
+ [&](arrow::util::string_view s) {
+ // still needs string content length to be added
+ char* row_end = output + *offsets;
+ int32_t next_column_offset = 0;
+ if (!*needs_escaping) {
+ next_column_offset = static_cast<int32_t>(s.length() + kQuoteDelimiterCount);
+ memcpy(row_end - next_column_offset + /*quote_offset=*/1, s.data(),
+ s.length());
+ } else {
+ // Adjust row_end by 3: 1 quote char, 1 end char and 1 to position at the
+ // first position to write to.
+ next_column_offset =
+ static_cast<int32_t>(row_end - EscapeReverse(s, row_end - 3));
+ }
+ *(row_end - next_column_offset) = '"';
+ *(row_end - 2) = '"';
+ *(row_end - 1) = end_char_;
+ *offsets -= next_column_offset;
+ offsets++;
+ needs_escaping++;
+ },
+ [&]() {
+ // Nulls are empty (unquoted) to distinguish with empty string.
+ *(output + *offsets - 1) = end_char_;
+ *offsets -= 1;
+ offsets++;
+ needs_escaping++;
+ });
+ }
+
+ private:
+ // Older version of GCC don't support custom allocators
+ // at some point we should change this to use memory_pool
+ // backed allocator.
+ std::vector<bool> row_needs_escaping_;
+};
+
+struct PopulatorFactory {
+ template <typename TypeClass>
+ enable_if_t<is_base_binary_type<TypeClass>::value ||
+ std::is_same<FixedSizeBinaryType, TypeClass>::value,
+ Status>
+ Visit(const TypeClass& type) {
+ populator = new QuotedColumnPopulator(pool, end_char);
+ return Status::OK();
+ }
+
+ template <typename TypeClass>
+ enable_if_dictionary<TypeClass, Status> Visit(const TypeClass& type) {
+ return VisitTypeInline(*type.value_type(), this);
+ }
+
+ template <typename TypeClass>
+ enable_if_t<is_nested_type<TypeClass>::value || is_extension_type<TypeClass>::value,
+ Status>
+ Visit(const TypeClass& type) {
+ return Status::Invalid("Unsupported Type:", type.ToString());
+ }
+
+ template <typename TypeClass>
+ enable_if_t<is_primitive_ctype<TypeClass>::value || is_decimal_type<TypeClass>::value ||
+ is_null_type<TypeClass>::value || is_temporal_type<TypeClass>::value,
+ Status>
+ Visit(const TypeClass& type) {
+ populator = new UnquotedColumnPopulator(pool, end_char);
+ return Status::OK();
+ }
+
+ char end_char;
+ MemoryPool* pool;
+ ColumnPopulator* populator;
+};
+
+Result<std::unique_ptr<ColumnPopulator>> MakePopulator(const Field& field, char end_char,
+ MemoryPool* pool) {
+ PopulatorFactory factory{end_char, pool, nullptr};
+ RETURN_NOT_OK(VisitTypeInline(*field.type(), &factory));
+ return std::unique_ptr<ColumnPopulator>(factory.populator);
+}
+
+class CSVWriterImpl : public ipc::RecordBatchWriter {
+ public:
+ static Result<std::shared_ptr<CSVWriterImpl>> Make(
+ io::OutputStream* sink, std::shared_ptr<io::OutputStream> owned_sink,
+ std::shared_ptr<Schema> schema, const WriteOptions& options) {
+ RETURN_NOT_OK(options.Validate());
+ std::vector<std::unique_ptr<ColumnPopulator>> populators(schema->num_fields());
+ for (int col = 0; col < schema->num_fields(); col++) {
+ char end_char = col < schema->num_fields() - 1 ? ',' : '\n';
+ ASSIGN_OR_RAISE(populators[col], MakePopulator(*schema->field(col), end_char,
+ options.io_context.pool()));
+ }
+ auto writer = std::make_shared<CSVWriterImpl>(
+ sink, std::move(owned_sink), std::move(schema), std::move(populators), options);
+ RETURN_NOT_OK(writer->PrepareForContentsWrite());
+ if (options.include_header) {
+ RETURN_NOT_OK(writer->WriteHeader());
+ }
+ return writer;
+ }
+
+ Status WriteRecordBatch(const RecordBatch& batch) override {
+ RecordBatchIterator iterator = RecordBatchSliceIterator(batch, options_.batch_size);
+ for (auto maybe_slice : iterator) {
+ ASSIGN_OR_RAISE(std::shared_ptr<RecordBatch> slice, maybe_slice);
+ RETURN_NOT_OK(TranslateMinimalBatch(*slice));
+ RETURN_NOT_OK(sink_->Write(data_buffer_));
+ stats_.num_record_batches++;
+ }
+ return Status::OK();
+ }
+
+ Status WriteTable(const Table& table, int64_t max_chunksize) override {
+ TableBatchReader reader(table);
+ reader.set_chunksize(max_chunksize > 0 ? max_chunksize : options_.batch_size);
+ std::shared_ptr<RecordBatch> batch;
+ RETURN_NOT_OK(reader.ReadNext(&batch));
+ while (batch != nullptr) {
+ RETURN_NOT_OK(TranslateMinimalBatch(*batch));
+ RETURN_NOT_OK(sink_->Write(data_buffer_));
+ RETURN_NOT_OK(reader.ReadNext(&batch));
+ stats_.num_record_batches++;
+ }
+
+ return Status::OK();
+ }
+
+ Status Close() override { return Status::OK(); }
+
+ ipc::WriteStats stats() const override { return stats_; }
+
+ CSVWriterImpl(io::OutputStream* sink, std::shared_ptr<io::OutputStream> owned_sink,
+ std::shared_ptr<Schema> schema,
+ std::vector<std::unique_ptr<ColumnPopulator>> populators,
+ const WriteOptions& options)
+ : sink_(sink),
+ owned_sink_(std::move(owned_sink)),
+ column_populators_(std::move(populators)),
+ offsets_(0, 0, ::arrow::stl::allocator<char*>(options.io_context.pool())),
+ schema_(std::move(schema)),
+ options_(options) {}
+
+ private:
+ Status PrepareForContentsWrite() {
+ // Only called once, as part of initialization
+ if (data_buffer_ == nullptr) {
+ ASSIGN_OR_RAISE(data_buffer_,
+ AllocateResizableBuffer(
+ options_.batch_size * schema_->num_fields() * kColumnSizeGuess,
+ options_.io_context.pool()));
+ }
+ return Status::OK();
+ }
+
+ int64_t CalculateHeaderSize() const {
+ int64_t header_length = 0;
+ for (int col = 0; col < schema_->num_fields(); col++) {
+ const std::string& col_name = schema_->field(col)->name();
+ header_length += col_name.size();
+ header_length += CountEscapes(col_name);
+ }
+ return header_length + (kQuoteDelimiterCount * schema_->num_fields());
+ }
+
+ Status WriteHeader() {
+ // Only called once, as part of initialization
+ RETURN_NOT_OK(data_buffer_->Resize(CalculateHeaderSize(), /*shrink_to_fit=*/false));
+ char* next =
+ reinterpret_cast<char*>(data_buffer_->mutable_data() + data_buffer_->size() - 1);
+ for (int col = schema_->num_fields() - 1; col >= 0; col--) {
+ *next-- = ',';
+ *next-- = '"';
+ next = EscapeReverse(schema_->field(col)->name(), next);
+ *next-- = '"';
+ }
+ *(data_buffer_->mutable_data() + data_buffer_->size() - 1) = '\n';
+ DCHECK_EQ(reinterpret_cast<uint8_t*>(next + 1), data_buffer_->data());
+ return sink_->Write(data_buffer_);
+ }
+
+ Status TranslateMinimalBatch(const RecordBatch& batch) {
+ if (batch.num_rows() == 0) {
+ return Status::OK();
+ }
+ offsets_.resize(batch.num_rows());
+ std::fill(offsets_.begin(), offsets_.end(), 0);
+
+ // Calculate relative offsets for each row (excluding delimiters)
+ for (int32_t col = 0; col < static_cast<int32_t>(column_populators_.size()); col++) {
+ RETURN_NOT_OK(
+ column_populators_[col]->UpdateRowLengths(*batch.column(col), offsets_.data()));
+ }
+ // Calculate cumulalative offsets for each row (including delimiters).
+ offsets_[0] += batch.num_columns();
+ for (int64_t row = 1; row < batch.num_rows(); row++) {
+ offsets_[row] += offsets_[row - 1] + /*delimiter lengths*/ batch.num_columns();
+ }
+ // Resize the target buffer to required size. We assume batch to batch sizes
+ // should be pretty close so don't shrink the buffer to avoid allocation churn.
+ RETURN_NOT_OK(data_buffer_->Resize(offsets_.back(), /*shrink_to_fit=*/false));
+
+ // Use the offsets to populate contents.
+ for (auto populator = column_populators_.rbegin();
+ populator != column_populators_.rend(); populator++) {
+ (*populator)
+ ->PopulateColumns(reinterpret_cast<char*>(data_buffer_->mutable_data()),
+ offsets_.data());
+ }
+ DCHECK_EQ(0, offsets_[0]);
+ return Status::OK();
+ }
+
+ static constexpr int64_t kColumnSizeGuess = 8;
+ io::OutputStream* sink_;
+ std::shared_ptr<io::OutputStream> owned_sink_;
+ std::vector<std::unique_ptr<ColumnPopulator>> column_populators_;
+ std::vector<int32_t, arrow::stl::allocator<int32_t>> offsets_;
+ std::shared_ptr<ResizableBuffer> data_buffer_;
+ const std::shared_ptr<Schema> schema_;
+ const WriteOptions options_;
+ ipc::WriteStats stats_;
+};
+
+} // namespace
+
+Status WriteCSV(const Table& table, const WriteOptions& options,
+ arrow::io::OutputStream* output) {
+ ASSIGN_OR_RAISE(auto writer, MakeCSVWriter(output, table.schema(), options));
+ RETURN_NOT_OK(writer->WriteTable(table));
+ return writer->Close();
+}
+
+Status WriteCSV(const RecordBatch& batch, const WriteOptions& options,
+ arrow::io::OutputStream* output) {
+ ASSIGN_OR_RAISE(auto writer, MakeCSVWriter(output, batch.schema(), options));
+ RETURN_NOT_OK(writer->WriteRecordBatch(batch));
+ return writer->Close();
+}
+
+ARROW_EXPORT
+Result<std::shared_ptr<ipc::RecordBatchWriter>> MakeCSVWriter(
+ std::shared_ptr<io::OutputStream> sink, const std::shared_ptr<Schema>& schema,
+ const WriteOptions& options) {
+ return CSVWriterImpl::Make(sink.get(), sink, schema, options);
+}
+
+ARROW_EXPORT
+Result<std::shared_ptr<ipc::RecordBatchWriter>> MakeCSVWriter(
+ io::OutputStream* sink, const std::shared_ptr<Schema>& schema,
+ const WriteOptions& options) {
+ return CSVWriterImpl::Make(sink, nullptr, schema, options);
+}
+
+} // namespace csv
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/csv/writer.h b/contrib/libs/apache/arrow/cpp/src/arrow/csv/writer.h
new file mode 100644
index 00000000000..2f1442ae0af
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/csv/writer.h
@@ -0,0 +1,73 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+
+#include "arrow/csv/options.h"
+#include "arrow/io/interfaces.h"
+#include "arrow/ipc/type_fwd.h"
+#include "arrow/record_batch.h"
+#include "arrow/table.h"
+
+namespace arrow {
+namespace csv {
+// Functionality for converting Arrow data to Comma separated value text.
+// This library supports all primitive types that can be cast to a StringArrays.
+// It applies to following formatting rules:
+// - For non-binary types no quotes surround values. Nulls are represented as the empty
+// string.
+// - For binary types all non-null data is quoted (and quotes within data are escaped
+// with an additional quote).
+// Null values are empty and unquoted.
+// - LF (\n) is always used as a line ending.
+
+/// \brief Converts table to a CSV and writes the results to output.
+/// Experimental
+ARROW_EXPORT Status WriteCSV(const Table& table, const WriteOptions& options,
+ arrow::io::OutputStream* output);
+/// \brief Converts batch to CSV and writes the results to output.
+/// Experimental
+ARROW_EXPORT Status WriteCSV(const RecordBatch& batch, const WriteOptions& options,
+ arrow::io::OutputStream* output);
+
+/// \brief Create a new CSV writer. User is responsible for closing the
+/// actual OutputStream.
+///
+/// \param[in] sink output stream to write to
+/// \param[in] schema the schema of the record batches to be written
+/// \param[in] options options for serialization
+/// \return Result<std::shared_ptr<RecordBatchWriter>>
+ARROW_EXPORT
+Result<std::shared_ptr<ipc::RecordBatchWriter>> MakeCSVWriter(
+ std::shared_ptr<io::OutputStream> sink, const std::shared_ptr<Schema>& schema,
+ const WriteOptions& options = WriteOptions::Defaults());
+
+/// \brief Create a new CSV writer.
+///
+/// \param[in] sink output stream to write to (does not take ownership)
+/// \param[in] schema the schema of the record batches to be written
+/// \param[in] options options for serialization
+/// \return Result<std::shared_ptr<RecordBatchWriter>>
+ARROW_EXPORT
+Result<std::shared_ptr<ipc::RecordBatchWriter>> MakeCSVWriter(
+ io::OutputStream* sink, const std::shared_ptr<Schema>& schema,
+ const WriteOptions& options = WriteOptions::Defaults());
+
+} // namespace csv
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/datum.cc b/contrib/libs/apache/arrow/cpp/src/arrow/datum.cc
new file mode 100644
index 00000000000..dd10fce3e4d
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/datum.cc
@@ -0,0 +1,284 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/datum.h"
+
+#include <cstddef>
+#include <memory>
+#include <sstream>
+#include <vector>
+
+#include "arrow/array/array_base.h"
+#include "arrow/array/util.h"
+#include "arrow/chunked_array.h"
+#include "arrow/record_batch.h"
+#include "arrow/scalar.h"
+#include "arrow/table.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/memory.h"
+
+namespace arrow {
+
+static bool CollectionEquals(const std::vector<Datum>& left,
+ const std::vector<Datum>& right) {
+ if (left.size() != right.size()) {
+ return false;
+ }
+
+ for (size_t i = 0; i < left.size(); i++) {
+ if (!left[i].Equals(right[i])) {
+ return false;
+ }
+ }
+ return true;
+}
+
+Datum::Datum(const Array& value) : Datum(value.data()) {}
+
+Datum::Datum(const std::shared_ptr<Array>& value)
+ : Datum(value ? value->data() : NULLPTR) {}
+
+Datum::Datum(std::shared_ptr<ChunkedArray> value) : value(std::move(value)) {}
+Datum::Datum(std::shared_ptr<RecordBatch> value) : value(std::move(value)) {}
+Datum::Datum(std::shared_ptr<Table> value) : value(std::move(value)) {}
+Datum::Datum(std::vector<Datum> value) : value(std::move(value)) {}
+
+Datum::Datum(bool value) : value(std::make_shared<BooleanScalar>(value)) {}
+Datum::Datum(int8_t value) : value(std::make_shared<Int8Scalar>(value)) {}
+Datum::Datum(uint8_t value) : value(std::make_shared<UInt8Scalar>(value)) {}
+Datum::Datum(int16_t value) : value(std::make_shared<Int16Scalar>(value)) {}
+Datum::Datum(uint16_t value) : value(std::make_shared<UInt16Scalar>(value)) {}
+Datum::Datum(int32_t value) : value(std::make_shared<Int32Scalar>(value)) {}
+Datum::Datum(uint32_t value) : value(std::make_shared<UInt32Scalar>(value)) {}
+Datum::Datum(int64_t value) : value(std::make_shared<Int64Scalar>(value)) {}
+Datum::Datum(uint64_t value) : value(std::make_shared<UInt64Scalar>(value)) {}
+Datum::Datum(float value) : value(std::make_shared<FloatScalar>(value)) {}
+Datum::Datum(double value) : value(std::make_shared<DoubleScalar>(value)) {}
+Datum::Datum(std::string value)
+ : value(std::make_shared<StringScalar>(std::move(value))) {}
+Datum::Datum(const char* value) : value(std::make_shared<StringScalar>(value)) {}
+
+Datum::Datum(const ChunkedArray& value)
+ : value(std::make_shared<ChunkedArray>(value.chunks(), value.type())) {}
+
+Datum::Datum(const Table& value)
+ : value(Table::Make(value.schema(), value.columns(), value.num_rows())) {}
+
+Datum::Datum(const RecordBatch& value)
+ : value(RecordBatch::Make(value.schema(), value.num_rows(), value.columns())) {}
+
+std::shared_ptr<Array> Datum::make_array() const {
+ DCHECK_EQ(Datum::ARRAY, this->kind());
+ return MakeArray(util::get<std::shared_ptr<ArrayData>>(this->value));
+}
+
+std::shared_ptr<DataType> Datum::type() const {
+ if (this->kind() == Datum::ARRAY) {
+ return util::get<std::shared_ptr<ArrayData>>(this->value)->type;
+ }
+ if (this->kind() == Datum::CHUNKED_ARRAY) {
+ return util::get<std::shared_ptr<ChunkedArray>>(this->value)->type();
+ }
+ if (this->kind() == Datum::SCALAR) {
+ return util::get<std::shared_ptr<Scalar>>(this->value)->type;
+ }
+ return nullptr;
+}
+
+std::shared_ptr<Schema> Datum::schema() const {
+ if (this->kind() == Datum::RECORD_BATCH) {
+ return util::get<std::shared_ptr<RecordBatch>>(this->value)->schema();
+ }
+ if (this->kind() == Datum::TABLE) {
+ return util::get<std::shared_ptr<Table>>(this->value)->schema();
+ }
+ return nullptr;
+}
+
+int64_t Datum::length() const {
+ if (this->kind() == Datum::ARRAY) {
+ return util::get<std::shared_ptr<ArrayData>>(this->value)->length;
+ } else if (this->kind() == Datum::CHUNKED_ARRAY) {
+ return util::get<std::shared_ptr<ChunkedArray>>(this->value)->length();
+ } else if (this->kind() == Datum::SCALAR) {
+ return 1;
+ }
+ return kUnknownLength;
+}
+
+int64_t Datum::null_count() const {
+ if (this->kind() == Datum::ARRAY) {
+ return util::get<std::shared_ptr<ArrayData>>(this->value)->GetNullCount();
+ } else if (this->kind() == Datum::CHUNKED_ARRAY) {
+ return util::get<std::shared_ptr<ChunkedArray>>(this->value)->null_count();
+ } else if (this->kind() == Datum::SCALAR) {
+ const auto& val = *util::get<std::shared_ptr<Scalar>>(this->value);
+ return val.is_valid ? 0 : 1;
+ } else {
+ DCHECK(false) << "This function only valid for array-like values";
+ return 0;
+ }
+}
+
+ArrayVector Datum::chunks() const {
+ if (!this->is_arraylike()) {
+ return {};
+ }
+ if (this->is_array()) {
+ return {this->make_array()};
+ }
+ return this->chunked_array()->chunks();
+}
+
+bool Datum::Equals(const Datum& other) const {
+ if (this->kind() != other.kind()) return false;
+
+ switch (this->kind()) {
+ case Datum::NONE:
+ return true;
+ case Datum::SCALAR:
+ return internal::SharedPtrEquals(this->scalar(), other.scalar());
+ case Datum::ARRAY:
+ return internal::SharedPtrEquals(this->make_array(), other.make_array());
+ case Datum::CHUNKED_ARRAY:
+ return internal::SharedPtrEquals(this->chunked_array(), other.chunked_array());
+ case Datum::RECORD_BATCH:
+ return internal::SharedPtrEquals(this->record_batch(), other.record_batch());
+ case Datum::TABLE:
+ return internal::SharedPtrEquals(this->table(), other.table());
+ case Datum::COLLECTION:
+ return CollectionEquals(this->collection(), other.collection());
+ default:
+ return false;
+ }
+}
+
+ValueDescr Datum::descr() const {
+ if (this->is_arraylike()) {
+ return ValueDescr(this->type(), ValueDescr::ARRAY);
+ } else if (this->is_scalar()) {
+ return ValueDescr(this->type(), ValueDescr::SCALAR);
+ } else {
+ DCHECK(false) << "Datum is not value-like, this method should not be called";
+ return ValueDescr();
+ }
+}
+
+ValueDescr::Shape Datum::shape() const {
+ if (this->is_arraylike()) {
+ return ValueDescr::ARRAY;
+ } else if (this->is_scalar()) {
+ return ValueDescr::SCALAR;
+ } else {
+ DCHECK(false) << "Datum is not value-like, this method should not be called";
+ return ValueDescr::ANY;
+ }
+}
+
+static std::string FormatValueDescr(const ValueDescr& descr) {
+ std::stringstream ss;
+ switch (descr.shape) {
+ case ValueDescr::ANY:
+ ss << "any";
+ break;
+ case ValueDescr::ARRAY:
+ ss << "array";
+ break;
+ case ValueDescr::SCALAR:
+ ss << "scalar";
+ break;
+ default:
+ DCHECK(false);
+ break;
+ }
+ ss << "[" << descr.type->ToString() << "]";
+ return ss.str();
+}
+
+std::string ValueDescr::ToString() const { return FormatValueDescr(*this); }
+
+std::string ValueDescr::ToString(const std::vector<ValueDescr>& descrs) {
+ std::stringstream ss;
+ ss << "(";
+ for (size_t i = 0; i < descrs.size(); ++i) {
+ if (i > 0) {
+ ss << ", ";
+ }
+ ss << descrs[i].ToString();
+ }
+ ss << ")";
+ return ss.str();
+}
+
+void PrintTo(const ValueDescr& descr, std::ostream* os) { *os << descr.ToString(); }
+
+std::string Datum::ToString() const {
+ switch (this->kind()) {
+ case Datum::NONE:
+ return "nullptr";
+ case Datum::SCALAR:
+ return "Scalar";
+ case Datum::ARRAY:
+ return "Array";
+ case Datum::CHUNKED_ARRAY:
+ return "ChunkedArray";
+ case Datum::RECORD_BATCH:
+ return "RecordBatch";
+ case Datum::TABLE:
+ return "Table";
+ case Datum::COLLECTION: {
+ std::stringstream ss;
+ ss << "Collection(";
+ const auto& values = this->collection();
+ for (size_t i = 0; i < values.size(); ++i) {
+ if (i > 0) {
+ ss << ", ";
+ }
+ ss << values[i].ToString();
+ }
+ ss << ')';
+ return ss.str();
+ }
+ default:
+ DCHECK(false);
+ return "";
+ }
+}
+
+ValueDescr::Shape GetBroadcastShape(const std::vector<ValueDescr>& args) {
+ for (const auto& descr : args) {
+ if (descr.shape == ValueDescr::ARRAY) {
+ return ValueDescr::ARRAY;
+ }
+ }
+ return ValueDescr::SCALAR;
+}
+
+void PrintTo(const Datum& datum, std::ostream* os) {
+ switch (datum.kind()) {
+ case Datum::SCALAR:
+ *os << datum.scalar()->ToString();
+ break;
+ case Datum::ARRAY:
+ *os << datum.make_array()->ToString();
+ break;
+ default:
+ *os << datum.ToString();
+ }
+}
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/datum.h b/contrib/libs/apache/arrow/cpp/src/arrow/datum.h
new file mode 100644
index 00000000000..6ba6af7f79e
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/datum.h
@@ -0,0 +1,281 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "arrow/array/data.h"
+#include "arrow/scalar.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/variant.h" // IWYU pragma: export
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class Array;
+class ChunkedArray;
+class RecordBatch;
+class Table;
+
+/// \brief A descriptor type that gives the shape (array or scalar) and
+/// DataType of a Value, but without the data
+struct ARROW_EXPORT ValueDescr {
+ std::shared_ptr<DataType> type;
+ enum Shape {
+ /// \brief Either Array or Scalar
+ ANY,
+
+ /// \brief Array type
+ ARRAY,
+
+ /// \brief Only Scalar arguments supported
+ SCALAR
+ };
+
+ Shape shape;
+
+ ValueDescr() : shape(ANY) {}
+
+ ValueDescr(std::shared_ptr<DataType> type, ValueDescr::Shape shape)
+ : type(std::move(type)), shape(shape) {}
+
+ ValueDescr(std::shared_ptr<DataType> type) // NOLINT implicit conversion
+ : type(std::move(type)), shape(ValueDescr::ANY) {}
+
+ /// \brief Convenience constructor for ANY descr
+ static ValueDescr Any(std::shared_ptr<DataType> type) {
+ return ValueDescr(std::move(type), ANY);
+ }
+
+ /// \brief Convenience constructor for Value::ARRAY descr
+ static ValueDescr Array(std::shared_ptr<DataType> type) {
+ return ValueDescr(std::move(type), ARRAY);
+ }
+
+ /// \brief Convenience constructor for Value::SCALAR descr
+ static ValueDescr Scalar(std::shared_ptr<DataType> type) {
+ return ValueDescr(std::move(type), SCALAR);
+ }
+
+ bool operator==(const ValueDescr& other) const {
+ if (shape != other.shape) return false;
+ if (type == other.type) return true;
+ return type && type->Equals(other.type);
+ }
+
+ bool operator!=(const ValueDescr& other) const { return !(*this == other); }
+
+ std::string ToString() const;
+ static std::string ToString(const std::vector<ValueDescr>&);
+
+ ARROW_EXPORT friend void PrintTo(const ValueDescr&, std::ostream*);
+};
+
+/// \brief For use with scalar functions, returns the broadcasted Value::Shape
+/// given a vector of value descriptors. Return SCALAR unless any value is
+/// ARRAY
+ARROW_EXPORT
+ValueDescr::Shape GetBroadcastShape(const std::vector<ValueDescr>& args);
+
+/// \class Datum
+/// \brief Variant type for various Arrow C++ data structures
+struct ARROW_EXPORT Datum {
+ enum Kind { NONE, SCALAR, ARRAY, CHUNKED_ARRAY, RECORD_BATCH, TABLE, COLLECTION };
+
+ struct Empty {};
+
+ // Datums variants may have a length. This special value indicate that the
+ // current variant does not have a length.
+ static constexpr int64_t kUnknownLength = -1;
+
+ util::Variant<Empty, std::shared_ptr<Scalar>, std::shared_ptr<ArrayData>,
+ std::shared_ptr<ChunkedArray>, std::shared_ptr<RecordBatch>,
+ std::shared_ptr<Table>, std::vector<Datum>>
+ value;
+
+ /// \brief Empty datum, to be populated elsewhere
+ Datum() = default;
+
+ Datum(const Datum& other) = default;
+ Datum& operator=(const Datum& other) = default;
+ Datum(Datum&& other) = default;
+ Datum& operator=(Datum&& other) = default;
+
+ Datum(std::shared_ptr<Scalar> value) // NOLINT implicit conversion
+ : value(std::move(value)) {}
+
+ Datum(std::shared_ptr<ArrayData> value) // NOLINT implicit conversion
+ : value(std::move(value)) {}
+
+ Datum(ArrayData arg) // NOLINT implicit conversion
+ : value(std::make_shared<ArrayData>(std::move(arg))) {}
+
+ Datum(const Array& value); // NOLINT implicit conversion
+ Datum(const std::shared_ptr<Array>& value); // NOLINT implicit conversion
+ Datum(std::shared_ptr<ChunkedArray> value); // NOLINT implicit conversion
+ Datum(std::shared_ptr<RecordBatch> value); // NOLINT implicit conversion
+ Datum(std::shared_ptr<Table> value); // NOLINT implicit conversion
+ Datum(std::vector<Datum> value); // NOLINT implicit conversion
+
+ // Explicit constructors from const-refs. Can be expensive, prefer the
+ // shared_ptr constructors
+ explicit Datum(const ChunkedArray& value);
+ explicit Datum(const RecordBatch& value);
+ explicit Datum(const Table& value);
+
+ // Cast from subtypes of Array to Datum
+ template <typename T, typename = enable_if_t<std::is_base_of<Array, T>::value>>
+ Datum(const std::shared_ptr<T>& value) // NOLINT implicit conversion
+ : Datum(std::shared_ptr<Array>(value)) {}
+
+ // Convenience constructors
+ explicit Datum(bool value);
+ explicit Datum(int8_t value);
+ explicit Datum(uint8_t value);
+ explicit Datum(int16_t value);
+ explicit Datum(uint16_t value);
+ explicit Datum(int32_t value);
+ explicit Datum(uint32_t value);
+ explicit Datum(int64_t value);
+ explicit Datum(uint64_t value);
+ explicit Datum(float value);
+ explicit Datum(double value);
+ explicit Datum(std::string value);
+ explicit Datum(const char* value);
+
+ Datum::Kind kind() const {
+ switch (this->value.index()) {
+ case 0:
+ return Datum::NONE;
+ case 1:
+ return Datum::SCALAR;
+ case 2:
+ return Datum::ARRAY;
+ case 3:
+ return Datum::CHUNKED_ARRAY;
+ case 4:
+ return Datum::RECORD_BATCH;
+ case 5:
+ return Datum::TABLE;
+ case 6:
+ return Datum::COLLECTION;
+ default:
+ return Datum::NONE;
+ }
+ }
+
+ const std::shared_ptr<ArrayData>& array() const {
+ return util::get<std::shared_ptr<ArrayData>>(this->value);
+ }
+
+ ArrayData* mutable_array() const { return this->array().get(); }
+
+ std::shared_ptr<Array> make_array() const;
+
+ const std::shared_ptr<ChunkedArray>& chunked_array() const {
+ return util::get<std::shared_ptr<ChunkedArray>>(this->value);
+ }
+
+ const std::shared_ptr<RecordBatch>& record_batch() const {
+ return util::get<std::shared_ptr<RecordBatch>>(this->value);
+ }
+
+ const std::shared_ptr<Table>& table() const {
+ return util::get<std::shared_ptr<Table>>(this->value);
+ }
+
+ const std::vector<Datum>& collection() const {
+ return util::get<std::vector<Datum>>(this->value);
+ }
+
+ const std::shared_ptr<Scalar>& scalar() const {
+ return util::get<std::shared_ptr<Scalar>>(this->value);
+ }
+
+ template <typename ExactType>
+ std::shared_ptr<ExactType> array_as() const {
+ return internal::checked_pointer_cast<ExactType>(this->make_array());
+ }
+
+ template <typename ExactType>
+ const ExactType& scalar_as() const {
+ return internal::checked_cast<const ExactType&>(*this->scalar());
+ }
+
+ bool is_array() const { return this->kind() == Datum::ARRAY; }
+
+ bool is_arraylike() const {
+ return this->kind() == Datum::ARRAY || this->kind() == Datum::CHUNKED_ARRAY;
+ }
+
+ bool is_scalar() const { return this->kind() == Datum::SCALAR; }
+
+ /// \brief True if Datum contains a scalar or array-like data
+ bool is_value() const { return this->is_arraylike() || this->is_scalar(); }
+
+ bool is_collection() const { return this->kind() == Datum::COLLECTION; }
+
+ int64_t null_count() const;
+
+ /// \brief Return the shape (array or scalar) and type for supported kinds
+ /// (ARRAY, CHUNKED_ARRAY, and SCALAR). Debug asserts otherwise
+ ValueDescr descr() const;
+
+ /// \brief Return the shape (array or scalar) for supported kinds (ARRAY,
+ /// CHUNKED_ARRAY, and SCALAR). Debug asserts otherwise
+ ValueDescr::Shape shape() const;
+
+ /// \brief The value type of the variant, if any
+ ///
+ /// \return nullptr if no type
+ std::shared_ptr<DataType> type() const;
+
+ /// \brief The schema of the variant, if any
+ ///
+ /// \return nullptr if no schema
+ std::shared_ptr<Schema> schema() const;
+
+ /// \brief The value length of the variant, if any
+ ///
+ /// \return kUnknownLength if no type
+ int64_t length() const;
+
+ /// \brief The array chunks of the variant, if any
+ ///
+ /// \return empty if not arraylike
+ ArrayVector chunks() const;
+
+ bool Equals(const Datum& other) const;
+
+ bool operator==(const Datum& other) const { return Equals(other); }
+ bool operator!=(const Datum& other) const { return !Equals(other); }
+
+ std::string ToString() const;
+
+ ARROW_EXPORT friend void PrintTo(const Datum&, std::ostream*);
+};
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/device.cc b/contrib/libs/apache/arrow/cpp/src/arrow/device.cc
new file mode 100644
index 00000000000..1aead49bfb1
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/device.cc
@@ -0,0 +1,209 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/device.h"
+
+#include <cstring>
+#include <utility>
+
+#include "arrow/buffer.h"
+#include "arrow/io/memory.h"
+#include "arrow/result.h"
+#include "arrow/util/logging.h"
+
+namespace arrow {
+
+MemoryManager::~MemoryManager() {}
+
+Device::~Device() {}
+
+#define COPY_BUFFER_SUCCESS(maybe_buffer) \
+ ((maybe_buffer).ok() && *(maybe_buffer) != nullptr)
+
+#define COPY_BUFFER_RETURN(maybe_buffer, to) \
+ if (!maybe_buffer.ok()) { \
+ return maybe_buffer; \
+ } \
+ if (COPY_BUFFER_SUCCESS(maybe_buffer)) { \
+ DCHECK_EQ(*(**maybe_buffer).device(), *to->device()); \
+ return maybe_buffer; \
+ }
+
+Result<std::shared_ptr<Buffer>> MemoryManager::CopyBuffer(
+ const std::shared_ptr<Buffer>& buf, const std::shared_ptr<MemoryManager>& to) {
+ const auto& from = buf->memory_manager();
+ auto maybe_buffer = to->CopyBufferFrom(buf, from);
+ COPY_BUFFER_RETURN(maybe_buffer, to);
+ // `to` doesn't support copying from `from`, try the other way
+ maybe_buffer = from->CopyBufferTo(buf, to);
+ COPY_BUFFER_RETURN(maybe_buffer, to);
+ if (!from->is_cpu() && !to->is_cpu()) {
+ // Try an intermediate view on the CPU
+ auto cpu_mm = default_cpu_memory_manager();
+ maybe_buffer = from->ViewBufferTo(buf, cpu_mm);
+ if (!COPY_BUFFER_SUCCESS(maybe_buffer)) {
+ // View failed, try a copy instead
+ // XXX should we have a MemoryManager::IsCopySupportedTo(MemoryManager)
+ // to avoid copying to CPU if copy from CPU to dest is unsupported?
+ maybe_buffer = from->CopyBufferTo(buf, cpu_mm);
+ }
+ if (COPY_BUFFER_SUCCESS(maybe_buffer)) {
+ // Copy from source to CPU succeeded, now try to copy from CPU into dest
+ maybe_buffer = to->CopyBufferFrom(*maybe_buffer, cpu_mm);
+ if (COPY_BUFFER_SUCCESS(maybe_buffer)) {
+ return maybe_buffer;
+ }
+ }
+ }
+
+ return Status::NotImplemented("Copying buffer from ", from->device()->ToString(),
+ " to ", to->device()->ToString(), " not supported");
+}
+
+Result<std::shared_ptr<Buffer>> MemoryManager::ViewBuffer(
+ const std::shared_ptr<Buffer>& buf, const std::shared_ptr<MemoryManager>& to) {
+ if (buf->memory_manager() == to) {
+ return buf;
+ }
+ const auto& from = buf->memory_manager();
+ auto maybe_buffer = to->ViewBufferFrom(buf, from);
+ COPY_BUFFER_RETURN(maybe_buffer, to);
+ // `to` doesn't support viewing from `from`, try the other way
+ maybe_buffer = from->ViewBufferTo(buf, to);
+ COPY_BUFFER_RETURN(maybe_buffer, to);
+
+ return Status::NotImplemented("Viewing buffer from ", from->device()->ToString(),
+ " on ", to->device()->ToString(), " not supported");
+}
+
+#undef COPY_BUFFER_RETURN
+#undef COPY_BUFFER_SUCCESS
+
+Result<std::shared_ptr<Buffer>> MemoryManager::CopyBufferFrom(
+ const std::shared_ptr<Buffer>& buf, const std::shared_ptr<MemoryManager>& from) {
+ return nullptr;
+}
+
+Result<std::shared_ptr<Buffer>> MemoryManager::CopyBufferTo(
+ const std::shared_ptr<Buffer>& buf, const std::shared_ptr<MemoryManager>& to) {
+ return nullptr;
+}
+
+Result<std::shared_ptr<Buffer>> MemoryManager::ViewBufferFrom(
+ const std::shared_ptr<Buffer>& buf, const std::shared_ptr<MemoryManager>& from) {
+ return nullptr;
+}
+
+Result<std::shared_ptr<Buffer>> MemoryManager::ViewBufferTo(
+ const std::shared_ptr<Buffer>& buf, const std::shared_ptr<MemoryManager>& to) {
+ return nullptr;
+}
+
+// ----------------------------------------------------------------------
+// CPU backend implementation
+
+namespace {
+const char kCPUDeviceTypeName[] = "arrow::CPUDevice";
+}
+
+std::shared_ptr<MemoryManager> CPUMemoryManager::Make(
+ const std::shared_ptr<Device>& device, MemoryPool* pool) {
+ return std::shared_ptr<MemoryManager>(new CPUMemoryManager(device, pool));
+}
+
+Result<std::shared_ptr<io::RandomAccessFile>> CPUMemoryManager::GetBufferReader(
+ std::shared_ptr<Buffer> buf) {
+ return std::make_shared<io::BufferReader>(std::move(buf));
+}
+
+Result<std::shared_ptr<io::OutputStream>> CPUMemoryManager::GetBufferWriter(
+ std::shared_ptr<Buffer> buf) {
+ return std::make_shared<io::FixedSizeBufferWriter>(std::move(buf));
+}
+
+Result<std::shared_ptr<Buffer>> CPUMemoryManager::AllocateBuffer(int64_t size) {
+ return ::arrow::AllocateBuffer(size, pool_);
+}
+
+Result<std::shared_ptr<Buffer>> CPUMemoryManager::CopyBufferFrom(
+ const std::shared_ptr<Buffer>& buf, const std::shared_ptr<MemoryManager>& from) {
+ if (!from->is_cpu()) {
+ return nullptr;
+ }
+ ARROW_ASSIGN_OR_RAISE(auto dest, ::arrow::AllocateBuffer(buf->size(), pool_));
+ if (buf->size() > 0) {
+ memcpy(dest->mutable_data(), buf->data(), static_cast<size_t>(buf->size()));
+ }
+ return std::move(dest);
+}
+
+Result<std::shared_ptr<Buffer>> CPUMemoryManager::ViewBufferFrom(
+ const std::shared_ptr<Buffer>& buf, const std::shared_ptr<MemoryManager>& from) {
+ if (!from->is_cpu()) {
+ return nullptr;
+ }
+ return buf;
+}
+
+Result<std::shared_ptr<Buffer>> CPUMemoryManager::CopyBufferTo(
+ const std::shared_ptr<Buffer>& buf, const std::shared_ptr<MemoryManager>& to) {
+ if (!to->is_cpu()) {
+ return nullptr;
+ }
+ ARROW_ASSIGN_OR_RAISE(auto dest, ::arrow::AllocateBuffer(buf->size(), pool_));
+ if (buf->size() > 0) {
+ memcpy(dest->mutable_data(), buf->data(), static_cast<size_t>(buf->size()));
+ }
+ return std::move(dest);
+}
+
+Result<std::shared_ptr<Buffer>> CPUMemoryManager::ViewBufferTo(
+ const std::shared_ptr<Buffer>& buf, const std::shared_ptr<MemoryManager>& to) {
+ if (!to->is_cpu()) {
+ return nullptr;
+ }
+ return buf;
+}
+
+std::shared_ptr<MemoryManager> default_cpu_memory_manager() {
+ static auto instance =
+ CPUMemoryManager::Make(CPUDevice::Instance(), default_memory_pool());
+ return instance;
+}
+
+std::shared_ptr<Device> CPUDevice::Instance() {
+ static auto instance = std::shared_ptr<Device>(new CPUDevice());
+ return instance;
+}
+
+const char* CPUDevice::type_name() const { return kCPUDeviceTypeName; }
+
+std::string CPUDevice::ToString() const { return "CPUDevice()"; }
+
+bool CPUDevice::Equals(const Device& other) const {
+ return other.type_name() == kCPUDeviceTypeName;
+}
+
+std::shared_ptr<MemoryManager> CPUDevice::memory_manager(MemoryPool* pool) {
+ return CPUMemoryManager::Make(Instance(), pool);
+}
+
+std::shared_ptr<MemoryManager> CPUDevice::default_memory_manager() {
+ return default_cpu_memory_manager();
+}
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/device.h b/contrib/libs/apache/arrow/cpp/src/arrow/device.h
new file mode 100644
index 00000000000..068be483e98
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/device.h
@@ -0,0 +1,226 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+
+#include "arrow/io/type_fwd.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/compare.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class MemoryManager;
+
+/// \brief EXPERIMENTAL: Abstract interface for hardware devices
+///
+/// This object represents a device with access to some memory spaces.
+/// When handling a Buffer or raw memory address, it allows deciding in which
+/// context the raw memory address should be interpreted
+/// (e.g. CPU-accessible memory, or embedded memory on some particular GPU).
+class ARROW_EXPORT Device : public std::enable_shared_from_this<Device>,
+ public util::EqualityComparable<Device> {
+ public:
+ virtual ~Device();
+
+ /// \brief A shorthand for this device's type.
+ ///
+ /// The returned value is different for each device class, but is the
+ /// same for all instances of a given class. It can be used as a replacement
+ /// for RTTI.
+ virtual const char* type_name() const = 0;
+
+ /// \brief A human-readable description of the device.
+ ///
+ /// The returned value should be detailed enough to distinguish between
+ /// different instances, where necessary.
+ virtual std::string ToString() const = 0;
+
+ /// \brief Whether this instance points to the same device as another one.
+ virtual bool Equals(const Device&) const = 0;
+
+ /// \brief Whether this device is the main CPU device.
+ ///
+ /// This shorthand method is very useful when deciding whether a memory address
+ /// is CPU-accessible.
+ bool is_cpu() const { return is_cpu_; }
+
+ /// \brief Return a MemoryManager instance tied to this device
+ ///
+ /// The returned instance uses default parameters for this device type's
+ /// MemoryManager implementation. Some devices also allow constructing
+ /// MemoryManager instances with non-default parameters.
+ virtual std::shared_ptr<MemoryManager> default_memory_manager() = 0;
+
+ protected:
+ ARROW_DISALLOW_COPY_AND_ASSIGN(Device);
+ explicit Device(bool is_cpu = false) : is_cpu_(is_cpu) {}
+
+ bool is_cpu_;
+};
+
+/// \brief EXPERIMENTAL: An object that provides memory management primitives
+///
+/// A MemoryManager is always tied to a particular Device instance.
+/// It can also have additional parameters (such as a MemoryPool to
+/// allocate CPU memory).
+class ARROW_EXPORT MemoryManager : public std::enable_shared_from_this<MemoryManager> {
+ public:
+ virtual ~MemoryManager();
+
+ /// \brief The device this MemoryManager is tied to
+ const std::shared_ptr<Device>& device() const { return device_; }
+
+ /// \brief Whether this MemoryManager is tied to the main CPU device.
+ ///
+ /// This shorthand method is very useful when deciding whether a memory address
+ /// is CPU-accessible.
+ bool is_cpu() const { return device_->is_cpu(); }
+
+ /// \brief Create a RandomAccessFile to read a particular buffer.
+ ///
+ /// The given buffer must be tied to this MemoryManager.
+ ///
+ /// See also the Buffer::GetReader shorthand.
+ virtual Result<std::shared_ptr<io::RandomAccessFile>> GetBufferReader(
+ std::shared_ptr<Buffer> buf) = 0;
+
+ /// \brief Create a OutputStream to write to a particular buffer.
+ ///
+ /// The given buffer must be mutable and tied to this MemoryManager.
+ /// The returned stream object writes into the buffer's underlying memory
+ /// (but it won't resize it).
+ ///
+ /// See also the Buffer::GetWriter shorthand.
+ virtual Result<std::shared_ptr<io::OutputStream>> GetBufferWriter(
+ std::shared_ptr<Buffer> buf) = 0;
+
+ /// \brief Allocate a (mutable) Buffer
+ ///
+ /// The buffer will be allocated in the device's memory.
+ virtual Result<std::shared_ptr<Buffer>> AllocateBuffer(int64_t size) = 0;
+
+ // XXX Should this take a `const Buffer&` instead
+ /// \brief Copy a Buffer to a destination MemoryManager
+ ///
+ /// See also the Buffer::Copy shorthand.
+ static Result<std::shared_ptr<Buffer>> CopyBuffer(
+ const std::shared_ptr<Buffer>& source, const std::shared_ptr<MemoryManager>& to);
+
+ /// \brief Make a no-copy Buffer view in a destination MemoryManager
+ ///
+ /// See also the Buffer::View shorthand.
+ static Result<std::shared_ptr<Buffer>> ViewBuffer(
+ const std::shared_ptr<Buffer>& source, const std::shared_ptr<MemoryManager>& to);
+
+ protected:
+ ARROW_DISALLOW_COPY_AND_ASSIGN(MemoryManager);
+
+ explicit MemoryManager(const std::shared_ptr<Device>& device) : device_(device) {}
+
+ // Default implementations always return nullptr, should be overridden
+ // by subclasses that support data transfer.
+ // (returning nullptr means unsupported copy / view)
+ // In CopyBufferFrom and ViewBufferFrom, the `from` parameter is guaranteed to
+ // be equal to `buf->memory_manager()`.
+ virtual Result<std::shared_ptr<Buffer>> CopyBufferFrom(
+ const std::shared_ptr<Buffer>& buf, const std::shared_ptr<MemoryManager>& from);
+ virtual Result<std::shared_ptr<Buffer>> CopyBufferTo(
+ const std::shared_ptr<Buffer>& buf, const std::shared_ptr<MemoryManager>& to);
+ virtual Result<std::shared_ptr<Buffer>> ViewBufferFrom(
+ const std::shared_ptr<Buffer>& buf, const std::shared_ptr<MemoryManager>& from);
+ virtual Result<std::shared_ptr<Buffer>> ViewBufferTo(
+ const std::shared_ptr<Buffer>& buf, const std::shared_ptr<MemoryManager>& to);
+
+ std::shared_ptr<Device> device_;
+};
+
+// ----------------------------------------------------------------------
+// CPU backend implementation
+
+class ARROW_EXPORT CPUDevice : public Device {
+ public:
+ const char* type_name() const override;
+ std::string ToString() const override;
+ bool Equals(const Device&) const override;
+
+ std::shared_ptr<MemoryManager> default_memory_manager() override;
+
+ /// \brief Return the global CPUDevice instance
+ static std::shared_ptr<Device> Instance();
+
+ /// \brief Create a MemoryManager
+ ///
+ /// The returned MemoryManager will use the given MemoryPool for allocations.
+ static std::shared_ptr<MemoryManager> memory_manager(MemoryPool* pool);
+
+ protected:
+ CPUDevice() : Device(true) {}
+};
+
+class ARROW_EXPORT CPUMemoryManager : public MemoryManager {
+ public:
+ Result<std::shared_ptr<io::RandomAccessFile>> GetBufferReader(
+ std::shared_ptr<Buffer> buf) override;
+ Result<std::shared_ptr<io::OutputStream>> GetBufferWriter(
+ std::shared_ptr<Buffer> buf) override;
+
+ Result<std::shared_ptr<Buffer>> AllocateBuffer(int64_t size) override;
+
+ /// \brief Return the MemoryPool associated with this MemoryManager.
+ MemoryPool* pool() const { return pool_; }
+
+ protected:
+ CPUMemoryManager(const std::shared_ptr<Device>& device, MemoryPool* pool)
+ : MemoryManager(device), pool_(pool) {}
+
+ static std::shared_ptr<MemoryManager> Make(const std::shared_ptr<Device>& device,
+ MemoryPool* pool = default_memory_pool());
+
+ Result<std::shared_ptr<Buffer>> CopyBufferFrom(
+ const std::shared_ptr<Buffer>& buf,
+ const std::shared_ptr<MemoryManager>& from) override;
+ Result<std::shared_ptr<Buffer>> CopyBufferTo(
+ const std::shared_ptr<Buffer>& buf,
+ const std::shared_ptr<MemoryManager>& to) override;
+ Result<std::shared_ptr<Buffer>> ViewBufferFrom(
+ const std::shared_ptr<Buffer>& buf,
+ const std::shared_ptr<MemoryManager>& from) override;
+ Result<std::shared_ptr<Buffer>> ViewBufferTo(
+ const std::shared_ptr<Buffer>& buf,
+ const std::shared_ptr<MemoryManager>& to) override;
+
+ MemoryPool* pool_;
+
+ friend std::shared_ptr<MemoryManager> CPUDevice::memory_manager(MemoryPool* pool);
+ friend ARROW_EXPORT std::shared_ptr<MemoryManager> default_cpu_memory_manager();
+};
+
+/// \brief Return the default CPU MemoryManager instance
+///
+/// The returned singleton instance uses the default MemoryPool.
+/// This function is a faster spelling of
+/// `CPUDevice::Instance()->default_memory_manager()`.
+ARROW_EXPORT
+std::shared_ptr<MemoryManager> default_cpu_memory_manager();
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/extension_type.cc b/contrib/libs/apache/arrow/cpp/src/arrow/extension_type.cc
new file mode 100644
index 00000000000..e579b691023
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/extension_type.cc
@@ -0,0 +1,169 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/extension_type.h"
+
+#include <memory>
+#include <mutex>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <utility>
+
+#include "arrow/array/util.h"
+#include "arrow/chunked_array.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/logging.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+
+DataTypeLayout ExtensionType::layout() const { return storage_type_->layout(); }
+
+std::string ExtensionType::ToString() const {
+ std::stringstream ss;
+ ss << "extension<" << this->extension_name() << ">";
+ return ss.str();
+}
+
+std::shared_ptr<Array> ExtensionType::WrapArray(const std::shared_ptr<DataType>& type,
+ const std::shared_ptr<Array>& storage) {
+ DCHECK_EQ(type->id(), Type::EXTENSION);
+ const auto& ext_type = checked_cast<const ExtensionType&>(*type);
+ DCHECK_EQ(storage->type_id(), ext_type.storage_type()->id());
+ auto data = storage->data()->Copy();
+ data->type = type;
+ return ext_type.MakeArray(std::move(data));
+}
+
+std::shared_ptr<ChunkedArray> ExtensionType::WrapArray(
+ const std::shared_ptr<DataType>& type, const std::shared_ptr<ChunkedArray>& storage) {
+ DCHECK_EQ(type->id(), Type::EXTENSION);
+ const auto& ext_type = checked_cast<const ExtensionType&>(*type);
+ DCHECK_EQ(storage->type()->id(), ext_type.storage_type()->id());
+
+ ArrayVector out_chunks(storage->num_chunks());
+ for (int i = 0; i < storage->num_chunks(); i++) {
+ auto data = storage->chunk(i)->data()->Copy();
+ data->type = type;
+ out_chunks[i] = ext_type.MakeArray(std::move(data));
+ }
+ return std::make_shared<ChunkedArray>(std::move(out_chunks));
+}
+
+ExtensionArray::ExtensionArray(const std::shared_ptr<ArrayData>& data) { SetData(data); }
+
+ExtensionArray::ExtensionArray(const std::shared_ptr<DataType>& type,
+ const std::shared_ptr<Array>& storage) {
+ ARROW_CHECK_EQ(type->id(), Type::EXTENSION);
+ ARROW_CHECK(
+ storage->type()->Equals(*checked_cast<const ExtensionType&>(*type).storage_type()));
+ auto data = storage->data()->Copy();
+ // XXX This pointer is reverted below in SetData()...
+ data->type = type;
+ SetData(data);
+}
+
+void ExtensionArray::SetData(const std::shared_ptr<ArrayData>& data) {
+ ARROW_CHECK_EQ(data->type->id(), Type::EXTENSION);
+ this->Array::SetData(data);
+
+ auto storage_data = data->Copy();
+ storage_data->type = (static_cast<const ExtensionType&>(*data->type).storage_type());
+ storage_ = MakeArray(storage_data);
+}
+
+class ExtensionTypeRegistryImpl : public ExtensionTypeRegistry {
+ public:
+ ExtensionTypeRegistryImpl() {}
+
+ Status RegisterType(std::shared_ptr<ExtensionType> type) override {
+ std::lock_guard<std::mutex> lock(lock_);
+ std::string type_name = type->extension_name();
+ auto it = name_to_type_.find(type_name);
+ if (it != name_to_type_.end()) {
+ return Status::KeyError("A type extension with name ", type_name,
+ " already defined");
+ }
+ name_to_type_[type_name] = std::move(type);
+ return Status::OK();
+ }
+
+ Status UnregisterType(const std::string& type_name) override {
+ std::lock_guard<std::mutex> lock(lock_);
+ auto it = name_to_type_.find(type_name);
+ if (it == name_to_type_.end()) {
+ return Status::KeyError("No type extension with name ", type_name, " found");
+ }
+ name_to_type_.erase(it);
+ return Status::OK();
+ }
+
+ std::shared_ptr<ExtensionType> GetType(const std::string& type_name) override {
+ std::lock_guard<std::mutex> lock(lock_);
+ auto it = name_to_type_.find(type_name);
+ if (it == name_to_type_.end()) {
+ return nullptr;
+ } else {
+ return it->second;
+ }
+ return nullptr;
+ }
+
+ private:
+ std::mutex lock_;
+ std::unordered_map<std::string, std::shared_ptr<ExtensionType>> name_to_type_;
+};
+
+static std::shared_ptr<ExtensionTypeRegistry> g_registry;
+static std::once_flag registry_initialized;
+
+namespace internal {
+
+static void CreateGlobalRegistry() {
+ g_registry = std::make_shared<ExtensionTypeRegistryImpl>();
+}
+
+} // namespace internal
+
+std::shared_ptr<ExtensionTypeRegistry> ExtensionTypeRegistry::GetGlobalRegistry() {
+ std::call_once(registry_initialized, internal::CreateGlobalRegistry);
+ return g_registry;
+}
+
+Status RegisterExtensionType(std::shared_ptr<ExtensionType> type) {
+ auto registry = ExtensionTypeRegistry::GetGlobalRegistry();
+ return registry->RegisterType(type);
+}
+
+Status UnregisterExtensionType(const std::string& type_name) {
+ auto registry = ExtensionTypeRegistry::GetGlobalRegistry();
+ return registry->UnregisterType(type_name);
+}
+
+std::shared_ptr<ExtensionType> GetExtensionType(const std::string& type_name) {
+ auto registry = ExtensionTypeRegistry::GetGlobalRegistry();
+ return registry->GetType(type_name);
+}
+
+extern const char kExtensionTypeKeyName[] = "ARROW:extension:name";
+extern const char kExtensionMetadataKeyName[] = "ARROW:extension:metadata";
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/extension_type.h b/contrib/libs/apache/arrow/cpp/src/arrow/extension_type.h
new file mode 100644
index 00000000000..a22d015195d
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/extension_type.h
@@ -0,0 +1,161 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+/// User-defined extension types. EXPERIMENTAL in 0.13.0
+/// \since 0.13.0
+
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include "arrow/array/array_base.h"
+#include "arrow/array/data.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+/// \brief The base class for custom / user-defined types.
+class ARROW_EXPORT ExtensionType : public DataType {
+ public:
+ static constexpr Type::type type_id = Type::EXTENSION;
+
+ static constexpr const char* type_name() { return "extension"; }
+
+ /// \brief The type of array used to represent this extension type's data
+ std::shared_ptr<DataType> storage_type() const { return storage_type_; }
+
+ DataTypeLayout layout() const override;
+
+ std::string ToString() const override;
+
+ std::string name() const override { return "extension"; }
+
+ /// \brief Unique name of extension type used to identify type for
+ /// serialization
+ /// \return the string name of the extension
+ virtual std::string extension_name() const = 0;
+
+ /// \brief Determine if two instances of the same extension types are
+ /// equal. Invoked from ExtensionType::Equals
+ /// \param[in] other the type to compare this type with
+ /// \return bool true if type instances are equal
+ virtual bool ExtensionEquals(const ExtensionType& other) const = 0;
+
+ /// \brief Wrap built-in Array type in a user-defined ExtensionArray instance
+ /// \param[in] data the physical storage for the extension type
+ virtual std::shared_ptr<Array> MakeArray(std::shared_ptr<ArrayData> data) const = 0;
+
+ /// \brief Create an instance of the ExtensionType given the actual storage
+ /// type and the serialized representation
+ /// \param[in] storage_type the physical storage type of the extension
+ /// \param[in] serialized_data the serialized representation produced by
+ /// Serialize
+ virtual Result<std::shared_ptr<DataType>> Deserialize(
+ std::shared_ptr<DataType> storage_type,
+ const std::string& serialized_data) const = 0;
+
+ /// \brief Create a serialized representation of the extension type's
+ /// metadata. The storage type will be handled automatically in IPC code
+ /// paths
+ /// \return the serialized representation
+ virtual std::string Serialize() const = 0;
+
+ /// \brief Wrap the given storage array as an extension array
+ static std::shared_ptr<Array> WrapArray(const std::shared_ptr<DataType>& ext_type,
+ const std::shared_ptr<Array>& storage);
+
+ /// \brief Wrap the given chunked storage array as a chunked extension array
+ static std::shared_ptr<ChunkedArray> WrapArray(
+ const std::shared_ptr<DataType>& ext_type,
+ const std::shared_ptr<ChunkedArray>& storage);
+
+ protected:
+ explicit ExtensionType(std::shared_ptr<DataType> storage_type)
+ : DataType(Type::EXTENSION), storage_type_(storage_type) {}
+
+ std::shared_ptr<DataType> storage_type_;
+};
+
+/// \brief Base array class for user-defined extension types
+class ARROW_EXPORT ExtensionArray : public Array {
+ public:
+ /// \brief Construct an ExtensionArray from an ArrayData.
+ ///
+ /// The ArrayData must have the right ExtensionType.
+ explicit ExtensionArray(const std::shared_ptr<ArrayData>& data);
+
+ /// \brief Construct an ExtensionArray from a type and the underlying storage.
+ ExtensionArray(const std::shared_ptr<DataType>& type,
+ const std::shared_ptr<Array>& storage);
+
+ const ExtensionType* extension_type() const {
+ return internal::checked_cast<const ExtensionType*>(data_->type.get());
+ }
+
+ /// \brief The physical storage for the extension array
+ std::shared_ptr<Array> storage() const { return storage_; }
+
+ protected:
+ void SetData(const std::shared_ptr<ArrayData>& data);
+ std::shared_ptr<Array> storage_;
+};
+
+class ARROW_EXPORT ExtensionTypeRegistry {
+ public:
+ /// \brief Provide access to the global registry to allow code to control for
+ /// race conditions in registry teardown when some types need to be
+ /// unregistered and destroyed first
+ static std::shared_ptr<ExtensionTypeRegistry> GetGlobalRegistry();
+
+ virtual ~ExtensionTypeRegistry() = default;
+
+ virtual Status RegisterType(std::shared_ptr<ExtensionType> type) = 0;
+ virtual Status UnregisterType(const std::string& type_name) = 0;
+ virtual std::shared_ptr<ExtensionType> GetType(const std::string& type_name) = 0;
+};
+
+/// \brief Register an extension type globally. The name returned by the type's
+/// extension_name() method should be unique. This method is thread-safe
+/// \param[in] type an instance of the extension type
+/// \return Status
+ARROW_EXPORT
+Status RegisterExtensionType(std::shared_ptr<ExtensionType> type);
+
+/// \brief Delete an extension type from the global registry. This method is
+/// thread-safe
+/// \param[in] type_name the unique name of a registered extension type
+/// \return Status error if the type name is unknown
+ARROW_EXPORT
+Status UnregisterExtensionType(const std::string& type_name);
+
+/// \brief Retrieve an extension type from the global registry. Returns nullptr
+/// if not found. This method is thread-safe
+/// \return the globally-registered extension type
+ARROW_EXPORT
+std::shared_ptr<ExtensionType> GetExtensionType(const std::string& type_name);
+
+ARROW_EXPORT extern const char kExtensionTypeKeyName[];
+ARROW_EXPORT extern const char kExtensionMetadataKeyName[];
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/io/api.h b/contrib/libs/apache/arrow/cpp/src/arrow/io/api.h
new file mode 100644
index 00000000000..3bfde6de452
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/io/api.h
@@ -0,0 +1,24 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/io/buffered.h"
+#include "arrow/io/compressed.h"
+#include "arrow/io/file.h"
+#include "arrow/io/interfaces.h"
+#include "arrow/io/memory.h"
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/io/buffered.cc b/contrib/libs/apache/arrow/cpp/src/arrow/io/buffered.cc
new file mode 100644
index 00000000000..7804c130ca1
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/io/buffered.cc
@@ -0,0 +1,489 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/io/buffered.h"
+
+#include <algorithm>
+#include <cstring>
+#include <memory>
+#include <mutex>
+#include <utility>
+
+#include "arrow/buffer.h"
+#include "arrow/io/util_internal.h"
+#include "arrow/memory_pool.h"
+#include "arrow/status.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/string_view.h"
+
+namespace arrow {
+namespace io {
+
+// ----------------------------------------------------------------------
+// BufferedOutputStream implementation
+
+class BufferedBase {
+ public:
+ explicit BufferedBase(MemoryPool* pool)
+ : pool_(pool),
+ is_open_(true),
+ buffer_data_(nullptr),
+ buffer_pos_(0),
+ buffer_size_(0),
+ raw_pos_(-1) {}
+
+ bool closed() const {
+ std::lock_guard<std::mutex> guard(lock_);
+ return !is_open_;
+ }
+
+ Status ResetBuffer() {
+ if (!buffer_) {
+ // On first invocation, or if the buffer has been released, we allocate a
+ // new buffer
+ ARROW_ASSIGN_OR_RAISE(buffer_, AllocateResizableBuffer(buffer_size_, pool_));
+ } else if (buffer_->size() != buffer_size_) {
+ RETURN_NOT_OK(buffer_->Resize(buffer_size_));
+ }
+ buffer_data_ = buffer_->mutable_data();
+ return Status::OK();
+ }
+
+ Status ResizeBuffer(int64_t new_buffer_size) {
+ buffer_size_ = new_buffer_size;
+ return ResetBuffer();
+ }
+
+ void AppendToBuffer(const void* data, int64_t nbytes) {
+ DCHECK_LE(buffer_pos_ + nbytes, buffer_size_);
+ std::memcpy(buffer_data_ + buffer_pos_, data, nbytes);
+ buffer_pos_ += nbytes;
+ }
+
+ int64_t buffer_size() const { return buffer_size_; }
+
+ int64_t buffer_pos() const { return buffer_pos_; }
+
+ protected:
+ MemoryPool* pool_;
+ bool is_open_;
+
+ std::shared_ptr<ResizableBuffer> buffer_;
+ uint8_t* buffer_data_;
+ int64_t buffer_pos_;
+ int64_t buffer_size_;
+
+ mutable int64_t raw_pos_;
+ mutable std::mutex lock_;
+};
+
+class BufferedOutputStream::Impl : public BufferedBase {
+ public:
+ explicit Impl(std::shared_ptr<OutputStream> raw, MemoryPool* pool)
+ : BufferedBase(pool), raw_(std::move(raw)) {}
+
+ Status Close() {
+ std::lock_guard<std::mutex> guard(lock_);
+ if (is_open_) {
+ Status st = FlushUnlocked();
+ is_open_ = false;
+ RETURN_NOT_OK(raw_->Close());
+ return st;
+ }
+ return Status::OK();
+ }
+
+ Status Abort() {
+ std::lock_guard<std::mutex> guard(lock_);
+ if (is_open_) {
+ is_open_ = false;
+ return raw_->Abort();
+ }
+ return Status::OK();
+ }
+
+ Result<int64_t> Tell() const {
+ std::lock_guard<std::mutex> guard(lock_);
+ if (raw_pos_ == -1) {
+ ARROW_ASSIGN_OR_RAISE(raw_pos_, raw_->Tell());
+ DCHECK_GE(raw_pos_, 0);
+ }
+ return raw_pos_ + buffer_pos_;
+ }
+
+ Status Write(const void* data, int64_t nbytes) { return DoWrite(data, nbytes); }
+
+ Status Write(const std::shared_ptr<Buffer>& buffer) {
+ return DoWrite(buffer->data(), buffer->size(), buffer);
+ }
+
+ Status DoWrite(const void* data, int64_t nbytes,
+ const std::shared_ptr<Buffer>& buffer = nullptr) {
+ std::lock_guard<std::mutex> guard(lock_);
+ if (nbytes < 0) {
+ return Status::Invalid("write count should be >= 0");
+ }
+ if (nbytes == 0) {
+ return Status::OK();
+ }
+ if (nbytes + buffer_pos_ >= buffer_size_) {
+ RETURN_NOT_OK(FlushUnlocked());
+ DCHECK_EQ(buffer_pos_, 0);
+ if (nbytes >= buffer_size_) {
+ // Direct write
+ if (buffer) {
+ return raw_->Write(buffer);
+ } else {
+ return raw_->Write(data, nbytes);
+ }
+ }
+ }
+ AppendToBuffer(data, nbytes);
+ return Status::OK();
+ }
+
+ Status FlushUnlocked() {
+ if (buffer_pos_ > 0) {
+ // Invalidate cached raw pos
+ raw_pos_ = -1;
+ RETURN_NOT_OK(raw_->Write(buffer_data_, buffer_pos_));
+ buffer_pos_ = 0;
+ }
+ return Status::OK();
+ }
+
+ Status Flush() {
+ std::lock_guard<std::mutex> guard(lock_);
+ return FlushUnlocked();
+ }
+
+ Result<std::shared_ptr<OutputStream>> Detach() {
+ std::lock_guard<std::mutex> guard(lock_);
+ RETURN_NOT_OK(FlushUnlocked());
+ is_open_ = false;
+ return std::move(raw_);
+ }
+
+ Status SetBufferSize(int64_t new_buffer_size) {
+ std::lock_guard<std::mutex> guard(lock_);
+ if (new_buffer_size <= 0) {
+ return Status::Invalid("Buffer size should be positive");
+ }
+ if (buffer_pos_ >= new_buffer_size) {
+ // If the buffer is shrinking, first flush to the raw OutputStream
+ RETURN_NOT_OK(FlushUnlocked());
+ }
+ return ResizeBuffer(new_buffer_size);
+ }
+
+ std::shared_ptr<OutputStream> raw() const { return raw_; }
+
+ private:
+ std::shared_ptr<OutputStream> raw_;
+};
+
+BufferedOutputStream::BufferedOutputStream(std::shared_ptr<OutputStream> raw,
+ MemoryPool* pool) {
+ impl_.reset(new Impl(std::move(raw), pool));
+}
+
+Result<std::shared_ptr<BufferedOutputStream>> BufferedOutputStream::Create(
+ int64_t buffer_size, MemoryPool* pool, std::shared_ptr<OutputStream> raw) {
+ auto result = std::shared_ptr<BufferedOutputStream>(
+ new BufferedOutputStream(std::move(raw), pool));
+ RETURN_NOT_OK(result->SetBufferSize(buffer_size));
+ return result;
+}
+
+BufferedOutputStream::~BufferedOutputStream() { internal::CloseFromDestructor(this); }
+
+Status BufferedOutputStream::SetBufferSize(int64_t new_buffer_size) {
+ return impl_->SetBufferSize(new_buffer_size);
+}
+
+int64_t BufferedOutputStream::buffer_size() const { return impl_->buffer_size(); }
+
+int64_t BufferedOutputStream::bytes_buffered() const { return impl_->buffer_pos(); }
+
+Result<std::shared_ptr<OutputStream>> BufferedOutputStream::Detach() {
+ return impl_->Detach();
+}
+
+Status BufferedOutputStream::Close() { return impl_->Close(); }
+
+Status BufferedOutputStream::Abort() { return impl_->Abort(); }
+
+bool BufferedOutputStream::closed() const { return impl_->closed(); }
+
+Result<int64_t> BufferedOutputStream::Tell() const { return impl_->Tell(); }
+
+Status BufferedOutputStream::Write(const void* data, int64_t nbytes) {
+ return impl_->Write(data, nbytes);
+}
+
+Status BufferedOutputStream::Write(const std::shared_ptr<Buffer>& data) {
+ return impl_->Write(data);
+}
+
+Status BufferedOutputStream::Flush() { return impl_->Flush(); }
+
+std::shared_ptr<OutputStream> BufferedOutputStream::raw() const { return impl_->raw(); }
+
+// ----------------------------------------------------------------------
+// BufferedInputStream implementation
+
+class BufferedInputStream::Impl : public BufferedBase {
+ public:
+ Impl(std::shared_ptr<InputStream> raw, MemoryPool* pool, int64_t raw_total_bytes_bound)
+ : BufferedBase(pool),
+ raw_(std::move(raw)),
+ raw_read_total_(0),
+ raw_read_bound_(raw_total_bytes_bound),
+ bytes_buffered_(0) {}
+
+ Status Close() {
+ if (is_open_) {
+ is_open_ = false;
+ return raw_->Close();
+ }
+ return Status::OK();
+ }
+
+ Status Abort() {
+ if (is_open_) {
+ is_open_ = false;
+ return raw_->Abort();
+ }
+ return Status::OK();
+ }
+
+ Result<int64_t> Tell() const {
+ if (raw_pos_ == -1) {
+ ARROW_ASSIGN_OR_RAISE(raw_pos_, raw_->Tell());
+ DCHECK_GE(raw_pos_, 0);
+ }
+ // Shift by bytes_buffered to return semantic stream position
+ return raw_pos_ - bytes_buffered_;
+ }
+
+ Status SetBufferSize(int64_t new_buffer_size) {
+ if (new_buffer_size <= 0) {
+ return Status::Invalid("Buffer size should be positive");
+ }
+ if ((buffer_pos_ + bytes_buffered_) >= new_buffer_size) {
+ return Status::Invalid("Cannot shrink read buffer if buffered data remains");
+ }
+ return ResizeBuffer(new_buffer_size);
+ }
+
+ Result<util::string_view> Peek(int64_t nbytes) {
+ if (raw_read_bound_ >= 0) {
+ // Do not try to peek more than the total remaining number of bytes.
+ nbytes = std::min(nbytes, bytes_buffered_ + (raw_read_bound_ - raw_read_total_));
+ }
+
+ if (bytes_buffered_ == 0 && nbytes < buffer_size_) {
+ // Pre-buffer for small reads
+ RETURN_NOT_OK(BufferIfNeeded());
+ }
+
+ // Increase the buffer size if needed.
+ if (nbytes > buffer_->size() - buffer_pos_) {
+ RETURN_NOT_OK(SetBufferSize(nbytes + buffer_pos_));
+ DCHECK(buffer_->size() - buffer_pos_ >= nbytes);
+ }
+ // Read more data when buffer has insufficient left
+ if (nbytes > bytes_buffered_) {
+ int64_t additional_bytes_to_read = nbytes - bytes_buffered_;
+ if (raw_read_bound_ >= 0) {
+ additional_bytes_to_read =
+ std::min(additional_bytes_to_read, raw_read_bound_ - raw_read_total_);
+ }
+ ARROW_ASSIGN_OR_RAISE(
+ int64_t bytes_read,
+ raw_->Read(additional_bytes_to_read,
+ buffer_->mutable_data() + buffer_pos_ + bytes_buffered_));
+ bytes_buffered_ += bytes_read;
+ raw_read_total_ += bytes_read;
+ nbytes = bytes_buffered_;
+ }
+ DCHECK(nbytes <= bytes_buffered_); // Enough bytes available
+ return util::string_view(reinterpret_cast<const char*>(buffer_data_ + buffer_pos_),
+ static_cast<size_t>(nbytes));
+ }
+
+ int64_t bytes_buffered() const { return bytes_buffered_; }
+
+ int64_t buffer_size() const { return buffer_size_; }
+
+ std::shared_ptr<InputStream> Detach() {
+ is_open_ = false;
+ return std::move(raw_);
+ }
+
+ void RewindBuffer() {
+ // Invalidate buffered data, as with a Seek or large Read
+ buffer_pos_ = bytes_buffered_ = 0;
+ }
+
+ Status BufferIfNeeded() {
+ if (bytes_buffered_ == 0) {
+ // Fill buffer
+ if (!buffer_) {
+ RETURN_NOT_OK(ResetBuffer());
+ }
+
+ int64_t bytes_to_buffer = buffer_size_;
+ if (raw_read_bound_ >= 0) {
+ bytes_to_buffer = std::min(buffer_size_, raw_read_bound_ - raw_read_total_);
+ }
+ ARROW_ASSIGN_OR_RAISE(bytes_buffered_, raw_->Read(bytes_to_buffer, buffer_data_));
+ buffer_pos_ = 0;
+ raw_read_total_ += bytes_buffered_;
+
+ // Do not make assumptions about the raw stream position
+ raw_pos_ = -1;
+ }
+ return Status::OK();
+ }
+
+ void ConsumeBuffer(int64_t nbytes) {
+ buffer_pos_ += nbytes;
+ bytes_buffered_ -= nbytes;
+ }
+
+ Result<int64_t> Read(int64_t nbytes, void* out) {
+ if (ARROW_PREDICT_FALSE(nbytes < 0)) {
+ return Status::Invalid("Bytes to read must be positive. Received:", nbytes);
+ }
+
+ if (nbytes < buffer_size_) {
+ // Pre-buffer for small reads
+ RETURN_NOT_OK(BufferIfNeeded());
+ }
+
+ if (nbytes > bytes_buffered_) {
+ // Copy buffered bytes into out, then read rest
+ memcpy(out, buffer_data_ + buffer_pos_, bytes_buffered_);
+
+ int64_t bytes_to_read = nbytes - bytes_buffered_;
+ if (raw_read_bound_ >= 0) {
+ bytes_to_read = std::min(bytes_to_read, raw_read_bound_ - raw_read_total_);
+ }
+ ARROW_ASSIGN_OR_RAISE(
+ int64_t bytes_read,
+ raw_->Read(bytes_to_read, reinterpret_cast<uint8_t*>(out) + bytes_buffered_));
+ raw_read_total_ += bytes_read;
+
+ // Do not make assumptions about the raw stream position
+ raw_pos_ = -1;
+ bytes_read += bytes_buffered_;
+ RewindBuffer();
+ return bytes_read;
+ } else {
+ memcpy(out, buffer_data_ + buffer_pos_, nbytes);
+ ConsumeBuffer(nbytes);
+ return nbytes;
+ }
+ }
+
+ Result<std::shared_ptr<Buffer>> Read(int64_t nbytes) {
+ ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateResizableBuffer(nbytes, pool_));
+
+ ARROW_ASSIGN_OR_RAISE(int64_t bytes_read, Read(nbytes, buffer->mutable_data()));
+
+ if (bytes_read < nbytes) {
+ // Change size but do not reallocate internal capacity
+ RETURN_NOT_OK(buffer->Resize(bytes_read, false /* shrink_to_fit */));
+ buffer->ZeroPadding();
+ }
+ return std::move(buffer);
+ }
+
+ // For providing access to the raw file handles
+ std::shared_ptr<InputStream> raw() const { return raw_; }
+
+ private:
+ std::shared_ptr<InputStream> raw_;
+ int64_t raw_read_total_;
+ int64_t raw_read_bound_;
+
+ // Number of remaining bytes in the buffer, to be reduced on each read from
+ // the buffer
+ int64_t bytes_buffered_;
+};
+
+BufferedInputStream::BufferedInputStream(std::shared_ptr<InputStream> raw,
+ MemoryPool* pool,
+ int64_t raw_total_bytes_bound) {
+ impl_.reset(new Impl(std::move(raw), pool, raw_total_bytes_bound));
+}
+
+BufferedInputStream::~BufferedInputStream() { internal::CloseFromDestructor(this); }
+
+Result<std::shared_ptr<BufferedInputStream>> BufferedInputStream::Create(
+ int64_t buffer_size, MemoryPool* pool, std::shared_ptr<InputStream> raw,
+ int64_t raw_total_bytes_bound) {
+ auto result = std::shared_ptr<BufferedInputStream>(
+ new BufferedInputStream(std::move(raw), pool, raw_total_bytes_bound));
+ RETURN_NOT_OK(result->SetBufferSize(buffer_size));
+ return result;
+}
+
+Status BufferedInputStream::DoClose() { return impl_->Close(); }
+
+Status BufferedInputStream::DoAbort() { return impl_->Abort(); }
+
+bool BufferedInputStream::closed() const { return impl_->closed(); }
+
+std::shared_ptr<InputStream> BufferedInputStream::Detach() { return impl_->Detach(); }
+
+std::shared_ptr<InputStream> BufferedInputStream::raw() const { return impl_->raw(); }
+
+Result<int64_t> BufferedInputStream::DoTell() const { return impl_->Tell(); }
+
+Result<util::string_view> BufferedInputStream::DoPeek(int64_t nbytes) {
+ return impl_->Peek(nbytes);
+}
+
+Status BufferedInputStream::SetBufferSize(int64_t new_buffer_size) {
+ return impl_->SetBufferSize(new_buffer_size);
+}
+
+int64_t BufferedInputStream::bytes_buffered() const { return impl_->bytes_buffered(); }
+
+int64_t BufferedInputStream::buffer_size() const { return impl_->buffer_size(); }
+
+Result<int64_t> BufferedInputStream::DoRead(int64_t nbytes, void* out) {
+ return impl_->Read(nbytes, out);
+}
+
+Result<std::shared_ptr<Buffer>> BufferedInputStream::DoRead(int64_t nbytes) {
+ return impl_->Read(nbytes);
+}
+
+Result<std::shared_ptr<const KeyValueMetadata>> BufferedInputStream::ReadMetadata() {
+ return impl_->raw()->ReadMetadata();
+}
+
+Future<std::shared_ptr<const KeyValueMetadata>> BufferedInputStream::ReadMetadataAsync(
+ const IOContext& io_context) {
+ return impl_->raw()->ReadMetadataAsync(io_context);
+}
+
+} // namespace io
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/io/buffered.h b/contrib/libs/apache/arrow/cpp/src/arrow/io/buffered.h
new file mode 100644
index 00000000000..8116613fa4e
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/io/buffered.h
@@ -0,0 +1,167 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Buffered stream implementations
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+
+#include "arrow/io/concurrency.h"
+#include "arrow/io/interfaces.h"
+#include "arrow/util/string_view.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class Buffer;
+class MemoryPool;
+class Status;
+
+namespace io {
+
+class ARROW_EXPORT BufferedOutputStream : public OutputStream {
+ public:
+ ~BufferedOutputStream() override;
+
+ /// \brief Create a buffered output stream wrapping the given output stream.
+ /// \param[in] buffer_size the size of the temporary write buffer
+ /// \param[in] pool a MemoryPool to use for allocations
+ /// \param[in] raw another OutputStream
+ /// \return the created BufferedOutputStream
+ static Result<std::shared_ptr<BufferedOutputStream>> Create(
+ int64_t buffer_size, MemoryPool* pool, std::shared_ptr<OutputStream> raw);
+
+ /// \brief Resize internal buffer
+ /// \param[in] new_buffer_size the new buffer size
+ /// \return Status
+ Status SetBufferSize(int64_t new_buffer_size);
+
+ /// \brief Return the current size of the internal buffer
+ int64_t buffer_size() const;
+
+ /// \brief Return the number of remaining bytes that have not been flushed to
+ /// the raw OutputStream
+ int64_t bytes_buffered() const;
+
+ /// \brief Flush any buffered writes and release the raw
+ /// OutputStream. Further operations on this object are invalid
+ /// \return the underlying OutputStream
+ Result<std::shared_ptr<OutputStream>> Detach();
+
+ // OutputStream interface
+
+ /// \brief Close the buffered output stream. This implicitly closes the
+ /// underlying raw output stream.
+ Status Close() override;
+ Status Abort() override;
+ bool closed() const override;
+
+ Result<int64_t> Tell() const override;
+ // Write bytes to the stream. Thread-safe
+ Status Write(const void* data, int64_t nbytes) override;
+ Status Write(const std::shared_ptr<Buffer>& data) override;
+
+ Status Flush() override;
+
+ /// \brief Return the underlying raw output stream.
+ std::shared_ptr<OutputStream> raw() const;
+
+ private:
+ explicit BufferedOutputStream(std::shared_ptr<OutputStream> raw, MemoryPool* pool);
+
+ class ARROW_NO_EXPORT Impl;
+ std::unique_ptr<Impl> impl_;
+};
+
+/// \class BufferedInputStream
+/// \brief An InputStream that performs buffered reads from an unbuffered
+/// InputStream, which can mitigate the overhead of many small reads in some
+/// cases
+class ARROW_EXPORT BufferedInputStream
+ : public internal::InputStreamConcurrencyWrapper<BufferedInputStream> {
+ public:
+ ~BufferedInputStream() override;
+
+ /// \brief Create a BufferedInputStream from a raw InputStream
+ /// \param[in] buffer_size the size of the temporary read buffer
+ /// \param[in] pool a MemoryPool to use for allocations
+ /// \param[in] raw a raw InputStream
+ /// \param[in] raw_read_bound a bound on the maximum number of bytes
+ /// to read from the raw input stream. The default -1 indicates that
+ /// it is unbounded
+ /// \return the created BufferedInputStream
+ static Result<std::shared_ptr<BufferedInputStream>> Create(
+ int64_t buffer_size, MemoryPool* pool, std::shared_ptr<InputStream> raw,
+ int64_t raw_read_bound = -1);
+
+ /// \brief Resize internal read buffer; calls to Read(...) will read at least
+ /// \param[in] new_buffer_size the new read buffer size
+ /// \return Status
+ Status SetBufferSize(int64_t new_buffer_size);
+
+ /// \brief Return the number of remaining bytes in the read buffer
+ int64_t bytes_buffered() const;
+
+ /// \brief Return the current size of the internal buffer
+ int64_t buffer_size() const;
+
+ /// \brief Release the raw InputStream. Any data buffered will be
+ /// discarded. Further operations on this object are invalid
+ /// \return raw the underlying InputStream
+ std::shared_ptr<InputStream> Detach();
+
+ /// \brief Return the unbuffered InputStream
+ std::shared_ptr<InputStream> raw() const;
+
+ // InputStream APIs
+
+ bool closed() const override;
+ Result<std::shared_ptr<const KeyValueMetadata>> ReadMetadata() override;
+ Future<std::shared_ptr<const KeyValueMetadata>> ReadMetadataAsync(
+ const IOContext& io_context) override;
+
+ private:
+ friend InputStreamConcurrencyWrapper<BufferedInputStream>;
+
+ explicit BufferedInputStream(std::shared_ptr<InputStream> raw, MemoryPool* pool,
+ int64_t raw_total_bytes_bound);
+
+ Status DoClose();
+ Status DoAbort() override;
+
+ /// \brief Returns the position of the buffered stream, though the position
+ /// of the unbuffered stream may be further advanced.
+ Result<int64_t> DoTell() const;
+
+ Result<int64_t> DoRead(int64_t nbytes, void* out);
+
+ /// \brief Read into buffer.
+ Result<std::shared_ptr<Buffer>> DoRead(int64_t nbytes);
+
+ /// \brief Return a zero-copy string view referencing buffered data,
+ /// but do not advance the position of the stream. Buffers data and
+ /// expands the buffer size if necessary
+ Result<util::string_view> DoPeek(int64_t nbytes) override;
+
+ class ARROW_NO_EXPORT Impl;
+ std::unique_ptr<Impl> impl_;
+};
+
+} // namespace io
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/io/caching.cc b/contrib/libs/apache/arrow/cpp/src/arrow/io/caching.cc
new file mode 100644
index 00000000000..722026ccd9b
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/io/caching.cc
@@ -0,0 +1,318 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <algorithm>
+#include <atomic>
+#include <cmath>
+#include <mutex>
+#include <utility>
+#include <vector>
+
+#include "arrow/buffer.h"
+#include "arrow/io/caching.h"
+#include "arrow/io/util_internal.h"
+#include "arrow/result.h"
+#include "arrow/util/future.h"
+#include "arrow/util/logging.h"
+
+namespace arrow {
+namespace io {
+
+CacheOptions CacheOptions::Defaults() {
+ return CacheOptions{internal::ReadRangeCache::kDefaultHoleSizeLimit,
+ internal::ReadRangeCache::kDefaultRangeSizeLimit,
+ /*lazy=*/false};
+}
+
+CacheOptions CacheOptions::LazyDefaults() {
+ return CacheOptions{internal::ReadRangeCache::kDefaultHoleSizeLimit,
+ internal::ReadRangeCache::kDefaultRangeSizeLimit,
+ /*lazy=*/true};
+}
+
+CacheOptions CacheOptions::MakeFromNetworkMetrics(int64_t time_to_first_byte_millis,
+ int64_t transfer_bandwidth_mib_per_sec,
+ double ideal_bandwidth_utilization_frac,
+ int64_t max_ideal_request_size_mib) {
+ //
+ // The I/O coalescing algorithm uses two parameters:
+ // 1. hole_size_limit (a.k.a max_io_gap): Max I/O gap/hole size in bytes
+ // 2. range_size_limit (a.k.a ideal_request_size): Ideal I/O Request size in bytes
+ //
+ // These parameters can be derived from network metrics (e.g. S3) as described below:
+ //
+ // In an S3 compatible storage, there are two main metrics:
+ // 1. Seek-time or Time-To-First-Byte (TTFB) in seconds: call setup latency of a new
+ // S3 request
+ // 2. Transfer Bandwidth (BW) for data in bytes/sec
+ //
+ // 1. Computing hole_size_limit:
+ //
+ // hole_size_limit = TTFB * BW
+ //
+ // This is also called Bandwidth-Delay-Product (BDP).
+ // Two byte ranges that have a gap can still be mapped to the same read
+ // if the gap is less than the bandwidth-delay product [TTFB * TransferBandwidth],
+ // i.e. if the Time-To-First-Byte (or call setup latency of a new S3 request) is
+ // expected to be greater than just reading and discarding the extra bytes on an
+ // existing HTTP request.
+ //
+ // 2. Computing range_size_limit:
+ //
+ // We want to have high bandwidth utilization per S3 connections,
+ // i.e. transfer large amounts of data to amortize the seek overhead.
+ // But, we also want to leverage parallelism by slicing very large IO chunks.
+ // We define two more config parameters with suggested default values to control
+ // the slice size and seek to balance the two effects with the goal of maximizing
+ // net data load performance.
+ //
+ // BW_util_frac (ideal bandwidth utilization): Transfer bandwidth utilization fraction
+ // (per connection) to maximize the net data load. 90% is a good default number for
+ // an effective transfer bandwidth.
+ //
+ // MAX_IDEAL_REQUEST_SIZE: The maximum single data request size (in MiB) to maximize
+ // the net data load. 64 MiB is a good default number for the ideal request size.
+ //
+ // The amount of data that needs to be transferred in a single S3 get_object
+ // request to achieve effective bandwidth eff_BW = BW_util_frac * BW is as follows:
+ // eff_BW = range_size_limit / (TTFB + range_size_limit / BW)
+ //
+ // Substituting TTFB = hole_size_limit / BW and eff_BW = BW_util_frac * BW, we get the
+ // following result:
+ // range_size_limit = hole_size_limit * BW_util_frac / (1 - BW_util_frac)
+ //
+ // Applying the MAX_IDEAL_REQUEST_SIZE, we get the following:
+ // range_size_limit = min(MAX_IDEAL_REQUEST_SIZE,
+ // hole_size_limit * BW_util_frac / (1 - BW_util_frac))
+ //
+ DCHECK_GT(time_to_first_byte_millis, 0) << "TTFB must be > 0";
+ DCHECK_GT(transfer_bandwidth_mib_per_sec, 0) << "Transfer bandwidth must be > 0";
+ DCHECK_GT(ideal_bandwidth_utilization_frac, 0)
+ << "Ideal bandwidth utilization fraction must be > 0";
+ DCHECK_LT(ideal_bandwidth_utilization_frac, 1.0)
+ << "Ideal bandwidth utilization fraction must be < 1";
+ DCHECK_GT(max_ideal_request_size_mib, 0) << "Max Ideal request size must be > 0";
+
+ const double time_to_first_byte_sec = time_to_first_byte_millis / 1000.0;
+ const int64_t transfer_bandwidth_bytes_per_sec =
+ transfer_bandwidth_mib_per_sec * 1024 * 1024;
+ const int64_t max_ideal_request_size_bytes = max_ideal_request_size_mib * 1024 * 1024;
+
+ // hole_size_limit = TTFB * BW
+ const auto hole_size_limit = static_cast<int64_t>(
+ std::round(time_to_first_byte_sec * transfer_bandwidth_bytes_per_sec));
+ DCHECK_GT(hole_size_limit, 0) << "Computed hole_size_limit must be > 0";
+
+ // range_size_limit = min(MAX_IDEAL_REQUEST_SIZE,
+ // hole_size_limit * BW_util_frac / (1 - BW_util_frac))
+ const int64_t range_size_limit = std::min(
+ max_ideal_request_size_bytes,
+ static_cast<int64_t>(std::round(hole_size_limit * ideal_bandwidth_utilization_frac /
+ (1 - ideal_bandwidth_utilization_frac))));
+ DCHECK_GT(range_size_limit, 0) << "Computed range_size_limit must be > 0";
+
+ return {hole_size_limit, range_size_limit, false};
+}
+
+namespace internal {
+
+struct RangeCacheEntry {
+ ReadRange range;
+ Future<std::shared_ptr<Buffer>> future;
+
+ RangeCacheEntry() = default;
+ RangeCacheEntry(const ReadRange& range_, Future<std::shared_ptr<Buffer>> future_)
+ : range(range_), future(std::move(future_)) {}
+
+ friend bool operator<(const RangeCacheEntry& left, const RangeCacheEntry& right) {
+ return left.range.offset < right.range.offset;
+ }
+};
+
+struct ReadRangeCache::Impl {
+ std::shared_ptr<RandomAccessFile> file;
+ IOContext ctx;
+ CacheOptions options;
+
+ // Ordered by offset (so as to find a matching region by binary search)
+ std::vector<RangeCacheEntry> entries;
+
+ virtual ~Impl() = default;
+
+ // Get the future corresponding to a range
+ virtual Future<std::shared_ptr<Buffer>> MaybeRead(RangeCacheEntry* entry) {
+ return entry->future;
+ }
+
+ // Make cache entries for ranges
+ virtual std::vector<RangeCacheEntry> MakeCacheEntries(
+ const std::vector<ReadRange>& ranges) {
+ std::vector<RangeCacheEntry> new_entries;
+ new_entries.reserve(ranges.size());
+ for (const auto& range : ranges) {
+ new_entries.emplace_back(range, file->ReadAsync(ctx, range.offset, range.length));
+ }
+ return new_entries;
+ }
+
+ // Add the given ranges to the cache, coalescing them where possible
+ virtual Status Cache(std::vector<ReadRange> ranges) {
+ ranges = internal::CoalesceReadRanges(std::move(ranges), options.hole_size_limit,
+ options.range_size_limit);
+ std::vector<RangeCacheEntry> new_entries = MakeCacheEntries(ranges);
+ // Add new entries, themselves ordered by offset
+ if (entries.size() > 0) {
+ std::vector<RangeCacheEntry> merged(entries.size() + new_entries.size());
+ std::merge(entries.begin(), entries.end(), new_entries.begin(), new_entries.end(),
+ merged.begin());
+ entries = std::move(merged);
+ } else {
+ entries = std::move(new_entries);
+ }
+ // Prefetch immediately, regardless of executor availability, if possible
+ return file->WillNeed(ranges);
+ }
+
+ // Read the given range from the cache, blocking if needed. Cannot read a range
+ // that spans cache entries.
+ virtual Result<std::shared_ptr<Buffer>> Read(ReadRange range) {
+ if (range.length == 0) {
+ static const uint8_t byte = 0;
+ return std::make_shared<Buffer>(&byte, 0);
+ }
+
+ const auto it = std::lower_bound(
+ entries.begin(), entries.end(), range,
+ [](const RangeCacheEntry& entry, const ReadRange& range) {
+ return entry.range.offset + entry.range.length < range.offset + range.length;
+ });
+ if (it != entries.end() && it->range.Contains(range)) {
+ auto fut = MaybeRead(&*it);
+ ARROW_ASSIGN_OR_RAISE(auto buf, fut.result());
+ return SliceBuffer(std::move(buf), range.offset - it->range.offset, range.length);
+ }
+ return Status::Invalid("ReadRangeCache did not find matching cache entry");
+ }
+
+ virtual Future<> Wait() {
+ std::vector<Future<>> futures;
+ for (auto& entry : entries) {
+ futures.emplace_back(MaybeRead(&entry));
+ }
+ return AllComplete(futures);
+ }
+
+ // Return a Future that completes when the given ranges have been read.
+ virtual Future<> WaitFor(std::vector<ReadRange> ranges) {
+ auto end = std::remove_if(ranges.begin(), ranges.end(),
+ [](const ReadRange& range) { return range.length == 0; });
+ ranges.resize(end - ranges.begin());
+ std::vector<Future<>> futures;
+ futures.reserve(ranges.size());
+ for (auto& range : ranges) {
+ const auto it = std::lower_bound(
+ entries.begin(), entries.end(), range,
+ [](const RangeCacheEntry& entry, const ReadRange& range) {
+ return entry.range.offset + entry.range.length < range.offset + range.length;
+ });
+ if (it != entries.end() && it->range.Contains(range)) {
+ futures.push_back(Future<>(MaybeRead(&*it)));
+ } else {
+ return Status::Invalid("Range was not requested for caching: offset=",
+ range.offset, " length=", range.length);
+ }
+ }
+ return AllComplete(futures);
+ }
+};
+
+// Don't read ranges when they're first added. Instead, wait until they're requested
+// (either through Read or WaitFor).
+struct ReadRangeCache::LazyImpl : public ReadRangeCache::Impl {
+ // Protect against concurrent modification of entries[i]->future
+ std::mutex entry_mutex;
+
+ virtual ~LazyImpl() = default;
+
+ Future<std::shared_ptr<Buffer>> MaybeRead(RangeCacheEntry* entry) override {
+ // Called by superclass Read()/WaitFor() so we have the lock
+ if (!entry->future.is_valid()) {
+ entry->future = file->ReadAsync(ctx, entry->range.offset, entry->range.length);
+ }
+ return entry->future;
+ }
+
+ std::vector<RangeCacheEntry> MakeCacheEntries(
+ const std::vector<ReadRange>& ranges) override {
+ std::vector<RangeCacheEntry> new_entries;
+ new_entries.reserve(ranges.size());
+ for (const auto& range : ranges) {
+ // In the lazy variant, don't read data here - later, a call to Read or WaitFor
+ // will call back to MaybeRead (under the lock) which will fill the future.
+ new_entries.emplace_back(range, Future<std::shared_ptr<Buffer>>());
+ }
+ return new_entries;
+ }
+
+ Status Cache(std::vector<ReadRange> ranges) override {
+ std::unique_lock<std::mutex> guard(entry_mutex);
+ return ReadRangeCache::Impl::Cache(std::move(ranges));
+ }
+
+ Result<std::shared_ptr<Buffer>> Read(ReadRange range) override {
+ std::unique_lock<std::mutex> guard(entry_mutex);
+ return ReadRangeCache::Impl::Read(range);
+ }
+
+ Future<> Wait() override {
+ std::unique_lock<std::mutex> guard(entry_mutex);
+ return ReadRangeCache::Impl::Wait();
+ }
+
+ Future<> WaitFor(std::vector<ReadRange> ranges) override {
+ std::unique_lock<std::mutex> guard(entry_mutex);
+ return ReadRangeCache::Impl::WaitFor(std::move(ranges));
+ }
+};
+
+ReadRangeCache::ReadRangeCache(std::shared_ptr<RandomAccessFile> file, IOContext ctx,
+ CacheOptions options)
+ : impl_(options.lazy ? new LazyImpl() : new Impl()) {
+ impl_->file = std::move(file);
+ impl_->ctx = std::move(ctx);
+ impl_->options = options;
+}
+
+ReadRangeCache::~ReadRangeCache() = default;
+
+Status ReadRangeCache::Cache(std::vector<ReadRange> ranges) {
+ return impl_->Cache(std::move(ranges));
+}
+
+Result<std::shared_ptr<Buffer>> ReadRangeCache::Read(ReadRange range) {
+ return impl_->Read(range);
+}
+
+Future<> ReadRangeCache::Wait() { return impl_->Wait(); }
+
+Future<> ReadRangeCache::WaitFor(std::vector<ReadRange> ranges) {
+ return impl_->WaitFor(std::move(ranges));
+}
+
+} // namespace internal
+} // namespace io
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/io/caching.h b/contrib/libs/apache/arrow/cpp/src/arrow/io/caching.h
new file mode 100644
index 00000000000..59a9b60e82f
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/io/caching.h
@@ -0,0 +1,138 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/io/interfaces.h"
+#include "arrow/util/type_fwd.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace io {
+
+struct ARROW_EXPORT CacheOptions {
+ static constexpr double kDefaultIdealBandwidthUtilizationFrac = 0.9;
+ static constexpr int64_t kDefaultMaxIdealRequestSizeMib = 64;
+
+ /// \brief The maximum distance in bytes between two consecutive
+ /// ranges; beyond this value, ranges are not combined
+ int64_t hole_size_limit;
+ /// \brief The maximum size in bytes of a combined range; if
+ /// combining two consecutive ranges would produce a range of a
+ /// size greater than this, they are not combined
+ int64_t range_size_limit;
+ /// \brief A lazy cache does not perform any I/O until requested.
+ bool lazy;
+
+ bool operator==(const CacheOptions& other) const {
+ return hole_size_limit == other.hole_size_limit &&
+ range_size_limit == other.range_size_limit && lazy == other.lazy;
+ }
+
+ /// \brief Construct CacheOptions from network storage metrics (e.g. S3).
+ ///
+ /// \param[in] time_to_first_byte_millis Seek-time or Time-To-First-Byte (TTFB) in
+ /// milliseconds, also called call setup latency of a new S3 request.
+ /// The value is a positive integer.
+ /// \param[in] transfer_bandwidth_mib_per_sec Data transfer Bandwidth (BW) in MiB/sec.
+ /// The value is a positive integer.
+ /// \param[in] ideal_bandwidth_utilization_frac Transfer bandwidth utilization fraction
+ /// (per connection) to maximize the net data load.
+ /// The value is a positive double precision number less than 1.
+ /// \param[in] max_ideal_request_size_mib The maximum single data request size (in MiB)
+ /// to maximize the net data load.
+ /// The value is a positive integer.
+ /// \return A new instance of CacheOptions.
+ static CacheOptions MakeFromNetworkMetrics(
+ int64_t time_to_first_byte_millis, int64_t transfer_bandwidth_mib_per_sec,
+ double ideal_bandwidth_utilization_frac = kDefaultIdealBandwidthUtilizationFrac,
+ int64_t max_ideal_request_size_mib = kDefaultMaxIdealRequestSizeMib);
+
+ static CacheOptions Defaults();
+ static CacheOptions LazyDefaults();
+};
+
+namespace internal {
+
+/// \brief A read cache designed to hide IO latencies when reading.
+///
+/// This class takes multiple byte ranges that an application expects to read, and
+/// coalesces them into fewer, larger read requests, which benefits performance on some
+/// filesystems, particularly remote ones like Amazon S3. By default, it also issues
+/// these read requests in parallel up front.
+///
+/// To use:
+/// 1. Cache() the ranges you expect to read in the future. Ideally, these ranges have
+/// the exact offset and length that will later be read. The cache will combine those
+/// ranges according to parameters (see constructor).
+///
+/// By default, the cache will also start fetching the combined ranges in parallel in
+/// the background, unless CacheOptions.lazy is set.
+///
+/// 2. Call WaitFor() to be notified when the given ranges have been read. If
+/// CacheOptions.lazy is set, I/O will be triggered in the background here instead.
+/// This can be done in parallel (e.g. if parsing a file, call WaitFor() for each
+/// chunk of the file that can be parsed in parallel).
+///
+/// 3. Call Read() to retrieve the actual data for the given ranges.
+/// A synchronous application may skip WaitFor() and just call Read() - it will still
+/// benefit from coalescing and parallel fetching.
+class ARROW_EXPORT ReadRangeCache {
+ public:
+ static constexpr int64_t kDefaultHoleSizeLimit = 8192;
+ static constexpr int64_t kDefaultRangeSizeLimit = 32 * 1024 * 1024;
+
+ /// Construct a read cache with default
+ explicit ReadRangeCache(std::shared_ptr<RandomAccessFile> file, IOContext ctx)
+ : ReadRangeCache(file, std::move(ctx), CacheOptions::Defaults()) {}
+
+ /// Construct a read cache with given options
+ explicit ReadRangeCache(std::shared_ptr<RandomAccessFile> file, IOContext ctx,
+ CacheOptions options);
+ ~ReadRangeCache();
+
+ /// \brief Cache the given ranges in the background.
+ ///
+ /// The caller must ensure that the ranges do not overlap with each other,
+ /// nor with previously cached ranges. Otherwise, behaviour will be undefined.
+ Status Cache(std::vector<ReadRange> ranges);
+
+ /// \brief Read a range previously given to Cache().
+ Result<std::shared_ptr<Buffer>> Read(ReadRange range);
+
+ /// \brief Wait until all ranges added so far have been cached.
+ Future<> Wait();
+
+ /// \brief Wait until all given ranges have been cached.
+ Future<> WaitFor(std::vector<ReadRange> ranges);
+
+ protected:
+ struct Impl;
+ struct LazyImpl;
+
+ std::unique_ptr<Impl> impl_;
+};
+
+} // namespace internal
+} // namespace io
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/io/compressed.cc b/contrib/libs/apache/arrow/cpp/src/arrow/io/compressed.cc
new file mode 100644
index 00000000000..72977f0f297
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/io/compressed.cc
@@ -0,0 +1,450 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/io/compressed.h"
+
+#include <algorithm>
+#include <cstring>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <utility>
+
+#include "arrow/buffer.h"
+#include "arrow/io/util_internal.h"
+#include "arrow/memory_pool.h"
+#include "arrow/status.h"
+#include "arrow/util/compression.h"
+#include "arrow/util/logging.h"
+
+namespace arrow {
+
+using util::Codec;
+using util::Compressor;
+using util::Decompressor;
+
+namespace io {
+
+// ----------------------------------------------------------------------
+// CompressedOutputStream implementation
+
+class CompressedOutputStream::Impl {
+ public:
+ Impl(MemoryPool* pool, const std::shared_ptr<OutputStream>& raw)
+ : pool_(pool), raw_(raw), is_open_(false), compressed_pos_(0), total_pos_(0) {}
+
+ Status Init(Codec* codec) {
+ ARROW_ASSIGN_OR_RAISE(compressor_, codec->MakeCompressor());
+ ARROW_ASSIGN_OR_RAISE(compressed_, AllocateResizableBuffer(kChunkSize, pool_));
+ compressed_pos_ = 0;
+ is_open_ = true;
+ return Status::OK();
+ }
+
+ Result<int64_t> Tell() const {
+ std::lock_guard<std::mutex> guard(lock_);
+ return total_pos_;
+ }
+
+ std::shared_ptr<OutputStream> raw() const { return raw_; }
+
+ Status FlushCompressed() {
+ if (compressed_pos_ > 0) {
+ RETURN_NOT_OK(raw_->Write(compressed_->data(), compressed_pos_));
+ compressed_pos_ = 0;
+ }
+ return Status::OK();
+ }
+
+ Status Write(const void* data, int64_t nbytes) {
+ std::lock_guard<std::mutex> guard(lock_);
+
+ auto input = reinterpret_cast<const uint8_t*>(data);
+ while (nbytes > 0) {
+ int64_t input_len = nbytes;
+ int64_t output_len = compressed_->size() - compressed_pos_;
+ uint8_t* output = compressed_->mutable_data() + compressed_pos_;
+ ARROW_ASSIGN_OR_RAISE(auto result,
+ compressor_->Compress(input_len, input, output_len, output));
+ compressed_pos_ += result.bytes_written;
+
+ if (result.bytes_read == 0) {
+ // Not enough output, try to flush it and retry
+ if (compressed_pos_ > 0) {
+ RETURN_NOT_OK(FlushCompressed());
+ output_len = compressed_->size() - compressed_pos_;
+ output = compressed_->mutable_data() + compressed_pos_;
+ ARROW_ASSIGN_OR_RAISE(
+ result, compressor_->Compress(input_len, input, output_len, output));
+ compressed_pos_ += result.bytes_written;
+ }
+ }
+ input += result.bytes_read;
+ nbytes -= result.bytes_read;
+ total_pos_ += result.bytes_read;
+ if (compressed_pos_ == compressed_->size()) {
+ // Output buffer full, flush it
+ RETURN_NOT_OK(FlushCompressed());
+ }
+ if (result.bytes_read == 0) {
+ // Need to enlarge output buffer
+ RETURN_NOT_OK(compressed_->Resize(compressed_->size() * 2));
+ }
+ }
+ return Status::OK();
+ }
+
+ Status Flush() {
+ std::lock_guard<std::mutex> guard(lock_);
+
+ while (true) {
+ // Flush compressor
+ int64_t output_len = compressed_->size() - compressed_pos_;
+ uint8_t* output = compressed_->mutable_data() + compressed_pos_;
+ ARROW_ASSIGN_OR_RAISE(auto result, compressor_->Flush(output_len, output));
+ compressed_pos_ += result.bytes_written;
+
+ // Flush compressed output
+ RETURN_NOT_OK(FlushCompressed());
+
+ if (result.should_retry) {
+ // Need to enlarge output buffer
+ RETURN_NOT_OK(compressed_->Resize(compressed_->size() * 2));
+ } else {
+ break;
+ }
+ }
+ return Status::OK();
+ }
+
+ Status FinalizeCompression() {
+ while (true) {
+ // Try to end compressor
+ int64_t output_len = compressed_->size() - compressed_pos_;
+ uint8_t* output = compressed_->mutable_data() + compressed_pos_;
+ ARROW_ASSIGN_OR_RAISE(auto result, compressor_->End(output_len, output));
+ compressed_pos_ += result.bytes_written;
+
+ // Flush compressed output
+ RETURN_NOT_OK(FlushCompressed());
+
+ if (result.should_retry) {
+ // Need to enlarge output buffer
+ RETURN_NOT_OK(compressed_->Resize(compressed_->size() * 2));
+ } else {
+ // Done
+ break;
+ }
+ }
+ return Status::OK();
+ }
+
+ Status Close() {
+ std::lock_guard<std::mutex> guard(lock_);
+
+ if (is_open_) {
+ is_open_ = false;
+ RETURN_NOT_OK(FinalizeCompression());
+ return raw_->Close();
+ } else {
+ return Status::OK();
+ }
+ }
+
+ Status Abort() {
+ std::lock_guard<std::mutex> guard(lock_);
+
+ if (is_open_) {
+ is_open_ = false;
+ return raw_->Abort();
+ } else {
+ return Status::OK();
+ }
+ }
+
+ bool closed() {
+ std::lock_guard<std::mutex> guard(lock_);
+ return !is_open_;
+ }
+
+ private:
+ // Write 64 KB compressed data at a time
+ static const int64_t kChunkSize = 64 * 1024;
+
+ MemoryPool* pool_;
+ std::shared_ptr<OutputStream> raw_;
+ bool is_open_;
+ std::shared_ptr<Compressor> compressor_;
+ std::shared_ptr<ResizableBuffer> compressed_;
+ int64_t compressed_pos_;
+ // Total number of bytes compressed
+ int64_t total_pos_;
+
+ mutable std::mutex lock_;
+};
+
+Result<std::shared_ptr<CompressedOutputStream>> CompressedOutputStream::Make(
+ util::Codec* codec, const std::shared_ptr<OutputStream>& raw, MemoryPool* pool) {
+ // CAUTION: codec is not owned
+ std::shared_ptr<CompressedOutputStream> res(new CompressedOutputStream);
+ res->impl_.reset(new Impl(pool, std::move(raw)));
+ RETURN_NOT_OK(res->impl_->Init(codec));
+ return res;
+}
+
+CompressedOutputStream::~CompressedOutputStream() { internal::CloseFromDestructor(this); }
+
+Status CompressedOutputStream::Close() { return impl_->Close(); }
+
+Status CompressedOutputStream::Abort() { return impl_->Abort(); }
+
+bool CompressedOutputStream::closed() const { return impl_->closed(); }
+
+Result<int64_t> CompressedOutputStream::Tell() const { return impl_->Tell(); }
+
+Status CompressedOutputStream::Write(const void* data, int64_t nbytes) {
+ return impl_->Write(data, nbytes);
+}
+
+Status CompressedOutputStream::Flush() { return impl_->Flush(); }
+
+std::shared_ptr<OutputStream> CompressedOutputStream::raw() const { return impl_->raw(); }
+
+// ----------------------------------------------------------------------
+// CompressedInputStream implementation
+
+class CompressedInputStream::Impl {
+ public:
+ Impl(MemoryPool* pool, const std::shared_ptr<InputStream>& raw)
+ : pool_(pool),
+ raw_(raw),
+ is_open_(true),
+ compressed_pos_(0),
+ decompressed_pos_(0),
+ total_pos_(0) {}
+
+ Status Init(Codec* codec) {
+ ARROW_ASSIGN_OR_RAISE(decompressor_, codec->MakeDecompressor());
+ fresh_decompressor_ = true;
+ return Status::OK();
+ }
+
+ Status Close() {
+ if (is_open_) {
+ is_open_ = false;
+ return raw_->Close();
+ } else {
+ return Status::OK();
+ }
+ }
+
+ Status Abort() {
+ if (is_open_) {
+ is_open_ = false;
+ return raw_->Abort();
+ } else {
+ return Status::OK();
+ }
+ }
+
+ bool closed() { return !is_open_; }
+
+ Result<int64_t> Tell() const { return total_pos_; }
+
+ // Read compressed data if necessary
+ Status EnsureCompressedData() {
+ int64_t compressed_avail = compressed_ ? compressed_->size() - compressed_pos_ : 0;
+ if (compressed_avail == 0) {
+ // No compressed data available, read a full chunk
+ ARROW_ASSIGN_OR_RAISE(compressed_, raw_->Read(kChunkSize));
+ compressed_pos_ = 0;
+ }
+ return Status::OK();
+ }
+
+ // Decompress some data from the compressed_ buffer.
+ // Call this function only if the decompressed_ buffer is empty.
+ Status DecompressData() {
+ int64_t decompress_size = kDecompressSize;
+
+ while (true) {
+ ARROW_ASSIGN_OR_RAISE(decompressed_,
+ AllocateResizableBuffer(decompress_size, pool_));
+ decompressed_pos_ = 0;
+
+ int64_t input_len = compressed_->size() - compressed_pos_;
+ const uint8_t* input = compressed_->data() + compressed_pos_;
+ int64_t output_len = decompressed_->size();
+ uint8_t* output = decompressed_->mutable_data();
+
+ ARROW_ASSIGN_OR_RAISE(
+ auto result, decompressor_->Decompress(input_len, input, output_len, output));
+ compressed_pos_ += result.bytes_read;
+ if (result.bytes_read > 0) {
+ fresh_decompressor_ = false;
+ }
+ if (result.bytes_written > 0 || !result.need_more_output || input_len == 0) {
+ RETURN_NOT_OK(decompressed_->Resize(result.bytes_written));
+ break;
+ }
+ DCHECK_EQ(result.bytes_written, 0);
+ // Need to enlarge output buffer
+ decompress_size *= 2;
+ }
+ return Status::OK();
+ }
+
+ // Read a given number of bytes from the decompressed_ buffer.
+ int64_t ReadFromDecompressed(int64_t nbytes, uint8_t* out) {
+ int64_t readable = decompressed_ ? (decompressed_->size() - decompressed_pos_) : 0;
+ int64_t read_bytes = std::min(readable, nbytes);
+
+ if (read_bytes > 0) {
+ memcpy(out, decompressed_->data() + decompressed_pos_, read_bytes);
+ decompressed_pos_ += read_bytes;
+
+ if (decompressed_pos_ == decompressed_->size()) {
+ // Decompressed data is exhausted, release buffer
+ decompressed_.reset();
+ }
+ }
+
+ return read_bytes;
+ }
+
+ // Try to feed more data into the decompressed_ buffer.
+ Status RefillDecompressed(bool* has_data) {
+ // First try to read data from the decompressor
+ if (compressed_) {
+ if (decompressor_->IsFinished()) {
+ // We just went over the end of a previous compressed stream.
+ RETURN_NOT_OK(decompressor_->Reset());
+ fresh_decompressor_ = true;
+ }
+ RETURN_NOT_OK(DecompressData());
+ }
+ if (!decompressed_ || decompressed_->size() == 0) {
+ // Got nothing, need to read more compressed data
+ RETURN_NOT_OK(EnsureCompressedData());
+ if (compressed_pos_ == compressed_->size()) {
+ // No more data to decompress
+ if (!fresh_decompressor_ && !decompressor_->IsFinished()) {
+ return Status::IOError("Truncated compressed stream");
+ }
+ *has_data = false;
+ return Status::OK();
+ }
+ RETURN_NOT_OK(DecompressData());
+ }
+ *has_data = true;
+ return Status::OK();
+ }
+
+ Result<int64_t> Read(int64_t nbytes, void* out) {
+ auto out_data = reinterpret_cast<uint8_t*>(out);
+
+ int64_t total_read = 0;
+ bool decompressor_has_data = true;
+
+ while (nbytes - total_read > 0 && decompressor_has_data) {
+ total_read += ReadFromDecompressed(nbytes - total_read, out_data + total_read);
+
+ if (nbytes == total_read) {
+ break;
+ }
+
+ // At this point, no more decompressed data remains, so we need to
+ // decompress more
+ RETURN_NOT_OK(RefillDecompressed(&decompressor_has_data));
+ }
+
+ total_pos_ += total_read;
+ return total_read;
+ }
+
+ Result<std::shared_ptr<Buffer>> Read(int64_t nbytes) {
+ ARROW_ASSIGN_OR_RAISE(auto buf, AllocateResizableBuffer(nbytes, pool_));
+ ARROW_ASSIGN_OR_RAISE(int64_t bytes_read, Read(nbytes, buf->mutable_data()));
+ RETURN_NOT_OK(buf->Resize(bytes_read));
+ return std::move(buf);
+ }
+
+ std::shared_ptr<InputStream> raw() const { return raw_; }
+
+ private:
+ // Read 64 KB compressed data at a time
+ static const int64_t kChunkSize = 64 * 1024;
+ // Decompress 1 MB at a time
+ static const int64_t kDecompressSize = 1024 * 1024;
+
+ MemoryPool* pool_;
+ std::shared_ptr<InputStream> raw_;
+ bool is_open_;
+ std::shared_ptr<Decompressor> decompressor_;
+ std::shared_ptr<Buffer> compressed_;
+ // Position in compressed buffer
+ int64_t compressed_pos_;
+ std::shared_ptr<ResizableBuffer> decompressed_;
+ // Position in decompressed buffer
+ int64_t decompressed_pos_;
+ // True if the decompressor hasn't read any data yet.
+ bool fresh_decompressor_;
+ // Total number of bytes decompressed
+ int64_t total_pos_;
+};
+
+Result<std::shared_ptr<CompressedInputStream>> CompressedInputStream::Make(
+ Codec* codec, const std::shared_ptr<InputStream>& raw, MemoryPool* pool) {
+ // CAUTION: codec is not owned
+ std::shared_ptr<CompressedInputStream> res(new CompressedInputStream);
+ res->impl_.reset(new Impl(pool, std::move(raw)));
+ RETURN_NOT_OK(res->impl_->Init(codec));
+ return res;
+ return Status::OK();
+}
+
+CompressedInputStream::~CompressedInputStream() { internal::CloseFromDestructor(this); }
+
+Status CompressedInputStream::DoClose() { return impl_->Close(); }
+
+Status CompressedInputStream::DoAbort() { return impl_->Abort(); }
+
+bool CompressedInputStream::closed() const { return impl_->closed(); }
+
+Result<int64_t> CompressedInputStream::DoTell() const { return impl_->Tell(); }
+
+Result<int64_t> CompressedInputStream::DoRead(int64_t nbytes, void* out) {
+ return impl_->Read(nbytes, out);
+}
+
+Result<std::shared_ptr<Buffer>> CompressedInputStream::DoRead(int64_t nbytes) {
+ return impl_->Read(nbytes);
+}
+
+std::shared_ptr<InputStream> CompressedInputStream::raw() const { return impl_->raw(); }
+
+Result<std::shared_ptr<const KeyValueMetadata>> CompressedInputStream::ReadMetadata() {
+ return impl_->raw()->ReadMetadata();
+}
+
+Future<std::shared_ptr<const KeyValueMetadata>> CompressedInputStream::ReadMetadataAsync(
+ const IOContext& io_context) {
+ return impl_->raw()->ReadMetadataAsync(io_context);
+}
+
+} // namespace io
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/io/compressed.h b/contrib/libs/apache/arrow/cpp/src/arrow/io/compressed.h
new file mode 100644
index 00000000000..cd1a7f673ce
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/io/compressed.h
@@ -0,0 +1,118 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Compressed stream implementations
+
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include "arrow/io/concurrency.h"
+#include "arrow/io/interfaces.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class MemoryPool;
+class Status;
+
+namespace util {
+
+class Codec;
+
+} // namespace util
+
+namespace io {
+
+class ARROW_EXPORT CompressedOutputStream : public OutputStream {
+ public:
+ ~CompressedOutputStream() override;
+
+ /// \brief Create a compressed output stream wrapping the given output stream.
+ static Result<std::shared_ptr<CompressedOutputStream>> Make(
+ util::Codec* codec, const std::shared_ptr<OutputStream>& raw,
+ MemoryPool* pool = default_memory_pool());
+
+ // OutputStream interface
+
+ /// \brief Close the compressed output stream. This implicitly closes the
+ /// underlying raw output stream.
+ Status Close() override;
+ Status Abort() override;
+ bool closed() const override;
+
+ Result<int64_t> Tell() const override;
+
+ Status Write(const void* data, int64_t nbytes) override;
+ /// \cond FALSE
+ using Writable::Write;
+ /// \endcond
+ Status Flush() override;
+
+ /// \brief Return the underlying raw output stream.
+ std::shared_ptr<OutputStream> raw() const;
+
+ private:
+ ARROW_DISALLOW_COPY_AND_ASSIGN(CompressedOutputStream);
+
+ CompressedOutputStream() = default;
+
+ class ARROW_NO_EXPORT Impl;
+ std::unique_ptr<Impl> impl_;
+};
+
+class ARROW_EXPORT CompressedInputStream
+ : public internal::InputStreamConcurrencyWrapper<CompressedInputStream> {
+ public:
+ ~CompressedInputStream() override;
+
+ /// \brief Create a compressed input stream wrapping the given input stream.
+ static Result<std::shared_ptr<CompressedInputStream>> Make(
+ util::Codec* codec, const std::shared_ptr<InputStream>& raw,
+ MemoryPool* pool = default_memory_pool());
+
+ // InputStream interface
+
+ bool closed() const override;
+ Result<std::shared_ptr<const KeyValueMetadata>> ReadMetadata() override;
+ Future<std::shared_ptr<const KeyValueMetadata>> ReadMetadataAsync(
+ const IOContext& io_context) override;
+
+ /// \brief Return the underlying raw input stream.
+ std::shared_ptr<InputStream> raw() const;
+
+ private:
+ friend InputStreamConcurrencyWrapper<CompressedInputStream>;
+ ARROW_DISALLOW_COPY_AND_ASSIGN(CompressedInputStream);
+
+ CompressedInputStream() = default;
+
+ /// \brief Close the compressed input stream. This implicitly closes the
+ /// underlying raw input stream.
+ Status DoClose();
+ Status DoAbort() override;
+ Result<int64_t> DoTell() const;
+ Result<int64_t> DoRead(int64_t nbytes, void* out);
+ Result<std::shared_ptr<Buffer>> DoRead(int64_t nbytes);
+
+ class ARROW_NO_EXPORT Impl;
+ std::unique_ptr<Impl> impl_;
+};
+
+} // namespace io
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/io/concurrency.h b/contrib/libs/apache/arrow/cpp/src/arrow/io/concurrency.h
new file mode 100644
index 00000000000..b41ad2c1350
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/io/concurrency.h
@@ -0,0 +1,263 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+
+#include "arrow/io/interfaces.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace io {
+namespace internal {
+
+template <class LockType>
+class SharedLockGuard {
+ public:
+ explicit SharedLockGuard(LockType* lock) : lock_(lock) { lock_->LockShared(); }
+
+ ~SharedLockGuard() { lock_->UnlockShared(); }
+
+ protected:
+ LockType* lock_;
+};
+
+template <class LockType>
+class ExclusiveLockGuard {
+ public:
+ explicit ExclusiveLockGuard(LockType* lock) : lock_(lock) { lock_->LockExclusive(); }
+
+ ~ExclusiveLockGuard() { lock_->UnlockExclusive(); }
+
+ protected:
+ LockType* lock_;
+};
+
+// Debug concurrency checker that marks "shared" and "exclusive" code sections,
+// aborting if the concurrency rules get violated. Does nothing in release mode.
+// Note that we intentionally use the same class declaration in debug and
+// release builds in order to avoid runtime failures when e.g. loading a
+// release-built DLL with a debug-built application, or the reverse.
+
+class ARROW_EXPORT SharedExclusiveChecker {
+ public:
+ SharedExclusiveChecker();
+ void LockShared();
+ void UnlockShared();
+ void LockExclusive();
+ void UnlockExclusive();
+
+ SharedLockGuard<SharedExclusiveChecker> shared_guard() {
+ return SharedLockGuard<SharedExclusiveChecker>(this);
+ }
+
+ ExclusiveLockGuard<SharedExclusiveChecker> exclusive_guard() {
+ return ExclusiveLockGuard<SharedExclusiveChecker>(this);
+ }
+
+ protected:
+ struct Impl;
+ std::shared_ptr<Impl> impl_;
+};
+
+// Concurrency wrappers for IO classes that check the correctness of
+// concurrent calls to various methods. It is not necessary to wrap all
+// IO classes with these, only a few core classes that get used in tests.
+//
+// We're not using virtual inheritance here as virtual bases have poorly
+// understood semantic overhead which we'd be passing on to implementers
+// and users of these interfaces. Instead, we just duplicate the method
+// wrappers between those two classes.
+
+template <class Derived>
+class ARROW_EXPORT InputStreamConcurrencyWrapper : public InputStream {
+ public:
+ Status Close() final {
+ auto guard = lock_.exclusive_guard();
+ return derived()->DoClose();
+ }
+
+ Status Abort() final {
+ auto guard = lock_.exclusive_guard();
+ return derived()->DoAbort();
+ }
+
+ Result<int64_t> Tell() const final {
+ auto guard = lock_.exclusive_guard();
+ return derived()->DoTell();
+ }
+
+ Result<int64_t> Read(int64_t nbytes, void* out) final {
+ auto guard = lock_.exclusive_guard();
+ return derived()->DoRead(nbytes, out);
+ }
+
+ Result<std::shared_ptr<Buffer>> Read(int64_t nbytes) final {
+ auto guard = lock_.exclusive_guard();
+ return derived()->DoRead(nbytes);
+ }
+
+ Result<util::string_view> Peek(int64_t nbytes) final {
+ auto guard = lock_.exclusive_guard();
+ return derived()->DoPeek(nbytes);
+ }
+
+ /*
+ Methods to implement in derived class:
+
+ Status DoClose();
+ Result<int64_t> DoTell() const;
+ Result<int64_t> DoRead(int64_t nbytes, void* out);
+ Result<std::shared_ptr<Buffer>> DoRead(int64_t nbytes);
+
+ And optionally:
+
+ Status DoAbort() override;
+ Result<util::string_view> DoPeek(int64_t nbytes) override;
+
+ These methods should be protected in the derived class and
+ InputStreamConcurrencyWrapper declared as a friend with
+
+ friend InputStreamConcurrencyWrapper<derived>;
+ */
+
+ protected:
+ // Default implementations. They are virtual because the derived class may
+ // have derived classes itself.
+ virtual Status DoAbort() { return derived()->DoClose(); }
+
+ virtual Result<util::string_view> DoPeek(int64_t ARROW_ARG_UNUSED(nbytes)) {
+ return Status::NotImplemented("Peek not implemented");
+ }
+
+ Derived* derived() { return ::arrow::internal::checked_cast<Derived*>(this); }
+
+ const Derived* derived() const {
+ return ::arrow::internal::checked_cast<const Derived*>(this);
+ }
+
+ mutable SharedExclusiveChecker lock_;
+};
+
+template <class Derived>
+class ARROW_EXPORT RandomAccessFileConcurrencyWrapper : public RandomAccessFile {
+ public:
+ Status Close() final {
+ auto guard = lock_.exclusive_guard();
+ return derived()->DoClose();
+ }
+
+ Status Abort() final {
+ auto guard = lock_.exclusive_guard();
+ return derived()->DoAbort();
+ }
+
+ Result<int64_t> Tell() const final {
+ auto guard = lock_.exclusive_guard();
+ return derived()->DoTell();
+ }
+
+ Result<int64_t> Read(int64_t nbytes, void* out) final {
+ auto guard = lock_.exclusive_guard();
+ return derived()->DoRead(nbytes, out);
+ }
+
+ Result<std::shared_ptr<Buffer>> Read(int64_t nbytes) final {
+ auto guard = lock_.exclusive_guard();
+ return derived()->DoRead(nbytes);
+ }
+
+ Result<util::string_view> Peek(int64_t nbytes) final {
+ auto guard = lock_.exclusive_guard();
+ return derived()->DoPeek(nbytes);
+ }
+
+ Status Seek(int64_t position) final {
+ auto guard = lock_.exclusive_guard();
+ return derived()->DoSeek(position);
+ }
+
+ Result<int64_t> GetSize() final {
+ auto guard = lock_.shared_guard();
+ return derived()->DoGetSize();
+ }
+
+ // NOTE: ReadAt doesn't use stream pointer, but it is allowed to update it
+ // (it's the case on Windows when using ReadFileEx).
+ // So any method that relies on the current position (even if it doesn't
+ // update it, such as Peek) cannot run in parallel with ReadAt and has
+ // to use the exclusive_guard.
+
+ Result<int64_t> ReadAt(int64_t position, int64_t nbytes, void* out) final {
+ auto guard = lock_.shared_guard();
+ return derived()->DoReadAt(position, nbytes, out);
+ }
+
+ Result<std::shared_ptr<Buffer>> ReadAt(int64_t position, int64_t nbytes) final {
+ auto guard = lock_.shared_guard();
+ return derived()->DoReadAt(position, nbytes);
+ }
+
+ /*
+ Methods to implement in derived class:
+
+ Status DoClose();
+ Result<int64_t> DoTell() const;
+ Result<int64_t> DoRead(int64_t nbytes, void* out);
+ Result<std::shared_ptr<Buffer>> DoRead(int64_t nbytes);
+ Status DoSeek(int64_t position);
+ Result<int64_t> DoGetSize()
+ Result<int64_t> DoReadAt(int64_t position, int64_t nbytes, void* out);
+ Result<std::shared_ptr<Buffer>> DoReadAt(int64_t position, int64_t nbytes);
+
+ And optionally:
+
+ Status DoAbort() override;
+ Result<util::string_view> DoPeek(int64_t nbytes) override;
+
+ These methods should be protected in the derived class and
+ RandomAccessFileConcurrencyWrapper declared as a friend with
+
+ friend RandomAccessFileConcurrencyWrapper<derived>;
+ */
+
+ protected:
+ // Default implementations. They are virtual because the derived class may
+ // have derived classes itself.
+ virtual Status DoAbort() { return derived()->DoClose(); }
+
+ virtual Result<util::string_view> DoPeek(int64_t ARROW_ARG_UNUSED(nbytes)) {
+ return Status::NotImplemented("Peek not implemented");
+ }
+
+ Derived* derived() { return ::arrow::internal::checked_cast<Derived*>(this); }
+
+ const Derived* derived() const {
+ return ::arrow::internal::checked_cast<const Derived*>(this);
+ }
+
+ mutable SharedExclusiveChecker lock_;
+};
+
+} // namespace internal
+} // namespace io
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/io/file.cc b/contrib/libs/apache/arrow/cpp/src/arrow/io/file.cc
new file mode 100644
index 00000000000..70e15335af2
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/io/file.cc
@@ -0,0 +1,772 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/util/windows_compatibility.h" // IWYU pragma: keep
+
+// sys/mman.h not present in Visual Studio or Cygwin
+#ifdef _WIN32
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+#include "arrow/io/mman.h"
+#undef Realloc
+#undef Free
+#else
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <unistd.h> // IWYU pragma: keep
+#endif
+
+#include <algorithm>
+#include <atomic>
+#include <cerrno>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <mutex>
+#include <sstream>
+#include <string>
+#include <utility>
+
+// ----------------------------------------------------------------------
+// Other Arrow includes
+
+#include "arrow/io/file.h"
+#include "arrow/io/interfaces.h"
+#include "arrow/io/util_internal.h"
+
+#include "arrow/buffer.h"
+#include "arrow/memory_pool.h"
+#include "arrow/status.h"
+#include "arrow/util/future.h"
+#include "arrow/util/io_util.h"
+#include "arrow/util/logging.h"
+
+namespace arrow {
+
+using internal::IOErrorFromErrno;
+
+namespace io {
+
+class OSFile {
+ public:
+ OSFile() : fd_(-1), is_open_(false), size_(-1), need_seeking_(false) {}
+
+ ~OSFile() {}
+
+ // Note: only one of the Open* methods below may be called on a given instance
+
+ Status OpenWritable(const std::string& path, bool truncate, bool append,
+ bool write_only) {
+ RETURN_NOT_OK(SetFileName(path));
+
+ ARROW_ASSIGN_OR_RAISE(fd_, ::arrow::internal::FileOpenWritable(file_name_, write_only,
+ truncate, append));
+ is_open_ = true;
+ mode_ = write_only ? FileMode::WRITE : FileMode::READWRITE;
+
+ if (!truncate) {
+ ARROW_ASSIGN_OR_RAISE(size_, ::arrow::internal::FileGetSize(fd_));
+ } else {
+ size_ = 0;
+ }
+ return Status::OK();
+ }
+
+ // This is different from OpenWritable(string, ...) in that it doesn't
+ // truncate nor mandate a seekable file
+ Status OpenWritable(int fd) {
+ auto result = ::arrow::internal::FileGetSize(fd);
+ if (result.ok()) {
+ size_ = *result;
+ } else {
+ // Non-seekable file
+ size_ = -1;
+ }
+ RETURN_NOT_OK(SetFileName(fd));
+ is_open_ = true;
+ mode_ = FileMode::WRITE;
+ fd_ = fd;
+ return Status::OK();
+ }
+
+ Status OpenReadable(const std::string& path) {
+ RETURN_NOT_OK(SetFileName(path));
+
+ ARROW_ASSIGN_OR_RAISE(fd_, ::arrow::internal::FileOpenReadable(file_name_));
+ ARROW_ASSIGN_OR_RAISE(size_, ::arrow::internal::FileGetSize(fd_));
+
+ is_open_ = true;
+ mode_ = FileMode::READ;
+ return Status::OK();
+ }
+
+ Status OpenReadable(int fd) {
+ ARROW_ASSIGN_OR_RAISE(size_, ::arrow::internal::FileGetSize(fd));
+ RETURN_NOT_OK(SetFileName(fd));
+ is_open_ = true;
+ mode_ = FileMode::READ;
+ fd_ = fd;
+ return Status::OK();
+ }
+
+ Status CheckClosed() const {
+ if (!is_open_) {
+ return Status::Invalid("Invalid operation on closed file");
+ }
+ return Status::OK();
+ }
+
+ Status Close() {
+ if (is_open_) {
+ // Even if closing fails, the fd will likely be closed (perhaps it's
+ // already closed).
+ is_open_ = false;
+ int fd = fd_;
+ fd_ = -1;
+ RETURN_NOT_OK(::arrow::internal::FileClose(fd));
+ }
+ return Status::OK();
+ }
+
+ Result<int64_t> Read(int64_t nbytes, void* out) {
+ RETURN_NOT_OK(CheckClosed());
+ RETURN_NOT_OK(CheckPositioned());
+ return ::arrow::internal::FileRead(fd_, reinterpret_cast<uint8_t*>(out), nbytes);
+ }
+
+ Result<int64_t> ReadAt(int64_t position, int64_t nbytes, void* out) {
+ RETURN_NOT_OK(CheckClosed());
+ RETURN_NOT_OK(internal::ValidateRange(position, nbytes));
+ // ReadAt() leaves the file position undefined, so require that we seek
+ // before calling Read() or Write().
+ need_seeking_.store(true);
+ return ::arrow::internal::FileReadAt(fd_, reinterpret_cast<uint8_t*>(out), position,
+ nbytes);
+ }
+
+ Status Seek(int64_t pos) {
+ RETURN_NOT_OK(CheckClosed());
+ if (pos < 0) {
+ return Status::Invalid("Invalid position");
+ }
+ Status st = ::arrow::internal::FileSeek(fd_, pos);
+ if (st.ok()) {
+ need_seeking_.store(false);
+ }
+ return st;
+ }
+
+ Result<int64_t> Tell() const {
+ RETURN_NOT_OK(CheckClosed());
+ return ::arrow::internal::FileTell(fd_);
+ }
+
+ Status Write(const void* data, int64_t length) {
+ RETURN_NOT_OK(CheckClosed());
+
+ std::lock_guard<std::mutex> guard(lock_);
+ RETURN_NOT_OK(CheckPositioned());
+ if (length < 0) {
+ return Status::IOError("Length must be non-negative");
+ }
+ return ::arrow::internal::FileWrite(fd_, reinterpret_cast<const uint8_t*>(data),
+ length);
+ }
+
+ int fd() const { return fd_; }
+
+ bool is_open() const { return is_open_; }
+
+ int64_t size() const { return size_; }
+
+ FileMode::type mode() const { return mode_; }
+
+ std::mutex& lock() { return lock_; }
+
+ protected:
+ Status SetFileName(const std::string& file_name) {
+ return ::arrow::internal::PlatformFilename::FromString(file_name).Value(&file_name_);
+ }
+
+ Status SetFileName(int fd) {
+ std::stringstream ss;
+ ss << "<fd " << fd << ">";
+ return SetFileName(ss.str());
+ }
+
+ Status CheckPositioned() {
+ if (need_seeking_.load()) {
+ return Status::Invalid(
+ "Need seeking after ReadAt() before "
+ "calling implicitly-positioned operation");
+ }
+ return Status::OK();
+ }
+
+ ::arrow::internal::PlatformFilename file_name_;
+
+ std::mutex lock_;
+
+ // File descriptor
+ int fd_;
+
+ FileMode::type mode_;
+
+ bool is_open_;
+ int64_t size_;
+ // Whether ReadAt made the file position non-deterministic.
+ std::atomic<bool> need_seeking_;
+};
+
+// ----------------------------------------------------------------------
+// ReadableFile implementation
+
+class ReadableFile::ReadableFileImpl : public OSFile {
+ public:
+ explicit ReadableFileImpl(MemoryPool* pool) : OSFile(), pool_(pool) {}
+
+ Status Open(const std::string& path) { return OpenReadable(path); }
+ Status Open(int fd) { return OpenReadable(fd); }
+
+ Result<std::shared_ptr<Buffer>> ReadBuffer(int64_t nbytes) {
+ ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateResizableBuffer(nbytes, pool_));
+
+ ARROW_ASSIGN_OR_RAISE(int64_t bytes_read, Read(nbytes, buffer->mutable_data()));
+ if (bytes_read < nbytes) {
+ RETURN_NOT_OK(buffer->Resize(bytes_read));
+ buffer->ZeroPadding();
+ }
+ return std::move(buffer);
+ }
+
+ Result<std::shared_ptr<Buffer>> ReadBufferAt(int64_t position, int64_t nbytes) {
+ ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateResizableBuffer(nbytes, pool_));
+
+ ARROW_ASSIGN_OR_RAISE(int64_t bytes_read,
+ ReadAt(position, nbytes, buffer->mutable_data()));
+ if (bytes_read < nbytes) {
+ RETURN_NOT_OK(buffer->Resize(bytes_read));
+ buffer->ZeroPadding();
+ }
+ return std::move(buffer);
+ }
+
+ Status WillNeed(const std::vector<ReadRange>& ranges) {
+ RETURN_NOT_OK(CheckClosed());
+ for (const auto& range : ranges) {
+ RETURN_NOT_OK(internal::ValidateRange(range.offset, range.length));
+#if defined(POSIX_FADV_WILLNEED)
+ if (posix_fadvise(fd_, range.offset, range.length, POSIX_FADV_WILLNEED)) {
+ return IOErrorFromErrno(errno, "posix_fadvise failed");
+ }
+#elif defined(F_RDADVISE) // macOS, BSD?
+ struct {
+ off_t ra_offset;
+ int ra_count;
+ } radvisory{range.offset, static_cast<int>(range.length)};
+ if (radvisory.ra_count > 0 && fcntl(fd_, F_RDADVISE, &radvisory) == -1) {
+ return IOErrorFromErrno(errno, "fcntl(fd, F_RDADVISE, ...) failed");
+ }
+#endif
+ }
+ return Status::OK();
+ }
+
+ private:
+ MemoryPool* pool_;
+};
+
+ReadableFile::ReadableFile(MemoryPool* pool) { impl_.reset(new ReadableFileImpl(pool)); }
+
+ReadableFile::~ReadableFile() { internal::CloseFromDestructor(this); }
+
+Result<std::shared_ptr<ReadableFile>> ReadableFile::Open(const std::string& path,
+ MemoryPool* pool) {
+ auto file = std::shared_ptr<ReadableFile>(new ReadableFile(pool));
+ RETURN_NOT_OK(file->impl_->Open(path));
+ return file;
+}
+
+Result<std::shared_ptr<ReadableFile>> ReadableFile::Open(int fd, MemoryPool* pool) {
+ auto file = std::shared_ptr<ReadableFile>(new ReadableFile(pool));
+ RETURN_NOT_OK(file->impl_->Open(fd));
+ return file;
+}
+
+Status ReadableFile::DoClose() { return impl_->Close(); }
+
+bool ReadableFile::closed() const { return !impl_->is_open(); }
+
+Status ReadableFile::WillNeed(const std::vector<ReadRange>& ranges) {
+ return impl_->WillNeed(ranges);
+}
+
+Result<int64_t> ReadableFile::DoTell() const { return impl_->Tell(); }
+
+Result<int64_t> ReadableFile::DoRead(int64_t nbytes, void* out) {
+ return impl_->Read(nbytes, out);
+}
+
+Result<int64_t> ReadableFile::DoReadAt(int64_t position, int64_t nbytes, void* out) {
+ return impl_->ReadAt(position, nbytes, out);
+}
+
+Result<std::shared_ptr<Buffer>> ReadableFile::DoReadAt(int64_t position, int64_t nbytes) {
+ return impl_->ReadBufferAt(position, nbytes);
+}
+
+Result<std::shared_ptr<Buffer>> ReadableFile::DoRead(int64_t nbytes) {
+ return impl_->ReadBuffer(nbytes);
+}
+
+Result<int64_t> ReadableFile::DoGetSize() { return impl_->size(); }
+
+Status ReadableFile::DoSeek(int64_t pos) { return impl_->Seek(pos); }
+
+int ReadableFile::file_descriptor() const { return impl_->fd(); }
+
+// ----------------------------------------------------------------------
+// FileOutputStream
+
+class FileOutputStream::FileOutputStreamImpl : public OSFile {
+ public:
+ Status Open(const std::string& path, bool append) {
+ const bool truncate = !append;
+ return OpenWritable(path, truncate, append, true /* write_only */);
+ }
+ Status Open(int fd) { return OpenWritable(fd); }
+};
+
+FileOutputStream::FileOutputStream() { impl_.reset(new FileOutputStreamImpl()); }
+
+FileOutputStream::~FileOutputStream() { internal::CloseFromDestructor(this); }
+
+Result<std::shared_ptr<FileOutputStream>> FileOutputStream::Open(const std::string& path,
+ bool append) {
+ auto stream = std::shared_ptr<FileOutputStream>(new FileOutputStream());
+ RETURN_NOT_OK(stream->impl_->Open(path, append));
+ return stream;
+}
+
+Result<std::shared_ptr<FileOutputStream>> FileOutputStream::Open(int fd) {
+ auto stream = std::shared_ptr<FileOutputStream>(new FileOutputStream());
+ RETURN_NOT_OK(stream->impl_->Open(fd));
+ return stream;
+}
+
+Status FileOutputStream::Close() { return impl_->Close(); }
+
+bool FileOutputStream::closed() const { return !impl_->is_open(); }
+
+Result<int64_t> FileOutputStream::Tell() const { return impl_->Tell(); }
+
+Status FileOutputStream::Write(const void* data, int64_t length) {
+ return impl_->Write(data, length);
+}
+
+int FileOutputStream::file_descriptor() const { return impl_->fd(); }
+
+// ----------------------------------------------------------------------
+// Implement MemoryMappedFile
+
+class MemoryMappedFile::MemoryMap
+ : public std::enable_shared_from_this<MemoryMappedFile::MemoryMap> {
+ public:
+ // An object representing the entire memory-mapped region.
+ // It can be sliced in order to return individual subregions, which
+ // will then keep the original region alive as long as necessary.
+ class Region : public Buffer {
+ public:
+ Region(std::shared_ptr<MemoryMappedFile::MemoryMap> memory_map, uint8_t* data,
+ int64_t size)
+ : Buffer(data, size) {
+ is_mutable_ = memory_map->writable();
+ }
+
+ ~Region() {
+ if (data_ != nullptr) {
+ int result = munmap(data(), static_cast<size_t>(size_));
+ ARROW_CHECK_EQ(result, 0) << "munmap failed";
+ }
+ }
+
+ // For convenience
+ uint8_t* data() { return const_cast<uint8_t*>(data_); }
+
+ void Detach() { data_ = nullptr; }
+ };
+
+ MemoryMap() : file_size_(0), map_len_(0) {}
+
+ ~MemoryMap() { ARROW_CHECK_OK(Close()); }
+
+ Status Close() {
+ if (file_->is_open()) {
+ // Lose our reference to the MemoryMappedRegion, so that munmap()
+ // is called as soon as all buffer exports are released.
+ region_.reset();
+ return file_->Close();
+ } else {
+ return Status::OK();
+ }
+ }
+
+ bool closed() const { return !file_->is_open(); }
+
+ Status CheckClosed() const {
+ if (closed()) {
+ return Status::Invalid("Invalid operation on closed file");
+ }
+ return Status::OK();
+ }
+
+ Status Open(const std::string& path, FileMode::type mode, const int64_t offset = 0,
+ const int64_t length = -1) {
+ file_.reset(new OSFile());
+
+ if (mode != FileMode::READ) {
+ // Memory mapping has permission failures if PROT_READ not set
+ prot_flags_ = PROT_READ | PROT_WRITE;
+ map_mode_ = MAP_SHARED;
+ constexpr bool append = false;
+ constexpr bool truncate = false;
+ constexpr bool write_only = false;
+ RETURN_NOT_OK(file_->OpenWritable(path, truncate, append, write_only));
+ } else {
+ prot_flags_ = PROT_READ;
+ map_mode_ = MAP_PRIVATE; // Changes are not to be committed back to the file
+ RETURN_NOT_OK(file_->OpenReadable(path));
+ }
+ map_len_ = offset_ = 0;
+
+ // Memory mapping fails when file size is 0
+ // delay it until the first resize
+ if (file_->size() > 0) {
+ RETURN_NOT_OK(InitMMap(file_->size(), false, offset, length));
+ }
+
+ position_ = 0;
+
+ return Status::OK();
+ }
+
+ // Resize the mmap and file to the specified size.
+ // Resize on memory mapped file region is not supported.
+ Status Resize(const int64_t new_size) {
+ if (!writable()) {
+ return Status::IOError("Cannot resize a readonly memory map");
+ }
+ if (map_len_ != file_size_) {
+ return Status::IOError("Cannot resize a partial memory map");
+ }
+ if (region_.use_count() > 1) {
+ // There are buffer exports currently, the MemoryMapRemap() call
+ // would make the buffers invalid
+ return Status::IOError("Cannot resize memory map while there are active readers");
+ }
+
+ if (new_size == 0) {
+ if (map_len_ > 0) {
+ // Just unmap the mmap and truncate the file to 0 size
+ region_.reset();
+ RETURN_NOT_OK(::arrow::internal::FileTruncate(file_->fd(), 0));
+ map_len_ = offset_ = file_size_ = 0;
+ }
+ position_ = 0;
+ return Status::OK();
+ }
+
+ if (map_len_ > 0) {
+ void* result;
+ auto data = region_->data();
+ RETURN_NOT_OK(::arrow::internal::MemoryMapRemap(data, map_len_, new_size,
+ file_->fd(), &result));
+ region_->Detach(); // avoid munmap() on destruction
+ region_ = std::make_shared<Region>(shared_from_this(),
+ static_cast<uint8_t*>(result), new_size);
+ map_len_ = file_size_ = new_size;
+ offset_ = 0;
+ if (position_ > map_len_) {
+ position_ = map_len_;
+ }
+ } else {
+ DCHECK_EQ(position_, 0);
+ // the mmap is not yet initialized, resize the underlying
+ // file, since it might have been 0-sized
+ RETURN_NOT_OK(InitMMap(new_size, /*resize_file*/ true));
+ }
+ return Status::OK();
+ }
+
+ Status Seek(int64_t position) {
+ if (position < 0) {
+ return Status::Invalid("position is out of bounds");
+ }
+ position_ = position;
+ return Status::OK();
+ }
+
+ Result<std::shared_ptr<Buffer>> Slice(int64_t offset, int64_t length) {
+ length = std::max<int64_t>(0, std::min(length, map_len_ - offset));
+
+ if (length > 0) {
+ DCHECK_NE(region_, nullptr);
+ return SliceBuffer(region_, offset, length);
+ } else {
+ return std::make_shared<Buffer>(nullptr, 0);
+ }
+ }
+
+ // map_len_ == file_size_ if memory mapping on the whole file
+ int64_t size() const { return map_len_; }
+
+ int64_t position() { return position_; }
+
+ void advance(int64_t nbytes) { position_ = position_ + nbytes; }
+
+ uint8_t* data() { return region_ ? region_->data() : nullptr; }
+
+ uint8_t* head() { return data() + position_; }
+
+ bool writable() { return file_->mode() != FileMode::READ; }
+
+ bool opened() { return file_->is_open(); }
+
+ int fd() const { return file_->fd(); }
+
+ std::mutex& write_lock() { return file_->lock(); }
+
+ std::mutex& resize_lock() { return resize_lock_; }
+
+ private:
+ // Initialize the mmap and set size, capacity and the data pointers
+ Status InitMMap(int64_t initial_size, bool resize_file = false,
+ const int64_t offset = 0, const int64_t length = -1) {
+ DCHECK(!region_);
+
+ if (resize_file) {
+ RETURN_NOT_OK(::arrow::internal::FileTruncate(file_->fd(), initial_size));
+ }
+
+ size_t mmap_length = static_cast<size_t>(initial_size);
+ if (length > initial_size) {
+ return Status::Invalid("mapping length is beyond file size");
+ }
+ if (length >= 0 && length < initial_size) {
+ // memory mapping a file region
+ mmap_length = static_cast<size_t>(length);
+ }
+
+ void* result = mmap(nullptr, mmap_length, prot_flags_, map_mode_, file_->fd(),
+ static_cast<off_t>(offset));
+ if (result == MAP_FAILED) {
+ return Status::IOError("Memory mapping file failed: ",
+ ::arrow::internal::ErrnoMessage(errno));
+ }
+ map_len_ = mmap_length;
+ offset_ = offset;
+ region_ = std::make_shared<Region>(shared_from_this(), static_cast<uint8_t*>(result),
+ map_len_);
+ file_size_ = initial_size;
+
+ return Status::OK();
+ }
+
+ std::unique_ptr<OSFile> file_;
+ int prot_flags_;
+ int map_mode_;
+
+ std::shared_ptr<Region> region_;
+ int64_t file_size_;
+ int64_t position_;
+ int64_t offset_;
+ int64_t map_len_;
+ std::mutex resize_lock_;
+};
+
+MemoryMappedFile::MemoryMappedFile() {}
+
+MemoryMappedFile::~MemoryMappedFile() { internal::CloseFromDestructor(this); }
+
+Result<std::shared_ptr<MemoryMappedFile>> MemoryMappedFile::Create(
+ const std::string& path, int64_t size) {
+ ARROW_ASSIGN_OR_RAISE(auto file, FileOutputStream::Open(path));
+ RETURN_NOT_OK(::arrow::internal::FileTruncate(file->file_descriptor(), size));
+ RETURN_NOT_OK(file->Close());
+ return MemoryMappedFile::Open(path, FileMode::READWRITE);
+}
+
+Result<std::shared_ptr<MemoryMappedFile>> MemoryMappedFile::Open(const std::string& path,
+ FileMode::type mode) {
+ std::shared_ptr<MemoryMappedFile> result(new MemoryMappedFile());
+
+ result->memory_map_.reset(new MemoryMap());
+ RETURN_NOT_OK(result->memory_map_->Open(path, mode));
+ return result;
+}
+
+Result<std::shared_ptr<MemoryMappedFile>> MemoryMappedFile::Open(const std::string& path,
+ FileMode::type mode,
+ const int64_t offset,
+ const int64_t length) {
+ std::shared_ptr<MemoryMappedFile> result(new MemoryMappedFile());
+
+ result->memory_map_.reset(new MemoryMap());
+ RETURN_NOT_OK(result->memory_map_->Open(path, mode, offset, length));
+ return result;
+}
+
+Result<int64_t> MemoryMappedFile::GetSize() {
+ RETURN_NOT_OK(memory_map_->CheckClosed());
+ return memory_map_->size();
+}
+
+Result<int64_t> MemoryMappedFile::Tell() const {
+ RETURN_NOT_OK(memory_map_->CheckClosed());
+ return memory_map_->position();
+}
+
+Status MemoryMappedFile::Seek(int64_t position) {
+ RETURN_NOT_OK(memory_map_->CheckClosed());
+ return memory_map_->Seek(position);
+}
+
+Status MemoryMappedFile::Close() { return memory_map_->Close(); }
+
+bool MemoryMappedFile::closed() const { return memory_map_->closed(); }
+
+Result<std::shared_ptr<Buffer>> MemoryMappedFile::ReadAt(int64_t position,
+ int64_t nbytes) {
+ RETURN_NOT_OK(memory_map_->CheckClosed());
+ // if the file is writable, we acquire the lock before creating any slices
+ // in case a resize is triggered concurrently, otherwise we wouldn't detect
+ // a change in the use count
+ auto guard_resize = memory_map_->writable()
+ ? std::unique_lock<std::mutex>(memory_map_->resize_lock())
+ : std::unique_lock<std::mutex>();
+
+ ARROW_ASSIGN_OR_RAISE(
+ nbytes, internal::ValidateReadRange(position, nbytes, memory_map_->size()));
+ // Arrange to page data in
+ RETURN_NOT_OK(::arrow::internal::MemoryAdviseWillNeed(
+ {{memory_map_->data() + position, static_cast<size_t>(nbytes)}}));
+ return memory_map_->Slice(position, nbytes);
+}
+
+Result<int64_t> MemoryMappedFile::ReadAt(int64_t position, int64_t nbytes, void* out) {
+ RETURN_NOT_OK(memory_map_->CheckClosed());
+ auto guard_resize = memory_map_->writable()
+ ? std::unique_lock<std::mutex>(memory_map_->resize_lock())
+ : std::unique_lock<std::mutex>();
+
+ ARROW_ASSIGN_OR_RAISE(
+ nbytes, internal::ValidateReadRange(position, nbytes, memory_map_->size()));
+ if (nbytes > 0) {
+ memcpy(out, memory_map_->data() + position, static_cast<size_t>(nbytes));
+ }
+ return nbytes;
+}
+
+Result<int64_t> MemoryMappedFile::Read(int64_t nbytes, void* out) {
+ RETURN_NOT_OK(memory_map_->CheckClosed());
+ ARROW_ASSIGN_OR_RAISE(int64_t bytes_read, ReadAt(memory_map_->position(), nbytes, out));
+ memory_map_->advance(bytes_read);
+ return bytes_read;
+}
+
+Result<std::shared_ptr<Buffer>> MemoryMappedFile::Read(int64_t nbytes) {
+ RETURN_NOT_OK(memory_map_->CheckClosed());
+ ARROW_ASSIGN_OR_RAISE(auto buffer, ReadAt(memory_map_->position(), nbytes));
+ memory_map_->advance(buffer->size());
+ return buffer;
+}
+
+Future<std::shared_ptr<Buffer>> MemoryMappedFile::ReadAsync(const IOContext&,
+ int64_t position,
+ int64_t nbytes) {
+ return Future<std::shared_ptr<Buffer>>::MakeFinished(ReadAt(position, nbytes));
+}
+
+Status MemoryMappedFile::WillNeed(const std::vector<ReadRange>& ranges) {
+ using ::arrow::internal::MemoryRegion;
+
+ RETURN_NOT_OK(memory_map_->CheckClosed());
+ auto guard_resize = memory_map_->writable()
+ ? std::unique_lock<std::mutex>(memory_map_->resize_lock())
+ : std::unique_lock<std::mutex>();
+
+ std::vector<MemoryRegion> regions(ranges.size());
+ for (size_t i = 0; i < ranges.size(); ++i) {
+ const auto& range = ranges[i];
+ ARROW_ASSIGN_OR_RAISE(
+ auto size,
+ internal::ValidateReadRange(range.offset, range.length, memory_map_->size()));
+ DCHECK_NE(memory_map_->data(), nullptr);
+ regions[i] = {const_cast<uint8_t*>(memory_map_->data() + range.offset),
+ static_cast<size_t>(size)};
+ }
+ return ::arrow::internal::MemoryAdviseWillNeed(regions);
+}
+
+bool MemoryMappedFile::supports_zero_copy() const { return true; }
+
+Status MemoryMappedFile::WriteAt(int64_t position, const void* data, int64_t nbytes) {
+ RETURN_NOT_OK(memory_map_->CheckClosed());
+ std::lock_guard<std::mutex> guard(memory_map_->write_lock());
+
+ if (!memory_map_->opened() || !memory_map_->writable()) {
+ return Status::IOError("Unable to write");
+ }
+ RETURN_NOT_OK(internal::ValidateWriteRange(position, nbytes, memory_map_->size()));
+
+ RETURN_NOT_OK(memory_map_->Seek(position));
+ return WriteInternal(data, nbytes);
+}
+
+Status MemoryMappedFile::Write(const void* data, int64_t nbytes) {
+ RETURN_NOT_OK(memory_map_->CheckClosed());
+ std::lock_guard<std::mutex> guard(memory_map_->write_lock());
+
+ if (!memory_map_->opened() || !memory_map_->writable()) {
+ return Status::IOError("Unable to write");
+ }
+ RETURN_NOT_OK(
+ internal::ValidateWriteRange(memory_map_->position(), nbytes, memory_map_->size()));
+
+ return WriteInternal(data, nbytes);
+}
+
+Status MemoryMappedFile::WriteInternal(const void* data, int64_t nbytes) {
+ memcpy(memory_map_->head(), data, static_cast<size_t>(nbytes));
+ memory_map_->advance(nbytes);
+ return Status::OK();
+}
+
+Status MemoryMappedFile::Resize(int64_t new_size) {
+ RETURN_NOT_OK(memory_map_->CheckClosed());
+ std::unique_lock<std::mutex> write_guard(memory_map_->write_lock(), std::defer_lock);
+ std::unique_lock<std::mutex> resize_guard(memory_map_->resize_lock(), std::defer_lock);
+ std::lock(write_guard, resize_guard);
+ RETURN_NOT_OK(memory_map_->Resize(new_size));
+ return Status::OK();
+}
+
+int MemoryMappedFile::file_descriptor() const { return memory_map_->fd(); }
+
+} // namespace io
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/io/file.h b/contrib/libs/apache/arrow/cpp/src/arrow/io/file.h
new file mode 100644
index 00000000000..50d4f2c4dfc
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/io/file.h
@@ -0,0 +1,221 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// IO interface implementations for OS files
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/io/concurrency.h"
+#include "arrow/io/interfaces.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class Buffer;
+class MemoryPool;
+class Status;
+
+namespace io {
+
+/// \brief An operating system file open in write-only mode.
+class ARROW_EXPORT FileOutputStream : public OutputStream {
+ public:
+ ~FileOutputStream() override;
+
+ /// \brief Open a local file for writing, truncating any existing file
+ /// \param[in] path with UTF8 encoding
+ /// \param[in] append append to existing file, otherwise truncate to 0 bytes
+ /// \return an open FileOutputStream
+ ///
+ /// When opening a new file, any existing file with the indicated path is
+ /// truncated to 0 bytes, deleting any existing data
+ static Result<std::shared_ptr<FileOutputStream>> Open(const std::string& path,
+ bool append = false);
+
+ /// \brief Open a file descriptor for writing. The underlying file isn't
+ /// truncated.
+ /// \param[in] fd file descriptor
+ /// \return an open FileOutputStream
+ ///
+ /// The file descriptor becomes owned by the OutputStream, and will be closed
+ /// on Close() or destruction.
+ static Result<std::shared_ptr<FileOutputStream>> Open(int fd);
+
+ // OutputStream interface
+ Status Close() override;
+ bool closed() const override;
+ Result<int64_t> Tell() const override;
+
+ // Write bytes to the stream. Thread-safe
+ Status Write(const void* data, int64_t nbytes) override;
+ /// \cond FALSE
+ using Writable::Write;
+ /// \endcond
+
+ int file_descriptor() const;
+
+ private:
+ FileOutputStream();
+
+ class ARROW_NO_EXPORT FileOutputStreamImpl;
+ std::unique_ptr<FileOutputStreamImpl> impl_;
+};
+
+/// \brief An operating system file open in read-only mode.
+///
+/// Reads through this implementation are unbuffered. If many small reads
+/// need to be issued, it is recommended to use a buffering layer for good
+/// performance.
+class ARROW_EXPORT ReadableFile
+ : public internal::RandomAccessFileConcurrencyWrapper<ReadableFile> {
+ public:
+ ~ReadableFile() override;
+
+ /// \brief Open a local file for reading
+ /// \param[in] path with UTF8 encoding
+ /// \param[in] pool a MemoryPool for memory allocations
+ /// \return ReadableFile instance
+ static Result<std::shared_ptr<ReadableFile>> Open(
+ const std::string& path, MemoryPool* pool = default_memory_pool());
+
+ /// \brief Open a local file for reading
+ /// \param[in] fd file descriptor
+ /// \param[in] pool a MemoryPool for memory allocations
+ /// \return ReadableFile instance
+ ///
+ /// The file descriptor becomes owned by the ReadableFile, and will be closed
+ /// on Close() or destruction.
+ static Result<std::shared_ptr<ReadableFile>> Open(
+ int fd, MemoryPool* pool = default_memory_pool());
+
+ bool closed() const override;
+
+ int file_descriptor() const;
+
+ Status WillNeed(const std::vector<ReadRange>& ranges) override;
+
+ private:
+ friend RandomAccessFileConcurrencyWrapper<ReadableFile>;
+
+ explicit ReadableFile(MemoryPool* pool);
+
+ Status DoClose();
+ Result<int64_t> DoTell() const;
+ Result<int64_t> DoRead(int64_t nbytes, void* buffer);
+ Result<std::shared_ptr<Buffer>> DoRead(int64_t nbytes);
+
+ /// \brief Thread-safe implementation of ReadAt
+ Result<int64_t> DoReadAt(int64_t position, int64_t nbytes, void* out);
+
+ /// \brief Thread-safe implementation of ReadAt
+ Result<std::shared_ptr<Buffer>> DoReadAt(int64_t position, int64_t nbytes);
+
+ Result<int64_t> DoGetSize();
+ Status DoSeek(int64_t position);
+
+ class ARROW_NO_EXPORT ReadableFileImpl;
+ std::unique_ptr<ReadableFileImpl> impl_;
+};
+
+/// \brief A file interface that uses memory-mapped files for memory interactions
+///
+/// This implementation supports zero-copy reads. The same class is used
+/// for both reading and writing.
+///
+/// If opening a file in a writable mode, it is not truncated first as with
+/// FileOutputStream.
+class ARROW_EXPORT MemoryMappedFile : public ReadWriteFileInterface {
+ public:
+ ~MemoryMappedFile() override;
+
+ /// Create new file with indicated size, return in read/write mode
+ static Result<std::shared_ptr<MemoryMappedFile>> Create(const std::string& path,
+ int64_t size);
+
+ // mmap() with whole file
+ static Result<std::shared_ptr<MemoryMappedFile>> Open(const std::string& path,
+ FileMode::type mode);
+
+ // mmap() with a region of file, the offset must be a multiple of the page size
+ static Result<std::shared_ptr<MemoryMappedFile>> Open(const std::string& path,
+ FileMode::type mode,
+ const int64_t offset,
+ const int64_t length);
+
+ Status Close() override;
+
+ bool closed() const override;
+
+ Result<int64_t> Tell() const override;
+
+ Status Seek(int64_t position) override;
+
+ // Required by RandomAccessFile, copies memory into out. Not thread-safe
+ Result<int64_t> Read(int64_t nbytes, void* out) override;
+
+ // Zero copy read, moves position pointer. Not thread-safe
+ Result<std::shared_ptr<Buffer>> Read(int64_t nbytes) override;
+
+ // Zero-copy read, leaves position unchanged. Acquires a reader lock
+ // for the duration of slice creation (typically very short). Is thread-safe.
+ Result<std::shared_ptr<Buffer>> ReadAt(int64_t position, int64_t nbytes) override;
+
+ // Raw copy of the memory at specified position. Thread-safe, but
+ // locks out other readers for the duration of memcpy. Prefer the
+ // zero copy method
+ Result<int64_t> ReadAt(int64_t position, int64_t nbytes, void* out) override;
+
+ // Synchronous ReadAsync override
+ Future<std::shared_ptr<Buffer>> ReadAsync(const IOContext&, int64_t position,
+ int64_t nbytes) override;
+
+ Status WillNeed(const std::vector<ReadRange>& ranges) override;
+
+ bool supports_zero_copy() const override;
+
+ /// Write data at the current position in the file. Thread-safe
+ Status Write(const void* data, int64_t nbytes) override;
+ /// \cond FALSE
+ using Writable::Write;
+ /// \endcond
+
+ /// Set the size of the map to new_size.
+ Status Resize(int64_t new_size);
+
+ /// Write data at a particular position in the file. Thread-safe
+ Status WriteAt(int64_t position, const void* data, int64_t nbytes) override;
+
+ Result<int64_t> GetSize() override;
+
+ int file_descriptor() const;
+
+ private:
+ MemoryMappedFile();
+
+ Status WriteInternal(const void* data, int64_t nbytes);
+
+ class ARROW_NO_EXPORT MemoryMap;
+ std::shared_ptr<MemoryMap> memory_map_;
+};
+
+} // namespace io
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/io/interfaces.cc b/contrib/libs/apache/arrow/cpp/src/arrow/io/interfaces.cc
new file mode 100644
index 00000000000..954c0f37b2d
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/io/interfaces.cc
@@ -0,0 +1,469 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/io/interfaces.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <iterator>
+#include <list>
+#include <memory>
+#include <mutex>
+#include <sstream>
+#include <typeinfo>
+#include <utility>
+
+#include "arrow/buffer.h"
+#include "arrow/io/concurrency.h"
+#include "arrow/io/type_fwd.h"
+#include "arrow/io/util_internal.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/future.h"
+#include "arrow/util/iterator.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/string_view.h"
+#include "arrow/util/thread_pool.h"
+
+namespace arrow {
+
+using internal::checked_pointer_cast;
+using internal::Executor;
+using internal::TaskHints;
+using internal::ThreadPool;
+
+namespace io {
+
+static IOContext g_default_io_context{};
+
+IOContext::IOContext(MemoryPool* pool, StopToken stop_token)
+ : IOContext(pool, internal::GetIOThreadPool(), std::move(stop_token)) {}
+
+const IOContext& default_io_context() { return g_default_io_context; }
+
+int GetIOThreadPoolCapacity() { return internal::GetIOThreadPool()->GetCapacity(); }
+
+Status SetIOThreadPoolCapacity(int threads) {
+ return internal::GetIOThreadPool()->SetCapacity(threads);
+}
+
+FileInterface::~FileInterface() = default;
+
+Status FileInterface::Abort() { return Close(); }
+
+namespace {
+
+class InputStreamBlockIterator {
+ public:
+ InputStreamBlockIterator(std::shared_ptr<InputStream> stream, int64_t block_size)
+ : stream_(std::move(stream)), block_size_(block_size) {}
+
+ Result<std::shared_ptr<Buffer>> Next() {
+ if (done_) {
+ return nullptr;
+ }
+
+ ARROW_ASSIGN_OR_RAISE(auto out, stream_->Read(block_size_));
+
+ if (out->size() == 0) {
+ done_ = true;
+ stream_.reset();
+ out.reset();
+ }
+
+ return out;
+ }
+
+ protected:
+ std::shared_ptr<InputStream> stream_;
+ int64_t block_size_;
+ bool done_ = false;
+};
+
+} // namespace
+
+const IOContext& Readable::io_context() const { return g_default_io_context; }
+
+Status InputStream::Advance(int64_t nbytes) { return Read(nbytes).status(); }
+
+Result<util::string_view> InputStream::Peek(int64_t ARROW_ARG_UNUSED(nbytes)) {
+ return Status::NotImplemented("Peek not implemented");
+}
+
+bool InputStream::supports_zero_copy() const { return false; }
+
+Result<std::shared_ptr<const KeyValueMetadata>> InputStream::ReadMetadata() {
+ return std::shared_ptr<const KeyValueMetadata>{};
+}
+
+// Default ReadMetadataAsync() implementation: simply issue the read on the context's
+// executor
+Future<std::shared_ptr<const KeyValueMetadata>> InputStream::ReadMetadataAsync(
+ const IOContext& ctx) {
+ auto self = shared_from_this();
+ return DeferNotOk(internal::SubmitIO(ctx, [self] { return self->ReadMetadata(); }));
+}
+
+Future<std::shared_ptr<const KeyValueMetadata>> InputStream::ReadMetadataAsync() {
+ return ReadMetadataAsync(io_context());
+}
+
+Result<Iterator<std::shared_ptr<Buffer>>> MakeInputStreamIterator(
+ std::shared_ptr<InputStream> stream, int64_t block_size) {
+ if (stream->closed()) {
+ return Status::Invalid("Cannot take iterator on closed stream");
+ }
+ DCHECK_GT(block_size, 0);
+ return Iterator<std::shared_ptr<Buffer>>(InputStreamBlockIterator(stream, block_size));
+}
+
+struct RandomAccessFile::Impl {
+ std::mutex lock_;
+};
+
+RandomAccessFile::~RandomAccessFile() = default;
+
+RandomAccessFile::RandomAccessFile() : interface_impl_(new Impl()) {}
+
+Result<int64_t> RandomAccessFile::ReadAt(int64_t position, int64_t nbytes, void* out) {
+ std::lock_guard<std::mutex> lock(interface_impl_->lock_);
+ RETURN_NOT_OK(Seek(position));
+ return Read(nbytes, out);
+}
+
+Result<std::shared_ptr<Buffer>> RandomAccessFile::ReadAt(int64_t position,
+ int64_t nbytes) {
+ std::lock_guard<std::mutex> lock(interface_impl_->lock_);
+ RETURN_NOT_OK(Seek(position));
+ return Read(nbytes);
+}
+
+// Default ReadAsync() implementation: simply issue the read on the context's executor
+Future<std::shared_ptr<Buffer>> RandomAccessFile::ReadAsync(const IOContext& ctx,
+ int64_t position,
+ int64_t nbytes) {
+ auto self = checked_pointer_cast<RandomAccessFile>(shared_from_this());
+ return DeferNotOk(internal::SubmitIO(
+ ctx, [self, position, nbytes] { return self->ReadAt(position, nbytes); }));
+}
+
+Future<std::shared_ptr<Buffer>> RandomAccessFile::ReadAsync(int64_t position,
+ int64_t nbytes) {
+ return ReadAsync(io_context(), position, nbytes);
+}
+
+// Default WillNeed() implementation: no-op
+Status RandomAccessFile::WillNeed(const std::vector<ReadRange>& ranges) {
+ return Status::OK();
+}
+
+Status Writable::Write(util::string_view data) {
+ return Write(data.data(), static_cast<int64_t>(data.size()));
+}
+
+Status Writable::Write(const std::shared_ptr<Buffer>& data) {
+ return Write(data->data(), data->size());
+}
+
+Status Writable::Flush() { return Status::OK(); }
+
+// An InputStream that reads from a delimited range of a RandomAccessFile
+class FileSegmentReader
+ : public internal::InputStreamConcurrencyWrapper<FileSegmentReader> {
+ public:
+ FileSegmentReader(std::shared_ptr<RandomAccessFile> file, int64_t file_offset,
+ int64_t nbytes)
+ : file_(std::move(file)),
+ closed_(false),
+ position_(0),
+ file_offset_(file_offset),
+ nbytes_(nbytes) {
+ FileInterface::set_mode(FileMode::READ);
+ }
+
+ Status CheckOpen() const {
+ if (closed_) {
+ return Status::IOError("Stream is closed");
+ }
+ return Status::OK();
+ }
+
+ Status DoClose() {
+ closed_ = true;
+ return Status::OK();
+ }
+
+ Result<int64_t> DoTell() const {
+ RETURN_NOT_OK(CheckOpen());
+ return position_;
+ }
+
+ bool closed() const override { return closed_; }
+
+ Result<int64_t> DoRead(int64_t nbytes, void* out) {
+ RETURN_NOT_OK(CheckOpen());
+ int64_t bytes_to_read = std::min(nbytes, nbytes_ - position_);
+ ARROW_ASSIGN_OR_RAISE(int64_t bytes_read,
+ file_->ReadAt(file_offset_ + position_, bytes_to_read, out));
+ position_ += bytes_read;
+ return bytes_read;
+ }
+
+ Result<std::shared_ptr<Buffer>> DoRead(int64_t nbytes) {
+ RETURN_NOT_OK(CheckOpen());
+ int64_t bytes_to_read = std::min(nbytes, nbytes_ - position_);
+ ARROW_ASSIGN_OR_RAISE(auto buffer,
+ file_->ReadAt(file_offset_ + position_, bytes_to_read));
+ position_ += buffer->size();
+ return buffer;
+ }
+
+ private:
+ std::shared_ptr<RandomAccessFile> file_;
+ bool closed_;
+ int64_t position_;
+ int64_t file_offset_;
+ int64_t nbytes_;
+};
+
+std::shared_ptr<InputStream> RandomAccessFile::GetStream(
+ std::shared_ptr<RandomAccessFile> file, int64_t file_offset, int64_t nbytes) {
+ return std::make_shared<FileSegmentReader>(std::move(file), file_offset, nbytes);
+}
+
+// -----------------------------------------------------------------------
+// Implement utilities exported from concurrency.h and util_internal.h
+
+namespace internal {
+
+void CloseFromDestructor(FileInterface* file) {
+ Status st = file->Close();
+ if (!st.ok()) {
+ auto file_type = typeid(*file).name();
+#ifdef NDEBUG
+ ARROW_LOG(ERROR) << "Error ignored when destroying file of type " << file_type << ": "
+ << st;
+#else
+ std::stringstream ss;
+ ss << "When destroying file of type " << file_type << ": " << st.message();
+ ARROW_LOG(FATAL) << st.WithMessage(ss.str());
+#endif
+ }
+}
+
+Result<int64_t> ValidateReadRange(int64_t offset, int64_t size, int64_t file_size) {
+ if (offset < 0 || size < 0) {
+ return Status::Invalid("Invalid read (offset = ", offset, ", size = ", size, ")");
+ }
+ if (offset > file_size) {
+ return Status::IOError("Read out of bounds (offset = ", offset, ", size = ", size,
+ ") in file of size ", file_size);
+ }
+ return std::min(size, file_size - offset);
+}
+
+Status ValidateWriteRange(int64_t offset, int64_t size, int64_t file_size) {
+ if (offset < 0 || size < 0) {
+ return Status::Invalid("Invalid write (offset = ", offset, ", size = ", size, ")");
+ }
+ if (offset + size > file_size) {
+ return Status::IOError("Write out of bounds (offset = ", offset, ", size = ", size,
+ ") in file of size ", file_size);
+ }
+ return Status::OK();
+}
+
+Status ValidateRange(int64_t offset, int64_t size) {
+ if (offset < 0 || size < 0) {
+ return Status::Invalid("Invalid IO range (offset = ", offset, ", size = ", size, ")");
+ }
+ return Status::OK();
+}
+
+#ifndef NDEBUG
+
+// Debug mode concurrency checking
+
+struct SharedExclusiveChecker::Impl {
+ std::mutex mutex;
+ int64_t n_shared = 0;
+ int64_t n_exclusive = 0;
+};
+
+SharedExclusiveChecker::SharedExclusiveChecker() : impl_(new Impl) {}
+
+void SharedExclusiveChecker::LockShared() {
+ std::lock_guard<std::mutex> lock(impl_->mutex);
+ // XXX The error message doesn't really describe the actual situation
+ // (e.g. ReadAt() called while Read() call in progress)
+ ARROW_CHECK_EQ(impl_->n_exclusive, 0)
+ << "Attempted to take shared lock while locked exclusive";
+ ++impl_->n_shared;
+}
+
+void SharedExclusiveChecker::UnlockShared() {
+ std::lock_guard<std::mutex> lock(impl_->mutex);
+ ARROW_CHECK_GT(impl_->n_shared, 0);
+ --impl_->n_shared;
+}
+
+void SharedExclusiveChecker::LockExclusive() {
+ std::lock_guard<std::mutex> lock(impl_->mutex);
+ ARROW_CHECK_EQ(impl_->n_shared, 0)
+ << "Attempted to take exclusive lock while locked shared";
+ ARROW_CHECK_EQ(impl_->n_exclusive, 0)
+ << "Attempted to take exclusive lock while already locked exclusive";
+ ++impl_->n_exclusive;
+}
+
+void SharedExclusiveChecker::UnlockExclusive() {
+ std::lock_guard<std::mutex> lock(impl_->mutex);
+ ARROW_CHECK_EQ(impl_->n_exclusive, 1);
+ --impl_->n_exclusive;
+}
+
+#else
+
+// Release mode no-op concurrency checking
+
+struct SharedExclusiveChecker::Impl {};
+
+SharedExclusiveChecker::SharedExclusiveChecker() {}
+
+void SharedExclusiveChecker::LockShared() {}
+void SharedExclusiveChecker::UnlockShared() {}
+void SharedExclusiveChecker::LockExclusive() {}
+void SharedExclusiveChecker::UnlockExclusive() {}
+
+#endif
+
+static std::shared_ptr<ThreadPool> MakeIOThreadPool() {
+ auto maybe_pool = ThreadPool::MakeEternal(/*threads=*/8);
+ if (!maybe_pool.ok()) {
+ maybe_pool.status().Abort("Failed to create global IO thread pool");
+ }
+ return *std::move(maybe_pool);
+}
+
+ThreadPool* GetIOThreadPool() {
+ static std::shared_ptr<ThreadPool> pool = MakeIOThreadPool();
+ return pool.get();
+}
+
+// -----------------------------------------------------------------------
+// CoalesceReadRanges
+
+namespace {
+
+struct ReadRangeCombiner {
+ std::vector<ReadRange> Coalesce(std::vector<ReadRange> ranges) {
+ if (ranges.empty()) {
+ return ranges;
+ }
+
+ // Remove zero-sized ranges
+ auto end = std::remove_if(ranges.begin(), ranges.end(),
+ [](const ReadRange& range) { return range.length == 0; });
+ // Sort in position order
+ std::sort(ranges.begin(), end,
+ [](const ReadRange& a, const ReadRange& b) { return a.offset < b.offset; });
+ // Remove ranges that overlap 100%
+ end = std::unique(ranges.begin(), end,
+ [](const ReadRange& left, const ReadRange& right) {
+ return right.offset >= left.offset &&
+ right.offset + right.length <= left.offset + left.length;
+ });
+ ranges.resize(end - ranges.begin());
+
+ // Skip further processing if ranges is empty after removing zero-sized ranges.
+ if (ranges.empty()) {
+ return ranges;
+ }
+
+#ifndef NDEBUG
+ for (size_t i = 0; i < ranges.size() - 1; ++i) {
+ const auto& left = ranges[i];
+ const auto& right = ranges[i + 1];
+ DCHECK_LE(left.offset, right.offset);
+ DCHECK_LE(left.offset + left.length, right.offset) << "Some read ranges overlap";
+ }
+#endif
+
+ std::vector<ReadRange> coalesced;
+
+ auto itr = ranges.begin();
+ // Ensure ranges is not empty.
+ DCHECK_LE(itr, ranges.end());
+ // Start of the current coalesced range and end (exclusive) of previous range.
+ // Both are initialized with the start of first range which is a placeholder value.
+ int64_t coalesced_start = itr->offset;
+ int64_t prev_range_end = coalesced_start;
+
+ for (; itr < ranges.end(); ++itr) {
+ const int64_t current_range_start = itr->offset;
+ const int64_t current_range_end = current_range_start + itr->length;
+ // We don't expect to have 0 sized ranges.
+ DCHECK_LT(current_range_start, current_range_end);
+
+ // At this point, the coalesced range is [coalesced_start, prev_range_end).
+ // Stop coalescing if:
+ // - coalesced range is too large, or
+ // - distance (hole/gap) between consecutive ranges is too large.
+ if (current_range_end - coalesced_start > range_size_limit_ ||
+ current_range_start - prev_range_end > hole_size_limit_) {
+ DCHECK_LE(coalesced_start, prev_range_end);
+ // Append the coalesced range only if coalesced range size > 0.
+ if (prev_range_end > coalesced_start) {
+ coalesced.push_back({coalesced_start, prev_range_end - coalesced_start});
+ }
+ // Start a new coalesced range.
+ coalesced_start = current_range_start;
+ }
+
+ // Update the prev_range_end with the current range.
+ prev_range_end = current_range_end;
+ }
+ // Append the coalesced range only if coalesced range size > 0.
+ if (prev_range_end > coalesced_start) {
+ coalesced.push_back({coalesced_start, prev_range_end - coalesced_start});
+ }
+
+ DCHECK_EQ(coalesced.front().offset, ranges.front().offset);
+ DCHECK_EQ(coalesced.back().offset + coalesced.back().length,
+ ranges.back().offset + ranges.back().length);
+ return coalesced;
+ }
+
+ const int64_t hole_size_limit_;
+ const int64_t range_size_limit_;
+};
+
+}; // namespace
+
+std::vector<ReadRange> CoalesceReadRanges(std::vector<ReadRange> ranges,
+ int64_t hole_size_limit,
+ int64_t range_size_limit) {
+ DCHECK_GT(range_size_limit, hole_size_limit);
+
+ ReadRangeCombiner combiner{hole_size_limit, range_size_limit};
+ return combiner.Coalesce(std::move(ranges));
+}
+
+} // namespace internal
+} // namespace io
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/io/interfaces.h b/contrib/libs/apache/arrow/cpp/src/arrow/io/interfaces.h
new file mode 100644
index 00000000000..e524afa99a3
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/io/interfaces.h
@@ -0,0 +1,340 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/io/type_fwd.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/cancel.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/string_view.h"
+#include "arrow/util/type_fwd.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace io {
+
+struct ReadRange {
+ int64_t offset;
+ int64_t length;
+
+ friend bool operator==(const ReadRange& left, const ReadRange& right) {
+ return (left.offset == right.offset && left.length == right.length);
+ }
+ friend bool operator!=(const ReadRange& left, const ReadRange& right) {
+ return !(left == right);
+ }
+
+ bool Contains(const ReadRange& other) const {
+ return (offset <= other.offset && offset + length >= other.offset + other.length);
+ }
+};
+
+/// EXPERIMENTAL: options provider for IO tasks
+///
+/// Includes an Executor (which will be used to execute asynchronous reads),
+/// a MemoryPool (which will be used to allocate buffers when zero copy reads
+/// are not possible), and an external id (in case the executor receives tasks from
+/// multiple sources and must distinguish tasks associated with this IOContext).
+struct ARROW_EXPORT IOContext {
+ // No specified executor: will use a global IO thread pool
+ IOContext() : IOContext(default_memory_pool(), StopToken::Unstoppable()) {}
+
+ explicit IOContext(StopToken stop_token)
+ : IOContext(default_memory_pool(), std::move(stop_token)) {}
+
+ explicit IOContext(MemoryPool* pool, StopToken stop_token = StopToken::Unstoppable());
+
+ explicit IOContext(MemoryPool* pool, ::arrow::internal::Executor* executor,
+ StopToken stop_token = StopToken::Unstoppable(),
+ int64_t external_id = -1)
+ : pool_(pool),
+ executor_(executor),
+ external_id_(external_id),
+ stop_token_(std::move(stop_token)) {}
+
+ explicit IOContext(::arrow::internal::Executor* executor,
+ StopToken stop_token = StopToken::Unstoppable(),
+ int64_t external_id = -1)
+ : pool_(default_memory_pool()),
+ executor_(executor),
+ external_id_(external_id),
+ stop_token_(std::move(stop_token)) {}
+
+ MemoryPool* pool() const { return pool_; }
+
+ ::arrow::internal::Executor* executor() const { return executor_; }
+
+ // An application-specific ID, forwarded to executor task submissions
+ int64_t external_id() const { return external_id_; }
+
+ StopToken stop_token() const { return stop_token_; }
+
+ private:
+ MemoryPool* pool_;
+ ::arrow::internal::Executor* executor_;
+ int64_t external_id_;
+ StopToken stop_token_;
+};
+
+struct ARROW_DEPRECATED("renamed to IOContext in 4.0.0") AsyncContext : public IOContext {
+ using IOContext::IOContext;
+};
+
+class ARROW_EXPORT FileInterface {
+ public:
+ virtual ~FileInterface() = 0;
+
+ /// \brief Close the stream cleanly
+ ///
+ /// For writable streams, this will attempt to flush any pending data
+ /// before releasing the underlying resource.
+ ///
+ /// After Close() is called, closed() returns true and the stream is not
+ /// available for further operations.
+ virtual Status Close() = 0;
+
+ /// \brief Close the stream abruptly
+ ///
+ /// This method does not guarantee that any pending data is flushed.
+ /// It merely releases any underlying resource used by the stream for
+ /// its operation.
+ ///
+ /// After Abort() is called, closed() returns true and the stream is not
+ /// available for further operations.
+ virtual Status Abort();
+
+ /// \brief Return the position in this stream
+ virtual Result<int64_t> Tell() const = 0;
+
+ /// \brief Return whether the stream is closed
+ virtual bool closed() const = 0;
+
+ FileMode::type mode() const { return mode_; }
+
+ protected:
+ FileInterface() : mode_(FileMode::READ) {}
+ FileMode::type mode_;
+ void set_mode(FileMode::type mode) { mode_ = mode; }
+
+ private:
+ ARROW_DISALLOW_COPY_AND_ASSIGN(FileInterface);
+};
+
+class ARROW_EXPORT Seekable {
+ public:
+ virtual ~Seekable() = default;
+ virtual Status Seek(int64_t position) = 0;
+};
+
+class ARROW_EXPORT Writable {
+ public:
+ virtual ~Writable() = default;
+
+ /// \brief Write the given data to the stream
+ ///
+ /// This method always processes the bytes in full. Depending on the
+ /// semantics of the stream, the data may be written out immediately,
+ /// held in a buffer, or written asynchronously. In the case where
+ /// the stream buffers the data, it will be copied. To avoid potentially
+ /// large copies, use the Write variant that takes an owned Buffer.
+ virtual Status Write(const void* data, int64_t nbytes) = 0;
+
+ /// \brief Write the given data to the stream
+ ///
+ /// Since the Buffer owns its memory, this method can avoid a copy if
+ /// buffering is required. See Write(const void*, int64_t) for details.
+ virtual Status Write(const std::shared_ptr<Buffer>& data);
+
+ /// \brief Flush buffered bytes, if any
+ virtual Status Flush();
+
+ Status Write(util::string_view data);
+};
+
+class ARROW_EXPORT Readable {
+ public:
+ virtual ~Readable() = default;
+
+ /// \brief Read data from current file position.
+ ///
+ /// Read at most `nbytes` from the current file position into `out`.
+ /// The number of bytes read is returned.
+ virtual Result<int64_t> Read(int64_t nbytes, void* out) = 0;
+
+ /// \brief Read data from current file position.
+ ///
+ /// Read at most `nbytes` from the current file position. Less bytes may
+ /// be read if EOF is reached. This method updates the current file position.
+ ///
+ /// In some cases (e.g. a memory-mapped file), this method may avoid a
+ /// memory copy.
+ virtual Result<std::shared_ptr<Buffer>> Read(int64_t nbytes) = 0;
+
+ /// EXPERIMENTAL: The IOContext associated with this file.
+ ///
+ /// By default, this is the same as default_io_context(), but it may be
+ /// overriden by subclasses.
+ virtual const IOContext& io_context() const;
+};
+
+class ARROW_EXPORT OutputStream : virtual public FileInterface, public Writable {
+ protected:
+ OutputStream() = default;
+};
+
+class ARROW_EXPORT InputStream : virtual public FileInterface,
+ virtual public Readable,
+ public std::enable_shared_from_this<InputStream> {
+ public:
+ /// \brief Advance or skip stream indicated number of bytes
+ /// \param[in] nbytes the number to move forward
+ /// \return Status
+ Status Advance(int64_t nbytes);
+
+ /// \brief Return zero-copy string_view to upcoming bytes.
+ ///
+ /// Do not modify the stream position. The view becomes invalid after
+ /// any operation on the stream. May trigger buffering if the requested
+ /// size is larger than the number of buffered bytes.
+ ///
+ /// May return NotImplemented on streams that don't support it.
+ ///
+ /// \param[in] nbytes the maximum number of bytes to see
+ virtual Result<util::string_view> Peek(int64_t nbytes);
+
+ /// \brief Return true if InputStream is capable of zero copy Buffer reads
+ ///
+ /// Zero copy reads imply the use of Buffer-returning Read() overloads.
+ virtual bool supports_zero_copy() const;
+
+ /// \brief Read and return stream metadata
+ ///
+ /// If the stream implementation doesn't support metadata, empty metadata
+ /// is returned. Note that it is allowed to return a null pointer rather
+ /// than an allocated empty metadata.
+ virtual Result<std::shared_ptr<const KeyValueMetadata>> ReadMetadata();
+
+ /// \brief Read stream metadata asynchronously
+ virtual Future<std::shared_ptr<const KeyValueMetadata>> ReadMetadataAsync(
+ const IOContext& io_context);
+ Future<std::shared_ptr<const KeyValueMetadata>> ReadMetadataAsync();
+
+ protected:
+ InputStream() = default;
+};
+
+class ARROW_EXPORT RandomAccessFile : public InputStream, public Seekable {
+ public:
+ /// Necessary because we hold a std::unique_ptr
+ ~RandomAccessFile() override;
+
+ /// \brief Create an isolated InputStream that reads a segment of a
+ /// RandomAccessFile. Multiple such stream can be created and used
+ /// independently without interference
+ /// \param[in] file a file instance
+ /// \param[in] file_offset the starting position in the file
+ /// \param[in] nbytes the extent of bytes to read. The file should have
+ /// sufficient bytes available
+ static std::shared_ptr<InputStream> GetStream(std::shared_ptr<RandomAccessFile> file,
+ int64_t file_offset, int64_t nbytes);
+
+ /// \brief Return the total file size in bytes.
+ ///
+ /// This method does not read or move the current file position, so is safe
+ /// to call concurrently with e.g. ReadAt().
+ virtual Result<int64_t> GetSize() = 0;
+
+ /// \brief Read data from given file position.
+ ///
+ /// At most `nbytes` bytes are read. The number of bytes read is returned
+ /// (it can be less than `nbytes` if EOF is reached).
+ ///
+ /// This method can be safely called from multiple threads concurrently.
+ /// It is unspecified whether this method updates the file position or not.
+ ///
+ /// The default RandomAccessFile-provided implementation uses Seek() and Read(),
+ /// but subclasses may override it with a more efficient implementation
+ /// that doesn't depend on implicit file positioning.
+ ///
+ /// \param[in] position Where to read bytes from
+ /// \param[in] nbytes The number of bytes to read
+ /// \param[out] out The buffer to read bytes into
+ /// \return The number of bytes read, or an error
+ virtual Result<int64_t> ReadAt(int64_t position, int64_t nbytes, void* out);
+
+ /// \brief Read data from given file position.
+ ///
+ /// At most `nbytes` bytes are read, but it can be less if EOF is reached.
+ ///
+ /// \param[in] position Where to read bytes from
+ /// \param[in] nbytes The number of bytes to read
+ /// \return A buffer containing the bytes read, or an error
+ virtual Result<std::shared_ptr<Buffer>> ReadAt(int64_t position, int64_t nbytes);
+
+ /// EXPERIMENTAL: Read data asynchronously.
+ virtual Future<std::shared_ptr<Buffer>> ReadAsync(const IOContext&, int64_t position,
+ int64_t nbytes);
+
+ /// EXPERIMENTAL: Read data asynchronously, using the file's IOContext.
+ Future<std::shared_ptr<Buffer>> ReadAsync(int64_t position, int64_t nbytes);
+
+ /// EXPERIMENTAL: Inform that the given ranges may be read soon.
+ ///
+ /// Some implementations might arrange to prefetch some of the data.
+ /// However, no guarantee is made and the default implementation does nothing.
+ /// For robust prefetching, use ReadAt() or ReadAsync().
+ virtual Status WillNeed(const std::vector<ReadRange>& ranges);
+
+ protected:
+ RandomAccessFile();
+
+ private:
+ struct ARROW_NO_EXPORT Impl;
+ std::unique_ptr<Impl> interface_impl_;
+};
+
+class ARROW_EXPORT WritableFile : public OutputStream, public Seekable {
+ public:
+ virtual Status WriteAt(int64_t position, const void* data, int64_t nbytes) = 0;
+
+ protected:
+ WritableFile() = default;
+};
+
+class ARROW_EXPORT ReadWriteFileInterface : public RandomAccessFile, public WritableFile {
+ protected:
+ ReadWriteFileInterface() { RandomAccessFile::set_mode(FileMode::READWRITE); }
+};
+
+/// \brief Return an iterator on an input stream
+///
+/// The iterator yields a fixed-size block on each Next() call, except the
+/// last block in the stream which may be smaller.
+/// Once the end of stream is reached, Next() returns nullptr
+/// (unlike InputStream::Read() which returns an empty buffer).
+ARROW_EXPORT
+Result<Iterator<std::shared_ptr<Buffer>>> MakeInputStreamIterator(
+ std::shared_ptr<InputStream> stream, int64_t block_size);
+
+} // namespace io
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/io/memory.cc b/contrib/libs/apache/arrow/cpp/src/arrow/io/memory.cc
new file mode 100644
index 00000000000..6495242e63b
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/io/memory.cc
@@ -0,0 +1,388 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/io/memory.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <mutex>
+#include <utility>
+
+#include "arrow/buffer.h"
+#include "arrow/io/util_internal.h"
+#include "arrow/memory_pool.h"
+#include "arrow/status.h"
+#include "arrow/util/future.h"
+#include "arrow/util/io_util.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/memory.h"
+
+namespace arrow {
+namespace io {
+
+// ----------------------------------------------------------------------
+// OutputStream that writes to resizable buffer
+
+static constexpr int64_t kBufferMinimumSize = 256;
+
+BufferOutputStream::BufferOutputStream()
+ : is_open_(false), capacity_(0), position_(0), mutable_data_(nullptr) {}
+
+BufferOutputStream::BufferOutputStream(const std::shared_ptr<ResizableBuffer>& buffer)
+ : buffer_(buffer),
+ is_open_(true),
+ capacity_(buffer->size()),
+ position_(0),
+ mutable_data_(buffer->mutable_data()) {}
+
+Result<std::shared_ptr<BufferOutputStream>> BufferOutputStream::Create(
+ int64_t initial_capacity, MemoryPool* pool) {
+ // ctor is private, so cannot use make_shared
+ auto ptr = std::shared_ptr<BufferOutputStream>(new BufferOutputStream);
+ RETURN_NOT_OK(ptr->Reset(initial_capacity, pool));
+ return ptr;
+}
+
+Status BufferOutputStream::Reset(int64_t initial_capacity, MemoryPool* pool) {
+ ARROW_ASSIGN_OR_RAISE(buffer_, AllocateResizableBuffer(initial_capacity, pool));
+ is_open_ = true;
+ capacity_ = initial_capacity;
+ position_ = 0;
+ mutable_data_ = buffer_->mutable_data();
+ return Status::OK();
+}
+
+BufferOutputStream::~BufferOutputStream() {
+ if (buffer_) {
+ internal::CloseFromDestructor(this);
+ }
+}
+
+Status BufferOutputStream::Close() {
+ if (is_open_) {
+ is_open_ = false;
+ if (position_ < capacity_) {
+ RETURN_NOT_OK(buffer_->Resize(position_, false));
+ }
+ }
+ return Status::OK();
+}
+
+bool BufferOutputStream::closed() const { return !is_open_; }
+
+Result<std::shared_ptr<Buffer>> BufferOutputStream::Finish() {
+ RETURN_NOT_OK(Close());
+ buffer_->ZeroPadding();
+ is_open_ = false;
+ return std::move(buffer_);
+}
+
+Result<int64_t> BufferOutputStream::Tell() const { return position_; }
+
+Status BufferOutputStream::Write(const void* data, int64_t nbytes) {
+ if (ARROW_PREDICT_FALSE(!is_open_)) {
+ return Status::IOError("OutputStream is closed");
+ }
+ DCHECK(buffer_);
+ if (ARROW_PREDICT_TRUE(nbytes > 0)) {
+ if (ARROW_PREDICT_FALSE(position_ + nbytes >= capacity_)) {
+ RETURN_NOT_OK(Reserve(nbytes));
+ }
+ memcpy(mutable_data_ + position_, data, nbytes);
+ position_ += nbytes;
+ }
+ return Status::OK();
+}
+
+Status BufferOutputStream::Reserve(int64_t nbytes) {
+ // Always overallocate by doubling. It seems that it is a better growth
+ // strategy, at least for memory_benchmark.cc.
+ // This may be because it helps match the allocator's allocation buckets
+ // more exactly. Or perhaps it hits a sweet spot in jemalloc.
+ int64_t new_capacity = std::max(kBufferMinimumSize, capacity_);
+ while (new_capacity < position_ + nbytes) {
+ new_capacity = new_capacity * 2;
+ }
+ if (new_capacity > capacity_) {
+ RETURN_NOT_OK(buffer_->Resize(new_capacity));
+ capacity_ = new_capacity;
+ mutable_data_ = buffer_->mutable_data();
+ }
+ return Status::OK();
+}
+
+// ----------------------------------------------------------------------
+// OutputStream that doesn't write anything
+
+Status MockOutputStream::Close() {
+ is_open_ = false;
+ return Status::OK();
+}
+
+bool MockOutputStream::closed() const { return !is_open_; }
+
+Result<int64_t> MockOutputStream::Tell() const { return extent_bytes_written_; }
+
+Status MockOutputStream::Write(const void* data, int64_t nbytes) {
+ extent_bytes_written_ += nbytes;
+ return Status::OK();
+}
+
+// ----------------------------------------------------------------------
+// In-memory buffer writer
+
+static constexpr int kMemcopyDefaultNumThreads = 1;
+static constexpr int64_t kMemcopyDefaultBlocksize = 64;
+static constexpr int64_t kMemcopyDefaultThreshold = 1024 * 1024;
+
+class FixedSizeBufferWriter::FixedSizeBufferWriterImpl {
+ public:
+ /// Input buffer must be mutable, will abort if not
+
+ /// Input buffer must be mutable, will abort if not
+ explicit FixedSizeBufferWriterImpl(const std::shared_ptr<Buffer>& buffer)
+ : is_open_(true),
+ memcopy_num_threads_(kMemcopyDefaultNumThreads),
+ memcopy_blocksize_(kMemcopyDefaultBlocksize),
+ memcopy_threshold_(kMemcopyDefaultThreshold) {
+ buffer_ = buffer;
+ ARROW_CHECK(buffer->is_mutable()) << "Must pass mutable buffer";
+ mutable_data_ = buffer->mutable_data();
+ size_ = buffer->size();
+ position_ = 0;
+ }
+
+ Status Close() {
+ is_open_ = false;
+ return Status::OK();
+ }
+
+ bool closed() const { return !is_open_; }
+
+ Status Seek(int64_t position) {
+ if (position < 0 || position > size_) {
+ return Status::IOError("Seek out of bounds");
+ }
+ position_ = position;
+ return Status::OK();
+ }
+
+ Result<int64_t> Tell() { return position_; }
+
+ Status Write(const void* data, int64_t nbytes) {
+ RETURN_NOT_OK(internal::ValidateWriteRange(position_, nbytes, size_));
+ if (nbytes > memcopy_threshold_ && memcopy_num_threads_ > 1) {
+ ::arrow::internal::parallel_memcopy(mutable_data_ + position_,
+ reinterpret_cast<const uint8_t*>(data), nbytes,
+ memcopy_blocksize_, memcopy_num_threads_);
+ } else {
+ memcpy(mutable_data_ + position_, data, nbytes);
+ }
+ position_ += nbytes;
+ return Status::OK();
+ }
+
+ Status WriteAt(int64_t position, const void* data, int64_t nbytes) {
+ std::lock_guard<std::mutex> guard(lock_);
+ RETURN_NOT_OK(internal::ValidateWriteRange(position, nbytes, size_));
+ RETURN_NOT_OK(Seek(position));
+ return Write(data, nbytes);
+ }
+
+ void set_memcopy_threads(int num_threads) { memcopy_num_threads_ = num_threads; }
+
+ void set_memcopy_blocksize(int64_t blocksize) { memcopy_blocksize_ = blocksize; }
+
+ void set_memcopy_threshold(int64_t threshold) { memcopy_threshold_ = threshold; }
+
+ private:
+ std::mutex lock_;
+ std::shared_ptr<Buffer> buffer_;
+ uint8_t* mutable_data_;
+ int64_t size_;
+ int64_t position_;
+ bool is_open_;
+
+ int memcopy_num_threads_;
+ int64_t memcopy_blocksize_;
+ int64_t memcopy_threshold_;
+};
+
+FixedSizeBufferWriter::FixedSizeBufferWriter(const std::shared_ptr<Buffer>& buffer)
+ : impl_(new FixedSizeBufferWriterImpl(buffer)) {}
+
+FixedSizeBufferWriter::~FixedSizeBufferWriter() = default;
+
+Status FixedSizeBufferWriter::Close() { return impl_->Close(); }
+
+bool FixedSizeBufferWriter::closed() const { return impl_->closed(); }
+
+Status FixedSizeBufferWriter::Seek(int64_t position) { return impl_->Seek(position); }
+
+Result<int64_t> FixedSizeBufferWriter::Tell() const { return impl_->Tell(); }
+
+Status FixedSizeBufferWriter::Write(const void* data, int64_t nbytes) {
+ return impl_->Write(data, nbytes);
+}
+
+Status FixedSizeBufferWriter::WriteAt(int64_t position, const void* data,
+ int64_t nbytes) {
+ return impl_->WriteAt(position, data, nbytes);
+}
+
+void FixedSizeBufferWriter::set_memcopy_threads(int num_threads) {
+ impl_->set_memcopy_threads(num_threads);
+}
+
+void FixedSizeBufferWriter::set_memcopy_blocksize(int64_t blocksize) {
+ impl_->set_memcopy_blocksize(blocksize);
+}
+
+void FixedSizeBufferWriter::set_memcopy_threshold(int64_t threshold) {
+ impl_->set_memcopy_threshold(threshold);
+}
+
+// ----------------------------------------------------------------------
+// In-memory buffer reader
+
+BufferReader::BufferReader(std::shared_ptr<Buffer> buffer)
+ : buffer_(std::move(buffer)),
+ data_(buffer_ ? buffer_->data() : reinterpret_cast<const uint8_t*>("")),
+ size_(buffer_ ? buffer_->size() : 0),
+ position_(0),
+ is_open_(true) {}
+
+BufferReader::BufferReader(const uint8_t* data, int64_t size)
+ : buffer_(nullptr), data_(data), size_(size), position_(0), is_open_(true) {}
+
+BufferReader::BufferReader(const Buffer& buffer)
+ : BufferReader(buffer.data(), buffer.size()) {}
+
+BufferReader::BufferReader(const util::string_view& data)
+ : BufferReader(reinterpret_cast<const uint8_t*>(data.data()),
+ static_cast<int64_t>(data.size())) {}
+
+Status BufferReader::DoClose() {
+ is_open_ = false;
+ return Status::OK();
+}
+
+bool BufferReader::closed() const { return !is_open_; }
+
+Result<int64_t> BufferReader::DoTell() const {
+ RETURN_NOT_OK(CheckClosed());
+ return position_;
+}
+
+Result<util::string_view> BufferReader::DoPeek(int64_t nbytes) {
+ RETURN_NOT_OK(CheckClosed());
+
+ const int64_t bytes_available = std::min(nbytes, size_ - position_);
+ return util::string_view(reinterpret_cast<const char*>(data_) + position_,
+ static_cast<size_t>(bytes_available));
+}
+
+bool BufferReader::supports_zero_copy() const { return true; }
+
+Status BufferReader::WillNeed(const std::vector<ReadRange>& ranges) {
+ using ::arrow::internal::MemoryRegion;
+
+ RETURN_NOT_OK(CheckClosed());
+
+ std::vector<MemoryRegion> regions(ranges.size());
+ for (size_t i = 0; i < ranges.size(); ++i) {
+ const auto& range = ranges[i];
+ ARROW_ASSIGN_OR_RAISE(auto size,
+ internal::ValidateReadRange(range.offset, range.length, size_));
+ regions[i] = {const_cast<uint8_t*>(data_ + range.offset), static_cast<size_t>(size)};
+ }
+ const auto st = ::arrow::internal::MemoryAdviseWillNeed(regions);
+ if (st.IsIOError()) {
+ // Ignore any system-level errors, in case the memory area isn't madvise()-able
+ return Status::OK();
+ }
+ return st;
+}
+
+Future<std::shared_ptr<Buffer>> BufferReader::ReadAsync(const IOContext&,
+ int64_t position,
+ int64_t nbytes) {
+ return Future<std::shared_ptr<Buffer>>::MakeFinished(DoReadAt(position, nbytes));
+}
+
+Result<int64_t> BufferReader::DoReadAt(int64_t position, int64_t nbytes, void* buffer) {
+ RETURN_NOT_OK(CheckClosed());
+
+ ARROW_ASSIGN_OR_RAISE(nbytes, internal::ValidateReadRange(position, nbytes, size_));
+ DCHECK_GE(nbytes, 0);
+ if (nbytes) {
+ memcpy(buffer, data_ + position, nbytes);
+ }
+ return nbytes;
+}
+
+Result<std::shared_ptr<Buffer>> BufferReader::DoReadAt(int64_t position, int64_t nbytes) {
+ RETURN_NOT_OK(CheckClosed());
+
+ ARROW_ASSIGN_OR_RAISE(nbytes, internal::ValidateReadRange(position, nbytes, size_));
+ DCHECK_GE(nbytes, 0);
+
+ // Arrange for data to be paged in
+ // RETURN_NOT_OK(::arrow::internal::MemoryAdviseWillNeed(
+ // {{const_cast<uint8_t*>(data_ + position), static_cast<size_t>(nbytes)}}));
+
+ if (nbytes > 0 && buffer_ != nullptr) {
+ return SliceBuffer(buffer_, position, nbytes);
+ } else {
+ return std::make_shared<Buffer>(data_ + position, nbytes);
+ }
+}
+
+Result<int64_t> BufferReader::DoRead(int64_t nbytes, void* out) {
+ RETURN_NOT_OK(CheckClosed());
+ ARROW_ASSIGN_OR_RAISE(int64_t bytes_read, DoReadAt(position_, nbytes, out));
+ position_ += bytes_read;
+ return bytes_read;
+}
+
+Result<std::shared_ptr<Buffer>> BufferReader::DoRead(int64_t nbytes) {
+ RETURN_NOT_OK(CheckClosed());
+ ARROW_ASSIGN_OR_RAISE(auto buffer, DoReadAt(position_, nbytes));
+ position_ += buffer->size();
+ return buffer;
+}
+
+Result<int64_t> BufferReader::DoGetSize() {
+ RETURN_NOT_OK(CheckClosed());
+ return size_;
+}
+
+Status BufferReader::DoSeek(int64_t position) {
+ RETURN_NOT_OK(CheckClosed());
+
+ if (position < 0 || position > size_) {
+ return Status::IOError("Seek out of bounds");
+ }
+
+ position_ = position;
+ return Status::OK();
+}
+
+} // namespace io
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/io/memory.h b/contrib/libs/apache/arrow/cpp/src/arrow/io/memory.h
new file mode 100644
index 00000000000..8213439ef74
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/io/memory.h
@@ -0,0 +1,197 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Public API for different memory sharing / IO mechanisms
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "arrow/io/concurrency.h"
+#include "arrow/io/interfaces.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/string_view.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class Status;
+
+namespace io {
+
+/// \brief An output stream that writes to a resizable buffer
+class ARROW_EXPORT BufferOutputStream : public OutputStream {
+ public:
+ explicit BufferOutputStream(const std::shared_ptr<ResizableBuffer>& buffer);
+
+ /// \brief Create in-memory output stream with indicated capacity using a
+ /// memory pool
+ /// \param[in] initial_capacity the initial allocated internal capacity of
+ /// the OutputStream
+ /// \param[in,out] pool a MemoryPool to use for allocations
+ /// \return the created stream
+ static Result<std::shared_ptr<BufferOutputStream>> Create(
+ int64_t initial_capacity = 4096, MemoryPool* pool = default_memory_pool());
+
+ ~BufferOutputStream() override;
+
+ // Implement the OutputStream interface
+
+ /// Close the stream, preserving the buffer (retrieve it with Finish()).
+ Status Close() override;
+ bool closed() const override;
+ Result<int64_t> Tell() const override;
+ Status Write(const void* data, int64_t nbytes) override;
+
+ /// \cond FALSE
+ using OutputStream::Write;
+ /// \endcond
+
+ /// Close the stream and return the buffer
+ Result<std::shared_ptr<Buffer>> Finish();
+
+ /// \brief Initialize state of OutputStream with newly allocated memory and
+ /// set position to 0
+ /// \param[in] initial_capacity the starting allocated capacity
+ /// \param[in,out] pool the memory pool to use for allocations
+ /// \return Status
+ Status Reset(int64_t initial_capacity = 1024, MemoryPool* pool = default_memory_pool());
+
+ int64_t capacity() const { return capacity_; }
+
+ private:
+ BufferOutputStream();
+
+ // Ensures there is sufficient space available to write nbytes
+ Status Reserve(int64_t nbytes);
+
+ std::shared_ptr<ResizableBuffer> buffer_;
+ bool is_open_;
+ int64_t capacity_;
+ int64_t position_;
+ uint8_t* mutable_data_;
+};
+
+/// \brief A helper class to track the size of allocations
+///
+/// Writes to this stream do not copy or retain any data, they just bump
+/// a size counter that can be later used to know exactly which data size
+/// needs to be allocated for actual writing.
+class ARROW_EXPORT MockOutputStream : public OutputStream {
+ public:
+ MockOutputStream() : extent_bytes_written_(0), is_open_(true) {}
+
+ // Implement the OutputStream interface
+ Status Close() override;
+ bool closed() const override;
+ Result<int64_t> Tell() const override;
+ Status Write(const void* data, int64_t nbytes) override;
+ /// \cond FALSE
+ using Writable::Write;
+ /// \endcond
+
+ int64_t GetExtentBytesWritten() const { return extent_bytes_written_; }
+
+ private:
+ int64_t extent_bytes_written_;
+ bool is_open_;
+};
+
+/// \brief An output stream that writes into a fixed-size mutable buffer
+class ARROW_EXPORT FixedSizeBufferWriter : public WritableFile {
+ public:
+ /// Input buffer must be mutable, will abort if not
+ explicit FixedSizeBufferWriter(const std::shared_ptr<Buffer>& buffer);
+ ~FixedSizeBufferWriter() override;
+
+ Status Close() override;
+ bool closed() const override;
+ Status Seek(int64_t position) override;
+ Result<int64_t> Tell() const override;
+ Status Write(const void* data, int64_t nbytes) override;
+ /// \cond FALSE
+ using Writable::Write;
+ /// \endcond
+
+ Status WriteAt(int64_t position, const void* data, int64_t nbytes) override;
+
+ void set_memcopy_threads(int num_threads);
+ void set_memcopy_blocksize(int64_t blocksize);
+ void set_memcopy_threshold(int64_t threshold);
+
+ protected:
+ class FixedSizeBufferWriterImpl;
+ std::unique_ptr<FixedSizeBufferWriterImpl> impl_;
+};
+
+/// \class BufferReader
+/// \brief Random access zero-copy reads on an arrow::Buffer
+class ARROW_EXPORT BufferReader
+ : public internal::RandomAccessFileConcurrencyWrapper<BufferReader> {
+ public:
+ explicit BufferReader(std::shared_ptr<Buffer> buffer);
+ explicit BufferReader(const Buffer& buffer);
+ BufferReader(const uint8_t* data, int64_t size);
+
+ /// \brief Instantiate from std::string or arrow::util::string_view. Does not
+ /// own data
+ explicit BufferReader(const util::string_view& data);
+
+ bool closed() const override;
+
+ bool supports_zero_copy() const override;
+
+ std::shared_ptr<Buffer> buffer() const { return buffer_; }
+
+ // Synchronous ReadAsync override
+ Future<std::shared_ptr<Buffer>> ReadAsync(const IOContext&, int64_t position,
+ int64_t nbytes) override;
+ Status WillNeed(const std::vector<ReadRange>& ranges) override;
+
+ protected:
+ friend RandomAccessFileConcurrencyWrapper<BufferReader>;
+
+ Status DoClose();
+
+ Result<int64_t> DoRead(int64_t nbytes, void* buffer);
+ Result<std::shared_ptr<Buffer>> DoRead(int64_t nbytes);
+ Result<int64_t> DoReadAt(int64_t position, int64_t nbytes, void* out);
+ Result<std::shared_ptr<Buffer>> DoReadAt(int64_t position, int64_t nbytes);
+ Result<util::string_view> DoPeek(int64_t nbytes) override;
+
+ Result<int64_t> DoTell() const;
+ Status DoSeek(int64_t position);
+ Result<int64_t> DoGetSize();
+
+ Status CheckClosed() const {
+ if (!is_open_) {
+ return Status::Invalid("Operation forbidden on closed BufferReader");
+ }
+ return Status::OK();
+ }
+
+ std::shared_ptr<Buffer> buffer_;
+ const uint8_t* data_;
+ int64_t size_;
+ int64_t position_;
+ bool is_open_;
+};
+
+} // namespace io
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/io/mman.h b/contrib/libs/apache/arrow/cpp/src/arrow/io/mman.h
new file mode 100644
index 00000000000..9b06ac8e7b5
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/io/mman.h
@@ -0,0 +1,169 @@
+// Copyright https://code.google.com/p/mman-win32/
+//
+// Licensed under the MIT License;
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/MIT
+
+#pragma once
+
+#include "arrow/util/windows_compatibility.h"
+
+#include <errno.h>
+#include <io.h>
+#include <sys/types.h>
+
+#include <cstdint>
+
+#define PROT_NONE 0
+#define PROT_READ 1
+#define PROT_WRITE 2
+#define PROT_EXEC 4
+
+#define MAP_FILE 0
+#define MAP_SHARED 1
+#define MAP_PRIVATE 2
+#define MAP_TYPE 0xf
+#define MAP_FIXED 0x10
+#define MAP_ANONYMOUS 0x20
+#define MAP_ANON MAP_ANONYMOUS
+
+#define MAP_FAILED ((void*)-1)
+
+/* Flags for msync. */
+#define MS_ASYNC 1
+#define MS_SYNC 2
+#define MS_INVALIDATE 4
+
+#ifndef FILE_MAP_EXECUTE
+#define FILE_MAP_EXECUTE 0x0020
+#endif
+
+static inline int __map_mman_error(const DWORD err, const int deferr) {
+ if (err == 0) return 0;
+ // TODO: implement
+ return err;
+}
+
+static inline DWORD __map_mmap_prot_page(const int prot) {
+ DWORD protect = 0;
+
+ if (prot == PROT_NONE) return protect;
+
+ if ((prot & PROT_EXEC) != 0) {
+ protect = ((prot & PROT_WRITE) != 0) ? PAGE_EXECUTE_READWRITE : PAGE_EXECUTE_READ;
+ } else {
+ protect = ((prot & PROT_WRITE) != 0) ? PAGE_READWRITE : PAGE_READONLY;
+ }
+
+ return protect;
+}
+
+static inline DWORD __map_mmap_prot_file(const int prot) {
+ DWORD desiredAccess = 0;
+
+ if (prot == PROT_NONE) return desiredAccess;
+
+ if ((prot & PROT_READ) != 0) desiredAccess |= FILE_MAP_READ;
+ if ((prot & PROT_WRITE) != 0) desiredAccess |= FILE_MAP_WRITE;
+ if ((prot & PROT_EXEC) != 0) desiredAccess |= FILE_MAP_EXECUTE;
+
+ return desiredAccess;
+}
+
+static inline void* mmap(void* addr, size_t len, int prot, int flags, int fildes,
+ off_t off) {
+ HANDLE fm, h;
+
+ void* map = MAP_FAILED;
+ const uint64_t off64 = static_cast<uint64_t>(off);
+ const uint64_t maxSize = off64 + len;
+
+ const DWORD dwFileOffsetLow = static_cast<DWORD>(off64 & 0xFFFFFFFFUL);
+ const DWORD dwFileOffsetHigh = static_cast<DWORD>((off64 >> 32) & 0xFFFFFFFFUL);
+ const DWORD dwMaxSizeLow = static_cast<DWORD>(maxSize & 0xFFFFFFFFUL);
+ const DWORD dwMaxSizeHigh = static_cast<DWORD>((maxSize >> 32) & 0xFFFFFFFFUL);
+
+ const DWORD protect = __map_mmap_prot_page(prot);
+ const DWORD desiredAccess = __map_mmap_prot_file(prot);
+
+ errno = 0;
+
+ if (len == 0
+ /* Unsupported flag combinations */
+ || (flags & MAP_FIXED) != 0
+ /* Unsupported protection combinations */
+ || prot == PROT_EXEC) {
+ errno = EINVAL;
+ return MAP_FAILED;
+ }
+
+ h = ((flags & MAP_ANONYMOUS) == 0) ? (HANDLE)_get_osfhandle(fildes)
+ : INVALID_HANDLE_VALUE;
+
+ if ((flags & MAP_ANONYMOUS) == 0 && h == INVALID_HANDLE_VALUE) {
+ errno = EBADF;
+ return MAP_FAILED;
+ }
+
+ fm = CreateFileMapping(h, NULL, protect, dwMaxSizeHigh, dwMaxSizeLow, NULL);
+
+ if (fm == NULL) {
+ errno = __map_mman_error(GetLastError(), EPERM);
+ return MAP_FAILED;
+ }
+
+ map = MapViewOfFile(fm, desiredAccess, dwFileOffsetHigh, dwFileOffsetLow, len);
+
+ CloseHandle(fm);
+
+ if (map == NULL) {
+ errno = __map_mman_error(GetLastError(), EPERM);
+ return MAP_FAILED;
+ }
+
+ return map;
+}
+
+static inline int munmap(void* addr, size_t len) {
+ if (UnmapViewOfFile(addr)) return 0;
+
+ errno = __map_mman_error(GetLastError(), EPERM);
+
+ return -1;
+}
+
+static inline int mprotect(void* addr, size_t len, int prot) {
+ DWORD newProtect = __map_mmap_prot_page(prot);
+ DWORD oldProtect = 0;
+
+ if (VirtualProtect(addr, len, newProtect, &oldProtect)) return 0;
+
+ errno = __map_mman_error(GetLastError(), EPERM);
+
+ return -1;
+}
+
+static inline int msync(void* addr, size_t len, int flags) {
+ if (FlushViewOfFile(addr, len)) return 0;
+
+ errno = __map_mman_error(GetLastError(), EPERM);
+
+ return -1;
+}
+
+static inline int mlock(const void* addr, size_t len) {
+ if (VirtualLock((LPVOID)addr, len)) return 0;
+
+ errno = __map_mman_error(GetLastError(), EPERM);
+
+ return -1;
+}
+
+static inline int munlock(const void* addr, size_t len) {
+ if (VirtualUnlock((LPVOID)addr, len)) return 0;
+
+ errno = __map_mman_error(GetLastError(), EPERM);
+
+ return -1;
+}
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/io/slow.cc b/contrib/libs/apache/arrow/cpp/src/arrow/io/slow.cc
new file mode 100644
index 00000000000..1042691fa59
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/io/slow.cc
@@ -0,0 +1,148 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/io/slow.h"
+
+#include <algorithm>
+#include <cstring>
+#include <mutex>
+#include <random>
+#include <thread>
+#include <utility>
+
+#include "arrow/buffer.h"
+#include "arrow/io/util_internal.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/util/io_util.h"
+#include "arrow/util/logging.h"
+
+namespace arrow {
+namespace io {
+
+// Multiply the average by this ratio to get the intended standard deviation
+static constexpr double kStandardDeviationRatio = 0.1;
+
+class LatencyGeneratorImpl : public LatencyGenerator {
+ public:
+ ~LatencyGeneratorImpl() override = default;
+
+ LatencyGeneratorImpl(double average_latency, int32_t seed)
+ : gen_(static_cast<decltype(gen_)::result_type>(seed)),
+ latency_dist_(average_latency, average_latency * kStandardDeviationRatio) {}
+
+ double NextLatency() override {
+ // std::random distributions are unlikely to be thread-safe, and
+ // a RandomAccessFile may be called from multiple threads
+ std::lock_guard<std::mutex> lock(mutex_);
+ return std::max<double>(0.0, latency_dist_(gen_));
+ }
+
+ private:
+ std::default_random_engine gen_;
+ std::normal_distribution<double> latency_dist_;
+ std::mutex mutex_;
+};
+
+LatencyGenerator::~LatencyGenerator() {}
+
+void LatencyGenerator::Sleep() {
+ std::this_thread::sleep_for(std::chrono::duration<double>(NextLatency()));
+}
+
+std::shared_ptr<LatencyGenerator> LatencyGenerator::Make(double average_latency) {
+ return std::make_shared<LatencyGeneratorImpl>(
+ average_latency, static_cast<int32_t>(::arrow::internal::GetRandomSeed()));
+}
+
+std::shared_ptr<LatencyGenerator> LatencyGenerator::Make(double average_latency,
+ int32_t seed) {
+ return std::make_shared<LatencyGeneratorImpl>(average_latency, seed);
+}
+
+//////////////////////////////////////////////////////////////////////////
+// SlowInputStream implementation
+
+SlowInputStream::~SlowInputStream() { internal::CloseFromDestructor(this); }
+
+Status SlowInputStream::Close() { return stream_->Close(); }
+
+Status SlowInputStream::Abort() { return stream_->Abort(); }
+
+bool SlowInputStream::closed() const { return stream_->closed(); }
+
+Result<int64_t> SlowInputStream::Tell() const { return stream_->Tell(); }
+
+Result<int64_t> SlowInputStream::Read(int64_t nbytes, void* out) {
+ latencies_->Sleep();
+ return stream_->Read(nbytes, out);
+}
+
+Result<std::shared_ptr<Buffer>> SlowInputStream::Read(int64_t nbytes) {
+ latencies_->Sleep();
+ return stream_->Read(nbytes);
+}
+
+Result<util::string_view> SlowInputStream::Peek(int64_t nbytes) {
+ return stream_->Peek(nbytes);
+}
+
+//////////////////////////////////////////////////////////////////////////
+// SlowRandomAccessFile implementation
+
+SlowRandomAccessFile::~SlowRandomAccessFile() { internal::CloseFromDestructor(this); }
+
+Status SlowRandomAccessFile::Close() { return stream_->Close(); }
+
+Status SlowRandomAccessFile::Abort() { return stream_->Abort(); }
+
+bool SlowRandomAccessFile::closed() const { return stream_->closed(); }
+
+Result<int64_t> SlowRandomAccessFile::GetSize() { return stream_->GetSize(); }
+
+Status SlowRandomAccessFile::Seek(int64_t position) { return stream_->Seek(position); }
+
+Result<int64_t> SlowRandomAccessFile::Tell() const { return stream_->Tell(); }
+
+Result<int64_t> SlowRandomAccessFile::Read(int64_t nbytes, void* out) {
+ latencies_->Sleep();
+ return stream_->Read(nbytes, out);
+}
+
+Result<std::shared_ptr<Buffer>> SlowRandomAccessFile::Read(int64_t nbytes) {
+ latencies_->Sleep();
+ return stream_->Read(nbytes);
+}
+
+Result<int64_t> SlowRandomAccessFile::ReadAt(int64_t position, int64_t nbytes,
+ void* out) {
+ latencies_->Sleep();
+ return stream_->ReadAt(position, nbytes, out);
+}
+
+Result<std::shared_ptr<Buffer>> SlowRandomAccessFile::ReadAt(int64_t position,
+ int64_t nbytes) {
+ latencies_->Sleep();
+ return stream_->ReadAt(position, nbytes);
+}
+
+Result<util::string_view> SlowRandomAccessFile::Peek(int64_t nbytes) {
+ return stream_->Peek(nbytes);
+}
+
+} // namespace io
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/io/slow.h b/contrib/libs/apache/arrow/cpp/src/arrow/io/slow.h
new file mode 100644
index 00000000000..b0c02a85ac6
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/io/slow.h
@@ -0,0 +1,118 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Slow stream implementations, mainly for testing and benchmarking
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+
+#include "arrow/io/interfaces.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class Buffer;
+class Status;
+
+namespace io {
+
+class ARROW_EXPORT LatencyGenerator {
+ public:
+ virtual ~LatencyGenerator();
+
+ void Sleep();
+
+ virtual double NextLatency() = 0;
+
+ static std::shared_ptr<LatencyGenerator> Make(double average_latency);
+ static std::shared_ptr<LatencyGenerator> Make(double average_latency, int32_t seed);
+};
+
+// XXX use ConcurrencyWrapper? It could increase chances of finding a race.
+
+template <class StreamType>
+class ARROW_EXPORT SlowInputStreamBase : public StreamType {
+ public:
+ SlowInputStreamBase(std::shared_ptr<StreamType> stream,
+ std::shared_ptr<LatencyGenerator> latencies)
+ : stream_(std::move(stream)), latencies_(std::move(latencies)) {}
+
+ SlowInputStreamBase(std::shared_ptr<StreamType> stream, double average_latency)
+ : stream_(std::move(stream)), latencies_(LatencyGenerator::Make(average_latency)) {}
+
+ SlowInputStreamBase(std::shared_ptr<StreamType> stream, double average_latency,
+ int32_t seed)
+ : stream_(std::move(stream)),
+ latencies_(LatencyGenerator::Make(average_latency, seed)) {}
+
+ protected:
+ std::shared_ptr<StreamType> stream_;
+ std::shared_ptr<LatencyGenerator> latencies_;
+};
+
+/// \brief An InputStream wrapper that makes reads slower.
+///
+/// Read() calls are made slower by an average latency (in seconds).
+/// Actual latencies form a normal distribution closely centered
+/// on the average latency.
+/// Other calls are forwarded directly.
+class ARROW_EXPORT SlowInputStream : public SlowInputStreamBase<InputStream> {
+ public:
+ ~SlowInputStream() override;
+
+ using SlowInputStreamBase<InputStream>::SlowInputStreamBase;
+
+ Status Close() override;
+ Status Abort() override;
+ bool closed() const override;
+
+ Result<int64_t> Read(int64_t nbytes, void* out) override;
+ Result<std::shared_ptr<Buffer>> Read(int64_t nbytes) override;
+ Result<util::string_view> Peek(int64_t nbytes) override;
+
+ Result<int64_t> Tell() const override;
+};
+
+/// \brief A RandomAccessFile wrapper that makes reads slower.
+///
+/// Similar to SlowInputStream, but allows random access and seeking.
+class ARROW_EXPORT SlowRandomAccessFile : public SlowInputStreamBase<RandomAccessFile> {
+ public:
+ ~SlowRandomAccessFile() override;
+
+ using SlowInputStreamBase<RandomAccessFile>::SlowInputStreamBase;
+
+ Status Close() override;
+ Status Abort() override;
+ bool closed() const override;
+
+ Result<int64_t> Read(int64_t nbytes, void* out) override;
+ Result<std::shared_ptr<Buffer>> Read(int64_t nbytes) override;
+ Result<int64_t> ReadAt(int64_t position, int64_t nbytes, void* out) override;
+ Result<std::shared_ptr<Buffer>> ReadAt(int64_t position, int64_t nbytes) override;
+ Result<util::string_view> Peek(int64_t nbytes) override;
+
+ Result<int64_t> GetSize() override;
+ Status Seek(int64_t position) override;
+ Result<int64_t> Tell() const override;
+};
+
+} // namespace io
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/io/stdio.cc b/contrib/libs/apache/arrow/cpp/src/arrow/io/stdio.cc
new file mode 100644
index 00000000000..7ef4843a224
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/io/stdio.cc
@@ -0,0 +1,95 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/io/stdio.h"
+
+#include <iostream>
+
+#include "arrow/buffer.h"
+#include "arrow/result.h"
+
+namespace arrow {
+namespace io {
+
+//
+// StdoutStream implementation
+//
+
+StdoutStream::StdoutStream() : pos_(0) { set_mode(FileMode::WRITE); }
+
+Status StdoutStream::Close() { return Status::OK(); }
+
+bool StdoutStream::closed() const { return false; }
+
+Result<int64_t> StdoutStream::Tell() const { return pos_; }
+
+Status StdoutStream::Write(const void* data, int64_t nbytes) {
+ pos_ += nbytes;
+ std::cout.write(reinterpret_cast<const char*>(data), nbytes);
+ return Status::OK();
+}
+
+//
+// StderrStream implementation
+//
+
+StderrStream::StderrStream() : pos_(0) { set_mode(FileMode::WRITE); }
+
+Status StderrStream::Close() { return Status::OK(); }
+
+bool StderrStream::closed() const { return false; }
+
+Result<int64_t> StderrStream::Tell() const { return pos_; }
+
+Status StderrStream::Write(const void* data, int64_t nbytes) {
+ pos_ += nbytes;
+ std::cerr.write(reinterpret_cast<const char*>(data), nbytes);
+ return Status::OK();
+}
+
+//
+// StdinStream implementation
+//
+
+StdinStream::StdinStream() : pos_(0) { set_mode(FileMode::READ); }
+
+Status StdinStream::Close() { return Status::OK(); }
+
+bool StdinStream::closed() const { return false; }
+
+Result<int64_t> StdinStream::Tell() const { return pos_; }
+
+Result<int64_t> StdinStream::Read(int64_t nbytes, void* out) {
+ std::cin.read(reinterpret_cast<char*>(out), nbytes);
+ if (std::cin) {
+ pos_ += nbytes;
+ return nbytes;
+ } else {
+ return 0;
+ }
+}
+
+Result<std::shared_ptr<Buffer>> StdinStream::Read(int64_t nbytes) {
+ ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateResizableBuffer(nbytes));
+ ARROW_ASSIGN_OR_RAISE(int64_t bytes_read, Read(nbytes, buffer->mutable_data()));
+ ARROW_RETURN_NOT_OK(buffer->Resize(bytes_read, false));
+ buffer->ZeroPadding();
+ return std::move(buffer);
+}
+
+} // namespace io
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/io/stdio.h b/contrib/libs/apache/arrow/cpp/src/arrow/io/stdio.h
new file mode 100644
index 00000000000..9484ac77124
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/io/stdio.h
@@ -0,0 +1,82 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+
+#include "arrow/io/interfaces.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace io {
+
+// Output stream that just writes to stdout.
+class ARROW_EXPORT StdoutStream : public OutputStream {
+ public:
+ StdoutStream();
+ ~StdoutStream() override {}
+
+ Status Close() override;
+ bool closed() const override;
+
+ Result<int64_t> Tell() const override;
+
+ Status Write(const void* data, int64_t nbytes) override;
+
+ private:
+ int64_t pos_;
+};
+
+// Output stream that just writes to stderr.
+class ARROW_EXPORT StderrStream : public OutputStream {
+ public:
+ StderrStream();
+ ~StderrStream() override {}
+
+ Status Close() override;
+ bool closed() const override;
+
+ Result<int64_t> Tell() const override;
+
+ Status Write(const void* data, int64_t nbytes) override;
+
+ private:
+ int64_t pos_;
+};
+
+// Input stream that just reads from stdin.
+class ARROW_EXPORT StdinStream : public InputStream {
+ public:
+ StdinStream();
+ ~StdinStream() override {}
+
+ Status Close() override;
+ bool closed() const override;
+
+ Result<int64_t> Tell() const override;
+
+ Result<int64_t> Read(int64_t nbytes, void* out) override;
+
+ Result<std::shared_ptr<Buffer>> Read(int64_t nbytes) override;
+
+ private:
+ int64_t pos_;
+};
+
+} // namespace io
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/io/transform.cc b/contrib/libs/apache/arrow/cpp/src/arrow/io/transform.cc
new file mode 100644
index 00000000000..3fdf5a7a9ba
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/io/transform.cc
@@ -0,0 +1,162 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/io/transform.h"
+
+#include <algorithm>
+#include <cstring>
+#include <mutex>
+#include <random>
+#include <thread>
+#include <utility>
+
+#include "arrow/buffer.h"
+#include "arrow/io/util_internal.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/util/logging.h"
+
+namespace arrow {
+namespace io {
+
+struct TransformInputStream::Impl {
+ std::shared_ptr<InputStream> wrapped_;
+ TransformInputStream::TransformFunc transform_;
+ std::shared_ptr<Buffer> pending_;
+ int64_t pos_ = 0;
+ bool closed_ = false;
+
+ Impl(std::shared_ptr<InputStream> wrapped,
+ TransformInputStream::TransformFunc transform)
+ : wrapped_(std::move(wrapped)), transform_(std::move(transform)) {}
+
+ void Close() {
+ closed_ = true;
+ pending_.reset();
+ }
+
+ Status CheckClosed() const {
+ if (closed_) {
+ return Status::Invalid("Operation on closed file");
+ }
+ return Status::OK();
+ }
+};
+
+TransformInputStream::TransformInputStream(std::shared_ptr<InputStream> wrapped,
+ TransformInputStream::TransformFunc transform)
+ : impl_(new Impl{std::move(wrapped), std::move(transform)}) {}
+
+TransformInputStream::~TransformInputStream() {}
+
+Status TransformInputStream::Close() {
+ impl_->Close();
+ return impl_->wrapped_->Close();
+}
+
+Status TransformInputStream::Abort() { return impl_->wrapped_->Abort(); }
+
+bool TransformInputStream::closed() const { return impl_->closed_; }
+
+Result<std::shared_ptr<Buffer>> TransformInputStream::Read(int64_t nbytes) {
+ RETURN_NOT_OK(impl_->CheckClosed());
+
+ ARROW_ASSIGN_OR_RAISE(auto buf, AllocateResizableBuffer(nbytes));
+ ARROW_ASSIGN_OR_RAISE(auto bytes_read, this->Read(nbytes, buf->mutable_data()));
+ if (bytes_read < nbytes) {
+ RETURN_NOT_OK(buf->Resize(bytes_read, /*shrink_to_fit=*/true));
+ }
+ return std::shared_ptr<Buffer>(std::move(buf));
+}
+
+Result<int64_t> TransformInputStream::Read(int64_t nbytes, void* out) {
+ RETURN_NOT_OK(impl_->CheckClosed());
+
+ if (nbytes == 0) {
+ return 0;
+ }
+
+ int64_t avail_size = 0;
+ std::vector<std::shared_ptr<Buffer>> avail;
+ if (impl_->pending_) {
+ avail.push_back(impl_->pending_);
+ avail_size += impl_->pending_->size();
+ }
+ // Accumulate enough transformed data to satisfy read
+ while (avail_size < nbytes) {
+ ARROW_ASSIGN_OR_RAISE(auto buf, impl_->wrapped_->Read(nbytes));
+ const bool have_eof = (buf->size() == 0);
+ // Even if EOF is met, let the transform function run a last time
+ // (for example to flush internal buffers)
+ ARROW_ASSIGN_OR_RAISE(buf, impl_->transform_(std::move(buf)));
+ avail_size += buf->size();
+ avail.push_back(std::move(buf));
+ if (have_eof) {
+ break;
+ }
+ }
+ DCHECK(!avail.empty());
+
+ // Coalesce buffer data
+ uint8_t* out_data = reinterpret_cast<uint8_t*>(out);
+ int64_t copied_bytes = 0;
+ for (size_t i = 0; i < avail.size() - 1; ++i) {
+ // All buffers except the last fit fully into `nbytes`
+ const auto buf = std::move(avail[i]);
+ DCHECK_LE(buf->size(), nbytes);
+ memcpy(out_data, buf->data(), static_cast<size_t>(buf->size()));
+ out_data += buf->size();
+ nbytes -= buf->size();
+ copied_bytes += buf->size();
+ }
+ {
+ // Last buffer: splice into `out` and `pending_`
+ const auto buf = std::move(avail.back());
+ const int64_t to_copy = std::min(buf->size(), nbytes);
+ memcpy(out_data, buf->data(), static_cast<size_t>(to_copy));
+ copied_bytes += to_copy;
+ if (buf->size() > to_copy) {
+ impl_->pending_ = SliceBuffer(buf, to_copy);
+ } else {
+ impl_->pending_.reset();
+ }
+ }
+ impl_->pos_ += copied_bytes;
+ return copied_bytes;
+}
+
+Result<int64_t> TransformInputStream::Tell() const {
+ RETURN_NOT_OK(impl_->CheckClosed());
+
+ return impl_->pos_;
+}
+
+Result<std::shared_ptr<const KeyValueMetadata>> TransformInputStream::ReadMetadata() {
+ RETURN_NOT_OK(impl_->CheckClosed());
+
+ return impl_->wrapped_->ReadMetadata();
+}
+
+Future<std::shared_ptr<const KeyValueMetadata>> TransformInputStream::ReadMetadataAsync(
+ const IOContext& io_context) {
+ RETURN_NOT_OK(impl_->CheckClosed());
+
+ return impl_->wrapped_->ReadMetadataAsync(io_context);
+}
+
+} // namespace io
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/io/transform.h b/contrib/libs/apache/arrow/cpp/src/arrow/io/transform.h
new file mode 100644
index 00000000000..c117f275929
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/io/transform.h
@@ -0,0 +1,60 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Slow stream implementations, mainly for testing and benchmarking
+
+#pragma once
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <utility>
+
+#include "arrow/io/interfaces.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace io {
+
+class ARROW_EXPORT TransformInputStream : public InputStream {
+ public:
+ using TransformFunc =
+ std::function<Result<std::shared_ptr<Buffer>>(const std::shared_ptr<Buffer>&)>;
+
+ TransformInputStream(std::shared_ptr<InputStream> wrapped, TransformFunc transform);
+ ~TransformInputStream() override;
+
+ Status Close() override;
+ Status Abort() override;
+ bool closed() const override;
+
+ Result<int64_t> Read(int64_t nbytes, void* out) override;
+ Result<std::shared_ptr<Buffer>> Read(int64_t nbytes) override;
+
+ Result<std::shared_ptr<const KeyValueMetadata>> ReadMetadata() override;
+ Future<std::shared_ptr<const KeyValueMetadata>> ReadMetadataAsync(
+ const IOContext& io_context) override;
+
+ Result<int64_t> Tell() const override;
+
+ protected:
+ struct Impl;
+ std::unique_ptr<Impl> impl_;
+};
+
+} // namespace io
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/io/type_fwd.h b/contrib/libs/apache/arrow/cpp/src/arrow/io/type_fwd.h
new file mode 100644
index 00000000000..a2fd33bf360
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/io/type_fwd.h
@@ -0,0 +1,79 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/type_fwd.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace io {
+
+struct FileMode {
+ enum type { READ, WRITE, READWRITE };
+};
+
+struct IOContext;
+struct CacheOptions;
+
+/// EXPERIMENTAL: convenience global singleton for default IOContext settings
+ARROW_EXPORT
+const IOContext& default_io_context();
+
+/// \brief Get the capacity of the global I/O thread pool
+///
+/// Return the number of worker threads in the thread pool to which
+/// Arrow dispatches various I/O-bound tasks. This is an ideal number,
+/// not necessarily the exact number of threads at a given point in time.
+///
+/// You can change this number using SetIOThreadPoolCapacity().
+ARROW_EXPORT int GetIOThreadPoolCapacity();
+
+/// \brief Set the capacity of the global I/O thread pool
+///
+/// Set the number of worker threads in the thread pool to which
+/// Arrow dispatches various I/O-bound tasks.
+///
+/// The current number is returned by GetIOThreadPoolCapacity().
+ARROW_EXPORT Status SetIOThreadPoolCapacity(int threads);
+
+class FileInterface;
+class Seekable;
+class Writable;
+class Readable;
+class OutputStream;
+class FileOutputStream;
+class InputStream;
+class ReadableFile;
+class RandomAccessFile;
+class MemoryMappedFile;
+class WritableFile;
+class ReadWriteFileInterface;
+
+class LatencyGenerator;
+
+class BufferReader;
+
+class BufferInputStream;
+class BufferOutputStream;
+class CompressedInputStream;
+class CompressedOutputStream;
+class BufferedInputStream;
+class BufferedOutputStream;
+
+} // namespace io
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/io/util_internal.h b/contrib/libs/apache/arrow/cpp/src/arrow/io/util_internal.h
new file mode 100644
index 00000000000..b1d75d1d0bd
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/io/util_internal.h
@@ -0,0 +1,66 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "arrow/io/interfaces.h"
+#include "arrow/util/thread_pool.h"
+#include "arrow/util/type_fwd.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace io {
+namespace internal {
+
+ARROW_EXPORT void CloseFromDestructor(FileInterface* file);
+
+// Validate a (offset, size) region (as given to ReadAt) against
+// the file size. Return the actual read size.
+ARROW_EXPORT Result<int64_t> ValidateReadRange(int64_t offset, int64_t size,
+ int64_t file_size);
+// Validate a (offset, size) region (as given to WriteAt) against
+// the file size. Short writes are not allowed.
+ARROW_EXPORT Status ValidateWriteRange(int64_t offset, int64_t size, int64_t file_size);
+
+// Validate a (offset, size) region (as given to ReadAt or WriteAt), without
+// knowing the file size.
+ARROW_EXPORT Status ValidateRange(int64_t offset, int64_t size);
+
+ARROW_EXPORT
+std::vector<ReadRange> CoalesceReadRanges(std::vector<ReadRange> ranges,
+ int64_t hole_size_limit,
+ int64_t range_size_limit);
+
+ARROW_EXPORT
+::arrow::internal::ThreadPool* GetIOThreadPool();
+
+template <typename... SubmitArgs>
+auto SubmitIO(IOContext io_context, SubmitArgs&&... submit_args)
+ -> decltype(std::declval<::arrow::internal::Executor*>()->Submit(submit_args...)) {
+ ::arrow::internal::TaskHints hints;
+ hints.external_id = io_context.external_id();
+ return io_context.executor()->Submit(hints, io_context.stop_token(),
+ std::forward<SubmitArgs>(submit_args)...);
+}
+
+} // namespace internal
+} // namespace io
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/api.h b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/api.h
new file mode 100644
index 00000000000..b5690aed8da
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/api.h
@@ -0,0 +1,25 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/ipc/dictionary.h"
+#include "arrow/ipc/feather.h"
+#include "arrow/ipc/json_simple.h"
+#include "arrow/ipc/message.h"
+#include "arrow/ipc/reader.h"
+#include "arrow/ipc/writer.h"
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/dictionary.cc b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/dictionary.cc
new file mode 100644
index 00000000000..3ab2c8b3847
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/dictionary.cc
@@ -0,0 +1,412 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/ipc/dictionary.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+#include <set>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/array/concatenate.h"
+#include "arrow/array/validate.h"
+#include "arrow/extension_type.h"
+#include "arrow/record_batch.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/logging.h"
+
+namespace std {
+template <>
+struct hash<arrow::FieldPath> {
+ size_t operator()(const arrow::FieldPath& path) const { return path.hash(); }
+};
+} // namespace std
+
+namespace arrow {
+
+using internal::checked_cast;
+
+namespace ipc {
+
+using internal::FieldPosition;
+
+// ----------------------------------------------------------------------
+// DictionaryFieldMapper implementation
+
+struct DictionaryFieldMapper::Impl {
+ using FieldPathMap = std::unordered_map<FieldPath, int64_t>;
+
+ FieldPathMap field_path_to_id;
+
+ void ImportSchema(const Schema& schema) {
+ ImportFields(FieldPosition(), schema.fields());
+ }
+
+ Status AddSchemaFields(const Schema& schema) {
+ if (!field_path_to_id.empty()) {
+ return Status::Invalid("Non-empty DictionaryFieldMapper");
+ }
+ ImportSchema(schema);
+ return Status::OK();
+ }
+
+ Status AddField(int64_t id, std::vector<int> field_path) {
+ const auto pair = field_path_to_id.emplace(FieldPath(std::move(field_path)), id);
+ if (!pair.second) {
+ return Status::KeyError("Field already mapped to id");
+ }
+ return Status::OK();
+ }
+
+ Result<int64_t> GetFieldId(std::vector<int> field_path) const {
+ const auto it = field_path_to_id.find(FieldPath(std::move(field_path)));
+ if (it == field_path_to_id.end()) {
+ return Status::KeyError("Dictionary field not found");
+ }
+ return it->second;
+ }
+
+ int num_fields() const { return static_cast<int>(field_path_to_id.size()); }
+
+ int num_dicts() const {
+ std::set<int64_t> uniqueIds;
+
+ for (auto& kv : field_path_to_id) {
+ uniqueIds.insert(kv.second);
+ }
+
+ return static_cast<int>(uniqueIds.size());
+ }
+
+ private:
+ void ImportFields(const FieldPosition& pos,
+ const std::vector<std::shared_ptr<Field>>& fields) {
+ for (int i = 0; i < static_cast<int>(fields.size()); ++i) {
+ ImportField(pos.child(i), *fields[i]);
+ }
+ }
+
+ void ImportField(const FieldPosition& pos, const Field& field) {
+ const DataType* type = field.type().get();
+ if (type->id() == Type::EXTENSION) {
+ type = checked_cast<const ExtensionType&>(*type).storage_type().get();
+ }
+ if (type->id() == Type::DICTIONARY) {
+ InsertPath(pos);
+ // Import nested dictionaries
+ ImportFields(pos,
+ checked_cast<const DictionaryType&>(*type).value_type()->fields());
+ } else {
+ ImportFields(pos, type->fields());
+ }
+ }
+
+ void InsertPath(const FieldPosition& pos) {
+ const int64_t id = field_path_to_id.size();
+ const auto pair = field_path_to_id.emplace(FieldPath(pos.path()), id);
+ DCHECK(pair.second); // was inserted
+ ARROW_UNUSED(pair);
+ }
+};
+
+DictionaryFieldMapper::DictionaryFieldMapper() : impl_(new Impl) {}
+
+DictionaryFieldMapper::DictionaryFieldMapper(const Schema& schema) : impl_(new Impl) {
+ impl_->ImportSchema(schema);
+}
+
+DictionaryFieldMapper::~DictionaryFieldMapper() {}
+
+Status DictionaryFieldMapper::AddSchemaFields(const Schema& schema) {
+ return impl_->AddSchemaFields(schema);
+}
+
+Status DictionaryFieldMapper::AddField(int64_t id, std::vector<int> field_path) {
+ return impl_->AddField(id, std::move(field_path));
+}
+
+Result<int64_t> DictionaryFieldMapper::GetFieldId(std::vector<int> field_path) const {
+ return impl_->GetFieldId(std::move(field_path));
+}
+
+int DictionaryFieldMapper::num_fields() const { return impl_->num_fields(); }
+
+int DictionaryFieldMapper::num_dicts() const { return impl_->num_dicts(); }
+
+// ----------------------------------------------------------------------
+// DictionaryMemo implementation
+
+namespace {
+
+bool HasUnresolvedNestedDict(const ArrayData& data) {
+ if (data.type->id() == Type::DICTIONARY) {
+ if (data.dictionary == nullptr) {
+ return true;
+ }
+ if (HasUnresolvedNestedDict(*data.dictionary)) {
+ return true;
+ }
+ }
+ for (const auto& child : data.child_data) {
+ if (HasUnresolvedNestedDict(*child)) {
+ return true;
+ }
+ }
+ return false;
+}
+
+} // namespace
+
+struct DictionaryMemo::Impl {
+ // Map of dictionary id to dictionary array(s) (several in case of deltas)
+ std::unordered_map<int64_t, ArrayDataVector> id_to_dictionary_;
+ std::unordered_map<int64_t, std::shared_ptr<DataType>> id_to_type_;
+ DictionaryFieldMapper mapper_;
+
+ Result<decltype(id_to_dictionary_)::iterator> FindDictionary(int64_t id) {
+ auto it = id_to_dictionary_.find(id);
+ if (it == id_to_dictionary_.end()) {
+ return Status::KeyError("Dictionary with id ", id, " not found");
+ }
+ return it;
+ }
+
+ Result<std::shared_ptr<ArrayData>> ReifyDictionary(int64_t id, MemoryPool* pool) {
+ ARROW_ASSIGN_OR_RAISE(auto it, FindDictionary(id));
+ ArrayDataVector* data_vector = &it->second;
+
+ DCHECK(!data_vector->empty());
+ if (data_vector->size() > 1) {
+ // There are deltas, we need to concatenate them to the first dictionary.
+ ArrayVector to_combine;
+ to_combine.reserve(data_vector->size());
+ // IMPORTANT: At this point, the dictionary data may be untrusted.
+ // We need to validate it, as concatenation can crash on invalid or
+ // corrupted data. Full validation is necessary for certain types
+ // (for example nested dictionaries).
+ for (const auto& data : *data_vector) {
+ if (HasUnresolvedNestedDict(*data)) {
+ return Status::NotImplemented(
+ "Encountered delta dictionary with an unresolved nested dictionary");
+ }
+ RETURN_NOT_OK(::arrow::internal::ValidateArray(*data));
+ RETURN_NOT_OK(::arrow::internal::ValidateArrayFull(*data));
+ to_combine.push_back(MakeArray(data));
+ }
+ ARROW_ASSIGN_OR_RAISE(auto combined_dict, Concatenate(to_combine, pool));
+ *data_vector = {combined_dict->data()};
+ }
+
+ return data_vector->back();
+ }
+};
+
+DictionaryMemo::DictionaryMemo() : impl_(new Impl()) {}
+
+DictionaryMemo::~DictionaryMemo() {}
+
+DictionaryFieldMapper& DictionaryMemo::fields() { return impl_->mapper_; }
+
+const DictionaryFieldMapper& DictionaryMemo::fields() const { return impl_->mapper_; }
+
+Result<std::shared_ptr<DataType>> DictionaryMemo::GetDictionaryType(int64_t id) const {
+ const auto it = impl_->id_to_type_.find(id);
+ if (it == impl_->id_to_type_.end()) {
+ return Status::KeyError("No record of dictionary type with id ", id);
+ }
+ return it->second;
+}
+
+// Returns KeyError if dictionary not found
+Result<std::shared_ptr<ArrayData>> DictionaryMemo::GetDictionary(int64_t id,
+ MemoryPool* pool) const {
+ return impl_->ReifyDictionary(id, pool);
+}
+
+Status DictionaryMemo::AddDictionaryType(int64_t id,
+ const std::shared_ptr<DataType>& type) {
+ // AddDictionaryType expects the dict value type
+ DCHECK_NE(type->id(), Type::DICTIONARY);
+ const auto pair = impl_->id_to_type_.emplace(id, type);
+ if (!pair.second && !pair.first->second->Equals(*type)) {
+ return Status::KeyError("Conflicting dictionary types for id ", id);
+ }
+ return Status::OK();
+}
+
+bool DictionaryMemo::HasDictionary(int64_t id) const {
+ const auto it = impl_->id_to_dictionary_.find(id);
+ return it != impl_->id_to_dictionary_.end();
+}
+
+Status DictionaryMemo::AddDictionary(int64_t id,
+ const std::shared_ptr<ArrayData>& dictionary) {
+ const auto pair = impl_->id_to_dictionary_.emplace(id, ArrayDataVector{dictionary});
+ if (!pair.second) {
+ return Status::KeyError("Dictionary with id ", id, " already exists");
+ }
+ return Status::OK();
+}
+
+Status DictionaryMemo::AddDictionaryDelta(int64_t id,
+ const std::shared_ptr<ArrayData>& dictionary) {
+ ARROW_ASSIGN_OR_RAISE(auto it, impl_->FindDictionary(id));
+ it->second.push_back(dictionary);
+ return Status::OK();
+}
+
+Result<bool> DictionaryMemo::AddOrReplaceDictionary(
+ int64_t id, const std::shared_ptr<ArrayData>& dictionary) {
+ ArrayDataVector value{dictionary};
+
+ auto pair = impl_->id_to_dictionary_.emplace(id, value);
+ if (pair.second) {
+ // Inserted
+ return true;
+ } else {
+ // Update existing value
+ pair.first->second = std::move(value);
+ return false;
+ }
+}
+
+// ----------------------------------------------------------------------
+// CollectDictionaries implementation
+
+namespace {
+
+struct DictionaryCollector {
+ const DictionaryFieldMapper& mapper_;
+ DictionaryVector dictionaries_;
+
+ Status WalkChildren(const FieldPosition& position, const DataType& type,
+ const Array& array) {
+ for (int i = 0; i < type.num_fields(); ++i) {
+ auto boxed_child = MakeArray(array.data()->child_data[i]);
+ RETURN_NOT_OK(Visit(position.child(i), type.field(i), boxed_child.get()));
+ }
+ return Status::OK();
+ }
+
+ Status Visit(const FieldPosition& position, const std::shared_ptr<Field>& field,
+ const Array* array) {
+ const DataType* type = array->type().get();
+
+ if (type->id() == Type::EXTENSION) {
+ type = checked_cast<const ExtensionType&>(*type).storage_type().get();
+ array = checked_cast<const ExtensionArray&>(*array).storage().get();
+ }
+ if (type->id() == Type::DICTIONARY) {
+ const auto& dict_array = checked_cast<const DictionaryArray&>(*array);
+ auto dictionary = dict_array.dictionary();
+
+ // Traverse the dictionary to first gather any nested dictionaries
+ // (so that they appear in the output before their parent)
+ const auto& dict_type = checked_cast<const DictionaryType&>(*type);
+ RETURN_NOT_OK(WalkChildren(position, *dict_type.value_type(), *dictionary));
+
+ // Then record the dictionary itself
+ ARROW_ASSIGN_OR_RAISE(int64_t id, mapper_.GetFieldId(position.path()));
+ dictionaries_.emplace_back(id, dictionary);
+ } else {
+ RETURN_NOT_OK(WalkChildren(position, *type, *array));
+ }
+ return Status::OK();
+ }
+
+ Status Collect(const RecordBatch& batch) {
+ FieldPosition position;
+ const Schema& schema = *batch.schema();
+ dictionaries_.reserve(mapper_.num_fields());
+
+ for (int i = 0; i < schema.num_fields(); ++i) {
+ RETURN_NOT_OK(Visit(position.child(i), schema.field(i), batch.column(i).get()));
+ }
+ return Status::OK();
+ }
+};
+
+struct DictionaryResolver {
+ const DictionaryMemo& memo_;
+ MemoryPool* pool_;
+
+ Status VisitChildren(const ArrayDataVector& data_vector, FieldPosition parent_pos) {
+ int i = 0;
+ for (const auto& data : data_vector) {
+ // Some data entries may be missing if reading only a subset of the schema
+ if (data != nullptr) {
+ RETURN_NOT_OK(VisitField(parent_pos.child(i), data.get()));
+ }
+ ++i;
+ }
+ return Status::OK();
+ }
+
+ Status VisitField(FieldPosition field_pos, ArrayData* data) {
+ const DataType* type = data->type.get();
+ if (type->id() == Type::EXTENSION) {
+ type = checked_cast<const ExtensionType&>(*type).storage_type().get();
+ }
+ if (type->id() == Type::DICTIONARY) {
+ ARROW_ASSIGN_OR_RAISE(const int64_t id,
+ memo_.fields().GetFieldId(field_pos.path()));
+ ARROW_ASSIGN_OR_RAISE(data->dictionary, memo_.GetDictionary(id, pool_));
+ // Resolve nested dictionary data
+ RETURN_NOT_OK(VisitField(field_pos, data->dictionary.get()));
+ }
+ // Resolve child data
+ return VisitChildren(data->child_data, field_pos);
+ }
+};
+
+} // namespace
+
+Result<DictionaryVector> CollectDictionaries(const RecordBatch& batch,
+ const DictionaryFieldMapper& mapper) {
+ DictionaryCollector collector{mapper, {}};
+ RETURN_NOT_OK(collector.Collect(batch));
+ return std::move(collector.dictionaries_);
+}
+
+namespace internal {
+
+Status CollectDictionaries(const RecordBatch& batch, DictionaryMemo* memo) {
+ RETURN_NOT_OK(memo->fields().AddSchemaFields(*batch.schema()));
+ ARROW_ASSIGN_OR_RAISE(const auto dictionaries,
+ CollectDictionaries(batch, memo->fields()));
+ for (const auto& pair : dictionaries) {
+ RETURN_NOT_OK(memo->AddDictionary(pair.first, pair.second->data()));
+ }
+ return Status::OK();
+}
+
+} // namespace internal
+
+Status ResolveDictionaries(const ArrayDataVector& columns, const DictionaryMemo& memo,
+ MemoryPool* pool) {
+ DictionaryResolver resolver{memo, pool};
+ return resolver.VisitChildren(columns, FieldPosition());
+}
+
+} // namespace ipc
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/dictionary.h b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/dictionary.h
new file mode 100644
index 00000000000..e4287cb1974
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/dictionary.h
@@ -0,0 +1,177 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Tools for dictionaries in IPC context
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace ipc {
+
+namespace internal {
+
+class FieldPosition {
+ public:
+ FieldPosition() : parent_(NULLPTR), index_(-1), depth_(0) {}
+
+ FieldPosition child(int index) const { return {this, index}; }
+
+ std::vector<int> path() const {
+ std::vector<int> path(depth_);
+ const FieldPosition* cur = this;
+ for (int i = depth_ - 1; i >= 0; --i) {
+ path[i] = cur->index_;
+ cur = cur->parent_;
+ }
+ return path;
+ }
+
+ protected:
+ FieldPosition(const FieldPosition* parent, int index)
+ : parent_(parent), index_(index), depth_(parent->depth_ + 1) {}
+
+ const FieldPosition* parent_;
+ int index_;
+ int depth_;
+};
+
+} // namespace internal
+
+/// \brief Map fields in a schema to dictionary ids
+///
+/// The mapping is structural, i.e. the field path (as a vector of indices)
+/// is associated to the dictionary id. A dictionary id may be associated
+/// to multiple fields.
+class ARROW_EXPORT DictionaryFieldMapper {
+ public:
+ DictionaryFieldMapper();
+ explicit DictionaryFieldMapper(const Schema& schema);
+ ~DictionaryFieldMapper();
+
+ Status AddSchemaFields(const Schema& schema);
+ Status AddField(int64_t id, std::vector<int> field_path);
+
+ Result<int64_t> GetFieldId(std::vector<int> field_path) const;
+
+ int num_fields() const;
+
+ /// \brief Returns number of unique dictionaries, taking into
+ /// account that different fields can share the same dictionary.
+ int num_dicts() const;
+
+ private:
+ struct Impl;
+ std::unique_ptr<Impl> impl_;
+};
+
+using DictionaryVector = std::vector<std::pair<int64_t, std::shared_ptr<Array>>>;
+
+/// \brief Memoization data structure for reading dictionaries from IPC streams
+///
+/// This structure tracks the following associations:
+/// - field position (structural) -> dictionary id
+/// - dictionary id -> value type
+/// - dictionary id -> dictionary (value) data
+///
+/// Together, they allow resolving dictionary data when reading an IPC stream,
+/// using metadata recorded in the schema message and data recorded in the
+/// dictionary batch messages (see ResolveDictionaries).
+///
+/// This structure isn't useful for writing an IPC stream, where only
+/// DictionaryFieldMapper is necessary.
+class ARROW_EXPORT DictionaryMemo {
+ public:
+ DictionaryMemo();
+ ~DictionaryMemo();
+
+ DictionaryFieldMapper& fields();
+ const DictionaryFieldMapper& fields() const;
+
+ /// \brief Return current dictionary corresponding to a particular
+ /// id. Returns KeyError if id not found
+ Result<std::shared_ptr<ArrayData>> GetDictionary(int64_t id, MemoryPool* pool) const;
+
+ /// \brief Return dictionary value type corresponding to a
+ /// particular dictionary id.
+ Result<std::shared_ptr<DataType>> GetDictionaryType(int64_t id) const;
+
+ /// \brief Return true if we have a dictionary for the input id
+ bool HasDictionary(int64_t id) const;
+
+ /// \brief Add a dictionary value type to the memo with a particular id.
+ /// Returns KeyError if a different type is already registered with the same id.
+ Status AddDictionaryType(int64_t id, const std::shared_ptr<DataType>& type);
+
+ /// \brief Add a dictionary to the memo with a particular id. Returns
+ /// KeyError if that dictionary already exists
+ Status AddDictionary(int64_t id, const std::shared_ptr<ArrayData>& dictionary);
+
+ /// \brief Append a dictionary delta to the memo with a particular id. Returns
+ /// KeyError if that dictionary does not exists
+ Status AddDictionaryDelta(int64_t id, const std::shared_ptr<ArrayData>& dictionary);
+
+ /// \brief Add a dictionary to the memo if it does not have one with the id,
+ /// otherwise, replace the dictionary with the new one.
+ ///
+ /// Return true if the dictionary was added, false if replaced.
+ Result<bool> AddOrReplaceDictionary(int64_t id,
+ const std::shared_ptr<ArrayData>& dictionary);
+
+ private:
+ struct Impl;
+ std::unique_ptr<Impl> impl_;
+};
+
+// For writing: collect dictionary entries to write to the IPC stream, in order
+// (i.e. inner dictionaries before dependent outer dictionaries).
+ARROW_EXPORT
+Result<DictionaryVector> CollectDictionaries(const RecordBatch& batch,
+ const DictionaryFieldMapper& mapper);
+
+// For reading: resolve all dictionaries in columns, according to the field
+// mapping and dictionary arrays stored in memo.
+// Columns may be sparse, i.e. some entries may be left null
+// (e.g. if an inclusion mask was used).
+ARROW_EXPORT
+Status ResolveDictionaries(const ArrayDataVector& columns, const DictionaryMemo& memo,
+ MemoryPool* pool);
+
+namespace internal {
+
+// Like CollectDictionaries above, but uses the memo's DictionaryFieldMapper
+// and all collected dictionaries are added to the memo using AddDictionary.
+//
+// This is used as a shortcut in some roundtripping tests (to avoid emitting
+// any actual dictionary batches).
+ARROW_EXPORT
+Status CollectDictionaries(const RecordBatch& batch, DictionaryMemo* memo);
+
+} // namespace internal
+
+} // namespace ipc
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/feather.cc b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/feather.cc
new file mode 100644
index 00000000000..b1c30eec0b3
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/feather.cc
@@ -0,0 +1,819 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/ipc/feather.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <sstream> // IWYU pragma: keep
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include <flatbuffers/flatbuffers.h>
+
+#include "arrow/array.h"
+#include "arrow/buffer.h"
+#include "arrow/chunked_array.h"
+#include "arrow/io/interfaces.h"
+#include "arrow/ipc/metadata_internal.h"
+#include "arrow/ipc/options.h"
+#include "arrow/ipc/reader.h"
+#include "arrow/ipc/util.h"
+#include "arrow/ipc/writer.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/table.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/make_unique.h"
+#include "arrow/visitor_inline.h"
+
+#include "generated/feather_generated.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+using internal::make_unique;
+
+class ExtensionType;
+
+namespace ipc {
+namespace feather {
+
+namespace {
+
+using FBB = flatbuffers::FlatBufferBuilder;
+
+constexpr const char* kFeatherV1MagicBytes = "FEA1";
+constexpr const int kFeatherDefaultAlignment = 8;
+const uint8_t kPaddingBytes[kFeatherDefaultAlignment] = {0};
+
+inline int64_t PaddedLength(int64_t nbytes) {
+ static const int64_t alignment = kFeatherDefaultAlignment;
+ return ((nbytes + alignment - 1) / alignment) * alignment;
+}
+
+Status WritePaddedWithOffset(io::OutputStream* stream, const uint8_t* data,
+ int64_t bit_offset, const int64_t length,
+ int64_t* bytes_written) {
+ data = data + bit_offset / 8;
+ uint8_t bit_shift = static_cast<uint8_t>(bit_offset % 8);
+ if (bit_offset == 0) {
+ RETURN_NOT_OK(stream->Write(data, length));
+ } else {
+ constexpr int64_t buffersize = 256;
+ uint8_t buffer[buffersize];
+ const uint8_t lshift = static_cast<uint8_t>(8 - bit_shift);
+ const uint8_t* buffer_end = buffer + buffersize;
+ uint8_t* buffer_it = buffer;
+
+ for (const uint8_t* end = data + length; data != end;) {
+ uint8_t r = static_cast<uint8_t>(*data++ >> bit_shift);
+ uint8_t l = static_cast<uint8_t>(*data << lshift);
+ uint8_t value = l | r;
+ *buffer_it++ = value;
+ if (buffer_it == buffer_end) {
+ RETURN_NOT_OK(stream->Write(buffer, buffersize));
+ buffer_it = buffer;
+ }
+ }
+ if (buffer_it != buffer) {
+ RETURN_NOT_OK(stream->Write(buffer, buffer_it - buffer));
+ }
+ }
+
+ int64_t remainder = PaddedLength(length) - length;
+ if (remainder != 0) {
+ RETURN_NOT_OK(stream->Write(kPaddingBytes, remainder));
+ }
+ *bytes_written = length + remainder;
+ return Status::OK();
+}
+
+Status WritePadded(io::OutputStream* stream, const uint8_t* data, int64_t length,
+ int64_t* bytes_written) {
+ return WritePaddedWithOffset(stream, data, /*bit_offset=*/0, length, bytes_written);
+}
+
+struct ColumnType {
+ enum type { PRIMITIVE, CATEGORY, TIMESTAMP, DATE, TIME };
+};
+
+inline TimeUnit::type FromFlatbufferEnum(fbs::TimeUnit unit) {
+ return static_cast<TimeUnit::type>(static_cast<int>(unit));
+}
+
+/// For compatibility, we need to write any data sometimes just to keep producing
+/// files that can be read with an older reader.
+Status WritePaddedBlank(io::OutputStream* stream, int64_t length,
+ int64_t* bytes_written) {
+ const uint8_t null = 0;
+ for (int64_t i = 0; i < length; i++) {
+ RETURN_NOT_OK(stream->Write(&null, 1));
+ }
+ int64_t remainder = PaddedLength(length) - length;
+ if (remainder != 0) {
+ RETURN_NOT_OK(stream->Write(kPaddingBytes, remainder));
+ }
+ *bytes_written = length + remainder;
+ return Status::OK();
+}
+
+// ----------------------------------------------------------------------
+// ReaderV1
+
+class ReaderV1 : public Reader {
+ public:
+ Status Open(const std::shared_ptr<io::RandomAccessFile>& source) {
+ source_ = source;
+
+ ARROW_ASSIGN_OR_RAISE(int64_t size, source->GetSize());
+ int magic_size = static_cast<int>(strlen(kFeatherV1MagicBytes));
+ int footer_size = magic_size + static_cast<int>(sizeof(uint32_t));
+
+ // Now get the footer and verify
+ ARROW_ASSIGN_OR_RAISE(auto buffer, source->ReadAt(size - footer_size, footer_size));
+
+ if (memcmp(buffer->data() + sizeof(uint32_t), kFeatherV1MagicBytes, magic_size)) {
+ return Status::Invalid("Feather file footer incomplete");
+ }
+
+ uint32_t metadata_length = *reinterpret_cast<const uint32_t*>(buffer->data());
+ if (size < magic_size + footer_size + metadata_length) {
+ return Status::Invalid("File is smaller than indicated metadata size");
+ }
+ ARROW_ASSIGN_OR_RAISE(
+ metadata_buffer_,
+ source->ReadAt(size - footer_size - metadata_length, metadata_length));
+
+ metadata_ = fbs::GetCTable(metadata_buffer_->data());
+ return ReadSchema();
+ }
+
+ Status ReadSchema() {
+ std::vector<std::shared_ptr<Field>> fields;
+ for (int i = 0; i < static_cast<int>(metadata_->columns()->size()); ++i) {
+ const fbs::Column* col = metadata_->columns()->Get(i);
+ std::shared_ptr<DataType> type;
+ RETURN_NOT_OK(
+ GetDataType(col->values(), col->metadata_type(), col->metadata(), &type));
+ fields.push_back(::arrow::field(col->name()->str(), type));
+ }
+ schema_ = ::arrow::schema(std::move(fields));
+ return Status::OK();
+ }
+
+ Status GetDataType(const fbs::PrimitiveArray* values, fbs::TypeMetadata metadata_type,
+ const void* metadata, std::shared_ptr<DataType>* out) {
+#define PRIMITIVE_CASE(CAP_TYPE, FACTORY_FUNC) \
+ case fbs::Type::CAP_TYPE: \
+ *out = FACTORY_FUNC(); \
+ break;
+
+ switch (metadata_type) {
+ case fbs::TypeMetadata::CategoryMetadata: {
+ auto meta = static_cast<const fbs::CategoryMetadata*>(metadata);
+
+ std::shared_ptr<DataType> index_type, dict_type;
+ RETURN_NOT_OK(GetDataType(values, fbs::TypeMetadata::NONE, nullptr, &index_type));
+ RETURN_NOT_OK(
+ GetDataType(meta->levels(), fbs::TypeMetadata::NONE, nullptr, &dict_type));
+ *out = dictionary(index_type, dict_type, meta->ordered());
+ break;
+ }
+ case fbs::TypeMetadata::TimestampMetadata: {
+ auto meta = static_cast<const fbs::TimestampMetadata*>(metadata);
+ TimeUnit::type unit = FromFlatbufferEnum(meta->unit());
+ std::string tz;
+ // flatbuffer non-null
+ if (meta->timezone() != 0) {
+ tz = meta->timezone()->str();
+ } else {
+ tz = "";
+ }
+ *out = timestamp(unit, tz);
+ } break;
+ case fbs::TypeMetadata::DateMetadata:
+ *out = date32();
+ break;
+ case fbs::TypeMetadata::TimeMetadata: {
+ auto meta = static_cast<const fbs::TimeMetadata*>(metadata);
+ *out = time32(FromFlatbufferEnum(meta->unit()));
+ } break;
+ default:
+ switch (values->type()) {
+ PRIMITIVE_CASE(BOOL, boolean);
+ PRIMITIVE_CASE(INT8, int8);
+ PRIMITIVE_CASE(INT16, int16);
+ PRIMITIVE_CASE(INT32, int32);
+ PRIMITIVE_CASE(INT64, int64);
+ PRIMITIVE_CASE(UINT8, uint8);
+ PRIMITIVE_CASE(UINT16, uint16);
+ PRIMITIVE_CASE(UINT32, uint32);
+ PRIMITIVE_CASE(UINT64, uint64);
+ PRIMITIVE_CASE(FLOAT, float32);
+ PRIMITIVE_CASE(DOUBLE, float64);
+ PRIMITIVE_CASE(UTF8, utf8);
+ PRIMITIVE_CASE(BINARY, binary);
+ PRIMITIVE_CASE(LARGE_UTF8, large_utf8);
+ PRIMITIVE_CASE(LARGE_BINARY, large_binary);
+ default:
+ return Status::Invalid("Unrecognized type");
+ }
+ break;
+ }
+
+#undef PRIMITIVE_CASE
+
+ return Status::OK();
+ }
+
+ int64_t GetOutputLength(int64_t nbytes) {
+ // XXX: Hack for Feather 0.3.0 for backwards compatibility with old files
+ // Size in-file of written byte buffer
+ if (version() < 2) {
+ // Feather files < 0.3.0
+ return nbytes;
+ } else {
+ return PaddedLength(nbytes);
+ }
+ }
+
+ // Retrieve a primitive array from the data source
+ //
+ // @returns: a Buffer instance, the precise type will depend on the kind of
+ // input data source (which may or may not have memory-map like semantics)
+ Status LoadValues(std::shared_ptr<DataType> type, const fbs::PrimitiveArray* meta,
+ fbs::TypeMetadata metadata_type, const void* metadata,
+ std::shared_ptr<ArrayData>* out) {
+ std::vector<std::shared_ptr<Buffer>> buffers;
+
+ // Buffer data from the source (may or may not perform a copy depending on
+ // input source)
+ ARROW_ASSIGN_OR_RAISE(auto buffer,
+ source_->ReadAt(meta->offset(), meta->total_bytes()));
+
+ int64_t offset = 0;
+
+ if (type->id() == Type::DICTIONARY) {
+ // Load the index type values
+ type = checked_cast<const DictionaryType&>(*type).index_type();
+ }
+
+ // If there are nulls, the null bitmask is first
+ if (meta->null_count() > 0) {
+ int64_t null_bitmap_size = GetOutputLength(BitUtil::BytesForBits(meta->length()));
+ buffers.push_back(SliceBuffer(buffer, offset, null_bitmap_size));
+ offset += null_bitmap_size;
+ } else {
+ buffers.push_back(nullptr);
+ }
+
+ if (is_binary_like(type->id())) {
+ int64_t offsets_size = GetOutputLength((meta->length() + 1) * sizeof(int32_t));
+ buffers.push_back(SliceBuffer(buffer, offset, offsets_size));
+ offset += offsets_size;
+ } else if (is_large_binary_like(type->id())) {
+ int64_t offsets_size = GetOutputLength((meta->length() + 1) * sizeof(int64_t));
+ buffers.push_back(SliceBuffer(buffer, offset, offsets_size));
+ offset += offsets_size;
+ }
+
+ buffers.push_back(SliceBuffer(buffer, offset, buffer->size() - offset));
+
+ *out = ArrayData::Make(type, meta->length(), std::move(buffers), meta->null_count());
+ return Status::OK();
+ }
+
+ int version() const override { return metadata_->version(); }
+ int64_t num_rows() const { return metadata_->num_rows(); }
+
+ std::shared_ptr<Schema> schema() const override { return schema_; }
+
+ Status GetDictionary(int field_index, std::shared_ptr<ArrayData>* out) {
+ const fbs::Column* col_meta = metadata_->columns()->Get(field_index);
+ auto dict_meta = col_meta->metadata_as<fbs::CategoryMetadata>();
+ const auto& dict_type =
+ checked_cast<const DictionaryType&>(*schema_->field(field_index)->type());
+
+ return LoadValues(dict_type.value_type(), dict_meta->levels(),
+ fbs::TypeMetadata::NONE, nullptr, out);
+ }
+
+ Status GetColumn(int field_index, std::shared_ptr<ChunkedArray>* out) {
+ const fbs::Column* col_meta = metadata_->columns()->Get(field_index);
+ std::shared_ptr<ArrayData> data;
+
+ auto type = schema_->field(field_index)->type();
+ RETURN_NOT_OK(LoadValues(type, col_meta->values(), col_meta->metadata_type(),
+ col_meta->metadata(), &data));
+
+ if (type->id() == Type::DICTIONARY) {
+ RETURN_NOT_OK(GetDictionary(field_index, &data->dictionary));
+ data->type = type;
+ }
+ *out = std::make_shared<ChunkedArray>(MakeArray(data));
+ return Status::OK();
+ }
+
+ Status Read(std::shared_ptr<Table>* out) override {
+ std::vector<std::shared_ptr<ChunkedArray>> columns;
+ for (int i = 0; i < static_cast<int>(metadata_->columns()->size()); ++i) {
+ columns.emplace_back();
+ RETURN_NOT_OK(GetColumn(i, &columns.back()));
+ }
+ *out = Table::Make(this->schema(), std::move(columns), this->num_rows());
+ return Status::OK();
+ }
+
+ Status Read(const std::vector<int>& indices, std::shared_ptr<Table>* out) override {
+ std::vector<std::shared_ptr<Field>> fields;
+ std::vector<std::shared_ptr<ChunkedArray>> columns;
+
+ auto my_schema = this->schema();
+ for (auto field_index : indices) {
+ if (field_index < 0 || field_index >= my_schema->num_fields()) {
+ return Status::Invalid("Field index ", field_index, " is out of bounds");
+ }
+ columns.emplace_back();
+ RETURN_NOT_OK(GetColumn(field_index, &columns.back()));
+ fields.push_back(my_schema->field(field_index));
+ }
+ *out = Table::Make(::arrow::schema(std::move(fields)), std::move(columns),
+ this->num_rows());
+ return Status::OK();
+ }
+
+ Status Read(const std::vector<std::string>& names,
+ std::shared_ptr<Table>* out) override {
+ std::vector<std::shared_ptr<Field>> fields;
+ std::vector<std::shared_ptr<ChunkedArray>> columns;
+
+ std::shared_ptr<Schema> sch = this->schema();
+ for (auto name : names) {
+ int field_index = sch->GetFieldIndex(name);
+ if (field_index == -1) {
+ return Status::Invalid("Field named ", name, " is not found");
+ }
+ columns.emplace_back();
+ RETURN_NOT_OK(GetColumn(field_index, &columns.back()));
+ fields.push_back(sch->field(field_index));
+ }
+ *out = Table::Make(::arrow::schema(std::move(fields)), std::move(columns),
+ this->num_rows());
+ return Status::OK();
+ }
+
+ private:
+ std::shared_ptr<io::RandomAccessFile> source_;
+ std::shared_ptr<Buffer> metadata_buffer_;
+ const fbs::CTable* metadata_;
+ std::shared_ptr<Schema> schema_;
+};
+
+// ----------------------------------------------------------------------
+// WriterV1
+
+struct ArrayMetadata {
+ fbs::Type type;
+ int64_t offset;
+ int64_t length;
+ int64_t null_count;
+ int64_t total_bytes;
+};
+
+#define TO_FLATBUFFER_CASE(TYPE) \
+ case Type::TYPE: \
+ return fbs::Type::TYPE;
+
+Result<fbs::Type> ToFlatbufferType(const DataType& type) {
+ switch (type.id()) {
+ TO_FLATBUFFER_CASE(BOOL);
+ TO_FLATBUFFER_CASE(INT8);
+ TO_FLATBUFFER_CASE(INT16);
+ TO_FLATBUFFER_CASE(INT32);
+ TO_FLATBUFFER_CASE(INT64);
+ TO_FLATBUFFER_CASE(UINT8);
+ TO_FLATBUFFER_CASE(UINT16);
+ TO_FLATBUFFER_CASE(UINT32);
+ TO_FLATBUFFER_CASE(UINT64);
+ TO_FLATBUFFER_CASE(FLOAT);
+ TO_FLATBUFFER_CASE(DOUBLE);
+ TO_FLATBUFFER_CASE(LARGE_BINARY);
+ TO_FLATBUFFER_CASE(BINARY);
+ case Type::STRING:
+ return fbs::Type::UTF8;
+ case Type::LARGE_STRING:
+ return fbs::Type::LARGE_UTF8;
+ case Type::DATE32:
+ return fbs::Type::INT32;
+ case Type::TIMESTAMP:
+ return fbs::Type::INT64;
+ case Type::TIME32:
+ return fbs::Type::INT32;
+ case Type::TIME64:
+ return fbs::Type::INT64;
+ default:
+ return Status::TypeError("Unsupported Feather V1 type: ", type.ToString(),
+ ". Use V2 format to serialize all Arrow types.");
+ }
+}
+
+inline flatbuffers::Offset<fbs::PrimitiveArray> GetPrimitiveArray(
+ FBB& fbb, const ArrayMetadata& array) {
+ return fbs::CreatePrimitiveArray(fbb, array.type, fbs::Encoding::PLAIN, array.offset,
+ array.length, array.null_count, array.total_bytes);
+}
+
+// Convert Feather enums to Flatbuffer enums
+inline fbs::TimeUnit ToFlatbufferEnum(TimeUnit::type unit) {
+ return static_cast<fbs::TimeUnit>(static_cast<int>(unit));
+}
+
+const fbs::TypeMetadata COLUMN_TYPE_ENUM_MAPPING[] = {
+ fbs::TypeMetadata::NONE, // PRIMITIVE
+ fbs::TypeMetadata::CategoryMetadata, // CATEGORY
+ fbs::TypeMetadata::TimestampMetadata, // TIMESTAMP
+ fbs::TypeMetadata::DateMetadata, // DATE
+ fbs::TypeMetadata::TimeMetadata // TIME
+};
+
+inline fbs::TypeMetadata ToFlatbufferEnum(ColumnType::type column_type) {
+ return COLUMN_TYPE_ENUM_MAPPING[column_type];
+}
+
+struct ColumnMetadata {
+ flatbuffers::Offset<void> WriteMetadata(FBB& fbb) { // NOLINT
+ switch (this->meta_type) {
+ case ColumnType::PRIMITIVE:
+ // flatbuffer void
+ return 0;
+ case ColumnType::CATEGORY: {
+ auto cat_meta = fbs::CreateCategoryMetadata(
+ fbb, GetPrimitiveArray(fbb, this->category_levels), this->category_ordered);
+ return cat_meta.Union();
+ }
+ case ColumnType::TIMESTAMP: {
+ // flatbuffer void
+ flatbuffers::Offset<flatbuffers::String> tz = 0;
+ if (!this->timezone.empty()) {
+ tz = fbb.CreateString(this->timezone);
+ }
+
+ auto ts_meta =
+ fbs::CreateTimestampMetadata(fbb, ToFlatbufferEnum(this->temporal_unit), tz);
+ return ts_meta.Union();
+ }
+ case ColumnType::DATE: {
+ auto date_meta = fbs::CreateDateMetadata(fbb);
+ return date_meta.Union();
+ }
+ case ColumnType::TIME: {
+ auto time_meta =
+ fbs::CreateTimeMetadata(fbb, ToFlatbufferEnum(this->temporal_unit));
+ return time_meta.Union();
+ }
+ default:
+ // null
+ DCHECK(false);
+ return 0;
+ }
+ }
+
+ ArrayMetadata values;
+ ColumnType::type meta_type;
+
+ ArrayMetadata category_levels;
+ bool category_ordered;
+
+ TimeUnit::type temporal_unit;
+
+ // A timezone name known to the Olson timezone database. For display purposes
+ // because the actual data is all UTC
+ std::string timezone;
+};
+
+Status WriteArrayV1(const Array& values, io::OutputStream* dst, ArrayMetadata* meta);
+
+struct ArrayWriterV1 {
+ const Array& values;
+ io::OutputStream* dst;
+ ArrayMetadata* meta;
+
+ Status WriteBuffer(const uint8_t* buffer, int64_t length, int64_t bit_offset) {
+ int64_t bytes_written = 0;
+ if (buffer) {
+ RETURN_NOT_OK(
+ WritePaddedWithOffset(dst, buffer, bit_offset, length, &bytes_written));
+ } else {
+ RETURN_NOT_OK(WritePaddedBlank(dst, length, &bytes_written));
+ }
+ meta->total_bytes += bytes_written;
+ return Status::OK();
+ }
+
+ template <typename T>
+ typename std::enable_if<
+ is_nested_type<T>::value || is_null_type<T>::value || is_decimal_type<T>::value ||
+ std::is_same<DictionaryType, T>::value || is_duration_type<T>::value ||
+ is_interval_type<T>::value || is_fixed_size_binary_type<T>::value ||
+ std::is_same<Date64Type, T>::value || std::is_same<Time64Type, T>::value ||
+ std::is_same<ExtensionType, T>::value,
+ Status>::type
+ Visit(const T& type) {
+ return Status::NotImplemented(type.ToString());
+ }
+
+ template <typename T>
+ typename std::enable_if<is_number_type<T>::value ||
+ std::is_same<Date32Type, T>::value ||
+ std::is_same<Time32Type, T>::value ||
+ is_timestamp_type<T>::value || is_boolean_type<T>::value,
+ Status>::type
+ Visit(const T&) {
+ const auto& prim_values = checked_cast<const PrimitiveArray&>(values);
+ const auto& fw_type = checked_cast<const FixedWidthType&>(*values.type());
+
+ if (prim_values.values()) {
+ const uint8_t* buffer =
+ prim_values.values()->data() + (prim_values.offset() * fw_type.bit_width() / 8);
+ int64_t bit_offset = (prim_values.offset() * fw_type.bit_width()) % 8;
+ return WriteBuffer(buffer,
+ BitUtil::BytesForBits(values.length() * fw_type.bit_width()),
+ bit_offset);
+ } else {
+ return Status::OK();
+ }
+ return Status::OK();
+ }
+
+ template <typename T>
+ enable_if_base_binary<T, Status> Visit(const T&) {
+ using ArrayType = typename TypeTraits<T>::ArrayType;
+ const auto& ty_values = checked_cast<const ArrayType&>(values);
+
+ using offset_type = typename T::offset_type;
+ const offset_type* offsets_data = nullptr;
+ int64_t values_bytes = 0;
+ if (ty_values.value_offsets()) {
+ offsets_data = ty_values.raw_value_offsets();
+ // All of the data has to be written because we don't have offset
+ // shifting implemented here as with the IPC format
+ values_bytes = offsets_data[values.length()];
+ }
+ RETURN_NOT_OK(WriteBuffer(reinterpret_cast<const uint8_t*>(offsets_data),
+ sizeof(offset_type) * (values.length() + 1),
+ /*bit_offset=*/0));
+
+ const uint8_t* values_buffer = nullptr;
+ if (ty_values.value_data()) {
+ values_buffer = ty_values.value_data()->data();
+ }
+ return WriteBuffer(values_buffer, values_bytes, /*bit_offset=*/0);
+ }
+
+ Status Write() {
+ if (values.type_id() == Type::DICTIONARY) {
+ return WriteArrayV1(*(checked_cast<const DictionaryArray&>(values).indices()), dst,
+ meta);
+ }
+
+ ARROW_ASSIGN_OR_RAISE(meta->type, ToFlatbufferType(*values.type()));
+ ARROW_ASSIGN_OR_RAISE(meta->offset, dst->Tell());
+ meta->length = values.length();
+ meta->null_count = values.null_count();
+ meta->total_bytes = 0;
+
+ // Write the null bitmask
+ if (values.null_count() > 0) {
+ RETURN_NOT_OK(WriteBuffer(values.null_bitmap_data(),
+ BitUtil::BytesForBits(values.length()), values.offset()));
+ }
+ // Write data buffer(s)
+ return VisitTypeInline(*values.type(), this);
+ }
+};
+
+Status WriteArrayV1(const Array& values, io::OutputStream* dst, ArrayMetadata* meta) {
+ std::shared_ptr<Array> sanitized;
+ if (values.type_id() == Type::NA) {
+ // As long as R doesn't support NA, we write this as a StringColumn
+ // to ensure stable roundtrips.
+ sanitized = std::make_shared<StringArray>(values.length(), nullptr, nullptr,
+ values.null_bitmap(), values.null_count());
+ } else {
+ sanitized = MakeArray(values.data());
+ }
+ ArrayWriterV1 visitor{*sanitized, dst, meta};
+ return visitor.Write();
+}
+
+Status WriteColumnV1(const ChunkedArray& values, io::OutputStream* dst,
+ ColumnMetadata* out) {
+ if (values.num_chunks() > 1) {
+ return Status::Invalid("Writing chunked arrays not supported in Feather V1");
+ }
+ const Array& chunk = *values.chunk(0);
+ RETURN_NOT_OK(WriteArrayV1(chunk, dst, &out->values));
+ switch (chunk.type_id()) {
+ case Type::DICTIONARY: {
+ out->meta_type = ColumnType::CATEGORY;
+ auto dictionary = checked_cast<const DictionaryArray&>(chunk).dictionary();
+ RETURN_NOT_OK(WriteArrayV1(*dictionary, dst, &out->category_levels));
+ out->category_ordered =
+ checked_cast<const DictionaryType&>(*chunk.type()).ordered();
+ } break;
+ case Type::DATE32:
+ out->meta_type = ColumnType::DATE;
+ break;
+ case Type::TIME32: {
+ out->meta_type = ColumnType::TIME;
+ out->temporal_unit = checked_cast<const Time32Type&>(*chunk.type()).unit();
+ } break;
+ case Type::TIMESTAMP: {
+ const auto& ts_type = checked_cast<const TimestampType&>(*chunk.type());
+ out->meta_type = ColumnType::TIMESTAMP;
+ out->temporal_unit = ts_type.unit();
+ out->timezone = ts_type.timezone();
+ } break;
+ default:
+ out->meta_type = ColumnType::PRIMITIVE;
+ break;
+ }
+ return Status::OK();
+}
+
+Status WriteFeatherV1(const Table& table, io::OutputStream* dst) {
+ // Preamble
+ int64_t bytes_written;
+ RETURN_NOT_OK(WritePadded(dst, reinterpret_cast<const uint8_t*>(kFeatherV1MagicBytes),
+ strlen(kFeatherV1MagicBytes), &bytes_written));
+
+ // Write columns
+ flatbuffers::FlatBufferBuilder fbb;
+ std::vector<flatbuffers::Offset<fbs::Column>> fb_columns;
+ for (int i = 0; i < table.num_columns(); ++i) {
+ ColumnMetadata col;
+ RETURN_NOT_OK(WriteColumnV1(*table.column(i), dst, &col));
+ auto fb_column = fbs::CreateColumn(
+ fbb, fbb.CreateString(table.field(i)->name()), GetPrimitiveArray(fbb, col.values),
+ ToFlatbufferEnum(col.meta_type), col.WriteMetadata(fbb),
+ /*user_metadata=*/0);
+ fb_columns.push_back(fb_column);
+ }
+
+ // Finalize file footer
+ auto root = fbs::CreateCTable(fbb, /*description=*/0, table.num_rows(),
+ fbb.CreateVector(fb_columns), kFeatherV1Version,
+ /*metadata=*/0);
+ fbb.Finish(root);
+ auto buffer = std::make_shared<Buffer>(fbb.GetBufferPointer(),
+ static_cast<int64_t>(fbb.GetSize()));
+
+ // Writer metadata
+ RETURN_NOT_OK(WritePadded(dst, buffer->data(), buffer->size(), &bytes_written));
+ uint32_t metadata_size = static_cast<uint32_t>(bytes_written);
+
+ // Footer: metadata length, magic bytes
+ RETURN_NOT_OK(dst->Write(&metadata_size, sizeof(uint32_t)));
+ return dst->Write(kFeatherV1MagicBytes, strlen(kFeatherV1MagicBytes));
+}
+
+// ----------------------------------------------------------------------
+// Reader V2
+
+class ReaderV2 : public Reader {
+ public:
+ Status Open(const std::shared_ptr<io::RandomAccessFile>& source) {
+ source_ = source;
+ ARROW_ASSIGN_OR_RAISE(auto reader, RecordBatchFileReader::Open(source_));
+ schema_ = reader->schema();
+ return Status::OK();
+ }
+
+ int version() const override { return kFeatherV2Version; }
+
+ std::shared_ptr<Schema> schema() const override { return schema_; }
+
+ Status Read(const IpcReadOptions& options, std::shared_ptr<Table>* out) {
+ ARROW_ASSIGN_OR_RAISE(auto reader, RecordBatchFileReader::Open(source_, options));
+ RecordBatchVector batches(reader->num_record_batches());
+ for (int i = 0; i < reader->num_record_batches(); ++i) {
+ ARROW_ASSIGN_OR_RAISE(batches[i], reader->ReadRecordBatch(i));
+ }
+
+ return Table::FromRecordBatches(reader->schema(), batches).Value(out);
+ }
+
+ Status Read(std::shared_ptr<Table>* out) override {
+ return Read(IpcReadOptions::Defaults(), out);
+ }
+
+ Status Read(const std::vector<int>& indices, std::shared_ptr<Table>* out) override {
+ auto options = IpcReadOptions::Defaults();
+ options.included_fields = indices;
+ return Read(options, out);
+ }
+
+ Status Read(const std::vector<std::string>& names,
+ std::shared_ptr<Table>* out) override {
+ std::vector<int> indices;
+ std::shared_ptr<Schema> sch = this->schema();
+ for (auto name : names) {
+ int field_index = sch->GetFieldIndex(name);
+ if (field_index == -1) {
+ return Status::Invalid("Field named ", name, " is not found");
+ }
+ indices.push_back(field_index);
+ }
+ return Read(indices, out);
+ }
+
+ private:
+ std::shared_ptr<io::RandomAccessFile> source_;
+ std::shared_ptr<Schema> schema_;
+};
+
+} // namespace
+
+Result<std::shared_ptr<Reader>> Reader::Open(
+ const std::shared_ptr<io::RandomAccessFile>& source) {
+ // Pathological issue where the file is smaller than header and footer
+ // combined
+ ARROW_ASSIGN_OR_RAISE(int64_t size, source->GetSize());
+ if (size < /* 2 * 4 + 4 */ 12) {
+ return Status::Invalid("File is too small to be a well-formed file");
+ }
+
+ // Determine what kind of file we have. 6 is the max of len(FEA1) and
+ // len(ARROW1)
+ constexpr int magic_size = 6;
+ ARROW_ASSIGN_OR_RAISE(auto buffer, source->ReadAt(0, magic_size));
+
+ if (memcmp(buffer->data(), kFeatherV1MagicBytes, strlen(kFeatherV1MagicBytes)) == 0) {
+ std::shared_ptr<ReaderV1> result = std::make_shared<ReaderV1>();
+ RETURN_NOT_OK(result->Open(source));
+ return result;
+ } else if (memcmp(buffer->data(), internal::kArrowMagicBytes,
+ strlen(internal::kArrowMagicBytes)) == 0) {
+ std::shared_ptr<ReaderV2> result = std::make_shared<ReaderV2>();
+ RETURN_NOT_OK(result->Open(source));
+ return result;
+ } else {
+ return Status::Invalid("Not a Feather V1 or Arrow IPC file");
+ }
+}
+
+WriteProperties WriteProperties::Defaults() {
+ WriteProperties result;
+#ifdef ARROW_WITH_LZ4
+ result.compression = Compression::LZ4_FRAME;
+#else
+ result.compression = Compression::UNCOMPRESSED;
+#endif
+ return result;
+}
+
+Status WriteTable(const Table& table, io::OutputStream* dst,
+ const WriteProperties& properties) {
+ if (properties.version == kFeatherV1Version) {
+ return WriteFeatherV1(table, dst);
+ } else {
+ IpcWriteOptions ipc_options = IpcWriteOptions::Defaults();
+ ipc_options.unify_dictionaries = true;
+ ipc_options.allow_64bit = true;
+ ARROW_ASSIGN_OR_RAISE(
+ ipc_options.codec,
+ util::Codec::Create(properties.compression, properties.compression_level));
+
+ std::shared_ptr<RecordBatchWriter> writer;
+ ARROW_ASSIGN_OR_RAISE(writer, MakeFileWriter(dst, table.schema(), ipc_options));
+ RETURN_NOT_OK(writer->WriteTable(table, properties.chunksize));
+ return writer->Close();
+ }
+}
+
+} // namespace feather
+} // namespace ipc
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/feather.h b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/feather.h
new file mode 100644
index 00000000000..a32ff6d0a5a
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/feather.h
@@ -0,0 +1,140 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Public API for the "Feather" file format, originally created at
+// http://github.com/wesm/feather
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/type_fwd.h"
+#include "arrow/util/compression.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class Schema;
+class Status;
+class Table;
+
+namespace io {
+
+class OutputStream;
+class RandomAccessFile;
+
+} // namespace io
+
+namespace ipc {
+namespace feather {
+
+static constexpr const int kFeatherV1Version = 2;
+static constexpr const int kFeatherV2Version = 3;
+
+// ----------------------------------------------------------------------
+// Metadata accessor classes
+
+/// \class Reader
+/// \brief An interface for reading columns from Feather files
+class ARROW_EXPORT Reader {
+ public:
+ virtual ~Reader() = default;
+
+ /// \brief Open a Feather file from a RandomAccessFile interface
+ ///
+ /// \param[in] source a RandomAccessFile instance
+ /// \return the table reader
+ static Result<std::shared_ptr<Reader>> Open(
+ const std::shared_ptr<io::RandomAccessFile>& source);
+
+ /// \brief Return the version number of the Feather file
+ virtual int version() const = 0;
+
+ virtual std::shared_ptr<Schema> schema() const = 0;
+
+ /// \brief Read all columns from the file as an arrow::Table.
+ ///
+ /// \param[out] out the returned table
+ /// \return Status
+ ///
+ /// This function is zero-copy if the file source supports zero-copy reads
+ virtual Status Read(std::shared_ptr<Table>* out) = 0;
+
+ /// \brief Read only the specified columns from the file as an arrow::Table.
+ ///
+ /// \param[in] indices the column indices to read
+ /// \param[out] out the returned table
+ /// \return Status
+ ///
+ /// This function is zero-copy if the file source supports zero-copy reads
+ virtual Status Read(const std::vector<int>& indices, std::shared_ptr<Table>* out) = 0;
+
+ /// \brief Read only the specified columns from the file as an arrow::Table.
+ ///
+ /// \param[in] names the column names to read
+ /// \param[out] out the returned table
+ /// \return Status
+ ///
+ /// This function is zero-copy if the file source supports zero-copy reads
+ virtual Status Read(const std::vector<std::string>& names,
+ std::shared_ptr<Table>* out) = 0;
+};
+
+struct ARROW_EXPORT WriteProperties {
+ static WriteProperties Defaults();
+
+ static WriteProperties DefaultsV1() {
+ WriteProperties props = Defaults();
+ props.version = kFeatherV1Version;
+ return props;
+ }
+
+ /// Feather file version number
+ ///
+ /// version 2: "Feather V1" Apache Arrow <= 0.16.0
+ /// version 3: "Feather V2" Apache Arrow > 0.16.0
+ int version = kFeatherV2Version;
+
+ // Parameters for Feather V2 only
+
+ /// Number of rows per intra-file chunk. Use smaller chunksize when you need
+ /// faster random row access
+ int64_t chunksize = 1LL << 16;
+
+ /// Compression type to use. Only UNCOMPRESSED, LZ4_FRAME, and ZSTD are
+ /// supported. The default compression returned by Defaults() is LZ4 if the
+ /// project is built with support for it, otherwise
+ /// UNCOMPRESSED. UNCOMPRESSED is set as the object default here so that if
+ /// WriteProperties::Defaults() is not used, the default constructor for
+ /// WriteProperties will work regardless of the options used to build the C++
+ /// project.
+ Compression::type compression = Compression::UNCOMPRESSED;
+
+ /// Compressor-specific compression level
+ int compression_level = ::arrow::util::kUseDefaultCompressionLevel;
+};
+
+ARROW_EXPORT
+Status WriteTable(const Table& table, io::OutputStream* dst,
+ const WriteProperties& properties = WriteProperties::Defaults());
+
+} // namespace feather
+} // namespace ipc
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/json_simple.h b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/json_simple.h
new file mode 100644
index 00000000000..4dd3a664aa6
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/json_simple.h
@@ -0,0 +1,61 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Implement a simple JSON representation format for arrays
+
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include "arrow/status.h"
+#include "arrow/util/string_view.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class Array;
+class DataType;
+
+namespace ipc {
+namespace internal {
+namespace json {
+
+ARROW_EXPORT
+Status ArrayFromJSON(const std::shared_ptr<DataType>&, const std::string& json,
+ std::shared_ptr<Array>* out);
+
+ARROW_EXPORT
+Status ArrayFromJSON(const std::shared_ptr<DataType>&, util::string_view json,
+ std::shared_ptr<Array>* out);
+
+ARROW_EXPORT
+Status ArrayFromJSON(const std::shared_ptr<DataType>&, const char* json,
+ std::shared_ptr<Array>* out);
+
+ARROW_EXPORT
+Status DictArrayFromJSON(const std::shared_ptr<DataType>&, util::string_view indices_json,
+ util::string_view dictionary_json, std::shared_ptr<Array>* out);
+
+ARROW_EXPORT
+Status ScalarFromJSON(const std::shared_ptr<DataType>&, util::string_view json,
+ std::shared_ptr<Scalar>* out);
+
+} // namespace json
+} // namespace internal
+} // namespace ipc
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/message.cc b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/message.cc
new file mode 100644
index 00000000000..197556efcea
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/message.cc
@@ -0,0 +1,931 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/ipc/message.h"
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/buffer.h"
+#include "arrow/device.h"
+#include "arrow/io/interfaces.h"
+#include "arrow/ipc/metadata_internal.h"
+#include "arrow/ipc/options.h"
+#include "arrow/ipc/util.h"
+#include "arrow/status.h"
+#include "arrow/util/endian.h"
+#include "arrow/util/future.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/ubsan.h"
+
+#include "generated/Message_generated.h"
+
+namespace arrow {
+
+class KeyValueMetadata;
+class MemoryPool;
+
+namespace ipc {
+
+class Message::MessageImpl {
+ public:
+ explicit MessageImpl(std::shared_ptr<Buffer> metadata, std::shared_ptr<Buffer> body)
+ : metadata_(std::move(metadata)), message_(nullptr), body_(std::move(body)) {}
+
+ Status Open() {
+ RETURN_NOT_OK(
+ internal::VerifyMessage(metadata_->data(), metadata_->size(), &message_));
+
+ // Check that the metadata version is supported
+ if (message_->version() < internal::kMinMetadataVersion) {
+ return Status::Invalid("Old metadata version not supported");
+ }
+
+ if (message_->version() > flatbuf::MetadataVersion::MAX) {
+ return Status::Invalid("Unsupported future MetadataVersion: ",
+ static_cast<int16_t>(message_->version()));
+ }
+
+ if (message_->custom_metadata() != nullptr) {
+ // Deserialize from Flatbuffers if first time called
+ std::shared_ptr<KeyValueMetadata> md;
+ RETURN_NOT_OK(internal::GetKeyValueMetadata(message_->custom_metadata(), &md));
+ custom_metadata_ = std::move(md); // const-ify
+ }
+
+ return Status::OK();
+ }
+
+ MessageType type() const {
+ switch (message_->header_type()) {
+ case flatbuf::MessageHeader::Schema:
+ return MessageType::SCHEMA;
+ case flatbuf::MessageHeader::DictionaryBatch:
+ return MessageType::DICTIONARY_BATCH;
+ case flatbuf::MessageHeader::RecordBatch:
+ return MessageType::RECORD_BATCH;
+ case flatbuf::MessageHeader::Tensor:
+ return MessageType::TENSOR;
+ case flatbuf::MessageHeader::SparseTensor:
+ return MessageType::SPARSE_TENSOR;
+ default:
+ return MessageType::NONE;
+ }
+ }
+
+ MetadataVersion version() const {
+ return internal::GetMetadataVersion(message_->version());
+ }
+
+ const void* header() const { return message_->header(); }
+
+ int64_t body_length() const { return message_->bodyLength(); }
+
+ std::shared_ptr<Buffer> body() const { return body_; }
+
+ std::shared_ptr<Buffer> metadata() const { return metadata_; }
+
+ const std::shared_ptr<const KeyValueMetadata>& custom_metadata() const {
+ return custom_metadata_;
+ }
+
+ private:
+ // The Flatbuffer metadata
+ std::shared_ptr<Buffer> metadata_;
+ const flatbuf::Message* message_;
+
+ // The reconstructed custom_metadata field from the Message Flatbuffer
+ std::shared_ptr<const KeyValueMetadata> custom_metadata_;
+
+ // The message body, if any
+ std::shared_ptr<Buffer> body_;
+};
+
+Message::Message(std::shared_ptr<Buffer> metadata, std::shared_ptr<Buffer> body) {
+ impl_.reset(new MessageImpl(std::move(metadata), std::move(body)));
+}
+
+Result<std::unique_ptr<Message>> Message::Open(std::shared_ptr<Buffer> metadata,
+ std::shared_ptr<Buffer> body) {
+ std::unique_ptr<Message> result(new Message(std::move(metadata), std::move(body)));
+ RETURN_NOT_OK(result->impl_->Open());
+ return std::move(result);
+}
+
+Message::~Message() {}
+
+std::shared_ptr<Buffer> Message::body() const { return impl_->body(); }
+
+int64_t Message::body_length() const { return impl_->body_length(); }
+
+std::shared_ptr<Buffer> Message::metadata() const { return impl_->metadata(); }
+
+MessageType Message::type() const { return impl_->type(); }
+
+MetadataVersion Message::metadata_version() const { return impl_->version(); }
+
+const void* Message::header() const { return impl_->header(); }
+
+const std::shared_ptr<const KeyValueMetadata>& Message::custom_metadata() const {
+ return impl_->custom_metadata();
+}
+
+bool Message::Equals(const Message& other) const {
+ int64_t metadata_bytes = std::min(metadata()->size(), other.metadata()->size());
+
+ if (!metadata()->Equals(*other.metadata(), metadata_bytes)) {
+ return false;
+ }
+
+ // Compare bodies, if they have them
+ auto this_body = body();
+ auto other_body = other.body();
+
+ const bool this_has_body = (this_body != nullptr) && (this_body->size() > 0);
+ const bool other_has_body = (other_body != nullptr) && (other_body->size() > 0);
+
+ if (this_has_body && other_has_body) {
+ return this_body->Equals(*other_body);
+ } else if (this_has_body ^ other_has_body) {
+ // One has a body but not the other
+ return false;
+ } else {
+ // Neither has a body
+ return true;
+ }
+}
+
+Status MaybeAlignMetadata(std::shared_ptr<Buffer>* metadata) {
+ if (reinterpret_cast<uintptr_t>((*metadata)->data()) % 8 != 0) {
+ // If the metadata memory is not aligned, we copy it here to avoid
+ // potential UBSAN issues from Flatbuffers
+ ARROW_ASSIGN_OR_RAISE(*metadata, (*metadata)->CopySlice(0, (*metadata)->size()));
+ }
+ return Status::OK();
+}
+
+Status CheckMetadataAndGetBodyLength(const Buffer& metadata, int64_t* body_length) {
+ const flatbuf::Message* fb_message = nullptr;
+ RETURN_NOT_OK(internal::VerifyMessage(metadata.data(), metadata.size(), &fb_message));
+ *body_length = fb_message->bodyLength();
+ if (*body_length < 0) {
+ return Status::IOError("Invalid IPC message: negative bodyLength");
+ }
+ return Status::OK();
+}
+
+Result<std::unique_ptr<Message>> Message::ReadFrom(std::shared_ptr<Buffer> metadata,
+ io::InputStream* stream) {
+ std::unique_ptr<Message> result;
+ auto listener = std::make_shared<AssignMessageDecoderListener>(&result);
+ MessageDecoder decoder(listener, MessageDecoder::State::METADATA, metadata->size());
+ ARROW_RETURN_NOT_OK(decoder.Consume(metadata));
+
+ ARROW_ASSIGN_OR_RAISE(auto body, stream->Read(decoder.next_required_size()));
+ if (body->size() < decoder.next_required_size()) {
+ return Status::IOError("Expected to be able to read ", decoder.next_required_size(),
+ " bytes for message body, got ", body->size());
+ }
+ RETURN_NOT_OK(decoder.Consume(body));
+ return std::move(result);
+}
+
+Result<std::unique_ptr<Message>> Message::ReadFrom(const int64_t offset,
+ std::shared_ptr<Buffer> metadata,
+ io::RandomAccessFile* file) {
+ std::unique_ptr<Message> result;
+ auto listener = std::make_shared<AssignMessageDecoderListener>(&result);
+ MessageDecoder decoder(listener, MessageDecoder::State::METADATA, metadata->size());
+ ARROW_RETURN_NOT_OK(decoder.Consume(metadata));
+
+ ARROW_ASSIGN_OR_RAISE(auto body, file->ReadAt(offset, decoder.next_required_size()));
+ if (body->size() < decoder.next_required_size()) {
+ return Status::IOError("Expected to be able to read ", decoder.next_required_size(),
+ " bytes for message body, got ", body->size());
+ }
+ RETURN_NOT_OK(decoder.Consume(body));
+ return std::move(result);
+}
+
+Status WritePadding(io::OutputStream* stream, int64_t nbytes) {
+ while (nbytes > 0) {
+ const int64_t bytes_to_write = std::min<int64_t>(nbytes, kArrowAlignment);
+ RETURN_NOT_OK(stream->Write(kPaddingBytes, bytes_to_write));
+ nbytes -= bytes_to_write;
+ }
+ return Status::OK();
+}
+
+Status Message::SerializeTo(io::OutputStream* stream, const IpcWriteOptions& options,
+ int64_t* output_length) const {
+ int32_t metadata_length = 0;
+ RETURN_NOT_OK(WriteMessage(*metadata(), options, stream, &metadata_length));
+
+ *output_length = metadata_length;
+
+ auto body_buffer = body();
+ if (body_buffer) {
+ RETURN_NOT_OK(stream->Write(body_buffer));
+ *output_length += body_buffer->size();
+
+ DCHECK_GE(this->body_length(), body_buffer->size());
+
+ int64_t remainder = this->body_length() - body_buffer->size();
+ RETURN_NOT_OK(WritePadding(stream, remainder));
+ *output_length += remainder;
+ }
+ return Status::OK();
+}
+
+bool Message::Verify() const {
+ const flatbuf::Message* unused;
+ return internal::VerifyMessage(metadata()->data(), metadata()->size(), &unused).ok();
+}
+
+std::string FormatMessageType(MessageType type) {
+ switch (type) {
+ case MessageType::SCHEMA:
+ return "schema";
+ case MessageType::RECORD_BATCH:
+ return "record batch";
+ case MessageType::DICTIONARY_BATCH:
+ return "dictionary";
+ case MessageType::TENSOR:
+ return "tensor";
+ case MessageType::SPARSE_TENSOR:
+ return "sparse tensor";
+ default:
+ break;
+ }
+ return "unknown";
+}
+
+Result<std::unique_ptr<Message>> ReadMessage(int64_t offset, int32_t metadata_length,
+ io::RandomAccessFile* file) {
+ std::unique_ptr<Message> result;
+ auto listener = std::make_shared<AssignMessageDecoderListener>(&result);
+ MessageDecoder decoder(listener);
+
+ if (metadata_length < decoder.next_required_size()) {
+ return Status::Invalid("metadata_length should be at least ",
+ decoder.next_required_size());
+ }
+
+ ARROW_ASSIGN_OR_RAISE(auto metadata, file->ReadAt(offset, metadata_length));
+ if (metadata->size() < metadata_length) {
+ return Status::Invalid("Expected to read ", metadata_length,
+ " metadata bytes but got ", metadata->size());
+ }
+ ARROW_RETURN_NOT_OK(decoder.Consume(metadata));
+
+ switch (decoder.state()) {
+ case MessageDecoder::State::INITIAL:
+ return std::move(result);
+ case MessageDecoder::State::METADATA_LENGTH:
+ return Status::Invalid("metadata length is missing. File offset: ", offset,
+ ", metadata length: ", metadata_length);
+ case MessageDecoder::State::METADATA:
+ return Status::Invalid("flatbuffer size ", decoder.next_required_size(),
+ " invalid. File offset: ", offset,
+ ", metadata length: ", metadata_length);
+ case MessageDecoder::State::BODY: {
+ ARROW_ASSIGN_OR_RAISE(auto body, file->ReadAt(offset + metadata_length,
+ decoder.next_required_size()));
+ if (body->size() < decoder.next_required_size()) {
+ return Status::IOError("Expected to be able to read ",
+ decoder.next_required_size(),
+ " bytes for message body, got ", body->size());
+ }
+ RETURN_NOT_OK(decoder.Consume(body));
+ return std::move(result);
+ }
+ case MessageDecoder::State::EOS:
+ return Status::Invalid("Unexpected empty message in IPC file format");
+ default:
+ return Status::Invalid("Unexpected state: ", decoder.state());
+ }
+}
+
+Future<std::shared_ptr<Message>> ReadMessageAsync(int64_t offset, int32_t metadata_length,
+ int64_t body_length,
+ io::RandomAccessFile* file,
+ const io::IOContext& context) {
+ struct State {
+ std::unique_ptr<Message> result;
+ std::shared_ptr<MessageDecoderListener> listener;
+ std::shared_ptr<MessageDecoder> decoder;
+ };
+ auto state = std::make_shared<State>();
+ state->listener = std::make_shared<AssignMessageDecoderListener>(&state->result);
+ state->decoder = std::make_shared<MessageDecoder>(state->listener);
+
+ if (metadata_length < state->decoder->next_required_size()) {
+ return Status::Invalid("metadata_length should be at least ",
+ state->decoder->next_required_size());
+ }
+ return file->ReadAsync(context, offset, metadata_length + body_length)
+ .Then([=](std::shared_ptr<Buffer> metadata) -> Result<std::shared_ptr<Message>> {
+ if (metadata->size() < metadata_length) {
+ return Status::Invalid("Expected to read ", metadata_length,
+ " metadata bytes but got ", metadata->size());
+ }
+ ARROW_RETURN_NOT_OK(
+ state->decoder->Consume(SliceBuffer(metadata, 0, metadata_length)));
+ switch (state->decoder->state()) {
+ case MessageDecoder::State::INITIAL:
+ return std::move(state->result);
+ case MessageDecoder::State::METADATA_LENGTH:
+ return Status::Invalid("metadata length is missing. File offset: ", offset,
+ ", metadata length: ", metadata_length);
+ case MessageDecoder::State::METADATA:
+ return Status::Invalid("flatbuffer size ",
+ state->decoder->next_required_size(),
+ " invalid. File offset: ", offset,
+ ", metadata length: ", metadata_length);
+ case MessageDecoder::State::BODY: {
+ auto body = SliceBuffer(metadata, metadata_length, body_length);
+ if (body->size() < state->decoder->next_required_size()) {
+ return Status::IOError("Expected to be able to read ",
+ state->decoder->next_required_size(),
+ " bytes for message body, got ", body->size());
+ }
+ RETURN_NOT_OK(state->decoder->Consume(body));
+ return std::move(state->result);
+ }
+ case MessageDecoder::State::EOS:
+ return Status::Invalid("Unexpected empty message in IPC file format");
+ default:
+ return Status::Invalid("Unexpected state: ", state->decoder->state());
+ }
+ });
+}
+
+Status AlignStream(io::InputStream* stream, int32_t alignment) {
+ ARROW_ASSIGN_OR_RAISE(int64_t position, stream->Tell());
+ return stream->Advance(PaddedLength(position, alignment) - position);
+}
+
+Status AlignStream(io::OutputStream* stream, int32_t alignment) {
+ ARROW_ASSIGN_OR_RAISE(int64_t position, stream->Tell());
+ int64_t remainder = PaddedLength(position, alignment) - position;
+ if (remainder > 0) {
+ return stream->Write(kPaddingBytes, remainder);
+ }
+ return Status::OK();
+}
+
+Status CheckAligned(io::FileInterface* stream, int32_t alignment) {
+ ARROW_ASSIGN_OR_RAISE(int64_t position, stream->Tell());
+ if (position % alignment != 0) {
+ return Status::Invalid("Stream is not aligned pos: ", position,
+ " alignment: ", alignment);
+ } else {
+ return Status::OK();
+ }
+}
+
+Status DecodeMessage(MessageDecoder* decoder, io::InputStream* file) {
+ if (decoder->state() == MessageDecoder::State::INITIAL) {
+ uint8_t continuation[sizeof(int32_t)];
+ ARROW_ASSIGN_OR_RAISE(int64_t bytes_read, file->Read(sizeof(int32_t), &continuation));
+ if (bytes_read == 0) {
+ // EOS without indication
+ return Status::OK();
+ } else if (bytes_read != decoder->next_required_size()) {
+ return Status::Invalid("Corrupted message, only ", bytes_read, " bytes available");
+ }
+ ARROW_RETURN_NOT_OK(decoder->Consume(continuation, bytes_read));
+ }
+
+ if (decoder->state() == MessageDecoder::State::METADATA_LENGTH) {
+ // Valid IPC message, read the message length now
+ uint8_t metadata_length[sizeof(int32_t)];
+ ARROW_ASSIGN_OR_RAISE(int64_t bytes_read,
+ file->Read(sizeof(int32_t), &metadata_length));
+ if (bytes_read != decoder->next_required_size()) {
+ return Status::Invalid("Corrupted metadata length, only ", bytes_read,
+ " bytes available");
+ }
+ ARROW_RETURN_NOT_OK(decoder->Consume(metadata_length, bytes_read));
+ }
+
+ if (decoder->state() == MessageDecoder::State::EOS) {
+ return Status::OK();
+ }
+
+ auto metadata_length = decoder->next_required_size();
+ ARROW_ASSIGN_OR_RAISE(auto metadata, file->Read(metadata_length));
+ if (metadata->size() != metadata_length) {
+ return Status::Invalid("Expected to read ", metadata_length, " metadata bytes, but ",
+ "only read ", metadata->size());
+ }
+ ARROW_RETURN_NOT_OK(decoder->Consume(metadata));
+
+ if (decoder->state() == MessageDecoder::State::BODY) {
+ ARROW_ASSIGN_OR_RAISE(auto body, file->Read(decoder->next_required_size()));
+ if (body->size() < decoder->next_required_size()) {
+ return Status::IOError("Expected to be able to read ",
+ decoder->next_required_size(),
+ " bytes for message body, got ", body->size());
+ }
+ ARROW_RETURN_NOT_OK(decoder->Consume(body));
+ }
+
+ if (decoder->state() == MessageDecoder::State::INITIAL ||
+ decoder->state() == MessageDecoder::State::EOS) {
+ return Status::OK();
+ } else {
+ return Status::Invalid("Failed to decode message");
+ }
+}
+
+Result<std::unique_ptr<Message>> ReadMessage(io::InputStream* file, MemoryPool* pool) {
+ std::unique_ptr<Message> message;
+ auto listener = std::make_shared<AssignMessageDecoderListener>(&message);
+ MessageDecoder decoder(listener, pool);
+ ARROW_RETURN_NOT_OK(DecodeMessage(&decoder, file));
+ if (!message) {
+ return nullptr;
+ } else {
+ return std::move(message);
+ }
+}
+
+Status WriteMessage(const Buffer& message, const IpcWriteOptions& options,
+ io::OutputStream* file, int32_t* message_length) {
+ const int32_t prefix_size = options.write_legacy_ipc_format ? 4 : 8;
+ const int32_t flatbuffer_size = static_cast<int32_t>(message.size());
+
+ int32_t padded_message_length = static_cast<int32_t>(
+ PaddedLength(flatbuffer_size + prefix_size, options.alignment));
+
+ int32_t padding = padded_message_length - flatbuffer_size - prefix_size;
+
+ // The returned message size includes the length prefix, the flatbuffer,
+ // plus padding
+ *message_length = padded_message_length;
+
+ // ARROW-6314: Write continuation / padding token
+ if (!options.write_legacy_ipc_format) {
+ RETURN_NOT_OK(file->Write(&internal::kIpcContinuationToken, sizeof(int32_t)));
+ }
+
+ // Write the flatbuffer size prefix including padding in little endian
+ int32_t padded_flatbuffer_size =
+ BitUtil::ToLittleEndian(padded_message_length - prefix_size);
+ RETURN_NOT_OK(file->Write(&padded_flatbuffer_size, sizeof(int32_t)));
+
+ // Write the flatbuffer
+ RETURN_NOT_OK(file->Write(message.data(), flatbuffer_size));
+ if (padding > 0) {
+ RETURN_NOT_OK(file->Write(kPaddingBytes, padding));
+ }
+
+ return Status::OK();
+}
+
+// ----------------------------------------------------------------------
+// Implement MessageDecoder
+
+Status MessageDecoderListener::OnInitial() { return Status::OK(); }
+Status MessageDecoderListener::OnMetadataLength() { return Status::OK(); }
+Status MessageDecoderListener::OnMetadata() { return Status::OK(); }
+Status MessageDecoderListener::OnBody() { return Status::OK(); }
+Status MessageDecoderListener::OnEOS() { return Status::OK(); }
+
+static constexpr auto kMessageDecoderNextRequiredSizeInitial = sizeof(int32_t);
+static constexpr auto kMessageDecoderNextRequiredSizeMetadataLength = sizeof(int32_t);
+
+class MessageDecoder::MessageDecoderImpl {
+ public:
+ explicit MessageDecoderImpl(std::shared_ptr<MessageDecoderListener> listener,
+ State initial_state, int64_t initial_next_required_size,
+ MemoryPool* pool)
+ : listener_(std::move(listener)),
+ pool_(pool),
+ state_(initial_state),
+ next_required_size_(initial_next_required_size),
+ chunks_(),
+ buffered_size_(0),
+ metadata_(nullptr) {}
+
+ Status ConsumeData(const uint8_t* data, int64_t size) {
+ if (buffered_size_ == 0) {
+ while (size > 0 && size >= next_required_size_) {
+ auto used_size = next_required_size_;
+ switch (state_) {
+ case State::INITIAL:
+ RETURN_NOT_OK(ConsumeInitialData(data, next_required_size_));
+ break;
+ case State::METADATA_LENGTH:
+ RETURN_NOT_OK(ConsumeMetadataLengthData(data, next_required_size_));
+ break;
+ case State::METADATA: {
+ auto buffer = std::make_shared<Buffer>(data, next_required_size_);
+ RETURN_NOT_OK(ConsumeMetadataBuffer(buffer));
+ } break;
+ case State::BODY: {
+ auto buffer = std::make_shared<Buffer>(data, next_required_size_);
+ RETURN_NOT_OK(ConsumeBodyBuffer(buffer));
+ } break;
+ case State::EOS:
+ return Status::OK();
+ }
+ data += used_size;
+ size -= used_size;
+ }
+ }
+
+ if (size == 0) {
+ return Status::OK();
+ }
+
+ chunks_.push_back(std::make_shared<Buffer>(data, size));
+ buffered_size_ += size;
+ return ConsumeChunks();
+ }
+
+ Status ConsumeBuffer(std::shared_ptr<Buffer> buffer) {
+ if (buffered_size_ == 0) {
+ while (buffer->size() >= next_required_size_) {
+ auto used_size = next_required_size_;
+ switch (state_) {
+ case State::INITIAL:
+ RETURN_NOT_OK(ConsumeInitialBuffer(buffer));
+ break;
+ case State::METADATA_LENGTH:
+ RETURN_NOT_OK(ConsumeMetadataLengthBuffer(buffer));
+ break;
+ case State::METADATA:
+ if (buffer->size() == next_required_size_) {
+ return ConsumeMetadataBuffer(buffer);
+ } else {
+ auto sliced_buffer = SliceBuffer(buffer, 0, next_required_size_);
+ RETURN_NOT_OK(ConsumeMetadataBuffer(sliced_buffer));
+ }
+ break;
+ case State::BODY:
+ if (buffer->size() == next_required_size_) {
+ return ConsumeBodyBuffer(buffer);
+ } else {
+ auto sliced_buffer = SliceBuffer(buffer, 0, next_required_size_);
+ RETURN_NOT_OK(ConsumeBodyBuffer(sliced_buffer));
+ }
+ break;
+ case State::EOS:
+ return Status::OK();
+ }
+ if (buffer->size() == used_size) {
+ return Status::OK();
+ }
+ buffer = SliceBuffer(buffer, used_size);
+ }
+ }
+
+ if (buffer->size() == 0) {
+ return Status::OK();
+ }
+
+ buffered_size_ += buffer->size();
+ chunks_.push_back(std::move(buffer));
+ return ConsumeChunks();
+ }
+
+ int64_t next_required_size() const { return next_required_size_ - buffered_size_; }
+
+ MessageDecoder::State state() const { return state_; }
+
+ private:
+ Status ConsumeChunks() {
+ while (state_ != State::EOS) {
+ if (buffered_size_ < next_required_size_) {
+ return Status::OK();
+ }
+
+ switch (state_) {
+ case State::INITIAL:
+ RETURN_NOT_OK(ConsumeInitialChunks());
+ break;
+ case State::METADATA_LENGTH:
+ RETURN_NOT_OK(ConsumeMetadataLengthChunks());
+ break;
+ case State::METADATA:
+ RETURN_NOT_OK(ConsumeMetadataChunks());
+ break;
+ case State::BODY:
+ RETURN_NOT_OK(ConsumeBodyChunks());
+ break;
+ case State::EOS:
+ return Status::OK();
+ }
+ }
+
+ return Status::OK();
+ }
+
+ Status ConsumeInitialData(const uint8_t* data, int64_t size) {
+ return ConsumeInitial(BitUtil::FromLittleEndian(util::SafeLoadAs<int32_t>(data)));
+ }
+
+ Status ConsumeInitialBuffer(const std::shared_ptr<Buffer>& buffer) {
+ ARROW_ASSIGN_OR_RAISE(auto continuation, ConsumeDataBufferInt32(buffer));
+ return ConsumeInitial(BitUtil::FromLittleEndian(continuation));
+ }
+
+ Status ConsumeInitialChunks() {
+ int32_t continuation = 0;
+ RETURN_NOT_OK(ConsumeDataChunks(sizeof(int32_t), &continuation));
+ return ConsumeInitial(BitUtil::FromLittleEndian(continuation));
+ }
+
+ Status ConsumeInitial(int32_t continuation) {
+ if (continuation == internal::kIpcContinuationToken) {
+ state_ = State::METADATA_LENGTH;
+ next_required_size_ = kMessageDecoderNextRequiredSizeMetadataLength;
+ RETURN_NOT_OK(listener_->OnMetadataLength());
+ // Valid IPC message, read the message length now
+ return Status::OK();
+ } else if (continuation == 0) {
+ state_ = State::EOS;
+ next_required_size_ = 0;
+ RETURN_NOT_OK(listener_->OnEOS());
+ return Status::OK();
+ } else if (continuation > 0) {
+ state_ = State::METADATA;
+ // ARROW-6314: Backwards compatibility for reading old IPC
+ // messages produced prior to version 0.15.0
+ next_required_size_ = continuation;
+ RETURN_NOT_OK(listener_->OnMetadata());
+ return Status::OK();
+ } else {
+ return Status::IOError("Invalid IPC stream: negative continuation token");
+ }
+ }
+
+ Status ConsumeMetadataLengthData(const uint8_t* data, int64_t size) {
+ return ConsumeMetadataLength(
+ BitUtil::FromLittleEndian(util::SafeLoadAs<int32_t>(data)));
+ }
+
+ Status ConsumeMetadataLengthBuffer(const std::shared_ptr<Buffer>& buffer) {
+ ARROW_ASSIGN_OR_RAISE(auto metadata_length, ConsumeDataBufferInt32(buffer));
+ return ConsumeMetadataLength(BitUtil::FromLittleEndian(metadata_length));
+ }
+
+ Status ConsumeMetadataLengthChunks() {
+ int32_t metadata_length = 0;
+ RETURN_NOT_OK(ConsumeDataChunks(sizeof(int32_t), &metadata_length));
+ return ConsumeMetadataLength(BitUtil::FromLittleEndian(metadata_length));
+ }
+
+ Status ConsumeMetadataLength(int32_t metadata_length) {
+ if (metadata_length == 0) {
+ state_ = State::EOS;
+ next_required_size_ = 0;
+ RETURN_NOT_OK(listener_->OnEOS());
+ return Status::OK();
+ } else if (metadata_length > 0) {
+ state_ = State::METADATA;
+ next_required_size_ = metadata_length;
+ RETURN_NOT_OK(listener_->OnMetadata());
+ return Status::OK();
+ } else {
+ return Status::IOError("Invalid IPC message: negative metadata length");
+ }
+ }
+
+ Status ConsumeMetadataBuffer(const std::shared_ptr<Buffer>& buffer) {
+ if (buffer->is_cpu()) {
+ metadata_ = buffer;
+ } else {
+ ARROW_ASSIGN_OR_RAISE(metadata_,
+ Buffer::ViewOrCopy(buffer, CPUDevice::memory_manager(pool_)));
+ }
+ return ConsumeMetadata();
+ }
+
+ Status ConsumeMetadataChunks() {
+ if (chunks_[0]->size() >= next_required_size_) {
+ if (chunks_[0]->size() == next_required_size_) {
+ if (chunks_[0]->is_cpu()) {
+ metadata_ = std::move(chunks_[0]);
+ } else {
+ ARROW_ASSIGN_OR_RAISE(
+ metadata_,
+ Buffer::ViewOrCopy(chunks_[0], CPUDevice::memory_manager(pool_)));
+ }
+ chunks_.erase(chunks_.begin());
+ } else {
+ metadata_ = SliceBuffer(chunks_[0], 0, next_required_size_);
+ if (!chunks_[0]->is_cpu()) {
+ ARROW_ASSIGN_OR_RAISE(
+ metadata_, Buffer::ViewOrCopy(metadata_, CPUDevice::memory_manager(pool_)));
+ }
+ chunks_[0] = SliceBuffer(chunks_[0], next_required_size_);
+ }
+ buffered_size_ -= next_required_size_;
+ } else {
+ ARROW_ASSIGN_OR_RAISE(auto metadata, AllocateBuffer(next_required_size_, pool_));
+ metadata_ = std::shared_ptr<Buffer>(metadata.release());
+ RETURN_NOT_OK(ConsumeDataChunks(next_required_size_, metadata_->mutable_data()));
+ }
+ return ConsumeMetadata();
+ }
+
+ Status ConsumeMetadata() {
+ RETURN_NOT_OK(MaybeAlignMetadata(&metadata_));
+ int64_t body_length = -1;
+ RETURN_NOT_OK(CheckMetadataAndGetBodyLength(*metadata_, &body_length));
+
+ state_ = State::BODY;
+ next_required_size_ = body_length;
+ RETURN_NOT_OK(listener_->OnBody());
+ if (next_required_size_ == 0) {
+ ARROW_ASSIGN_OR_RAISE(auto body, AllocateBuffer(0, pool_));
+ std::shared_ptr<Buffer> shared_body(body.release());
+ return ConsumeBody(&shared_body);
+ } else {
+ return Status::OK();
+ }
+ }
+
+ Status ConsumeBodyBuffer(std::shared_ptr<Buffer> buffer) {
+ return ConsumeBody(&buffer);
+ }
+
+ Status ConsumeBodyChunks() {
+ if (chunks_[0]->size() >= next_required_size_) {
+ auto used_size = next_required_size_;
+ if (chunks_[0]->size() == next_required_size_) {
+ RETURN_NOT_OK(ConsumeBody(&chunks_[0]));
+ chunks_.erase(chunks_.begin());
+ } else {
+ auto body = SliceBuffer(chunks_[0], 0, next_required_size_);
+ RETURN_NOT_OK(ConsumeBody(&body));
+ chunks_[0] = SliceBuffer(chunks_[0], used_size);
+ }
+ buffered_size_ -= used_size;
+ return Status::OK();
+ } else {
+ ARROW_ASSIGN_OR_RAISE(auto body, AllocateBuffer(next_required_size_, pool_));
+ RETURN_NOT_OK(ConsumeDataChunks(next_required_size_, body->mutable_data()));
+ std::shared_ptr<Buffer> shared_body(body.release());
+ return ConsumeBody(&shared_body);
+ }
+ }
+
+ Status ConsumeBody(std::shared_ptr<Buffer>* buffer) {
+ ARROW_ASSIGN_OR_RAISE(std::unique_ptr<Message> message,
+ Message::Open(metadata_, *buffer));
+
+ RETURN_NOT_OK(listener_->OnMessageDecoded(std::move(message)));
+ state_ = State::INITIAL;
+ next_required_size_ = kMessageDecoderNextRequiredSizeInitial;
+ RETURN_NOT_OK(listener_->OnInitial());
+ return Status::OK();
+ }
+
+ Result<int32_t> ConsumeDataBufferInt32(const std::shared_ptr<Buffer>& buffer) {
+ if (buffer->is_cpu()) {
+ return util::SafeLoadAs<int32_t>(buffer->data());
+ } else {
+ ARROW_ASSIGN_OR_RAISE(auto cpu_buffer,
+ Buffer::ViewOrCopy(buffer, CPUDevice::memory_manager(pool_)));
+ return util::SafeLoadAs<int32_t>(cpu_buffer->data());
+ }
+ }
+
+ Status ConsumeDataChunks(int64_t nbytes, void* out) {
+ size_t offset = 0;
+ size_t n_used_chunks = 0;
+ auto required_size = nbytes;
+ std::shared_ptr<Buffer> last_chunk;
+ for (auto& chunk : chunks_) {
+ if (!chunk->is_cpu()) {
+ ARROW_ASSIGN_OR_RAISE(
+ chunk, Buffer::ViewOrCopy(chunk, CPUDevice::memory_manager(pool_)));
+ }
+ auto data = chunk->data();
+ auto data_size = chunk->size();
+ auto copy_size = std::min(required_size, data_size);
+ memcpy(static_cast<uint8_t*>(out) + offset, data, copy_size);
+ n_used_chunks++;
+ offset += copy_size;
+ required_size -= copy_size;
+ if (required_size == 0) {
+ if (data_size != copy_size) {
+ last_chunk = SliceBuffer(chunk, copy_size);
+ }
+ break;
+ }
+ }
+ chunks_.erase(chunks_.begin(), chunks_.begin() + n_used_chunks);
+ if (last_chunk.get() != nullptr) {
+ chunks_.insert(chunks_.begin(), std::move(last_chunk));
+ }
+ buffered_size_ -= offset;
+ return Status::OK();
+ }
+
+ std::shared_ptr<MessageDecoderListener> listener_;
+ MemoryPool* pool_;
+ State state_;
+ int64_t next_required_size_;
+ std::vector<std::shared_ptr<Buffer>> chunks_;
+ int64_t buffered_size_;
+ std::shared_ptr<Buffer> metadata_; // Must be CPU buffer
+};
+
+MessageDecoder::MessageDecoder(std::shared_ptr<MessageDecoderListener> listener,
+ MemoryPool* pool) {
+ impl_.reset(new MessageDecoderImpl(std::move(listener), State::INITIAL,
+ kMessageDecoderNextRequiredSizeInitial, pool));
+}
+
+MessageDecoder::MessageDecoder(std::shared_ptr<MessageDecoderListener> listener,
+ State initial_state, int64_t initial_next_required_size,
+ MemoryPool* pool) {
+ impl_.reset(new MessageDecoderImpl(std::move(listener), initial_state,
+ initial_next_required_size, pool));
+}
+
+MessageDecoder::~MessageDecoder() {}
+
+Status MessageDecoder::Consume(const uint8_t* data, int64_t size) {
+ return impl_->ConsumeData(data, size);
+}
+
+Status MessageDecoder::Consume(std::shared_ptr<Buffer> buffer) {
+ return impl_->ConsumeBuffer(buffer);
+}
+
+int64_t MessageDecoder::next_required_size() const { return impl_->next_required_size(); }
+
+MessageDecoder::State MessageDecoder::state() const { return impl_->state(); }
+
+// ----------------------------------------------------------------------
+// Implement InputStream message reader
+
+/// \brief Implementation of MessageReader that reads from InputStream
+class InputStreamMessageReader : public MessageReader, public MessageDecoderListener {
+ public:
+ explicit InputStreamMessageReader(io::InputStream* stream)
+ : stream_(stream),
+ owned_stream_(),
+ message_(),
+ decoder_(std::shared_ptr<InputStreamMessageReader>(this, [](void*) {})) {}
+
+ explicit InputStreamMessageReader(const std::shared_ptr<io::InputStream>& owned_stream)
+ : InputStreamMessageReader(owned_stream.get()) {
+ owned_stream_ = owned_stream;
+ }
+
+ ~InputStreamMessageReader() {}
+
+ Status OnMessageDecoded(std::unique_ptr<Message> message) override {
+ message_ = std::move(message);
+ return Status::OK();
+ }
+
+ Result<std::unique_ptr<Message>> ReadNextMessage() override {
+ ARROW_RETURN_NOT_OK(DecodeMessage(&decoder_, stream_));
+ return std::move(message_);
+ }
+
+ private:
+ io::InputStream* stream_;
+ std::shared_ptr<io::InputStream> owned_stream_;
+ std::unique_ptr<Message> message_;
+ MessageDecoder decoder_;
+};
+
+std::unique_ptr<MessageReader> MessageReader::Open(io::InputStream* stream) {
+ return std::unique_ptr<MessageReader>(new InputStreamMessageReader(stream));
+}
+
+std::unique_ptr<MessageReader> MessageReader::Open(
+ const std::shared_ptr<io::InputStream>& owned_stream) {
+ return std::unique_ptr<MessageReader>(new InputStreamMessageReader(owned_stream));
+}
+
+} // namespace ipc
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/message.h b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/message.h
new file mode 100644
index 00000000000..b2683259cb4
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/message.h
@@ -0,0 +1,536 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// C++ object model and user API for interprocess schema messaging
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "arrow/io/type_fwd.h"
+#include "arrow/ipc/type_fwd.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace ipc {
+
+struct IpcWriteOptions;
+
+// Read interface classes. We do not fully deserialize the flatbuffers so that
+// individual fields metadata can be retrieved from very large schema without
+//
+
+/// \class Message
+/// \brief An IPC message including metadata and body
+class ARROW_EXPORT Message {
+ public:
+ /// \brief Construct message, but do not validate
+ ///
+ /// Use at your own risk; Message::Open has more metadata validation
+ Message(std::shared_ptr<Buffer> metadata, std::shared_ptr<Buffer> body);
+
+ ~Message();
+
+ /// \brief Create and validate a Message instance from two buffers
+ ///
+ /// \param[in] metadata a buffer containing the Flatbuffer metadata
+ /// \param[in] body a buffer containing the message body, which may be null
+ /// \return the created message
+ static Result<std::unique_ptr<Message>> Open(std::shared_ptr<Buffer> metadata,
+ std::shared_ptr<Buffer> body);
+
+ /// \brief Read message body and create Message given Flatbuffer metadata
+ /// \param[in] metadata containing a serialized Message flatbuffer
+ /// \param[in] stream an InputStream
+ /// \return the created Message
+ ///
+ /// \note If stream supports zero-copy, this is zero-copy
+ static Result<std::unique_ptr<Message>> ReadFrom(std::shared_ptr<Buffer> metadata,
+ io::InputStream* stream);
+
+ /// \brief Read message body from position in file, and create Message given
+ /// the Flatbuffer metadata
+ /// \param[in] offset the position in the file where the message body starts.
+ /// \param[in] metadata containing a serialized Message flatbuffer
+ /// \param[in] file the seekable file interface to read from
+ /// \return the created Message
+ ///
+ /// \note If file supports zero-copy, this is zero-copy
+ static Result<std::unique_ptr<Message>> ReadFrom(const int64_t offset,
+ std::shared_ptr<Buffer> metadata,
+ io::RandomAccessFile* file);
+
+ /// \brief Return true if message type and contents are equal
+ ///
+ /// \param other another message
+ /// \return true if contents equal
+ bool Equals(const Message& other) const;
+
+ /// \brief the Message metadata
+ ///
+ /// \return buffer
+ std::shared_ptr<Buffer> metadata() const;
+
+ /// \brief Custom metadata serialized in metadata Flatbuffer. Returns nullptr
+ /// when none set
+ const std::shared_ptr<const KeyValueMetadata>& custom_metadata() const;
+
+ /// \brief the Message body, if any
+ ///
+ /// \return buffer is null if no body
+ std::shared_ptr<Buffer> body() const;
+
+ /// \brief The expected body length according to the metadata, for
+ /// verification purposes
+ int64_t body_length() const;
+
+ /// \brief The Message type
+ MessageType type() const;
+
+ /// \brief The Message metadata version
+ MetadataVersion metadata_version() const;
+
+ const void* header() const;
+
+ /// \brief Write length-prefixed metadata and body to output stream
+ ///
+ /// \param[in] file output stream to write to
+ /// \param[in] options IPC writing options including alignment
+ /// \param[out] output_length the number of bytes written
+ /// \return Status
+ Status SerializeTo(io::OutputStream* file, const IpcWriteOptions& options,
+ int64_t* output_length) const;
+
+ /// \brief Return true if the Message metadata passes Flatbuffer validation
+ bool Verify() const;
+
+ /// \brief Whether a given message type needs a body.
+ static bool HasBody(MessageType type) {
+ return type != MessageType::NONE && type != MessageType::SCHEMA;
+ }
+
+ private:
+ // Hide serialization details from user API
+ class MessageImpl;
+ std::unique_ptr<MessageImpl> impl_;
+
+ ARROW_DISALLOW_COPY_AND_ASSIGN(Message);
+};
+
+ARROW_EXPORT std::string FormatMessageType(MessageType type);
+
+/// \class MessageDecoderListener
+/// \brief An abstract class to listen events from MessageDecoder.
+///
+/// This API is EXPERIMENTAL.
+///
+/// \since 0.17.0
+class ARROW_EXPORT MessageDecoderListener {
+ public:
+ virtual ~MessageDecoderListener() = default;
+
+ /// \brief Called when a message is decoded.
+ ///
+ /// MessageDecoder calls this method when it decodes a message. This
+ /// method is called multiple times when the target stream has
+ /// multiple messages.
+ ///
+ /// \param[in] message a decoded message
+ /// \return Status
+ virtual Status OnMessageDecoded(std::unique_ptr<Message> message) = 0;
+
+ /// \brief Called when the decoder state is changed to
+ /// MessageDecoder::State::INITIAL.
+ ///
+ /// The default implementation just returns arrow::Status::OK().
+ ///
+ /// \return Status
+ virtual Status OnInitial();
+
+ /// \brief Called when the decoder state is changed to
+ /// MessageDecoder::State::METADATA_LENGTH.
+ ///
+ /// The default implementation just returns arrow::Status::OK().
+ ///
+ /// \return Status
+ virtual Status OnMetadataLength();
+
+ /// \brief Called when the decoder state is changed to
+ /// MessageDecoder::State::METADATA.
+ ///
+ /// The default implementation just returns arrow::Status::OK().
+ ///
+ /// \return Status
+ virtual Status OnMetadata();
+
+ /// \brief Called when the decoder state is changed to
+ /// MessageDecoder::State::BODY.
+ ///
+ /// The default implementation just returns arrow::Status::OK().
+ ///
+ /// \return Status
+ virtual Status OnBody();
+
+ /// \brief Called when the decoder state is changed to
+ /// MessageDecoder::State::EOS.
+ ///
+ /// The default implementation just returns arrow::Status::OK().
+ ///
+ /// \return Status
+ virtual Status OnEOS();
+};
+
+/// \class AssignMessageDecoderListener
+/// \brief Assign a message decoded by MessageDecoder.
+///
+/// This API is EXPERIMENTAL.
+///
+/// \since 0.17.0
+class ARROW_EXPORT AssignMessageDecoderListener : public MessageDecoderListener {
+ public:
+ /// \brief Construct a listener that assigns a decoded message to the
+ /// specified location.
+ ///
+ /// \param[in] message a location to store the received message
+ explicit AssignMessageDecoderListener(std::unique_ptr<Message>* message)
+ : message_(message) {}
+
+ virtual ~AssignMessageDecoderListener() = default;
+
+ Status OnMessageDecoded(std::unique_ptr<Message> message) override {
+ *message_ = std::move(message);
+ return Status::OK();
+ }
+
+ private:
+ std::unique_ptr<Message>* message_;
+
+ ARROW_DISALLOW_COPY_AND_ASSIGN(AssignMessageDecoderListener);
+};
+
+/// \class MessageDecoder
+/// \brief Push style message decoder that receives data from user.
+///
+/// This API is EXPERIMENTAL.
+///
+/// \since 0.17.0
+class ARROW_EXPORT MessageDecoder {
+ public:
+ /// \brief State for reading a message
+ enum State {
+ /// The initial state. It requires one of the followings as the next data:
+ ///
+ /// * int32_t continuation token
+ /// * int32_t end-of-stream mark (== 0)
+ /// * int32_t metadata length (backward compatibility for
+ /// reading old IPC messages produced prior to version 0.15.0
+ INITIAL,
+
+ /// It requires int32_t metadata length.
+ METADATA_LENGTH,
+
+ /// It requires metadata.
+ METADATA,
+
+ /// It requires message body.
+ BODY,
+
+ /// The end-of-stream state. No more data is processed.
+ EOS,
+ };
+
+ /// \brief Construct a message decoder.
+ ///
+ /// \param[in] listener a MessageDecoderListener that responds events from
+ /// the decoder
+ /// \param[in] pool an optional MemoryPool to copy metadata on the
+ /// CPU, if required
+ explicit MessageDecoder(std::shared_ptr<MessageDecoderListener> listener,
+ MemoryPool* pool = default_memory_pool());
+
+ /// \brief Construct a message decoder with the specified state.
+ ///
+ /// This is a construct for advanced users that know how to decode
+ /// Message.
+ ///
+ /// \param[in] listener a MessageDecoderListener that responds events from
+ /// the decoder
+ /// \param[in] initial_state an initial state of the decode
+ /// \param[in] initial_next_required_size the number of bytes needed
+ /// to run the next action
+ /// \param[in] pool an optional MemoryPool to copy metadata on the
+ /// CPU, if required
+ MessageDecoder(std::shared_ptr<MessageDecoderListener> listener, State initial_state,
+ int64_t initial_next_required_size,
+ MemoryPool* pool = default_memory_pool());
+
+ virtual ~MessageDecoder();
+
+ /// \brief Feed data to the decoder as a raw data.
+ ///
+ /// If the decoder can decode one or more messages by the data, the
+ /// decoder calls listener->OnMessageDecoded() with a decoded
+ /// message multiple times.
+ ///
+ /// If the state of the decoder is changed, corresponding callbacks
+ /// on listener is called:
+ ///
+ /// * MessageDecoder::State::INITIAL: listener->OnInitial()
+ /// * MessageDecoder::State::METADATA_LENGTH: listener->OnMetadataLength()
+ /// * MessageDecoder::State::METADATA: listener->OnMetadata()
+ /// * MessageDecoder::State::BODY: listener->OnBody()
+ /// * MessageDecoder::State::EOS: listener->OnEOS()
+ ///
+ /// \param[in] data a raw data to be processed. This data isn't
+ /// copied. The passed memory must be kept alive through message
+ /// processing.
+ /// \param[in] size raw data size.
+ /// \return Status
+ Status Consume(const uint8_t* data, int64_t size);
+
+ /// \brief Feed data to the decoder as a Buffer.
+ ///
+ /// If the decoder can decode one or more messages by the Buffer,
+ /// the decoder calls listener->OnMessageDecoded() with a decoded
+ /// message multiple times.
+ ///
+ /// \param[in] buffer a Buffer to be processed.
+ /// \return Status
+ Status Consume(std::shared_ptr<Buffer> buffer);
+
+ /// \brief Return the number of bytes needed to advance the state of
+ /// the decoder.
+ ///
+ /// This method is provided for users who want to optimize performance.
+ /// Normal users don't need to use this method.
+ ///
+ /// Here is an example usage for normal users:
+ ///
+ /// ~~~{.cpp}
+ /// decoder.Consume(buffer1);
+ /// decoder.Consume(buffer2);
+ /// decoder.Consume(buffer3);
+ /// ~~~
+ ///
+ /// Decoder has internal buffer. If consumed data isn't enough to
+ /// advance the state of the decoder, consumed data is buffered to
+ /// the internal buffer. It causes performance overhead.
+ ///
+ /// If you pass next_required_size() size data to each Consume()
+ /// call, the decoder doesn't use its internal buffer. It improves
+ /// performance.
+ ///
+ /// Here is an example usage to avoid using internal buffer:
+ ///
+ /// ~~~{.cpp}
+ /// buffer1 = get_data(decoder.next_required_size());
+ /// decoder.Consume(buffer1);
+ /// buffer2 = get_data(decoder.next_required_size());
+ /// decoder.Consume(buffer2);
+ /// ~~~
+ ///
+ /// Users can use this method to avoid creating small
+ /// chunks. Message body must be contiguous data. If users pass
+ /// small chunks to the decoder, the decoder needs concatenate small
+ /// chunks internally. It causes performance overhead.
+ ///
+ /// Here is an example usage to reduce small chunks:
+ ///
+ /// ~~~{.cpp}
+ /// buffer = AllocateResizableBuffer();
+ /// while ((small_chunk = get_data(&small_chunk_size))) {
+ /// auto current_buffer_size = buffer->size();
+ /// buffer->Resize(current_buffer_size + small_chunk_size);
+ /// memcpy(buffer->mutable_data() + current_buffer_size,
+ /// small_chunk,
+ /// small_chunk_size);
+ /// if (buffer->size() < decoder.next_required_size()) {
+ /// continue;
+ /// }
+ /// std::shared_ptr<arrow::Buffer> chunk(buffer.release());
+ /// decoder.Consume(chunk);
+ /// buffer = AllocateResizableBuffer();
+ /// }
+ /// if (buffer->size() > 0) {
+ /// std::shared_ptr<arrow::Buffer> chunk(buffer.release());
+ /// decoder.Consume(chunk);
+ /// }
+ /// ~~~
+ ///
+ /// \return the number of bytes needed to advance the state of the
+ /// decoder
+ int64_t next_required_size() const;
+
+ /// \brief Return the current state of the decoder.
+ ///
+ /// This method is provided for users who want to optimize performance.
+ /// Normal users don't need to use this method.
+ ///
+ /// Decoder doesn't need Buffer to process data on the
+ /// MessageDecoder::State::INITIAL state and the
+ /// MessageDecoder::State::METADATA_LENGTH. Creating Buffer has
+ /// performance overhead. Advanced users can avoid creating Buffer
+ /// by checking the current state of the decoder:
+ ///
+ /// ~~~{.cpp}
+ /// switch (decoder.state()) {
+ /// MessageDecoder::State::INITIAL:
+ /// MessageDecoder::State::METADATA_LENGTH:
+ /// {
+ /// uint8_t data[sizeof(int32_t)];
+ /// auto data_size = input->Read(decoder.next_required_size(), data);
+ /// decoder.Consume(data, data_size);
+ /// }
+ /// break;
+ /// default:
+ /// {
+ /// auto buffer = input->Read(decoder.next_required_size());
+ /// decoder.Consume(buffer);
+ /// }
+ /// break;
+ /// }
+ /// ~~~
+ ///
+ /// \return the current state
+ State state() const;
+
+ private:
+ class MessageDecoderImpl;
+ std::unique_ptr<MessageDecoderImpl> impl_;
+
+ ARROW_DISALLOW_COPY_AND_ASSIGN(MessageDecoder);
+};
+
+/// \brief Abstract interface for a sequence of messages
+/// \since 0.5.0
+class ARROW_EXPORT MessageReader {
+ public:
+ virtual ~MessageReader() = default;
+
+ /// \brief Create MessageReader that reads from InputStream
+ static std::unique_ptr<MessageReader> Open(io::InputStream* stream);
+
+ /// \brief Create MessageReader that reads from owned InputStream
+ static std::unique_ptr<MessageReader> Open(
+ const std::shared_ptr<io::InputStream>& owned_stream);
+
+ /// \brief Read next Message from the interface
+ ///
+ /// \return an arrow::ipc::Message instance
+ virtual Result<std::unique_ptr<Message>> ReadNextMessage() = 0;
+};
+
+/// \brief Read encapsulated RPC message from position in file
+///
+/// Read a length-prefixed message flatbuffer starting at the indicated file
+/// offset. If the message has a body with non-zero length, it will also be
+/// read
+///
+/// The metadata_length includes at least the length prefix and the flatbuffer
+///
+/// \param[in] offset the position in the file where the message starts. The
+/// first 4 bytes after the offset are the message length
+/// \param[in] metadata_length the total number of bytes to read from file
+/// \param[in] file the seekable file interface to read from
+/// \return the message read
+ARROW_EXPORT
+Result<std::unique_ptr<Message>> ReadMessage(const int64_t offset,
+ const int32_t metadata_length,
+ io::RandomAccessFile* file);
+
+ARROW_EXPORT
+Future<std::shared_ptr<Message>> ReadMessageAsync(
+ const int64_t offset, const int32_t metadata_length, const int64_t body_length,
+ io::RandomAccessFile* file, const io::IOContext& context = io::default_io_context());
+
+/// \brief Advance stream to an 8-byte offset if its position is not a multiple
+/// of 8 already
+/// \param[in] stream an input stream
+/// \param[in] alignment the byte multiple for the metadata prefix, usually 8
+/// or 64, to ensure the body starts on a multiple of that alignment
+/// \return Status
+ARROW_EXPORT
+Status AlignStream(io::InputStream* stream, int32_t alignment = 8);
+
+/// \brief Advance stream to an 8-byte offset if its position is not a multiple
+/// of 8 already
+/// \param[in] stream an output stream
+/// \param[in] alignment the byte multiple for the metadata prefix, usually 8
+/// or 64, to ensure the body starts on a multiple of that alignment
+/// \return Status
+ARROW_EXPORT
+Status AlignStream(io::OutputStream* stream, int32_t alignment = 8);
+
+/// \brief Return error Status if file position is not a multiple of the
+/// indicated alignment
+ARROW_EXPORT
+Status CheckAligned(io::FileInterface* stream, int32_t alignment = 8);
+
+/// \brief Read encapsulated IPC message (metadata and body) from InputStream
+///
+/// Returns null if there are not enough bytes available or the
+/// message length is 0 (e.g. EOS in a stream)
+///
+/// \param[in] stream an input stream
+/// \param[in] pool an optional MemoryPool to copy metadata on the CPU, if required
+/// \return Message
+ARROW_EXPORT
+Result<std::unique_ptr<Message>> ReadMessage(io::InputStream* stream,
+ MemoryPool* pool = default_memory_pool());
+
+/// \brief Feed data from InputStream to MessageDecoder to decode an
+/// encapsulated IPC message (metadata and body)
+///
+/// This API is EXPERIMENTAL.
+///
+/// \param[in] decoder a decoder
+/// \param[in] stream an input stream
+/// \return Status
+///
+/// \since 0.17.0
+ARROW_EXPORT
+Status DecodeMessage(MessageDecoder* decoder, io::InputStream* stream);
+
+/// Write encapsulated IPC message Does not make assumptions about
+/// whether the stream is aligned already. Can write legacy (pre
+/// version 0.15.0) IPC message if option set
+///
+/// continuation: 0xFFFFFFFF
+/// message_size: int32
+/// message: const void*
+/// padding
+///
+///
+/// \param[in] message a buffer containing the metadata to write
+/// \param[in] options IPC writing options, including alignment and
+/// legacy message support
+/// \param[in,out] file the OutputStream to write to
+/// \param[out] message_length the total size of the payload written including
+/// padding
+/// \return Status
+Status WriteMessage(const Buffer& message, const IpcWriteOptions& options,
+ io::OutputStream* file, int32_t* message_length);
+
+} // namespace ipc
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/metadata_internal.cc b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/metadata_internal.cc
new file mode 100644
index 00000000000..4b332bd9e1e
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/metadata_internal.cc
@@ -0,0 +1,1486 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/ipc/metadata_internal.h"
+
+#include <cstdint>
+#include <memory>
+#include <sstream>
+#include <unordered_map>
+#include <utility>
+
+#include <flatbuffers/flatbuffers.h>
+
+#include "arrow/extension_type.h"
+#include "arrow/io/interfaces.h"
+#include "arrow/ipc/dictionary.h"
+#include "arrow/ipc/message.h"
+#include "arrow/ipc/options.h"
+#include "arrow/ipc/util.h"
+#include "arrow/sparse_tensor.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/key_value_metadata.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/ubsan.h"
+#include "arrow/visitor_inline.h"
+
+#include "generated/File_generated.h"
+#include "generated/Message_generated.h"
+#include "generated/Schema_generated.h"
+#include "generated/SparseTensor_generated.h"
+#include "generated/Tensor_generated.h"
+
+namespace arrow {
+
+namespace flatbuf = org::apache::arrow::flatbuf;
+using internal::checked_cast;
+using internal::GetByteWidth;
+
+namespace ipc {
+namespace internal {
+
+using FBB = flatbuffers::FlatBufferBuilder;
+using DictionaryOffset = flatbuffers::Offset<flatbuf::DictionaryEncoding>;
+using FieldOffset = flatbuffers::Offset<flatbuf::Field>;
+using RecordBatchOffset = flatbuffers::Offset<flatbuf::RecordBatch>;
+using SparseTensorOffset = flatbuffers::Offset<flatbuf::SparseTensor>;
+using Offset = flatbuffers::Offset<void>;
+using FBString = flatbuffers::Offset<flatbuffers::String>;
+
+MetadataVersion GetMetadataVersion(flatbuf::MetadataVersion version) {
+ switch (version) {
+ case flatbuf::MetadataVersion::V1:
+ // Arrow 0.1
+ return MetadataVersion::V1;
+ case flatbuf::MetadataVersion::V2:
+ // Arrow 0.2
+ return MetadataVersion::V2;
+ case flatbuf::MetadataVersion::V3:
+ // Arrow 0.3 to 0.7.1
+ return MetadataVersion::V4;
+ case flatbuf::MetadataVersion::V4:
+ // Arrow 0.8 to 0.17
+ return MetadataVersion::V4;
+ case flatbuf::MetadataVersion::V5:
+ // Arrow >= 1.0
+ return MetadataVersion::V5;
+ // Add cases as other versions become available
+ default:
+ return MetadataVersion::V5;
+ }
+}
+
+flatbuf::MetadataVersion MetadataVersionToFlatbuffer(MetadataVersion version) {
+ switch (version) {
+ case MetadataVersion::V1:
+ return flatbuf::MetadataVersion::V1;
+ case MetadataVersion::V2:
+ return flatbuf::MetadataVersion::V2;
+ case MetadataVersion::V3:
+ return flatbuf::MetadataVersion::V3;
+ case MetadataVersion::V4:
+ return flatbuf::MetadataVersion::V4;
+ case MetadataVersion::V5:
+ return flatbuf::MetadataVersion::V5;
+ // Add cases as other versions become available
+ default:
+ return flatbuf::MetadataVersion::V5;
+ }
+}
+
+bool HasValidityBitmap(Type::type type_id, MetadataVersion version) {
+ // In V4, null types have no validity bitmap
+ // In V5 and later, null and union types have no validity bitmap
+ return (version < MetadataVersion::V5) ? (type_id != Type::NA)
+ : ::arrow::internal::HasValidityBitmap(type_id);
+}
+
+namespace {
+
+Status IntFromFlatbuffer(const flatbuf::Int* int_data, std::shared_ptr<DataType>* out) {
+ if (int_data->bitWidth() > 64) {
+ return Status::NotImplemented("Integers with more than 64 bits not implemented");
+ }
+ if (int_data->bitWidth() < 8) {
+ return Status::NotImplemented("Integers with less than 8 bits not implemented");
+ }
+
+ switch (int_data->bitWidth()) {
+ case 8:
+ *out = int_data->is_signed() ? int8() : uint8();
+ break;
+ case 16:
+ *out = int_data->is_signed() ? int16() : uint16();
+ break;
+ case 32:
+ *out = int_data->is_signed() ? int32() : uint32();
+ break;
+ case 64:
+ *out = int_data->is_signed() ? int64() : uint64();
+ break;
+ default:
+ return Status::NotImplemented("Integers not in cstdint are not implemented");
+ }
+ return Status::OK();
+}
+
+Status FloatFromFlatbuffer(const flatbuf::FloatingPoint* float_data,
+ std::shared_ptr<DataType>* out) {
+ if (float_data->precision() == flatbuf::Precision::HALF) {
+ *out = float16();
+ } else if (float_data->precision() == flatbuf::Precision::SINGLE) {
+ *out = float32();
+ } else {
+ *out = float64();
+ }
+ return Status::OK();
+}
+
+Offset IntToFlatbuffer(FBB& fbb, int bitWidth, bool is_signed) {
+ return flatbuf::CreateInt(fbb, bitWidth, is_signed).Union();
+}
+
+Offset FloatToFlatbuffer(FBB& fbb, flatbuf::Precision precision) {
+ return flatbuf::CreateFloatingPoint(fbb, precision).Union();
+}
+
+// ----------------------------------------------------------------------
+// Union implementation
+
+Status UnionFromFlatbuffer(const flatbuf::Union* union_data,
+ const std::vector<std::shared_ptr<Field>>& children,
+ std::shared_ptr<DataType>* out) {
+ UnionMode::type mode =
+ (union_data->mode() == flatbuf::UnionMode::Sparse ? UnionMode::SPARSE
+ : UnionMode::DENSE);
+
+ std::vector<int8_t> type_codes;
+
+ const flatbuffers::Vector<int32_t>* fb_type_ids = union_data->typeIds();
+ if (fb_type_ids == nullptr) {
+ for (int8_t i = 0; i < static_cast<int8_t>(children.size()); ++i) {
+ type_codes.push_back(i);
+ }
+ } else {
+ for (int32_t id : (*fb_type_ids)) {
+ const auto type_code = static_cast<int8_t>(id);
+ if (id != type_code) {
+ return Status::Invalid("union type id out of bounds");
+ }
+ type_codes.push_back(type_code);
+ }
+ }
+
+ if (mode == UnionMode::SPARSE) {
+ ARROW_ASSIGN_OR_RAISE(
+ *out, SparseUnionType::Make(std::move(children), std::move(type_codes)));
+ } else {
+ ARROW_ASSIGN_OR_RAISE(
+ *out, DenseUnionType::Make(std::move(children), std::move(type_codes)));
+ }
+ return Status::OK();
+}
+
+#define INT_TO_FB_CASE(BIT_WIDTH, IS_SIGNED) \
+ *out_type = flatbuf::Type::Int; \
+ *offset = IntToFlatbuffer(fbb, BIT_WIDTH, IS_SIGNED); \
+ break;
+
+static inline flatbuf::TimeUnit ToFlatbufferUnit(TimeUnit::type unit) {
+ switch (unit) {
+ case TimeUnit::SECOND:
+ return flatbuf::TimeUnit::SECOND;
+ case TimeUnit::MILLI:
+ return flatbuf::TimeUnit::MILLISECOND;
+ case TimeUnit::MICRO:
+ return flatbuf::TimeUnit::MICROSECOND;
+ case TimeUnit::NANO:
+ return flatbuf::TimeUnit::NANOSECOND;
+ default:
+ break;
+ }
+ return flatbuf::TimeUnit::MIN;
+}
+
+static inline TimeUnit::type FromFlatbufferUnit(flatbuf::TimeUnit unit) {
+ switch (unit) {
+ case flatbuf::TimeUnit::SECOND:
+ return TimeUnit::SECOND;
+ case flatbuf::TimeUnit::MILLISECOND:
+ return TimeUnit::MILLI;
+ case flatbuf::TimeUnit::MICROSECOND:
+ return TimeUnit::MICRO;
+ case flatbuf::TimeUnit::NANOSECOND:
+ return TimeUnit::NANO;
+ default:
+ break;
+ }
+ // cannot reach
+ return TimeUnit::SECOND;
+}
+
+Status ConcreteTypeFromFlatbuffer(flatbuf::Type type, const void* type_data,
+ const std::vector<std::shared_ptr<Field>>& children,
+ std::shared_ptr<DataType>* out) {
+ switch (type) {
+ case flatbuf::Type::NONE:
+ return Status::Invalid("Type metadata cannot be none");
+ case flatbuf::Type::Null:
+ *out = null();
+ return Status::OK();
+ case flatbuf::Type::Int:
+ return IntFromFlatbuffer(static_cast<const flatbuf::Int*>(type_data), out);
+ case flatbuf::Type::FloatingPoint:
+ return FloatFromFlatbuffer(static_cast<const flatbuf::FloatingPoint*>(type_data),
+ out);
+ case flatbuf::Type::Binary:
+ *out = binary();
+ return Status::OK();
+ case flatbuf::Type::LargeBinary:
+ *out = large_binary();
+ return Status::OK();
+ case flatbuf::Type::FixedSizeBinary: {
+ auto fw_binary = static_cast<const flatbuf::FixedSizeBinary*>(type_data);
+ return FixedSizeBinaryType::Make(fw_binary->byteWidth()).Value(out);
+ }
+ case flatbuf::Type::Utf8:
+ *out = utf8();
+ return Status::OK();
+ case flatbuf::Type::LargeUtf8:
+ *out = large_utf8();
+ return Status::OK();
+ case flatbuf::Type::Bool:
+ *out = boolean();
+ return Status::OK();
+ case flatbuf::Type::Decimal: {
+ auto dec_type = static_cast<const flatbuf::Decimal*>(type_data);
+ if (dec_type->bitWidth() == 128) {
+ return Decimal128Type::Make(dec_type->precision(), dec_type->scale()).Value(out);
+ } else if (dec_type->bitWidth() == 256) {
+ return Decimal256Type::Make(dec_type->precision(), dec_type->scale()).Value(out);
+ } else {
+ return Status::Invalid("Library only supports 128-bit or 256-bit decimal values");
+ }
+ }
+ case flatbuf::Type::Date: {
+ auto date_type = static_cast<const flatbuf::Date*>(type_data);
+ if (date_type->unit() == flatbuf::DateUnit::DAY) {
+ *out = date32();
+ } else {
+ *out = date64();
+ }
+ return Status::OK();
+ }
+ case flatbuf::Type::Time: {
+ auto time_type = static_cast<const flatbuf::Time*>(type_data);
+ TimeUnit::type unit = FromFlatbufferUnit(time_type->unit());
+ int32_t bit_width = time_type->bitWidth();
+ switch (unit) {
+ case TimeUnit::SECOND:
+ case TimeUnit::MILLI:
+ if (bit_width != 32) {
+ return Status::Invalid("Time is 32 bits for second/milli unit");
+ }
+ *out = time32(unit);
+ break;
+ default:
+ if (bit_width != 64) {
+ return Status::Invalid("Time is 64 bits for micro/nano unit");
+ }
+ *out = time64(unit);
+ break;
+ }
+ return Status::OK();
+ }
+ case flatbuf::Type::Timestamp: {
+ auto ts_type = static_cast<const flatbuf::Timestamp*>(type_data);
+ TimeUnit::type unit = FromFlatbufferUnit(ts_type->unit());
+ *out = timestamp(unit, StringFromFlatbuffers(ts_type->timezone()));
+ return Status::OK();
+ }
+ case flatbuf::Type::Duration: {
+ auto duration = static_cast<const flatbuf::Duration*>(type_data);
+ TimeUnit::type unit = FromFlatbufferUnit(duration->unit());
+ *out = arrow::duration(unit);
+ return Status::OK();
+ }
+
+ case flatbuf::Type::Interval: {
+ auto i_type = static_cast<const flatbuf::Interval*>(type_data);
+ switch (i_type->unit()) {
+ case flatbuf::IntervalUnit::YEAR_MONTH: {
+ *out = month_interval();
+ return Status::OK();
+ }
+ case flatbuf::IntervalUnit::DAY_TIME: {
+ *out = day_time_interval();
+ return Status::OK();
+ }
+ }
+ return Status::NotImplemented("Unrecognized interval type.");
+ }
+
+ case flatbuf::Type::List:
+ if (children.size() != 1) {
+ return Status::Invalid("List must have exactly 1 child field");
+ }
+ *out = std::make_shared<ListType>(children[0]);
+ return Status::OK();
+ case flatbuf::Type::LargeList:
+ if (children.size() != 1) {
+ return Status::Invalid("LargeList must have exactly 1 child field");
+ }
+ *out = std::make_shared<LargeListType>(children[0]);
+ return Status::OK();
+ case flatbuf::Type::Map:
+ if (children.size() != 1) {
+ return Status::Invalid("Map must have exactly 1 child field");
+ }
+ if (children[0]->nullable() || children[0]->type()->id() != Type::STRUCT ||
+ children[0]->type()->num_fields() != 2) {
+ return Status::Invalid("Map's key-item pairs must be non-nullable structs");
+ }
+ if (children[0]->type()->field(0)->nullable()) {
+ return Status::Invalid("Map's keys must be non-nullable");
+ } else {
+ auto map = static_cast<const flatbuf::Map*>(type_data);
+ *out = std::make_shared<MapType>(children[0]->type()->field(0)->type(),
+ children[0]->type()->field(1)->type(),
+ map->keysSorted());
+ }
+ return Status::OK();
+ case flatbuf::Type::FixedSizeList:
+ if (children.size() != 1) {
+ return Status::Invalid("FixedSizeList must have exactly 1 child field");
+ } else {
+ auto fs_list = static_cast<const flatbuf::FixedSizeList*>(type_data);
+ *out = std::make_shared<FixedSizeListType>(children[0], fs_list->listSize());
+ }
+ return Status::OK();
+ case flatbuf::Type::Struct_:
+ *out = std::make_shared<StructType>(children);
+ return Status::OK();
+ case flatbuf::Type::Union:
+ return UnionFromFlatbuffer(static_cast<const flatbuf::Union*>(type_data), children,
+ out);
+ default:
+ return Status::Invalid("Unrecognized type:" +
+ std::to_string(static_cast<int>(type)));
+ }
+}
+
+Status TensorTypeToFlatbuffer(FBB& fbb, const DataType& type, flatbuf::Type* out_type,
+ Offset* offset) {
+ switch (type.id()) {
+ case Type::UINT8:
+ INT_TO_FB_CASE(8, false);
+ case Type::INT8:
+ INT_TO_FB_CASE(8, true);
+ case Type::UINT16:
+ INT_TO_FB_CASE(16, false);
+ case Type::INT16:
+ INT_TO_FB_CASE(16, true);
+ case Type::UINT32:
+ INT_TO_FB_CASE(32, false);
+ case Type::INT32:
+ INT_TO_FB_CASE(32, true);
+ case Type::UINT64:
+ INT_TO_FB_CASE(64, false);
+ case Type::INT64:
+ INT_TO_FB_CASE(64, true);
+ case Type::HALF_FLOAT:
+ *out_type = flatbuf::Type::FloatingPoint;
+ *offset = FloatToFlatbuffer(fbb, flatbuf::Precision::HALF);
+ break;
+ case Type::FLOAT:
+ *out_type = flatbuf::Type::FloatingPoint;
+ *offset = FloatToFlatbuffer(fbb, flatbuf::Precision::SINGLE);
+ break;
+ case Type::DOUBLE:
+ *out_type = flatbuf::Type::FloatingPoint;
+ *offset = FloatToFlatbuffer(fbb, flatbuf::Precision::DOUBLE);
+ break;
+ default:
+ *out_type = flatbuf::Type::NONE; // Make clang-tidy happy
+ return Status::NotImplemented("Unable to convert type: ", type.ToString());
+ }
+ return Status::OK();
+}
+
+static Status GetDictionaryEncoding(FBB& fbb, const std::shared_ptr<Field>& field,
+ const DictionaryType& type, int64_t dictionary_id,
+ DictionaryOffset* out) {
+ // We assume that the dictionary index type (as an integer) has already been
+ // validated elsewhere, and can safely assume we are dealing with integers
+ const auto& index_type = checked_cast<const IntegerType&>(*type.index_type());
+
+ auto index_type_offset =
+ flatbuf::CreateInt(fbb, index_type.bit_width(), index_type.is_signed());
+
+ *out = flatbuf::CreateDictionaryEncoding(fbb, dictionary_id, index_type_offset,
+ type.ordered());
+ return Status::OK();
+}
+
+static KeyValueOffset AppendKeyValue(FBB& fbb, const std::string& key,
+ const std::string& value) {
+ return flatbuf::CreateKeyValue(fbb, fbb.CreateString(key), fbb.CreateString(value));
+}
+
+static void AppendKeyValueMetadata(FBB& fbb, const KeyValueMetadata& metadata,
+ std::vector<KeyValueOffset>* key_values) {
+ key_values->reserve(metadata.size());
+ for (int i = 0; i < metadata.size(); ++i) {
+ key_values->push_back(AppendKeyValue(fbb, metadata.key(i), metadata.value(i)));
+ }
+}
+
+class FieldToFlatbufferVisitor {
+ public:
+ FieldToFlatbufferVisitor(FBB& fbb, const DictionaryFieldMapper& mapper,
+ const FieldPosition& field_pos)
+ : fbb_(fbb), mapper_(mapper), field_pos_(field_pos) {}
+
+ Status VisitType(const DataType& type) { return VisitTypeInline(type, this); }
+
+ Status Visit(const NullType& type) {
+ fb_type_ = flatbuf::Type::Null;
+ type_offset_ = flatbuf::CreateNull(fbb_).Union();
+ return Status::OK();
+ }
+
+ Status Visit(const BooleanType& type) {
+ fb_type_ = flatbuf::Type::Bool;
+ type_offset_ = flatbuf::CreateBool(fbb_).Union();
+ return Status::OK();
+ }
+
+ template <int BitWidth, bool IsSigned, typename T>
+ Status Visit(const T& type) {
+ fb_type_ = flatbuf::Type::Int;
+ type_offset_ = IntToFlatbuffer(fbb_, BitWidth, IsSigned);
+ return Status::OK();
+ }
+
+ template <typename T>
+ enable_if_integer<T, Status> Visit(const T& type) {
+ constexpr bool is_signed = is_signed_integer_type<T>::value;
+ return Visit<sizeof(typename T::c_type) * 8, is_signed>(type);
+ }
+
+ Status Visit(const HalfFloatType& type) {
+ fb_type_ = flatbuf::Type::FloatingPoint;
+ type_offset_ = FloatToFlatbuffer(fbb_, flatbuf::Precision::HALF);
+ return Status::OK();
+ }
+
+ Status Visit(const FloatType& type) {
+ fb_type_ = flatbuf::Type::FloatingPoint;
+ type_offset_ = FloatToFlatbuffer(fbb_, flatbuf::Precision::SINGLE);
+ return Status::OK();
+ }
+
+ Status Visit(const DoubleType& type) {
+ fb_type_ = flatbuf::Type::FloatingPoint;
+ type_offset_ = FloatToFlatbuffer(fbb_, flatbuf::Precision::DOUBLE);
+ return Status::OK();
+ }
+
+ Status Visit(const FixedSizeBinaryType& type) {
+ const auto& fw_type = checked_cast<const FixedSizeBinaryType&>(type);
+ fb_type_ = flatbuf::Type::FixedSizeBinary;
+ type_offset_ = flatbuf::CreateFixedSizeBinary(fbb_, fw_type.byte_width()).Union();
+ return Status::OK();
+ }
+
+ Status Visit(const BinaryType& type) {
+ fb_type_ = flatbuf::Type::Binary;
+ type_offset_ = flatbuf::CreateBinary(fbb_).Union();
+ return Status::OK();
+ }
+
+ Status Visit(const LargeBinaryType& type) {
+ fb_type_ = flatbuf::Type::LargeBinary;
+ type_offset_ = flatbuf::CreateLargeBinary(fbb_).Union();
+ return Status::OK();
+ }
+
+ Status Visit(const StringType& type) {
+ fb_type_ = flatbuf::Type::Utf8;
+ type_offset_ = flatbuf::CreateUtf8(fbb_).Union();
+ return Status::OK();
+ }
+
+ Status Visit(const LargeStringType& type) {
+ fb_type_ = flatbuf::Type::LargeUtf8;
+ type_offset_ = flatbuf::CreateLargeUtf8(fbb_).Union();
+ return Status::OK();
+ }
+
+ Status Visit(const Date32Type& type) {
+ fb_type_ = flatbuf::Type::Date;
+ type_offset_ = flatbuf::CreateDate(fbb_, flatbuf::DateUnit::DAY).Union();
+ return Status::OK();
+ }
+
+ Status Visit(const Date64Type& type) {
+ fb_type_ = flatbuf::Type::Date;
+ type_offset_ = flatbuf::CreateDate(fbb_, flatbuf::DateUnit::MILLISECOND).Union();
+ return Status::OK();
+ }
+
+ Status Visit(const Time32Type& type) {
+ const auto& time_type = checked_cast<const Time32Type&>(type);
+ fb_type_ = flatbuf::Type::Time;
+ type_offset_ =
+ flatbuf::CreateTime(fbb_, ToFlatbufferUnit(time_type.unit()), 32).Union();
+ return Status::OK();
+ }
+
+ Status Visit(const Time64Type& type) {
+ const auto& time_type = checked_cast<const Time64Type&>(type);
+ fb_type_ = flatbuf::Type::Time;
+ type_offset_ =
+ flatbuf::CreateTime(fbb_, ToFlatbufferUnit(time_type.unit()), 64).Union();
+ return Status::OK();
+ }
+
+ Status Visit(const TimestampType& type) {
+ const auto& ts_type = checked_cast<const TimestampType&>(type);
+ fb_type_ = flatbuf::Type::Timestamp;
+ flatbuf::TimeUnit fb_unit = ToFlatbufferUnit(ts_type.unit());
+ FBString fb_timezone = 0;
+ if (ts_type.timezone().size() > 0) {
+ fb_timezone = fbb_.CreateString(ts_type.timezone());
+ }
+ type_offset_ = flatbuf::CreateTimestamp(fbb_, fb_unit, fb_timezone).Union();
+ return Status::OK();
+ }
+
+ Status Visit(const DurationType& type) {
+ fb_type_ = flatbuf::Type::Duration;
+ flatbuf::TimeUnit fb_unit = ToFlatbufferUnit(type.unit());
+ type_offset_ = flatbuf::CreateDuration(fbb_, fb_unit).Union();
+ return Status::OK();
+ }
+
+ Status Visit(const DayTimeIntervalType& type) {
+ fb_type_ = flatbuf::Type::Interval;
+ type_offset_ = flatbuf::CreateInterval(fbb_, flatbuf::IntervalUnit::DAY_TIME).Union();
+ return Status::OK();
+ }
+
+ Status Visit(const MonthIntervalType& type) {
+ fb_type_ = flatbuf::Type::Interval;
+ type_offset_ =
+ flatbuf::CreateInterval(fbb_, flatbuf::IntervalUnit::YEAR_MONTH).Union();
+ return Status::OK();
+ }
+
+ Status Visit(const Decimal128Type& type) {
+ const auto& dec_type = checked_cast<const Decimal128Type&>(type);
+ fb_type_ = flatbuf::Type::Decimal;
+ type_offset_ = flatbuf::CreateDecimal(fbb_, dec_type.precision(), dec_type.scale(),
+ /*bitWidth=*/128)
+ .Union();
+ return Status::OK();
+ }
+
+ Status Visit(const Decimal256Type& type) {
+ const auto& dec_type = checked_cast<const Decimal256Type&>(type);
+ fb_type_ = flatbuf::Type::Decimal;
+ type_offset_ = flatbuf::CreateDecimal(fbb_, dec_type.precision(), dec_type.scale(),
+ /*bitWith=*/256)
+ .Union();
+ return Status::OK();
+ }
+
+ Status Visit(const ListType& type) {
+ fb_type_ = flatbuf::Type::List;
+ RETURN_NOT_OK(VisitChildFields(type));
+ type_offset_ = flatbuf::CreateList(fbb_).Union();
+ return Status::OK();
+ }
+
+ Status Visit(const LargeListType& type) {
+ fb_type_ = flatbuf::Type::LargeList;
+ RETURN_NOT_OK(VisitChildFields(type));
+ type_offset_ = flatbuf::CreateLargeList(fbb_).Union();
+ return Status::OK();
+ }
+
+ Status Visit(const MapType& type) {
+ fb_type_ = flatbuf::Type::Map;
+ RETURN_NOT_OK(VisitChildFields(type));
+ type_offset_ = flatbuf::CreateMap(fbb_, type.keys_sorted()).Union();
+ return Status::OK();
+ }
+
+ Status Visit(const FixedSizeListType& type) {
+ fb_type_ = flatbuf::Type::FixedSizeList;
+ RETURN_NOT_OK(VisitChildFields(type));
+ type_offset_ = flatbuf::CreateFixedSizeList(fbb_, type.list_size()).Union();
+ return Status::OK();
+ }
+
+ Status Visit(const StructType& type) {
+ fb_type_ = flatbuf::Type::Struct_;
+ RETURN_NOT_OK(VisitChildFields(type));
+ type_offset_ = flatbuf::CreateStruct_(fbb_).Union();
+ return Status::OK();
+ }
+
+ Status Visit(const UnionType& type) {
+ fb_type_ = flatbuf::Type::Union;
+ RETURN_NOT_OK(VisitChildFields(type));
+
+ const auto& union_type = checked_cast<const UnionType&>(type);
+
+ flatbuf::UnionMode mode = union_type.mode() == UnionMode::SPARSE
+ ? flatbuf::UnionMode::Sparse
+ : flatbuf::UnionMode::Dense;
+
+ std::vector<int32_t> type_ids;
+ type_ids.reserve(union_type.type_codes().size());
+ for (uint8_t code : union_type.type_codes()) {
+ type_ids.push_back(code);
+ }
+
+ auto fb_type_ids = fbb_.CreateVector(type_ids.data(), type_ids.size());
+
+ type_offset_ = flatbuf::CreateUnion(fbb_, mode, fb_type_ids).Union();
+ return Status::OK();
+ }
+
+ Status Visit(const DictionaryType& type) {
+ // In this library, the dictionary "type" is a logical construct. Here we
+ // pass through to the value type, as we've already captured the index
+ // type in the DictionaryEncoding metadata in the parent field
+ return VisitType(*checked_cast<const DictionaryType&>(type).value_type());
+ }
+
+ Status Visit(const ExtensionType& type) {
+ RETURN_NOT_OK(VisitType(*type.storage_type()));
+ extra_type_metadata_[kExtensionTypeKeyName] = type.extension_name();
+ extra_type_metadata_[kExtensionMetadataKeyName] = type.Serialize();
+ return Status::OK();
+ }
+
+ Status VisitChildFields(const DataType& type) {
+ for (int i = 0; i < type.num_fields(); ++i) {
+ FieldOffset child_offset;
+ FieldToFlatbufferVisitor child_visitor(fbb_, mapper_, field_pos_.child(i));
+ RETURN_NOT_OK(child_visitor.GetResult(type.field(i), &child_offset));
+ children_.push_back(child_offset);
+ }
+ return Status::OK();
+ }
+
+ Status GetResult(const std::shared_ptr<Field>& field, FieldOffset* offset) {
+ RETURN_NOT_OK(VisitType(*field->type()));
+
+ DictionaryOffset dictionary = 0;
+ const DataType* storage_type = field->type().get();
+ if (storage_type->id() == Type::EXTENSION) {
+ storage_type =
+ checked_cast<const ExtensionType&>(*storage_type).storage_type().get();
+ }
+ if (storage_type->id() == Type::DICTIONARY) {
+ ARROW_ASSIGN_OR_RAISE(const auto dictionary_id,
+ mapper_.GetFieldId(field_pos_.path()));
+ RETURN_NOT_OK(GetDictionaryEncoding(
+ fbb_, field, checked_cast<const DictionaryType&>(*storage_type), dictionary_id,
+ &dictionary));
+ }
+
+ auto metadata = field->metadata();
+
+ flatbuffers::Offset<KVVector> fb_custom_metadata;
+ std::vector<KeyValueOffset> key_values;
+ if (metadata != nullptr) {
+ AppendKeyValueMetadata(fbb_, *metadata, &key_values);
+ }
+
+ for (const auto& pair : extra_type_metadata_) {
+ key_values.push_back(AppendKeyValue(fbb_, pair.first, pair.second));
+ }
+
+ if (key_values.size() > 0) {
+ fb_custom_metadata = fbb_.CreateVector(key_values);
+ }
+
+ auto fb_name = fbb_.CreateString(field->name());
+ auto fb_children = fbb_.CreateVector(children_.data(), children_.size());
+ *offset =
+ flatbuf::CreateField(fbb_, fb_name, field->nullable(), fb_type_, type_offset_,
+ dictionary, fb_children, fb_custom_metadata);
+ return Status::OK();
+ }
+
+ private:
+ FBB& fbb_;
+ const DictionaryFieldMapper& mapper_;
+ FieldPosition field_pos_;
+ flatbuf::Type fb_type_;
+ Offset type_offset_;
+ std::vector<FieldOffset> children_;
+ std::unordered_map<std::string, std::string> extra_type_metadata_;
+};
+
+Status FieldFromFlatbuffer(const flatbuf::Field* field, FieldPosition field_pos,
+ DictionaryMemo* dictionary_memo, std::shared_ptr<Field>* out) {
+ std::shared_ptr<DataType> type;
+
+ std::shared_ptr<KeyValueMetadata> metadata;
+ RETURN_NOT_OK(internal::GetKeyValueMetadata(field->custom_metadata(), &metadata));
+
+ // Reconstruct the data type
+ // 1. Data type children
+ FieldVector child_fields;
+ const auto& children = field->children();
+ // As a tolerance, allow for a null children field meaning "no children" (ARROW-12100)
+ if (children != nullptr) {
+ child_fields.resize(children->size());
+ for (int i = 0; i < static_cast<int>(children->size()); ++i) {
+ RETURN_NOT_OK(FieldFromFlatbuffer(children->Get(i), field_pos.child(i),
+ dictionary_memo, &child_fields[i]));
+ }
+ }
+
+ // 2. Top-level concrete data type
+ auto type_data = field->type();
+ CHECK_FLATBUFFERS_NOT_NULL(type_data, "Field.type");
+ RETURN_NOT_OK(
+ ConcreteTypeFromFlatbuffer(field->type_type(), type_data, child_fields, &type));
+
+ // 3. Is it a dictionary type?
+ int64_t dictionary_id = -1;
+ std::shared_ptr<DataType> dict_value_type;
+ const flatbuf::DictionaryEncoding* encoding = field->dictionary();
+ if (encoding != nullptr) {
+ // The field is dictionary-encoded. Construct the DictionaryType
+ // based on the DictionaryEncoding metadata and record in the
+ // dictionary_memo
+ std::shared_ptr<DataType> index_type;
+ auto int_data = encoding->indexType();
+ CHECK_FLATBUFFERS_NOT_NULL(int_data, "DictionaryEncoding.indexType");
+ RETURN_NOT_OK(IntFromFlatbuffer(int_data, &index_type));
+ dict_value_type = type;
+ ARROW_ASSIGN_OR_RAISE(type,
+ DictionaryType::Make(index_type, type, encoding->isOrdered()));
+ dictionary_id = encoding->id();
+ }
+
+ // 4. Is it an extension type?
+ if (metadata != nullptr) {
+ // Look for extension metadata in custom_metadata field
+ int name_index = metadata->FindKey(kExtensionTypeKeyName);
+ if (name_index != -1) {
+ std::shared_ptr<ExtensionType> ext_type =
+ GetExtensionType(metadata->value(name_index));
+ if (ext_type != nullptr) {
+ int data_index = metadata->FindKey(kExtensionMetadataKeyName);
+ std::string type_data = data_index == -1 ? "" : metadata->value(data_index);
+
+ ARROW_ASSIGN_OR_RAISE(type, ext_type->Deserialize(type, type_data));
+ // Remove the metadata, for faithful roundtripping
+ if (data_index != -1) {
+ RETURN_NOT_OK(metadata->DeleteMany({name_index, data_index}));
+ } else {
+ RETURN_NOT_OK(metadata->Delete(name_index));
+ }
+ }
+ // NOTE: if extension type is unknown, we do not raise here and
+ // simply return the storage type.
+ }
+ }
+
+ // Reconstruct field
+ auto field_name = StringFromFlatbuffers(field->name());
+ *out =
+ ::arrow::field(std::move(field_name), type, field->nullable(), std::move(metadata));
+ if (dictionary_id != -1) {
+ // We need both the id -> type mapping (to find the value type when
+ // reading a dictionary batch)
+ // and the field path -> id mapping (to find the dictionary when
+ // reading a record batch)
+ RETURN_NOT_OK(dictionary_memo->fields().AddField(dictionary_id, field_pos.path()));
+ RETURN_NOT_OK(dictionary_memo->AddDictionaryType(dictionary_id, dict_value_type));
+ }
+ return Status::OK();
+}
+
+// will return the endianness of the system we are running on
+// based the NUMPY_API function. See NOTICE.txt
+flatbuf::Endianness endianness() {
+ union {
+ uint32_t i;
+ char c[4];
+ } bint = {0x01020304};
+
+ return bint.c[0] == 1 ? flatbuf::Endianness::Big : flatbuf::Endianness::Little;
+}
+
+flatbuffers::Offset<KVVector> SerializeCustomMetadata(
+ FBB& fbb, const std::shared_ptr<const KeyValueMetadata>& metadata) {
+ std::vector<KeyValueOffset> key_values;
+ if (metadata != nullptr) {
+ AppendKeyValueMetadata(fbb, *metadata, &key_values);
+ return fbb.CreateVector(key_values);
+ } else {
+ // null
+ return 0;
+ }
+}
+
+Status SchemaToFlatbuffer(FBB& fbb, const Schema& schema,
+ const DictionaryFieldMapper& mapper,
+ flatbuffers::Offset<flatbuf::Schema>* out) {
+ std::vector<FieldOffset> field_offsets;
+ FieldPosition pos;
+ for (int i = 0; i < schema.num_fields(); ++i) {
+ FieldOffset offset;
+ FieldToFlatbufferVisitor field_visitor(fbb, mapper, pos.child(i));
+ RETURN_NOT_OK(field_visitor.GetResult(schema.field(i), &offset));
+ field_offsets.push_back(offset);
+ }
+
+ auto fb_offsets = fbb.CreateVector(field_offsets);
+ *out = flatbuf::CreateSchema(fbb, endianness(), fb_offsets,
+ SerializeCustomMetadata(fbb, schema.metadata()));
+ return Status::OK();
+}
+
+Result<std::shared_ptr<Buffer>> WriteFBMessage(
+ FBB& fbb, flatbuf::MessageHeader header_type, flatbuffers::Offset<void> header,
+ int64_t body_length, MetadataVersion version,
+ const std::shared_ptr<const KeyValueMetadata>& custom_metadata, MemoryPool* pool) {
+ auto message = flatbuf::CreateMessage(fbb, MetadataVersionToFlatbuffer(version),
+ header_type, header, body_length,
+ SerializeCustomMetadata(fbb, custom_metadata));
+ fbb.Finish(message);
+ return WriteFlatbufferBuilder(fbb, pool);
+}
+
+using FieldNodeVector =
+ flatbuffers::Offset<flatbuffers::Vector<const flatbuf::FieldNode*>>;
+using BufferVector = flatbuffers::Offset<flatbuffers::Vector<const flatbuf::Buffer*>>;
+using BodyCompressionOffset = flatbuffers::Offset<flatbuf::BodyCompression>;
+
+static Status WriteFieldNodes(FBB& fbb, const std::vector<FieldMetadata>& nodes,
+ FieldNodeVector* out) {
+ std::vector<flatbuf::FieldNode> fb_nodes;
+ fb_nodes.reserve(nodes.size());
+
+ for (size_t i = 0; i < nodes.size(); ++i) {
+ const FieldMetadata& node = nodes[i];
+ if (node.offset != 0) {
+ return Status::Invalid("Field metadata for IPC must have offset 0");
+ }
+ fb_nodes.emplace_back(node.length, node.null_count);
+ }
+ *out = fbb.CreateVectorOfStructs(fb_nodes.data(), fb_nodes.size());
+ return Status::OK();
+}
+
+static Status WriteBuffers(FBB& fbb, const std::vector<BufferMetadata>& buffers,
+ BufferVector* out) {
+ std::vector<flatbuf::Buffer> fb_buffers;
+ fb_buffers.reserve(buffers.size());
+
+ for (size_t i = 0; i < buffers.size(); ++i) {
+ const BufferMetadata& buffer = buffers[i];
+ fb_buffers.emplace_back(buffer.offset, buffer.length);
+ }
+ *out = fbb.CreateVectorOfStructs(fb_buffers.data(), fb_buffers.size());
+
+ return Status::OK();
+}
+
+static Status GetBodyCompression(FBB& fbb, const IpcWriteOptions& options,
+ BodyCompressionOffset* out) {
+ if (options.codec != nullptr) {
+ flatbuf::CompressionType codec;
+ if (options.codec->compression_type() == Compression::LZ4_FRAME) {
+ codec = flatbuf::CompressionType::LZ4_FRAME;
+ } else if (options.codec->compression_type() == Compression::ZSTD) {
+ codec = flatbuf::CompressionType::ZSTD;
+ } else {
+ return Status::Invalid("Unsupported IPC compression codec: ",
+ options.codec->name());
+ }
+ *out = flatbuf::CreateBodyCompression(fbb, codec,
+ flatbuf::BodyCompressionMethod::BUFFER);
+ }
+ return Status::OK();
+}
+
+static Status MakeRecordBatch(FBB& fbb, int64_t length, int64_t body_length,
+ const std::vector<FieldMetadata>& nodes,
+ const std::vector<BufferMetadata>& buffers,
+ const IpcWriteOptions& options, RecordBatchOffset* offset) {
+ FieldNodeVector fb_nodes;
+ RETURN_NOT_OK(WriteFieldNodes(fbb, nodes, &fb_nodes));
+
+ BufferVector fb_buffers;
+ RETURN_NOT_OK(WriteBuffers(fbb, buffers, &fb_buffers));
+
+ BodyCompressionOffset fb_compression;
+ RETURN_NOT_OK(GetBodyCompression(fbb, options, &fb_compression));
+
+ *offset = flatbuf::CreateRecordBatch(fbb, length, fb_nodes, fb_buffers, fb_compression);
+ return Status::OK();
+}
+
+Status MakeSparseTensorIndexCOO(FBB& fbb, const SparseCOOIndex& sparse_index,
+ const std::vector<BufferMetadata>& buffers,
+ flatbuf::SparseTensorIndex* fb_sparse_index_type,
+ Offset* fb_sparse_index, size_t* num_buffers) {
+ *fb_sparse_index_type = flatbuf::SparseTensorIndex::SparseTensorIndexCOO;
+
+ // We assume that the value type of indices tensor is an integer.
+ const auto& index_value_type =
+ checked_cast<const IntegerType&>(*sparse_index.indices()->type());
+ auto indices_type_offset =
+ flatbuf::CreateInt(fbb, index_value_type.bit_width(), index_value_type.is_signed());
+
+ auto fb_strides = fbb.CreateVector(sparse_index.indices()->strides().data(),
+ sparse_index.indices()->strides().size());
+
+ const BufferMetadata& indices_metadata = buffers[0];
+ flatbuf::Buffer indices(indices_metadata.offset, indices_metadata.length);
+
+ *fb_sparse_index =
+ flatbuf::CreateSparseTensorIndexCOO(fbb, indices_type_offset, fb_strides, &indices,
+ sparse_index.is_canonical())
+ .Union();
+ *num_buffers = 1;
+ return Status::OK();
+}
+
+template <typename SparseIndexType>
+struct SparseMatrixCompressedAxis {};
+
+template <>
+struct SparseMatrixCompressedAxis<SparseCSRIndex> {
+ constexpr static const auto value = flatbuf::SparseMatrixCompressedAxis::Row;
+};
+
+template <>
+struct SparseMatrixCompressedAxis<SparseCSCIndex> {
+ constexpr static const auto value = flatbuf::SparseMatrixCompressedAxis::Column;
+};
+
+template <typename SparseIndexType>
+Status MakeSparseMatrixIndexCSX(FBB& fbb, const SparseIndexType& sparse_index,
+ const std::vector<BufferMetadata>& buffers,
+ flatbuf::SparseTensorIndex* fb_sparse_index_type,
+ Offset* fb_sparse_index, size_t* num_buffers) {
+ *fb_sparse_index_type = flatbuf::SparseTensorIndex::SparseMatrixIndexCSX;
+
+ // We assume that the value type of indptr tensor is an integer.
+ const auto& indptr_value_type =
+ checked_cast<const IntegerType&>(*sparse_index.indptr()->type());
+ auto indptr_type_offset = flatbuf::CreateInt(fbb, indptr_value_type.bit_width(),
+ indptr_value_type.is_signed());
+
+ const BufferMetadata& indptr_metadata = buffers[0];
+ flatbuf::Buffer indptr(indptr_metadata.offset, indptr_metadata.length);
+
+ // We assume that the value type of indices tensor is an integer.
+ const auto& indices_value_type =
+ checked_cast<const IntegerType&>(*sparse_index.indices()->type());
+ auto indices_type_offset = flatbuf::CreateInt(fbb, indices_value_type.bit_width(),
+ indices_value_type.is_signed());
+
+ const BufferMetadata& indices_metadata = buffers[1];
+ flatbuf::Buffer indices(indices_metadata.offset, indices_metadata.length);
+
+ auto compressedAxis = SparseMatrixCompressedAxis<SparseIndexType>::value;
+ *fb_sparse_index =
+ flatbuf::CreateSparseMatrixIndexCSX(fbb, compressedAxis, indptr_type_offset,
+ &indptr, indices_type_offset, &indices)
+ .Union();
+ *num_buffers = 2;
+ return Status::OK();
+}
+
+Status MakeSparseTensorIndexCSF(FBB& fbb, const SparseCSFIndex& sparse_index,
+ const std::vector<BufferMetadata>& buffers,
+ flatbuf::SparseTensorIndex* fb_sparse_index_type,
+ Offset* fb_sparse_index, size_t* num_buffers) {
+ *fb_sparse_index_type = flatbuf::SparseTensorIndex::SparseTensorIndexCSF;
+ const int ndim = static_cast<int>(sparse_index.axis_order().size());
+
+ // We assume that the value type of indptr tensor is an integer.
+ const auto& indptr_value_type =
+ checked_cast<const IntegerType&>(*sparse_index.indptr()[0]->type());
+ auto indptr_type_offset = flatbuf::CreateInt(fbb, indptr_value_type.bit_width(),
+ indptr_value_type.is_signed());
+
+ // We assume that the value type of indices tensor is an integer.
+ const auto& indices_value_type =
+ checked_cast<const IntegerType&>(*sparse_index.indices()[0]->type());
+ auto indices_type_offset = flatbuf::CreateInt(fbb, indices_value_type.bit_width(),
+ indices_value_type.is_signed());
+
+ const int64_t indptr_elem_size = GetByteWidth(indptr_value_type);
+ const int64_t indices_elem_size = GetByteWidth(indices_value_type);
+
+ int64_t offset = 0;
+ std::vector<flatbuf::Buffer> indptr, indices;
+
+ for (const std::shared_ptr<arrow::Tensor>& tensor : sparse_index.indptr()) {
+ const int64_t size = tensor->data()->size() / indptr_elem_size;
+ const int64_t padded_size = PaddedLength(tensor->data()->size(), kArrowIpcAlignment);
+
+ indptr.push_back({offset, size});
+ offset += padded_size;
+ }
+ for (const std::shared_ptr<arrow::Tensor>& tensor : sparse_index.indices()) {
+ const int64_t size = tensor->data()->size() / indices_elem_size;
+ const int64_t padded_size = PaddedLength(tensor->data()->size(), kArrowIpcAlignment);
+
+ indices.push_back({offset, size});
+ offset += padded_size;
+ }
+
+ auto fb_indices = fbb.CreateVectorOfStructs(indices);
+ auto fb_indptr = fbb.CreateVectorOfStructs(indptr);
+
+ std::vector<int> axis_order;
+ for (int i = 0; i < ndim; ++i) {
+ axis_order.emplace_back(static_cast<int>(sparse_index.axis_order()[i]));
+ }
+ auto fb_axis_order =
+ fbb.CreateVector(arrow::util::MakeNonNull(axis_order.data()), axis_order.size());
+
+ *fb_sparse_index =
+ flatbuf::CreateSparseTensorIndexCSF(fbb, indptr_type_offset, fb_indptr,
+ indices_type_offset, fb_indices, fb_axis_order)
+ .Union();
+ *num_buffers = 2 * ndim - 1;
+ return Status::OK();
+}
+
+Status MakeSparseTensorIndex(FBB& fbb, const SparseIndex& sparse_index,
+ const std::vector<BufferMetadata>& buffers,
+ flatbuf::SparseTensorIndex* fb_sparse_index_type,
+ Offset* fb_sparse_index, size_t* num_buffers) {
+ switch (sparse_index.format_id()) {
+ case SparseTensorFormat::COO:
+ RETURN_NOT_OK(MakeSparseTensorIndexCOO(
+ fbb, checked_cast<const SparseCOOIndex&>(sparse_index), buffers,
+ fb_sparse_index_type, fb_sparse_index, num_buffers));
+ break;
+
+ case SparseTensorFormat::CSR:
+ RETURN_NOT_OK(MakeSparseMatrixIndexCSX(
+ fbb, checked_cast<const SparseCSRIndex&>(sparse_index), buffers,
+ fb_sparse_index_type, fb_sparse_index, num_buffers));
+ break;
+
+ case SparseTensorFormat::CSC:
+ RETURN_NOT_OK(MakeSparseMatrixIndexCSX(
+ fbb, checked_cast<const SparseCSCIndex&>(sparse_index), buffers,
+ fb_sparse_index_type, fb_sparse_index, num_buffers));
+ break;
+
+ case SparseTensorFormat::CSF:
+ RETURN_NOT_OK(MakeSparseTensorIndexCSF(
+ fbb, checked_cast<const SparseCSFIndex&>(sparse_index), buffers,
+ fb_sparse_index_type, fb_sparse_index, num_buffers));
+ break;
+
+ default:
+ *fb_sparse_index_type = flatbuf::SparseTensorIndex::NONE; // Silence warnings
+ std::stringstream ss;
+ ss << "Unsupported sparse tensor format:: " << sparse_index.ToString() << std::endl;
+ return Status::NotImplemented(ss.str());
+ }
+
+ return Status::OK();
+}
+
+Status MakeSparseTensor(FBB& fbb, const SparseTensor& sparse_tensor, int64_t body_length,
+ const std::vector<BufferMetadata>& buffers,
+ SparseTensorOffset* offset) {
+ flatbuf::Type fb_type_type;
+ Offset fb_type;
+ RETURN_NOT_OK(
+ TensorTypeToFlatbuffer(fbb, *sparse_tensor.type(), &fb_type_type, &fb_type));
+
+ using TensorDimOffset = flatbuffers::Offset<flatbuf::TensorDim>;
+ std::vector<TensorDimOffset> dims;
+ for (int i = 0; i < sparse_tensor.ndim(); ++i) {
+ FBString name = fbb.CreateString(sparse_tensor.dim_name(i));
+ dims.push_back(flatbuf::CreateTensorDim(fbb, sparse_tensor.shape()[i], name));
+ }
+
+ auto fb_shape = fbb.CreateVector(dims);
+
+ flatbuf::SparseTensorIndex fb_sparse_index_type;
+ Offset fb_sparse_index;
+ size_t num_index_buffers = 0;
+ RETURN_NOT_OK(MakeSparseTensorIndex(fbb, *sparse_tensor.sparse_index(), buffers,
+ &fb_sparse_index_type, &fb_sparse_index,
+ &num_index_buffers));
+
+ const BufferMetadata& data_metadata = buffers[num_index_buffers];
+ flatbuf::Buffer data(data_metadata.offset, data_metadata.length);
+
+ const int64_t non_zero_length = sparse_tensor.non_zero_length();
+
+ *offset =
+ flatbuf::CreateSparseTensor(fbb, fb_type_type, fb_type, fb_shape, non_zero_length,
+ fb_sparse_index_type, fb_sparse_index, &data);
+
+ return Status::OK();
+}
+
+} // namespace
+
+Status GetKeyValueMetadata(const KVVector* fb_metadata,
+ std::shared_ptr<KeyValueMetadata>* out) {
+ if (fb_metadata == nullptr) {
+ *out = nullptr;
+ return Status::OK();
+ }
+
+ auto metadata = std::make_shared<KeyValueMetadata>();
+
+ metadata->reserve(fb_metadata->size());
+ for (const auto pair : *fb_metadata) {
+ CHECK_FLATBUFFERS_NOT_NULL(pair->key(), "custom_metadata.key");
+ CHECK_FLATBUFFERS_NOT_NULL(pair->value(), "custom_metadata.value");
+ metadata->Append(pair->key()->str(), pair->value()->str());
+ }
+
+ *out = std::move(metadata);
+ return Status::OK();
+}
+
+Status WriteSchemaMessage(const Schema& schema, const DictionaryFieldMapper& mapper,
+ const IpcWriteOptions& options, std::shared_ptr<Buffer>* out) {
+ FBB fbb;
+ flatbuffers::Offset<flatbuf::Schema> fb_schema;
+ RETURN_NOT_OK(SchemaToFlatbuffer(fbb, schema, mapper, &fb_schema));
+ return WriteFBMessage(fbb, flatbuf::MessageHeader::Schema, fb_schema.Union(),
+ /*body_length=*/0, options.metadata_version,
+ /*custom_metadata=*/nullptr, options.memory_pool)
+ .Value(out);
+}
+
+Status WriteRecordBatchMessage(
+ int64_t length, int64_t body_length,
+ const std::shared_ptr<const KeyValueMetadata>& custom_metadata,
+ const std::vector<FieldMetadata>& nodes, const std::vector<BufferMetadata>& buffers,
+ const IpcWriteOptions& options, std::shared_ptr<Buffer>* out) {
+ FBB fbb;
+ RecordBatchOffset record_batch;
+ RETURN_NOT_OK(
+ MakeRecordBatch(fbb, length, body_length, nodes, buffers, options, &record_batch));
+ return WriteFBMessage(fbb, flatbuf::MessageHeader::RecordBatch, record_batch.Union(),
+ body_length, options.metadata_version, custom_metadata,
+ options.memory_pool)
+ .Value(out);
+}
+
+Result<std::shared_ptr<Buffer>> WriteTensorMessage(const Tensor& tensor,
+ int64_t buffer_start_offset,
+ const IpcWriteOptions& options) {
+ using TensorDimOffset = flatbuffers::Offset<flatbuf::TensorDim>;
+ using TensorOffset = flatbuffers::Offset<flatbuf::Tensor>;
+
+ FBB fbb;
+ const int elem_size = GetByteWidth(*tensor.type());
+
+ flatbuf::Type fb_type_type;
+ Offset fb_type;
+ RETURN_NOT_OK(TensorTypeToFlatbuffer(fbb, *tensor.type(), &fb_type_type, &fb_type));
+
+ std::vector<TensorDimOffset> dims;
+ for (int i = 0; i < tensor.ndim(); ++i) {
+ FBString name = fbb.CreateString(tensor.dim_name(i));
+ dims.push_back(flatbuf::CreateTensorDim(fbb, tensor.shape()[i], name));
+ }
+
+ auto fb_shape = fbb.CreateVector(dims.data(), dims.size());
+
+ flatbuffers::Offset<flatbuffers::Vector<int64_t>> fb_strides;
+ fb_strides = fbb.CreateVector(tensor.strides().data(), tensor.strides().size());
+ int64_t body_length = tensor.size() * elem_size;
+ flatbuf::Buffer buffer(buffer_start_offset, body_length);
+
+ TensorOffset fb_tensor =
+ flatbuf::CreateTensor(fbb, fb_type_type, fb_type, fb_shape, fb_strides, &buffer);
+
+ return WriteFBMessage(fbb, flatbuf::MessageHeader::Tensor, fb_tensor.Union(),
+ body_length, options.metadata_version,
+ /*custom_metadata=*/nullptr, options.memory_pool);
+}
+
+Result<std::shared_ptr<Buffer>> WriteSparseTensorMessage(
+ const SparseTensor& sparse_tensor, int64_t body_length,
+ const std::vector<BufferMetadata>& buffers, const IpcWriteOptions& options) {
+ FBB fbb;
+ SparseTensorOffset fb_sparse_tensor;
+ RETURN_NOT_OK(
+ MakeSparseTensor(fbb, sparse_tensor, body_length, buffers, &fb_sparse_tensor));
+ return WriteFBMessage(fbb, flatbuf::MessageHeader::SparseTensor,
+ fb_sparse_tensor.Union(), body_length, options.metadata_version,
+ /*custom_metadata=*/nullptr, options.memory_pool);
+}
+
+Status WriteDictionaryMessage(
+ int64_t id, bool is_delta, int64_t length, int64_t body_length,
+ const std::shared_ptr<const KeyValueMetadata>& custom_metadata,
+ const std::vector<FieldMetadata>& nodes, const std::vector<BufferMetadata>& buffers,
+ const IpcWriteOptions& options, std::shared_ptr<Buffer>* out) {
+ FBB fbb;
+ RecordBatchOffset record_batch;
+ RETURN_NOT_OK(
+ MakeRecordBatch(fbb, length, body_length, nodes, buffers, options, &record_batch));
+ auto dictionary_batch =
+ flatbuf::CreateDictionaryBatch(fbb, id, record_batch, is_delta).Union();
+ return WriteFBMessage(fbb, flatbuf::MessageHeader::DictionaryBatch, dictionary_batch,
+ body_length, options.metadata_version, custom_metadata,
+ options.memory_pool)
+ .Value(out);
+}
+
+static flatbuffers::Offset<flatbuffers::Vector<const flatbuf::Block*>>
+FileBlocksToFlatbuffer(FBB& fbb, const std::vector<FileBlock>& blocks) {
+ std::vector<flatbuf::Block> fb_blocks;
+
+ for (const FileBlock& block : blocks) {
+ fb_blocks.emplace_back(block.offset, block.metadata_length, block.body_length);
+ }
+
+ return fbb.CreateVectorOfStructs(fb_blocks.data(), fb_blocks.size());
+}
+
+Status WriteFileFooter(const Schema& schema, const std::vector<FileBlock>& dictionaries,
+ const std::vector<FileBlock>& record_batches,
+ const std::shared_ptr<const KeyValueMetadata>& metadata,
+ io::OutputStream* out) {
+ FBB fbb;
+
+ flatbuffers::Offset<flatbuf::Schema> fb_schema;
+ DictionaryFieldMapper mapper(schema);
+ RETURN_NOT_OK(SchemaToFlatbuffer(fbb, schema, mapper, &fb_schema));
+
+#ifndef NDEBUG
+ for (size_t i = 0; i < dictionaries.size(); ++i) {
+ DCHECK(BitUtil::IsMultipleOf8(dictionaries[i].offset)) << i;
+ DCHECK(BitUtil::IsMultipleOf8(dictionaries[i].metadata_length)) << i;
+ DCHECK(BitUtil::IsMultipleOf8(dictionaries[i].body_length)) << i;
+ }
+
+ for (size_t i = 0; i < record_batches.size(); ++i) {
+ DCHECK(BitUtil::IsMultipleOf8(record_batches[i].offset)) << i;
+ DCHECK(BitUtil::IsMultipleOf8(record_batches[i].metadata_length)) << i;
+ DCHECK(BitUtil::IsMultipleOf8(record_batches[i].body_length)) << i;
+ }
+#endif
+
+ auto fb_dictionaries = FileBlocksToFlatbuffer(fbb, dictionaries);
+ auto fb_record_batches = FileBlocksToFlatbuffer(fbb, record_batches);
+
+ auto fb_custom_metadata = SerializeCustomMetadata(fbb, metadata);
+
+ auto footer =
+ flatbuf::CreateFooter(fbb, kCurrentMetadataVersion, fb_schema, fb_dictionaries,
+ fb_record_batches, fb_custom_metadata);
+ fbb.Finish(footer);
+
+ int32_t size = fbb.GetSize();
+
+ return out->Write(fbb.GetBufferPointer(), size);
+}
+
+// ----------------------------------------------------------------------
+
+Status GetSchema(const void* opaque_schema, DictionaryMemo* dictionary_memo,
+ std::shared_ptr<Schema>* out) {
+ auto schema = static_cast<const flatbuf::Schema*>(opaque_schema);
+ CHECK_FLATBUFFERS_NOT_NULL(schema, "schema");
+ CHECK_FLATBUFFERS_NOT_NULL(schema->fields(), "Schema.fields");
+ int num_fields = static_cast<int>(schema->fields()->size());
+
+ FieldPosition field_pos;
+
+ std::vector<std::shared_ptr<Field>> fields(num_fields);
+ for (int i = 0; i < num_fields; ++i) {
+ const flatbuf::Field* field = schema->fields()->Get(i);
+ // XXX I don't think this check is necessary (AP)
+ CHECK_FLATBUFFERS_NOT_NULL(field, "DictionaryEncoding.indexType");
+ RETURN_NOT_OK(
+ FieldFromFlatbuffer(field, field_pos.child(i), dictionary_memo, &fields[i]));
+ }
+
+ std::shared_ptr<KeyValueMetadata> metadata;
+ RETURN_NOT_OK(internal::GetKeyValueMetadata(schema->custom_metadata(), &metadata));
+ // set endianess using the value in flatbuf schema
+ auto endianness = schema->endianness() == flatbuf::Endianness::Little
+ ? Endianness::Little
+ : Endianness::Big;
+ *out = ::arrow::schema(std::move(fields), endianness, metadata);
+ return Status::OK();
+}
+
+Status GetTensorMetadata(const Buffer& metadata, std::shared_ptr<DataType>* type,
+ std::vector<int64_t>* shape, std::vector<int64_t>* strides,
+ std::vector<std::string>* dim_names) {
+ const flatbuf::Message* message = nullptr;
+ RETURN_NOT_OK(internal::VerifyMessage(metadata.data(), metadata.size(), &message));
+ auto tensor = message->header_as_Tensor();
+ if (tensor == nullptr) {
+ return Status::IOError("Header-type of flatbuffer-encoded Message is not Tensor.");
+ }
+
+ flatbuffers::uoffset_t ndim = tensor->shape()->size();
+
+ for (flatbuffers::uoffset_t i = 0; i < ndim; ++i) {
+ auto dim = tensor->shape()->Get(i);
+
+ shape->push_back(dim->size());
+ dim_names->push_back(StringFromFlatbuffers(dim->name()));
+ }
+
+ if (tensor->strides() && tensor->strides()->size() > 0) {
+ if (tensor->strides()->size() != ndim) {
+ return Status::IOError(
+ "The sizes of shape and strides in a tensor are mismatched.");
+ }
+
+ for (decltype(ndim) i = 0; i < ndim; ++i) {
+ strides->push_back(tensor->strides()->Get(i));
+ }
+ }
+
+ auto type_data = tensor->type(); // Required
+ return ConcreteTypeFromFlatbuffer(tensor->type_type(), type_data, {}, type);
+}
+
+Status GetSparseCOOIndexMetadata(const flatbuf::SparseTensorIndexCOO* sparse_index,
+ std::shared_ptr<DataType>* indices_type) {
+ return IntFromFlatbuffer(sparse_index->indicesType(), indices_type);
+}
+
+Status GetSparseCSXIndexMetadata(const flatbuf::SparseMatrixIndexCSX* sparse_index,
+ std::shared_ptr<DataType>* indptr_type,
+ std::shared_ptr<DataType>* indices_type) {
+ RETURN_NOT_OK(IntFromFlatbuffer(sparse_index->indptrType(), indptr_type));
+ RETURN_NOT_OK(IntFromFlatbuffer(sparse_index->indicesType(), indices_type));
+ return Status::OK();
+}
+
+Status GetSparseCSFIndexMetadata(const flatbuf::SparseTensorIndexCSF* sparse_index,
+ std::vector<int64_t>* axis_order,
+ std::vector<int64_t>* indices_size,
+ std::shared_ptr<DataType>* indptr_type,
+ std::shared_ptr<DataType>* indices_type) {
+ RETURN_NOT_OK(IntFromFlatbuffer(sparse_index->indptrType(), indptr_type));
+ RETURN_NOT_OK(IntFromFlatbuffer(sparse_index->indicesType(), indices_type));
+
+ const int ndim = static_cast<int>(sparse_index->axisOrder()->size());
+ for (int i = 0; i < ndim; ++i) {
+ axis_order->push_back(sparse_index->axisOrder()->Get(i));
+ indices_size->push_back(sparse_index->indicesBuffers()->Get(i)->length());
+ }
+
+ return Status::OK();
+}
+
+Status GetSparseTensorMetadata(const Buffer& metadata, std::shared_ptr<DataType>* type,
+ std::vector<int64_t>* shape,
+ std::vector<std::string>* dim_names,
+ int64_t* non_zero_length,
+ SparseTensorFormat::type* sparse_tensor_format_id) {
+ const flatbuf::Message* message = nullptr;
+ RETURN_NOT_OK(internal::VerifyMessage(metadata.data(), metadata.size(), &message));
+ auto sparse_tensor = message->header_as_SparseTensor();
+ if (sparse_tensor == nullptr) {
+ return Status::IOError(
+ "Header-type of flatbuffer-encoded Message is not SparseTensor.");
+ }
+ int ndim = static_cast<int>(sparse_tensor->shape()->size());
+
+ if (shape || dim_names) {
+ for (int i = 0; i < ndim; ++i) {
+ auto dim = sparse_tensor->shape()->Get(i);
+
+ if (shape) {
+ shape->push_back(dim->size());
+ }
+
+ if (dim_names) {
+ dim_names->push_back(StringFromFlatbuffers(dim->name()));
+ }
+ }
+ }
+
+ if (non_zero_length) {
+ *non_zero_length = sparse_tensor->non_zero_length();
+ }
+
+ if (sparse_tensor_format_id) {
+ switch (sparse_tensor->sparseIndex_type()) {
+ case flatbuf::SparseTensorIndex::SparseTensorIndexCOO:
+ *sparse_tensor_format_id = SparseTensorFormat::COO;
+ break;
+
+ case flatbuf::SparseTensorIndex::SparseMatrixIndexCSX: {
+ auto cs = sparse_tensor->sparseIndex_as_SparseMatrixIndexCSX();
+ switch (cs->compressedAxis()) {
+ case flatbuf::SparseMatrixCompressedAxis::Row:
+ *sparse_tensor_format_id = SparseTensorFormat::CSR;
+ break;
+
+ case flatbuf::SparseMatrixCompressedAxis::Column:
+ *sparse_tensor_format_id = SparseTensorFormat::CSC;
+ break;
+
+ default:
+ return Status::Invalid("Invalid value of SparseMatrixCompressedAxis");
+ }
+ } break;
+
+ case flatbuf::SparseTensorIndex::SparseTensorIndexCSF:
+ *sparse_tensor_format_id = SparseTensorFormat::CSF;
+ break;
+
+ default:
+ return Status::Invalid("Unrecognized sparse index type");
+ }
+ }
+
+ auto type_data = sparse_tensor->type(); // Required
+ if (type) {
+ return ConcreteTypeFromFlatbuffer(sparse_tensor->type_type(), type_data, {}, type);
+ } else {
+ return Status::OK();
+ }
+}
+
+} // namespace internal
+} // namespace ipc
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/metadata_internal.h b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/metadata_internal.h
new file mode 100644
index 00000000000..9cf489dd668
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/metadata_internal.h
@@ -0,0 +1,227 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Internal metadata serialization matters
+
+#pragma once
+
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <flatbuffers/flatbuffers.h>
+
+#include "arrow/buffer.h"
+#include "arrow/io/type_fwd.h"
+#include "arrow/ipc/message.h"
+#include "arrow/result.h"
+#include "arrow/sparse_tensor.h"
+#include "arrow/status.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+#include "generated/Message_generated.h"
+#include "generated/Schema_generated.h"
+#include "generated/SparseTensor_generated.h" // IWYU pragma: keep
+
+namespace arrow {
+
+namespace flatbuf = org::apache::arrow::flatbuf;
+
+namespace ipc {
+
+class DictionaryFieldMapper;
+class DictionaryMemo;
+
+namespace internal {
+
+using KeyValueOffset = flatbuffers::Offset<flatbuf::KeyValue>;
+using KVVector = flatbuffers::Vector<KeyValueOffset>;
+
+// This 0xFFFFFFFF value is the first 4 bytes of a valid IPC message
+constexpr int32_t kIpcContinuationToken = -1;
+
+static constexpr flatbuf::MetadataVersion kCurrentMetadataVersion =
+ flatbuf::MetadataVersion::V5;
+
+static constexpr flatbuf::MetadataVersion kLatestMetadataVersion =
+ flatbuf::MetadataVersion::V5;
+
+static constexpr flatbuf::MetadataVersion kMinMetadataVersion =
+ flatbuf::MetadataVersion::V4;
+
+MetadataVersion GetMetadataVersion(flatbuf::MetadataVersion version);
+
+// This function is used in a unit test
+ARROW_EXPORT
+flatbuf::MetadataVersion MetadataVersionToFlatbuffer(MetadataVersion version);
+
+// Whether the type has a validity bitmap in the given IPC version
+bool HasValidityBitmap(Type::type type_id, MetadataVersion version);
+
+static constexpr const char* kArrowMagicBytes = "ARROW1";
+
+struct FieldMetadata {
+ int64_t length;
+ int64_t null_count;
+ int64_t offset;
+};
+
+struct BufferMetadata {
+ /// The relative offset into the memory page to the starting byte of the buffer
+ int64_t offset;
+
+ /// Absolute length in bytes of the buffer
+ int64_t length;
+};
+
+struct FileBlock {
+ int64_t offset;
+ int32_t metadata_length;
+ int64_t body_length;
+};
+
+// Low-level utilities to help with reading Flatbuffers data.
+
+#define CHECK_FLATBUFFERS_NOT_NULL(fb_value, name) \
+ if ((fb_value) == NULLPTR) { \
+ return Status::IOError("Unexpected null field ", name, \
+ " in flatbuffer-encoded metadata"); \
+ }
+
+template <typename T>
+inline uint32_t FlatBuffersVectorSize(const flatbuffers::Vector<T>* vec) {
+ return (vec == NULLPTR) ? 0 : vec->size();
+}
+
+inline std::string StringFromFlatbuffers(const flatbuffers::String* s) {
+ return (s == NULLPTR) ? "" : s->str();
+}
+
+// Read interface classes. We do not fully deserialize the flatbuffers so that
+// individual fields metadata can be retrieved from very large schema without
+//
+
+// Construct a complete Schema from the message and add
+// dictionary-encoded fields to a DictionaryMemo instance. May be
+// expensive for very large schemas if you are only interested in a
+// few fields
+Status GetSchema(const void* opaque_schema, DictionaryMemo* dictionary_memo,
+ std::shared_ptr<Schema>* out);
+
+Status GetTensorMetadata(const Buffer& metadata, std::shared_ptr<DataType>* type,
+ std::vector<int64_t>* shape, std::vector<int64_t>* strides,
+ std::vector<std::string>* dim_names);
+
+// EXPERIMENTAL: Extracting metadata of a SparseCOOIndex from the message
+Status GetSparseCOOIndexMetadata(const flatbuf::SparseTensorIndexCOO* sparse_index,
+ std::shared_ptr<DataType>* indices_type);
+
+// EXPERIMENTAL: Extracting metadata of a SparseCSXIndex from the message
+Status GetSparseCSXIndexMetadata(const flatbuf::SparseMatrixIndexCSX* sparse_index,
+ std::shared_ptr<DataType>* indptr_type,
+ std::shared_ptr<DataType>* indices_type);
+
+// EXPERIMENTAL: Extracting metadata of a SparseCSFIndex from the message
+Status GetSparseCSFIndexMetadata(const flatbuf::SparseTensorIndexCSF* sparse_index,
+ std::vector<int64_t>* axis_order,
+ std::vector<int64_t>* indices_size,
+ std::shared_ptr<DataType>* indptr_type,
+ std::shared_ptr<DataType>* indices_type);
+
+// EXPERIMENTAL: Extracting metadata of a sparse tensor from the message
+Status GetSparseTensorMetadata(const Buffer& metadata, std::shared_ptr<DataType>* type,
+ std::vector<int64_t>* shape,
+ std::vector<std::string>* dim_names, int64_t* length,
+ SparseTensorFormat::type* sparse_tensor_format_id);
+
+Status GetKeyValueMetadata(const KVVector* fb_metadata,
+ std::shared_ptr<KeyValueMetadata>* out);
+
+template <typename RootType>
+bool VerifyFlatbuffers(const uint8_t* data, int64_t size) {
+ // Heuristic: tables in a Arrow flatbuffers buffer must take at least 1 bit
+ // each in average (ARROW-11559).
+ // Especially, the only recursive table (the `Field` table in Schema.fbs)
+ // must have a non-empty `type` member.
+ flatbuffers::Verifier verifier(
+ data, static_cast<size_t>(size),
+ /*max_depth=*/128,
+ /*max_tables=*/static_cast<flatbuffers::uoffset_t>(8 * size));
+ return verifier.VerifyBuffer<RootType>(nullptr);
+}
+
+static inline Status VerifyMessage(const uint8_t* data, int64_t size,
+ const flatbuf::Message** out) {
+ if (!VerifyFlatbuffers<flatbuf::Message>(data, size)) {
+ return Status::IOError("Invalid flatbuffers message.");
+ }
+ *out = flatbuf::GetMessage(data);
+ return Status::OK();
+}
+
+// Serialize arrow::Schema as a Flatbuffer
+Status WriteSchemaMessage(const Schema& schema, const DictionaryFieldMapper& mapper,
+ const IpcWriteOptions& options, std::shared_ptr<Buffer>* out);
+
+// This function is used in a unit test
+ARROW_EXPORT
+Status WriteRecordBatchMessage(
+ const int64_t length, const int64_t body_length,
+ const std::shared_ptr<const KeyValueMetadata>& custom_metadata,
+ const std::vector<FieldMetadata>& nodes, const std::vector<BufferMetadata>& buffers,
+ const IpcWriteOptions& options, std::shared_ptr<Buffer>* out);
+
+Result<std::shared_ptr<Buffer>> WriteTensorMessage(const Tensor& tensor,
+ const int64_t buffer_start_offset,
+ const IpcWriteOptions& options);
+
+Result<std::shared_ptr<Buffer>> WriteSparseTensorMessage(
+ const SparseTensor& sparse_tensor, int64_t body_length,
+ const std::vector<BufferMetadata>& buffers, const IpcWriteOptions& options);
+
+Status WriteFileFooter(const Schema& schema, const std::vector<FileBlock>& dictionaries,
+ const std::vector<FileBlock>& record_batches,
+ const std::shared_ptr<const KeyValueMetadata>& metadata,
+ io::OutputStream* out);
+
+Status WriteDictionaryMessage(
+ const int64_t id, const bool is_delta, const int64_t length,
+ const int64_t body_length,
+ const std::shared_ptr<const KeyValueMetadata>& custom_metadata,
+ const std::vector<FieldMetadata>& nodes, const std::vector<BufferMetadata>& buffers,
+ const IpcWriteOptions& options, std::shared_ptr<Buffer>* out);
+
+static inline Result<std::shared_ptr<Buffer>> WriteFlatbufferBuilder(
+ flatbuffers::FlatBufferBuilder& fbb, // NOLINT non-const reference
+ MemoryPool* pool = default_memory_pool()) {
+ int32_t size = fbb.GetSize();
+
+ ARROW_ASSIGN_OR_RAISE(auto result, AllocateBuffer(size, pool));
+
+ uint8_t* dst = result->mutable_data();
+ memcpy(dst, fbb.GetBufferPointer(), size);
+ return std::move(result);
+}
+
+} // namespace internal
+} // namespace ipc
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/options.cc b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/options.cc
new file mode 100644
index 00000000000..e5b14a47fac
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/options.cc
@@ -0,0 +1,41 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/ipc/options.h"
+
+#include "arrow/status.h"
+
+namespace arrow {
+namespace ipc {
+
+IpcWriteOptions IpcWriteOptions::Defaults() { return IpcWriteOptions(); }
+
+IpcReadOptions IpcReadOptions::Defaults() { return IpcReadOptions(); }
+
+namespace internal {
+
+Status CheckCompressionSupported(Compression::type codec) {
+ if (!(codec == Compression::LZ4_FRAME || codec == Compression::ZSTD)) {
+ return Status::Invalid("Only LZ4_FRAME and ZSTD compression allowed");
+ }
+ return Status::OK();
+}
+
+} // namespace internal
+
+} // namespace ipc
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/options.h b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/options.h
new file mode 100644
index 00000000000..2e0f800b5ad
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/options.h
@@ -0,0 +1,161 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <vector>
+
+#include "arrow/ipc/type_fwd.h"
+#include "arrow/status.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/compression.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class MemoryPool;
+
+namespace ipc {
+
+// ARROW-109: We set this number arbitrarily to help catch user mistakes. For
+// deeply nested schemas, it is expected the user will indicate explicitly the
+// maximum allowed recursion depth
+constexpr int kMaxNestingDepth = 64;
+
+/// \brief Options for writing Arrow IPC messages
+struct ARROW_EXPORT IpcWriteOptions {
+ /// \brief If true, allow field lengths that don't fit in a signed 32-bit int.
+ ///
+ /// Some implementations may not be able to parse streams created with this option.
+ bool allow_64bit = false;
+
+ /// \brief The maximum permitted schema nesting depth.
+ int max_recursion_depth = kMaxNestingDepth;
+
+ /// \brief Write padding after memory buffers up to this multiple of bytes.
+ int32_t alignment = 8;
+
+ /// \brief Write the pre-0.15.0 IPC message format
+ ///
+ /// This legacy format consists of a 4-byte prefix instead of 8-byte.
+ bool write_legacy_ipc_format = false;
+
+ /// \brief The memory pool to use for allocations made during IPC writing
+ ///
+ /// While Arrow IPC is predominantly zero-copy, it may have to allocate
+ /// memory in some cases (for example if compression is enabled).
+ MemoryPool* memory_pool = default_memory_pool();
+
+ /// \brief Compression codec to use for record batch body buffers
+ ///
+ /// May only be UNCOMPRESSED, LZ4_FRAME and ZSTD.
+ std::shared_ptr<util::Codec> codec;
+
+ /// \brief Use global CPU thread pool to parallelize any computational tasks
+ /// like compression
+ bool use_threads = true;
+
+ /// \brief Whether to emit dictionary deltas
+ ///
+ /// If false, a changed dictionary for a given field will emit a full
+ /// dictionary replacement.
+ /// If true, a changed dictionary will be compared against the previous
+ /// version. If possible, a dictionary delta will be omitted, otherwise
+ /// a full dictionary replacement.
+ ///
+ /// Default is false to maximize stream compatibility.
+ ///
+ /// Also, note that if a changed dictionary is a nested dictionary,
+ /// then a delta is never emitted, for compatibility with the read path.
+ bool emit_dictionary_deltas = false;
+
+ /// \brief Whether to unify dictionaries for the IPC file format
+ ///
+ /// The IPC file format doesn't support dictionary replacements or deltas.
+ /// Therefore, chunks of a column with a dictionary type must have the same
+ /// dictionary in each record batch.
+ ///
+ /// If this option is true, RecordBatchWriter::WriteTable will attempt
+ /// to unify dictionaries across each table column. If this option is
+ /// false, unequal dictionaries across a table column will simply raise
+ /// an error.
+ ///
+ /// Note that enabling this option has a runtime cost. Also, not all types
+ /// currently support dictionary unification.
+ ///
+ /// This option is ignored for IPC streams, which support dictionary replacement
+ /// and deltas.
+ bool unify_dictionaries = false;
+
+ /// \brief Format version to use for IPC messages and their metadata.
+ ///
+ /// Presently using V5 version (readable by 1.0.0 and later).
+ /// V4 is also available (readable by 0.8.0 and later).
+ MetadataVersion metadata_version = MetadataVersion::V5;
+
+ static IpcWriteOptions Defaults();
+};
+
+#ifndef ARROW_NO_DEPRECATED_API
+using IpcOptions = IpcWriteOptions;
+#endif
+
+/// \brief Options for reading Arrow IPC messages
+struct ARROW_EXPORT IpcReadOptions {
+ /// \brief The maximum permitted schema nesting depth.
+ int max_recursion_depth = kMaxNestingDepth;
+
+ /// \brief The memory pool to use for allocations made during IPC reading
+ ///
+ /// While Arrow IPC is predominantly zero-copy, it may have to allocate
+ /// memory in some cases (for example if compression is enabled).
+ MemoryPool* memory_pool = default_memory_pool();
+
+ /// \brief EXPERIMENTAL: Top-level schema fields to include when
+ /// deserializing RecordBatch.
+ ///
+ /// If empty (the default), return all deserialized fields.
+ /// If non-empty, the values are the indices of fields in the top-level schema.
+ std::vector<int> included_fields;
+
+ /// \brief Use global CPU thread pool to parallelize any computational tasks
+ /// like decompression
+ bool use_threads = true;
+
+ /// \brief EXPERIMENTAL: Convert incoming data to platform-native endianness
+ ///
+ /// If the endianness of the received schema is not equal to platform-native
+ /// endianness, then all buffers with endian-sensitive data will be byte-swapped.
+ /// This includes the value buffers of numeric types, temporal types, decimal
+ /// types, as well as the offset buffers of variable-sized binary and list-like
+ /// types.
+ ///
+ /// Endianness conversion is achieved by the RecordBatchFileReader,
+ /// RecordBatchStreamReader and StreamDecoder classes.
+ bool ensure_native_endian = true;
+
+ static IpcReadOptions Defaults();
+};
+
+namespace internal {
+
+Status CheckCompressionSupported(Compression::type codec);
+
+} // namespace internal
+} // namespace ipc
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/reader.cc b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/reader.cc
new file mode 100644
index 00000000000..a3c345cc440
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/reader.cc
@@ -0,0 +1,2081 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/ipc/reader.h"
+
+#include <algorithm>
+#include <climits>
+#include <cstdint>
+#include <cstring>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include <flatbuffers/flatbuffers.h> // IWYU pragma: export
+
+#include "arrow/array.h"
+#include "arrow/buffer.h"
+#include "arrow/extension_type.h"
+#include "arrow/io/caching.h"
+#include "arrow/io/interfaces.h"
+#include "arrow/io/memory.h"
+#include "arrow/ipc/message.h"
+#include "arrow/ipc/metadata_internal.h"
+#include "arrow/ipc/util.h"
+#include "arrow/ipc/writer.h"
+#include "arrow/record_batch.h"
+#include "arrow/sparse_tensor.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_ops.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/compression.h"
+#include "arrow/util/endian.h"
+#include "arrow/util/key_value_metadata.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/parallel.h"
+#include "arrow/util/string.h"
+#include "arrow/util/thread_pool.h"
+#include "arrow/util/ubsan.h"
+#include "arrow/util/vector.h"
+#include "arrow/visitor_inline.h"
+
+#include "generated/File_generated.h" // IWYU pragma: export
+#include "generated/Message_generated.h"
+#include "generated/Schema_generated.h"
+#include "generated/SparseTensor_generated.h"
+
+namespace arrow {
+
+namespace flatbuf = org::apache::arrow::flatbuf;
+
+using internal::checked_cast;
+using internal::checked_pointer_cast;
+using internal::GetByteWidth;
+
+namespace ipc {
+
+using internal::FileBlock;
+using internal::kArrowMagicBytes;
+
+namespace {
+
+enum class DictionaryKind { New, Delta, Replacement };
+
+Status InvalidMessageType(MessageType expected, MessageType actual) {
+ return Status::IOError("Expected IPC message of type ", FormatMessageType(expected),
+ " but got ", FormatMessageType(actual));
+}
+
+#define CHECK_MESSAGE_TYPE(expected, actual) \
+ do { \
+ if ((actual) != (expected)) { \
+ return InvalidMessageType((expected), (actual)); \
+ } \
+ } while (0)
+
+#define CHECK_HAS_BODY(message) \
+ do { \
+ if ((message).body() == nullptr) { \
+ return Status::IOError("Expected body in IPC message of type ", \
+ FormatMessageType((message).type())); \
+ } \
+ } while (0)
+
+#define CHECK_HAS_NO_BODY(message) \
+ do { \
+ if ((message).body_length() != 0) { \
+ return Status::IOError("Unexpected body in IPC message of type ", \
+ FormatMessageType((message).type())); \
+ } \
+ } while (0)
+
+} // namespace
+
+// ----------------------------------------------------------------------
+// Record batch read path
+
+/// \brief Structure to keep common arguments to be passed
+struct IpcReadContext {
+ IpcReadContext(DictionaryMemo* memo, const IpcReadOptions& option, bool swap,
+ MetadataVersion version = MetadataVersion::V5,
+ Compression::type kind = Compression::UNCOMPRESSED)
+ : dictionary_memo(memo),
+ options(option),
+ metadata_version(version),
+ compression(kind),
+ swap_endian(swap) {}
+
+ DictionaryMemo* dictionary_memo;
+
+ const IpcReadOptions& options;
+
+ MetadataVersion metadata_version;
+
+ Compression::type compression;
+
+ /// \brief LoadRecordBatch() or LoadRecordBatchSubset() swaps endianness of elements
+ /// if this flag is true
+ const bool swap_endian;
+};
+
+/// The field_index and buffer_index are incremented based on how much of the
+/// batch is "consumed" (through nested data reconstruction, for example)
+class ArrayLoader {
+ public:
+ explicit ArrayLoader(const flatbuf::RecordBatch* metadata,
+ MetadataVersion metadata_version, const IpcReadOptions& options,
+ io::RandomAccessFile* file)
+ : metadata_(metadata),
+ metadata_version_(metadata_version),
+ file_(file),
+ max_recursion_depth_(options.max_recursion_depth) {}
+
+ Status ReadBuffer(int64_t offset, int64_t length, std::shared_ptr<Buffer>* out) {
+ if (skip_io_) {
+ return Status::OK();
+ }
+ if (offset < 0) {
+ return Status::Invalid("Negative offset for reading buffer ", buffer_index_);
+ }
+ if (length < 0) {
+ return Status::Invalid("Negative length for reading buffer ", buffer_index_);
+ }
+ // This construct permits overriding GetBuffer at compile time
+ if (!BitUtil::IsMultipleOf8(offset)) {
+ return Status::Invalid("Buffer ", buffer_index_,
+ " did not start on 8-byte aligned offset: ", offset);
+ }
+ return file_->ReadAt(offset, length).Value(out);
+ }
+
+ Status LoadType(const DataType& type) { return VisitTypeInline(type, this); }
+
+ Status Load(const Field* field, ArrayData* out) {
+ if (max_recursion_depth_ <= 0) {
+ return Status::Invalid("Max recursion depth reached");
+ }
+
+ field_ = field;
+ out_ = out;
+ out_->type = field_->type();
+ return LoadType(*field_->type());
+ }
+
+ Status SkipField(const Field* field) {
+ ArrayData dummy;
+ skip_io_ = true;
+ Status status = Load(field, &dummy);
+ skip_io_ = false;
+ return status;
+ }
+
+ Status GetBuffer(int buffer_index, std::shared_ptr<Buffer>* out) {
+ auto buffers = metadata_->buffers();
+ CHECK_FLATBUFFERS_NOT_NULL(buffers, "RecordBatch.buffers");
+ if (buffer_index >= static_cast<int>(buffers->size())) {
+ return Status::IOError("buffer_index out of range.");
+ }
+ const flatbuf::Buffer* buffer = buffers->Get(buffer_index);
+ if (buffer->length() == 0) {
+ // Should never return a null buffer here.
+ // (zero-sized buffer allocations are cheap)
+ return AllocateBuffer(0).Value(out);
+ } else {
+ return ReadBuffer(buffer->offset(), buffer->length(), out);
+ }
+ }
+
+ Status GetFieldMetadata(int field_index, ArrayData* out) {
+ auto nodes = metadata_->nodes();
+ CHECK_FLATBUFFERS_NOT_NULL(nodes, "Table.nodes");
+ // pop off a field
+ if (field_index >= static_cast<int>(nodes->size())) {
+ return Status::Invalid("Ran out of field metadata, likely malformed");
+ }
+ const flatbuf::FieldNode* node = nodes->Get(field_index);
+
+ out->length = node->length();
+ out->null_count = node->null_count();
+ out->offset = 0;
+ return Status::OK();
+ }
+
+ Status LoadCommon(Type::type type_id) {
+ // This only contains the length and null count, which we need to figure
+ // out what to do with the buffers. For example, if null_count == 0, then
+ // we can skip that buffer without reading from shared memory
+ RETURN_NOT_OK(GetFieldMetadata(field_index_++, out_));
+
+ if (internal::HasValidityBitmap(type_id, metadata_version_)) {
+ // Extract null_bitmap which is common to all arrays except for unions
+ // and nulls.
+ if (out_->null_count != 0) {
+ RETURN_NOT_OK(GetBuffer(buffer_index_, &out_->buffers[0]));
+ }
+ buffer_index_++;
+ }
+ return Status::OK();
+ }
+
+ template <typename TYPE>
+ Status LoadPrimitive(Type::type type_id) {
+ out_->buffers.resize(2);
+
+ RETURN_NOT_OK(LoadCommon(type_id));
+ if (out_->length > 0) {
+ RETURN_NOT_OK(GetBuffer(buffer_index_++, &out_->buffers[1]));
+ } else {
+ buffer_index_++;
+ out_->buffers[1].reset(new Buffer(nullptr, 0));
+ }
+ return Status::OK();
+ }
+
+ template <typename TYPE>
+ Status LoadBinary(Type::type type_id) {
+ out_->buffers.resize(3);
+
+ RETURN_NOT_OK(LoadCommon(type_id));
+ RETURN_NOT_OK(GetBuffer(buffer_index_++, &out_->buffers[1]));
+ return GetBuffer(buffer_index_++, &out_->buffers[2]);
+ }
+
+ template <typename TYPE>
+ Status LoadList(const TYPE& type) {
+ out_->buffers.resize(2);
+
+ RETURN_NOT_OK(LoadCommon(type.id()));
+ RETURN_NOT_OK(GetBuffer(buffer_index_++, &out_->buffers[1]));
+
+ const int num_children = type.num_fields();
+ if (num_children != 1) {
+ return Status::Invalid("Wrong number of children: ", num_children);
+ }
+
+ return LoadChildren(type.fields());
+ }
+
+ Status LoadChildren(const std::vector<std::shared_ptr<Field>>& child_fields) {
+ ArrayData* parent = out_;
+
+ parent->child_data.resize(child_fields.size());
+ for (int i = 0; i < static_cast<int>(child_fields.size()); ++i) {
+ parent->child_data[i] = std::make_shared<ArrayData>();
+ --max_recursion_depth_;
+ RETURN_NOT_OK(Load(child_fields[i].get(), parent->child_data[i].get()));
+ ++max_recursion_depth_;
+ }
+ out_ = parent;
+ return Status::OK();
+ }
+
+ Status Visit(const NullType& type) {
+ out_->buffers.resize(1);
+
+ // ARROW-6379: NullType has no buffers in the IPC payload
+ return GetFieldMetadata(field_index_++, out_);
+ }
+
+ template <typename T>
+ enable_if_t<std::is_base_of<FixedWidthType, T>::value &&
+ !std::is_base_of<FixedSizeBinaryType, T>::value &&
+ !std::is_base_of<DictionaryType, T>::value,
+ Status>
+ Visit(const T& type) {
+ return LoadPrimitive<T>(type.id());
+ }
+
+ template <typename T>
+ enable_if_base_binary<T, Status> Visit(const T& type) {
+ return LoadBinary<T>(type.id());
+ }
+
+ Status Visit(const FixedSizeBinaryType& type) {
+ out_->buffers.resize(2);
+ RETURN_NOT_OK(LoadCommon(type.id()));
+ return GetBuffer(buffer_index_++, &out_->buffers[1]);
+ }
+
+ template <typename T>
+ enable_if_var_size_list<T, Status> Visit(const T& type) {
+ return LoadList(type);
+ }
+
+ Status Visit(const MapType& type) {
+ RETURN_NOT_OK(LoadList(type));
+ return MapArray::ValidateChildData(out_->child_data);
+ }
+
+ Status Visit(const FixedSizeListType& type) {
+ out_->buffers.resize(1);
+
+ RETURN_NOT_OK(LoadCommon(type.id()));
+
+ const int num_children = type.num_fields();
+ if (num_children != 1) {
+ return Status::Invalid("Wrong number of children: ", num_children);
+ }
+
+ return LoadChildren(type.fields());
+ }
+
+ Status Visit(const StructType& type) {
+ out_->buffers.resize(1);
+ RETURN_NOT_OK(LoadCommon(type.id()));
+ return LoadChildren(type.fields());
+ }
+
+ Status Visit(const UnionType& type) {
+ int n_buffers = type.mode() == UnionMode::SPARSE ? 2 : 3;
+ out_->buffers.resize(n_buffers);
+
+ RETURN_NOT_OK(LoadCommon(type.id()));
+
+ // With metadata V4, we can get a validity bitmap.
+ // Trying to fix up union data to do without the top-level validity bitmap
+ // is hairy:
+ // - type ids must be rewritten to all have valid values (even for former
+ // null slots)
+ // - sparse union children must have their validity bitmaps rewritten
+ // by ANDing the top-level validity bitmap
+ // - dense union children must be rewritten (at least one of them)
+ // to insert the required null slots that were formerly omitted
+ // So instead we bail out.
+ if (out_->null_count != 0 && out_->buffers[0] != nullptr) {
+ return Status::Invalid(
+ "Cannot read pre-1.0.0 Union array with top-level validity bitmap");
+ }
+ out_->buffers[0] = nullptr;
+ out_->null_count = 0;
+
+ if (out_->length > 0) {
+ RETURN_NOT_OK(GetBuffer(buffer_index_, &out_->buffers[1]));
+ if (type.mode() == UnionMode::DENSE) {
+ RETURN_NOT_OK(GetBuffer(buffer_index_ + 1, &out_->buffers[2]));
+ }
+ }
+ buffer_index_ += n_buffers - 1;
+ return LoadChildren(type.fields());
+ }
+
+ Status Visit(const DictionaryType& type) {
+ // out_->dictionary will be filled later in ResolveDictionaries()
+ return LoadType(*type.index_type());
+ }
+
+ Status Visit(const ExtensionType& type) { return LoadType(*type.storage_type()); }
+
+ private:
+ const flatbuf::RecordBatch* metadata_;
+ const MetadataVersion metadata_version_;
+ io::RandomAccessFile* file_;
+ int max_recursion_depth_;
+ int buffer_index_ = 0;
+ int field_index_ = 0;
+ bool skip_io_ = false;
+
+ const Field* field_;
+ ArrayData* out_;
+};
+
+Result<std::shared_ptr<Buffer>> DecompressBuffer(const std::shared_ptr<Buffer>& buf,
+ const IpcReadOptions& options,
+ util::Codec* codec) {
+ if (buf == nullptr || buf->size() == 0) {
+ return buf;
+ }
+
+ if (buf->size() < 8) {
+ return Status::Invalid(
+ "Likely corrupted message, compressed buffers "
+ "are larger than 8 bytes by construction");
+ }
+
+ const uint8_t* data = buf->data();
+ int64_t compressed_size = buf->size() - sizeof(int64_t);
+ int64_t uncompressed_size = BitUtil::FromLittleEndian(util::SafeLoadAs<int64_t>(data));
+
+ ARROW_ASSIGN_OR_RAISE(auto uncompressed,
+ AllocateBuffer(uncompressed_size, options.memory_pool));
+
+ ARROW_ASSIGN_OR_RAISE(
+ int64_t actual_decompressed,
+ codec->Decompress(compressed_size, data + sizeof(int64_t), uncompressed_size,
+ uncompressed->mutable_data()));
+ if (actual_decompressed != uncompressed_size) {
+ return Status::Invalid("Failed to fully decompress buffer, expected ",
+ uncompressed_size, " bytes but decompressed ",
+ actual_decompressed);
+ }
+
+ return std::move(uncompressed);
+}
+
+Status DecompressBuffers(Compression::type compression, const IpcReadOptions& options,
+ ArrayDataVector* fields) {
+ struct BufferAccumulator {
+ using BufferPtrVector = std::vector<std::shared_ptr<Buffer>*>;
+
+ void AppendFrom(const ArrayDataVector& fields) {
+ for (const auto& field : fields) {
+ for (auto& buffer : field->buffers) {
+ buffers_.push_back(&buffer);
+ }
+ AppendFrom(field->child_data);
+ }
+ }
+
+ BufferPtrVector Get(const ArrayDataVector& fields) && {
+ AppendFrom(fields);
+ return std::move(buffers_);
+ }
+
+ BufferPtrVector buffers_;
+ };
+
+ // Flatten all buffers
+ auto buffers = BufferAccumulator{}.Get(*fields);
+
+ std::unique_ptr<util::Codec> codec;
+ ARROW_ASSIGN_OR_RAISE(codec, util::Codec::Create(compression));
+
+ return ::arrow::internal::OptionalParallelFor(
+ options.use_threads, static_cast<int>(buffers.size()), [&](int i) {
+ ARROW_ASSIGN_OR_RAISE(*buffers[i],
+ DecompressBuffer(*buffers[i], options, codec.get()));
+ return Status::OK();
+ });
+}
+
+Result<std::shared_ptr<RecordBatch>> LoadRecordBatchSubset(
+ const flatbuf::RecordBatch* metadata, const std::shared_ptr<Schema>& schema,
+ const std::vector<bool>* inclusion_mask, const IpcReadContext& context,
+ io::RandomAccessFile* file) {
+ ArrayLoader loader(metadata, context.metadata_version, context.options, file);
+
+ ArrayDataVector columns(schema->num_fields());
+ ArrayDataVector filtered_columns;
+ FieldVector filtered_fields;
+ std::shared_ptr<Schema> filtered_schema;
+
+ for (int i = 0; i < schema->num_fields(); ++i) {
+ const Field& field = *schema->field(i);
+ if (!inclusion_mask || (*inclusion_mask)[i]) {
+ // Read field
+ auto column = std::make_shared<ArrayData>();
+ RETURN_NOT_OK(loader.Load(&field, column.get()));
+ if (metadata->length() != column->length) {
+ return Status::IOError("Array length did not match record batch length");
+ }
+ columns[i] = std::move(column);
+ if (inclusion_mask) {
+ filtered_columns.push_back(columns[i]);
+ filtered_fields.push_back(schema->field(i));
+ }
+ } else {
+ // Skip field. This logic must be executed to advance the state of the
+ // loader to the next field
+ RETURN_NOT_OK(loader.SkipField(&field));
+ }
+ }
+
+ // Dictionary resolution needs to happen on the unfiltered columns,
+ // because fields are mapped structurally (by path in the original schema).
+ RETURN_NOT_OK(ResolveDictionaries(columns, *context.dictionary_memo,
+ context.options.memory_pool));
+
+ if (inclusion_mask) {
+ filtered_schema = ::arrow::schema(std::move(filtered_fields), schema->metadata());
+ columns.clear();
+ } else {
+ filtered_schema = schema;
+ filtered_columns = std::move(columns);
+ }
+ if (context.compression != Compression::UNCOMPRESSED) {
+ RETURN_NOT_OK(
+ DecompressBuffers(context.compression, context.options, &filtered_columns));
+ }
+
+ // swap endian in a set of ArrayData if necessary (swap_endian == true)
+ if (context.swap_endian) {
+ for (int i = 0; i < static_cast<int>(filtered_columns.size()); ++i) {
+ ARROW_ASSIGN_OR_RAISE(filtered_columns[i],
+ arrow::internal::SwapEndianArrayData(filtered_columns[i]));
+ }
+ }
+ return RecordBatch::Make(std::move(filtered_schema), metadata->length(),
+ std::move(filtered_columns));
+}
+
+Result<std::shared_ptr<RecordBatch>> LoadRecordBatch(
+ const flatbuf::RecordBatch* metadata, const std::shared_ptr<Schema>& schema,
+ const std::vector<bool>& inclusion_mask, const IpcReadContext& context,
+ io::RandomAccessFile* file) {
+ if (inclusion_mask.size() > 0) {
+ return LoadRecordBatchSubset(metadata, schema, &inclusion_mask, context, file);
+ } else {
+ return LoadRecordBatchSubset(metadata, schema, /*param_name=*/nullptr, context, file);
+ }
+}
+
+// ----------------------------------------------------------------------
+// Array loading
+
+Status GetCompression(const flatbuf::RecordBatch* batch, Compression::type* out) {
+ *out = Compression::UNCOMPRESSED;
+ const flatbuf::BodyCompression* compression = batch->compression();
+ if (compression != nullptr) {
+ if (compression->method() != flatbuf::BodyCompressionMethod::BUFFER) {
+ // Forward compatibility
+ return Status::Invalid("This library only supports BUFFER compression method");
+ }
+
+ if (compression->codec() == flatbuf::CompressionType::LZ4_FRAME) {
+ *out = Compression::LZ4_FRAME;
+ } else if (compression->codec() == flatbuf::CompressionType::ZSTD) {
+ *out = Compression::ZSTD;
+ } else {
+ return Status::Invalid("Unsupported codec in RecordBatch::compression metadata");
+ }
+ return Status::OK();
+ }
+ return Status::OK();
+}
+
+Status GetCompressionExperimental(const flatbuf::Message* message,
+ Compression::type* out) {
+ *out = Compression::UNCOMPRESSED;
+ if (message->custom_metadata() != nullptr) {
+ // TODO: Ensure this deserialization only ever happens once
+ std::shared_ptr<KeyValueMetadata> metadata;
+ RETURN_NOT_OK(internal::GetKeyValueMetadata(message->custom_metadata(), &metadata));
+ int index = metadata->FindKey("ARROW:experimental_compression");
+ if (index != -1) {
+ // Arrow 0.17 stored string in upper case, internal utils now require lower case
+ auto name = arrow::internal::AsciiToLower(metadata->value(index));
+ ARROW_ASSIGN_OR_RAISE(*out, util::Codec::GetCompressionType(name));
+ }
+ return internal::CheckCompressionSupported(*out);
+ }
+ return Status::OK();
+}
+
+static Status ReadContiguousPayload(io::InputStream* file,
+ std::unique_ptr<Message>* message) {
+ ARROW_ASSIGN_OR_RAISE(*message, ReadMessage(file));
+ if (*message == nullptr) {
+ return Status::Invalid("Unable to read metadata at offset");
+ }
+ return Status::OK();
+}
+
+Result<std::shared_ptr<RecordBatch>> ReadRecordBatch(
+ const std::shared_ptr<Schema>& schema, const DictionaryMemo* dictionary_memo,
+ const IpcReadOptions& options, io::InputStream* file) {
+ std::unique_ptr<Message> message;
+ RETURN_NOT_OK(ReadContiguousPayload(file, &message));
+ CHECK_HAS_BODY(*message);
+ ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message->body()));
+ return ReadRecordBatch(*message->metadata(), schema, dictionary_memo, options,
+ reader.get());
+}
+
+Result<std::shared_ptr<RecordBatch>> ReadRecordBatch(
+ const Message& message, const std::shared_ptr<Schema>& schema,
+ const DictionaryMemo* dictionary_memo, const IpcReadOptions& options) {
+ CHECK_MESSAGE_TYPE(MessageType::RECORD_BATCH, message.type());
+ CHECK_HAS_BODY(message);
+ ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message.body()));
+ return ReadRecordBatch(*message.metadata(), schema, dictionary_memo, options,
+ reader.get());
+}
+
+Result<std::shared_ptr<RecordBatch>> ReadRecordBatchInternal(
+ const Buffer& metadata, const std::shared_ptr<Schema>& schema,
+ const std::vector<bool>& inclusion_mask, IpcReadContext& context,
+ io::RandomAccessFile* file) {
+ const flatbuf::Message* message = nullptr;
+ RETURN_NOT_OK(internal::VerifyMessage(metadata.data(), metadata.size(), &message));
+ auto batch = message->header_as_RecordBatch();
+ if (batch == nullptr) {
+ return Status::IOError(
+ "Header-type of flatbuffer-encoded Message is not RecordBatch.");
+ }
+
+ Compression::type compression;
+ RETURN_NOT_OK(GetCompression(batch, &compression));
+ if (context.compression == Compression::UNCOMPRESSED &&
+ message->version() == flatbuf::MetadataVersion::V4) {
+ // Possibly obtain codec information from experimental serialization format
+ // in 0.17.x
+ RETURN_NOT_OK(GetCompressionExperimental(message, &compression));
+ }
+ context.compression = compression;
+ context.metadata_version = internal::GetMetadataVersion(message->version());
+ return LoadRecordBatch(batch, schema, inclusion_mask, context, file);
+}
+
+// If we are selecting only certain fields, populate an inclusion mask for fast lookups.
+// Additionally, drop deselected fields from the reader's schema.
+Status GetInclusionMaskAndOutSchema(const std::shared_ptr<Schema>& full_schema,
+ const std::vector<int>& included_indices,
+ std::vector<bool>* inclusion_mask,
+ std::shared_ptr<Schema>* out_schema) {
+ inclusion_mask->clear();
+ if (included_indices.empty()) {
+ *out_schema = full_schema;
+ return Status::OK();
+ }
+
+ inclusion_mask->resize(full_schema->num_fields(), false);
+
+ auto included_indices_sorted = included_indices;
+ std::sort(included_indices_sorted.begin(), included_indices_sorted.end());
+
+ FieldVector included_fields;
+ for (int i : included_indices_sorted) {
+ // Ignore out of bounds indices
+ if (i < 0 || i >= full_schema->num_fields()) {
+ return Status::Invalid("Out of bounds field index: ", i);
+ }
+
+ if (inclusion_mask->at(i)) continue;
+
+ inclusion_mask->at(i) = true;
+ included_fields.push_back(full_schema->field(i));
+ }
+
+ *out_schema = schema(std::move(included_fields), full_schema->endianness(),
+ full_schema->metadata());
+ return Status::OK();
+}
+
+Status UnpackSchemaMessage(const void* opaque_schema, const IpcReadOptions& options,
+ DictionaryMemo* dictionary_memo,
+ std::shared_ptr<Schema>* schema,
+ std::shared_ptr<Schema>* out_schema,
+ std::vector<bool>* field_inclusion_mask, bool* swap_endian) {
+ RETURN_NOT_OK(internal::GetSchema(opaque_schema, dictionary_memo, schema));
+
+ // If we are selecting only certain fields, populate the inclusion mask now
+ // for fast lookups
+ RETURN_NOT_OK(GetInclusionMaskAndOutSchema(*schema, options.included_fields,
+ field_inclusion_mask, out_schema));
+ *swap_endian = options.ensure_native_endian && !out_schema->get()->is_native_endian();
+ if (*swap_endian) {
+ // create a new schema with native endianness before swapping endian in ArrayData
+ *schema = schema->get()->WithEndianness(Endianness::Native);
+ *out_schema = out_schema->get()->WithEndianness(Endianness::Native);
+ }
+ return Status::OK();
+}
+
+Status UnpackSchemaMessage(const Message& message, const IpcReadOptions& options,
+ DictionaryMemo* dictionary_memo,
+ std::shared_ptr<Schema>* schema,
+ std::shared_ptr<Schema>* out_schema,
+ std::vector<bool>* field_inclusion_mask, bool* swap_endian) {
+ CHECK_MESSAGE_TYPE(MessageType::SCHEMA, message.type());
+ CHECK_HAS_NO_BODY(message);
+
+ return UnpackSchemaMessage(message.header(), options, dictionary_memo, schema,
+ out_schema, field_inclusion_mask, swap_endian);
+}
+
+Result<std::shared_ptr<RecordBatch>> ReadRecordBatch(
+ const Buffer& metadata, const std::shared_ptr<Schema>& schema,
+ const DictionaryMemo* dictionary_memo, const IpcReadOptions& options,
+ io::RandomAccessFile* file) {
+ std::shared_ptr<Schema> out_schema;
+ // Empty means do not use
+ std::vector<bool> inclusion_mask;
+ IpcReadContext context(const_cast<DictionaryMemo*>(dictionary_memo), options, false);
+ RETURN_NOT_OK(GetInclusionMaskAndOutSchema(schema, context.options.included_fields,
+ &inclusion_mask, &out_schema));
+ return ReadRecordBatchInternal(metadata, schema, inclusion_mask, context, file);
+}
+
+Status ReadDictionary(const Buffer& metadata, const IpcReadContext& context,
+ DictionaryKind* kind, io::RandomAccessFile* file) {
+ const flatbuf::Message* message = nullptr;
+ RETURN_NOT_OK(internal::VerifyMessage(metadata.data(), metadata.size(), &message));
+ const auto dictionary_batch = message->header_as_DictionaryBatch();
+ if (dictionary_batch == nullptr) {
+ return Status::IOError(
+ "Header-type of flatbuffer-encoded Message is not DictionaryBatch.");
+ }
+
+ // The dictionary is embedded in a record batch with a single column
+ const auto batch_meta = dictionary_batch->data();
+
+ CHECK_FLATBUFFERS_NOT_NULL(batch_meta, "DictionaryBatch.data");
+
+ Compression::type compression;
+ RETURN_NOT_OK(GetCompression(batch_meta, &compression));
+ if (compression == Compression::UNCOMPRESSED &&
+ message->version() == flatbuf::MetadataVersion::V4) {
+ // Possibly obtain codec information from experimental serialization format
+ // in 0.17.x
+ RETURN_NOT_OK(GetCompressionExperimental(message, &compression));
+ }
+
+ const int64_t id = dictionary_batch->id();
+
+ // Look up the dictionary value type, which must have been added to the
+ // DictionaryMemo already prior to invoking this function
+ ARROW_ASSIGN_OR_RAISE(auto value_type, context.dictionary_memo->GetDictionaryType(id));
+
+ // Load the dictionary data from the dictionary batch
+ ArrayLoader loader(batch_meta, internal::GetMetadataVersion(message->version()),
+ context.options, file);
+ auto dict_data = std::make_shared<ArrayData>();
+ const Field dummy_field("", value_type);
+ RETURN_NOT_OK(loader.Load(&dummy_field, dict_data.get()));
+
+ if (compression != Compression::UNCOMPRESSED) {
+ ArrayDataVector dict_fields{dict_data};
+ RETURN_NOT_OK(DecompressBuffers(compression, context.options, &dict_fields));
+ }
+
+ // swap endian in dict_data if necessary (swap_endian == true)
+ if (context.swap_endian) {
+ ARROW_ASSIGN_OR_RAISE(dict_data, ::arrow::internal::SwapEndianArrayData(dict_data));
+ }
+
+ if (dictionary_batch->isDelta()) {
+ if (kind != nullptr) {
+ *kind = DictionaryKind::Delta;
+ }
+ return context.dictionary_memo->AddDictionaryDelta(id, dict_data);
+ }
+ ARROW_ASSIGN_OR_RAISE(bool inserted,
+ context.dictionary_memo->AddOrReplaceDictionary(id, dict_data));
+ if (kind != nullptr) {
+ *kind = inserted ? DictionaryKind::New : DictionaryKind::Replacement;
+ }
+ return Status::OK();
+}
+
+Status ReadDictionary(const Message& message, const IpcReadContext& context,
+ DictionaryKind* kind) {
+ // Only invoke this method if we already know we have a dictionary message
+ DCHECK_EQ(message.type(), MessageType::DICTIONARY_BATCH);
+ CHECK_HAS_BODY(message);
+ ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message.body()));
+ return ReadDictionary(*message.metadata(), context, kind, reader.get());
+}
+
+// ----------------------------------------------------------------------
+// RecordBatchStreamReader implementation
+
+class RecordBatchStreamReaderImpl : public RecordBatchStreamReader {
+ public:
+ Status Open(std::unique_ptr<MessageReader> message_reader,
+ const IpcReadOptions& options) {
+ message_reader_ = std::move(message_reader);
+ options_ = options;
+
+ // Read schema
+ ARROW_ASSIGN_OR_RAISE(std::unique_ptr<Message> message, ReadNextMessage());
+ if (!message) {
+ return Status::Invalid("Tried reading schema message, was null or length 0");
+ }
+
+ RETURN_NOT_OK(UnpackSchemaMessage(*message, options, &dictionary_memo_, &schema_,
+ &out_schema_, &field_inclusion_mask_,
+ &swap_endian_));
+ return Status::OK();
+ }
+
+ Status ReadNext(std::shared_ptr<RecordBatch>* batch) override {
+ if (!have_read_initial_dictionaries_) {
+ RETURN_NOT_OK(ReadInitialDictionaries());
+ }
+
+ if (empty_stream_) {
+ // ARROW-6006: Degenerate case where stream contains no data, we do not
+ // bother trying to read a RecordBatch message from the stream
+ *batch = nullptr;
+ return Status::OK();
+ }
+
+ // Continue to read other dictionaries, if any
+ std::unique_ptr<Message> message;
+ ARROW_ASSIGN_OR_RAISE(message, ReadNextMessage());
+
+ while (message != nullptr && message->type() == MessageType::DICTIONARY_BATCH) {
+ RETURN_NOT_OK(ReadDictionary(*message));
+ ARROW_ASSIGN_OR_RAISE(message, ReadNextMessage());
+ }
+
+ if (message == nullptr) {
+ // End of stream
+ *batch = nullptr;
+ return Status::OK();
+ }
+
+ CHECK_HAS_BODY(*message);
+ ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message->body()));
+ IpcReadContext context(&dictionary_memo_, options_, swap_endian_);
+ return ReadRecordBatchInternal(*message->metadata(), schema_, field_inclusion_mask_,
+ context, reader.get())
+ .Value(batch);
+ }
+
+ std::shared_ptr<Schema> schema() const override { return out_schema_; }
+
+ ReadStats stats() const override { return stats_; }
+
+ private:
+ Result<std::unique_ptr<Message>> ReadNextMessage() {
+ ARROW_ASSIGN_OR_RAISE(auto message, message_reader_->ReadNextMessage());
+ if (message) {
+ ++stats_.num_messages;
+ switch (message->type()) {
+ case MessageType::RECORD_BATCH:
+ ++stats_.num_record_batches;
+ break;
+ case MessageType::DICTIONARY_BATCH:
+ ++stats_.num_dictionary_batches;
+ break;
+ default:
+ break;
+ }
+ }
+ return std::move(message);
+ }
+
+ // Read dictionary from dictionary batch
+ Status ReadDictionary(const Message& message) {
+ DictionaryKind kind;
+ IpcReadContext context(&dictionary_memo_, options_, swap_endian_);
+ RETURN_NOT_OK(::arrow::ipc::ReadDictionary(message, context, &kind));
+ switch (kind) {
+ case DictionaryKind::New:
+ break;
+ case DictionaryKind::Delta:
+ ++stats_.num_dictionary_deltas;
+ break;
+ case DictionaryKind::Replacement:
+ ++stats_.num_replaced_dictionaries;
+ break;
+ }
+ return Status::OK();
+ }
+
+ Status ReadInitialDictionaries() {
+ // We must receive all dictionaries before reconstructing the
+ // first record batch. Subsequent dictionary deltas modify the memo
+ std::unique_ptr<Message> message;
+
+ // TODO(wesm): In future, we may want to reconcile the ids in the stream with
+ // those found in the schema
+ const auto num_dicts = dictionary_memo_.fields().num_dicts();
+ for (int i = 0; i < num_dicts; ++i) {
+ ARROW_ASSIGN_OR_RAISE(message, ReadNextMessage());
+ if (!message) {
+ if (i == 0) {
+ /// ARROW-6006: If we fail to find any dictionaries in the stream, then
+ /// it may be that the stream has a schema but no actual data. In such
+ /// case we communicate that we were unable to find the dictionaries
+ /// (but there was no failure otherwise), so the caller can decide what
+ /// to do
+ empty_stream_ = true;
+ break;
+ } else {
+ // ARROW-6126, the stream terminated before receiving the expected
+ // number of dictionaries
+ return Status::Invalid("IPC stream ended without reading the expected number (",
+ num_dicts, ") of dictionaries");
+ }
+ }
+
+ if (message->type() != MessageType::DICTIONARY_BATCH) {
+ return Status::Invalid("IPC stream did not have the expected number (", num_dicts,
+ ") of dictionaries at the start of the stream");
+ }
+ RETURN_NOT_OK(ReadDictionary(*message));
+ }
+
+ have_read_initial_dictionaries_ = true;
+ return Status::OK();
+ }
+
+ std::unique_ptr<MessageReader> message_reader_;
+ IpcReadOptions options_;
+ std::vector<bool> field_inclusion_mask_;
+
+ bool have_read_initial_dictionaries_ = false;
+
+ // Flag to set in case where we fail to observe all dictionaries in a stream,
+ // and so the reader should not attempt to parse any messages
+ bool empty_stream_ = false;
+
+ ReadStats stats_;
+
+ DictionaryMemo dictionary_memo_;
+ std::shared_ptr<Schema> schema_, out_schema_;
+
+ bool swap_endian_;
+};
+
+// ----------------------------------------------------------------------
+// Stream reader constructors
+
+Result<std::shared_ptr<RecordBatchStreamReader>> RecordBatchStreamReader::Open(
+ std::unique_ptr<MessageReader> message_reader, const IpcReadOptions& options) {
+ // Private ctor
+ auto result = std::make_shared<RecordBatchStreamReaderImpl>();
+ RETURN_NOT_OK(result->Open(std::move(message_reader), options));
+ return result;
+}
+
+Result<std::shared_ptr<RecordBatchStreamReader>> RecordBatchStreamReader::Open(
+ io::InputStream* stream, const IpcReadOptions& options) {
+ return Open(MessageReader::Open(stream), options);
+}
+
+Result<std::shared_ptr<RecordBatchStreamReader>> RecordBatchStreamReader::Open(
+ const std::shared_ptr<io::InputStream>& stream, const IpcReadOptions& options) {
+ return Open(MessageReader::Open(stream), options);
+}
+
+// ----------------------------------------------------------------------
+// Reader implementation
+
+// Common functions used in both the random-access file reader and the
+// asynchronous generator
+static inline FileBlock FileBlockFromFlatbuffer(const flatbuf::Block* block) {
+ return FileBlock{block->offset(), block->metaDataLength(), block->bodyLength()};
+}
+
+static Result<std::unique_ptr<Message>> ReadMessageFromBlock(const FileBlock& block,
+ io::RandomAccessFile* file) {
+ if (!BitUtil::IsMultipleOf8(block.offset) ||
+ !BitUtil::IsMultipleOf8(block.metadata_length) ||
+ !BitUtil::IsMultipleOf8(block.body_length)) {
+ return Status::Invalid("Unaligned block in IPC file");
+ }
+
+ // TODO(wesm): this breaks integration tests, see ARROW-3256
+ // DCHECK_EQ((*out)->body_length(), block.body_length);
+
+ ARROW_ASSIGN_OR_RAISE(auto message,
+ ReadMessage(block.offset, block.metadata_length, file));
+ return std::move(message);
+}
+
+static Future<std::shared_ptr<Message>> ReadMessageFromBlockAsync(
+ const FileBlock& block, io::RandomAccessFile* file, const io::IOContext& io_context) {
+ if (!BitUtil::IsMultipleOf8(block.offset) ||
+ !BitUtil::IsMultipleOf8(block.metadata_length) ||
+ !BitUtil::IsMultipleOf8(block.body_length)) {
+ return Status::Invalid("Unaligned block in IPC file");
+ }
+
+ // TODO(wesm): this breaks integration tests, see ARROW-3256
+ // DCHECK_EQ((*out)->body_length(), block.body_length);
+
+ return ReadMessageAsync(block.offset, block.metadata_length, block.body_length, file,
+ io_context);
+}
+
+static Status ReadOneDictionary(Message* message, const IpcReadContext& context) {
+ CHECK_HAS_BODY(*message);
+ ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message->body()));
+ DictionaryKind kind;
+ RETURN_NOT_OK(ReadDictionary(*message->metadata(), context, &kind, reader.get()));
+ if (kind != DictionaryKind::New) {
+ return Status::Invalid(
+ "Unsupported dictionary replacement or "
+ "dictionary delta in IPC file");
+ }
+ return Status::OK();
+}
+
+class RecordBatchFileReaderImpl;
+
+/// A generator of record batches.
+///
+/// All batches are yielded in order.
+class ARROW_EXPORT IpcFileRecordBatchGenerator {
+ public:
+ using Item = std::shared_ptr<RecordBatch>;
+
+ explicit IpcFileRecordBatchGenerator(
+ std::shared_ptr<RecordBatchFileReaderImpl> state,
+ std::shared_ptr<io::internal::ReadRangeCache> cached_source,
+ const io::IOContext& io_context, arrow::internal::Executor* executor)
+ : state_(std::move(state)),
+ cached_source_(std::move(cached_source)),
+ io_context_(io_context),
+ executor_(executor),
+ index_(0) {}
+
+ Future<Item> operator()();
+ Future<std::shared_ptr<Message>> ReadBlock(const FileBlock& block);
+
+ static Status ReadDictionaries(
+ RecordBatchFileReaderImpl* state,
+ std::vector<std::shared_ptr<Message>> dictionary_messages);
+ static Result<std::shared_ptr<RecordBatch>> ReadRecordBatch(
+ RecordBatchFileReaderImpl* state, Message* message);
+
+ private:
+ std::shared_ptr<RecordBatchFileReaderImpl> state_;
+ std::shared_ptr<io::internal::ReadRangeCache> cached_source_;
+ io::IOContext io_context_;
+ arrow::internal::Executor* executor_;
+ int index_;
+ // Odd Future type, but this lets us use All() easily
+ Future<> read_dictionaries_;
+};
+
+class RecordBatchFileReaderImpl : public RecordBatchFileReader {
+ public:
+ RecordBatchFileReaderImpl() : file_(NULLPTR), footer_offset_(0), footer_(NULLPTR) {}
+
+ int num_record_batches() const override {
+ return static_cast<int>(internal::FlatBuffersVectorSize(footer_->recordBatches()));
+ }
+
+ MetadataVersion version() const override {
+ return internal::GetMetadataVersion(footer_->version());
+ }
+
+ Result<std::shared_ptr<RecordBatch>> ReadRecordBatch(int i) override {
+ DCHECK_GE(i, 0);
+ DCHECK_LT(i, num_record_batches());
+
+ if (!read_dictionaries_) {
+ RETURN_NOT_OK(ReadDictionaries());
+ read_dictionaries_ = true;
+ }
+
+ ARROW_ASSIGN_OR_RAISE(auto message, ReadMessageFromBlock(GetRecordBatchBlock(i)));
+
+ CHECK_HAS_BODY(*message);
+ ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message->body()));
+ IpcReadContext context(&dictionary_memo_, options_, swap_endian_);
+ ARROW_ASSIGN_OR_RAISE(auto batch, ReadRecordBatchInternal(
+ *message->metadata(), schema_,
+ field_inclusion_mask_, context, reader.get()));
+ ++stats_.num_record_batches;
+ return batch;
+ }
+
+ Result<int64_t> CountRows() override {
+ int64_t total = 0;
+ for (int i = 0; i < num_record_batches(); i++) {
+ ARROW_ASSIGN_OR_RAISE(auto outer_message,
+ ReadMessageFromBlock(GetRecordBatchBlock(i)));
+ auto metadata = outer_message->metadata();
+ const flatbuf::Message* message = nullptr;
+ RETURN_NOT_OK(
+ internal::VerifyMessage(metadata->data(), metadata->size(), &message));
+ auto batch = message->header_as_RecordBatch();
+ if (batch == nullptr) {
+ return Status::IOError(
+ "Header-type of flatbuffer-encoded Message is not RecordBatch.");
+ }
+ total += batch->length();
+ }
+ return total;
+ }
+
+ Status Open(const std::shared_ptr<io::RandomAccessFile>& file, int64_t footer_offset,
+ const IpcReadOptions& options) {
+ owned_file_ = file;
+ return Open(file.get(), footer_offset, options);
+ }
+
+ Status Open(io::RandomAccessFile* file, int64_t footer_offset,
+ const IpcReadOptions& options) {
+ file_ = file;
+ options_ = options;
+ footer_offset_ = footer_offset;
+ RETURN_NOT_OK(ReadFooter());
+
+ // Get the schema and record any observed dictionaries
+ RETURN_NOT_OK(UnpackSchemaMessage(footer_->schema(), options, &dictionary_memo_,
+ &schema_, &out_schema_, &field_inclusion_mask_,
+ &swap_endian_));
+ ++stats_.num_messages;
+ return Status::OK();
+ }
+
+ Future<> OpenAsync(const std::shared_ptr<io::RandomAccessFile>& file,
+ int64_t footer_offset, const IpcReadOptions& options) {
+ owned_file_ = file;
+ return OpenAsync(file.get(), footer_offset, options);
+ }
+
+ Future<> OpenAsync(io::RandomAccessFile* file, int64_t footer_offset,
+ const IpcReadOptions& options) {
+ file_ = file;
+ options_ = options;
+ footer_offset_ = footer_offset;
+ auto cpu_executor = ::arrow::internal::GetCpuThreadPool();
+ auto self = std::dynamic_pointer_cast<RecordBatchFileReaderImpl>(shared_from_this());
+ return ReadFooterAsync(cpu_executor).Then([self, options]() -> Status {
+ // Get the schema and record any observed dictionaries
+ RETURN_NOT_OK(UnpackSchemaMessage(
+ self->footer_->schema(), options, &self->dictionary_memo_, &self->schema_,
+ &self->out_schema_, &self->field_inclusion_mask_, &self->swap_endian_));
+ ++self->stats_.num_messages;
+ return Status::OK();
+ });
+ }
+
+ std::shared_ptr<Schema> schema() const override { return out_schema_; }
+
+ std::shared_ptr<const KeyValueMetadata> metadata() const override { return metadata_; }
+
+ ReadStats stats() const override { return stats_; }
+
+ Result<AsyncGenerator<std::shared_ptr<RecordBatch>>> GetRecordBatchGenerator(
+ const bool coalesce, const io::IOContext& io_context,
+ const io::CacheOptions cache_options,
+ arrow::internal::Executor* executor) override {
+ auto state = std::dynamic_pointer_cast<RecordBatchFileReaderImpl>(shared_from_this());
+ std::shared_ptr<io::internal::ReadRangeCache> cached_source;
+ if (coalesce) {
+ if (!owned_file_) return Status::Invalid("Cannot coalesce without an owned file");
+ cached_source = std::make_shared<io::internal::ReadRangeCache>(
+ owned_file_, io_context, cache_options);
+ auto num_dictionaries = this->num_dictionaries();
+ auto num_record_batches = this->num_record_batches();
+ std::vector<io::ReadRange> ranges(num_dictionaries + num_record_batches);
+ for (int i = 0; i < num_dictionaries; i++) {
+ auto block = FileBlockFromFlatbuffer(footer_->dictionaries()->Get(i));
+ ranges[i].offset = block.offset;
+ ranges[i].length = block.metadata_length + block.body_length;
+ }
+ for (int i = 0; i < num_record_batches; i++) {
+ auto block = FileBlockFromFlatbuffer(footer_->recordBatches()->Get(i));
+ ranges[num_dictionaries + i].offset = block.offset;
+ ranges[num_dictionaries + i].length = block.metadata_length + block.body_length;
+ }
+ RETURN_NOT_OK(cached_source->Cache(std::move(ranges)));
+ }
+ return IpcFileRecordBatchGenerator(std::move(state), std::move(cached_source),
+ io_context, executor);
+ }
+
+ private:
+ friend AsyncGenerator<std::shared_ptr<Message>> MakeMessageGenerator(
+ std::shared_ptr<RecordBatchFileReaderImpl>, const io::IOContext&);
+ friend class IpcFileRecordBatchGenerator;
+
+ FileBlock GetRecordBatchBlock(int i) const {
+ return FileBlockFromFlatbuffer(footer_->recordBatches()->Get(i));
+ }
+
+ FileBlock GetDictionaryBlock(int i) const {
+ return FileBlockFromFlatbuffer(footer_->dictionaries()->Get(i));
+ }
+
+ Result<std::unique_ptr<Message>> ReadMessageFromBlock(const FileBlock& block) {
+ ARROW_ASSIGN_OR_RAISE(auto message, arrow::ipc::ReadMessageFromBlock(block, file_));
+ ++stats_.num_messages;
+ return std::move(message);
+ }
+
+ Status ReadDictionaries() {
+ // Read all the dictionaries
+ IpcReadContext context(&dictionary_memo_, options_, swap_endian_);
+ for (int i = 0; i < num_dictionaries(); ++i) {
+ ARROW_ASSIGN_OR_RAISE(auto message, ReadMessageFromBlock(GetDictionaryBlock(i)));
+ RETURN_NOT_OK(ReadOneDictionary(message.get(), context));
+ ++stats_.num_dictionary_batches;
+ }
+ return Status::OK();
+ }
+
+ Status ReadFooter() {
+ auto fut = ReadFooterAsync(/*executor=*/nullptr);
+ return fut.status();
+ }
+
+ Future<> ReadFooterAsync(arrow::internal::Executor* executor) {
+ const int32_t magic_size = static_cast<int>(strlen(kArrowMagicBytes));
+
+ if (footer_offset_ <= magic_size * 2 + 4) {
+ return Status::Invalid("File is too small: ", footer_offset_);
+ }
+
+ int file_end_size = static_cast<int>(magic_size + sizeof(int32_t));
+ auto self = std::dynamic_pointer_cast<RecordBatchFileReaderImpl>(shared_from_this());
+ auto read_magic = file_->ReadAsync(footer_offset_ - file_end_size, file_end_size);
+ if (executor) read_magic = executor->Transfer(std::move(read_magic));
+ return read_magic
+ .Then([=](const std::shared_ptr<Buffer>& buffer)
+ -> Future<std::shared_ptr<Buffer>> {
+ const int64_t expected_footer_size = magic_size + sizeof(int32_t);
+ if (buffer->size() < expected_footer_size) {
+ return Status::Invalid("Unable to read ", expected_footer_size,
+ "from end of file");
+ }
+
+ if (memcmp(buffer->data() + sizeof(int32_t), kArrowMagicBytes, magic_size)) {
+ return Status::Invalid("Not an Arrow file");
+ }
+
+ int32_t footer_length = BitUtil::FromLittleEndian(
+ *reinterpret_cast<const int32_t*>(buffer->data()));
+
+ if (footer_length <= 0 ||
+ footer_length > self->footer_offset_ - magic_size * 2 - 4) {
+ return Status::Invalid("File is smaller than indicated metadata size");
+ }
+
+ // Now read the footer
+ auto read_footer = self->file_->ReadAsync(
+ self->footer_offset_ - footer_length - file_end_size, footer_length);
+ if (executor) read_footer = executor->Transfer(std::move(read_footer));
+ return read_footer;
+ })
+ .Then([=](const std::shared_ptr<Buffer>& buffer) -> Status {
+ self->footer_buffer_ = buffer;
+ const auto data = self->footer_buffer_->data();
+ const auto size = self->footer_buffer_->size();
+ if (!internal::VerifyFlatbuffers<flatbuf::Footer>(data, size)) {
+ return Status::IOError("Verification of flatbuffer-encoded Footer failed.");
+ }
+ self->footer_ = flatbuf::GetFooter(data);
+
+ auto fb_metadata = self->footer_->custom_metadata();
+ if (fb_metadata != nullptr) {
+ std::shared_ptr<KeyValueMetadata> md;
+ RETURN_NOT_OK(internal::GetKeyValueMetadata(fb_metadata, &md));
+ self->metadata_ = std::move(md); // const-ify
+ }
+ return Status::OK();
+ });
+ }
+
+ int num_dictionaries() const {
+ return static_cast<int>(internal::FlatBuffersVectorSize(footer_->dictionaries()));
+ }
+
+ io::RandomAccessFile* file_;
+ IpcReadOptions options_;
+ std::vector<bool> field_inclusion_mask_;
+
+ std::shared_ptr<io::RandomAccessFile> owned_file_;
+
+ // The location where the Arrow file layout ends. May be the end of the file
+ // or some other location if embedded in a larger file.
+ int64_t footer_offset_;
+
+ // Footer metadata
+ std::shared_ptr<Buffer> footer_buffer_;
+ const flatbuf::Footer* footer_;
+ std::shared_ptr<const KeyValueMetadata> metadata_;
+
+ bool read_dictionaries_ = false;
+ DictionaryMemo dictionary_memo_;
+
+ // Reconstructed schema, including any read dictionaries
+ std::shared_ptr<Schema> schema_;
+ // Schema with deselected fields dropped
+ std::shared_ptr<Schema> out_schema_;
+
+ ReadStats stats_;
+
+ bool swap_endian_;
+};
+
+Result<std::shared_ptr<RecordBatchFileReader>> RecordBatchFileReader::Open(
+ io::RandomAccessFile* file, const IpcReadOptions& options) {
+ ARROW_ASSIGN_OR_RAISE(int64_t footer_offset, file->GetSize());
+ return Open(file, footer_offset, options);
+}
+
+Result<std::shared_ptr<RecordBatchFileReader>> RecordBatchFileReader::Open(
+ io::RandomAccessFile* file, int64_t footer_offset, const IpcReadOptions& options) {
+ auto result = std::make_shared<RecordBatchFileReaderImpl>();
+ RETURN_NOT_OK(result->Open(file, footer_offset, options));
+ return result;
+}
+
+Result<std::shared_ptr<RecordBatchFileReader>> RecordBatchFileReader::Open(
+ const std::shared_ptr<io::RandomAccessFile>& file, const IpcReadOptions& options) {
+ ARROW_ASSIGN_OR_RAISE(int64_t footer_offset, file->GetSize());
+ return Open(file, footer_offset, options);
+}
+
+Result<std::shared_ptr<RecordBatchFileReader>> RecordBatchFileReader::Open(
+ const std::shared_ptr<io::RandomAccessFile>& file, int64_t footer_offset,
+ const IpcReadOptions& options) {
+ auto result = std::make_shared<RecordBatchFileReaderImpl>();
+ RETURN_NOT_OK(result->Open(file, footer_offset, options));
+ return result;
+}
+
+Future<std::shared_ptr<RecordBatchFileReader>> RecordBatchFileReader::OpenAsync(
+ const std::shared_ptr<io::RandomAccessFile>& file, const IpcReadOptions& options) {
+ ARROW_ASSIGN_OR_RAISE(int64_t footer_offset, file->GetSize());
+ return OpenAsync(std::move(file), footer_offset, options);
+}
+
+Future<std::shared_ptr<RecordBatchFileReader>> RecordBatchFileReader::OpenAsync(
+ io::RandomAccessFile* file, const IpcReadOptions& options) {
+ ARROW_ASSIGN_OR_RAISE(int64_t footer_offset, file->GetSize());
+ return OpenAsync(file, footer_offset, options);
+}
+
+Future<std::shared_ptr<RecordBatchFileReader>> RecordBatchFileReader::OpenAsync(
+ const std::shared_ptr<io::RandomAccessFile>& file, int64_t footer_offset,
+ const IpcReadOptions& options) {
+ auto result = std::make_shared<RecordBatchFileReaderImpl>();
+ return result->OpenAsync(file, footer_offset, options)
+ .Then([=]() -> Result<std::shared_ptr<RecordBatchFileReader>> { return result; });
+}
+
+Future<std::shared_ptr<RecordBatchFileReader>> RecordBatchFileReader::OpenAsync(
+ io::RandomAccessFile* file, int64_t footer_offset, const IpcReadOptions& options) {
+ auto result = std::make_shared<RecordBatchFileReaderImpl>();
+ return result->OpenAsync(file, footer_offset, options)
+ .Then([=]() -> Result<std::shared_ptr<RecordBatchFileReader>> { return result; });
+}
+
+Future<IpcFileRecordBatchGenerator::Item> IpcFileRecordBatchGenerator::operator()() {
+ auto state = state_;
+ if (!read_dictionaries_.is_valid()) {
+ std::vector<Future<std::shared_ptr<Message>>> messages(state->num_dictionaries());
+ for (int i = 0; i < state->num_dictionaries(); i++) {
+ auto block = FileBlockFromFlatbuffer(state->footer_->dictionaries()->Get(i));
+ messages[i] = ReadBlock(block);
+ }
+ auto read_messages = All(std::move(messages));
+ if (executor_) read_messages = executor_->Transfer(read_messages);
+ read_dictionaries_ = read_messages.Then(
+ [=](const std::vector<Result<std::shared_ptr<Message>>>& maybe_messages)
+ -> Status {
+ ARROW_ASSIGN_OR_RAISE(auto messages,
+ arrow::internal::UnwrapOrRaise(maybe_messages));
+ return ReadDictionaries(state.get(), std::move(messages));
+ });
+ }
+ if (index_ >= state_->num_record_batches()) {
+ return Future<Item>::MakeFinished(IterationTraits<Item>::End());
+ }
+ auto block = FileBlockFromFlatbuffer(state->footer_->recordBatches()->Get(index_++));
+ auto read_message = ReadBlock(block);
+ auto read_messages = read_dictionaries_.Then([read_message]() { return read_message; });
+ // Force transfer. This may be wasteful in some cases, but ensures we get off the
+ // I/O threads as soon as possible, and ensures we don't decode record batches
+ // synchronously in the case that the message read has already finished.
+ if (executor_) {
+ auto executor = executor_;
+ return read_messages.Then(
+ [=](const std::shared_ptr<Message>& message) -> Future<Item> {
+ return DeferNotOk(executor->Submit(
+ [=]() { return ReadRecordBatch(state.get(), message.get()); }));
+ });
+ }
+ return read_messages.Then([=](const std::shared_ptr<Message>& message) -> Result<Item> {
+ return ReadRecordBatch(state.get(), message.get());
+ });
+}
+
+Future<std::shared_ptr<Message>> IpcFileRecordBatchGenerator::ReadBlock(
+ const FileBlock& block) {
+ if (cached_source_) {
+ auto cached_source = cached_source_;
+ io::ReadRange range{block.offset, block.metadata_length + block.body_length};
+ auto pool = state_->options_.memory_pool;
+ return cached_source->WaitFor({range}).Then(
+ [cached_source, pool, range]() -> Result<std::shared_ptr<Message>> {
+ ARROW_ASSIGN_OR_RAISE(auto buffer, cached_source->Read(range));
+ io::BufferReader stream(std::move(buffer));
+ return ReadMessage(&stream, pool);
+ });
+ } else {
+ return ReadMessageFromBlockAsync(block, state_->file_, io_context_);
+ }
+}
+
+Status IpcFileRecordBatchGenerator::ReadDictionaries(
+ RecordBatchFileReaderImpl* state,
+ std::vector<std::shared_ptr<Message>> dictionary_messages) {
+ IpcReadContext context(&state->dictionary_memo_, state->options_, state->swap_endian_);
+ for (const auto& message : dictionary_messages) {
+ RETURN_NOT_OK(ReadOneDictionary(message.get(), context));
+ }
+ return Status::OK();
+}
+
+Result<std::shared_ptr<RecordBatch>> IpcFileRecordBatchGenerator::ReadRecordBatch(
+ RecordBatchFileReaderImpl* state, Message* message) {
+ CHECK_HAS_BODY(*message);
+ ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message->body()));
+ IpcReadContext context(&state->dictionary_memo_, state->options_, state->swap_endian_);
+ return ReadRecordBatchInternal(*message->metadata(), state->schema_,
+ state->field_inclusion_mask_, context, reader.get());
+}
+
+Status Listener::OnEOS() { return Status::OK(); }
+
+Status Listener::OnSchemaDecoded(std::shared_ptr<Schema> schema) { return Status::OK(); }
+
+Status Listener::OnRecordBatchDecoded(std::shared_ptr<RecordBatch> record_batch) {
+ return Status::NotImplemented("OnRecordBatchDecoded() callback isn't implemented");
+}
+
+class StreamDecoder::StreamDecoderImpl : public MessageDecoderListener {
+ private:
+ enum State {
+ SCHEMA,
+ INITIAL_DICTIONARIES,
+ RECORD_BATCHES,
+ EOS,
+ };
+
+ public:
+ explicit StreamDecoderImpl(std::shared_ptr<Listener> listener, IpcReadOptions options)
+ : listener_(std::move(listener)),
+ options_(std::move(options)),
+ state_(State::SCHEMA),
+ message_decoder_(std::shared_ptr<StreamDecoderImpl>(this, [](void*) {}),
+ options_.memory_pool),
+ n_required_dictionaries_(0) {}
+
+ Status OnMessageDecoded(std::unique_ptr<Message> message) override {
+ ++stats_.num_messages;
+ switch (state_) {
+ case State::SCHEMA:
+ ARROW_RETURN_NOT_OK(OnSchemaMessageDecoded(std::move(message)));
+ break;
+ case State::INITIAL_DICTIONARIES:
+ ARROW_RETURN_NOT_OK(OnInitialDictionaryMessageDecoded(std::move(message)));
+ break;
+ case State::RECORD_BATCHES:
+ ARROW_RETURN_NOT_OK(OnRecordBatchMessageDecoded(std::move(message)));
+ break;
+ case State::EOS:
+ break;
+ }
+ return Status::OK();
+ }
+
+ Status OnEOS() override {
+ state_ = State::EOS;
+ return listener_->OnEOS();
+ }
+
+ Status Consume(const uint8_t* data, int64_t size) {
+ return message_decoder_.Consume(data, size);
+ }
+
+ Status Consume(std::shared_ptr<Buffer> buffer) {
+ return message_decoder_.Consume(std::move(buffer));
+ }
+
+ std::shared_ptr<Schema> schema() const { return out_schema_; }
+
+ int64_t next_required_size() const { return message_decoder_.next_required_size(); }
+
+ ReadStats stats() const { return stats_; }
+
+ private:
+ Status OnSchemaMessageDecoded(std::unique_ptr<Message> message) {
+ RETURN_NOT_OK(UnpackSchemaMessage(*message, options_, &dictionary_memo_, &schema_,
+ &out_schema_, &field_inclusion_mask_,
+ &swap_endian_));
+
+ n_required_dictionaries_ = dictionary_memo_.fields().num_fields();
+ if (n_required_dictionaries_ == 0) {
+ state_ = State::RECORD_BATCHES;
+ RETURN_NOT_OK(listener_->OnSchemaDecoded(schema_));
+ } else {
+ state_ = State::INITIAL_DICTIONARIES;
+ }
+ return Status::OK();
+ }
+
+ Status OnInitialDictionaryMessageDecoded(std::unique_ptr<Message> message) {
+ if (message->type() != MessageType::DICTIONARY_BATCH) {
+ return Status::Invalid("IPC stream did not have the expected number (",
+ dictionary_memo_.fields().num_fields(),
+ ") of dictionaries at the start of the stream");
+ }
+ RETURN_NOT_OK(ReadDictionary(*message));
+ n_required_dictionaries_--;
+ if (n_required_dictionaries_ == 0) {
+ state_ = State::RECORD_BATCHES;
+ ARROW_RETURN_NOT_OK(listener_->OnSchemaDecoded(schema_));
+ }
+ return Status::OK();
+ }
+
+ Status OnRecordBatchMessageDecoded(std::unique_ptr<Message> message) {
+ IpcReadContext context(&dictionary_memo_, options_, swap_endian_);
+ if (message->type() == MessageType::DICTIONARY_BATCH) {
+ return ReadDictionary(*message);
+ } else {
+ CHECK_HAS_BODY(*message);
+ ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message->body()));
+ IpcReadContext context(&dictionary_memo_, options_, swap_endian_);
+ ARROW_ASSIGN_OR_RAISE(
+ auto batch,
+ ReadRecordBatchInternal(*message->metadata(), schema_, field_inclusion_mask_,
+ context, reader.get()));
+ ++stats_.num_record_batches;
+ return listener_->OnRecordBatchDecoded(std::move(batch));
+ }
+ }
+
+ // Read dictionary from dictionary batch
+ Status ReadDictionary(const Message& message) {
+ DictionaryKind kind;
+ IpcReadContext context(&dictionary_memo_, options_, swap_endian_);
+ RETURN_NOT_OK(::arrow::ipc::ReadDictionary(message, context, &kind));
+ ++stats_.num_dictionary_batches;
+ switch (kind) {
+ case DictionaryKind::New:
+ break;
+ case DictionaryKind::Delta:
+ ++stats_.num_dictionary_deltas;
+ break;
+ case DictionaryKind::Replacement:
+ ++stats_.num_replaced_dictionaries;
+ break;
+ }
+ return Status::OK();
+ }
+
+ std::shared_ptr<Listener> listener_;
+ const IpcReadOptions options_;
+ State state_;
+ MessageDecoder message_decoder_;
+ std::vector<bool> field_inclusion_mask_;
+ int n_required_dictionaries_;
+ DictionaryMemo dictionary_memo_;
+ std::shared_ptr<Schema> schema_, out_schema_;
+ ReadStats stats_;
+ bool swap_endian_;
+};
+
+StreamDecoder::StreamDecoder(std::shared_ptr<Listener> listener, IpcReadOptions options) {
+ impl_.reset(new StreamDecoderImpl(std::move(listener), options));
+}
+
+StreamDecoder::~StreamDecoder() {}
+
+Status StreamDecoder::Consume(const uint8_t* data, int64_t size) {
+ return impl_->Consume(data, size);
+}
+Status StreamDecoder::Consume(std::shared_ptr<Buffer> buffer) {
+ return impl_->Consume(std::move(buffer));
+}
+
+std::shared_ptr<Schema> StreamDecoder::schema() const { return impl_->schema(); }
+
+int64_t StreamDecoder::next_required_size() const { return impl_->next_required_size(); }
+
+ReadStats StreamDecoder::stats() const { return impl_->stats(); }
+
+Result<std::shared_ptr<Schema>> ReadSchema(io::InputStream* stream,
+ DictionaryMemo* dictionary_memo) {
+ std::unique_ptr<MessageReader> reader = MessageReader::Open(stream);
+ ARROW_ASSIGN_OR_RAISE(std::unique_ptr<Message> message, reader->ReadNextMessage());
+ if (!message) {
+ return Status::Invalid("Tried reading schema message, was null or length 0");
+ }
+ CHECK_MESSAGE_TYPE(MessageType::SCHEMA, message->type());
+ return ReadSchema(*message, dictionary_memo);
+}
+
+Result<std::shared_ptr<Schema>> ReadSchema(const Message& message,
+ DictionaryMemo* dictionary_memo) {
+ std::shared_ptr<Schema> result;
+ RETURN_NOT_OK(internal::GetSchema(message.header(), dictionary_memo, &result));
+ return result;
+}
+
+Result<std::shared_ptr<Tensor>> ReadTensor(io::InputStream* file) {
+ std::unique_ptr<Message> message;
+ RETURN_NOT_OK(ReadContiguousPayload(file, &message));
+ return ReadTensor(*message);
+}
+
+Result<std::shared_ptr<Tensor>> ReadTensor(const Message& message) {
+ std::shared_ptr<DataType> type;
+ std::vector<int64_t> shape;
+ std::vector<int64_t> strides;
+ std::vector<std::string> dim_names;
+ CHECK_HAS_BODY(message);
+ RETURN_NOT_OK(internal::GetTensorMetadata(*message.metadata(), &type, &shape, &strides,
+ &dim_names));
+ return Tensor::Make(type, message.body(), shape, strides, dim_names);
+}
+
+namespace {
+
+Result<std::shared_ptr<SparseIndex>> ReadSparseCOOIndex(
+ const flatbuf::SparseTensor* sparse_tensor, const std::vector<int64_t>& shape,
+ int64_t non_zero_length, io::RandomAccessFile* file) {
+ auto* sparse_index = sparse_tensor->sparseIndex_as_SparseTensorIndexCOO();
+ const auto ndim = static_cast<int64_t>(shape.size());
+
+ std::shared_ptr<DataType> indices_type;
+ RETURN_NOT_OK(internal::GetSparseCOOIndexMetadata(sparse_index, &indices_type));
+ const int64_t indices_elsize = GetByteWidth(*indices_type);
+
+ auto* indices_buffer = sparse_index->indicesBuffer();
+ ARROW_ASSIGN_OR_RAISE(auto indices_data,
+ file->ReadAt(indices_buffer->offset(), indices_buffer->length()));
+ std::vector<int64_t> indices_shape({non_zero_length, ndim});
+ auto* indices_strides = sparse_index->indicesStrides();
+ std::vector<int64_t> strides(2);
+ if (indices_strides && indices_strides->size() > 0) {
+ if (indices_strides->size() != 2) {
+ return Status::Invalid("Wrong size for indicesStrides in SparseCOOIndex");
+ }
+ strides[0] = indices_strides->Get(0);
+ strides[1] = indices_strides->Get(1);
+ } else {
+ // Row-major by default
+ strides[0] = indices_elsize * ndim;
+ strides[1] = indices_elsize;
+ }
+ return SparseCOOIndex::Make(
+ std::make_shared<Tensor>(indices_type, indices_data, indices_shape, strides),
+ sparse_index->isCanonical());
+}
+
+Result<std::shared_ptr<SparseIndex>> ReadSparseCSXIndex(
+ const flatbuf::SparseTensor* sparse_tensor, const std::vector<int64_t>& shape,
+ int64_t non_zero_length, io::RandomAccessFile* file) {
+ if (shape.size() != 2) {
+ return Status::Invalid("Invalid shape length for a sparse matrix");
+ }
+
+ auto* sparse_index = sparse_tensor->sparseIndex_as_SparseMatrixIndexCSX();
+
+ std::shared_ptr<DataType> indptr_type, indices_type;
+ RETURN_NOT_OK(
+ internal::GetSparseCSXIndexMetadata(sparse_index, &indptr_type, &indices_type));
+ const int indptr_byte_width = GetByteWidth(*indptr_type);
+
+ auto* indptr_buffer = sparse_index->indptrBuffer();
+ ARROW_ASSIGN_OR_RAISE(auto indptr_data,
+ file->ReadAt(indptr_buffer->offset(), indptr_buffer->length()));
+
+ auto* indices_buffer = sparse_index->indicesBuffer();
+ ARROW_ASSIGN_OR_RAISE(auto indices_data,
+ file->ReadAt(indices_buffer->offset(), indices_buffer->length()));
+
+ std::vector<int64_t> indices_shape({non_zero_length});
+ const auto indices_minimum_bytes = indices_shape[0] * GetByteWidth(*indices_type);
+ if (indices_minimum_bytes > indices_buffer->length()) {
+ return Status::Invalid("shape is inconsistent to the size of indices buffer");
+ }
+
+ switch (sparse_index->compressedAxis()) {
+ case flatbuf::SparseMatrixCompressedAxis::Row: {
+ std::vector<int64_t> indptr_shape({shape[0] + 1});
+ const int64_t indptr_minimum_bytes = indptr_shape[0] * indptr_byte_width;
+ if (indptr_minimum_bytes > indptr_buffer->length()) {
+ return Status::Invalid("shape is inconsistent to the size of indptr buffer");
+ }
+ return std::make_shared<SparseCSRIndex>(
+ std::make_shared<Tensor>(indptr_type, indptr_data, indptr_shape),
+ std::make_shared<Tensor>(indices_type, indices_data, indices_shape));
+ }
+ case flatbuf::SparseMatrixCompressedAxis::Column: {
+ std::vector<int64_t> indptr_shape({shape[1] + 1});
+ const int64_t indptr_minimum_bytes = indptr_shape[0] * indptr_byte_width;
+ if (indptr_minimum_bytes > indptr_buffer->length()) {
+ return Status::Invalid("shape is inconsistent to the size of indptr buffer");
+ }
+ return std::make_shared<SparseCSCIndex>(
+ std::make_shared<Tensor>(indptr_type, indptr_data, indptr_shape),
+ std::make_shared<Tensor>(indices_type, indices_data, indices_shape));
+ }
+ default:
+ return Status::Invalid("Invalid value of SparseMatrixCompressedAxis");
+ }
+}
+
+Result<std::shared_ptr<SparseIndex>> ReadSparseCSFIndex(
+ const flatbuf::SparseTensor* sparse_tensor, const std::vector<int64_t>& shape,
+ io::RandomAccessFile* file) {
+ auto* sparse_index = sparse_tensor->sparseIndex_as_SparseTensorIndexCSF();
+ const auto ndim = static_cast<int64_t>(shape.size());
+ auto* indptr_buffers = sparse_index->indptrBuffers();
+ auto* indices_buffers = sparse_index->indicesBuffers();
+ std::vector<std::shared_ptr<Buffer>> indptr_data(ndim - 1);
+ std::vector<std::shared_ptr<Buffer>> indices_data(ndim);
+
+ std::shared_ptr<DataType> indptr_type, indices_type;
+ std::vector<int64_t> axis_order, indices_size;
+
+ RETURN_NOT_OK(internal::GetSparseCSFIndexMetadata(
+ sparse_index, &axis_order, &indices_size, &indptr_type, &indices_type));
+ for (int i = 0; i < static_cast<int>(indptr_buffers->size()); ++i) {
+ ARROW_ASSIGN_OR_RAISE(indptr_data[i], file->ReadAt(indptr_buffers->Get(i)->offset(),
+ indptr_buffers->Get(i)->length()));
+ }
+ for (int i = 0; i < static_cast<int>(indices_buffers->size()); ++i) {
+ ARROW_ASSIGN_OR_RAISE(indices_data[i],
+ file->ReadAt(indices_buffers->Get(i)->offset(),
+ indices_buffers->Get(i)->length()));
+ }
+
+ return SparseCSFIndex::Make(indptr_type, indices_type, indices_size, axis_order,
+ indptr_data, indices_data);
+}
+
+Result<std::shared_ptr<SparseTensor>> MakeSparseTensorWithSparseCOOIndex(
+ const std::shared_ptr<DataType>& type, const std::vector<int64_t>& shape,
+ const std::vector<std::string>& dim_names,
+ const std::shared_ptr<SparseCOOIndex>& sparse_index, int64_t non_zero_length,
+ const std::shared_ptr<Buffer>& data) {
+ return SparseCOOTensor::Make(sparse_index, type, data, shape, dim_names);
+}
+
+Result<std::shared_ptr<SparseTensor>> MakeSparseTensorWithSparseCSRIndex(
+ const std::shared_ptr<DataType>& type, const std::vector<int64_t>& shape,
+ const std::vector<std::string>& dim_names,
+ const std::shared_ptr<SparseCSRIndex>& sparse_index, int64_t non_zero_length,
+ const std::shared_ptr<Buffer>& data) {
+ return SparseCSRMatrix::Make(sparse_index, type, data, shape, dim_names);
+}
+
+Result<std::shared_ptr<SparseTensor>> MakeSparseTensorWithSparseCSCIndex(
+ const std::shared_ptr<DataType>& type, const std::vector<int64_t>& shape,
+ const std::vector<std::string>& dim_names,
+ const std::shared_ptr<SparseCSCIndex>& sparse_index, int64_t non_zero_length,
+ const std::shared_ptr<Buffer>& data) {
+ return SparseCSCMatrix::Make(sparse_index, type, data, shape, dim_names);
+}
+
+Result<std::shared_ptr<SparseTensor>> MakeSparseTensorWithSparseCSFIndex(
+ const std::shared_ptr<DataType>& type, const std::vector<int64_t>& shape,
+ const std::vector<std::string>& dim_names,
+ const std::shared_ptr<SparseCSFIndex>& sparse_index,
+ const std::shared_ptr<Buffer>& data) {
+ return SparseCSFTensor::Make(sparse_index, type, data, shape, dim_names);
+}
+
+Status ReadSparseTensorMetadata(const Buffer& metadata,
+ std::shared_ptr<DataType>* out_type,
+ std::vector<int64_t>* out_shape,
+ std::vector<std::string>* out_dim_names,
+ int64_t* out_non_zero_length,
+ SparseTensorFormat::type* out_format_id,
+ const flatbuf::SparseTensor** out_fb_sparse_tensor,
+ const flatbuf::Buffer** out_buffer) {
+ RETURN_NOT_OK(internal::GetSparseTensorMetadata(
+ metadata, out_type, out_shape, out_dim_names, out_non_zero_length, out_format_id));
+
+ const flatbuf::Message* message = nullptr;
+ RETURN_NOT_OK(internal::VerifyMessage(metadata.data(), metadata.size(), &message));
+
+ auto sparse_tensor = message->header_as_SparseTensor();
+ if (sparse_tensor == nullptr) {
+ return Status::IOError(
+ "Header-type of flatbuffer-encoded Message is not SparseTensor.");
+ }
+ *out_fb_sparse_tensor = sparse_tensor;
+
+ auto buffer = sparse_tensor->data();
+ if (!BitUtil::IsMultipleOf8(buffer->offset())) {
+ return Status::Invalid(
+ "Buffer of sparse index data did not start on 8-byte aligned offset: ",
+ buffer->offset());
+ }
+ *out_buffer = buffer;
+
+ return Status::OK();
+}
+
+} // namespace
+
+namespace internal {
+
+namespace {
+
+Result<size_t> GetSparseTensorBodyBufferCount(SparseTensorFormat::type format_id,
+ const size_t ndim) {
+ switch (format_id) {
+ case SparseTensorFormat::COO:
+ return 2;
+
+ case SparseTensorFormat::CSR:
+ return 3;
+
+ case SparseTensorFormat::CSC:
+ return 3;
+
+ case SparseTensorFormat::CSF:
+ return 2 * ndim;
+
+ default:
+ return Status::Invalid("Unrecognized sparse tensor format");
+ }
+}
+
+Status CheckSparseTensorBodyBufferCount(const IpcPayload& payload,
+ SparseTensorFormat::type sparse_tensor_format_id,
+ const size_t ndim) {
+ size_t expected_body_buffer_count = 0;
+ ARROW_ASSIGN_OR_RAISE(expected_body_buffer_count,
+ GetSparseTensorBodyBufferCount(sparse_tensor_format_id, ndim));
+ if (payload.body_buffers.size() != expected_body_buffer_count) {
+ return Status::Invalid("Invalid body buffer count for a sparse tensor");
+ }
+
+ return Status::OK();
+}
+
+} // namespace
+
+Result<size_t> ReadSparseTensorBodyBufferCount(const Buffer& metadata) {
+ SparseTensorFormat::type format_id;
+ std::vector<int64_t> shape;
+
+ RETURN_NOT_OK(internal::GetSparseTensorMetadata(metadata, nullptr, &shape, nullptr,
+ nullptr, &format_id));
+
+ return GetSparseTensorBodyBufferCount(format_id, static_cast<size_t>(shape.size()));
+}
+
+Result<std::shared_ptr<SparseTensor>> ReadSparseTensorPayload(const IpcPayload& payload) {
+ std::shared_ptr<DataType> type;
+ std::vector<int64_t> shape;
+ std::vector<std::string> dim_names;
+ int64_t non_zero_length;
+ SparseTensorFormat::type sparse_tensor_format_id;
+ const flatbuf::SparseTensor* sparse_tensor;
+ const flatbuf::Buffer* buffer;
+
+ RETURN_NOT_OK(ReadSparseTensorMetadata(*payload.metadata, &type, &shape, &dim_names,
+ &non_zero_length, &sparse_tensor_format_id,
+ &sparse_tensor, &buffer));
+
+ RETURN_NOT_OK(CheckSparseTensorBodyBufferCount(payload, sparse_tensor_format_id,
+ static_cast<size_t>(shape.size())));
+
+ switch (sparse_tensor_format_id) {
+ case SparseTensorFormat::COO: {
+ std::shared_ptr<SparseCOOIndex> sparse_index;
+ std::shared_ptr<DataType> indices_type;
+ RETURN_NOT_OK(internal::GetSparseCOOIndexMetadata(
+ sparse_tensor->sparseIndex_as_SparseTensorIndexCOO(), &indices_type));
+ ARROW_ASSIGN_OR_RAISE(sparse_index,
+ SparseCOOIndex::Make(indices_type, shape, non_zero_length,
+ payload.body_buffers[0]));
+ return MakeSparseTensorWithSparseCOOIndex(type, shape, dim_names, sparse_index,
+ non_zero_length, payload.body_buffers[1]);
+ }
+ case SparseTensorFormat::CSR: {
+ std::shared_ptr<SparseCSRIndex> sparse_index;
+ std::shared_ptr<DataType> indptr_type;
+ std::shared_ptr<DataType> indices_type;
+ RETURN_NOT_OK(internal::GetSparseCSXIndexMetadata(
+ sparse_tensor->sparseIndex_as_SparseMatrixIndexCSX(), &indptr_type,
+ &indices_type));
+ ARROW_CHECK_EQ(indptr_type, indices_type);
+ ARROW_ASSIGN_OR_RAISE(
+ sparse_index,
+ SparseCSRIndex::Make(indices_type, shape, non_zero_length,
+ payload.body_buffers[0], payload.body_buffers[1]));
+ return MakeSparseTensorWithSparseCSRIndex(type, shape, dim_names, sparse_index,
+ non_zero_length, payload.body_buffers[2]);
+ }
+ case SparseTensorFormat::CSC: {
+ std::shared_ptr<SparseCSCIndex> sparse_index;
+ std::shared_ptr<DataType> indptr_type;
+ std::shared_ptr<DataType> indices_type;
+ RETURN_NOT_OK(internal::GetSparseCSXIndexMetadata(
+ sparse_tensor->sparseIndex_as_SparseMatrixIndexCSX(), &indptr_type,
+ &indices_type));
+ ARROW_CHECK_EQ(indptr_type, indices_type);
+ ARROW_ASSIGN_OR_RAISE(
+ sparse_index,
+ SparseCSCIndex::Make(indices_type, shape, non_zero_length,
+ payload.body_buffers[0], payload.body_buffers[1]));
+ return MakeSparseTensorWithSparseCSCIndex(type, shape, dim_names, sparse_index,
+ non_zero_length, payload.body_buffers[2]);
+ }
+ case SparseTensorFormat::CSF: {
+ std::shared_ptr<SparseCSFIndex> sparse_index;
+ std::shared_ptr<DataType> indptr_type, indices_type;
+ std::vector<int64_t> axis_order, indices_size;
+
+ RETURN_NOT_OK(internal::GetSparseCSFIndexMetadata(
+ sparse_tensor->sparseIndex_as_SparseTensorIndexCSF(), &axis_order,
+ &indices_size, &indptr_type, &indices_type));
+ ARROW_CHECK_EQ(indptr_type, indices_type);
+
+ const int64_t ndim = shape.size();
+ std::vector<std::shared_ptr<Buffer>> indptr_data(ndim - 1);
+ std::vector<std::shared_ptr<Buffer>> indices_data(ndim);
+
+ for (int64_t i = 0; i < ndim - 1; ++i) {
+ indptr_data[i] = payload.body_buffers[i];
+ }
+ for (int64_t i = 0; i < ndim; ++i) {
+ indices_data[i] = payload.body_buffers[i + ndim - 1];
+ }
+
+ ARROW_ASSIGN_OR_RAISE(sparse_index,
+ SparseCSFIndex::Make(indptr_type, indices_type, indices_size,
+ axis_order, indptr_data, indices_data));
+ return MakeSparseTensorWithSparseCSFIndex(type, shape, dim_names, sparse_index,
+ payload.body_buffers[2 * ndim - 1]);
+ }
+ default:
+ return Status::Invalid("Unsupported sparse index format");
+ }
+}
+
+} // namespace internal
+
+Result<std::shared_ptr<SparseTensor>> ReadSparseTensor(const Buffer& metadata,
+ io::RandomAccessFile* file) {
+ std::shared_ptr<DataType> type;
+ std::vector<int64_t> shape;
+ std::vector<std::string> dim_names;
+ int64_t non_zero_length;
+ SparseTensorFormat::type sparse_tensor_format_id;
+ const flatbuf::SparseTensor* sparse_tensor;
+ const flatbuf::Buffer* buffer;
+
+ RETURN_NOT_OK(ReadSparseTensorMetadata(metadata, &type, &shape, &dim_names,
+ &non_zero_length, &sparse_tensor_format_id,
+ &sparse_tensor, &buffer));
+
+ ARROW_ASSIGN_OR_RAISE(auto data, file->ReadAt(buffer->offset(), buffer->length()));
+
+ std::shared_ptr<SparseIndex> sparse_index;
+ switch (sparse_tensor_format_id) {
+ case SparseTensorFormat::COO: {
+ ARROW_ASSIGN_OR_RAISE(
+ sparse_index, ReadSparseCOOIndex(sparse_tensor, shape, non_zero_length, file));
+ return MakeSparseTensorWithSparseCOOIndex(
+ type, shape, dim_names, checked_pointer_cast<SparseCOOIndex>(sparse_index),
+ non_zero_length, data);
+ }
+ case SparseTensorFormat::CSR: {
+ ARROW_ASSIGN_OR_RAISE(
+ sparse_index, ReadSparseCSXIndex(sparse_tensor, shape, non_zero_length, file));
+ return MakeSparseTensorWithSparseCSRIndex(
+ type, shape, dim_names, checked_pointer_cast<SparseCSRIndex>(sparse_index),
+ non_zero_length, data);
+ }
+ case SparseTensorFormat::CSC: {
+ ARROW_ASSIGN_OR_RAISE(
+ sparse_index, ReadSparseCSXIndex(sparse_tensor, shape, non_zero_length, file));
+ return MakeSparseTensorWithSparseCSCIndex(
+ type, shape, dim_names, checked_pointer_cast<SparseCSCIndex>(sparse_index),
+ non_zero_length, data);
+ }
+ case SparseTensorFormat::CSF: {
+ ARROW_ASSIGN_OR_RAISE(sparse_index, ReadSparseCSFIndex(sparse_tensor, shape, file));
+ return MakeSparseTensorWithSparseCSFIndex(
+ type, shape, dim_names, checked_pointer_cast<SparseCSFIndex>(sparse_index),
+ data);
+ }
+ default:
+ return Status::Invalid("Unsupported sparse index format");
+ }
+}
+
+Result<std::shared_ptr<SparseTensor>> ReadSparseTensor(const Message& message) {
+ CHECK_HAS_BODY(message);
+ ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message.body()));
+ return ReadSparseTensor(*message.metadata(), reader.get());
+}
+
+Result<std::shared_ptr<SparseTensor>> ReadSparseTensor(io::InputStream* file) {
+ std::unique_ptr<Message> message;
+ RETURN_NOT_OK(ReadContiguousPayload(file, &message));
+ CHECK_MESSAGE_TYPE(MessageType::SPARSE_TENSOR, message->type());
+ CHECK_HAS_BODY(*message);
+ ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message->body()));
+ return ReadSparseTensor(*message->metadata(), reader.get());
+}
+
+///////////////////////////////////////////////////////////////////////////
+// Helpers for fuzzing
+
+namespace internal {
+
+Status FuzzIpcStream(const uint8_t* data, int64_t size) {
+ auto buffer = std::make_shared<Buffer>(data, size);
+ io::BufferReader buffer_reader(buffer);
+
+ std::shared_ptr<RecordBatchReader> batch_reader;
+ ARROW_ASSIGN_OR_RAISE(batch_reader, RecordBatchStreamReader::Open(&buffer_reader));
+
+ while (true) {
+ std::shared_ptr<arrow::RecordBatch> batch;
+ RETURN_NOT_OK(batch_reader->ReadNext(&batch));
+ if (batch == nullptr) {
+ break;
+ }
+ RETURN_NOT_OK(batch->ValidateFull());
+ }
+
+ return Status::OK();
+}
+
+Status FuzzIpcFile(const uint8_t* data, int64_t size) {
+ auto buffer = std::make_shared<Buffer>(data, size);
+ io::BufferReader buffer_reader(buffer);
+
+ std::shared_ptr<RecordBatchFileReader> batch_reader;
+ ARROW_ASSIGN_OR_RAISE(batch_reader, RecordBatchFileReader::Open(&buffer_reader));
+
+ const int n_batches = batch_reader->num_record_batches();
+ for (int i = 0; i < n_batches; ++i) {
+ ARROW_ASSIGN_OR_RAISE(auto batch, batch_reader->ReadRecordBatch(i));
+ RETURN_NOT_OK(batch->ValidateFull());
+ }
+
+ return Status::OK();
+}
+
+Status FuzzIpcTensorStream(const uint8_t* data, int64_t size) {
+ auto buffer = std::make_shared<Buffer>(data, size);
+ io::BufferReader buffer_reader(buffer);
+
+ std::shared_ptr<Tensor> tensor;
+
+ while (true) {
+ ARROW_ASSIGN_OR_RAISE(tensor, ReadTensor(&buffer_reader));
+ if (tensor == nullptr) {
+ break;
+ }
+ RETURN_NOT_OK(tensor->Validate());
+ }
+
+ return Status::OK();
+}
+
+} // namespace internal
+} // namespace ipc
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/reader.h b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/reader.h
new file mode 100644
index 00000000000..6f2157557f3
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/reader.h
@@ -0,0 +1,536 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Read Arrow files and streams
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "arrow/io/caching.h"
+#include "arrow/io/type_fwd.h"
+#include "arrow/ipc/message.h"
+#include "arrow/ipc/options.h"
+#include "arrow/record_batch.h"
+#include "arrow/result.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/async_generator.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace ipc {
+
+class DictionaryMemo;
+struct IpcPayload;
+
+using RecordBatchReader = ::arrow::RecordBatchReader;
+
+struct ReadStats {
+ /// Number of IPC messages read.
+ int64_t num_messages = 0;
+ /// Number of record batches read.
+ int64_t num_record_batches = 0;
+ /// Number of dictionary batches read.
+ ///
+ /// Note: num_dictionary_batches >= num_dictionary_deltas + num_replaced_dictionaries
+ int64_t num_dictionary_batches = 0;
+
+ /// Number of dictionary deltas read.
+ int64_t num_dictionary_deltas = 0;
+ /// Number of replaced dictionaries (i.e. where a dictionary batch replaces
+ /// an existing dictionary with an unrelated new dictionary).
+ int64_t num_replaced_dictionaries = 0;
+};
+
+/// \brief Synchronous batch stream reader that reads from io::InputStream
+///
+/// This class reads the schema (plus any dictionaries) as the first messages
+/// in the stream, followed by record batches. For more granular zero-copy
+/// reads see the ReadRecordBatch functions
+class ARROW_EXPORT RecordBatchStreamReader : public RecordBatchReader {
+ public:
+ /// Create batch reader from generic MessageReader.
+ /// This will take ownership of the given MessageReader.
+ ///
+ /// \param[in] message_reader a MessageReader implementation
+ /// \param[in] options any IPC reading options (optional)
+ /// \return the created batch reader
+ static Result<std::shared_ptr<RecordBatchStreamReader>> Open(
+ std::unique_ptr<MessageReader> message_reader,
+ const IpcReadOptions& options = IpcReadOptions::Defaults());
+
+ /// \brief Record batch stream reader from InputStream
+ ///
+ /// \param[in] stream an input stream instance. Must stay alive throughout
+ /// lifetime of stream reader
+ /// \param[in] options any IPC reading options (optional)
+ /// \return the created batch reader
+ static Result<std::shared_ptr<RecordBatchStreamReader>> Open(
+ io::InputStream* stream,
+ const IpcReadOptions& options = IpcReadOptions::Defaults());
+
+ /// \brief Open stream and retain ownership of stream object
+ /// \param[in] stream the input stream
+ /// \param[in] options any IPC reading options (optional)
+ /// \return the created batch reader
+ static Result<std::shared_ptr<RecordBatchStreamReader>> Open(
+ const std::shared_ptr<io::InputStream>& stream,
+ const IpcReadOptions& options = IpcReadOptions::Defaults());
+
+ /// \brief Return current read statistics
+ virtual ReadStats stats() const = 0;
+};
+
+/// \brief Reads the record batch file format
+class ARROW_EXPORT RecordBatchFileReader
+ : public std::enable_shared_from_this<RecordBatchFileReader> {
+ public:
+ virtual ~RecordBatchFileReader() = default;
+
+ /// \brief Open a RecordBatchFileReader
+ ///
+ /// Open a file-like object that is assumed to be self-contained; i.e., the
+ /// end of the file interface is the end of the Arrow file. Note that there
+ /// can be any amount of data preceding the Arrow-formatted data, because we
+ /// need only locate the end of the Arrow file stream to discover the metadata
+ /// and then proceed to read the data into memory.
+ static Result<std::shared_ptr<RecordBatchFileReader>> Open(
+ io::RandomAccessFile* file,
+ const IpcReadOptions& options = IpcReadOptions::Defaults());
+
+ /// \brief Open a RecordBatchFileReader
+ /// If the file is embedded within some larger file or memory region, you can
+ /// pass the absolute memory offset to the end of the file (which contains the
+ /// metadata footer). The metadata must have been written with memory offsets
+ /// relative to the start of the containing file
+ ///
+ /// \param[in] file the data source
+ /// \param[in] footer_offset the position of the end of the Arrow file
+ /// \param[in] options options for IPC reading
+ /// \return the returned reader
+ static Result<std::shared_ptr<RecordBatchFileReader>> Open(
+ io::RandomAccessFile* file, int64_t footer_offset,
+ const IpcReadOptions& options = IpcReadOptions::Defaults());
+
+ /// \brief Version of Open that retains ownership of file
+ ///
+ /// \param[in] file the data source
+ /// \param[in] options options for IPC reading
+ /// \return the returned reader
+ static Result<std::shared_ptr<RecordBatchFileReader>> Open(
+ const std::shared_ptr<io::RandomAccessFile>& file,
+ const IpcReadOptions& options = IpcReadOptions::Defaults());
+
+ /// \brief Version of Open that retains ownership of file
+ ///
+ /// \param[in] file the data source
+ /// \param[in] footer_offset the position of the end of the Arrow file
+ /// \param[in] options options for IPC reading
+ /// \return the returned reader
+ static Result<std::shared_ptr<RecordBatchFileReader>> Open(
+ const std::shared_ptr<io::RandomAccessFile>& file, int64_t footer_offset,
+ const IpcReadOptions& options = IpcReadOptions::Defaults());
+
+ /// \brief Open a file asynchronously (owns the file).
+ static Future<std::shared_ptr<RecordBatchFileReader>> OpenAsync(
+ const std::shared_ptr<io::RandomAccessFile>& file,
+ const IpcReadOptions& options = IpcReadOptions::Defaults());
+
+ /// \brief Open a file asynchronously (borrows the file).
+ static Future<std::shared_ptr<RecordBatchFileReader>> OpenAsync(
+ io::RandomAccessFile* file,
+ const IpcReadOptions& options = IpcReadOptions::Defaults());
+
+ /// \brief Open a file asynchronously (owns the file).
+ static Future<std::shared_ptr<RecordBatchFileReader>> OpenAsync(
+ const std::shared_ptr<io::RandomAccessFile>& file, int64_t footer_offset,
+ const IpcReadOptions& options = IpcReadOptions::Defaults());
+
+ /// \brief Open a file asynchronously (borrows the file).
+ static Future<std::shared_ptr<RecordBatchFileReader>> OpenAsync(
+ io::RandomAccessFile* file, int64_t footer_offset,
+ const IpcReadOptions& options = IpcReadOptions::Defaults());
+
+ /// \brief The schema read from the file
+ virtual std::shared_ptr<Schema> schema() const = 0;
+
+ /// \brief Returns the number of record batches in the file
+ virtual int num_record_batches() const = 0;
+
+ /// \brief Return the metadata version from the file metadata
+ virtual MetadataVersion version() const = 0;
+
+ /// \brief Return the contents of the custom_metadata field from the file's
+ /// Footer
+ virtual std::shared_ptr<const KeyValueMetadata> metadata() const = 0;
+
+ /// \brief Read a particular record batch from the file. Does not copy memory
+ /// if the input source supports zero-copy.
+ ///
+ /// \param[in] i the index of the record batch to return
+ /// \return the read batch
+ virtual Result<std::shared_ptr<RecordBatch>> ReadRecordBatch(int i) = 0;
+
+ /// \brief Return current read statistics
+ virtual ReadStats stats() const = 0;
+
+ /// \brief Computes the total number of rows in the file.
+ virtual Result<int64_t> CountRows() = 0;
+
+ /// \brief Get a reentrant generator of record batches.
+ ///
+ /// \param[in] coalesce If true, enable I/O coalescing.
+ /// \param[in] io_context The IOContext to use (controls which thread pool
+ /// is used for I/O).
+ /// \param[in] cache_options Options for coalescing (if enabled).
+ /// \param[in] executor Optionally, an executor to use for decoding record
+ /// batches. This is generally only a benefit for very wide and/or
+ /// compressed batches.
+ virtual Result<AsyncGenerator<std::shared_ptr<RecordBatch>>> GetRecordBatchGenerator(
+ const bool coalesce = false,
+ const io::IOContext& io_context = io::default_io_context(),
+ const io::CacheOptions cache_options = io::CacheOptions::LazyDefaults(),
+ arrow::internal::Executor* executor = NULLPTR) = 0;
+};
+
+/// \brief A general listener class to receive events.
+///
+/// You must implement callback methods for interested events.
+///
+/// This API is EXPERIMENTAL.
+///
+/// \since 0.17.0
+class ARROW_EXPORT Listener {
+ public:
+ virtual ~Listener() = default;
+
+ /// \brief Called when end-of-stream is received.
+ ///
+ /// The default implementation just returns arrow::Status::OK().
+ ///
+ /// \return Status
+ ///
+ /// \see StreamDecoder
+ virtual Status OnEOS();
+
+ /// \brief Called when a record batch is decoded.
+ ///
+ /// The default implementation just returns
+ /// arrow::Status::NotImplemented().
+ ///
+ /// \param[in] record_batch a record batch decoded
+ /// \return Status
+ ///
+ /// \see StreamDecoder
+ virtual Status OnRecordBatchDecoded(std::shared_ptr<RecordBatch> record_batch);
+
+ /// \brief Called when a schema is decoded.
+ ///
+ /// The default implementation just returns arrow::Status::OK().
+ ///
+ /// \param[in] schema a schema decoded
+ /// \return Status
+ ///
+ /// \see StreamDecoder
+ virtual Status OnSchemaDecoded(std::shared_ptr<Schema> schema);
+};
+
+/// \brief Collect schema and record batches decoded by StreamDecoder.
+///
+/// This API is EXPERIMENTAL.
+///
+/// \since 0.17.0
+class ARROW_EXPORT CollectListener : public Listener {
+ public:
+ CollectListener() : schema_(), record_batches_() {}
+ virtual ~CollectListener() = default;
+
+ Status OnSchemaDecoded(std::shared_ptr<Schema> schema) override {
+ schema_ = std::move(schema);
+ return Status::OK();
+ }
+
+ Status OnRecordBatchDecoded(std::shared_ptr<RecordBatch> record_batch) override {
+ record_batches_.push_back(std::move(record_batch));
+ return Status::OK();
+ }
+
+ /// \return the decoded schema
+ std::shared_ptr<Schema> schema() const { return schema_; }
+
+ /// \return the all decoded record batches
+ std::vector<std::shared_ptr<RecordBatch>> record_batches() const {
+ return record_batches_;
+ }
+
+ private:
+ std::shared_ptr<Schema> schema_;
+ std::vector<std::shared_ptr<RecordBatch>> record_batches_;
+};
+
+/// \brief Push style stream decoder that receives data from user.
+///
+/// This class decodes the Apache Arrow IPC streaming format data.
+///
+/// This API is EXPERIMENTAL.
+///
+/// \see https://arrow.apache.org/docs/format/Columnar.html#ipc-streaming-format
+///
+/// \since 0.17.0
+class ARROW_EXPORT StreamDecoder {
+ public:
+ /// \brief Construct a stream decoder.
+ ///
+ /// \param[in] listener a Listener that must implement
+ /// Listener::OnRecordBatchDecoded() to receive decoded record batches
+ /// \param[in] options any IPC reading options (optional)
+ StreamDecoder(std::shared_ptr<Listener> listener,
+ IpcReadOptions options = IpcReadOptions::Defaults());
+
+ virtual ~StreamDecoder();
+
+ /// \brief Feed data to the decoder as a raw data.
+ ///
+ /// If the decoder can read one or more record batches by the data,
+ /// the decoder calls listener->OnRecordBatchDecoded() with a
+ /// decoded record batch multiple times.
+ ///
+ /// \param[in] data a raw data to be processed. This data isn't
+ /// copied. The passed memory must be kept alive through record
+ /// batch processing.
+ /// \param[in] size raw data size.
+ /// \return Status
+ Status Consume(const uint8_t* data, int64_t size);
+
+ /// \brief Feed data to the decoder as a Buffer.
+ ///
+ /// If the decoder can read one or more record batches by the
+ /// Buffer, the decoder calls listener->RecordBatchReceived() with a
+ /// decoded record batch multiple times.
+ ///
+ /// \param[in] buffer a Buffer to be processed.
+ /// \return Status
+ Status Consume(std::shared_ptr<Buffer> buffer);
+
+ /// \return the shared schema of the record batches in the stream
+ std::shared_ptr<Schema> schema() const;
+
+ /// \brief Return the number of bytes needed to advance the state of
+ /// the decoder.
+ ///
+ /// This method is provided for users who want to optimize performance.
+ /// Normal users don't need to use this method.
+ ///
+ /// Here is an example usage for normal users:
+ ///
+ /// ~~~{.cpp}
+ /// decoder.Consume(buffer1);
+ /// decoder.Consume(buffer2);
+ /// decoder.Consume(buffer3);
+ /// ~~~
+ ///
+ /// Decoder has internal buffer. If consumed data isn't enough to
+ /// advance the state of the decoder, consumed data is buffered to
+ /// the internal buffer. It causes performance overhead.
+ ///
+ /// If you pass next_required_size() size data to each Consume()
+ /// call, the decoder doesn't use its internal buffer. It improves
+ /// performance.
+ ///
+ /// Here is an example usage to avoid using internal buffer:
+ ///
+ /// ~~~{.cpp}
+ /// buffer1 = get_data(decoder.next_required_size());
+ /// decoder.Consume(buffer1);
+ /// buffer2 = get_data(decoder.next_required_size());
+ /// decoder.Consume(buffer2);
+ /// ~~~
+ ///
+ /// Users can use this method to avoid creating small chunks. Record
+ /// batch data must be contiguous data. If users pass small chunks
+ /// to the decoder, the decoder needs concatenate small chunks
+ /// internally. It causes performance overhead.
+ ///
+ /// Here is an example usage to reduce small chunks:
+ ///
+ /// ~~~{.cpp}
+ /// buffer = AllocateResizableBuffer();
+ /// while ((small_chunk = get_data(&small_chunk_size))) {
+ /// auto current_buffer_size = buffer->size();
+ /// buffer->Resize(current_buffer_size + small_chunk_size);
+ /// memcpy(buffer->mutable_data() + current_buffer_size,
+ /// small_chunk,
+ /// small_chunk_size);
+ /// if (buffer->size() < decoder.next_required_size()) {
+ /// continue;
+ /// }
+ /// std::shared_ptr<arrow::Buffer> chunk(buffer.release());
+ /// decoder.Consume(chunk);
+ /// buffer = AllocateResizableBuffer();
+ /// }
+ /// if (buffer->size() > 0) {
+ /// std::shared_ptr<arrow::Buffer> chunk(buffer.release());
+ /// decoder.Consume(chunk);
+ /// }
+ /// ~~~
+ ///
+ /// \return the number of bytes needed to advance the state of the
+ /// decoder
+ int64_t next_required_size() const;
+
+ /// \brief Return current read statistics
+ ReadStats stats() const;
+
+ private:
+ class StreamDecoderImpl;
+ std::unique_ptr<StreamDecoderImpl> impl_;
+
+ ARROW_DISALLOW_COPY_AND_ASSIGN(StreamDecoder);
+};
+
+// Generic read functions; does not copy data if the input supports zero copy reads
+
+/// \brief Read Schema from stream serialized as a single IPC message
+/// and populate any dictionary-encoded fields into a DictionaryMemo
+///
+/// \param[in] stream an InputStream
+/// \param[in] dictionary_memo for recording dictionary-encoded fields
+/// \return the output Schema
+///
+/// If record batches follow the schema, it is better to use
+/// RecordBatchStreamReader
+ARROW_EXPORT
+Result<std::shared_ptr<Schema>> ReadSchema(io::InputStream* stream,
+ DictionaryMemo* dictionary_memo);
+
+/// \brief Read Schema from encapsulated Message
+///
+/// \param[in] message the message containing the Schema IPC metadata
+/// \param[in] dictionary_memo DictionaryMemo for recording dictionary-encoded
+/// fields. Can be nullptr if you are sure there are no
+/// dictionary-encoded fields
+/// \return the resulting Schema
+ARROW_EXPORT
+Result<std::shared_ptr<Schema>> ReadSchema(const Message& message,
+ DictionaryMemo* dictionary_memo);
+
+/// Read record batch as encapsulated IPC message with metadata size prefix and
+/// header
+///
+/// \param[in] schema the record batch schema
+/// \param[in] dictionary_memo DictionaryMemo which has any
+/// dictionaries. Can be nullptr if you are sure there are no
+/// dictionary-encoded fields
+/// \param[in] options IPC options for reading
+/// \param[in] stream the file where the batch is located
+/// \return the read record batch
+ARROW_EXPORT
+Result<std::shared_ptr<RecordBatch>> ReadRecordBatch(
+ const std::shared_ptr<Schema>& schema, const DictionaryMemo* dictionary_memo,
+ const IpcReadOptions& options, io::InputStream* stream);
+
+/// \brief Read record batch from message
+///
+/// \param[in] message a Message containing the record batch metadata
+/// \param[in] schema the record batch schema
+/// \param[in] dictionary_memo DictionaryMemo which has any
+/// dictionaries. Can be nullptr if you are sure there are no
+/// dictionary-encoded fields
+/// \param[in] options IPC options for reading
+/// \return the read record batch
+ARROW_EXPORT
+Result<std::shared_ptr<RecordBatch>> ReadRecordBatch(
+ const Message& message, const std::shared_ptr<Schema>& schema,
+ const DictionaryMemo* dictionary_memo, const IpcReadOptions& options);
+
+/// Read record batch from file given metadata and schema
+///
+/// \param[in] metadata a Message containing the record batch metadata
+/// \param[in] schema the record batch schema
+/// \param[in] dictionary_memo DictionaryMemo which has any
+/// dictionaries. Can be nullptr if you are sure there are no
+/// dictionary-encoded fields
+/// \param[in] file a random access file
+/// \param[in] options options for deserialization
+/// \return the read record batch
+ARROW_EXPORT
+Result<std::shared_ptr<RecordBatch>> ReadRecordBatch(
+ const Buffer& metadata, const std::shared_ptr<Schema>& schema,
+ const DictionaryMemo* dictionary_memo, const IpcReadOptions& options,
+ io::RandomAccessFile* file);
+
+/// \brief Read arrow::Tensor as encapsulated IPC message in file
+///
+/// \param[in] file an InputStream pointed at the start of the message
+/// \return the read tensor
+ARROW_EXPORT
+Result<std::shared_ptr<Tensor>> ReadTensor(io::InputStream* file);
+
+/// \brief EXPERIMENTAL: Read arrow::Tensor from IPC message
+///
+/// \param[in] message a Message containing the tensor metadata and body
+/// \return the read tensor
+ARROW_EXPORT
+Result<std::shared_ptr<Tensor>> ReadTensor(const Message& message);
+
+/// \brief EXPERIMENTAL: Read arrow::SparseTensor as encapsulated IPC message in file
+///
+/// \param[in] file an InputStream pointed at the start of the message
+/// \return the read sparse tensor
+ARROW_EXPORT
+Result<std::shared_ptr<SparseTensor>> ReadSparseTensor(io::InputStream* file);
+
+/// \brief EXPERIMENTAL: Read arrow::SparseTensor from IPC message
+///
+/// \param[in] message a Message containing the tensor metadata and body
+/// \return the read sparse tensor
+ARROW_EXPORT
+Result<std::shared_ptr<SparseTensor>> ReadSparseTensor(const Message& message);
+
+namespace internal {
+
+// These internal APIs may change without warning or deprecation
+
+/// \brief EXPERIMENTAL: Read arrow::SparseTensorFormat::type from a metadata
+/// \param[in] metadata a Buffer containing the sparse tensor metadata
+/// \return the count of the body buffers
+ARROW_EXPORT
+Result<size_t> ReadSparseTensorBodyBufferCount(const Buffer& metadata);
+
+/// \brief EXPERIMENTAL: Read arrow::SparseTensor from an IpcPayload
+/// \param[in] payload a IpcPayload contains a serialized SparseTensor
+/// \return the read sparse tensor
+ARROW_EXPORT
+Result<std::shared_ptr<SparseTensor>> ReadSparseTensorPayload(const IpcPayload& payload);
+
+// For fuzzing targets
+ARROW_EXPORT
+Status FuzzIpcStream(const uint8_t* data, int64_t size);
+ARROW_EXPORT
+Status FuzzIpcTensorStream(const uint8_t* data, int64_t size);
+ARROW_EXPORT
+Status FuzzIpcFile(const uint8_t* data, int64_t size);
+
+} // namespace internal
+
+} // namespace ipc
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/type_fwd.h b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/type_fwd.h
new file mode 100644
index 00000000000..3493c4f1409
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/type_fwd.h
@@ -0,0 +1,65 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+namespace arrow {
+namespace ipc {
+
+enum class MetadataVersion : char {
+ /// 0.1.0
+ V1,
+
+ /// 0.2.0
+ V2,
+
+ /// 0.3.0 to 0.7.1
+ V3,
+
+ /// 0.8.0 to 0.17.0
+ V4,
+
+ /// >= 1.0.0
+ V5
+};
+
+class Message;
+enum class MessageType {
+ NONE,
+ SCHEMA,
+ DICTIONARY_BATCH,
+ RECORD_BATCH,
+ TENSOR,
+ SPARSE_TENSOR
+};
+
+struct IpcReadOptions;
+struct IpcWriteOptions;
+
+class MessageReader;
+
+class RecordBatchStreamReader;
+class RecordBatchFileReader;
+class RecordBatchWriter;
+
+namespace feather {
+
+class Reader;
+
+} // namespace feather
+} // namespace ipc
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/util.h b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/util.h
new file mode 100644
index 00000000000..709fedbf31b
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/util.h
@@ -0,0 +1,41 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+
+namespace arrow {
+namespace ipc {
+
+// Buffers are padded to 64-byte boundaries (for SIMD)
+static constexpr int32_t kArrowAlignment = 64;
+
+// Tensors are padded to 64-byte boundaries
+static constexpr int32_t kTensorAlignment = 64;
+
+// Align on 8-byte boundaries in IPC
+static constexpr int32_t kArrowIpcAlignment = 8;
+
+static constexpr uint8_t kPaddingBytes[kArrowAlignment] = {0};
+
+static inline int64_t PaddedLength(int64_t nbytes, int32_t alignment = kArrowAlignment) {
+ return ((nbytes + alignment - 1) / alignment) * alignment;
+}
+
+} // namespace ipc
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/writer.cc b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/writer.cc
new file mode 100644
index 00000000000..7b9254b7e59
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/writer.cc
@@ -0,0 +1,1429 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/ipc/writer.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <limits>
+#include <sstream>
+#include <string>
+#include <type_traits>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/buffer.h"
+#include "arrow/device.h"
+#include "arrow/extension_type.h"
+#include "arrow/io/interfaces.h"
+#include "arrow/io/memory.h"
+#include "arrow/ipc/dictionary.h"
+#include "arrow/ipc/message.h"
+#include "arrow/ipc/metadata_internal.h"
+#include "arrow/ipc/util.h"
+#include "arrow/record_batch.h"
+#include "arrow/result_internal.h"
+#include "arrow/sparse_tensor.h"
+#include "arrow/status.h"
+#include "arrow/table.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_ops.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/compression.h"
+#include "arrow/util/endian.h"
+#include "arrow/util/key_value_metadata.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/make_unique.h"
+#include "arrow/util/parallel.h"
+#include "arrow/visitor_inline.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+using internal::checked_pointer_cast;
+using internal::CopyBitmap;
+using internal::GetByteWidth;
+
+namespace ipc {
+
+using internal::FileBlock;
+using internal::kArrowMagicBytes;
+
+namespace {
+
+bool HasNestedDict(const ArrayData& data) {
+ if (data.type->id() == Type::DICTIONARY) {
+ return true;
+ }
+ for (const auto& child : data.child_data) {
+ if (HasNestedDict(*child)) {
+ return true;
+ }
+ }
+ return false;
+}
+
+Status GetTruncatedBitmap(int64_t offset, int64_t length,
+ const std::shared_ptr<Buffer> input, MemoryPool* pool,
+ std::shared_ptr<Buffer>* buffer) {
+ if (!input) {
+ *buffer = input;
+ return Status::OK();
+ }
+ int64_t min_length = PaddedLength(BitUtil::BytesForBits(length));
+ if (offset != 0 || min_length < input->size()) {
+ // With a sliced array / non-zero offset, we must copy the bitmap
+ ARROW_ASSIGN_OR_RAISE(*buffer, CopyBitmap(pool, input->data(), offset, length));
+ } else {
+ *buffer = input;
+ }
+ return Status::OK();
+}
+
+Status GetTruncatedBuffer(int64_t offset, int64_t length, int32_t byte_width,
+ const std::shared_ptr<Buffer> input, MemoryPool* pool,
+ std::shared_ptr<Buffer>* buffer) {
+ if (!input) {
+ *buffer = input;
+ return Status::OK();
+ }
+ int64_t padded_length = PaddedLength(length * byte_width);
+ if (offset != 0 || padded_length < input->size()) {
+ *buffer =
+ SliceBuffer(input, offset * byte_width, std::min(padded_length, input->size()));
+ } else {
+ *buffer = input;
+ }
+ return Status::OK();
+}
+
+static inline bool NeedTruncate(int64_t offset, const Buffer* buffer,
+ int64_t min_length) {
+ // buffer can be NULL
+ if (buffer == nullptr) {
+ return false;
+ }
+ return offset != 0 || min_length < buffer->size();
+}
+
+class RecordBatchSerializer {
+ public:
+ RecordBatchSerializer(int64_t buffer_start_offset, const IpcWriteOptions& options,
+ IpcPayload* out)
+ : out_(out),
+ options_(options),
+ max_recursion_depth_(options.max_recursion_depth),
+ buffer_start_offset_(buffer_start_offset) {
+ DCHECK_GT(max_recursion_depth_, 0);
+ }
+
+ virtual ~RecordBatchSerializer() = default;
+
+ Status VisitArray(const Array& arr) {
+ static std::shared_ptr<Buffer> kNullBuffer = std::make_shared<Buffer>(nullptr, 0);
+
+ if (max_recursion_depth_ <= 0) {
+ return Status::Invalid("Max recursion depth reached");
+ }
+
+ if (!options_.allow_64bit && arr.length() > std::numeric_limits<int32_t>::max()) {
+ return Status::CapacityError("Cannot write arrays larger than 2^31 - 1 in length");
+ }
+
+ // push back all common elements
+ field_nodes_.push_back({arr.length(), arr.null_count(), 0});
+
+ // In V4, null types have no validity bitmap
+ // In V5 and later, null and union types have no validity bitmap
+ if (internal::HasValidityBitmap(arr.type_id(), options_.metadata_version)) {
+ if (arr.null_count() > 0) {
+ std::shared_ptr<Buffer> bitmap;
+ RETURN_NOT_OK(GetTruncatedBitmap(arr.offset(), arr.length(), arr.null_bitmap(),
+ options_.memory_pool, &bitmap));
+ out_->body_buffers.emplace_back(bitmap);
+ } else {
+ // Push a dummy zero-length buffer, not to be copied
+ out_->body_buffers.emplace_back(kNullBuffer);
+ }
+ }
+ return VisitType(arr);
+ }
+
+ // Override this for writing dictionary metadata
+ virtual Status SerializeMetadata(int64_t num_rows) {
+ return WriteRecordBatchMessage(num_rows, out_->body_length, custom_metadata_,
+ field_nodes_, buffer_meta_, options_, &out_->metadata);
+ }
+
+ void AppendCustomMetadata(const std::string& key, const std::string& value) {
+ if (!custom_metadata_) {
+ custom_metadata_ = std::make_shared<KeyValueMetadata>();
+ }
+ custom_metadata_->Append(key, value);
+ }
+
+ Status CompressBuffer(const Buffer& buffer, util::Codec* codec,
+ std::shared_ptr<Buffer>* out) {
+ // Convert buffer to uncompressed-length-prefixed compressed buffer
+ int64_t maximum_length = codec->MaxCompressedLen(buffer.size(), buffer.data());
+ ARROW_ASSIGN_OR_RAISE(auto result, AllocateBuffer(maximum_length + sizeof(int64_t)));
+
+ int64_t actual_length;
+ ARROW_ASSIGN_OR_RAISE(actual_length,
+ codec->Compress(buffer.size(), buffer.data(), maximum_length,
+ result->mutable_data() + sizeof(int64_t)));
+ *reinterpret_cast<int64_t*>(result->mutable_data()) =
+ BitUtil::ToLittleEndian(buffer.size());
+ *out = SliceBuffer(std::move(result), /*offset=*/0, actual_length + sizeof(int64_t));
+ return Status::OK();
+ }
+
+ Status CompressBodyBuffers() {
+ RETURN_NOT_OK(
+ internal::CheckCompressionSupported(options_.codec->compression_type()));
+
+ auto CompressOne = [&](size_t i) {
+ if (out_->body_buffers[i]->size() > 0) {
+ RETURN_NOT_OK(CompressBuffer(*out_->body_buffers[i], options_.codec.get(),
+ &out_->body_buffers[i]));
+ }
+ return Status::OK();
+ };
+
+ return ::arrow::internal::OptionalParallelFor(
+ options_.use_threads, static_cast<int>(out_->body_buffers.size()), CompressOne);
+ }
+
+ Status Assemble(const RecordBatch& batch) {
+ if (field_nodes_.size() > 0) {
+ field_nodes_.clear();
+ buffer_meta_.clear();
+ out_->body_buffers.clear();
+ }
+
+ // Perform depth-first traversal of the row-batch
+ for (int i = 0; i < batch.num_columns(); ++i) {
+ RETURN_NOT_OK(VisitArray(*batch.column(i)));
+ }
+
+ if (options_.codec != nullptr) {
+ RETURN_NOT_OK(CompressBodyBuffers());
+ }
+
+ // The position for the start of a buffer relative to the passed frame of
+ // reference. May be 0 or some other position in an address space
+ int64_t offset = buffer_start_offset_;
+
+ buffer_meta_.reserve(out_->body_buffers.size());
+
+ // Construct the buffer metadata for the record batch header
+ for (const auto& buffer : out_->body_buffers) {
+ int64_t size = 0;
+ int64_t padding = 0;
+
+ // The buffer might be null if we are handling zero row lengths.
+ if (buffer) {
+ size = buffer->size();
+ padding = BitUtil::RoundUpToMultipleOf8(size) - size;
+ }
+
+ buffer_meta_.push_back({offset, size});
+ offset += size + padding;
+ }
+
+ out_->body_length = offset - buffer_start_offset_;
+ DCHECK(BitUtil::IsMultipleOf8(out_->body_length));
+
+ // Now that we have computed the locations of all of the buffers in shared
+ // memory, the data header can be converted to a flatbuffer and written out
+ //
+ // Note: The memory written here is prefixed by the size of the flatbuffer
+ // itself as an int32_t.
+ return SerializeMetadata(batch.num_rows());
+ }
+
+ template <typename ArrayType>
+ Status GetZeroBasedValueOffsets(const ArrayType& array,
+ std::shared_ptr<Buffer>* value_offsets) {
+ // Share slicing logic between ListArray, BinaryArray and LargeBinaryArray
+ using offset_type = typename ArrayType::offset_type;
+
+ auto offsets = array.value_offsets();
+
+ int64_t required_bytes = sizeof(offset_type) * (array.length() + 1);
+ if (array.offset() != 0) {
+ // If we have a non-zero offset, then the value offsets do not start at
+ // zero. We must a) create a new offsets array with shifted offsets and
+ // b) slice the values array accordingly
+
+ ARROW_ASSIGN_OR_RAISE(auto shifted_offsets,
+ AllocateBuffer(required_bytes, options_.memory_pool));
+
+ offset_type* dest_offsets =
+ reinterpret_cast<offset_type*>(shifted_offsets->mutable_data());
+ const offset_type start_offset = array.value_offset(0);
+
+ for (int i = 0; i < array.length(); ++i) {
+ dest_offsets[i] = array.value_offset(i) - start_offset;
+ }
+ // Final offset
+ dest_offsets[array.length()] = array.value_offset(array.length()) - start_offset;
+ offsets = std::move(shifted_offsets);
+ } else {
+ // ARROW-6046: Slice offsets to used extent, in case we have a truncated
+ // slice
+ if (offsets != nullptr && offsets->size() > required_bytes) {
+ offsets = SliceBuffer(offsets, 0, required_bytes);
+ }
+ }
+ *value_offsets = std::move(offsets);
+ return Status::OK();
+ }
+
+ Status Visit(const BooleanArray& array) {
+ std::shared_ptr<Buffer> data;
+ RETURN_NOT_OK(GetTruncatedBitmap(array.offset(), array.length(), array.values(),
+ options_.memory_pool, &data));
+ out_->body_buffers.emplace_back(data);
+ return Status::OK();
+ }
+
+ Status Visit(const NullArray& array) { return Status::OK(); }
+
+ template <typename T>
+ typename std::enable_if<is_number_type<typename T::TypeClass>::value ||
+ is_temporal_type<typename T::TypeClass>::value ||
+ is_fixed_size_binary_type<typename T::TypeClass>::value,
+ Status>::type
+ Visit(const T& array) {
+ std::shared_ptr<Buffer> data = array.values();
+
+ const int64_t type_width = GetByteWidth(*array.type());
+ int64_t min_length = PaddedLength(array.length() * type_width);
+
+ if (NeedTruncate(array.offset(), data.get(), min_length)) {
+ // Non-zero offset, slice the buffer
+ const int64_t byte_offset = array.offset() * type_width;
+
+ // Send padding if it's available
+ const int64_t buffer_length =
+ std::min(BitUtil::RoundUpToMultipleOf8(array.length() * type_width),
+ data->size() - byte_offset);
+ data = SliceBuffer(data, byte_offset, buffer_length);
+ }
+ out_->body_buffers.emplace_back(data);
+ return Status::OK();
+ }
+
+ template <typename T>
+ enable_if_base_binary<typename T::TypeClass, Status> Visit(const T& array) {
+ std::shared_ptr<Buffer> value_offsets;
+ RETURN_NOT_OK(GetZeroBasedValueOffsets<T>(array, &value_offsets));
+ auto data = array.value_data();
+
+ int64_t total_data_bytes = 0;
+ if (value_offsets) {
+ total_data_bytes = array.value_offset(array.length()) - array.value_offset(0);
+ }
+ if (NeedTruncate(array.offset(), data.get(), total_data_bytes)) {
+ // Slice the data buffer to include only the range we need now
+ const int64_t start_offset = array.value_offset(0);
+ const int64_t slice_length =
+ std::min(PaddedLength(total_data_bytes), data->size() - start_offset);
+ data = SliceBuffer(data, start_offset, slice_length);
+ }
+
+ out_->body_buffers.emplace_back(value_offsets);
+ out_->body_buffers.emplace_back(data);
+ return Status::OK();
+ }
+
+ template <typename T>
+ enable_if_base_list<typename T::TypeClass, Status> Visit(const T& array) {
+ using offset_type = typename T::offset_type;
+
+ std::shared_ptr<Buffer> value_offsets;
+ RETURN_NOT_OK(GetZeroBasedValueOffsets<T>(array, &value_offsets));
+ out_->body_buffers.emplace_back(value_offsets);
+
+ --max_recursion_depth_;
+ std::shared_ptr<Array> values = array.values();
+
+ offset_type values_offset = 0;
+ offset_type values_length = 0;
+ if (value_offsets) {
+ values_offset = array.value_offset(0);
+ values_length = array.value_offset(array.length()) - values_offset;
+ }
+
+ if (array.offset() != 0 || values_length < values->length()) {
+ // Must also slice the values
+ values = values->Slice(values_offset, values_length);
+ }
+ RETURN_NOT_OK(VisitArray(*values));
+ ++max_recursion_depth_;
+ return Status::OK();
+ }
+
+ Status Visit(const FixedSizeListArray& array) {
+ --max_recursion_depth_;
+ auto size = array.list_type()->list_size();
+ auto values = array.values()->Slice(array.offset() * size, array.length() * size);
+
+ RETURN_NOT_OK(VisitArray(*values));
+ ++max_recursion_depth_;
+ return Status::OK();
+ }
+
+ Status Visit(const StructArray& array) {
+ --max_recursion_depth_;
+ for (int i = 0; i < array.num_fields(); ++i) {
+ std::shared_ptr<Array> field = array.field(i);
+ RETURN_NOT_OK(VisitArray(*field));
+ }
+ ++max_recursion_depth_;
+ return Status::OK();
+ }
+
+ Status Visit(const SparseUnionArray& array) {
+ const int64_t offset = array.offset();
+ const int64_t length = array.length();
+
+ std::shared_ptr<Buffer> type_codes;
+ RETURN_NOT_OK(GetTruncatedBuffer(
+ offset, length, static_cast<int32_t>(sizeof(UnionArray::type_code_t)),
+ array.type_codes(), options_.memory_pool, &type_codes));
+ out_->body_buffers.emplace_back(type_codes);
+
+ --max_recursion_depth_;
+ for (int i = 0; i < array.num_fields(); ++i) {
+ // Sparse union, slicing is done for us by field()
+ RETURN_NOT_OK(VisitArray(*array.field(i)));
+ }
+ ++max_recursion_depth_;
+ return Status::OK();
+ }
+
+ Status Visit(const DenseUnionArray& array) {
+ const int64_t offset = array.offset();
+ const int64_t length = array.length();
+
+ std::shared_ptr<Buffer> type_codes;
+ RETURN_NOT_OK(GetTruncatedBuffer(
+ offset, length, static_cast<int32_t>(sizeof(UnionArray::type_code_t)),
+ array.type_codes(), options_.memory_pool, &type_codes));
+ out_->body_buffers.emplace_back(type_codes);
+
+ --max_recursion_depth_;
+ const auto& type = checked_cast<const UnionType&>(*array.type());
+
+ std::shared_ptr<Buffer> value_offsets;
+ RETURN_NOT_OK(
+ GetTruncatedBuffer(offset, length, static_cast<int32_t>(sizeof(int32_t)),
+ array.value_offsets(), options_.memory_pool, &value_offsets));
+
+ // The Union type codes are not necessary 0-indexed
+ int8_t max_code = 0;
+ for (int8_t code : type.type_codes()) {
+ if (code > max_code) {
+ max_code = code;
+ }
+ }
+
+ // Allocate an array of child offsets. Set all to -1 to indicate that we
+ // haven't observed a first occurrence of a particular child yet
+ std::vector<int32_t> child_offsets(max_code + 1, -1);
+ std::vector<int32_t> child_lengths(max_code + 1, 0);
+
+ if (offset != 0) {
+ // This is an unpleasant case. Because the offsets are different for
+ // each child array, when we have a sliced array, we need to "rebase"
+ // the value_offsets for each array
+
+ const int32_t* unshifted_offsets = array.raw_value_offsets();
+ const int8_t* type_codes = array.raw_type_codes();
+
+ // Allocate the shifted offsets
+ ARROW_ASSIGN_OR_RAISE(
+ auto shifted_offsets_buffer,
+ AllocateBuffer(length * sizeof(int32_t), options_.memory_pool));
+ int32_t* shifted_offsets =
+ reinterpret_cast<int32_t*>(shifted_offsets_buffer->mutable_data());
+
+ // Offsets may not be ascending, so we need to find out the start offset
+ // for each child
+ for (int64_t i = 0; i < length; ++i) {
+ const uint8_t code = type_codes[i];
+ if (child_offsets[code] == -1) {
+ child_offsets[code] = unshifted_offsets[i];
+ } else {
+ child_offsets[code] = std::min(child_offsets[code], unshifted_offsets[i]);
+ }
+ }
+
+ // Now compute shifted offsets by subtracting child offset
+ for (int64_t i = 0; i < length; ++i) {
+ const int8_t code = type_codes[i];
+ shifted_offsets[i] = unshifted_offsets[i] - child_offsets[code];
+ // Update the child length to account for observed value
+ child_lengths[code] = std::max(child_lengths[code], shifted_offsets[i] + 1);
+ }
+
+ value_offsets = std::move(shifted_offsets_buffer);
+ }
+ out_->body_buffers.emplace_back(value_offsets);
+
+ // Visit children and slice accordingly
+ for (int i = 0; i < type.num_fields(); ++i) {
+ std::shared_ptr<Array> child = array.field(i);
+
+ // TODO: ARROW-809, for sliced unions, tricky to know how much to
+ // truncate the children. For now, we are truncating the children to be
+ // no longer than the parent union.
+ if (offset != 0) {
+ const int8_t code = type.type_codes()[i];
+ const int64_t child_offset = child_offsets[code];
+ const int64_t child_length = child_lengths[code];
+
+ if (child_offset > 0) {
+ child = child->Slice(child_offset, child_length);
+ } else if (child_length < child->length()) {
+ // This case includes when child is not encountered at all
+ child = child->Slice(0, child_length);
+ }
+ }
+ RETURN_NOT_OK(VisitArray(*child));
+ }
+ ++max_recursion_depth_;
+ return Status::OK();
+ }
+
+ Status Visit(const DictionaryArray& array) {
+ // Dictionary written out separately. Slice offset contained in the indices
+ return VisitType(*array.indices());
+ }
+
+ Status Visit(const ExtensionArray& array) { return VisitType(*array.storage()); }
+
+ Status VisitType(const Array& values) { return VisitArrayInline(values, this); }
+
+ protected:
+ // Destination for output buffers
+ IpcPayload* out_;
+
+ std::shared_ptr<KeyValueMetadata> custom_metadata_;
+
+ std::vector<internal::FieldMetadata> field_nodes_;
+ std::vector<internal::BufferMetadata> buffer_meta_;
+
+ const IpcWriteOptions& options_;
+ int64_t max_recursion_depth_;
+ int64_t buffer_start_offset_;
+};
+
+class DictionarySerializer : public RecordBatchSerializer {
+ public:
+ DictionarySerializer(int64_t dictionary_id, bool is_delta, int64_t buffer_start_offset,
+ const IpcWriteOptions& options, IpcPayload* out)
+ : RecordBatchSerializer(buffer_start_offset, options, out),
+ dictionary_id_(dictionary_id),
+ is_delta_(is_delta) {}
+
+ Status SerializeMetadata(int64_t num_rows) override {
+ return WriteDictionaryMessage(dictionary_id_, is_delta_, num_rows, out_->body_length,
+ custom_metadata_, field_nodes_, buffer_meta_, options_,
+ &out_->metadata);
+ }
+
+ Status Assemble(const std::shared_ptr<Array>& dictionary) {
+ // Make a dummy record batch. A bit tedious as we have to make a schema
+ auto schema = arrow::schema({arrow::field("dictionary", dictionary->type())});
+ auto batch = RecordBatch::Make(std::move(schema), dictionary->length(), {dictionary});
+ return RecordBatchSerializer::Assemble(*batch);
+ }
+
+ private:
+ int64_t dictionary_id_;
+ bool is_delta_;
+};
+
+} // namespace
+
+Status WriteIpcPayload(const IpcPayload& payload, const IpcWriteOptions& options,
+ io::OutputStream* dst, int32_t* metadata_length) {
+ RETURN_NOT_OK(WriteMessage(*payload.metadata, options, dst, metadata_length));
+
+#ifndef NDEBUG
+ RETURN_NOT_OK(CheckAligned(dst));
+#endif
+
+ // Now write the buffers
+ for (size_t i = 0; i < payload.body_buffers.size(); ++i) {
+ const std::shared_ptr<Buffer>& buffer = payload.body_buffers[i];
+ int64_t size = 0;
+ int64_t padding = 0;
+
+ // The buffer might be null if we are handling zero row lengths.
+ if (buffer) {
+ size = buffer->size();
+ padding = BitUtil::RoundUpToMultipleOf8(size) - size;
+ }
+
+ if (size > 0) {
+ RETURN_NOT_OK(dst->Write(buffer));
+ }
+
+ if (padding > 0) {
+ RETURN_NOT_OK(dst->Write(kPaddingBytes, padding));
+ }
+ }
+
+#ifndef NDEBUG
+ RETURN_NOT_OK(CheckAligned(dst));
+#endif
+
+ return Status::OK();
+}
+
+Status GetSchemaPayload(const Schema& schema, const IpcWriteOptions& options,
+ const DictionaryFieldMapper& mapper, IpcPayload* out) {
+ out->type = MessageType::SCHEMA;
+ return internal::WriteSchemaMessage(schema, mapper, options, &out->metadata);
+}
+
+Status GetDictionaryPayload(int64_t id, const std::shared_ptr<Array>& dictionary,
+ const IpcWriteOptions& options, IpcPayload* out) {
+ return GetDictionaryPayload(id, false, dictionary, options, out);
+}
+
+Status GetDictionaryPayload(int64_t id, bool is_delta,
+ const std::shared_ptr<Array>& dictionary,
+ const IpcWriteOptions& options, IpcPayload* out) {
+ out->type = MessageType::DICTIONARY_BATCH;
+ // Frame of reference is 0, see ARROW-384
+ DictionarySerializer assembler(id, is_delta, /*buffer_start_offset=*/0, options, out);
+ return assembler.Assemble(dictionary);
+}
+
+Status GetRecordBatchPayload(const RecordBatch& batch, const IpcWriteOptions& options,
+ IpcPayload* out) {
+ out->type = MessageType::RECORD_BATCH;
+ RecordBatchSerializer assembler(/*buffer_start_offset=*/0, options, out);
+ return assembler.Assemble(batch);
+}
+
+Status WriteRecordBatch(const RecordBatch& batch, int64_t buffer_start_offset,
+ io::OutputStream* dst, int32_t* metadata_length,
+ int64_t* body_length, const IpcWriteOptions& options) {
+ IpcPayload payload;
+ RecordBatchSerializer assembler(buffer_start_offset, options, &payload);
+ RETURN_NOT_OK(assembler.Assemble(batch));
+
+ // TODO: it's a rough edge that the metadata and body length here are
+ // computed separately
+
+ // The body size is computed in the payload
+ *body_length = payload.body_length;
+
+ return WriteIpcPayload(payload, options, dst, metadata_length);
+}
+
+Status WriteRecordBatchStream(const std::vector<std::shared_ptr<RecordBatch>>& batches,
+ const IpcWriteOptions& options, io::OutputStream* dst) {
+ ASSIGN_OR_RAISE(std::shared_ptr<RecordBatchWriter> writer,
+ MakeStreamWriter(dst, batches[0]->schema(), options));
+ for (const auto& batch : batches) {
+ DCHECK(batch->schema()->Equals(*batches[0]->schema())) << "Schemas unequal";
+ RETURN_NOT_OK(writer->WriteRecordBatch(*batch));
+ }
+ RETURN_NOT_OK(writer->Close());
+ return Status::OK();
+}
+
+namespace {
+
+Status WriteTensorHeader(const Tensor& tensor, io::OutputStream* dst,
+ int32_t* metadata_length) {
+ IpcWriteOptions options;
+ options.alignment = kTensorAlignment;
+ std::shared_ptr<Buffer> metadata;
+ ARROW_ASSIGN_OR_RAISE(metadata, internal::WriteTensorMessage(tensor, 0, options));
+ return WriteMessage(*metadata, options, dst, metadata_length);
+}
+
+Status WriteStridedTensorData(int dim_index, int64_t offset, int elem_size,
+ const Tensor& tensor, uint8_t* scratch_space,
+ io::OutputStream* dst) {
+ if (dim_index == tensor.ndim() - 1) {
+ const uint8_t* data_ptr = tensor.raw_data() + offset;
+ const int64_t stride = tensor.strides()[dim_index];
+ for (int64_t i = 0; i < tensor.shape()[dim_index]; ++i) {
+ memcpy(scratch_space + i * elem_size, data_ptr, elem_size);
+ data_ptr += stride;
+ }
+ return dst->Write(scratch_space, elem_size * tensor.shape()[dim_index]);
+ }
+ for (int64_t i = 0; i < tensor.shape()[dim_index]; ++i) {
+ RETURN_NOT_OK(WriteStridedTensorData(dim_index + 1, offset, elem_size, tensor,
+ scratch_space, dst));
+ offset += tensor.strides()[dim_index];
+ }
+ return Status::OK();
+}
+
+Status GetContiguousTensor(const Tensor& tensor, MemoryPool* pool,
+ std::unique_ptr<Tensor>* out) {
+ const int elem_size = GetByteWidth(*tensor.type());
+
+ ARROW_ASSIGN_OR_RAISE(
+ auto scratch_space,
+ AllocateBuffer(tensor.shape()[tensor.ndim() - 1] * elem_size, pool));
+
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<ResizableBuffer> contiguous_data,
+ AllocateResizableBuffer(tensor.size() * elem_size, pool));
+
+ io::BufferOutputStream stream(contiguous_data);
+ RETURN_NOT_OK(WriteStridedTensorData(0, 0, elem_size, tensor,
+ scratch_space->mutable_data(), &stream));
+
+ out->reset(new Tensor(tensor.type(), contiguous_data, tensor.shape()));
+
+ return Status::OK();
+}
+
+} // namespace
+
+Status WriteTensor(const Tensor& tensor, io::OutputStream* dst, int32_t* metadata_length,
+ int64_t* body_length) {
+ const int elem_size = GetByteWidth(*tensor.type());
+
+ *body_length = tensor.size() * elem_size;
+
+ // Tensor metadata accounts for padding
+ if (tensor.is_contiguous()) {
+ RETURN_NOT_OK(WriteTensorHeader(tensor, dst, metadata_length));
+ auto data = tensor.data();
+ if (data && data->data()) {
+ RETURN_NOT_OK(dst->Write(data->data(), *body_length));
+ } else {
+ *body_length = 0;
+ }
+ } else {
+ // The tensor written is made contiguous
+ Tensor dummy(tensor.type(), nullptr, tensor.shape());
+ RETURN_NOT_OK(WriteTensorHeader(dummy, dst, metadata_length));
+
+ // TODO: Do we care enough about this temporary allocation to pass in a
+ // MemoryPool to this function?
+ ARROW_ASSIGN_OR_RAISE(auto scratch_space,
+ AllocateBuffer(tensor.shape()[tensor.ndim() - 1] * elem_size));
+
+ RETURN_NOT_OK(WriteStridedTensorData(0, 0, elem_size, tensor,
+ scratch_space->mutable_data(), dst));
+ }
+
+ return Status::OK();
+}
+
+Result<std::unique_ptr<Message>> GetTensorMessage(const Tensor& tensor,
+ MemoryPool* pool) {
+ const Tensor* tensor_to_write = &tensor;
+ std::unique_ptr<Tensor> temp_tensor;
+
+ if (!tensor.is_contiguous()) {
+ RETURN_NOT_OK(GetContiguousTensor(tensor, pool, &temp_tensor));
+ tensor_to_write = temp_tensor.get();
+ }
+
+ IpcWriteOptions options;
+ options.alignment = kTensorAlignment;
+ std::shared_ptr<Buffer> metadata;
+ ARROW_ASSIGN_OR_RAISE(metadata,
+ internal::WriteTensorMessage(*tensor_to_write, 0, options));
+ return std::unique_ptr<Message>(new Message(metadata, tensor_to_write->data()));
+}
+
+namespace internal {
+
+class SparseTensorSerializer {
+ public:
+ SparseTensorSerializer(int64_t buffer_start_offset, IpcPayload* out)
+ : out_(out),
+ buffer_start_offset_(buffer_start_offset),
+ options_(IpcWriteOptions::Defaults()) {}
+
+ ~SparseTensorSerializer() = default;
+
+ Status VisitSparseIndex(const SparseIndex& sparse_index) {
+ switch (sparse_index.format_id()) {
+ case SparseTensorFormat::COO:
+ RETURN_NOT_OK(
+ VisitSparseCOOIndex(checked_cast<const SparseCOOIndex&>(sparse_index)));
+ break;
+
+ case SparseTensorFormat::CSR:
+ RETURN_NOT_OK(
+ VisitSparseCSRIndex(checked_cast<const SparseCSRIndex&>(sparse_index)));
+ break;
+
+ case SparseTensorFormat::CSC:
+ RETURN_NOT_OK(
+ VisitSparseCSCIndex(checked_cast<const SparseCSCIndex&>(sparse_index)));
+ break;
+
+ case SparseTensorFormat::CSF:
+ RETURN_NOT_OK(
+ VisitSparseCSFIndex(checked_cast<const SparseCSFIndex&>(sparse_index)));
+ break;
+
+ default:
+ std::stringstream ss;
+ ss << "Unable to convert type: " << sparse_index.ToString() << std::endl;
+ return Status::NotImplemented(ss.str());
+ }
+
+ return Status::OK();
+ }
+
+ Status SerializeMetadata(const SparseTensor& sparse_tensor) {
+ return WriteSparseTensorMessage(sparse_tensor, out_->body_length, buffer_meta_,
+ options_)
+ .Value(&out_->metadata);
+ }
+
+ Status Assemble(const SparseTensor& sparse_tensor) {
+ if (buffer_meta_.size() > 0) {
+ buffer_meta_.clear();
+ out_->body_buffers.clear();
+ }
+
+ RETURN_NOT_OK(VisitSparseIndex(*sparse_tensor.sparse_index()));
+ out_->body_buffers.emplace_back(sparse_tensor.data());
+
+ int64_t offset = buffer_start_offset_;
+ buffer_meta_.reserve(out_->body_buffers.size());
+
+ for (size_t i = 0; i < out_->body_buffers.size(); ++i) {
+ const Buffer* buffer = out_->body_buffers[i].get();
+ int64_t size = buffer->size();
+ int64_t padding = BitUtil::RoundUpToMultipleOf8(size) - size;
+ buffer_meta_.push_back({offset, size + padding});
+ offset += size + padding;
+ }
+
+ out_->body_length = offset - buffer_start_offset_;
+ DCHECK(BitUtil::IsMultipleOf8(out_->body_length));
+
+ return SerializeMetadata(sparse_tensor);
+ }
+
+ private:
+ Status VisitSparseCOOIndex(const SparseCOOIndex& sparse_index) {
+ out_->body_buffers.emplace_back(sparse_index.indices()->data());
+ return Status::OK();
+ }
+
+ Status VisitSparseCSRIndex(const SparseCSRIndex& sparse_index) {
+ out_->body_buffers.emplace_back(sparse_index.indptr()->data());
+ out_->body_buffers.emplace_back(sparse_index.indices()->data());
+ return Status::OK();
+ }
+
+ Status VisitSparseCSCIndex(const SparseCSCIndex& sparse_index) {
+ out_->body_buffers.emplace_back(sparse_index.indptr()->data());
+ out_->body_buffers.emplace_back(sparse_index.indices()->data());
+ return Status::OK();
+ }
+
+ Status VisitSparseCSFIndex(const SparseCSFIndex& sparse_index) {
+ for (const std::shared_ptr<arrow::Tensor>& indptr : sparse_index.indptr()) {
+ out_->body_buffers.emplace_back(indptr->data());
+ }
+ for (const std::shared_ptr<arrow::Tensor>& indices : sparse_index.indices()) {
+ out_->body_buffers.emplace_back(indices->data());
+ }
+ return Status::OK();
+ }
+
+ IpcPayload* out_;
+
+ std::vector<internal::BufferMetadata> buffer_meta_;
+ int64_t buffer_start_offset_;
+ IpcWriteOptions options_;
+};
+
+} // namespace internal
+
+Status WriteSparseTensor(const SparseTensor& sparse_tensor, io::OutputStream* dst,
+ int32_t* metadata_length, int64_t* body_length) {
+ IpcPayload payload;
+ internal::SparseTensorSerializer writer(0, &payload);
+ RETURN_NOT_OK(writer.Assemble(sparse_tensor));
+
+ *body_length = payload.body_length;
+ return WriteIpcPayload(payload, IpcWriteOptions::Defaults(), dst, metadata_length);
+}
+
+Status GetSparseTensorPayload(const SparseTensor& sparse_tensor, MemoryPool* pool,
+ IpcPayload* out) {
+ internal::SparseTensorSerializer writer(0, out);
+ return writer.Assemble(sparse_tensor);
+}
+
+Result<std::unique_ptr<Message>> GetSparseTensorMessage(const SparseTensor& sparse_tensor,
+ MemoryPool* pool) {
+ IpcPayload payload;
+ RETURN_NOT_OK(GetSparseTensorPayload(sparse_tensor, pool, &payload));
+ return std::unique_ptr<Message>(
+ new Message(std::move(payload.metadata), std::move(payload.body_buffers[0])));
+}
+
+int64_t GetPayloadSize(const IpcPayload& payload, const IpcWriteOptions& options) {
+ const int32_t prefix_size = options.write_legacy_ipc_format ? 4 : 8;
+ const int32_t flatbuffer_size = static_cast<int32_t>(payload.metadata->size());
+ const int32_t padded_message_length = static_cast<int32_t>(
+ PaddedLength(flatbuffer_size + prefix_size, options.alignment));
+ // body_length already accounts for padding
+ return payload.body_length + padded_message_length;
+}
+
+Status GetRecordBatchSize(const RecordBatch& batch, int64_t* size) {
+ return GetRecordBatchSize(batch, IpcWriteOptions::Defaults(), size);
+}
+
+Status GetRecordBatchSize(const RecordBatch& batch, const IpcWriteOptions& options,
+ int64_t* size) {
+ // emulates the behavior of Write without actually writing
+ int32_t metadata_length = 0;
+ int64_t body_length = 0;
+ io::MockOutputStream dst;
+ RETURN_NOT_OK(
+ WriteRecordBatch(batch, 0, &dst, &metadata_length, &body_length, options));
+ *size = dst.GetExtentBytesWritten();
+ return Status::OK();
+}
+
+Status GetTensorSize(const Tensor& tensor, int64_t* size) {
+ // emulates the behavior of Write without actually writing
+ int32_t metadata_length = 0;
+ int64_t body_length = 0;
+ io::MockOutputStream dst;
+ RETURN_NOT_OK(WriteTensor(tensor, &dst, &metadata_length, &body_length));
+ *size = dst.GetExtentBytesWritten();
+ return Status::OK();
+}
+
+// ----------------------------------------------------------------------
+
+RecordBatchWriter::~RecordBatchWriter() {}
+
+Status RecordBatchWriter::WriteTable(const Table& table, int64_t max_chunksize) {
+ TableBatchReader reader(table);
+
+ if (max_chunksize > 0) {
+ reader.set_chunksize(max_chunksize);
+ }
+
+ std::shared_ptr<RecordBatch> batch;
+ while (true) {
+ RETURN_NOT_OK(reader.ReadNext(&batch));
+ if (batch == nullptr) {
+ break;
+ }
+ RETURN_NOT_OK(WriteRecordBatch(*batch));
+ }
+
+ return Status::OK();
+}
+
+Status RecordBatchWriter::WriteTable(const Table& table) { return WriteTable(table, -1); }
+
+// ----------------------------------------------------------------------
+// Payload writer implementation
+
+namespace internal {
+
+IpcPayloadWriter::~IpcPayloadWriter() {}
+
+Status IpcPayloadWriter::Start() { return Status::OK(); }
+
+class ARROW_EXPORT IpcFormatWriter : public RecordBatchWriter {
+ public:
+ // A RecordBatchWriter implementation that writes to a IpcPayloadWriter.
+ IpcFormatWriter(std::unique_ptr<internal::IpcPayloadWriter> payload_writer,
+ const Schema& schema, const IpcWriteOptions& options,
+ bool is_file_format)
+ : payload_writer_(std::move(payload_writer)),
+ schema_(schema),
+ mapper_(schema),
+ is_file_format_(is_file_format),
+ options_(options) {}
+
+ // A Schema-owning constructor variant
+ IpcFormatWriter(std::unique_ptr<internal::IpcPayloadWriter> payload_writer,
+ const std::shared_ptr<Schema>& schema, const IpcWriteOptions& options,
+ bool is_file_format)
+ : IpcFormatWriter(std::move(payload_writer), *schema, options, is_file_format) {
+ shared_schema_ = schema;
+ }
+
+ Status WriteRecordBatch(const RecordBatch& batch) override {
+ if (!batch.schema()->Equals(schema_, false /* check_metadata */)) {
+ return Status::Invalid("Tried to write record batch with different schema");
+ }
+
+ RETURN_NOT_OK(CheckStarted());
+
+ RETURN_NOT_OK(WriteDictionaries(batch));
+
+ IpcPayload payload;
+ RETURN_NOT_OK(GetRecordBatchPayload(batch, options_, &payload));
+ RETURN_NOT_OK(WritePayload(payload));
+ ++stats_.num_record_batches;
+ return Status::OK();
+ }
+
+ Status WriteTable(const Table& table, int64_t max_chunksize) override {
+ if (is_file_format_ && options_.unify_dictionaries) {
+ ARROW_ASSIGN_OR_RAISE(auto unified_table,
+ DictionaryUnifier::UnifyTable(table, options_.memory_pool));
+ return RecordBatchWriter::WriteTable(*unified_table, max_chunksize);
+ } else {
+ return RecordBatchWriter::WriteTable(table, max_chunksize);
+ }
+ }
+
+ Status Close() override {
+ RETURN_NOT_OK(CheckStarted());
+ return payload_writer_->Close();
+ }
+
+ Status Start() {
+ started_ = true;
+ RETURN_NOT_OK(payload_writer_->Start());
+
+ IpcPayload payload;
+ RETURN_NOT_OK(GetSchemaPayload(schema_, options_, mapper_, &payload));
+ return WritePayload(payload);
+ }
+
+ WriteStats stats() const override { return stats_; }
+
+ protected:
+ Status CheckStarted() {
+ if (!started_) {
+ return Start();
+ }
+ return Status::OK();
+ }
+
+ Status WriteDictionaries(const RecordBatch& batch) {
+ ARROW_ASSIGN_OR_RAISE(const auto dictionaries, CollectDictionaries(batch, mapper_));
+ const auto equal_options = EqualOptions().nans_equal(true);
+
+ for (const auto& pair : dictionaries) {
+ int64_t dictionary_id = pair.first;
+ const auto& dictionary = pair.second;
+
+ // If a dictionary with this id was already emitted, check if it was the same.
+ auto* last_dictionary = &last_dictionaries_[dictionary_id];
+ const bool dictionary_exists = (*last_dictionary != nullptr);
+ int64_t delta_start = 0;
+ if (dictionary_exists) {
+ if ((*last_dictionary)->data() == dictionary->data()) {
+ // Fast shortcut for a common case.
+ // Same dictionary data by pointer => no need to emit it again
+ continue;
+ }
+ const int64_t last_length = (*last_dictionary)->length();
+ const int64_t new_length = dictionary->length();
+ if (new_length == last_length &&
+ ((*last_dictionary)->Equals(dictionary, equal_options))) {
+ // Same dictionary by value => no need to emit it again
+ // (while this can have a CPU cost, this code path is required
+ // for the IPC file format)
+ continue;
+ }
+ if (is_file_format_) {
+ return Status::Invalid(
+ "Dictionary replacement detected when writing IPC file format. "
+ "Arrow IPC files only support a single dictionary for a given field "
+ "across all batches.");
+ }
+
+ // (the read path doesn't support outer dictionary deltas, don't emit them)
+ if (new_length > last_length && options_.emit_dictionary_deltas &&
+ !HasNestedDict(*dictionary->data()) &&
+ ((*last_dictionary)
+ ->RangeEquals(dictionary, 0, last_length, 0, equal_options))) {
+ // New dictionary starts with the current dictionary
+ delta_start = last_length;
+ }
+ }
+
+ IpcPayload payload;
+ if (delta_start) {
+ RETURN_NOT_OK(GetDictionaryPayload(dictionary_id, /*is_delta=*/true,
+ dictionary->Slice(delta_start), options_,
+ &payload));
+ } else {
+ RETURN_NOT_OK(
+ GetDictionaryPayload(dictionary_id, dictionary, options_, &payload));
+ }
+ RETURN_NOT_OK(WritePayload(payload));
+ ++stats_.num_dictionary_batches;
+ if (dictionary_exists) {
+ if (delta_start) {
+ ++stats_.num_dictionary_deltas;
+ } else {
+ ++stats_.num_replaced_dictionaries;
+ }
+ }
+
+ // Remember dictionary for next batches
+ *last_dictionary = dictionary;
+ }
+ return Status::OK();
+ }
+
+ Status WritePayload(const IpcPayload& payload) {
+ RETURN_NOT_OK(payload_writer_->WritePayload(payload));
+ ++stats_.num_messages;
+ return Status::OK();
+ }
+
+ std::unique_ptr<IpcPayloadWriter> payload_writer_;
+ std::shared_ptr<Schema> shared_schema_;
+ const Schema& schema_;
+ const DictionaryFieldMapper mapper_;
+ const bool is_file_format_;
+
+ // A map of last-written dictionaries by id.
+ // This is required to avoid the same dictionary again and again,
+ // and also for correctness when writing the IPC file format
+ // (where replacements and deltas are unsupported).
+ // The latter is also why we can't use weak_ptr.
+ std::unordered_map<int64_t, std::shared_ptr<Array>> last_dictionaries_;
+
+ bool started_ = false;
+ IpcWriteOptions options_;
+ WriteStats stats_;
+};
+
+class StreamBookKeeper {
+ public:
+ StreamBookKeeper(const IpcWriteOptions& options, io::OutputStream* sink)
+ : options_(options), sink_(sink), position_(-1) {}
+ StreamBookKeeper(const IpcWriteOptions& options, std::shared_ptr<io::OutputStream> sink)
+ : options_(options),
+ sink_(sink.get()),
+ owned_sink_(std::move(sink)),
+ position_(-1) {}
+
+ Status UpdatePosition() { return sink_->Tell().Value(&position_); }
+
+ Status UpdatePositionCheckAligned() {
+ RETURN_NOT_OK(UpdatePosition());
+ DCHECK_EQ(0, position_ % 8) << "Stream is not aligned";
+ return Status::OK();
+ }
+
+ Status Align(int32_t alignment = kArrowIpcAlignment) {
+ // Adds padding bytes if necessary to ensure all memory blocks are written on
+ // 8-byte (or other alignment) boundaries.
+ int64_t remainder = PaddedLength(position_, alignment) - position_;
+ if (remainder > 0) {
+ return Write(kPaddingBytes, remainder);
+ }
+ return Status::OK();
+ }
+
+ // Write data and update position
+ Status Write(const void* data, int64_t nbytes) {
+ RETURN_NOT_OK(sink_->Write(data, nbytes));
+ position_ += nbytes;
+ return Status::OK();
+ }
+
+ Status WriteEOS() {
+ // End of stream marker
+ constexpr int32_t kZeroLength = 0;
+ if (!options_.write_legacy_ipc_format) {
+ RETURN_NOT_OK(Write(&kIpcContinuationToken, sizeof(int32_t)));
+ }
+ return Write(&kZeroLength, sizeof(int32_t));
+ }
+
+ protected:
+ IpcWriteOptions options_;
+ io::OutputStream* sink_;
+ std::shared_ptr<io::OutputStream> owned_sink_;
+ int64_t position_;
+};
+
+/// A IpcPayloadWriter implementation that writes to an IPC stream
+/// (with an end-of-stream marker)
+class PayloadStreamWriter : public IpcPayloadWriter, protected StreamBookKeeper {
+ public:
+ PayloadStreamWriter(io::OutputStream* sink,
+ const IpcWriteOptions& options = IpcWriteOptions::Defaults())
+ : StreamBookKeeper(options, sink) {}
+ PayloadStreamWriter(std::shared_ptr<io::OutputStream> sink,
+ const IpcWriteOptions& options = IpcWriteOptions::Defaults())
+ : StreamBookKeeper(options, std::move(sink)) {}
+
+ ~PayloadStreamWriter() override = default;
+
+ Status WritePayload(const IpcPayload& payload) override {
+#ifndef NDEBUG
+ // Catch bug fixed in ARROW-3236
+ RETURN_NOT_OK(UpdatePositionCheckAligned());
+#endif
+
+ int32_t metadata_length = 0; // unused
+ RETURN_NOT_OK(WriteIpcPayload(payload, options_, sink_, &metadata_length));
+ RETURN_NOT_OK(UpdatePositionCheckAligned());
+ return Status::OK();
+ }
+
+ Status Close() override { return WriteEOS(); }
+};
+
+/// A IpcPayloadWriter implementation that writes to a IPC file
+/// (with a footer as defined in File.fbs)
+class PayloadFileWriter : public internal::IpcPayloadWriter, protected StreamBookKeeper {
+ public:
+ PayloadFileWriter(const IpcWriteOptions& options, const std::shared_ptr<Schema>& schema,
+ const std::shared_ptr<const KeyValueMetadata>& metadata,
+ io::OutputStream* sink)
+ : StreamBookKeeper(options, sink), schema_(schema), metadata_(metadata) {}
+ PayloadFileWriter(const IpcWriteOptions& options, const std::shared_ptr<Schema>& schema,
+ const std::shared_ptr<const KeyValueMetadata>& metadata,
+ std::shared_ptr<io::OutputStream> sink)
+ : StreamBookKeeper(options, std::move(sink)),
+ schema_(schema),
+ metadata_(metadata) {}
+
+ ~PayloadFileWriter() override = default;
+
+ Status WritePayload(const IpcPayload& payload) override {
+#ifndef NDEBUG
+ // Catch bug fixed in ARROW-3236
+ RETURN_NOT_OK(UpdatePositionCheckAligned());
+#endif
+
+ // Metadata length must include padding, it's computed by WriteIpcPayload()
+ FileBlock block = {position_, 0, payload.body_length};
+ RETURN_NOT_OK(WriteIpcPayload(payload, options_, sink_, &block.metadata_length));
+ RETURN_NOT_OK(UpdatePositionCheckAligned());
+
+ // Record position and size of some message types, to list them in the footer
+ switch (payload.type) {
+ case MessageType::DICTIONARY_BATCH:
+ dictionaries_.push_back(block);
+ break;
+ case MessageType::RECORD_BATCH:
+ record_batches_.push_back(block);
+ break;
+ default:
+ break;
+ }
+
+ return Status::OK();
+ }
+
+ Status Start() override {
+ // ARROW-3236: The initial position -1 needs to be updated to the stream's
+ // current position otherwise an incorrect amount of padding will be
+ // written to new files.
+ RETURN_NOT_OK(UpdatePosition());
+
+ // It is only necessary to align to 8-byte boundary at the start of the file
+ RETURN_NOT_OK(Write(kArrowMagicBytes, strlen(kArrowMagicBytes)));
+ RETURN_NOT_OK(Align());
+
+ return Status::OK();
+ }
+
+ Status Close() override {
+ // Write 0 EOS message for compatibility with sequential readers
+ RETURN_NOT_OK(WriteEOS());
+
+ // Write file footer
+ RETURN_NOT_OK(UpdatePosition());
+ int64_t initial_position = position_;
+ RETURN_NOT_OK(
+ WriteFileFooter(*schema_, dictionaries_, record_batches_, metadata_, sink_));
+
+ // Write footer length
+ RETURN_NOT_OK(UpdatePosition());
+ int32_t footer_length = static_cast<int32_t>(position_ - initial_position);
+ if (footer_length <= 0) {
+ return Status::Invalid("Invalid file footer");
+ }
+
+ // write footer length in little endian
+ footer_length = BitUtil::ToLittleEndian(footer_length);
+ RETURN_NOT_OK(Write(&footer_length, sizeof(int32_t)));
+
+ // Write magic bytes to end file
+ return Write(kArrowMagicBytes, strlen(kArrowMagicBytes));
+ }
+
+ protected:
+ std::shared_ptr<Schema> schema_;
+ std::shared_ptr<const KeyValueMetadata> metadata_;
+ std::vector<FileBlock> dictionaries_;
+ std::vector<FileBlock> record_batches_;
+};
+
+} // namespace internal
+
+Result<std::shared_ptr<RecordBatchWriter>> MakeStreamWriter(
+ io::OutputStream* sink, const std::shared_ptr<Schema>& schema,
+ const IpcWriteOptions& options) {
+ return std::make_shared<internal::IpcFormatWriter>(
+ ::arrow::internal::make_unique<internal::PayloadStreamWriter>(sink, options),
+ schema, options, /*is_file_format=*/false);
+}
+
+Result<std::shared_ptr<RecordBatchWriter>> MakeStreamWriter(
+ std::shared_ptr<io::OutputStream> sink, const std::shared_ptr<Schema>& schema,
+ const IpcWriteOptions& options) {
+ return std::make_shared<internal::IpcFormatWriter>(
+ ::arrow::internal::make_unique<internal::PayloadStreamWriter>(std::move(sink),
+ options),
+ schema, options, /*is_file_format=*/false);
+}
+
+Result<std::shared_ptr<RecordBatchWriter>> NewStreamWriter(
+ io::OutputStream* sink, const std::shared_ptr<Schema>& schema,
+ const IpcWriteOptions& options) {
+ return MakeStreamWriter(sink, schema, options);
+}
+
+Result<std::shared_ptr<RecordBatchWriter>> MakeFileWriter(
+ io::OutputStream* sink, const std::shared_ptr<Schema>& schema,
+ const IpcWriteOptions& options,
+ const std::shared_ptr<const KeyValueMetadata>& metadata) {
+ return std::make_shared<internal::IpcFormatWriter>(
+ ::arrow::internal::make_unique<internal::PayloadFileWriter>(options, schema,
+ metadata, sink),
+ schema, options, /*is_file_format=*/true);
+}
+
+Result<std::shared_ptr<RecordBatchWriter>> MakeFileWriter(
+ std::shared_ptr<io::OutputStream> sink, const std::shared_ptr<Schema>& schema,
+ const IpcWriteOptions& options,
+ const std::shared_ptr<const KeyValueMetadata>& metadata) {
+ return std::make_shared<internal::IpcFormatWriter>(
+ ::arrow::internal::make_unique<internal::PayloadFileWriter>(
+ options, schema, metadata, std::move(sink)),
+ schema, options, /*is_file_format=*/true);
+}
+
+Result<std::shared_ptr<RecordBatchWriter>> NewFileWriter(
+ io::OutputStream* sink, const std::shared_ptr<Schema>& schema,
+ const IpcWriteOptions& options,
+ const std::shared_ptr<const KeyValueMetadata>& metadata) {
+ return MakeFileWriter(sink, schema, options, metadata);
+}
+
+namespace internal {
+
+Result<std::unique_ptr<RecordBatchWriter>> OpenRecordBatchWriter(
+ std::unique_ptr<IpcPayloadWriter> sink, const std::shared_ptr<Schema>& schema,
+ const IpcWriteOptions& options) {
+ // XXX should we call Start()?
+ return ::arrow::internal::make_unique<internal::IpcFormatWriter>(
+ std::move(sink), schema, options, /*is_file_format=*/false);
+}
+
+Result<std::unique_ptr<IpcPayloadWriter>> MakePayloadStreamWriter(
+ io::OutputStream* sink, const IpcWriteOptions& options) {
+ return ::arrow::internal::make_unique<internal::PayloadStreamWriter>(sink, options);
+}
+
+Result<std::unique_ptr<IpcPayloadWriter>> MakePayloadFileWriter(
+ io::OutputStream* sink, const std::shared_ptr<Schema>& schema,
+ const IpcWriteOptions& options,
+ const std::shared_ptr<const KeyValueMetadata>& metadata) {
+ return ::arrow::internal::make_unique<internal::PayloadFileWriter>(options, schema,
+ metadata, sink);
+}
+
+} // namespace internal
+
+// ----------------------------------------------------------------------
+// Serialization public APIs
+
+Result<std::shared_ptr<Buffer>> SerializeRecordBatch(const RecordBatch& batch,
+ std::shared_ptr<MemoryManager> mm) {
+ auto options = IpcWriteOptions::Defaults();
+ int64_t size = 0;
+ RETURN_NOT_OK(GetRecordBatchSize(batch, options, &size));
+ ARROW_ASSIGN_OR_RAISE(auto buffer, mm->AllocateBuffer(size));
+ ARROW_ASSIGN_OR_RAISE(auto writer, Buffer::GetWriter(buffer));
+
+ // XXX Should we have a helper function for getting a MemoryPool
+ // for any MemoryManager (not only CPU)?
+ if (mm->is_cpu()) {
+ options.memory_pool = checked_pointer_cast<CPUMemoryManager>(mm)->pool();
+ }
+ RETURN_NOT_OK(SerializeRecordBatch(batch, options, writer.get()));
+ RETURN_NOT_OK(writer->Close());
+ return buffer;
+}
+
+Result<std::shared_ptr<Buffer>> SerializeRecordBatch(const RecordBatch& batch,
+ const IpcWriteOptions& options) {
+ int64_t size = 0;
+ RETURN_NOT_OK(GetRecordBatchSize(batch, options, &size));
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Buffer> buffer,
+ AllocateBuffer(size, options.memory_pool));
+
+ io::FixedSizeBufferWriter stream(buffer);
+ RETURN_NOT_OK(SerializeRecordBatch(batch, options, &stream));
+ return buffer;
+}
+
+Status SerializeRecordBatch(const RecordBatch& batch, const IpcWriteOptions& options,
+ io::OutputStream* out) {
+ int32_t metadata_length = 0;
+ int64_t body_length = 0;
+ return WriteRecordBatch(batch, 0, out, &metadata_length, &body_length, options);
+}
+
+Result<std::shared_ptr<Buffer>> SerializeSchema(const Schema& schema, MemoryPool* pool) {
+ ARROW_ASSIGN_OR_RAISE(auto stream, io::BufferOutputStream::Create(1024, pool));
+
+ auto options = IpcWriteOptions::Defaults();
+ const bool is_file_format = false; // indifferent as we don't write dictionaries
+ internal::IpcFormatWriter writer(
+ ::arrow::internal::make_unique<internal::PayloadStreamWriter>(stream.get()), schema,
+ options, is_file_format);
+ RETURN_NOT_OK(writer.Start());
+ return stream->Finish();
+}
+
+} // namespace ipc
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/writer.h b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/writer.h
new file mode 100644
index 00000000000..0ea83d7630a
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/writer.h
@@ -0,0 +1,459 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Implement Arrow streaming binary format
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "arrow/ipc/dictionary.h" // IWYU pragma: export
+#include "arrow/ipc/message.h"
+#include "arrow/ipc/options.h"
+#include "arrow/result.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class Array;
+class Buffer;
+class MemoryManager;
+class MemoryPool;
+class RecordBatch;
+class Schema;
+class Status;
+class Table;
+class Tensor;
+class SparseTensor;
+
+namespace io {
+
+class OutputStream;
+
+} // namespace io
+
+namespace ipc {
+
+/// \brief Intermediate data structure with metadata header, and zero
+/// or more buffers for the message body.
+struct IpcPayload {
+ MessageType type = MessageType::NONE;
+ std::shared_ptr<Buffer> metadata;
+ std::vector<std::shared_ptr<Buffer>> body_buffers;
+ int64_t body_length = 0;
+};
+
+struct WriteStats {
+ /// Number of IPC messages written.
+ int64_t num_messages = 0;
+ /// Number of record batches written.
+ int64_t num_record_batches = 0;
+ /// Number of dictionary batches written.
+ ///
+ /// Note: num_dictionary_batches >= num_dictionary_deltas + num_replaced_dictionaries
+ int64_t num_dictionary_batches = 0;
+
+ /// Number of dictionary deltas written.
+ int64_t num_dictionary_deltas = 0;
+ /// Number of replaced dictionaries (i.e. where a dictionary batch replaces
+ /// an existing dictionary with an unrelated new dictionary).
+ int64_t num_replaced_dictionaries = 0;
+};
+
+/// \class RecordBatchWriter
+/// \brief Abstract interface for writing a stream of record batches
+class ARROW_EXPORT RecordBatchWriter {
+ public:
+ virtual ~RecordBatchWriter();
+
+ /// \brief Write a record batch to the stream
+ ///
+ /// \param[in] batch the record batch to write to the stream
+ /// \return Status
+ virtual Status WriteRecordBatch(const RecordBatch& batch) = 0;
+
+ /// \brief Write possibly-chunked table by creating sequence of record batches
+ /// \param[in] table table to write
+ /// \return Status
+ Status WriteTable(const Table& table);
+
+ /// \brief Write Table with a particular chunksize
+ /// \param[in] table table to write
+ /// \param[in] max_chunksize maximum length of table chunks. To indicate
+ /// that no maximum should be enforced, pass -1.
+ /// \return Status
+ virtual Status WriteTable(const Table& table, int64_t max_chunksize);
+
+ /// \brief Perform any logic necessary to finish the stream
+ ///
+ /// \return Status
+ virtual Status Close() = 0;
+
+ /// \brief Return current write statistics
+ virtual WriteStats stats() const = 0;
+};
+
+/// \defgroup record-batch-writer-factories Functions for creating RecordBatchWriter
+/// instances
+///
+/// @{
+
+/// Create a new IPC stream writer from stream sink and schema. User is
+/// responsible for closing the actual OutputStream.
+///
+/// \param[in] sink output stream to write to
+/// \param[in] schema the schema of the record batches to be written
+/// \param[in] options options for serialization
+/// \return Result<std::shared_ptr<RecordBatchWriter>>
+ARROW_EXPORT
+Result<std::shared_ptr<RecordBatchWriter>> MakeStreamWriter(
+ io::OutputStream* sink, const std::shared_ptr<Schema>& schema,
+ const IpcWriteOptions& options = IpcWriteOptions::Defaults());
+
+/// Create a new IPC stream writer from stream sink and schema. User is
+/// responsible for closing the actual OutputStream.
+///
+/// \param[in] sink output stream to write to
+/// \param[in] schema the schema of the record batches to be written
+/// \param[in] options options for serialization
+/// \return Result<std::shared_ptr<RecordBatchWriter>>
+ARROW_EXPORT
+Result<std::shared_ptr<RecordBatchWriter>> MakeStreamWriter(
+ std::shared_ptr<io::OutputStream> sink, const std::shared_ptr<Schema>& schema,
+ const IpcWriteOptions& options = IpcWriteOptions::Defaults());
+
+/// Create a new IPC file writer from stream sink and schema
+///
+/// \param[in] sink output stream to write to
+/// \param[in] schema the schema of the record batches to be written
+/// \param[in] options options for serialization, optional
+/// \param[in] metadata custom metadata for File Footer, optional
+/// \return Result<std::shared_ptr<RecordBatchWriter>>
+ARROW_EXPORT
+Result<std::shared_ptr<RecordBatchWriter>> MakeFileWriter(
+ io::OutputStream* sink, const std::shared_ptr<Schema>& schema,
+ const IpcWriteOptions& options = IpcWriteOptions::Defaults(),
+ const std::shared_ptr<const KeyValueMetadata>& metadata = NULLPTR);
+
+/// Create a new IPC file writer from stream sink and schema
+///
+/// \param[in] sink output stream to write to
+/// \param[in] schema the schema of the record batches to be written
+/// \param[in] options options for serialization, optional
+/// \param[in] metadata custom metadata for File Footer, optional
+/// \return Result<std::shared_ptr<RecordBatchWriter>>
+ARROW_EXPORT
+Result<std::shared_ptr<RecordBatchWriter>> MakeFileWriter(
+ std::shared_ptr<io::OutputStream> sink, const std::shared_ptr<Schema>& schema,
+ const IpcWriteOptions& options = IpcWriteOptions::Defaults(),
+ const std::shared_ptr<const KeyValueMetadata>& metadata = NULLPTR);
+
+/// @}
+
+ARROW_DEPRECATED("Use MakeStreamWriter")
+ARROW_EXPORT
+Result<std::shared_ptr<RecordBatchWriter>> NewStreamWriter(
+ io::OutputStream* sink, const std::shared_ptr<Schema>& schema,
+ const IpcWriteOptions& options = IpcWriteOptions::Defaults());
+
+ARROW_DEPRECATED("Use MakeFileWriter")
+ARROW_EXPORT
+Result<std::shared_ptr<RecordBatchWriter>> NewFileWriter(
+ io::OutputStream* sink, const std::shared_ptr<Schema>& schema,
+ const IpcWriteOptions& options = IpcWriteOptions::Defaults(),
+ const std::shared_ptr<const KeyValueMetadata>& metadata = NULLPTR);
+
+/// \brief Low-level API for writing a record batch (without schema)
+/// to an OutputStream as encapsulated IPC message. See Arrow format
+/// documentation for more detail.
+///
+/// \param[in] batch the record batch to write
+/// \param[in] buffer_start_offset the start offset to use in the buffer metadata,
+/// generally should be 0
+/// \param[in] dst an OutputStream
+/// \param[out] metadata_length the size of the length-prefixed flatbuffer
+/// including padding to a 64-byte boundary
+/// \param[out] body_length the size of the contiguous buffer block plus
+/// \param[in] options options for serialization
+/// \return Status
+ARROW_EXPORT
+Status WriteRecordBatch(const RecordBatch& batch, int64_t buffer_start_offset,
+ io::OutputStream* dst, int32_t* metadata_length,
+ int64_t* body_length, const IpcWriteOptions& options);
+
+/// \brief Serialize record batch as encapsulated IPC message in a new buffer
+///
+/// \param[in] batch the record batch
+/// \param[in] options the IpcWriteOptions to use for serialization
+/// \return the serialized message
+ARROW_EXPORT
+Result<std::shared_ptr<Buffer>> SerializeRecordBatch(const RecordBatch& batch,
+ const IpcWriteOptions& options);
+
+/// \brief Serialize record batch as encapsulated IPC message in a new buffer
+///
+/// \param[in] batch the record batch
+/// \param[in] mm a MemoryManager to allocate memory from
+/// \return the serialized message
+ARROW_EXPORT
+Result<std::shared_ptr<Buffer>> SerializeRecordBatch(const RecordBatch& batch,
+ std::shared_ptr<MemoryManager> mm);
+
+/// \brief Write record batch to OutputStream
+///
+/// \param[in] batch the record batch to write
+/// \param[in] options the IpcWriteOptions to use for serialization
+/// \param[in] out the OutputStream to write the output to
+/// \return Status
+///
+/// If writing to pre-allocated memory, you can use
+/// arrow::ipc::GetRecordBatchSize to compute how much space is required
+ARROW_EXPORT
+Status SerializeRecordBatch(const RecordBatch& batch, const IpcWriteOptions& options,
+ io::OutputStream* out);
+
+/// \brief Serialize schema as encapsulated IPC message
+///
+/// \param[in] schema the schema to write
+/// \param[in] pool a MemoryPool to allocate memory from
+/// \return the serialized schema
+ARROW_EXPORT
+Result<std::shared_ptr<Buffer>> SerializeSchema(const Schema& schema,
+ MemoryPool* pool = default_memory_pool());
+
+/// \brief Write multiple record batches to OutputStream, including schema
+/// \param[in] batches a vector of batches. Must all have same schema
+/// \param[in] options options for serialization
+/// \param[out] dst an OutputStream
+/// \return Status
+ARROW_EXPORT
+Status WriteRecordBatchStream(const std::vector<std::shared_ptr<RecordBatch>>& batches,
+ const IpcWriteOptions& options, io::OutputStream* dst);
+
+/// \brief Compute the number of bytes needed to write an IPC payload
+/// including metadata
+///
+/// \param[in] payload the IPC payload to write
+/// \param[in] options write options
+/// \return the size of the complete encapsulated message
+ARROW_EXPORT
+int64_t GetPayloadSize(const IpcPayload& payload,
+ const IpcWriteOptions& options = IpcWriteOptions::Defaults());
+
+/// \brief Compute the number of bytes needed to write a record batch including metadata
+///
+/// \param[in] batch the record batch to write
+/// \param[out] size the size of the complete encapsulated message
+/// \return Status
+ARROW_EXPORT
+Status GetRecordBatchSize(const RecordBatch& batch, int64_t* size);
+
+/// \brief Compute the number of bytes needed to write a record batch including metadata
+///
+/// \param[in] batch the record batch to write
+/// \param[in] options options for serialization
+/// \param[out] size the size of the complete encapsulated message
+/// \return Status
+ARROW_EXPORT
+Status GetRecordBatchSize(const RecordBatch& batch, const IpcWriteOptions& options,
+ int64_t* size);
+
+/// \brief Compute the number of bytes needed to write a tensor including metadata
+///
+/// \param[in] tensor the tensor to write
+/// \param[out] size the size of the complete encapsulated message
+/// \return Status
+ARROW_EXPORT
+Status GetTensorSize(const Tensor& tensor, int64_t* size);
+
+/// \brief EXPERIMENTAL: Convert arrow::Tensor to a Message with minimal memory
+/// allocation
+///
+/// \param[in] tensor the Tensor to write
+/// \param[in] pool MemoryPool to allocate space for metadata
+/// \return the resulting Message
+ARROW_EXPORT
+Result<std::unique_ptr<Message>> GetTensorMessage(const Tensor& tensor, MemoryPool* pool);
+
+/// \brief Write arrow::Tensor as a contiguous message.
+///
+/// The metadata and body are written assuming 64-byte alignment. It is the
+/// user's responsibility to ensure that the OutputStream has been aligned
+/// to a 64-byte multiple before writing the message.
+///
+/// The message is written out as followed:
+/// \code
+/// <metadata size> <metadata> <tensor data>
+/// \endcode
+///
+/// \param[in] tensor the Tensor to write
+/// \param[in] dst the OutputStream to write to
+/// \param[out] metadata_length the actual metadata length, including padding
+/// \param[out] body_length the actual message body length
+/// \return Status
+ARROW_EXPORT
+Status WriteTensor(const Tensor& tensor, io::OutputStream* dst, int32_t* metadata_length,
+ int64_t* body_length);
+
+/// \brief EXPERIMENTAL: Convert arrow::SparseTensor to a Message with minimal memory
+/// allocation
+///
+/// The message is written out as followed:
+/// \code
+/// <metadata size> <metadata> <sparse index> <sparse tensor body>
+/// \endcode
+///
+/// \param[in] sparse_tensor the SparseTensor to write
+/// \param[in] pool MemoryPool to allocate space for metadata
+/// \return the resulting Message
+ARROW_EXPORT
+Result<std::unique_ptr<Message>> GetSparseTensorMessage(const SparseTensor& sparse_tensor,
+ MemoryPool* pool);
+
+/// \brief EXPERIMENTAL: Write arrow::SparseTensor as a contiguous message. The metadata,
+/// sparse index, and body are written assuming 64-byte alignment. It is the
+/// user's responsibility to ensure that the OutputStream has been aligned
+/// to a 64-byte multiple before writing the message.
+///
+/// \param[in] sparse_tensor the SparseTensor to write
+/// \param[in] dst the OutputStream to write to
+/// \param[out] metadata_length the actual metadata length, including padding
+/// \param[out] body_length the actual message body length
+/// \return Status
+ARROW_EXPORT
+Status WriteSparseTensor(const SparseTensor& sparse_tensor, io::OutputStream* dst,
+ int32_t* metadata_length, int64_t* body_length);
+
+/// \brief Compute IpcPayload for the given schema
+/// \param[in] schema the Schema that is being serialized
+/// \param[in] options options for serialization
+/// \param[in] mapper object mapping dictionary fields to dictionary ids
+/// \param[out] out the returned vector of IpcPayloads
+/// \return Status
+ARROW_EXPORT
+Status GetSchemaPayload(const Schema& schema, const IpcWriteOptions& options,
+ const DictionaryFieldMapper& mapper, IpcPayload* out);
+
+/// \brief Compute IpcPayload for a dictionary
+/// \param[in] id the dictionary id
+/// \param[in] dictionary the dictionary values
+/// \param[in] options options for serialization
+/// \param[out] payload the output IpcPayload
+/// \return Status
+ARROW_EXPORT
+Status GetDictionaryPayload(int64_t id, const std::shared_ptr<Array>& dictionary,
+ const IpcWriteOptions& options, IpcPayload* payload);
+
+/// \brief Compute IpcPayload for a dictionary
+/// \param[in] id the dictionary id
+/// \param[in] is_delta whether the dictionary is a delta dictionary
+/// \param[in] dictionary the dictionary values
+/// \param[in] options options for serialization
+/// \param[out] payload the output IpcPayload
+/// \return Status
+ARROW_EXPORT
+Status GetDictionaryPayload(int64_t id, bool is_delta,
+ const std::shared_ptr<Array>& dictionary,
+ const IpcWriteOptions& options, IpcPayload* payload);
+
+/// \brief Compute IpcPayload for the given record batch
+/// \param[in] batch the RecordBatch that is being serialized
+/// \param[in] options options for serialization
+/// \param[out] out the returned IpcPayload
+/// \return Status
+ARROW_EXPORT
+Status GetRecordBatchPayload(const RecordBatch& batch, const IpcWriteOptions& options,
+ IpcPayload* out);
+
+/// \brief Write an IPC payload to the given stream.
+/// \param[in] payload the payload to write
+/// \param[in] options options for serialization
+/// \param[in] dst The stream to write the payload to.
+/// \param[out] metadata_length the length of the serialized metadata
+/// \return Status
+ARROW_EXPORT
+Status WriteIpcPayload(const IpcPayload& payload, const IpcWriteOptions& options,
+ io::OutputStream* dst, int32_t* metadata_length);
+
+/// \brief Compute IpcPayload for the given sparse tensor
+/// \param[in] sparse_tensor the SparseTensor that is being serialized
+/// \param[in,out] pool for any required temporary memory allocations
+/// \param[out] out the returned IpcPayload
+/// \return Status
+ARROW_EXPORT
+Status GetSparseTensorPayload(const SparseTensor& sparse_tensor, MemoryPool* pool,
+ IpcPayload* out);
+
+namespace internal {
+
+// These internal APIs may change without warning or deprecation
+
+class ARROW_EXPORT IpcPayloadWriter {
+ public:
+ virtual ~IpcPayloadWriter();
+
+ // Default implementation is a no-op
+ virtual Status Start();
+
+ virtual Status WritePayload(const IpcPayload& payload) = 0;
+
+ virtual Status Close() = 0;
+};
+
+/// Create a new IPC payload stream writer from stream sink. User is
+/// responsible for closing the actual OutputStream.
+///
+/// \param[in] sink output stream to write to
+/// \param[in] options options for serialization
+/// \return Result<std::shared_ptr<IpcPayloadWriter>>
+ARROW_EXPORT
+Result<std::unique_ptr<IpcPayloadWriter>> MakePayloadStreamWriter(
+ io::OutputStream* sink, const IpcWriteOptions& options = IpcWriteOptions::Defaults());
+
+/// Create a new IPC payload file writer from stream sink.
+///
+/// \param[in] sink output stream to write to
+/// \param[in] schema the schema of the record batches to be written
+/// \param[in] options options for serialization, optional
+/// \param[in] metadata custom metadata for File Footer, optional
+/// \return Status
+ARROW_EXPORT
+Result<std::unique_ptr<IpcPayloadWriter>> MakePayloadFileWriter(
+ io::OutputStream* sink, const std::shared_ptr<Schema>& schema,
+ const IpcWriteOptions& options = IpcWriteOptions::Defaults(),
+ const std::shared_ptr<const KeyValueMetadata>& metadata = NULLPTR);
+
+/// Create a new RecordBatchWriter from IpcPayloadWriter and schema.
+///
+/// The format is implicitly the IPC stream format (allowing dictionary
+/// replacement and deltas).
+///
+/// \param[in] sink the IpcPayloadWriter to write to
+/// \param[in] schema the schema of the record batches to be written
+/// \param[in] options options for serialization
+/// \return Result<std::unique_ptr<RecordBatchWriter>>
+ARROW_EXPORT
+Result<std::unique_ptr<RecordBatchWriter>> OpenRecordBatchWriter(
+ std::unique_ptr<IpcPayloadWriter> sink, const std::shared_ptr<Schema>& schema,
+ const IpcWriteOptions& options = IpcWriteOptions::Defaults());
+
+} // namespace internal
+} // namespace ipc
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/memory_pool.cc b/contrib/libs/apache/arrow/cpp/src/arrow/memory_pool.cc
new file mode 100644
index 00000000000..2d6f3176224
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/memory_pool.cc
@@ -0,0 +1,797 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/memory_pool.h"
+
+#include <algorithm> // IWYU pragma: keep
+#include <atomic>
+#include <cstdlib> // IWYU pragma: keep
+#include <cstring> // IWYU pragma: keep
+#include <iostream> // IWYU pragma: keep
+#include <limits>
+#include <memory>
+
+#if defined(sun) || defined(__sun)
+#include <stdlib.h>
+#endif
+
+#include "arrow/buffer.h"
+#include "arrow/io/util_internal.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/io_util.h"
+#include "arrow/util/logging.h" // IWYU pragma: keep
+#include "arrow/util/optional.h"
+#include "arrow/util/string.h"
+#include "arrow/util/thread_pool.h"
+
+#ifdef __GLIBC__
+#include <malloc.h>
+#endif
+
+#ifdef ARROW_JEMALLOC
+// Needed to support jemalloc 3 and 4
+#define JEMALLOC_MANGLE
+// Explicitly link to our version of jemalloc
+#error #include "jemalloc_ep/dist/include/jemalloc/jemalloc.h"
+#endif
+
+#ifdef ARROW_MIMALLOC
+#error #include <mimalloc.h>
+#endif
+
+#ifdef ARROW_JEMALLOC
+
+// Compile-time configuration for jemalloc options.
+// Note the prefix ("je_arrow_") must match the symbol prefix given when
+// building jemalloc.
+// See discussion in https://github.com/jemalloc/jemalloc/issues/1621
+
+// ARROW-6910(wesm): we found that jemalloc's default behavior with respect to
+// dirty / muzzy pages (see definitions of these in the jemalloc documentation)
+// conflicted with user expectations, and would even cause memory use problems
+// in some cases. By enabling the background_thread option and reducing the
+// decay time from 10 seconds to 1 seconds, memory is released more
+// aggressively (and in the background) to the OS. This can be configured
+// further by using the arrow::jemalloc_set_decay_ms API
+
+#undef USE_JEMALLOC_BACKGROUND_THREAD
+#ifndef __APPLE__
+// ARROW-6977: jemalloc's background_thread isn't always enabled on macOS
+#define USE_JEMALLOC_BACKGROUND_THREAD
+#endif
+
+// In debug mode, add memory poisoning on alloc / free
+#ifdef NDEBUG
+#define JEMALLOC_DEBUG_OPTIONS ""
+#else
+#define JEMALLOC_DEBUG_OPTIONS ",junk:true"
+#endif
+
+const char* je_arrow_malloc_conf =
+ ("oversize_threshold:0"
+#ifdef USE_JEMALLOC_BACKGROUND_THREAD
+ ",dirty_decay_ms:1000"
+ ",muzzy_decay_ms:1000"
+ ",background_thread:true"
+#else
+ // ARROW-6994: return memory immediately to the OS if the
+ // background_thread option isn't available
+ ",dirty_decay_ms:0"
+ ",muzzy_decay_ms:0"
+#endif
+ JEMALLOC_DEBUG_OPTIONS); // NOLINT: whitespace/parens
+
+#endif // ARROW_JEMALLOC
+
+namespace arrow {
+
+namespace {
+
+constexpr size_t kAlignment = 64;
+
+constexpr char kDefaultBackendEnvVar[] = "ARROW_DEFAULT_MEMORY_POOL";
+
+enum class MemoryPoolBackend : uint8_t { System, Jemalloc, Mimalloc };
+
+struct SupportedBackend {
+ const char* name;
+ MemoryPoolBackend backend;
+};
+
+// See ARROW-12248 for why we use static in-function singletons rather than
+// global constants below (in SupportedBackends() and UserSelectedBackend()).
+// In some contexts (especially R bindings) `default_memory_pool()` may be
+// called before all globals are initialized, and then the ARROW_DEFAULT_MEMORY_POOL
+// environment variable would be ignored.
+
+const std::vector<SupportedBackend>& SupportedBackends() {
+ static std::vector<SupportedBackend> backends = {
+ // ARROW-12316: Apple => mimalloc first, then jemalloc
+ // non-Apple => jemalloc first, then mimalloc
+#if defined(ARROW_JEMALLOC) && !defined(__APPLE__)
+ {"jemalloc", MemoryPoolBackend::Jemalloc},
+#endif
+#ifdef ARROW_MIMALLOC
+ {"mimalloc", MemoryPoolBackend::Mimalloc},
+#endif
+#if defined(ARROW_JEMALLOC) && defined(__APPLE__)
+ {"jemalloc", MemoryPoolBackend::Jemalloc},
+#endif
+ {"system", MemoryPoolBackend::System}
+ };
+ return backends;
+}
+
+// Return the MemoryPoolBackend selected by the user through the
+// ARROW_DEFAULT_MEMORY_POOL environment variable, if any.
+util::optional<MemoryPoolBackend> UserSelectedBackend() {
+ static auto user_selected_backend = []() -> util::optional<MemoryPoolBackend> {
+ auto unsupported_backend = [](const std::string& name) {
+ std::vector<std::string> supported;
+ for (const auto backend : SupportedBackends()) {
+ supported.push_back(std::string("'") + backend.name + "'");
+ }
+ ARROW_LOG(WARNING) << "Unsupported backend '" << name << "' specified in "
+ << kDefaultBackendEnvVar << " (supported backends are "
+ << internal::JoinStrings(supported, ", ") << ")";
+ };
+
+ auto maybe_name = internal::GetEnvVar(kDefaultBackendEnvVar);
+ if (!maybe_name.ok()) {
+ return {};
+ }
+ const auto name = *std::move(maybe_name);
+ if (name.empty()) {
+ // An empty environment variable is considered missing
+ return {};
+ }
+ const auto found = std::find_if(
+ SupportedBackends().begin(), SupportedBackends().end(),
+ [&](const SupportedBackend& backend) { return name == backend.name; });
+ if (found != SupportedBackends().end()) {
+ return found->backend;
+ }
+ unsupported_backend(name);
+ return {};
+ }();
+
+ return user_selected_backend;
+}
+
+MemoryPoolBackend DefaultBackend() {
+ auto backend = UserSelectedBackend();
+ if (backend.has_value()) {
+ return backend.value();
+ }
+ struct SupportedBackend default_backend = SupportedBackends().front();
+ return default_backend.backend;
+}
+
+// A static piece of memory for 0-size allocations, so as to return
+// an aligned non-null pointer.
+alignas(kAlignment) static uint8_t zero_size_area[1];
+
+// Helper class directing allocations to the standard system allocator.
+class SystemAllocator {
+ public:
+ // Allocate memory according to the alignment requirements for Arrow
+ // (as of May 2016 64 bytes)
+ static Status AllocateAligned(int64_t size, uint8_t** out) {
+ if (size == 0) {
+ *out = zero_size_area;
+ return Status::OK();
+ }
+#ifdef _WIN32
+ // Special code path for Windows
+ *out = reinterpret_cast<uint8_t*>(
+ _aligned_malloc(static_cast<size_t>(size), kAlignment));
+ if (!*out) {
+ return Status::OutOfMemory("malloc of size ", size, " failed");
+ }
+#elif defined(sun) || defined(__sun)
+ *out = reinterpret_cast<uint8_t*>(memalign(kAlignment, static_cast<size_t>(size)));
+ if (!*out) {
+ return Status::OutOfMemory("malloc of size ", size, " failed");
+ }
+#else
+ const int result = posix_memalign(reinterpret_cast<void**>(out), kAlignment,
+ static_cast<size_t>(size));
+ if (result == ENOMEM) {
+ return Status::OutOfMemory("malloc of size ", size, " failed");
+ }
+
+ if (result == EINVAL) {
+ return Status::Invalid("invalid alignment parameter: ", kAlignment);
+ }
+#endif
+ return Status::OK();
+ }
+
+ static Status ReallocateAligned(int64_t old_size, int64_t new_size, uint8_t** ptr) {
+ uint8_t* previous_ptr = *ptr;
+ if (previous_ptr == zero_size_area) {
+ DCHECK_EQ(old_size, 0);
+ return AllocateAligned(new_size, ptr);
+ }
+ if (new_size == 0) {
+ DeallocateAligned(previous_ptr, old_size);
+ *ptr = zero_size_area;
+ return Status::OK();
+ }
+ // Note: We cannot use realloc() here as it doesn't guarantee alignment.
+
+ // Allocate new chunk
+ uint8_t* out = nullptr;
+ RETURN_NOT_OK(AllocateAligned(new_size, &out));
+ DCHECK(out);
+ // Copy contents and release old memory chunk
+ memcpy(out, *ptr, static_cast<size_t>(std::min(new_size, old_size)));
+#ifdef _WIN32
+ _aligned_free(*ptr);
+#else
+ free(*ptr);
+#endif // defined(_WIN32)
+ *ptr = out;
+ return Status::OK();
+ }
+
+ static void DeallocateAligned(uint8_t* ptr, int64_t size) {
+ if (ptr == zero_size_area) {
+ DCHECK_EQ(size, 0);
+ } else {
+#ifdef _WIN32
+ _aligned_free(ptr);
+#else
+ free(ptr);
+#endif
+ }
+ }
+
+ static void ReleaseUnused() {
+#ifdef __GLIBC__
+ // The return value of malloc_trim is not an error but to inform
+ // you if memory was actually released or not, which we do not care about here
+ ARROW_UNUSED(malloc_trim(0));
+#endif
+ }
+};
+
+#ifdef ARROW_JEMALLOC
+
+// Helper class directing allocations to the jemalloc allocator.
+class JemallocAllocator {
+ public:
+ static Status AllocateAligned(int64_t size, uint8_t** out) {
+ if (size == 0) {
+ *out = zero_size_area;
+ return Status::OK();
+ }
+ *out = reinterpret_cast<uint8_t*>(
+ mallocx(static_cast<size_t>(size), MALLOCX_ALIGN(kAlignment)));
+ if (*out == NULL) {
+ return Status::OutOfMemory("malloc of size ", size, " failed");
+ }
+ return Status::OK();
+ }
+
+ static Status ReallocateAligned(int64_t old_size, int64_t new_size, uint8_t** ptr) {
+ uint8_t* previous_ptr = *ptr;
+ if (previous_ptr == zero_size_area) {
+ DCHECK_EQ(old_size, 0);
+ return AllocateAligned(new_size, ptr);
+ }
+ if (new_size == 0) {
+ DeallocateAligned(previous_ptr, old_size);
+ *ptr = zero_size_area;
+ return Status::OK();
+ }
+ *ptr = reinterpret_cast<uint8_t*>(
+ rallocx(*ptr, static_cast<size_t>(new_size), MALLOCX_ALIGN(kAlignment)));
+ if (*ptr == NULL) {
+ *ptr = previous_ptr;
+ return Status::OutOfMemory("realloc of size ", new_size, " failed");
+ }
+ return Status::OK();
+ }
+
+ static void DeallocateAligned(uint8_t* ptr, int64_t size) {
+ if (ptr == zero_size_area) {
+ DCHECK_EQ(size, 0);
+ } else {
+ dallocx(ptr, MALLOCX_ALIGN(kAlignment));
+ }
+ }
+
+ static void ReleaseUnused() {
+ mallctl("arena." ARROW_STRINGIFY(MALLCTL_ARENAS_ALL) ".purge", NULL, NULL, NULL, 0);
+ }
+};
+
+#endif // defined(ARROW_JEMALLOC)
+
+#ifdef ARROW_MIMALLOC
+
+// Helper class directing allocations to the mimalloc allocator.
+class MimallocAllocator {
+ public:
+ static Status AllocateAligned(int64_t size, uint8_t** out) {
+ if (size == 0) {
+ *out = zero_size_area;
+ return Status::OK();
+ }
+ *out = reinterpret_cast<uint8_t*>(
+ mi_malloc_aligned(static_cast<size_t>(size), kAlignment));
+ if (*out == NULL) {
+ return Status::OutOfMemory("malloc of size ", size, " failed");
+ }
+ return Status::OK();
+ }
+
+ static void ReleaseUnused() { mi_collect(true); }
+
+ static Status ReallocateAligned(int64_t old_size, int64_t new_size, uint8_t** ptr) {
+ uint8_t* previous_ptr = *ptr;
+ if (previous_ptr == zero_size_area) {
+ DCHECK_EQ(old_size, 0);
+ return AllocateAligned(new_size, ptr);
+ }
+ if (new_size == 0) {
+ DeallocateAligned(previous_ptr, old_size);
+ *ptr = zero_size_area;
+ return Status::OK();
+ }
+ *ptr = reinterpret_cast<uint8_t*>(
+ mi_realloc_aligned(previous_ptr, static_cast<size_t>(new_size), kAlignment));
+ if (*ptr == NULL) {
+ *ptr = previous_ptr;
+ return Status::OutOfMemory("realloc of size ", new_size, " failed");
+ }
+ return Status::OK();
+ }
+
+ static void DeallocateAligned(uint8_t* ptr, int64_t size) {
+ if (ptr == zero_size_area) {
+ DCHECK_EQ(size, 0);
+ } else {
+ mi_free(ptr);
+ }
+ }
+};
+
+#endif // defined(ARROW_MIMALLOC)
+
+} // namespace
+
+int64_t MemoryPool::max_memory() const { return -1; }
+
+///////////////////////////////////////////////////////////////////////
+// MemoryPool implementation that delegates its core duty
+// to an Allocator class.
+
+#ifndef NDEBUG
+static constexpr uint8_t kAllocPoison = 0xBC;
+static constexpr uint8_t kReallocPoison = 0xBD;
+static constexpr uint8_t kDeallocPoison = 0xBE;
+#endif
+
+template <typename Allocator>
+class BaseMemoryPoolImpl : public MemoryPool {
+ public:
+ ~BaseMemoryPoolImpl() override {}
+
+ Status Allocate(int64_t size, uint8_t** out) override {
+ if (size < 0) {
+ return Status::Invalid("negative malloc size");
+ }
+ if (static_cast<uint64_t>(size) >= std::numeric_limits<size_t>::max()) {
+ return Status::CapacityError("malloc size overflows size_t");
+ }
+ RETURN_NOT_OK(Allocator::AllocateAligned(size, out));
+#ifndef NDEBUG
+ // Poison data
+ if (size > 0) {
+ DCHECK_NE(*out, nullptr);
+ (*out)[0] = kAllocPoison;
+ (*out)[size - 1] = kAllocPoison;
+ }
+#endif
+
+ stats_.UpdateAllocatedBytes(size);
+ return Status::OK();
+ }
+
+ Status Reallocate(int64_t old_size, int64_t new_size, uint8_t** ptr) override {
+ if (new_size < 0) {
+ return Status::Invalid("negative realloc size");
+ }
+ if (static_cast<uint64_t>(new_size) >= std::numeric_limits<size_t>::max()) {
+ return Status::CapacityError("realloc overflows size_t");
+ }
+ RETURN_NOT_OK(Allocator::ReallocateAligned(old_size, new_size, ptr));
+#ifndef NDEBUG
+ // Poison data
+ if (new_size > old_size) {
+ DCHECK_NE(*ptr, nullptr);
+ (*ptr)[old_size] = kReallocPoison;
+ (*ptr)[new_size - 1] = kReallocPoison;
+ }
+#endif
+
+ stats_.UpdateAllocatedBytes(new_size - old_size);
+ return Status::OK();
+ }
+
+ void Free(uint8_t* buffer, int64_t size) override {
+#ifndef NDEBUG
+ // Poison data
+ if (size > 0) {
+ DCHECK_NE(buffer, nullptr);
+ buffer[0] = kDeallocPoison;
+ buffer[size - 1] = kDeallocPoison;
+ }
+#endif
+ Allocator::DeallocateAligned(buffer, size);
+
+ stats_.UpdateAllocatedBytes(-size);
+ }
+
+ void ReleaseUnused() override { Allocator::ReleaseUnused(); }
+
+ int64_t bytes_allocated() const override { return stats_.bytes_allocated(); }
+
+ int64_t max_memory() const override { return stats_.max_memory(); }
+
+ protected:
+ internal::MemoryPoolStats stats_;
+};
+
+class SystemMemoryPool : public BaseMemoryPoolImpl<SystemAllocator> {
+ public:
+ std::string backend_name() const override { return "system"; }
+};
+
+#ifdef ARROW_JEMALLOC
+class JemallocMemoryPool : public BaseMemoryPoolImpl<JemallocAllocator> {
+ public:
+ std::string backend_name() const override { return "jemalloc"; }
+};
+#endif
+
+#ifdef ARROW_MIMALLOC
+class MimallocMemoryPool : public BaseMemoryPoolImpl<MimallocAllocator> {
+ public:
+ std::string backend_name() const override { return "mimalloc"; }
+};
+#endif
+
+std::unique_ptr<MemoryPool> MemoryPool::CreateDefault() {
+ auto backend = DefaultBackend();
+ switch (backend) {
+ case MemoryPoolBackend::System:
+ return std::unique_ptr<MemoryPool>(new SystemMemoryPool);
+#ifdef ARROW_JEMALLOC
+ case MemoryPoolBackend::Jemalloc:
+ return std::unique_ptr<MemoryPool>(new JemallocMemoryPool);
+#endif
+#ifdef ARROW_MIMALLOC
+ case MemoryPoolBackend::Mimalloc:
+ return std::unique_ptr<MemoryPool>(new MimallocMemoryPool);
+#endif
+ default:
+ ARROW_LOG(FATAL) << "Internal error: cannot create default memory pool";
+ return nullptr;
+ }
+}
+
+static struct GlobalState {
+ ~GlobalState() { finalizing.store(true, std::memory_order_relaxed); }
+
+ bool is_finalizing() const { return finalizing.load(std::memory_order_relaxed); }
+
+ std::atomic<bool> finalizing{false}; // constructed first, destroyed last
+
+ SystemMemoryPool system_pool;
+#ifdef ARROW_JEMALLOC
+ JemallocMemoryPool jemalloc_pool;
+#endif
+#ifdef ARROW_MIMALLOC
+ MimallocMemoryPool mimalloc_pool;
+#endif
+} global_state;
+
+MemoryPool* system_memory_pool() { return &global_state.system_pool; }
+
+Status jemalloc_memory_pool(MemoryPool** out) {
+#ifdef ARROW_JEMALLOC
+ *out = &global_state.jemalloc_pool;
+ return Status::OK();
+#else
+ return Status::NotImplemented("This Arrow build does not enable jemalloc");
+#endif
+}
+
+Status mimalloc_memory_pool(MemoryPool** out) {
+#ifdef ARROW_MIMALLOC
+ *out = &global_state.mimalloc_pool;
+ return Status::OK();
+#else
+ return Status::NotImplemented("This Arrow build does not enable mimalloc");
+#endif
+}
+
+MemoryPool* default_memory_pool() {
+ auto backend = DefaultBackend();
+ switch (backend) {
+ case MemoryPoolBackend::System:
+ return &global_state.system_pool;
+#ifdef ARROW_JEMALLOC
+ case MemoryPoolBackend::Jemalloc:
+ return &global_state.jemalloc_pool;
+#endif
+#ifdef ARROW_MIMALLOC
+ case MemoryPoolBackend::Mimalloc:
+ return &global_state.mimalloc_pool;
+#endif
+ default:
+ ARROW_LOG(FATAL) << "Internal error: cannot create default memory pool";
+ return nullptr;
+ }
+}
+
+#define RETURN_IF_JEMALLOC_ERROR(ERR) \
+ do { \
+ if (err != 0) { \
+ return Status::UnknownError(std::strerror(ERR)); \
+ } \
+ } while (0)
+
+Status jemalloc_set_decay_ms(int ms) {
+#ifdef ARROW_JEMALLOC
+ ssize_t decay_time_ms = static_cast<ssize_t>(ms);
+
+ int err = mallctl("arenas.dirty_decay_ms", nullptr, nullptr, &decay_time_ms,
+ sizeof(decay_time_ms));
+ RETURN_IF_JEMALLOC_ERROR(err);
+ err = mallctl("arenas.muzzy_decay_ms", nullptr, nullptr, &decay_time_ms,
+ sizeof(decay_time_ms));
+ RETURN_IF_JEMALLOC_ERROR(err);
+
+ return Status::OK();
+#else
+ return Status::Invalid("jemalloc support is not built");
+#endif
+}
+
+///////////////////////////////////////////////////////////////////////
+// LoggingMemoryPool implementation
+
+LoggingMemoryPool::LoggingMemoryPool(MemoryPool* pool) : pool_(pool) {}
+
+Status LoggingMemoryPool::Allocate(int64_t size, uint8_t** out) {
+ Status s = pool_->Allocate(size, out);
+ std::cout << "Allocate: size = " << size << std::endl;
+ return s;
+}
+
+Status LoggingMemoryPool::Reallocate(int64_t old_size, int64_t new_size, uint8_t** ptr) {
+ Status s = pool_->Reallocate(old_size, new_size, ptr);
+ std::cout << "Reallocate: old_size = " << old_size << " - new_size = " << new_size
+ << std::endl;
+ return s;
+}
+
+void LoggingMemoryPool::Free(uint8_t* buffer, int64_t size) {
+ pool_->Free(buffer, size);
+ std::cout << "Free: size = " << size << std::endl;
+}
+
+int64_t LoggingMemoryPool::bytes_allocated() const {
+ int64_t nb_bytes = pool_->bytes_allocated();
+ std::cout << "bytes_allocated: " << nb_bytes << std::endl;
+ return nb_bytes;
+}
+
+int64_t LoggingMemoryPool::max_memory() const {
+ int64_t mem = pool_->max_memory();
+ std::cout << "max_memory: " << mem << std::endl;
+ return mem;
+}
+
+std::string LoggingMemoryPool::backend_name() const { return pool_->backend_name(); }
+
+///////////////////////////////////////////////////////////////////////
+// ProxyMemoryPool implementation
+
+class ProxyMemoryPool::ProxyMemoryPoolImpl {
+ public:
+ explicit ProxyMemoryPoolImpl(MemoryPool* pool) : pool_(pool) {}
+
+ Status Allocate(int64_t size, uint8_t** out) {
+ RETURN_NOT_OK(pool_->Allocate(size, out));
+ stats_.UpdateAllocatedBytes(size);
+ return Status::OK();
+ }
+
+ Status Reallocate(int64_t old_size, int64_t new_size, uint8_t** ptr) {
+ RETURN_NOT_OK(pool_->Reallocate(old_size, new_size, ptr));
+ stats_.UpdateAllocatedBytes(new_size - old_size);
+ return Status::OK();
+ }
+
+ void Free(uint8_t* buffer, int64_t size) {
+ pool_->Free(buffer, size);
+ stats_.UpdateAllocatedBytes(-size);
+ }
+
+ int64_t bytes_allocated() const { return stats_.bytes_allocated(); }
+
+ int64_t max_memory() const { return stats_.max_memory(); }
+
+ std::string backend_name() const { return pool_->backend_name(); }
+
+ private:
+ MemoryPool* pool_;
+ internal::MemoryPoolStats stats_;
+};
+
+ProxyMemoryPool::ProxyMemoryPool(MemoryPool* pool) {
+ impl_.reset(new ProxyMemoryPoolImpl(pool));
+}
+
+ProxyMemoryPool::~ProxyMemoryPool() {}
+
+Status ProxyMemoryPool::Allocate(int64_t size, uint8_t** out) {
+ return impl_->Allocate(size, out);
+}
+
+Status ProxyMemoryPool::Reallocate(int64_t old_size, int64_t new_size, uint8_t** ptr) {
+ return impl_->Reallocate(old_size, new_size, ptr);
+}
+
+void ProxyMemoryPool::Free(uint8_t* buffer, int64_t size) {
+ return impl_->Free(buffer, size);
+}
+
+int64_t ProxyMemoryPool::bytes_allocated() const { return impl_->bytes_allocated(); }
+
+int64_t ProxyMemoryPool::max_memory() const { return impl_->max_memory(); }
+
+std::string ProxyMemoryPool::backend_name() const { return impl_->backend_name(); }
+
+std::vector<std::string> SupportedMemoryBackendNames() {
+ std::vector<std::string> supported;
+ for (const auto backend : SupportedBackends()) {
+ supported.push_back(backend.name);
+ }
+ return supported;
+}
+
+// -----------------------------------------------------------------------
+// Pool buffer and allocation
+
+/// A Buffer whose lifetime is tied to a particular MemoryPool
+class PoolBuffer final : public ResizableBuffer {
+ public:
+ explicit PoolBuffer(std::shared_ptr<MemoryManager> mm, MemoryPool* pool)
+ : ResizableBuffer(nullptr, 0, std::move(mm)), pool_(pool) {}
+
+ ~PoolBuffer() override {
+ // Avoid calling pool_->Free if the global pools are destroyed
+ // (XXX this will not work with user-defined pools)
+
+ // This can happen if a Future is destructing on one thread while or
+ // after memory pools are destructed on the main thread (as there is
+ // no guarantee of destructor order between thread/memory pools)
+ uint8_t* ptr = mutable_data();
+ if (ptr && !global_state.is_finalizing()) {
+ pool_->Free(ptr, capacity_);
+ }
+ }
+
+ Status Reserve(const int64_t capacity) override {
+ if (capacity < 0) {
+ return Status::Invalid("Negative buffer capacity: ", capacity);
+ }
+ uint8_t* ptr = mutable_data();
+ if (!ptr || capacity > capacity_) {
+ int64_t new_capacity = BitUtil::RoundUpToMultipleOf64(capacity);
+ if (ptr) {
+ RETURN_NOT_OK(pool_->Reallocate(capacity_, new_capacity, &ptr));
+ } else {
+ RETURN_NOT_OK(pool_->Allocate(new_capacity, &ptr));
+ }
+ data_ = ptr;
+ capacity_ = new_capacity;
+ }
+ return Status::OK();
+ }
+
+ Status Resize(const int64_t new_size, bool shrink_to_fit = true) override {
+ if (ARROW_PREDICT_FALSE(new_size < 0)) {
+ return Status::Invalid("Negative buffer resize: ", new_size);
+ }
+ uint8_t* ptr = mutable_data();
+ if (ptr && shrink_to_fit && new_size <= size_) {
+ // Buffer is non-null and is not growing, so shrink to the requested size without
+ // excess space.
+ int64_t new_capacity = BitUtil::RoundUpToMultipleOf64(new_size);
+ if (capacity_ != new_capacity) {
+ // Buffer hasn't got yet the requested size.
+ RETURN_NOT_OK(pool_->Reallocate(capacity_, new_capacity, &ptr));
+ data_ = ptr;
+ capacity_ = new_capacity;
+ }
+ } else {
+ RETURN_NOT_OK(Reserve(new_size));
+ }
+ size_ = new_size;
+
+ return Status::OK();
+ }
+
+ static std::shared_ptr<PoolBuffer> MakeShared(MemoryPool* pool) {
+ std::shared_ptr<MemoryManager> mm;
+ if (pool == nullptr) {
+ pool = default_memory_pool();
+ mm = default_cpu_memory_manager();
+ } else {
+ mm = CPUDevice::memory_manager(pool);
+ }
+ return std::make_shared<PoolBuffer>(std::move(mm), pool);
+ }
+
+ static std::unique_ptr<PoolBuffer> MakeUnique(MemoryPool* pool) {
+ std::shared_ptr<MemoryManager> mm;
+ if (pool == nullptr) {
+ pool = default_memory_pool();
+ mm = default_cpu_memory_manager();
+ } else {
+ mm = CPUDevice::memory_manager(pool);
+ }
+ return std::unique_ptr<PoolBuffer>(new PoolBuffer(std::move(mm), pool));
+ }
+
+ private:
+ MemoryPool* pool_;
+};
+
+namespace {
+// A utility that does most of the work of the `AllocateBuffer` and
+// `AllocateResizableBuffer` methods. The argument `buffer` should be a smart pointer to
+// a PoolBuffer.
+template <typename BufferPtr, typename PoolBufferPtr>
+inline Result<BufferPtr> ResizePoolBuffer(PoolBufferPtr&& buffer, const int64_t size) {
+ RETURN_NOT_OK(buffer->Resize(size));
+ buffer->ZeroPadding();
+ return std::move(buffer);
+}
+
+} // namespace
+
+Result<std::unique_ptr<Buffer>> AllocateBuffer(const int64_t size, MemoryPool* pool) {
+ return ResizePoolBuffer<std::unique_ptr<Buffer>>(PoolBuffer::MakeUnique(pool), size);
+}
+
+Result<std::unique_ptr<ResizableBuffer>> AllocateResizableBuffer(const int64_t size,
+ MemoryPool* pool) {
+ return ResizePoolBuffer<std::unique_ptr<ResizableBuffer>>(PoolBuffer::MakeUnique(pool),
+ size);
+}
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/memory_pool.h b/contrib/libs/apache/arrow/cpp/src/arrow/memory_pool.h
new file mode 100644
index 00000000000..81b1b112dc7
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/memory_pool.h
@@ -0,0 +1,185 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <atomic>
+#include <cstdint>
+#include <memory>
+#include <string>
+
+#include "arrow/status.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+namespace internal {
+
+///////////////////////////////////////////////////////////////////////
+// Helper tracking memory statistics
+
+class MemoryPoolStats {
+ public:
+ MemoryPoolStats() : bytes_allocated_(0), max_memory_(0) {}
+
+ int64_t max_memory() const { return max_memory_.load(); }
+
+ int64_t bytes_allocated() const { return bytes_allocated_.load(); }
+
+ inline void UpdateAllocatedBytes(int64_t diff) {
+ auto allocated = bytes_allocated_.fetch_add(diff) + diff;
+ // "maximum" allocated memory is ill-defined in multi-threaded code,
+ // so don't try to be too rigorous here
+ if (diff > 0 && allocated > max_memory_) {
+ max_memory_ = allocated;
+ }
+ }
+
+ protected:
+ std::atomic<int64_t> bytes_allocated_;
+ std::atomic<int64_t> max_memory_;
+};
+
+} // namespace internal
+
+/// Base class for memory allocation on the CPU.
+///
+/// Besides tracking the number of allocated bytes, the allocator also should
+/// take care of the required 64-byte alignment.
+class ARROW_EXPORT MemoryPool {
+ public:
+ virtual ~MemoryPool() = default;
+
+ /// \brief EXPERIMENTAL. Create a new instance of the default MemoryPool
+ static std::unique_ptr<MemoryPool> CreateDefault();
+
+ /// Allocate a new memory region of at least size bytes.
+ ///
+ /// The allocated region shall be 64-byte aligned.
+ virtual Status Allocate(int64_t size, uint8_t** out) = 0;
+
+ /// Resize an already allocated memory section.
+ ///
+ /// As by default most default allocators on a platform don't support aligned
+ /// reallocation, this function can involve a copy of the underlying data.
+ virtual Status Reallocate(int64_t old_size, int64_t new_size, uint8_t** ptr) = 0;
+
+ /// Free an allocated region.
+ ///
+ /// @param buffer Pointer to the start of the allocated memory region
+ /// @param size Allocated size located at buffer. An allocator implementation
+ /// may use this for tracking the amount of allocated bytes as well as for
+ /// faster deallocation if supported by its backend.
+ virtual void Free(uint8_t* buffer, int64_t size) = 0;
+
+ /// Return unused memory to the OS
+ ///
+ /// Only applies to allocators that hold onto unused memory. This will be
+ /// best effort, a memory pool may not implement this feature or may be
+ /// unable to fulfill the request due to fragmentation.
+ virtual void ReleaseUnused() {}
+
+ /// The number of bytes that were allocated and not yet free'd through
+ /// this allocator.
+ virtual int64_t bytes_allocated() const = 0;
+
+ /// Return peak memory allocation in this memory pool
+ ///
+ /// \return Maximum bytes allocated. If not known (or not implemented),
+ /// returns -1
+ virtual int64_t max_memory() const;
+
+ /// The name of the backend used by this MemoryPool (e.g. "system" or "jemalloc").
+ virtual std::string backend_name() const = 0;
+
+ protected:
+ MemoryPool() = default;
+};
+
+class ARROW_EXPORT LoggingMemoryPool : public MemoryPool {
+ public:
+ explicit LoggingMemoryPool(MemoryPool* pool);
+ ~LoggingMemoryPool() override = default;
+
+ Status Allocate(int64_t size, uint8_t** out) override;
+ Status Reallocate(int64_t old_size, int64_t new_size, uint8_t** ptr) override;
+
+ void Free(uint8_t* buffer, int64_t size) override;
+
+ int64_t bytes_allocated() const override;
+
+ int64_t max_memory() const override;
+
+ std::string backend_name() const override;
+
+ private:
+ MemoryPool* pool_;
+};
+
+/// Derived class for memory allocation.
+///
+/// Tracks the number of bytes and maximum memory allocated through its direct
+/// calls. Actual allocation is delegated to MemoryPool class.
+class ARROW_EXPORT ProxyMemoryPool : public MemoryPool {
+ public:
+ explicit ProxyMemoryPool(MemoryPool* pool);
+ ~ProxyMemoryPool() override;
+
+ Status Allocate(int64_t size, uint8_t** out) override;
+ Status Reallocate(int64_t old_size, int64_t new_size, uint8_t** ptr) override;
+
+ void Free(uint8_t* buffer, int64_t size) override;
+
+ int64_t bytes_allocated() const override;
+
+ int64_t max_memory() const override;
+
+ std::string backend_name() const override;
+
+ private:
+ class ProxyMemoryPoolImpl;
+ std::unique_ptr<ProxyMemoryPoolImpl> impl_;
+};
+
+/// \brief Return a process-wide memory pool based on the system allocator.
+ARROW_EXPORT MemoryPool* system_memory_pool();
+
+/// \brief Return a process-wide memory pool based on jemalloc.
+///
+/// May return NotImplemented if jemalloc is not available.
+ARROW_EXPORT Status jemalloc_memory_pool(MemoryPool** out);
+
+/// \brief Set jemalloc memory page purging behavior for future-created arenas
+/// to the indicated number of milliseconds. See dirty_decay_ms and
+/// muzzy_decay_ms options in jemalloc for a description of what these do. The
+/// default is configured to 1000 (1 second) which releases memory more
+/// aggressively to the operating system than the jemalloc default of 10
+/// seconds. If you set the value to 0, dirty / muzzy pages will be released
+/// immediately rather than with a time decay, but this may reduce application
+/// performance.
+ARROW_EXPORT
+Status jemalloc_set_decay_ms(int ms);
+
+/// \brief Return a process-wide memory pool based on mimalloc.
+///
+/// May return NotImplemented if mimalloc is not available.
+ARROW_EXPORT Status mimalloc_memory_pool(MemoryPool** out);
+
+ARROW_EXPORT std::vector<std::string> SupportedMemoryBackendNames();
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/pretty_print.cc b/contrib/libs/apache/arrow/cpp/src/arrow/pretty_print.cc
new file mode 100644
index 00000000000..8d1c16e0ed6
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/pretty_print.cc
@@ -0,0 +1,711 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/pretty_print.h"
+
+#include <algorithm>
+#include <chrono>
+#include <cstddef>
+#include <cstdint>
+#include <iostream>
+#include <memory>
+#include <sstream> // IWYU pragma: keep
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/chunked_array.h"
+#include "arrow/record_batch.h"
+#include "arrow/status.h"
+#include "arrow/table.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/int_util_internal.h"
+#include "arrow/util/key_value_metadata.h"
+#include "arrow/util/string.h"
+#include "arrow/vendored/datetime.h"
+#include "arrow/visitor_inline.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+
+class PrettyPrinter {
+ public:
+ PrettyPrinter(const PrettyPrintOptions& options, std::ostream* sink)
+ : options_(options), indent_(options.indent), sink_(sink) {}
+
+ void Write(const char* data);
+ void Write(const std::string& data);
+ void WriteIndented(const char* data);
+ void WriteIndented(const std::string& data);
+ void Newline();
+ void Indent();
+ void OpenArray(const Array& array);
+ void CloseArray(const Array& array);
+
+ void Flush() { (*sink_) << std::flush; }
+
+ protected:
+ const PrettyPrintOptions& options_;
+ int indent_;
+ std::ostream* sink_;
+};
+
+void PrettyPrinter::OpenArray(const Array& array) {
+ if (!options_.skip_new_lines) {
+ Indent();
+ }
+ (*sink_) << "[";
+ if (array.length() > 0) {
+ Newline();
+ indent_ += options_.indent_size;
+ }
+}
+
+void PrettyPrinter::CloseArray(const Array& array) {
+ if (array.length() > 0) {
+ indent_ -= options_.indent_size;
+ Indent();
+ }
+ (*sink_) << "]";
+}
+
+void PrettyPrinter::Write(const char* data) { (*sink_) << data; }
+void PrettyPrinter::Write(const std::string& data) { (*sink_) << data; }
+
+void PrettyPrinter::WriteIndented(const char* data) {
+ Indent();
+ Write(data);
+}
+
+void PrettyPrinter::WriteIndented(const std::string& data) {
+ Indent();
+ Write(data);
+}
+
+void PrettyPrinter::Newline() {
+ if (options_.skip_new_lines) {
+ return;
+ }
+ (*sink_) << "\n";
+}
+
+void PrettyPrinter::Indent() {
+ for (int i = 0; i < indent_; ++i) {
+ (*sink_) << " ";
+ }
+}
+
+class ArrayPrinter : public PrettyPrinter {
+ public:
+ ArrayPrinter(const PrettyPrintOptions& options, std::ostream* sink)
+ : PrettyPrinter(options, sink) {}
+
+ template <typename FormatFunction>
+ void WriteValues(const Array& array, FormatFunction&& func) {
+ bool skip_comma = true;
+ for (int64_t i = 0; i < array.length(); ++i) {
+ if (skip_comma) {
+ skip_comma = false;
+ } else {
+ (*sink_) << ",";
+ Newline();
+ }
+ if (!options_.skip_new_lines) {
+ Indent();
+ }
+ if ((i >= options_.window) && (i < (array.length() - options_.window))) {
+ (*sink_) << "...";
+ Newline();
+ i = array.length() - options_.window - 1;
+ skip_comma = true;
+ } else if (array.IsNull(i)) {
+ (*sink_) << options_.null_rep;
+ } else {
+ func(i);
+ }
+ }
+ Newline();
+ }
+
+ Status WriteDataValues(const BooleanArray& array) {
+ WriteValues(array, [&](int64_t i) { Write(array.Value(i) ? "true" : "false"); });
+ return Status::OK();
+ }
+
+ template <typename T>
+ enable_if_integer<typename T::TypeClass, Status> WriteDataValues(const T& array) {
+ const auto data = array.raw_values();
+ // Need to upcast integers to avoid selecting operator<<(char)
+ WriteValues(array, [&](int64_t i) { (*sink_) << internal::UpcastInt(data[i]); });
+ return Status::OK();
+ }
+
+ template <typename T>
+ enable_if_floating_point<typename T::TypeClass, Status> WriteDataValues(
+ const T& array) {
+ const auto data = array.raw_values();
+ WriteValues(array, [&](int64_t i) { (*sink_) << data[i]; });
+ return Status::OK();
+ }
+
+ template <typename T>
+ enable_if_date<typename T::TypeClass, Status> WriteDataValues(const T& array) {
+ const auto data = array.raw_values();
+ using unit = typename std::conditional<std::is_same<T, Date32Array>::value,
+ arrow_vendored::date::days,
+ std::chrono::milliseconds>::type;
+ WriteValues(array, [&](int64_t i) { FormatDateTime<unit>("%F", data[i], true); });
+ return Status::OK();
+ }
+
+ template <typename T>
+ enable_if_time<typename T::TypeClass, Status> WriteDataValues(const T& array) {
+ const auto data = array.raw_values();
+ const auto type = static_cast<const TimeType*>(array.type().get());
+ WriteValues(array,
+ [&](int64_t i) { FormatDateTime(type->unit(), "%T", data[i], false); });
+ return Status::OK();
+ }
+
+ Status WriteDataValues(const TimestampArray& array) {
+ const int64_t* data = array.raw_values();
+ const auto type = static_cast<const TimestampType*>(array.type().get());
+ WriteValues(array,
+ [&](int64_t i) { FormatDateTime(type->unit(), "%F %T", data[i], true); });
+ return Status::OK();
+ }
+
+ template <typename T>
+ enable_if_duration<typename T::TypeClass, Status> WriteDataValues(const T& array) {
+ const auto data = array.raw_values();
+ WriteValues(array, [&](int64_t i) { (*sink_) << data[i]; });
+ return Status::OK();
+ }
+
+ Status WriteDataValues(const DayTimeIntervalArray& array) {
+ WriteValues(array, [&](int64_t i) {
+ auto day_millis = array.GetValue(i);
+ (*sink_) << day_millis.days << "d" << day_millis.milliseconds << "ms";
+ });
+ return Status::OK();
+ }
+
+ Status WriteDataValues(const MonthIntervalArray& array) {
+ const auto data = array.raw_values();
+ WriteValues(array, [&](int64_t i) { (*sink_) << data[i]; });
+ return Status::OK();
+ }
+
+ template <typename T>
+ enable_if_string_like<typename T::TypeClass, Status> WriteDataValues(const T& array) {
+ WriteValues(array, [&](int64_t i) { (*sink_) << "\"" << array.GetView(i) << "\""; });
+ return Status::OK();
+ }
+
+ // Binary
+ template <typename T>
+ enable_if_binary_like<typename T::TypeClass, Status> WriteDataValues(const T& array) {
+ WriteValues(array, [&](int64_t i) { (*sink_) << HexEncode(array.GetView(i)); });
+ return Status::OK();
+ }
+
+ Status WriteDataValues(const Decimal128Array& array) {
+ WriteValues(array, [&](int64_t i) { (*sink_) << array.FormatValue(i); });
+ return Status::OK();
+ }
+
+ Status WriteDataValues(const Decimal256Array& array) {
+ WriteValues(array, [&](int64_t i) { (*sink_) << array.FormatValue(i); });
+ return Status::OK();
+ }
+
+ template <typename T>
+ enable_if_list_like<typename T::TypeClass, Status> WriteDataValues(const T& array) {
+ bool skip_comma = true;
+ for (int64_t i = 0; i < array.length(); ++i) {
+ if (skip_comma) {
+ skip_comma = false;
+ } else {
+ (*sink_) << ",";
+ Newline();
+ }
+ if ((i >= options_.window) && (i < (array.length() - options_.window))) {
+ Indent();
+ (*sink_) << "...";
+ Newline();
+ i = array.length() - options_.window - 1;
+ skip_comma = true;
+ } else if (array.IsNull(i)) {
+ Indent();
+ (*sink_) << options_.null_rep;
+ } else {
+ std::shared_ptr<Array> slice =
+ array.values()->Slice(array.value_offset(i), array.value_length(i));
+ RETURN_NOT_OK(
+ PrettyPrint(*slice, PrettyPrintOptions{indent_, options_.window}, sink_));
+ }
+ }
+ Newline();
+ return Status::OK();
+ }
+
+ Status WriteDataValues(const MapArray& array) {
+ bool skip_comma = true;
+ for (int64_t i = 0; i < array.length(); ++i) {
+ if (skip_comma) {
+ skip_comma = false;
+ } else {
+ (*sink_) << ",";
+ Newline();
+ }
+
+ if (!options_.skip_new_lines) {
+ Indent();
+ }
+
+ if ((i >= options_.window) && (i < (array.length() - options_.window))) {
+ (*sink_) << "...";
+ Newline();
+ i = array.length() - options_.window - 1;
+ skip_comma = true;
+ } else if (array.IsNull(i)) {
+ (*sink_) << options_.null_rep;
+ } else {
+ (*sink_) << "keys:";
+ Newline();
+ auto keys_slice =
+ array.keys()->Slice(array.value_offset(i), array.value_length(i));
+ RETURN_NOT_OK(PrettyPrint(*keys_slice,
+ PrettyPrintOptions{indent_, options_.window}, sink_));
+ Newline();
+ Indent();
+ (*sink_) << "values:";
+ Newline();
+ auto values_slice =
+ array.items()->Slice(array.value_offset(i), array.value_length(i));
+ RETURN_NOT_OK(PrettyPrint(*values_slice,
+ PrettyPrintOptions{indent_, options_.window}, sink_));
+ }
+ }
+ (*sink_) << "\n";
+ return Status::OK();
+ }
+
+ Status Visit(const NullArray& array) {
+ (*sink_) << array.length() << " nulls";
+ return Status::OK();
+ }
+
+ template <typename T>
+ enable_if_t<std::is_base_of<PrimitiveArray, T>::value ||
+ std::is_base_of<FixedSizeBinaryArray, T>::value ||
+ std::is_base_of<BinaryArray, T>::value ||
+ std::is_base_of<LargeBinaryArray, T>::value ||
+ std::is_base_of<ListArray, T>::value ||
+ std::is_base_of<LargeListArray, T>::value ||
+ std::is_base_of<MapArray, T>::value ||
+ std::is_base_of<FixedSizeListArray, T>::value,
+ Status>
+ Visit(const T& array) {
+ OpenArray(array);
+ if (array.length() > 0) {
+ RETURN_NOT_OK(WriteDataValues(array));
+ }
+ CloseArray(array);
+ return Status::OK();
+ }
+
+ Status Visit(const ExtensionArray& array) { return Print(*array.storage()); }
+
+ Status WriteValidityBitmap(const Array& array);
+
+ Status PrintChildren(const std::vector<std::shared_ptr<Array>>& fields, int64_t offset,
+ int64_t length) {
+ for (size_t i = 0; i < fields.size(); ++i) {
+ Newline();
+ Indent();
+ std::stringstream ss;
+ ss << "-- child " << i << " type: " << fields[i]->type()->ToString() << "\n";
+ Write(ss.str());
+
+ std::shared_ptr<Array> field = fields[i];
+ if (offset != 0) {
+ field = field->Slice(offset, length);
+ }
+ RETURN_NOT_OK(PrettyPrint(*field, indent_ + options_.indent_size, sink_));
+ }
+ return Status::OK();
+ }
+
+ Status Visit(const StructArray& array) {
+ RETURN_NOT_OK(WriteValidityBitmap(array));
+ std::vector<std::shared_ptr<Array>> children;
+ children.reserve(array.num_fields());
+ for (int i = 0; i < array.num_fields(); ++i) {
+ children.emplace_back(array.field(i));
+ }
+ return PrintChildren(children, 0, array.length());
+ }
+
+ Status Visit(const UnionArray& array) {
+ RETURN_NOT_OK(WriteValidityBitmap(array));
+
+ Newline();
+ Indent();
+ Write("-- type_ids: ");
+ UInt8Array type_codes(array.length(), array.type_codes(), nullptr, 0, array.offset());
+ RETURN_NOT_OK(PrettyPrint(type_codes, indent_ + options_.indent_size, sink_));
+
+ if (array.mode() == UnionMode::DENSE) {
+ Newline();
+ Indent();
+ Write("-- value_offsets: ");
+ Int32Array value_offsets(
+ array.length(), checked_cast<const DenseUnionArray&>(array).value_offsets(),
+ nullptr, 0, array.offset());
+ RETURN_NOT_OK(PrettyPrint(value_offsets, indent_ + options_.indent_size, sink_));
+ }
+
+ // Print the children without any offset, because the type ids are absolute
+ std::vector<std::shared_ptr<Array>> children;
+ children.reserve(array.num_fields());
+ for (int i = 0; i < array.num_fields(); ++i) {
+ children.emplace_back(array.field(i));
+ }
+ return PrintChildren(children, 0, array.length() + array.offset());
+ }
+
+ Status Visit(const DictionaryArray& array) {
+ Newline();
+ Indent();
+ Write("-- dictionary:\n");
+ RETURN_NOT_OK(
+ PrettyPrint(*array.dictionary(), indent_ + options_.indent_size, sink_));
+
+ Newline();
+ Indent();
+ Write("-- indices:\n");
+ return PrettyPrint(*array.indices(), indent_ + options_.indent_size, sink_);
+ }
+
+ Status Print(const Array& array) {
+ RETURN_NOT_OK(VisitArrayInline(array, this));
+ Flush();
+ return Status::OK();
+ }
+
+ private:
+ template <typename Unit>
+ void FormatDateTime(const char* fmt, int64_t value, bool add_epoch) {
+ if (add_epoch) {
+ (*sink_) << arrow_vendored::date::format(fmt, epoch_ + Unit{value});
+ } else {
+ (*sink_) << arrow_vendored::date::format(fmt, Unit{value});
+ }
+ }
+
+ void FormatDateTime(TimeUnit::type unit, const char* fmt, int64_t value,
+ bool add_epoch) {
+ switch (unit) {
+ case TimeUnit::NANO:
+ FormatDateTime<std::chrono::nanoseconds>(fmt, value, add_epoch);
+ break;
+ case TimeUnit::MICRO:
+ FormatDateTime<std::chrono::microseconds>(fmt, value, add_epoch);
+ break;
+ case TimeUnit::MILLI:
+ FormatDateTime<std::chrono::milliseconds>(fmt, value, add_epoch);
+ break;
+ case TimeUnit::SECOND:
+ FormatDateTime<std::chrono::seconds>(fmt, value, add_epoch);
+ break;
+ }
+ }
+
+ static arrow_vendored::date::sys_days epoch_;
+};
+
+arrow_vendored::date::sys_days ArrayPrinter::epoch_ =
+ arrow_vendored::date::sys_days{arrow_vendored::date::jan / 1 / 1970};
+
+Status ArrayPrinter::WriteValidityBitmap(const Array& array) {
+ Indent();
+ Write("-- is_valid:");
+
+ if (array.null_count() > 0) {
+ Newline();
+ Indent();
+ BooleanArray is_valid(array.length(), array.null_bitmap(), nullptr, 0,
+ array.offset());
+ return PrettyPrint(is_valid, indent_ + options_.indent_size, sink_);
+ } else {
+ Write(" all not null");
+ return Status::OK();
+ }
+}
+
+Status PrettyPrint(const Array& arr, int indent, std::ostream* sink) {
+ PrettyPrintOptions options;
+ options.indent = indent;
+ ArrayPrinter printer(options, sink);
+ return printer.Print(arr);
+}
+
+Status PrettyPrint(const Array& arr, const PrettyPrintOptions& options,
+ std::ostream* sink) {
+ ArrayPrinter printer(options, sink);
+ return printer.Print(arr);
+}
+
+Status PrettyPrint(const Array& arr, const PrettyPrintOptions& options,
+ std::string* result) {
+ std::ostringstream sink;
+ RETURN_NOT_OK(PrettyPrint(arr, options, &sink));
+ *result = sink.str();
+ return Status::OK();
+}
+
+Status PrettyPrint(const ChunkedArray& chunked_arr, const PrettyPrintOptions& options,
+ std::ostream* sink) {
+ int num_chunks = chunked_arr.num_chunks();
+ int indent = options.indent;
+ int window = options.window;
+
+ for (int i = 0; i < indent; ++i) {
+ (*sink) << " ";
+ }
+ (*sink) << "[";
+ if (!options.skip_new_lines) {
+ *sink << "\n";
+ }
+ bool skip_comma = true;
+ for (int i = 0; i < num_chunks; ++i) {
+ if (skip_comma) {
+ skip_comma = false;
+ } else {
+ (*sink) << ",";
+ if (!options.skip_new_lines) {
+ *sink << "\n";
+ }
+ }
+ if ((i >= window) && (i < (num_chunks - window))) {
+ for (int i = 0; i < indent; ++i) {
+ (*sink) << " ";
+ }
+ (*sink) << "...";
+ if (!options.skip_new_lines) {
+ *sink << "\n";
+ }
+ i = num_chunks - window - 1;
+ skip_comma = true;
+ } else {
+ PrettyPrintOptions chunk_options = options;
+ chunk_options.indent += options.indent_size;
+ ArrayPrinter printer(chunk_options, sink);
+ RETURN_NOT_OK(printer.Print(*chunked_arr.chunk(i)));
+ }
+ }
+ if (!options.skip_new_lines) {
+ *sink << "\n";
+ }
+
+ for (int i = 0; i < indent; ++i) {
+ (*sink) << " ";
+ }
+ (*sink) << "]";
+
+ return Status::OK();
+}
+
+Status PrettyPrint(const ChunkedArray& chunked_arr, const PrettyPrintOptions& options,
+ std::string* result) {
+ std::ostringstream sink;
+ RETURN_NOT_OK(PrettyPrint(chunked_arr, options, &sink));
+ *result = sink.str();
+ return Status::OK();
+}
+
+Status PrettyPrint(const RecordBatch& batch, int indent, std::ostream* sink) {
+ for (int i = 0; i < batch.num_columns(); ++i) {
+ const std::string& name = batch.column_name(i);
+ (*sink) << name << ": ";
+ RETURN_NOT_OK(PrettyPrint(*batch.column(i), indent + 2, sink));
+ (*sink) << "\n";
+ }
+ (*sink) << std::flush;
+ return Status::OK();
+}
+
+Status PrettyPrint(const RecordBatch& batch, const PrettyPrintOptions& options,
+ std::ostream* sink) {
+ for (int i = 0; i < batch.num_columns(); ++i) {
+ const std::string& name = batch.column_name(i);
+ PrettyPrintOptions column_options = options;
+ column_options.indent += 2;
+
+ (*sink) << name << ": ";
+ RETURN_NOT_OK(PrettyPrint(*batch.column(i), column_options, sink));
+ (*sink) << "\n";
+ }
+ (*sink) << std::flush;
+ return Status::OK();
+}
+
+Status PrettyPrint(const Table& table, const PrettyPrintOptions& options,
+ std::ostream* sink) {
+ RETURN_NOT_OK(PrettyPrint(*table.schema(), options, sink));
+ (*sink) << "\n";
+ (*sink) << "----\n";
+
+ PrettyPrintOptions column_options = options;
+ column_options.indent += 2;
+ for (int i = 0; i < table.num_columns(); ++i) {
+ for (int j = 0; j < options.indent; ++j) {
+ (*sink) << " ";
+ }
+ (*sink) << table.schema()->field(i)->name() << ":\n";
+ RETURN_NOT_OK(PrettyPrint(*table.column(i), column_options, sink));
+ (*sink) << "\n";
+ }
+ (*sink) << std::flush;
+ return Status::OK();
+}
+
+Status DebugPrint(const Array& arr, int indent) {
+ return PrettyPrint(arr, indent, &std::cerr);
+}
+
+class SchemaPrinter : public PrettyPrinter {
+ public:
+ SchemaPrinter(const Schema& schema, const PrettyPrintOptions& options,
+ std::ostream* sink)
+ : PrettyPrinter(options, sink), schema_(schema) {}
+
+ Status PrintType(const DataType& type, bool nullable);
+ Status PrintField(const Field& field);
+
+ void PrintVerboseMetadata(const KeyValueMetadata& metadata) {
+ for (int64_t i = 0; i < metadata.size(); ++i) {
+ Newline();
+ Indent();
+ Write(metadata.key(i) + ": '" + metadata.value(i) + "'");
+ }
+ }
+
+ void PrintTruncatedMetadata(const KeyValueMetadata& metadata) {
+ for (int64_t i = 0; i < metadata.size(); ++i) {
+ Newline();
+ Indent();
+ size_t size = metadata.value(i).size();
+ size_t truncated_size = std::max<size_t>(10, 70 - metadata.key(i).size() - indent_);
+ if (size <= truncated_size) {
+ Write(metadata.key(i) + ": '" + metadata.value(i) + "'");
+ continue;
+ }
+
+ Write(metadata.key(i) + ": '" + metadata.value(i).substr(0, truncated_size) +
+ "' + " + std::to_string(size - truncated_size));
+ }
+ }
+
+ void PrintMetadata(const std::string& metadata_type, const KeyValueMetadata& metadata) {
+ if (metadata.size() > 0) {
+ Newline();
+ Indent();
+ Write(metadata_type);
+ if (options_.truncate_metadata) {
+ PrintTruncatedMetadata(metadata);
+ } else {
+ PrintVerboseMetadata(metadata);
+ }
+ }
+ }
+
+ Status Print() {
+ for (int i = 0; i < schema_.num_fields(); ++i) {
+ if (i > 0) {
+ Newline();
+ Indent();
+ } else {
+ Indent();
+ }
+ RETURN_NOT_OK(PrintField(*schema_.field(i)));
+ }
+
+ if (options_.show_schema_metadata && schema_.metadata() != nullptr) {
+ PrintMetadata("-- schema metadata --", *schema_.metadata());
+ }
+ Flush();
+ return Status::OK();
+ }
+
+ private:
+ const Schema& schema_;
+};
+
+Status SchemaPrinter::PrintType(const DataType& type, bool nullable) {
+ Write(type.ToString());
+ if (!nullable) {
+ Write(" not null");
+ }
+ for (int i = 0; i < type.num_fields(); ++i) {
+ Newline();
+ Indent();
+
+ std::stringstream ss;
+ ss << "child " << i << ", ";
+
+ indent_ += options_.indent_size;
+ WriteIndented(ss.str());
+ RETURN_NOT_OK(PrintField(*type.field(i)));
+ indent_ -= options_.indent_size;
+ }
+ return Status::OK();
+}
+
+Status SchemaPrinter::PrintField(const Field& field) {
+ Write(field.name());
+ Write(": ");
+ RETURN_NOT_OK(PrintType(*field.type(), field.nullable()));
+
+ if (options_.show_field_metadata && field.metadata() != nullptr) {
+ indent_ += options_.indent_size;
+ PrintMetadata("-- field metadata --", *field.metadata());
+ indent_ -= options_.indent_size;
+ }
+ return Status::OK();
+}
+
+Status PrettyPrint(const Schema& schema, const PrettyPrintOptions& options,
+ std::ostream* sink) {
+ SchemaPrinter printer(schema, options, sink);
+ return printer.Print();
+}
+
+Status PrettyPrint(const Schema& schema, const PrettyPrintOptions& options,
+ std::string* result) {
+ std::ostringstream sink;
+ RETURN_NOT_OK(PrettyPrint(schema, options, &sink));
+ *result = sink.str();
+ return Status::OK();
+}
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/pretty_print.h b/contrib/libs/apache/arrow/cpp/src/arrow/pretty_print.h
new file mode 100644
index 00000000000..1bc086a6889
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/pretty_print.h
@@ -0,0 +1,125 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <iosfwd>
+#include <string>
+#include <utility>
+
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class Array;
+class ChunkedArray;
+class RecordBatch;
+class Schema;
+class Status;
+class Table;
+
+struct PrettyPrintOptions {
+ PrettyPrintOptions() = default;
+
+ PrettyPrintOptions(int indent_arg, // NOLINT runtime/explicit
+ int window_arg = 10, int indent_size_arg = 2,
+ std::string null_rep_arg = "null", bool skip_new_lines_arg = false,
+ bool truncate_metadata_arg = true)
+ : indent(indent_arg),
+ indent_size(indent_size_arg),
+ window(window_arg),
+ null_rep(std::move(null_rep_arg)),
+ skip_new_lines(skip_new_lines_arg),
+ truncate_metadata(truncate_metadata_arg) {}
+
+ static PrettyPrintOptions Defaults() { return PrettyPrintOptions(); }
+
+ /// Number of spaces to shift entire formatted object to the right
+ int indent = 0;
+
+ /// Size of internal indents
+ int indent_size = 2;
+
+ /// Maximum number of elements to show at the beginning and at the end.
+ int window = 10;
+
+ /// String to use for representing a null value, defaults to "null"
+ std::string null_rep = "null";
+
+ /// Skip new lines between elements, defaults to false
+ bool skip_new_lines = false;
+
+ /// Limit display of each KeyValueMetadata key/value pair to a single line at
+ /// 80 character width
+ bool truncate_metadata = true;
+
+ /// If true, display field metadata when pretty-printing a Schema
+ bool show_field_metadata = true;
+
+ /// If true, display schema metadata when pretty-printing a Schema
+ bool show_schema_metadata = true;
+};
+
+/// \brief Print human-readable representation of RecordBatch
+ARROW_EXPORT
+Status PrettyPrint(const RecordBatch& batch, int indent, std::ostream* sink);
+
+ARROW_EXPORT
+Status PrettyPrint(const RecordBatch& batch, const PrettyPrintOptions& options,
+ std::ostream* sink);
+
+/// \brief Print human-readable representation of Table
+ARROW_EXPORT
+Status PrettyPrint(const Table& table, const PrettyPrintOptions& options,
+ std::ostream* sink);
+
+/// \brief Print human-readable representation of Array
+ARROW_EXPORT
+Status PrettyPrint(const Array& arr, int indent, std::ostream* sink);
+
+/// \brief Print human-readable representation of Array
+ARROW_EXPORT
+Status PrettyPrint(const Array& arr, const PrettyPrintOptions& options,
+ std::ostream* sink);
+
+/// \brief Print human-readable representation of Array
+ARROW_EXPORT
+Status PrettyPrint(const Array& arr, const PrettyPrintOptions& options,
+ std::string* result);
+
+/// \brief Print human-readable representation of ChunkedArray
+ARROW_EXPORT
+Status PrettyPrint(const ChunkedArray& chunked_arr, const PrettyPrintOptions& options,
+ std::ostream* sink);
+
+/// \brief Print human-readable representation of ChunkedArray
+ARROW_EXPORT
+Status PrettyPrint(const ChunkedArray& chunked_arr, const PrettyPrintOptions& options,
+ std::string* result);
+
+ARROW_EXPORT
+Status PrettyPrint(const Schema& schema, const PrettyPrintOptions& options,
+ std::ostream* sink);
+
+ARROW_EXPORT
+Status PrettyPrint(const Schema& schema, const PrettyPrintOptions& options,
+ std::string* result);
+
+ARROW_EXPORT
+Status DebugPrint(const Array& arr, int indent);
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/record_batch.cc b/contrib/libs/apache/arrow/cpp/src/arrow/record_batch.cc
new file mode 100644
index 00000000000..66f9e932b58
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/record_batch.cc
@@ -0,0 +1,367 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/record_batch.h"
+
+#include <algorithm>
+#include <cstdlib>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <utility>
+
+#include "arrow/array.h"
+#include "arrow/array/validate.h"
+#include "arrow/pretty_print.h"
+#include "arrow/status.h"
+#include "arrow/table.h"
+#include "arrow/type.h"
+#include "arrow/util/atomic_shared_ptr.h"
+#include "arrow/util/iterator.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/vector.h"
+
+namespace arrow {
+
+Result<std::shared_ptr<RecordBatch>> RecordBatch::AddColumn(
+ int i, std::string field_name, const std::shared_ptr<Array>& column) const {
+ auto field = ::arrow::field(std::move(field_name), column->type());
+ return AddColumn(i, field, column);
+}
+
+std::shared_ptr<Array> RecordBatch::GetColumnByName(const std::string& name) const {
+ auto i = schema_->GetFieldIndex(name);
+ return i == -1 ? NULLPTR : column(i);
+}
+
+int RecordBatch::num_columns() const { return schema_->num_fields(); }
+
+/// \class SimpleRecordBatch
+/// \brief A basic, non-lazy in-memory record batch
+class SimpleRecordBatch : public RecordBatch {
+ public:
+ SimpleRecordBatch(std::shared_ptr<Schema> schema, int64_t num_rows,
+ std::vector<std::shared_ptr<Array>> columns)
+ : RecordBatch(std::move(schema), num_rows), boxed_columns_(std::move(columns)) {
+ columns_.resize(boxed_columns_.size());
+ for (size_t i = 0; i < columns_.size(); ++i) {
+ columns_[i] = boxed_columns_[i]->data();
+ }
+ }
+
+ SimpleRecordBatch(const std::shared_ptr<Schema>& schema, int64_t num_rows,
+ std::vector<std::shared_ptr<ArrayData>> columns)
+ : RecordBatch(std::move(schema), num_rows), columns_(std::move(columns)) {
+ boxed_columns_.resize(schema_->num_fields());
+ }
+
+ const std::vector<std::shared_ptr<Array>>& columns() const override {
+ for (int i = 0; i < num_columns(); ++i) {
+ // Force all columns to be boxed
+ column(i);
+ }
+ return boxed_columns_;
+ }
+
+ std::shared_ptr<Array> column(int i) const override {
+ std::shared_ptr<Array> result = internal::atomic_load(&boxed_columns_[i]);
+ if (!result) {
+ result = MakeArray(columns_[i]);
+ internal::atomic_store(&boxed_columns_[i], result);
+ }
+ return result;
+ }
+
+ std::shared_ptr<ArrayData> column_data(int i) const override { return columns_[i]; }
+
+ const ArrayDataVector& column_data() const override { return columns_; }
+
+ Result<std::shared_ptr<RecordBatch>> AddColumn(
+ int i, const std::shared_ptr<Field>& field,
+ const std::shared_ptr<Array>& column) const override {
+ ARROW_CHECK(field != nullptr);
+ ARROW_CHECK(column != nullptr);
+
+ if (!field->type()->Equals(column->type())) {
+ return Status::TypeError("Column data type ", field->type()->name(),
+ " does not match field data type ",
+ column->type()->name());
+ }
+ if (column->length() != num_rows_) {
+ return Status::Invalid(
+ "Added column's length must match record batch's length. Expected length ",
+ num_rows_, " but got length ", column->length());
+ }
+
+ ARROW_ASSIGN_OR_RAISE(auto new_schema, schema_->AddField(i, field));
+ return RecordBatch::Make(std::move(new_schema), num_rows_,
+ internal::AddVectorElement(columns_, i, column->data()));
+ }
+
+ Result<std::shared_ptr<RecordBatch>> SetColumn(
+ int i, const std::shared_ptr<Field>& field,
+ const std::shared_ptr<Array>& column) const override {
+ ARROW_CHECK(field != nullptr);
+ ARROW_CHECK(column != nullptr);
+
+ if (!field->type()->Equals(column->type())) {
+ return Status::TypeError("Column data type ", field->type()->name(),
+ " does not match field data type ",
+ column->type()->name());
+ }
+ if (column->length() != num_rows_) {
+ return Status::Invalid(
+ "Added column's length must match record batch's length. Expected length ",
+ num_rows_, " but got length ", column->length());
+ }
+
+ ARROW_ASSIGN_OR_RAISE(auto new_schema, schema_->SetField(i, field));
+ return RecordBatch::Make(std::move(new_schema), num_rows_,
+ internal::ReplaceVectorElement(columns_, i, column->data()));
+ }
+
+ Result<std::shared_ptr<RecordBatch>> RemoveColumn(int i) const override {
+ ARROW_ASSIGN_OR_RAISE(auto new_schema, schema_->RemoveField(i));
+ return RecordBatch::Make(std::move(new_schema), num_rows_,
+ internal::DeleteVectorElement(columns_, i));
+ }
+
+ std::shared_ptr<RecordBatch> ReplaceSchemaMetadata(
+ const std::shared_ptr<const KeyValueMetadata>& metadata) const override {
+ auto new_schema = schema_->WithMetadata(metadata);
+ return RecordBatch::Make(std::move(new_schema), num_rows_, columns_);
+ }
+
+ std::shared_ptr<RecordBatch> Slice(int64_t offset, int64_t length) const override {
+ std::vector<std::shared_ptr<ArrayData>> arrays;
+ arrays.reserve(num_columns());
+ for (const auto& field : columns_) {
+ arrays.emplace_back(field->Slice(offset, length));
+ }
+ int64_t num_rows = std::min(num_rows_ - offset, length);
+ return std::make_shared<SimpleRecordBatch>(schema_, num_rows, std::move(arrays));
+ }
+
+ Status Validate() const override {
+ if (static_cast<int>(columns_.size()) != schema_->num_fields()) {
+ return Status::Invalid("Number of columns did not match schema");
+ }
+ return RecordBatch::Validate();
+ }
+
+ private:
+ std::vector<std::shared_ptr<ArrayData>> columns_;
+
+ // Caching boxed array data
+ mutable std::vector<std::shared_ptr<Array>> boxed_columns_;
+};
+
+RecordBatch::RecordBatch(const std::shared_ptr<Schema>& schema, int64_t num_rows)
+ : schema_(schema), num_rows_(num_rows) {}
+
+std::shared_ptr<RecordBatch> RecordBatch::Make(
+ std::shared_ptr<Schema> schema, int64_t num_rows,
+ std::vector<std::shared_ptr<Array>> columns) {
+ DCHECK_EQ(schema->num_fields(), static_cast<int>(columns.size()));
+ return std::make_shared<SimpleRecordBatch>(std::move(schema), num_rows, columns);
+}
+
+std::shared_ptr<RecordBatch> RecordBatch::Make(
+ std::shared_ptr<Schema> schema, int64_t num_rows,
+ std::vector<std::shared_ptr<ArrayData>> columns) {
+ DCHECK_EQ(schema->num_fields(), static_cast<int>(columns.size()));
+ return std::make_shared<SimpleRecordBatch>(std::move(schema), num_rows,
+ std::move(columns));
+}
+
+Result<std::shared_ptr<RecordBatch>> RecordBatch::FromStructArray(
+ const std::shared_ptr<Array>& array) {
+ if (array->type_id() != Type::STRUCT) {
+ return Status::TypeError("Cannot construct record batch from array of type ",
+ *array->type());
+ }
+ if (array->null_count() != 0) {
+ return Status::Invalid(
+ "Unable to construct record batch from a StructArray with non-zero nulls.");
+ }
+ return Make(arrow::schema(array->type()->fields()), array->length(),
+ array->data()->child_data);
+}
+
+Result<std::shared_ptr<StructArray>> RecordBatch::ToStructArray() const {
+ if (num_columns() != 0) {
+ return StructArray::Make(columns(), schema()->fields());
+ }
+ return std::make_shared<StructArray>(arrow::struct_({}), num_rows_,
+ std::vector<std::shared_ptr<Array>>{},
+ /*null_bitmap=*/nullptr,
+ /*null_count=*/0,
+ /*offset=*/0);
+}
+
+const std::string& RecordBatch::column_name(int i) const {
+ return schema_->field(i)->name();
+}
+
+bool RecordBatch::Equals(const RecordBatch& other, bool check_metadata) const {
+ if (num_columns() != other.num_columns() || num_rows_ != other.num_rows()) {
+ return false;
+ }
+
+ if (check_metadata) {
+ if (!schema_->Equals(*other.schema(), /*check_metadata=*/true)) {
+ return false;
+ }
+ }
+
+ for (int i = 0; i < num_columns(); ++i) {
+ if (!column(i)->Equals(other.column(i))) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+bool RecordBatch::ApproxEquals(const RecordBatch& other) const {
+ if (num_columns() != other.num_columns() || num_rows_ != other.num_rows()) {
+ return false;
+ }
+
+ for (int i = 0; i < num_columns(); ++i) {
+ if (!column(i)->ApproxEquals(other.column(i))) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+Result<std::shared_ptr<RecordBatch>> RecordBatch::SelectColumns(
+ const std::vector<int>& indices) const {
+ int n = static_cast<int>(indices.size());
+
+ FieldVector fields(n);
+ ArrayVector columns(n);
+
+ for (int i = 0; i < n; i++) {
+ int pos = indices[i];
+ if (pos < 0 || pos > num_columns() - 1) {
+ return Status::Invalid("Invalid column index ", pos, " to select columns.");
+ }
+ fields[i] = schema()->field(pos);
+ columns[i] = column(pos);
+ }
+
+ auto new_schema =
+ std::make_shared<arrow::Schema>(std::move(fields), schema()->metadata());
+ return RecordBatch::Make(std::move(new_schema), num_rows(), std::move(columns));
+}
+
+std::shared_ptr<RecordBatch> RecordBatch::Slice(int64_t offset) const {
+ return Slice(offset, this->num_rows() - offset);
+}
+
+std::string RecordBatch::ToString() const {
+ std::stringstream ss;
+ ARROW_CHECK_OK(PrettyPrint(*this, 0, &ss));
+ return ss.str();
+}
+
+Status RecordBatch::Validate() const {
+ for (int i = 0; i < num_columns(); ++i) {
+ const auto& array = *this->column(i);
+ if (array.length() != num_rows_) {
+ return Status::Invalid("Number of rows in column ", i,
+ " did not match batch: ", array.length(), " vs ", num_rows_);
+ }
+ const auto& schema_type = *schema_->field(i)->type();
+ if (!array.type()->Equals(schema_type)) {
+ return Status::Invalid("Column ", i,
+ " type not match schema: ", array.type()->ToString(), " vs ",
+ schema_type.ToString());
+ }
+ RETURN_NOT_OK(internal::ValidateArray(array));
+ }
+ return Status::OK();
+}
+
+Status RecordBatch::ValidateFull() const {
+ RETURN_NOT_OK(Validate());
+ for (int i = 0; i < num_columns(); ++i) {
+ const auto& array = *this->column(i);
+ RETURN_NOT_OK(internal::ValidateArrayFull(array));
+ }
+ return Status::OK();
+}
+
+// ----------------------------------------------------------------------
+// Base record batch reader
+
+Status RecordBatchReader::ReadAll(std::vector<std::shared_ptr<RecordBatch>>* batches) {
+ while (true) {
+ std::shared_ptr<RecordBatch> batch;
+ RETURN_NOT_OK(ReadNext(&batch));
+ if (!batch) {
+ break;
+ }
+ batches->emplace_back(std::move(batch));
+ }
+ return Status::OK();
+}
+
+Status RecordBatchReader::ReadAll(std::shared_ptr<Table>* table) {
+ std::vector<std::shared_ptr<RecordBatch>> batches;
+ RETURN_NOT_OK(ReadAll(&batches));
+ return Table::FromRecordBatches(schema(), std::move(batches)).Value(table);
+}
+
+class SimpleRecordBatchReader : public RecordBatchReader {
+ public:
+ SimpleRecordBatchReader(Iterator<std::shared_ptr<RecordBatch>> it,
+ std::shared_ptr<Schema> schema)
+ : schema_(std::move(schema)), it_(std::move(it)) {}
+
+ SimpleRecordBatchReader(std::vector<std::shared_ptr<RecordBatch>> batches,
+ std::shared_ptr<Schema> schema)
+ : schema_(std::move(schema)), it_(MakeVectorIterator(std::move(batches))) {}
+
+ Status ReadNext(std::shared_ptr<RecordBatch>* batch) override {
+ return it_.Next().Value(batch);
+ }
+
+ std::shared_ptr<Schema> schema() const override { return schema_; }
+
+ protected:
+ std::shared_ptr<Schema> schema_;
+ Iterator<std::shared_ptr<RecordBatch>> it_;
+};
+
+Result<std::shared_ptr<RecordBatchReader>> RecordBatchReader::Make(
+ std::vector<std::shared_ptr<RecordBatch>> batches, std::shared_ptr<Schema> schema) {
+ if (schema == nullptr) {
+ if (batches.size() == 0 || batches[0] == nullptr) {
+ return Status::Invalid("Cannot infer schema from empty vector or nullptr");
+ }
+
+ schema = batches[0]->schema();
+ }
+
+ return std::make_shared<SimpleRecordBatchReader>(std::move(batches), schema);
+}
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/record_batch.h b/contrib/libs/apache/arrow/cpp/src/arrow/record_batch.h
new file mode 100644
index 00000000000..3dc1f54a083
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/record_batch.h
@@ -0,0 +1,238 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+/// \class RecordBatch
+/// \brief Collection of equal-length arrays matching a particular Schema
+///
+/// A record batch is table-like data structure that is semantically a sequence
+/// of fields, each a contiguous Arrow array
+class ARROW_EXPORT RecordBatch {
+ public:
+ virtual ~RecordBatch() = default;
+
+ /// \param[in] schema The record batch schema
+ /// \param[in] num_rows length of fields in the record batch. Each array
+ /// should have the same length as num_rows
+ /// \param[in] columns the record batch fields as vector of arrays
+ static std::shared_ptr<RecordBatch> Make(std::shared_ptr<Schema> schema,
+ int64_t num_rows,
+ std::vector<std::shared_ptr<Array>> columns);
+
+ /// \brief Construct record batch from vector of internal data structures
+ /// \since 0.5.0
+ ///
+ /// This class is intended for internal use, or advanced users.
+ ///
+ /// \param schema the record batch schema
+ /// \param num_rows the number of semantic rows in the record batch. This
+ /// should be equal to the length of each field
+ /// \param columns the data for the batch's columns
+ static std::shared_ptr<RecordBatch> Make(
+ std::shared_ptr<Schema> schema, int64_t num_rows,
+ std::vector<std::shared_ptr<ArrayData>> columns);
+
+ /// \brief Convert record batch to struct array
+ ///
+ /// Create a struct array whose child arrays are the record batch's columns.
+ /// Note that the record batch's top-level field metadata cannot be reflected
+ /// in the resulting struct array.
+ Result<std::shared_ptr<StructArray>> ToStructArray() const;
+
+ /// \brief Construct record batch from struct array
+ ///
+ /// This constructs a record batch using the child arrays of the given
+ /// array, which must be a struct array. Note that the struct array's own
+ /// null bitmap is not reflected in the resulting record batch.
+ static Result<std::shared_ptr<RecordBatch>> FromStructArray(
+ const std::shared_ptr<Array>& array);
+
+ /// \brief Determine if two record batches are exactly equal
+ ///
+ /// \param[in] other the RecordBatch to compare with
+ /// \param[in] check_metadata if true, check that Schema metadata is the same
+ /// \return true if batches are equal
+ bool Equals(const RecordBatch& other, bool check_metadata = false) const;
+
+ /// \brief Determine if two record batches are approximately equal
+ bool ApproxEquals(const RecordBatch& other) const;
+
+ // \return the table's schema
+ /// \return true if batches are equal
+ const std::shared_ptr<Schema>& schema() const { return schema_; }
+
+ /// \brief Retrieve all columns at once
+ virtual const std::vector<std::shared_ptr<Array>>& columns() const = 0;
+
+ /// \brief Retrieve an array from the record batch
+ /// \param[in] i field index, does not boundscheck
+ /// \return an Array object
+ virtual std::shared_ptr<Array> column(int i) const = 0;
+
+ /// \brief Retrieve an array from the record batch
+ /// \param[in] name field name
+ /// \return an Array or null if no field was found
+ std::shared_ptr<Array> GetColumnByName(const std::string& name) const;
+
+ /// \brief Retrieve an array's internal data from the record batch
+ /// \param[in] i field index, does not boundscheck
+ /// \return an internal ArrayData object
+ virtual std::shared_ptr<ArrayData> column_data(int i) const = 0;
+
+ /// \brief Retrieve all arrays' internal data from the record batch.
+ virtual const ArrayDataVector& column_data() const = 0;
+
+ /// \brief Add column to the record batch, producing a new RecordBatch
+ ///
+ /// \param[in] i field index, which will be boundschecked
+ /// \param[in] field field to be added
+ /// \param[in] column column to be added
+ virtual Result<std::shared_ptr<RecordBatch>> AddColumn(
+ int i, const std::shared_ptr<Field>& field,
+ const std::shared_ptr<Array>& column) const = 0;
+
+ /// \brief Add new nullable column to the record batch, producing a new
+ /// RecordBatch.
+ ///
+ /// For non-nullable columns, use the Field-based version of this method.
+ ///
+ /// \param[in] i field index, which will be boundschecked
+ /// \param[in] field_name name of field to be added
+ /// \param[in] column column to be added
+ virtual Result<std::shared_ptr<RecordBatch>> AddColumn(
+ int i, std::string field_name, const std::shared_ptr<Array>& column) const;
+
+ /// \brief Replace a column in the table, producing a new Table
+ virtual Result<std::shared_ptr<RecordBatch>> SetColumn(
+ int i, const std::shared_ptr<Field>& field,
+ const std::shared_ptr<Array>& column) const = 0;
+
+ /// \brief Remove column from the record batch, producing a new RecordBatch
+ ///
+ /// \param[in] i field index, does boundscheck
+ virtual Result<std::shared_ptr<RecordBatch>> RemoveColumn(int i) const = 0;
+
+ virtual std::shared_ptr<RecordBatch> ReplaceSchemaMetadata(
+ const std::shared_ptr<const KeyValueMetadata>& metadata) const = 0;
+
+ /// \brief Name in i-th column
+ const std::string& column_name(int i) const;
+
+ /// \return the number of columns in the table
+ int num_columns() const;
+
+ /// \return the number of rows (the corresponding length of each column)
+ int64_t num_rows() const { return num_rows_; }
+
+ /// \brief Slice each of the arrays in the record batch
+ /// \param[in] offset the starting offset to slice, through end of batch
+ /// \return new record batch
+ virtual std::shared_ptr<RecordBatch> Slice(int64_t offset) const;
+
+ /// \brief Slice each of the arrays in the record batch
+ /// \param[in] offset the starting offset to slice
+ /// \param[in] length the number of elements to slice from offset
+ /// \return new record batch
+ virtual std::shared_ptr<RecordBatch> Slice(int64_t offset, int64_t length) const = 0;
+
+ /// \return PrettyPrint representation suitable for debugging
+ std::string ToString() const;
+
+ /// \brief Return new record batch with specified columns
+ Result<std::shared_ptr<RecordBatch>> SelectColumns(
+ const std::vector<int>& indices) const;
+
+ /// \brief Perform cheap validation checks to determine obvious inconsistencies
+ /// within the record batch's schema and internal data.
+ ///
+ /// This is O(k) where k is the total number of fields and array descendents.
+ ///
+ /// \return Status
+ virtual Status Validate() const;
+
+ /// \brief Perform extensive validation checks to determine inconsistencies
+ /// within the record batch's schema and internal data.
+ ///
+ /// This is potentially O(k*n) where n is the number of rows.
+ ///
+ /// \return Status
+ virtual Status ValidateFull() const;
+
+ protected:
+ RecordBatch(const std::shared_ptr<Schema>& schema, int64_t num_rows);
+
+ std::shared_ptr<Schema> schema_;
+ int64_t num_rows_;
+
+ private:
+ ARROW_DISALLOW_COPY_AND_ASSIGN(RecordBatch);
+};
+
+/// \brief Abstract interface for reading stream of record batches
+class ARROW_EXPORT RecordBatchReader {
+ public:
+ using ValueType = std::shared_ptr<RecordBatch>;
+
+ virtual ~RecordBatchReader() = default;
+
+ /// \return the shared schema of the record batches in the stream
+ virtual std::shared_ptr<Schema> schema() const = 0;
+
+ /// \brief Read the next record batch in the stream. Return null for batch
+ /// when reaching end of stream
+ ///
+ /// \param[out] batch the next loaded batch, null at end of stream
+ /// \return Status
+ virtual Status ReadNext(std::shared_ptr<RecordBatch>* batch) = 0;
+
+ /// \brief Iterator interface
+ Result<std::shared_ptr<RecordBatch>> Next() {
+ std::shared_ptr<RecordBatch> batch;
+ ARROW_RETURN_NOT_OK(ReadNext(&batch));
+ return batch;
+ }
+
+ /// \brief Consume entire stream as a vector of record batches
+ Status ReadAll(RecordBatchVector* batches);
+
+ /// \brief Read all batches and concatenate as arrow::Table
+ Status ReadAll(std::shared_ptr<Table>* table);
+
+ /// \brief Create a RecordBatchReader from a vector of RecordBatch.
+ ///
+ /// \param[in] batches the vector of RecordBatch to read from
+ /// \param[in] schema schema to conform to. Will be inferred from the first
+ /// element if not provided.
+ static Result<std::shared_ptr<RecordBatchReader>> Make(
+ RecordBatchVector batches, std::shared_ptr<Schema> schema = NULLPTR);
+};
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/result.cc b/contrib/libs/apache/arrow/cpp/src/arrow/result.cc
new file mode 100644
index 00000000000..0bb65acb831
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/result.cc
@@ -0,0 +1,36 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/result.h"
+
+#include <string>
+
+#include "arrow/util/logging.h"
+
+namespace arrow {
+
+namespace internal {
+
+void DieWithMessage(const std::string& msg) { ARROW_LOG(FATAL) << msg; }
+
+void InvalidValueOrDie(const Status& st) {
+ DieWithMessage(std::string("ValueOrDie called on an error: ") + st.ToString());
+}
+
+} // namespace internal
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/result.h b/contrib/libs/apache/arrow/cpp/src/arrow/result.h
new file mode 100644
index 00000000000..cb7437cd242
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/result.h
@@ -0,0 +1,519 @@
+//
+// Copyright 2017 Asylo authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+// Adapted from Asylo
+
+#pragma once
+
+#include <cstddef>
+#include <new>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+#include "arrow/status.h"
+#include "arrow/util/compare.h"
+
+namespace arrow {
+
+template <typename>
+struct EnsureResult;
+
+namespace internal {
+
+#if __cplusplus >= 201703L
+using std::launder;
+#else
+template <class T>
+constexpr T* launder(T* p) noexcept {
+ return p;
+}
+#endif
+
+ARROW_EXPORT void DieWithMessage(const std::string& msg);
+
+ARROW_EXPORT void InvalidValueOrDie(const Status& st);
+
+} // namespace internal
+
+/// A class for representing either a usable value, or an error.
+///
+/// A Result object either contains a value of type `T` or a Status object
+/// explaining why such a value is not present. The type `T` must be
+/// copy-constructible and/or move-constructible.
+///
+/// The state of a Result object may be determined by calling ok() or
+/// status(). The ok() method returns true if the object contains a valid value.
+/// The status() method returns the internal Status object. A Result object
+/// that contains a valid value will return an OK Status for a call to status().
+///
+/// A value of type `T` may be extracted from a Result object through a call
+/// to ValueOrDie(). This function should only be called if a call to ok()
+/// returns true. Sample usage:
+///
+/// ```
+/// arrow::Result<Foo> result = CalculateFoo();
+/// if (result.ok()) {
+/// Foo foo = result.ValueOrDie();
+/// foo.DoSomethingCool();
+/// } else {
+/// ARROW_LOG(ERROR) << result.status();
+/// }
+/// ```
+///
+/// If `T` is a move-only type, like `std::unique_ptr<>`, then the value should
+/// only be extracted after invoking `std::move()` on the Result object.
+/// Sample usage:
+///
+/// ```
+/// arrow::Result<std::unique_ptr<Foo>> result = CalculateFoo();
+/// if (result.ok()) {
+/// std::unique_ptr<Foo> foo = std::move(result).ValueOrDie();
+/// foo->DoSomethingCool();
+/// } else {
+/// ARROW_LOG(ERROR) << result.status();
+/// }
+/// ```
+///
+/// Result is provided for the convenience of implementing functions that
+/// return some value but may fail during execution. For instance, consider a
+/// function with the following signature:
+///
+/// ```
+/// arrow::Status CalculateFoo(int *output);
+/// ```
+///
+/// This function may instead be written as:
+///
+/// ```
+/// arrow::Result<int> CalculateFoo();
+/// ```
+template <class T>
+class ARROW_MUST_USE_TYPE Result : public util::EqualityComparable<Result<T>> {
+ template <typename U>
+ friend class Result;
+
+ static_assert(!std::is_same<T, Status>::value,
+ "this assert indicates you have probably made a metaprogramming error");
+
+ public:
+ using ValueType = T;
+
+ /// Constructs a Result object that contains a non-OK status.
+ ///
+ /// This constructor is marked `explicit` to prevent attempts to `return {}`
+ /// from a function with a return type of, for example,
+ /// `Result<std::vector<int>>`. While `return {}` seems like it would return
+ /// an empty vector, it will actually invoke the default constructor of
+ /// Result.
+ explicit Result() // NOLINT(runtime/explicit)
+ : status_(Status::UnknownError("Uninitialized Result<T>")) {}
+
+ ~Result() noexcept { Destroy(); }
+
+ /// Constructs a Result object with the given non-OK Status object. All
+ /// calls to ValueOrDie() on this object will abort. The given `status` must
+ /// not be an OK status, otherwise this constructor will abort.
+ ///
+ /// This constructor is not declared explicit so that a function with a return
+ /// type of `Result<T>` can return a Status object, and the status will be
+ /// implicitly converted to the appropriate return type as a matter of
+ /// convenience.
+ ///
+ /// \param status The non-OK Status object to initialize to.
+ Result(const Status& status) // NOLINT(runtime/explicit)
+ : status_(status) {
+ if (ARROW_PREDICT_FALSE(status.ok())) {
+ internal::DieWithMessage(std::string("Constructed with a non-error status: ") +
+ status.ToString());
+ }
+ }
+
+ /// Constructs a Result object that contains `value`. The resulting object
+ /// is considered to have an OK status. The wrapped element can be accessed
+ /// with ValueOrDie().
+ ///
+ /// This constructor is made implicit so that a function with a return type of
+ /// `Result<T>` can return an object of type `U &&`, implicitly converting
+ /// it to a `Result<T>` object.
+ ///
+ /// Note that `T` must be implicitly constructible from `U`, and `U` must not
+ /// be a (cv-qualified) Status or Status-reference type. Due to C++
+ /// reference-collapsing rules and perfect-forwarding semantics, this
+ /// constructor matches invocations that pass `value` either as a const
+ /// reference or as an rvalue reference. Since Result needs to work for both
+ /// reference and rvalue-reference types, the constructor uses perfect
+ /// forwarding to avoid invalidating arguments that were passed by reference.
+ /// See http://thbecker.net/articles/rvalue_references/section_08.html for
+ /// additional details.
+ ///
+ /// \param value The value to initialize to.
+ template <typename U,
+ typename E = typename std::enable_if<
+ std::is_constructible<T, U>::value && std::is_convertible<U, T>::value &&
+ !std::is_same<typename std::remove_reference<
+ typename std::remove_cv<U>::type>::type,
+ Status>::value>::type>
+ Result(U&& value) noexcept { // NOLINT(runtime/explicit)
+ ConstructValue(std::forward<U>(value));
+ }
+
+ /// Constructs a Result object that contains `value`. The resulting object
+ /// is considered to have an OK status. The wrapped element can be accessed
+ /// with ValueOrDie().
+ ///
+ /// This constructor is made implicit so that a function with a return type of
+ /// `Result<T>` can return an object of type `T`, implicitly converting
+ /// it to a `Result<T>` object.
+ ///
+ /// \param value The value to initialize to.
+ // NOTE `Result(U&& value)` above should be sufficient, but some compilers
+ // fail matching it.
+ Result(T&& value) noexcept { // NOLINT(runtime/explicit)
+ ConstructValue(std::move(value));
+ }
+
+ /// Copy constructor.
+ ///
+ /// This constructor needs to be explicitly defined because the presence of
+ /// the move-assignment operator deletes the default copy constructor. In such
+ /// a scenario, since the deleted copy constructor has stricter binding rules
+ /// than the templated copy constructor, the templated constructor cannot act
+ /// as a copy constructor, and any attempt to copy-construct a `Result`
+ /// object results in a compilation error.
+ ///
+ /// \param other The value to copy from.
+ Result(const Result& other) : status_(other.status_) {
+ if (ARROW_PREDICT_TRUE(status_.ok())) {
+ ConstructValue(other.ValueUnsafe());
+ }
+ }
+
+ /// Templatized constructor that constructs a `Result<T>` from a const
+ /// reference to a `Result<U>`.
+ ///
+ /// `T` must be implicitly constructible from `const U &`.
+ ///
+ /// \param other The value to copy from.
+ template <typename U, typename E = typename std::enable_if<
+ std::is_constructible<T, const U&>::value &&
+ std::is_convertible<U, T>::value>::type>
+ Result(const Result<U>& other) : status_(other.status_) {
+ if (ARROW_PREDICT_TRUE(status_.ok())) {
+ ConstructValue(other.ValueUnsafe());
+ }
+ }
+
+ /// Copy-assignment operator.
+ ///
+ /// \param other The Result object to copy.
+ Result& operator=(const Result& other) {
+ // Check for self-assignment.
+ if (this == &other) {
+ return *this;
+ }
+ Destroy();
+ status_ = other.status_;
+ if (ARROW_PREDICT_TRUE(status_.ok())) {
+ ConstructValue(other.ValueUnsafe());
+ }
+ return *this;
+ }
+
+ /// Templatized constructor which constructs a `Result<T>` by moving the
+ /// contents of a `Result<U>`. `T` must be implicitly constructible from `U
+ /// &&`.
+ ///
+ /// Sets `other` to contain a non-OK status with a`StatusError::Invalid`
+ /// error code.
+ ///
+ /// \param other The Result object to move from and set to a non-OK status.
+ template <typename U,
+ typename E = typename std::enable_if<std::is_constructible<T, U&&>::value &&
+ std::is_convertible<U, T>::value>::type>
+ Result(Result<U>&& other) noexcept {
+ if (ARROW_PREDICT_TRUE(other.status_.ok())) {
+ status_ = std::move(other.status_);
+ ConstructValue(other.MoveValueUnsafe());
+ } else {
+ // If we moved the status, the other status may become ok but the other
+ // value hasn't been constructed => crash on other destructor.
+ status_ = other.status_;
+ }
+ }
+
+ /// Move-assignment operator.
+ ///
+ /// Sets `other` to an invalid state..
+ ///
+ /// \param other The Result object to assign from and set to a non-OK
+ /// status.
+ Result& operator=(Result&& other) noexcept {
+ // Check for self-assignment.
+ if (this == &other) {
+ return *this;
+ }
+ Destroy();
+ if (ARROW_PREDICT_TRUE(other.status_.ok())) {
+ status_ = std::move(other.status_);
+ ConstructValue(other.MoveValueUnsafe());
+ } else {
+ // If we moved the status, the other status may become ok but the other
+ // value hasn't been constructed => crash on other destructor.
+ status_ = other.status_;
+ }
+ return *this;
+ }
+
+ /// Compare to another Result.
+ bool Equals(const Result& other) const {
+ if (ARROW_PREDICT_TRUE(status_.ok())) {
+ return other.status_.ok() && ValueUnsafe() == other.ValueUnsafe();
+ }
+ return status_ == other.status_;
+ }
+
+ /// Indicates whether the object contains a `T` value. Generally instead
+ /// of accessing this directly you will want to use ASSIGN_OR_RAISE defined
+ /// below.
+ ///
+ /// \return True if this Result object's status is OK (i.e. a call to ok()
+ /// returns true). If this function returns true, then it is safe to access
+ /// the wrapped element through a call to ValueOrDie().
+ bool ok() const { return status_.ok(); }
+
+ /// \brief Equivalent to ok().
+ // operator bool() const { return ok(); }
+
+ /// Gets the stored status object, or an OK status if a `T` value is stored.
+ ///
+ /// \return The stored non-OK status object, or an OK status if this object
+ /// has a value.
+ const Status& status() const { return status_; }
+
+ /// Gets the stored `T` value.
+ ///
+ /// This method should only be called if this Result object's status is OK
+ /// (i.e. a call to ok() returns true), otherwise this call will abort.
+ ///
+ /// \return The stored `T` value.
+ const T& ValueOrDie() const& {
+ if (ARROW_PREDICT_FALSE(!ok())) {
+ internal::InvalidValueOrDie(status_);
+ }
+ return ValueUnsafe();
+ }
+ const T& operator*() const& { return ValueOrDie(); }
+ const T* operator->() const { return &ValueOrDie(); }
+
+ /// Gets a mutable reference to the stored `T` value.
+ ///
+ /// This method should only be called if this Result object's status is OK
+ /// (i.e. a call to ok() returns true), otherwise this call will abort.
+ ///
+ /// \return The stored `T` value.
+ T& ValueOrDie() & {
+ if (ARROW_PREDICT_FALSE(!ok())) {
+ internal::InvalidValueOrDie(status_);
+ }
+ return ValueUnsafe();
+ }
+ T& operator*() & { return ValueOrDie(); }
+ T* operator->() { return &ValueOrDie(); }
+
+ /// Moves and returns the internally-stored `T` value.
+ ///
+ /// This method should only be called if this Result object's status is OK
+ /// (i.e. a call to ok() returns true), otherwise this call will abort. The
+ /// Result object is invalidated after this call and will be updated to
+ /// contain a non-OK status.
+ ///
+ /// \return The stored `T` value.
+ T ValueOrDie() && {
+ if (ARROW_PREDICT_FALSE(!ok())) {
+ internal::InvalidValueOrDie(status_);
+ }
+ return MoveValueUnsafe();
+ }
+ T operator*() && { return std::move(*this).ValueOrDie(); }
+
+ /// Helper method for implementing Status returning functions in terms of semantically
+ /// equivalent Result returning functions. For example:
+ ///
+ /// Status GetInt(int *out) { return GetInt().Value(out); }
+ template <typename U, typename E = typename std::enable_if<
+ std::is_constructible<U, T>::value>::type>
+ Status Value(U* out) && {
+ if (!ok()) {
+ return status();
+ }
+ *out = U(MoveValueUnsafe());
+ return Status::OK();
+ }
+
+ /// Move and return the internally stored value or alternative if an error is stored.
+ T ValueOr(T alternative) && {
+ if (!ok()) {
+ return alternative;
+ }
+ return MoveValueUnsafe();
+ }
+
+ /// Retrieve the value if ok(), falling back to an alternative generated by the provided
+ /// factory
+ template <typename G>
+ T ValueOrElse(G&& generate_alternative) && {
+ if (ok()) {
+ return MoveValueUnsafe();
+ }
+ return generate_alternative();
+ }
+
+ /// Apply a function to the internally stored value to produce a new result or propagate
+ /// the stored error.
+ template <typename M>
+ typename EnsureResult<typename std::result_of<M && (T)>::type>::type Map(M&& m) && {
+ if (!ok()) {
+ return status();
+ }
+ return std::forward<M>(m)(MoveValueUnsafe());
+ }
+
+ /// Apply a function to the internally stored value to produce a new result or propagate
+ /// the stored error.
+ template <typename M>
+ typename EnsureResult<typename std::result_of<M && (const T&)>::type>::type Map(
+ M&& m) const& {
+ if (!ok()) {
+ return status();
+ }
+ return std::forward<M>(m)(ValueUnsafe());
+ }
+
+ /// Cast the internally stored value to produce a new result or propagate the stored
+ /// error.
+ template <typename U, typename E = typename std::enable_if<
+ std::is_constructible<U, T>::value>::type>
+ Result<U> As() && {
+ if (!ok()) {
+ return status();
+ }
+ return U(MoveValueUnsafe());
+ }
+
+ /// Cast the internally stored value to produce a new result or propagate the stored
+ /// error.
+ template <typename U, typename E = typename std::enable_if<
+ std::is_constructible<U, const T&>::value>::type>
+ Result<U> As() const& {
+ if (!ok()) {
+ return status();
+ }
+ return U(ValueUnsafe());
+ }
+
+ const T& ValueUnsafe() const& {
+ return *internal::launder(reinterpret_cast<const T*>(&data_));
+ }
+
+ T& ValueUnsafe() & { return *internal::launder(reinterpret_cast<T*>(&data_)); }
+
+ T ValueUnsafe() && { return MoveValueUnsafe(); }
+
+ T MoveValueUnsafe() {
+ return std::move(*internal::launder(reinterpret_cast<T*>(&data_)));
+ }
+
+ private:
+ Status status_; // pointer-sized
+ typename std::aligned_storage<sizeof(T), alignof(T)>::type data_;
+
+ template <typename U>
+ void ConstructValue(U&& u) {
+ new (&data_) T(std::forward<U>(u));
+ }
+
+ void Destroy() {
+ if (ARROW_PREDICT_TRUE(status_.ok())) {
+ static_assert(offsetof(Result<T>, status_) == 0,
+ "Status is guaranteed to be at the start of Result<>");
+ internal::launder(reinterpret_cast<const T*>(&data_))->~T();
+ }
+ }
+};
+
+#define ARROW_ASSIGN_OR_RAISE_IMPL(result_name, lhs, rexpr) \
+ auto&& result_name = (rexpr); \
+ ARROW_RETURN_IF_(!(result_name).ok(), (result_name).status(), ARROW_STRINGIFY(rexpr)); \
+ lhs = std::move(result_name).ValueUnsafe();
+
+#define ARROW_ASSIGN_OR_RAISE_NAME(x, y) ARROW_CONCAT(x, y)
+
+/// \brief Execute an expression that returns a Result, extracting its value
+/// into the variable defined by `lhs` (or returning a Status on error).
+///
+/// Example: Assigning to a new value:
+/// ARROW_ASSIGN_OR_RAISE(auto value, MaybeGetValue(arg));
+///
+/// Example: Assigning to an existing value:
+/// ValueType value;
+/// ARROW_ASSIGN_OR_RAISE(value, MaybeGetValue(arg));
+///
+/// WARNING: ARROW_ASSIGN_OR_RAISE expands into multiple statements;
+/// it cannot be used in a single statement (e.g. as the body of an if
+/// statement without {})!
+///
+/// WARNING: ARROW_ASSIGN_OR_RAISE `std::move`s its right operand. If you have
+/// an lvalue Result which you *don't* want to move out of cast appropriately.
+///
+/// WARNING: ARROW_ASSIGN_OR_RAISE is not a single expression; it will not
+/// maintain lifetimes of all temporaries in `rexpr` (e.g.
+/// `ARROW_ASSIGN_OR_RAISE(auto x, MakeTemp().GetResultRef());`
+/// will most likely segfault)!
+#define ARROW_ASSIGN_OR_RAISE(lhs, rexpr) \
+ ARROW_ASSIGN_OR_RAISE_IMPL(ARROW_ASSIGN_OR_RAISE_NAME(_error_or_value, __COUNTER__), \
+ lhs, rexpr);
+
+namespace internal {
+
+template <typename T>
+inline const Status& GenericToStatus(const Result<T>& res) {
+ return res.status();
+}
+
+template <typename T>
+inline Status GenericToStatus(Result<T>&& res) {
+ return std::move(res).status();
+}
+
+} // namespace internal
+
+template <typename T, typename R = typename EnsureResult<T>::type>
+R ToResult(T t) {
+ return R(std::move(t));
+}
+
+template <typename T>
+struct EnsureResult {
+ using type = Result<T>;
+};
+
+template <typename T>
+struct EnsureResult<Result<T>> {
+ using type = Result<T>;
+};
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/result_internal.h b/contrib/libs/apache/arrow/cpp/src/arrow/result_internal.h
new file mode 100644
index 00000000000..7550f945d85
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/result_internal.h
@@ -0,0 +1,22 @@
+//
+// Copyright 2017 Asylo authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#pragma once
+
+#include "arrow/result.h"
+
+#ifndef ASSIGN_OR_RAISE
+#define ASSIGN_OR_RAISE(lhs, rhs) ARROW_ASSIGN_OR_RAISE(lhs, rhs)
+#endif
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/scalar.cc b/contrib/libs/apache/arrow/cpp/src/arrow/scalar.cc
new file mode 100644
index 00000000000..cb7755ba3f1
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/scalar.cc
@@ -0,0 +1,659 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/scalar.h"
+
+#include <memory>
+#include <sstream>
+#include <string>
+#include <utility>
+
+#include "arrow/array.h"
+#include "arrow/array/util.h"
+#include "arrow/buffer.h"
+#include "arrow/compare.h"
+#include "arrow/type.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/decimal.h"
+#include "arrow/util/formatting.h"
+#include "arrow/util/hashing.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/time.h"
+#include "arrow/util/value_parsing.h"
+#include "arrow/visitor_inline.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+using internal::checked_pointer_cast;
+
+bool Scalar::Equals(const Scalar& other, const EqualOptions& options) const {
+ return ScalarEquals(*this, other, options);
+}
+
+bool Scalar::ApproxEquals(const Scalar& other, const EqualOptions& options) const {
+ return ScalarApproxEquals(*this, other, options);
+}
+
+struct ScalarHashImpl {
+ static std::hash<std::string> string_hash;
+
+ Status Visit(const NullScalar& s) { return Status::OK(); }
+
+ template <typename T>
+ Status Visit(const internal::PrimitiveScalar<T>& s) {
+ return ValueHash(s);
+ }
+
+ Status Visit(const BaseBinaryScalar& s) { return BufferHash(*s.value); }
+
+ template <typename T>
+ Status Visit(const TemporalScalar<T>& s) {
+ return ValueHash(s);
+ }
+
+ Status Visit(const DayTimeIntervalScalar& s) {
+ return StdHash(s.value.days) & StdHash(s.value.days);
+ }
+
+ Status Visit(const Decimal128Scalar& s) {
+ return StdHash(s.value.low_bits()) & StdHash(s.value.high_bits());
+ }
+
+ Status Visit(const Decimal256Scalar& s) {
+ Status status = Status::OK();
+ for (uint64_t elem : s.value.little_endian_array()) {
+ status &= StdHash(elem);
+ }
+ return status;
+ }
+
+ Status Visit(const BaseListScalar& s) { return ArrayHash(*s.value); }
+
+ Status Visit(const StructScalar& s) {
+ for (const auto& child : s.value) {
+ AccumulateHashFrom(*child);
+ }
+ return Status::OK();
+ }
+
+ Status Visit(const DictionaryScalar& s) {
+ AccumulateHashFrom(*s.value.index);
+ return Status::OK();
+ }
+
+ // TODO(bkietz) implement less wimpy hashing when these have ValueType
+ Status Visit(const UnionScalar& s) { return Status::OK(); }
+ Status Visit(const ExtensionScalar& s) { return Status::OK(); }
+
+ template <typename T>
+ Status StdHash(const T& t) {
+ static std::hash<T> hash;
+ hash_ ^= hash(t);
+ return Status::OK();
+ }
+
+ template <typename S>
+ Status ValueHash(const S& s) {
+ return StdHash(s.value);
+ }
+
+ Status BufferHash(const Buffer& b) {
+ hash_ ^= internal::ComputeStringHash<1>(b.data(), b.size());
+ return Status::OK();
+ }
+
+ Status ArrayHash(const Array& a) { return ArrayHash(*a.data()); }
+
+ Status ArrayHash(const ArrayData& a) {
+ RETURN_NOT_OK(StdHash(a.length) & StdHash(a.GetNullCount()));
+ if (a.buffers[0] != nullptr) {
+ // We can't visit values without unboxing the whole array, so only hash
+ // the null bitmap for now.
+ RETURN_NOT_OK(BufferHash(*a.buffers[0]));
+ }
+ for (const auto& child : a.child_data) {
+ RETURN_NOT_OK(ArrayHash(*child));
+ }
+ return Status::OK();
+ }
+
+ explicit ScalarHashImpl(const Scalar& scalar) : hash_(scalar.type->Hash()) {
+ if (scalar.is_valid) {
+ AccumulateHashFrom(scalar);
+ }
+ }
+
+ void AccumulateHashFrom(const Scalar& scalar) {
+ DCHECK_OK(StdHash(scalar.type->fingerprint()));
+ DCHECK_OK(VisitScalarInline(scalar, this));
+ }
+
+ size_t hash_;
+};
+
+size_t Scalar::hash() const { return ScalarHashImpl(*this).hash_; }
+
+StringScalar::StringScalar(std::string s)
+ : StringScalar(Buffer::FromString(std::move(s))) {}
+
+LargeStringScalar::LargeStringScalar(std::string s)
+ : LargeStringScalar(Buffer::FromString(std::move(s))) {}
+
+FixedSizeBinaryScalar::FixedSizeBinaryScalar(std::shared_ptr<Buffer> value,
+ std::shared_ptr<DataType> type)
+ : BinaryScalar(std::move(value), std::move(type)) {
+ ARROW_CHECK_EQ(checked_cast<const FixedSizeBinaryType&>(*this->type).byte_width(),
+ this->value->size());
+}
+
+BaseListScalar::BaseListScalar(std::shared_ptr<Array> value,
+ std::shared_ptr<DataType> type)
+ : Scalar{std::move(type), true}, value(std::move(value)) {
+ ARROW_CHECK(this->type->field(0)->type()->Equals(this->value->type()));
+}
+
+ListScalar::ListScalar(std::shared_ptr<Array> value)
+ : BaseListScalar(value, list(value->type())) {}
+
+LargeListScalar::LargeListScalar(std::shared_ptr<Array> value)
+ : BaseListScalar(value, large_list(value->type())) {}
+
+inline std::shared_ptr<DataType> MakeMapType(const std::shared_ptr<DataType>& pair_type) {
+ ARROW_CHECK_EQ(pair_type->id(), Type::STRUCT);
+ ARROW_CHECK_EQ(pair_type->num_fields(), 2);
+ return map(pair_type->field(0)->type(), pair_type->field(1)->type());
+}
+
+MapScalar::MapScalar(std::shared_ptr<Array> value)
+ : BaseListScalar(value, MakeMapType(value->type())) {}
+
+FixedSizeListScalar::FixedSizeListScalar(std::shared_ptr<Array> value,
+ std::shared_ptr<DataType> type)
+ : BaseListScalar(value, std::move(type)) {
+ ARROW_CHECK_EQ(this->value->length(),
+ checked_cast<const FixedSizeListType&>(*this->type).list_size());
+}
+
+FixedSizeListScalar::FixedSizeListScalar(std::shared_ptr<Array> value)
+ : BaseListScalar(
+ value, fixed_size_list(value->type(), static_cast<int32_t>(value->length()))) {}
+
+Result<std::shared_ptr<StructScalar>> StructScalar::Make(
+ ScalarVector values, std::vector<std::string> field_names) {
+ if (values.size() != field_names.size()) {
+ return Status::Invalid("Mismatching number of field names and child scalars");
+ }
+
+ FieldVector fields(field_names.size());
+ for (size_t i = 0; i < fields.size(); ++i) {
+ fields[i] = arrow::field(std::move(field_names[i]), values[i]->type);
+ }
+
+ return std::make_shared<StructScalar>(std::move(values), struct_(std::move(fields)));
+}
+
+Result<std::shared_ptr<Scalar>> StructScalar::field(FieldRef ref) const {
+ ARROW_ASSIGN_OR_RAISE(auto path, ref.FindOne(*type));
+ if (path.indices().size() != 1) {
+ return Status::NotImplemented("retrieval of nested fields from StructScalar");
+ }
+ auto index = path.indices()[0];
+ if (is_valid) {
+ return value[index];
+ } else {
+ const auto& struct_type = checked_cast<const StructType&>(*this->type);
+ const auto& field_type = struct_type.field(index)->type();
+ return MakeNullScalar(field_type);
+ }
+}
+
+DictionaryScalar::DictionaryScalar(std::shared_ptr<DataType> type)
+ : Scalar(std::move(type)),
+ value{MakeNullScalar(checked_cast<const DictionaryType&>(*this->type).index_type()),
+ MakeArrayOfNull(checked_cast<const DictionaryType&>(*this->type).value_type(),
+ 0)
+ .ValueOrDie()} {}
+
+Result<std::shared_ptr<Scalar>> DictionaryScalar::GetEncodedValue() const {
+ const auto& dict_type = checked_cast<DictionaryType&>(*type);
+
+ if (!is_valid) {
+ return MakeNullScalar(dict_type.value_type());
+ }
+
+ int64_t index_value = 0;
+ switch (dict_type.index_type()->id()) {
+ case Type::UINT8:
+ index_value =
+ static_cast<int64_t>(checked_cast<const UInt8Scalar&>(*value.index).value);
+ break;
+ case Type::INT8:
+ index_value =
+ static_cast<int64_t>(checked_cast<const Int8Scalar&>(*value.index).value);
+ break;
+ case Type::UINT16:
+ index_value =
+ static_cast<int64_t>(checked_cast<const UInt16Scalar&>(*value.index).value);
+ break;
+ case Type::INT16:
+ index_value =
+ static_cast<int64_t>(checked_cast<const Int16Scalar&>(*value.index).value);
+ break;
+ case Type::UINT32:
+ index_value =
+ static_cast<int64_t>(checked_cast<const UInt32Scalar&>(*value.index).value);
+ break;
+ case Type::INT32:
+ index_value =
+ static_cast<int64_t>(checked_cast<const Int32Scalar&>(*value.index).value);
+ break;
+ case Type::UINT64:
+ index_value =
+ static_cast<int64_t>(checked_cast<const UInt64Scalar&>(*value.index).value);
+ break;
+ case Type::INT64:
+ index_value =
+ static_cast<int64_t>(checked_cast<const Int64Scalar&>(*value.index).value);
+ break;
+ default:
+ return Status::TypeError("Not implemented dictionary index type");
+ break;
+ }
+ return value.dictionary->GetScalar(index_value);
+}
+
+std::shared_ptr<DictionaryScalar> DictionaryScalar::Make(std::shared_ptr<Scalar> index,
+ std::shared_ptr<Array> dict) {
+ auto type = dictionary(index->type, dict->type());
+ return std::make_shared<DictionaryScalar>(ValueType{std::move(index), std::move(dict)},
+ std::move(type));
+}
+
+template <typename T>
+using scalar_constructor_has_arrow_type =
+ std::is_constructible<typename TypeTraits<T>::ScalarType, std::shared_ptr<DataType>>;
+
+template <typename T, typename R = void>
+using enable_if_scalar_constructor_has_arrow_type =
+ typename std::enable_if<scalar_constructor_has_arrow_type<T>::value, R>::type;
+
+template <typename T, typename R = void>
+using enable_if_scalar_constructor_has_no_arrow_type =
+ typename std::enable_if<!scalar_constructor_has_arrow_type<T>::value, R>::type;
+
+struct MakeNullImpl {
+ template <typename T, typename ScalarType = typename TypeTraits<T>::ScalarType>
+ enable_if_scalar_constructor_has_arrow_type<T, Status> Visit(const T&) {
+ out_ = std::make_shared<ScalarType>(type_);
+ return Status::OK();
+ }
+
+ template <typename T, typename ScalarType = typename TypeTraits<T>::ScalarType>
+ enable_if_scalar_constructor_has_no_arrow_type<T, Status> Visit(const T&) {
+ out_ = std::make_shared<ScalarType>();
+ return Status::OK();
+ }
+
+ std::shared_ptr<Scalar> Finish() && {
+ // Should not fail.
+ DCHECK_OK(VisitTypeInline(*type_, this));
+ return std::move(out_);
+ }
+
+ std::shared_ptr<DataType> type_;
+ std::shared_ptr<Scalar> out_;
+};
+
+std::shared_ptr<Scalar> MakeNullScalar(std::shared_ptr<DataType> type) {
+ return MakeNullImpl{std::move(type), nullptr}.Finish();
+}
+
+std::string Scalar::ToString() const {
+ if (!this->is_valid) {
+ return "null";
+ }
+ if (type->id() == Type::DICTIONARY) {
+ auto dict_scalar = checked_cast<const DictionaryScalar*>(this);
+ return dict_scalar->value.dictionary->ToString() + "[" +
+ dict_scalar->value.index->ToString() + "]";
+ }
+ auto maybe_repr = CastTo(utf8());
+ if (maybe_repr.ok()) {
+ return checked_cast<const StringScalar&>(*maybe_repr.ValueOrDie()).value->ToString();
+ }
+ return "...";
+}
+
+struct ScalarParseImpl {
+ template <typename T, typename = internal::enable_if_parseable<T>>
+ Status Visit(const T& t) {
+ typename internal::StringConverter<T>::value_type value;
+ if (!internal::ParseValue(t, s_.data(), s_.size(), &value)) {
+ return Status::Invalid("error parsing '", s_, "' as scalar of type ", t);
+ }
+ return Finish(value);
+ }
+
+ Status Visit(const BinaryType&) { return FinishWithBuffer(); }
+
+ Status Visit(const LargeBinaryType&) { return FinishWithBuffer(); }
+
+ Status Visit(const FixedSizeBinaryType&) { return FinishWithBuffer(); }
+
+ Status Visit(const DictionaryType& t) {
+ ARROW_ASSIGN_OR_RAISE(auto value, Scalar::Parse(t.value_type(), s_));
+ return Finish(std::move(value));
+ }
+
+ Status Visit(const DataType& t) {
+ return Status::NotImplemented("parsing scalars of type ", t);
+ }
+
+ template <typename Arg>
+ Status Finish(Arg&& arg) {
+ return MakeScalar(std::move(type_), std::forward<Arg>(arg)).Value(&out_);
+ }
+
+ Status FinishWithBuffer() { return Finish(Buffer::FromString(std::string(s_))); }
+
+ Result<std::shared_ptr<Scalar>> Finish() && {
+ RETURN_NOT_OK(VisitTypeInline(*type_, this));
+ return std::move(out_);
+ }
+
+ ScalarParseImpl(std::shared_ptr<DataType> type, util::string_view s)
+ : type_(std::move(type)), s_(s) {}
+
+ std::shared_ptr<DataType> type_;
+ util::string_view s_;
+ std::shared_ptr<Scalar> out_;
+};
+
+Result<std::shared_ptr<Scalar>> Scalar::Parse(const std::shared_ptr<DataType>& type,
+ util::string_view s) {
+ return ScalarParseImpl{type, s}.Finish();
+}
+
+namespace internal {
+Status CheckBufferLength(const FixedSizeBinaryType* t, const std::shared_ptr<Buffer>* b) {
+ return t->byte_width() == (*b)->size()
+ ? Status::OK()
+ : Status::Invalid("buffer length ", (*b)->size(), " is not compatible with ",
+ *t);
+}
+} // namespace internal
+
+namespace {
+// CastImpl(...) assumes `to` points to a non null scalar of the correct type with
+// uninitialized value
+
+// helper for StringFormatter
+template <typename Formatter, typename ScalarType>
+std::shared_ptr<Buffer> FormatToBuffer(Formatter&& formatter, const ScalarType& from) {
+ if (!from.is_valid) {
+ return Buffer::FromString("null");
+ }
+ return formatter(from.value, [&](util::string_view v) {
+ return Buffer::FromString(std::string(v));
+ });
+}
+
+// error fallback
+Status CastImpl(const Scalar& from, Scalar* to) {
+ return Status::NotImplemented("casting scalars of type ", *from.type, " to type ",
+ *to->type);
+}
+
+// numeric to numeric
+template <typename From, typename To>
+Status CastImpl(const NumericScalar<From>& from, NumericScalar<To>* to) {
+ to->value = static_cast<typename To::c_type>(from.value);
+ return Status::OK();
+}
+
+// numeric to boolean
+template <typename T>
+Status CastImpl(const NumericScalar<T>& from, BooleanScalar* to) {
+ constexpr auto zero = static_cast<typename T::c_type>(0);
+ to->value = from.value != zero;
+ return Status::OK();
+}
+
+// boolean to numeric
+template <typename T>
+Status CastImpl(const BooleanScalar& from, NumericScalar<T>* to) {
+ to->value = static_cast<typename T::c_type>(from.value);
+ return Status::OK();
+}
+
+// numeric to temporal
+template <typename From, typename To>
+typename std::enable_if<std::is_base_of<TemporalType, To>::value &&
+ !std::is_same<DayTimeIntervalType, To>::value,
+ Status>::type
+CastImpl(const NumericScalar<From>& from, TemporalScalar<To>* to) {
+ to->value = static_cast<typename To::c_type>(from.value);
+ return Status::OK();
+}
+
+// temporal to numeric
+template <typename From, typename To>
+typename std::enable_if<std::is_base_of<TemporalType, From>::value &&
+ !std::is_same<DayTimeIntervalType, From>::value,
+ Status>::type
+CastImpl(const TemporalScalar<From>& from, NumericScalar<To>* to) {
+ to->value = static_cast<typename To::c_type>(from.value);
+ return Status::OK();
+}
+
+// timestamp to timestamp
+Status CastImpl(const TimestampScalar& from, TimestampScalar* to) {
+ return util::ConvertTimestampValue(from.type, to->type, from.value).Value(&to->value);
+}
+
+template <typename TypeWithTimeUnit>
+std::shared_ptr<DataType> AsTimestampType(const std::shared_ptr<DataType>& type) {
+ return timestamp(checked_cast<const TypeWithTimeUnit&>(*type).unit());
+}
+
+// duration to duration
+Status CastImpl(const DurationScalar& from, DurationScalar* to) {
+ return util::ConvertTimestampValue(AsTimestampType<DurationType>(from.type),
+ AsTimestampType<DurationType>(to->type), from.value)
+ .Value(&to->value);
+}
+
+// time to time
+template <typename F, typename ToScalar, typename T = typename ToScalar::TypeClass>
+enable_if_time<T, Status> CastImpl(const TimeScalar<F>& from, ToScalar* to) {
+ return util::ConvertTimestampValue(AsTimestampType<F>(from.type),
+ AsTimestampType<T>(to->type), from.value)
+ .Value(&to->value);
+}
+
+constexpr int64_t kMillisecondsInDay = 86400000;
+
+// date to date
+Status CastImpl(const Date32Scalar& from, Date64Scalar* to) {
+ to->value = from.value * kMillisecondsInDay;
+ return Status::OK();
+}
+Status CastImpl(const Date64Scalar& from, Date32Scalar* to) {
+ to->value = static_cast<int32_t>(from.value / kMillisecondsInDay);
+ return Status::OK();
+}
+
+// timestamp to date
+Status CastImpl(const TimestampScalar& from, Date64Scalar* to) {
+ ARROW_ASSIGN_OR_RAISE(
+ auto millis,
+ util::ConvertTimestampValue(from.type, timestamp(TimeUnit::MILLI), from.value));
+ to->value = millis - millis % kMillisecondsInDay;
+ return Status::OK();
+}
+Status CastImpl(const TimestampScalar& from, Date32Scalar* to) {
+ ARROW_ASSIGN_OR_RAISE(
+ auto millis,
+ util::ConvertTimestampValue(from.type, timestamp(TimeUnit::MILLI), from.value));
+ to->value = static_cast<int32_t>(millis / kMillisecondsInDay);
+ return Status::OK();
+}
+
+// date to timestamp
+template <typename D>
+Status CastImpl(const DateScalar<D>& from, TimestampScalar* to) {
+ int64_t millis = from.value;
+ if (std::is_same<D, Date32Type>::value) {
+ millis *= kMillisecondsInDay;
+ }
+ return util::ConvertTimestampValue(timestamp(TimeUnit::MILLI), to->type, millis)
+ .Value(&to->value);
+}
+
+// string to any
+template <typename ScalarType>
+Status CastImpl(const StringScalar& from, ScalarType* to) {
+ ARROW_ASSIGN_OR_RAISE(auto out,
+ Scalar::Parse(to->type, util::string_view(*from.value)));
+ to->value = std::move(checked_cast<ScalarType&>(*out).value);
+ return Status::OK();
+}
+
+// binary to string
+Status CastImpl(const BinaryScalar& from, StringScalar* to) {
+ to->value = from.value;
+ return Status::OK();
+}
+
+// formattable to string
+template <typename ScalarType, typename T = typename ScalarType::TypeClass,
+ typename Formatter = internal::StringFormatter<T>,
+ // note: Value unused but necessary to trigger SFINAE if Formatter is
+ // undefined
+ typename Value = typename Formatter::value_type>
+Status CastImpl(const ScalarType& from, StringScalar* to) {
+ to->value = FormatToBuffer(Formatter{from.type}, from);
+ return Status::OK();
+}
+
+Status CastImpl(const Decimal128Scalar& from, StringScalar* to) {
+ auto from_type = checked_cast<const Decimal128Type*>(from.type.get());
+ to->value = Buffer::FromString(from.value.ToString(from_type->scale()));
+ return Status::OK();
+}
+
+Status CastImpl(const Decimal256Scalar& from, StringScalar* to) {
+ auto from_type = checked_cast<const Decimal256Type*>(from.type.get());
+ to->value = Buffer::FromString(from.value.ToString(from_type->scale()));
+ return Status::OK();
+}
+
+Status CastImpl(const StructScalar& from, StringScalar* to) {
+ std::stringstream ss;
+ ss << '{';
+ for (int i = 0; static_cast<size_t>(i) < from.value.size(); i++) {
+ if (i > 0) ss << ", ";
+ ss << from.type->field(i)->name() << ':' << from.type->field(i)->type()->ToString()
+ << " = " << from.value[i]->ToString();
+ }
+ ss << '}';
+ to->value = Buffer::FromString(ss.str());
+ return Status::OK();
+}
+
+struct CastImplVisitor {
+ Status NotImplemented() {
+ return Status::NotImplemented("cast to ", *to_type_, " from ", *from_.type);
+ }
+
+ const Scalar& from_;
+ const std::shared_ptr<DataType>& to_type_;
+ Scalar* out_;
+};
+
+template <typename ToType>
+struct FromTypeVisitor : CastImplVisitor {
+ using ToScalar = typename TypeTraits<ToType>::ScalarType;
+
+ FromTypeVisitor(const Scalar& from, const std::shared_ptr<DataType>& to_type,
+ Scalar* out)
+ : CastImplVisitor{from, to_type, out} {}
+
+ template <typename FromType>
+ Status Visit(const FromType&) {
+ return CastImpl(checked_cast<const typename TypeTraits<FromType>::ScalarType&>(from_),
+ checked_cast<ToScalar*>(out_));
+ }
+
+ // identity cast only for parameter free types
+ template <typename T1 = ToType>
+ typename std::enable_if<TypeTraits<T1>::is_parameter_free, Status>::type Visit(
+ const ToType&) {
+ checked_cast<ToScalar*>(out_)->value = checked_cast<const ToScalar&>(from_).value;
+ return Status::OK();
+ }
+
+ Status Visit(const NullType&) { return NotImplemented(); }
+ Status Visit(const SparseUnionType&) { return NotImplemented(); }
+ Status Visit(const DenseUnionType&) { return NotImplemented(); }
+ Status Visit(const DictionaryType&) { return NotImplemented(); }
+ Status Visit(const ExtensionType&) { return NotImplemented(); }
+};
+
+struct ToTypeVisitor : CastImplVisitor {
+ ToTypeVisitor(const Scalar& from, const std::shared_ptr<DataType>& to_type, Scalar* out)
+ : CastImplVisitor{from, to_type, out} {}
+
+ template <typename ToType>
+ Status Visit(const ToType&) {
+ FromTypeVisitor<ToType> unpack_from_type{from_, to_type_, out_};
+ return VisitTypeInline(*from_.type, &unpack_from_type);
+ }
+
+ Status Visit(const NullType&) {
+ if (from_.is_valid) {
+ return Status::Invalid("attempting to cast non-null scalar to NullScalar");
+ }
+ return Status::OK();
+ }
+
+ Status Visit(const DictionaryType& dict_type) {
+ auto& out = checked_cast<DictionaryScalar*>(out_)->value;
+ ARROW_ASSIGN_OR_RAISE(auto cast_value, from_.CastTo(dict_type.value_type()));
+ ARROW_ASSIGN_OR_RAISE(out.dictionary, MakeArrayFromScalar(*cast_value, 1));
+ return Int32Scalar(0).CastTo(dict_type.index_type()).Value(&out.index);
+ }
+
+ Status Visit(const SparseUnionType&) { return NotImplemented(); }
+ Status Visit(const DenseUnionType&) { return NotImplemented(); }
+ Status Visit(const ExtensionType&) { return NotImplemented(); }
+};
+
+} // namespace
+
+Result<std::shared_ptr<Scalar>> Scalar::CastTo(std::shared_ptr<DataType> to) const {
+ std::shared_ptr<Scalar> out = MakeNullScalar(to);
+ if (is_valid) {
+ out->is_valid = true;
+ ToTypeVisitor unpack_to_type{*this, to, out.get()};
+ RETURN_NOT_OK(VisitTypeInline(*to, &unpack_to_type));
+ }
+ return out;
+}
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/scalar.h b/contrib/libs/apache/arrow/cpp/src/arrow/scalar.h
new file mode 100644
index 00000000000..24744859686
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/scalar.h
@@ -0,0 +1,537 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Object model for scalar (non-Array) values. Not intended for use with large
+// amounts of data
+//
+// NOTE: This API is experimental as of the 0.13 version and subject to change
+// without deprecation warnings
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/compare.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_fwd.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/compare.h"
+#include "arrow/util/decimal.h"
+#include "arrow/util/string_view.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class Array;
+
+/// \brief Base class for scalar values
+///
+/// A Scalar represents a single value with a specific DataType.
+/// Scalars are useful for passing single value inputs to compute functions,
+/// or for representing individual array elements (with a non-trivial
+/// wrapping cost, though).
+struct ARROW_EXPORT Scalar : public util::EqualityComparable<Scalar> {
+ virtual ~Scalar() = default;
+
+ explicit Scalar(std::shared_ptr<DataType> type) : type(std::move(type)) {}
+
+ /// \brief The type of the scalar value
+ std::shared_ptr<DataType> type;
+
+ /// \brief Whether the value is valid (not null) or not
+ bool is_valid = false;
+
+ using util::EqualityComparable<Scalar>::operator==;
+ using util::EqualityComparable<Scalar>::Equals;
+ bool Equals(const Scalar& other,
+ const EqualOptions& options = EqualOptions::Defaults()) const;
+
+ bool ApproxEquals(const Scalar& other,
+ const EqualOptions& options = EqualOptions::Defaults()) const;
+
+ struct ARROW_EXPORT Hash {
+ size_t operator()(const Scalar& scalar) const { return scalar.hash(); }
+
+ size_t operator()(const std::shared_ptr<Scalar>& scalar) const {
+ return scalar->hash();
+ }
+ };
+
+ size_t hash() const;
+
+ std::string ToString() const;
+
+ static Result<std::shared_ptr<Scalar>> Parse(const std::shared_ptr<DataType>& type,
+ util::string_view repr);
+
+ // TODO(bkietz) add compute::CastOptions
+ Result<std::shared_ptr<Scalar>> CastTo(std::shared_ptr<DataType> to) const;
+
+ protected:
+ Scalar(std::shared_ptr<DataType> type, bool is_valid)
+ : type(std::move(type)), is_valid(is_valid) {}
+};
+
+/// \defgroup concrete-scalar-classes Concrete Scalar subclasses
+///
+/// @{
+
+/// \brief A scalar value for NullType. Never valid
+struct ARROW_EXPORT NullScalar : public Scalar {
+ public:
+ using TypeClass = NullType;
+
+ NullScalar() : Scalar{null(), false} {}
+};
+
+/// @}
+
+namespace internal {
+
+struct ARROW_EXPORT PrimitiveScalarBase : public Scalar {
+ using Scalar::Scalar;
+ virtual void* mutable_data() = 0;
+ virtual const void* data() const = 0;
+};
+
+template <typename T, typename CType = typename T::c_type>
+struct ARROW_EXPORT PrimitiveScalar : public PrimitiveScalarBase {
+ using PrimitiveScalarBase::PrimitiveScalarBase;
+ using TypeClass = T;
+ using ValueType = CType;
+
+ // Non-null constructor.
+ PrimitiveScalar(ValueType value, std::shared_ptr<DataType> type)
+ : PrimitiveScalarBase(std::move(type), true), value(value) {}
+
+ explicit PrimitiveScalar(std::shared_ptr<DataType> type)
+ : PrimitiveScalarBase(std::move(type), false) {}
+
+ ValueType value{};
+
+ void* mutable_data() override { return &value; }
+ const void* data() const override { return &value; }
+};
+
+} // namespace internal
+
+/// \addtogroup concrete-scalar-classes Concrete Scalar subclasses
+///
+/// @{
+
+struct ARROW_EXPORT BooleanScalar : public internal::PrimitiveScalar<BooleanType, bool> {
+ using Base = internal::PrimitiveScalar<BooleanType, bool>;
+ using Base::Base;
+
+ explicit BooleanScalar(bool value) : Base(value, boolean()) {}
+
+ BooleanScalar() : Base(boolean()) {}
+};
+
+template <typename T>
+struct NumericScalar : public internal::PrimitiveScalar<T> {
+ using Base = typename internal::PrimitiveScalar<T>;
+ using Base::Base;
+ using TypeClass = typename Base::TypeClass;
+ using ValueType = typename Base::ValueType;
+
+ explicit NumericScalar(ValueType value)
+ : Base(value, TypeTraits<T>::type_singleton()) {}
+
+ NumericScalar() : Base(TypeTraits<T>::type_singleton()) {}
+};
+
+struct ARROW_EXPORT Int8Scalar : public NumericScalar<Int8Type> {
+ using NumericScalar<Int8Type>::NumericScalar;
+};
+
+struct ARROW_EXPORT Int16Scalar : public NumericScalar<Int16Type> {
+ using NumericScalar<Int16Type>::NumericScalar;
+};
+
+struct ARROW_EXPORT Int32Scalar : public NumericScalar<Int32Type> {
+ using NumericScalar<Int32Type>::NumericScalar;
+};
+
+struct ARROW_EXPORT Int64Scalar : public NumericScalar<Int64Type> {
+ using NumericScalar<Int64Type>::NumericScalar;
+};
+
+struct ARROW_EXPORT UInt8Scalar : public NumericScalar<UInt8Type> {
+ using NumericScalar<UInt8Type>::NumericScalar;
+};
+
+struct ARROW_EXPORT UInt16Scalar : public NumericScalar<UInt16Type> {
+ using NumericScalar<UInt16Type>::NumericScalar;
+};
+
+struct ARROW_EXPORT UInt32Scalar : public NumericScalar<UInt32Type> {
+ using NumericScalar<UInt32Type>::NumericScalar;
+};
+
+struct ARROW_EXPORT UInt64Scalar : public NumericScalar<UInt64Type> {
+ using NumericScalar<UInt64Type>::NumericScalar;
+};
+
+struct ARROW_EXPORT HalfFloatScalar : public NumericScalar<HalfFloatType> {
+ using NumericScalar<HalfFloatType>::NumericScalar;
+};
+
+struct ARROW_EXPORT FloatScalar : public NumericScalar<FloatType> {
+ using NumericScalar<FloatType>::NumericScalar;
+};
+
+struct ARROW_EXPORT DoubleScalar : public NumericScalar<DoubleType> {
+ using NumericScalar<DoubleType>::NumericScalar;
+};
+
+struct ARROW_EXPORT BaseBinaryScalar : public Scalar {
+ using Scalar::Scalar;
+ using ValueType = std::shared_ptr<Buffer>;
+
+ std::shared_ptr<Buffer> value;
+
+ protected:
+ BaseBinaryScalar(std::shared_ptr<Buffer> value, std::shared_ptr<DataType> type)
+ : Scalar{std::move(type), true}, value(std::move(value)) {}
+};
+
+struct ARROW_EXPORT BinaryScalar : public BaseBinaryScalar {
+ using BaseBinaryScalar::BaseBinaryScalar;
+ using TypeClass = BinaryType;
+
+ BinaryScalar(std::shared_ptr<Buffer> value, std::shared_ptr<DataType> type)
+ : BaseBinaryScalar(std::move(value), std::move(type)) {}
+
+ explicit BinaryScalar(std::shared_ptr<Buffer> value)
+ : BinaryScalar(std::move(value), binary()) {}
+
+ BinaryScalar() : BinaryScalar(binary()) {}
+};
+
+struct ARROW_EXPORT StringScalar : public BinaryScalar {
+ using BinaryScalar::BinaryScalar;
+ using TypeClass = StringType;
+
+ explicit StringScalar(std::shared_ptr<Buffer> value)
+ : StringScalar(std::move(value), utf8()) {}
+
+ explicit StringScalar(std::string s);
+
+ StringScalar() : StringScalar(utf8()) {}
+};
+
+struct ARROW_EXPORT LargeBinaryScalar : public BaseBinaryScalar {
+ using BaseBinaryScalar::BaseBinaryScalar;
+ using TypeClass = LargeBinaryType;
+
+ LargeBinaryScalar(std::shared_ptr<Buffer> value, std::shared_ptr<DataType> type)
+ : BaseBinaryScalar(std::move(value), std::move(type)) {}
+
+ explicit LargeBinaryScalar(std::shared_ptr<Buffer> value)
+ : LargeBinaryScalar(std::move(value), large_binary()) {}
+
+ LargeBinaryScalar() : LargeBinaryScalar(large_binary()) {}
+};
+
+struct ARROW_EXPORT LargeStringScalar : public LargeBinaryScalar {
+ using LargeBinaryScalar::LargeBinaryScalar;
+ using TypeClass = LargeStringType;
+
+ explicit LargeStringScalar(std::shared_ptr<Buffer> value)
+ : LargeStringScalar(std::move(value), large_utf8()) {}
+
+ explicit LargeStringScalar(std::string s);
+
+ LargeStringScalar() : LargeStringScalar(large_utf8()) {}
+};
+
+struct ARROW_EXPORT FixedSizeBinaryScalar : public BinaryScalar {
+ using TypeClass = FixedSizeBinaryType;
+
+ FixedSizeBinaryScalar(std::shared_ptr<Buffer> value, std::shared_ptr<DataType> type);
+
+ explicit FixedSizeBinaryScalar(std::shared_ptr<DataType> type) : BinaryScalar(type) {}
+};
+
+template <typename T>
+struct ARROW_EXPORT TemporalScalar : internal::PrimitiveScalar<T> {
+ using internal::PrimitiveScalar<T>::PrimitiveScalar;
+ using ValueType = typename TemporalScalar<T>::ValueType;
+
+ explicit TemporalScalar(ValueType value, std::shared_ptr<DataType> type)
+ : internal::PrimitiveScalar<T>(std::move(value), type) {}
+};
+
+template <typename T>
+struct ARROW_EXPORT DateScalar : public TemporalScalar<T> {
+ using TemporalScalar<T>::TemporalScalar;
+ using ValueType = typename TemporalScalar<T>::ValueType;
+
+ explicit DateScalar(ValueType value)
+ : TemporalScalar<T>(std::move(value), TypeTraits<T>::type_singleton()) {}
+ DateScalar() : TemporalScalar<T>(TypeTraits<T>::type_singleton()) {}
+};
+
+struct ARROW_EXPORT Date32Scalar : public DateScalar<Date32Type> {
+ using DateScalar<Date32Type>::DateScalar;
+};
+
+struct ARROW_EXPORT Date64Scalar : public DateScalar<Date64Type> {
+ using DateScalar<Date64Type>::DateScalar;
+};
+
+template <typename T>
+struct ARROW_EXPORT TimeScalar : public TemporalScalar<T> {
+ using TemporalScalar<T>::TemporalScalar;
+};
+
+struct ARROW_EXPORT Time32Scalar : public TimeScalar<Time32Type> {
+ using TimeScalar<Time32Type>::TimeScalar;
+};
+
+struct ARROW_EXPORT Time64Scalar : public TimeScalar<Time64Type> {
+ using TimeScalar<Time64Type>::TimeScalar;
+};
+
+struct ARROW_EXPORT TimestampScalar : public TemporalScalar<TimestampType> {
+ using TemporalScalar<TimestampType>::TemporalScalar;
+};
+
+template <typename T>
+struct ARROW_EXPORT IntervalScalar : public TemporalScalar<T> {
+ using TemporalScalar<T>::TemporalScalar;
+ using ValueType = typename TemporalScalar<T>::ValueType;
+
+ explicit IntervalScalar(ValueType value)
+ : TemporalScalar<T>(value, TypeTraits<T>::type_singleton()) {}
+ IntervalScalar() : TemporalScalar<T>(TypeTraits<T>::type_singleton()) {}
+};
+
+struct ARROW_EXPORT MonthIntervalScalar : public IntervalScalar<MonthIntervalType> {
+ using IntervalScalar<MonthIntervalType>::IntervalScalar;
+};
+
+struct ARROW_EXPORT DayTimeIntervalScalar : public IntervalScalar<DayTimeIntervalType> {
+ using IntervalScalar<DayTimeIntervalType>::IntervalScalar;
+};
+
+struct ARROW_EXPORT DurationScalar : public TemporalScalar<DurationType> {
+ using TemporalScalar<DurationType>::TemporalScalar;
+};
+
+struct ARROW_EXPORT Decimal128Scalar : public Scalar {
+ using Scalar::Scalar;
+ using TypeClass = Decimal128Type;
+ using ValueType = Decimal128;
+
+ Decimal128Scalar(Decimal128 value, std::shared_ptr<DataType> type)
+ : Scalar(std::move(type), true), value(value) {}
+
+ Decimal128 value;
+};
+
+struct ARROW_EXPORT Decimal256Scalar : public Scalar {
+ using Scalar::Scalar;
+ using TypeClass = Decimal256Type;
+ using ValueType = Decimal256;
+
+ Decimal256Scalar(Decimal256 value, std::shared_ptr<DataType> type)
+ : Scalar(std::move(type), true), value(value) {}
+
+ Decimal256 value;
+};
+
+struct ARROW_EXPORT BaseListScalar : public Scalar {
+ using Scalar::Scalar;
+ using ValueType = std::shared_ptr<Array>;
+
+ BaseListScalar(std::shared_ptr<Array> value, std::shared_ptr<DataType> type);
+
+ std::shared_ptr<Array> value;
+};
+
+struct ARROW_EXPORT ListScalar : public BaseListScalar {
+ using TypeClass = ListType;
+ using BaseListScalar::BaseListScalar;
+
+ explicit ListScalar(std::shared_ptr<Array> value);
+};
+
+struct ARROW_EXPORT LargeListScalar : public BaseListScalar {
+ using TypeClass = LargeListType;
+ using BaseListScalar::BaseListScalar;
+
+ explicit LargeListScalar(std::shared_ptr<Array> value);
+};
+
+struct ARROW_EXPORT MapScalar : public BaseListScalar {
+ using TypeClass = MapType;
+ using BaseListScalar::BaseListScalar;
+
+ explicit MapScalar(std::shared_ptr<Array> value);
+};
+
+struct ARROW_EXPORT FixedSizeListScalar : public BaseListScalar {
+ using TypeClass = FixedSizeListType;
+ using BaseListScalar::BaseListScalar;
+
+ FixedSizeListScalar(std::shared_ptr<Array> value, std::shared_ptr<DataType> type);
+
+ explicit FixedSizeListScalar(std::shared_ptr<Array> value);
+};
+
+struct ARROW_EXPORT StructScalar : public Scalar {
+ using TypeClass = StructType;
+ using ValueType = std::vector<std::shared_ptr<Scalar>>;
+
+ ScalarVector value;
+
+ Result<std::shared_ptr<Scalar>> field(FieldRef ref) const;
+
+ StructScalar(ValueType value, std::shared_ptr<DataType> type)
+ : Scalar(std::move(type), true), value(std::move(value)) {}
+
+ static Result<std::shared_ptr<StructScalar>> Make(ValueType value,
+ std::vector<std::string> field_names);
+
+ explicit StructScalar(std::shared_ptr<DataType> type) : Scalar(std::move(type)) {}
+};
+
+struct ARROW_EXPORT UnionScalar : public Scalar {
+ using Scalar::Scalar;
+ using ValueType = std::shared_ptr<Scalar>;
+ ValueType value;
+
+ UnionScalar(ValueType value, std::shared_ptr<DataType> type)
+ : Scalar(std::move(type), true), value(std::move(value)) {}
+};
+
+struct ARROW_EXPORT SparseUnionScalar : public UnionScalar {
+ using UnionScalar::UnionScalar;
+ using TypeClass = SparseUnionType;
+};
+
+struct ARROW_EXPORT DenseUnionScalar : public UnionScalar {
+ using UnionScalar::UnionScalar;
+ using TypeClass = DenseUnionType;
+};
+
+struct ARROW_EXPORT DictionaryScalar : public Scalar {
+ using TypeClass = DictionaryType;
+ struct ValueType {
+ std::shared_ptr<Scalar> index;
+ std::shared_ptr<Array> dictionary;
+ } value;
+
+ explicit DictionaryScalar(std::shared_ptr<DataType> type);
+
+ DictionaryScalar(ValueType value, std::shared_ptr<DataType> type, bool is_valid = true)
+ : Scalar(std::move(type), is_valid), value(std::move(value)) {}
+
+ static std::shared_ptr<DictionaryScalar> Make(std::shared_ptr<Scalar> index,
+ std::shared_ptr<Array> dict);
+
+ Result<std::shared_ptr<Scalar>> GetEncodedValue() const;
+};
+
+struct ARROW_EXPORT ExtensionScalar : public Scalar {
+ using Scalar::Scalar;
+ using TypeClass = ExtensionType;
+};
+
+/// @}
+
+namespace internal {
+
+inline Status CheckBufferLength(...) { return Status::OK(); }
+
+ARROW_EXPORT Status CheckBufferLength(const FixedSizeBinaryType* t,
+ const std::shared_ptr<Buffer>* b);
+
+} // namespace internal
+
+template <typename ValueRef>
+struct MakeScalarImpl {
+ template <typename T, typename ScalarType = typename TypeTraits<T>::ScalarType,
+ typename ValueType = typename ScalarType::ValueType,
+ typename Enable = typename std::enable_if<
+ std::is_constructible<ScalarType, ValueType,
+ std::shared_ptr<DataType>>::value &&
+ std::is_convertible<ValueRef, ValueType>::value>::type>
+ Status Visit(const T& t) {
+ ARROW_RETURN_NOT_OK(internal::CheckBufferLength(&t, &value_));
+ out_ = std::make_shared<ScalarType>(
+ static_cast<ValueType>(static_cast<ValueRef>(value_)), std::move(type_));
+ return Status::OK();
+ }
+
+ Status Visit(const DataType& t) {
+ return Status::NotImplemented("constructing scalars of type ", t,
+ " from unboxed values");
+ }
+
+ Result<std::shared_ptr<Scalar>> Finish() && {
+ ARROW_RETURN_NOT_OK(VisitTypeInline(*type_, this));
+ return std::move(out_);
+ }
+
+ std::shared_ptr<DataType> type_;
+ ValueRef value_;
+ std::shared_ptr<Scalar> out_;
+};
+
+/// \defgroup scalar-factories Scalar factory functions
+///
+/// @{
+
+/// \brief Scalar factory for null scalars
+ARROW_EXPORT
+std::shared_ptr<Scalar> MakeNullScalar(std::shared_ptr<DataType> type);
+
+/// \brief Scalar factory for non-null scalars
+template <typename Value>
+Result<std::shared_ptr<Scalar>> MakeScalar(std::shared_ptr<DataType> type,
+ Value&& value) {
+ return MakeScalarImpl<Value&&>{type, std::forward<Value>(value), NULLPTR}.Finish();
+}
+
+/// \brief Type-inferring scalar factory for non-null scalars
+///
+/// Construct a Scalar instance with a DataType determined by the input C++ type.
+/// (for example Int8Scalar for a int8_t input).
+/// Only non-parametric primitive types and String are supported.
+template <typename Value, typename Traits = CTypeTraits<typename std::decay<Value>::type>,
+ typename ScalarType = typename Traits::ScalarType,
+ typename Enable = decltype(ScalarType(std::declval<Value>(),
+ Traits::type_singleton()))>
+std::shared_ptr<Scalar> MakeScalar(Value value) {
+ return std::make_shared<ScalarType>(std::move(value), Traits::type_singleton());
+}
+
+inline std::shared_ptr<Scalar> MakeScalar(std::string value) {
+ return std::make_shared<StringScalar>(std::move(value));
+}
+
+/// @}
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/sparse_tensor.cc b/contrib/libs/apache/arrow/cpp/src/arrow/sparse_tensor.cc
new file mode 100644
index 00000000000..03d59c3d793
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/sparse_tensor.cc
@@ -0,0 +1,478 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/sparse_tensor.h"
+#include "arrow/tensor/converter.h"
+
+#include <algorithm>
+#include <functional>
+#include <memory>
+#include <numeric>
+
+#include "arrow/compare.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/logging.h"
+#include "arrow/visitor_inline.h"
+
+namespace arrow {
+
+class MemoryPool;
+
+// ----------------------------------------------------------------------
+// SparseIndex
+
+Status SparseIndex::ValidateShape(const std::vector<int64_t>& shape) const {
+ if (!std::all_of(shape.begin(), shape.end(), [](int64_t x) { return x >= 0; })) {
+ return Status::Invalid("Shape elements must be positive");
+ }
+
+ return Status::OK();
+}
+
+namespace internal {
+namespace {
+
+template <typename IndexValueType>
+Status CheckSparseIndexMaximumValue(const std::vector<int64_t>& shape) {
+ using c_index_value_type = typename IndexValueType::c_type;
+ constexpr int64_t type_max =
+ static_cast<int64_t>(std::numeric_limits<c_index_value_type>::max());
+ auto greater_than_type_max = [&](int64_t x) { return x > type_max; };
+ if (std::any_of(shape.begin(), shape.end(), greater_than_type_max)) {
+ return Status::Invalid("The bit width of the index value type is too small");
+ }
+ return Status::OK();
+}
+
+template <>
+Status CheckSparseIndexMaximumValue<Int64Type>(const std::vector<int64_t>& shape) {
+ return Status::OK();
+}
+
+template <>
+Status CheckSparseIndexMaximumValue<UInt64Type>(const std::vector<int64_t>& shape) {
+ return Status::Invalid("UInt64Type cannot be used as IndexValueType of SparseIndex");
+}
+
+} // namespace
+
+#define CALL_CHECK_MAXIMUM_VALUE(TYPE_CLASS) \
+ case TYPE_CLASS##Type::type_id: \
+ return CheckSparseIndexMaximumValue<TYPE_CLASS##Type>(shape);
+
+Status CheckSparseIndexMaximumValue(const std::shared_ptr<DataType>& index_value_type,
+ const std::vector<int64_t>& shape) {
+ switch (index_value_type->id()) {
+ ARROW_GENERATE_FOR_ALL_INTEGER_TYPES(CALL_CHECK_MAXIMUM_VALUE);
+ default:
+ return Status::TypeError("Unsupported SparseTensor index value type");
+ }
+}
+
+#undef CALL_CHECK_MAXIMUM_VALUE
+
+Status MakeSparseTensorFromTensor(const Tensor& tensor,
+ SparseTensorFormat::type sparse_format_id,
+ const std::shared_ptr<DataType>& index_value_type,
+ MemoryPool* pool,
+ std::shared_ptr<SparseIndex>* out_sparse_index,
+ std::shared_ptr<Buffer>* out_data) {
+ switch (sparse_format_id) {
+ case SparseTensorFormat::COO:
+ return MakeSparseCOOTensorFromTensor(tensor, index_value_type, pool,
+ out_sparse_index, out_data);
+ case SparseTensorFormat::CSR:
+ return MakeSparseCSXMatrixFromTensor(SparseMatrixCompressedAxis::ROW, tensor,
+ index_value_type, pool, out_sparse_index,
+ out_data);
+ case SparseTensorFormat::CSC:
+ return MakeSparseCSXMatrixFromTensor(SparseMatrixCompressedAxis::COLUMN, tensor,
+ index_value_type, pool, out_sparse_index,
+ out_data);
+ case SparseTensorFormat::CSF:
+ return MakeSparseCSFTensorFromTensor(tensor, index_value_type, pool,
+ out_sparse_index, out_data);
+
+ // LCOV_EXCL_START: ignore program failure
+ default:
+ return Status::Invalid("Invalid sparse tensor format");
+ // LCOV_EXCL_STOP
+ }
+}
+
+} // namespace internal
+
+// ----------------------------------------------------------------------
+// SparseCOOIndex
+
+namespace {
+
+inline Status CheckSparseCOOIndexValidity(const std::shared_ptr<DataType>& type,
+ const std::vector<int64_t>& shape,
+ const std::vector<int64_t>& strides) {
+ if (!is_integer(type->id())) {
+ return Status::TypeError("Type of SparseCOOIndex indices must be integer");
+ }
+ if (shape.size() != 2) {
+ return Status::Invalid("SparseCOOIndex indices must be a matrix");
+ }
+
+ RETURN_NOT_OK(internal::CheckSparseIndexMaximumValue(type, shape));
+
+ if (!internal::IsTensorStridesContiguous(type, shape, strides)) {
+ return Status::Invalid("SparseCOOIndex indices must be contiguous");
+ }
+ return Status::OK();
+}
+
+void GetCOOIndexTensorRow(const std::shared_ptr<Tensor>& coords, const int64_t row,
+ std::vector<int64_t>* out_index) {
+ const auto& fw_index_value_type =
+ internal::checked_cast<const FixedWidthType&>(*coords->type());
+ const size_t indices_elsize = fw_index_value_type.bit_width() / CHAR_BIT;
+
+ const auto& shape = coords->shape();
+ const int64_t non_zero_length = shape[0];
+ DCHECK(0 <= row && row < non_zero_length);
+
+ const int64_t ndim = shape[1];
+ out_index->resize(ndim);
+
+ switch (indices_elsize) {
+ case 1: // Int8, UInt8
+ for (int64_t i = 0; i < ndim; ++i) {
+ (*out_index)[i] = static_cast<int64_t>(coords->Value<UInt8Type>({row, i}));
+ }
+ break;
+ case 2: // Int16, UInt16
+ for (int64_t i = 0; i < ndim; ++i) {
+ (*out_index)[i] = static_cast<int64_t>(coords->Value<UInt16Type>({row, i}));
+ }
+ break;
+ case 4: // Int32, UInt32
+ for (int64_t i = 0; i < ndim; ++i) {
+ (*out_index)[i] = static_cast<int64_t>(coords->Value<UInt32Type>({row, i}));
+ }
+ break;
+ case 8: // Int64
+ for (int64_t i = 0; i < ndim; ++i) {
+ (*out_index)[i] = coords->Value<Int64Type>({row, i});
+ }
+ break;
+ default:
+ DCHECK(false) << "Must not reach here";
+ break;
+ }
+}
+
+bool DetectSparseCOOIndexCanonicality(const std::shared_ptr<Tensor>& coords) {
+ DCHECK_EQ(coords->ndim(), 2);
+
+ const auto& shape = coords->shape();
+ const int64_t non_zero_length = shape[0];
+ if (non_zero_length <= 1) return true;
+
+ const int64_t ndim = shape[1];
+ std::vector<int64_t> last_index, index;
+ GetCOOIndexTensorRow(coords, 0, &last_index);
+ for (int64_t i = 1; i < non_zero_length; ++i) {
+ GetCOOIndexTensorRow(coords, i, &index);
+ int64_t j = 0;
+ while (j < ndim) {
+ if (last_index[j] > index[j]) {
+ // last_index > index, so we can detect non-canonical here
+ return false;
+ }
+ if (last_index[j] < index[j]) {
+ // last_index < index, so we can skip the remaining dimensions
+ break;
+ }
+ ++j;
+ }
+ if (j == ndim) {
+ // last_index == index, so we can detect non-canonical here
+ return false;
+ }
+ swap(last_index, index);
+ }
+
+ return true;
+}
+
+} // namespace
+
+Result<std::shared_ptr<SparseCOOIndex>> SparseCOOIndex::Make(
+ const std::shared_ptr<Tensor>& coords, bool is_canonical) {
+ RETURN_NOT_OK(
+ CheckSparseCOOIndexValidity(coords->type(), coords->shape(), coords->strides()));
+ return std::make_shared<SparseCOOIndex>(coords, is_canonical);
+}
+
+Result<std::shared_ptr<SparseCOOIndex>> SparseCOOIndex::Make(
+ const std::shared_ptr<Tensor>& coords) {
+ RETURN_NOT_OK(
+ CheckSparseCOOIndexValidity(coords->type(), coords->shape(), coords->strides()));
+ auto is_canonical = DetectSparseCOOIndexCanonicality(coords);
+ return std::make_shared<SparseCOOIndex>(coords, is_canonical);
+}
+
+Result<std::shared_ptr<SparseCOOIndex>> SparseCOOIndex::Make(
+ const std::shared_ptr<DataType>& indices_type,
+ const std::vector<int64_t>& indices_shape,
+ const std::vector<int64_t>& indices_strides, std::shared_ptr<Buffer> indices_data,
+ bool is_canonical) {
+ RETURN_NOT_OK(
+ CheckSparseCOOIndexValidity(indices_type, indices_shape, indices_strides));
+ return std::make_shared<SparseCOOIndex>(
+ std::make_shared<Tensor>(indices_type, indices_data, indices_shape,
+ indices_strides),
+ is_canonical);
+}
+
+Result<std::shared_ptr<SparseCOOIndex>> SparseCOOIndex::Make(
+ const std::shared_ptr<DataType>& indices_type,
+ const std::vector<int64_t>& indices_shape,
+ const std::vector<int64_t>& indices_strides, std::shared_ptr<Buffer> indices_data) {
+ RETURN_NOT_OK(
+ CheckSparseCOOIndexValidity(indices_type, indices_shape, indices_strides));
+ auto coords = std::make_shared<Tensor>(indices_type, indices_data, indices_shape,
+ indices_strides);
+ auto is_canonical = DetectSparseCOOIndexCanonicality(coords);
+ return std::make_shared<SparseCOOIndex>(coords, is_canonical);
+}
+
+Result<std::shared_ptr<SparseCOOIndex>> SparseCOOIndex::Make(
+ const std::shared_ptr<DataType>& indices_type, const std::vector<int64_t>& shape,
+ int64_t non_zero_length, std::shared_ptr<Buffer> indices_data, bool is_canonical) {
+ auto ndim = static_cast<int64_t>(shape.size());
+ if (!is_integer(indices_type->id())) {
+ return Status::TypeError("Type of SparseCOOIndex indices must be integer");
+ }
+ const int64_t elsize =
+ internal::checked_cast<const IntegerType&>(*indices_type).bit_width() / 8;
+ std::vector<int64_t> indices_shape({non_zero_length, ndim});
+ std::vector<int64_t> indices_strides({elsize * ndim, elsize});
+ return Make(indices_type, indices_shape, indices_strides, indices_data, is_canonical);
+}
+
+Result<std::shared_ptr<SparseCOOIndex>> SparseCOOIndex::Make(
+ const std::shared_ptr<DataType>& indices_type, const std::vector<int64_t>& shape,
+ int64_t non_zero_length, std::shared_ptr<Buffer> indices_data) {
+ auto ndim = static_cast<int64_t>(shape.size());
+ if (!is_integer(indices_type->id())) {
+ return Status::TypeError("Type of SparseCOOIndex indices must be integer");
+ }
+ const int64_t elsize = internal::GetByteWidth(*indices_type);
+ std::vector<int64_t> indices_shape({non_zero_length, ndim});
+ std::vector<int64_t> indices_strides({elsize * ndim, elsize});
+ return Make(indices_type, indices_shape, indices_strides, indices_data);
+}
+
+// Constructor with a contiguous NumericTensor
+SparseCOOIndex::SparseCOOIndex(const std::shared_ptr<Tensor>& coords, bool is_canonical)
+ : SparseIndexBase(), coords_(coords), is_canonical_(is_canonical) {
+ ARROW_CHECK_OK(
+ CheckSparseCOOIndexValidity(coords_->type(), coords_->shape(), coords_->strides()));
+}
+
+std::string SparseCOOIndex::ToString() const { return std::string("SparseCOOIndex"); }
+
+// ----------------------------------------------------------------------
+// SparseCSXIndex
+
+namespace internal {
+
+Status ValidateSparseCSXIndex(const std::shared_ptr<DataType>& indptr_type,
+ const std::shared_ptr<DataType>& indices_type,
+ const std::vector<int64_t>& indptr_shape,
+ const std::vector<int64_t>& indices_shape,
+ char const* type_name) {
+ if (!is_integer(indptr_type->id())) {
+ return Status::TypeError("Type of ", type_name, " indptr must be integer");
+ }
+ if (indptr_shape.size() != 1) {
+ return Status::Invalid(type_name, " indptr must be a vector");
+ }
+ if (!is_integer(indices_type->id())) {
+ return Status::Invalid("Type of ", type_name, " indices must be integer");
+ }
+ if (indices_shape.size() != 1) {
+ return Status::Invalid(type_name, " indices must be a vector");
+ }
+
+ RETURN_NOT_OK(internal::CheckSparseIndexMaximumValue(indptr_type, indptr_shape));
+ RETURN_NOT_OK(internal::CheckSparseIndexMaximumValue(indices_type, indices_shape));
+
+ return Status::OK();
+}
+
+void CheckSparseCSXIndexValidity(const std::shared_ptr<DataType>& indptr_type,
+ const std::shared_ptr<DataType>& indices_type,
+ const std::vector<int64_t>& indptr_shape,
+ const std::vector<int64_t>& indices_shape,
+ char const* type_name) {
+ ARROW_CHECK_OK(ValidateSparseCSXIndex(indptr_type, indices_type, indptr_shape,
+ indices_shape, type_name));
+}
+
+} // namespace internal
+
+// ----------------------------------------------------------------------
+// SparseCSFIndex
+
+namespace {
+
+inline Status CheckSparseCSFIndexValidity(const std::shared_ptr<DataType>& indptr_type,
+ const std::shared_ptr<DataType>& indices_type,
+ const int64_t num_indptrs,
+ const int64_t num_indices,
+ const int64_t axis_order_size) {
+ if (!is_integer(indptr_type->id())) {
+ return Status::TypeError("Type of SparseCSFIndex indptr must be integer");
+ }
+ if (!is_integer(indices_type->id())) {
+ return Status::TypeError("Type of SparseCSFIndex indices must be integer");
+ }
+ if (num_indptrs + 1 != num_indices) {
+ return Status::Invalid(
+ "Length of indices must be equal to length of indptrs + 1 for SparseCSFIndex.");
+ }
+ if (axis_order_size != num_indices) {
+ return Status::Invalid(
+ "Length of indices must be equal to number of dimensions for SparseCSFIndex.");
+ }
+ return Status::OK();
+}
+
+} // namespace
+
+Result<std::shared_ptr<SparseCSFIndex>> SparseCSFIndex::Make(
+ const std::shared_ptr<DataType>& indptr_type,
+ const std::shared_ptr<DataType>& indices_type,
+ const std::vector<int64_t>& indices_shapes, const std::vector<int64_t>& axis_order,
+ const std::vector<std::shared_ptr<Buffer>>& indptr_data,
+ const std::vector<std::shared_ptr<Buffer>>& indices_data) {
+ int64_t ndim = axis_order.size();
+ std::vector<std::shared_ptr<Tensor>> indptr(ndim - 1);
+ std::vector<std::shared_ptr<Tensor>> indices(ndim);
+
+ for (int64_t i = 0; i < ndim - 1; ++i)
+ indptr[i] = std::make_shared<Tensor>(indptr_type, indptr_data[i],
+ std::vector<int64_t>({indices_shapes[i] + 1}));
+ for (int64_t i = 0; i < ndim; ++i)
+ indices[i] = std::make_shared<Tensor>(indices_type, indices_data[i],
+ std::vector<int64_t>({indices_shapes[i]}));
+
+ RETURN_NOT_OK(CheckSparseCSFIndexValidity(indptr_type, indices_type, indptr.size(),
+ indices.size(), axis_order.size()));
+
+ for (auto tensor : indptr) {
+ RETURN_NOT_OK(internal::CheckSparseIndexMaximumValue(indptr_type, tensor->shape()));
+ }
+
+ for (auto tensor : indices) {
+ RETURN_NOT_OK(internal::CheckSparseIndexMaximumValue(indices_type, tensor->shape()));
+ }
+
+ return std::make_shared<SparseCSFIndex>(indptr, indices, axis_order);
+}
+
+// Constructor with two index vectors
+SparseCSFIndex::SparseCSFIndex(const std::vector<std::shared_ptr<Tensor>>& indptr,
+ const std::vector<std::shared_ptr<Tensor>>& indices,
+ const std::vector<int64_t>& axis_order)
+ : SparseIndexBase(), indptr_(indptr), indices_(indices), axis_order_(axis_order) {
+ ARROW_CHECK_OK(CheckSparseCSFIndexValidity(indptr_.front()->type(),
+ indices_.front()->type(), indptr_.size(),
+ indices_.size(), axis_order_.size()));
+}
+
+std::string SparseCSFIndex::ToString() const { return std::string("SparseCSFIndex"); }
+
+bool SparseCSFIndex::Equals(const SparseCSFIndex& other) const {
+ for (int64_t i = 0; i < static_cast<int64_t>(indices().size()); ++i) {
+ if (!indices()[i]->Equals(*other.indices()[i])) return false;
+ }
+ for (int64_t i = 0; i < static_cast<int64_t>(indptr().size()); ++i) {
+ if (!indptr()[i]->Equals(*other.indptr()[i])) return false;
+ }
+ return axis_order() == other.axis_order();
+}
+
+// ----------------------------------------------------------------------
+// SparseTensor
+
+// Constructor with all attributes
+SparseTensor::SparseTensor(const std::shared_ptr<DataType>& type,
+ const std::shared_ptr<Buffer>& data,
+ const std::vector<int64_t>& shape,
+ const std::shared_ptr<SparseIndex>& sparse_index,
+ const std::vector<std::string>& dim_names)
+ : type_(type),
+ data_(data),
+ shape_(shape),
+ sparse_index_(sparse_index),
+ dim_names_(dim_names) {
+ ARROW_CHECK(is_tensor_supported(type->id()));
+}
+
+const std::string& SparseTensor::dim_name(int i) const {
+ static const std::string kEmpty = "";
+ if (dim_names_.size() == 0) {
+ return kEmpty;
+ } else {
+ ARROW_CHECK_LT(i, static_cast<int>(dim_names_.size()));
+ return dim_names_[i];
+ }
+}
+
+int64_t SparseTensor::size() const {
+ return std::accumulate(shape_.begin(), shape_.end(), 1LL, std::multiplies<int64_t>());
+}
+
+bool SparseTensor::Equals(const SparseTensor& other, const EqualOptions& opts) const {
+ return SparseTensorEquals(*this, other, opts);
+}
+
+Result<std::shared_ptr<Tensor>> SparseTensor::ToTensor(MemoryPool* pool) const {
+ switch (format_id()) {
+ case SparseTensorFormat::COO:
+ return MakeTensorFromSparseCOOTensor(
+ pool, internal::checked_cast<const SparseCOOTensor*>(this));
+ break;
+
+ case SparseTensorFormat::CSR:
+ return MakeTensorFromSparseCSRMatrix(
+ pool, internal::checked_cast<const SparseCSRMatrix*>(this));
+ break;
+
+ case SparseTensorFormat::CSC:
+ return MakeTensorFromSparseCSCMatrix(
+ pool, internal::checked_cast<const SparseCSCMatrix*>(this));
+ break;
+
+ case SparseTensorFormat::CSF:
+ return MakeTensorFromSparseCSFTensor(
+ pool, internal::checked_cast<const SparseCSFTensor*>(this));
+
+ default:
+ return Status::NotImplemented("Unsupported SparseIndex format type");
+ }
+}
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/sparse_tensor.h b/contrib/libs/apache/arrow/cpp/src/arrow/sparse_tensor.h
new file mode 100644
index 00000000000..1f2f8c0d82e
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/sparse_tensor.h
@@ -0,0 +1,624 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/buffer.h"
+#include "arrow/compare.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/tensor.h" // IWYU pragma: export
+#include "arrow/type.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class MemoryPool;
+
+namespace internal {
+
+ARROW_EXPORT
+Status CheckSparseIndexMaximumValue(const std::shared_ptr<DataType>& index_value_type,
+ const std::vector<int64_t>& shape);
+
+} // namespace internal
+
+// ----------------------------------------------------------------------
+// SparseIndex class
+
+struct SparseTensorFormat {
+ /// EXPERIMENTAL: The index format type of SparseTensor
+ enum type {
+ /// Coordinate list (COO) format.
+ COO,
+ /// Compressed sparse row (CSR) format.
+ CSR,
+ /// Compressed sparse column (CSC) format.
+ CSC,
+ /// Compressed sparse fiber (CSF) format.
+ CSF
+ };
+};
+
+/// \brief EXPERIMENTAL: The base class for the index of a sparse tensor
+///
+/// SparseIndex describes where the non-zero elements are within a SparseTensor.
+///
+/// There are several ways to represent this. The format_id is used to
+/// distinguish what kind of representation is used. Each possible value of
+/// format_id must have only one corresponding concrete subclass of SparseIndex.
+class ARROW_EXPORT SparseIndex {
+ public:
+ explicit SparseIndex(SparseTensorFormat::type format_id) : format_id_(format_id) {}
+
+ virtual ~SparseIndex() = default;
+
+ /// \brief Return the identifier of the format type
+ SparseTensorFormat::type format_id() const { return format_id_; }
+
+ /// \brief Return the number of non zero values in the sparse tensor related
+ /// to this sparse index
+ virtual int64_t non_zero_length() const = 0;
+
+ /// \brief Return the string representation of the sparse index
+ virtual std::string ToString() const = 0;
+
+ virtual Status ValidateShape(const std::vector<int64_t>& shape) const;
+
+ protected:
+ const SparseTensorFormat::type format_id_;
+};
+
+namespace internal {
+template <typename SparseIndexType>
+class SparseIndexBase : public SparseIndex {
+ public:
+ SparseIndexBase() : SparseIndex(SparseIndexType::format_id) {}
+};
+} // namespace internal
+
+// ----------------------------------------------------------------------
+// SparseCOOIndex class
+
+/// \brief EXPERIMENTAL: The index data for a COO sparse tensor
+///
+/// A COO sparse index manages the location of its non-zero values by their
+/// coordinates.
+class ARROW_EXPORT SparseCOOIndex : public internal::SparseIndexBase<SparseCOOIndex> {
+ public:
+ static constexpr SparseTensorFormat::type format_id = SparseTensorFormat::COO;
+
+ /// \brief Make SparseCOOIndex from a coords tensor and canonicality
+ static Result<std::shared_ptr<SparseCOOIndex>> Make(
+ const std::shared_ptr<Tensor>& coords, bool is_canonical);
+
+ /// \brief Make SparseCOOIndex from a coords tensor with canonicality auto-detection
+ static Result<std::shared_ptr<SparseCOOIndex>> Make(
+ const std::shared_ptr<Tensor>& coords);
+
+ /// \brief Make SparseCOOIndex from raw properties with canonicality auto-detection
+ static Result<std::shared_ptr<SparseCOOIndex>> Make(
+ const std::shared_ptr<DataType>& indices_type,
+ const std::vector<int64_t>& indices_shape,
+ const std::vector<int64_t>& indices_strides, std::shared_ptr<Buffer> indices_data);
+
+ /// \brief Make SparseCOOIndex from raw properties
+ static Result<std::shared_ptr<SparseCOOIndex>> Make(
+ const std::shared_ptr<DataType>& indices_type,
+ const std::vector<int64_t>& indices_shape,
+ const std::vector<int64_t>& indices_strides, std::shared_ptr<Buffer> indices_data,
+ bool is_canonical);
+
+ /// \brief Make SparseCOOIndex from sparse tensor's shape properties and data
+ /// with canonicality auto-detection
+ ///
+ /// The indices_data should be in row-major (C-like) order. If not,
+ /// use the raw properties constructor.
+ static Result<std::shared_ptr<SparseCOOIndex>> Make(
+ const std::shared_ptr<DataType>& indices_type, const std::vector<int64_t>& shape,
+ int64_t non_zero_length, std::shared_ptr<Buffer> indices_data);
+
+ /// \brief Make SparseCOOIndex from sparse tensor's shape properties and data
+ ///
+ /// The indices_data should be in row-major (C-like) order. If not,
+ /// use the raw properties constructor.
+ static Result<std::shared_ptr<SparseCOOIndex>> Make(
+ const std::shared_ptr<DataType>& indices_type, const std::vector<int64_t>& shape,
+ int64_t non_zero_length, std::shared_ptr<Buffer> indices_data, bool is_canonical);
+
+ /// \brief Construct SparseCOOIndex from column-major NumericTensor
+ explicit SparseCOOIndex(const std::shared_ptr<Tensor>& coords, bool is_canonical);
+
+ /// \brief Return a tensor that has the coordinates of the non-zero values
+ ///
+ /// The returned tensor is a N x D tensor where N is the number of non-zero
+ /// values and D is the number of dimensions in the logical data.
+ /// The column at index `i` is a D-tuple of coordinates indicating that the
+ /// logical value at those coordinates should be found at physical index `i`.
+ const std::shared_ptr<Tensor>& indices() const { return coords_; }
+
+ /// \brief Return the number of non zero values in the sparse tensor related
+ /// to this sparse index
+ int64_t non_zero_length() const override { return coords_->shape()[0]; }
+
+ /// \brief Return whether a sparse tensor index is canonical, or not.
+ /// If a sparse tensor index is canonical, it is sorted in the lexicographical order,
+ /// and the corresponding sparse tensor doesn't have duplicated entries.
+ bool is_canonical() const { return is_canonical_; }
+
+ /// \brief Return a string representation of the sparse index
+ std::string ToString() const override;
+
+ /// \brief Return whether the COO indices are equal
+ bool Equals(const SparseCOOIndex& other) const {
+ return indices()->Equals(*other.indices());
+ }
+
+ inline Status ValidateShape(const std::vector<int64_t>& shape) const override {
+ ARROW_RETURN_NOT_OK(SparseIndex::ValidateShape(shape));
+
+ if (static_cast<size_t>(coords_->shape()[1]) == shape.size()) {
+ return Status::OK();
+ }
+
+ return Status::Invalid(
+ "shape length is inconsistent with the coords matrix in COO index");
+ }
+
+ protected:
+ std::shared_ptr<Tensor> coords_;
+ bool is_canonical_;
+};
+
+namespace internal {
+
+/// EXPERIMENTAL: The axis to be compressed
+enum class SparseMatrixCompressedAxis : char {
+ /// The value for CSR matrix
+ ROW,
+ /// The value for CSC matrix
+ COLUMN
+};
+
+ARROW_EXPORT
+Status ValidateSparseCSXIndex(const std::shared_ptr<DataType>& indptr_type,
+ const std::shared_ptr<DataType>& indices_type,
+ const std::vector<int64_t>& indptr_shape,
+ const std::vector<int64_t>& indices_shape,
+ char const* type_name);
+
+ARROW_EXPORT
+void CheckSparseCSXIndexValidity(const std::shared_ptr<DataType>& indptr_type,
+ const std::shared_ptr<DataType>& indices_type,
+ const std::vector<int64_t>& indptr_shape,
+ const std::vector<int64_t>& indices_shape,
+ char const* type_name);
+
+template <typename SparseIndexType, SparseMatrixCompressedAxis COMPRESSED_AXIS>
+class SparseCSXIndex : public SparseIndexBase<SparseIndexType> {
+ public:
+ static constexpr SparseMatrixCompressedAxis kCompressedAxis = COMPRESSED_AXIS;
+
+ /// \brief Make a subclass of SparseCSXIndex from raw properties
+ static Result<std::shared_ptr<SparseIndexType>> Make(
+ const std::shared_ptr<DataType>& indptr_type,
+ const std::shared_ptr<DataType>& indices_type,
+ const std::vector<int64_t>& indptr_shape, const std::vector<int64_t>& indices_shape,
+ std::shared_ptr<Buffer> indptr_data, std::shared_ptr<Buffer> indices_data) {
+ ARROW_RETURN_NOT_OK(ValidateSparseCSXIndex(indptr_type, indices_type, indptr_shape,
+ indices_shape,
+ SparseIndexType::kTypeName));
+ return std::make_shared<SparseIndexType>(
+ std::make_shared<Tensor>(indptr_type, indptr_data, indptr_shape),
+ std::make_shared<Tensor>(indices_type, indices_data, indices_shape));
+ }
+
+ /// \brief Make a subclass of SparseCSXIndex from raw properties
+ static Result<std::shared_ptr<SparseIndexType>> Make(
+ const std::shared_ptr<DataType>& indices_type,
+ const std::vector<int64_t>& indptr_shape, const std::vector<int64_t>& indices_shape,
+ std::shared_ptr<Buffer> indptr_data, std::shared_ptr<Buffer> indices_data) {
+ return Make(indices_type, indices_type, indptr_shape, indices_shape, indptr_data,
+ indices_data);
+ }
+
+ /// \brief Make a subclass of SparseCSXIndex from sparse tensor's shape properties and
+ /// data
+ static Result<std::shared_ptr<SparseIndexType>> Make(
+ const std::shared_ptr<DataType>& indptr_type,
+ const std::shared_ptr<DataType>& indices_type, const std::vector<int64_t>& shape,
+ int64_t non_zero_length, std::shared_ptr<Buffer> indptr_data,
+ std::shared_ptr<Buffer> indices_data) {
+ std::vector<int64_t> indptr_shape({shape[0] + 1});
+ std::vector<int64_t> indices_shape({non_zero_length});
+ return Make(indptr_type, indices_type, indptr_shape, indices_shape, indptr_data,
+ indices_data);
+ }
+
+ /// \brief Make a subclass of SparseCSXIndex from sparse tensor's shape properties and
+ /// data
+ static Result<std::shared_ptr<SparseIndexType>> Make(
+ const std::shared_ptr<DataType>& indices_type, const std::vector<int64_t>& shape,
+ int64_t non_zero_length, std::shared_ptr<Buffer> indptr_data,
+ std::shared_ptr<Buffer> indices_data) {
+ return Make(indices_type, indices_type, shape, non_zero_length, indptr_data,
+ indices_data);
+ }
+
+ /// \brief Construct SparseCSXIndex from two index vectors
+ explicit SparseCSXIndex(const std::shared_ptr<Tensor>& indptr,
+ const std::shared_ptr<Tensor>& indices)
+ : SparseIndexBase<SparseIndexType>(), indptr_(indptr), indices_(indices) {
+ CheckSparseCSXIndexValidity(indptr_->type(), indices_->type(), indptr_->shape(),
+ indices_->shape(), SparseIndexType::kTypeName);
+ }
+
+ /// \brief Return a 1D tensor of indptr vector
+ const std::shared_ptr<Tensor>& indptr() const { return indptr_; }
+
+ /// \brief Return a 1D tensor of indices vector
+ const std::shared_ptr<Tensor>& indices() const { return indices_; }
+
+ /// \brief Return the number of non zero values in the sparse tensor related
+ /// to this sparse index
+ int64_t non_zero_length() const override { return indices_->shape()[0]; }
+
+ /// \brief Return a string representation of the sparse index
+ std::string ToString() const override {
+ return std::string(SparseIndexType::kTypeName);
+ }
+
+ /// \brief Return whether the CSR indices are equal
+ bool Equals(const SparseIndexType& other) const {
+ return indptr()->Equals(*other.indptr()) && indices()->Equals(*other.indices());
+ }
+
+ inline Status ValidateShape(const std::vector<int64_t>& shape) const override {
+ ARROW_RETURN_NOT_OK(SparseIndex::ValidateShape(shape));
+
+ if (shape.size() < 2) {
+ return Status::Invalid("shape length is too short");
+ }
+
+ if (shape.size() > 2) {
+ return Status::Invalid("shape length is too long");
+ }
+
+ if (indptr_->shape()[0] == shape[static_cast<int64_t>(kCompressedAxis)] + 1) {
+ return Status::OK();
+ }
+
+ return Status::Invalid("shape length is inconsistent with the ", ToString());
+ }
+
+ protected:
+ std::shared_ptr<Tensor> indptr_;
+ std::shared_ptr<Tensor> indices_;
+};
+
+} // namespace internal
+
+// ----------------------------------------------------------------------
+// SparseCSRIndex class
+
+/// \brief EXPERIMENTAL: The index data for a CSR sparse matrix
+///
+/// A CSR sparse index manages the location of its non-zero values by two
+/// vectors.
+///
+/// The first vector, called indptr, represents the range of the rows; the i-th
+/// row spans from indptr[i] to indptr[i+1] in the corresponding value vector.
+/// So the length of an indptr vector is the number of rows + 1.
+///
+/// The other vector, called indices, represents the column indices of the
+/// corresponding non-zero values. So the length of an indices vector is same
+/// as the number of non-zero-values.
+class ARROW_EXPORT SparseCSRIndex
+ : public internal::SparseCSXIndex<SparseCSRIndex,
+ internal::SparseMatrixCompressedAxis::ROW> {
+ public:
+ using BaseClass =
+ internal::SparseCSXIndex<SparseCSRIndex, internal::SparseMatrixCompressedAxis::ROW>;
+
+ static constexpr SparseTensorFormat::type format_id = SparseTensorFormat::CSR;
+ static constexpr char const* kTypeName = "SparseCSRIndex";
+
+ using SparseCSXIndex::kCompressedAxis;
+ using SparseCSXIndex::Make;
+ using SparseCSXIndex::SparseCSXIndex;
+};
+
+// ----------------------------------------------------------------------
+// SparseCSCIndex class
+
+/// \brief EXPERIMENTAL: The index data for a CSC sparse matrix
+///
+/// A CSC sparse index manages the location of its non-zero values by two
+/// vectors.
+///
+/// The first vector, called indptr, represents the range of the column; the i-th
+/// column spans from indptr[i] to indptr[i+1] in the corresponding value vector.
+/// So the length of an indptr vector is the number of columns + 1.
+///
+/// The other vector, called indices, represents the row indices of the
+/// corresponding non-zero values. So the length of an indices vector is same
+/// as the number of non-zero-values.
+class ARROW_EXPORT SparseCSCIndex
+ : public internal::SparseCSXIndex<SparseCSCIndex,
+ internal::SparseMatrixCompressedAxis::COLUMN> {
+ public:
+ using BaseClass =
+ internal::SparseCSXIndex<SparseCSCIndex,
+ internal::SparseMatrixCompressedAxis::COLUMN>;
+
+ static constexpr SparseTensorFormat::type format_id = SparseTensorFormat::CSC;
+ static constexpr char const* kTypeName = "SparseCSCIndex";
+
+ using SparseCSXIndex::kCompressedAxis;
+ using SparseCSXIndex::Make;
+ using SparseCSXIndex::SparseCSXIndex;
+};
+
+// ----------------------------------------------------------------------
+// SparseCSFIndex class
+
+/// \brief EXPERIMENTAL: The index data for a CSF sparse tensor
+///
+/// A CSF sparse index manages the location of its non-zero values by set of
+/// prefix trees. Each path from a root to leaf forms one tensor non-zero index.
+/// CSF is implemented with three vectors.
+///
+/// Vectors inptr and indices contain N-1 and N buffers respectively, where N is the
+/// number of dimensions. Axis_order is a vector of integers of length N. Indptr and
+/// indices describe the set of prefix trees. Trees traverse dimensions in order given by
+/// axis_order.
+class ARROW_EXPORT SparseCSFIndex : public internal::SparseIndexBase<SparseCSFIndex> {
+ public:
+ static constexpr SparseTensorFormat::type format_id = SparseTensorFormat::CSF;
+ static constexpr char const* kTypeName = "SparseCSFIndex";
+
+ /// \brief Make SparseCSFIndex from raw properties
+ static Result<std::shared_ptr<SparseCSFIndex>> Make(
+ const std::shared_ptr<DataType>& indptr_type,
+ const std::shared_ptr<DataType>& indices_type,
+ const std::vector<int64_t>& indices_shapes, const std::vector<int64_t>& axis_order,
+ const std::vector<std::shared_ptr<Buffer>>& indptr_data,
+ const std::vector<std::shared_ptr<Buffer>>& indices_data);
+
+ /// \brief Make SparseCSFIndex from raw properties
+ static Result<std::shared_ptr<SparseCSFIndex>> Make(
+ const std::shared_ptr<DataType>& indices_type,
+ const std::vector<int64_t>& indices_shapes, const std::vector<int64_t>& axis_order,
+ const std::vector<std::shared_ptr<Buffer>>& indptr_data,
+ const std::vector<std::shared_ptr<Buffer>>& indices_data) {
+ return Make(indices_type, indices_type, indices_shapes, axis_order, indptr_data,
+ indices_data);
+ }
+
+ /// \brief Construct SparseCSFIndex from two index vectors
+ explicit SparseCSFIndex(const std::vector<std::shared_ptr<Tensor>>& indptr,
+ const std::vector<std::shared_ptr<Tensor>>& indices,
+ const std::vector<int64_t>& axis_order);
+
+ /// \brief Return a 1D vector of indptr tensors
+ const std::vector<std::shared_ptr<Tensor>>& indptr() const { return indptr_; }
+
+ /// \brief Return a 1D vector of indices tensors
+ const std::vector<std::shared_ptr<Tensor>>& indices() const { return indices_; }
+
+ /// \brief Return a 1D vector specifying the order of axes
+ const std::vector<int64_t>& axis_order() const { return axis_order_; }
+
+ /// \brief Return the number of non zero values in the sparse tensor related
+ /// to this sparse index
+ int64_t non_zero_length() const override { return indices_.back()->shape()[0]; }
+
+ /// \brief Return a string representation of the sparse index
+ std::string ToString() const override;
+
+ /// \brief Return whether the CSF indices are equal
+ bool Equals(const SparseCSFIndex& other) const;
+
+ protected:
+ std::vector<std::shared_ptr<Tensor>> indptr_;
+ std::vector<std::shared_ptr<Tensor>> indices_;
+ std::vector<int64_t> axis_order_;
+};
+
+// ----------------------------------------------------------------------
+// SparseTensor class
+
+/// \brief EXPERIMENTAL: The base class of sparse tensor container
+class ARROW_EXPORT SparseTensor {
+ public:
+ virtual ~SparseTensor() = default;
+
+ SparseTensorFormat::type format_id() const { return sparse_index_->format_id(); }
+
+ /// \brief Return a value type of the sparse tensor
+ std::shared_ptr<DataType> type() const { return type_; }
+
+ /// \brief Return a buffer that contains the value vector of the sparse tensor
+ std::shared_ptr<Buffer> data() const { return data_; }
+
+ /// \brief Return an immutable raw data pointer
+ const uint8_t* raw_data() const { return data_->data(); }
+
+ /// \brief Return a mutable raw data pointer
+ uint8_t* raw_mutable_data() const { return data_->mutable_data(); }
+
+ /// \brief Return a shape vector of the sparse tensor
+ const std::vector<int64_t>& shape() const { return shape_; }
+
+ /// \brief Return a sparse index of the sparse tensor
+ const std::shared_ptr<SparseIndex>& sparse_index() const { return sparse_index_; }
+
+ /// \brief Return a number of dimensions of the sparse tensor
+ int ndim() const { return static_cast<int>(shape_.size()); }
+
+ /// \brief Return a vector of dimension names
+ const std::vector<std::string>& dim_names() const { return dim_names_; }
+
+ /// \brief Return the name of the i-th dimension
+ const std::string& dim_name(int i) const;
+
+ /// \brief Total number of value cells in the sparse tensor
+ int64_t size() const;
+
+ /// \brief Return true if the underlying data buffer is mutable
+ bool is_mutable() const { return data_->is_mutable(); }
+
+ /// \brief Total number of non-zero cells in the sparse tensor
+ int64_t non_zero_length() const {
+ return sparse_index_ ? sparse_index_->non_zero_length() : 0;
+ }
+
+ /// \brief Return whether sparse tensors are equal
+ bool Equals(const SparseTensor& other,
+ const EqualOptions& = EqualOptions::Defaults()) const;
+
+ /// \brief Return dense representation of sparse tensor as tensor
+ ///
+ /// The returned Tensor has row-major order (C-like).
+ Result<std::shared_ptr<Tensor>> ToTensor(MemoryPool* pool) const;
+ Result<std::shared_ptr<Tensor>> ToTensor() const {
+ return ToTensor(default_memory_pool());
+ }
+
+ /// \brief Status-return version of ToTensor().
+ ARROW_DEPRECATED("Use Result-returning version")
+ Status ToTensor(std::shared_ptr<Tensor>* out) const { return ToTensor().Value(out); }
+ Status ToTensor(MemoryPool* pool, std::shared_ptr<Tensor>* out) const {
+ return ToTensor(pool).Value(out);
+ }
+
+ protected:
+ // Constructor with all attributes
+ SparseTensor(const std::shared_ptr<DataType>& type, const std::shared_ptr<Buffer>& data,
+ const std::vector<int64_t>& shape,
+ const std::shared_ptr<SparseIndex>& sparse_index,
+ const std::vector<std::string>& dim_names);
+
+ std::shared_ptr<DataType> type_;
+ std::shared_ptr<Buffer> data_;
+ std::vector<int64_t> shape_;
+ std::shared_ptr<SparseIndex> sparse_index_;
+
+ // These names are optional
+ std::vector<std::string> dim_names_;
+};
+
+// ----------------------------------------------------------------------
+// SparseTensorImpl class
+
+namespace internal {
+
+ARROW_EXPORT
+Status MakeSparseTensorFromTensor(const Tensor& tensor,
+ SparseTensorFormat::type sparse_format_id,
+ const std::shared_ptr<DataType>& index_value_type,
+ MemoryPool* pool,
+ std::shared_ptr<SparseIndex>* out_sparse_index,
+ std::shared_ptr<Buffer>* out_data);
+
+} // namespace internal
+
+/// \brief EXPERIMENTAL: Concrete sparse tensor implementation classes with sparse index
+/// type
+template <typename SparseIndexType>
+class SparseTensorImpl : public SparseTensor {
+ public:
+ virtual ~SparseTensorImpl() = default;
+
+ /// \brief Construct a sparse tensor from physical data buffer and logical index
+ SparseTensorImpl(const std::shared_ptr<SparseIndexType>& sparse_index,
+ const std::shared_ptr<DataType>& type,
+ const std::shared_ptr<Buffer>& data, const std::vector<int64_t>& shape,
+ const std::vector<std::string>& dim_names)
+ : SparseTensor(type, data, shape, sparse_index, dim_names) {}
+
+ /// \brief Construct an empty sparse tensor
+ SparseTensorImpl(const std::shared_ptr<DataType>& type,
+ const std::vector<int64_t>& shape,
+ const std::vector<std::string>& dim_names = {})
+ : SparseTensorImpl(NULLPTR, type, NULLPTR, shape, dim_names) {}
+
+ /// \brief Create a SparseTensor with full parameters
+ static inline Result<std::shared_ptr<SparseTensorImpl<SparseIndexType>>> Make(
+ const std::shared_ptr<SparseIndexType>& sparse_index,
+ const std::shared_ptr<DataType>& type, const std::shared_ptr<Buffer>& data,
+ const std::vector<int64_t>& shape, const std::vector<std::string>& dim_names) {
+ if (!is_tensor_supported(type->id())) {
+ return Status::Invalid(type->ToString(),
+ " is not valid data type for a sparse tensor");
+ }
+ ARROW_RETURN_NOT_OK(sparse_index->ValidateShape(shape));
+ if (dim_names.size() > 0 && dim_names.size() != shape.size()) {
+ return Status::Invalid("dim_names length is inconsistent with shape");
+ }
+ return std::make_shared<SparseTensorImpl<SparseIndexType>>(sparse_index, type, data,
+ shape, dim_names);
+ }
+
+ /// \brief Create a sparse tensor from a dense tensor
+ ///
+ /// The dense tensor is re-encoded as a sparse index and a physical
+ /// data buffer for the non-zero value.
+ static inline Result<std::shared_ptr<SparseTensorImpl<SparseIndexType>>> Make(
+ const Tensor& tensor, const std::shared_ptr<DataType>& index_value_type,
+ MemoryPool* pool = default_memory_pool()) {
+ std::shared_ptr<SparseIndex> sparse_index;
+ std::shared_ptr<Buffer> data;
+ ARROW_RETURN_NOT_OK(internal::MakeSparseTensorFromTensor(
+ tensor, SparseIndexType::format_id, index_value_type, pool, &sparse_index,
+ &data));
+ return std::make_shared<SparseTensorImpl<SparseIndexType>>(
+ internal::checked_pointer_cast<SparseIndexType>(sparse_index), tensor.type(),
+ data, tensor.shape(), tensor.dim_names_);
+ }
+
+ static inline Result<std::shared_ptr<SparseTensorImpl<SparseIndexType>>> Make(
+ const Tensor& tensor, MemoryPool* pool = default_memory_pool()) {
+ return Make(tensor, int64(), pool);
+ }
+
+ private:
+ ARROW_DISALLOW_COPY_AND_ASSIGN(SparseTensorImpl);
+};
+
+/// \brief EXPERIMENTAL: Type alias for COO sparse tensor
+using SparseCOOTensor = SparseTensorImpl<SparseCOOIndex>;
+
+/// \brief EXPERIMENTAL: Type alias for CSR sparse matrix
+using SparseCSRMatrix = SparseTensorImpl<SparseCSRIndex>;
+
+/// \brief EXPERIMENTAL: Type alias for CSC sparse matrix
+using SparseCSCMatrix = SparseTensorImpl<SparseCSCIndex>;
+
+/// \brief EXPERIMENTAL: Type alias for CSF sparse matrix
+using SparseCSFTensor = SparseTensorImpl<SparseCSFIndex>;
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/status.cc b/contrib/libs/apache/arrow/cpp/src/arrow/status.cc
new file mode 100644
index 00000000000..0f02cb57a23
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/status.cc
@@ -0,0 +1,143 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A Status encapsulates the result of an operation. It may indicate success,
+// or it may indicate an error with an associated error message.
+//
+// Multiple threads can invoke const methods on a Status without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same Status must use
+// external synchronization.
+
+#include "arrow/status.h"
+
+#include <cassert>
+#include <cstdlib>
+#include <iostream>
+#include <sstream>
+
+#include "arrow/util/logging.h"
+
+namespace arrow {
+
+Status::Status(StatusCode code, const std::string& msg)
+ : Status::Status(code, msg, nullptr) {}
+
+Status::Status(StatusCode code, std::string msg, std::shared_ptr<StatusDetail> detail) {
+ ARROW_CHECK_NE(code, StatusCode::OK) << "Cannot construct ok status with message";
+ state_ = new State;
+ state_->code = code;
+ state_->msg = std::move(msg);
+ if (detail != nullptr) {
+ state_->detail = std::move(detail);
+ }
+}
+
+void Status::CopyFrom(const Status& s) {
+ delete state_;
+ if (s.state_ == nullptr) {
+ state_ = nullptr;
+ } else {
+ state_ = new State(*s.state_);
+ }
+}
+
+std::string Status::CodeAsString() const {
+ if (state_ == nullptr) {
+ return "OK";
+ }
+ return CodeAsString(code());
+}
+
+std::string Status::CodeAsString(StatusCode code) {
+ const char* type;
+ switch (code) {
+ case StatusCode::OK:
+ type = "OK";
+ break;
+ case StatusCode::OutOfMemory:
+ type = "Out of memory";
+ break;
+ case StatusCode::KeyError:
+ type = "Key error";
+ break;
+ case StatusCode::TypeError:
+ type = "Type error";
+ break;
+ case StatusCode::Invalid:
+ type = "Invalid";
+ break;
+ case StatusCode::Cancelled:
+ type = "Cancelled";
+ break;
+ case StatusCode::IOError:
+ type = "IOError";
+ break;
+ case StatusCode::CapacityError:
+ type = "Capacity error";
+ break;
+ case StatusCode::IndexError:
+ type = "Index error";
+ break;
+ case StatusCode::UnknownError:
+ type = "Unknown error";
+ break;
+ case StatusCode::NotImplemented:
+ type = "NotImplemented";
+ break;
+ case StatusCode::SerializationError:
+ type = "Serialization error";
+ break;
+ case StatusCode::CodeGenError:
+ type = "CodeGenError in Gandiva";
+ break;
+ case StatusCode::ExpressionValidationError:
+ type = "ExpressionValidationError";
+ break;
+ case StatusCode::ExecutionError:
+ type = "ExecutionError in Gandiva";
+ break;
+ default:
+ type = "Unknown";
+ break;
+ }
+ return std::string(type);
+}
+
+std::string Status::ToString() const {
+ std::string result(CodeAsString());
+ if (state_ == nullptr) {
+ return result;
+ }
+ result += ": ";
+ result += state_->msg;
+ if (state_->detail != nullptr) {
+ result += ". Detail: ";
+ result += state_->detail->ToString();
+ }
+
+ return result;
+}
+
+void Status::Abort() const { Abort(std::string()); }
+
+void Status::Abort(const std::string& message) const {
+ std::cerr << "-- Arrow Fatal Error --\n";
+ if (!message.empty()) {
+ std::cerr << message << "\n";
+ }
+ std::cerr << ToString() << std::endl;
+ std::abort();
+}
+
+#ifdef ARROW_EXTRA_ERROR_CONTEXT
+void Status::AddContextLine(const char* filename, int line, const char* expr) {
+ ARROW_CHECK(!ok()) << "Cannot add context line to ok status";
+ std::stringstream ss;
+ ss << "\n" << filename << ":" << line << " " << expr;
+ state_->msg += ss.str();
+}
+#endif
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/status.h b/contrib/libs/apache/arrow/cpp/src/arrow/status.h
new file mode 100644
index 00000000000..056d60d6f32
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/status.h
@@ -0,0 +1,451 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A Status encapsulates the result of an operation. It may indicate success,
+// or it may indicate an error with an associated error message.
+//
+// Multiple threads can invoke const methods on a Status without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same Status must use
+// external synchronization.
+
+// Adapted from Apache Kudu, TensorFlow
+
+#pragma once
+
+#include <cstring>
+#include <iosfwd>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "arrow/util/compare.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/string_builder.h"
+#include "arrow/util/visibility.h"
+
+#ifdef ARROW_EXTRA_ERROR_CONTEXT
+
+/// \brief Return with given status if condition is met.
+#define ARROW_RETURN_IF_(condition, status, expr) \
+ do { \
+ if (ARROW_PREDICT_FALSE(condition)) { \
+ ::arrow::Status _st = (status); \
+ _st.AddContextLine(__FILE__, __LINE__, expr); \
+ return _st; \
+ } \
+ } while (0)
+
+#else
+
+#define ARROW_RETURN_IF_(condition, status, _) \
+ do { \
+ if (ARROW_PREDICT_FALSE(condition)) { \
+ return (status); \
+ } \
+ } while (0)
+
+#endif // ARROW_EXTRA_ERROR_CONTEXT
+
+#define ARROW_RETURN_IF(condition, status) \
+ ARROW_RETURN_IF_(condition, status, ARROW_STRINGIFY(status))
+
+/// \brief Propagate any non-successful Status to the caller
+#define ARROW_RETURN_NOT_OK(status) \
+ do { \
+ ::arrow::Status __s = ::arrow::internal::GenericToStatus(status); \
+ ARROW_RETURN_IF_(!__s.ok(), __s, ARROW_STRINGIFY(status)); \
+ } while (false)
+
+#define RETURN_NOT_OK_ELSE(s, else_) \
+ do { \
+ ::arrow::Status _s = ::arrow::internal::GenericToStatus(s); \
+ if (!_s.ok()) { \
+ else_; \
+ return _s; \
+ } \
+ } while (false)
+
+// This is an internal-use macro and should not be used in public headers.
+#ifndef RETURN_NOT_OK
+#define RETURN_NOT_OK(s) ARROW_RETURN_NOT_OK(s)
+#endif
+
+namespace arrow {
+
+enum class StatusCode : char {
+ OK = 0,
+ OutOfMemory = 1,
+ KeyError = 2,
+ TypeError = 3,
+ Invalid = 4,
+ IOError = 5,
+ CapacityError = 6,
+ IndexError = 7,
+ Cancelled = 8,
+ UnknownError = 9,
+ NotImplemented = 10,
+ SerializationError = 11,
+ RError = 13,
+ // Gandiva range of errors
+ CodeGenError = 40,
+ ExpressionValidationError = 41,
+ ExecutionError = 42,
+ // Continue generic codes.
+ AlreadyExists = 45
+};
+
+/// \brief An opaque class that allows subsystems to retain
+/// additional information inside the Status.
+class ARROW_EXPORT StatusDetail {
+ public:
+ virtual ~StatusDetail() = default;
+ /// \brief Return a unique id for the type of the StatusDetail
+ /// (effectively a poor man's substitute for RTTI).
+ virtual const char* type_id() const = 0;
+ /// \brief Produce a human-readable description of this status.
+ virtual std::string ToString() const = 0;
+
+ bool operator==(const StatusDetail& other) const noexcept {
+ return std::string(type_id()) == other.type_id() && ToString() == other.ToString();
+ }
+};
+
+/// \brief Status outcome object (success or error)
+///
+/// The Status object is an object holding the outcome of an operation.
+/// The outcome is represented as a StatusCode, either success
+/// (StatusCode::OK) or an error (any other of the StatusCode enumeration values).
+///
+/// Additionally, if an error occurred, a specific error message is generally
+/// attached.
+class ARROW_MUST_USE_TYPE ARROW_EXPORT Status : public util::EqualityComparable<Status>,
+ public util::ToStringOstreamable<Status> {
+ public:
+ // Create a success status.
+ Status() noexcept : state_(NULLPTR) {}
+ ~Status() noexcept {
+ // ARROW-2400: On certain compilers, splitting off the slow path improves
+ // performance significantly.
+ if (ARROW_PREDICT_FALSE(state_ != NULL)) {
+ DeleteState();
+ }
+ }
+
+ Status(StatusCode code, const std::string& msg);
+ /// \brief Pluggable constructor for use by sub-systems. detail cannot be null.
+ Status(StatusCode code, std::string msg, std::shared_ptr<StatusDetail> detail);
+
+ // Copy the specified status.
+ inline Status(const Status& s);
+ inline Status& operator=(const Status& s);
+
+ // Move the specified status.
+ inline Status(Status&& s) noexcept;
+ inline Status& operator=(Status&& s) noexcept;
+
+ inline bool Equals(const Status& s) const;
+
+ // AND the statuses.
+ inline Status operator&(const Status& s) const noexcept;
+ inline Status operator&(Status&& s) const noexcept;
+ inline Status& operator&=(const Status& s) noexcept;
+ inline Status& operator&=(Status&& s) noexcept;
+
+ /// Return a success status
+ static Status OK() { return Status(); }
+
+ template <typename... Args>
+ static Status FromArgs(StatusCode code, Args&&... args) {
+ return Status(code, util::StringBuilder(std::forward<Args>(args)...));
+ }
+
+ template <typename... Args>
+ static Status FromDetailAndArgs(StatusCode code, std::shared_ptr<StatusDetail> detail,
+ Args&&... args) {
+ return Status(code, util::StringBuilder(std::forward<Args>(args)...),
+ std::move(detail));
+ }
+
+ /// Return an error status for out-of-memory conditions
+ template <typename... Args>
+ static Status OutOfMemory(Args&&... args) {
+ return Status::FromArgs(StatusCode::OutOfMemory, std::forward<Args>(args)...);
+ }
+
+ /// Return an error status for failed key lookups (e.g. column name in a table)
+ template <typename... Args>
+ static Status KeyError(Args&&... args) {
+ return Status::FromArgs(StatusCode::KeyError, std::forward<Args>(args)...);
+ }
+
+ /// Return an error status for type errors (such as mismatching data types)
+ template <typename... Args>
+ static Status TypeError(Args&&... args) {
+ return Status::FromArgs(StatusCode::TypeError, std::forward<Args>(args)...);
+ }
+
+ /// Return an error status for unknown errors
+ template <typename... Args>
+ static Status UnknownError(Args&&... args) {
+ return Status::FromArgs(StatusCode::UnknownError, std::forward<Args>(args)...);
+ }
+
+ /// Return an error status when an operation or a combination of operation and
+ /// data types is unimplemented
+ template <typename... Args>
+ static Status NotImplemented(Args&&... args) {
+ return Status::FromArgs(StatusCode::NotImplemented, std::forward<Args>(args)...);
+ }
+
+ /// Return an error status for invalid data (for example a string that fails parsing)
+ template <typename... Args>
+ static Status Invalid(Args&&... args) {
+ return Status::FromArgs(StatusCode::Invalid, std::forward<Args>(args)...);
+ }
+
+ /// Return an error status for cancelled operation
+ template <typename... Args>
+ static Status Cancelled(Args&&... args) {
+ return Status::FromArgs(StatusCode::Cancelled, std::forward<Args>(args)...);
+ }
+
+ /// Return an error status when an index is out of bounds
+ template <typename... Args>
+ static Status IndexError(Args&&... args) {
+ return Status::FromArgs(StatusCode::IndexError, std::forward<Args>(args)...);
+ }
+
+ /// Return an error status when a container's capacity would exceed its limits
+ template <typename... Args>
+ static Status CapacityError(Args&&... args) {
+ return Status::FromArgs(StatusCode::CapacityError, std::forward<Args>(args)...);
+ }
+
+ /// Return an error status when some IO-related operation failed
+ template <typename... Args>
+ static Status IOError(Args&&... args) {
+ return Status::FromArgs(StatusCode::IOError, std::forward<Args>(args)...);
+ }
+
+ /// Return an error status when some (de)serialization operation failed
+ template <typename... Args>
+ static Status SerializationError(Args&&... args) {
+ return Status::FromArgs(StatusCode::SerializationError, std::forward<Args>(args)...);
+ }
+
+ template <typename... Args>
+ static Status RError(Args&&... args) {
+ return Status::FromArgs(StatusCode::RError, std::forward<Args>(args)...);
+ }
+
+ template <typename... Args>
+ static Status CodeGenError(Args&&... args) {
+ return Status::FromArgs(StatusCode::CodeGenError, std::forward<Args>(args)...);
+ }
+
+ template <typename... Args>
+ static Status ExpressionValidationError(Args&&... args) {
+ return Status::FromArgs(StatusCode::ExpressionValidationError,
+ std::forward<Args>(args)...);
+ }
+
+ template <typename... Args>
+ static Status ExecutionError(Args&&... args) {
+ return Status::FromArgs(StatusCode::ExecutionError, std::forward<Args>(args)...);
+ }
+
+ template <typename... Args>
+ static Status AlreadyExists(Args&&... args) {
+ return Status::FromArgs(StatusCode::AlreadyExists, std::forward<Args>(args)...);
+ }
+
+ /// Return true iff the status indicates success.
+ bool ok() const { return (state_ == NULLPTR); }
+
+ /// Return true iff the status indicates an out-of-memory error.
+ bool IsOutOfMemory() const { return code() == StatusCode::OutOfMemory; }
+ /// Return true iff the status indicates a key lookup error.
+ bool IsKeyError() const { return code() == StatusCode::KeyError; }
+ /// Return true iff the status indicates invalid data.
+ bool IsInvalid() const { return code() == StatusCode::Invalid; }
+ /// Return true iff the status indicates a cancelled operation.
+ bool IsCancelled() const { return code() == StatusCode::Cancelled; }
+ /// Return true iff the status indicates an IO-related failure.
+ bool IsIOError() const { return code() == StatusCode::IOError; }
+ /// Return true iff the status indicates a container reaching capacity limits.
+ bool IsCapacityError() const { return code() == StatusCode::CapacityError; }
+ /// Return true iff the status indicates an out of bounds index.
+ bool IsIndexError() const { return code() == StatusCode::IndexError; }
+ /// Return true iff the status indicates a type error.
+ bool IsTypeError() const { return code() == StatusCode::TypeError; }
+ /// Return true iff the status indicates an unknown error.
+ bool IsUnknownError() const { return code() == StatusCode::UnknownError; }
+ /// Return true iff the status indicates an unimplemented operation.
+ bool IsNotImplemented() const { return code() == StatusCode::NotImplemented; }
+ /// Return true iff the status indicates a (de)serialization failure
+ bool IsSerializationError() const { return code() == StatusCode::SerializationError; }
+ /// Return true iff the status indicates a R-originated error.
+ bool IsRError() const { return code() == StatusCode::RError; }
+
+ bool IsCodeGenError() const { return code() == StatusCode::CodeGenError; }
+
+ bool IsExpressionValidationError() const {
+ return code() == StatusCode::ExpressionValidationError;
+ }
+
+ bool IsExecutionError() const { return code() == StatusCode::ExecutionError; }
+ bool IsAlreadyExists() const { return code() == StatusCode::AlreadyExists; }
+
+ /// \brief Return a string representation of this status suitable for printing.
+ ///
+ /// The string "OK" is returned for success.
+ std::string ToString() const;
+
+ /// \brief Return a string representation of the status code, without the message
+ /// text or POSIX code information.
+ std::string CodeAsString() const;
+ static std::string CodeAsString(StatusCode);
+
+ /// \brief Return the StatusCode value attached to this status.
+ StatusCode code() const { return ok() ? StatusCode::OK : state_->code; }
+
+ /// \brief Return the specific error message attached to this status.
+ const std::string& message() const {
+ static const std::string no_message = "";
+ return ok() ? no_message : state_->msg;
+ }
+
+ /// \brief Return the status detail attached to this message.
+ const std::shared_ptr<StatusDetail>& detail() const {
+ static std::shared_ptr<StatusDetail> no_detail = NULLPTR;
+ return state_ ? state_->detail : no_detail;
+ }
+
+ /// \brief Return a new Status copying the existing status, but
+ /// updating with the existing detail.
+ Status WithDetail(std::shared_ptr<StatusDetail> new_detail) const {
+ return Status(code(), message(), std::move(new_detail));
+ }
+
+ /// \brief Return a new Status with changed message, copying the
+ /// existing status code and detail.
+ template <typename... Args>
+ Status WithMessage(Args&&... args) const {
+ return FromArgs(code(), std::forward<Args>(args)...).WithDetail(detail());
+ }
+
+ [[noreturn]] void Abort() const;
+ [[noreturn]] void Abort(const std::string& message) const;
+
+#ifdef ARROW_EXTRA_ERROR_CONTEXT
+ void AddContextLine(const char* filename, int line, const char* expr);
+#endif
+
+ private:
+ struct State {
+ StatusCode code;
+ std::string msg;
+ std::shared_ptr<StatusDetail> detail;
+ };
+ // OK status has a `NULL` state_. Otherwise, `state_` points to
+ // a `State` structure containing the error code and message(s)
+ State* state_;
+
+ void DeleteState() {
+ delete state_;
+ state_ = NULLPTR;
+ }
+ void CopyFrom(const Status& s);
+ inline void MoveFrom(Status& s);
+};
+
+void Status::MoveFrom(Status& s) {
+ delete state_;
+ state_ = s.state_;
+ s.state_ = NULLPTR;
+}
+
+Status::Status(const Status& s)
+ : state_((s.state_ == NULLPTR) ? NULLPTR : new State(*s.state_)) {}
+
+Status& Status::operator=(const Status& s) {
+ // The following condition catches both aliasing (when this == &s),
+ // and the common case where both s and *this are ok.
+ if (state_ != s.state_) {
+ CopyFrom(s);
+ }
+ return *this;
+}
+
+Status::Status(Status&& s) noexcept : state_(s.state_) { s.state_ = NULLPTR; }
+
+Status& Status::operator=(Status&& s) noexcept {
+ MoveFrom(s);
+ return *this;
+}
+
+bool Status::Equals(const Status& s) const {
+ if (state_ == s.state_) {
+ return true;
+ }
+
+ if (ok() || s.ok()) {
+ return false;
+ }
+
+ if (detail() != s.detail()) {
+ if ((detail() && !s.detail()) || (!detail() && s.detail())) {
+ return false;
+ }
+ return *detail() == *s.detail();
+ }
+
+ return code() == s.code() && message() == s.message();
+}
+
+/// \cond FALSE
+// (note: emits warnings on Doxygen < 1.8.15,
+// see https://github.com/doxygen/doxygen/issues/6295)
+Status Status::operator&(const Status& s) const noexcept {
+ if (ok()) {
+ return s;
+ } else {
+ return *this;
+ }
+}
+
+Status Status::operator&(Status&& s) const noexcept {
+ if (ok()) {
+ return std::move(s);
+ } else {
+ return *this;
+ }
+}
+
+Status& Status::operator&=(const Status& s) noexcept {
+ if (ok() && !s.ok()) {
+ CopyFrom(s);
+ }
+ return *this;
+}
+
+Status& Status::operator&=(Status&& s) noexcept {
+ if (ok() && !s.ok()) {
+ MoveFrom(s);
+ }
+ return *this;
+}
+/// \endcond
+
+namespace internal {
+
+// Extract Status from Status or Result<T>
+// Useful for the status check macros such as RETURN_NOT_OK.
+inline const Status& GenericToStatus(const Status& st) { return st; }
+inline Status GenericToStatus(Status&& st) { return std::move(st); }
+
+} // namespace internal
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/stl_allocator.h b/contrib/libs/apache/arrow/cpp/src/arrow/stl_allocator.h
new file mode 100644
index 00000000000..b5ad2b53460
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/stl_allocator.h
@@ -0,0 +1,153 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cstddef>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "arrow/memory_pool.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/macros.h"
+
+namespace arrow {
+namespace stl {
+
+/// \brief A STL allocator delegating allocations to a Arrow MemoryPool
+template <class T>
+class allocator {
+ public:
+ using value_type = T;
+ using pointer = T*;
+ using const_pointer = const T*;
+ using reference = T&;
+ using const_reference = const T&;
+ using size_type = std::size_t;
+ using difference_type = std::ptrdiff_t;
+
+ template <class U>
+ struct rebind {
+ using other = allocator<U>;
+ };
+
+ /// \brief Construct an allocator from the default MemoryPool
+ allocator() noexcept : pool_(default_memory_pool()) {}
+ /// \brief Construct an allocator from the given MemoryPool
+ explicit allocator(MemoryPool* pool) noexcept : pool_(pool) {}
+
+ template <class U>
+ allocator(const allocator<U>& rhs) noexcept : pool_(rhs.pool()) {}
+
+ ~allocator() { pool_ = NULLPTR; }
+
+ pointer address(reference r) const noexcept { return std::addressof(r); }
+
+ const_pointer address(const_reference r) const noexcept { return std::addressof(r); }
+
+ pointer allocate(size_type n, const void* /*hint*/ = NULLPTR) {
+ uint8_t* data;
+ Status s = pool_->Allocate(n * sizeof(T), &data);
+ if (!s.ok()) throw std::bad_alloc();
+ return reinterpret_cast<pointer>(data);
+ }
+
+ void deallocate(pointer p, size_type n) {
+ pool_->Free(reinterpret_cast<uint8_t*>(p), n * sizeof(T));
+ }
+
+ size_type size_max() const noexcept { return size_type(-1) / sizeof(T); }
+
+ template <class U, class... Args>
+ void construct(U* p, Args&&... args) {
+ new (reinterpret_cast<void*>(p)) U(std::forward<Args>(args)...);
+ }
+
+ template <class U>
+ void destroy(U* p) {
+ p->~U();
+ }
+
+ MemoryPool* pool() const noexcept { return pool_; }
+
+ private:
+ MemoryPool* pool_;
+};
+
+/// \brief A MemoryPool implementation delegating allocations to a STL allocator
+///
+/// Note that STL allocators don't provide a resizing operation, and therefore
+/// any buffer resizes will do a full reallocation and copy.
+template <typename Allocator = std::allocator<uint8_t>>
+class STLMemoryPool : public MemoryPool {
+ public:
+ /// \brief Construct a memory pool from the given allocator
+ explicit STLMemoryPool(const Allocator& alloc) : alloc_(alloc) {}
+
+ Status Allocate(int64_t size, uint8_t** out) override {
+ try {
+ *out = alloc_.allocate(size);
+ } catch (std::bad_alloc& e) {
+ return Status::OutOfMemory(e.what());
+ }
+ stats_.UpdateAllocatedBytes(size);
+ return Status::OK();
+ }
+
+ Status Reallocate(int64_t old_size, int64_t new_size, uint8_t** ptr) override {
+ uint8_t* old_ptr = *ptr;
+ try {
+ *ptr = alloc_.allocate(new_size);
+ } catch (std::bad_alloc& e) {
+ return Status::OutOfMemory(e.what());
+ }
+ memcpy(*ptr, old_ptr, std::min(old_size, new_size));
+ alloc_.deallocate(old_ptr, old_size);
+ stats_.UpdateAllocatedBytes(new_size - old_size);
+ return Status::OK();
+ }
+
+ void Free(uint8_t* buffer, int64_t size) override {
+ alloc_.deallocate(buffer, size);
+ stats_.UpdateAllocatedBytes(-size);
+ }
+
+ int64_t bytes_allocated() const override { return stats_.bytes_allocated(); }
+
+ int64_t max_memory() const override { return stats_.max_memory(); }
+
+ std::string backend_name() const override { return "stl"; }
+
+ private:
+ Allocator alloc_;
+ arrow::internal::MemoryPoolStats stats_;
+};
+
+template <class T1, class T2>
+bool operator==(const allocator<T1>& lhs, const allocator<T2>& rhs) noexcept {
+ return lhs.pool() == rhs.pool();
+}
+
+template <class T1, class T2>
+bool operator!=(const allocator<T1>& lhs, const allocator<T2>& rhs) noexcept {
+ return !(lhs == rhs);
+}
+
+} // namespace stl
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/stl_iterator.h b/contrib/libs/apache/arrow/cpp/src/arrow/stl_iterator.h
new file mode 100644
index 00000000000..6225a89aae4
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/stl_iterator.h
@@ -0,0 +1,146 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstddef>
+#include <iterator>
+#include <utility>
+
+#include "arrow/type_fwd.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/optional.h"
+
+namespace arrow {
+namespace stl {
+
+namespace detail {
+
+template <typename ArrayType>
+struct DefaultValueAccessor {
+ using ValueType = decltype(std::declval<ArrayType>().GetView(0));
+
+ ValueType operator()(const ArrayType& array, int64_t index) {
+ return array.GetView(index);
+ }
+};
+
+} // namespace detail
+
+template <typename ArrayType,
+ typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
+class ArrayIterator {
+ public:
+ using value_type = arrow::util::optional<typename ValueAccessor::ValueType>;
+ using difference_type = int64_t;
+ using pointer = value_type*;
+ using reference = value_type&;
+ using iterator_category = std::random_access_iterator_tag;
+
+ // Some algorithms need to default-construct an iterator
+ ArrayIterator() : array_(NULLPTR), index_(0) {}
+
+ explicit ArrayIterator(const ArrayType& array, int64_t index = 0)
+ : array_(&array), index_(index) {}
+
+ // Value access
+ value_type operator*() const {
+ return array_->IsNull(index_) ? value_type{} : array_->GetView(index_);
+ }
+
+ value_type operator[](difference_type n) const {
+ return array_->IsNull(index_ + n) ? value_type{} : array_->GetView(index_ + n);
+ }
+
+ int64_t index() const { return index_; }
+
+ // Forward / backward
+ ArrayIterator& operator++() {
+ ++index_;
+ return *this;
+ }
+ ArrayIterator& operator--() {
+ --index_;
+ return *this;
+ }
+ ArrayIterator operator++(int) {
+ ArrayIterator tmp(*this);
+ ++index_;
+ return tmp;
+ }
+ ArrayIterator operator--(int) {
+ ArrayIterator tmp(*this);
+ --index_;
+ return tmp;
+ }
+
+ // Arithmetic
+ difference_type operator-(const ArrayIterator& other) const {
+ return index_ - other.index_;
+ }
+ ArrayIterator operator+(difference_type n) const {
+ return ArrayIterator(*array_, index_ + n);
+ }
+ ArrayIterator operator-(difference_type n) const {
+ return ArrayIterator(*array_, index_ - n);
+ }
+ friend inline ArrayIterator operator+(difference_type diff,
+ const ArrayIterator& other) {
+ return ArrayIterator(*other.array_, diff + other.index_);
+ }
+ friend inline ArrayIterator operator-(difference_type diff,
+ const ArrayIterator& other) {
+ return ArrayIterator(*other.array_, diff - other.index_);
+ }
+ ArrayIterator& operator+=(difference_type n) {
+ index_ += n;
+ return *this;
+ }
+ ArrayIterator& operator-=(difference_type n) {
+ index_ -= n;
+ return *this;
+ }
+
+ // Comparisons
+ bool operator==(const ArrayIterator& other) const { return index_ == other.index_; }
+ bool operator!=(const ArrayIterator& other) const { return index_ != other.index_; }
+ bool operator<(const ArrayIterator& other) const { return index_ < other.index_; }
+ bool operator>(const ArrayIterator& other) const { return index_ > other.index_; }
+ bool operator<=(const ArrayIterator& other) const { return index_ <= other.index_; }
+ bool operator>=(const ArrayIterator& other) const { return index_ >= other.index_; }
+
+ private:
+ const ArrayType* array_;
+ int64_t index_;
+};
+
+} // namespace stl
+} // namespace arrow
+
+namespace std {
+
+template <typename ArrayType>
+struct iterator_traits<::arrow::stl::ArrayIterator<ArrayType>> {
+ using IteratorType = ::arrow::stl::ArrayIterator<ArrayType>;
+ using difference_type = typename IteratorType::difference_type;
+ using value_type = typename IteratorType::value_type;
+ using pointer = typename IteratorType::pointer;
+ using reference = typename IteratorType::reference;
+ using iterator_category = typename IteratorType::iterator_category;
+};
+
+} // namespace std
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/symbols.map b/contrib/libs/apache/arrow/cpp/src/arrow/symbols.map
new file mode 100644
index 00000000000..7262cc6a898
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/symbols.map
@@ -0,0 +1,38 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+{
+ global:
+ extern "C++" {
+ # The leading asterisk is required for symbols such as
+ # "typeinfo for arrow::SomeClass".
+ # Unfortunately this will also catch template specializations
+ # (from e.g. STL or Flatbuffers) involving Arrow types.
+ *arrow::*;
+ *arrow_vendored::*;
+ };
+ # Also export C-level helpers
+ arrow_*;
+ pyarrow_*;
+
+ # Symbols marked as 'local' are not exported by the DSO and thus may not
+ # be used by client applications. Everything except the above falls here.
+ # This ensures we hide symbols of static dependencies.
+ local:
+ *;
+
+};
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/table.cc b/contrib/libs/apache/arrow/cpp/src/arrow/table.cc
new file mode 100644
index 00000000000..d4c7802c834
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/table.cc
@@ -0,0 +1,640 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/table.h"
+
+#include <algorithm>
+#include <cstdlib>
+#include <limits>
+#include <memory>
+#include <sstream>
+#include <utility>
+
+#include "arrow/array/array_base.h"
+#include "arrow/array/array_binary.h"
+#include "arrow/array/array_nested.h"
+#include "arrow/array/concatenate.h"
+#include "arrow/array/util.h"
+#include "arrow/chunked_array.h"
+#include "arrow/pretty_print.h"
+#include "arrow/record_batch.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_fwd.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/vector.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+
+class KeyValueMetadata;
+class MemoryPool;
+struct ArrayData;
+
+// ----------------------------------------------------------------------
+// Table methods
+
+/// \class SimpleTable
+/// \brief A basic, non-lazy in-memory table, like SimpleRecordBatch
+class SimpleTable : public Table {
+ public:
+ SimpleTable(std::shared_ptr<Schema> schema,
+ std::vector<std::shared_ptr<ChunkedArray>> columns, int64_t num_rows = -1)
+ : columns_(std::move(columns)) {
+ schema_ = std::move(schema);
+ if (num_rows < 0) {
+ if (columns_.size() == 0) {
+ num_rows_ = 0;
+ } else {
+ num_rows_ = columns_[0]->length();
+ }
+ } else {
+ num_rows_ = num_rows;
+ }
+ }
+
+ SimpleTable(std::shared_ptr<Schema> schema,
+ const std::vector<std::shared_ptr<Array>>& columns, int64_t num_rows = -1) {
+ schema_ = std::move(schema);
+ if (num_rows < 0) {
+ if (columns.size() == 0) {
+ num_rows_ = 0;
+ } else {
+ num_rows_ = columns[0]->length();
+ }
+ } else {
+ num_rows_ = num_rows;
+ }
+
+ columns_.resize(columns.size());
+ for (size_t i = 0; i < columns.size(); ++i) {
+ columns_[i] = std::make_shared<ChunkedArray>(columns[i]);
+ }
+ }
+
+ std::shared_ptr<ChunkedArray> column(int i) const override { return columns_[i]; }
+
+ const std::vector<std::shared_ptr<ChunkedArray>>& columns() const override {
+ return columns_;
+ }
+
+ std::shared_ptr<Table> Slice(int64_t offset, int64_t length) const override {
+ auto sliced = columns_;
+ int64_t num_rows = length;
+ for (auto& column : sliced) {
+ column = column->Slice(offset, length);
+ num_rows = column->length();
+ }
+ return Table::Make(schema_, std::move(sliced), num_rows);
+ }
+
+ Result<std::shared_ptr<Table>> RemoveColumn(int i) const override {
+ ARROW_ASSIGN_OR_RAISE(auto new_schema, schema_->RemoveField(i));
+
+ return Table::Make(std::move(new_schema), internal::DeleteVectorElement(columns_, i),
+ this->num_rows());
+ }
+
+ Result<std::shared_ptr<Table>> AddColumn(
+ int i, std::shared_ptr<Field> field_arg,
+ std::shared_ptr<ChunkedArray> col) const override {
+ DCHECK(col != nullptr);
+
+ if (col->length() != num_rows_) {
+ return Status::Invalid(
+ "Added column's length must match table's length. Expected length ", num_rows_,
+ " but got length ", col->length());
+ }
+
+ if (!field_arg->type()->Equals(col->type())) {
+ return Status::Invalid("Field type did not match data type");
+ }
+
+ ARROW_ASSIGN_OR_RAISE(auto new_schema, schema_->AddField(i, field_arg));
+ return Table::Make(std::move(new_schema),
+ internal::AddVectorElement(columns_, i, std::move(col)));
+ }
+
+ Result<std::shared_ptr<Table>> SetColumn(
+ int i, std::shared_ptr<Field> field_arg,
+ std::shared_ptr<ChunkedArray> col) const override {
+ DCHECK(col != nullptr);
+
+ if (col->length() != num_rows_) {
+ return Status::Invalid(
+ "Added column's length must match table's length. Expected length ", num_rows_,
+ " but got length ", col->length());
+ }
+
+ if (!field_arg->type()->Equals(col->type())) {
+ return Status::Invalid("Field type did not match data type");
+ }
+
+ ARROW_ASSIGN_OR_RAISE(auto new_schema, schema_->SetField(i, field_arg));
+ return Table::Make(std::move(new_schema),
+ internal::ReplaceVectorElement(columns_, i, std::move(col)));
+ }
+
+ std::shared_ptr<Table> ReplaceSchemaMetadata(
+ const std::shared_ptr<const KeyValueMetadata>& metadata) const override {
+ auto new_schema = schema_->WithMetadata(metadata);
+ return Table::Make(std::move(new_schema), columns_);
+ }
+
+ Result<std::shared_ptr<Table>> Flatten(MemoryPool* pool) const override {
+ std::vector<std::shared_ptr<Field>> flattened_fields;
+ std::vector<std::shared_ptr<ChunkedArray>> flattened_columns;
+ for (int i = 0; i < num_columns(); ++i) {
+ std::vector<std::shared_ptr<Field>> new_fields = field(i)->Flatten();
+ ARROW_ASSIGN_OR_RAISE(auto new_columns, column(i)->Flatten(pool));
+ DCHECK_EQ(new_columns.size(), new_fields.size());
+ for (size_t j = 0; j < new_columns.size(); ++j) {
+ flattened_fields.push_back(new_fields[j]);
+ flattened_columns.push_back(new_columns[j]);
+ }
+ }
+ auto flattened_schema =
+ std::make_shared<Schema>(std::move(flattened_fields), schema_->metadata());
+ return Table::Make(std::move(flattened_schema), std::move(flattened_columns));
+ }
+
+ Status Validate() const override {
+ RETURN_NOT_OK(ValidateMeta());
+ for (int i = 0; i < num_columns(); ++i) {
+ const ChunkedArray* col = columns_[i].get();
+ Status st = col->Validate();
+ if (!st.ok()) {
+ std::stringstream ss;
+ ss << "Column " << i << ": " << st.message();
+ return st.WithMessage(ss.str());
+ }
+ }
+ return Status::OK();
+ }
+
+ Status ValidateFull() const override {
+ RETURN_NOT_OK(ValidateMeta());
+ for (int i = 0; i < num_columns(); ++i) {
+ const ChunkedArray* col = columns_[i].get();
+ Status st = col->ValidateFull();
+ if (!st.ok()) {
+ std::stringstream ss;
+ ss << "Column " << i << ": " << st.message();
+ return st.WithMessage(ss.str());
+ }
+ }
+ return Status::OK();
+ }
+
+ protected:
+ Status ValidateMeta() const {
+ // Make sure columns and schema are consistent
+ if (static_cast<int>(columns_.size()) != schema_->num_fields()) {
+ return Status::Invalid("Number of columns did not match schema");
+ }
+ for (int i = 0; i < num_columns(); ++i) {
+ const ChunkedArray* col = columns_[i].get();
+ if (col == nullptr) {
+ return Status::Invalid("Column ", i, " was null");
+ }
+ if (!col->type()->Equals(*schema_->field(i)->type())) {
+ return Status::Invalid("Column data for field ", i, " with type ",
+ col->type()->ToString(), " is inconsistent with schema ",
+ schema_->field(i)->type()->ToString());
+ }
+ }
+
+ // Make sure columns are all the same length, and validate them
+ for (int i = 0; i < num_columns(); ++i) {
+ const ChunkedArray* col = columns_[i].get();
+ if (col->length() != num_rows_) {
+ return Status::Invalid("Column ", i, " named ", field(i)->name(),
+ " expected length ", num_rows_, " but got length ",
+ col->length());
+ }
+ Status st = col->Validate();
+ if (!st.ok()) {
+ std::stringstream ss;
+ ss << "Column " << i << ": " << st.message();
+ return st.WithMessage(ss.str());
+ }
+ }
+ return Status::OK();
+ }
+
+ private:
+ std::vector<std::shared_ptr<ChunkedArray>> columns_;
+};
+
+Table::Table() : num_rows_(0) {}
+
+std::vector<std::shared_ptr<Field>> Table::fields() const {
+ std::vector<std::shared_ptr<Field>> result;
+ for (int i = 0; i < this->num_columns(); ++i) {
+ result.emplace_back(this->field(i));
+ }
+ return result;
+}
+
+std::shared_ptr<Table> Table::Make(std::shared_ptr<Schema> schema,
+ std::vector<std::shared_ptr<ChunkedArray>> columns,
+ int64_t num_rows) {
+ return std::make_shared<SimpleTable>(std::move(schema), std::move(columns), num_rows);
+}
+
+std::shared_ptr<Table> Table::Make(std::shared_ptr<Schema> schema,
+ const std::vector<std::shared_ptr<Array>>& arrays,
+ int64_t num_rows) {
+ return std::make_shared<SimpleTable>(std::move(schema), arrays, num_rows);
+}
+
+Result<std::shared_ptr<Table>> Table::FromRecordBatchReader(RecordBatchReader* reader) {
+ std::shared_ptr<Table> table = nullptr;
+ RETURN_NOT_OK(reader->ReadAll(&table));
+ return table;
+}
+
+Result<std::shared_ptr<Table>> Table::FromRecordBatches(
+ std::shared_ptr<Schema> schema,
+ const std::vector<std::shared_ptr<RecordBatch>>& batches) {
+ const int nbatches = static_cast<int>(batches.size());
+ const int ncolumns = static_cast<int>(schema->num_fields());
+
+ int64_t num_rows = 0;
+ for (int i = 0; i < nbatches; ++i) {
+ if (!batches[i]->schema()->Equals(*schema, false)) {
+ return Status::Invalid("Schema at index ", static_cast<int>(i),
+ " was different: \n", schema->ToString(), "\nvs\n",
+ batches[i]->schema()->ToString());
+ }
+ num_rows += batches[i]->num_rows();
+ }
+
+ std::vector<std::shared_ptr<ChunkedArray>> columns(ncolumns);
+ std::vector<std::shared_ptr<Array>> column_arrays(nbatches);
+
+ for (int i = 0; i < ncolumns; ++i) {
+ for (int j = 0; j < nbatches; ++j) {
+ column_arrays[j] = batches[j]->column(i);
+ }
+ columns[i] = std::make_shared<ChunkedArray>(column_arrays, schema->field(i)->type());
+ }
+
+ return Table::Make(std::move(schema), std::move(columns), num_rows);
+}
+
+Result<std::shared_ptr<Table>> Table::FromRecordBatches(
+ const std::vector<std::shared_ptr<RecordBatch>>& batches) {
+ if (batches.size() == 0) {
+ return Status::Invalid("Must pass at least one record batch or an explicit Schema");
+ }
+
+ return FromRecordBatches(batches[0]->schema(), batches);
+}
+
+Result<std::shared_ptr<Table>> Table::FromChunkedStructArray(
+ const std::shared_ptr<ChunkedArray>& array) {
+ auto type = array->type();
+ if (type->id() != Type::STRUCT) {
+ return Status::Invalid("Expected a chunked struct array, got ", *type);
+ }
+ int num_columns = type->num_fields();
+ int num_chunks = array->num_chunks();
+
+ const auto& struct_chunks = array->chunks();
+ std::vector<std::shared_ptr<ChunkedArray>> columns(num_columns);
+ for (int i = 0; i < num_columns; ++i) {
+ ArrayVector chunks(num_chunks);
+ std::transform(struct_chunks.begin(), struct_chunks.end(), chunks.begin(),
+ [i](const std::shared_ptr<Array>& struct_chunk) {
+ return static_cast<const StructArray&>(*struct_chunk).field(i);
+ });
+ columns[i] = std::make_shared<ChunkedArray>(std::move(chunks));
+ }
+
+ return Table::Make(::arrow::schema(type->fields()), std::move(columns),
+ array->length());
+}
+
+std::vector<std::string> Table::ColumnNames() const {
+ std::vector<std::string> names(num_columns());
+ for (int i = 0; i < num_columns(); ++i) {
+ names[i] = field(i)->name();
+ }
+ return names;
+}
+
+Result<std::shared_ptr<Table>> Table::RenameColumns(
+ const std::vector<std::string>& names) const {
+ if (names.size() != static_cast<size_t>(num_columns())) {
+ return Status::Invalid("tried to rename a table of ", num_columns(),
+ " columns but only ", names.size(), " names were provided");
+ }
+ std::vector<std::shared_ptr<ChunkedArray>> columns(num_columns());
+ std::vector<std::shared_ptr<Field>> fields(num_columns());
+ for (int i = 0; i < num_columns(); ++i) {
+ columns[i] = column(i);
+ fields[i] = field(i)->WithName(names[i]);
+ }
+ return Table::Make(::arrow::schema(std::move(fields)), std::move(columns), num_rows());
+}
+
+Result<std::shared_ptr<Table>> Table::SelectColumns(
+ const std::vector<int>& indices) const {
+ int n = static_cast<int>(indices.size());
+
+ std::vector<std::shared_ptr<ChunkedArray>> columns(n);
+ std::vector<std::shared_ptr<Field>> fields(n);
+ for (int i = 0; i < n; i++) {
+ int pos = indices[i];
+ if (pos < 0 || pos > num_columns() - 1) {
+ return Status::Invalid("Invalid column index ", pos, " to select columns.");
+ }
+ columns[i] = column(pos);
+ fields[i] = field(pos);
+ }
+
+ auto new_schema =
+ std::make_shared<arrow::Schema>(std::move(fields), schema()->metadata());
+ return Table::Make(std::move(new_schema), std::move(columns), num_rows());
+}
+
+std::string Table::ToString() const {
+ std::stringstream ss;
+ ARROW_CHECK_OK(PrettyPrint(*this, 0, &ss));
+ return ss.str();
+}
+
+Result<std::shared_ptr<Table>> ConcatenateTables(
+ const std::vector<std::shared_ptr<Table>>& tables,
+ const ConcatenateTablesOptions options, MemoryPool* memory_pool) {
+ if (tables.size() == 0) {
+ return Status::Invalid("Must pass at least one table");
+ }
+
+ std::vector<std::shared_ptr<Table>> promoted_tables;
+ const std::vector<std::shared_ptr<Table>>* tables_to_concat = &tables;
+ if (options.unify_schemas) {
+ std::vector<std::shared_ptr<Schema>> schemas;
+ schemas.reserve(tables.size());
+ for (const auto& t : tables) {
+ schemas.push_back(t->schema());
+ }
+
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Schema> unified_schema,
+ UnifySchemas(schemas, options.field_merge_options));
+
+ promoted_tables.reserve(tables.size());
+ for (const auto& t : tables) {
+ promoted_tables.emplace_back();
+ ARROW_ASSIGN_OR_RAISE(promoted_tables.back(),
+ PromoteTableToSchema(t, unified_schema, memory_pool));
+ }
+ tables_to_concat = &promoted_tables;
+ } else {
+ auto first_schema = tables[0]->schema();
+ for (size_t i = 1; i < tables.size(); ++i) {
+ if (!tables[i]->schema()->Equals(*first_schema, false)) {
+ return Status::Invalid("Schema at index ", i, " was different: \n",
+ first_schema->ToString(), "\nvs\n",
+ tables[i]->schema()->ToString());
+ }
+ }
+ }
+
+ std::shared_ptr<Schema> schema = tables_to_concat->front()->schema();
+
+ const int ncolumns = schema->num_fields();
+
+ std::vector<std::shared_ptr<ChunkedArray>> columns(ncolumns);
+ for (int i = 0; i < ncolumns; ++i) {
+ std::vector<std::shared_ptr<Array>> column_arrays;
+ for (const auto& table : *tables_to_concat) {
+ const std::vector<std::shared_ptr<Array>>& chunks = table->column(i)->chunks();
+ for (const auto& chunk : chunks) {
+ column_arrays.push_back(chunk);
+ }
+ }
+ columns[i] = std::make_shared<ChunkedArray>(column_arrays, schema->field(i)->type());
+ }
+ return Table::Make(std::move(schema), std::move(columns));
+}
+
+Result<std::shared_ptr<Table>> PromoteTableToSchema(const std::shared_ptr<Table>& table,
+ const std::shared_ptr<Schema>& schema,
+ MemoryPool* pool) {
+ const std::shared_ptr<Schema> current_schema = table->schema();
+ if (current_schema->Equals(*schema, /*check_metadata=*/false)) {
+ return table->ReplaceSchemaMetadata(schema->metadata());
+ }
+
+ // fields_seen[i] == true iff that field is also in `schema`.
+ std::vector<bool> fields_seen(current_schema->num_fields(), false);
+
+ std::vector<std::shared_ptr<ChunkedArray>> columns;
+ columns.reserve(schema->num_fields());
+ const int64_t num_rows = table->num_rows();
+ auto AppendColumnOfNulls = [pool, &columns,
+ num_rows](const std::shared_ptr<DataType>& type) {
+ // TODO(bkietz): share the zero-filled buffers as much as possible across
+ // the null-filled arrays created here.
+ ARROW_ASSIGN_OR_RAISE(auto array_of_nulls, MakeArrayOfNull(type, num_rows, pool));
+ columns.push_back(std::make_shared<ChunkedArray>(array_of_nulls));
+ return Status::OK();
+ };
+
+ for (const auto& field : schema->fields()) {
+ const std::vector<int> field_indices =
+ current_schema->GetAllFieldIndices(field->name());
+ if (field_indices.empty()) {
+ RETURN_NOT_OK(AppendColumnOfNulls(field->type()));
+ continue;
+ }
+
+ if (field_indices.size() > 1) {
+ return Status::Invalid(
+ "PromoteTableToSchema cannot handle schemas with duplicate fields: ",
+ field->name());
+ }
+
+ const int field_index = field_indices[0];
+ const auto& current_field = current_schema->field(field_index);
+ if (!field->nullable() && current_field->nullable()) {
+ return Status::Invalid("Unable to promote field ", current_field->name(),
+ ": it was nullable but the target schema was not.");
+ }
+
+ fields_seen[field_index] = true;
+ if (current_field->type()->Equals(field->type())) {
+ columns.push_back(table->column(field_index));
+ continue;
+ }
+
+ if (current_field->type()->id() == Type::NA) {
+ RETURN_NOT_OK(AppendColumnOfNulls(field->type()));
+ continue;
+ }
+
+ return Status::Invalid("Unable to promote field ", field->name(),
+ ": incompatible types: ", field->type()->ToString(), " vs ",
+ current_field->type()->ToString());
+ }
+
+ auto unseen_field_iter = std::find(fields_seen.begin(), fields_seen.end(), false);
+ if (unseen_field_iter != fields_seen.end()) {
+ const size_t unseen_field_index = unseen_field_iter - fields_seen.begin();
+ return Status::Invalid(
+ "Incompatible schemas: field ",
+ current_schema->field(static_cast<int>(unseen_field_index))->name(),
+ " did not exist in the new schema.");
+ }
+
+ return Table::Make(schema, std::move(columns));
+}
+
+bool Table::Equals(const Table& other, bool check_metadata) const {
+ if (this == &other) {
+ return true;
+ }
+ if (!schema_->Equals(*other.schema(), check_metadata)) {
+ return false;
+ }
+ if (this->num_columns() != other.num_columns()) {
+ return false;
+ }
+
+ for (int i = 0; i < this->num_columns(); i++) {
+ if (!this->column(i)->Equals(other.column(i))) {
+ return false;
+ }
+ }
+ return true;
+}
+
+Result<std::shared_ptr<Table>> Table::CombineChunks(MemoryPool* pool) const {
+ const int ncolumns = num_columns();
+ std::vector<std::shared_ptr<ChunkedArray>> compacted_columns(ncolumns);
+ for (int i = 0; i < ncolumns; ++i) {
+ const auto& col = column(i);
+ if (col->num_chunks() <= 1) {
+ compacted_columns[i] = col;
+ continue;
+ }
+
+ if (is_binary_like(col->type()->id())) {
+ // ARROW-5744 Allow binary columns to be combined into multiple chunks to avoid
+ // buffer overflow
+ ArrayVector chunks;
+ int chunk_i = 0;
+ while (chunk_i < col->num_chunks()) {
+ ArrayVector safe_chunks;
+ int64_t data_length = 0;
+ for (; chunk_i < col->num_chunks(); ++chunk_i) {
+ const auto& chunk = col->chunk(chunk_i);
+ data_length += checked_cast<const BinaryArray&>(*chunk).total_values_length();
+ if (data_length >= kBinaryMemoryLimit) {
+ break;
+ }
+ safe_chunks.push_back(chunk);
+ }
+ chunks.emplace_back();
+ ARROW_ASSIGN_OR_RAISE(chunks.back(), Concatenate(safe_chunks, pool));
+ }
+ compacted_columns[i] = std::make_shared<ChunkedArray>(std::move(chunks));
+ } else {
+ ARROW_ASSIGN_OR_RAISE(auto compacted, Concatenate(col->chunks(), pool));
+ compacted_columns[i] = std::make_shared<ChunkedArray>(compacted);
+ }
+ }
+ return Table::Make(schema(), std::move(compacted_columns), num_rows_);
+}
+
+// ----------------------------------------------------------------------
+// Convert a table to a sequence of record batches
+
+TableBatchReader::TableBatchReader(const Table& table)
+ : table_(table),
+ column_data_(table.num_columns()),
+ chunk_numbers_(table.num_columns(), 0),
+ chunk_offsets_(table.num_columns(), 0),
+ absolute_row_position_(0),
+ max_chunksize_(std::numeric_limits<int64_t>::max()) {
+ for (int i = 0; i < table.num_columns(); ++i) {
+ column_data_[i] = table.column(i).get();
+ }
+}
+
+std::shared_ptr<Schema> TableBatchReader::schema() const { return table_.schema(); }
+
+void TableBatchReader::set_chunksize(int64_t chunksize) { max_chunksize_ = chunksize; }
+
+Status TableBatchReader::ReadNext(std::shared_ptr<RecordBatch>* out) {
+ if (absolute_row_position_ == table_.num_rows()) {
+ *out = nullptr;
+ return Status::OK();
+ }
+
+ // Determine the minimum contiguous slice across all columns
+ int64_t chunksize = std::min(table_.num_rows(), max_chunksize_);
+ std::vector<const Array*> chunks(table_.num_columns());
+ for (int i = 0; i < table_.num_columns(); ++i) {
+ auto chunk = column_data_[i]->chunk(chunk_numbers_[i]).get();
+ int64_t chunk_remaining = chunk->length() - chunk_offsets_[i];
+
+ if (chunk_remaining < chunksize) {
+ chunksize = chunk_remaining;
+ }
+
+ chunks[i] = chunk;
+ }
+
+ // Slice chunks and advance chunk index as appropriate
+ std::vector<std::shared_ptr<ArrayData>> batch_data(table_.num_columns());
+
+ for (int i = 0; i < table_.num_columns(); ++i) {
+ // Exhausted chunk
+ const Array* chunk = chunks[i];
+ const int64_t offset = chunk_offsets_[i];
+ std::shared_ptr<ArrayData> slice_data;
+ if ((chunk->length() - offset) == chunksize) {
+ ++chunk_numbers_[i];
+ chunk_offsets_[i] = 0;
+ if (offset > 0) {
+ // Need to slice
+ slice_data = chunk->Slice(offset, chunksize)->data();
+ } else {
+ // No slice
+ slice_data = chunk->data();
+ }
+ } else {
+ chunk_offsets_[i] += chunksize;
+ slice_data = chunk->Slice(offset, chunksize)->data();
+ }
+ batch_data[i] = std::move(slice_data);
+ }
+
+ absolute_row_position_ += chunksize;
+ *out = RecordBatch::Make(table_.schema(), chunksize, std::move(batch_data));
+
+ return Status::OK();
+}
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/table.h b/contrib/libs/apache/arrow/cpp/src/arrow/table.h
new file mode 100644
index 00000000000..f1e5f23eed8
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/table.h
@@ -0,0 +1,295 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/chunked_array.h" // IWYU pragma: keep
+#include "arrow/record_batch.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class Array;
+class ChunkedArray;
+class KeyValueMetadata;
+class MemoryPool;
+
+/// \class Table
+/// \brief Logical table as sequence of chunked arrays
+class ARROW_EXPORT Table {
+ public:
+ virtual ~Table() = default;
+
+ /// \brief Construct a Table from schema and columns
+ ///
+ /// If columns is zero-length, the table's number of rows is zero
+ ///
+ /// \param[in] schema The table schema (column types)
+ /// \param[in] columns The table's columns as chunked arrays
+ /// \param[in] num_rows number of rows in table, -1 (default) to infer from columns
+ static std::shared_ptr<Table> Make(std::shared_ptr<Schema> schema,
+ std::vector<std::shared_ptr<ChunkedArray>> columns,
+ int64_t num_rows = -1);
+
+ /// \brief Construct a Table from schema and arrays
+ ///
+ /// \param[in] schema The table schema (column types)
+ /// \param[in] arrays The table's columns as arrays
+ /// \param[in] num_rows number of rows in table, -1 (default) to infer from columns
+ static std::shared_ptr<Table> Make(std::shared_ptr<Schema> schema,
+ const std::vector<std::shared_ptr<Array>>& arrays,
+ int64_t num_rows = -1);
+
+ /// \brief Construct a Table from a RecordBatchReader.
+ ///
+ /// \param[in] reader the arrow::Schema for each batch
+ static Result<std::shared_ptr<Table>> FromRecordBatchReader(RecordBatchReader* reader);
+
+ /// \brief Construct a Table from RecordBatches, using schema supplied by the first
+ /// RecordBatch.
+ ///
+ /// \param[in] batches a std::vector of record batches
+ static Result<std::shared_ptr<Table>> FromRecordBatches(
+ const std::vector<std::shared_ptr<RecordBatch>>& batches);
+
+ /// \brief Construct a Table from RecordBatches, using supplied schema. There may be
+ /// zero record batches
+ ///
+ /// \param[in] schema the arrow::Schema for each batch
+ /// \param[in] batches a std::vector of record batches
+ static Result<std::shared_ptr<Table>> FromRecordBatches(
+ std::shared_ptr<Schema> schema,
+ const std::vector<std::shared_ptr<RecordBatch>>& batches);
+
+ /// \brief Construct a Table from a chunked StructArray. One column will be produced
+ /// for each field of the StructArray.
+ ///
+ /// \param[in] array a chunked StructArray
+ static Result<std::shared_ptr<Table>> FromChunkedStructArray(
+ const std::shared_ptr<ChunkedArray>& array);
+
+ /// \brief Return the table schema
+ std::shared_ptr<Schema> schema() const { return schema_; }
+
+ /// \brief Return a column by index
+ virtual std::shared_ptr<ChunkedArray> column(int i) const = 0;
+
+ /// \brief Return vector of all columns for table
+ virtual const std::vector<std::shared_ptr<ChunkedArray>>& columns() const = 0;
+
+ /// Return a column's field by index
+ std::shared_ptr<Field> field(int i) const { return schema_->field(i); }
+
+ /// \brief Return vector of all fields for table
+ std::vector<std::shared_ptr<Field>> fields() const;
+
+ /// \brief Construct a zero-copy slice of the table with the
+ /// indicated offset and length
+ ///
+ /// \param[in] offset the index of the first row in the constructed
+ /// slice
+ /// \param[in] length the number of rows of the slice. If there are not enough
+ /// rows in the table, the length will be adjusted accordingly
+ ///
+ /// \return a new object wrapped in std::shared_ptr<Table>
+ virtual std::shared_ptr<Table> Slice(int64_t offset, int64_t length) const = 0;
+
+ /// \brief Slice from first row at offset until end of the table
+ std::shared_ptr<Table> Slice(int64_t offset) const { return Slice(offset, num_rows_); }
+
+ /// \brief Return a column by name
+ /// \param[in] name field name
+ /// \return an Array or null if no field was found
+ std::shared_ptr<ChunkedArray> GetColumnByName(const std::string& name) const {
+ auto i = schema_->GetFieldIndex(name);
+ return i == -1 ? NULLPTR : column(i);
+ }
+
+ /// \brief Remove column from the table, producing a new Table
+ virtual Result<std::shared_ptr<Table>> RemoveColumn(int i) const = 0;
+
+ /// \brief Add column to the table, producing a new Table
+ virtual Result<std::shared_ptr<Table>> AddColumn(
+ int i, std::shared_ptr<Field> field_arg,
+ std::shared_ptr<ChunkedArray> column) const = 0;
+
+ /// \brief Replace a column in the table, producing a new Table
+ virtual Result<std::shared_ptr<Table>> SetColumn(
+ int i, std::shared_ptr<Field> field_arg,
+ std::shared_ptr<ChunkedArray> column) const = 0;
+
+ /// \brief Return names of all columns
+ std::vector<std::string> ColumnNames() const;
+
+ /// \brief Rename columns with provided names
+ Result<std::shared_ptr<Table>> RenameColumns(
+ const std::vector<std::string>& names) const;
+
+ /// \brief Return new table with specified columns
+ Result<std::shared_ptr<Table>> SelectColumns(const std::vector<int>& indices) const;
+
+ /// \brief Replace schema key-value metadata with new metadata
+ /// \since 0.5.0
+ ///
+ /// \param[in] metadata new KeyValueMetadata
+ /// \return new Table
+ virtual std::shared_ptr<Table> ReplaceSchemaMetadata(
+ const std::shared_ptr<const KeyValueMetadata>& metadata) const = 0;
+
+ /// \brief Flatten the table, producing a new Table. Any column with a
+ /// struct type will be flattened into multiple columns
+ ///
+ /// \param[in] pool The pool for buffer allocations, if any
+ virtual Result<std::shared_ptr<Table>> Flatten(
+ MemoryPool* pool = default_memory_pool()) const = 0;
+
+ /// \return PrettyPrint representation suitable for debugging
+ std::string ToString() const;
+
+ /// \brief Perform cheap validation checks to determine obvious inconsistencies
+ /// within the table's schema and internal data.
+ ///
+ /// This is O(k*m) where k is the total number of field descendents,
+ /// and m is the number of chunks.
+ ///
+ /// \return Status
+ virtual Status Validate() const = 0;
+
+ /// \brief Perform extensive validation checks to determine inconsistencies
+ /// within the table's schema and internal data.
+ ///
+ /// This is O(k*n) where k is the total number of field descendents,
+ /// and n is the number of rows.
+ ///
+ /// \return Status
+ virtual Status ValidateFull() const = 0;
+
+ /// \brief Return the number of columns in the table
+ int num_columns() const { return schema_->num_fields(); }
+
+ /// \brief Return the number of rows (equal to each column's logical length)
+ int64_t num_rows() const { return num_rows_; }
+
+ /// \brief Determine if tables are equal
+ ///
+ /// Two tables can be equal only if they have equal schemas.
+ /// However, they may be equal even if they have different chunkings.
+ bool Equals(const Table& other, bool check_metadata = false) const;
+
+ /// \brief Make a new table by combining the chunks this table has.
+ ///
+ /// All the underlying chunks in the ChunkedArray of each column are
+ /// concatenated into zero or one chunk.
+ ///
+ /// \param[in] pool The pool for buffer allocations
+ Result<std::shared_ptr<Table>> CombineChunks(
+ MemoryPool* pool = default_memory_pool()) const;
+
+ protected:
+ Table();
+
+ std::shared_ptr<Schema> schema_;
+ int64_t num_rows_;
+
+ private:
+ ARROW_DISALLOW_COPY_AND_ASSIGN(Table);
+};
+
+/// \brief Compute a stream of record batches from a (possibly chunked) Table
+///
+/// The conversion is zero-copy: each record batch is a view over a slice
+/// of the table's columns.
+class ARROW_EXPORT TableBatchReader : public RecordBatchReader {
+ public:
+ /// \brief Construct a TableBatchReader for the given table
+ explicit TableBatchReader(const Table& table);
+
+ std::shared_ptr<Schema> schema() const override;
+
+ Status ReadNext(std::shared_ptr<RecordBatch>* out) override;
+
+ /// \brief Set the desired maximum chunk size of record batches
+ ///
+ /// The actual chunk size of each record batch may be smaller, depending
+ /// on actual chunking characteristics of each table column.
+ void set_chunksize(int64_t chunksize);
+
+ private:
+ const Table& table_;
+ std::vector<ChunkedArray*> column_data_;
+ std::vector<int> chunk_numbers_;
+ std::vector<int64_t> chunk_offsets_;
+ int64_t absolute_row_position_;
+ int64_t max_chunksize_;
+};
+
+/// \defgroup concat-tables ConcatenateTables function.
+///
+/// ConcatenateTables function.
+/// @{
+
+/// \brief Controls the behavior of ConcatenateTables().
+struct ARROW_EXPORT ConcatenateTablesOptions {
+ /// If true, the schemas of the tables will be first unified with fields of
+ /// the same name being merged, according to `field_merge_options`, then each
+ /// table will be promoted to the unified schema before being concatenated.
+ /// Otherwise, all tables should have the same schema. Each column in the output table
+ /// is the result of concatenating the corresponding columns in all input tables.
+ bool unify_schemas = false;
+
+ Field::MergeOptions field_merge_options = Field::MergeOptions::Defaults();
+
+ static ConcatenateTablesOptions Defaults() { return {}; }
+};
+
+/// \brief Construct table from multiple input tables.
+ARROW_EXPORT
+Result<std::shared_ptr<Table>> ConcatenateTables(
+ const std::vector<std::shared_ptr<Table>>& tables,
+ ConcatenateTablesOptions options = ConcatenateTablesOptions::Defaults(),
+ MemoryPool* memory_pool = default_memory_pool());
+
+/// \brief Promotes a table to conform to the given schema.
+///
+/// If a field in the schema does not have a corresponding column in the
+/// table, a column of nulls will be added to the resulting table.
+/// If the corresponding column is of type Null, it will be promoted to
+/// the type specified by schema, with null values filled.
+/// Returns an error:
+/// - if the corresponding column's type is not compatible with the
+/// schema.
+/// - if there is a column in the table that does not exist in the schema.
+///
+/// \param[in] table the input Table
+/// \param[in] schema the target schema to promote to
+/// \param[in] pool The memory pool to be used if null-filled arrays need to
+/// be created.
+ARROW_EXPORT
+Result<std::shared_ptr<Table>> PromoteTableToSchema(
+ const std::shared_ptr<Table>& table, const std::shared_ptr<Schema>& schema,
+ MemoryPool* pool = default_memory_pool());
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/table_builder.cc b/contrib/libs/apache/arrow/cpp/src/arrow/table_builder.cc
new file mode 100644
index 00000000000..c026c355758
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/table_builder.cc
@@ -0,0 +1,113 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/table_builder.h"
+
+#include <memory>
+#include <utility>
+
+#include "arrow/array/array_base.h"
+#include "arrow/array/builder_base.h"
+#include "arrow/record_batch.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/util/logging.h"
+
+namespace arrow {
+
+// ----------------------------------------------------------------------
+// RecordBatchBuilder
+
+RecordBatchBuilder::RecordBatchBuilder(const std::shared_ptr<Schema>& schema,
+ MemoryPool* pool, int64_t initial_capacity)
+ : schema_(schema), initial_capacity_(initial_capacity), pool_(pool) {}
+
+Status RecordBatchBuilder::Make(const std::shared_ptr<Schema>& schema, MemoryPool* pool,
+ std::unique_ptr<RecordBatchBuilder>* builder) {
+ return Make(schema, pool, kMinBuilderCapacity, builder);
+}
+
+Status RecordBatchBuilder::Make(const std::shared_ptr<Schema>& schema, MemoryPool* pool,
+ int64_t initial_capacity,
+ std::unique_ptr<RecordBatchBuilder>* builder) {
+ builder->reset(new RecordBatchBuilder(schema, pool, initial_capacity));
+ RETURN_NOT_OK((*builder)->CreateBuilders());
+ return (*builder)->InitBuilders();
+}
+
+Status RecordBatchBuilder::Flush(bool reset_builders,
+ std::shared_ptr<RecordBatch>* batch) {
+ std::vector<std::shared_ptr<Array>> fields;
+ fields.resize(this->num_fields());
+
+ int64_t length = 0;
+ for (int i = 0; i < this->num_fields(); ++i) {
+ RETURN_NOT_OK(raw_field_builders_[i]->Finish(&fields[i]));
+ if (i > 0 && fields[i]->length() != length) {
+ return Status::Invalid("All fields must be same length when calling Flush");
+ }
+ length = fields[i]->length();
+ }
+
+ // For certain types like dictionaries, types may not be fully
+ // determined before we have flushed. Make sure that the RecordBatch
+ // gets the correct types in schema.
+ // See: #ARROW-9969
+ std::vector<std::shared_ptr<Field>> schema_fields(schema_->fields());
+ for (int i = 0; i < this->num_fields(); ++i) {
+ if (!schema_fields[i]->type()->Equals(fields[i]->type())) {
+ schema_fields[i] = schema_fields[i]->WithType(fields[i]->type());
+ }
+ }
+ std::shared_ptr<Schema> schema =
+ std::make_shared<Schema>(std::move(schema_fields), schema_->metadata());
+
+ *batch = RecordBatch::Make(std::move(schema), length, std::move(fields));
+ if (reset_builders) {
+ return InitBuilders();
+ } else {
+ return Status::OK();
+ }
+}
+
+Status RecordBatchBuilder::Flush(std::shared_ptr<RecordBatch>* batch) {
+ return Flush(true, batch);
+}
+
+void RecordBatchBuilder::SetInitialCapacity(int64_t capacity) {
+ ARROW_CHECK_GT(capacity, 0) << "Initial capacity must be positive";
+ initial_capacity_ = capacity;
+}
+
+Status RecordBatchBuilder::CreateBuilders() {
+ field_builders_.resize(this->num_fields());
+ raw_field_builders_.resize(this->num_fields());
+ for (int i = 0; i < this->num_fields(); ++i) {
+ RETURN_NOT_OK(MakeBuilder(pool_, schema_->field(i)->type(), &field_builders_[i]));
+ raw_field_builders_[i] = field_builders_[i].get();
+ }
+ return Status::OK();
+}
+
+Status RecordBatchBuilder::InitBuilders() {
+ for (int i = 0; i < this->num_fields(); ++i) {
+ RETURN_NOT_OK(raw_field_builders_[i]->Reserve(initial_capacity_));
+ }
+ return Status::OK();
+}
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/table_builder.h b/contrib/libs/apache/arrow/cpp/src/arrow/table_builder.h
new file mode 100644
index 00000000000..db130d38950
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/table_builder.h
@@ -0,0 +1,110 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "arrow/array/builder_base.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class MemoryPool;
+class RecordBatch;
+
+/// \class RecordBatchBuilder
+/// \brief Helper class for creating record batches iteratively given a known
+/// schema
+class ARROW_EXPORT RecordBatchBuilder {
+ public:
+ /// \brief Create an initialize a RecordBatchBuilder
+ /// \param[in] schema The schema for the record batch
+ /// \param[in] pool A MemoryPool to use for allocations
+ /// \param[in] builder the created builder instance
+ static Status Make(const std::shared_ptr<Schema>& schema, MemoryPool* pool,
+ std::unique_ptr<RecordBatchBuilder>* builder);
+
+ /// \brief Create an initialize a RecordBatchBuilder
+ /// \param[in] schema The schema for the record batch
+ /// \param[in] pool A MemoryPool to use for allocations
+ /// \param[in] initial_capacity The initial capacity for the builders
+ /// \param[in] builder the created builder instance
+ static Status Make(const std::shared_ptr<Schema>& schema, MemoryPool* pool,
+ int64_t initial_capacity,
+ std::unique_ptr<RecordBatchBuilder>* builder);
+
+ /// \brief Get base pointer to field builder
+ /// \param i the field index
+ /// \return pointer to ArrayBuilder
+ ArrayBuilder* GetField(int i) { return raw_field_builders_[i]; }
+
+ /// \brief Return field builder casted to indicated specific builder type
+ /// \param i the field index
+ /// \return pointer to template type
+ template <typename T>
+ T* GetFieldAs(int i) {
+ return internal::checked_cast<T*>(raw_field_builders_[i]);
+ }
+
+ /// \brief Finish current batch and optionally reset
+ /// \param[in] reset_builders the resulting RecordBatch
+ /// \param[out] batch the resulting RecordBatch
+ /// \return Status
+ Status Flush(bool reset_builders, std::shared_ptr<RecordBatch>* batch);
+
+ /// \brief Finish current batch and reset
+ /// \param[out] batch the resulting RecordBatch
+ /// \return Status
+ Status Flush(std::shared_ptr<RecordBatch>* batch);
+
+ /// \brief Set the initial capacity for new builders
+ void SetInitialCapacity(int64_t capacity);
+
+ /// \brief The initial capacity for builders
+ int64_t initial_capacity() const { return initial_capacity_; }
+
+ /// \brief The number of fields in the schema
+ int num_fields() const { return schema_->num_fields(); }
+
+ /// \brief The number of fields in the schema
+ std::shared_ptr<Schema> schema() const { return schema_; }
+
+ private:
+ ARROW_DISALLOW_COPY_AND_ASSIGN(RecordBatchBuilder);
+
+ RecordBatchBuilder(const std::shared_ptr<Schema>& schema, MemoryPool* pool,
+ int64_t initial_capacity);
+
+ Status CreateBuilders();
+ Status InitBuilders();
+
+ std::shared_ptr<Schema> schema_;
+ int64_t initial_capacity_;
+ MemoryPool* pool_;
+
+ std::vector<std::unique_ptr<ArrayBuilder>> field_builders_;
+ std::vector<ArrayBuilder*> raw_field_builders_;
+};
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/tensor.cc b/contrib/libs/apache/arrow/cpp/src/arrow/tensor.cc
new file mode 100644
index 00000000000..d591bacff02
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/tensor.cc
@@ -0,0 +1,342 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/tensor.h"
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <numeric>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/int_util_internal.h"
+#include "arrow/util/logging.h"
+#include "arrow/visitor_inline.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+
+namespace internal {
+
+Status ComputeRowMajorStrides(const FixedWidthType& type,
+ const std::vector<int64_t>& shape,
+ std::vector<int64_t>* strides) {
+ const int byte_width = GetByteWidth(type);
+ const size_t ndim = shape.size();
+
+ int64_t remaining = 0;
+ if (!shape.empty() && shape.front() > 0) {
+ remaining = byte_width;
+ for (size_t i = 1; i < ndim; ++i) {
+ if (internal::MultiplyWithOverflow(remaining, shape[i], &remaining)) {
+ return Status::Invalid(
+ "Row-major strides computed from shape would not fit in 64-bit integer");
+ }
+ }
+ }
+
+ if (remaining == 0) {
+ strides->assign(shape.size(), byte_width);
+ return Status::OK();
+ }
+
+ strides->push_back(remaining);
+ for (size_t i = 1; i < ndim; ++i) {
+ remaining /= shape[i];
+ strides->push_back(remaining);
+ }
+
+ return Status::OK();
+}
+
+Status ComputeColumnMajorStrides(const FixedWidthType& type,
+ const std::vector<int64_t>& shape,
+ std::vector<int64_t>* strides) {
+ const int byte_width = internal::GetByteWidth(type);
+ const size_t ndim = shape.size();
+
+ int64_t total = 0;
+ if (!shape.empty() && shape.back() > 0) {
+ total = byte_width;
+ for (size_t i = 0; i < ndim - 1; ++i) {
+ if (internal::MultiplyWithOverflow(total, shape[i], &total)) {
+ return Status::Invalid(
+ "Column-major strides computed from shape would not fit in 64-bit "
+ "integer");
+ }
+ }
+ }
+
+ if (total == 0) {
+ strides->assign(shape.size(), byte_width);
+ return Status::OK();
+ }
+
+ total = byte_width;
+ for (size_t i = 0; i < ndim - 1; ++i) {
+ strides->push_back(total);
+ total *= shape[i];
+ }
+ strides->push_back(total);
+
+ return Status::OK();
+}
+
+} // namespace internal
+
+namespace {
+
+inline bool IsTensorStridesRowMajor(const std::shared_ptr<DataType>& type,
+ const std::vector<int64_t>& shape,
+ const std::vector<int64_t>& strides) {
+ std::vector<int64_t> c_strides;
+ const auto& fw_type = checked_cast<const FixedWidthType&>(*type);
+ if (internal::ComputeRowMajorStrides(fw_type, shape, &c_strides).ok()) {
+ return strides == c_strides;
+ } else {
+ return false;
+ }
+}
+
+inline bool IsTensorStridesColumnMajor(const std::shared_ptr<DataType>& type,
+ const std::vector<int64_t>& shape,
+ const std::vector<int64_t>& strides) {
+ std::vector<int64_t> f_strides;
+ const auto& fw_type = checked_cast<const FixedWidthType&>(*type);
+ if (internal::ComputeColumnMajorStrides(fw_type, shape, &f_strides).ok()) {
+ return strides == f_strides;
+ } else {
+ return false;
+ }
+}
+
+inline Status CheckTensorValidity(const std::shared_ptr<DataType>& type,
+ const std::shared_ptr<Buffer>& data,
+ const std::vector<int64_t>& shape) {
+ if (!type) {
+ return Status::Invalid("Null type is supplied");
+ }
+ if (!is_tensor_supported(type->id())) {
+ return Status::Invalid(type->ToString(), " is not valid data type for a tensor");
+ }
+ if (!data) {
+ return Status::Invalid("Null data is supplied");
+ }
+ if (!std::all_of(shape.begin(), shape.end(), [](int64_t x) { return x >= 0; })) {
+ return Status::Invalid("Shape elements must be positive");
+ }
+ return Status::OK();
+}
+
+Status CheckTensorStridesValidity(const std::shared_ptr<Buffer>& data,
+ const std::vector<int64_t>& shape,
+ const std::vector<int64_t>& strides,
+ const std::shared_ptr<DataType>& type) {
+ if (strides.size() != shape.size()) {
+ return Status::Invalid("strides must have the same length as shape");
+ }
+ if (data->size() == 0 && std::find(shape.begin(), shape.end(), 0) != shape.end()) {
+ return Status::OK();
+ }
+
+ // Check the largest offset can be computed without overflow
+ const size_t ndim = shape.size();
+ int64_t largest_offset = 0;
+ for (size_t i = 0; i < ndim; ++i) {
+ if (shape[i] == 0) continue;
+ if (strides[i] < 0) {
+ // TODO(mrkn): Support negative strides for sharing views
+ return Status::Invalid("negative strides not supported");
+ }
+
+ int64_t dim_offset;
+ if (!internal::MultiplyWithOverflow(shape[i] - 1, strides[i], &dim_offset)) {
+ if (!internal::AddWithOverflow(largest_offset, dim_offset, &largest_offset)) {
+ continue;
+ }
+ }
+
+ return Status::Invalid(
+ "offsets computed from shape and strides would not fit in 64-bit integer");
+ }
+
+ const int byte_width = internal::GetByteWidth(*type);
+ if (largest_offset > data->size() - byte_width) {
+ return Status::Invalid("strides must not involve buffer over run");
+ }
+ return Status::OK();
+}
+
+} // namespace
+
+namespace internal {
+
+bool IsTensorStridesContiguous(const std::shared_ptr<DataType>& type,
+ const std::vector<int64_t>& shape,
+ const std::vector<int64_t>& strides) {
+ return IsTensorStridesRowMajor(type, shape, strides) ||
+ IsTensorStridesColumnMajor(type, shape, strides);
+}
+
+Status ValidateTensorParameters(const std::shared_ptr<DataType>& type,
+ const std::shared_ptr<Buffer>& data,
+ const std::vector<int64_t>& shape,
+ const std::vector<int64_t>& strides,
+ const std::vector<std::string>& dim_names) {
+ RETURN_NOT_OK(CheckTensorValidity(type, data, shape));
+ if (!strides.empty()) {
+ RETURN_NOT_OK(CheckTensorStridesValidity(data, shape, strides, type));
+ } else {
+ std::vector<int64_t> tmp_strides;
+ RETURN_NOT_OK(ComputeRowMajorStrides(checked_cast<const FixedWidthType&>(*type),
+ shape, &tmp_strides));
+ }
+ if (dim_names.size() > shape.size()) {
+ return Status::Invalid("too many dim_names are supplied");
+ }
+ return Status::OK();
+}
+
+} // namespace internal
+
+/// Constructor with strides and dimension names
+Tensor::Tensor(const std::shared_ptr<DataType>& type, const std::shared_ptr<Buffer>& data,
+ const std::vector<int64_t>& shape, const std::vector<int64_t>& strides,
+ const std::vector<std::string>& dim_names)
+ : type_(type), data_(data), shape_(shape), strides_(strides), dim_names_(dim_names) {
+ ARROW_CHECK(is_tensor_supported(type->id()));
+ if (shape.size() > 0 && strides.size() == 0) {
+ ARROW_CHECK_OK(internal::ComputeRowMajorStrides(
+ checked_cast<const FixedWidthType&>(*type_), shape, &strides_));
+ }
+}
+
+Tensor::Tensor(const std::shared_ptr<DataType>& type, const std::shared_ptr<Buffer>& data,
+ const std::vector<int64_t>& shape, const std::vector<int64_t>& strides)
+ : Tensor(type, data, shape, strides, {}) {}
+
+Tensor::Tensor(const std::shared_ptr<DataType>& type, const std::shared_ptr<Buffer>& data,
+ const std::vector<int64_t>& shape)
+ : Tensor(type, data, shape, {}, {}) {}
+
+const std::string& Tensor::dim_name(int i) const {
+ static const std::string kEmpty = "";
+ if (dim_names_.size() == 0) {
+ return kEmpty;
+ } else {
+ ARROW_CHECK_LT(i, static_cast<int>(dim_names_.size()));
+ return dim_names_[i];
+ }
+}
+
+int64_t Tensor::size() const {
+ return std::accumulate(shape_.begin(), shape_.end(), 1LL, std::multiplies<int64_t>());
+}
+
+bool Tensor::is_contiguous() const {
+ return internal::IsTensorStridesContiguous(type_, shape_, strides_);
+}
+
+bool Tensor::is_row_major() const {
+ return IsTensorStridesRowMajor(type_, shape_, strides_);
+}
+
+bool Tensor::is_column_major() const {
+ return IsTensorStridesColumnMajor(type_, shape_, strides_);
+}
+
+Type::type Tensor::type_id() const { return type_->id(); }
+
+bool Tensor::Equals(const Tensor& other, const EqualOptions& opts) const {
+ return TensorEquals(*this, other, opts);
+}
+
+namespace {
+
+template <typename TYPE>
+int64_t StridedTensorCountNonZero(int dim_index, int64_t offset, const Tensor& tensor) {
+ using c_type = typename TYPE::c_type;
+ c_type const zero = c_type(0);
+ int64_t nnz = 0;
+ if (dim_index == tensor.ndim() - 1) {
+ for (int64_t i = 0; i < tensor.shape()[dim_index]; ++i) {
+ auto const* ptr = tensor.raw_data() + offset + i * tensor.strides()[dim_index];
+ auto& elem = *reinterpret_cast<c_type const*>(ptr);
+ if (elem != zero) ++nnz;
+ }
+ return nnz;
+ }
+ for (int64_t i = 0; i < tensor.shape()[dim_index]; ++i) {
+ nnz += StridedTensorCountNonZero<TYPE>(dim_index + 1, offset, tensor);
+ offset += tensor.strides()[dim_index];
+ }
+ return nnz;
+}
+
+template <typename TYPE>
+int64_t ContiguousTensorCountNonZero(const Tensor& tensor) {
+ using c_type = typename TYPE::c_type;
+ auto* data = reinterpret_cast<c_type const*>(tensor.raw_data());
+ return std::count_if(data, data + tensor.size(),
+ [](c_type const& x) { return x != 0; });
+}
+
+template <typename TYPE>
+inline int64_t TensorCountNonZero(const Tensor& tensor) {
+ if (tensor.is_contiguous()) {
+ return ContiguousTensorCountNonZero<TYPE>(tensor);
+ } else {
+ return StridedTensorCountNonZero<TYPE>(0, 0, tensor);
+ }
+}
+
+struct NonZeroCounter {
+ explicit NonZeroCounter(const Tensor& tensor) : tensor_(tensor) {}
+
+ template <typename TYPE>
+ enable_if_number<TYPE, Status> Visit(const TYPE& type) {
+ result = TensorCountNonZero<TYPE>(tensor_);
+ return Status::OK();
+ }
+
+ Status Visit(const DataType& type) {
+ ARROW_CHECK(!is_tensor_supported(type.id()));
+ return Status::NotImplemented("Tensor of ", type.ToString(), " is not implemented");
+ }
+
+ const Tensor& tensor_;
+ int64_t result;
+};
+
+} // namespace
+
+Result<int64_t> Tensor::CountNonZero() const {
+ NonZeroCounter counter(*this);
+ RETURN_NOT_OK(VisitTypeInline(*type(), &counter));
+ return counter.result;
+}
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/tensor.h b/contrib/libs/apache/arrow/cpp/src/arrow/tensor.h
new file mode 100644
index 00000000000..91e9ad26066
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/tensor.h
@@ -0,0 +1,250 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/buffer.h"
+#include "arrow/compare.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+static inline bool is_tensor_supported(Type::type type_id) {
+ switch (type_id) {
+ case Type::UINT8:
+ case Type::INT8:
+ case Type::UINT16:
+ case Type::INT16:
+ case Type::UINT32:
+ case Type::INT32:
+ case Type::UINT64:
+ case Type::INT64:
+ case Type::HALF_FLOAT:
+ case Type::FLOAT:
+ case Type::DOUBLE:
+ return true;
+ default:
+ break;
+ }
+ return false;
+}
+
+namespace internal {
+
+ARROW_EXPORT
+Status ComputeRowMajorStrides(const FixedWidthType& type,
+ const std::vector<int64_t>& shape,
+ std::vector<int64_t>* strides);
+
+ARROW_EXPORT
+Status ComputeColumnMajorStrides(const FixedWidthType& type,
+ const std::vector<int64_t>& shape,
+ std::vector<int64_t>* strides);
+
+ARROW_EXPORT
+bool IsTensorStridesContiguous(const std::shared_ptr<DataType>& type,
+ const std::vector<int64_t>& shape,
+ const std::vector<int64_t>& strides);
+
+ARROW_EXPORT
+Status ValidateTensorParameters(const std::shared_ptr<DataType>& type,
+ const std::shared_ptr<Buffer>& data,
+ const std::vector<int64_t>& shape,
+ const std::vector<int64_t>& strides,
+ const std::vector<std::string>& dim_names);
+
+} // namespace internal
+
+class ARROW_EXPORT Tensor {
+ public:
+ /// \brief Create a Tensor with full parameters
+ ///
+ /// This factory function will return Status::Invalid when the parameters are
+ /// inconsistent
+ ///
+ /// \param[in] type The data type of the tensor values
+ /// \param[in] data The buffer of the tensor content
+ /// \param[in] shape The shape of the tensor
+ /// \param[in] strides The strides of the tensor
+ /// (if this is empty, the data assumed to be row-major)
+ /// \param[in] dim_names The names of the tensor dimensions
+ static inline Result<std::shared_ptr<Tensor>> Make(
+ const std::shared_ptr<DataType>& type, const std::shared_ptr<Buffer>& data,
+ const std::vector<int64_t>& shape, const std::vector<int64_t>& strides = {},
+ const std::vector<std::string>& dim_names = {}) {
+ ARROW_RETURN_NOT_OK(
+ internal::ValidateTensorParameters(type, data, shape, strides, dim_names));
+ return std::make_shared<Tensor>(type, data, shape, strides, dim_names);
+ }
+
+ virtual ~Tensor() = default;
+
+ /// Constructor with no dimension names or strides, data assumed to be row-major
+ Tensor(const std::shared_ptr<DataType>& type, const std::shared_ptr<Buffer>& data,
+ const std::vector<int64_t>& shape);
+
+ /// Constructor with non-negative strides
+ Tensor(const std::shared_ptr<DataType>& type, const std::shared_ptr<Buffer>& data,
+ const std::vector<int64_t>& shape, const std::vector<int64_t>& strides);
+
+ /// Constructor with non-negative strides and dimension names
+ Tensor(const std::shared_ptr<DataType>& type, const std::shared_ptr<Buffer>& data,
+ const std::vector<int64_t>& shape, const std::vector<int64_t>& strides,
+ const std::vector<std::string>& dim_names);
+
+ std::shared_ptr<DataType> type() const { return type_; }
+ std::shared_ptr<Buffer> data() const { return data_; }
+
+ const uint8_t* raw_data() const { return data_->data(); }
+ uint8_t* raw_mutable_data() { return data_->mutable_data(); }
+
+ const std::vector<int64_t>& shape() const { return shape_; }
+ const std::vector<int64_t>& strides() const { return strides_; }
+
+ int ndim() const { return static_cast<int>(shape_.size()); }
+
+ const std::vector<std::string>& dim_names() const { return dim_names_; }
+ const std::string& dim_name(int i) const;
+
+ /// Total number of value cells in the tensor
+ int64_t size() const;
+
+ /// Return true if the underlying data buffer is mutable
+ bool is_mutable() const { return data_->is_mutable(); }
+
+ /// Either row major or column major
+ bool is_contiguous() const;
+
+ /// AKA "C order"
+ bool is_row_major() const;
+
+ /// AKA "Fortran order"
+ bool is_column_major() const;
+
+ Type::type type_id() const;
+
+ bool Equals(const Tensor& other, const EqualOptions& = EqualOptions::Defaults()) const;
+
+ /// Compute the number of non-zero values in the tensor
+ Result<int64_t> CountNonZero() const;
+
+ /// Compute the number of non-zero values in the tensor
+ ARROW_DEPRECATED("Use Result-returning version")
+ Status CountNonZero(int64_t* result) const { return CountNonZero().Value(result); }
+
+ /// Return the offset of the given index on the given strides
+ static int64_t CalculateValueOffset(const std::vector<int64_t>& strides,
+ const std::vector<int64_t>& index) {
+ const int64_t n = static_cast<int64_t>(index.size());
+ int64_t offset = 0;
+ for (int64_t i = 0; i < n; ++i) {
+ offset += index[i] * strides[i];
+ }
+ return offset;
+ }
+
+ int64_t CalculateValueOffset(const std::vector<int64_t>& index) const {
+ return Tensor::CalculateValueOffset(strides_, index);
+ }
+
+ /// Returns the value at the given index without data-type and bounds checks
+ template <typename ValueType>
+ const typename ValueType::c_type& Value(const std::vector<int64_t>& index) const {
+ using c_type = typename ValueType::c_type;
+ const int64_t offset = CalculateValueOffset(index);
+ const c_type* ptr = reinterpret_cast<const c_type*>(raw_data() + offset);
+ return *ptr;
+ }
+
+ Status Validate() const {
+ return internal::ValidateTensorParameters(type_, data_, shape_, strides_, dim_names_);
+ }
+
+ protected:
+ Tensor() {}
+
+ std::shared_ptr<DataType> type_;
+ std::shared_ptr<Buffer> data_;
+ std::vector<int64_t> shape_;
+ std::vector<int64_t> strides_;
+
+ /// These names are optional
+ std::vector<std::string> dim_names_;
+
+ template <typename SparseIndexType>
+ friend class SparseTensorImpl;
+
+ private:
+ ARROW_DISALLOW_COPY_AND_ASSIGN(Tensor);
+};
+
+template <typename TYPE>
+class NumericTensor : public Tensor {
+ public:
+ using TypeClass = TYPE;
+ using value_type = typename TypeClass::c_type;
+
+ /// \brief Create a NumericTensor with full parameters
+ ///
+ /// This factory function will return Status::Invalid when the parameters are
+ /// inconsistent
+ ///
+ /// \param[in] data The buffer of the tensor content
+ /// \param[in] shape The shape of the tensor
+ /// \param[in] strides The strides of the tensor
+ /// (if this is empty, the data assumed to be row-major)
+ /// \param[in] dim_names The names of the tensor dimensions
+ static Result<std::shared_ptr<NumericTensor<TYPE>>> Make(
+ const std::shared_ptr<Buffer>& data, const std::vector<int64_t>& shape,
+ const std::vector<int64_t>& strides = {},
+ const std::vector<std::string>& dim_names = {}) {
+ ARROW_RETURN_NOT_OK(internal::ValidateTensorParameters(
+ TypeTraits<TYPE>::type_singleton(), data, shape, strides, dim_names));
+ return std::make_shared<NumericTensor<TYPE>>(data, shape, strides, dim_names);
+ }
+
+ /// Constructor with non-negative strides and dimension names
+ NumericTensor(const std::shared_ptr<Buffer>& data, const std::vector<int64_t>& shape,
+ const std::vector<int64_t>& strides,
+ const std::vector<std::string>& dim_names)
+ : Tensor(TypeTraits<TYPE>::type_singleton(), data, shape, strides, dim_names) {}
+
+ /// Constructor with no dimension names or strides, data assumed to be row-major
+ NumericTensor(const std::shared_ptr<Buffer>& data, const std::vector<int64_t>& shape)
+ : NumericTensor(data, shape, {}, {}) {}
+
+ /// Constructor with non-negative strides
+ NumericTensor(const std::shared_ptr<Buffer>& data, const std::vector<int64_t>& shape,
+ const std::vector<int64_t>& strides)
+ : NumericTensor(data, shape, strides, {}) {}
+
+ const value_type& Value(const std::vector<int64_t>& index) const {
+ return Tensor::Value<TypeClass>(index);
+ }
+};
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/tensor/converter.h b/contrib/libs/apache/arrow/cpp/src/arrow/tensor/converter.h
new file mode 100644
index 00000000000..408ab22305f
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/tensor/converter.h
@@ -0,0 +1,67 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/sparse_tensor.h" // IWYU pragma: export
+
+#include <memory>
+
+namespace arrow {
+namespace internal {
+
+struct SparseTensorConverterMixin {
+ static bool IsNonZero(const uint8_t val) { return val != 0; }
+
+ static void AssignIndex(uint8_t* indices, int64_t val, const int elsize);
+
+ static int64_t GetIndexValue(const uint8_t* value_ptr, const int elsize);
+};
+
+Status MakeSparseCOOTensorFromTensor(const Tensor& tensor,
+ const std::shared_ptr<DataType>& index_value_type,
+ MemoryPool* pool,
+ std::shared_ptr<SparseIndex>* out_sparse_index,
+ std::shared_ptr<Buffer>* out_data);
+
+Status MakeSparseCSXMatrixFromTensor(SparseMatrixCompressedAxis axis,
+ const Tensor& tensor,
+ const std::shared_ptr<DataType>& index_value_type,
+ MemoryPool* pool,
+ std::shared_ptr<SparseIndex>* out_sparse_index,
+ std::shared_ptr<Buffer>* out_data);
+
+Status MakeSparseCSFTensorFromTensor(const Tensor& tensor,
+ const std::shared_ptr<DataType>& index_value_type,
+ MemoryPool* pool,
+ std::shared_ptr<SparseIndex>* out_sparse_index,
+ std::shared_ptr<Buffer>* out_data);
+
+Result<std::shared_ptr<Tensor>> MakeTensorFromSparseCOOTensor(
+ MemoryPool* pool, const SparseCOOTensor* sparse_tensor);
+
+Result<std::shared_ptr<Tensor>> MakeTensorFromSparseCSRMatrix(
+ MemoryPool* pool, const SparseCSRMatrix* sparse_tensor);
+
+Result<std::shared_ptr<Tensor>> MakeTensorFromSparseCSCMatrix(
+ MemoryPool* pool, const SparseCSCMatrix* sparse_tensor);
+
+Result<std::shared_ptr<Tensor>> MakeTensorFromSparseCSFTensor(
+ MemoryPool* pool, const SparseCSFTensor* sparse_tensor);
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/tensor/converter_internal.h b/contrib/libs/apache/arrow/cpp/src/arrow/tensor/converter_internal.h
new file mode 100644
index 00000000000..3a87feaf4b3
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/tensor/converter_internal.h
@@ -0,0 +1,88 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/tensor/converter.h"
+
+#define DISPATCH(ACTION, index_elsize, value_elsize, ...) \
+ switch (index_elsize) { \
+ case 1: \
+ switch (value_elsize) { \
+ case 1: \
+ ACTION(uint8_t, uint8_t, __VA_ARGS__); \
+ break; \
+ case 2: \
+ ACTION(uint8_t, uint16_t, __VA_ARGS__); \
+ break; \
+ case 4: \
+ ACTION(uint8_t, uint32_t, __VA_ARGS__); \
+ break; \
+ case 8: \
+ ACTION(uint8_t, uint64_t, __VA_ARGS__); \
+ break; \
+ } \
+ break; \
+ case 2: \
+ switch (value_elsize) { \
+ case 1: \
+ ACTION(uint16_t, uint8_t, __VA_ARGS__); \
+ break; \
+ case 2: \
+ ACTION(uint16_t, uint16_t, __VA_ARGS__); \
+ break; \
+ case 4: \
+ ACTION(uint16_t, uint32_t, __VA_ARGS__); \
+ break; \
+ case 8: \
+ ACTION(uint16_t, uint64_t, __VA_ARGS__); \
+ break; \
+ } \
+ break; \
+ case 4: \
+ switch (value_elsize) { \
+ case 1: \
+ ACTION(uint32_t, uint8_t, __VA_ARGS__); \
+ break; \
+ case 2: \
+ ACTION(uint32_t, uint16_t, __VA_ARGS__); \
+ break; \
+ case 4: \
+ ACTION(uint32_t, uint32_t, __VA_ARGS__); \
+ break; \
+ case 8: \
+ ACTION(uint32_t, uint64_t, __VA_ARGS__); \
+ break; \
+ } \
+ break; \
+ case 8: \
+ switch (value_elsize) { \
+ case 1: \
+ ACTION(int64_t, uint8_t, __VA_ARGS__); \
+ break; \
+ case 2: \
+ ACTION(int64_t, uint16_t, __VA_ARGS__); \
+ break; \
+ case 4: \
+ ACTION(int64_t, uint32_t, __VA_ARGS__); \
+ break; \
+ case 8: \
+ ACTION(int64_t, uint64_t, __VA_ARGS__); \
+ break; \
+ } \
+ break; \
+ }
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/tensor/coo_converter.cc b/contrib/libs/apache/arrow/cpp/src/arrow/tensor/coo_converter.cc
new file mode 100644
index 00000000000..2124d0a4e4b
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/tensor/coo_converter.cc
@@ -0,0 +1,333 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/tensor/converter_internal.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+#include <numeric>
+#include <vector>
+
+#include "arrow/buffer.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/macros.h"
+#include "arrow/visitor_inline.h"
+
+namespace arrow {
+
+class MemoryPool;
+
+namespace internal {
+namespace {
+
+template <typename c_index_type>
+inline void IncrementRowMajorIndex(std::vector<c_index_type>& coord,
+ const std::vector<int64_t>& shape) {
+ const int64_t ndim = shape.size();
+ ++coord[ndim - 1];
+ if (coord[ndim - 1] == shape[ndim - 1]) {
+ int64_t d = ndim - 1;
+ while (d > 0 && coord[d] == shape[d]) {
+ coord[d] = 0;
+ ++coord[d - 1];
+ --d;
+ }
+ }
+}
+
+template <typename c_index_type, typename c_value_type>
+void ConvertRowMajorTensor(const Tensor& tensor, c_index_type* indices,
+ c_value_type* values, const int64_t size) {
+ const auto ndim = tensor.ndim();
+ const auto& shape = tensor.shape();
+ const c_value_type* tensor_data =
+ reinterpret_cast<const c_value_type*>(tensor.raw_data());
+
+ constexpr c_value_type zero = 0;
+ std::vector<c_index_type> coord(ndim, 0);
+ for (int64_t n = tensor.size(); n > 0; --n) {
+ const c_value_type x = *tensor_data;
+ if (ARROW_PREDICT_FALSE(x != zero)) {
+ std::copy(coord.begin(), coord.end(), indices);
+ *values++ = x;
+ indices += ndim;
+ }
+
+ IncrementRowMajorIndex(coord, shape);
+ ++tensor_data;
+ }
+}
+
+template <typename c_index_type, typename c_value_type>
+void ConvertColumnMajorTensor(const Tensor& tensor, c_index_type* out_indices,
+ c_value_type* out_values, const int64_t size) {
+ const auto ndim = tensor.ndim();
+ std::vector<c_index_type> indices(ndim * size);
+ std::vector<c_value_type> values(size);
+ ConvertRowMajorTensor(tensor, indices.data(), values.data(), size);
+
+ // transpose indices
+ for (int64_t i = 0; i < size; ++i) {
+ for (int j = 0; j < ndim / 2; ++j) {
+ std::swap(indices[i * ndim + j], indices[i * ndim + ndim - j - 1]);
+ }
+ }
+
+ // sort indices
+ std::vector<int64_t> order(size);
+ std::iota(order.begin(), order.end(), 0);
+ std::sort(order.begin(), order.end(), [&](const int64_t xi, const int64_t yi) {
+ const int64_t x_offset = xi * ndim;
+ const int64_t y_offset = yi * ndim;
+ for (int j = 0; j < ndim; ++j) {
+ const auto x = indices[x_offset + j];
+ const auto y = indices[y_offset + j];
+ if (x < y) return true;
+ if (x > y) return false;
+ }
+ return false;
+ });
+
+ // transfer result
+ const auto* indices_data = indices.data();
+ for (int64_t i = 0; i < size; ++i) {
+ out_values[i] = values[i];
+
+ std::copy_n(indices_data, ndim, out_indices);
+ indices_data += ndim;
+ out_indices += ndim;
+ }
+}
+
+template <typename c_index_type, typename c_value_type>
+void ConvertStridedTensor(const Tensor& tensor, c_index_type* indices,
+ c_value_type* values, const int64_t size) {
+ using ValueType = typename CTypeTraits<c_value_type>::ArrowType;
+ const auto& shape = tensor.shape();
+ const auto ndim = tensor.ndim();
+ std::vector<int64_t> coord(ndim, 0);
+
+ constexpr c_value_type zero = 0;
+ c_value_type x;
+ int64_t i;
+ for (int64_t n = tensor.size(); n > 0; --n) {
+ x = tensor.Value<ValueType>(coord);
+ if (ARROW_PREDICT_FALSE(x != zero)) {
+ *values++ = x;
+ for (i = 0; i < ndim; ++i) {
+ *indices++ = static_cast<c_index_type>(coord[i]);
+ }
+ }
+
+ IncrementRowMajorIndex(coord, shape);
+ }
+}
+
+#define CONVERT_TENSOR(func, index_type, value_type, indices, values, size) \
+ func<index_type, value_type>(tensor_, reinterpret_cast<index_type*>(indices), \
+ reinterpret_cast<value_type*>(values), size)
+
+// Using ARROW_EXPAND is necessary to expand __VA_ARGS__ correctly on VC++.
+#define CONVERT_ROW_MAJOR_TENSOR(index_type, value_type, ...) \
+ ARROW_EXPAND(CONVERT_TENSOR(ConvertRowMajorTensor, index_type, value_type, __VA_ARGS__))
+
+#define CONVERT_COLUMN_MAJOR_TENSOR(index_type, value_type, ...) \
+ ARROW_EXPAND( \
+ CONVERT_TENSOR(ConvertColumnMajorTensor, index_type, value_type, __VA_ARGS__))
+
+#define CONVERT_STRIDED_TENSOR(index_type, value_type, ...) \
+ ARROW_EXPAND(CONVERT_TENSOR(ConvertStridedTensor, index_type, value_type, __VA_ARGS__))
+
+// ----------------------------------------------------------------------
+// SparseTensorConverter for SparseCOOIndex
+
+class SparseCOOTensorConverter : private SparseTensorConverterMixin {
+ using SparseTensorConverterMixin::AssignIndex;
+ using SparseTensorConverterMixin::IsNonZero;
+
+ public:
+ SparseCOOTensorConverter(const Tensor& tensor,
+ const std::shared_ptr<DataType>& index_value_type,
+ MemoryPool* pool)
+ : tensor_(tensor), index_value_type_(index_value_type), pool_(pool) {}
+
+ Status Convert() {
+ RETURN_NOT_OK(::arrow::internal::CheckSparseIndexMaximumValue(index_value_type_,
+ tensor_.shape()));
+
+ const int index_elsize = GetByteWidth(*index_value_type_);
+ const int value_elsize = GetByteWidth(*tensor_.type());
+
+ const int64_t ndim = tensor_.ndim();
+ ARROW_ASSIGN_OR_RAISE(int64_t nonzero_count, tensor_.CountNonZero());
+
+ ARROW_ASSIGN_OR_RAISE(auto indices_buffer,
+ AllocateBuffer(index_elsize * ndim * nonzero_count, pool_));
+ uint8_t* indices = indices_buffer->mutable_data();
+
+ ARROW_ASSIGN_OR_RAISE(auto values_buffer,
+ AllocateBuffer(value_elsize * nonzero_count, pool_));
+ uint8_t* values = values_buffer->mutable_data();
+
+ const uint8_t* tensor_data = tensor_.raw_data();
+ if (ndim <= 1) {
+ const int64_t count = ndim == 0 ? 1 : tensor_.shape()[0];
+ for (int64_t i = 0; i < count; ++i) {
+ if (std::any_of(tensor_data, tensor_data + value_elsize, IsNonZero)) {
+ AssignIndex(indices, i, index_elsize);
+ std::copy_n(tensor_data, value_elsize, values);
+
+ indices += index_elsize;
+ values += value_elsize;
+ }
+ tensor_data += value_elsize;
+ }
+ } else if (tensor_.is_row_major()) {
+ DISPATCH(CONVERT_ROW_MAJOR_TENSOR, index_elsize, value_elsize, indices, values,
+ nonzero_count);
+ } else if (tensor_.is_column_major()) {
+ DISPATCH(CONVERT_COLUMN_MAJOR_TENSOR, index_elsize, value_elsize, indices, values,
+ nonzero_count);
+ } else {
+ DISPATCH(CONVERT_STRIDED_TENSOR, index_elsize, value_elsize, indices, values,
+ nonzero_count);
+ }
+
+ // make results
+ const std::vector<int64_t> indices_shape = {nonzero_count, ndim};
+ std::vector<int64_t> indices_strides;
+ RETURN_NOT_OK(internal::ComputeRowMajorStrides(
+ checked_cast<const FixedWidthType&>(*index_value_type_), indices_shape,
+ &indices_strides));
+ auto coords = std::make_shared<Tensor>(index_value_type_, std::move(indices_buffer),
+ indices_shape, indices_strides);
+ ARROW_ASSIGN_OR_RAISE(sparse_index, SparseCOOIndex::Make(coords, true));
+ data = std::move(values_buffer);
+
+ return Status::OK();
+ }
+
+ std::shared_ptr<SparseCOOIndex> sparse_index;
+ std::shared_ptr<Buffer> data;
+
+ private:
+ const Tensor& tensor_;
+ const std::shared_ptr<DataType>& index_value_type_;
+ MemoryPool* pool_;
+};
+
+} // namespace
+
+void SparseTensorConverterMixin::AssignIndex(uint8_t* indices, int64_t val,
+ const int elsize) {
+ switch (elsize) {
+ case 1:
+ *indices = static_cast<uint8_t>(val);
+ break;
+ case 2:
+ *reinterpret_cast<uint16_t*>(indices) = static_cast<uint16_t>(val);
+ break;
+ case 4:
+ *reinterpret_cast<uint32_t*>(indices) = static_cast<uint32_t>(val);
+ break;
+ case 8:
+ *reinterpret_cast<int64_t*>(indices) = val;
+ break;
+ default:
+ break;
+ }
+}
+
+int64_t SparseTensorConverterMixin::GetIndexValue(const uint8_t* value_ptr,
+ const int elsize) {
+ switch (elsize) {
+ case 1:
+ return *value_ptr;
+
+ case 2:
+ return *reinterpret_cast<const uint16_t*>(value_ptr);
+
+ case 4:
+ return *reinterpret_cast<const uint32_t*>(value_ptr);
+
+ case 8:
+ return *reinterpret_cast<const int64_t*>(value_ptr);
+
+ default:
+ return 0;
+ }
+}
+
+Status MakeSparseCOOTensorFromTensor(const Tensor& tensor,
+ const std::shared_ptr<DataType>& index_value_type,
+ MemoryPool* pool,
+ std::shared_ptr<SparseIndex>* out_sparse_index,
+ std::shared_ptr<Buffer>* out_data) {
+ SparseCOOTensorConverter converter(tensor, index_value_type, pool);
+ RETURN_NOT_OK(converter.Convert());
+
+ *out_sparse_index = checked_pointer_cast<SparseIndex>(converter.sparse_index);
+ *out_data = converter.data;
+ return Status::OK();
+}
+
+Result<std::shared_ptr<Tensor>> MakeTensorFromSparseCOOTensor(
+ MemoryPool* pool, const SparseCOOTensor* sparse_tensor) {
+ const auto& sparse_index =
+ checked_cast<const SparseCOOIndex&>(*sparse_tensor->sparse_index());
+ const auto& coords = sparse_index.indices();
+ const auto* coords_data = coords->raw_data();
+
+ const int index_elsize = GetByteWidth(*coords->type());
+
+ const auto& value_type = checked_cast<const FixedWidthType&>(*sparse_tensor->type());
+ const int value_elsize = GetByteWidth(value_type);
+ ARROW_ASSIGN_OR_RAISE(auto values_buffer,
+ AllocateBuffer(value_elsize * sparse_tensor->size(), pool));
+ auto values = values_buffer->mutable_data();
+ std::fill_n(values, value_elsize * sparse_tensor->size(), 0);
+
+ std::vector<int64_t> strides;
+ RETURN_NOT_OK(ComputeRowMajorStrides(value_type, sparse_tensor->shape(), &strides));
+
+ const auto* raw_data = sparse_tensor->raw_data();
+ const int ndim = sparse_tensor->ndim();
+
+ for (int64_t i = 0; i < sparse_tensor->non_zero_length(); ++i) {
+ int64_t offset = 0;
+
+ for (int j = 0; j < ndim; ++j) {
+ auto index = static_cast<int64_t>(
+ SparseTensorConverterMixin::GetIndexValue(coords_data, index_elsize));
+ offset += index * strides[j];
+ coords_data += index_elsize;
+ }
+
+ std::copy_n(raw_data, value_elsize, values + offset);
+ raw_data += value_elsize;
+ }
+
+ return std::make_shared<Tensor>(sparse_tensor->type(), std::move(values_buffer),
+ sparse_tensor->shape(), strides,
+ sparse_tensor->dim_names());
+}
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/tensor/csf_converter.cc b/contrib/libs/apache/arrow/cpp/src/arrow/tensor/csf_converter.cc
new file mode 100644
index 00000000000..77a71d8a12e
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/tensor/csf_converter.cc
@@ -0,0 +1,289 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/tensor/converter.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <vector>
+
+#include "arrow/buffer.h"
+#include "arrow/buffer_builder.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/sort.h"
+#include "arrow/visitor_inline.h"
+
+namespace arrow {
+
+class MemoryPool;
+
+namespace internal {
+namespace {
+
+inline void IncrementIndex(std::vector<int64_t>& coord, const std::vector<int64_t>& shape,
+ const std::vector<int64_t>& axis_order) {
+ const int64_t ndim = shape.size();
+ const int64_t last_axis = axis_order[ndim - 1];
+ ++coord[last_axis];
+ if (coord[last_axis] == shape[last_axis]) {
+ int64_t d = ndim - 1;
+ while (d > 0 && coord[axis_order[d]] == shape[axis_order[d]]) {
+ coord[axis_order[d]] = 0;
+ ++coord[axis_order[d - 1]];
+ --d;
+ }
+ }
+}
+
+// ----------------------------------------------------------------------
+// SparseTensorConverter for SparseCSFIndex
+
+class SparseCSFTensorConverter : private SparseTensorConverterMixin {
+ using SparseTensorConverterMixin::AssignIndex;
+ using SparseTensorConverterMixin::IsNonZero;
+
+ public:
+ SparseCSFTensorConverter(const Tensor& tensor,
+ const std::shared_ptr<DataType>& index_value_type,
+ MemoryPool* pool)
+ : tensor_(tensor), index_value_type_(index_value_type), pool_(pool) {}
+
+ Status Convert() {
+ RETURN_NOT_OK(::arrow::internal::CheckSparseIndexMaximumValue(index_value_type_,
+ tensor_.shape()));
+
+ const int index_elsize = GetByteWidth(*index_value_type_);
+ const int value_elsize = GetByteWidth(*tensor_.type());
+
+ const int64_t ndim = tensor_.ndim();
+ // Axis order as ascending order of dimension size is a good heuristic but is not
+ // necessarily optimal.
+ std::vector<int64_t> axis_order = internal::ArgSort(tensor_.shape());
+ ARROW_ASSIGN_OR_RAISE(int64_t nonzero_count, tensor_.CountNonZero());
+
+ ARROW_ASSIGN_OR_RAISE(auto values_buffer,
+ AllocateBuffer(value_elsize * nonzero_count, pool_));
+ auto* values = values_buffer->mutable_data();
+
+ std::vector<int64_t> counts(ndim, 0);
+ std::vector<int64_t> coord(ndim, 0);
+ std::vector<int64_t> previous_coord(ndim, -1);
+ std::vector<BufferBuilder> indptr_buffer_builders(ndim - 1);
+ std::vector<BufferBuilder> indices_buffer_builders(ndim);
+
+ const auto* tensor_data = tensor_.raw_data();
+ uint8_t index_buffer[sizeof(int64_t)];
+
+ if (ndim <= 1) {
+ return Status::NotImplemented("TODO for ndim <= 1");
+ } else {
+ const auto& shape = tensor_.shape();
+ for (int64_t n = tensor_.size(); n > 0; n--) {
+ const auto offset = tensor_.CalculateValueOffset(coord);
+ const auto xp = tensor_data + offset;
+
+ if (std::any_of(xp, xp + value_elsize, IsNonZero)) {
+ bool tree_split = false;
+
+ std::copy_n(xp, value_elsize, values);
+ values += value_elsize;
+
+ for (int64_t i = 0; i < ndim; ++i) {
+ int64_t dimension = axis_order[i];
+
+ tree_split = tree_split || (coord[dimension] != previous_coord[dimension]);
+ if (tree_split) {
+ if (i < ndim - 1) {
+ AssignIndex(index_buffer, counts[i + 1], index_elsize);
+ RETURN_NOT_OK(
+ indptr_buffer_builders[i].Append(index_buffer, index_elsize));
+ }
+
+ AssignIndex(index_buffer, coord[dimension], index_elsize);
+ RETURN_NOT_OK(
+ indices_buffer_builders[i].Append(index_buffer, index_elsize));
+
+ ++counts[i];
+ }
+ }
+
+ previous_coord = coord;
+ }
+
+ IncrementIndex(coord, shape, axis_order);
+ }
+ }
+
+ for (int64_t column = 0; column < ndim - 1; ++column) {
+ AssignIndex(index_buffer, counts[column + 1], index_elsize);
+ RETURN_NOT_OK(indptr_buffer_builders[column].Append(index_buffer, index_elsize));
+ }
+
+ // make results
+ data = std::move(values_buffer);
+
+ std::vector<std::shared_ptr<Buffer>> indptr_buffers(ndim - 1);
+ std::vector<std::shared_ptr<Buffer>> indices_buffers(ndim);
+ std::vector<int64_t> indptr_shapes(counts.begin(), counts.end() - 1);
+ std::vector<int64_t> indices_shapes = counts;
+
+ for (int64_t column = 0; column < ndim; ++column) {
+ RETURN_NOT_OK(
+ indices_buffer_builders[column].Finish(&indices_buffers[column], true));
+ }
+ for (int64_t column = 0; column < ndim - 1; ++column) {
+ RETURN_NOT_OK(indptr_buffer_builders[column].Finish(&indptr_buffers[column], true));
+ }
+
+ ARROW_ASSIGN_OR_RAISE(
+ sparse_index, SparseCSFIndex::Make(index_value_type_, indices_shapes, axis_order,
+ indptr_buffers, indices_buffers));
+ return Status::OK();
+ }
+
+ std::shared_ptr<SparseCSFIndex> sparse_index;
+ std::shared_ptr<Buffer> data;
+
+ private:
+ const Tensor& tensor_;
+ const std::shared_ptr<DataType>& index_value_type_;
+ MemoryPool* pool_;
+};
+
+class TensorBuilderFromSparseCSFTensor : private SparseTensorConverterMixin {
+ using SparseTensorConverterMixin::GetIndexValue;
+
+ MemoryPool* pool_;
+ const SparseCSFTensor* sparse_tensor_;
+ const SparseCSFIndex* sparse_index_;
+ const std::vector<std::shared_ptr<Tensor>>& indptr_;
+ const std::vector<std::shared_ptr<Tensor>>& indices_;
+ const std::vector<int64_t>& axis_order_;
+ const std::vector<int64_t>& shape_;
+ const int64_t non_zero_length_;
+ const int ndim_;
+ const int64_t tensor_size_;
+ const FixedWidthType& value_type_;
+ const int value_elsize_;
+ const uint8_t* raw_data_;
+ std::vector<int64_t> strides_;
+ std::shared_ptr<Buffer> values_buffer_;
+ uint8_t* values_;
+
+ public:
+ TensorBuilderFromSparseCSFTensor(const SparseCSFTensor* sparse_tensor, MemoryPool* pool)
+ : pool_(pool),
+ sparse_tensor_(sparse_tensor),
+ sparse_index_(
+ checked_cast<const SparseCSFIndex*>(sparse_tensor->sparse_index().get())),
+ indptr_(sparse_index_->indptr()),
+ indices_(sparse_index_->indices()),
+ axis_order_(sparse_index_->axis_order()),
+ shape_(sparse_tensor->shape()),
+ non_zero_length_(sparse_tensor->non_zero_length()),
+ ndim_(sparse_tensor->ndim()),
+ tensor_size_(sparse_tensor->size()),
+ value_type_(checked_cast<const FixedWidthType&>(*sparse_tensor->type())),
+ value_elsize_(GetByteWidth(value_type_)),
+ raw_data_(sparse_tensor->raw_data()) {}
+
+ int ElementSize(const std::shared_ptr<Tensor>& tensor) const {
+ return GetByteWidth(*tensor->type());
+ }
+
+ Result<std::shared_ptr<Tensor>> Build() {
+ RETURN_NOT_OK(internal::ComputeRowMajorStrides(value_type_, shape_, &strides_));
+
+ ARROW_ASSIGN_OR_RAISE(values_buffer_,
+ AllocateBuffer(value_elsize_ * tensor_size_, pool_));
+ values_ = values_buffer_->mutable_data();
+ std::fill_n(values_, value_elsize_ * tensor_size_, 0);
+
+ const int64_t start = 0;
+ const int64_t stop = indptr_[0]->size() - 1;
+ ExpandValues(0, 0, start, stop);
+
+ return std::make_shared<Tensor>(sparse_tensor_->type(), std::move(values_buffer_),
+ shape_, strides_, sparse_tensor_->dim_names());
+ }
+
+ void ExpandValues(const int64_t dim, const int64_t dim_offset, const int64_t start,
+ const int64_t stop) {
+ const auto& cur_indices = indices_[dim];
+ const int indices_elsize = ElementSize(cur_indices);
+ const auto* indices_data = cur_indices->raw_data() + start * indices_elsize;
+
+ if (dim == ndim_ - 1) {
+ for (auto i = start; i < stop; ++i) {
+ const int64_t index =
+ SparseTensorConverterMixin::GetIndexValue(indices_data, indices_elsize);
+ const int64_t offset = dim_offset + index * strides_[axis_order_[dim]];
+
+ std::copy_n(raw_data_ + i * value_elsize_, value_elsize_, values_ + offset);
+
+ indices_data += indices_elsize;
+ }
+ } else {
+ const auto& cur_indptr = indptr_[dim];
+ const int indptr_elsize = ElementSize(cur_indptr);
+ const auto* indptr_data = cur_indptr->raw_data() + start * indptr_elsize;
+
+ for (int64_t i = start; i < stop; ++i) {
+ const int64_t index =
+ SparseTensorConverterMixin::GetIndexValue(indices_data, indices_elsize);
+ const int64_t offset = dim_offset + index * strides_[axis_order_[dim]];
+ const int64_t next_start = GetIndexValue(indptr_data, indptr_elsize);
+ const int64_t next_stop =
+ GetIndexValue(indptr_data + indptr_elsize, indptr_elsize);
+
+ ExpandValues(dim + 1, offset, next_start, next_stop);
+
+ indices_data += indices_elsize;
+ indptr_data += indptr_elsize;
+ }
+ }
+ }
+};
+
+} // namespace
+
+Status MakeSparseCSFTensorFromTensor(const Tensor& tensor,
+ const std::shared_ptr<DataType>& index_value_type,
+ MemoryPool* pool,
+ std::shared_ptr<SparseIndex>* out_sparse_index,
+ std::shared_ptr<Buffer>* out_data) {
+ SparseCSFTensorConverter converter(tensor, index_value_type, pool);
+ RETURN_NOT_OK(converter.Convert());
+
+ *out_sparse_index = checked_pointer_cast<SparseIndex>(converter.sparse_index);
+ *out_data = converter.data;
+ return Status::OK();
+}
+
+Result<std::shared_ptr<Tensor>> MakeTensorFromSparseCSFTensor(
+ MemoryPool* pool, const SparseCSFTensor* sparse_tensor) {
+ TensorBuilderFromSparseCSFTensor builder(sparse_tensor, pool);
+ return builder.Build();
+}
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/tensor/csx_converter.cc b/contrib/libs/apache/arrow/cpp/src/arrow/tensor/csx_converter.cc
new file mode 100644
index 00000000000..137b5d3202f
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/tensor/csx_converter.cc
@@ -0,0 +1,241 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/tensor/converter.h"
+
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <vector>
+
+#include "arrow/buffer.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/visitor_inline.h"
+
+namespace arrow {
+
+class MemoryPool;
+
+namespace internal {
+namespace {
+
+// ----------------------------------------------------------------------
+// SparseTensorConverter for SparseCSRIndex
+
+class SparseCSXMatrixConverter : private SparseTensorConverterMixin {
+ using SparseTensorConverterMixin::AssignIndex;
+ using SparseTensorConverterMixin::IsNonZero;
+
+ public:
+ SparseCSXMatrixConverter(SparseMatrixCompressedAxis axis, const Tensor& tensor,
+ const std::shared_ptr<DataType>& index_value_type,
+ MemoryPool* pool)
+ : axis_(axis), tensor_(tensor), index_value_type_(index_value_type), pool_(pool) {}
+
+ Status Convert() {
+ RETURN_NOT_OK(::arrow::internal::CheckSparseIndexMaximumValue(index_value_type_,
+ tensor_.shape()));
+
+ const int index_elsize = GetByteWidth(*index_value_type_);
+ const int value_elsize = GetByteWidth(*tensor_.type());
+
+ const int64_t ndim = tensor_.ndim();
+ if (ndim > 2) {
+ return Status::Invalid("Invalid tensor dimension");
+ }
+
+ const int major_axis = static_cast<int>(axis_);
+ const int64_t n_major = tensor_.shape()[major_axis];
+ const int64_t n_minor = tensor_.shape()[1 - major_axis];
+ ARROW_ASSIGN_OR_RAISE(int64_t nonzero_count, tensor_.CountNonZero());
+
+ std::shared_ptr<Buffer> indptr_buffer;
+ std::shared_ptr<Buffer> indices_buffer;
+
+ ARROW_ASSIGN_OR_RAISE(auto values_buffer,
+ AllocateBuffer(value_elsize * nonzero_count, pool_));
+ auto* values = values_buffer->mutable_data();
+
+ const auto* tensor_data = tensor_.raw_data();
+
+ if (ndim <= 1) {
+ return Status::NotImplemented("TODO for ndim <= 1");
+ } else {
+ ARROW_ASSIGN_OR_RAISE(indptr_buffer,
+ AllocateBuffer(index_elsize * (n_major + 1), pool_));
+ auto* indptr = indptr_buffer->mutable_data();
+
+ ARROW_ASSIGN_OR_RAISE(indices_buffer,
+ AllocateBuffer(index_elsize * nonzero_count, pool_));
+ auto* indices = indices_buffer->mutable_data();
+
+ std::vector<int64_t> coords(2);
+ int64_t k = 0;
+ std::fill_n(indptr, index_elsize, 0);
+ indptr += index_elsize;
+ for (int64_t i = 0; i < n_major; ++i) {
+ for (int64_t j = 0; j < n_minor; ++j) {
+ if (axis_ == SparseMatrixCompressedAxis::ROW) {
+ coords = {i, j};
+ } else {
+ coords = {j, i};
+ }
+ const int64_t offset = tensor_.CalculateValueOffset(coords);
+ if (std::any_of(tensor_data + offset, tensor_data + offset + value_elsize,
+ IsNonZero)) {
+ std::copy_n(tensor_data + offset, value_elsize, values);
+ values += value_elsize;
+
+ AssignIndex(indices, j, index_elsize);
+ indices += index_elsize;
+
+ k++;
+ }
+ }
+ AssignIndex(indptr, k, index_elsize);
+ indptr += index_elsize;
+ }
+ }
+
+ std::vector<int64_t> indptr_shape({n_major + 1});
+ std::shared_ptr<Tensor> indptr_tensor =
+ std::make_shared<Tensor>(index_value_type_, indptr_buffer, indptr_shape);
+
+ std::vector<int64_t> indices_shape({nonzero_count});
+ std::shared_ptr<Tensor> indices_tensor =
+ std::make_shared<Tensor>(index_value_type_, indices_buffer, indices_shape);
+
+ if (axis_ == SparseMatrixCompressedAxis::ROW) {
+ sparse_index = std::make_shared<SparseCSRIndex>(indptr_tensor, indices_tensor);
+ } else {
+ sparse_index = std::make_shared<SparseCSCIndex>(indptr_tensor, indices_tensor);
+ }
+ data = std::move(values_buffer);
+
+ return Status::OK();
+ }
+
+ std::shared_ptr<SparseIndex> sparse_index;
+ std::shared_ptr<Buffer> data;
+
+ private:
+ SparseMatrixCompressedAxis axis_;
+ const Tensor& tensor_;
+ const std::shared_ptr<DataType>& index_value_type_;
+ MemoryPool* pool_;
+};
+
+} // namespace
+
+Status MakeSparseCSXMatrixFromTensor(SparseMatrixCompressedAxis axis,
+ const Tensor& tensor,
+ const std::shared_ptr<DataType>& index_value_type,
+ MemoryPool* pool,
+ std::shared_ptr<SparseIndex>* out_sparse_index,
+ std::shared_ptr<Buffer>* out_data) {
+ SparseCSXMatrixConverter converter(axis, tensor, index_value_type, pool);
+ RETURN_NOT_OK(converter.Convert());
+
+ *out_sparse_index = converter.sparse_index;
+ *out_data = converter.data;
+ return Status::OK();
+}
+
+Result<std::shared_ptr<Tensor>> MakeTensorFromSparseCSXMatrix(
+ SparseMatrixCompressedAxis axis, MemoryPool* pool,
+ const std::shared_ptr<Tensor>& indptr, const std::shared_ptr<Tensor>& indices,
+ const int64_t non_zero_length, const std::shared_ptr<DataType>& value_type,
+ const std::vector<int64_t>& shape, const int64_t tensor_size, const uint8_t* raw_data,
+ const std::vector<std::string>& dim_names) {
+ const auto* indptr_data = indptr->raw_data();
+ const auto* indices_data = indices->raw_data();
+
+ const int indptr_elsize = GetByteWidth(*indptr->type());
+ const int indices_elsize = GetByteWidth(*indices->type());
+
+ const auto& fw_value_type = checked_cast<const FixedWidthType&>(*value_type);
+ const int value_elsize = GetByteWidth(fw_value_type);
+ ARROW_ASSIGN_OR_RAISE(auto values_buffer,
+ AllocateBuffer(value_elsize * tensor_size, pool));
+ auto values = values_buffer->mutable_data();
+ std::fill_n(values, value_elsize * tensor_size, 0);
+
+ std::vector<int64_t> strides;
+ RETURN_NOT_OK(ComputeRowMajorStrides(fw_value_type, shape, &strides));
+
+ const auto nc = shape[1];
+
+ int64_t offset = 0;
+ for (int64_t i = 0; i < indptr->size() - 1; ++i) {
+ const auto start =
+ SparseTensorConverterMixin::GetIndexValue(indptr_data, indptr_elsize);
+ const auto stop = SparseTensorConverterMixin::GetIndexValue(
+ indptr_data + indptr_elsize, indptr_elsize);
+
+ for (int64_t j = start; j < stop; ++j) {
+ const auto index = SparseTensorConverterMixin::GetIndexValue(
+ indices_data + j * indices_elsize, indices_elsize);
+ switch (axis) {
+ case SparseMatrixCompressedAxis::ROW:
+ offset = (index + i * nc) * value_elsize;
+ break;
+ case SparseMatrixCompressedAxis::COLUMN:
+ offset = (i + index * nc) * value_elsize;
+ break;
+ }
+
+ std::copy_n(raw_data, value_elsize, values + offset);
+ raw_data += value_elsize;
+ }
+
+ indptr_data += indptr_elsize;
+ }
+
+ return std::make_shared<Tensor>(value_type, std::move(values_buffer), shape, strides,
+ dim_names);
+}
+
+Result<std::shared_ptr<Tensor>> MakeTensorFromSparseCSRMatrix(
+ MemoryPool* pool, const SparseCSRMatrix* sparse_tensor) {
+ const auto& sparse_index =
+ internal::checked_cast<const SparseCSRIndex&>(*sparse_tensor->sparse_index());
+ const auto& indptr = sparse_index.indptr();
+ const auto& indices = sparse_index.indices();
+ const auto non_zero_length = sparse_tensor->non_zero_length();
+ return MakeTensorFromSparseCSXMatrix(
+ SparseMatrixCompressedAxis::ROW, pool, indptr, indices, non_zero_length,
+ sparse_tensor->type(), sparse_tensor->shape(), sparse_tensor->size(),
+ sparse_tensor->raw_data(), sparse_tensor->dim_names());
+}
+
+Result<std::shared_ptr<Tensor>> MakeTensorFromSparseCSCMatrix(
+ MemoryPool* pool, const SparseCSCMatrix* sparse_tensor) {
+ const auto& sparse_index =
+ internal::checked_cast<const SparseCSCIndex&>(*sparse_tensor->sparse_index());
+ const auto& indptr = sparse_index.indptr();
+ const auto& indices = sparse_index.indices();
+ const auto non_zero_length = sparse_tensor->non_zero_length();
+ return MakeTensorFromSparseCSXMatrix(
+ SparseMatrixCompressedAxis::COLUMN, pool, indptr, indices, non_zero_length,
+ sparse_tensor->type(), sparse_tensor->shape(), sparse_tensor->size(),
+ sparse_tensor->raw_data(), sparse_tensor->dim_names());
+}
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/type.cc b/contrib/libs/apache/arrow/cpp/src/arrow/type.cc
new file mode 100644
index 00000000000..41914f43663
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/type.cc
@@ -0,0 +1,2282 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/type.h"
+
+#include <algorithm>
+#include <climits>
+#include <cstddef>
+#include <limits>
+#include <ostream>
+#include <sstream> // IWYU pragma: keep
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/compare.h"
+#include "arrow/record_batch.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/hash_util.h"
+#include "arrow/util/hashing.h"
+#include "arrow/util/key_value_metadata.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/make_unique.h"
+#include "arrow/util/range.h"
+#include "arrow/util/vector.h"
+#include "arrow/visitor_inline.h"
+
+namespace arrow {
+
+constexpr Type::type NullType::type_id;
+constexpr Type::type ListType::type_id;
+constexpr Type::type LargeListType::type_id;
+
+constexpr Type::type MapType::type_id;
+
+constexpr Type::type FixedSizeListType::type_id;
+
+constexpr Type::type BinaryType::type_id;
+
+constexpr Type::type LargeBinaryType::type_id;
+
+constexpr Type::type StringType::type_id;
+
+constexpr Type::type LargeStringType::type_id;
+
+constexpr Type::type FixedSizeBinaryType::type_id;
+
+constexpr Type::type StructType::type_id;
+
+constexpr Type::type Decimal128Type::type_id;
+
+constexpr Type::type Decimal256Type::type_id;
+
+constexpr Type::type SparseUnionType::type_id;
+
+constexpr Type::type DenseUnionType::type_id;
+
+constexpr Type::type Date32Type::type_id;
+
+constexpr Type::type Date64Type::type_id;
+
+constexpr Type::type Time32Type::type_id;
+
+constexpr Type::type Time64Type::type_id;
+
+constexpr Type::type TimestampType::type_id;
+
+constexpr Type::type MonthIntervalType::type_id;
+
+constexpr Type::type DayTimeIntervalType::type_id;
+
+constexpr Type::type DurationType::type_id;
+
+constexpr Type::type DictionaryType::type_id;
+
+namespace internal {
+
+struct TypeIdToTypeNameVisitor {
+ std::string out;
+
+ template <typename ArrowType>
+ Status Visit(const ArrowType*) {
+ out = ArrowType::type_name();
+ return Status::OK();
+ }
+};
+
+std::string ToTypeName(Type::type id) {
+ TypeIdToTypeNameVisitor visitor;
+
+ ARROW_CHECK_OK(VisitTypeIdInline(id, &visitor));
+ return std::move(visitor.out);
+}
+
+std::string ToString(Type::type id) {
+ switch (id) {
+#define TO_STRING_CASE(_id) \
+ case Type::_id: \
+ return ARROW_STRINGIFY(_id);
+
+ TO_STRING_CASE(NA)
+ TO_STRING_CASE(BOOL)
+ TO_STRING_CASE(INT8)
+ TO_STRING_CASE(INT16)
+ TO_STRING_CASE(INT32)
+ TO_STRING_CASE(INT64)
+ TO_STRING_CASE(UINT8)
+ TO_STRING_CASE(UINT16)
+ TO_STRING_CASE(UINT32)
+ TO_STRING_CASE(UINT64)
+ TO_STRING_CASE(HALF_FLOAT)
+ TO_STRING_CASE(FLOAT)
+ TO_STRING_CASE(DOUBLE)
+ TO_STRING_CASE(DECIMAL128)
+ TO_STRING_CASE(DECIMAL256)
+ TO_STRING_CASE(DATE32)
+ TO_STRING_CASE(DATE64)
+ TO_STRING_CASE(TIME32)
+ TO_STRING_CASE(TIME64)
+ TO_STRING_CASE(TIMESTAMP)
+ TO_STRING_CASE(INTERVAL_DAY_TIME)
+ TO_STRING_CASE(INTERVAL_MONTHS)
+ TO_STRING_CASE(DURATION)
+ TO_STRING_CASE(STRING)
+ TO_STRING_CASE(BINARY)
+ TO_STRING_CASE(LARGE_STRING)
+ TO_STRING_CASE(LARGE_BINARY)
+ TO_STRING_CASE(FIXED_SIZE_BINARY)
+ TO_STRING_CASE(STRUCT)
+ TO_STRING_CASE(LIST)
+ TO_STRING_CASE(LARGE_LIST)
+ TO_STRING_CASE(FIXED_SIZE_LIST)
+ TO_STRING_CASE(MAP)
+ TO_STRING_CASE(DENSE_UNION)
+ TO_STRING_CASE(SPARSE_UNION)
+ TO_STRING_CASE(DICTIONARY)
+ TO_STRING_CASE(EXTENSION)
+
+#undef TO_STRING_CASE
+
+ default:
+ ARROW_LOG(FATAL) << "Unhandled type id: " << id;
+ return "";
+ }
+}
+
+std::string ToString(TimeUnit::type unit) {
+ switch (unit) {
+ case TimeUnit::SECOND:
+ return "s";
+ case TimeUnit::MILLI:
+ return "ms";
+ case TimeUnit::MICRO:
+ return "us";
+ case TimeUnit::NANO:
+ return "ns";
+ default:
+ DCHECK(false);
+ return "";
+ }
+}
+
+int GetByteWidth(const DataType& type) {
+ const auto& fw_type = checked_cast<const FixedWidthType&>(type);
+ return fw_type.bit_width() / CHAR_BIT;
+}
+
+} // namespace internal
+
+namespace {
+
+struct PhysicalTypeVisitor {
+ const std::shared_ptr<DataType>& real_type;
+ std::shared_ptr<DataType> result;
+
+ Status Visit(const DataType&) {
+ result = real_type;
+ return Status::OK();
+ }
+
+ template <typename Type, typename PhysicalType = typename Type::PhysicalType>
+ Status Visit(const Type&) {
+ result = TypeTraits<PhysicalType>::type_singleton();
+ return Status::OK();
+ }
+};
+
+} // namespace
+
+std::shared_ptr<DataType> GetPhysicalType(const std::shared_ptr<DataType>& real_type) {
+ PhysicalTypeVisitor visitor{real_type, {}};
+ ARROW_CHECK_OK(VisitTypeInline(*real_type, &visitor));
+ return std::move(visitor.result);
+}
+
+namespace {
+
+using internal::checked_cast;
+
+// Merges `existing` and `other` if one of them is of NullType, otherwise
+// returns nullptr.
+// - if `other` if of NullType or is nullable, the unified field will be nullable.
+// - if `existing` is of NullType but other is not, the unified field will
+// have `other`'s type and will be nullable
+std::shared_ptr<Field> MaybePromoteNullTypes(const Field& existing, const Field& other) {
+ if (existing.type()->id() != Type::NA && other.type()->id() != Type::NA) {
+ return nullptr;
+ }
+ if (existing.type()->id() == Type::NA) {
+ return other.WithNullable(true)->WithMetadata(existing.metadata());
+ }
+ // `other` must be null.
+ return existing.WithNullable(true);
+}
+} // namespace
+
+Field::~Field() {}
+
+bool Field::HasMetadata() const {
+ return (metadata_ != nullptr) && (metadata_->size() > 0);
+}
+
+std::shared_ptr<Field> Field::WithMetadata(
+ const std::shared_ptr<const KeyValueMetadata>& metadata) const {
+ return std::make_shared<Field>(name_, type_, nullable_, metadata);
+}
+
+std::shared_ptr<Field> Field::WithMergedMetadata(
+ const std::shared_ptr<const KeyValueMetadata>& metadata) const {
+ std::shared_ptr<const KeyValueMetadata> merged_metadata;
+ if (metadata_) {
+ merged_metadata = metadata_->Merge(*metadata);
+ } else {
+ merged_metadata = metadata;
+ }
+ return std::make_shared<Field>(name_, type_, nullable_, merged_metadata);
+}
+
+std::shared_ptr<Field> Field::RemoveMetadata() const {
+ return std::make_shared<Field>(name_, type_, nullable_);
+}
+
+std::shared_ptr<Field> Field::WithType(const std::shared_ptr<DataType>& type) const {
+ return std::make_shared<Field>(name_, type, nullable_, metadata_);
+}
+
+std::shared_ptr<Field> Field::WithName(const std::string& name) const {
+ return std::make_shared<Field>(name, type_, nullable_, metadata_);
+}
+
+std::shared_ptr<Field> Field::WithNullable(const bool nullable) const {
+ return std::make_shared<Field>(name_, type_, nullable, metadata_);
+}
+
+Result<std::shared_ptr<Field>> Field::MergeWith(const Field& other,
+ MergeOptions options) const {
+ if (name() != other.name()) {
+ return Status::Invalid("Field ", name(), " doesn't have the same name as ",
+ other.name());
+ }
+
+ if (Equals(other, /*check_metadata=*/false)) {
+ return Copy();
+ }
+
+ if (options.promote_nullability) {
+ if (type()->Equals(other.type())) {
+ return Copy()->WithNullable(nullable() || other.nullable());
+ }
+ std::shared_ptr<Field> promoted = MaybePromoteNullTypes(*this, other);
+ if (promoted) return promoted;
+ }
+
+ return Status::Invalid("Unable to merge: Field ", name(),
+ " has incompatible types: ", type()->ToString(), " vs ",
+ other.type()->ToString());
+}
+
+Result<std::shared_ptr<Field>> Field::MergeWith(const std::shared_ptr<Field>& other,
+ MergeOptions options) const {
+ DCHECK_NE(other, nullptr);
+ return MergeWith(*other, options);
+}
+
+std::vector<std::shared_ptr<Field>> Field::Flatten() const {
+ std::vector<std::shared_ptr<Field>> flattened;
+ if (type_->id() == Type::STRUCT) {
+ for (const auto& child : type_->fields()) {
+ auto flattened_child = child->Copy();
+ flattened.push_back(flattened_child);
+ flattened_child->name_.insert(0, name() + ".");
+ flattened_child->nullable_ |= nullable_;
+ }
+ } else {
+ flattened.push_back(this->Copy());
+ }
+ return flattened;
+}
+
+std::shared_ptr<Field> Field::Copy() const {
+ return ::arrow::field(name_, type_, nullable_, metadata_);
+}
+
+bool Field::Equals(const Field& other, bool check_metadata) const {
+ if (this == &other) {
+ return true;
+ }
+ if (this->name_ == other.name_ && this->nullable_ == other.nullable_ &&
+ this->type_->Equals(*other.type_.get(), check_metadata)) {
+ if (!check_metadata) {
+ return true;
+ } else if (this->HasMetadata() && other.HasMetadata()) {
+ return metadata_->Equals(*other.metadata_);
+ } else if (!this->HasMetadata() && !other.HasMetadata()) {
+ return true;
+ } else {
+ return false;
+ }
+ }
+ return false;
+}
+
+bool Field::Equals(const std::shared_ptr<Field>& other, bool check_metadata) const {
+ return Equals(*other.get(), check_metadata);
+}
+
+bool Field::IsCompatibleWith(const Field& other) const { return MergeWith(other).ok(); }
+
+bool Field::IsCompatibleWith(const std::shared_ptr<Field>& other) const {
+ DCHECK_NE(other, nullptr);
+ return IsCompatibleWith(*other);
+}
+
+std::string Field::ToString(bool show_metadata) const {
+ std::stringstream ss;
+ ss << name_ << ": " << type_->ToString();
+ if (!nullable_) {
+ ss << " not null";
+ }
+ if (show_metadata && metadata_) {
+ ss << metadata_->ToString();
+ }
+ return ss.str();
+}
+
+DataType::~DataType() {}
+
+bool DataType::Equals(const DataType& other, bool check_metadata) const {
+ return TypeEquals(*this, other, check_metadata);
+}
+
+bool DataType::Equals(const std::shared_ptr<DataType>& other) const {
+ if (!other) {
+ return false;
+ }
+ return Equals(*other.get());
+}
+
+size_t DataType::Hash() const {
+ static constexpr size_t kHashSeed = 0;
+ size_t result = kHashSeed;
+ internal::hash_combine(result, this->ComputeFingerprint());
+ return result;
+}
+
+std::ostream& operator<<(std::ostream& os, const DataType& type) {
+ os << type.ToString();
+ return os;
+}
+
+FloatingPointType::Precision HalfFloatType::precision() const {
+ return FloatingPointType::HALF;
+}
+
+FloatingPointType::Precision FloatType::precision() const {
+ return FloatingPointType::SINGLE;
+}
+
+FloatingPointType::Precision DoubleType::precision() const {
+ return FloatingPointType::DOUBLE;
+}
+
+std::string ListType::ToString() const {
+ std::stringstream s;
+ s << "list<" << value_field()->ToString() << ">";
+ return s.str();
+}
+
+std::string LargeListType::ToString() const {
+ std::stringstream s;
+ s << "large_list<" << value_field()->ToString() << ">";
+ return s.str();
+}
+
+MapType::MapType(std::shared_ptr<DataType> key_type, std::shared_ptr<DataType> item_type,
+ bool keys_sorted)
+ : MapType(::arrow::field("key", std::move(key_type), false),
+ ::arrow::field("value", std::move(item_type)), keys_sorted) {}
+
+MapType::MapType(std::shared_ptr<DataType> key_type, std::shared_ptr<Field> item_field,
+ bool keys_sorted)
+ : MapType(::arrow::field("key", std::move(key_type), false), std::move(item_field),
+ keys_sorted) {}
+
+MapType::MapType(std::shared_ptr<Field> key_field, std::shared_ptr<Field> item_field,
+ bool keys_sorted)
+ : MapType(
+ ::arrow::field("entries",
+ struct_({std::move(key_field), std::move(item_field)}), false),
+ keys_sorted) {}
+
+MapType::MapType(std::shared_ptr<Field> value_field, bool keys_sorted)
+ : ListType(std::move(value_field)), keys_sorted_(keys_sorted) {
+ id_ = type_id;
+}
+
+Result<std::shared_ptr<DataType>> MapType::Make(std::shared_ptr<Field> value_field,
+ bool keys_sorted) {
+ const auto& value_type = *value_field->type();
+ if (value_field->nullable() || value_type.id() != Type::STRUCT) {
+ return Status::TypeError("Map entry field should be non-nullable struct");
+ }
+ const auto& struct_type = checked_cast<const StructType&>(value_type);
+ if (struct_type.num_fields() != 2) {
+ return Status::TypeError("Map entry field should have two children (got ",
+ struct_type.num_fields(), ")");
+ }
+ if (struct_type.field(0)->nullable()) {
+ return Status::TypeError("Map key field should be non-nullable");
+ }
+ return std::make_shared<MapType>(std::move(value_field), keys_sorted);
+}
+
+std::string MapType::ToString() const {
+ std::stringstream s;
+
+ const auto print_field_name = [](std::ostream& os, const Field& field,
+ const char* std_name) {
+ if (field.name() != std_name) {
+ os << " ('" << field.name() << "')";
+ }
+ };
+ const auto print_field = [&](std::ostream& os, const Field& field,
+ const char* std_name) {
+ os << field.type()->ToString();
+ print_field_name(os, field, std_name);
+ };
+
+ s << "map<";
+ print_field(s, *key_field(), "key");
+ s << ", ";
+ print_field(s, *item_field(), "value");
+ if (keys_sorted_) {
+ s << ", keys_sorted";
+ }
+ print_field_name(s, *value_field(), "entries");
+ s << ">";
+ return s.str();
+}
+
+std::string FixedSizeListType::ToString() const {
+ std::stringstream s;
+ s << "fixed_size_list<" << value_field()->ToString() << ">[" << list_size_ << "]";
+ return s.str();
+}
+
+std::string BinaryType::ToString() const { return "binary"; }
+
+std::string LargeBinaryType::ToString() const { return "large_binary"; }
+
+std::string StringType::ToString() const { return "string"; }
+
+std::string LargeStringType::ToString() const { return "large_string"; }
+
+int FixedSizeBinaryType::bit_width() const { return CHAR_BIT * byte_width(); }
+
+Result<std::shared_ptr<DataType>> FixedSizeBinaryType::Make(int32_t byte_width) {
+ if (byte_width < 0) {
+ return Status::Invalid("Negative FixedSizeBinaryType byte width");
+ }
+ if (byte_width > std::numeric_limits<int>::max() / CHAR_BIT) {
+ // bit_width() would overflow
+ return Status::Invalid("byte width of FixedSizeBinaryType too large");
+ }
+ return std::make_shared<FixedSizeBinaryType>(byte_width);
+}
+
+std::string FixedSizeBinaryType::ToString() const {
+ std::stringstream ss;
+ ss << "fixed_size_binary[" << byte_width_ << "]";
+ return ss.str();
+}
+
+// ----------------------------------------------------------------------
+// Date types
+
+DateType::DateType(Type::type type_id) : TemporalType(type_id) {}
+
+Date32Type::Date32Type() : DateType(Type::DATE32) {}
+
+Date64Type::Date64Type() : DateType(Type::DATE64) {}
+
+std::string Date64Type::ToString() const { return std::string("date64[ms]"); }
+
+std::string Date32Type::ToString() const { return std::string("date32[day]"); }
+
+// ----------------------------------------------------------------------
+// Time types
+
+TimeType::TimeType(Type::type type_id, TimeUnit::type unit)
+ : TemporalType(type_id), unit_(unit) {}
+
+Time32Type::Time32Type(TimeUnit::type unit) : TimeType(Type::TIME32, unit) {
+ ARROW_CHECK(unit == TimeUnit::SECOND || unit == TimeUnit::MILLI)
+ << "Must be seconds or milliseconds";
+}
+
+std::string Time32Type::ToString() const {
+ std::stringstream ss;
+ ss << "time32[" << this->unit_ << "]";
+ return ss.str();
+}
+
+Time64Type::Time64Type(TimeUnit::type unit) : TimeType(Type::TIME64, unit) {
+ ARROW_CHECK(unit == TimeUnit::MICRO || unit == TimeUnit::NANO)
+ << "Must be microseconds or nanoseconds";
+}
+
+std::string Time64Type::ToString() const {
+ std::stringstream ss;
+ ss << "time64[" << this->unit_ << "]";
+ return ss.str();
+}
+
+std::ostream& operator<<(std::ostream& os, TimeUnit::type unit) {
+ switch (unit) {
+ case TimeUnit::SECOND:
+ os << "s";
+ break;
+ case TimeUnit::MILLI:
+ os << "ms";
+ break;
+ case TimeUnit::MICRO:
+ os << "us";
+ break;
+ case TimeUnit::NANO:
+ os << "ns";
+ break;
+ }
+ return os;
+}
+
+// ----------------------------------------------------------------------
+// Timestamp types
+
+std::string TimestampType::ToString() const {
+ std::stringstream ss;
+ ss << "timestamp[" << this->unit_;
+ if (this->timezone_.size() > 0) {
+ ss << ", tz=" << this->timezone_;
+ }
+ ss << "]";
+ return ss.str();
+}
+
+// Duration types
+std::string DurationType::ToString() const {
+ std::stringstream ss;
+ ss << "duration[" << this->unit_ << "]";
+ return ss.str();
+}
+
+// ----------------------------------------------------------------------
+// Union type
+
+constexpr int8_t UnionType::kMaxTypeCode;
+constexpr int UnionType::kInvalidChildId;
+
+UnionMode::type UnionType::mode() const {
+ return id_ == Type::SPARSE_UNION ? UnionMode::SPARSE : UnionMode::DENSE;
+}
+
+UnionType::UnionType(std::vector<std::shared_ptr<Field>> fields,
+ std::vector<int8_t> type_codes, Type::type id)
+ : NestedType(id),
+ type_codes_(std::move(type_codes)),
+ child_ids_(kMaxTypeCode + 1, kInvalidChildId) {
+ children_ = std::move(fields);
+ DCHECK_OK(ValidateParameters(children_, type_codes_, mode()));
+ for (int child_id = 0; child_id < static_cast<int>(type_codes_.size()); ++child_id) {
+ const auto type_code = type_codes_[child_id];
+ child_ids_[type_code] = child_id;
+ }
+}
+
+Status UnionType::ValidateParameters(const std::vector<std::shared_ptr<Field>>& fields,
+ const std::vector<int8_t>& type_codes,
+ UnionMode::type mode) {
+ if (fields.size() != type_codes.size()) {
+ return Status::Invalid("Union should get the same number of fields as type codes");
+ }
+ for (const auto type_code : type_codes) {
+ if (type_code < 0 || type_code > kMaxTypeCode) {
+ return Status::Invalid("Union type code out of bounds");
+ }
+ }
+ return Status::OK();
+}
+
+DataTypeLayout UnionType::layout() const {
+ if (mode() == UnionMode::SPARSE) {
+ return DataTypeLayout(
+ {DataTypeLayout::AlwaysNull(), DataTypeLayout::FixedWidth(sizeof(uint8_t))});
+ } else {
+ return DataTypeLayout({DataTypeLayout::AlwaysNull(),
+ DataTypeLayout::FixedWidth(sizeof(uint8_t)),
+ DataTypeLayout::FixedWidth(sizeof(int32_t))});
+ }
+}
+
+uint8_t UnionType::max_type_code() const {
+ return type_codes_.size() == 0
+ ? 0
+ : *std::max_element(type_codes_.begin(), type_codes_.end());
+}
+
+std::string UnionType::ToString() const {
+ std::stringstream s;
+
+ s << name() << "<";
+
+ for (size_t i = 0; i < children_.size(); ++i) {
+ if (i) {
+ s << ", ";
+ }
+ s << children_[i]->ToString() << "=" << static_cast<int>(type_codes_[i]);
+ }
+ s << ">";
+ return s.str();
+}
+
+SparseUnionType::SparseUnionType(std::vector<std::shared_ptr<Field>> fields,
+ std::vector<int8_t> type_codes)
+ : UnionType(fields, type_codes, Type::SPARSE_UNION) {}
+
+Result<std::shared_ptr<DataType>> SparseUnionType::Make(
+ std::vector<std::shared_ptr<Field>> fields, std::vector<int8_t> type_codes) {
+ RETURN_NOT_OK(ValidateParameters(fields, type_codes, UnionMode::SPARSE));
+ return std::make_shared<SparseUnionType>(fields, type_codes);
+}
+
+DenseUnionType::DenseUnionType(std::vector<std::shared_ptr<Field>> fields,
+ std::vector<int8_t> type_codes)
+ : UnionType(fields, type_codes, Type::DENSE_UNION) {}
+
+Result<std::shared_ptr<DataType>> DenseUnionType::Make(
+ std::vector<std::shared_ptr<Field>> fields, std::vector<int8_t> type_codes) {
+ RETURN_NOT_OK(ValidateParameters(fields, type_codes, UnionMode::DENSE));
+ return std::make_shared<DenseUnionType>(fields, type_codes);
+}
+
+// ----------------------------------------------------------------------
+// Struct type
+
+namespace {
+
+std::unordered_multimap<std::string, int> CreateNameToIndexMap(
+ const std::vector<std::shared_ptr<Field>>& fields) {
+ std::unordered_multimap<std::string, int> name_to_index;
+ for (size_t i = 0; i < fields.size(); ++i) {
+ name_to_index.emplace(fields[i]->name(), static_cast<int>(i));
+ }
+ return name_to_index;
+}
+
+template <int NotFoundValue = -1, int DuplicateFoundValue = -1>
+int LookupNameIndex(const std::unordered_multimap<std::string, int>& name_to_index,
+ const std::string& name) {
+ auto p = name_to_index.equal_range(name);
+ auto it = p.first;
+ if (it == p.second) {
+ // Not found
+ return NotFoundValue;
+ }
+ auto index = it->second;
+ if (++it != p.second) {
+ // Duplicate field name
+ return DuplicateFoundValue;
+ }
+ return index;
+}
+
+} // namespace
+
+class StructType::Impl {
+ public:
+ explicit Impl(const std::vector<std::shared_ptr<Field>>& fields)
+ : name_to_index_(CreateNameToIndexMap(fields)) {}
+
+ const std::unordered_multimap<std::string, int> name_to_index_;
+};
+
+StructType::StructType(const std::vector<std::shared_ptr<Field>>& fields)
+ : NestedType(Type::STRUCT), impl_(new Impl(fields)) {
+ children_ = fields;
+}
+
+StructType::~StructType() {}
+
+std::string StructType::ToString() const {
+ std::stringstream s;
+ s << "struct<";
+ for (int i = 0; i < this->num_fields(); ++i) {
+ if (i > 0) {
+ s << ", ";
+ }
+ std::shared_ptr<Field> field = this->field(i);
+ s << field->ToString();
+ }
+ s << ">";
+ return s.str();
+}
+
+std::shared_ptr<Field> StructType::GetFieldByName(const std::string& name) const {
+ int i = GetFieldIndex(name);
+ return i == -1 ? nullptr : children_[i];
+}
+
+int StructType::GetFieldIndex(const std::string& name) const {
+ return LookupNameIndex(impl_->name_to_index_, name);
+}
+
+std::vector<int> StructType::GetAllFieldIndices(const std::string& name) const {
+ std::vector<int> result;
+ auto p = impl_->name_to_index_.equal_range(name);
+ for (auto it = p.first; it != p.second; ++it) {
+ result.push_back(it->second);
+ }
+ if (result.size() > 1) {
+ std::sort(result.begin(), result.end());
+ }
+ return result;
+}
+
+std::vector<std::shared_ptr<Field>> StructType::GetAllFieldsByName(
+ const std::string& name) const {
+ std::vector<std::shared_ptr<Field>> result;
+ auto p = impl_->name_to_index_.equal_range(name);
+ for (auto it = p.first; it != p.second; ++it) {
+ result.push_back(children_[it->second]);
+ }
+ return result;
+}
+
+Result<std::shared_ptr<DataType>> DecimalType::Make(Type::type type_id, int32_t precision,
+ int32_t scale) {
+ if (type_id == Type::DECIMAL128) {
+ return Decimal128Type::Make(precision, scale);
+ } else if (type_id == Type::DECIMAL256) {
+ return Decimal256Type::Make(precision, scale);
+ } else {
+ return Status::Invalid("Not a decimal type_id: ", type_id);
+ }
+}
+
+// Taken from the Apache Impala codebase. The comments next
+// to the return values are the maximum value that can be represented in 2's
+// complement with the returned number of bytes.
+int32_t DecimalType::DecimalSize(int32_t precision) {
+ DCHECK_GE(precision, 1) << "decimal precision must be greater than or equal to 1, got "
+ << precision;
+
+ // Generated in python with:
+ // >>> decimal_size = lambda prec: int(math.ceil((prec * math.log2(10) + 1) / 8))
+ // >>> [-1] + [decimal_size(i) for i in range(1, 77)]
+ constexpr int32_t kBytes[] = {
+ -1, 1, 1, 2, 2, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 7, 7, 8, 8, 9,
+ 9, 9, 10, 10, 11, 11, 11, 12, 12, 13, 13, 13, 14, 14, 15, 15, 16, 16, 16, 17,
+ 17, 18, 18, 18, 19, 19, 20, 20, 21, 21, 21, 22, 22, 23, 23, 23, 24, 24, 25, 25,
+ 26, 26, 26, 27, 27, 28, 28, 28, 29, 29, 30, 30, 31, 31, 31, 32, 32};
+
+ if (precision <= 76) {
+ return kBytes[precision];
+ }
+ return static_cast<int32_t>(std::ceil((precision / 8.0) * std::log2(10) + 1));
+}
+
+// ----------------------------------------------------------------------
+// Decimal128 type
+
+Decimal128Type::Decimal128Type(int32_t precision, int32_t scale)
+ : DecimalType(type_id, 16, precision, scale) {
+ ARROW_CHECK_GE(precision, kMinPrecision);
+ ARROW_CHECK_LE(precision, kMaxPrecision);
+}
+
+Result<std::shared_ptr<DataType>> Decimal128Type::Make(int32_t precision, int32_t scale) {
+ if (precision < kMinPrecision || precision > kMaxPrecision) {
+ return Status::Invalid("Decimal precision out of range: ", precision);
+ }
+ return std::make_shared<Decimal128Type>(precision, scale);
+}
+
+// ----------------------------------------------------------------------
+// Decimal256 type
+
+Decimal256Type::Decimal256Type(int32_t precision, int32_t scale)
+ : DecimalType(type_id, 32, precision, scale) {
+ ARROW_CHECK_GE(precision, kMinPrecision);
+ ARROW_CHECK_LE(precision, kMaxPrecision);
+}
+
+Result<std::shared_ptr<DataType>> Decimal256Type::Make(int32_t precision, int32_t scale) {
+ if (precision < kMinPrecision || precision > kMaxPrecision) {
+ return Status::Invalid("Decimal precision out of range: ", precision);
+ }
+ return std::make_shared<Decimal256Type>(precision, scale);
+}
+
+// ----------------------------------------------------------------------
+// Dictionary-encoded type
+
+Status DictionaryType::ValidateParameters(const DataType& index_type,
+ const DataType& value_type) {
+ if (!is_integer(index_type.id())) {
+ return Status::TypeError("Dictionary index type should be integer, got ",
+ index_type.ToString());
+ }
+ return Status::OK();
+}
+
+int DictionaryType::bit_width() const {
+ return checked_cast<const FixedWidthType&>(*index_type_).bit_width();
+}
+
+Result<std::shared_ptr<DataType>> DictionaryType::Make(
+ const std::shared_ptr<DataType>& index_type,
+ const std::shared_ptr<DataType>& value_type, bool ordered) {
+ RETURN_NOT_OK(ValidateParameters(*index_type, *value_type));
+ return std::make_shared<DictionaryType>(index_type, value_type, ordered);
+}
+
+DictionaryType::DictionaryType(const std::shared_ptr<DataType>& index_type,
+ const std::shared_ptr<DataType>& value_type, bool ordered)
+ : FixedWidthType(Type::DICTIONARY),
+ index_type_(index_type),
+ value_type_(value_type),
+ ordered_(ordered) {
+ ARROW_CHECK_OK(ValidateParameters(*index_type_, *value_type_));
+}
+
+DataTypeLayout DictionaryType::layout() const {
+ auto layout = index_type_->layout();
+ layout.has_dictionary = true;
+ return layout;
+}
+
+std::string DictionaryType::ToString() const {
+ std::stringstream ss;
+ ss << this->name() << "<values=" << value_type_->ToString()
+ << ", indices=" << index_type_->ToString() << ", ordered=" << ordered_ << ">";
+ return ss.str();
+}
+
+// ----------------------------------------------------------------------
+// Null type
+
+std::string NullType::ToString() const { return name(); }
+
+// ----------------------------------------------------------------------
+// FieldRef
+
+size_t FieldPath::hash() const {
+ return internal::ComputeStringHash<0>(indices().data(), indices().size() * sizeof(int));
+}
+
+std::string FieldPath::ToString() const {
+ if (this->indices().empty()) {
+ return "FieldPath(empty)";
+ }
+
+ std::string repr = "FieldPath(";
+ for (auto index : this->indices()) {
+ repr += std::to_string(index) + " ";
+ }
+ repr.back() = ')';
+ return repr;
+}
+
+struct FieldPathGetImpl {
+ static const DataType& GetType(const ArrayData& data) { return *data.type; }
+
+ static void Summarize(const FieldVector& fields, std::stringstream* ss) {
+ *ss << "{ ";
+ for (const auto& field : fields) {
+ *ss << field->ToString() << ", ";
+ }
+ *ss << "}";
+ }
+
+ template <typename T>
+ static void Summarize(const std::vector<T>& columns, std::stringstream* ss) {
+ *ss << "{ ";
+ for (const auto& column : columns) {
+ *ss << GetType(*column) << ", ";
+ }
+ *ss << "}";
+ }
+
+ template <typename T>
+ static Status IndexError(const FieldPath* path, int out_of_range_depth,
+ const std::vector<T>& children) {
+ std::stringstream ss;
+ ss << "index out of range. ";
+
+ ss << "indices=[ ";
+ int depth = 0;
+ for (int i : path->indices()) {
+ if (depth != out_of_range_depth) {
+ ss << i << " ";
+ continue;
+ }
+ ss << ">" << i << "< ";
+ ++depth;
+ }
+ ss << "] ";
+
+ if (std::is_same<T, std::shared_ptr<Field>>::value) {
+ ss << "fields were: ";
+ } else {
+ ss << "columns had types: ";
+ }
+ Summarize(children, &ss);
+
+ return Status::IndexError(ss.str());
+ }
+
+ template <typename T, typename GetChildren>
+ static Result<T> Get(const FieldPath* path, const std::vector<T>* children,
+ GetChildren&& get_children, int* out_of_range_depth) {
+ if (path->indices().empty()) {
+ return Status::Invalid("empty indices cannot be traversed");
+ }
+
+ int depth = 0;
+ const T* out;
+ for (int index : path->indices()) {
+ if (children == nullptr) {
+ return Status::NotImplemented("Get child data of non-struct array");
+ }
+
+ if (index < 0 || static_cast<size_t>(index) >= children->size()) {
+ *out_of_range_depth = depth;
+ return nullptr;
+ }
+
+ out = &children->at(index);
+ children = get_children(*out);
+ ++depth;
+ }
+
+ return *out;
+ }
+
+ template <typename T, typename GetChildren>
+ static Result<T> Get(const FieldPath* path, const std::vector<T>* children,
+ GetChildren&& get_children) {
+ int out_of_range_depth = -1;
+ ARROW_ASSIGN_OR_RAISE(auto child,
+ Get(path, children, std::forward<GetChildren>(get_children),
+ &out_of_range_depth));
+ if (child != nullptr) {
+ return std::move(child);
+ }
+ return IndexError(path, out_of_range_depth, *children);
+ }
+
+ static Result<std::shared_ptr<Field>> Get(const FieldPath* path,
+ const FieldVector& fields) {
+ return FieldPathGetImpl::Get(path, &fields, [](const std::shared_ptr<Field>& field) {
+ return &field->type()->fields();
+ });
+ }
+
+ static Result<std::shared_ptr<ArrayData>> Get(const FieldPath* path,
+ const ArrayDataVector& child_data) {
+ return FieldPathGetImpl::Get(
+ path, &child_data,
+ [](const std::shared_ptr<ArrayData>& data) -> const ArrayDataVector* {
+ if (data->type->id() != Type::STRUCT) {
+ return nullptr;
+ }
+ return &data->child_data;
+ });
+ }
+};
+
+Result<std::shared_ptr<Field>> FieldPath::Get(const Schema& schema) const {
+ return FieldPathGetImpl::Get(this, schema.fields());
+}
+
+Result<std::shared_ptr<Field>> FieldPath::Get(const Field& field) const {
+ return FieldPathGetImpl::Get(this, field.type()->fields());
+}
+
+Result<std::shared_ptr<Field>> FieldPath::Get(const DataType& type) const {
+ return FieldPathGetImpl::Get(this, type.fields());
+}
+
+Result<std::shared_ptr<Field>> FieldPath::Get(const FieldVector& fields) const {
+ return FieldPathGetImpl::Get(this, fields);
+}
+
+Result<std::shared_ptr<Array>> FieldPath::Get(const RecordBatch& batch) const {
+ ARROW_ASSIGN_OR_RAISE(auto data, FieldPathGetImpl::Get(this, batch.column_data()));
+ return MakeArray(std::move(data));
+}
+
+Result<std::shared_ptr<Array>> FieldPath::Get(const Array& array) const {
+ ARROW_ASSIGN_OR_RAISE(auto data, Get(*array.data()));
+ return MakeArray(std::move(data));
+}
+
+Result<std::shared_ptr<ArrayData>> FieldPath::Get(const ArrayData& data) const {
+ if (data.type->id() != Type::STRUCT) {
+ return Status::NotImplemented("Get child data of non-struct array");
+ }
+ return FieldPathGetImpl::Get(this, data.child_data);
+}
+
+FieldRef::FieldRef(FieldPath indices) : impl_(std::move(indices)) {
+ DCHECK_GT(util::get<FieldPath>(impl_).indices().size(), 0);
+}
+
+void FieldRef::Flatten(std::vector<FieldRef> children) {
+ // flatten children
+ struct Visitor {
+ void operator()(std::string* name) { *out++ = FieldRef(std::move(*name)); }
+
+ void operator()(FieldPath* indices) { *out++ = FieldRef(std::move(*indices)); }
+
+ void operator()(std::vector<FieldRef>* children) {
+ for (auto& child : *children) {
+ util::visit(*this, &child.impl_);
+ }
+ }
+
+ std::back_insert_iterator<std::vector<FieldRef>> out;
+ };
+
+ std::vector<FieldRef> out;
+ Visitor visitor{std::back_inserter(out)};
+ visitor(&children);
+
+ DCHECK(!out.empty());
+ DCHECK(std::none_of(out.begin(), out.end(),
+ [](const FieldRef& ref) { return ref.IsNested(); }));
+
+ if (out.size() == 1) {
+ impl_ = std::move(out[0].impl_);
+ } else {
+ impl_ = std::move(out);
+ }
+}
+
+Result<FieldRef> FieldRef::FromDotPath(const std::string& dot_path_arg) {
+ if (dot_path_arg.empty()) {
+ return Status::Invalid("Dot path was empty");
+ }
+
+ std::vector<FieldRef> children;
+
+ util::string_view dot_path = dot_path_arg;
+
+ auto parse_name = [&] {
+ std::string name;
+ for (;;) {
+ auto segment_end = dot_path.find_first_of("\\[.");
+ if (segment_end == util::string_view::npos) {
+ // dot_path doesn't contain any other special characters; consume all
+ name.append(dot_path.begin(), dot_path.end());
+ dot_path = "";
+ break;
+ }
+
+ if (dot_path[segment_end] != '\\') {
+ // segment_end points to a subscript for a new FieldRef
+ name.append(dot_path.begin(), segment_end);
+ dot_path = dot_path.substr(segment_end);
+ break;
+ }
+
+ if (dot_path.size() == segment_end + 1) {
+ // dot_path ends with backslash; consume it all
+ name.append(dot_path.begin(), dot_path.end());
+ dot_path = "";
+ break;
+ }
+
+ // append all characters before backslash, then the character which follows it
+ name.append(dot_path.begin(), segment_end);
+ name.push_back(dot_path[segment_end + 1]);
+ dot_path = dot_path.substr(segment_end + 2);
+ }
+ return name;
+ };
+
+ while (!dot_path.empty()) {
+ auto subscript = dot_path[0];
+ dot_path = dot_path.substr(1);
+ switch (subscript) {
+ case '.': {
+ // next element is a name
+ children.emplace_back(parse_name());
+ continue;
+ }
+ case '[': {
+ auto subscript_end = dot_path.find_first_not_of("0123456789");
+ if (subscript_end == util::string_view::npos || dot_path[subscript_end] != ']') {
+ return Status::Invalid("Dot path '", dot_path_arg,
+ "' contained an unterminated index");
+ }
+ children.emplace_back(std::atoi(dot_path.data()));
+ dot_path = dot_path.substr(subscript_end + 1);
+ continue;
+ }
+ default:
+ return Status::Invalid("Dot path must begin with '[' or '.', got '", dot_path_arg,
+ "'");
+ }
+ }
+
+ FieldRef out;
+ out.Flatten(std::move(children));
+ return out;
+}
+
+size_t FieldRef::hash() const {
+ struct Visitor : std::hash<std::string> {
+ using std::hash<std::string>::operator();
+
+ size_t operator()(const FieldPath& path) { return path.hash(); }
+
+ size_t operator()(const std::vector<FieldRef>& children) {
+ size_t hash = 0;
+
+ for (const FieldRef& child : children) {
+ hash ^= child.hash();
+ }
+
+ return hash;
+ }
+ };
+
+ return util::visit(Visitor{}, impl_);
+}
+
+std::string FieldRef::ToString() const {
+ struct Visitor {
+ std::string operator()(const FieldPath& path) { return path.ToString(); }
+
+ std::string operator()(const std::string& name) { return "Name(" + name + ")"; }
+
+ std::string operator()(const std::vector<FieldRef>& children) {
+ std::string repr = "Nested(";
+ for (const auto& child : children) {
+ repr += child.ToString() + " ";
+ }
+ repr.resize(repr.size() - 1);
+ repr += ")";
+ return repr;
+ }
+ };
+
+ return "FieldRef." + util::visit(Visitor{}, impl_);
+}
+
+std::vector<FieldPath> FieldRef::FindAll(const Schema& schema) const {
+ if (auto name = this->name()) {
+ return internal::MapVector([](int i) { return FieldPath{i}; },
+ schema.GetAllFieldIndices(*name));
+ }
+ return FindAll(schema.fields());
+}
+
+std::vector<FieldPath> FieldRef::FindAll(const Field& field) const {
+ return FindAll(field.type()->fields());
+}
+
+std::vector<FieldPath> FieldRef::FindAll(const DataType& type) const {
+ return FindAll(type.fields());
+}
+
+std::vector<FieldPath> FieldRef::FindAll(const FieldVector& fields) const {
+ struct Visitor {
+ std::vector<FieldPath> operator()(const FieldPath& path) {
+ // skip long IndexError construction if path is out of range
+ int out_of_range_depth;
+ auto maybe_field = FieldPathGetImpl::Get(
+ &path, &fields_,
+ [](const std::shared_ptr<Field>& field) { return &field->type()->fields(); },
+ &out_of_range_depth);
+
+ DCHECK_OK(maybe_field.status());
+
+ if (maybe_field.ValueOrDie() != nullptr) {
+ return {path};
+ }
+ return {};
+ }
+
+ std::vector<FieldPath> operator()(const std::string& name) {
+ std::vector<FieldPath> out;
+
+ for (int i = 0; i < static_cast<int>(fields_.size()); ++i) {
+ if (fields_[i]->name() == name) {
+ out.push_back({i});
+ }
+ }
+
+ return out;
+ }
+
+ struct Matches {
+ // referents[i] is referenced by prefixes[i]
+ std::vector<FieldPath> prefixes;
+ FieldVector referents;
+
+ Matches(std::vector<FieldPath> matches, const FieldVector& fields) {
+ for (auto& match : matches) {
+ Add({}, std::move(match), fields);
+ }
+ }
+
+ Matches() = default;
+
+ size_t size() const { return referents.size(); }
+
+ void Add(const FieldPath& prefix, const FieldPath& suffix,
+ const FieldVector& fields) {
+ auto maybe_field = suffix.Get(fields);
+ DCHECK_OK(maybe_field.status());
+ referents.push_back(std::move(maybe_field).ValueOrDie());
+
+ std::vector<int> concatenated_indices(prefix.indices().size() +
+ suffix.indices().size());
+ auto it = concatenated_indices.begin();
+ for (auto path : {&prefix, &suffix}) {
+ it = std::copy(path->indices().begin(), path->indices().end(), it);
+ }
+ prefixes.emplace_back(std::move(concatenated_indices));
+ }
+ };
+
+ std::vector<FieldPath> operator()(const std::vector<FieldRef>& refs) {
+ DCHECK_GE(refs.size(), 1);
+ Matches matches(refs.front().FindAll(fields_), fields_);
+
+ for (auto ref_it = refs.begin() + 1; ref_it != refs.end(); ++ref_it) {
+ Matches next_matches;
+ for (size_t i = 0; i < matches.size(); ++i) {
+ const auto& referent = *matches.referents[i];
+
+ for (const FieldPath& match : ref_it->FindAll(referent)) {
+ next_matches.Add(matches.prefixes[i], match, referent.type()->fields());
+ }
+ }
+ matches = std::move(next_matches);
+ }
+
+ return matches.prefixes;
+ }
+
+ const FieldVector& fields_;
+ };
+
+ return util::visit(Visitor{fields}, impl_);
+}
+
+std::vector<FieldPath> FieldRef::FindAll(const ArrayData& array) const {
+ return FindAll(*array.type);
+}
+
+std::vector<FieldPath> FieldRef::FindAll(const Array& array) const {
+ return FindAll(*array.type());
+}
+
+std::vector<FieldPath> FieldRef::FindAll(const RecordBatch& batch) const {
+ return FindAll(*batch.schema());
+}
+
+void PrintTo(const FieldRef& ref, std::ostream* os) { *os << ref.ToString(); }
+
+// ----------------------------------------------------------------------
+// Schema implementation
+
+std::string EndiannessToString(Endianness endianness) {
+ switch (endianness) {
+ case Endianness::Little:
+ return "little";
+ case Endianness::Big:
+ return "big";
+ default:
+ DCHECK(false) << "invalid endianness";
+ return "???";
+ }
+}
+
+class Schema::Impl {
+ public:
+ Impl(std::vector<std::shared_ptr<Field>> fields, Endianness endianness,
+ std::shared_ptr<const KeyValueMetadata> metadata)
+ : fields_(std::move(fields)),
+ endianness_(endianness),
+ name_to_index_(CreateNameToIndexMap(fields_)),
+ metadata_(std::move(metadata)) {}
+
+ std::vector<std::shared_ptr<Field>> fields_;
+ Endianness endianness_;
+ std::unordered_multimap<std::string, int> name_to_index_;
+ std::shared_ptr<const KeyValueMetadata> metadata_;
+};
+
+Schema::Schema(std::vector<std::shared_ptr<Field>> fields, Endianness endianness,
+ std::shared_ptr<const KeyValueMetadata> metadata)
+ : detail::Fingerprintable(),
+ impl_(new Impl(std::move(fields), endianness, std::move(metadata))) {}
+
+Schema::Schema(std::vector<std::shared_ptr<Field>> fields,
+ std::shared_ptr<const KeyValueMetadata> metadata)
+ : detail::Fingerprintable(),
+ impl_(new Impl(std::move(fields), Endianness::Native, std::move(metadata))) {}
+
+Schema::Schema(const Schema& schema)
+ : detail::Fingerprintable(), impl_(new Impl(*schema.impl_)) {}
+
+Schema::~Schema() = default;
+
+std::shared_ptr<Schema> Schema::WithEndianness(Endianness endianness) const {
+ return std::make_shared<Schema>(impl_->fields_, endianness, impl_->metadata_);
+}
+
+Endianness Schema::endianness() const { return impl_->endianness_; }
+
+bool Schema::is_native_endian() const { return impl_->endianness_ == Endianness::Native; }
+
+int Schema::num_fields() const { return static_cast<int>(impl_->fields_.size()); }
+
+const std::shared_ptr<Field>& Schema::field(int i) const {
+ DCHECK_GE(i, 0);
+ DCHECK_LT(i, num_fields());
+ return impl_->fields_[i];
+}
+
+const std::vector<std::shared_ptr<Field>>& Schema::fields() const {
+ return impl_->fields_;
+}
+
+bool Schema::Equals(const Schema& other, bool check_metadata) const {
+ if (this == &other) {
+ return true;
+ }
+
+ // checks endianness equality
+ if (endianness() != other.endianness()) {
+ return false;
+ }
+
+ // checks field equality
+ if (num_fields() != other.num_fields()) {
+ return false;
+ }
+
+ if (check_metadata) {
+ const auto& metadata_fp = metadata_fingerprint();
+ const auto& other_metadata_fp = other.metadata_fingerprint();
+ if (metadata_fp != other_metadata_fp) {
+ return false;
+ }
+ }
+
+ // Fast path using fingerprints, if possible
+ const auto& fp = fingerprint();
+ const auto& other_fp = other.fingerprint();
+ if (!fp.empty() && !other_fp.empty()) {
+ return fp == other_fp;
+ }
+
+ // Fall back on field-by-field comparison
+ for (int i = 0; i < num_fields(); ++i) {
+ if (!field(i)->Equals(*other.field(i).get(), check_metadata)) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+bool Schema::Equals(const std::shared_ptr<Schema>& other, bool check_metadata) const {
+ if (other == nullptr) {
+ return false;
+ }
+
+ return Equals(*other, check_metadata);
+}
+
+std::shared_ptr<Field> Schema::GetFieldByName(const std::string& name) const {
+ int i = GetFieldIndex(name);
+ return i == -1 ? nullptr : impl_->fields_[i];
+}
+
+int Schema::GetFieldIndex(const std::string& name) const {
+ return LookupNameIndex(impl_->name_to_index_, name);
+}
+
+std::vector<int> Schema::GetAllFieldIndices(const std::string& name) const {
+ std::vector<int> result;
+ auto p = impl_->name_to_index_.equal_range(name);
+ for (auto it = p.first; it != p.second; ++it) {
+ result.push_back(it->second);
+ }
+ if (result.size() > 1) {
+ std::sort(result.begin(), result.end());
+ }
+ return result;
+}
+
+Status Schema::CanReferenceFieldsByNames(const std::vector<std::string>& names) const {
+ for (const auto& name : names) {
+ if (GetFieldByName(name) == nullptr) {
+ return Status::Invalid("Field named '", name,
+ "' not found or not unique in the schema.");
+ }
+ }
+
+ return Status::OK();
+}
+
+std::vector<std::shared_ptr<Field>> Schema::GetAllFieldsByName(
+ const std::string& name) const {
+ std::vector<std::shared_ptr<Field>> result;
+ auto p = impl_->name_to_index_.equal_range(name);
+ for (auto it = p.first; it != p.second; ++it) {
+ result.push_back(impl_->fields_[it->second]);
+ }
+ return result;
+}
+
+Result<std::shared_ptr<Schema>> Schema::AddField(
+ int i, const std::shared_ptr<Field>& field) const {
+ if (i < 0 || i > this->num_fields()) {
+ return Status::Invalid("Invalid column index to add field.");
+ }
+
+ return std::make_shared<Schema>(internal::AddVectorElement(impl_->fields_, i, field),
+ impl_->metadata_);
+}
+
+Result<std::shared_ptr<Schema>> Schema::SetField(
+ int i, const std::shared_ptr<Field>& field) const {
+ if (i < 0 || i > this->num_fields()) {
+ return Status::Invalid("Invalid column index to add field.");
+ }
+
+ return std::make_shared<Schema>(
+ internal::ReplaceVectorElement(impl_->fields_, i, field), impl_->metadata_);
+}
+
+Result<std::shared_ptr<Schema>> Schema::RemoveField(int i) const {
+ if (i < 0 || i >= this->num_fields()) {
+ return Status::Invalid("Invalid column index to remove field.");
+ }
+
+ return std::make_shared<Schema>(internal::DeleteVectorElement(impl_->fields_, i),
+ impl_->metadata_);
+}
+
+bool Schema::HasMetadata() const {
+ return (impl_->metadata_ != nullptr) && (impl_->metadata_->size() > 0);
+}
+
+bool Schema::HasDistinctFieldNames() const {
+ auto fields = field_names();
+ std::unordered_set<std::string> names{fields.cbegin(), fields.cend()};
+ return names.size() == fields.size();
+}
+
+std::shared_ptr<Schema> Schema::WithMetadata(
+ const std::shared_ptr<const KeyValueMetadata>& metadata) const {
+ return std::make_shared<Schema>(impl_->fields_, metadata);
+}
+
+const std::shared_ptr<const KeyValueMetadata>& Schema::metadata() const {
+ return impl_->metadata_;
+}
+
+std::shared_ptr<Schema> Schema::RemoveMetadata() const {
+ return std::make_shared<Schema>(impl_->fields_);
+}
+
+std::string Schema::ToString(bool show_metadata) const {
+ std::stringstream buffer;
+
+ int i = 0;
+ for (const auto& field : impl_->fields_) {
+ if (i > 0) {
+ buffer << std::endl;
+ }
+ buffer << field->ToString(show_metadata);
+ ++i;
+ }
+
+ if (impl_->endianness_ != Endianness::Native) {
+ buffer << "\n-- endianness: " << EndiannessToString(impl_->endianness_) << " --";
+ }
+
+ if (show_metadata && HasMetadata()) {
+ buffer << impl_->metadata_->ToString();
+ }
+
+ return buffer.str();
+}
+
+std::vector<std::string> Schema::field_names() const {
+ std::vector<std::string> names;
+ for (const auto& field : impl_->fields_) {
+ names.push_back(field->name());
+ }
+ return names;
+}
+
+class SchemaBuilder::Impl {
+ public:
+ friend class SchemaBuilder;
+ Impl(ConflictPolicy policy, Field::MergeOptions field_merge_options)
+ : policy_(policy), field_merge_options_(field_merge_options) {}
+
+ Impl(std::vector<std::shared_ptr<Field>> fields,
+ std::shared_ptr<const KeyValueMetadata> metadata, ConflictPolicy conflict_policy,
+ Field::MergeOptions field_merge_options)
+ : fields_(std::move(fields)),
+ name_to_index_(CreateNameToIndexMap(fields_)),
+ metadata_(std::move(metadata)),
+ policy_(conflict_policy),
+ field_merge_options_(field_merge_options) {}
+
+ Status AddField(const std::shared_ptr<Field>& field) {
+ DCHECK_NE(field, nullptr);
+
+ // Short-circuit, no lookup needed.
+ if (policy_ == CONFLICT_APPEND) {
+ return AppendField(field);
+ }
+
+ auto name = field->name();
+ constexpr int kNotFound = -1;
+ constexpr int kDuplicateFound = -2;
+ auto i = LookupNameIndex<kNotFound, kDuplicateFound>(name_to_index_, name);
+
+ if (i == kNotFound) {
+ return AppendField(field);
+ }
+
+ // From this point, there's one or more field in the builder that exists with
+ // the same name.
+
+ if (policy_ == CONFLICT_IGNORE) {
+ // The ignore policy is more generous when there's duplicate in the builder.
+ return Status::OK();
+ } else if (policy_ == CONFLICT_ERROR) {
+ return Status::Invalid("Duplicate found, policy dictate to treat as an error");
+ }
+
+ if (i == kDuplicateFound) {
+ // Cannot merge/replace when there's more than one field in the builder
+ // because we can't decide which to merge/replace.
+ return Status::Invalid("Cannot merge field ", name,
+ " more than one field with same name exists");
+ }
+
+ DCHECK_GE(i, 0);
+
+ if (policy_ == CONFLICT_REPLACE) {
+ fields_[i] = field;
+ } else if (policy_ == CONFLICT_MERGE) {
+ ARROW_ASSIGN_OR_RAISE(fields_[i], fields_[i]->MergeWith(field));
+ }
+
+ return Status::OK();
+ }
+
+ Status AppendField(const std::shared_ptr<Field>& field) {
+ name_to_index_.emplace(field->name(), static_cast<int>(fields_.size()));
+ fields_.push_back(field);
+ return Status::OK();
+ }
+
+ void Reset() {
+ fields_.clear();
+ name_to_index_.clear();
+ metadata_.reset();
+ }
+
+ private:
+ std::vector<std::shared_ptr<Field>> fields_;
+ std::unordered_multimap<std::string, int> name_to_index_;
+ std::shared_ptr<const KeyValueMetadata> metadata_;
+ ConflictPolicy policy_;
+ Field::MergeOptions field_merge_options_;
+};
+
+SchemaBuilder::SchemaBuilder(ConflictPolicy policy,
+ Field::MergeOptions field_merge_options) {
+ impl_ = internal::make_unique<Impl>(policy, field_merge_options);
+}
+
+SchemaBuilder::SchemaBuilder(std::vector<std::shared_ptr<Field>> fields,
+ ConflictPolicy policy,
+ Field::MergeOptions field_merge_options) {
+ impl_ = internal::make_unique<Impl>(std::move(fields), nullptr, policy,
+ field_merge_options);
+}
+
+SchemaBuilder::SchemaBuilder(const std::shared_ptr<Schema>& schema, ConflictPolicy policy,
+ Field::MergeOptions field_merge_options) {
+ std::shared_ptr<const KeyValueMetadata> metadata;
+ if (schema->HasMetadata()) {
+ metadata = schema->metadata()->Copy();
+ }
+
+ impl_ = internal::make_unique<Impl>(schema->fields(), std::move(metadata), policy,
+ field_merge_options);
+}
+
+SchemaBuilder::~SchemaBuilder() {}
+
+SchemaBuilder::ConflictPolicy SchemaBuilder::policy() const { return impl_->policy_; }
+
+void SchemaBuilder::SetPolicy(SchemaBuilder::ConflictPolicy resolution) {
+ impl_->policy_ = resolution;
+}
+
+Status SchemaBuilder::AddField(const std::shared_ptr<Field>& field) {
+ return impl_->AddField(field);
+}
+
+Status SchemaBuilder::AddFields(const std::vector<std::shared_ptr<Field>>& fields) {
+ for (const auto& field : fields) {
+ RETURN_NOT_OK(AddField(field));
+ }
+
+ return Status::OK();
+}
+
+Status SchemaBuilder::AddSchema(const std::shared_ptr<Schema>& schema) {
+ DCHECK_NE(schema, nullptr);
+ return AddFields(schema->fields());
+}
+
+Status SchemaBuilder::AddSchemas(const std::vector<std::shared_ptr<Schema>>& schemas) {
+ for (const auto& schema : schemas) {
+ RETURN_NOT_OK(AddSchema(schema));
+ }
+
+ return Status::OK();
+}
+
+Status SchemaBuilder::AddMetadata(const KeyValueMetadata& metadata) {
+ impl_->metadata_ = metadata.Copy();
+ return Status::OK();
+}
+
+Result<std::shared_ptr<Schema>> SchemaBuilder::Finish() const {
+ return schema(impl_->fields_, impl_->metadata_);
+}
+
+void SchemaBuilder::Reset() { impl_->Reset(); }
+
+Result<std::shared_ptr<Schema>> SchemaBuilder::Merge(
+ const std::vector<std::shared_ptr<Schema>>& schemas, ConflictPolicy policy) {
+ SchemaBuilder builder{policy};
+ RETURN_NOT_OK(builder.AddSchemas(schemas));
+ return builder.Finish();
+}
+
+Status SchemaBuilder::AreCompatible(const std::vector<std::shared_ptr<Schema>>& schemas,
+ ConflictPolicy policy) {
+ return Merge(schemas, policy).status();
+}
+
+std::shared_ptr<Schema> schema(std::vector<std::shared_ptr<Field>> fields,
+ std::shared_ptr<const KeyValueMetadata> metadata) {
+ return std::make_shared<Schema>(std::move(fields), std::move(metadata));
+}
+
+std::shared_ptr<Schema> schema(std::vector<std::shared_ptr<Field>> fields,
+ Endianness endianness,
+ std::shared_ptr<const KeyValueMetadata> metadata) {
+ return std::make_shared<Schema>(std::move(fields), endianness, std::move(metadata));
+}
+
+Result<std::shared_ptr<Schema>> UnifySchemas(
+ const std::vector<std::shared_ptr<Schema>>& schemas,
+ const Field::MergeOptions field_merge_options) {
+ if (schemas.empty()) {
+ return Status::Invalid("Must provide at least one schema to unify.");
+ }
+
+ if (!schemas[0]->HasDistinctFieldNames()) {
+ return Status::Invalid("Can't unify schema with duplicate field names.");
+ }
+
+ SchemaBuilder builder{schemas[0], SchemaBuilder::CONFLICT_MERGE, field_merge_options};
+
+ for (size_t i = 1; i < schemas.size(); i++) {
+ const auto& schema = schemas[i];
+ if (!schema->HasDistinctFieldNames()) {
+ return Status::Invalid("Can't unify schema with duplicate field names.");
+ }
+ RETURN_NOT_OK(builder.AddSchema(schema));
+ }
+
+ return builder.Finish();
+}
+
+// ----------------------------------------------------------------------
+// Fingerprint computations
+
+namespace detail {
+
+Fingerprintable::~Fingerprintable() {
+ delete fingerprint_.load();
+ delete metadata_fingerprint_.load();
+}
+
+template <typename ComputeFingerprint>
+static const std::string& LoadFingerprint(std::atomic<std::string*>* fingerprint,
+ ComputeFingerprint&& compute_fingerprint) {
+ auto new_p = new std::string(std::forward<ComputeFingerprint>(compute_fingerprint)());
+ // Since fingerprint() and metadata_fingerprint() return a *reference* to the
+ // allocated string, the first allocation ever should never be replaced by another
+ // one. Hence the compare_exchange_strong() against nullptr.
+ std::string* expected = nullptr;
+ if (fingerprint->compare_exchange_strong(expected, new_p)) {
+ return *new_p;
+ } else {
+ delete new_p;
+ DCHECK_NE(expected, nullptr);
+ return *expected;
+ }
+}
+
+const std::string& Fingerprintable::LoadFingerprintSlow() const {
+ return LoadFingerprint(&fingerprint_, [this]() { return ComputeFingerprint(); });
+}
+
+const std::string& Fingerprintable::LoadMetadataFingerprintSlow() const {
+ return LoadFingerprint(&metadata_fingerprint_,
+ [this]() { return ComputeMetadataFingerprint(); });
+}
+
+} // namespace detail
+
+static inline std::string TypeIdFingerprint(const DataType& type) {
+ auto c = static_cast<int>(type.id()) + 'A';
+ DCHECK_GE(c, 0);
+ DCHECK_LT(c, 128); // Unlikely to happen any soon
+ // Prefix with an unusual character in order to disambiguate
+ std::string s{'@', static_cast<char>(c)};
+ return s;
+}
+
+static char TimeUnitFingerprint(TimeUnit::type unit) {
+ switch (unit) {
+ case TimeUnit::SECOND:
+ return 's';
+ case TimeUnit::MILLI:
+ return 'm';
+ case TimeUnit::MICRO:
+ return 'u';
+ case TimeUnit::NANO:
+ return 'n';
+ default:
+ DCHECK(false) << "Unexpected TimeUnit";
+ return '\0';
+ }
+}
+
+static char IntervalTypeFingerprint(IntervalType::type unit) {
+ switch (unit) {
+ case IntervalType::DAY_TIME:
+ return 'd';
+ case IntervalType::MONTHS:
+ return 'M';
+ default:
+ DCHECK(false) << "Unexpected IntervalType::type";
+ return '\0';
+ }
+}
+
+static void AppendMetadataFingerprint(const KeyValueMetadata& metadata,
+ std::stringstream* ss) {
+ // Compute metadata fingerprint. KeyValueMetadata is not immutable,
+ // so we don't cache the result on the metadata instance.
+ const auto pairs = metadata.sorted_pairs();
+ if (!pairs.empty()) {
+ *ss << "!{";
+ for (const auto& p : pairs) {
+ const auto& k = p.first;
+ const auto& v = p.second;
+ // Since metadata strings can contain arbitrary characters, prefix with
+ // string length to disambiguate.
+ *ss << k.length() << ':' << k << ':';
+ *ss << v.length() << ':' << v << ';';
+ }
+ *ss << '}';
+ }
+}
+
+std::string Field::ComputeFingerprint() const {
+ const auto& type_fingerprint = type_->fingerprint();
+ if (type_fingerprint.empty()) {
+ // Underlying DataType doesn't support fingerprinting.
+ return "";
+ }
+ std::stringstream ss;
+ ss << 'F';
+ if (nullable_) {
+ ss << 'n';
+ } else {
+ ss << 'N';
+ }
+ ss << name_;
+ ss << '{' << type_fingerprint << '}';
+ return ss.str();
+}
+
+std::string Field::ComputeMetadataFingerprint() const {
+ std::stringstream ss;
+ if (metadata_) {
+ AppendMetadataFingerprint(*metadata_, &ss);
+ }
+ const auto& type_fingerprint = type_->metadata_fingerprint();
+ if (!type_fingerprint.empty()) {
+ ss << "+{" << type_->metadata_fingerprint() << "}";
+ }
+ return ss.str();
+}
+
+std::string Schema::ComputeFingerprint() const {
+ std::stringstream ss;
+ ss << "S{";
+ for (const auto& field : fields()) {
+ const auto& field_fingerprint = field->fingerprint();
+ if (field_fingerprint.empty()) {
+ return "";
+ }
+ ss << field_fingerprint << ";";
+ }
+ ss << (endianness() == Endianness::Little ? "L" : "B");
+ ss << "}";
+ return ss.str();
+}
+
+std::string Schema::ComputeMetadataFingerprint() const {
+ std::stringstream ss;
+ if (HasMetadata()) {
+ AppendMetadataFingerprint(*metadata(), &ss);
+ }
+ ss << "S{";
+ for (const auto& field : fields()) {
+ const auto& field_fingerprint = field->metadata_fingerprint();
+ ss << field_fingerprint << ";";
+ }
+ ss << "}";
+ return ss.str();
+}
+
+void PrintTo(const Schema& s, std::ostream* os) { *os << s; }
+
+std::string DataType::ComputeFingerprint() const {
+ // Default implementation returns empty string, signalling non-implemented
+ // functionality.
+ return "";
+}
+
+std::string DataType::ComputeMetadataFingerprint() const {
+ // Whatever the data type, metadata can only be found on child fields
+ std::string s;
+ for (const auto& child : children_) {
+ s += child->metadata_fingerprint() + ";";
+ }
+ return s;
+}
+
+#define PARAMETER_LESS_FINGERPRINT(TYPE_CLASS) \
+ std::string TYPE_CLASS##Type::ComputeFingerprint() const { \
+ return TypeIdFingerprint(*this); \
+ }
+
+PARAMETER_LESS_FINGERPRINT(Null)
+PARAMETER_LESS_FINGERPRINT(Boolean)
+PARAMETER_LESS_FINGERPRINT(Int8)
+PARAMETER_LESS_FINGERPRINT(Int16)
+PARAMETER_LESS_FINGERPRINT(Int32)
+PARAMETER_LESS_FINGERPRINT(Int64)
+PARAMETER_LESS_FINGERPRINT(UInt8)
+PARAMETER_LESS_FINGERPRINT(UInt16)
+PARAMETER_LESS_FINGERPRINT(UInt32)
+PARAMETER_LESS_FINGERPRINT(UInt64)
+PARAMETER_LESS_FINGERPRINT(HalfFloat)
+PARAMETER_LESS_FINGERPRINT(Float)
+PARAMETER_LESS_FINGERPRINT(Double)
+PARAMETER_LESS_FINGERPRINT(Binary)
+PARAMETER_LESS_FINGERPRINT(LargeBinary)
+PARAMETER_LESS_FINGERPRINT(String)
+PARAMETER_LESS_FINGERPRINT(LargeString)
+PARAMETER_LESS_FINGERPRINT(Date32)
+PARAMETER_LESS_FINGERPRINT(Date64)
+
+#undef PARAMETER_LESS_FINGERPRINT
+
+std::string DictionaryType::ComputeFingerprint() const {
+ const auto& index_fingerprint = index_type_->fingerprint();
+ const auto& value_fingerprint = value_type_->fingerprint();
+ std::string ordered_fingerprint = ordered_ ? "1" : "0";
+
+ DCHECK(!index_fingerprint.empty()); // it's an integer type
+ if (!value_fingerprint.empty()) {
+ return TypeIdFingerprint(*this) + index_fingerprint + value_fingerprint +
+ ordered_fingerprint;
+ }
+ return ordered_fingerprint;
+}
+
+std::string ListType::ComputeFingerprint() const {
+ const auto& child_fingerprint = children_[0]->fingerprint();
+ if (!child_fingerprint.empty()) {
+ return TypeIdFingerprint(*this) + "{" + child_fingerprint + "}";
+ }
+ return "";
+}
+
+std::string LargeListType::ComputeFingerprint() const {
+ const auto& child_fingerprint = children_[0]->fingerprint();
+ if (!child_fingerprint.empty()) {
+ return TypeIdFingerprint(*this) + "{" + child_fingerprint + "}";
+ }
+ return "";
+}
+
+std::string MapType::ComputeFingerprint() const {
+ const auto& key_fingerprint = key_type()->fingerprint();
+ const auto& item_fingerprint = item_type()->fingerprint();
+ if (!key_fingerprint.empty() && !item_fingerprint.empty()) {
+ if (keys_sorted_) {
+ return TypeIdFingerprint(*this) + "s{" + key_fingerprint + item_fingerprint + "}";
+ } else {
+ return TypeIdFingerprint(*this) + "{" + key_fingerprint + item_fingerprint + "}";
+ }
+ }
+ return "";
+}
+
+std::string FixedSizeListType::ComputeFingerprint() const {
+ const auto& child_fingerprint = children_[0]->fingerprint();
+ if (!child_fingerprint.empty()) {
+ std::stringstream ss;
+ ss << TypeIdFingerprint(*this) << "[" << list_size_ << "]"
+ << "{" << child_fingerprint << "}";
+ return ss.str();
+ }
+ return "";
+}
+
+std::string FixedSizeBinaryType::ComputeFingerprint() const {
+ std::stringstream ss;
+ ss << TypeIdFingerprint(*this) << "[" << byte_width_ << "]";
+ return ss.str();
+}
+
+std::string DecimalType::ComputeFingerprint() const {
+ std::stringstream ss;
+ ss << TypeIdFingerprint(*this) << "[" << byte_width_ << "," << precision_ << ","
+ << scale_ << "]";
+ return ss.str();
+}
+
+std::string StructType::ComputeFingerprint() const {
+ std::stringstream ss;
+ ss << TypeIdFingerprint(*this) << "{";
+ for (const auto& child : children_) {
+ const auto& child_fingerprint = child->fingerprint();
+ if (child_fingerprint.empty()) {
+ return "";
+ }
+ ss << child_fingerprint << ";";
+ }
+ ss << "}";
+ return ss.str();
+}
+
+std::string UnionType::ComputeFingerprint() const {
+ std::stringstream ss;
+ ss << TypeIdFingerprint(*this);
+ switch (mode()) {
+ case UnionMode::SPARSE:
+ ss << "[s";
+ break;
+ case UnionMode::DENSE:
+ ss << "[d";
+ break;
+ default:
+ DCHECK(false) << "Unexpected UnionMode";
+ }
+ for (const auto code : type_codes_) {
+ // Represent code as integer, not raw character
+ ss << ':' << static_cast<int32_t>(code);
+ }
+ ss << "]{";
+ for (const auto& child : children_) {
+ const auto& child_fingerprint = child->fingerprint();
+ if (child_fingerprint.empty()) {
+ return "";
+ }
+ ss << child_fingerprint << ";";
+ }
+ ss << "}";
+ return ss.str();
+}
+
+std::string TimeType::ComputeFingerprint() const {
+ std::stringstream ss;
+ ss << TypeIdFingerprint(*this) << TimeUnitFingerprint(unit_);
+ return ss.str();
+}
+
+std::string TimestampType::ComputeFingerprint() const {
+ std::stringstream ss;
+ ss << TypeIdFingerprint(*this) << TimeUnitFingerprint(unit_) << timezone_.length()
+ << ':' << timezone_;
+ return ss.str();
+}
+
+std::string IntervalType::ComputeFingerprint() const {
+ std::stringstream ss;
+ ss << TypeIdFingerprint(*this) << IntervalTypeFingerprint(interval_type());
+ return ss.str();
+}
+
+std::string DurationType::ComputeFingerprint() const {
+ std::stringstream ss;
+ ss << TypeIdFingerprint(*this) << TimeUnitFingerprint(unit_);
+ return ss.str();
+}
+
+// ----------------------------------------------------------------------
+// Visitors and factory functions
+
+Status DataType::Accept(TypeVisitor* visitor) const {
+ return VisitTypeInline(*this, visitor);
+}
+
+#define TYPE_FACTORY(NAME, KLASS) \
+ std::shared_ptr<DataType> NAME() { \
+ static std::shared_ptr<DataType> result = std::make_shared<KLASS>(); \
+ return result; \
+ }
+
+TYPE_FACTORY(null, NullType)
+TYPE_FACTORY(boolean, BooleanType)
+TYPE_FACTORY(int8, Int8Type)
+TYPE_FACTORY(uint8, UInt8Type)
+TYPE_FACTORY(int16, Int16Type)
+TYPE_FACTORY(uint16, UInt16Type)
+TYPE_FACTORY(int32, Int32Type)
+TYPE_FACTORY(uint32, UInt32Type)
+TYPE_FACTORY(int64, Int64Type)
+TYPE_FACTORY(uint64, UInt64Type)
+TYPE_FACTORY(float16, HalfFloatType)
+TYPE_FACTORY(float32, FloatType)
+TYPE_FACTORY(float64, DoubleType)
+TYPE_FACTORY(utf8, StringType)
+TYPE_FACTORY(large_utf8, LargeStringType)
+TYPE_FACTORY(binary, BinaryType)
+TYPE_FACTORY(large_binary, LargeBinaryType)
+TYPE_FACTORY(date64, Date64Type)
+TYPE_FACTORY(date32, Date32Type)
+
+std::shared_ptr<DataType> fixed_size_binary(int32_t byte_width) {
+ return std::make_shared<FixedSizeBinaryType>(byte_width);
+}
+
+std::shared_ptr<DataType> duration(TimeUnit::type unit) {
+ return std::make_shared<DurationType>(unit);
+}
+
+std::shared_ptr<DataType> day_time_interval() {
+ return std::make_shared<DayTimeIntervalType>();
+}
+
+std::shared_ptr<DataType> month_interval() {
+ return std::make_shared<MonthIntervalType>();
+}
+
+std::shared_ptr<DataType> timestamp(TimeUnit::type unit) {
+ return std::make_shared<TimestampType>(unit);
+}
+
+std::shared_ptr<DataType> timestamp(TimeUnit::type unit, const std::string& timezone) {
+ return std::make_shared<TimestampType>(unit, timezone);
+}
+
+std::shared_ptr<DataType> time32(TimeUnit::type unit) {
+ return std::make_shared<Time32Type>(unit);
+}
+
+std::shared_ptr<DataType> time64(TimeUnit::type unit) {
+ return std::make_shared<Time64Type>(unit);
+}
+
+std::shared_ptr<DataType> list(const std::shared_ptr<DataType>& value_type) {
+ return std::make_shared<ListType>(value_type);
+}
+
+std::shared_ptr<DataType> list(const std::shared_ptr<Field>& value_field) {
+ return std::make_shared<ListType>(value_field);
+}
+
+std::shared_ptr<DataType> large_list(const std::shared_ptr<DataType>& value_type) {
+ return std::make_shared<LargeListType>(value_type);
+}
+
+std::shared_ptr<DataType> large_list(const std::shared_ptr<Field>& value_field) {
+ return std::make_shared<LargeListType>(value_field);
+}
+
+std::shared_ptr<DataType> map(std::shared_ptr<DataType> key_type,
+ std::shared_ptr<DataType> item_type, bool keys_sorted) {
+ return std::make_shared<MapType>(std::move(key_type), std::move(item_type),
+ keys_sorted);
+}
+
+std::shared_ptr<DataType> map(std::shared_ptr<DataType> key_type,
+ std::shared_ptr<Field> item_field, bool keys_sorted) {
+ return std::make_shared<MapType>(std::move(key_type), std::move(item_field),
+ keys_sorted);
+}
+
+std::shared_ptr<DataType> fixed_size_list(const std::shared_ptr<DataType>& value_type,
+ int32_t list_size) {
+ return std::make_shared<FixedSizeListType>(value_type, list_size);
+}
+
+std::shared_ptr<DataType> fixed_size_list(const std::shared_ptr<Field>& value_field,
+ int32_t list_size) {
+ return std::make_shared<FixedSizeListType>(value_field, list_size);
+}
+
+std::shared_ptr<DataType> struct_(const std::vector<std::shared_ptr<Field>>& fields) {
+ return std::make_shared<StructType>(fields);
+}
+
+std::shared_ptr<DataType> sparse_union(FieldVector child_fields,
+ std::vector<int8_t> type_codes) {
+ if (type_codes.empty()) {
+ type_codes = internal::Iota(static_cast<int8_t>(child_fields.size()));
+ }
+ return std::make_shared<SparseUnionType>(std::move(child_fields),
+ std::move(type_codes));
+}
+std::shared_ptr<DataType> dense_union(FieldVector child_fields,
+ std::vector<int8_t> type_codes) {
+ if (type_codes.empty()) {
+ type_codes = internal::Iota(static_cast<int8_t>(child_fields.size()));
+ }
+ return std::make_shared<DenseUnionType>(std::move(child_fields), std::move(type_codes));
+}
+
+FieldVector FieldsFromArraysAndNames(std::vector<std::string> names,
+ const ArrayVector& arrays) {
+ FieldVector fields(arrays.size());
+ int i = 0;
+ if (names.empty()) {
+ for (const auto& array : arrays) {
+ fields[i] = field(std::to_string(i), array->type());
+ ++i;
+ }
+ } else {
+ DCHECK_EQ(names.size(), arrays.size());
+ for (const auto& array : arrays) {
+ fields[i] = field(std::move(names[i]), array->type());
+ ++i;
+ }
+ }
+ return fields;
+}
+
+std::shared_ptr<DataType> sparse_union(const ArrayVector& children,
+ std::vector<std::string> field_names,
+ std::vector<int8_t> type_codes) {
+ if (type_codes.empty()) {
+ type_codes = internal::Iota(static_cast<int8_t>(children.size()));
+ }
+ auto fields = FieldsFromArraysAndNames(std::move(field_names), children);
+ return sparse_union(std::move(fields), std::move(type_codes));
+}
+
+std::shared_ptr<DataType> dense_union(const ArrayVector& children,
+ std::vector<std::string> field_names,
+ std::vector<int8_t> type_codes) {
+ if (type_codes.empty()) {
+ type_codes = internal::Iota(static_cast<int8_t>(children.size()));
+ }
+ auto fields = FieldsFromArraysAndNames(std::move(field_names), children);
+ return dense_union(std::move(fields), std::move(type_codes));
+}
+
+std::shared_ptr<DataType> dictionary(const std::shared_ptr<DataType>& index_type,
+ const std::shared_ptr<DataType>& dict_type,
+ bool ordered) {
+ return std::make_shared<DictionaryType>(index_type, dict_type, ordered);
+}
+
+std::shared_ptr<Field> field(std::string name, std::shared_ptr<DataType> type,
+ bool nullable,
+ std::shared_ptr<const KeyValueMetadata> metadata) {
+ return std::make_shared<Field>(std::move(name), std::move(type), nullable,
+ std::move(metadata));
+}
+
+std::shared_ptr<Field> field(std::string name, std::shared_ptr<DataType> type,
+ std::shared_ptr<const KeyValueMetadata> metadata) {
+ return std::make_shared<Field>(std::move(name), std::move(type), /*nullable=*/true,
+ std::move(metadata));
+}
+
+std::shared_ptr<DataType> decimal(int32_t precision, int32_t scale) {
+ return precision <= Decimal128Type::kMaxPrecision ? decimal128(precision, scale)
+ : decimal256(precision, scale);
+}
+
+std::shared_ptr<DataType> decimal128(int32_t precision, int32_t scale) {
+ return std::make_shared<Decimal128Type>(precision, scale);
+}
+
+std::shared_ptr<DataType> decimal256(int32_t precision, int32_t scale) {
+ return std::make_shared<Decimal256Type>(precision, scale);
+}
+
+std::string Decimal128Type::ToString() const {
+ std::stringstream s;
+ s << "decimal128(" << precision_ << ", " << scale_ << ")";
+ return s.str();
+}
+
+std::string Decimal256Type::ToString() const {
+ std::stringstream s;
+ s << "decimal256(" << precision_ << ", " << scale_ << ")";
+ return s.str();
+}
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/type.h b/contrib/libs/apache/arrow/cpp/src/arrow/type.h
new file mode 100644
index 00000000000..b933da66089
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/type.h
@@ -0,0 +1,1930 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <atomic>
+#include <climits>
+#include <cstdint>
+#include <iosfwd>
+#include <limits>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/result.h"
+#include "arrow/type_fwd.h" // IWYU pragma: export
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/endian.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/variant.h"
+#include "arrow/util/visibility.h"
+#include "arrow/visitor.h" // IWYU pragma: keep
+
+namespace arrow {
+namespace detail {
+
+class ARROW_EXPORT Fingerprintable {
+ public:
+ virtual ~Fingerprintable();
+
+ const std::string& fingerprint() const {
+ auto p = fingerprint_.load();
+ if (ARROW_PREDICT_TRUE(p != NULLPTR)) {
+ return *p;
+ }
+ return LoadFingerprintSlow();
+ }
+
+ const std::string& metadata_fingerprint() const {
+ auto p = metadata_fingerprint_.load();
+ if (ARROW_PREDICT_TRUE(p != NULLPTR)) {
+ return *p;
+ }
+ return LoadMetadataFingerprintSlow();
+ }
+
+ protected:
+ const std::string& LoadFingerprintSlow() const;
+ const std::string& LoadMetadataFingerprintSlow() const;
+
+ virtual std::string ComputeFingerprint() const = 0;
+ virtual std::string ComputeMetadataFingerprint() const = 0;
+
+ mutable std::atomic<std::string*> fingerprint_;
+ mutable std::atomic<std::string*> metadata_fingerprint_;
+};
+
+} // namespace detail
+
+/// EXPERIMENTAL: Layout specification for a data type
+struct ARROW_EXPORT DataTypeLayout {
+ enum BufferKind { FIXED_WIDTH, VARIABLE_WIDTH, BITMAP, ALWAYS_NULL };
+
+ /// Layout specification for a single data type buffer
+ struct BufferSpec {
+ BufferKind kind;
+ int64_t byte_width; // For FIXED_WIDTH
+
+ bool operator==(const BufferSpec& other) const {
+ return kind == other.kind &&
+ (kind != FIXED_WIDTH || byte_width == other.byte_width);
+ }
+ bool operator!=(const BufferSpec& other) const { return !(*this == other); }
+ };
+
+ static BufferSpec FixedWidth(int64_t w) { return BufferSpec{FIXED_WIDTH, w}; }
+ static BufferSpec VariableWidth() { return BufferSpec{VARIABLE_WIDTH, -1}; }
+ static BufferSpec Bitmap() { return BufferSpec{BITMAP, -1}; }
+ static BufferSpec AlwaysNull() { return BufferSpec{ALWAYS_NULL, -1}; }
+
+ /// A vector of buffer layout specifications, one for each expected buffer
+ std::vector<BufferSpec> buffers;
+ /// Whether this type expects an associated dictionary array.
+ bool has_dictionary = false;
+
+ explicit DataTypeLayout(std::vector<BufferSpec> v) : buffers(std::move(v)) {}
+};
+
+/// \brief Base class for all data types
+///
+/// Data types in this library are all *logical*. They can be expressed as
+/// either a primitive physical type (bytes or bits of some fixed size), a
+/// nested type consisting of other data types, or another data type (e.g. a
+/// timestamp encoded as an int64).
+///
+/// Simple datatypes may be entirely described by their Type::type id, but
+/// complex datatypes are usually parametric.
+class ARROW_EXPORT DataType : public detail::Fingerprintable {
+ public:
+ explicit DataType(Type::type id) : detail::Fingerprintable(), id_(id) {}
+ ~DataType() override;
+
+ /// \brief Return whether the types are equal
+ ///
+ /// Types that are logically convertible from one to another (e.g. List<UInt8>
+ /// and Binary) are NOT equal.
+ bool Equals(const DataType& other, bool check_metadata = false) const;
+
+ /// \brief Return whether the types are equal
+ bool Equals(const std::shared_ptr<DataType>& other) const;
+
+ ARROW_DEPRECATED("Use field(i)")
+ const std::shared_ptr<Field>& child(int i) const { return field(i); }
+
+ /// Returns the child-field at index i.
+ const std::shared_ptr<Field>& field(int i) const { return children_[i]; }
+
+ ARROW_DEPRECATED("Use fields()")
+ const std::vector<std::shared_ptr<Field>>& children() const { return fields(); }
+
+ /// \brief Returns the children fields associated with this type.
+ const std::vector<std::shared_ptr<Field>>& fields() const { return children_; }
+
+ ARROW_DEPRECATED("Use num_fields()")
+ int num_children() const { return num_fields(); }
+
+ /// \brief Returns the number of children fields associated with this type.
+ int num_fields() const { return static_cast<int>(children_.size()); }
+
+ Status Accept(TypeVisitor* visitor) const;
+
+ /// \brief A string representation of the type, including any children
+ virtual std::string ToString() const = 0;
+
+ /// \brief Return hash value (excluding metadata in child fields)
+ size_t Hash() const;
+
+ /// \brief A string name of the type, omitting any child fields
+ ///
+ /// \note Experimental API
+ /// \since 0.7.0
+ virtual std::string name() const = 0;
+
+ /// \brief Return the data type layout. Children are not included.
+ ///
+ /// \note Experimental API
+ virtual DataTypeLayout layout() const = 0;
+
+ /// \brief Return the type category
+ Type::type id() const { return id_; }
+
+ protected:
+ // Dummy version that returns a null string (indicating not implemented).
+ // Subclasses should override for fast equality checks.
+ std::string ComputeFingerprint() const override;
+
+ // Generic versions that works for all regular types, nested or not.
+ std::string ComputeMetadataFingerprint() const override;
+
+ Type::type id_;
+ std::vector<std::shared_ptr<Field>> children_;
+
+ private:
+ ARROW_DISALLOW_COPY_AND_ASSIGN(DataType);
+};
+
+ARROW_EXPORT
+std::ostream& operator<<(std::ostream& os, const DataType& type);
+
+/// \brief Return the compatible physical data type
+///
+/// Some types may have distinct logical meanings but the exact same physical
+/// representation. For example, TimestampType has Int64Type as a physical
+/// type (defined as TimestampType::PhysicalType).
+///
+/// The return value is as follows:
+/// - if a `PhysicalType` alias exists in the concrete type class, return
+/// an instance of `PhysicalType`.
+/// - otherwise, return the input type itself.
+std::shared_ptr<DataType> GetPhysicalType(const std::shared_ptr<DataType>& type);
+
+/// \brief Base class for all fixed-width data types
+class ARROW_EXPORT FixedWidthType : public DataType {
+ public:
+ using DataType::DataType;
+
+ virtual int bit_width() const = 0;
+};
+
+/// \brief Base class for all data types representing primitive values
+class ARROW_EXPORT PrimitiveCType : public FixedWidthType {
+ public:
+ using FixedWidthType::FixedWidthType;
+};
+
+/// \brief Base class for all numeric data types
+class ARROW_EXPORT NumberType : public PrimitiveCType {
+ public:
+ using PrimitiveCType::PrimitiveCType;
+};
+
+/// \brief Base class for all integral data types
+class ARROW_EXPORT IntegerType : public NumberType {
+ public:
+ using NumberType::NumberType;
+ virtual bool is_signed() const = 0;
+};
+
+/// \brief Base class for all floating-point data types
+class ARROW_EXPORT FloatingPointType : public NumberType {
+ public:
+ using NumberType::NumberType;
+ enum Precision { HALF, SINGLE, DOUBLE };
+ virtual Precision precision() const = 0;
+};
+
+/// \brief Base class for all parametric data types
+class ParametricType {};
+
+class ARROW_EXPORT NestedType : public DataType, public ParametricType {
+ public:
+ using DataType::DataType;
+};
+
+/// \brief The combination of a field name and data type, with optional metadata
+///
+/// Fields are used to describe the individual constituents of a
+/// nested DataType or a Schema.
+///
+/// A field's metadata is represented by a KeyValueMetadata instance,
+/// which holds arbitrary key-value pairs.
+class ARROW_EXPORT Field : public detail::Fingerprintable {
+ public:
+ Field(std::string name, std::shared_ptr<DataType> type, bool nullable = true,
+ std::shared_ptr<const KeyValueMetadata> metadata = NULLPTR)
+ : detail::Fingerprintable(),
+ name_(std::move(name)),
+ type_(std::move(type)),
+ nullable_(nullable),
+ metadata_(std::move(metadata)) {}
+
+ ~Field() override;
+
+ /// \brief Return the field's attached metadata
+ std::shared_ptr<const KeyValueMetadata> metadata() const { return metadata_; }
+
+ /// \brief Return whether the field has non-empty metadata
+ bool HasMetadata() const;
+
+ /// \brief Return a copy of this field with the given metadata attached to it
+ std::shared_ptr<Field> WithMetadata(
+ const std::shared_ptr<const KeyValueMetadata>& metadata) const;
+
+ /// \brief EXPERIMENTAL: Return a copy of this field with the given metadata
+ /// merged with existing metadata (any colliding keys will be overridden by
+ /// the passed metadata)
+ std::shared_ptr<Field> WithMergedMetadata(
+ const std::shared_ptr<const KeyValueMetadata>& metadata) const;
+
+ /// \brief Return a copy of this field without any metadata attached to it
+ std::shared_ptr<Field> RemoveMetadata() const;
+
+ /// \brief Return a copy of this field with the replaced type.
+ std::shared_ptr<Field> WithType(const std::shared_ptr<DataType>& type) const;
+
+ /// \brief Return a copy of this field with the replaced name.
+ std::shared_ptr<Field> WithName(const std::string& name) const;
+
+ /// \brief Return a copy of this field with the replaced nullability.
+ std::shared_ptr<Field> WithNullable(bool nullable) const;
+
+ /// \brief Options that control the behavior of `MergeWith`.
+ /// Options are to be added to allow type conversions, including integer
+ /// widening, promotion from integer to float, or conversion to or from boolean.
+ struct MergeOptions {
+ /// If true, a Field of NullType can be unified with a Field of another type.
+ /// The unified field will be of the other type and become nullable.
+ /// Nullability will be promoted to the looser option (nullable if one is not
+ /// nullable).
+ bool promote_nullability = true;
+
+ static MergeOptions Defaults() { return MergeOptions(); }
+ };
+
+ /// \brief Merge the current field with a field of the same name.
+ ///
+ /// The two fields must be compatible, i.e:
+ /// - have the same name
+ /// - have the same type, or of compatible types according to `options`.
+ ///
+ /// The metadata of the current field is preserved; the metadata of the other
+ /// field is discarded.
+ Result<std::shared_ptr<Field>> MergeWith(
+ const Field& other, MergeOptions options = MergeOptions::Defaults()) const;
+ Result<std::shared_ptr<Field>> MergeWith(
+ const std::shared_ptr<Field>& other,
+ MergeOptions options = MergeOptions::Defaults()) const;
+
+ std::vector<std::shared_ptr<Field>> Flatten() const;
+
+ /// \brief Indicate if fields are equals.
+ ///
+ /// \param[in] other field to check equality with.
+ /// \param[in] check_metadata controls if it should check for metadata
+ /// equality.
+ ///
+ /// \return true if fields are equal, false otherwise.
+ bool Equals(const Field& other, bool check_metadata = false) const;
+ bool Equals(const std::shared_ptr<Field>& other, bool check_metadata = false) const;
+
+ /// \brief Indicate if fields are compatibles.
+ ///
+ /// See the criteria of MergeWith.
+ ///
+ /// \return true if fields are compatible, false otherwise.
+ bool IsCompatibleWith(const Field& other) const;
+ bool IsCompatibleWith(const std::shared_ptr<Field>& other) const;
+
+ /// \brief Return a string representation ot the field
+ /// \param[in] show_metadata when true, if KeyValueMetadata is non-empty,
+ /// print keys and values in the output
+ std::string ToString(bool show_metadata = false) const;
+
+ /// \brief Return the field name
+ const std::string& name() const { return name_; }
+ /// \brief Return the field data type
+ const std::shared_ptr<DataType>& type() const { return type_; }
+ /// \brief Return whether the field is nullable
+ bool nullable() const { return nullable_; }
+
+ std::shared_ptr<Field> Copy() const;
+
+ private:
+ std::string ComputeFingerprint() const override;
+ std::string ComputeMetadataFingerprint() const override;
+
+ // Field name
+ std::string name_;
+
+ // The field's data type
+ std::shared_ptr<DataType> type_;
+
+ // Fields can be nullable
+ bool nullable_;
+
+ // The field's metadata, if any
+ std::shared_ptr<const KeyValueMetadata> metadata_;
+
+ ARROW_DISALLOW_COPY_AND_ASSIGN(Field);
+};
+
+namespace detail {
+
+template <typename DERIVED, typename BASE, Type::type TYPE_ID, typename C_TYPE>
+class ARROW_EXPORT CTypeImpl : public BASE {
+ public:
+ static constexpr Type::type type_id = TYPE_ID;
+ using c_type = C_TYPE;
+ using PhysicalType = DERIVED;
+
+ CTypeImpl() : BASE(TYPE_ID) {}
+
+ int bit_width() const override { return static_cast<int>(sizeof(C_TYPE) * CHAR_BIT); }
+
+ DataTypeLayout layout() const override {
+ return DataTypeLayout(
+ {DataTypeLayout::Bitmap(), DataTypeLayout::FixedWidth(sizeof(C_TYPE))});
+ }
+
+ std::string name() const override { return DERIVED::type_name(); }
+
+ std::string ToString() const override { return this->name(); }
+};
+
+template <typename DERIVED, typename BASE, Type::type TYPE_ID, typename C_TYPE>
+constexpr Type::type CTypeImpl<DERIVED, BASE, TYPE_ID, C_TYPE>::type_id;
+
+template <typename DERIVED, Type::type TYPE_ID, typename C_TYPE>
+class IntegerTypeImpl : public detail::CTypeImpl<DERIVED, IntegerType, TYPE_ID, C_TYPE> {
+ bool is_signed() const override { return std::is_signed<C_TYPE>::value; }
+};
+
+} // namespace detail
+
+/// Concrete type class for always-null data
+class ARROW_EXPORT NullType : public DataType {
+ public:
+ static constexpr Type::type type_id = Type::NA;
+
+ static constexpr const char* type_name() { return "null"; }
+
+ NullType() : DataType(Type::NA) {}
+
+ std::string ToString() const override;
+
+ DataTypeLayout layout() const override {
+ return DataTypeLayout({DataTypeLayout::AlwaysNull()});
+ }
+
+ std::string name() const override { return "null"; }
+
+ protected:
+ std::string ComputeFingerprint() const override;
+};
+
+/// Concrete type class for boolean data
+class ARROW_EXPORT BooleanType
+ : public detail::CTypeImpl<BooleanType, PrimitiveCType, Type::BOOL, bool> {
+ public:
+ static constexpr const char* type_name() { return "bool"; }
+
+ // BooleanType within arrow use a single bit instead of the C 8-bits layout.
+ int bit_width() const final { return 1; }
+
+ DataTypeLayout layout() const override {
+ return DataTypeLayout({DataTypeLayout::Bitmap(), DataTypeLayout::Bitmap()});
+ }
+
+ protected:
+ std::string ComputeFingerprint() const override;
+};
+
+/// Concrete type class for unsigned 8-bit integer data
+class ARROW_EXPORT UInt8Type
+ : public detail::IntegerTypeImpl<UInt8Type, Type::UINT8, uint8_t> {
+ public:
+ static constexpr const char* type_name() { return "uint8"; }
+
+ protected:
+ std::string ComputeFingerprint() const override;
+};
+
+/// Concrete type class for signed 8-bit integer data
+class ARROW_EXPORT Int8Type
+ : public detail::IntegerTypeImpl<Int8Type, Type::INT8, int8_t> {
+ public:
+ static constexpr const char* type_name() { return "int8"; }
+
+ protected:
+ std::string ComputeFingerprint() const override;
+};
+
+/// Concrete type class for unsigned 16-bit integer data
+class ARROW_EXPORT UInt16Type
+ : public detail::IntegerTypeImpl<UInt16Type, Type::UINT16, uint16_t> {
+ public:
+ static constexpr const char* type_name() { return "uint16"; }
+
+ protected:
+ std::string ComputeFingerprint() const override;
+};
+
+/// Concrete type class for signed 16-bit integer data
+class ARROW_EXPORT Int16Type
+ : public detail::IntegerTypeImpl<Int16Type, Type::INT16, int16_t> {
+ public:
+ static constexpr const char* type_name() { return "int16"; }
+
+ protected:
+ std::string ComputeFingerprint() const override;
+};
+
+/// Concrete type class for unsigned 32-bit integer data
+class ARROW_EXPORT UInt32Type
+ : public detail::IntegerTypeImpl<UInt32Type, Type::UINT32, uint32_t> {
+ public:
+ static constexpr const char* type_name() { return "uint32"; }
+
+ protected:
+ std::string ComputeFingerprint() const override;
+};
+
+/// Concrete type class for signed 32-bit integer data
+class ARROW_EXPORT Int32Type
+ : public detail::IntegerTypeImpl<Int32Type, Type::INT32, int32_t> {
+ public:
+ static constexpr const char* type_name() { return "int32"; }
+
+ protected:
+ std::string ComputeFingerprint() const override;
+};
+
+/// Concrete type class for unsigned 64-bit integer data
+class ARROW_EXPORT UInt64Type
+ : public detail::IntegerTypeImpl<UInt64Type, Type::UINT64, uint64_t> {
+ public:
+ static constexpr const char* type_name() { return "uint64"; }
+
+ protected:
+ std::string ComputeFingerprint() const override;
+};
+
+/// Concrete type class for signed 64-bit integer data
+class ARROW_EXPORT Int64Type
+ : public detail::IntegerTypeImpl<Int64Type, Type::INT64, int64_t> {
+ public:
+ static constexpr const char* type_name() { return "int64"; }
+
+ protected:
+ std::string ComputeFingerprint() const override;
+};
+
+/// Concrete type class for 16-bit floating-point data
+class ARROW_EXPORT HalfFloatType
+ : public detail::CTypeImpl<HalfFloatType, FloatingPointType, Type::HALF_FLOAT,
+ uint16_t> {
+ public:
+ Precision precision() const override;
+ static constexpr const char* type_name() { return "halffloat"; }
+
+ protected:
+ std::string ComputeFingerprint() const override;
+};
+
+/// Concrete type class for 32-bit floating-point data (C "float")
+class ARROW_EXPORT FloatType
+ : public detail::CTypeImpl<FloatType, FloatingPointType, Type::FLOAT, float> {
+ public:
+ Precision precision() const override;
+ static constexpr const char* type_name() { return "float"; }
+
+ protected:
+ std::string ComputeFingerprint() const override;
+};
+
+/// Concrete type class for 64-bit floating-point data (C "double")
+class ARROW_EXPORT DoubleType
+ : public detail::CTypeImpl<DoubleType, FloatingPointType, Type::DOUBLE, double> {
+ public:
+ Precision precision() const override;
+ static constexpr const char* type_name() { return "double"; }
+
+ protected:
+ std::string ComputeFingerprint() const override;
+};
+
+/// \brief Base class for all variable-size list data types
+class ARROW_EXPORT BaseListType : public NestedType {
+ public:
+ using NestedType::NestedType;
+ std::shared_ptr<Field> value_field() const { return children_[0]; }
+
+ std::shared_ptr<DataType> value_type() const { return children_[0]->type(); }
+};
+
+/// \brief Concrete type class for list data
+///
+/// List data is nested data where each value is a variable number of
+/// child items. Lists can be recursively nested, for example
+/// list(list(int32)).
+class ARROW_EXPORT ListType : public BaseListType {
+ public:
+ static constexpr Type::type type_id = Type::LIST;
+ using offset_type = int32_t;
+
+ static constexpr const char* type_name() { return "list"; }
+
+ // List can contain any other logical value type
+ explicit ListType(const std::shared_ptr<DataType>& value_type)
+ : ListType(std::make_shared<Field>("item", value_type)) {}
+
+ explicit ListType(const std::shared_ptr<Field>& value_field) : BaseListType(type_id) {
+ children_ = {value_field};
+ }
+
+ DataTypeLayout layout() const override {
+ return DataTypeLayout(
+ {DataTypeLayout::Bitmap(), DataTypeLayout::FixedWidth(sizeof(offset_type))});
+ }
+
+ std::string ToString() const override;
+
+ std::string name() const override { return "list"; }
+
+ protected:
+ std::string ComputeFingerprint() const override;
+};
+
+/// \brief Concrete type class for large list data
+///
+/// LargeListType is like ListType but with 64-bit rather than 32-bit offsets.
+class ARROW_EXPORT LargeListType : public BaseListType {
+ public:
+ static constexpr Type::type type_id = Type::LARGE_LIST;
+ using offset_type = int64_t;
+
+ static constexpr const char* type_name() { return "large_list"; }
+
+ // List can contain any other logical value type
+ explicit LargeListType(const std::shared_ptr<DataType>& value_type)
+ : LargeListType(std::make_shared<Field>("item", value_type)) {}
+
+ explicit LargeListType(const std::shared_ptr<Field>& value_field)
+ : BaseListType(type_id) {
+ children_ = {value_field};
+ }
+
+ DataTypeLayout layout() const override {
+ return DataTypeLayout(
+ {DataTypeLayout::Bitmap(), DataTypeLayout::FixedWidth(sizeof(offset_type))});
+ }
+
+ std::string ToString() const override;
+
+ std::string name() const override { return "large_list"; }
+
+ protected:
+ std::string ComputeFingerprint() const override;
+};
+
+/// \brief Concrete type class for map data
+///
+/// Map data is nested data where each value is a variable number of
+/// key-item pairs. Its physical representation is the same as
+/// a list of `{key, item}` structs.
+///
+/// Maps can be recursively nested, for example map(utf8, map(utf8, int32)).
+class ARROW_EXPORT MapType : public ListType {
+ public:
+ static constexpr Type::type type_id = Type::MAP;
+
+ static constexpr const char* type_name() { return "map"; }
+
+ MapType(std::shared_ptr<DataType> key_type, std::shared_ptr<DataType> item_type,
+ bool keys_sorted = false);
+
+ MapType(std::shared_ptr<DataType> key_type, std::shared_ptr<Field> item_field,
+ bool keys_sorted = false);
+
+ MapType(std::shared_ptr<Field> key_field, std::shared_ptr<Field> item_field,
+ bool keys_sorted = false);
+
+ explicit MapType(std::shared_ptr<Field> value_field, bool keys_sorted = false);
+
+ // Validating constructor
+ static Result<std::shared_ptr<DataType>> Make(std::shared_ptr<Field> value_field,
+ bool keys_sorted = false);
+
+ std::shared_ptr<Field> key_field() const { return value_type()->field(0); }
+ std::shared_ptr<DataType> key_type() const { return key_field()->type(); }
+
+ std::shared_ptr<Field> item_field() const { return value_type()->field(1); }
+ std::shared_ptr<DataType> item_type() const { return item_field()->type(); }
+
+ std::string ToString() const override;
+
+ std::string name() const override { return "map"; }
+
+ bool keys_sorted() const { return keys_sorted_; }
+
+ private:
+ std::string ComputeFingerprint() const override;
+
+ bool keys_sorted_;
+};
+
+/// \brief Concrete type class for fixed size list data
+class ARROW_EXPORT FixedSizeListType : public BaseListType {
+ public:
+ static constexpr Type::type type_id = Type::FIXED_SIZE_LIST;
+ using offset_type = int32_t;
+
+ static constexpr const char* type_name() { return "fixed_size_list"; }
+
+ // List can contain any other logical value type
+ FixedSizeListType(const std::shared_ptr<DataType>& value_type, int32_t list_size)
+ : FixedSizeListType(std::make_shared<Field>("item", value_type), list_size) {}
+
+ FixedSizeListType(const std::shared_ptr<Field>& value_field, int32_t list_size)
+ : BaseListType(type_id), list_size_(list_size) {
+ children_ = {value_field};
+ }
+
+ DataTypeLayout layout() const override {
+ return DataTypeLayout({DataTypeLayout::Bitmap()});
+ }
+
+ std::string ToString() const override;
+
+ std::string name() const override { return "fixed_size_list"; }
+
+ int32_t list_size() const { return list_size_; }
+
+ protected:
+ std::string ComputeFingerprint() const override;
+
+ int32_t list_size_;
+};
+
+/// \brief Base class for all variable-size binary data types
+class ARROW_EXPORT BaseBinaryType : public DataType {
+ public:
+ using DataType::DataType;
+};
+
+constexpr int64_t kBinaryMemoryLimit = std::numeric_limits<int32_t>::max() - 1;
+
+/// \brief Concrete type class for variable-size binary data
+class ARROW_EXPORT BinaryType : public BaseBinaryType {
+ public:
+ static constexpr Type::type type_id = Type::BINARY;
+ static constexpr bool is_utf8 = false;
+ using offset_type = int32_t;
+ using PhysicalType = BinaryType;
+
+ static constexpr const char* type_name() { return "binary"; }
+
+ BinaryType() : BinaryType(Type::BINARY) {}
+
+ DataTypeLayout layout() const override {
+ return DataTypeLayout({DataTypeLayout::Bitmap(),
+ DataTypeLayout::FixedWidth(sizeof(offset_type)),
+ DataTypeLayout::VariableWidth()});
+ }
+
+ std::string ToString() const override;
+ std::string name() const override { return "binary"; }
+
+ protected:
+ std::string ComputeFingerprint() const override;
+
+ // Allow subclasses like StringType to change the logical type.
+ explicit BinaryType(Type::type logical_type) : BaseBinaryType(logical_type) {}
+};
+
+/// \brief Concrete type class for large variable-size binary data
+class ARROW_EXPORT LargeBinaryType : public BaseBinaryType {
+ public:
+ static constexpr Type::type type_id = Type::LARGE_BINARY;
+ static constexpr bool is_utf8 = false;
+ using offset_type = int64_t;
+ using PhysicalType = LargeBinaryType;
+
+ static constexpr const char* type_name() { return "large_binary"; }
+
+ LargeBinaryType() : LargeBinaryType(Type::LARGE_BINARY) {}
+
+ DataTypeLayout layout() const override {
+ return DataTypeLayout({DataTypeLayout::Bitmap(),
+ DataTypeLayout::FixedWidth(sizeof(offset_type)),
+ DataTypeLayout::VariableWidth()});
+ }
+
+ std::string ToString() const override;
+ std::string name() const override { return "large_binary"; }
+
+ protected:
+ std::string ComputeFingerprint() const override;
+
+ // Allow subclasses like LargeStringType to change the logical type.
+ explicit LargeBinaryType(Type::type logical_type) : BaseBinaryType(logical_type) {}
+};
+
+/// \brief Concrete type class for variable-size string data, utf8-encoded
+class ARROW_EXPORT StringType : public BinaryType {
+ public:
+ static constexpr Type::type type_id = Type::STRING;
+ static constexpr bool is_utf8 = true;
+ using PhysicalType = BinaryType;
+
+ static constexpr const char* type_name() { return "utf8"; }
+
+ StringType() : BinaryType(Type::STRING) {}
+
+ std::string ToString() const override;
+ std::string name() const override { return "utf8"; }
+
+ protected:
+ std::string ComputeFingerprint() const override;
+};
+
+/// \brief Concrete type class for large variable-size string data, utf8-encoded
+class ARROW_EXPORT LargeStringType : public LargeBinaryType {
+ public:
+ static constexpr Type::type type_id = Type::LARGE_STRING;
+ static constexpr bool is_utf8 = true;
+ using PhysicalType = LargeBinaryType;
+
+ static constexpr const char* type_name() { return "large_utf8"; }
+
+ LargeStringType() : LargeBinaryType(Type::LARGE_STRING) {}
+
+ std::string ToString() const override;
+ std::string name() const override { return "large_utf8"; }
+
+ protected:
+ std::string ComputeFingerprint() const override;
+};
+
+/// \brief Concrete type class for fixed-size binary data
+class ARROW_EXPORT FixedSizeBinaryType : public FixedWidthType, public ParametricType {
+ public:
+ static constexpr Type::type type_id = Type::FIXED_SIZE_BINARY;
+ static constexpr bool is_utf8 = false;
+
+ static constexpr const char* type_name() { return "fixed_size_binary"; }
+
+ explicit FixedSizeBinaryType(int32_t byte_width)
+ : FixedWidthType(Type::FIXED_SIZE_BINARY), byte_width_(byte_width) {}
+ explicit FixedSizeBinaryType(int32_t byte_width, Type::type override_type_id)
+ : FixedWidthType(override_type_id), byte_width_(byte_width) {}
+
+ std::string ToString() const override;
+ std::string name() const override { return "fixed_size_binary"; }
+
+ DataTypeLayout layout() const override {
+ return DataTypeLayout(
+ {DataTypeLayout::Bitmap(), DataTypeLayout::FixedWidth(byte_width())});
+ }
+
+ int32_t byte_width() const { return byte_width_; }
+ int bit_width() const override;
+
+ // Validating constructor
+ static Result<std::shared_ptr<DataType>> Make(int32_t byte_width);
+
+ protected:
+ std::string ComputeFingerprint() const override;
+
+ int32_t byte_width_;
+};
+
+/// \brief Concrete type class for struct data
+class ARROW_EXPORT StructType : public NestedType {
+ public:
+ static constexpr Type::type type_id = Type::STRUCT;
+
+ static constexpr const char* type_name() { return "struct"; }
+
+ explicit StructType(const std::vector<std::shared_ptr<Field>>& fields);
+
+ ~StructType() override;
+
+ DataTypeLayout layout() const override {
+ return DataTypeLayout({DataTypeLayout::Bitmap()});
+ }
+
+ std::string ToString() const override;
+ std::string name() const override { return "struct"; }
+
+ /// Returns null if name not found
+ std::shared_ptr<Field> GetFieldByName(const std::string& name) const;
+
+ /// Return all fields having this name
+ std::vector<std::shared_ptr<Field>> GetAllFieldsByName(const std::string& name) const;
+
+ /// Returns -1 if name not found or if there are multiple fields having the
+ /// same name
+ int GetFieldIndex(const std::string& name) const;
+
+ /// \brief Return the indices of all fields having this name in sorted order
+ std::vector<int> GetAllFieldIndices(const std::string& name) const;
+
+ private:
+ std::string ComputeFingerprint() const override;
+
+ class Impl;
+ std::unique_ptr<Impl> impl_;
+};
+
+/// \brief Base type class for (fixed-size) decimal data
+class ARROW_EXPORT DecimalType : public FixedSizeBinaryType {
+ public:
+ explicit DecimalType(Type::type type_id, int32_t byte_width, int32_t precision,
+ int32_t scale)
+ : FixedSizeBinaryType(byte_width, type_id), precision_(precision), scale_(scale) {}
+
+ /// Constructs concrete decimal types
+ static Result<std::shared_ptr<DataType>> Make(Type::type type_id, int32_t precision,
+ int32_t scale);
+
+ int32_t precision() const { return precision_; }
+ int32_t scale() const { return scale_; }
+
+ /// \brief Returns the number of bytes needed for precision.
+ ///
+ /// precision must be >= 1
+ static int32_t DecimalSize(int32_t precision);
+
+ protected:
+ std::string ComputeFingerprint() const override;
+
+ int32_t precision_;
+ int32_t scale_;
+};
+
+/// \brief Concrete type class for 128-bit decimal data
+///
+/// Arrow decimals are fixed-point decimal numbers encoded as a scaled
+/// integer. The precision is the number of significant digits that the
+/// decimal type can represent; the scale is the number of digits after
+/// the decimal point (note the scale can be negative).
+///
+/// As an example, `Decimal128Type(7, 3)` can exactly represent the numbers
+/// 1234.567 and -1234.567 (encoded internally as the 128-bit integers
+/// 1234567 and -1234567, respectively), but neither 12345.67 nor 123.4567.
+///
+/// Decimal128Type has a maximum precision of 38 significant digits
+/// (also available as Decimal128Type::kMaxPrecision).
+/// If higher precision is needed, consider using Decimal256Type.
+class ARROW_EXPORT Decimal128Type : public DecimalType {
+ public:
+ static constexpr Type::type type_id = Type::DECIMAL128;
+
+ static constexpr const char* type_name() { return "decimal128"; }
+
+ /// Decimal128Type constructor that aborts on invalid input.
+ explicit Decimal128Type(int32_t precision, int32_t scale);
+
+ /// Decimal128Type constructor that returns an error on invalid input.
+ static Result<std::shared_ptr<DataType>> Make(int32_t precision, int32_t scale);
+
+ std::string ToString() const override;
+ std::string name() const override { return "decimal128"; }
+
+ static constexpr int32_t kMinPrecision = 1;
+ static constexpr int32_t kMaxPrecision = 38;
+ static constexpr int32_t kByteWidth = 16;
+};
+
+/// \brief Concrete type class for 256-bit decimal data
+///
+/// Arrow decimals are fixed-point decimal numbers encoded as a scaled
+/// integer. The precision is the number of significant digits that the
+/// decimal type can represent; the scale is the number of digits after
+/// the decimal point (note the scale can be negative).
+///
+/// Decimal256Type has a maximum precision of 76 significant digits.
+/// (also available as Decimal256Type::kMaxPrecision).
+///
+/// For most use cases, the maximum precision offered by Decimal128Type
+/// is sufficient, and it will result in a more compact and more efficient
+/// encoding.
+class ARROW_EXPORT Decimal256Type : public DecimalType {
+ public:
+ static constexpr Type::type type_id = Type::DECIMAL256;
+
+ static constexpr const char* type_name() { return "decimal256"; }
+
+ /// Decimal256Type constructor that aborts on invalid input.
+ explicit Decimal256Type(int32_t precision, int32_t scale);
+
+ /// Decimal256Type constructor that returns an error on invalid input.
+ static Result<std::shared_ptr<DataType>> Make(int32_t precision, int32_t scale);
+
+ std::string ToString() const override;
+ std::string name() const override { return "decimal256"; }
+
+ static constexpr int32_t kMinPrecision = 1;
+ static constexpr int32_t kMaxPrecision = 76;
+ static constexpr int32_t kByteWidth = 32;
+};
+
+/// \brief Base type class for union data
+class ARROW_EXPORT UnionType : public NestedType {
+ public:
+ static constexpr int8_t kMaxTypeCode = 127;
+ static constexpr int kInvalidChildId = -1;
+
+ static Result<std::shared_ptr<DataType>> Make(
+ const std::vector<std::shared_ptr<Field>>& fields,
+ const std::vector<int8_t>& type_codes, UnionMode::type mode = UnionMode::SPARSE) {
+ if (mode == UnionMode::SPARSE) {
+ return sparse_union(fields, type_codes);
+ } else {
+ return dense_union(fields, type_codes);
+ }
+ }
+
+ DataTypeLayout layout() const override;
+
+ std::string ToString() const override;
+
+ /// The array of logical type ids.
+ ///
+ /// For example, the first type in the union might be denoted by the id 5
+ /// (instead of 0).
+ const std::vector<int8_t>& type_codes() const { return type_codes_; }
+
+ /// An array mapping logical type ids to physical child ids.
+ const std::vector<int>& child_ids() const { return child_ids_; }
+
+ uint8_t max_type_code() const;
+
+ UnionMode::type mode() const;
+
+ protected:
+ UnionType(std::vector<std::shared_ptr<Field>> fields, std::vector<int8_t> type_codes,
+ Type::type id);
+
+ static Status ValidateParameters(const std::vector<std::shared_ptr<Field>>& fields,
+ const std::vector<int8_t>& type_codes,
+ UnionMode::type mode);
+
+ private:
+ std::string ComputeFingerprint() const override;
+
+ std::vector<int8_t> type_codes_;
+ std::vector<int> child_ids_;
+};
+
+/// \brief Concrete type class for sparse union data
+///
+/// A sparse union is a nested type where each logical value is taken from
+/// a single child. A buffer of 8-bit type ids indicates which child
+/// a given logical value is to be taken from.
+///
+/// In a sparse union, each child array should have the same length as the
+/// union array, regardless of the actual number of union values that
+/// refer to it.
+///
+/// Note that, unlike most other types, unions don't have a top-level validity bitmap.
+class ARROW_EXPORT SparseUnionType : public UnionType {
+ public:
+ static constexpr Type::type type_id = Type::SPARSE_UNION;
+
+ static constexpr const char* type_name() { return "sparse_union"; }
+
+ SparseUnionType(std::vector<std::shared_ptr<Field>> fields,
+ std::vector<int8_t> type_codes);
+
+ // A constructor variant that validates input parameters
+ static Result<std::shared_ptr<DataType>> Make(
+ std::vector<std::shared_ptr<Field>> fields, std::vector<int8_t> type_codes);
+
+ std::string name() const override { return "sparse_union"; }
+};
+
+/// \brief Concrete type class for dense union data
+///
+/// A dense union is a nested type where each logical value is taken from
+/// a single child, at a specific offset. A buffer of 8-bit type ids
+/// indicates which child a given logical value is to be taken from,
+/// and a buffer of 32-bit offsets indicates at which physical position
+/// in the given child array the logical value is to be taken from.
+///
+/// Unlike a sparse union, a dense union allows encoding only the child array
+/// values which are actually referred to by the union array. This is
+/// counterbalanced by the additional footprint of the offsets buffer, and
+/// the additional indirection cost when looking up values.
+///
+/// Note that, unlike most other types, unions don't have a top-level validity bitmap.
+class ARROW_EXPORT DenseUnionType : public UnionType {
+ public:
+ static constexpr Type::type type_id = Type::DENSE_UNION;
+
+ static constexpr const char* type_name() { return "dense_union"; }
+
+ DenseUnionType(std::vector<std::shared_ptr<Field>> fields,
+ std::vector<int8_t> type_codes);
+
+ // A constructor variant that validates input parameters
+ static Result<std::shared_ptr<DataType>> Make(
+ std::vector<std::shared_ptr<Field>> fields, std::vector<int8_t> type_codes);
+
+ std::string name() const override { return "dense_union"; }
+};
+
+// ----------------------------------------------------------------------
+// Date and time types
+
+/// \brief Base type for all date and time types
+class ARROW_EXPORT TemporalType : public FixedWidthType {
+ public:
+ using FixedWidthType::FixedWidthType;
+
+ DataTypeLayout layout() const override {
+ return DataTypeLayout(
+ {DataTypeLayout::Bitmap(), DataTypeLayout::FixedWidth(bit_width() / 8)});
+ }
+};
+
+/// \brief Base type class for date data
+class ARROW_EXPORT DateType : public TemporalType {
+ public:
+ virtual DateUnit unit() const = 0;
+
+ protected:
+ explicit DateType(Type::type type_id);
+};
+
+/// Concrete type class for 32-bit date data (as number of days since UNIX epoch)
+class ARROW_EXPORT Date32Type : public DateType {
+ public:
+ static constexpr Type::type type_id = Type::DATE32;
+ static constexpr DateUnit UNIT = DateUnit::DAY;
+ using c_type = int32_t;
+ using PhysicalType = Int32Type;
+
+ static constexpr const char* type_name() { return "date32"; }
+
+ Date32Type();
+
+ int bit_width() const override { return static_cast<int>(sizeof(c_type) * CHAR_BIT); }
+
+ std::string ToString() const override;
+
+ std::string name() const override { return "date32"; }
+ DateUnit unit() const override { return UNIT; }
+
+ protected:
+ std::string ComputeFingerprint() const override;
+};
+
+/// Concrete type class for 64-bit date data (as number of milliseconds since UNIX epoch)
+class ARROW_EXPORT Date64Type : public DateType {
+ public:
+ static constexpr Type::type type_id = Type::DATE64;
+ static constexpr DateUnit UNIT = DateUnit::MILLI;
+ using c_type = int64_t;
+ using PhysicalType = Int64Type;
+
+ static constexpr const char* type_name() { return "date64"; }
+
+ Date64Type();
+
+ int bit_width() const override { return static_cast<int>(sizeof(c_type) * CHAR_BIT); }
+
+ std::string ToString() const override;
+
+ std::string name() const override { return "date64"; }
+ DateUnit unit() const override { return UNIT; }
+
+ protected:
+ std::string ComputeFingerprint() const override;
+};
+
+ARROW_EXPORT
+std::ostream& operator<<(std::ostream& os, TimeUnit::type unit);
+
+/// Base type class for time data
+class ARROW_EXPORT TimeType : public TemporalType, public ParametricType {
+ public:
+ TimeUnit::type unit() const { return unit_; }
+
+ protected:
+ TimeType(Type::type type_id, TimeUnit::type unit);
+ std::string ComputeFingerprint() const override;
+
+ TimeUnit::type unit_;
+};
+
+/// Concrete type class for 32-bit time data (as number of seconds or milliseconds
+/// since midnight)
+class ARROW_EXPORT Time32Type : public TimeType {
+ public:
+ static constexpr Type::type type_id = Type::TIME32;
+ using c_type = int32_t;
+ using PhysicalType = Int32Type;
+
+ static constexpr const char* type_name() { return "time32"; }
+
+ int bit_width() const override { return static_cast<int>(sizeof(c_type) * CHAR_BIT); }
+
+ explicit Time32Type(TimeUnit::type unit = TimeUnit::MILLI);
+
+ std::string ToString() const override;
+
+ std::string name() const override { return "time32"; }
+};
+
+/// Concrete type class for 64-bit time data (as number of microseconds or nanoseconds
+/// since midnight)
+class ARROW_EXPORT Time64Type : public TimeType {
+ public:
+ static constexpr Type::type type_id = Type::TIME64;
+ using c_type = int64_t;
+ using PhysicalType = Int64Type;
+
+ static constexpr const char* type_name() { return "time64"; }
+
+ int bit_width() const override { return static_cast<int>(sizeof(c_type) * CHAR_BIT); }
+
+ explicit Time64Type(TimeUnit::type unit = TimeUnit::NANO);
+
+ std::string ToString() const override;
+
+ std::string name() const override { return "time64"; }
+};
+
+/// \brief Concrete type class for datetime data (as number of seconds, milliseconds,
+/// microseconds or nanoseconds since UNIX epoch)
+///
+/// If supplied, the timezone string should take either the form (i) "Area/Location",
+/// with values drawn from the names in the IANA Time Zone Database (such as
+/// "Europe/Zurich"); or (ii) "(+|-)HH:MM" indicating an absolute offset from GMT
+/// (such as "-08:00"). To indicate a native UTC timestamp, one of the strings "UTC",
+/// "Etc/UTC" or "+00:00" should be used.
+///
+/// If any non-empty string is supplied as the timezone for a TimestampType, then the
+/// Arrow field containing that timestamp type (and by extension the column associated
+/// with such a field) is considered "timezone-aware". The integer arrays that comprise
+/// a timezone-aware column must contain UTC normalized datetime values, regardless of
+/// the contents of their timezone string. More precisely, (i) the producer of a
+/// timezone-aware column must populate its constituent arrays with valid UTC values
+/// (performing offset conversions from non-UTC values if necessary); and (ii) the
+/// consumer of a timezone-aware column may assume that the column's values are directly
+/// comparable (that is, with no offset adjustment required) to the values of any other
+/// timezone-aware column or to any other valid UTC datetime value (provided all values
+/// are expressed in the same units).
+///
+/// If a TimestampType is constructed without a timezone (or, equivalently, if the
+/// timezone supplied is an empty string) then the resulting Arrow field (column) is
+/// considered "timezone-naive". The producer of a timezone-naive column may populate
+/// its constituent integer arrays with datetime values from any timezone; the consumer
+/// of a timezone-naive column should make no assumptions about the interoperability or
+/// comparability of the values of such a column with those of any other timestamp
+/// column or datetime value.
+///
+/// If a timezone-aware field contains a recognized timezone, its values may be
+/// localized to that locale upon display; the values of timezone-naive fields must
+/// always be displayed "as is", with no localization performed on them.
+class ARROW_EXPORT TimestampType : public TemporalType, public ParametricType {
+ public:
+ using Unit = TimeUnit;
+
+ static constexpr Type::type type_id = Type::TIMESTAMP;
+ using c_type = int64_t;
+ using PhysicalType = Int64Type;
+
+ static constexpr const char* type_name() { return "timestamp"; }
+
+ int bit_width() const override { return static_cast<int>(sizeof(int64_t) * CHAR_BIT); }
+
+ explicit TimestampType(TimeUnit::type unit = TimeUnit::MILLI)
+ : TemporalType(Type::TIMESTAMP), unit_(unit) {}
+
+ explicit TimestampType(TimeUnit::type unit, const std::string& timezone)
+ : TemporalType(Type::TIMESTAMP), unit_(unit), timezone_(timezone) {}
+
+ std::string ToString() const override;
+ std::string name() const override { return "timestamp"; }
+
+ TimeUnit::type unit() const { return unit_; }
+ const std::string& timezone() const { return timezone_; }
+
+ protected:
+ std::string ComputeFingerprint() const override;
+
+ private:
+ TimeUnit::type unit_;
+ std::string timezone_;
+};
+
+// Base class for the different kinds of calendar intervals.
+class ARROW_EXPORT IntervalType : public TemporalType, public ParametricType {
+ public:
+ enum type { MONTHS, DAY_TIME };
+
+ virtual type interval_type() const = 0;
+
+ protected:
+ explicit IntervalType(Type::type subtype) : TemporalType(subtype) {}
+ std::string ComputeFingerprint() const override;
+};
+
+/// \brief Represents a number of months.
+///
+/// Type representing a number of months. Corresponds to YearMonth type
+/// in Schema.fbs (years are defined as 12 months).
+class ARROW_EXPORT MonthIntervalType : public IntervalType {
+ public:
+ static constexpr Type::type type_id = Type::INTERVAL_MONTHS;
+ using c_type = int32_t;
+ using PhysicalType = Int32Type;
+
+ static constexpr const char* type_name() { return "month_interval"; }
+
+ IntervalType::type interval_type() const override { return IntervalType::MONTHS; }
+
+ int bit_width() const override { return static_cast<int>(sizeof(c_type) * CHAR_BIT); }
+
+ MonthIntervalType() : IntervalType(type_id) {}
+
+ std::string ToString() const override { return name(); }
+ std::string name() const override { return "month_interval"; }
+};
+
+/// \brief Represents a number of days and milliseconds (fraction of day).
+class ARROW_EXPORT DayTimeIntervalType : public IntervalType {
+ public:
+ struct DayMilliseconds {
+ int32_t days;
+ int32_t milliseconds;
+ bool operator==(DayMilliseconds other) const {
+ return this->days == other.days && this->milliseconds == other.milliseconds;
+ }
+ bool operator!=(DayMilliseconds other) const { return !(*this == other); }
+ bool operator<(DayMilliseconds other) const {
+ return this->days < other.days || this->milliseconds < other.milliseconds;
+ }
+ };
+ using c_type = DayMilliseconds;
+ using PhysicalType = DayTimeIntervalType;
+
+ static_assert(sizeof(DayMilliseconds) == 8,
+ "DayMilliseconds struct assumed to be of size 8 bytes");
+ static constexpr Type::type type_id = Type::INTERVAL_DAY_TIME;
+
+ static constexpr const char* type_name() { return "day_time_interval"; }
+
+ IntervalType::type interval_type() const override { return IntervalType::DAY_TIME; }
+
+ DayTimeIntervalType() : IntervalType(type_id) {}
+
+ int bit_width() const override { return static_cast<int>(sizeof(c_type) * CHAR_BIT); }
+
+ std::string ToString() const override { return name(); }
+ std::string name() const override { return "day_time_interval"; }
+};
+
+/// \brief Represents an elapsed time without any relation to a calendar artifact.
+class ARROW_EXPORT DurationType : public TemporalType, public ParametricType {
+ public:
+ using Unit = TimeUnit;
+
+ static constexpr Type::type type_id = Type::DURATION;
+ using c_type = int64_t;
+ using PhysicalType = Int64Type;
+
+ static constexpr const char* type_name() { return "duration"; }
+
+ int bit_width() const override { return static_cast<int>(sizeof(int64_t) * CHAR_BIT); }
+
+ explicit DurationType(TimeUnit::type unit = TimeUnit::MILLI)
+ : TemporalType(Type::DURATION), unit_(unit) {}
+
+ std::string ToString() const override;
+ std::string name() const override { return "duration"; }
+
+ TimeUnit::type unit() const { return unit_; }
+
+ protected:
+ std::string ComputeFingerprint() const override;
+
+ private:
+ TimeUnit::type unit_;
+};
+
+// ----------------------------------------------------------------------
+// Dictionary type (for representing categorical or dictionary-encoded
+// in memory)
+
+/// \brief Dictionary-encoded value type with data-dependent
+/// dictionary. Indices are represented by any integer types.
+class ARROW_EXPORT DictionaryType : public FixedWidthType {
+ public:
+ static constexpr Type::type type_id = Type::DICTIONARY;
+
+ static constexpr const char* type_name() { return "dictionary"; }
+
+ DictionaryType(const std::shared_ptr<DataType>& index_type,
+ const std::shared_ptr<DataType>& value_type, bool ordered = false);
+
+ // A constructor variant that validates its input parameters
+ static Result<std::shared_ptr<DataType>> Make(
+ const std::shared_ptr<DataType>& index_type,
+ const std::shared_ptr<DataType>& value_type, bool ordered = false);
+
+ std::string ToString() const override;
+ std::string name() const override { return "dictionary"; }
+
+ int bit_width() const override;
+
+ DataTypeLayout layout() const override;
+
+ const std::shared_ptr<DataType>& index_type() const { return index_type_; }
+ const std::shared_ptr<DataType>& value_type() const { return value_type_; }
+
+ bool ordered() const { return ordered_; }
+
+ protected:
+ static Status ValidateParameters(const DataType& index_type,
+ const DataType& value_type);
+
+ std::string ComputeFingerprint() const override;
+
+ // Must be an integer type (not currently checked)
+ std::shared_ptr<DataType> index_type_;
+ std::shared_ptr<DataType> value_type_;
+ bool ordered_;
+};
+
+// ----------------------------------------------------------------------
+// FieldRef
+
+/// \class FieldPath
+///
+/// Represents a path to a nested field using indices of child fields.
+/// For example, given indices {5, 9, 3} the field would be retrieved with
+/// schema->field(5)->type()->field(9)->type()->field(3)
+///
+/// Attempting to retrieve a child field using a FieldPath which is not valid for
+/// a given schema will raise an error. Invalid FieldPaths include:
+/// - an index is out of range
+/// - the path is empty (note: a default constructed FieldPath will be empty)
+///
+/// FieldPaths provide a number of accessors for drilling down to potentially nested
+/// children. They are overloaded for convenience to support Schema (returns a field),
+/// DataType (returns a child field), Field (returns a child field of this field's type)
+/// Array (returns a child array), RecordBatch (returns a column).
+class ARROW_EXPORT FieldPath {
+ public:
+ FieldPath() = default;
+
+ FieldPath(std::vector<int> indices) // NOLINT runtime/explicit
+ : indices_(std::move(indices)) {}
+
+ FieldPath(std::initializer_list<int> indices) // NOLINT runtime/explicit
+ : indices_(std::move(indices)) {}
+
+ std::string ToString() const;
+
+ size_t hash() const;
+ struct Hash {
+ size_t operator()(const FieldPath& path) const { return path.hash(); }
+ };
+
+ bool empty() const { return indices_.empty(); }
+ bool operator==(const FieldPath& other) const { return indices() == other.indices(); }
+ bool operator!=(const FieldPath& other) const { return indices() != other.indices(); }
+
+ const std::vector<int>& indices() const { return indices_; }
+ int operator[](size_t i) const { return indices_[i]; }
+ std::vector<int>::const_iterator begin() const { return indices_.begin(); }
+ std::vector<int>::const_iterator end() const { return indices_.end(); }
+
+ /// \brief Retrieve the referenced child Field from a Schema, Field, or DataType
+ Result<std::shared_ptr<Field>> Get(const Schema& schema) const;
+ Result<std::shared_ptr<Field>> Get(const Field& field) const;
+ Result<std::shared_ptr<Field>> Get(const DataType& type) const;
+ Result<std::shared_ptr<Field>> Get(const FieldVector& fields) const;
+
+ /// \brief Retrieve the referenced column from a RecordBatch or Table
+ Result<std::shared_ptr<Array>> Get(const RecordBatch& batch) const;
+
+ /// \brief Retrieve the referenced child from an Array or ArrayData
+ Result<std::shared_ptr<Array>> Get(const Array& array) const;
+ Result<std::shared_ptr<ArrayData>> Get(const ArrayData& data) const;
+
+ private:
+ std::vector<int> indices_;
+};
+
+/// \class FieldRef
+/// \brief Descriptor of a (potentially nested) field within a schema.
+///
+/// Unlike FieldPath (which exclusively uses indices of child fields), FieldRef may
+/// reference a field by name. It is intended to replace parameters like `int field_index`
+/// and `const std::string& field_name`; it can be implicitly constructed from either a
+/// field index or a name.
+///
+/// Nested fields can be referenced as well. Given
+/// schema({field("a", struct_({field("n", null())})), field("b", int32())})
+///
+/// the following all indicate the nested field named "n":
+/// FieldRef ref1(0, 0);
+/// FieldRef ref2("a", 0);
+/// FieldRef ref3("a", "n");
+/// FieldRef ref4(0, "n");
+/// ARROW_ASSIGN_OR_RAISE(FieldRef ref5,
+/// FieldRef::FromDotPath(".a[0]"));
+///
+/// FieldPaths matching a FieldRef are retrieved using the member function FindAll.
+/// Multiple matches are possible because field names may be duplicated within a schema.
+/// For example:
+/// Schema a_is_ambiguous({field("a", int32()), field("a", float32())});
+/// auto matches = FieldRef("a").FindAll(a_is_ambiguous);
+/// assert(matches.size() == 2);
+/// assert(matches[0].Get(a_is_ambiguous)->Equals(a_is_ambiguous.field(0)));
+/// assert(matches[1].Get(a_is_ambiguous)->Equals(a_is_ambiguous.field(1)));
+///
+/// Convenience accessors are available which raise a helpful error if the field is not
+/// found or ambiguous, and for immediately calling FieldPath::Get to retrieve any
+/// matching children:
+/// auto maybe_match = FieldRef("struct", "field_i32").FindOneOrNone(schema);
+/// auto maybe_column = FieldRef("struct", "field_i32").GetOne(some_table);
+class ARROW_EXPORT FieldRef {
+ public:
+ FieldRef() = default;
+
+ /// Construct a FieldRef using a string of indices. The reference will be retrieved as:
+ /// schema.fields[self.indices[0]].type.fields[self.indices[1]] ...
+ ///
+ /// Empty indices are not valid.
+ FieldRef(FieldPath indices); // NOLINT runtime/explicit
+
+ /// Construct a by-name FieldRef. Multiple fields may match a by-name FieldRef:
+ /// [f for f in schema.fields where f.name == self.name]
+ FieldRef(std::string name) : impl_(std::move(name)) {} // NOLINT runtime/explicit
+ FieldRef(const char* name) : impl_(std::string(name)) {} // NOLINT runtime/explicit
+
+ /// Equivalent to a single index string of indices.
+ FieldRef(int index) : impl_(FieldPath({index})) {} // NOLINT runtime/explicit
+
+ /// Convenience constructor for nested FieldRefs: each argument will be used to
+ /// construct a FieldRef
+ template <typename A0, typename A1, typename... A>
+ FieldRef(A0&& a0, A1&& a1, A&&... a) {
+ Flatten({// cpplint thinks the following are constructor decls
+ FieldRef(std::forward<A0>(a0)), // NOLINT runtime/explicit
+ FieldRef(std::forward<A1>(a1)), // NOLINT runtime/explicit
+ FieldRef(std::forward<A>(a))...}); // NOLINT runtime/explicit
+ }
+
+ /// Parse a dot path into a FieldRef.
+ ///
+ /// dot_path = '.' name
+ /// | '[' digit+ ']'
+ /// | dot_path+
+ ///
+ /// Examples:
+ /// ".alpha" => FieldRef("alpha")
+ /// "[2]" => FieldRef(2)
+ /// ".beta[3]" => FieldRef("beta", 3)
+ /// "[5].gamma.delta[7]" => FieldRef(5, "gamma", "delta", 7)
+ /// ".hello world" => FieldRef("hello world")
+ /// R"(.\[y\]\\tho\.\)" => FieldRef(R"([y]\tho.\)")
+ ///
+ /// Note: When parsing a name, a '\' preceding any other character will be dropped from
+ /// the resulting name. Therefore if a name must contain the characters '.', '\', or '['
+ /// those must be escaped with a preceding '\'.
+ static Result<FieldRef> FromDotPath(const std::string& dot_path);
+
+ bool Equals(const FieldRef& other) const { return impl_ == other.impl_; }
+ bool operator==(const FieldRef& other) const { return Equals(other); }
+
+ std::string ToString() const;
+
+ size_t hash() const;
+ struct Hash {
+ size_t operator()(const FieldRef& ref) const { return ref.hash(); }
+ };
+
+ explicit operator bool() const { return Equals(FieldPath{}); }
+ bool operator!() const { return !Equals(FieldPath{}); }
+
+ bool IsFieldPath() const { return util::holds_alternative<FieldPath>(impl_); }
+ bool IsName() const { return util::holds_alternative<std::string>(impl_); }
+ bool IsNested() const {
+ if (IsName()) return false;
+ if (IsFieldPath()) return util::get<FieldPath>(impl_).indices().size() > 1;
+ return true;
+ }
+
+ const FieldPath* field_path() const {
+ return IsFieldPath() ? &util::get<FieldPath>(impl_) : NULLPTR;
+ }
+ const std::string* name() const {
+ return IsName() ? &util::get<std::string>(impl_) : NULLPTR;
+ }
+
+ /// \brief Retrieve FieldPath of every child field which matches this FieldRef.
+ std::vector<FieldPath> FindAll(const Schema& schema) const;
+ std::vector<FieldPath> FindAll(const Field& field) const;
+ std::vector<FieldPath> FindAll(const DataType& type) const;
+ std::vector<FieldPath> FindAll(const FieldVector& fields) const;
+
+ /// \brief Convenience function which applies FindAll to arg's type or schema.
+ std::vector<FieldPath> FindAll(const ArrayData& array) const;
+ std::vector<FieldPath> FindAll(const Array& array) const;
+ std::vector<FieldPath> FindAll(const RecordBatch& batch) const;
+
+ /// \brief Convenience function: raise an error if matches is empty.
+ template <typename T>
+ Status CheckNonEmpty(const std::vector<FieldPath>& matches, const T& root) const {
+ if (matches.empty()) {
+ return Status::Invalid("No match for ", ToString(), " in ", root.ToString());
+ }
+ return Status::OK();
+ }
+
+ /// \brief Convenience function: raise an error if matches contains multiple FieldPaths.
+ template <typename T>
+ Status CheckNonMultiple(const std::vector<FieldPath>& matches, const T& root) const {
+ if (matches.size() > 1) {
+ return Status::Invalid("Multiple matches for ", ToString(), " in ",
+ root.ToString());
+ }
+ return Status::OK();
+ }
+
+ /// \brief Retrieve FieldPath of a single child field which matches this
+ /// FieldRef. Emit an error if none or multiple match.
+ template <typename T>
+ Result<FieldPath> FindOne(const T& root) const {
+ auto matches = FindAll(root);
+ ARROW_RETURN_NOT_OK(CheckNonEmpty(matches, root));
+ ARROW_RETURN_NOT_OK(CheckNonMultiple(matches, root));
+ return std::move(matches[0]);
+ }
+
+ /// \brief Retrieve FieldPath of a single child field which matches this
+ /// FieldRef. Emit an error if multiple match. An empty (invalid) FieldPath
+ /// will be returned if none match.
+ template <typename T>
+ Result<FieldPath> FindOneOrNone(const T& root) const {
+ auto matches = FindAll(root);
+ ARROW_RETURN_NOT_OK(CheckNonMultiple(matches, root));
+ if (matches.empty()) {
+ return FieldPath();
+ }
+ return std::move(matches[0]);
+ }
+
+ template <typename T>
+ using GetType = decltype(std::declval<FieldPath>().Get(std::declval<T>()).ValueOrDie());
+
+ /// \brief Get all children matching this FieldRef.
+ template <typename T>
+ std::vector<GetType<T>> GetAll(const T& root) const {
+ std::vector<GetType<T>> out;
+ for (const auto& match : FindAll(root)) {
+ out.push_back(match.Get(root).ValueOrDie());
+ }
+ return out;
+ }
+
+ /// \brief Get the single child matching this FieldRef.
+ /// Emit an error if none or multiple match.
+ template <typename T>
+ Result<GetType<T>> GetOne(const T& root) const {
+ ARROW_ASSIGN_OR_RAISE(auto match, FindOne(root));
+ return match.Get(root).ValueOrDie();
+ }
+
+ /// \brief Get the single child matching this FieldRef.
+ /// Return nullptr if none match, emit an error if multiple match.
+ template <typename T>
+ Result<GetType<T>> GetOneOrNone(const T& root) const {
+ ARROW_ASSIGN_OR_RAISE(auto match, FindOneOrNone(root));
+ if (match.empty()) {
+ return static_cast<GetType<T>>(NULLPTR);
+ }
+ return match.Get(root).ValueOrDie();
+ }
+
+ private:
+ void Flatten(std::vector<FieldRef> children);
+
+ util::Variant<FieldPath, std::string, std::vector<FieldRef>> impl_;
+
+ ARROW_EXPORT friend void PrintTo(const FieldRef& ref, std::ostream* os);
+};
+
+// ----------------------------------------------------------------------
+// Schema
+
+enum class Endianness {
+ Little = 0,
+ Big = 1,
+#if ARROW_LITTLE_ENDIAN
+ Native = Little
+#else
+ Native = Big
+#endif
+};
+
+/// \class Schema
+/// \brief Sequence of arrow::Field objects describing the columns of a record
+/// batch or table data structure
+class ARROW_EXPORT Schema : public detail::Fingerprintable,
+ public util::EqualityComparable<Schema>,
+ public util::ToStringOstreamable<Schema> {
+ public:
+ explicit Schema(FieldVector fields, Endianness endianness,
+ std::shared_ptr<const KeyValueMetadata> metadata = NULLPTR);
+
+ explicit Schema(FieldVector fields,
+ std::shared_ptr<const KeyValueMetadata> metadata = NULLPTR);
+
+ Schema(const Schema&);
+
+ ~Schema() override;
+
+ /// Returns true if all of the schema fields are equal
+ bool Equals(const Schema& other, bool check_metadata = false) const;
+ bool Equals(const std::shared_ptr<Schema>& other, bool check_metadata = false) const;
+
+ /// \brief Set endianness in the schema
+ ///
+ /// \return new Schema
+ std::shared_ptr<Schema> WithEndianness(Endianness endianness) const;
+
+ /// \brief Return endianness in the schema
+ Endianness endianness() const;
+
+ /// \brief Indicate if endianness is equal to platform-native endianness
+ bool is_native_endian() const;
+
+ /// \brief Return the number of fields (columns) in the schema
+ int num_fields() const;
+
+ /// Return the ith schema element. Does not boundscheck
+ const std::shared_ptr<Field>& field(int i) const;
+
+ const FieldVector& fields() const;
+
+ std::vector<std::string> field_names() const;
+
+ /// Returns null if name not found
+ std::shared_ptr<Field> GetFieldByName(const std::string& name) const;
+
+ /// \brief Return the indices of all fields having this name in sorted order
+ FieldVector GetAllFieldsByName(const std::string& name) const;
+
+ /// Returns -1 if name not found
+ int GetFieldIndex(const std::string& name) const;
+
+ /// Return the indices of all fields having this name
+ std::vector<int> GetAllFieldIndices(const std::string& name) const;
+
+ /// Indicate if fields named `names` can be found unambiguously in the schema.
+ Status CanReferenceFieldsByNames(const std::vector<std::string>& names) const;
+
+ /// \brief The custom key-value metadata, if any
+ ///
+ /// \return metadata may be null
+ const std::shared_ptr<const KeyValueMetadata>& metadata() const;
+
+ /// \brief Render a string representation of the schema suitable for debugging
+ /// \param[in] show_metadata when true, if KeyValueMetadata is non-empty,
+ /// print keys and values in the output
+ std::string ToString(bool show_metadata = false) const;
+
+ Result<std::shared_ptr<Schema>> AddField(int i,
+ const std::shared_ptr<Field>& field) const;
+ Result<std::shared_ptr<Schema>> RemoveField(int i) const;
+ Result<std::shared_ptr<Schema>> SetField(int i,
+ const std::shared_ptr<Field>& field) const;
+
+ /// \brief Replace key-value metadata with new metadata
+ ///
+ /// \param[in] metadata new KeyValueMetadata
+ /// \return new Schema
+ std::shared_ptr<Schema> WithMetadata(
+ const std::shared_ptr<const KeyValueMetadata>& metadata) const;
+
+ /// \brief Return copy of Schema without the KeyValueMetadata
+ std::shared_ptr<Schema> RemoveMetadata() const;
+
+ /// \brief Indicate that the Schema has non-empty KevValueMetadata
+ bool HasMetadata() const;
+
+ /// \brief Indicate that the Schema has distinct field names.
+ bool HasDistinctFieldNames() const;
+
+ protected:
+ std::string ComputeFingerprint() const override;
+ std::string ComputeMetadataFingerprint() const override;
+
+ private:
+ ARROW_EXPORT friend void PrintTo(const Schema& s, std::ostream* os);
+
+ class Impl;
+ std::unique_ptr<Impl> impl_;
+};
+
+ARROW_EXPORT
+std::string EndiannessToString(Endianness endianness);
+
+// ----------------------------------------------------------------------
+
+/// \brief Convenience class to incrementally construct/merge schemas.
+///
+/// This class amortizes the cost of validating field name conflicts by
+/// maintaining the mapping. The caller also controls the conflict resolution
+/// scheme.
+class ARROW_EXPORT SchemaBuilder {
+ public:
+ // Indicate how field conflict(s) should be resolved when building a schema. A
+ // conflict arise when a field is added to the builder and one or more field(s)
+ // with the same name already exists.
+ enum ConflictPolicy {
+ // Ignore the conflict and append the field. This is the default behavior of the
+ // Schema constructor and the `arrow::schema` factory function.
+ CONFLICT_APPEND = 0,
+ // Keep the existing field and ignore the newer one.
+ CONFLICT_IGNORE,
+ // Replace the existing field with the newer one.
+ CONFLICT_REPLACE,
+ // Merge the fields. The merging behavior can be controlled by `Field::MergeOptions`
+ // specified at construction time. Also see documentation of `Field::MergeWith`.
+ CONFLICT_MERGE,
+ // Refuse the new field and error out.
+ CONFLICT_ERROR
+ };
+
+ /// \brief Construct an empty SchemaBuilder
+ /// `field_merge_options` is only effective when `conflict_policy` == `CONFLICT_MERGE`.
+ SchemaBuilder(
+ ConflictPolicy conflict_policy = CONFLICT_APPEND,
+ Field::MergeOptions field_merge_options = Field::MergeOptions::Defaults());
+ /// \brief Construct a SchemaBuilder from a list of fields
+ /// `field_merge_options` is only effective when `conflict_policy` == `CONFLICT_MERGE`.
+ SchemaBuilder(
+ std::vector<std::shared_ptr<Field>> fields,
+ ConflictPolicy conflict_policy = CONFLICT_APPEND,
+ Field::MergeOptions field_merge_options = Field::MergeOptions::Defaults());
+ /// \brief Construct a SchemaBuilder from a schema, preserving the metadata
+ /// `field_merge_options` is only effective when `conflict_policy` == `CONFLICT_MERGE`.
+ SchemaBuilder(
+ const std::shared_ptr<Schema>& schema,
+ ConflictPolicy conflict_policy = CONFLICT_APPEND,
+ Field::MergeOptions field_merge_options = Field::MergeOptions::Defaults());
+
+ /// \brief Return the conflict resolution method.
+ ConflictPolicy policy() const;
+
+ /// \brief Set the conflict resolution method.
+ void SetPolicy(ConflictPolicy resolution);
+
+ /// \brief Add a field to the constructed schema.
+ ///
+ /// \param[in] field to add to the constructed Schema.
+ /// \return A failure if encountered.
+ Status AddField(const std::shared_ptr<Field>& field);
+
+ /// \brief Add multiple fields to the constructed schema.
+ ///
+ /// \param[in] fields to add to the constructed Schema.
+ /// \return The first failure encountered, if any.
+ Status AddFields(const std::vector<std::shared_ptr<Field>>& fields);
+
+ /// \brief Add fields of a Schema to the constructed Schema.
+ ///
+ /// \param[in] schema to take fields to add to the constructed Schema.
+ /// \return The first failure encountered, if any.
+ Status AddSchema(const std::shared_ptr<Schema>& schema);
+
+ /// \brief Add fields of multiple Schemas to the constructed Schema.
+ ///
+ /// \param[in] schemas to take fields to add to the constructed Schema.
+ /// \return The first failure encountered, if any.
+ Status AddSchemas(const std::vector<std::shared_ptr<Schema>>& schemas);
+
+ Status AddMetadata(const KeyValueMetadata& metadata);
+
+ /// \brief Return the constructed Schema.
+ ///
+ /// The builder internal state is not affected by invoking this method, i.e.
+ /// a single builder can yield multiple incrementally constructed schemas.
+ ///
+ /// \return the constructed schema.
+ Result<std::shared_ptr<Schema>> Finish() const;
+
+ /// \brief Merge schemas in a unified schema according to policy.
+ static Result<std::shared_ptr<Schema>> Merge(
+ const std::vector<std::shared_ptr<Schema>>& schemas,
+ ConflictPolicy policy = CONFLICT_MERGE);
+
+ /// \brief Indicate if schemas are compatible to merge according to policy.
+ static Status AreCompatible(const std::vector<std::shared_ptr<Schema>>& schemas,
+ ConflictPolicy policy = CONFLICT_MERGE);
+
+ /// \brief Reset internal state with an empty schema (and metadata).
+ void Reset();
+
+ ~SchemaBuilder();
+
+ private:
+ class Impl;
+ std::unique_ptr<Impl> impl_;
+
+ Status AppendField(const std::shared_ptr<Field>& field);
+};
+
+/// \brief Unifies schemas by merging fields by name.
+///
+/// The behavior of field merging can be controlled via `Field::MergeOptions`.
+///
+/// The resulting schema will contain the union of fields from all schemas.
+/// Fields with the same name will be merged. See `Field::MergeOptions`.
+/// - They are expected to be mergeable under provided `field_merge_options`.
+/// - The unified field will inherit the metadata from the schema where
+/// that field is first defined.
+/// - The first N fields in the schema will be ordered the same as the
+/// N fields in the first schema.
+/// The resulting schema will inherit its metadata from the first input schema.
+/// Returns an error if:
+/// - Any input schema contains fields with duplicate names.
+/// - Fields of the same name are not mergeable.
+ARROW_EXPORT
+Result<std::shared_ptr<Schema>> UnifySchemas(
+ const std::vector<std::shared_ptr<Schema>>& schemas,
+ Field::MergeOptions field_merge_options = Field::MergeOptions::Defaults());
+
+namespace internal {
+
+static inline bool HasValidityBitmap(Type::type id) {
+ switch (id) {
+ case Type::NA:
+ case Type::DENSE_UNION:
+ case Type::SPARSE_UNION:
+ return false;
+ default:
+ return true;
+ }
+}
+
+ARROW_EXPORT
+std::string ToString(Type::type id);
+
+ARROW_EXPORT
+std::string ToTypeName(Type::type id);
+
+ARROW_EXPORT
+std::string ToString(TimeUnit::type unit);
+
+ARROW_EXPORT
+int GetByteWidth(const DataType& type);
+
+} // namespace internal
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/type_fwd.h b/contrib/libs/apache/arrow/cpp/src/arrow/type_fwd.h
new file mode 100644
index 00000000000..7e564106bbe
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/type_fwd.h
@@ -0,0 +1,678 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <limits>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+template <typename T>
+class Iterator;
+template <typename T>
+struct IterationTraits;
+
+template <typename T>
+class Result;
+
+class Status;
+
+namespace internal {
+struct Empty;
+} // namespace internal
+template <typename T = internal::Empty>
+class Future;
+
+namespace util {
+class Codec;
+} // namespace util
+
+class Buffer;
+class Device;
+class MemoryManager;
+class MemoryPool;
+class MutableBuffer;
+class ResizableBuffer;
+
+using BufferVector = std::vector<std::shared_ptr<Buffer>>;
+
+class DataType;
+class Field;
+class FieldRef;
+class KeyValueMetadata;
+enum class Endianness;
+class Schema;
+
+using DataTypeVector = std::vector<std::shared_ptr<DataType>>;
+using FieldVector = std::vector<std::shared_ptr<Field>>;
+
+class Array;
+struct ArrayData;
+class ArrayBuilder;
+struct Scalar;
+
+using ArrayDataVector = std::vector<std::shared_ptr<ArrayData>>;
+using ArrayVector = std::vector<std::shared_ptr<Array>>;
+using ScalarVector = std::vector<std::shared_ptr<Scalar>>;
+
+class ChunkedArray;
+class RecordBatch;
+class RecordBatchReader;
+class Table;
+
+struct Datum;
+struct ValueDescr;
+
+using ChunkedArrayVector = std::vector<std::shared_ptr<ChunkedArray>>;
+using RecordBatchVector = std::vector<std::shared_ptr<RecordBatch>>;
+using RecordBatchIterator = Iterator<std::shared_ptr<RecordBatch>>;
+
+class DictionaryType;
+class DictionaryArray;
+struct DictionaryScalar;
+
+class NullType;
+class NullArray;
+class NullBuilder;
+struct NullScalar;
+
+class FixedWidthType;
+
+class BooleanType;
+class BooleanArray;
+class BooleanBuilder;
+struct BooleanScalar;
+
+class BinaryType;
+class BinaryArray;
+class BinaryBuilder;
+struct BinaryScalar;
+
+class LargeBinaryType;
+class LargeBinaryArray;
+class LargeBinaryBuilder;
+struct LargeBinaryScalar;
+
+class FixedSizeBinaryType;
+class FixedSizeBinaryArray;
+class FixedSizeBinaryBuilder;
+struct FixedSizeBinaryScalar;
+
+class StringType;
+class StringArray;
+class StringBuilder;
+struct StringScalar;
+
+class LargeStringType;
+class LargeStringArray;
+class LargeStringBuilder;
+struct LargeStringScalar;
+
+class ListType;
+class ListArray;
+class ListBuilder;
+struct ListScalar;
+
+class LargeListType;
+class LargeListArray;
+class LargeListBuilder;
+struct LargeListScalar;
+
+class MapType;
+class MapArray;
+class MapBuilder;
+struct MapScalar;
+
+class FixedSizeListType;
+class FixedSizeListArray;
+class FixedSizeListBuilder;
+struct FixedSizeListScalar;
+
+class StructType;
+class StructArray;
+class StructBuilder;
+struct StructScalar;
+
+class Decimal128;
+class Decimal256;
+class DecimalType;
+class Decimal128Type;
+class Decimal256Type;
+class Decimal128Array;
+class Decimal256Array;
+class Decimal128Builder;
+class Decimal256Builder;
+struct Decimal128Scalar;
+struct Decimal256Scalar;
+
+struct UnionMode {
+ enum type { SPARSE, DENSE };
+};
+
+class SparseUnionType;
+class SparseUnionArray;
+class SparseUnionBuilder;
+struct SparseUnionScalar;
+
+class DenseUnionType;
+class DenseUnionArray;
+class DenseUnionBuilder;
+struct DenseUnionScalar;
+
+template <typename TypeClass>
+class NumericArray;
+
+template <typename TypeClass>
+class NumericBuilder;
+
+template <typename TypeClass>
+class NumericTensor;
+
+#define _NUMERIC_TYPE_DECL(KLASS) \
+ class KLASS##Type; \
+ using KLASS##Array = NumericArray<KLASS##Type>; \
+ using KLASS##Builder = NumericBuilder<KLASS##Type>; \
+ struct KLASS##Scalar; \
+ using KLASS##Tensor = NumericTensor<KLASS##Type>;
+
+_NUMERIC_TYPE_DECL(Int8)
+_NUMERIC_TYPE_DECL(Int16)
+_NUMERIC_TYPE_DECL(Int32)
+_NUMERIC_TYPE_DECL(Int64)
+_NUMERIC_TYPE_DECL(UInt8)
+_NUMERIC_TYPE_DECL(UInt16)
+_NUMERIC_TYPE_DECL(UInt32)
+_NUMERIC_TYPE_DECL(UInt64)
+_NUMERIC_TYPE_DECL(HalfFloat)
+_NUMERIC_TYPE_DECL(Float)
+_NUMERIC_TYPE_DECL(Double)
+
+#undef _NUMERIC_TYPE_DECL
+
+enum class DateUnit : char { DAY = 0, MILLI = 1 };
+
+class DateType;
+class Date32Type;
+using Date32Array = NumericArray<Date32Type>;
+using Date32Builder = NumericBuilder<Date32Type>;
+struct Date32Scalar;
+
+class Date64Type;
+using Date64Array = NumericArray<Date64Type>;
+using Date64Builder = NumericBuilder<Date64Type>;
+struct Date64Scalar;
+
+struct TimeUnit {
+ /// The unit for a time or timestamp DataType
+ enum type { SECOND = 0, MILLI = 1, MICRO = 2, NANO = 3 };
+};
+
+class TimeType;
+class Time32Type;
+using Time32Array = NumericArray<Time32Type>;
+using Time32Builder = NumericBuilder<Time32Type>;
+struct Time32Scalar;
+
+class Time64Type;
+using Time64Array = NumericArray<Time64Type>;
+using Time64Builder = NumericBuilder<Time64Type>;
+struct Time64Scalar;
+
+class TimestampType;
+using TimestampArray = NumericArray<TimestampType>;
+using TimestampBuilder = NumericBuilder<TimestampType>;
+struct TimestampScalar;
+
+class MonthIntervalType;
+using MonthIntervalArray = NumericArray<MonthIntervalType>;
+using MonthIntervalBuilder = NumericBuilder<MonthIntervalType>;
+struct MonthIntervalScalar;
+
+class DayTimeIntervalType;
+class DayTimeIntervalArray;
+class DayTimeIntervalBuilder;
+struct DayTimeIntervalScalar;
+
+class DurationType;
+using DurationArray = NumericArray<DurationType>;
+using DurationBuilder = NumericBuilder<DurationType>;
+struct DurationScalar;
+
+class ExtensionType;
+class ExtensionArray;
+struct ExtensionScalar;
+
+class Tensor;
+class SparseTensor;
+
+// ----------------------------------------------------------------------
+
+struct Type {
+ /// \brief Main data type enumeration
+ ///
+ /// This enumeration provides a quick way to interrogate the category
+ /// of a DataType instance.
+ enum type {
+ /// A NULL type having no physical storage
+ NA = 0,
+
+ /// Boolean as 1 bit, LSB bit-packed ordering
+ BOOL,
+
+ /// Unsigned 8-bit little-endian integer
+ UINT8,
+
+ /// Signed 8-bit little-endian integer
+ INT8,
+
+ /// Unsigned 16-bit little-endian integer
+ UINT16,
+
+ /// Signed 16-bit little-endian integer
+ INT16,
+
+ /// Unsigned 32-bit little-endian integer
+ UINT32,
+
+ /// Signed 32-bit little-endian integer
+ INT32,
+
+ /// Unsigned 64-bit little-endian integer
+ UINT64,
+
+ /// Signed 64-bit little-endian integer
+ INT64,
+
+ /// 2-byte floating point value
+ HALF_FLOAT,
+
+ /// 4-byte floating point value
+ FLOAT,
+
+ /// 8-byte floating point value
+ DOUBLE,
+
+ /// UTF8 variable-length string as List<Char>
+ STRING,
+
+ /// Variable-length bytes (no guarantee of UTF8-ness)
+ BINARY,
+
+ /// Fixed-size binary. Each value occupies the same number of bytes
+ FIXED_SIZE_BINARY,
+
+ /// int32_t days since the UNIX epoch
+ DATE32,
+
+ /// int64_t milliseconds since the UNIX epoch
+ DATE64,
+
+ /// Exact timestamp encoded with int64 since UNIX epoch
+ /// Default unit millisecond
+ TIMESTAMP,
+
+ /// Time as signed 32-bit integer, representing either seconds or
+ /// milliseconds since midnight
+ TIME32,
+
+ /// Time as signed 64-bit integer, representing either microseconds or
+ /// nanoseconds since midnight
+ TIME64,
+
+ /// YEAR_MONTH interval in SQL style
+ INTERVAL_MONTHS,
+
+ /// DAY_TIME interval in SQL style
+ INTERVAL_DAY_TIME,
+
+ /// Precision- and scale-based decimal type with 128 bits.
+ DECIMAL128,
+
+ /// Defined for backward-compatibility.
+ DECIMAL = DECIMAL128,
+
+ /// Precision- and scale-based decimal type with 256 bits.
+ DECIMAL256,
+
+ /// A list of some logical data type
+ LIST,
+
+ /// Struct of logical types
+ STRUCT,
+
+ /// Sparse unions of logical types
+ SPARSE_UNION,
+
+ /// Dense unions of logical types
+ DENSE_UNION,
+
+ /// Dictionary-encoded type, also called "categorical" or "factor"
+ /// in other programming languages. Holds the dictionary value
+ /// type but not the dictionary itself, which is part of the
+ /// ArrayData struct
+ DICTIONARY,
+
+ /// Map, a repeated struct logical type
+ MAP,
+
+ /// Custom data type, implemented by user
+ EXTENSION,
+
+ /// Fixed size list of some logical type
+ FIXED_SIZE_LIST,
+
+ /// Measure of elapsed time in either seconds, milliseconds, microseconds
+ /// or nanoseconds.
+ DURATION,
+
+ /// Like STRING, but with 64-bit offsets
+ LARGE_STRING,
+
+ /// Like BINARY, but with 64-bit offsets
+ LARGE_BINARY,
+
+ /// Like LIST, but with 64-bit offsets
+ LARGE_LIST,
+
+ // Leave this at the end
+ MAX_ID
+ };
+};
+
+/// \defgroup type-factories Factory functions for creating data types
+///
+/// Factory functions for creating data types
+/// @{
+
+/// \brief Return a NullType instance
+std::shared_ptr<DataType> ARROW_EXPORT null();
+/// \brief Return a BooleanType instance
+std::shared_ptr<DataType> ARROW_EXPORT boolean();
+/// \brief Return a Int8Type instance
+std::shared_ptr<DataType> ARROW_EXPORT int8();
+/// \brief Return a Int16Type instance
+std::shared_ptr<DataType> ARROW_EXPORT int16();
+/// \brief Return a Int32Type instance
+std::shared_ptr<DataType> ARROW_EXPORT int32();
+/// \brief Return a Int64Type instance
+std::shared_ptr<DataType> ARROW_EXPORT int64();
+/// \brief Return a UInt8Type instance
+std::shared_ptr<DataType> ARROW_EXPORT uint8();
+/// \brief Return a UInt16Type instance
+std::shared_ptr<DataType> ARROW_EXPORT uint16();
+/// \brief Return a UInt32Type instance
+std::shared_ptr<DataType> ARROW_EXPORT uint32();
+/// \brief Return a UInt64Type instance
+std::shared_ptr<DataType> ARROW_EXPORT uint64();
+/// \brief Return a HalfFloatType instance
+std::shared_ptr<DataType> ARROW_EXPORT float16();
+/// \brief Return a FloatType instance
+std::shared_ptr<DataType> ARROW_EXPORT float32();
+/// \brief Return a DoubleType instance
+std::shared_ptr<DataType> ARROW_EXPORT float64();
+/// \brief Return a StringType instance
+std::shared_ptr<DataType> ARROW_EXPORT utf8();
+/// \brief Return a LargeStringType instance
+std::shared_ptr<DataType> ARROW_EXPORT large_utf8();
+/// \brief Return a BinaryType instance
+std::shared_ptr<DataType> ARROW_EXPORT binary();
+/// \brief Return a LargeBinaryType instance
+std::shared_ptr<DataType> ARROW_EXPORT large_binary();
+/// \brief Return a Date32Type instance
+std::shared_ptr<DataType> ARROW_EXPORT date32();
+/// \brief Return a Date64Type instance
+std::shared_ptr<DataType> ARROW_EXPORT date64();
+
+/// \brief Create a FixedSizeBinaryType instance.
+ARROW_EXPORT
+std::shared_ptr<DataType> fixed_size_binary(int32_t byte_width);
+
+/// \brief Create a DecimalType instance depending on the precision
+///
+/// If the precision is greater than 38, a Decimal256Type is returned,
+/// otherwise a Decimal128Type.
+ARROW_EXPORT
+std::shared_ptr<DataType> decimal(int32_t precision, int32_t scale);
+
+/// \brief Create a Decimal128Type instance
+ARROW_EXPORT
+std::shared_ptr<DataType> decimal128(int32_t precision, int32_t scale);
+
+/// \brief Create a Decimal256Type instance
+ARROW_EXPORT
+std::shared_ptr<DataType> decimal256(int32_t precision, int32_t scale);
+
+/// \brief Create a ListType instance from its child Field type
+ARROW_EXPORT
+std::shared_ptr<DataType> list(const std::shared_ptr<Field>& value_type);
+
+/// \brief Create a ListType instance from its child DataType
+ARROW_EXPORT
+std::shared_ptr<DataType> list(const std::shared_ptr<DataType>& value_type);
+
+/// \brief Create a LargeListType instance from its child Field type
+ARROW_EXPORT
+std::shared_ptr<DataType> large_list(const std::shared_ptr<Field>& value_type);
+
+/// \brief Create a LargeListType instance from its child DataType
+ARROW_EXPORT
+std::shared_ptr<DataType> large_list(const std::shared_ptr<DataType>& value_type);
+
+/// \brief Create a MapType instance from its key and value DataTypes
+ARROW_EXPORT
+std::shared_ptr<DataType> map(std::shared_ptr<DataType> key_type,
+ std::shared_ptr<DataType> item_type,
+ bool keys_sorted = false);
+
+/// \brief Create a MapType instance from its key DataType and value field.
+///
+/// The field override is provided to communicate nullability of the value.
+ARROW_EXPORT
+std::shared_ptr<DataType> map(std::shared_ptr<DataType> key_type,
+ std::shared_ptr<Field> item_field,
+ bool keys_sorted = false);
+
+/// \brief Create a FixedSizeListType instance from its child Field type
+ARROW_EXPORT
+std::shared_ptr<DataType> fixed_size_list(const std::shared_ptr<Field>& value_type,
+ int32_t list_size);
+
+/// \brief Create a FixedSizeListType instance from its child DataType
+ARROW_EXPORT
+std::shared_ptr<DataType> fixed_size_list(const std::shared_ptr<DataType>& value_type,
+ int32_t list_size);
+/// \brief Return a Duration instance (naming use _type to avoid namespace conflict with
+/// built in time classes).
+std::shared_ptr<DataType> ARROW_EXPORT duration(TimeUnit::type unit);
+
+/// \brief Return a DayTimeIntervalType instance
+std::shared_ptr<DataType> ARROW_EXPORT day_time_interval();
+
+/// \brief Return a MonthIntervalType instance
+std::shared_ptr<DataType> ARROW_EXPORT month_interval();
+
+/// \brief Create a TimestampType instance from its unit
+ARROW_EXPORT
+std::shared_ptr<DataType> timestamp(TimeUnit::type unit);
+
+/// \brief Create a TimestampType instance from its unit and timezone
+ARROW_EXPORT
+std::shared_ptr<DataType> timestamp(TimeUnit::type unit, const std::string& timezone);
+
+/// \brief Create a 32-bit time type instance
+///
+/// Unit can be either SECOND or MILLI
+std::shared_ptr<DataType> ARROW_EXPORT time32(TimeUnit::type unit);
+
+/// \brief Create a 64-bit time type instance
+///
+/// Unit can be either MICRO or NANO
+std::shared_ptr<DataType> ARROW_EXPORT time64(TimeUnit::type unit);
+
+/// \brief Create a StructType instance
+std::shared_ptr<DataType> ARROW_EXPORT
+struct_(const std::vector<std::shared_ptr<Field>>& fields);
+
+/// \brief Create a SparseUnionType instance
+std::shared_ptr<DataType> ARROW_EXPORT sparse_union(FieldVector child_fields,
+ std::vector<int8_t> type_codes = {});
+/// \brief Create a DenseUnionType instance
+std::shared_ptr<DataType> ARROW_EXPORT dense_union(FieldVector child_fields,
+ std::vector<int8_t> type_codes = {});
+
+/// \brief Create a SparseUnionType instance
+std::shared_ptr<DataType> ARROW_EXPORT
+sparse_union(const ArrayVector& children, std::vector<std::string> field_names = {},
+ std::vector<int8_t> type_codes = {});
+/// \brief Create a DenseUnionType instance
+std::shared_ptr<DataType> ARROW_EXPORT
+dense_union(const ArrayVector& children, std::vector<std::string> field_names = {},
+ std::vector<int8_t> type_codes = {});
+
+/// \brief Create a UnionType instance
+ARROW_DEPRECATED("Deprecated in 1.0.0")
+inline std::shared_ptr<DataType> ARROW_EXPORT
+union_(const std::vector<std::shared_ptr<Field>>& child_fields,
+ const std::vector<int8_t>& type_codes, UnionMode::type mode = UnionMode::SPARSE) {
+ if (mode == UnionMode::SPARSE) {
+ return sparse_union(child_fields, type_codes);
+ } else {
+ return dense_union(child_fields, type_codes);
+ }
+}
+
+/// \brief Create a UnionType instance
+ARROW_DEPRECATED("Deprecated in 1.0.0")
+inline std::shared_ptr<DataType> ARROW_EXPORT
+union_(const std::vector<std::shared_ptr<Field>>& child_fields,
+ UnionMode::type mode = UnionMode::SPARSE) {
+ if (mode == UnionMode::SPARSE) {
+ return sparse_union(child_fields);
+ } else {
+ return dense_union(child_fields);
+ }
+}
+
+/// \brief Create a UnionType instance
+ARROW_DEPRECATED("Deprecated in 1.0.0")
+inline std::shared_ptr<DataType> ARROW_EXPORT
+union_(const std::vector<std::shared_ptr<Array>>& children,
+ const std::vector<std::string>& field_names, const std::vector<int8_t>& type_codes,
+ UnionMode::type mode = UnionMode::SPARSE) {
+ if (mode == UnionMode::SPARSE) {
+ return sparse_union(children, field_names, type_codes);
+ } else {
+ return dense_union(children, field_names, type_codes);
+ }
+}
+
+/// \brief Create a UnionType instance
+ARROW_DEPRECATED("Deprecated in 1.0.0")
+inline std::shared_ptr<DataType> ARROW_EXPORT
+union_(const std::vector<std::shared_ptr<Array>>& children,
+ const std::vector<std::string>& field_names,
+ UnionMode::type mode = UnionMode::SPARSE) {
+ if (mode == UnionMode::SPARSE) {
+ return sparse_union(children, field_names);
+ } else {
+ return dense_union(children, field_names);
+ }
+}
+
+/// \brief Create a UnionType instance
+ARROW_DEPRECATED("Deprecated in 1.0.0")
+inline std::shared_ptr<DataType> ARROW_EXPORT
+union_(const std::vector<std::shared_ptr<Array>>& children,
+ UnionMode::type mode = UnionMode::SPARSE) {
+ if (mode == UnionMode::SPARSE) {
+ return sparse_union(children);
+ } else {
+ return dense_union(children);
+ }
+}
+/// \brief Create a DictionaryType instance
+/// \param[in] index_type the type of the dictionary indices (must be
+/// a signed integer)
+/// \param[in] dict_type the type of the values in the variable dictionary
+/// \param[in] ordered true if the order of the dictionary values has
+/// semantic meaning and should be preserved where possible
+ARROW_EXPORT
+std::shared_ptr<DataType> dictionary(const std::shared_ptr<DataType>& index_type,
+ const std::shared_ptr<DataType>& dict_type,
+ bool ordered = false);
+
+/// @}
+
+/// \defgroup schema-factories Factory functions for fields and schemas
+///
+/// Factory functions for fields and schemas
+/// @{
+
+/// \brief Create a Field instance
+///
+/// \param name the field name
+/// \param type the field value type
+/// \param nullable whether the values are nullable, default true
+/// \param metadata any custom key-value metadata, default null
+std::shared_ptr<Field> ARROW_EXPORT
+field(std::string name, std::shared_ptr<DataType> type, bool nullable = true,
+ std::shared_ptr<const KeyValueMetadata> metadata = NULLPTR);
+
+/// \brief Create a Field instance with metadata
+///
+/// The field will be assumed to be nullable.
+///
+/// \param name the field name
+/// \param type the field value type
+/// \param metadata any custom key-value metadata
+std::shared_ptr<Field> ARROW_EXPORT
+field(std::string name, std::shared_ptr<DataType> type,
+ std::shared_ptr<const KeyValueMetadata> metadata);
+
+/// \brief Create a Schema instance
+///
+/// \param fields the schema's fields
+/// \param metadata any custom key-value metadata, default null
+/// \return schema shared_ptr to Schema
+ARROW_EXPORT
+std::shared_ptr<Schema> schema(
+ std::vector<std::shared_ptr<Field>> fields,
+ std::shared_ptr<const KeyValueMetadata> metadata = NULLPTR);
+
+/// \brief Create a Schema instance
+///
+/// \param fields the schema's fields
+/// \param endianness the endianness of the data
+/// \param metadata any custom key-value metadata, default null
+/// \return schema shared_ptr to Schema
+ARROW_EXPORT
+std::shared_ptr<Schema> schema(
+ std::vector<std::shared_ptr<Field>> fields, Endianness endianness,
+ std::shared_ptr<const KeyValueMetadata> metadata = NULLPTR);
+
+/// @}
+
+/// Return the process-wide default memory pool.
+ARROW_EXPORT MemoryPool* default_memory_pool();
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/type_traits.h b/contrib/libs/apache/arrow/cpp/src/arrow/type_traits.h
new file mode 100644
index 00000000000..e4d809967f9
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/type_traits.h
@@ -0,0 +1,1024 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include "arrow/type.h"
+#include "arrow/util/bit_util.h"
+
+namespace arrow {
+
+//
+// Per-type id type lookup
+//
+
+template <Type::type id>
+struct TypeIdTraits {};
+
+#define TYPE_ID_TRAIT(_id, _typeclass) \
+ template <> \
+ struct TypeIdTraits<Type::_id> { \
+ using Type = _typeclass; \
+ };
+
+TYPE_ID_TRAIT(NA, NullType)
+TYPE_ID_TRAIT(BOOL, BooleanType)
+TYPE_ID_TRAIT(INT8, Int8Type)
+TYPE_ID_TRAIT(INT16, Int16Type)
+TYPE_ID_TRAIT(INT32, Int32Type)
+TYPE_ID_TRAIT(INT64, Int64Type)
+TYPE_ID_TRAIT(UINT8, UInt8Type)
+TYPE_ID_TRAIT(UINT16, UInt16Type)
+TYPE_ID_TRAIT(UINT32, UInt32Type)
+TYPE_ID_TRAIT(UINT64, UInt64Type)
+TYPE_ID_TRAIT(HALF_FLOAT, HalfFloatType)
+TYPE_ID_TRAIT(FLOAT, FloatType)
+TYPE_ID_TRAIT(DOUBLE, DoubleType)
+TYPE_ID_TRAIT(STRING, StringType)
+TYPE_ID_TRAIT(BINARY, BinaryType)
+TYPE_ID_TRAIT(LARGE_STRING, LargeStringType)
+TYPE_ID_TRAIT(LARGE_BINARY, LargeBinaryType)
+TYPE_ID_TRAIT(FIXED_SIZE_BINARY, FixedSizeBinaryType)
+TYPE_ID_TRAIT(DATE32, Date32Type)
+TYPE_ID_TRAIT(DATE64, Date64Type)
+TYPE_ID_TRAIT(TIME32, Time32Type)
+TYPE_ID_TRAIT(TIME64, Time64Type)
+TYPE_ID_TRAIT(TIMESTAMP, TimestampType)
+TYPE_ID_TRAIT(INTERVAL_DAY_TIME, DayTimeIntervalType)
+TYPE_ID_TRAIT(INTERVAL_MONTHS, MonthIntervalType)
+TYPE_ID_TRAIT(DURATION, DurationType)
+TYPE_ID_TRAIT(DECIMAL128, Decimal128Type)
+TYPE_ID_TRAIT(DECIMAL256, Decimal256Type)
+TYPE_ID_TRAIT(STRUCT, StructType)
+TYPE_ID_TRAIT(LIST, ListType)
+TYPE_ID_TRAIT(LARGE_LIST, LargeListType)
+TYPE_ID_TRAIT(FIXED_SIZE_LIST, FixedSizeListType)
+TYPE_ID_TRAIT(MAP, MapType)
+TYPE_ID_TRAIT(DENSE_UNION, DenseUnionType)
+TYPE_ID_TRAIT(SPARSE_UNION, SparseUnionType)
+TYPE_ID_TRAIT(DICTIONARY, DictionaryType)
+TYPE_ID_TRAIT(EXTENSION, ExtensionType)
+
+#undef TYPE_ID_TRAIT
+
+//
+// Per-type type traits
+//
+
+template <typename T>
+struct TypeTraits {};
+
+template <typename T>
+struct CTypeTraits {};
+
+template <>
+struct TypeTraits<NullType> {
+ using ArrayType = NullArray;
+ using BuilderType = NullBuilder;
+ using ScalarType = NullScalar;
+
+ static constexpr int64_t bytes_required(int64_t) { return 0; }
+ constexpr static bool is_parameter_free = true;
+ static inline std::shared_ptr<DataType> type_singleton() { return null(); }
+};
+
+template <>
+struct TypeTraits<BooleanType> {
+ using ArrayType = BooleanArray;
+ using BuilderType = BooleanBuilder;
+ using ScalarType = BooleanScalar;
+ using CType = bool;
+
+ static constexpr int64_t bytes_required(int64_t elements) {
+ return BitUtil::BytesForBits(elements);
+ }
+ constexpr static bool is_parameter_free = true;
+ static inline std::shared_ptr<DataType> type_singleton() { return boolean(); }
+};
+
+template <>
+struct CTypeTraits<bool> : public TypeTraits<BooleanType> {
+ using ArrowType = BooleanType;
+};
+
+#define PRIMITIVE_TYPE_TRAITS_DEF_(CType_, ArrowType_, ArrowArrayType, ArrowBuilderType, \
+ ArrowScalarType, ArrowTensorType, SingletonFn) \
+ template <> \
+ struct TypeTraits<ArrowType_> { \
+ using ArrayType = ArrowArrayType; \
+ using BuilderType = ArrowBuilderType; \
+ using ScalarType = ArrowScalarType; \
+ using TensorType = ArrowTensorType; \
+ using CType = ArrowType_::c_type; \
+ static constexpr int64_t bytes_required(int64_t elements) { \
+ return elements * static_cast<int64_t>(sizeof(CType)); \
+ } \
+ constexpr static bool is_parameter_free = true; \
+ static inline std::shared_ptr<DataType> type_singleton() { return SingletonFn(); } \
+ }; \
+ \
+ template <> \
+ struct CTypeTraits<CType_> : public TypeTraits<ArrowType_> { \
+ using ArrowType = ArrowType_; \
+ };
+
+#define PRIMITIVE_TYPE_TRAITS_DEF(CType, ArrowShort, SingletonFn) \
+ PRIMITIVE_TYPE_TRAITS_DEF_( \
+ CType, ARROW_CONCAT(ArrowShort, Type), ARROW_CONCAT(ArrowShort, Array), \
+ ARROW_CONCAT(ArrowShort, Builder), ARROW_CONCAT(ArrowShort, Scalar), \
+ ARROW_CONCAT(ArrowShort, Tensor), SingletonFn)
+
+PRIMITIVE_TYPE_TRAITS_DEF(uint8_t, UInt8, uint8)
+PRIMITIVE_TYPE_TRAITS_DEF(int8_t, Int8, int8)
+PRIMITIVE_TYPE_TRAITS_DEF(uint16_t, UInt16, uint16)
+PRIMITIVE_TYPE_TRAITS_DEF(int16_t, Int16, int16)
+PRIMITIVE_TYPE_TRAITS_DEF(uint32_t, UInt32, uint32)
+PRIMITIVE_TYPE_TRAITS_DEF(int32_t, Int32, int32)
+PRIMITIVE_TYPE_TRAITS_DEF(uint64_t, UInt64, uint64)
+PRIMITIVE_TYPE_TRAITS_DEF(int64_t, Int64, int64)
+PRIMITIVE_TYPE_TRAITS_DEF(float, Float, float32)
+PRIMITIVE_TYPE_TRAITS_DEF(double, Double, float64)
+
+#undef PRIMITIVE_TYPE_TRAITS_DEF
+#undef PRIMITIVE_TYPE_TRAITS_DEF_
+
+template <>
+struct TypeTraits<Date64Type> {
+ using ArrayType = Date64Array;
+ using BuilderType = Date64Builder;
+ using ScalarType = Date64Scalar;
+ using CType = Date64Type::c_type;
+
+ static constexpr int64_t bytes_required(int64_t elements) {
+ return elements * static_cast<int64_t>(sizeof(int64_t));
+ }
+ constexpr static bool is_parameter_free = true;
+ static inline std::shared_ptr<DataType> type_singleton() { return date64(); }
+};
+
+template <>
+struct TypeTraits<Date32Type> {
+ using ArrayType = Date32Array;
+ using BuilderType = Date32Builder;
+ using ScalarType = Date32Scalar;
+ using CType = Date32Type::c_type;
+
+ static constexpr int64_t bytes_required(int64_t elements) {
+ return elements * static_cast<int64_t>(sizeof(int32_t));
+ }
+ constexpr static bool is_parameter_free = true;
+ static inline std::shared_ptr<DataType> type_singleton() { return date32(); }
+};
+
+template <>
+struct TypeTraits<TimestampType> {
+ using ArrayType = TimestampArray;
+ using BuilderType = TimestampBuilder;
+ using ScalarType = TimestampScalar;
+ using CType = TimestampType::c_type;
+
+ static constexpr int64_t bytes_required(int64_t elements) {
+ return elements * static_cast<int64_t>(sizeof(int64_t));
+ }
+ constexpr static bool is_parameter_free = false;
+};
+
+template <>
+struct TypeTraits<DurationType> {
+ using ArrayType = DurationArray;
+ using BuilderType = DurationBuilder;
+ using ScalarType = DurationScalar;
+ using CType = DurationType::c_type;
+
+ static constexpr int64_t bytes_required(int64_t elements) {
+ return elements * static_cast<int64_t>(sizeof(int64_t));
+ }
+ constexpr static bool is_parameter_free = false;
+};
+
+template <>
+struct TypeTraits<DayTimeIntervalType> {
+ using ArrayType = DayTimeIntervalArray;
+ using BuilderType = DayTimeIntervalBuilder;
+ using ScalarType = DayTimeIntervalScalar;
+
+ static constexpr int64_t bytes_required(int64_t elements) {
+ return elements * static_cast<int64_t>(sizeof(DayTimeIntervalType::DayMilliseconds));
+ }
+ constexpr static bool is_parameter_free = true;
+ static std::shared_ptr<DataType> type_singleton() { return day_time_interval(); }
+};
+
+template <>
+struct TypeTraits<MonthIntervalType> {
+ using ArrayType = MonthIntervalArray;
+ using BuilderType = MonthIntervalBuilder;
+ using ScalarType = MonthIntervalScalar;
+ using CType = MonthIntervalType::c_type;
+
+ static constexpr int64_t bytes_required(int64_t elements) {
+ return elements * static_cast<int64_t>(sizeof(int32_t));
+ }
+ constexpr static bool is_parameter_free = true;
+ static std::shared_ptr<DataType> type_singleton() { return month_interval(); }
+};
+
+template <>
+struct TypeTraits<Time32Type> {
+ using ArrayType = Time32Array;
+ using BuilderType = Time32Builder;
+ using ScalarType = Time32Scalar;
+ using CType = Time32Type::c_type;
+
+ static constexpr int64_t bytes_required(int64_t elements) {
+ return elements * static_cast<int64_t>(sizeof(int32_t));
+ }
+ constexpr static bool is_parameter_free = false;
+};
+
+template <>
+struct TypeTraits<Time64Type> {
+ using ArrayType = Time64Array;
+ using BuilderType = Time64Builder;
+ using ScalarType = Time64Scalar;
+ using CType = Time64Type::c_type;
+
+ static constexpr int64_t bytes_required(int64_t elements) {
+ return elements * static_cast<int64_t>(sizeof(int64_t));
+ }
+ constexpr static bool is_parameter_free = false;
+};
+
+template <>
+struct TypeTraits<HalfFloatType> {
+ using ArrayType = HalfFloatArray;
+ using BuilderType = HalfFloatBuilder;
+ using ScalarType = HalfFloatScalar;
+ using TensorType = HalfFloatTensor;
+
+ static constexpr int64_t bytes_required(int64_t elements) {
+ return elements * static_cast<int64_t>(sizeof(uint16_t));
+ }
+ constexpr static bool is_parameter_free = true;
+ static inline std::shared_ptr<DataType> type_singleton() { return float16(); }
+};
+
+template <>
+struct TypeTraits<Decimal128Type> {
+ using ArrayType = Decimal128Array;
+ using BuilderType = Decimal128Builder;
+ using ScalarType = Decimal128Scalar;
+ constexpr static bool is_parameter_free = false;
+};
+
+template <>
+struct TypeTraits<Decimal256Type> {
+ using ArrayType = Decimal256Array;
+ using BuilderType = Decimal256Builder;
+ using ScalarType = Decimal256Scalar;
+ constexpr static bool is_parameter_free = false;
+};
+
+template <>
+struct TypeTraits<BinaryType> {
+ using ArrayType = BinaryArray;
+ using BuilderType = BinaryBuilder;
+ using ScalarType = BinaryScalar;
+ using OffsetType = Int32Type;
+ constexpr static bool is_parameter_free = true;
+ static inline std::shared_ptr<DataType> type_singleton() { return binary(); }
+};
+
+template <>
+struct TypeTraits<LargeBinaryType> {
+ using ArrayType = LargeBinaryArray;
+ using BuilderType = LargeBinaryBuilder;
+ using ScalarType = LargeBinaryScalar;
+ using OffsetType = Int64Type;
+ constexpr static bool is_parameter_free = true;
+ static inline std::shared_ptr<DataType> type_singleton() { return large_binary(); }
+};
+
+template <>
+struct TypeTraits<FixedSizeBinaryType> {
+ using ArrayType = FixedSizeBinaryArray;
+ using BuilderType = FixedSizeBinaryBuilder;
+ using ScalarType = FixedSizeBinaryScalar;
+ constexpr static bool is_parameter_free = false;
+};
+
+template <>
+struct TypeTraits<StringType> {
+ using ArrayType = StringArray;
+ using BuilderType = StringBuilder;
+ using ScalarType = StringScalar;
+ using OffsetType = Int32Type;
+ constexpr static bool is_parameter_free = true;
+ static inline std::shared_ptr<DataType> type_singleton() { return utf8(); }
+};
+
+template <>
+struct TypeTraits<LargeStringType> {
+ using ArrayType = LargeStringArray;
+ using BuilderType = LargeStringBuilder;
+ using ScalarType = LargeStringScalar;
+ using OffsetType = Int64Type;
+ constexpr static bool is_parameter_free = true;
+ static inline std::shared_ptr<DataType> type_singleton() { return large_utf8(); }
+};
+
+template <>
+struct CTypeTraits<std::string> : public TypeTraits<StringType> {
+ using ArrowType = StringType;
+};
+
+template <>
+struct CTypeTraits<const char*> : public CTypeTraits<std::string> {};
+
+template <size_t N>
+struct CTypeTraits<const char (&)[N]> : public CTypeTraits<std::string> {};
+
+template <>
+struct CTypeTraits<DayTimeIntervalType::DayMilliseconds>
+ : public TypeTraits<DayTimeIntervalType> {
+ using ArrowType = DayTimeIntervalType;
+};
+
+template <>
+struct TypeTraits<ListType> {
+ using ArrayType = ListArray;
+ using BuilderType = ListBuilder;
+ using ScalarType = ListScalar;
+ using OffsetType = Int32Type;
+ using OffsetArrayType = Int32Array;
+ using OffsetBuilderType = Int32Builder;
+ using OffsetScalarType = Int32Scalar;
+ constexpr static bool is_parameter_free = false;
+};
+
+template <>
+struct TypeTraits<LargeListType> {
+ using ArrayType = LargeListArray;
+ using BuilderType = LargeListBuilder;
+ using ScalarType = LargeListScalar;
+ using OffsetType = Int64Type;
+ using OffsetArrayType = Int64Array;
+ using OffsetBuilderType = Int64Builder;
+ using OffsetScalarType = Int64Scalar;
+ constexpr static bool is_parameter_free = false;
+};
+
+template <>
+struct TypeTraits<MapType> {
+ using ArrayType = MapArray;
+ using BuilderType = MapBuilder;
+ using ScalarType = MapScalar;
+ using OffsetType = Int32Type;
+ using OffsetArrayType = Int32Array;
+ using OffsetBuilderType = Int32Builder;
+ constexpr static bool is_parameter_free = false;
+};
+
+template <>
+struct TypeTraits<FixedSizeListType> {
+ using ArrayType = FixedSizeListArray;
+ using BuilderType = FixedSizeListBuilder;
+ using ScalarType = FixedSizeListScalar;
+ constexpr static bool is_parameter_free = false;
+};
+
+template <typename CType>
+struct CTypeTraits<std::vector<CType>> : public TypeTraits<ListType> {
+ using ArrowType = ListType;
+
+ static inline std::shared_ptr<DataType> type_singleton() {
+ return list(CTypeTraits<CType>::type_singleton());
+ }
+};
+
+template <>
+struct TypeTraits<StructType> {
+ using ArrayType = StructArray;
+ using BuilderType = StructBuilder;
+ using ScalarType = StructScalar;
+ constexpr static bool is_parameter_free = false;
+};
+
+template <>
+struct TypeTraits<SparseUnionType> {
+ using ArrayType = SparseUnionArray;
+ using BuilderType = SparseUnionBuilder;
+ using ScalarType = SparseUnionScalar;
+ constexpr static bool is_parameter_free = false;
+};
+
+template <>
+struct TypeTraits<DenseUnionType> {
+ using ArrayType = DenseUnionArray;
+ using BuilderType = DenseUnionBuilder;
+ using ScalarType = DenseUnionScalar;
+ constexpr static bool is_parameter_free = false;
+};
+
+template <>
+struct TypeTraits<DictionaryType> {
+ using ArrayType = DictionaryArray;
+ using ScalarType = DictionaryScalar;
+ constexpr static bool is_parameter_free = false;
+};
+
+template <>
+struct TypeTraits<ExtensionType> {
+ using ArrayType = ExtensionArray;
+ using ScalarType = ExtensionScalar;
+ constexpr static bool is_parameter_free = false;
+};
+
+namespace internal {
+
+template <typename... Ts>
+struct make_void {
+ using type = void;
+};
+
+template <typename... Ts>
+using void_t = typename make_void<Ts...>::type;
+
+} // namespace internal
+
+//
+// Useful type predicates
+//
+
+// only in C++14
+template <bool B, typename T = void>
+using enable_if_t = typename std::enable_if<B, T>::type;
+
+template <typename T>
+using is_null_type = std::is_same<NullType, T>;
+
+template <typename T, typename R = void>
+using enable_if_null = enable_if_t<is_null_type<T>::value, R>;
+
+template <typename T>
+using is_boolean_type = std::is_same<BooleanType, T>;
+
+template <typename T, typename R = void>
+using enable_if_boolean = enable_if_t<is_boolean_type<T>::value, R>;
+
+template <typename T>
+using is_number_type = std::is_base_of<NumberType, T>;
+
+template <typename T, typename R = void>
+using enable_if_number = enable_if_t<is_number_type<T>::value, R>;
+
+template <typename T>
+using is_integer_type = std::is_base_of<IntegerType, T>;
+
+template <typename T, typename R = void>
+using enable_if_integer = enable_if_t<is_integer_type<T>::value, R>;
+
+template <typename T>
+using is_signed_integer_type =
+ std::integral_constant<bool, is_integer_type<T>::value &&
+ std::is_signed<typename T::c_type>::value>;
+
+template <typename T, typename R = void>
+using enable_if_signed_integer = enable_if_t<is_signed_integer_type<T>::value, R>;
+
+template <typename T>
+using is_unsigned_integer_type =
+ std::integral_constant<bool, is_integer_type<T>::value &&
+ std::is_unsigned<typename T::c_type>::value>;
+
+template <typename T, typename R = void>
+using enable_if_unsigned_integer = enable_if_t<is_unsigned_integer_type<T>::value, R>;
+
+// Note this will also include HalfFloatType which is represented by a
+// non-floating point primitive (uint16_t).
+template <typename T>
+using is_floating_type = std::is_base_of<FloatingPointType, T>;
+
+template <typename T, typename R = void>
+using enable_if_floating_point = enable_if_t<is_floating_type<T>::value, R>;
+
+// Half floats are special in that they behave physically like an unsigned
+// integer.
+template <typename T>
+using is_half_float_type = std::is_same<HalfFloatType, T>;
+
+template <typename T, typename R = void>
+using enable_if_half_float = enable_if_t<is_half_float_type<T>::value, R>;
+
+// Binary Types
+
+// Base binary refers to Binary/LargeBinary/String/LargeString
+template <typename T>
+using is_base_binary_type = std::is_base_of<BaseBinaryType, T>;
+
+template <typename T, typename R = void>
+using enable_if_base_binary = enable_if_t<is_base_binary_type<T>::value, R>;
+
+// Any binary excludes string from Base binary
+template <typename T>
+using is_binary_type =
+ std::integral_constant<bool, std::is_same<BinaryType, T>::value ||
+ std::is_same<LargeBinaryType, T>::value>;
+
+template <typename T, typename R = void>
+using enable_if_binary = enable_if_t<is_binary_type<T>::value, R>;
+
+template <typename T>
+using is_string_type =
+ std::integral_constant<bool, std::is_same<StringType, T>::value ||
+ std::is_same<LargeStringType, T>::value>;
+
+template <typename T, typename R = void>
+using enable_if_string = enable_if_t<is_string_type<T>::value, R>;
+
+template <typename T>
+using is_string_like_type =
+ std::integral_constant<bool, is_base_binary_type<T>::value && T::is_utf8>;
+
+template <typename T, typename R = void>
+using enable_if_string_like = enable_if_t<is_string_like_type<T>::value, R>;
+
+template <typename T, typename U, typename R = void>
+using enable_if_same = enable_if_t<std::is_same<T, U>::value, R>;
+
+// Note that this also includes DecimalType
+template <typename T>
+using is_fixed_size_binary_type = std::is_base_of<FixedSizeBinaryType, T>;
+
+template <typename T, typename R = void>
+using enable_if_fixed_size_binary = enable_if_t<is_fixed_size_binary_type<T>::value, R>;
+
+template <typename T>
+using is_binary_like_type =
+ std::integral_constant<bool, (is_base_binary_type<T>::value &&
+ !is_string_like_type<T>::value) ||
+ is_fixed_size_binary_type<T>::value>;
+
+template <typename T, typename R = void>
+using enable_if_binary_like = enable_if_t<is_binary_like_type<T>::value, R>;
+
+template <typename T>
+using is_decimal_type = std::is_base_of<DecimalType, T>;
+
+template <typename T, typename R = void>
+using enable_if_decimal = enable_if_t<is_decimal_type<T>::value, R>;
+
+template <typename T>
+using is_decimal128_type = std::is_base_of<Decimal128Type, T>;
+
+template <typename T, typename R = void>
+using enable_if_decimal128 = enable_if_t<is_decimal128_type<T>::value, R>;
+
+template <typename T>
+using is_decimal256_type = std::is_base_of<Decimal256Type, T>;
+
+template <typename T, typename R = void>
+using enable_if_decimal256 = enable_if_t<is_decimal256_type<T>::value, R>;
+
+// Nested Types
+
+template <typename T>
+using is_nested_type = std::is_base_of<NestedType, T>;
+
+template <typename T, typename R = void>
+using enable_if_nested = enable_if_t<is_nested_type<T>::value, R>;
+
+template <typename T, typename R = void>
+using enable_if_not_nested = enable_if_t<!is_nested_type<T>::value, R>;
+
+template <typename T>
+using is_var_length_list_type =
+ std::integral_constant<bool, std::is_base_of<LargeListType, T>::value ||
+ std::is_base_of<ListType, T>::value>;
+
+template <typename T, typename R = void>
+using enable_if_var_size_list = enable_if_t<is_var_length_list_type<T>::value, R>;
+
+// DEPRECATED use is_var_length_list_type.
+template <typename T>
+using is_base_list_type = is_var_length_list_type<T>;
+
+// DEPRECATED use enable_if_var_size_list
+template <typename T, typename R = void>
+using enable_if_base_list = enable_if_var_size_list<T, R>;
+
+template <typename T>
+using is_fixed_size_list_type = std::is_same<FixedSizeListType, T>;
+
+template <typename T, typename R = void>
+using enable_if_fixed_size_list = enable_if_t<is_fixed_size_list_type<T>::value, R>;
+
+template <typename T>
+using is_list_type =
+ std::integral_constant<bool, std::is_same<T, ListType>::value ||
+ std::is_same<T, LargeListType>::value ||
+ std::is_same<T, FixedSizeListType>::value>;
+
+template <typename T, typename R = void>
+using enable_if_list_type = enable_if_t<is_list_type<T>::value, R>;
+
+template <typename T>
+using is_list_like_type =
+ std::integral_constant<bool, is_base_list_type<T>::value ||
+ is_fixed_size_list_type<T>::value>;
+
+template <typename T, typename R = void>
+using enable_if_list_like = enable_if_t<is_list_like_type<T>::value, R>;
+
+template <typename T>
+using is_struct_type = std::is_base_of<StructType, T>;
+
+template <typename T, typename R = void>
+using enable_if_struct = enable_if_t<is_struct_type<T>::value, R>;
+
+template <typename T>
+using is_union_type = std::is_base_of<UnionType, T>;
+
+template <typename T, typename R = void>
+using enable_if_union = enable_if_t<is_union_type<T>::value, R>;
+
+// TemporalTypes
+
+template <typename T>
+using is_temporal_type = std::is_base_of<TemporalType, T>;
+
+template <typename T, typename R = void>
+using enable_if_temporal = enable_if_t<is_temporal_type<T>::value, R>;
+
+template <typename T>
+using is_date_type = std::is_base_of<DateType, T>;
+
+template <typename T, typename R = void>
+using enable_if_date = enable_if_t<is_date_type<T>::value, R>;
+
+template <typename T>
+using is_time_type = std::is_base_of<TimeType, T>;
+
+template <typename T, typename R = void>
+using enable_if_time = enable_if_t<is_time_type<T>::value, R>;
+
+template <typename T>
+using is_timestamp_type = std::is_base_of<TimestampType, T>;
+
+template <typename T, typename R = void>
+using enable_if_timestamp = enable_if_t<is_timestamp_type<T>::value, R>;
+
+template <typename T>
+using is_duration_type = std::is_base_of<DurationType, T>;
+
+template <typename T, typename R = void>
+using enable_if_duration = enable_if_t<is_duration_type<T>::value, R>;
+
+template <typename T>
+using is_interval_type = std::is_base_of<IntervalType, T>;
+
+template <typename T, typename R = void>
+using enable_if_interval = enable_if_t<is_interval_type<T>::value, R>;
+
+template <typename T>
+using is_dictionary_type = std::is_base_of<DictionaryType, T>;
+
+template <typename T, typename R = void>
+using enable_if_dictionary = enable_if_t<is_dictionary_type<T>::value, R>;
+
+template <typename T>
+using is_extension_type = std::is_base_of<ExtensionType, T>;
+
+template <typename T, typename R = void>
+using enable_if_extension = enable_if_t<is_extension_type<T>::value, R>;
+
+// Attribute differentiation
+
+template <typename T>
+using is_primitive_ctype = std::is_base_of<PrimitiveCType, T>;
+
+template <typename T, typename R = void>
+using enable_if_primitive_ctype = enable_if_t<is_primitive_ctype<T>::value, R>;
+
+template <typename T>
+using has_c_type = std::integral_constant<bool, is_primitive_ctype<T>::value ||
+ is_temporal_type<T>::value>;
+
+template <typename T, typename R = void>
+using enable_if_has_c_type = enable_if_t<has_c_type<T>::value, R>;
+
+template <typename T>
+using has_string_view =
+ std::integral_constant<bool, std::is_same<BinaryType, T>::value ||
+ std::is_same<LargeBinaryType, T>::value ||
+ std::is_same<StringType, T>::value ||
+ std::is_same<LargeStringType, T>::value ||
+ std::is_same<FixedSizeBinaryType, T>::value>;
+
+template <typename T, typename R = void>
+using enable_if_has_string_view = enable_if_t<has_string_view<T>::value, R>;
+
+template <typename T>
+using is_8bit_int = std::integral_constant<bool, std::is_same<UInt8Type, T>::value ||
+ std::is_same<Int8Type, T>::value>;
+
+template <typename T, typename R = void>
+using enable_if_8bit_int = enable_if_t<is_8bit_int<T>::value, R>;
+
+template <typename T>
+using is_parameter_free_type =
+ std::integral_constant<bool, TypeTraits<T>::is_parameter_free>;
+
+template <typename T, typename R = void>
+using enable_if_parameter_free = enable_if_t<is_parameter_free_type<T>::value, R>;
+
+// Physical representation quirks
+
+template <typename T>
+using is_physical_signed_integer_type =
+ std::integral_constant<bool,
+ is_signed_integer_type<T>::value ||
+ (is_temporal_type<T>::value && has_c_type<T>::value)>;
+
+template <typename T, typename R = void>
+using enable_if_physical_signed_integer =
+ enable_if_t<is_physical_signed_integer_type<T>::value, R>;
+
+template <typename T>
+using is_physical_unsigned_integer_type =
+ std::integral_constant<bool, is_unsigned_integer_type<T>::value ||
+ is_half_float_type<T>::value>;
+
+template <typename T, typename R = void>
+using enable_if_physical_unsigned_integer =
+ enable_if_t<is_physical_unsigned_integer_type<T>::value, R>;
+
+template <typename T>
+using is_physical_integer_type =
+ std::integral_constant<bool, is_physical_unsigned_integer_type<T>::value ||
+ is_physical_signed_integer_type<T>::value>;
+
+template <typename T, typename R = void>
+using enable_if_physical_integer = enable_if_t<is_physical_integer_type<T>::value, R>;
+
+// Like is_floating_type but excluding half-floats which don't have a
+// float-like c type.
+template <typename T>
+using is_physical_floating_type =
+ std::integral_constant<bool,
+ is_floating_type<T>::value && !is_half_float_type<T>::value>;
+
+template <typename T, typename R = void>
+using enable_if_physical_floating_point =
+ enable_if_t<is_physical_floating_type<T>::value, R>;
+
+static inline bool is_integer(Type::type type_id) {
+ switch (type_id) {
+ case Type::UINT8:
+ case Type::INT8:
+ case Type::UINT16:
+ case Type::INT16:
+ case Type::UINT32:
+ case Type::INT32:
+ case Type::UINT64:
+ case Type::INT64:
+ return true;
+ default:
+ break;
+ }
+ return false;
+}
+
+static inline bool is_signed_integer(Type::type type_id) {
+ switch (type_id) {
+ case Type::INT8:
+ case Type::INT16:
+ case Type::INT32:
+ case Type::INT64:
+ return true;
+ default:
+ break;
+ }
+ return false;
+}
+
+static inline bool is_unsigned_integer(Type::type type_id) {
+ switch (type_id) {
+ case Type::UINT8:
+ case Type::UINT16:
+ case Type::UINT32:
+ case Type::UINT64:
+ return true;
+ default:
+ break;
+ }
+ return false;
+}
+
+static inline bool is_floating(Type::type type_id) {
+ switch (type_id) {
+ case Type::HALF_FLOAT:
+ case Type::FLOAT:
+ case Type::DOUBLE:
+ return true;
+ default:
+ break;
+ }
+ return false;
+}
+
+static inline bool is_decimal(Type::type type_id) {
+ switch (type_id) {
+ case Type::DECIMAL128:
+ case Type::DECIMAL256:
+ return true;
+ default:
+ break;
+ }
+ return false;
+}
+
+static inline bool is_primitive(Type::type type_id) {
+ switch (type_id) {
+ case Type::BOOL:
+ case Type::UINT8:
+ case Type::INT8:
+ case Type::UINT16:
+ case Type::INT16:
+ case Type::UINT32:
+ case Type::INT32:
+ case Type::UINT64:
+ case Type::INT64:
+ case Type::HALF_FLOAT:
+ case Type::FLOAT:
+ case Type::DOUBLE:
+ case Type::DATE32:
+ case Type::DATE64:
+ case Type::TIME32:
+ case Type::TIME64:
+ case Type::TIMESTAMP:
+ case Type::DURATION:
+ case Type::INTERVAL_MONTHS:
+ case Type::INTERVAL_DAY_TIME:
+ return true;
+ default:
+ break;
+ }
+ return false;
+}
+
+static inline bool is_base_binary_like(Type::type type_id) {
+ switch (type_id) {
+ case Type::BINARY:
+ case Type::LARGE_BINARY:
+ case Type::STRING:
+ case Type::LARGE_STRING:
+ return true;
+ default:
+ break;
+ }
+ return false;
+}
+
+static inline bool is_binary_like(Type::type type_id) {
+ switch (type_id) {
+ case Type::BINARY:
+ case Type::STRING:
+ return true;
+ default:
+ break;
+ }
+ return false;
+}
+
+static inline bool is_large_binary_like(Type::type type_id) {
+ switch (type_id) {
+ case Type::LARGE_BINARY:
+ case Type::LARGE_STRING:
+ return true;
+ default:
+ break;
+ }
+ return false;
+}
+
+static inline bool is_dictionary(Type::type type_id) {
+ return type_id == Type::DICTIONARY;
+}
+
+static inline bool is_fixed_size_binary(Type::type type_id) {
+ switch (type_id) {
+ case Type::DECIMAL128:
+ case Type::DECIMAL256:
+ case Type::FIXED_SIZE_BINARY:
+ return true;
+ default:
+ break;
+ }
+ return false;
+}
+
+static inline bool is_fixed_width(Type::type type_id) {
+ return is_primitive(type_id) || is_dictionary(type_id) || is_fixed_size_binary(type_id);
+}
+
+static inline int bit_width(Type::type type_id) {
+ switch (type_id) {
+ case Type::BOOL:
+ return 1;
+ case Type::UINT8:
+ case Type::INT8:
+ return 8;
+ case Type::UINT16:
+ case Type::INT16:
+ return 16;
+ case Type::UINT32:
+ case Type::INT32:
+ case Type::DATE32:
+ case Type::TIME32:
+ return 32;
+ case Type::UINT64:
+ case Type::INT64:
+ case Type::DATE64:
+ case Type::TIME64:
+ case Type::TIMESTAMP:
+ case Type::DURATION:
+ return 64;
+
+ case Type::HALF_FLOAT:
+ return 16;
+ case Type::FLOAT:
+ return 32;
+ case Type::DOUBLE:
+ return 64;
+
+ case Type::INTERVAL_MONTHS:
+ return 32;
+ case Type::INTERVAL_DAY_TIME:
+ return 64;
+
+ case Type::DECIMAL128:
+ return 128;
+ case Type::DECIMAL256:
+ return 256;
+
+ default:
+ break;
+ }
+ return 0;
+}
+
+static inline bool is_nested(Type::type type_id) {
+ switch (type_id) {
+ case Type::LIST:
+ case Type::LARGE_LIST:
+ case Type::FIXED_SIZE_LIST:
+ case Type::MAP:
+ case Type::STRUCT:
+ case Type::SPARSE_UNION:
+ case Type::DENSE_UNION:
+ return true;
+ default:
+ break;
+ }
+ return false;
+}
+
+static inline int offset_bit_width(Type::type type_id) {
+ switch (type_id) {
+ case Type::STRING:
+ case Type::BINARY:
+ case Type::LIST:
+ case Type::MAP:
+ case Type::DENSE_UNION:
+ return 32;
+ case Type::LARGE_STRING:
+ case Type::LARGE_BINARY:
+ case Type::LARGE_LIST:
+ return 64;
+ default:
+ break;
+ }
+ return 0;
+}
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/algorithm.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/algorithm.h
new file mode 100644
index 00000000000..2a0e6ba709d
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/algorithm.h
@@ -0,0 +1,33 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/result.h"
+
+namespace arrow {
+
+template <typename InputIterator, typename OutputIterator, typename UnaryOperation>
+Status MaybeTransform(InputIterator first, InputIterator last, OutputIterator out,
+ UnaryOperation unary_op) {
+ for (; first != last; ++first, (void)++out) {
+ ARROW_ASSIGN_OR_RAISE(*out, unary_op(*first));
+ }
+ return Status::OK();
+}
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/align_util.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/align_util.h
new file mode 100644
index 00000000000..4c25a1a17b8
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/align_util.h
@@ -0,0 +1,68 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <algorithm>
+
+#include "arrow/util/bit_util.h"
+
+namespace arrow {
+namespace internal {
+
+struct BitmapWordAlignParams {
+ int64_t leading_bits;
+ int64_t trailing_bits;
+ int64_t trailing_bit_offset;
+ const uint8_t* aligned_start;
+ int64_t aligned_bits;
+ int64_t aligned_words;
+};
+
+// Compute parameters for accessing a bitmap using aligned word instructions.
+// The returned parameters describe:
+// - a leading area of size `leading_bits` before the aligned words
+// - a word-aligned area of size `aligned_bits`
+// - a trailing area of size `trailing_bits` after the aligned words
+template <uint64_t ALIGN_IN_BYTES>
+inline BitmapWordAlignParams BitmapWordAlign(const uint8_t* data, int64_t bit_offset,
+ int64_t length) {
+ static_assert(BitUtil::IsPowerOf2(ALIGN_IN_BYTES),
+ "ALIGN_IN_BYTES should be a positive power of two");
+ constexpr uint64_t ALIGN_IN_BITS = ALIGN_IN_BYTES * 8;
+
+ BitmapWordAlignParams p;
+
+ // Compute a "bit address" that we can align up to ALIGN_IN_BITS.
+ // We don't care about losing the upper bits since we are only interested in the
+ // difference between both addresses.
+ const uint64_t bit_addr =
+ reinterpret_cast<size_t>(data) * 8 + static_cast<uint64_t>(bit_offset);
+ const uint64_t aligned_bit_addr = BitUtil::RoundUpToPowerOf2(bit_addr, ALIGN_IN_BITS);
+
+ p.leading_bits = std::min<int64_t>(length, aligned_bit_addr - bit_addr);
+ p.aligned_words = (length - p.leading_bits) / ALIGN_IN_BITS;
+ p.aligned_bits = p.aligned_words * ALIGN_IN_BITS;
+ p.trailing_bits = length - p.leading_bits - p.aligned_bits;
+ p.trailing_bit_offset = bit_offset + p.leading_bits + p.aligned_bits;
+
+ p.aligned_start = data + (bit_offset + p.leading_bits) / 8;
+ return p;
+}
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/async_generator.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/async_generator.h
new file mode 100644
index 00000000000..9d1021edff5
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/async_generator.h
@@ -0,0 +1,1614 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <atomic>
+#include <cassert>
+#include <cstring>
+#include <deque>
+#include <limits>
+#include <queue>
+
+#include "arrow/util/functional.h"
+#include "arrow/util/future.h"
+#include "arrow/util/io_util.h"
+#include "arrow/util/iterator.h"
+#include "arrow/util/mutex.h"
+#include "arrow/util/optional.h"
+#include "arrow/util/queue.h"
+#include "arrow/util/thread_pool.h"
+
+namespace arrow {
+
+// The methods in this file create, modify, and utilize AsyncGenerator which is an
+// iterator of futures. This allows an asynchronous source (like file input) to be run
+// through a pipeline in the same way that iterators can be used to create pipelined
+// workflows.
+//
+// In order to support pipeline parallelism we introduce the concept of asynchronous
+// reentrancy. This is different than synchronous reentrancy. With synchronous code a
+// function is reentrant if the function can be called again while a previous call to that
+// function is still running. Unless otherwise specified none of these generators are
+// synchronously reentrant. Care should be taken to avoid calling them in such a way (and
+// the utilities Visit/Collect/Await take care to do this).
+//
+// Asynchronous reentrancy on the other hand means the function is called again before the
+// future returned by the function is marked finished (but after the call to get the
+// future returns). Some of these generators are async-reentrant while others (e.g.
+// those that depend on ordered processing like decompression) are not. Read the MakeXYZ
+// function comments to determine which generators support async reentrancy.
+//
+// Note: Generators that are not asynchronously reentrant can still support readahead
+// (\see MakeSerialReadaheadGenerator).
+//
+// Readahead operators, and some other operators, may introduce queueing. Any operators
+// that introduce buffering should detail the amount of buffering they introduce in their
+// MakeXYZ function comments.
+template <typename T>
+using AsyncGenerator = std::function<Future<T>()>;
+
+template <typename T>
+struct IterationTraits<AsyncGenerator<T>> {
+ /// \brief by default when iterating through a sequence of AsyncGenerator<T>,
+ /// an empty function indicates the end of iteration.
+ static AsyncGenerator<T> End() { return AsyncGenerator<T>(); }
+
+ static bool IsEnd(const AsyncGenerator<T>& val) { return !val; }
+};
+
+template <typename T>
+Future<T> AsyncGeneratorEnd() {
+ return Future<T>::MakeFinished(IterationTraits<T>::End());
+}
+
+/// returning a future that completes when all have been visited
+template <typename T, typename Visitor>
+Future<> VisitAsyncGenerator(AsyncGenerator<T> generator, Visitor visitor) {
+ struct LoopBody {
+ struct Callback {
+ Result<ControlFlow<>> operator()(const T& next) {
+ if (IsIterationEnd(next)) {
+ return Break();
+ } else {
+ auto visited = visitor(next);
+ if (visited.ok()) {
+ return Continue();
+ } else {
+ return visited;
+ }
+ }
+ }
+
+ Visitor visitor;
+ };
+
+ Future<ControlFlow<>> operator()() {
+ Callback callback{visitor};
+ auto next = generator();
+ return next.Then(std::move(callback));
+ }
+
+ AsyncGenerator<T> generator;
+ Visitor visitor;
+ };
+
+ return Loop(LoopBody{std::move(generator), std::move(visitor)});
+}
+
+/// \brief Waits for an async generator to complete, discarding results.
+template <typename T>
+Future<> DiscardAllFromAsyncGenerator(AsyncGenerator<T> generator) {
+ std::function<Status(T)> visitor = [](const T&) { return Status::OK(); };
+ return VisitAsyncGenerator(generator, visitor);
+}
+
+/// \brief Collects the results of an async generator into a vector
+template <typename T>
+Future<std::vector<T>> CollectAsyncGenerator(AsyncGenerator<T> generator) {
+ auto vec = std::make_shared<std::vector<T>>();
+ struct LoopBody {
+ Future<ControlFlow<std::vector<T>>> operator()() {
+ auto next = generator_();
+ auto vec = vec_;
+ return next.Then([vec](const T& result) -> Result<ControlFlow<std::vector<T>>> {
+ if (IsIterationEnd(result)) {
+ return Break(*vec);
+ } else {
+ vec->push_back(result);
+ return Continue();
+ }
+ });
+ }
+ AsyncGenerator<T> generator_;
+ std::shared_ptr<std::vector<T>> vec_;
+ };
+ return Loop(LoopBody{std::move(generator), std::move(vec)});
+}
+
+/// \see MakeMappedGenerator
+template <typename T, typename V>
+class MappingGenerator {
+ public:
+ MappingGenerator(AsyncGenerator<T> source, std::function<Future<V>(const T&)> map)
+ : state_(std::make_shared<State>(std::move(source), std::move(map))) {}
+
+ Future<V> operator()() {
+ auto future = Future<V>::Make();
+ bool should_trigger;
+ {
+ auto guard = state_->mutex.Lock();
+ if (state_->finished) {
+ return AsyncGeneratorEnd<V>();
+ }
+ should_trigger = state_->waiting_jobs.empty();
+ state_->waiting_jobs.push_back(future);
+ }
+ if (should_trigger) {
+ state_->source().AddCallback(Callback{state_});
+ }
+ return future;
+ }
+
+ private:
+ struct State {
+ State(AsyncGenerator<T> source, std::function<Future<V>(const T&)> map)
+ : source(std::move(source)),
+ map(std::move(map)),
+ waiting_jobs(),
+ mutex(),
+ finished(false) {}
+
+ void Purge() {
+ // This might be called by an original callback (if the source iterator fails or
+ // ends) or by a mapped callback (if the map function fails or ends prematurely).
+ // Either way it should only be called once and after finished is set so there is no
+ // need to guard access to `waiting_jobs`.
+ while (!waiting_jobs.empty()) {
+ waiting_jobs.front().MarkFinished(IterationTraits<V>::End());
+ waiting_jobs.pop_front();
+ }
+ }
+
+ AsyncGenerator<T> source;
+ std::function<Future<V>(const T&)> map;
+ std::deque<Future<V>> waiting_jobs;
+ util::Mutex mutex;
+ bool finished;
+ };
+
+ struct Callback;
+
+ struct MappedCallback {
+ void operator()(const Result<V>& maybe_next) {
+ bool end = !maybe_next.ok() || IsIterationEnd(*maybe_next);
+ bool should_purge = false;
+ if (end) {
+ {
+ auto guard = state->mutex.Lock();
+ should_purge = !state->finished;
+ state->finished = true;
+ }
+ }
+ sink.MarkFinished(maybe_next);
+ if (should_purge) {
+ state->Purge();
+ }
+ }
+ std::shared_ptr<State> state;
+ Future<V> sink;
+ };
+
+ struct Callback {
+ void operator()(const Result<T>& maybe_next) {
+ Future<V> sink;
+ bool end = !maybe_next.ok() || IsIterationEnd(*maybe_next);
+ bool should_purge = false;
+ bool should_trigger;
+ {
+ auto guard = state->mutex.Lock();
+ if (end) {
+ should_purge = !state->finished;
+ state->finished = true;
+ }
+ sink = state->waiting_jobs.front();
+ state->waiting_jobs.pop_front();
+ should_trigger = !end && !state->waiting_jobs.empty();
+ }
+ if (should_purge) {
+ state->Purge();
+ }
+ if (should_trigger) {
+ state->source().AddCallback(Callback{state});
+ }
+ if (maybe_next.ok()) {
+ const T& val = maybe_next.ValueUnsafe();
+ if (IsIterationEnd(val)) {
+ sink.MarkFinished(IterationTraits<V>::End());
+ } else {
+ Future<V> mapped_fut = state->map(val);
+ mapped_fut.AddCallback(MappedCallback{std::move(state), std::move(sink)});
+ }
+ } else {
+ sink.MarkFinished(maybe_next.status());
+ }
+ }
+
+ std::shared_ptr<State> state;
+ };
+
+ std::shared_ptr<State> state_;
+};
+
+/// \brief Creates a generator that will apply the map function to each element of
+/// source. The map function is not called on the end token.
+///
+/// Note: This function makes a copy of `map` for each item
+/// Note: Errors returned from the `map` function will be propagated
+///
+/// If the source generator is async-reentrant then this generator will be also
+template <typename T, typename MapFn,
+ typename Mapped = detail::result_of_t<MapFn(const T&)>,
+ typename V = typename EnsureFuture<Mapped>::type::ValueType>
+AsyncGenerator<V> MakeMappedGenerator(AsyncGenerator<T> source_generator, MapFn map) {
+ struct MapCallback {
+ MapFn map_;
+
+ Future<V> operator()(const T& val) { return ToFuture(map_(val)); }
+ };
+
+ return MappingGenerator<T, V>(std::move(source_generator), MapCallback{std::move(map)});
+}
+
+/// \see MakeSequencingGenerator
+template <typename T, typename ComesAfter, typename IsNext>
+class SequencingGenerator {
+ public:
+ SequencingGenerator(AsyncGenerator<T> source, ComesAfter compare, IsNext is_next,
+ T initial_value)
+ : state_(std::make_shared<State>(std::move(source), std::move(compare),
+ std::move(is_next), std::move(initial_value))) {}
+
+ Future<T> operator()() {
+ {
+ auto guard = state_->mutex.Lock();
+ // We can send a result immediately if the top of the queue is either an
+ // error or the next item
+ if (!state_->queue.empty() &&
+ (!state_->queue.top().ok() ||
+ state_->is_next(state_->previous_value, *state_->queue.top()))) {
+ auto result = std::move(state_->queue.top());
+ if (result.ok()) {
+ state_->previous_value = *result;
+ }
+ state_->queue.pop();
+ return Future<T>::MakeFinished(result);
+ }
+ if (state_->finished) {
+ return AsyncGeneratorEnd<T>();
+ }
+ // The next item is not in the queue so we will need to wait
+ auto new_waiting_fut = Future<T>::Make();
+ state_->waiting_future = new_waiting_fut;
+ guard.Unlock();
+ state_->source().AddCallback(Callback{state_});
+ return new_waiting_fut;
+ }
+ }
+
+ private:
+ struct WrappedComesAfter {
+ bool operator()(const Result<T>& left, const Result<T>& right) {
+ if (!left.ok() || !right.ok()) {
+ // Should never happen
+ return false;
+ }
+ return compare(*left, *right);
+ }
+ ComesAfter compare;
+ };
+
+ struct State {
+ State(AsyncGenerator<T> source, ComesAfter compare, IsNext is_next, T initial_value)
+ : source(std::move(source)),
+ is_next(std::move(is_next)),
+ previous_value(std::move(initial_value)),
+ waiting_future(),
+ queue(WrappedComesAfter{compare}),
+ finished(false),
+ mutex() {}
+
+ AsyncGenerator<T> source;
+ IsNext is_next;
+ T previous_value;
+ Future<T> waiting_future;
+ std::priority_queue<Result<T>, std::vector<Result<T>>, WrappedComesAfter> queue;
+ bool finished;
+ util::Mutex mutex;
+ };
+
+ class Callback {
+ public:
+ explicit Callback(std::shared_ptr<State> state) : state_(std::move(state)) {}
+
+ void operator()(const Result<T> result) {
+ Future<T> to_deliver;
+ bool finished;
+ {
+ auto guard = state_->mutex.Lock();
+ bool ready_to_deliver = false;
+ if (!result.ok()) {
+ // Clear any cached results
+ while (!state_->queue.empty()) {
+ state_->queue.pop();
+ }
+ ready_to_deliver = true;
+ state_->finished = true;
+ } else if (IsIterationEnd<T>(result.ValueUnsafe())) {
+ ready_to_deliver = state_->queue.empty();
+ state_->finished = true;
+ } else {
+ ready_to_deliver = state_->is_next(state_->previous_value, *result);
+ }
+
+ if (ready_to_deliver && state_->waiting_future.is_valid()) {
+ to_deliver = state_->waiting_future;
+ if (result.ok()) {
+ state_->previous_value = *result;
+ }
+ } else {
+ state_->queue.push(result);
+ }
+ // Capture state_->finished so we can access it outside the mutex
+ finished = state_->finished;
+ }
+ // Must deliver result outside of the mutex
+ if (to_deliver.is_valid()) {
+ to_deliver.MarkFinished(result);
+ } else {
+ // Otherwise, if we didn't get the next item (or a terminal item), we
+ // need to keep looking
+ if (!finished) {
+ state_->source().AddCallback(Callback{state_});
+ }
+ }
+ }
+
+ private:
+ const std::shared_ptr<State> state_;
+ };
+
+ const std::shared_ptr<State> state_;
+};
+
+/// \brief Buffers an AsyncGenerator to return values in sequence order ComesAfter
+/// and IsNext determine the sequence order.
+///
+/// ComesAfter should be a BinaryPredicate that only returns true if a comes after b
+///
+/// IsNext should be a BinaryPredicate that returns true, given `a` and `b`, only if
+/// `b` follows immediately after `a`. It should return true given `initial_value` and
+/// `b` if `b` is the first item in the sequence.
+///
+/// This operator will queue unboundedly while waiting for the next item. It is intended
+/// for jittery sources that might scatter an ordered sequence. It is NOT intended to
+/// sort. Using it to try and sort could result in excessive RAM usage. This generator
+/// will queue up to N blocks where N is the max "out of order"ness of the source.
+///
+/// For example, if the source is 1,6,2,5,4,3 it will queue 3 blocks because 3 is 3
+/// blocks beyond where it belongs.
+///
+/// This generator is not async-reentrant but it consists only of a simple log(n)
+/// insertion into a priority queue.
+template <typename T, typename ComesAfter, typename IsNext>
+AsyncGenerator<T> MakeSequencingGenerator(AsyncGenerator<T> source_generator,
+ ComesAfter compare, IsNext is_next,
+ T initial_value) {
+ return SequencingGenerator<T, ComesAfter, IsNext>(
+ std::move(source_generator), std::move(compare), std::move(is_next),
+ std::move(initial_value));
+}
+
+/// \see MakeTransformedGenerator
+template <typename T, typename V>
+class TransformingGenerator {
+ // The transforming generator state will be referenced as an async generator but will
+ // also be referenced via callback to various futures. If the async generator owner
+ // moves it around we need the state to be consistent for future callbacks.
+ struct TransformingGeneratorState
+ : std::enable_shared_from_this<TransformingGeneratorState> {
+ TransformingGeneratorState(AsyncGenerator<T> generator, Transformer<T, V> transformer)
+ : generator_(std::move(generator)),
+ transformer_(std::move(transformer)),
+ last_value_(),
+ finished_() {}
+
+ Future<V> operator()() {
+ while (true) {
+ auto maybe_next_result = Pump();
+ if (!maybe_next_result.ok()) {
+ return Future<V>::MakeFinished(maybe_next_result.status());
+ }
+ auto maybe_next = std::move(maybe_next_result).ValueUnsafe();
+ if (maybe_next.has_value()) {
+ return Future<V>::MakeFinished(*std::move(maybe_next));
+ }
+
+ auto next_fut = generator_();
+ // If finished already, process results immediately inside the loop to avoid
+ // stack overflow
+ if (next_fut.is_finished()) {
+ auto next_result = next_fut.result();
+ if (next_result.ok()) {
+ last_value_ = *next_result;
+ } else {
+ return Future<V>::MakeFinished(next_result.status());
+ }
+ // Otherwise, if not finished immediately, add callback to process results
+ } else {
+ auto self = this->shared_from_this();
+ return next_fut.Then([self](const T& next_result) {
+ self->last_value_ = next_result;
+ return (*self)();
+ });
+ }
+ }
+ }
+
+ // See comment on TransformingIterator::Pump
+ Result<util::optional<V>> Pump() {
+ if (!finished_ && last_value_.has_value()) {
+ ARROW_ASSIGN_OR_RAISE(TransformFlow<V> next, transformer_(*last_value_));
+ if (next.ReadyForNext()) {
+ if (IsIterationEnd(*last_value_)) {
+ finished_ = true;
+ }
+ last_value_.reset();
+ }
+ if (next.Finished()) {
+ finished_ = true;
+ }
+ if (next.HasValue()) {
+ return next.Value();
+ }
+ }
+ if (finished_) {
+ return IterationTraits<V>::End();
+ }
+ return util::nullopt;
+ }
+
+ AsyncGenerator<T> generator_;
+ Transformer<T, V> transformer_;
+ util::optional<T> last_value_;
+ bool finished_;
+ };
+
+ public:
+ explicit TransformingGenerator(AsyncGenerator<T> generator,
+ Transformer<T, V> transformer)
+ : state_(std::make_shared<TransformingGeneratorState>(std::move(generator),
+ std::move(transformer))) {}
+
+ Future<V> operator()() { return (*state_)(); }
+
+ protected:
+ std::shared_ptr<TransformingGeneratorState> state_;
+};
+
+/// \brief Transforms an async generator using a transformer function returning a new
+/// AsyncGenerator
+///
+/// The transform function here behaves exactly the same as the transform function in
+/// MakeTransformedIterator and you can safely use the same transform function to
+/// transform both synchronous and asynchronous streams.
+///
+/// This generator is not async-reentrant
+///
+/// This generator may queue up to 1 instance of T but will not delay
+template <typename T, typename V>
+AsyncGenerator<V> MakeTransformedGenerator(AsyncGenerator<T> generator,
+ Transformer<T, V> transformer) {
+ return TransformingGenerator<T, V>(generator, transformer);
+}
+
+/// \see MakeSerialReadaheadGenerator
+template <typename T>
+class SerialReadaheadGenerator {
+ public:
+ SerialReadaheadGenerator(AsyncGenerator<T> source_generator, int max_readahead)
+ : state_(std::make_shared<State>(std::move(source_generator), max_readahead)) {}
+
+ Future<T> operator()() {
+ if (state_->first_) {
+ // Lazy generator, need to wait for the first ask to prime the pump
+ state_->first_ = false;
+ auto next = state_->source_();
+ return next.Then(Callback{state_}, ErrCallback{state_});
+ }
+
+ // This generator is not async-reentrant. We won't be called until the last
+ // future finished so we know there is something in the queue
+ auto finished = state_->finished_.load();
+ if (finished && state_->readahead_queue_.IsEmpty()) {
+ return AsyncGeneratorEnd<T>();
+ }
+
+ std::shared_ptr<Future<T>> next;
+ if (!state_->readahead_queue_.Read(next)) {
+ return Status::UnknownError("Could not read from readahead_queue");
+ }
+
+ auto last_available = state_->spaces_available_.fetch_add(1);
+ if (last_available == 0 && !finished) {
+ // Reader idled out, we need to restart it
+ ARROW_RETURN_NOT_OK(state_->Pump(state_));
+ }
+ return *next;
+ }
+
+ private:
+ struct State {
+ State(AsyncGenerator<T> source, int max_readahead)
+ : first_(true),
+ source_(std::move(source)),
+ finished_(false),
+ // There is one extra "space" for the in-flight request
+ spaces_available_(max_readahead + 1),
+ // The SPSC queue has size-1 "usable" slots so we need to overallocate 1
+ readahead_queue_(max_readahead + 1) {}
+
+ Status Pump(const std::shared_ptr<State>& self) {
+ // Can't do readahead_queue.write(source().Then(...)) because then the
+ // callback might run immediately and add itself to the queue before this gets added
+ // to the queue messing up the order.
+ auto next_slot = std::make_shared<Future<T>>();
+ auto written = readahead_queue_.Write(next_slot);
+ if (!written) {
+ return Status::UnknownError("Could not write to readahead_queue");
+ }
+ // If this Pump is being called from a callback it is possible for the source to
+ // poll and read from the queue between the Write and this spot where we fill the
+ // value in. However, it is not possible for the future to read this value we are
+ // writing. That is because this callback (the callback for future X) must be
+ // finished before future X is marked complete and this source is not pulled
+ // reentrantly so it will not poll for future X+1 until this callback has completed.
+ *next_slot = source_().Then(Callback{self}, ErrCallback{self});
+ return Status::OK();
+ }
+
+ // Only accessed by the consumer end
+ bool first_;
+ // Accessed by both threads
+ AsyncGenerator<T> source_;
+ std::atomic<bool> finished_;
+ // The queue has a size but it is not atomic. We keep track of how many spaces are
+ // left in the queue here so we know if we've just written the last value and we need
+ // to stop reading ahead or if we've just read from a full queue and we need to
+ // restart reading ahead
+ std::atomic<uint32_t> spaces_available_;
+ // Needs to be a queue of shared_ptr and not Future because we set the value of the
+ // future after we add it to the queue
+ util::SpscQueue<std::shared_ptr<Future<T>>> readahead_queue_;
+ };
+
+ struct Callback {
+ Result<T> operator()(const T& next) {
+ if (IsIterationEnd(next)) {
+ state_->finished_.store(true);
+ return next;
+ }
+ auto last_available = state_->spaces_available_.fetch_sub(1);
+ if (last_available > 1) {
+ ARROW_RETURN_NOT_OK(state_->Pump(state_));
+ }
+ return next;
+ }
+
+ std::shared_ptr<State> state_;
+ };
+
+ struct ErrCallback {
+ Result<T> operator()(const Status& st) {
+ state_->finished_.store(true);
+ return st;
+ }
+
+ std::shared_ptr<State> state_;
+ };
+
+ std::shared_ptr<State> state_;
+};
+
+/// \see MakeFromFuture
+template <typename T>
+class FutureFirstGenerator {
+ public:
+ explicit FutureFirstGenerator(Future<AsyncGenerator<T>> future)
+ : state_(std::make_shared<State>(std::move(future))) {}
+
+ Future<T> operator()() {
+ if (state_->source_) {
+ return state_->source_();
+ } else {
+ auto state = state_;
+ return state_->future_.Then([state](const AsyncGenerator<T>& source) {
+ state->source_ = source;
+ return state->source_();
+ });
+ }
+ }
+
+ private:
+ struct State {
+ explicit State(Future<AsyncGenerator<T>> future) : future_(future), source_() {}
+
+ Future<AsyncGenerator<T>> future_;
+ AsyncGenerator<T> source_;
+ };
+
+ std::shared_ptr<State> state_;
+};
+
+/// \brief Transforms a Future<AsyncGenerator<T>> into an AsyncGenerator<T>
+/// that waits for the future to complete as part of the first item.
+///
+/// This generator is not async-reentrant (even if the generator yielded by future is)
+///
+/// This generator does not queue
+template <typename T>
+AsyncGenerator<T> MakeFromFuture(Future<AsyncGenerator<T>> future) {
+ return FutureFirstGenerator<T>(std::move(future));
+}
+
+/// \brief Creates a generator that will pull from the source into a queue. Unlike
+/// MakeReadaheadGenerator this will not pull reentrantly from the source.
+///
+/// The source generator does not need to be async-reentrant
+///
+/// This generator is not async-reentrant (even if the source is)
+///
+/// This generator may queue up to max_readahead additional instances of T
+template <typename T>
+AsyncGenerator<T> MakeSerialReadaheadGenerator(AsyncGenerator<T> source_generator,
+ int max_readahead) {
+ return SerialReadaheadGenerator<T>(std::move(source_generator), max_readahead);
+}
+
+/// \see MakeReadaheadGenerator
+template <typename T>
+class ReadaheadGenerator {
+ public:
+ ReadaheadGenerator(AsyncGenerator<T> source_generator, int max_readahead)
+ : state_(std::make_shared<State>(std::move(source_generator), max_readahead)) {}
+
+ Future<T> AddMarkFinishedContinuation(Future<T> fut) {
+ auto state = state_;
+ return fut.Then(
+ [state](const T& result) -> Result<T> {
+ state->MarkFinishedIfDone(result);
+ return result;
+ },
+ [state](const Status& err) -> Result<T> {
+ state->finished.store(true);
+ return err;
+ });
+ }
+
+ Future<T> operator()() {
+ if (state_->readahead_queue.empty()) {
+ // This is the first request, let's pump the underlying queue
+ for (int i = 0; i < state_->max_readahead; i++) {
+ auto next = state_->source_generator();
+ auto next_after_check = AddMarkFinishedContinuation(std::move(next));
+ state_->readahead_queue.push(std::move(next_after_check));
+ }
+ }
+ // Pop one and add one
+ auto result = state_->readahead_queue.front();
+ state_->readahead_queue.pop();
+ if (state_->finished.load()) {
+ state_->readahead_queue.push(AsyncGeneratorEnd<T>());
+ } else {
+ auto back_of_queue = state_->source_generator();
+ auto back_of_queue_after_check =
+ AddMarkFinishedContinuation(std::move(back_of_queue));
+ state_->readahead_queue.push(std::move(back_of_queue_after_check));
+ }
+ return result;
+ }
+
+ private:
+ struct State {
+ State(AsyncGenerator<T> source_generator, int max_readahead)
+ : source_generator(std::move(source_generator)), max_readahead(max_readahead) {
+ finished.store(false);
+ }
+
+ void MarkFinishedIfDone(const T& next_result) {
+ if (IsIterationEnd(next_result)) {
+ finished.store(true);
+ }
+ }
+
+ AsyncGenerator<T> source_generator;
+ int max_readahead;
+ std::atomic<bool> finished;
+ std::queue<Future<T>> readahead_queue;
+ };
+
+ std::shared_ptr<State> state_;
+};
+
+/// \brief A generator where the producer pushes items on a queue.
+///
+/// No back-pressure is applied, so this generator is mostly useful when
+/// producing the values is neither CPU- nor memory-expensive (e.g. fetching
+/// filesystem metadata).
+///
+/// This generator is not async-reentrant.
+template <typename T>
+class PushGenerator {
+ struct State {
+ util::Mutex mutex;
+ std::deque<Result<T>> result_q;
+ util::optional<Future<T>> consumer_fut;
+ bool finished = false;
+ };
+
+ public:
+ /// Producer API for PushGenerator
+ class Producer {
+ public:
+ explicit Producer(const std::shared_ptr<State>& state) : weak_state_(state) {}
+
+ /// \brief Push a value on the queue
+ ///
+ /// True is returned if the value was pushed, false if the generator is
+ /// already closed or destroyed. If the latter, it is recommended to stop
+ /// producing any further values.
+ bool Push(Result<T> result) {
+ auto state = weak_state_.lock();
+ if (!state) {
+ // Generator was destroyed
+ return false;
+ }
+ auto lock = state->mutex.Lock();
+ if (state->finished) {
+ // Closed early
+ return false;
+ }
+ if (state->consumer_fut.has_value()) {
+ auto fut = std::move(state->consumer_fut.value());
+ state->consumer_fut.reset();
+ lock.Unlock(); // unlock before potentially invoking a callback
+ fut.MarkFinished(std::move(result));
+ } else {
+ state->result_q.push_back(std::move(result));
+ }
+ return true;
+ }
+
+ /// \brief Tell the consumer we have finished producing
+ ///
+ /// It is allowed to call this and later call Push() again ("early close").
+ /// In this case, calls to Push() after the queue is closed are silently
+ /// ignored. This can help implementing non-trivial cancellation cases.
+ ///
+ /// True is returned on success, false if the generator is already closed
+ /// or destroyed.
+ bool Close() {
+ auto state = weak_state_.lock();
+ if (!state) {
+ // Generator was destroyed
+ return false;
+ }
+ auto lock = state->mutex.Lock();
+ if (state->finished) {
+ // Already closed
+ return false;
+ }
+ state->finished = true;
+ if (state->consumer_fut.has_value()) {
+ auto fut = std::move(state->consumer_fut.value());
+ state->consumer_fut.reset();
+ lock.Unlock(); // unlock before potentially invoking a callback
+ fut.MarkFinished(IterationTraits<T>::End());
+ }
+ return true;
+ }
+
+ /// Return whether the generator was closed or destroyed.
+ bool is_closed() const {
+ auto state = weak_state_.lock();
+ if (!state) {
+ // Generator was destroyed
+ return true;
+ }
+ auto lock = state->mutex.Lock();
+ return state->finished;
+ }
+
+ private:
+ const std::weak_ptr<State> weak_state_;
+ };
+
+ PushGenerator() : state_(std::make_shared<State>()) {}
+
+ /// Read an item from the queue
+ Future<T> operator()() {
+ auto lock = state_->mutex.Lock();
+ assert(!state_->consumer_fut.has_value()); // Non-reentrant
+ if (!state_->result_q.empty()) {
+ auto fut = Future<T>::MakeFinished(std::move(state_->result_q.front()));
+ state_->result_q.pop_front();
+ return fut;
+ }
+ if (state_->finished) {
+ return AsyncGeneratorEnd<T>();
+ }
+ auto fut = Future<T>::Make();
+ state_->consumer_fut = fut;
+ return fut;
+ }
+
+ /// \brief Return producer-side interface
+ ///
+ /// The returned object must be used by the producer to push values on the queue.
+ /// Only a single Producer object should be instantiated.
+ Producer producer() { return Producer{state_}; }
+
+ private:
+ const std::shared_ptr<State> state_;
+};
+
+/// \brief Creates a generator that pulls reentrantly from a source
+/// This generator will pull reentrantly from a source, ensuring that max_readahead
+/// requests are active at any given time.
+///
+/// The source generator must be async-reentrant
+///
+/// This generator itself is async-reentrant.
+///
+/// This generator may queue up to max_readahead instances of T
+template <typename T>
+AsyncGenerator<T> MakeReadaheadGenerator(AsyncGenerator<T> source_generator,
+ int max_readahead) {
+ return ReadaheadGenerator<T>(std::move(source_generator), max_readahead);
+}
+
+/// \brief Creates a generator that will yield finished futures from a vector
+///
+/// This generator is async-reentrant
+template <typename T>
+AsyncGenerator<T> MakeVectorGenerator(std::vector<T> vec) {
+ struct State {
+ explicit State(std::vector<T> vec_) : vec(std::move(vec_)), vec_idx(0) {}
+
+ std::vector<T> vec;
+ std::atomic<std::size_t> vec_idx;
+ };
+
+ auto state = std::make_shared<State>(std::move(vec));
+ return [state]() {
+ auto idx = state->vec_idx.fetch_add(1);
+ if (idx >= state->vec.size()) {
+ // Eagerly return memory
+ state->vec.clear();
+ return AsyncGeneratorEnd<T>();
+ }
+ return Future<T>::MakeFinished(state->vec[idx]);
+ };
+}
+
+/// \see MakeMergedGenerator
+template <typename T>
+class MergedGenerator {
+ public:
+ explicit MergedGenerator(AsyncGenerator<AsyncGenerator<T>> source,
+ int max_subscriptions)
+ : state_(std::make_shared<State>(std::move(source), max_subscriptions)) {}
+
+ Future<T> operator()() {
+ Future<T> waiting_future;
+ std::shared_ptr<DeliveredJob> delivered_job;
+ {
+ auto guard = state_->mutex.Lock();
+ if (!state_->delivered_jobs.empty()) {
+ delivered_job = std::move(state_->delivered_jobs.front());
+ state_->delivered_jobs.pop_front();
+ } else if (state_->finished) {
+ return IterationTraits<T>::End();
+ } else {
+ waiting_future = Future<T>::Make();
+ state_->waiting_jobs.push_back(std::make_shared<Future<T>>(waiting_future));
+ }
+ }
+ if (delivered_job) {
+ // deliverer will be invalid if outer callback encounters an error and delivers a
+ // failed result
+ if (delivered_job->deliverer) {
+ delivered_job->deliverer().AddCallback(
+ InnerCallback{state_, delivered_job->index});
+ }
+ return std::move(delivered_job->value);
+ }
+ if (state_->first) {
+ state_->first = false;
+ for (std::size_t i = 0; i < state_->active_subscriptions.size(); i++) {
+ state_->PullSource().AddCallback(OuterCallback{state_, i});
+ }
+ }
+ return waiting_future;
+ }
+
+ private:
+ struct DeliveredJob {
+ explicit DeliveredJob(AsyncGenerator<T> deliverer_, Result<T> value_,
+ std::size_t index_)
+ : deliverer(deliverer_), value(std::move(value_)), index(index_) {}
+
+ AsyncGenerator<T> deliverer;
+ Result<T> value;
+ std::size_t index;
+ };
+
+ struct State {
+ State(AsyncGenerator<AsyncGenerator<T>> source, int max_subscriptions)
+ : source(std::move(source)),
+ active_subscriptions(max_subscriptions),
+ delivered_jobs(),
+ waiting_jobs(),
+ mutex(),
+ first(true),
+ source_exhausted(false),
+ finished(false),
+ num_active_subscriptions(max_subscriptions) {}
+
+ Future<AsyncGenerator<T>> PullSource() {
+ // Need to guard access to source() so we don't pull sync-reentrantly which
+ // is never valid.
+ auto lock = mutex.Lock();
+ return source();
+ }
+
+ AsyncGenerator<AsyncGenerator<T>> source;
+ // active_subscriptions and delivered_jobs will be bounded by max_subscriptions
+ std::vector<AsyncGenerator<T>> active_subscriptions;
+ std::deque<std::shared_ptr<DeliveredJob>> delivered_jobs;
+ // waiting_jobs is unbounded, reentrant pulls (e.g. AddReadahead) will provide the
+ // backpressure
+ std::deque<std::shared_ptr<Future<T>>> waiting_jobs;
+ util::Mutex mutex;
+ bool first;
+ bool source_exhausted;
+ bool finished;
+ int num_active_subscriptions;
+ };
+
+ struct InnerCallback {
+ void operator()(const Result<T>& maybe_next) {
+ Future<T> sink;
+ bool sub_finished = maybe_next.ok() && IsIterationEnd(*maybe_next);
+ {
+ auto guard = state->mutex.Lock();
+ if (state->finished) {
+ // We've errored out so just ignore this result and don't keep pumping
+ return;
+ }
+ if (!sub_finished) {
+ if (state->waiting_jobs.empty()) {
+ state->delivered_jobs.push_back(std::make_shared<DeliveredJob>(
+ state->active_subscriptions[index], maybe_next, index));
+ } else {
+ sink = std::move(*state->waiting_jobs.front());
+ state->waiting_jobs.pop_front();
+ }
+ }
+ }
+ if (sub_finished) {
+ state->PullSource().AddCallback(OuterCallback{state, index});
+ } else if (sink.is_valid()) {
+ sink.MarkFinished(maybe_next);
+ if (maybe_next.ok()) {
+ state->active_subscriptions[index]().AddCallback(*this);
+ }
+ }
+ }
+ std::shared_ptr<State> state;
+ std::size_t index;
+ };
+
+ struct OuterCallback {
+ void operator()(const Result<AsyncGenerator<T>>& maybe_next) {
+ bool should_purge = false;
+ bool should_continue = false;
+ Future<T> error_sink;
+ {
+ auto guard = state->mutex.Lock();
+ if (!maybe_next.ok() || IsIterationEnd(*maybe_next)) {
+ state->source_exhausted = true;
+ if (!maybe_next.ok() || --state->num_active_subscriptions == 0) {
+ state->finished = true;
+ should_purge = true;
+ }
+ if (!maybe_next.ok()) {
+ if (state->waiting_jobs.empty()) {
+ state->delivered_jobs.push_back(std::make_shared<DeliveredJob>(
+ AsyncGenerator<T>(), maybe_next.status(), index));
+ } else {
+ error_sink = std::move(*state->waiting_jobs.front());
+ state->waiting_jobs.pop_front();
+ }
+ }
+ } else {
+ state->active_subscriptions[index] = *maybe_next;
+ should_continue = true;
+ }
+ }
+ if (error_sink.is_valid()) {
+ error_sink.MarkFinished(maybe_next.status());
+ }
+ if (should_continue) {
+ (*maybe_next)().AddCallback(InnerCallback{state, index});
+ } else if (should_purge) {
+ // At this point state->finished has been marked true so no one else
+ // will be interacting with waiting_jobs and we can iterate outside lock
+ while (!state->waiting_jobs.empty()) {
+ state->waiting_jobs.front()->MarkFinished(IterationTraits<T>::End());
+ state->waiting_jobs.pop_front();
+ }
+ }
+ }
+ std::shared_ptr<State> state;
+ std::size_t index;
+ };
+
+ std::shared_ptr<State> state_;
+};
+
+/// \brief Creates a generator that takes in a stream of generators and pulls from up to
+/// max_subscriptions at a time
+///
+/// Note: This may deliver items out of sequence. For example, items from the third
+/// AsyncGenerator generated by the source may be emitted before some items from the first
+/// AsyncGenerator generated by the source.
+///
+/// This generator will pull from source async-reentrantly unless max_subscriptions is 1
+/// This generator will not pull from the individual subscriptions reentrantly. Add
+/// readahead to the individual subscriptions if that is desired.
+/// This generator is async-reentrant
+///
+/// This generator may queue up to max_subscriptions instances of T
+template <typename T>
+AsyncGenerator<T> MakeMergedGenerator(AsyncGenerator<AsyncGenerator<T>> source,
+ int max_subscriptions) {
+ return MergedGenerator<T>(std::move(source), max_subscriptions);
+}
+
+/// \brief Creates a generator that takes in a stream of generators and pulls from each
+/// one in sequence.
+///
+/// This generator is async-reentrant but will never pull from source reentrantly and
+/// will never pull from any subscription reentrantly.
+///
+/// This generator may queue 1 instance of T
+///
+/// TODO: Could potentially make a bespoke implementation instead of MergedGenerator that
+/// forwards async-reentrant requests instead of buffering them (which is what
+/// MergedGenerator does)
+template <typename T>
+AsyncGenerator<T> MakeConcatenatedGenerator(AsyncGenerator<AsyncGenerator<T>> source) {
+ return MergedGenerator<T>(std::move(source), 1);
+}
+
+template <typename T>
+struct Enumerated {
+ T value;
+ int index;
+ bool last;
+};
+
+template <typename T>
+struct IterationTraits<Enumerated<T>> {
+ static Enumerated<T> End() { return Enumerated<T>{IterationEnd<T>(), -1, false}; }
+ static bool IsEnd(const Enumerated<T>& val) { return val.index < 0; }
+};
+
+/// \see MakeEnumeratedGenerator
+template <typename T>
+class EnumeratingGenerator {
+ public:
+ EnumeratingGenerator(AsyncGenerator<T> source, T initial_value)
+ : state_(std::make_shared<State>(std::move(source), std::move(initial_value))) {}
+
+ Future<Enumerated<T>> operator()() {
+ if (state_->finished) {
+ return AsyncGeneratorEnd<Enumerated<T>>();
+ } else {
+ auto state = state_;
+ return state->source().Then([state](const T& next) {
+ auto finished = IsIterationEnd<T>(next);
+ auto prev = Enumerated<T>{state->prev_value, state->prev_index, finished};
+ state->prev_value = next;
+ state->prev_index++;
+ state->finished = finished;
+ return prev;
+ });
+ }
+ }
+
+ private:
+ struct State {
+ State(AsyncGenerator<T> source, T initial_value)
+ : source(std::move(source)), prev_value(std::move(initial_value)), prev_index(0) {
+ finished = IsIterationEnd<T>(prev_value);
+ }
+
+ AsyncGenerator<T> source;
+ T prev_value;
+ int prev_index;
+ bool finished;
+ };
+
+ std::shared_ptr<State> state_;
+};
+
+/// Wraps items from a source generator with positional information
+///
+/// When used with MakeMergedGenerator and MakeSequencingGenerator this allows items to be
+/// processed in a "first-available" fashion and later resequenced which can reduce the
+/// impact of sources with erratic performance (e.g. a filesystem where some items may
+/// take longer to read than others).
+///
+/// TODO(ARROW-12371) Would require this generator be async-reentrant
+///
+/// \see MakeSequencingGenerator for an example of putting items back in order
+///
+/// This generator is not async-reentrant
+///
+/// This generator buffers one item (so it knows which item is the last item)
+template <typename T>
+AsyncGenerator<Enumerated<T>> MakeEnumeratedGenerator(AsyncGenerator<T> source) {
+ return FutureFirstGenerator<Enumerated<T>>(
+ source().Then([source](const T& initial_value) -> AsyncGenerator<Enumerated<T>> {
+ return EnumeratingGenerator<T>(std::move(source), initial_value);
+ }));
+}
+
+/// \see MakeTransferredGenerator
+template <typename T>
+class TransferringGenerator {
+ public:
+ explicit TransferringGenerator(AsyncGenerator<T> source, internal::Executor* executor)
+ : source_(std::move(source)), executor_(executor) {}
+
+ Future<T> operator()() { return executor_->Transfer(source_()); }
+
+ private:
+ AsyncGenerator<T> source_;
+ internal::Executor* executor_;
+};
+
+/// \brief Transfers a future to an underlying executor.
+///
+/// Continuations run on the returned future will be run on the given executor
+/// if they cannot be run synchronously.
+///
+/// This is often needed to move computation off I/O threads or other external
+/// completion sources and back on to the CPU executor so the I/O thread can
+/// stay busy and focused on I/O
+///
+/// Keep in mind that continuations called on an already completed future will
+/// always be run synchronously and so no transfer will happen in that case.
+///
+/// This generator is async reentrant if the source is
+///
+/// This generator will not queue
+template <typename T>
+AsyncGenerator<T> MakeTransferredGenerator(AsyncGenerator<T> source,
+ internal::Executor* executor) {
+ return TransferringGenerator<T>(std::move(source), executor);
+}
+
+/// \see MakeBackgroundGenerator
+template <typename T>
+class BackgroundGenerator {
+ public:
+ explicit BackgroundGenerator(Iterator<T> it, internal::Executor* io_executor, int max_q,
+ int q_restart)
+ : state_(std::make_shared<State>(io_executor, std::move(it), max_q, q_restart)),
+ cleanup_(std::make_shared<Cleanup>(state_.get())) {}
+
+ Future<T> operator()() {
+ auto guard = state_->mutex.Lock();
+ Future<T> waiting_future;
+ if (state_->queue.empty()) {
+ if (state_->finished) {
+ return AsyncGeneratorEnd<T>();
+ } else {
+ waiting_future = Future<T>::Make();
+ state_->waiting_future = waiting_future;
+ }
+ } else {
+ auto next = Future<T>::MakeFinished(std::move(state_->queue.front()));
+ state_->queue.pop();
+ if (state_->NeedsRestart()) {
+ return state_->RestartTask(state_, std::move(guard), std::move(next));
+ }
+ return next;
+ }
+ // This should only trigger the very first time this method is called
+ if (state_->NeedsRestart()) {
+ return state_->RestartTask(state_, std::move(guard), std::move(waiting_future));
+ }
+ return waiting_future;
+ }
+
+ protected:
+ static constexpr uint64_t kUnlikelyThreadId{std::numeric_limits<uint64_t>::max()};
+
+ struct State {
+ State(internal::Executor* io_executor, Iterator<T> it, int max_q, int q_restart)
+ : io_executor(io_executor),
+ max_q(max_q),
+ q_restart(q_restart),
+ it(std::move(it)),
+ reading(false),
+ finished(false),
+ should_shutdown(false) {}
+
+ void ClearQueue() {
+ while (!queue.empty()) {
+ queue.pop();
+ }
+ }
+
+ bool TaskIsRunning() const { return task_finished.is_valid(); }
+
+ bool NeedsRestart() const {
+ return !finished && !reading && static_cast<int>(queue.size()) <= q_restart;
+ }
+
+ void DoRestartTask(std::shared_ptr<State> state, util::Mutex::Guard guard) {
+ // If we get here we are actually going to start a new task so let's create a
+ // task_finished future for it
+ state->task_finished = Future<>::Make();
+ state->reading = true;
+ auto spawn_status = io_executor->Spawn(
+ [state]() { BackgroundGenerator::WorkerTask(std::move(state)); });
+ if (!spawn_status.ok()) {
+ // If we can't spawn a new task then send an error to the consumer (either via a
+ // waiting future or the queue) and mark ourselves finished
+ state->finished = true;
+ state->task_finished = Future<>();
+ if (waiting_future.has_value()) {
+ auto to_deliver = std::move(waiting_future.value());
+ waiting_future.reset();
+ guard.Unlock();
+ to_deliver.MarkFinished(spawn_status);
+ } else {
+ ClearQueue();
+ queue.push(spawn_status);
+ }
+ }
+ }
+
+ Future<T> RestartTask(std::shared_ptr<State> state, util::Mutex::Guard guard,
+ Future<T> next) {
+ if (TaskIsRunning()) {
+ // If the task is still cleaning up we need to wait for it to finish before
+ // restarting. We also want to block the consumer until we've restarted the
+ // reader to avoid multiple restarts
+ return task_finished.Then([state, next]() {
+ // This may appear dangerous (recursive mutex) but we should be guaranteed the
+ // outer guard has been released by this point. We know...
+ // * task_finished is not already finished (it would be invalid in that case)
+ // * task_finished will not be marked complete until we've given up the mutex
+ auto guard_ = state->mutex.Lock();
+ state->DoRestartTask(state, std::move(guard_));
+ return next;
+ });
+ }
+ // Otherwise we can restart immediately
+ DoRestartTask(std::move(state), std::move(guard));
+ return next;
+ }
+
+ internal::Executor* io_executor;
+ const int max_q;
+ const int q_restart;
+ Iterator<T> it;
+ std::atomic<uint64_t> worker_thread_id{kUnlikelyThreadId};
+
+ // If true, the task is actively pumping items from the queue and does not need a
+ // restart
+ bool reading;
+ // Set to true when a terminal item arrives
+ bool finished;
+ // Signal to the background task to end early because consumers have given up on it
+ bool should_shutdown;
+ // If the queue is empty, the consumer will create a waiting future and wait for it
+ std::queue<Result<T>> queue;
+ util::optional<Future<T>> waiting_future;
+ // Every background task is given a future to complete when it is entirely finished
+ // processing and ready for the next task to start or for State to be destroyed
+ Future<> task_finished;
+ util::Mutex mutex;
+ };
+
+ // Cleanup task that will be run when all consumer references to the generator are lost
+ struct Cleanup {
+ explicit Cleanup(State* state) : state(state) {}
+ ~Cleanup() {
+ /// TODO: Once ARROW-13109 is available then we can be force consumers to spawn and
+ /// there is no need to perform this check.
+ ///
+ /// It's a deadlock if we enter cleanup from
+ /// the worker thread but it can happen if the consumer doesn't transfer away
+ assert(state->worker_thread_id.load() != ::arrow::internal::GetThreadId());
+ Future<> finish_fut;
+ {
+ auto lock = state->mutex.Lock();
+ if (!state->TaskIsRunning()) {
+ return;
+ }
+ // Signal the current task to stop and wait for it to finish
+ state->should_shutdown = true;
+ finish_fut = state->task_finished;
+ }
+ // Using future as a condition variable here
+ Status st = finish_fut.status();
+ ARROW_UNUSED(st);
+ }
+ State* state;
+ };
+
+ static void WorkerTask(std::shared_ptr<State> state) {
+ state->worker_thread_id.store(::arrow::internal::GetThreadId());
+ // We need to capture the state to read while outside the mutex
+ bool reading = true;
+ while (reading) {
+ auto next = state->it.Next();
+ // Need to capture state->waiting_future inside the mutex to mark finished outside
+ Future<T> waiting_future;
+ {
+ auto guard = state->mutex.Lock();
+
+ if (state->should_shutdown) {
+ state->finished = true;
+ break;
+ }
+
+ if (!next.ok() || IsIterationEnd<T>(*next)) {
+ // Terminal item. Mark finished to true, send this last item, and quit
+ state->finished = true;
+ if (!next.ok()) {
+ state->ClearQueue();
+ }
+ }
+ // At this point we are going to send an item. Either we will add it to the
+ // queue or deliver it to a waiting future.
+ if (state->waiting_future.has_value()) {
+ waiting_future = std::move(state->waiting_future.value());
+ state->waiting_future.reset();
+ } else {
+ state->queue.push(std::move(next));
+ // We just filled up the queue so it is time to quit. We may need to notify
+ // a cleanup task so we transition to Quitting
+ if (static_cast<int>(state->queue.size()) >= state->max_q) {
+ state->reading = false;
+ }
+ }
+ reading = state->reading && !state->finished;
+ }
+ // This should happen outside the mutex. Presumably there is a
+ // transferring generator on the other end that will quickly transfer any
+ // callbacks off of this thread so we can continue looping. Still, best not to
+ // rely on that
+ if (waiting_future.is_valid()) {
+ waiting_future.MarkFinished(next);
+ }
+ }
+ // Once we've sent our last item we can notify any waiters that we are done and so
+ // either state can be cleaned up or a new background task can be started
+ Future<> task_finished;
+ {
+ auto guard = state->mutex.Lock();
+ // After we give up the mutex state can be safely deleted. We will no longer
+ // reference it. We can safely transition to idle now.
+ task_finished = state->task_finished;
+ state->task_finished = Future<>();
+ state->worker_thread_id.store(kUnlikelyThreadId);
+ }
+ task_finished.MarkFinished();
+ }
+
+ std::shared_ptr<State> state_;
+ // state_ is held by both the generator and the background thread so it won't be cleaned
+ // up when all consumer references are relinquished. cleanup_ is only held by the
+ // generator so it will be destructed when the last consumer reference is gone. We use
+ // this to cleanup / stop the background generator in case the consuming end stops
+ // listening (e.g. due to a downstream error)
+ std::shared_ptr<Cleanup> cleanup_;
+};
+
+constexpr int kDefaultBackgroundMaxQ = 32;
+constexpr int kDefaultBackgroundQRestart = 16;
+
+/// \brief Creates an AsyncGenerator<T> by iterating over an Iterator<T> on a background
+/// thread
+///
+/// The parameter max_q and q_restart control queue size and background thread task
+/// management. If the background task is fast you typically don't want it creating a
+/// thread task for every item. Instead the background thread will run until it fills
+/// up a readahead queue.
+///
+/// Once the queue has filled up the background thread task will terminate (allowing other
+/// I/O tasks to use the thread). Once the queue has been drained enough (specified by
+/// q_restart) then the background thread task will be restarted. If q_restart is too low
+/// then you may exhaust the queue waiting for the background thread task to start running
+/// again. If it is too high then it will be constantly stopping and restarting the
+/// background queue task
+///
+/// The "background thread" is a logical thread and will run as tasks on the io_executor.
+/// This thread may stop and start when the queue fills up but there will only be one
+/// active background thread task at any given time. You MUST transfer away from this
+/// background generator. Otherwise there could be a race condition if a callback on the
+/// background thread deletes the last consumer reference to the background generator. You
+/// can transfer onto the same executor as the background thread, it is only neccesary to
+/// create a new thread task, not to switch executors.
+///
+/// This generator is not async-reentrant
+///
+/// This generator will queue up to max_q blocks
+template <typename T>
+static Result<AsyncGenerator<T>> MakeBackgroundGenerator(
+ Iterator<T> iterator, internal::Executor* io_executor,
+ int max_q = kDefaultBackgroundMaxQ, int q_restart = kDefaultBackgroundQRestart) {
+ if (max_q < q_restart) {
+ return Status::Invalid("max_q must be >= q_restart");
+ }
+ return BackgroundGenerator<T>(std::move(iterator), io_executor, max_q, q_restart);
+}
+
+/// \see MakeGeneratorIterator
+template <typename T>
+class GeneratorIterator {
+ public:
+ explicit GeneratorIterator(AsyncGenerator<T> source) : source_(std::move(source)) {}
+
+ Result<T> Next() { return source_().result(); }
+
+ private:
+ AsyncGenerator<T> source_;
+};
+
+/// \brief Converts an AsyncGenerator<T> to an Iterator<T> by blocking until each future
+/// is finished
+template <typename T>
+Iterator<T> MakeGeneratorIterator(AsyncGenerator<T> source) {
+ return Iterator<T>(GeneratorIterator<T>(std::move(source)));
+}
+
+/// \brief Adds readahead to an iterator using a background thread.
+///
+/// Under the hood this is converting the iterator to a generator using
+/// MakeBackgroundGenerator, adding readahead to the converted generator with
+/// MakeReadaheadGenerator, and then converting back to an iterator using
+/// MakeGeneratorIterator.
+template <typename T>
+Result<Iterator<T>> MakeReadaheadIterator(Iterator<T> it, int readahead_queue_size) {
+ ARROW_ASSIGN_OR_RAISE(auto io_executor, internal::ThreadPool::Make(1));
+ auto max_q = readahead_queue_size;
+ auto q_restart = std::max(1, max_q / 2);
+ ARROW_ASSIGN_OR_RAISE(
+ auto background_generator,
+ MakeBackgroundGenerator(std::move(it), io_executor.get(), max_q, q_restart));
+ // Capture io_executor to keep it alive as long as owned_bg_generator is still
+ // referenced
+ AsyncGenerator<T> owned_bg_generator = [io_executor, background_generator]() {
+ return background_generator();
+ };
+ return MakeGeneratorIterator(std::move(owned_bg_generator));
+}
+
+/// \brief Make a generator that returns a single pre-generated future
+///
+/// This generator is async-reentrant.
+template <typename T>
+std::function<Future<T>()> MakeSingleFutureGenerator(Future<T> future) {
+ assert(future.is_valid());
+ auto state = std::make_shared<Future<T>>(std::move(future));
+ return [state]() -> Future<T> {
+ auto fut = std::move(*state);
+ if (fut.is_valid()) {
+ return fut;
+ } else {
+ return AsyncGeneratorEnd<T>();
+ }
+ };
+}
+
+/// \brief Make a generator that immediately ends.
+///
+/// This generator is async-reentrant.
+template <typename T>
+std::function<Future<T>()> MakeEmptyGenerator() {
+ return []() -> Future<T> { return AsyncGeneratorEnd<T>(); };
+}
+
+/// \brief Make a generator that always fails with a given error
+///
+/// This generator is async-reentrant.
+template <typename T>
+AsyncGenerator<T> MakeFailingGenerator(Status st) {
+ assert(!st.ok());
+ auto state = std::make_shared<Status>(std::move(st));
+ return [state]() -> Future<T> {
+ auto st = std::move(*state);
+ if (!st.ok()) {
+ return std::move(st);
+ } else {
+ return AsyncGeneratorEnd<T>();
+ }
+ };
+}
+
+/// \brief Make a generator that always fails with a given error
+///
+/// This overload allows inferring the return type from the argument.
+template <typename T>
+AsyncGenerator<T> MakeFailingGenerator(const Result<T>& result) {
+ return MakeFailingGenerator<T>(result.status());
+}
+
+/// \brief Prepends initial_values onto a generator
+///
+/// This generator is async-reentrant but will buffer requests and will not
+/// pull from following_values async-reentrantly.
+template <typename T>
+AsyncGenerator<T> MakeGeneratorStartsWith(std::vector<T> initial_values,
+ AsyncGenerator<T> following_values) {
+ auto initial_values_vec_gen = MakeVectorGenerator(std::move(initial_values));
+ auto gen_gen = MakeVectorGenerator<AsyncGenerator<T>>(
+ {std::move(initial_values_vec_gen), std::move(following_values)});
+ return MakeConcatenatedGenerator(std::move(gen_gen));
+}
+
+template <typename T>
+struct CancellableGenerator {
+ Future<T> operator()() {
+ if (stop_token.IsStopRequested()) {
+ return stop_token.Poll();
+ }
+ return source();
+ }
+
+ AsyncGenerator<T> source;
+ StopToken stop_token;
+};
+
+/// \brief Allows an async generator to be cancelled
+///
+/// This generator is async-reentrant
+template <typename T>
+AsyncGenerator<T> MakeCancellable(AsyncGenerator<T> source, StopToken stop_token) {
+ return CancellableGenerator<T>{std::move(source), std::move(stop_token)};
+}
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/atomic_shared_ptr.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/atomic_shared_ptr.h
new file mode 100644
index 00000000000..d93ad921db6
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/atomic_shared_ptr.h
@@ -0,0 +1,111 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <atomic>
+#include <memory>
+#include <utility>
+
+#include "arrow/type_traits.h"
+
+namespace arrow {
+namespace internal {
+
+// Atomic shared_ptr operations only appeared in libstdc++ since GCC 5,
+// emulate them with unsafe ops if unavailable.
+// See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=57250
+
+template <typename T, typename = void>
+struct is_atomic_load_shared_ptr_available : std::false_type {};
+
+template <typename T>
+struct is_atomic_load_shared_ptr_available<
+ T, void_t<decltype(std::atomic_load(std::declval<const std::shared_ptr<T>*>()))>>
+ : std::true_type {};
+
+template <typename T>
+using enable_if_atomic_load_shared_ptr_available =
+ enable_if_t<is_atomic_load_shared_ptr_available<T>::value, T>;
+
+template <typename T>
+using enable_if_atomic_load_shared_ptr_unavailable =
+ enable_if_t<!is_atomic_load_shared_ptr_available<T>::value, T>;
+
+template <class T>
+enable_if_atomic_load_shared_ptr_available<std::shared_ptr<T>> atomic_load(
+ const std::shared_ptr<T>* p) {
+ return std::atomic_load(p);
+}
+
+template <class T>
+enable_if_atomic_load_shared_ptr_unavailable<std::shared_ptr<T>> atomic_load(
+ const std::shared_ptr<T>* p) {
+ return *p;
+}
+
+template <typename T, typename = void>
+struct is_atomic_store_shared_ptr_available : std::false_type {};
+
+template <typename T>
+struct is_atomic_store_shared_ptr_available<
+ T, void_t<decltype(std::atomic_store(std::declval<std::shared_ptr<T>*>(),
+ std::declval<std::shared_ptr<T>>()))>>
+ : std::true_type {};
+
+template <typename T>
+using enable_if_atomic_store_shared_ptr_available =
+ enable_if_t<is_atomic_store_shared_ptr_available<T>::value, T>;
+
+template <typename T>
+using enable_if_atomic_store_shared_ptr_unavailable =
+ enable_if_t<!is_atomic_store_shared_ptr_available<T>::value, T>;
+
+template <class T>
+void atomic_store(enable_if_atomic_store_shared_ptr_available<std::shared_ptr<T>*> p,
+ std::shared_ptr<T> r) {
+ std::atomic_store(p, std::move(r));
+}
+
+template <class T>
+void atomic_store(enable_if_atomic_store_shared_ptr_unavailable<std::shared_ptr<T>*> p,
+ std::shared_ptr<T> r) {
+ *p = r;
+}
+
+template <class T>
+bool atomic_compare_exchange_strong(
+ enable_if_atomic_store_shared_ptr_available<std::shared_ptr<T>*> p,
+ std::shared_ptr<T>* expected, std::shared_ptr<T> desired) {
+ return std::atomic_compare_exchange_strong(p, expected, std::move(desired));
+}
+
+template <class T>
+bool atomic_compare_exchange_strong(
+ enable_if_atomic_store_shared_ptr_unavailable<std::shared_ptr<T>*> p,
+ std::shared_ptr<T>* expected, std::shared_ptr<T> desired) {
+ if (*p == *expected) {
+ *p = std::move(desired);
+ return true;
+ } else {
+ *expected = *p;
+ return false;
+ }
+}
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/base64.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/base64.h
new file mode 100644
index 00000000000..9ab41412ac3
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/base64.h
@@ -0,0 +1,34 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <string>
+
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace util {
+
+ARROW_EXPORT
+std::string base64_encode(unsigned char const*, unsigned int len);
+
+ARROW_EXPORT
+std::string base64_decode(std::string const& s);
+
+} // namespace util
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/basic_decimal.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/basic_decimal.cc
new file mode 100644
index 00000000000..56809f28165
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/basic_decimal.cc
@@ -0,0 +1,1344 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/util/basic_decimal.h"
+
+#include <algorithm>
+#include <array>
+#include <climits>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <iomanip>
+#include <limits>
+#include <string>
+
+#include "arrow/util/bit_util.h"
+#include "arrow/util/endian.h"
+#include "arrow/util/int128_internal.h"
+#include "arrow/util/int_util_internal.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/macros.h"
+
+namespace arrow {
+
+using internal::SafeLeftShift;
+using internal::SafeSignedAdd;
+
+static const BasicDecimal128 ScaleMultipliers[] = {
+ BasicDecimal128(1LL),
+ BasicDecimal128(10LL),
+ BasicDecimal128(100LL),
+ BasicDecimal128(1000LL),
+ BasicDecimal128(10000LL),
+ BasicDecimal128(100000LL),
+ BasicDecimal128(1000000LL),
+ BasicDecimal128(10000000LL),
+ BasicDecimal128(100000000LL),
+ BasicDecimal128(1000000000LL),
+ BasicDecimal128(10000000000LL),
+ BasicDecimal128(100000000000LL),
+ BasicDecimal128(1000000000000LL),
+ BasicDecimal128(10000000000000LL),
+ BasicDecimal128(100000000000000LL),
+ BasicDecimal128(1000000000000000LL),
+ BasicDecimal128(10000000000000000LL),
+ BasicDecimal128(100000000000000000LL),
+ BasicDecimal128(1000000000000000000LL),
+ BasicDecimal128(0LL, 10000000000000000000ULL),
+ BasicDecimal128(5LL, 7766279631452241920ULL),
+ BasicDecimal128(54LL, 3875820019684212736ULL),
+ BasicDecimal128(542LL, 1864712049423024128ULL),
+ BasicDecimal128(5421LL, 200376420520689664ULL),
+ BasicDecimal128(54210LL, 2003764205206896640ULL),
+ BasicDecimal128(542101LL, 1590897978359414784ULL),
+ BasicDecimal128(5421010LL, 15908979783594147840ULL),
+ BasicDecimal128(54210108LL, 11515845246265065472ULL),
+ BasicDecimal128(542101086LL, 4477988020393345024ULL),
+ BasicDecimal128(5421010862LL, 7886392056514347008ULL),
+ BasicDecimal128(54210108624LL, 5076944270305263616ULL),
+ BasicDecimal128(542101086242LL, 13875954555633532928ULL),
+ BasicDecimal128(5421010862427LL, 9632337040368467968ULL),
+ BasicDecimal128(54210108624275LL, 4089650035136921600ULL),
+ BasicDecimal128(542101086242752LL, 4003012203950112768ULL),
+ BasicDecimal128(5421010862427522LL, 3136633892082024448ULL),
+ BasicDecimal128(54210108624275221LL, 12919594847110692864ULL),
+ BasicDecimal128(542101086242752217LL, 68739955140067328ULL),
+ BasicDecimal128(5421010862427522170LL, 687399551400673280ULL)};
+
+static const BasicDecimal128 ScaleMultipliersHalf[] = {
+ BasicDecimal128(0ULL),
+ BasicDecimal128(5ULL),
+ BasicDecimal128(50ULL),
+ BasicDecimal128(500ULL),
+ BasicDecimal128(5000ULL),
+ BasicDecimal128(50000ULL),
+ BasicDecimal128(500000ULL),
+ BasicDecimal128(5000000ULL),
+ BasicDecimal128(50000000ULL),
+ BasicDecimal128(500000000ULL),
+ BasicDecimal128(5000000000ULL),
+ BasicDecimal128(50000000000ULL),
+ BasicDecimal128(500000000000ULL),
+ BasicDecimal128(5000000000000ULL),
+ BasicDecimal128(50000000000000ULL),
+ BasicDecimal128(500000000000000ULL),
+ BasicDecimal128(5000000000000000ULL),
+ BasicDecimal128(50000000000000000ULL),
+ BasicDecimal128(500000000000000000ULL),
+ BasicDecimal128(5000000000000000000ULL),
+ BasicDecimal128(2LL, 13106511852580896768ULL),
+ BasicDecimal128(27LL, 1937910009842106368ULL),
+ BasicDecimal128(271LL, 932356024711512064ULL),
+ BasicDecimal128(2710LL, 9323560247115120640ULL),
+ BasicDecimal128(27105LL, 1001882102603448320ULL),
+ BasicDecimal128(271050LL, 10018821026034483200ULL),
+ BasicDecimal128(2710505LL, 7954489891797073920ULL),
+ BasicDecimal128(27105054LL, 5757922623132532736ULL),
+ BasicDecimal128(271050543LL, 2238994010196672512ULL),
+ BasicDecimal128(2710505431LL, 3943196028257173504ULL),
+ BasicDecimal128(27105054312LL, 2538472135152631808ULL),
+ BasicDecimal128(271050543121LL, 6937977277816766464ULL),
+ BasicDecimal128(2710505431213LL, 14039540557039009792ULL),
+ BasicDecimal128(27105054312137LL, 11268197054423236608ULL),
+ BasicDecimal128(271050543121376LL, 2001506101975056384ULL),
+ BasicDecimal128(2710505431213761LL, 1568316946041012224ULL),
+ BasicDecimal128(27105054312137610LL, 15683169460410122240ULL),
+ BasicDecimal128(271050543121376108LL, 9257742014424809472ULL),
+ BasicDecimal128(2710505431213761085LL, 343699775700336640ULL)};
+
+static const BasicDecimal256 ScaleMultipliersDecimal256[] = {
+ BasicDecimal256({1ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({10ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({100ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({1000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({10000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({100000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({1000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({10000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({100000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({1000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({10000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({100000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({1000000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({10000000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({100000000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({1000000000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({10000000000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({100000000000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({1000000000000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({10000000000000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({7766279631452241920ULL, 5ULL, 0ULL, 0ULL}),
+ BasicDecimal256({3875820019684212736ULL, 54ULL, 0ULL, 0ULL}),
+ BasicDecimal256({1864712049423024128ULL, 542ULL, 0ULL, 0ULL}),
+ BasicDecimal256({200376420520689664ULL, 5421ULL, 0ULL, 0ULL}),
+ BasicDecimal256({2003764205206896640ULL, 54210ULL, 0ULL, 0ULL}),
+ BasicDecimal256({1590897978359414784ULL, 542101ULL, 0ULL, 0ULL}),
+ BasicDecimal256({15908979783594147840ULL, 5421010ULL, 0ULL, 0ULL}),
+ BasicDecimal256({11515845246265065472ULL, 54210108ULL, 0ULL, 0ULL}),
+ BasicDecimal256({4477988020393345024ULL, 542101086ULL, 0ULL, 0ULL}),
+ BasicDecimal256({7886392056514347008ULL, 5421010862ULL, 0ULL, 0ULL}),
+ BasicDecimal256({5076944270305263616ULL, 54210108624ULL, 0ULL, 0ULL}),
+ BasicDecimal256({13875954555633532928ULL, 542101086242ULL, 0ULL, 0ULL}),
+ BasicDecimal256({9632337040368467968ULL, 5421010862427ULL, 0ULL, 0ULL}),
+ BasicDecimal256({4089650035136921600ULL, 54210108624275ULL, 0ULL, 0ULL}),
+ BasicDecimal256({4003012203950112768ULL, 542101086242752ULL, 0ULL, 0ULL}),
+ BasicDecimal256({3136633892082024448ULL, 5421010862427522ULL, 0ULL, 0ULL}),
+ BasicDecimal256({12919594847110692864ULL, 54210108624275221ULL, 0ULL, 0ULL}),
+ BasicDecimal256({68739955140067328ULL, 542101086242752217ULL, 0ULL, 0ULL}),
+ BasicDecimal256({687399551400673280ULL, 5421010862427522170ULL, 0ULL, 0ULL}),
+ BasicDecimal256({6873995514006732800ULL, 17316620476856118468ULL, 2ULL, 0ULL}),
+ BasicDecimal256({13399722918938673152ULL, 7145508105175220139ULL, 29ULL, 0ULL}),
+ BasicDecimal256({4870020673419870208ULL, 16114848830623546549ULL, 293ULL, 0ULL}),
+ BasicDecimal256({11806718586779598848ULL, 13574535716559052564ULL, 2938ULL, 0ULL}),
+ BasicDecimal256({7386721425538678784ULL, 6618148649623664334ULL, 29387ULL, 0ULL}),
+ BasicDecimal256({80237960548581376ULL, 10841254275107988496ULL, 293873ULL, 0ULL}),
+ BasicDecimal256({802379605485813760ULL, 16178822382532126880ULL, 2938735ULL, 0ULL}),
+ BasicDecimal256({8023796054858137600ULL, 14214271235644855872ULL, 29387358ULL, 0ULL}),
+ BasicDecimal256(
+ {6450984253743169536ULL, 13015503840481697412ULL, 293873587ULL, 0ULL}),
+ BasicDecimal256(
+ {9169610316303040512ULL, 1027829888850112811ULL, 2938735877ULL, 0ULL}),
+ BasicDecimal256(
+ {17909126868192198656ULL, 10278298888501128114ULL, 29387358770ULL, 0ULL}),
+ BasicDecimal256(
+ {13070572018536022016ULL, 10549268516463523069ULL, 293873587705ULL, 0ULL}),
+ BasicDecimal256(
+ {1578511669393358848ULL, 13258964796087472617ULL, 2938735877055ULL, 0ULL}),
+ BasicDecimal256(
+ {15785116693933588480ULL, 3462439444907864858ULL, 29387358770557ULL, 0ULL}),
+ BasicDecimal256(
+ {10277214349659471872ULL, 16177650375369096972ULL, 293873587705571ULL, 0ULL}),
+ BasicDecimal256(
+ {10538423128046960640ULL, 14202551164014556797ULL, 2938735877055718ULL, 0ULL}),
+ BasicDecimal256(
+ {13150510911921848320ULL, 12898303124178706663ULL, 29387358770557187ULL, 0ULL}),
+ BasicDecimal256(
+ {2377900603251621888ULL, 18302566799529756941ULL, 293873587705571876ULL, 0ULL}),
+ BasicDecimal256(
+ {5332261958806667264ULL, 17004971331911604867ULL, 2938735877055718769ULL, 0ULL}),
+ BasicDecimal256(
+ {16429131440647569408ULL, 4029016655730084128ULL, 10940614696847636083ULL, 1ULL}),
+ BasicDecimal256({16717361816799281152ULL, 3396678409881738056ULL,
+ 17172426599928602752ULL, 15ULL}),
+ BasicDecimal256({1152921504606846976ULL, 15520040025107828953ULL,
+ 5703569335900062977ULL, 159ULL}),
+ BasicDecimal256({11529215046068469760ULL, 7626447661401876602ULL,
+ 1695461137871974930ULL, 1593ULL}),
+ BasicDecimal256({4611686018427387904ULL, 2477500319180559562ULL,
+ 16954611378719749304ULL, 15930ULL}),
+ BasicDecimal256({9223372036854775808ULL, 6328259118096044006ULL,
+ 3525417123811528497ULL, 159309ULL}),
+ BasicDecimal256({0ULL, 7942358959831785217ULL, 16807427164405733357ULL, 1593091ULL}),
+ BasicDecimal256({0ULL, 5636613303479645706ULL, 2053574980671369030ULL, 15930919ULL}),
+ BasicDecimal256({0ULL, 1025900813667802212ULL, 2089005733004138687ULL, 159309191ULL}),
+ BasicDecimal256(
+ {0ULL, 10259008136678022120ULL, 2443313256331835254ULL, 1593091911ULL}),
+ BasicDecimal256(
+ {0ULL, 10356360998232463120ULL, 5986388489608800929ULL, 15930919111ULL}),
+ BasicDecimal256(
+ {0ULL, 11329889613776873120ULL, 4523652674959354447ULL, 159309191113ULL}),
+ BasicDecimal256(
+ {0ULL, 2618431695511421504ULL, 8343038602174441244ULL, 1593091911132ULL}),
+ BasicDecimal256(
+ {0ULL, 7737572881404663424ULL, 9643409726906205977ULL, 15930919111324ULL}),
+ BasicDecimal256(
+ {0ULL, 3588752519208427776ULL, 4200376900514301694ULL, 159309191113245ULL}),
+ BasicDecimal256(
+ {0ULL, 17440781118374726144ULL, 5110280857723913709ULL, 1593091911132452ULL}),
+ BasicDecimal256(
+ {0ULL, 8387114520361296896ULL, 14209320429820033867ULL, 15930919111324522ULL}),
+ BasicDecimal256(
+ {0ULL, 10084168908774762496ULL, 12965995782233477362ULL, 159309191113245227ULL}),
+ BasicDecimal256(
+ {0ULL, 8607968719199866880ULL, 532749306367912313ULL, 1593091911132452277ULL})};
+
+static const BasicDecimal256 ScaleMultipliersHalfDecimal256[] = {
+ BasicDecimal256({0ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({5ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({50ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({500ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({5000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({50000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({500000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({5000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({50000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({500000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({5000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({50000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({500000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({5000000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({50000000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({500000000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({5000000000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({50000000000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({500000000000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({5000000000000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({13106511852580896768ULL, 2ULL, 0ULL, 0ULL}),
+ BasicDecimal256({1937910009842106368ULL, 27ULL, 0ULL, 0ULL}),
+ BasicDecimal256({932356024711512064ULL, 271ULL, 0ULL, 0ULL}),
+ BasicDecimal256({9323560247115120640ULL, 2710ULL, 0ULL, 0ULL}),
+ BasicDecimal256({1001882102603448320ULL, 27105ULL, 0ULL, 0ULL}),
+ BasicDecimal256({10018821026034483200ULL, 271050ULL, 0ULL, 0ULL}),
+ BasicDecimal256({7954489891797073920ULL, 2710505ULL, 0ULL, 0ULL}),
+ BasicDecimal256({5757922623132532736ULL, 27105054ULL, 0ULL, 0ULL}),
+ BasicDecimal256({2238994010196672512ULL, 271050543ULL, 0ULL, 0ULL}),
+ BasicDecimal256({3943196028257173504ULL, 2710505431ULL, 0ULL, 0ULL}),
+ BasicDecimal256({2538472135152631808ULL, 27105054312ULL, 0ULL, 0ULL}),
+ BasicDecimal256({6937977277816766464ULL, 271050543121ULL, 0ULL, 0ULL}),
+ BasicDecimal256({14039540557039009792ULL, 2710505431213ULL, 0ULL, 0ULL}),
+ BasicDecimal256({11268197054423236608ULL, 27105054312137ULL, 0ULL, 0ULL}),
+ BasicDecimal256({2001506101975056384ULL, 271050543121376ULL, 0ULL, 0ULL}),
+ BasicDecimal256({1568316946041012224ULL, 2710505431213761ULL, 0ULL, 0ULL}),
+ BasicDecimal256({15683169460410122240ULL, 27105054312137610ULL, 0ULL, 0ULL}),
+ BasicDecimal256({9257742014424809472ULL, 271050543121376108ULL, 0ULL, 0ULL}),
+ BasicDecimal256({343699775700336640ULL, 2710505431213761085ULL, 0ULL, 0ULL}),
+ BasicDecimal256({3436997757003366400ULL, 8658310238428059234ULL, 1ULL, 0ULL}),
+ BasicDecimal256({15923233496324112384ULL, 12796126089442385877ULL, 14ULL, 0ULL}),
+ BasicDecimal256({11658382373564710912ULL, 17280796452166549082ULL, 146ULL, 0ULL}),
+ BasicDecimal256({5903359293389799424ULL, 6787267858279526282ULL, 1469ULL, 0ULL}),
+ BasicDecimal256({3693360712769339392ULL, 12532446361666607975ULL, 14693ULL, 0ULL}),
+ BasicDecimal256({40118980274290688ULL, 14643999174408770056ULL, 146936ULL, 0ULL}),
+ BasicDecimal256({401189802742906880ULL, 17312783228120839248ULL, 1469367ULL, 0ULL}),
+ BasicDecimal256({4011898027429068800ULL, 7107135617822427936ULL, 14693679ULL, 0ULL}),
+ BasicDecimal256(
+ {3225492126871584768ULL, 15731123957095624514ULL, 146936793ULL, 0ULL}),
+ BasicDecimal256(
+ {13808177195006296064ULL, 9737286981279832213ULL, 1469367938ULL, 0ULL}),
+ BasicDecimal256(
+ {8954563434096099328ULL, 5139149444250564057ULL, 14693679385ULL, 0ULL}),
+ BasicDecimal256(
+ {15758658046122786816ULL, 14498006295086537342ULL, 146936793852ULL, 0ULL}),
+ BasicDecimal256(
+ {10012627871551455232ULL, 15852854434898512116ULL, 1469367938527ULL, 0ULL}),
+ BasicDecimal256(
+ {7892558346966794240ULL, 10954591759308708237ULL, 14693679385278ULL, 0ULL}),
+ BasicDecimal256(
+ {5138607174829735936ULL, 17312197224539324294ULL, 146936793852785ULL, 0ULL}),
+ BasicDecimal256(
+ {14492583600878256128ULL, 7101275582007278398ULL, 1469367938527859ULL, 0ULL}),
+ BasicDecimal256(
+ {15798627492815699968ULL, 15672523598944129139ULL, 14693679385278593ULL, 0ULL}),
+ BasicDecimal256(
+ {10412322338480586752ULL, 9151283399764878470ULL, 146936793852785938ULL, 0ULL}),
+ BasicDecimal256(
+ {11889503016258109440ULL, 17725857702810578241ULL, 1469367938527859384ULL, 0ULL}),
+ BasicDecimal256(
+ {8214565720323784704ULL, 11237880364719817872ULL, 14693679385278593849ULL, 0ULL}),
+ BasicDecimal256(
+ {8358680908399640576ULL, 1698339204940869028ULL, 17809585336819077184ULL, 7ULL}),
+ BasicDecimal256({9799832789158199296ULL, 16983392049408690284ULL,
+ 12075156704804807296ULL, 79ULL}),
+ BasicDecimal256({5764607523034234880ULL, 3813223830700938301ULL,
+ 10071102605790763273ULL, 796ULL}),
+ BasicDecimal256({2305843009213693952ULL, 1238750159590279781ULL,
+ 8477305689359874652ULL, 7965ULL}),
+ BasicDecimal256({4611686018427387904ULL, 12387501595902797811ULL,
+ 10986080598760540056ULL, 79654ULL}),
+ BasicDecimal256({9223372036854775808ULL, 13194551516770668416ULL,
+ 17627085619057642486ULL, 796545ULL}),
+ BasicDecimal256({0ULL, 2818306651739822853ULL, 10250159527190460323ULL, 7965459ULL}),
+ BasicDecimal256({0ULL, 9736322443688676914ULL, 10267874903356845151ULL, 79654595ULL}),
+ BasicDecimal256(
+ {0ULL, 5129504068339011060ULL, 10445028665020693435ULL, 796545955ULL}),
+ BasicDecimal256(
+ {0ULL, 14401552535971007368ULL, 12216566281659176272ULL, 7965459555ULL}),
+ BasicDecimal256(
+ {0ULL, 14888316843743212368ULL, 11485198374334453031ULL, 79654595556ULL}),
+ BasicDecimal256(
+ {0ULL, 1309215847755710752ULL, 4171519301087220622ULL, 796545955566ULL}),
+ BasicDecimal256(
+ {0ULL, 13092158477557107520ULL, 4821704863453102988ULL, 7965459555662ULL}),
+ BasicDecimal256(
+ {0ULL, 1794376259604213888ULL, 11323560487111926655ULL, 79654595556622ULL}),
+ BasicDecimal256(
+ {0ULL, 17943762596042138880ULL, 2555140428861956854ULL, 796545955566226ULL}),
+ BasicDecimal256(
+ {0ULL, 13416929297035424256ULL, 7104660214910016933ULL, 7965459555662261ULL}),
+ BasicDecimal256(
+ {0ULL, 5042084454387381248ULL, 15706369927971514489ULL, 79654595556622613ULL}),
+ BasicDecimal256(
+ {0ULL, 13527356396454709248ULL, 9489746690038731964ULL, 796545955566226138ULL})};
+
+#ifdef ARROW_USE_NATIVE_INT128
+static constexpr uint64_t kInt64Mask = 0xFFFFFFFFFFFFFFFF;
+#else
+static constexpr uint64_t kInt32Mask = 0xFFFFFFFF;
+#endif
+
+// same as ScaleMultipliers[38] - 1
+static constexpr BasicDecimal128 kMaxValue =
+ BasicDecimal128(5421010862427522170LL, 687399551400673280ULL - 1);
+
+#if ARROW_LITTLE_ENDIAN
+BasicDecimal128::BasicDecimal128(const uint8_t* bytes)
+ : BasicDecimal128(reinterpret_cast<const int64_t*>(bytes)[1],
+ reinterpret_cast<const uint64_t*>(bytes)[0]) {}
+#else
+BasicDecimal128::BasicDecimal128(const uint8_t* bytes)
+ : BasicDecimal128(reinterpret_cast<const int64_t*>(bytes)[0],
+ reinterpret_cast<const uint64_t*>(bytes)[1]) {}
+#endif
+
+std::array<uint8_t, 16> BasicDecimal128::ToBytes() const {
+ std::array<uint8_t, 16> out{{0}};
+ ToBytes(out.data());
+ return out;
+}
+
+void BasicDecimal128::ToBytes(uint8_t* out) const {
+ DCHECK_NE(out, nullptr);
+#if ARROW_LITTLE_ENDIAN
+ reinterpret_cast<uint64_t*>(out)[0] = low_bits_;
+ reinterpret_cast<int64_t*>(out)[1] = high_bits_;
+#else
+ reinterpret_cast<int64_t*>(out)[0] = high_bits_;
+ reinterpret_cast<uint64_t*>(out)[1] = low_bits_;
+#endif
+}
+
+BasicDecimal128& BasicDecimal128::Negate() {
+ low_bits_ = ~low_bits_ + 1;
+ high_bits_ = ~high_bits_;
+ if (low_bits_ == 0) {
+ high_bits_ = SafeSignedAdd<int64_t>(high_bits_, 1);
+ }
+ return *this;
+}
+
+BasicDecimal128& BasicDecimal128::Abs() { return *this < 0 ? Negate() : *this; }
+
+BasicDecimal128 BasicDecimal128::Abs(const BasicDecimal128& in) {
+ BasicDecimal128 result(in);
+ return result.Abs();
+}
+
+bool BasicDecimal128::FitsInPrecision(int32_t precision) const {
+ DCHECK_GT(precision, 0);
+ DCHECK_LE(precision, 38);
+ return BasicDecimal128::Abs(*this) < ScaleMultipliers[precision];
+}
+
+BasicDecimal128& BasicDecimal128::operator+=(const BasicDecimal128& right) {
+ const uint64_t sum = low_bits_ + right.low_bits_;
+ high_bits_ = SafeSignedAdd<int64_t>(high_bits_, right.high_bits_);
+ if (sum < low_bits_) {
+ high_bits_ = SafeSignedAdd<int64_t>(high_bits_, 1);
+ }
+ low_bits_ = sum;
+ return *this;
+}
+
+BasicDecimal128& BasicDecimal128::operator-=(const BasicDecimal128& right) {
+ const uint64_t diff = low_bits_ - right.low_bits_;
+ high_bits_ -= right.high_bits_;
+ if (diff > low_bits_) {
+ --high_bits_;
+ }
+ low_bits_ = diff;
+ return *this;
+}
+
+BasicDecimal128& BasicDecimal128::operator/=(const BasicDecimal128& right) {
+ BasicDecimal128 remainder;
+ auto s = Divide(right, this, &remainder);
+ DCHECK_EQ(s, DecimalStatus::kSuccess);
+ return *this;
+}
+
+BasicDecimal128& BasicDecimal128::operator|=(const BasicDecimal128& right) {
+ low_bits_ |= right.low_bits_;
+ high_bits_ |= right.high_bits_;
+ return *this;
+}
+
+BasicDecimal128& BasicDecimal128::operator&=(const BasicDecimal128& right) {
+ low_bits_ &= right.low_bits_;
+ high_bits_ &= right.high_bits_;
+ return *this;
+}
+
+BasicDecimal128& BasicDecimal128::operator<<=(uint32_t bits) {
+ if (bits != 0) {
+ if (bits < 64) {
+ high_bits_ = SafeLeftShift(high_bits_, bits);
+ high_bits_ |= (low_bits_ >> (64 - bits));
+ low_bits_ <<= bits;
+ } else if (bits < 128) {
+ high_bits_ = static_cast<int64_t>(low_bits_) << (bits - 64);
+ low_bits_ = 0;
+ } else {
+ high_bits_ = 0;
+ low_bits_ = 0;
+ }
+ }
+ return *this;
+}
+
+BasicDecimal128& BasicDecimal128::operator>>=(uint32_t bits) {
+ if (bits != 0) {
+ if (bits < 64) {
+ low_bits_ >>= bits;
+ low_bits_ |= static_cast<uint64_t>(high_bits_ << (64 - bits));
+ high_bits_ = static_cast<int64_t>(static_cast<uint64_t>(high_bits_) >> bits);
+ } else if (bits < 128) {
+ low_bits_ = static_cast<uint64_t>(high_bits_ >> (bits - 64));
+ high_bits_ = static_cast<int64_t>(high_bits_ >= 0L ? 0L : -1L);
+ } else {
+ high_bits_ = static_cast<int64_t>(high_bits_ >= 0L ? 0L : -1L);
+ low_bits_ = static_cast<uint64_t>(high_bits_);
+ }
+ }
+ return *this;
+}
+
+namespace {
+
+// Convenience wrapper type over 128 bit unsigned integers. We opt not to
+// replace the uint128_t type in int128_internal.h because it would require
+// significantly more implementation work to be done. This class merely
+// provides the minimum necessary set of functions to perform 128+ bit
+// multiplication operations when there may or may not be native support.
+#ifdef ARROW_USE_NATIVE_INT128
+struct uint128_t {
+ uint128_t() {}
+ uint128_t(uint64_t hi, uint64_t lo) : val_((static_cast<__uint128_t>(hi) << 64) | lo) {}
+ explicit uint128_t(const BasicDecimal128& decimal) {
+ val_ = (static_cast<__uint128_t>(decimal.high_bits()) << 64) | decimal.low_bits();
+ }
+
+ explicit uint128_t(uint64_t value) : val_(value) {}
+
+ uint64_t hi() { return val_ >> 64; }
+ uint64_t lo() { return val_ & kInt64Mask; }
+
+ uint128_t& operator+=(const uint128_t& other) {
+ val_ += other.val_;
+ return *this;
+ }
+
+ uint128_t& operator*=(const uint128_t& other) {
+ val_ *= other.val_;
+ return *this;
+ }
+
+ __uint128_t val_;
+};
+
+#else
+// Multiply two 64 bit word components into a 128 bit result, with high bits
+// stored in hi and low bits in lo.
+inline void ExtendAndMultiply(uint64_t x, uint64_t y, uint64_t* hi, uint64_t* lo) {
+ // Perform multiplication on two 64 bit words x and y into a 128 bit result
+ // by splitting up x and y into 32 bit high/low bit components,
+ // allowing us to represent the multiplication as
+ // x * y = x_lo * y_lo + x_hi * y_lo * 2^32 + y_hi * x_lo * 2^32
+ // + x_hi * y_hi * 2^64
+ //
+ // Now, consider the final output as lo_lo || lo_hi || hi_lo || hi_hi
+ // Therefore,
+ // lo_lo is (x_lo * y_lo)_lo,
+ // lo_hi is ((x_lo * y_lo)_hi + (x_hi * y_lo)_lo + (x_lo * y_hi)_lo)_lo,
+ // hi_lo is ((x_hi * y_hi)_lo + (x_hi * y_lo)_hi + (x_lo * y_hi)_hi)_hi,
+ // hi_hi is (x_hi * y_hi)_hi
+ const uint64_t x_lo = x & kInt32Mask;
+ const uint64_t y_lo = y & kInt32Mask;
+ const uint64_t x_hi = x >> 32;
+ const uint64_t y_hi = y >> 32;
+
+ const uint64_t t = x_lo * y_lo;
+ const uint64_t t_lo = t & kInt32Mask;
+ const uint64_t t_hi = t >> 32;
+
+ const uint64_t u = x_hi * y_lo + t_hi;
+ const uint64_t u_lo = u & kInt32Mask;
+ const uint64_t u_hi = u >> 32;
+
+ const uint64_t v = x_lo * y_hi + u_lo;
+ const uint64_t v_hi = v >> 32;
+
+ *hi = x_hi * y_hi + u_hi + v_hi;
+ *lo = (v << 32) + t_lo;
+}
+
+struct uint128_t {
+ uint128_t() {}
+ uint128_t(uint64_t hi, uint64_t lo) : hi_(hi), lo_(lo) {}
+ explicit uint128_t(const BasicDecimal128& decimal) {
+ hi_ = decimal.high_bits();
+ lo_ = decimal.low_bits();
+ }
+
+ uint64_t hi() const { return hi_; }
+ uint64_t lo() const { return lo_; }
+
+ uint128_t& operator+=(const uint128_t& other) {
+ // To deduce the carry bit, we perform "65 bit" addition on the low bits and
+ // seeing if the resulting high bit is 1. This is accomplished by shifting the
+ // low bits to the right by 1 (chopping off the lowest bit), then adding 1 if the
+ // result of adding the two chopped bits would have produced a carry.
+ uint64_t carry = (((lo_ & other.lo_) & 1) + (lo_ >> 1) + (other.lo_ >> 1)) >> 63;
+ hi_ += other.hi_ + carry;
+ lo_ += other.lo_;
+ return *this;
+ }
+
+ uint128_t& operator*=(const uint128_t& other) {
+ uint128_t r;
+ ExtendAndMultiply(lo_, other.lo_, &r.hi_, &r.lo_);
+ r.hi_ += (hi_ * other.lo_) + (lo_ * other.hi_);
+ *this = r;
+ return *this;
+ }
+
+ uint64_t hi_;
+ uint64_t lo_;
+};
+#endif
+
+// Multiplies two N * 64 bit unsigned integer types, represented by a uint64_t
+// array into a same sized output. Elements in the array should be in
+// little endian order, and output will be the same. Overflow in multiplication
+// will result in the lower N * 64 bits of the result being set.
+template <int N>
+inline void MultiplyUnsignedArray(const std::array<uint64_t, N>& lh,
+ const std::array<uint64_t, N>& rh,
+ std::array<uint64_t, N>* result) {
+ for (int j = 0; j < N; ++j) {
+ uint64_t carry = 0;
+ for (int i = 0; i < N - j; ++i) {
+ uint128_t tmp(lh[i]);
+ tmp *= uint128_t(rh[j]);
+ tmp += uint128_t((*result)[i + j]);
+ tmp += uint128_t(carry);
+ (*result)[i + j] = tmp.lo();
+ carry = tmp.hi();
+ }
+ }
+}
+
+} // namespace
+
+BasicDecimal128& BasicDecimal128::operator*=(const BasicDecimal128& right) {
+ // Since the max value of BasicDecimal128 is supposed to be 1e38 - 1 and the
+ // min the negation taking the absolute values here should always be safe.
+ const bool negate = Sign() != right.Sign();
+ BasicDecimal128 x = BasicDecimal128::Abs(*this);
+ BasicDecimal128 y = BasicDecimal128::Abs(right);
+ uint128_t r(x);
+ r *= uint128_t{y};
+ high_bits_ = r.hi();
+ low_bits_ = r.lo();
+ if (negate) {
+ Negate();
+ }
+ return *this;
+}
+
+/// Expands the given little endian array of uint64_t into a big endian array of
+/// uint32_t. The value of input array is expected to be non-negative. The result_array
+/// will remove leading zeros from the input array.
+/// \param value_array a little endian array to represent the value
+/// \param result_array a big endian array of length N*2 to set with the value
+/// \result the output length of the array
+template <size_t N>
+static int64_t FillInArray(const std::array<uint64_t, N>& value_array,
+ uint32_t* result_array) {
+ int64_t next_index = 0;
+ // 1st loop to find out 1st non-negative value in input
+ int64_t i = N - 1;
+ for (; i >= 0; i--) {
+ if (value_array[i] != 0) {
+ if (value_array[i] <= std::numeric_limits<uint32_t>::max()) {
+ result_array[next_index++] = static_cast<uint32_t>(value_array[i]);
+ i--;
+ }
+ break;
+ }
+ }
+ // 2nd loop to fill in the rest of the array.
+ for (int64_t j = i; j >= 0; j--) {
+ result_array[next_index++] = static_cast<uint32_t>(value_array[j] >> 32);
+ result_array[next_index++] = static_cast<uint32_t>(value_array[j]);
+ }
+ return next_index;
+}
+
+/// Expands the given value into a big endian array of ints so that we can work on
+/// it. The array will be converted to an absolute value and the was_negative
+/// flag will be set appropriately. The array will remove leading zeros from
+/// the value.
+/// \param array a big endian array of length 4 to set with the value
+/// \param was_negative a flag for whether the value was original negative
+/// \result the output length of the array
+static int64_t FillInArray(const BasicDecimal128& value, uint32_t* array,
+ bool& was_negative) {
+ BasicDecimal128 abs_value = BasicDecimal128::Abs(value);
+ was_negative = value.high_bits() < 0;
+ uint64_t high = static_cast<uint64_t>(abs_value.high_bits());
+ uint64_t low = abs_value.low_bits();
+
+ // FillInArray(std::array<uint64_t, N>& value_array, uint32_t* result_array) is not
+ // called here as the following code has better performance, to avoid regression on
+ // BasicDecimal128 Division.
+ if (high != 0) {
+ if (high > std::numeric_limits<uint32_t>::max()) {
+ array[0] = static_cast<uint32_t>(high >> 32);
+ array[1] = static_cast<uint32_t>(high);
+ array[2] = static_cast<uint32_t>(low >> 32);
+ array[3] = static_cast<uint32_t>(low);
+ return 4;
+ }
+
+ array[0] = static_cast<uint32_t>(high);
+ array[1] = static_cast<uint32_t>(low >> 32);
+ array[2] = static_cast<uint32_t>(low);
+ return 3;
+ }
+
+ if (low > std::numeric_limits<uint32_t>::max()) {
+ array[0] = static_cast<uint32_t>(low >> 32);
+ array[1] = static_cast<uint32_t>(low);
+ return 2;
+ }
+
+ if (low == 0) {
+ return 0;
+ }
+
+ array[0] = static_cast<uint32_t>(low);
+ return 1;
+}
+
+/// Expands the given value into a big endian array of ints so that we can work on
+/// it. The array will be converted to an absolute value and the was_negative
+/// flag will be set appropriately. The array will remove leading zeros from
+/// the value.
+/// \param array a big endian array of length 8 to set with the value
+/// \param was_negative a flag for whether the value was original negative
+/// \result the output length of the array
+static int64_t FillInArray(const BasicDecimal256& value, uint32_t* array,
+ bool& was_negative) {
+ BasicDecimal256 positive_value = value;
+ was_negative = false;
+ if (positive_value.IsNegative()) {
+ positive_value.Negate();
+ was_negative = true;
+ }
+ return FillInArray<4>(positive_value.little_endian_array(), array);
+}
+
+/// Shift the number in the array left by bits positions.
+/// \param array the number to shift, must have length elements
+/// \param length the number of entries in the array
+/// \param bits the number of bits to shift (0 <= bits < 32)
+static void ShiftArrayLeft(uint32_t* array, int64_t length, int64_t bits) {
+ if (length > 0 && bits != 0) {
+ for (int64_t i = 0; i < length - 1; ++i) {
+ array[i] = (array[i] << bits) | (array[i + 1] >> (32 - bits));
+ }
+ array[length - 1] <<= bits;
+ }
+}
+
+/// Shift the number in the array right by bits positions.
+/// \param array the number to shift, must have length elements
+/// \param length the number of entries in the array
+/// \param bits the number of bits to shift (0 <= bits < 32)
+static inline void ShiftArrayRight(uint32_t* array, int64_t length, int64_t bits) {
+ if (length > 0 && bits != 0) {
+ for (int64_t i = length - 1; i > 0; --i) {
+ array[i] = (array[i] >> bits) | (array[i - 1] << (32 - bits));
+ }
+ array[0] >>= bits;
+ }
+}
+
+/// \brief Fix the signs of the result and remainder at the end of the division based on
+/// the signs of the dividend and divisor.
+template <class DecimalClass>
+static inline void FixDivisionSigns(DecimalClass* result, DecimalClass* remainder,
+ bool dividend_was_negative,
+ bool divisor_was_negative) {
+ if (dividend_was_negative != divisor_was_negative) {
+ result->Negate();
+ }
+
+ if (dividend_was_negative) {
+ remainder->Negate();
+ }
+}
+
+/// \brief Build a little endian array of uint64_t from a big endian array of uint32_t.
+template <size_t N>
+static DecimalStatus BuildFromArray(std::array<uint64_t, N>* result_array,
+ const uint32_t* array, int64_t length) {
+ for (int64_t i = length - 2 * N - 1; i >= 0; i--) {
+ if (array[i] != 0) {
+ return DecimalStatus::kOverflow;
+ }
+ }
+ int64_t next_index = length - 1;
+ size_t i = 0;
+ for (; i < N && next_index >= 0; i++) {
+ uint64_t lower_bits = array[next_index--];
+ (*result_array)[i] =
+ (next_index < 0)
+ ? lower_bits
+ : ((static_cast<uint64_t>(array[next_index--]) << 32) + lower_bits);
+ }
+ for (; i < N; i++) {
+ (*result_array)[i] = 0;
+ }
+ return DecimalStatus::kSuccess;
+}
+
+/// \brief Build a BasicDecimal128 from a big endian array of uint32_t.
+static DecimalStatus BuildFromArray(BasicDecimal128* value, const uint32_t* array,
+ int64_t length) {
+ std::array<uint64_t, 2> result_array;
+ auto status = BuildFromArray(&result_array, array, length);
+ if (status != DecimalStatus::kSuccess) {
+ return status;
+ }
+ *value = {static_cast<int64_t>(result_array[1]), result_array[0]};
+ return DecimalStatus::kSuccess;
+}
+
+/// \brief Build a BasicDecimal256 from a big endian array of uint32_t.
+static DecimalStatus BuildFromArray(BasicDecimal256* value, const uint32_t* array,
+ int64_t length) {
+ std::array<uint64_t, 4> result_array;
+ auto status = BuildFromArray(&result_array, array, length);
+ if (status != DecimalStatus::kSuccess) {
+ return status;
+ }
+ *value = result_array;
+ return DecimalStatus::kSuccess;
+}
+
+/// \brief Do a division where the divisor fits into a single 32 bit value.
+template <class DecimalClass>
+static inline DecimalStatus SingleDivide(const uint32_t* dividend,
+ int64_t dividend_length, uint32_t divisor,
+ DecimalClass* remainder,
+ bool dividend_was_negative,
+ bool divisor_was_negative,
+ DecimalClass* result) {
+ uint64_t r = 0;
+ constexpr int64_t kDecimalArrayLength = DecimalClass::bit_width / sizeof(uint32_t) + 1;
+ uint32_t result_array[kDecimalArrayLength];
+ for (int64_t j = 0; j < dividend_length; j++) {
+ r <<= 32;
+ r += dividend[j];
+ result_array[j] = static_cast<uint32_t>(r / divisor);
+ r %= divisor;
+ }
+ auto status = BuildFromArray(result, result_array, dividend_length);
+ if (status != DecimalStatus::kSuccess) {
+ return status;
+ }
+
+ *remainder = static_cast<int64_t>(r);
+ FixDivisionSigns(result, remainder, dividend_was_negative, divisor_was_negative);
+ return DecimalStatus::kSuccess;
+}
+
+/// \brief Do a decimal division with remainder.
+template <class DecimalClass>
+static inline DecimalStatus DecimalDivide(const DecimalClass& dividend,
+ const DecimalClass& divisor,
+ DecimalClass* result, DecimalClass* remainder) {
+ constexpr int64_t kDecimalArrayLength = DecimalClass::bit_width / sizeof(uint32_t);
+ // Split the dividend and divisor into integer pieces so that we can
+ // work on them.
+ uint32_t dividend_array[kDecimalArrayLength + 1];
+ uint32_t divisor_array[kDecimalArrayLength];
+ bool dividend_was_negative;
+ bool divisor_was_negative;
+ // leave an extra zero before the dividend
+ dividend_array[0] = 0;
+ int64_t dividend_length =
+ FillInArray(dividend, dividend_array + 1, dividend_was_negative) + 1;
+ int64_t divisor_length = FillInArray(divisor, divisor_array, divisor_was_negative);
+
+ // Handle some of the easy cases.
+ if (dividend_length <= divisor_length) {
+ *remainder = dividend;
+ *result = 0;
+ return DecimalStatus::kSuccess;
+ }
+
+ if (divisor_length == 0) {
+ return DecimalStatus::kDivideByZero;
+ }
+
+ if (divisor_length == 1) {
+ return SingleDivide(dividend_array, dividend_length, divisor_array[0], remainder,
+ dividend_was_negative, divisor_was_negative, result);
+ }
+
+ int64_t result_length = dividend_length - divisor_length;
+ uint32_t result_array[kDecimalArrayLength];
+ DCHECK_LE(result_length, kDecimalArrayLength);
+
+ // Normalize by shifting both by a multiple of 2 so that
+ // the digit guessing is better. The requirement is that
+ // divisor_array[0] is greater than 2**31.
+ int64_t normalize_bits = BitUtil::CountLeadingZeros(divisor_array[0]);
+ ShiftArrayLeft(divisor_array, divisor_length, normalize_bits);
+ ShiftArrayLeft(dividend_array, dividend_length, normalize_bits);
+
+ // compute each digit in the result
+ for (int64_t j = 0; j < result_length; ++j) {
+ // Guess the next digit. At worst it is two too large
+ uint32_t guess = std::numeric_limits<uint32_t>::max();
+ const auto high_dividend =
+ static_cast<uint64_t>(dividend_array[j]) << 32 | dividend_array[j + 1];
+ if (dividend_array[j] != divisor_array[0]) {
+ guess = static_cast<uint32_t>(high_dividend / divisor_array[0]);
+ }
+
+ // catch all of the cases where guess is two too large and most of the
+ // cases where it is one too large
+ auto rhat = static_cast<uint32_t>(high_dividend -
+ guess * static_cast<uint64_t>(divisor_array[0]));
+ while (static_cast<uint64_t>(divisor_array[1]) * guess >
+ (static_cast<uint64_t>(rhat) << 32) + dividend_array[j + 2]) {
+ --guess;
+ rhat += divisor_array[0];
+ if (static_cast<uint64_t>(rhat) < divisor_array[0]) {
+ break;
+ }
+ }
+
+ // subtract off the guess * divisor from the dividend
+ uint64_t mult = 0;
+ for (int64_t i = divisor_length - 1; i >= 0; --i) {
+ mult += static_cast<uint64_t>(guess) * divisor_array[i];
+ uint32_t prev = dividend_array[j + i + 1];
+ dividend_array[j + i + 1] -= static_cast<uint32_t>(mult);
+ mult >>= 32;
+ if (dividend_array[j + i + 1] > prev) {
+ ++mult;
+ }
+ }
+ uint32_t prev = dividend_array[j];
+ dividend_array[j] -= static_cast<uint32_t>(mult);
+
+ // if guess was too big, we add back divisor
+ if (dividend_array[j] > prev) {
+ --guess;
+ uint32_t carry = 0;
+ for (int64_t i = divisor_length - 1; i >= 0; --i) {
+ const auto sum =
+ static_cast<uint64_t>(divisor_array[i]) + dividend_array[j + i + 1] + carry;
+ dividend_array[j + i + 1] = static_cast<uint32_t>(sum);
+ carry = static_cast<uint32_t>(sum >> 32);
+ }
+ dividend_array[j] += carry;
+ }
+
+ result_array[j] = guess;
+ }
+
+ // denormalize the remainder
+ ShiftArrayRight(dividend_array, dividend_length, normalize_bits);
+
+ // return result and remainder
+ auto status = BuildFromArray(result, result_array, result_length);
+ if (status != DecimalStatus::kSuccess) {
+ return status;
+ }
+ status = BuildFromArray(remainder, dividend_array, dividend_length);
+ if (status != DecimalStatus::kSuccess) {
+ return status;
+ }
+
+ FixDivisionSigns(result, remainder, dividend_was_negative, divisor_was_negative);
+ return DecimalStatus::kSuccess;
+}
+
+DecimalStatus BasicDecimal128::Divide(const BasicDecimal128& divisor,
+ BasicDecimal128* result,
+ BasicDecimal128* remainder) const {
+ return DecimalDivide(*this, divisor, result, remainder);
+}
+
+bool operator==(const BasicDecimal128& left, const BasicDecimal128& right) {
+ return left.high_bits() == right.high_bits() && left.low_bits() == right.low_bits();
+}
+
+bool operator!=(const BasicDecimal128& left, const BasicDecimal128& right) {
+ return !operator==(left, right);
+}
+
+bool operator<(const BasicDecimal128& left, const BasicDecimal128& right) {
+ return left.high_bits() < right.high_bits() ||
+ (left.high_bits() == right.high_bits() && left.low_bits() < right.low_bits());
+}
+
+bool operator<=(const BasicDecimal128& left, const BasicDecimal128& right) {
+ return !operator>(left, right);
+}
+
+bool operator>(const BasicDecimal128& left, const BasicDecimal128& right) {
+ return operator<(right, left);
+}
+
+bool operator>=(const BasicDecimal128& left, const BasicDecimal128& right) {
+ return !operator<(left, right);
+}
+
+BasicDecimal128 operator-(const BasicDecimal128& operand) {
+ BasicDecimal128 result(operand.high_bits(), operand.low_bits());
+ return result.Negate();
+}
+
+BasicDecimal128 operator~(const BasicDecimal128& operand) {
+ BasicDecimal128 result(~operand.high_bits(), ~operand.low_bits());
+ return result;
+}
+
+BasicDecimal128 operator+(const BasicDecimal128& left, const BasicDecimal128& right) {
+ BasicDecimal128 result(left.high_bits(), left.low_bits());
+ result += right;
+ return result;
+}
+
+BasicDecimal128 operator-(const BasicDecimal128& left, const BasicDecimal128& right) {
+ BasicDecimal128 result(left.high_bits(), left.low_bits());
+ result -= right;
+ return result;
+}
+
+BasicDecimal128 operator*(const BasicDecimal128& left, const BasicDecimal128& right) {
+ BasicDecimal128 result(left.high_bits(), left.low_bits());
+ result *= right;
+ return result;
+}
+
+BasicDecimal128 operator/(const BasicDecimal128& left, const BasicDecimal128& right) {
+ BasicDecimal128 remainder;
+ BasicDecimal128 result;
+ auto s = left.Divide(right, &result, &remainder);
+ DCHECK_EQ(s, DecimalStatus::kSuccess);
+ return result;
+}
+
+BasicDecimal128 operator%(const BasicDecimal128& left, const BasicDecimal128& right) {
+ BasicDecimal128 remainder;
+ BasicDecimal128 result;
+ auto s = left.Divide(right, &result, &remainder);
+ DCHECK_EQ(s, DecimalStatus::kSuccess);
+ return remainder;
+}
+
+template <class DecimalClass>
+static bool RescaleWouldCauseDataLoss(const DecimalClass& value, int32_t delta_scale,
+ const DecimalClass& multiplier,
+ DecimalClass* result) {
+ if (delta_scale < 0) {
+ DCHECK_NE(multiplier, 0);
+ DecimalClass remainder;
+ auto status = value.Divide(multiplier, result, &remainder);
+ DCHECK_EQ(status, DecimalStatus::kSuccess);
+ return remainder != 0;
+ }
+
+ *result = value * multiplier;
+ return (value < 0) ? *result > value : *result < value;
+}
+
+template <class DecimalClass>
+DecimalStatus DecimalRescale(const DecimalClass& value, int32_t original_scale,
+ int32_t new_scale, DecimalClass* out) {
+ DCHECK_NE(out, nullptr);
+
+ if (original_scale == new_scale) {
+ *out = value;
+ return DecimalStatus::kSuccess;
+ }
+
+ const int32_t delta_scale = new_scale - original_scale;
+ const int32_t abs_delta_scale = std::abs(delta_scale);
+
+ DecimalClass multiplier = DecimalClass::GetScaleMultiplier(abs_delta_scale);
+
+ const bool rescale_would_cause_data_loss =
+ RescaleWouldCauseDataLoss(value, delta_scale, multiplier, out);
+
+ // Fail if we overflow or truncate
+ if (ARROW_PREDICT_FALSE(rescale_would_cause_data_loss)) {
+ return DecimalStatus::kRescaleDataLoss;
+ }
+
+ return DecimalStatus::kSuccess;
+}
+
+DecimalStatus BasicDecimal128::Rescale(int32_t original_scale, int32_t new_scale,
+ BasicDecimal128* out) const {
+ return DecimalRescale(*this, original_scale, new_scale, out);
+}
+
+void BasicDecimal128::GetWholeAndFraction(int scale, BasicDecimal128* whole,
+ BasicDecimal128* fraction) const {
+ DCHECK_GE(scale, 0);
+ DCHECK_LE(scale, 38);
+
+ BasicDecimal128 multiplier(ScaleMultipliers[scale]);
+ auto s = Divide(multiplier, whole, fraction);
+ DCHECK_EQ(s, DecimalStatus::kSuccess);
+}
+
+const BasicDecimal128& BasicDecimal128::GetScaleMultiplier(int32_t scale) {
+ DCHECK_GE(scale, 0);
+ DCHECK_LE(scale, 38);
+
+ return ScaleMultipliers[scale];
+}
+
+const BasicDecimal128& BasicDecimal128::GetMaxValue() { return kMaxValue; }
+
+BasicDecimal128 BasicDecimal128::IncreaseScaleBy(int32_t increase_by) const {
+ DCHECK_GE(increase_by, 0);
+ DCHECK_LE(increase_by, 38);
+
+ return (*this) * ScaleMultipliers[increase_by];
+}
+
+BasicDecimal128 BasicDecimal128::ReduceScaleBy(int32_t reduce_by, bool round) const {
+ DCHECK_GE(reduce_by, 0);
+ DCHECK_LE(reduce_by, 38);
+
+ if (reduce_by == 0) {
+ return *this;
+ }
+
+ BasicDecimal128 divisor(ScaleMultipliers[reduce_by]);
+ BasicDecimal128 result;
+ BasicDecimal128 remainder;
+ auto s = Divide(divisor, &result, &remainder);
+ DCHECK_EQ(s, DecimalStatus::kSuccess);
+ if (round) {
+ auto divisor_half = ScaleMultipliersHalf[reduce_by];
+ if (remainder.Abs() >= divisor_half) {
+ if (result > 0) {
+ result += 1;
+ } else {
+ result -= 1;
+ }
+ }
+ }
+ return result;
+}
+
+int32_t BasicDecimal128::CountLeadingBinaryZeros() const {
+ DCHECK_GE(*this, BasicDecimal128(0));
+
+ if (high_bits_ == 0) {
+ return BitUtil::CountLeadingZeros(low_bits_) + 64;
+ } else {
+ return BitUtil::CountLeadingZeros(static_cast<uint64_t>(high_bits_));
+ }
+}
+
+#if ARROW_LITTLE_ENDIAN
+BasicDecimal256::BasicDecimal256(const uint8_t* bytes)
+ : little_endian_array_(
+ std::array<uint64_t, 4>({reinterpret_cast<const uint64_t*>(bytes)[0],
+ reinterpret_cast<const uint64_t*>(bytes)[1],
+ reinterpret_cast<const uint64_t*>(bytes)[2],
+ reinterpret_cast<const uint64_t*>(bytes)[3]})) {}
+#else
+BasicDecimal256::BasicDecimal256(const uint8_t* bytes)
+ : little_endian_array_(
+ std::array<uint64_t, 4>({reinterpret_cast<const uint64_t*>(bytes)[3],
+ reinterpret_cast<const uint64_t*>(bytes)[2],
+ reinterpret_cast<const uint64_t*>(bytes)[1],
+ reinterpret_cast<const uint64_t*>(bytes)[0]})) {}
+#endif
+
+BasicDecimal256& BasicDecimal256::Negate() {
+ uint64_t carry = 1;
+ for (uint64_t& elem : little_endian_array_) {
+ elem = ~elem + carry;
+ carry &= (elem == 0);
+ }
+ return *this;
+}
+
+BasicDecimal256& BasicDecimal256::Abs() { return *this < 0 ? Negate() : *this; }
+
+BasicDecimal256 BasicDecimal256::Abs(const BasicDecimal256& in) {
+ BasicDecimal256 result(in);
+ return result.Abs();
+}
+
+BasicDecimal256& BasicDecimal256::operator+=(const BasicDecimal256& right) {
+ uint64_t carry = 0;
+ for (size_t i = 0; i < little_endian_array_.size(); i++) {
+ const uint64_t right_value = right.little_endian_array_[i];
+ uint64_t sum = right_value + carry;
+ carry = 0;
+ if (sum < right_value) {
+ carry += 1;
+ }
+ sum += little_endian_array_[i];
+ if (sum < little_endian_array_[i]) {
+ carry += 1;
+ }
+ little_endian_array_[i] = sum;
+ }
+ return *this;
+}
+
+BasicDecimal256& BasicDecimal256::operator-=(const BasicDecimal256& right) {
+ *this += -right;
+ return *this;
+}
+
+BasicDecimal256& BasicDecimal256::operator<<=(uint32_t bits) {
+ if (bits == 0) {
+ return *this;
+ }
+ int cross_word_shift = bits / 64;
+ if (static_cast<size_t>(cross_word_shift) >= little_endian_array_.size()) {
+ little_endian_array_ = {0, 0, 0, 0};
+ return *this;
+ }
+ uint32_t in_word_shift = bits % 64;
+ for (int i = static_cast<int>(little_endian_array_.size() - 1); i >= cross_word_shift;
+ i--) {
+ // Account for shifts larger then 64 bits
+ little_endian_array_[i] = little_endian_array_[i - cross_word_shift];
+ little_endian_array_[i] <<= in_word_shift;
+ if (in_word_shift != 0 && i >= cross_word_shift + 1) {
+ little_endian_array_[i] |=
+ little_endian_array_[i - (cross_word_shift + 1)] >> (64 - in_word_shift);
+ }
+ }
+ for (int i = cross_word_shift - 1; i >= 0; i--) {
+ little_endian_array_[i] = 0;
+ }
+ return *this;
+}
+
+std::array<uint8_t, 32> BasicDecimal256::ToBytes() const {
+ std::array<uint8_t, 32> out{{0}};
+ ToBytes(out.data());
+ return out;
+}
+
+void BasicDecimal256::ToBytes(uint8_t* out) const {
+ DCHECK_NE(out, nullptr);
+#if ARROW_LITTLE_ENDIAN
+ reinterpret_cast<int64_t*>(out)[0] = little_endian_array_[0];
+ reinterpret_cast<int64_t*>(out)[1] = little_endian_array_[1];
+ reinterpret_cast<int64_t*>(out)[2] = little_endian_array_[2];
+ reinterpret_cast<int64_t*>(out)[3] = little_endian_array_[3];
+#else
+ reinterpret_cast<int64_t*>(out)[0] = little_endian_array_[3];
+ reinterpret_cast<int64_t*>(out)[1] = little_endian_array_[2];
+ reinterpret_cast<int64_t*>(out)[2] = little_endian_array_[1];
+ reinterpret_cast<int64_t*>(out)[3] = little_endian_array_[0];
+#endif
+}
+
+BasicDecimal256& BasicDecimal256::operator*=(const BasicDecimal256& right) {
+ // Since the max value of BasicDecimal256 is supposed to be 1e76 - 1 and the
+ // min the negation taking the absolute values here should always be safe.
+ const bool negate = Sign() != right.Sign();
+ BasicDecimal256 x = BasicDecimal256::Abs(*this);
+ BasicDecimal256 y = BasicDecimal256::Abs(right);
+
+ uint128_t r_hi;
+ uint128_t r_lo;
+ std::array<uint64_t, 4> res{0, 0, 0, 0};
+ MultiplyUnsignedArray<4>(x.little_endian_array_, y.little_endian_array_, &res);
+ little_endian_array_ = res;
+ if (negate) {
+ Negate();
+ }
+ return *this;
+}
+
+DecimalStatus BasicDecimal256::Divide(const BasicDecimal256& divisor,
+ BasicDecimal256* result,
+ BasicDecimal256* remainder) const {
+ return DecimalDivide(*this, divisor, result, remainder);
+}
+
+DecimalStatus BasicDecimal256::Rescale(int32_t original_scale, int32_t new_scale,
+ BasicDecimal256* out) const {
+ return DecimalRescale(*this, original_scale, new_scale, out);
+}
+
+BasicDecimal256 BasicDecimal256::IncreaseScaleBy(int32_t increase_by) const {
+ DCHECK_GE(increase_by, 0);
+ DCHECK_LE(increase_by, 76);
+
+ return (*this) * ScaleMultipliersDecimal256[increase_by];
+}
+
+BasicDecimal256 BasicDecimal256::ReduceScaleBy(int32_t reduce_by, bool round) const {
+ DCHECK_GE(reduce_by, 0);
+ DCHECK_LE(reduce_by, 76);
+
+ if (reduce_by == 0) {
+ return *this;
+ }
+
+ BasicDecimal256 divisor(ScaleMultipliersDecimal256[reduce_by]);
+ BasicDecimal256 result;
+ BasicDecimal256 remainder;
+ auto s = Divide(divisor, &result, &remainder);
+ DCHECK_EQ(s, DecimalStatus::kSuccess);
+ if (round) {
+ auto divisor_half = ScaleMultipliersHalfDecimal256[reduce_by];
+ if (remainder.Abs() >= divisor_half) {
+ if (result > 0) {
+ result += 1;
+ } else {
+ result -= 1;
+ }
+ }
+ }
+ return result;
+}
+
+bool BasicDecimal256::FitsInPrecision(int32_t precision) const {
+ DCHECK_GT(precision, 0);
+ DCHECK_LE(precision, 76);
+ return BasicDecimal256::Abs(*this) < ScaleMultipliersDecimal256[precision];
+}
+
+const BasicDecimal256& BasicDecimal256::GetScaleMultiplier(int32_t scale) {
+ DCHECK_GE(scale, 0);
+ DCHECK_LE(scale, 76);
+
+ return ScaleMultipliersDecimal256[scale];
+}
+
+BasicDecimal256 operator*(const BasicDecimal256& left, const BasicDecimal256& right) {
+ BasicDecimal256 result = left;
+ result *= right;
+ return result;
+}
+
+bool operator<(const BasicDecimal256& left, const BasicDecimal256& right) {
+ const std::array<uint64_t, 4>& lhs = left.little_endian_array();
+ const std::array<uint64_t, 4>& rhs = right.little_endian_array();
+ return lhs[3] != rhs[3]
+ ? static_cast<int64_t>(lhs[3]) < static_cast<int64_t>(rhs[3])
+ : lhs[2] != rhs[2] ? lhs[2] < rhs[2]
+ : lhs[1] != rhs[1] ? lhs[1] < rhs[1] : lhs[0] < rhs[0];
+}
+
+BasicDecimal256 operator-(const BasicDecimal256& operand) {
+ BasicDecimal256 result(operand);
+ return result.Negate();
+}
+
+BasicDecimal256 operator~(const BasicDecimal256& operand) {
+ const std::array<uint64_t, 4>& arr = operand.little_endian_array();
+ BasicDecimal256 result({~arr[0], ~arr[1], ~arr[2], ~arr[3]});
+ return result;
+}
+
+BasicDecimal256& BasicDecimal256::operator/=(const BasicDecimal256& right) {
+ BasicDecimal256 remainder;
+ auto s = Divide(right, this, &remainder);
+ DCHECK_EQ(s, DecimalStatus::kSuccess);
+ return *this;
+}
+
+BasicDecimal256 operator+(const BasicDecimal256& left, const BasicDecimal256& right) {
+ BasicDecimal256 sum = left;
+ sum += right;
+ return sum;
+}
+
+BasicDecimal256 operator/(const BasicDecimal256& left, const BasicDecimal256& right) {
+ BasicDecimal256 remainder;
+ BasicDecimal256 result;
+ auto s = left.Divide(right, &result, &remainder);
+ DCHECK_EQ(s, DecimalStatus::kSuccess);
+ return result;
+}
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/basic_decimal.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/basic_decimal.h
new file mode 100644
index 00000000000..acc8ea4930f
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/basic_decimal.h
@@ -0,0 +1,342 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <array>
+#include <cstdint>
+#include <limits>
+#include <string>
+#include <type_traits>
+
+#include "arrow/util/macros.h"
+#include "arrow/util/type_traits.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+enum class DecimalStatus {
+ kSuccess,
+ kDivideByZero,
+ kOverflow,
+ kRescaleDataLoss,
+};
+
+/// Represents a signed 128-bit integer in two's complement.
+///
+/// This class is also compiled into LLVM IR - so, it should not have cpp references like
+/// streams and boost.
+class ARROW_EXPORT BasicDecimal128 {
+ public:
+ static constexpr int bit_width = 128;
+
+ /// \brief Create a BasicDecimal128 from the two's complement representation.
+ constexpr BasicDecimal128(int64_t high, uint64_t low) noexcept
+ : low_bits_(low), high_bits_(high) {}
+
+ /// \brief Empty constructor creates a BasicDecimal128 with a value of 0.
+ constexpr BasicDecimal128() noexcept : BasicDecimal128(0, 0) {}
+
+ /// \brief Convert any integer value into a BasicDecimal128.
+ template <typename T,
+ typename = typename std::enable_if<
+ std::is_integral<T>::value && (sizeof(T) <= sizeof(uint64_t)), T>::type>
+ constexpr BasicDecimal128(T value) noexcept
+ : BasicDecimal128(value >= T{0} ? 0 : -1, static_cast<uint64_t>(value)) { // NOLINT
+ }
+
+ /// \brief Create a BasicDecimal128 from an array of bytes. Bytes are assumed to be in
+ /// native-endian byte order.
+ explicit BasicDecimal128(const uint8_t* bytes);
+
+ /// \brief Negate the current value (in-place)
+ BasicDecimal128& Negate();
+
+ /// \brief Absolute value (in-place)
+ BasicDecimal128& Abs();
+
+ /// \brief Absolute value
+ static BasicDecimal128 Abs(const BasicDecimal128& left);
+
+ /// \brief Add a number to this one. The result is truncated to 128 bits.
+ BasicDecimal128& operator+=(const BasicDecimal128& right);
+
+ /// \brief Subtract a number from this one. The result is truncated to 128 bits.
+ BasicDecimal128& operator-=(const BasicDecimal128& right);
+
+ /// \brief Multiply this number by another number. The result is truncated to 128 bits.
+ BasicDecimal128& operator*=(const BasicDecimal128& right);
+
+ /// Divide this number by right and return the result.
+ ///
+ /// This operation is not destructive.
+ /// The answer rounds to zero. Signs work like:
+ /// 21 / 5 -> 4, 1
+ /// -21 / 5 -> -4, -1
+ /// 21 / -5 -> -4, 1
+ /// -21 / -5 -> 4, -1
+ /// \param[in] divisor the number to divide by
+ /// \param[out] result the quotient
+ /// \param[out] remainder the remainder after the division
+ DecimalStatus Divide(const BasicDecimal128& divisor, BasicDecimal128* result,
+ BasicDecimal128* remainder) const;
+
+ /// \brief In-place division.
+ BasicDecimal128& operator/=(const BasicDecimal128& right);
+
+ /// \brief Bitwise "or" between two BasicDecimal128.
+ BasicDecimal128& operator|=(const BasicDecimal128& right);
+
+ /// \brief Bitwise "and" between two BasicDecimal128.
+ BasicDecimal128& operator&=(const BasicDecimal128& right);
+
+ /// \brief Shift left by the given number of bits.
+ BasicDecimal128& operator<<=(uint32_t bits);
+
+ /// \brief Shift right by the given number of bits. Negative values will
+ BasicDecimal128& operator>>=(uint32_t bits);
+
+ /// \brief Get the high bits of the two's complement representation of the number.
+ inline constexpr int64_t high_bits() const { return high_bits_; }
+
+ /// \brief Get the low bits of the two's complement representation of the number.
+ inline constexpr uint64_t low_bits() const { return low_bits_; }
+
+ /// \brief Return the raw bytes of the value in native-endian byte order.
+ std::array<uint8_t, 16> ToBytes() const;
+ void ToBytes(uint8_t* out) const;
+
+ /// \brief separate the integer and fractional parts for the given scale.
+ void GetWholeAndFraction(int32_t scale, BasicDecimal128* whole,
+ BasicDecimal128* fraction) const;
+
+ /// \brief Scale multiplier for given scale value.
+ static const BasicDecimal128& GetScaleMultiplier(int32_t scale);
+
+ /// \brief Convert BasicDecimal128 from one scale to another
+ DecimalStatus Rescale(int32_t original_scale, int32_t new_scale,
+ BasicDecimal128* out) const;
+
+ /// \brief Scale up.
+ BasicDecimal128 IncreaseScaleBy(int32_t increase_by) const;
+
+ /// \brief Scale down.
+ /// - If 'round' is true, the right-most digits are dropped and the result value is
+ /// rounded up (+1 for +ve, -1 for -ve) based on the value of the dropped digits
+ /// (>= 10^reduce_by / 2).
+ /// - If 'round' is false, the right-most digits are simply dropped.
+ BasicDecimal128 ReduceScaleBy(int32_t reduce_by, bool round = true) const;
+
+ /// \brief Whether this number fits in the given precision
+ ///
+ /// Return true if the number of significant digits is less or equal to `precision`.
+ bool FitsInPrecision(int32_t precision) const;
+
+ // returns 1 for positive and zero decimal values, -1 for negative decimal values.
+ inline int64_t Sign() const { return 1 | (high_bits_ >> 63); }
+
+ /// \brief count the number of leading binary zeroes.
+ int32_t CountLeadingBinaryZeros() const;
+
+ /// \brief Get the maximum valid unscaled decimal value.
+ static const BasicDecimal128& GetMaxValue();
+
+ private:
+ uint64_t low_bits_;
+ int64_t high_bits_;
+};
+
+ARROW_EXPORT bool operator==(const BasicDecimal128& left, const BasicDecimal128& right);
+ARROW_EXPORT bool operator!=(const BasicDecimal128& left, const BasicDecimal128& right);
+ARROW_EXPORT bool operator<(const BasicDecimal128& left, const BasicDecimal128& right);
+ARROW_EXPORT bool operator<=(const BasicDecimal128& left, const BasicDecimal128& right);
+ARROW_EXPORT bool operator>(const BasicDecimal128& left, const BasicDecimal128& right);
+ARROW_EXPORT bool operator>=(const BasicDecimal128& left, const BasicDecimal128& right);
+
+ARROW_EXPORT BasicDecimal128 operator-(const BasicDecimal128& operand);
+ARROW_EXPORT BasicDecimal128 operator~(const BasicDecimal128& operand);
+ARROW_EXPORT BasicDecimal128 operator+(const BasicDecimal128& left,
+ const BasicDecimal128& right);
+ARROW_EXPORT BasicDecimal128 operator-(const BasicDecimal128& left,
+ const BasicDecimal128& right);
+ARROW_EXPORT BasicDecimal128 operator*(const BasicDecimal128& left,
+ const BasicDecimal128& right);
+ARROW_EXPORT BasicDecimal128 operator/(const BasicDecimal128& left,
+ const BasicDecimal128& right);
+ARROW_EXPORT BasicDecimal128 operator%(const BasicDecimal128& left,
+ const BasicDecimal128& right);
+
+class ARROW_EXPORT BasicDecimal256 {
+ private:
+ // Due to a bug in clang, we have to declare the extend method prior to its
+ // usage.
+ template <typename T>
+ inline static constexpr uint64_t extend(T low_bits) noexcept {
+ return low_bits >= T() ? uint64_t{0} : ~uint64_t{0};
+ }
+
+ public:
+ static constexpr int bit_width = 256;
+
+ /// \brief Create a BasicDecimal256 from the two's complement representation.
+ constexpr BasicDecimal256(const std::array<uint64_t, 4>& little_endian_array) noexcept
+ : little_endian_array_(little_endian_array) {}
+
+ /// \brief Empty constructor creates a BasicDecimal256 with a value of 0.
+ constexpr BasicDecimal256() noexcept : little_endian_array_({0, 0, 0, 0}) {}
+
+ /// \brief Convert any integer value into a BasicDecimal256.
+ template <typename T,
+ typename = typename std::enable_if<
+ std::is_integral<T>::value && (sizeof(T) <= sizeof(uint64_t)), T>::type>
+ constexpr BasicDecimal256(T value) noexcept
+ : little_endian_array_({static_cast<uint64_t>(value), extend(value), extend(value),
+ extend(value)}) {}
+
+ constexpr BasicDecimal256(const BasicDecimal128& value) noexcept
+ : little_endian_array_({value.low_bits(), static_cast<uint64_t>(value.high_bits()),
+ extend(value.high_bits()), extend(value.high_bits())}) {}
+
+ /// \brief Create a BasicDecimal256 from an array of bytes. Bytes are assumed to be in
+ /// native-endian byte order.
+ explicit BasicDecimal256(const uint8_t* bytes);
+
+ /// \brief Negate the current value (in-place)
+ BasicDecimal256& Negate();
+
+ /// \brief Absolute value (in-place)
+ BasicDecimal256& Abs();
+
+ /// \brief Absolute value
+ static BasicDecimal256 Abs(const BasicDecimal256& left);
+
+ /// \brief Add a number to this one. The result is truncated to 256 bits.
+ BasicDecimal256& operator+=(const BasicDecimal256& right);
+
+ /// \brief Subtract a number from this one. The result is truncated to 256 bits.
+ BasicDecimal256& operator-=(const BasicDecimal256& right);
+
+ /// \brief Get the bits of the two's complement representation of the number. The 4
+ /// elements are in little endian order. The bits within each uint64_t element are in
+ /// native endian order. For example,
+ /// BasicDecimal256(123).little_endian_array() = {123, 0, 0, 0};
+ /// BasicDecimal256(-2).little_endian_array() = {0xFF...FE, 0xFF...FF, 0xFF...FF,
+ /// 0xFF...FF}.
+ inline const std::array<uint64_t, 4>& little_endian_array() const {
+ return little_endian_array_;
+ }
+
+ /// \brief Get the lowest bits of the two's complement representation of the number.
+ inline constexpr uint64_t low_bits() const { return little_endian_array_[0]; }
+
+ /// \brief Return the raw bytes of the value in native-endian byte order.
+ std::array<uint8_t, 32> ToBytes() const;
+ void ToBytes(uint8_t* out) const;
+
+ /// \brief Scale multiplier for given scale value.
+ static const BasicDecimal256& GetScaleMultiplier(int32_t scale);
+
+ /// \brief Convert BasicDecimal256 from one scale to another
+ DecimalStatus Rescale(int32_t original_scale, int32_t new_scale,
+ BasicDecimal256* out) const;
+
+ /// \brief Scale up.
+ BasicDecimal256 IncreaseScaleBy(int32_t increase_by) const;
+
+ /// \brief Scale down.
+ /// - If 'round' is true, the right-most digits are dropped and the result value is
+ /// rounded up (+1 for positive, -1 for negative) based on the value of the
+ /// dropped digits (>= 10^reduce_by / 2).
+ /// - If 'round' is false, the right-most digits are simply dropped.
+ BasicDecimal256 ReduceScaleBy(int32_t reduce_by, bool round = true) const;
+
+ /// \brief Whether this number fits in the given precision
+ ///
+ /// Return true if the number of significant digits is less or equal to `precision`.
+ bool FitsInPrecision(int32_t precision) const;
+
+ inline int64_t Sign() const {
+ return 1 | (static_cast<int64_t>(little_endian_array_[3]) >> 63);
+ }
+
+ inline int64_t IsNegative() const {
+ return static_cast<int64_t>(little_endian_array_[3]) < 0;
+ }
+
+ /// \brief Multiply this number by another number. The result is truncated to 256 bits.
+ BasicDecimal256& operator*=(const BasicDecimal256& right);
+
+ /// Divide this number by right and return the result.
+ ///
+ /// This operation is not destructive.
+ /// The answer rounds to zero. Signs work like:
+ /// 21 / 5 -> 4, 1
+ /// -21 / 5 -> -4, -1
+ /// 21 / -5 -> -4, 1
+ /// -21 / -5 -> 4, -1
+ /// \param[in] divisor the number to divide by
+ /// \param[out] result the quotient
+ /// \param[out] remainder the remainder after the division
+ DecimalStatus Divide(const BasicDecimal256& divisor, BasicDecimal256* result,
+ BasicDecimal256* remainder) const;
+
+ /// \brief Shift left by the given number of bits.
+ BasicDecimal256& operator<<=(uint32_t bits);
+
+ /// \brief In-place division.
+ BasicDecimal256& operator/=(const BasicDecimal256& right);
+
+ private:
+ std::array<uint64_t, 4> little_endian_array_;
+};
+
+ARROW_EXPORT inline bool operator==(const BasicDecimal256& left,
+ const BasicDecimal256& right) {
+ return left.little_endian_array() == right.little_endian_array();
+}
+
+ARROW_EXPORT inline bool operator!=(const BasicDecimal256& left,
+ const BasicDecimal256& right) {
+ return left.little_endian_array() != right.little_endian_array();
+}
+
+ARROW_EXPORT bool operator<(const BasicDecimal256& left, const BasicDecimal256& right);
+
+ARROW_EXPORT inline bool operator<=(const BasicDecimal256& left,
+ const BasicDecimal256& right) {
+ return !operator<(right, left);
+}
+
+ARROW_EXPORT inline bool operator>(const BasicDecimal256& left,
+ const BasicDecimal256& right) {
+ return operator<(right, left);
+}
+
+ARROW_EXPORT inline bool operator>=(const BasicDecimal256& left,
+ const BasicDecimal256& right) {
+ return !operator<(left, right);
+}
+
+ARROW_EXPORT BasicDecimal256 operator-(const BasicDecimal256& operand);
+ARROW_EXPORT BasicDecimal256 operator~(const BasicDecimal256& operand);
+ARROW_EXPORT BasicDecimal256 operator+(const BasicDecimal256& left,
+ const BasicDecimal256& right);
+ARROW_EXPORT BasicDecimal256 operator*(const BasicDecimal256& left,
+ const BasicDecimal256& right);
+ARROW_EXPORT BasicDecimal256 operator/(const BasicDecimal256& left,
+ const BasicDecimal256& right);
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_block_counter.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_block_counter.cc
new file mode 100644
index 00000000000..c67cedc4a06
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_block_counter.cc
@@ -0,0 +1,80 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/util/bit_block_counter.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <type_traits>
+
+#include "arrow/buffer.h"
+#include "arrow/util/bitmap_ops.h"
+
+namespace arrow {
+namespace internal {
+
+BitBlockCount BitBlockCounter::GetBlockSlow(int64_t block_size) noexcept {
+ const int16_t run_length = static_cast<int16_t>(std::min(bits_remaining_, block_size));
+ int16_t popcount = static_cast<int16_t>(CountSetBits(bitmap_, offset_, run_length));
+ bits_remaining_ -= run_length;
+ // This code path should trigger _at most_ 2 times. In the "two times"
+ // case, the first time the run length will be a multiple of 8 by construction
+ bitmap_ += run_length / 8;
+ return {run_length, popcount};
+}
+
+// Prevent pointer arithmetic on nullptr, which is undefined behavior even if the pointer
+// is never dereferenced.
+inline const uint8_t* EnsureNotNull(const uint8_t* ptr) {
+ static const uint8_t byte{};
+ return ptr == nullptr ? &byte : ptr;
+}
+
+OptionalBitBlockCounter::OptionalBitBlockCounter(const uint8_t* validity_bitmap,
+ int64_t offset, int64_t length)
+ : has_bitmap_(validity_bitmap != nullptr),
+ position_(0),
+ length_(length),
+ counter_(EnsureNotNull(validity_bitmap), offset, length) {}
+
+OptionalBitBlockCounter::OptionalBitBlockCounter(
+ const std::shared_ptr<Buffer>& validity_bitmap, int64_t offset, int64_t length)
+ : OptionalBitBlockCounter(validity_bitmap ? validity_bitmap->data() : nullptr, offset,
+ length) {}
+
+OptionalBinaryBitBlockCounter::OptionalBinaryBitBlockCounter(const uint8_t* left_bitmap,
+ int64_t left_offset,
+ const uint8_t* right_bitmap,
+ int64_t right_offset,
+ int64_t length)
+ : has_bitmap_(HasBitmapFromBitmaps(left_bitmap != nullptr, right_bitmap != nullptr)),
+ position_(0),
+ length_(length),
+ unary_counter_(EnsureNotNull(left_bitmap != nullptr ? left_bitmap : right_bitmap),
+ left_bitmap != nullptr ? left_offset : right_offset, length),
+ binary_counter_(EnsureNotNull(left_bitmap), left_offset,
+ EnsureNotNull(right_bitmap), right_offset, length) {}
+
+OptionalBinaryBitBlockCounter::OptionalBinaryBitBlockCounter(
+ const std::shared_ptr<Buffer>& left_bitmap, int64_t left_offset,
+ const std::shared_ptr<Buffer>& right_bitmap, int64_t right_offset, int64_t length)
+ : OptionalBinaryBitBlockCounter(
+ left_bitmap ? left_bitmap->data() : nullptr, left_offset,
+ right_bitmap ? right_bitmap->data() : nullptr, right_offset, length) {}
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_block_counter.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_block_counter.h
new file mode 100644
index 00000000000..63036af52a4
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_block_counter.h
@@ -0,0 +1,542 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cstdint>
+#include <limits>
+#include <memory>
+
+#include "arrow/buffer.h"
+#include "arrow/status.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/endian.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/ubsan.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace internal {
+namespace detail {
+
+inline uint64_t LoadWord(const uint8_t* bytes) {
+ return BitUtil::ToLittleEndian(util::SafeLoadAs<uint64_t>(bytes));
+}
+
+inline uint64_t ShiftWord(uint64_t current, uint64_t next, int64_t shift) {
+ if (shift == 0) {
+ return current;
+ }
+ return (current >> shift) | (next << (64 - shift));
+}
+
+// These templates are here to help with unit tests
+
+template <typename T>
+struct BitBlockAnd {
+ static T Call(T left, T right) { return left & right; }
+};
+
+template <>
+struct BitBlockAnd<bool> {
+ static bool Call(bool left, bool right) { return left && right; }
+};
+
+template <typename T>
+struct BitBlockAndNot {
+ static T Call(T left, T right) { return left & ~right; }
+};
+
+template <>
+struct BitBlockAndNot<bool> {
+ static bool Call(bool left, bool right) { return left && !right; }
+};
+
+template <typename T>
+struct BitBlockOr {
+ static T Call(T left, T right) { return left | right; }
+};
+
+template <>
+struct BitBlockOr<bool> {
+ static bool Call(bool left, bool right) { return left || right; }
+};
+
+template <typename T>
+struct BitBlockOrNot {
+ static T Call(T left, T right) { return left | ~right; }
+};
+
+template <>
+struct BitBlockOrNot<bool> {
+ static bool Call(bool left, bool right) { return left || !right; }
+};
+
+} // namespace detail
+
+/// \brief Return value from bit block counters: the total number of bits and
+/// the number of set bits.
+struct BitBlockCount {
+ int16_t length;
+ int16_t popcount;
+
+ bool NoneSet() const { return this->popcount == 0; }
+ bool AllSet() const { return this->length == this->popcount; }
+};
+
+/// \brief A class that scans through a true/false bitmap to compute popcounts
+/// 64 or 256 bits at a time. This is used to accelerate processing of
+/// mostly-not-null array data.
+class ARROW_EXPORT BitBlockCounter {
+ public:
+ BitBlockCounter(const uint8_t* bitmap, int64_t start_offset, int64_t length)
+ : bitmap_(bitmap + start_offset / 8),
+ bits_remaining_(length),
+ offset_(start_offset % 8) {}
+
+ /// \brief The bit size of each word run
+ static constexpr int64_t kWordBits = 64;
+
+ /// \brief The bit size of four words run
+ static constexpr int64_t kFourWordsBits = kWordBits * 4;
+
+ /// \brief Return the next run of available bits, usually 256. The returned
+ /// pair contains the size of run and the number of true values. The last
+ /// block will have a length less than 256 if the bitmap length is not a
+ /// multiple of 256, and will return 0-length blocks in subsequent
+ /// invocations.
+ BitBlockCount NextFourWords() {
+ using detail::LoadWord;
+ using detail::ShiftWord;
+
+ if (!bits_remaining_) {
+ return {0, 0};
+ }
+ int64_t total_popcount = 0;
+ if (offset_ == 0) {
+ if (bits_remaining_ < kFourWordsBits) {
+ return GetBlockSlow(kFourWordsBits);
+ }
+ total_popcount += BitUtil::PopCount(LoadWord(bitmap_));
+ total_popcount += BitUtil::PopCount(LoadWord(bitmap_ + 8));
+ total_popcount += BitUtil::PopCount(LoadWord(bitmap_ + 16));
+ total_popcount += BitUtil::PopCount(LoadWord(bitmap_ + 24));
+ } else {
+ // When the offset is > 0, we need there to be a word beyond the last
+ // aligned word in the bitmap for the bit shifting logic.
+ if (bits_remaining_ < 5 * kFourWordsBits - offset_) {
+ return GetBlockSlow(kFourWordsBits);
+ }
+ auto current = LoadWord(bitmap_);
+ auto next = LoadWord(bitmap_ + 8);
+ total_popcount += BitUtil::PopCount(ShiftWord(current, next, offset_));
+ current = next;
+ next = LoadWord(bitmap_ + 16);
+ total_popcount += BitUtil::PopCount(ShiftWord(current, next, offset_));
+ current = next;
+ next = LoadWord(bitmap_ + 24);
+ total_popcount += BitUtil::PopCount(ShiftWord(current, next, offset_));
+ current = next;
+ next = LoadWord(bitmap_ + 32);
+ total_popcount += BitUtil::PopCount(ShiftWord(current, next, offset_));
+ }
+ bitmap_ += BitUtil::BytesForBits(kFourWordsBits);
+ bits_remaining_ -= kFourWordsBits;
+ return {256, static_cast<int16_t>(total_popcount)};
+ }
+
+ /// \brief Return the next run of available bits, usually 64. The returned
+ /// pair contains the size of run and the number of true values. The last
+ /// block will have a length less than 64 if the bitmap length is not a
+ /// multiple of 64, and will return 0-length blocks in subsequent
+ /// invocations.
+ BitBlockCount NextWord() {
+ using detail::LoadWord;
+ using detail::ShiftWord;
+
+ if (!bits_remaining_) {
+ return {0, 0};
+ }
+ int64_t popcount = 0;
+ if (offset_ == 0) {
+ if (bits_remaining_ < kWordBits) {
+ return GetBlockSlow(kWordBits);
+ }
+ popcount = BitUtil::PopCount(LoadWord(bitmap_));
+ } else {
+ // When the offset is > 0, we need there to be a word beyond the last
+ // aligned word in the bitmap for the bit shifting logic.
+ if (bits_remaining_ < 2 * kWordBits - offset_) {
+ return GetBlockSlow(kWordBits);
+ }
+ popcount =
+ BitUtil::PopCount(ShiftWord(LoadWord(bitmap_), LoadWord(bitmap_ + 8), offset_));
+ }
+ bitmap_ += kWordBits / 8;
+ bits_remaining_ -= kWordBits;
+ return {64, static_cast<int16_t>(popcount)};
+ }
+
+ private:
+ /// \brief Return block with the requested size when doing word-wise
+ /// computation is not possible due to inadequate bits remaining.
+ BitBlockCount GetBlockSlow(int64_t block_size) noexcept;
+
+ const uint8_t* bitmap_;
+ int64_t bits_remaining_;
+ int64_t offset_;
+};
+
+/// \brief A tool to iterate through a possibly non-existent validity bitmap,
+/// to allow us to write one code path for both the with-nulls and no-nulls
+/// cases without giving up a lot of performance.
+class ARROW_EXPORT OptionalBitBlockCounter {
+ public:
+ // validity_bitmap may be NULLPTR
+ OptionalBitBlockCounter(const uint8_t* validity_bitmap, int64_t offset, int64_t length);
+
+ // validity_bitmap may be null
+ OptionalBitBlockCounter(const std::shared_ptr<Buffer>& validity_bitmap, int64_t offset,
+ int64_t length);
+
+ /// Return block count for next word when the bitmap is available otherwise
+ /// return a block with length up to INT16_MAX when there is no validity
+ /// bitmap (so all the referenced values are not null).
+ BitBlockCount NextBlock() {
+ static constexpr int64_t kMaxBlockSize = std::numeric_limits<int16_t>::max();
+ if (has_bitmap_) {
+ BitBlockCount block = counter_.NextWord();
+ position_ += block.length;
+ return block;
+ } else {
+ int16_t block_size =
+ static_cast<int16_t>(std::min(kMaxBlockSize, length_ - position_));
+ position_ += block_size;
+ // All values are non-null
+ return {block_size, block_size};
+ }
+ }
+
+ // Like NextBlock, but returns a word-sized block even when there is no
+ // validity bitmap
+ BitBlockCount NextWord() {
+ static constexpr int64_t kWordSize = 64;
+ if (has_bitmap_) {
+ BitBlockCount block = counter_.NextWord();
+ position_ += block.length;
+ return block;
+ } else {
+ int16_t block_size = static_cast<int16_t>(std::min(kWordSize, length_ - position_));
+ position_ += block_size;
+ // All values are non-null
+ return {block_size, block_size};
+ }
+ }
+
+ private:
+ const bool has_bitmap_;
+ int64_t position_;
+ int64_t length_;
+ BitBlockCounter counter_;
+};
+
+/// \brief A class that computes popcounts on the result of bitwise operations
+/// between two bitmaps, 64 bits at a time. A 64-bit word is loaded from each
+/// bitmap, then the popcount is computed on e.g. the bitwise-and of the two
+/// words.
+class ARROW_EXPORT BinaryBitBlockCounter {
+ public:
+ BinaryBitBlockCounter(const uint8_t* left_bitmap, int64_t left_offset,
+ const uint8_t* right_bitmap, int64_t right_offset, int64_t length)
+ : left_bitmap_(left_bitmap + left_offset / 8),
+ left_offset_(left_offset % 8),
+ right_bitmap_(right_bitmap + right_offset / 8),
+ right_offset_(right_offset % 8),
+ bits_remaining_(length) {}
+
+ /// \brief Return the popcount of the bitwise-and of the next run of
+ /// available bits, up to 64. The returned pair contains the size of run and
+ /// the number of true values. The last block will have a length less than 64
+ /// if the bitmap length is not a multiple of 64, and will return 0-length
+ /// blocks in subsequent invocations.
+ BitBlockCount NextAndWord() { return NextWord<detail::BitBlockAnd>(); }
+
+ /// \brief Computes "x & ~y" block for each available run of bits.
+ BitBlockCount NextAndNotWord() { return NextWord<detail::BitBlockAndNot>(); }
+
+ /// \brief Computes "x | y" block for each available run of bits.
+ BitBlockCount NextOrWord() { return NextWord<detail::BitBlockOr>(); }
+
+ /// \brief Computes "x | ~y" block for each available run of bits.
+ BitBlockCount NextOrNotWord() { return NextWord<detail::BitBlockOrNot>(); }
+
+ private:
+ template <template <typename T> class Op>
+ BitBlockCount NextWord() {
+ using detail::LoadWord;
+ using detail::ShiftWord;
+
+ if (!bits_remaining_) {
+ return {0, 0};
+ }
+ // When the offset is > 0, we need there to be a word beyond the last aligned
+ // word in the bitmap for the bit shifting logic.
+ constexpr int64_t kWordBits = BitBlockCounter::kWordBits;
+ const int64_t bits_required_to_use_words =
+ std::max(left_offset_ == 0 ? 64 : 64 + (64 - left_offset_),
+ right_offset_ == 0 ? 64 : 64 + (64 - right_offset_));
+ if (bits_remaining_ < bits_required_to_use_words) {
+ const int16_t run_length =
+ static_cast<int16_t>(std::min(bits_remaining_, kWordBits));
+ int16_t popcount = 0;
+ for (int64_t i = 0; i < run_length; ++i) {
+ if (Op<bool>::Call(BitUtil::GetBit(left_bitmap_, left_offset_ + i),
+ BitUtil::GetBit(right_bitmap_, right_offset_ + i))) {
+ ++popcount;
+ }
+ }
+ // This code path should trigger _at most_ 2 times. In the "two times"
+ // case, the first time the run length will be a multiple of 8.
+ left_bitmap_ += run_length / 8;
+ right_bitmap_ += run_length / 8;
+ bits_remaining_ -= run_length;
+ return {run_length, popcount};
+ }
+
+ int64_t popcount = 0;
+ if (left_offset_ == 0 && right_offset_ == 0) {
+ popcount = BitUtil::PopCount(
+ Op<uint64_t>::Call(LoadWord(left_bitmap_), LoadWord(right_bitmap_)));
+ } else {
+ auto left_word =
+ ShiftWord(LoadWord(left_bitmap_), LoadWord(left_bitmap_ + 8), left_offset_);
+ auto right_word =
+ ShiftWord(LoadWord(right_bitmap_), LoadWord(right_bitmap_ + 8), right_offset_);
+ popcount = BitUtil::PopCount(Op<uint64_t>::Call(left_word, right_word));
+ }
+ left_bitmap_ += kWordBits / 8;
+ right_bitmap_ += kWordBits / 8;
+ bits_remaining_ -= kWordBits;
+ return {64, static_cast<int16_t>(popcount)};
+ }
+
+ const uint8_t* left_bitmap_;
+ int64_t left_offset_;
+ const uint8_t* right_bitmap_;
+ int64_t right_offset_;
+ int64_t bits_remaining_;
+};
+
+class ARROW_EXPORT OptionalBinaryBitBlockCounter {
+ public:
+ // Any bitmap may be NULLPTR
+ OptionalBinaryBitBlockCounter(const uint8_t* left_bitmap, int64_t left_offset,
+ const uint8_t* right_bitmap, int64_t right_offset,
+ int64_t length);
+
+ // Any bitmap may be null
+ OptionalBinaryBitBlockCounter(const std::shared_ptr<Buffer>& left_bitmap,
+ int64_t left_offset,
+ const std::shared_ptr<Buffer>& right_bitmap,
+ int64_t right_offset, int64_t length);
+
+ BitBlockCount NextAndBlock() {
+ static constexpr int64_t kMaxBlockSize = std::numeric_limits<int16_t>::max();
+ switch (has_bitmap_) {
+ case HasBitmap::BOTH: {
+ BitBlockCount block = binary_counter_.NextAndWord();
+ position_ += block.length;
+ return block;
+ }
+ case HasBitmap::ONE: {
+ BitBlockCount block = unary_counter_.NextWord();
+ position_ += block.length;
+ return block;
+ }
+ case HasBitmap::NONE:
+ default: {
+ const int16_t block_size =
+ static_cast<int16_t>(std::min(kMaxBlockSize, length_ - position_));
+ position_ += block_size;
+ // All values are non-null
+ return {block_size, block_size};
+ }
+ }
+ }
+
+ BitBlockCount NextOrNotBlock() {
+ static constexpr int64_t kMaxBlockSize = std::numeric_limits<int16_t>::max();
+ switch (has_bitmap_) {
+ case HasBitmap::BOTH: {
+ BitBlockCount block = binary_counter_.NextOrNotWord();
+ position_ += block.length;
+ return block;
+ }
+ case HasBitmap::ONE: {
+ BitBlockCount block = unary_counter_.NextWord();
+ position_ += block.length;
+ return block;
+ }
+ case HasBitmap::NONE:
+ default: {
+ const int16_t block_size =
+ static_cast<int16_t>(std::min(kMaxBlockSize, length_ - position_));
+ position_ += block_size;
+ // All values are non-null
+ return {block_size, block_size};
+ }
+ }
+ }
+
+ private:
+ enum class HasBitmap : int { BOTH, ONE, NONE };
+
+ const HasBitmap has_bitmap_;
+ int64_t position_;
+ int64_t length_;
+ BitBlockCounter unary_counter_;
+ BinaryBitBlockCounter binary_counter_;
+
+ static HasBitmap HasBitmapFromBitmaps(bool has_left, bool has_right) {
+ switch (static_cast<int>(has_left) + static_cast<int>(has_right)) {
+ case 0:
+ return HasBitmap::NONE;
+ case 1:
+ return HasBitmap::ONE;
+ default: // 2
+ return HasBitmap::BOTH;
+ }
+ }
+};
+
+// Functional-style bit block visitors.
+
+template <typename VisitNotNull, typename VisitNull>
+static Status VisitBitBlocks(const std::shared_ptr<Buffer>& bitmap_buf, int64_t offset,
+ int64_t length, VisitNotNull&& visit_not_null,
+ VisitNull&& visit_null) {
+ const uint8_t* bitmap = NULLPTR;
+ if (bitmap_buf != NULLPTR) {
+ bitmap = bitmap_buf->data();
+ }
+ internal::OptionalBitBlockCounter bit_counter(bitmap, offset, length);
+ int64_t position = 0;
+ while (position < length) {
+ internal::BitBlockCount block = bit_counter.NextBlock();
+ if (block.AllSet()) {
+ for (int64_t i = 0; i < block.length; ++i, ++position) {
+ ARROW_RETURN_NOT_OK(visit_not_null(position));
+ }
+ } else if (block.NoneSet()) {
+ for (int64_t i = 0; i < block.length; ++i, ++position) {
+ ARROW_RETURN_NOT_OK(visit_null());
+ }
+ } else {
+ for (int64_t i = 0; i < block.length; ++i, ++position) {
+ if (BitUtil::GetBit(bitmap, offset + position)) {
+ ARROW_RETURN_NOT_OK(visit_not_null(position));
+ } else {
+ ARROW_RETURN_NOT_OK(visit_null());
+ }
+ }
+ }
+ }
+ return Status::OK();
+}
+
+template <typename VisitNotNull, typename VisitNull>
+static void VisitBitBlocksVoid(const std::shared_ptr<Buffer>& bitmap_buf, int64_t offset,
+ int64_t length, VisitNotNull&& visit_not_null,
+ VisitNull&& visit_null) {
+ const uint8_t* bitmap = NULLPTR;
+ if (bitmap_buf != NULLPTR) {
+ bitmap = bitmap_buf->data();
+ }
+ internal::OptionalBitBlockCounter bit_counter(bitmap, offset, length);
+ int64_t position = 0;
+ while (position < length) {
+ internal::BitBlockCount block = bit_counter.NextBlock();
+ if (block.AllSet()) {
+ for (int64_t i = 0; i < block.length; ++i, ++position) {
+ visit_not_null(position);
+ }
+ } else if (block.NoneSet()) {
+ for (int64_t i = 0; i < block.length; ++i, ++position) {
+ visit_null();
+ }
+ } else {
+ for (int64_t i = 0; i < block.length; ++i, ++position) {
+ if (BitUtil::GetBit(bitmap, offset + position)) {
+ visit_not_null(position);
+ } else {
+ visit_null();
+ }
+ }
+ }
+ }
+}
+
+template <typename VisitNotNull, typename VisitNull>
+static void VisitTwoBitBlocksVoid(const std::shared_ptr<Buffer>& left_bitmap_buf,
+ int64_t left_offset,
+ const std::shared_ptr<Buffer>& right_bitmap_buf,
+ int64_t right_offset, int64_t length,
+ VisitNotNull&& visit_not_null, VisitNull&& visit_null) {
+ if (left_bitmap_buf == NULLPTR || right_bitmap_buf == NULLPTR) {
+ // At most one bitmap is present
+ if (left_bitmap_buf == NULLPTR) {
+ return VisitBitBlocksVoid(right_bitmap_buf, right_offset, length,
+ std::forward<VisitNotNull>(visit_not_null),
+ std::forward<VisitNull>(visit_null));
+ } else {
+ return VisitBitBlocksVoid(left_bitmap_buf, left_offset, length,
+ std::forward<VisitNotNull>(visit_not_null),
+ std::forward<VisitNull>(visit_null));
+ }
+ }
+ // Both bitmaps are present
+ const uint8_t* left_bitmap = left_bitmap_buf->data();
+ const uint8_t* right_bitmap = right_bitmap_buf->data();
+ BinaryBitBlockCounter bit_counter(left_bitmap, left_offset, right_bitmap, right_offset,
+ length);
+ int64_t position = 0;
+ while (position < length) {
+ BitBlockCount block = bit_counter.NextAndWord();
+ if (block.AllSet()) {
+ for (int64_t i = 0; i < block.length; ++i, ++position) {
+ visit_not_null(position);
+ }
+ } else if (block.NoneSet()) {
+ for (int64_t i = 0; i < block.length; ++i, ++position) {
+ visit_null();
+ }
+ } else {
+ for (int64_t i = 0; i < block.length; ++i, ++position) {
+ if (BitUtil::GetBit(left_bitmap, left_offset + position) &&
+ BitUtil::GetBit(right_bitmap, right_offset + position)) {
+ visit_not_null(position);
+ } else {
+ visit_null();
+ }
+ }
+ }
+ }
+}
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_run_reader.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_run_reader.cc
new file mode 100644
index 00000000000..eda6088eb32
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_run_reader.cc
@@ -0,0 +1,54 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/util/bit_run_reader.h"
+
+#include <cstdint>
+
+#include "arrow/util/bit_util.h"
+
+namespace arrow {
+namespace internal {
+
+#if ARROW_LITTLE_ENDIAN
+
+BitRunReader::BitRunReader(const uint8_t* bitmap, int64_t start_offset, int64_t length)
+ : bitmap_(bitmap + (start_offset / 8)),
+ position_(start_offset % 8),
+ length_(position_ + length) {
+ if (ARROW_PREDICT_FALSE(length == 0)) {
+ word_ = 0;
+ return;
+ }
+
+ // On the initial load if there is an offset we need to account for this when
+ // loading bytes. Every other call to LoadWord() should only occur when
+ // position_ is a multiple of 64.
+ current_run_bit_set_ = !BitUtil::GetBit(bitmap, start_offset);
+ int64_t bits_remaining = length + position_;
+
+ LoadWord(bits_remaining);
+
+ // Prepare for inversion in NextRun.
+ // Clear out any preceding bits.
+ word_ = word_ & ~BitUtil::LeastSignificantBitMask(position_);
+}
+
+#endif
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_run_reader.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_run_reader.h
new file mode 100644
index 00000000000..3e196628477
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_run_reader.h
@@ -0,0 +1,515 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <string>
+
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_reader.h"
+#include "arrow/util/endian.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace internal {
+
+struct BitRun {
+ int64_t length;
+ // Whether bits are set at this point.
+ bool set;
+
+ std::string ToString() const {
+ return std::string("{Length: ") + std::to_string(length) +
+ ", set=" + std::to_string(set) + "}";
+ }
+};
+
+inline bool operator==(const BitRun& lhs, const BitRun& rhs) {
+ return lhs.length == rhs.length && lhs.set == rhs.set;
+}
+
+inline bool operator!=(const BitRun& lhs, const BitRun& rhs) {
+ return lhs.length != rhs.length || lhs.set != rhs.set;
+}
+
+class BitRunReaderLinear {
+ public:
+ BitRunReaderLinear(const uint8_t* bitmap, int64_t start_offset, int64_t length)
+ : reader_(bitmap, start_offset, length) {}
+
+ BitRun NextRun() {
+ BitRun rl = {/*length=*/0, reader_.IsSet()};
+ // Advance while the values are equal and not at the end of list.
+ while (reader_.position() < reader_.length() && reader_.IsSet() == rl.set) {
+ rl.length++;
+ reader_.Next();
+ }
+ return rl;
+ }
+
+ private:
+ BitmapReader reader_;
+};
+
+#if ARROW_LITTLE_ENDIAN
+/// A convenience class for counting the number of contiguous set/unset bits
+/// in a bitmap.
+class ARROW_EXPORT BitRunReader {
+ public:
+ /// \brief Constructs new BitRunReader.
+ ///
+ /// \param[in] bitmap source data
+ /// \param[in] start_offset bit offset into the source data
+ /// \param[in] length number of bits to copy
+ BitRunReader(const uint8_t* bitmap, int64_t start_offset, int64_t length);
+
+ /// Returns a new BitRun containing the number of contiguous
+ /// bits with the same value. length == 0 indicates the
+ /// end of the bitmap.
+ BitRun NextRun() {
+ if (ARROW_PREDICT_FALSE(position_ >= length_)) {
+ return {/*length=*/0, false};
+ }
+ // This implementation relies on a efficient implementations of
+ // CountTrailingZeros and assumes that runs are more often then
+ // not. The logic is to incrementally find the next bit change
+ // from the current position. This is done by zeroing all
+ // bits in word_ up to position_ and using the TrailingZeroCount
+ // to find the index of the next set bit.
+
+ // The runs alternate on each call, so flip the bit.
+ current_run_bit_set_ = !current_run_bit_set_;
+
+ int64_t start_position = position_;
+ int64_t start_bit_offset = start_position & 63;
+ // Invert the word for proper use of CountTrailingZeros and
+ // clear bits so CountTrailingZeros can do it magic.
+ word_ = ~word_ & ~BitUtil::LeastSignificantBitMask(start_bit_offset);
+
+ // Go forward until the next change from unset to set.
+ int64_t new_bits = BitUtil::CountTrailingZeros(word_) - start_bit_offset;
+ position_ += new_bits;
+
+ if (ARROW_PREDICT_FALSE(BitUtil::IsMultipleOf64(position_)) &&
+ ARROW_PREDICT_TRUE(position_ < length_)) {
+ // Continue extending position while we can advance an entire word.
+ // (updates position_ accordingly).
+ AdvanceUntilChange();
+ }
+
+ return {/*length=*/position_ - start_position, current_run_bit_set_};
+ }
+
+ private:
+ void AdvanceUntilChange() {
+ int64_t new_bits = 0;
+ do {
+ // Advance the position of the bitmap for loading.
+ bitmap_ += sizeof(uint64_t);
+ LoadNextWord();
+ new_bits = BitUtil::CountTrailingZeros(word_);
+ // Continue calculating run length.
+ position_ += new_bits;
+ } while (ARROW_PREDICT_FALSE(BitUtil::IsMultipleOf64(position_)) &&
+ ARROW_PREDICT_TRUE(position_ < length_) && new_bits > 0);
+ }
+
+ void LoadNextWord() { return LoadWord(length_ - position_); }
+
+ // Helper method for Loading the next word.
+ void LoadWord(int64_t bits_remaining) {
+ word_ = 0;
+ // we need at least an extra byte in this case.
+ if (ARROW_PREDICT_TRUE(bits_remaining >= 64)) {
+ std::memcpy(&word_, bitmap_, 8);
+ } else {
+ int64_t bytes_to_load = BitUtil::BytesForBits(bits_remaining);
+ auto word_ptr = reinterpret_cast<uint8_t*>(&word_);
+ std::memcpy(word_ptr, bitmap_, bytes_to_load);
+ // Ensure stoppage at last bit in bitmap by reversing the next higher
+ // order bit.
+ BitUtil::SetBitTo(word_ptr, bits_remaining,
+ !BitUtil::GetBit(word_ptr, bits_remaining - 1));
+ }
+
+ // Two cases:
+ // 1. For unset, CountTrailingZeros works naturally so we don't
+ // invert the word.
+ // 2. Otherwise invert so we can use CountTrailingZeros.
+ if (current_run_bit_set_) {
+ word_ = ~word_;
+ }
+ }
+ const uint8_t* bitmap_;
+ int64_t position_;
+ int64_t length_;
+ uint64_t word_;
+ bool current_run_bit_set_;
+};
+#else
+using BitRunReader = BitRunReaderLinear;
+#endif
+
+struct SetBitRun {
+ int64_t position;
+ int64_t length;
+
+ bool AtEnd() const { return length == 0; }
+
+ std::string ToString() const {
+ return std::string("{pos=") + std::to_string(position) +
+ ", len=" + std::to_string(length) + "}";
+ }
+
+ bool operator==(const SetBitRun& other) const {
+ return position == other.position && length == other.length;
+ }
+ bool operator!=(const SetBitRun& other) const {
+ return position != other.position || length != other.length;
+ }
+};
+
+template <bool Reverse>
+class BaseSetBitRunReader {
+ public:
+ /// \brief Constructs new SetBitRunReader.
+ ///
+ /// \param[in] bitmap source data
+ /// \param[in] start_offset bit offset into the source data
+ /// \param[in] length number of bits to copy
+ ARROW_NOINLINE
+ BaseSetBitRunReader(const uint8_t* bitmap, int64_t start_offset, int64_t length)
+ : bitmap_(bitmap),
+ length_(length),
+ remaining_(length_),
+ current_word_(0),
+ current_num_bits_(0) {
+ if (Reverse) {
+ bitmap_ += (start_offset + length) / 8;
+ const int8_t end_bit_offset = static_cast<int8_t>((start_offset + length) % 8);
+ if (length > 0 && end_bit_offset) {
+ // Get LSBs from last byte
+ ++bitmap_;
+ current_num_bits_ =
+ std::min(static_cast<int32_t>(length), static_cast<int32_t>(end_bit_offset));
+ current_word_ = LoadPartialWord(8 - end_bit_offset, current_num_bits_);
+ }
+ } else {
+ bitmap_ += start_offset / 8;
+ const int8_t bit_offset = static_cast<int8_t>(start_offset % 8);
+ if (length > 0 && bit_offset) {
+ // Get MSBs from first byte
+ current_num_bits_ =
+ std::min(static_cast<int32_t>(length), static_cast<int32_t>(8 - bit_offset));
+ current_word_ = LoadPartialWord(bit_offset, current_num_bits_);
+ }
+ }
+ }
+
+ ARROW_NOINLINE
+ SetBitRun NextRun() {
+ int64_t pos = 0;
+ int64_t len = 0;
+ if (current_num_bits_) {
+ const auto run = FindCurrentRun();
+ assert(remaining_ >= 0);
+ if (run.length && current_num_bits_) {
+ // The run ends in current_word_
+ return AdjustRun(run);
+ }
+ pos = run.position;
+ len = run.length;
+ }
+ if (!len) {
+ // We didn't get any ones in current_word_, so we can skip any zeros
+ // in the following words
+ SkipNextZeros();
+ if (remaining_ == 0) {
+ return {0, 0};
+ }
+ assert(current_num_bits_);
+ pos = position();
+ } else if (!current_num_bits_) {
+ if (ARROW_PREDICT_TRUE(remaining_ >= 64)) {
+ current_word_ = LoadFullWord();
+ current_num_bits_ = 64;
+ } else if (remaining_ > 0) {
+ current_word_ = LoadPartialWord(/*bit_offset=*/0, remaining_);
+ current_num_bits_ = static_cast<int32_t>(remaining_);
+ } else {
+ // No bits remaining, perhaps we found a run?
+ return AdjustRun({pos, len});
+ }
+ // If current word starts with a zero, we got a full run
+ if (!(current_word_ & kFirstBit)) {
+ return AdjustRun({pos, len});
+ }
+ }
+ // Current word should now start with a set bit
+ len += CountNextOnes();
+ return AdjustRun({pos, len});
+ }
+
+ protected:
+ int64_t position() const {
+ if (Reverse) {
+ return remaining_;
+ } else {
+ return length_ - remaining_;
+ }
+ }
+
+ SetBitRun AdjustRun(SetBitRun run) {
+ if (Reverse) {
+ assert(run.position >= run.length);
+ run.position -= run.length;
+ }
+ return run;
+ }
+
+ uint64_t LoadFullWord() {
+ uint64_t word;
+ if (Reverse) {
+ bitmap_ -= 8;
+ }
+ memcpy(&word, bitmap_, 8);
+ if (!Reverse) {
+ bitmap_ += 8;
+ }
+ return BitUtil::ToLittleEndian(word);
+ }
+
+ uint64_t LoadPartialWord(int8_t bit_offset, int64_t num_bits) {
+ assert(num_bits > 0);
+ uint64_t word = 0;
+ const int64_t num_bytes = BitUtil::BytesForBits(num_bits);
+ if (Reverse) {
+ // Read in the most significant bytes of the word
+ bitmap_ -= num_bytes;
+ memcpy(reinterpret_cast<char*>(&word) + 8 - num_bytes, bitmap_, num_bytes);
+ // XXX MostSignificantBitmask
+ return (BitUtil::ToLittleEndian(word) << bit_offset) &
+ ~BitUtil::LeastSignificantBitMask(64 - num_bits);
+ } else {
+ memcpy(&word, bitmap_, num_bytes);
+ bitmap_ += num_bytes;
+ return (BitUtil::ToLittleEndian(word) >> bit_offset) &
+ BitUtil::LeastSignificantBitMask(num_bits);
+ }
+ }
+
+ void SkipNextZeros() {
+ assert(current_num_bits_ == 0);
+ while (ARROW_PREDICT_TRUE(remaining_ >= 64)) {
+ current_word_ = LoadFullWord();
+ const auto num_zeros = CountFirstZeros(current_word_);
+ if (num_zeros < 64) {
+ // Run of zeros ends here
+ current_word_ = ConsumeBits(current_word_, num_zeros);
+ current_num_bits_ = 64 - num_zeros;
+ remaining_ -= num_zeros;
+ assert(remaining_ >= 0);
+ assert(current_num_bits_ >= 0);
+ return;
+ }
+ remaining_ -= 64;
+ }
+ // Run of zeros continues in last bitmap word
+ if (remaining_ > 0) {
+ current_word_ = LoadPartialWord(/*bit_offset=*/0, remaining_);
+ current_num_bits_ = static_cast<int32_t>(remaining_);
+ const auto num_zeros =
+ std::min<int32_t>(current_num_bits_, CountFirstZeros(current_word_));
+ current_word_ = ConsumeBits(current_word_, num_zeros);
+ current_num_bits_ -= num_zeros;
+ remaining_ -= num_zeros;
+ assert(remaining_ >= 0);
+ assert(current_num_bits_ >= 0);
+ }
+ }
+
+ int64_t CountNextOnes() {
+ assert(current_word_ & kFirstBit);
+
+ int64_t len;
+ if (~current_word_) {
+ const auto num_ones = CountFirstZeros(~current_word_);
+ assert(num_ones <= current_num_bits_);
+ assert(num_ones <= remaining_);
+ remaining_ -= num_ones;
+ current_word_ = ConsumeBits(current_word_, num_ones);
+ current_num_bits_ -= num_ones;
+ if (current_num_bits_) {
+ // Run of ones ends here
+ return num_ones;
+ }
+ len = num_ones;
+ } else {
+ // current_word_ is all ones
+ remaining_ -= 64;
+ current_num_bits_ = 0;
+ len = 64;
+ }
+
+ while (ARROW_PREDICT_TRUE(remaining_ >= 64)) {
+ current_word_ = LoadFullWord();
+ const auto num_ones = CountFirstZeros(~current_word_);
+ len += num_ones;
+ remaining_ -= num_ones;
+ if (num_ones < 64) {
+ // Run of ones ends here
+ current_word_ = ConsumeBits(current_word_, num_ones);
+ current_num_bits_ = 64 - num_ones;
+ return len;
+ }
+ }
+ // Run of ones continues in last bitmap word
+ if (remaining_ > 0) {
+ current_word_ = LoadPartialWord(/*bit_offset=*/0, remaining_);
+ current_num_bits_ = static_cast<int32_t>(remaining_);
+ const auto num_ones = CountFirstZeros(~current_word_);
+ assert(num_ones <= current_num_bits_);
+ assert(num_ones <= remaining_);
+ current_word_ = ConsumeBits(current_word_, num_ones);
+ current_num_bits_ -= num_ones;
+ remaining_ -= num_ones;
+ len += num_ones;
+ }
+ return len;
+ }
+
+ SetBitRun FindCurrentRun() {
+ // Skip any pending zeros
+ const auto num_zeros = CountFirstZeros(current_word_);
+ if (num_zeros >= current_num_bits_) {
+ remaining_ -= current_num_bits_;
+ current_word_ = 0;
+ current_num_bits_ = 0;
+ return {0, 0};
+ }
+ assert(num_zeros <= remaining_);
+ current_word_ = ConsumeBits(current_word_, num_zeros);
+ current_num_bits_ -= num_zeros;
+ remaining_ -= num_zeros;
+ const int64_t pos = position();
+ // Count any ones
+ const auto num_ones = CountFirstZeros(~current_word_);
+ assert(num_ones <= current_num_bits_);
+ assert(num_ones <= remaining_);
+ current_word_ = ConsumeBits(current_word_, num_ones);
+ current_num_bits_ -= num_ones;
+ remaining_ -= num_ones;
+ return {pos, num_ones};
+ }
+
+ inline int CountFirstZeros(uint64_t word);
+ inline uint64_t ConsumeBits(uint64_t word, int32_t num_bits);
+
+ const uint8_t* bitmap_;
+ const int64_t length_;
+ int64_t remaining_;
+ uint64_t current_word_;
+ int32_t current_num_bits_;
+
+ static constexpr uint64_t kFirstBit = Reverse ? 0x8000000000000000ULL : 1;
+};
+
+template <>
+inline int BaseSetBitRunReader<false>::CountFirstZeros(uint64_t word) {
+ return BitUtil::CountTrailingZeros(word);
+}
+
+template <>
+inline int BaseSetBitRunReader<true>::CountFirstZeros(uint64_t word) {
+ return BitUtil::CountLeadingZeros(word);
+}
+
+template <>
+inline uint64_t BaseSetBitRunReader<false>::ConsumeBits(uint64_t word, int32_t num_bits) {
+ return word >> num_bits;
+}
+
+template <>
+inline uint64_t BaseSetBitRunReader<true>::ConsumeBits(uint64_t word, int32_t num_bits) {
+ return word << num_bits;
+}
+
+using SetBitRunReader = BaseSetBitRunReader</*Reverse=*/false>;
+using ReverseSetBitRunReader = BaseSetBitRunReader</*Reverse=*/true>;
+
+// Functional-style bit run visitors.
+
+// XXX: Try to make this function small so the compiler can inline and optimize
+// the `visit` function, which is normally a hot loop with vectorizable code.
+// - don't inline SetBitRunReader constructor, it doesn't hurt performance
+// - un-inline NextRun hurts 'many null' cases a bit, but improves normal cases
+template <typename Visit>
+inline Status VisitSetBitRuns(const uint8_t* bitmap, int64_t offset, int64_t length,
+ Visit&& visit) {
+ if (bitmap == NULLPTR) {
+ // Assuming all set (as in a null bitmap)
+ return visit(static_cast<int64_t>(0), static_cast<int64_t>(length));
+ }
+ SetBitRunReader reader(bitmap, offset, length);
+ while (true) {
+ const auto run = reader.NextRun();
+ if (run.length == 0) {
+ break;
+ }
+ ARROW_RETURN_NOT_OK(visit(run.position, run.length));
+ }
+ return Status::OK();
+}
+
+template <typename Visit>
+inline void VisitSetBitRunsVoid(const uint8_t* bitmap, int64_t offset, int64_t length,
+ Visit&& visit) {
+ if (bitmap == NULLPTR) {
+ // Assuming all set (as in a null bitmap)
+ visit(static_cast<int64_t>(0), static_cast<int64_t>(length));
+ return;
+ }
+ SetBitRunReader reader(bitmap, offset, length);
+ while (true) {
+ const auto run = reader.NextRun();
+ if (run.length == 0) {
+ break;
+ }
+ visit(run.position, run.length);
+ }
+}
+
+template <typename Visit>
+inline Status VisitSetBitRuns(const std::shared_ptr<Buffer>& bitmap, int64_t offset,
+ int64_t length, Visit&& visit) {
+ return VisitSetBitRuns(bitmap ? bitmap->data() : NULLPTR, offset, length,
+ std::forward<Visit>(visit));
+}
+
+template <typename Visit>
+inline void VisitSetBitRunsVoid(const std::shared_ptr<Buffer>& bitmap, int64_t offset,
+ int64_t length, Visit&& visit) {
+ VisitSetBitRunsVoid(bitmap ? bitmap->data() : NULLPTR, offset, length,
+ std::forward<Visit>(visit));
+}
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_stream_utils.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_stream_utils.h
new file mode 100644
index 00000000000..b9e695dfcb0
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_stream_utils.h
@@ -0,0 +1,433 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// From Apache Impala (incubating) as of 2016-01-29
+
+#pragma once
+
+#include <string.h>
+#include <algorithm>
+#include <cstdint>
+
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bpacking.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/ubsan.h"
+
+namespace arrow {
+namespace BitUtil {
+
+/// Utility class to write bit/byte streams. This class can write data to either be
+/// bit packed or byte aligned (and a single stream that has a mix of both).
+/// This class does not allocate memory.
+class BitWriter {
+ public:
+ /// buffer: buffer to write bits to. Buffer should be preallocated with
+ /// 'buffer_len' bytes.
+ BitWriter(uint8_t* buffer, int buffer_len) : buffer_(buffer), max_bytes_(buffer_len) {
+ Clear();
+ }
+
+ void Clear() {
+ buffered_values_ = 0;
+ byte_offset_ = 0;
+ bit_offset_ = 0;
+ }
+
+ /// The number of current bytes written, including the current byte (i.e. may include a
+ /// fraction of a byte). Includes buffered values.
+ int bytes_written() const {
+ return byte_offset_ + static_cast<int>(BitUtil::BytesForBits(bit_offset_));
+ }
+ uint8_t* buffer() const { return buffer_; }
+ int buffer_len() const { return max_bytes_; }
+
+ /// Writes a value to buffered_values_, flushing to buffer_ if necessary. This is bit
+ /// packed. Returns false if there was not enough space. num_bits must be <= 32.
+ bool PutValue(uint64_t v, int num_bits);
+
+ /// Writes v to the next aligned byte using num_bytes. If T is larger than
+ /// num_bytes, the extra high-order bytes will be ignored. Returns false if
+ /// there was not enough space.
+ /// Assume the v is stored in buffer_ as a litte-endian format
+ template <typename T>
+ bool PutAligned(T v, int num_bytes);
+
+ /// Write a Vlq encoded int to the buffer. Returns false if there was not enough
+ /// room. The value is written byte aligned.
+ /// For more details on vlq:
+ /// en.wikipedia.org/wiki/Variable-length_quantity
+ bool PutVlqInt(uint32_t v);
+
+ // Writes an int zigzag encoded.
+ bool PutZigZagVlqInt(int32_t v);
+
+ /// Get a pointer to the next aligned byte and advance the underlying buffer
+ /// by num_bytes.
+ /// Returns NULL if there was not enough space.
+ uint8_t* GetNextBytePtr(int num_bytes = 1);
+
+ /// Flushes all buffered values to the buffer. Call this when done writing to
+ /// the buffer. If 'align' is true, buffered_values_ is reset and any future
+ /// writes will be written to the next byte boundary.
+ void Flush(bool align = false);
+
+ private:
+ uint8_t* buffer_;
+ int max_bytes_;
+
+ /// Bit-packed values are initially written to this variable before being memcpy'd to
+ /// buffer_. This is faster than writing values byte by byte directly to buffer_.
+ uint64_t buffered_values_;
+
+ int byte_offset_; // Offset in buffer_
+ int bit_offset_; // Offset in buffered_values_
+};
+
+/// Utility class to read bit/byte stream. This class can read bits or bytes
+/// that are either byte aligned or not. It also has utilities to read multiple
+/// bytes in one read (e.g. encoded int).
+class BitReader {
+ public:
+ /// 'buffer' is the buffer to read from. The buffer's length is 'buffer_len'.
+ BitReader(const uint8_t* buffer, int buffer_len)
+ : buffer_(buffer), max_bytes_(buffer_len), byte_offset_(0), bit_offset_(0) {
+ int num_bytes = std::min(8, max_bytes_ - byte_offset_);
+ memcpy(&buffered_values_, buffer_ + byte_offset_, num_bytes);
+ buffered_values_ = arrow::BitUtil::FromLittleEndian(buffered_values_);
+ }
+
+ BitReader()
+ : buffer_(NULL),
+ max_bytes_(0),
+ buffered_values_(0),
+ byte_offset_(0),
+ bit_offset_(0) {}
+
+ void Reset(const uint8_t* buffer, int buffer_len) {
+ buffer_ = buffer;
+ max_bytes_ = buffer_len;
+ byte_offset_ = 0;
+ bit_offset_ = 0;
+ int num_bytes = std::min(8, max_bytes_ - byte_offset_);
+ memcpy(&buffered_values_, buffer_ + byte_offset_, num_bytes);
+ buffered_values_ = arrow::BitUtil::FromLittleEndian(buffered_values_);
+ }
+
+ /// Gets the next value from the buffer. Returns true if 'v' could be read or false if
+ /// there are not enough bytes left. num_bits must be <= 32.
+ template <typename T>
+ bool GetValue(int num_bits, T* v);
+
+ /// Get a number of values from the buffer. Return the number of values actually read.
+ template <typename T>
+ int GetBatch(int num_bits, T* v, int batch_size);
+
+ /// Reads a 'num_bytes'-sized value from the buffer and stores it in 'v'. T
+ /// needs to be a little-endian native type and big enough to store
+ /// 'num_bytes'. The value is assumed to be byte-aligned so the stream will
+ /// be advanced to the start of the next byte before 'v' is read. Returns
+ /// false if there are not enough bytes left.
+ /// Assume the v was stored in buffer_ as a litte-endian format
+ template <typename T>
+ bool GetAligned(int num_bytes, T* v);
+
+ /// Reads a vlq encoded int from the stream. The encoded int must start at
+ /// the beginning of a byte. Return false if there were not enough bytes in
+ /// the buffer.
+ bool GetVlqInt(uint32_t* v);
+
+ // Reads a zigzag encoded int `into` v.
+ bool GetZigZagVlqInt(int32_t* v);
+
+ /// Returns the number of bytes left in the stream, not including the current
+ /// byte (i.e., there may be an additional fraction of a byte).
+ int bytes_left() {
+ return max_bytes_ -
+ (byte_offset_ + static_cast<int>(BitUtil::BytesForBits(bit_offset_)));
+ }
+
+ /// Maximum byte length of a vlq encoded int
+ static constexpr int kMaxVlqByteLength = 5;
+
+ private:
+ const uint8_t* buffer_;
+ int max_bytes_;
+
+ /// Bytes are memcpy'd from buffer_ and values are read from this variable. This is
+ /// faster than reading values byte by byte directly from buffer_.
+ uint64_t buffered_values_;
+
+ int byte_offset_; // Offset in buffer_
+ int bit_offset_; // Offset in buffered_values_
+};
+
+inline bool BitWriter::PutValue(uint64_t v, int num_bits) {
+ // TODO: revisit this limit if necessary (can be raised to 64 by fixing some edge cases)
+ DCHECK_LE(num_bits, 32);
+ DCHECK_EQ(v >> num_bits, 0) << "v = " << v << ", num_bits = " << num_bits;
+
+ if (ARROW_PREDICT_FALSE(byte_offset_ * 8 + bit_offset_ + num_bits > max_bytes_ * 8))
+ return false;
+
+ buffered_values_ |= v << bit_offset_;
+ bit_offset_ += num_bits;
+
+ if (ARROW_PREDICT_FALSE(bit_offset_ >= 64)) {
+ // Flush buffered_values_ and write out bits of v that did not fit
+ buffered_values_ = arrow::BitUtil::ToLittleEndian(buffered_values_);
+ memcpy(buffer_ + byte_offset_, &buffered_values_, 8);
+ buffered_values_ = 0;
+ byte_offset_ += 8;
+ bit_offset_ -= 64;
+ buffered_values_ = v >> (num_bits - bit_offset_);
+ }
+ DCHECK_LT(bit_offset_, 64);
+ return true;
+}
+
+inline void BitWriter::Flush(bool align) {
+ int num_bytes = static_cast<int>(BitUtil::BytesForBits(bit_offset_));
+ DCHECK_LE(byte_offset_ + num_bytes, max_bytes_);
+ auto buffered_values = arrow::BitUtil::ToLittleEndian(buffered_values_);
+ memcpy(buffer_ + byte_offset_, &buffered_values, num_bytes);
+
+ if (align) {
+ buffered_values_ = 0;
+ byte_offset_ += num_bytes;
+ bit_offset_ = 0;
+ }
+}
+
+inline uint8_t* BitWriter::GetNextBytePtr(int num_bytes) {
+ Flush(/* align */ true);
+ DCHECK_LE(byte_offset_, max_bytes_);
+ if (byte_offset_ + num_bytes > max_bytes_) return NULL;
+ uint8_t* ptr = buffer_ + byte_offset_;
+ byte_offset_ += num_bytes;
+ return ptr;
+}
+
+template <typename T>
+inline bool BitWriter::PutAligned(T val, int num_bytes) {
+ uint8_t* ptr = GetNextBytePtr(num_bytes);
+ if (ptr == NULL) return false;
+ val = arrow::BitUtil::ToLittleEndian(val);
+ memcpy(ptr, &val, num_bytes);
+ return true;
+}
+
+namespace detail {
+
+template <typename T>
+inline void GetValue_(int num_bits, T* v, int max_bytes, const uint8_t* buffer,
+ int* bit_offset, int* byte_offset, uint64_t* buffered_values) {
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4800)
+#endif
+ *v = static_cast<T>(BitUtil::TrailingBits(*buffered_values, *bit_offset + num_bits) >>
+ *bit_offset);
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+ *bit_offset += num_bits;
+ if (*bit_offset >= 64) {
+ *byte_offset += 8;
+ *bit_offset -= 64;
+
+ int bytes_remaining = max_bytes - *byte_offset;
+ if (ARROW_PREDICT_TRUE(bytes_remaining >= 8)) {
+ memcpy(buffered_values, buffer + *byte_offset, 8);
+ } else {
+ memcpy(buffered_values, buffer + *byte_offset, bytes_remaining);
+ }
+ *buffered_values = arrow::BitUtil::FromLittleEndian(*buffered_values);
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4800 4805)
+#endif
+ // Read bits of v that crossed into new buffered_values_
+ *v = *v | static_cast<T>(BitUtil::TrailingBits(*buffered_values, *bit_offset)
+ << (num_bits - *bit_offset));
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+ DCHECK_LE(*bit_offset, 64);
+ }
+}
+
+} // namespace detail
+
+template <typename T>
+inline bool BitReader::GetValue(int num_bits, T* v) {
+ return GetBatch(num_bits, v, 1) == 1;
+}
+
+template <typename T>
+inline int BitReader::GetBatch(int num_bits, T* v, int batch_size) {
+ DCHECK(buffer_ != NULL);
+ // TODO: revisit this limit if necessary
+ DCHECK_LE(num_bits, 32);
+ DCHECK_LE(num_bits, static_cast<int>(sizeof(T) * 8));
+
+ int bit_offset = bit_offset_;
+ int byte_offset = byte_offset_;
+ uint64_t buffered_values = buffered_values_;
+ int max_bytes = max_bytes_;
+ const uint8_t* buffer = buffer_;
+
+ uint64_t needed_bits = num_bits * batch_size;
+ constexpr uint64_t kBitsPerByte = 8;
+ uint64_t remaining_bits = (max_bytes - byte_offset) * kBitsPerByte - bit_offset;
+ if (remaining_bits < needed_bits) {
+ batch_size = static_cast<int>(remaining_bits) / num_bits;
+ }
+
+ int i = 0;
+ if (ARROW_PREDICT_FALSE(bit_offset != 0)) {
+ for (; i < batch_size && bit_offset != 0; ++i) {
+ detail::GetValue_(num_bits, &v[i], max_bytes, buffer, &bit_offset, &byte_offset,
+ &buffered_values);
+ }
+ }
+
+ if (sizeof(T) == 4) {
+ int num_unpacked =
+ internal::unpack32(reinterpret_cast<const uint32_t*>(buffer + byte_offset),
+ reinterpret_cast<uint32_t*>(v + i), batch_size - i, num_bits);
+ i += num_unpacked;
+ byte_offset += num_unpacked * num_bits / 8;
+ } else {
+ const int buffer_size = 1024;
+ uint32_t unpack_buffer[buffer_size];
+ while (i < batch_size) {
+ int unpack_size = std::min(buffer_size, batch_size - i);
+ int num_unpacked =
+ internal::unpack32(reinterpret_cast<const uint32_t*>(buffer + byte_offset),
+ unpack_buffer, unpack_size, num_bits);
+ if (num_unpacked == 0) {
+ break;
+ }
+ for (int k = 0; k < num_unpacked; ++k) {
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4800)
+#endif
+ v[i + k] = static_cast<T>(unpack_buffer[k]);
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+ }
+ i += num_unpacked;
+ byte_offset += num_unpacked * num_bits / 8;
+ }
+ }
+
+ int bytes_remaining = max_bytes - byte_offset;
+ if (bytes_remaining >= 8) {
+ memcpy(&buffered_values, buffer + byte_offset, 8);
+ } else {
+ memcpy(&buffered_values, buffer + byte_offset, bytes_remaining);
+ }
+ buffered_values = arrow::BitUtil::FromLittleEndian(buffered_values);
+
+ for (; i < batch_size; ++i) {
+ detail::GetValue_(num_bits, &v[i], max_bytes, buffer, &bit_offset, &byte_offset,
+ &buffered_values);
+ }
+
+ bit_offset_ = bit_offset;
+ byte_offset_ = byte_offset;
+ buffered_values_ = buffered_values;
+
+ return batch_size;
+}
+
+template <typename T>
+inline bool BitReader::GetAligned(int num_bytes, T* v) {
+ if (ARROW_PREDICT_FALSE(num_bytes > static_cast<int>(sizeof(T)))) {
+ return false;
+ }
+
+ int bytes_read = static_cast<int>(BitUtil::BytesForBits(bit_offset_));
+ if (ARROW_PREDICT_FALSE(byte_offset_ + bytes_read + num_bytes > max_bytes_)) {
+ return false;
+ }
+
+ // Advance byte_offset to next unread byte and read num_bytes
+ byte_offset_ += bytes_read;
+ memcpy(v, buffer_ + byte_offset_, num_bytes);
+ *v = arrow::BitUtil::FromLittleEndian(*v);
+ byte_offset_ += num_bytes;
+
+ // Reset buffered_values_
+ bit_offset_ = 0;
+ int bytes_remaining = max_bytes_ - byte_offset_;
+ if (ARROW_PREDICT_TRUE(bytes_remaining >= 8)) {
+ memcpy(&buffered_values_, buffer_ + byte_offset_, 8);
+ } else {
+ memcpy(&buffered_values_, buffer_ + byte_offset_, bytes_remaining);
+ }
+ buffered_values_ = arrow::BitUtil::FromLittleEndian(buffered_values_);
+ return true;
+}
+
+inline bool BitWriter::PutVlqInt(uint32_t v) {
+ bool result = true;
+ while ((v & 0xFFFFFF80UL) != 0UL) {
+ result &= PutAligned<uint8_t>(static_cast<uint8_t>((v & 0x7F) | 0x80), 1);
+ v >>= 7;
+ }
+ result &= PutAligned<uint8_t>(static_cast<uint8_t>(v & 0x7F), 1);
+ return result;
+}
+
+inline bool BitReader::GetVlqInt(uint32_t* v) {
+ uint32_t tmp = 0;
+
+ for (int i = 0; i < kMaxVlqByteLength; i++) {
+ uint8_t byte = 0;
+ if (ARROW_PREDICT_FALSE(!GetAligned<uint8_t>(1, &byte))) {
+ return false;
+ }
+ tmp |= static_cast<uint32_t>(byte & 0x7F) << (7 * i);
+
+ if ((byte & 0x80) == 0) {
+ *v = tmp;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+inline bool BitWriter::PutZigZagVlqInt(int32_t v) {
+ auto u_v = ::arrow::util::SafeCopy<uint32_t>(v);
+ return PutVlqInt((u_v << 1) ^ (u_v >> 31));
+}
+
+inline bool BitReader::GetZigZagVlqInt(int32_t* v) {
+ uint32_t u;
+ if (!GetVlqInt(&u)) return false;
+ *v = ::arrow::util::SafeCopy<int32_t>((u >> 1) ^ (u << 31));
+ return true;
+}
+
+} // namespace BitUtil
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_util.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_util.cc
new file mode 100644
index 00000000000..ee4bcde7713
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_util.cc
@@ -0,0 +1,127 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/util/bit_util.h"
+
+#include <cstdint>
+#include <cstring>
+
+#include "arrow/util/logging.h"
+
+namespace arrow {
+namespace BitUtil {
+
+void SetBitsTo(uint8_t* bits, int64_t start_offset, int64_t length, bool bits_are_set) {
+ if (length == 0) {
+ return;
+ }
+
+ const int64_t i_begin = start_offset;
+ const int64_t i_end = start_offset + length;
+ const uint8_t fill_byte = static_cast<uint8_t>(-static_cast<uint8_t>(bits_are_set));
+
+ const int64_t bytes_begin = i_begin / 8;
+ const int64_t bytes_end = i_end / 8 + 1;
+
+ const uint8_t first_byte_mask = kPrecedingBitmask[i_begin % 8];
+ const uint8_t last_byte_mask = kTrailingBitmask[i_end % 8];
+
+ if (bytes_end == bytes_begin + 1) {
+ // set bits within a single byte
+ const uint8_t only_byte_mask =
+ i_end % 8 == 0 ? first_byte_mask
+ : static_cast<uint8_t>(first_byte_mask | last_byte_mask);
+ bits[bytes_begin] &= only_byte_mask;
+ bits[bytes_begin] |= static_cast<uint8_t>(fill_byte & ~only_byte_mask);
+ return;
+ }
+
+ // set/clear trailing bits of first byte
+ bits[bytes_begin] &= first_byte_mask;
+ bits[bytes_begin] |= static_cast<uint8_t>(fill_byte & ~first_byte_mask);
+
+ if (bytes_end - bytes_begin > 2) {
+ // set/clear whole bytes
+ std::memset(bits + bytes_begin + 1, fill_byte,
+ static_cast<size_t>(bytes_end - bytes_begin - 2));
+ }
+
+ if (i_end % 8 == 0) {
+ return;
+ }
+
+ // set/clear leading bits of last byte
+ bits[bytes_end - 1] &= last_byte_mask;
+ bits[bytes_end - 1] |= static_cast<uint8_t>(fill_byte & ~last_byte_mask);
+}
+
+template <bool value>
+void SetBitmapImpl(uint8_t* data, int64_t offset, int64_t length) {
+ // offset length
+ // data |<------------->|
+ // |--------|...|--------|...|--------|
+ // |<--->| |<--->|
+ // pro epi
+ if (ARROW_PREDICT_FALSE(length == 0)) {
+ return;
+ }
+
+ constexpr uint8_t set_byte = value ? UINT8_MAX : 0;
+
+ auto prologue = static_cast<int32_t>(BitUtil::RoundUp(offset, 8) - offset);
+ DCHECK_LT(prologue, 8);
+
+ if (length < prologue) { // special case where a mask is required
+ // offset length
+ // data |<->|
+ // |--------|...|--------|...
+ // mask --> |111|
+ // |<---->|
+ // pro
+ uint8_t mask = BitUtil::kPrecedingBitmask[8 - prologue] ^
+ BitUtil::kPrecedingBitmask[8 - prologue + length];
+ data[offset / 8] = value ? data[offset / 8] | mask : data[offset / 8] & ~mask;
+ return;
+ }
+
+ // align to a byte boundary
+ data[offset / 8] = BitUtil::SpliceWord(8 - prologue, data[offset / 8], set_byte);
+ offset += prologue;
+ length -= prologue;
+
+ // set values per byte
+ DCHECK_EQ(offset % 8, 0);
+ std::memset(data + offset / 8, set_byte, length / 8);
+ offset += BitUtil::RoundDown(length, 8);
+ length -= BitUtil::RoundDown(length, 8);
+
+ // clean up
+ DCHECK_LT(length, 8);
+ data[offset / 8] =
+ BitUtil::SpliceWord(static_cast<int32_t>(length), set_byte, data[offset / 8]);
+}
+
+void SetBitmap(uint8_t* data, int64_t offset, int64_t length) {
+ SetBitmapImpl<true>(data, offset, length);
+}
+
+void ClearBitmap(uint8_t* data, int64_t offset, int64_t length) {
+ SetBitmapImpl<false>(data, offset, length);
+}
+
+} // namespace BitUtil
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_util.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_util.h
new file mode 100644
index 00000000000..c306ce7821b
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_util.h
@@ -0,0 +1,354 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#if defined(_MSC_VER)
+#include <intrin.h> // IWYU pragma: keep
+#include <nmmintrin.h>
+#pragma intrinsic(_BitScanReverse)
+#pragma intrinsic(_BitScanForward)
+#define ARROW_POPCOUNT64 __popcnt64
+#define ARROW_POPCOUNT32 __popcnt
+#else
+#define ARROW_POPCOUNT64 __builtin_popcountll
+#define ARROW_POPCOUNT32 __builtin_popcount
+#endif
+
+#include <cstdint>
+#include <type_traits>
+
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace detail {
+
+template <typename Integer>
+typename std::make_unsigned<Integer>::type as_unsigned(Integer x) {
+ return static_cast<typename std::make_unsigned<Integer>::type>(x);
+}
+
+} // namespace detail
+
+namespace BitUtil {
+
+// The number of set bits in a given unsigned byte value, pre-computed
+//
+// Generated with the following Python code
+// output = 'static constexpr uint8_t kBytePopcount[] = {{{0}}};'
+// popcounts = [str(bin(i).count('1')) for i in range(0, 256)]
+// print(output.format(', '.join(popcounts)))
+static constexpr uint8_t kBytePopcount[] = {
+ 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3,
+ 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4,
+ 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4,
+ 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5,
+ 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2,
+ 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5,
+ 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4,
+ 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6,
+ 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8};
+
+static inline uint64_t PopCount(uint64_t bitmap) { return ARROW_POPCOUNT64(bitmap); }
+static inline uint32_t PopCount(uint32_t bitmap) { return ARROW_POPCOUNT32(bitmap); }
+
+//
+// Bit-related computations on integer values
+//
+
+// Returns the ceil of value/divisor
+constexpr int64_t CeilDiv(int64_t value, int64_t divisor) {
+ return (value == 0) ? 0 : 1 + (value - 1) / divisor;
+}
+
+// Return the number of bytes needed to fit the given number of bits
+constexpr int64_t BytesForBits(int64_t bits) {
+ // This formula avoids integer overflow on very large `bits`
+ return (bits >> 3) + ((bits & 7) != 0);
+}
+
+constexpr bool IsPowerOf2(int64_t value) {
+ return value > 0 && (value & (value - 1)) == 0;
+}
+
+constexpr bool IsPowerOf2(uint64_t value) {
+ return value > 0 && (value & (value - 1)) == 0;
+}
+
+// Returns the smallest power of two that contains v. If v is already a
+// power of two, it is returned as is.
+static inline int64_t NextPower2(int64_t n) {
+ // Taken from
+ // http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
+ n--;
+ n |= n >> 1;
+ n |= n >> 2;
+ n |= n >> 4;
+ n |= n >> 8;
+ n |= n >> 16;
+ n |= n >> 32;
+ n++;
+ return n;
+}
+
+constexpr bool IsMultipleOf64(int64_t n) { return (n & 63) == 0; }
+
+constexpr bool IsMultipleOf8(int64_t n) { return (n & 7) == 0; }
+
+// Returns a mask for the bit_index lower order bits.
+// Only valid for bit_index in the range [0, 64).
+constexpr uint64_t LeastSignificantBitMask(int64_t bit_index) {
+ return (static_cast<uint64_t>(1) << bit_index) - 1;
+}
+
+// Returns 'value' rounded up to the nearest multiple of 'factor'
+constexpr int64_t RoundUp(int64_t value, int64_t factor) {
+ return CeilDiv(value, factor) * factor;
+}
+
+// Returns 'value' rounded down to the nearest multiple of 'factor'
+constexpr int64_t RoundDown(int64_t value, int64_t factor) {
+ return (value / factor) * factor;
+}
+
+// Returns 'value' rounded up to the nearest multiple of 'factor' when factor
+// is a power of two.
+// The result is undefined on overflow, i.e. if `value > 2**64 - factor`,
+// since we cannot return the correct result which would be 2**64.
+constexpr int64_t RoundUpToPowerOf2(int64_t value, int64_t factor) {
+ // DCHECK(value >= 0);
+ // DCHECK(IsPowerOf2(factor));
+ return (value + (factor - 1)) & ~(factor - 1);
+}
+
+constexpr uint64_t RoundUpToPowerOf2(uint64_t value, uint64_t factor) {
+ // DCHECK(IsPowerOf2(factor));
+ return (value + (factor - 1)) & ~(factor - 1);
+}
+
+constexpr int64_t RoundUpToMultipleOf8(int64_t num) { return RoundUpToPowerOf2(num, 8); }
+
+constexpr int64_t RoundUpToMultipleOf64(int64_t num) {
+ return RoundUpToPowerOf2(num, 64);
+}
+
+// Returns the number of bytes covering a sliced bitmap. Find the length
+// rounded to cover full bytes on both extremities.
+//
+// The following example represents a slice (offset=10, length=9)
+//
+// 0 8 16 24
+// |-------|-------|------|
+// [ ] (slice)
+// [ ] (same slice aligned to bytes bounds, length=16)
+//
+// The covering bytes is the length (in bytes) of this new aligned slice.
+constexpr int64_t CoveringBytes(int64_t offset, int64_t length) {
+ return (BitUtil::RoundUp(length + offset, 8) - BitUtil::RoundDown(offset, 8)) / 8;
+}
+
+// Returns the 'num_bits' least-significant bits of 'v'.
+static inline uint64_t TrailingBits(uint64_t v, int num_bits) {
+ if (ARROW_PREDICT_FALSE(num_bits == 0)) return 0;
+ if (ARROW_PREDICT_FALSE(num_bits >= 64)) return v;
+ int n = 64 - num_bits;
+ return (v << n) >> n;
+}
+
+/// \brief Count the number of leading zeros in an unsigned integer.
+static inline int CountLeadingZeros(uint32_t value) {
+#if defined(__clang__) || defined(__GNUC__)
+ if (value == 0) return 32;
+ return static_cast<int>(__builtin_clz(value));
+#elif defined(_MSC_VER)
+ unsigned long index; // NOLINT
+ if (_BitScanReverse(&index, static_cast<unsigned long>(value))) { // NOLINT
+ return 31 - static_cast<int>(index);
+ } else {
+ return 32;
+ }
+#else
+ int bitpos = 0;
+ while (value != 0) {
+ value >>= 1;
+ ++bitpos;
+ }
+ return 32 - bitpos;
+#endif
+}
+
+static inline int CountLeadingZeros(uint64_t value) {
+#if defined(__clang__) || defined(__GNUC__)
+ if (value == 0) return 64;
+ return static_cast<int>(__builtin_clzll(value));
+#elif defined(_MSC_VER)
+ unsigned long index; // NOLINT
+ if (_BitScanReverse64(&index, value)) { // NOLINT
+ return 63 - static_cast<int>(index);
+ } else {
+ return 64;
+ }
+#else
+ int bitpos = 0;
+ while (value != 0) {
+ value >>= 1;
+ ++bitpos;
+ }
+ return 64 - bitpos;
+#endif
+}
+
+static inline int CountTrailingZeros(uint32_t value) {
+#if defined(__clang__) || defined(__GNUC__)
+ if (value == 0) return 32;
+ return static_cast<int>(__builtin_ctzl(value));
+#elif defined(_MSC_VER)
+ unsigned long index; // NOLINT
+ if (_BitScanForward(&index, value)) {
+ return static_cast<int>(index);
+ } else {
+ return 32;
+ }
+#else
+ int bitpos = 0;
+ if (value) {
+ while (value & 1 == 0) {
+ value >>= 1;
+ ++bitpos;
+ }
+ } else {
+ bitpos = 32;
+ }
+ return bitpos;
+#endif
+}
+
+static inline int CountTrailingZeros(uint64_t value) {
+#if defined(__clang__) || defined(__GNUC__)
+ if (value == 0) return 64;
+ return static_cast<int>(__builtin_ctzll(value));
+#elif defined(_MSC_VER)
+ unsigned long index; // NOLINT
+ if (_BitScanForward64(&index, value)) {
+ return static_cast<int>(index);
+ } else {
+ return 64;
+ }
+#else
+ int bitpos = 0;
+ if (value) {
+ while (value & 1 == 0) {
+ value >>= 1;
+ ++bitpos;
+ }
+ } else {
+ bitpos = 64;
+ }
+ return bitpos;
+#endif
+}
+
+// Returns the minimum number of bits needed to represent an unsigned value
+static inline int NumRequiredBits(uint64_t x) { return 64 - CountLeadingZeros(x); }
+
+// Returns ceil(log2(x)).
+static inline int Log2(uint64_t x) {
+ // DCHECK_GT(x, 0);
+ return NumRequiredBits(x - 1);
+}
+
+//
+// Utilities for reading and writing individual bits by their index
+// in a memory area.
+//
+
+// Bitmask selecting the k-th bit in a byte
+static constexpr uint8_t kBitmask[] = {1, 2, 4, 8, 16, 32, 64, 128};
+
+// the bitwise complement version of kBitmask
+static constexpr uint8_t kFlippedBitmask[] = {254, 253, 251, 247, 239, 223, 191, 127};
+
+// Bitmask selecting the (k - 1) preceding bits in a byte
+static constexpr uint8_t kPrecedingBitmask[] = {0, 1, 3, 7, 15, 31, 63, 127};
+static constexpr uint8_t kPrecedingWrappingBitmask[] = {255, 1, 3, 7, 15, 31, 63, 127};
+
+// the bitwise complement version of kPrecedingBitmask
+static constexpr uint8_t kTrailingBitmask[] = {255, 254, 252, 248, 240, 224, 192, 128};
+
+static constexpr bool GetBit(const uint8_t* bits, uint64_t i) {
+ return (bits[i >> 3] >> (i & 0x07)) & 1;
+}
+
+// Gets the i-th bit from a byte. Should only be used with i <= 7.
+static constexpr bool GetBitFromByte(uint8_t byte, uint8_t i) {
+ return byte & kBitmask[i];
+}
+
+static inline void ClearBit(uint8_t* bits, int64_t i) {
+ bits[i / 8] &= kFlippedBitmask[i % 8];
+}
+
+static inline void SetBit(uint8_t* bits, int64_t i) { bits[i / 8] |= kBitmask[i % 8]; }
+
+static inline void SetBitTo(uint8_t* bits, int64_t i, bool bit_is_set) {
+ // https://graphics.stanford.edu/~seander/bithacks.html
+ // "Conditionally set or clear bits without branching"
+ // NOTE: this seems to confuse Valgrind as it reads from potentially
+ // uninitialized memory
+ bits[i / 8] ^= static_cast<uint8_t>(-static_cast<uint8_t>(bit_is_set) ^ bits[i / 8]) &
+ kBitmask[i % 8];
+}
+
+/// \brief set or clear a range of bits quickly
+ARROW_EXPORT
+void SetBitsTo(uint8_t* bits, int64_t start_offset, int64_t length, bool bits_are_set);
+
+/// \brief Sets all bits in the bitmap to true
+ARROW_EXPORT
+void SetBitmap(uint8_t* data, int64_t offset, int64_t length);
+
+/// \brief Clears all bits in the bitmap (set to false)
+ARROW_EXPORT
+void ClearBitmap(uint8_t* data, int64_t offset, int64_t length);
+
+/// Returns a mask with lower i bits set to 1. If i >= sizeof(Word)*8, all-ones will be
+/// returned
+/// ex:
+/// ref: https://stackoverflow.com/a/59523400
+template <typename Word>
+constexpr Word PrecedingWordBitmask(unsigned int const i) {
+ return (static_cast<Word>(i < sizeof(Word) * 8) << (i & (sizeof(Word) * 8 - 1))) - 1;
+}
+static_assert(PrecedingWordBitmask<uint8_t>(0) == 0x00, "");
+static_assert(PrecedingWordBitmask<uint8_t>(4) == 0x0f, "");
+static_assert(PrecedingWordBitmask<uint8_t>(8) == 0xff, "");
+static_assert(PrecedingWordBitmask<uint16_t>(8) == 0x00ff, "");
+
+/// \brief Create a word with low `n` bits from `low` and high `sizeof(Word)-n` bits
+/// from `high`.
+/// Word ret
+/// for (i = 0; i < sizeof(Word)*8; i++){
+/// ret[i]= i < n ? low[i]: high[i];
+/// }
+template <typename Word>
+constexpr Word SpliceWord(int n, Word low, Word high) {
+ return (high & ~PrecedingWordBitmask<Word>(n)) | (low & PrecedingWordBitmask<Word>(n));
+}
+
+} // namespace BitUtil
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap.cc
new file mode 100644
index 00000000000..33d1dee1957
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap.cc
@@ -0,0 +1,75 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/util/bitmap.h"
+
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <string>
+
+#include "arrow/array/array_primitive.h"
+#include "arrow/buffer.h"
+#include "arrow/util/bitmap_ops.h"
+#include "arrow/util/logging.h"
+
+namespace arrow {
+namespace internal {
+
+std::string Bitmap::ToString() const {
+ std::string out(length_ + ((length_ - 1) / 8), ' ');
+ for (int64_t i = 0; i < length_; ++i) {
+ out[i + (i / 8)] = GetBit(i) ? '1' : '0';
+ }
+ return out;
+}
+
+std::shared_ptr<BooleanArray> Bitmap::ToArray() const {
+ return std::make_shared<BooleanArray>(length_, buffer_, nullptr, 0, offset_);
+}
+
+std::string Bitmap::Diff(const Bitmap& other) const {
+ return ToArray()->Diff(*other.ToArray());
+}
+
+void Bitmap::CopyFrom(const Bitmap& other) {
+ ::arrow::internal::CopyBitmap(other.buffer_->data(), other.offset_, other.length_,
+ buffer_->mutable_data(), offset_);
+}
+
+void Bitmap::CopyFromInverted(const Bitmap& other) {
+ ::arrow::internal::InvertBitmap(other.buffer_->data(), other.offset_, other.length_,
+ buffer_->mutable_data(), offset_);
+}
+
+bool Bitmap::Equals(const Bitmap& other) const {
+ if (length_ != other.length_) {
+ return false;
+ }
+ return BitmapEquals(buffer_->data(), offset_, other.buffer_->data(), other.offset(),
+ length_);
+}
+
+int64_t Bitmap::BitLength(const Bitmap* bitmaps, size_t N) {
+ for (size_t i = 1; i < N; ++i) {
+ DCHECK_EQ(bitmaps[i].length(), bitmaps[0].length());
+ }
+ return bitmaps[0].length();
+}
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap.h
new file mode 100644
index 00000000000..141f863c0b8
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap.h
@@ -0,0 +1,461 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <bitset>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "arrow/buffer.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_ops.h"
+#include "arrow/util/bitmap_reader.h"
+#include "arrow/util/bitmap_writer.h"
+#include "arrow/util/compare.h"
+#include "arrow/util/endian.h"
+#include "arrow/util/functional.h"
+#include "arrow/util/string_builder.h"
+#include "arrow/util/string_view.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class BooleanArray;
+
+namespace internal {
+
+class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
+ public util::EqualityComparable<Bitmap> {
+ public:
+ template <typename Word>
+ using View = util::basic_string_view<Word>;
+
+ Bitmap() = default;
+
+ Bitmap(std::shared_ptr<Buffer> buffer, int64_t offset, int64_t length)
+ : buffer_(std::move(buffer)), offset_(offset), length_(length) {}
+
+ Bitmap(const void* data, int64_t offset, int64_t length)
+ : buffer_(std::make_shared<Buffer>(static_cast<const uint8_t*>(data),
+ BitUtil::BytesForBits(offset + length))),
+ offset_(offset),
+ length_(length) {}
+
+ Bitmap(void* data, int64_t offset, int64_t length)
+ : buffer_(std::make_shared<MutableBuffer>(static_cast<uint8_t*>(data),
+ BitUtil::BytesForBits(offset + length))),
+ offset_(offset),
+ length_(length) {}
+
+ Bitmap Slice(int64_t offset) const {
+ return Bitmap(buffer_, offset_ + offset, length_ - offset);
+ }
+
+ Bitmap Slice(int64_t offset, int64_t length) const {
+ return Bitmap(buffer_, offset_ + offset, length);
+ }
+
+ std::string ToString() const;
+
+ bool Equals(const Bitmap& other) const;
+
+ std::string Diff(const Bitmap& other) const;
+
+ bool GetBit(int64_t i) const { return BitUtil::GetBit(buffer_->data(), i + offset_); }
+
+ bool operator[](int64_t i) const { return GetBit(i); }
+
+ void SetBitTo(int64_t i, bool v) const {
+ BitUtil::SetBitTo(buffer_->mutable_data(), i + offset_, v);
+ }
+
+ void SetBitsTo(bool v) {
+ BitUtil::SetBitsTo(buffer_->mutable_data(), offset_, length_, v);
+ }
+
+ void CopyFrom(const Bitmap& other);
+ void CopyFromInverted(const Bitmap& other);
+
+ /// \brief Visit bits from each bitmap as bitset<N>
+ ///
+ /// All bitmaps must have identical length.
+ template <size_t N, typename Visitor>
+ static void VisitBits(const Bitmap (&bitmaps)[N], Visitor&& visitor) {
+ int64_t bit_length = BitLength(bitmaps, N);
+ std::bitset<N> bits;
+ for (int64_t bit_i = 0; bit_i < bit_length; ++bit_i) {
+ for (size_t i = 0; i < N; ++i) {
+ bits[i] = bitmaps[i].GetBit(bit_i);
+ }
+ visitor(bits);
+ }
+ }
+
+ /// \brief Visit bits from each bitmap as bitset<N>
+ ///
+ /// All bitmaps must have identical length.
+ template <size_t N, typename Visitor>
+ static void VisitBits(const std::array<Bitmap, N>& bitmaps, Visitor&& visitor) {
+ int64_t bit_length = BitLength(bitmaps);
+ std::bitset<N> bits;
+ for (int64_t bit_i = 0; bit_i < bit_length; ++bit_i) {
+ for (size_t i = 0; i < N; ++i) {
+ bits[i] = bitmaps[i].GetBit(bit_i);
+ }
+ visitor(bits);
+ }
+ }
+
+ /// \brief Visit words of bits from each bitmap as array<Word, N>
+ ///
+ /// All bitmaps must have identical length. The first bit in a visited bitmap
+ /// may be offset within the first visited word, but words will otherwise contain
+ /// densely packed bits loaded from the bitmap. That offset within the first word is
+ /// returned.
+ ///
+ /// TODO(bkietz) allow for early termination
+ // NOTE: this function is efficient on 3+ sufficiently large bitmaps.
+ // It also has a large prolog / epilog overhead and should be used
+ // carefully in other cases.
+ // For 2 bitmaps or less, and/or smaller bitmaps, see also VisitTwoBitBlocksVoid
+ // and BitmapUInt64Reader.
+ template <size_t N, typename Visitor,
+ typename Word = typename std::decay<
+ internal::call_traits::argument_type<0, Visitor&&>>::type::value_type>
+ static int64_t VisitWords(const Bitmap (&bitmaps_arg)[N], Visitor&& visitor) {
+ constexpr int64_t kBitWidth = sizeof(Word) * 8;
+
+ // local, mutable variables which will be sliced/decremented to represent consumption:
+ Bitmap bitmaps[N];
+ int64_t offsets[N];
+ int64_t bit_length = BitLength(bitmaps_arg, N);
+ View<Word> words[N];
+ for (size_t i = 0; i < N; ++i) {
+ bitmaps[i] = bitmaps_arg[i];
+ offsets[i] = bitmaps[i].template word_offset<Word>();
+ assert(offsets[i] >= 0 && offsets[i] < kBitWidth);
+ words[i] = bitmaps[i].template words<Word>();
+ }
+
+ auto consume = [&](int64_t consumed_bits) {
+ for (size_t i = 0; i < N; ++i) {
+ bitmaps[i] = bitmaps[i].Slice(consumed_bits, bit_length - consumed_bits);
+ offsets[i] = bitmaps[i].template word_offset<Word>();
+ assert(offsets[i] >= 0 && offsets[i] < kBitWidth);
+ words[i] = bitmaps[i].template words<Word>();
+ }
+ bit_length -= consumed_bits;
+ };
+
+ std::array<Word, N> visited_words;
+ visited_words.fill(0);
+
+ if (bit_length <= kBitWidth * 2) {
+ // bitmaps fit into one or two words so don't bother with optimization
+ while (bit_length > 0) {
+ auto leading_bits = std::min(bit_length, kBitWidth);
+ SafeLoadWords(bitmaps, 0, leading_bits, false, &visited_words);
+ visitor(visited_words);
+ consume(leading_bits);
+ }
+ return 0;
+ }
+
+ int64_t max_offset = *std::max_element(offsets, offsets + N);
+ int64_t min_offset = *std::min_element(offsets, offsets + N);
+ if (max_offset > 0) {
+ // consume leading bits
+ auto leading_bits = kBitWidth - min_offset;
+ SafeLoadWords(bitmaps, 0, leading_bits, true, &visited_words);
+ visitor(visited_words);
+ consume(leading_bits);
+ }
+ assert(*std::min_element(offsets, offsets + N) == 0);
+
+ int64_t whole_word_count = bit_length / kBitWidth;
+ assert(whole_word_count >= 1);
+
+ if (min_offset == max_offset) {
+ // all offsets were identical, all leading bits have been consumed
+ assert(
+ std::all_of(offsets, offsets + N, [](int64_t offset) { return offset == 0; }));
+
+ for (int64_t word_i = 0; word_i < whole_word_count; ++word_i) {
+ for (size_t i = 0; i < N; ++i) {
+ visited_words[i] = words[i][word_i];
+ }
+ visitor(visited_words);
+ }
+ consume(whole_word_count * kBitWidth);
+ } else {
+ // leading bits from potentially incomplete words have been consumed
+
+ // word_i such that words[i][word_i] and words[i][word_i + 1] are lie entirely
+ // within the bitmap for all i
+ for (int64_t word_i = 0; word_i < whole_word_count - 1; ++word_i) {
+ for (size_t i = 0; i < N; ++i) {
+ if (offsets[i] == 0) {
+ visited_words[i] = words[i][word_i];
+ } else {
+ auto words0 = BitUtil::ToLittleEndian(words[i][word_i]);
+ auto words1 = BitUtil::ToLittleEndian(words[i][word_i + 1]);
+ visited_words[i] = BitUtil::FromLittleEndian(
+ (words0 >> offsets[i]) | (words1 << (kBitWidth - offsets[i])));
+ }
+ }
+ visitor(visited_words);
+ }
+ consume((whole_word_count - 1) * kBitWidth);
+
+ SafeLoadWords(bitmaps, 0, kBitWidth, false, &visited_words);
+
+ visitor(visited_words);
+ consume(kBitWidth);
+ }
+
+ // load remaining bits
+ if (bit_length > 0) {
+ SafeLoadWords(bitmaps, 0, bit_length, false, &visited_words);
+ visitor(visited_words);
+ }
+
+ return min_offset;
+ }
+
+ template <size_t N, size_t M, typename ReaderT, typename WriterT, typename Visitor,
+ typename Word = typename std::decay<
+ internal::call_traits::argument_type<0, Visitor&&>>::type::value_type>
+ static void RunVisitWordsAndWriteLoop(int64_t bit_length,
+ std::array<ReaderT, N>& readers,
+ std::array<WriterT, M>& writers,
+ Visitor&& visitor) {
+ constexpr int64_t kBitWidth = sizeof(Word) * 8;
+
+ std::array<Word, N> visited_words;
+ std::array<Word, M> output_words;
+
+ // every reader will have same number of words, since they are same length'ed
+ // TODO($JIRA) this will be inefficient in some cases. When there are offsets beyond
+ // Word boundary, every Word would have to be created from 2 adjoining Words
+ auto n_words = readers[0].words();
+ bit_length -= n_words * kBitWidth;
+ while (n_words--) {
+ // first collect all words to visited_words array
+ for (size_t i = 0; i < N; i++) {
+ visited_words[i] = readers[i].NextWord();
+ }
+ visitor(visited_words, &output_words);
+ for (size_t i = 0; i < M; i++) {
+ writers[i].PutNextWord(output_words[i]);
+ }
+ }
+
+ // every reader will have same number of trailing bytes, because of the above reason
+ // tailing portion could be more than one word! (ref: BitmapWordReader constructor)
+ // remaining full/ partial words to write
+
+ if (bit_length) {
+ // convert the word visitor lambda to a byte_visitor
+ auto byte_visitor = [&](const std::array<uint8_t, N>& in,
+ std::array<uint8_t, M>* out) {
+ std::array<Word, N> in_words;
+ std::array<Word, M> out_words;
+ std::copy(in.begin(), in.end(), in_words.begin());
+ visitor(in_words, &out_words);
+ for (size_t i = 0; i < M; i++) {
+ out->at(i) = static_cast<uint8_t>(out_words[i]);
+ }
+ };
+
+ std::array<uint8_t, N> visited_bytes;
+ std::array<uint8_t, M> output_bytes;
+ int n_bytes = readers[0].trailing_bytes();
+ while (n_bytes--) {
+ visited_bytes.fill(0);
+ output_bytes.fill(0);
+ int valid_bits;
+ for (size_t i = 0; i < N; i++) {
+ visited_bytes[i] = readers[i].NextTrailingByte(valid_bits);
+ }
+ byte_visitor(visited_bytes, &output_bytes);
+ for (size_t i = 0; i < M; i++) {
+ writers[i].PutNextTrailingByte(output_bytes[i], valid_bits);
+ }
+ }
+ }
+ }
+
+ /// \brief Visit words of bits from each input bitmap as array<Word, N> and collects
+ /// outputs to an array<Word, M>, to be written into the output bitmaps accordingly.
+ ///
+ /// All bitmaps must have identical length. The first bit in a visited bitmap
+ /// may be offset within the first visited word, but words will otherwise contain
+ /// densely packed bits loaded from the bitmap. That offset within the first word is
+ /// returned.
+ /// Visitor is expected to have the following signature
+ /// [](const std::array<Word, N>& in_words, std::array<Word, M>* out_words){...}
+ ///
+ // NOTE: this function is efficient on 3+ sufficiently large bitmaps.
+ // It also has a large prolog / epilog overhead and should be used
+ // carefully in other cases.
+ // For 2 bitmaps or less, and/or smaller bitmaps, see also VisitTwoBitBlocksVoid
+ // and BitmapUInt64Reader.
+ template <size_t N, size_t M, typename Visitor,
+ typename Word = typename std::decay<
+ internal::call_traits::argument_type<0, Visitor&&>>::type::value_type>
+ static void VisitWordsAndWrite(const std::array<Bitmap, N>& bitmaps_arg,
+ std::array<Bitmap, M>* out_bitmaps_arg,
+ Visitor&& visitor) {
+ int64_t bit_length = BitLength(bitmaps_arg);
+ assert(bit_length == BitLength(*out_bitmaps_arg));
+
+ // if both input and output bitmaps have no byte offset, then use special template
+ if (std::all_of(bitmaps_arg.begin(), bitmaps_arg.end(),
+ [](const Bitmap& b) { return b.offset_ % 8 == 0; }) &&
+ std::all_of(out_bitmaps_arg->begin(), out_bitmaps_arg->end(),
+ [](const Bitmap& b) { return b.offset_ % 8 == 0; })) {
+ std::array<BitmapWordReader<Word, /*may_have_byte_offset=*/false>, N> readers;
+ for (size_t i = 0; i < N; ++i) {
+ const Bitmap& in_bitmap = bitmaps_arg[i];
+ readers[i] = BitmapWordReader<Word, /*may_have_byte_offset=*/false>(
+ in_bitmap.buffer_->data(), in_bitmap.offset_, in_bitmap.length_);
+ }
+
+ std::array<BitmapWordWriter<Word, /*may_have_byte_offset=*/false>, M> writers;
+ for (size_t i = 0; i < M; ++i) {
+ const Bitmap& out_bitmap = out_bitmaps_arg->at(i);
+ writers[i] = BitmapWordWriter<Word, /*may_have_byte_offset=*/false>(
+ out_bitmap.buffer_->mutable_data(), out_bitmap.offset_, out_bitmap.length_);
+ }
+
+ RunVisitWordsAndWriteLoop(bit_length, readers, writers, visitor);
+ } else {
+ std::array<BitmapWordReader<Word>, N> readers;
+ for (size_t i = 0; i < N; ++i) {
+ const Bitmap& in_bitmap = bitmaps_arg[i];
+ readers[i] = BitmapWordReader<Word>(in_bitmap.buffer_->data(), in_bitmap.offset_,
+ in_bitmap.length_);
+ }
+
+ std::array<BitmapWordWriter<Word>, M> writers;
+ for (size_t i = 0; i < M; ++i) {
+ const Bitmap& out_bitmap = out_bitmaps_arg->at(i);
+ writers[i] = BitmapWordWriter<Word>(out_bitmap.buffer_->mutable_data(),
+ out_bitmap.offset_, out_bitmap.length_);
+ }
+
+ RunVisitWordsAndWriteLoop(bit_length, readers, writers, visitor);
+ }
+ }
+
+ const std::shared_ptr<Buffer>& buffer() const { return buffer_; }
+
+ /// offset of first bit relative to buffer().data()
+ int64_t offset() const { return offset_; }
+
+ /// number of bits in this Bitmap
+ int64_t length() const { return length_; }
+
+ /// string_view of all bytes which contain any bit in this Bitmap
+ util::bytes_view bytes() const {
+ auto byte_offset = offset_ / 8;
+ auto byte_count = BitUtil::CeilDiv(offset_ + length_, 8) - byte_offset;
+ return util::bytes_view(buffer_->data() + byte_offset, byte_count);
+ }
+
+ private:
+ /// string_view of all Words which contain any bit in this Bitmap
+ ///
+ /// For example, given Word=uint16_t and a bitmap spanning bits [20, 36)
+ /// words() would span bits [16, 48).
+ ///
+ /// 0 16 32 48 64
+ /// |-------|-------|------|------| (buffer)
+ /// [ ] (bitmap)
+ /// |-------|------| (returned words)
+ ///
+ /// \warning The words may contain bytes which lie outside the buffer or are
+ /// uninitialized.
+ template <typename Word>
+ View<Word> words() const {
+ auto bytes_addr = reinterpret_cast<intptr_t>(bytes().data());
+ auto words_addr = bytes_addr - bytes_addr % sizeof(Word);
+ auto word_byte_count =
+ BitUtil::RoundUpToPowerOf2(static_cast<int64_t>(bytes_addr + bytes().size()),
+ static_cast<int64_t>(sizeof(Word))) -
+ words_addr;
+ return View<Word>(reinterpret_cast<const Word*>(words_addr),
+ word_byte_count / sizeof(Word));
+ }
+
+ /// offset of first bit relative to words<Word>().data()
+ template <typename Word>
+ int64_t word_offset() const {
+ return offset_ + 8 * (reinterpret_cast<intptr_t>(buffer_->data()) -
+ reinterpret_cast<intptr_t>(words<Word>().data()));
+ }
+
+ /// load words from bitmaps bitwise
+ template <size_t N, typename Word>
+ static void SafeLoadWords(const Bitmap (&bitmaps)[N], int64_t offset,
+ int64_t out_length, bool set_trailing_bits,
+ std::array<Word, N>* out) {
+ out->fill(0);
+
+ int64_t out_offset = set_trailing_bits ? sizeof(Word) * 8 - out_length : 0;
+
+ Bitmap slices[N], out_bitmaps[N];
+ for (size_t i = 0; i < N; ++i) {
+ slices[i] = bitmaps[i].Slice(offset, out_length);
+ out_bitmaps[i] = Bitmap(&out->at(i), out_offset, out_length);
+ }
+
+ int64_t bit_i = 0;
+ Bitmap::VisitBits(slices, [&](std::bitset<N> bits) {
+ for (size_t i = 0; i < N; ++i) {
+ out_bitmaps[i].SetBitTo(bit_i, bits[i]);
+ }
+ ++bit_i;
+ });
+ }
+
+ std::shared_ptr<BooleanArray> ToArray() const;
+
+ /// assert bitmaps have identical length and return that length
+ static int64_t BitLength(const Bitmap* bitmaps, size_t N);
+
+ template <size_t N>
+ static int64_t BitLength(const std::array<Bitmap, N>& bitmaps) {
+ for (size_t i = 1; i < N; ++i) {
+ assert(bitmaps[i].length() == bitmaps[0].length());
+ }
+ return bitmaps[0].length();
+ }
+
+ std::shared_ptr<Buffer> buffer_;
+ int64_t offset_ = 0, length_ = 0;
+};
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_builders.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_builders.cc
new file mode 100644
index 00000000000..9a91b7ac675
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_builders.cc
@@ -0,0 +1,72 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/util/bitmap_builders.h"
+
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <type_traits>
+#include <utility>
+
+#include "arrow/buffer.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/util/bit_util.h"
+
+namespace arrow {
+namespace internal {
+
+namespace {
+
+void FillBitsFromBytes(const std::vector<uint8_t>& bytes, uint8_t* bits) {
+ for (size_t i = 0; i < bytes.size(); ++i) {
+ if (bytes[i] > 0) {
+ BitUtil::SetBit(bits, i);
+ }
+ }
+}
+
+} // namespace
+
+Result<std::shared_ptr<Buffer>> BytesToBits(const std::vector<uint8_t>& bytes,
+ MemoryPool* pool) {
+ int64_t bit_length = BitUtil::BytesForBits(bytes.size());
+
+ ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateBuffer(bit_length, pool));
+ uint8_t* out_buf = buffer->mutable_data();
+ memset(out_buf, 0, static_cast<size_t>(buffer->capacity()));
+ FillBitsFromBytes(bytes, out_buf);
+ return std::move(buffer);
+}
+
+Result<std::shared_ptr<Buffer>> BitmapAllButOne(MemoryPool* pool, int64_t length,
+ int64_t straggler_pos, bool value) {
+ if (straggler_pos < 0 || straggler_pos >= length) {
+ return Status::Invalid("invalid straggler_pos ", straggler_pos);
+ }
+
+ ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateBuffer(BitUtil::BytesForBits(length), pool));
+
+ auto bitmap_data = buffer->mutable_data();
+ BitUtil::SetBitsTo(bitmap_data, 0, length, value);
+ BitUtil::SetBitTo(bitmap_data, straggler_pos, !value);
+ return std::move(buffer);
+}
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_builders.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_builders.h
new file mode 100644
index 00000000000..5bd2ad44140
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_builders.h
@@ -0,0 +1,43 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "arrow/result.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace internal {
+
+/// \brief Generate Bitmap with all position to `value` except for one found
+/// at `straggler_pos`.
+ARROW_EXPORT
+Result<std::shared_ptr<Buffer>> BitmapAllButOne(MemoryPool* pool, int64_t length,
+ int64_t straggler_pos, bool value = true);
+
+/// \brief Convert vector of bytes to bitmap buffer
+ARROW_EXPORT
+Result<std::shared_ptr<Buffer>> BytesToBits(const std::vector<uint8_t>&,
+ MemoryPool* pool = default_memory_pool());
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_generate.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_generate.h
new file mode 100644
index 00000000000..129fa913231
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_generate.h
@@ -0,0 +1,111 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+
+#include "arrow/buffer.h"
+#include "arrow/memory_pool.h"
+#include "arrow/result.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace internal {
+
+// A std::generate() like function to write sequential bits into a bitmap area.
+// Bits preceding the bitmap area are preserved, bits following the bitmap
+// area may be clobbered.
+
+template <class Generator>
+void GenerateBits(uint8_t* bitmap, int64_t start_offset, int64_t length, Generator&& g) {
+ if (length == 0) {
+ return;
+ }
+ uint8_t* cur = bitmap + start_offset / 8;
+ uint8_t bit_mask = BitUtil::kBitmask[start_offset % 8];
+ uint8_t current_byte = *cur & BitUtil::kPrecedingBitmask[start_offset % 8];
+
+ for (int64_t index = 0; index < length; ++index) {
+ const bool bit = g();
+ current_byte = bit ? (current_byte | bit_mask) : current_byte;
+ bit_mask = static_cast<uint8_t>(bit_mask << 1);
+ if (bit_mask == 0) {
+ bit_mask = 1;
+ *cur++ = current_byte;
+ current_byte = 0;
+ }
+ }
+ if (bit_mask != 1) {
+ *cur++ = current_byte;
+ }
+}
+
+// Like GenerateBits(), but unrolls its main loop for higher performance.
+
+template <class Generator>
+void GenerateBitsUnrolled(uint8_t* bitmap, int64_t start_offset, int64_t length,
+ Generator&& g) {
+ static_assert(std::is_same<typename std::result_of<Generator && ()>::type, bool>::value,
+ "Functor passed to GenerateBitsUnrolled must return bool");
+
+ if (length == 0) {
+ return;
+ }
+ uint8_t current_byte;
+ uint8_t* cur = bitmap + start_offset / 8;
+ const uint64_t start_bit_offset = start_offset % 8;
+ uint8_t bit_mask = BitUtil::kBitmask[start_bit_offset];
+ int64_t remaining = length;
+
+ if (bit_mask != 0x01) {
+ current_byte = *cur & BitUtil::kPrecedingBitmask[start_bit_offset];
+ while (bit_mask != 0 && remaining > 0) {
+ current_byte |= g() * bit_mask;
+ bit_mask = static_cast<uint8_t>(bit_mask << 1);
+ --remaining;
+ }
+ *cur++ = current_byte;
+ }
+
+ int64_t remaining_bytes = remaining / 8;
+ uint8_t out_results[8];
+ while (remaining_bytes-- > 0) {
+ for (int i = 0; i < 8; ++i) {
+ out_results[i] = g();
+ }
+ *cur++ = (out_results[0] | out_results[1] << 1 | out_results[2] << 2 |
+ out_results[3] << 3 | out_results[4] << 4 | out_results[5] << 5 |
+ out_results[6] << 6 | out_results[7] << 7);
+ }
+
+ int64_t remaining_bits = remaining % 8;
+ if (remaining_bits) {
+ current_byte = 0;
+ bit_mask = 0x01;
+ while (remaining_bits-- > 0) {
+ current_byte |= g() * bit_mask;
+ bit_mask = static_cast<uint8_t>(bit_mask << 1);
+ }
+ *cur++ = current_byte;
+ }
+}
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_ops.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_ops.cc
new file mode 100644
index 00000000000..63c8b008f4a
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_ops.cc
@@ -0,0 +1,387 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/util/bitmap_ops.h"
+
+#include <cstdint>
+#include <cstring>
+#include <functional>
+#include <memory>
+
+#include "arrow/buffer.h"
+#include "arrow/result.h"
+#include "arrow/util/align_util.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_reader.h"
+#include "arrow/util/bitmap_writer.h"
+#include "arrow/util/logging.h"
+
+namespace arrow {
+namespace internal {
+
+int64_t CountSetBits(const uint8_t* data, int64_t bit_offset, int64_t length) {
+ constexpr int64_t pop_len = sizeof(uint64_t) * 8;
+ DCHECK_GE(bit_offset, 0);
+ int64_t count = 0;
+
+ const auto p = BitmapWordAlign<pop_len / 8>(data, bit_offset, length);
+ for (int64_t i = bit_offset; i < bit_offset + p.leading_bits; ++i) {
+ if (BitUtil::GetBit(data, i)) {
+ ++count;
+ }
+ }
+
+ if (p.aligned_words > 0) {
+ // popcount as much as possible with the widest possible count
+ const uint64_t* u64_data = reinterpret_cast<const uint64_t*>(p.aligned_start);
+ DCHECK_EQ(reinterpret_cast<size_t>(u64_data) & 7, 0);
+ const uint64_t* end = u64_data + p.aligned_words;
+
+ constexpr int64_t kCountUnrollFactor = 4;
+ const int64_t words_rounded = BitUtil::RoundDown(p.aligned_words, kCountUnrollFactor);
+ int64_t count_unroll[kCountUnrollFactor] = {0};
+
+ // Unroll the loop for better performance
+ for (int64_t i = 0; i < words_rounded; i += kCountUnrollFactor) {
+ for (int64_t k = 0; k < kCountUnrollFactor; k++) {
+ count_unroll[k] += BitUtil::PopCount(u64_data[k]);
+ }
+ u64_data += kCountUnrollFactor;
+ }
+ for (int64_t k = 0; k < kCountUnrollFactor; k++) {
+ count += count_unroll[k];
+ }
+
+ // The trailing part
+ for (; u64_data < end; ++u64_data) {
+ count += BitUtil::PopCount(*u64_data);
+ }
+ }
+
+ // Account for left over bits (in theory we could fall back to smaller
+ // versions of popcount but the code complexity is likely not worth it)
+ for (int64_t i = p.trailing_bit_offset; i < bit_offset + length; ++i) {
+ if (BitUtil::GetBit(data, i)) {
+ ++count;
+ }
+ }
+
+ return count;
+}
+
+enum class TransferMode : bool { Copy, Invert };
+
+template <TransferMode mode>
+void TransferBitmap(const uint8_t* data, int64_t offset, int64_t length,
+ int64_t dest_offset, uint8_t* dest) {
+ int64_t bit_offset = offset % 8;
+ int64_t dest_bit_offset = dest_offset % 8;
+
+ if (bit_offset || dest_bit_offset) {
+ auto reader = internal::BitmapWordReader<uint64_t>(data, offset, length);
+ auto writer = internal::BitmapWordWriter<uint64_t>(dest, dest_offset, length);
+
+ auto nwords = reader.words();
+ while (nwords--) {
+ auto word = reader.NextWord();
+ writer.PutNextWord(mode == TransferMode::Invert ? ~word : word);
+ }
+ auto nbytes = reader.trailing_bytes();
+ while (nbytes--) {
+ int valid_bits;
+ auto byte = reader.NextTrailingByte(valid_bits);
+ writer.PutNextTrailingByte(mode == TransferMode::Invert ? ~byte : byte, valid_bits);
+ }
+ } else if (length) {
+ int64_t num_bytes = BitUtil::BytesForBits(length);
+
+ // Shift by its byte offset
+ data += offset / 8;
+ dest += dest_offset / 8;
+
+ // Take care of the trailing bits in the last byte
+ // E.g., if trailing_bits = 5, last byte should be
+ // - low 3 bits: new bits from last byte of data buffer
+ // - high 5 bits: old bits from last byte of dest buffer
+ int64_t trailing_bits = num_bytes * 8 - length;
+ uint8_t trail_mask = (1U << (8 - trailing_bits)) - 1;
+ uint8_t last_data;
+
+ if (mode == TransferMode::Invert) {
+ for (int64_t i = 0; i < num_bytes - 1; i++) {
+ dest[i] = static_cast<uint8_t>(~(data[i]));
+ }
+ last_data = ~data[num_bytes - 1];
+ } else {
+ std::memcpy(dest, data, static_cast<size_t>(num_bytes - 1));
+ last_data = data[num_bytes - 1];
+ }
+
+ // Set last byte
+ dest[num_bytes - 1] &= ~trail_mask;
+ dest[num_bytes - 1] |= last_data & trail_mask;
+ }
+}
+
+template <TransferMode mode>
+Result<std::shared_ptr<Buffer>> TransferBitmap(MemoryPool* pool, const uint8_t* data,
+ int64_t offset, int64_t length) {
+ ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateEmptyBitmap(length, pool));
+ uint8_t* dest = buffer->mutable_data();
+
+ TransferBitmap<mode>(data, offset, length, 0, dest);
+
+ // As we have freshly allocated this bitmap, we should take care of zeroing the
+ // remaining bits.
+ int64_t num_bytes = BitUtil::BytesForBits(length);
+ int64_t bits_to_zero = num_bytes * 8 - length;
+ for (int64_t i = length; i < length + bits_to_zero; ++i) {
+ // Both branches may copy extra bits - unsetting to match specification.
+ BitUtil::ClearBit(dest, i);
+ }
+ return buffer;
+}
+
+void CopyBitmap(const uint8_t* data, int64_t offset, int64_t length, uint8_t* dest,
+ int64_t dest_offset) {
+ TransferBitmap<TransferMode::Copy>(data, offset, length, dest_offset, dest);
+}
+
+void InvertBitmap(const uint8_t* data, int64_t offset, int64_t length, uint8_t* dest,
+ int64_t dest_offset) {
+ TransferBitmap<TransferMode::Invert>(data, offset, length, dest_offset, dest);
+}
+
+Result<std::shared_ptr<Buffer>> CopyBitmap(MemoryPool* pool, const uint8_t* data,
+ int64_t offset, int64_t length) {
+ return TransferBitmap<TransferMode::Copy>(pool, data, offset, length);
+}
+
+Result<std::shared_ptr<Buffer>> InvertBitmap(MemoryPool* pool, const uint8_t* data,
+ int64_t offset, int64_t length) {
+ return TransferBitmap<TransferMode::Invert>(pool, data, offset, length);
+}
+
+bool BitmapEquals(const uint8_t* left, int64_t left_offset, const uint8_t* right,
+ int64_t right_offset, int64_t length) {
+ if (left_offset % 8 == 0 && right_offset % 8 == 0) {
+ // byte aligned, can use memcmp
+ bool bytes_equal =
+ std::memcmp(left + left_offset / 8, right + right_offset / 8, length / 8) == 0;
+ if (!bytes_equal) {
+ return false;
+ }
+ for (int64_t i = (length / 8) * 8; i < length; ++i) {
+ if (BitUtil::GetBit(left, left_offset + i) !=
+ BitUtil::GetBit(right, right_offset + i)) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ // Unaligned slow case
+ auto left_reader = internal::BitmapWordReader<uint64_t>(left, left_offset, length);
+ auto right_reader = internal::BitmapWordReader<uint64_t>(right, right_offset, length);
+
+ auto nwords = left_reader.words();
+ while (nwords--) {
+ if (left_reader.NextWord() != right_reader.NextWord()) {
+ return false;
+ }
+ }
+ auto nbytes = left_reader.trailing_bytes();
+ while (nbytes--) {
+ int valid_bits;
+ if (left_reader.NextTrailingByte(valid_bits) !=
+ right_reader.NextTrailingByte(valid_bits)) {
+ return false;
+ }
+ }
+ return true;
+}
+
+bool OptionalBitmapEquals(const uint8_t* left, int64_t left_offset, const uint8_t* right,
+ int64_t right_offset, int64_t length) {
+ if (left == nullptr && right == nullptr) {
+ return true;
+ } else if (left != nullptr && right != nullptr) {
+ return BitmapEquals(left, left_offset, right, right_offset, length);
+ } else if (left != nullptr) {
+ return CountSetBits(left, left_offset, length) == length;
+ } else {
+ return CountSetBits(right, right_offset, length) == length;
+ }
+}
+
+bool OptionalBitmapEquals(const std::shared_ptr<Buffer>& left, int64_t left_offset,
+ const std::shared_ptr<Buffer>& right, int64_t right_offset,
+ int64_t length) {
+ return OptionalBitmapEquals(left ? left->data() : nullptr, left_offset,
+ right ? right->data() : nullptr, right_offset, length);
+}
+
+namespace {
+
+template <template <typename> class BitOp>
+void AlignedBitmapOp(const uint8_t* left, int64_t left_offset, const uint8_t* right,
+ int64_t right_offset, uint8_t* out, int64_t out_offset,
+ int64_t length) {
+ BitOp<uint8_t> op;
+ DCHECK_EQ(left_offset % 8, right_offset % 8);
+ DCHECK_EQ(left_offset % 8, out_offset % 8);
+
+ const int64_t nbytes = BitUtil::BytesForBits(length + left_offset % 8);
+ left += left_offset / 8;
+ right += right_offset / 8;
+ out += out_offset / 8;
+ for (int64_t i = 0; i < nbytes; ++i) {
+ out[i] = op(left[i], right[i]);
+ }
+}
+
+template <template <typename> class BitOp>
+void UnalignedBitmapOp(const uint8_t* left, int64_t left_offset, const uint8_t* right,
+ int64_t right_offset, uint8_t* out, int64_t out_offset,
+ int64_t length) {
+ BitOp<uint64_t> op_word;
+ BitOp<uint8_t> op_byte;
+
+ auto left_reader = internal::BitmapWordReader<uint64_t>(left, left_offset, length);
+ auto right_reader = internal::BitmapWordReader<uint64_t>(right, right_offset, length);
+ auto writer = internal::BitmapWordWriter<uint64_t>(out, out_offset, length);
+
+ auto nwords = left_reader.words();
+ while (nwords--) {
+ writer.PutNextWord(op_word(left_reader.NextWord(), right_reader.NextWord()));
+ }
+ auto nbytes = left_reader.trailing_bytes();
+ while (nbytes--) {
+ int left_valid_bits, right_valid_bits;
+ uint8_t left_byte = left_reader.NextTrailingByte(left_valid_bits);
+ uint8_t right_byte = right_reader.NextTrailingByte(right_valid_bits);
+ DCHECK_EQ(left_valid_bits, right_valid_bits);
+ writer.PutNextTrailingByte(op_byte(left_byte, right_byte), left_valid_bits);
+ }
+}
+
+template <template <typename> class BitOp>
+void BitmapOp(const uint8_t* left, int64_t left_offset, const uint8_t* right,
+ int64_t right_offset, int64_t length, int64_t out_offset, uint8_t* dest) {
+ if ((out_offset % 8 == left_offset % 8) && (out_offset % 8 == right_offset % 8)) {
+ // Fast case: can use bytewise AND
+ AlignedBitmapOp<BitOp>(left, left_offset, right, right_offset, dest, out_offset,
+ length);
+ } else {
+ // Unaligned
+ UnalignedBitmapOp<BitOp>(left, left_offset, right, right_offset, dest, out_offset,
+ length);
+ }
+}
+
+template <template <typename> class BitOp>
+Result<std::shared_ptr<Buffer>> BitmapOp(MemoryPool* pool, const uint8_t* left,
+ int64_t left_offset, const uint8_t* right,
+ int64_t right_offset, int64_t length,
+ int64_t out_offset) {
+ const int64_t phys_bits = length + out_offset;
+ ARROW_ASSIGN_OR_RAISE(auto out_buffer, AllocateEmptyBitmap(phys_bits, pool));
+ BitmapOp<BitOp>(left, left_offset, right, right_offset, length, out_offset,
+ out_buffer->mutable_data());
+ return out_buffer;
+}
+
+} // namespace
+
+Result<std::shared_ptr<Buffer>> BitmapAnd(MemoryPool* pool, const uint8_t* left,
+ int64_t left_offset, const uint8_t* right,
+ int64_t right_offset, int64_t length,
+ int64_t out_offset) {
+ return BitmapOp<std::bit_and>(pool, left, left_offset, right, right_offset, length,
+ out_offset);
+}
+
+void BitmapAnd(const uint8_t* left, int64_t left_offset, const uint8_t* right,
+ int64_t right_offset, int64_t length, int64_t out_offset, uint8_t* out) {
+ BitmapOp<std::bit_and>(left, left_offset, right, right_offset, length, out_offset, out);
+}
+
+Result<std::shared_ptr<Buffer>> BitmapOr(MemoryPool* pool, const uint8_t* left,
+ int64_t left_offset, const uint8_t* right,
+ int64_t right_offset, int64_t length,
+ int64_t out_offset) {
+ return BitmapOp<std::bit_or>(pool, left, left_offset, right, right_offset, length,
+ out_offset);
+}
+
+void BitmapOr(const uint8_t* left, int64_t left_offset, const uint8_t* right,
+ int64_t right_offset, int64_t length, int64_t out_offset, uint8_t* out) {
+ BitmapOp<std::bit_or>(left, left_offset, right, right_offset, length, out_offset, out);
+}
+
+Result<std::shared_ptr<Buffer>> BitmapXor(MemoryPool* pool, const uint8_t* left,
+ int64_t left_offset, const uint8_t* right,
+ int64_t right_offset, int64_t length,
+ int64_t out_offset) {
+ return BitmapOp<std::bit_xor>(pool, left, left_offset, right, right_offset, length,
+ out_offset);
+}
+
+void BitmapXor(const uint8_t* left, int64_t left_offset, const uint8_t* right,
+ int64_t right_offset, int64_t length, int64_t out_offset, uint8_t* out) {
+ BitmapOp<std::bit_xor>(left, left_offset, right, right_offset, length, out_offset, out);
+}
+
+template <typename T>
+struct AndNotOp {
+ constexpr T operator()(const T& l, const T& r) const { return l & ~r; }
+};
+
+Result<std::shared_ptr<Buffer>> BitmapAndNot(MemoryPool* pool, const uint8_t* left,
+ int64_t left_offset, const uint8_t* right,
+ int64_t right_offset, int64_t length,
+ int64_t out_offset) {
+ return BitmapOp<AndNotOp>(pool, left, left_offset, right, right_offset, length,
+ out_offset);
+}
+
+void BitmapAndNot(const uint8_t* left, int64_t left_offset, const uint8_t* right,
+ int64_t right_offset, int64_t length, int64_t out_offset,
+ uint8_t* out) {
+ BitmapOp<AndNotOp>(left, left_offset, right, right_offset, length, out_offset, out);
+}
+
+template <typename T>
+struct OrNotOp {
+ constexpr T operator()(const T& l, const T& r) const { return l | ~r; }
+};
+
+Result<std::shared_ptr<Buffer>> BitmapOrNot(MemoryPool* pool, const uint8_t* left,
+ int64_t left_offset, const uint8_t* right,
+ int64_t right_offset, int64_t length,
+ int64_t out_offset) {
+ return BitmapOp<OrNotOp>(pool, left, left_offset, right, right_offset, length,
+ out_offset);
+}
+
+void BitmapOrNot(const uint8_t* left, int64_t left_offset, const uint8_t* right,
+ int64_t right_offset, int64_t length, int64_t out_offset, uint8_t* out) {
+ BitmapOp<OrNotOp>(left, left_offset, right, right_offset, length, out_offset, out);
+}
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_ops.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_ops.h
new file mode 100644
index 00000000000..40a7797a239
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_ops.h
@@ -0,0 +1,206 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+
+#include "arrow/result.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class Buffer;
+class MemoryPool;
+
+namespace internal {
+
+// ----------------------------------------------------------------------
+// Bitmap utilities
+
+/// Copy a bit range of an existing bitmap
+///
+/// \param[in] pool memory pool to allocate memory from
+/// \param[in] bitmap source data
+/// \param[in] offset bit offset into the source data
+/// \param[in] length number of bits to copy
+///
+/// \return Status message
+ARROW_EXPORT
+Result<std::shared_ptr<Buffer>> CopyBitmap(MemoryPool* pool, const uint8_t* bitmap,
+ int64_t offset, int64_t length);
+
+/// Copy a bit range of an existing bitmap into an existing bitmap
+///
+/// \param[in] bitmap source data
+/// \param[in] offset bit offset into the source data
+/// \param[in] length number of bits to copy
+/// \param[in] dest_offset bit offset into the destination
+/// \param[out] dest the destination buffer, must have at least space for
+/// (offset + length) bits
+ARROW_EXPORT
+void CopyBitmap(const uint8_t* bitmap, int64_t offset, int64_t length, uint8_t* dest,
+ int64_t dest_offset);
+
+/// Invert a bit range of an existing bitmap into an existing bitmap
+///
+/// \param[in] bitmap source data
+/// \param[in] offset bit offset into the source data
+/// \param[in] length number of bits to copy
+/// \param[in] dest_offset bit offset into the destination
+/// \param[out] dest the destination buffer, must have at least space for
+/// (offset + length) bits
+ARROW_EXPORT
+void InvertBitmap(const uint8_t* bitmap, int64_t offset, int64_t length, uint8_t* dest,
+ int64_t dest_offset);
+
+/// Invert a bit range of an existing bitmap
+///
+/// \param[in] pool memory pool to allocate memory from
+/// \param[in] bitmap source data
+/// \param[in] offset bit offset into the source data
+/// \param[in] length number of bits to copy
+///
+/// \return Status message
+ARROW_EXPORT
+Result<std::shared_ptr<Buffer>> InvertBitmap(MemoryPool* pool, const uint8_t* bitmap,
+ int64_t offset, int64_t length);
+
+/// Compute the number of 1's in the given data array
+///
+/// \param[in] data a packed LSB-ordered bitmap as a byte array
+/// \param[in] bit_offset a bitwise offset into the bitmap
+/// \param[in] length the number of bits to inspect in the bitmap relative to
+/// the offset
+///
+/// \return The number of set (1) bits in the range
+ARROW_EXPORT
+int64_t CountSetBits(const uint8_t* data, int64_t bit_offset, int64_t length);
+
+ARROW_EXPORT
+bool BitmapEquals(const uint8_t* left, int64_t left_offset, const uint8_t* right,
+ int64_t right_offset, int64_t length);
+
+// Same as BitmapEquals, but considers a NULL bitmap pointer the same as an
+// all-ones bitmap.
+ARROW_EXPORT
+bool OptionalBitmapEquals(const uint8_t* left, int64_t left_offset, const uint8_t* right,
+ int64_t right_offset, int64_t length);
+
+ARROW_EXPORT
+bool OptionalBitmapEquals(const std::shared_ptr<Buffer>& left, int64_t left_offset,
+ const std::shared_ptr<Buffer>& right, int64_t right_offset,
+ int64_t length);
+
+/// \brief Do a "bitmap and" on right and left buffers starting at
+/// their respective bit-offsets for the given bit-length and put
+/// the results in out_buffer starting at the given bit-offset.
+///
+/// out_buffer will be allocated and initialized to zeros using pool before
+/// the operation.
+ARROW_EXPORT
+Result<std::shared_ptr<Buffer>> BitmapAnd(MemoryPool* pool, const uint8_t* left,
+ int64_t left_offset, const uint8_t* right,
+ int64_t right_offset, int64_t length,
+ int64_t out_offset);
+
+/// \brief Do a "bitmap and" on right and left buffers starting at
+/// their respective bit-offsets for the given bit-length and put
+/// the results in out starting at the given bit-offset.
+ARROW_EXPORT
+void BitmapAnd(const uint8_t* left, int64_t left_offset, const uint8_t* right,
+ int64_t right_offset, int64_t length, int64_t out_offset, uint8_t* out);
+
+/// \brief Do a "bitmap or" for the given bit length on right and left buffers
+/// starting at their respective bit-offsets and put the results in out_buffer
+/// starting at the given bit-offset.
+///
+/// out_buffer will be allocated and initialized to zeros using pool before
+/// the operation.
+ARROW_EXPORT
+Result<std::shared_ptr<Buffer>> BitmapOr(MemoryPool* pool, const uint8_t* left,
+ int64_t left_offset, const uint8_t* right,
+ int64_t right_offset, int64_t length,
+ int64_t out_offset);
+
+/// \brief Do a "bitmap or" for the given bit length on right and left buffers
+/// starting at their respective bit-offsets and put the results in out
+/// starting at the given bit-offset.
+ARROW_EXPORT
+void BitmapOr(const uint8_t* left, int64_t left_offset, const uint8_t* right,
+ int64_t right_offset, int64_t length, int64_t out_offset, uint8_t* out);
+
+/// \brief Do a "bitmap xor" for the given bit-length on right and left
+/// buffers starting at their respective bit-offsets and put the results in
+/// out_buffer starting at the given bit offset.
+///
+/// out_buffer will be allocated and initialized to zeros using pool before
+/// the operation.
+ARROW_EXPORT
+Result<std::shared_ptr<Buffer>> BitmapXor(MemoryPool* pool, const uint8_t* left,
+ int64_t left_offset, const uint8_t* right,
+ int64_t right_offset, int64_t length,
+ int64_t out_offset);
+
+/// \brief Do a "bitmap xor" for the given bit-length on right and left
+/// buffers starting at their respective bit-offsets and put the results in
+/// out starting at the given bit offset.
+ARROW_EXPORT
+void BitmapXor(const uint8_t* left, int64_t left_offset, const uint8_t* right,
+ int64_t right_offset, int64_t length, int64_t out_offset, uint8_t* out);
+
+/// \brief Do a "bitmap and not" on right and left buffers starting at
+/// their respective bit-offsets for the given bit-length and put
+/// the results in out_buffer starting at the given bit-offset.
+///
+/// out_buffer will be allocated and initialized to zeros using pool before
+/// the operation.
+ARROW_EXPORT
+Result<std::shared_ptr<Buffer>> BitmapAndNot(MemoryPool* pool, const uint8_t* left,
+ int64_t left_offset, const uint8_t* right,
+ int64_t right_offset, int64_t length,
+ int64_t out_offset);
+
+/// \brief Do a "bitmap and not" on right and left buffers starting at
+/// their respective bit-offsets for the given bit-length and put
+/// the results in out starting at the given bit-offset.
+ARROW_EXPORT
+void BitmapAndNot(const uint8_t* left, int64_t left_offset, const uint8_t* right,
+ int64_t right_offset, int64_t length, int64_t out_offset, uint8_t* out);
+
+/// \brief Do a "bitmap or not" on right and left buffers starting at
+/// their respective bit-offsets for the given bit-length and put
+/// the results in out_buffer starting at the given bit-offset.
+///
+/// out_buffer will be allocated and initialized to zeros using pool before
+/// the operation.
+ARROW_EXPORT
+Result<std::shared_ptr<Buffer>> BitmapOrNot(MemoryPool* pool, const uint8_t* left,
+ int64_t left_offset, const uint8_t* right,
+ int64_t right_offset, int64_t length,
+ int64_t out_offset);
+
+/// \brief Do a "bitmap or not" on right and left buffers starting at
+/// their respective bit-offsets for the given bit-length and put
+/// the results in out starting at the given bit-offset.
+ARROW_EXPORT
+void BitmapOrNot(const uint8_t* left, int64_t left_offset, const uint8_t* right,
+ int64_t right_offset, int64_t length, int64_t out_offset, uint8_t* out);
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_reader.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_reader.h
new file mode 100644
index 00000000000..7c43747fafb
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_reader.h
@@ -0,0 +1,271 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <cstring>
+
+#include "arrow/buffer.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/endian.h"
+#include "arrow/util/macros.h"
+
+namespace arrow {
+namespace internal {
+
+class BitmapReader {
+ public:
+ BitmapReader(const uint8_t* bitmap, int64_t start_offset, int64_t length)
+ : bitmap_(bitmap), position_(0), length_(length) {
+ current_byte_ = 0;
+ byte_offset_ = start_offset / 8;
+ bit_offset_ = start_offset % 8;
+ if (length > 0) {
+ current_byte_ = bitmap[byte_offset_];
+ }
+ }
+
+ bool IsSet() const { return (current_byte_ & (1 << bit_offset_)) != 0; }
+
+ bool IsNotSet() const { return (current_byte_ & (1 << bit_offset_)) == 0; }
+
+ void Next() {
+ ++bit_offset_;
+ ++position_;
+ if (ARROW_PREDICT_FALSE(bit_offset_ == 8)) {
+ bit_offset_ = 0;
+ ++byte_offset_;
+ if (ARROW_PREDICT_TRUE(position_ < length_)) {
+ current_byte_ = bitmap_[byte_offset_];
+ }
+ }
+ }
+
+ int64_t position() const { return position_; }
+
+ int64_t length() const { return length_; }
+
+ private:
+ const uint8_t* bitmap_;
+ int64_t position_;
+ int64_t length_;
+
+ uint8_t current_byte_;
+ int64_t byte_offset_;
+ int64_t bit_offset_;
+};
+
+// XXX Cannot name it BitmapWordReader because the name is already used
+// in bitmap_ops.cc
+
+class BitmapUInt64Reader {
+ public:
+ BitmapUInt64Reader(const uint8_t* bitmap, int64_t start_offset, int64_t length)
+ : bitmap_(bitmap + start_offset / 8),
+ num_carry_bits_(8 - start_offset % 8),
+ length_(length),
+ remaining_length_(length_) {
+ if (length_ > 0) {
+ // Load carry bits from the first byte's MSBs
+ if (length_ >= num_carry_bits_) {
+ carry_bits_ =
+ LoadPartialWord(static_cast<int8_t>(8 - num_carry_bits_), num_carry_bits_);
+ } else {
+ carry_bits_ = LoadPartialWord(static_cast<int8_t>(8 - num_carry_bits_), length_);
+ }
+ }
+ }
+
+ uint64_t NextWord() {
+ if (ARROW_PREDICT_TRUE(remaining_length_ >= 64 + num_carry_bits_)) {
+ // We can load a full word
+ uint64_t next_word = LoadFullWord();
+ // Carry bits come first, then the (64 - num_carry_bits_) LSBs from next_word
+ uint64_t word = carry_bits_ | (next_word << num_carry_bits_);
+ carry_bits_ = next_word >> (64 - num_carry_bits_);
+ remaining_length_ -= 64;
+ return word;
+ } else if (remaining_length_ > num_carry_bits_) {
+ // We can load a partial word
+ uint64_t next_word =
+ LoadPartialWord(/*bit_offset=*/0, remaining_length_ - num_carry_bits_);
+ uint64_t word = carry_bits_ | (next_word << num_carry_bits_);
+ carry_bits_ = next_word >> (64 - num_carry_bits_);
+ remaining_length_ = std::max<int64_t>(remaining_length_ - 64, 0);
+ return word;
+ } else {
+ remaining_length_ = 0;
+ return carry_bits_;
+ }
+ }
+
+ int64_t position() const { return length_ - remaining_length_; }
+
+ int64_t length() const { return length_; }
+
+ private:
+ uint64_t LoadFullWord() {
+ uint64_t word;
+ memcpy(&word, bitmap_, 8);
+ bitmap_ += 8;
+ return BitUtil::ToLittleEndian(word);
+ }
+
+ uint64_t LoadPartialWord(int8_t bit_offset, int64_t num_bits) {
+ uint64_t word = 0;
+ const int64_t num_bytes = BitUtil::BytesForBits(num_bits);
+ memcpy(&word, bitmap_, num_bytes);
+ bitmap_ += num_bytes;
+ return (BitUtil::ToLittleEndian(word) >> bit_offset) &
+ BitUtil::LeastSignificantBitMask(num_bits);
+ }
+
+ const uint8_t* bitmap_;
+ const int64_t num_carry_bits_; // in [1, 8]
+ const int64_t length_;
+ int64_t remaining_length_;
+ uint64_t carry_bits_;
+};
+
+// BitmapWordReader here is faster than BitmapUInt64Reader (in bitmap_reader.h)
+// on sufficiently large inputs. However, it has a larger prolog / epilog overhead
+// and should probably not be used for small bitmaps.
+
+template <typename Word, bool may_have_byte_offset = true>
+class BitmapWordReader {
+ public:
+ BitmapWordReader() = default;
+ BitmapWordReader(const uint8_t* bitmap, int64_t offset, int64_t length)
+ : offset_(static_cast<int64_t>(may_have_byte_offset) * (offset % 8)),
+ bitmap_(bitmap + offset / 8),
+ bitmap_end_(bitmap_ + BitUtil::BytesForBits(offset_ + length)) {
+ // decrement word count by one as we may touch two adjacent words in one iteration
+ nwords_ = length / (sizeof(Word) * 8) - 1;
+ if (nwords_ < 0) {
+ nwords_ = 0;
+ }
+ trailing_bits_ = static_cast<int>(length - nwords_ * sizeof(Word) * 8);
+ trailing_bytes_ = static_cast<int>(BitUtil::BytesForBits(trailing_bits_));
+
+ if (nwords_ > 0) {
+ current_word_ = load<Word>(bitmap_);
+ } else if (length > 0) {
+ current_byte_ = load<uint8_t>(bitmap_);
+ }
+ }
+
+ Word NextWord() {
+ bitmap_ += sizeof(Word);
+ const Word next_word = load<Word>(bitmap_);
+ Word word = current_word_;
+ if (may_have_byte_offset && offset_) {
+ // combine two adjacent words into one word
+ // |<------ next ----->|<---- current ---->|
+ // +-------------+-----+-------------+-----+
+ // | --- | A | B | --- |
+ // +-------------+-----+-------------+-----+
+ // | | offset
+ // v v
+ // +-----+-------------+
+ // | A | B |
+ // +-----+-------------+
+ // |<------ word ----->|
+ word >>= offset_;
+ word |= next_word << (sizeof(Word) * 8 - offset_);
+ }
+ current_word_ = next_word;
+ return word;
+ }
+
+ uint8_t NextTrailingByte(int& valid_bits) {
+ uint8_t byte;
+ assert(trailing_bits_ > 0);
+
+ if (trailing_bits_ <= 8) {
+ // last byte
+ valid_bits = trailing_bits_;
+ trailing_bits_ = 0;
+ byte = 0;
+ internal::BitmapReader reader(bitmap_, offset_, valid_bits);
+ for (int i = 0; i < valid_bits; ++i) {
+ byte >>= 1;
+ if (reader.IsSet()) {
+ byte |= 0x80;
+ }
+ reader.Next();
+ }
+ byte >>= (8 - valid_bits);
+ } else {
+ ++bitmap_;
+ const uint8_t next_byte = load<uint8_t>(bitmap_);
+ byte = current_byte_;
+ if (may_have_byte_offset && offset_) {
+ byte >>= offset_;
+ byte |= next_byte << (8 - offset_);
+ }
+ current_byte_ = next_byte;
+ trailing_bits_ -= 8;
+ trailing_bytes_--;
+ valid_bits = 8;
+ }
+ return byte;
+ }
+
+ int64_t words() const { return nwords_; }
+ int trailing_bytes() const { return trailing_bytes_; }
+
+ private:
+ int64_t offset_;
+ const uint8_t* bitmap_;
+
+ const uint8_t* bitmap_end_;
+ int64_t nwords_;
+ int trailing_bits_;
+ int trailing_bytes_;
+ union {
+ Word current_word_;
+ struct {
+#if ARROW_LITTLE_ENDIAN == 0
+ uint8_t padding_bytes_[sizeof(Word) - 1];
+#endif
+ uint8_t current_byte_;
+ };
+ };
+
+ template <typename DType>
+ DType load(const uint8_t* bitmap) {
+ assert(bitmap + sizeof(DType) <= bitmap_end_);
+ return BitUtil::ToLittleEndian(util::SafeLoadAs<DType>(bitmap));
+ }
+};
+
+/// \brief Index into a possibly non-existent bitmap
+struct OptionalBitIndexer {
+ const uint8_t* bitmap;
+ const int64_t offset;
+
+ explicit OptionalBitIndexer(const std::shared_ptr<Buffer>& buffer, int64_t offset = 0)
+ : bitmap(buffer == NULLPTR ? NULLPTR : buffer->data()), offset(offset) {}
+
+ bool operator[](int64_t i) const {
+ return bitmap == NULLPTR || BitUtil::GetBit(bitmap, offset + i);
+ }
+};
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_visit.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_visit.h
new file mode 100644
index 00000000000..8a16993e052
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_visit.h
@@ -0,0 +1,88 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_reader.h"
+
+namespace arrow {
+namespace internal {
+
+// A function that visits each bit in a bitmap and calls a visitor function with a
+// boolean representation of that bit. This is intended to be analogous to
+// GenerateBits.
+template <class Visitor>
+void VisitBits(const uint8_t* bitmap, int64_t start_offset, int64_t length,
+ Visitor&& visit) {
+ BitmapReader reader(bitmap, start_offset, length);
+ for (int64_t index = 0; index < length; ++index) {
+ visit(reader.IsSet());
+ reader.Next();
+ }
+}
+
+// Like VisitBits(), but unrolls its main loop for better performance.
+template <class Visitor>
+void VisitBitsUnrolled(const uint8_t* bitmap, int64_t start_offset, int64_t length,
+ Visitor&& visit) {
+ if (length == 0) {
+ return;
+ }
+
+ // Start by visiting any bits preceding the first full byte.
+ int64_t num_bits_before_full_bytes =
+ BitUtil::RoundUpToMultipleOf8(start_offset) - start_offset;
+ // Truncate num_bits_before_full_bytes if it is greater than length.
+ if (num_bits_before_full_bytes > length) {
+ num_bits_before_full_bytes = length;
+ }
+ // Use the non loop-unrolled VisitBits since we don't want to add branches
+ VisitBits<Visitor>(bitmap, start_offset, num_bits_before_full_bytes, visit);
+
+ // Shift the start pointer to the first full byte and compute the
+ // number of full bytes to be read.
+ const uint8_t* first_full_byte = bitmap + BitUtil::CeilDiv(start_offset, 8);
+ const int64_t num_full_bytes = (length - num_bits_before_full_bytes) / 8;
+
+ // Iterate over each full byte of the input bitmap and call the visitor in
+ // a loop-unrolled manner.
+ for (int64_t byte_index = 0; byte_index < num_full_bytes; ++byte_index) {
+ // Get the current bit-packed byte value from the bitmap.
+ const uint8_t byte = *(first_full_byte + byte_index);
+
+ // Execute the visitor function on each bit of the current byte.
+ visit(BitUtil::GetBitFromByte(byte, 0));
+ visit(BitUtil::GetBitFromByte(byte, 1));
+ visit(BitUtil::GetBitFromByte(byte, 2));
+ visit(BitUtil::GetBitFromByte(byte, 3));
+ visit(BitUtil::GetBitFromByte(byte, 4));
+ visit(BitUtil::GetBitFromByte(byte, 5));
+ visit(BitUtil::GetBitFromByte(byte, 6));
+ visit(BitUtil::GetBitFromByte(byte, 7));
+ }
+
+ // Write any leftover bits in the last byte.
+ const int64_t num_bits_after_full_bytes = (length - num_bits_before_full_bytes) % 8;
+ VisitBits<Visitor>(first_full_byte + num_full_bytes, 0, num_bits_after_full_bytes,
+ visit);
+}
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_writer.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_writer.h
new file mode 100644
index 00000000000..d5c6d909df0
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_writer.h
@@ -0,0 +1,285 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <cstring>
+
+#include "arrow/util/bit_util.h"
+#include "arrow/util/endian.h"
+#include "arrow/util/macros.h"
+
+namespace arrow {
+namespace internal {
+
+class BitmapWriter {
+ // A sequential bitwise writer that preserves surrounding bit values.
+
+ public:
+ BitmapWriter(uint8_t* bitmap, int64_t start_offset, int64_t length)
+ : bitmap_(bitmap), position_(0), length_(length) {
+ byte_offset_ = start_offset / 8;
+ bit_mask_ = BitUtil::kBitmask[start_offset % 8];
+ if (length > 0) {
+ current_byte_ = bitmap[byte_offset_];
+ } else {
+ current_byte_ = 0;
+ }
+ }
+
+ void Set() { current_byte_ |= bit_mask_; }
+
+ void Clear() { current_byte_ &= bit_mask_ ^ 0xFF; }
+
+ void Next() {
+ bit_mask_ = static_cast<uint8_t>(bit_mask_ << 1);
+ ++position_;
+ if (bit_mask_ == 0) {
+ // Finished this byte, need advancing
+ bit_mask_ = 0x01;
+ bitmap_[byte_offset_++] = current_byte_;
+ if (ARROW_PREDICT_TRUE(position_ < length_)) {
+ current_byte_ = bitmap_[byte_offset_];
+ }
+ }
+ }
+
+ void Finish() {
+ // Store current byte if we didn't went past bitmap storage
+ if (length_ > 0 && (bit_mask_ != 0x01 || position_ < length_)) {
+ bitmap_[byte_offset_] = current_byte_;
+ }
+ }
+
+ int64_t position() const { return position_; }
+
+ private:
+ uint8_t* bitmap_;
+ int64_t position_;
+ int64_t length_;
+
+ uint8_t current_byte_;
+ uint8_t bit_mask_;
+ int64_t byte_offset_;
+};
+
+class FirstTimeBitmapWriter {
+ // Like BitmapWriter, but any bit values *following* the bits written
+ // might be clobbered. It is hence faster than BitmapWriter, and can
+ // also avoid false positives with Valgrind.
+
+ public:
+ FirstTimeBitmapWriter(uint8_t* bitmap, int64_t start_offset, int64_t length)
+ : bitmap_(bitmap), position_(0), length_(length) {
+ current_byte_ = 0;
+ byte_offset_ = start_offset / 8;
+ bit_mask_ = BitUtil::kBitmask[start_offset % 8];
+ if (length > 0) {
+ current_byte_ = bitmap[byte_offset_] & BitUtil::kPrecedingBitmask[start_offset % 8];
+ } else {
+ current_byte_ = 0;
+ }
+ }
+
+ /// Appends number_of_bits from word to valid_bits and valid_bits_offset.
+ ///
+ /// \param[in] word The LSB bitmap to append. Any bits past number_of_bits are assumed
+ /// to be unset (i.e. 0).
+ /// \param[in] number_of_bits The number of bits to append from word.
+ void AppendWord(uint64_t word, int64_t number_of_bits) {
+ if (ARROW_PREDICT_FALSE(number_of_bits == 0)) {
+ return;
+ }
+
+ // Location that the first byte needs to be written to.
+ uint8_t* append_position = bitmap_ + byte_offset_;
+
+ // Update state variables except for current_byte_ here.
+ position_ += number_of_bits;
+ int64_t bit_offset = BitUtil::CountTrailingZeros(static_cast<uint32_t>(bit_mask_));
+ bit_mask_ = BitUtil::kBitmask[(bit_offset + number_of_bits) % 8];
+ byte_offset_ += (bit_offset + number_of_bits) / 8;
+
+ if (bit_offset != 0) {
+ // We are in the middle of the byte. This code updates the byte and shifts
+ // bits appropriately within word so it can be memcpy'd below.
+ int64_t bits_to_carry = 8 - bit_offset;
+ // Carry over bits from word to current_byte_. We assume any extra bits in word
+ // unset so no additional accounting is needed for when number_of_bits <
+ // bits_to_carry.
+ current_byte_ |= (word & BitUtil::kPrecedingBitmask[bits_to_carry]) << bit_offset;
+ // Check if everything is transfered into current_byte_.
+ if (ARROW_PREDICT_FALSE(number_of_bits < bits_to_carry)) {
+ return;
+ }
+ *append_position = current_byte_;
+ append_position++;
+ // Move the carry bits off of word.
+ word = word >> bits_to_carry;
+ number_of_bits -= bits_to_carry;
+ }
+ word = BitUtil::ToLittleEndian(word);
+ int64_t bytes_for_word = ::arrow::BitUtil::BytesForBits(number_of_bits);
+ std::memcpy(append_position, &word, bytes_for_word);
+ // At this point, the previous current_byte_ has been written to bitmap_.
+ // The new current_byte_ is either the last relevant byte in 'word'
+ // or cleared if the new position is byte aligned (i.e. a fresh byte).
+ if (bit_mask_ == 0x1) {
+ current_byte_ = 0;
+ } else {
+ current_byte_ = *(append_position + bytes_for_word - 1);
+ }
+ }
+
+ void Set() { current_byte_ |= bit_mask_; }
+
+ void Clear() {}
+
+ void Next() {
+ bit_mask_ = static_cast<uint8_t>(bit_mask_ << 1);
+ ++position_;
+ if (bit_mask_ == 0) {
+ // Finished this byte, need advancing
+ bit_mask_ = 0x01;
+ bitmap_[byte_offset_++] = current_byte_;
+ current_byte_ = 0;
+ }
+ }
+
+ void Finish() {
+ // Store current byte if we didn't went go bitmap storage
+ if (length_ > 0 && (bit_mask_ != 0x01 || position_ < length_)) {
+ bitmap_[byte_offset_] = current_byte_;
+ }
+ }
+
+ int64_t position() const { return position_; }
+
+ private:
+ uint8_t* bitmap_;
+ int64_t position_;
+ int64_t length_;
+
+ uint8_t current_byte_;
+ uint8_t bit_mask_;
+ int64_t byte_offset_;
+};
+
+template <typename Word, bool may_have_byte_offset = true>
+class BitmapWordWriter {
+ public:
+ BitmapWordWriter() = default;
+ BitmapWordWriter(uint8_t* bitmap, int64_t offset, int64_t length)
+ : offset_(static_cast<int64_t>(may_have_byte_offset) * (offset % 8)),
+ bitmap_(bitmap + offset / 8),
+ bitmap_end_(bitmap_ + BitUtil::BytesForBits(offset_ + length)),
+ mask_((1U << offset_) - 1) {
+ if (offset_) {
+ if (length >= static_cast<int>(sizeof(Word) * 8)) {
+ current_word_ = load<Word>(bitmap_);
+ } else if (length > 0) {
+ current_byte_ = load<uint8_t>(bitmap_);
+ }
+ }
+ }
+
+ void PutNextWord(Word word) {
+ if (may_have_byte_offset && offset_) {
+ // split one word into two adjacent words, don't touch unused bits
+ // |<------ word ----->|
+ // +-----+-------------+
+ // | A | B |
+ // +-----+-------------+
+ // | |
+ // v v offset
+ // +-------------+-----+-------------+-----+
+ // | --- | A | B | --- |
+ // +-------------+-----+-------------+-----+
+ // |<------ next ----->|<---- current ---->|
+ word = (word << offset_) | (word >> (sizeof(Word) * 8 - offset_));
+ Word next_word = load<Word>(bitmap_ + sizeof(Word));
+ current_word_ = (current_word_ & mask_) | (word & ~mask_);
+ next_word = (next_word & ~mask_) | (word & mask_);
+ store<Word>(bitmap_, current_word_);
+ store<Word>(bitmap_ + sizeof(Word), next_word);
+ current_word_ = next_word;
+ } else {
+ store<Word>(bitmap_, word);
+ }
+ bitmap_ += sizeof(Word);
+ }
+
+ void PutNextTrailingByte(uint8_t byte, int valid_bits) {
+ if (valid_bits == 8) {
+ if (may_have_byte_offset && offset_) {
+ byte = (byte << offset_) | (byte >> (8 - offset_));
+ uint8_t next_byte = load<uint8_t>(bitmap_ + 1);
+ current_byte_ = (current_byte_ & mask_) | (byte & ~mask_);
+ next_byte = (next_byte & ~mask_) | (byte & mask_);
+ store<uint8_t>(bitmap_, current_byte_);
+ store<uint8_t>(bitmap_ + 1, next_byte);
+ current_byte_ = next_byte;
+ } else {
+ store<uint8_t>(bitmap_, byte);
+ }
+ ++bitmap_;
+ } else {
+ assert(valid_bits > 0);
+ assert(valid_bits < 8);
+ assert(bitmap_ + BitUtil::BytesForBits(offset_ + valid_bits) <= bitmap_end_);
+ internal::BitmapWriter writer(bitmap_, offset_, valid_bits);
+ for (int i = 0; i < valid_bits; ++i) {
+ (byte & 0x01) ? writer.Set() : writer.Clear();
+ writer.Next();
+ byte >>= 1;
+ }
+ writer.Finish();
+ }
+ }
+
+ private:
+ int64_t offset_;
+ uint8_t* bitmap_;
+
+ const uint8_t* bitmap_end_;
+ uint64_t mask_;
+ union {
+ Word current_word_;
+ struct {
+#if ARROW_LITTLE_ENDIAN == 0
+ uint8_t padding_bytes_[sizeof(Word) - 1];
+#endif
+ uint8_t current_byte_;
+ };
+ };
+
+ template <typename DType>
+ DType load(const uint8_t* bitmap) {
+ assert(bitmap + sizeof(DType) <= bitmap_end_);
+ return BitUtil::ToLittleEndian(util::SafeLoadAs<DType>(bitmap));
+ }
+
+ template <typename DType>
+ void store(uint8_t* bitmap, DType data) {
+ assert(bitmap + sizeof(DType) <= bitmap_end_);
+ util::SafeStore(bitmap, BitUtil::FromLittleEndian(data));
+ }
+};
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/bpacking.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/bpacking.cc
new file mode 100644
index 00000000000..d9cafd602a2
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/bpacking.cc
@@ -0,0 +1,178 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/util/bpacking.h"
+#include "arrow/util/bpacking_default.h"
+#include "arrow/util/cpu_info.h"
+#include "arrow/util/dispatch.h"
+#include "arrow/util/logging.h"
+
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
+#error #include "arrow/util/bpacking_avx2.h"
+#endif
+#if defined(ARROW_HAVE_RUNTIME_AVX512)
+#error #include "arrow/util/bpacking_avx512.h"
+#endif
+#if defined(ARROW_HAVE_NEON)
+#error #include "arrow/util/bpacking_neon.h"
+#endif
+
+namespace arrow {
+namespace internal {
+
+namespace {
+
+int unpack32_default(const uint32_t* in, uint32_t* out, int batch_size, int num_bits) {
+ batch_size = batch_size / 32 * 32;
+ int num_loops = batch_size / 32;
+
+ switch (num_bits) {
+ case 0:
+ for (int i = 0; i < num_loops; ++i) in = nullunpacker32(in, out + i * 32);
+ break;
+ case 1:
+ for (int i = 0; i < num_loops; ++i) in = unpack1_32(in, out + i * 32);
+ break;
+ case 2:
+ for (int i = 0; i < num_loops; ++i) in = unpack2_32(in, out + i * 32);
+ break;
+ case 3:
+ for (int i = 0; i < num_loops; ++i) in = unpack3_32(in, out + i * 32);
+ break;
+ case 4:
+ for (int i = 0; i < num_loops; ++i) in = unpack4_32(in, out + i * 32);
+ break;
+ case 5:
+ for (int i = 0; i < num_loops; ++i) in = unpack5_32(in, out + i * 32);
+ break;
+ case 6:
+ for (int i = 0; i < num_loops; ++i) in = unpack6_32(in, out + i * 32);
+ break;
+ case 7:
+ for (int i = 0; i < num_loops; ++i) in = unpack7_32(in, out + i * 32);
+ break;
+ case 8:
+ for (int i = 0; i < num_loops; ++i) in = unpack8_32(in, out + i * 32);
+ break;
+ case 9:
+ for (int i = 0; i < num_loops; ++i) in = unpack9_32(in, out + i * 32);
+ break;
+ case 10:
+ for (int i = 0; i < num_loops; ++i) in = unpack10_32(in, out + i * 32);
+ break;
+ case 11:
+ for (int i = 0; i < num_loops; ++i) in = unpack11_32(in, out + i * 32);
+ break;
+ case 12:
+ for (int i = 0; i < num_loops; ++i) in = unpack12_32(in, out + i * 32);
+ break;
+ case 13:
+ for (int i = 0; i < num_loops; ++i) in = unpack13_32(in, out + i * 32);
+ break;
+ case 14:
+ for (int i = 0; i < num_loops; ++i) in = unpack14_32(in, out + i * 32);
+ break;
+ case 15:
+ for (int i = 0; i < num_loops; ++i) in = unpack15_32(in, out + i * 32);
+ break;
+ case 16:
+ for (int i = 0; i < num_loops; ++i) in = unpack16_32(in, out + i * 32);
+ break;
+ case 17:
+ for (int i = 0; i < num_loops; ++i) in = unpack17_32(in, out + i * 32);
+ break;
+ case 18:
+ for (int i = 0; i < num_loops; ++i) in = unpack18_32(in, out + i * 32);
+ break;
+ case 19:
+ for (int i = 0; i < num_loops; ++i) in = unpack19_32(in, out + i * 32);
+ break;
+ case 20:
+ for (int i = 0; i < num_loops; ++i) in = unpack20_32(in, out + i * 32);
+ break;
+ case 21:
+ for (int i = 0; i < num_loops; ++i) in = unpack21_32(in, out + i * 32);
+ break;
+ case 22:
+ for (int i = 0; i < num_loops; ++i) in = unpack22_32(in, out + i * 32);
+ break;
+ case 23:
+ for (int i = 0; i < num_loops; ++i) in = unpack23_32(in, out + i * 32);
+ break;
+ case 24:
+ for (int i = 0; i < num_loops; ++i) in = unpack24_32(in, out + i * 32);
+ break;
+ case 25:
+ for (int i = 0; i < num_loops; ++i) in = unpack25_32(in, out + i * 32);
+ break;
+ case 26:
+ for (int i = 0; i < num_loops; ++i) in = unpack26_32(in, out + i * 32);
+ break;
+ case 27:
+ for (int i = 0; i < num_loops; ++i) in = unpack27_32(in, out + i * 32);
+ break;
+ case 28:
+ for (int i = 0; i < num_loops; ++i) in = unpack28_32(in, out + i * 32);
+ break;
+ case 29:
+ for (int i = 0; i < num_loops; ++i) in = unpack29_32(in, out + i * 32);
+ break;
+ case 30:
+ for (int i = 0; i < num_loops; ++i) in = unpack30_32(in, out + i * 32);
+ break;
+ case 31:
+ for (int i = 0; i < num_loops; ++i) in = unpack31_32(in, out + i * 32);
+ break;
+ case 32:
+ for (int i = 0; i < num_loops; ++i) in = unpack32_32(in, out + i * 32);
+ break;
+ default:
+ DCHECK(false) << "Unsupported num_bits";
+ }
+
+ return batch_size;
+}
+
+struct Unpack32DynamicFunction {
+ using FunctionType = decltype(&unpack32_default);
+
+ static std::vector<std::pair<DispatchLevel, FunctionType>> implementations() {
+ return {
+ { DispatchLevel::NONE, unpack32_default }
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
+ , { DispatchLevel::AVX2, unpack32_avx2 }
+#endif
+#if defined(ARROW_HAVE_RUNTIME_AVX512)
+ , { DispatchLevel::AVX512, unpack32_avx512 }
+#endif
+ };
+ }
+};
+
+} // namespace
+
+int unpack32(const uint32_t* in, uint32_t* out, int batch_size, int num_bits) {
+#if defined(ARROW_HAVE_NEON)
+ return unpack32_neon(in, out, batch_size, num_bits);
+#else
+ static DynamicDispatch<Unpack32DynamicFunction> dispatch;
+ return dispatch.func(in, out, batch_size, num_bits);
+#endif
+}
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/bpacking.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/bpacking.h
new file mode 100644
index 00000000000..e5a4dbbed89
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/bpacking.h
@@ -0,0 +1,32 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/util/endian.h"
+#include "arrow/util/visibility.h"
+
+#include <stdint.h>
+
+namespace arrow {
+namespace internal {
+
+ARROW_EXPORT
+int unpack32(const uint32_t* in, uint32_t* out, int batch_size, int num_bits);
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/bpacking_default.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/bpacking_default.h
new file mode 100644
index 00000000000..d2516effa4d
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/bpacking_default.h
@@ -0,0 +1,4251 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This file was modified from its original version for inclusion in parquet-cpp.
+// Original source:
+// https://github.com/lemire/FrameOfReference/blob/6ccaf9e97160f9a3b299e23a8ef739e711ef0c71/src/bpacking.cpp
+// The original copyright notice follows.
+
+// This code is released under the
+// Apache License Version 2.0 http://www.apache.org/licenses/.
+// (c) Daniel Lemire 2013
+
+#pragma once
+
+#include "arrow/util/bit_util.h"
+#include "arrow/util/ubsan.h"
+
+namespace arrow {
+namespace internal {
+
+inline const uint32_t* unpack1_32(const uint32_t* in, uint32_t* out) {
+ uint32_t inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out = (inl >> 0) & 1;
+ out++;
+ *out = (inl >> 1) & 1;
+ out++;
+ *out = (inl >> 2) & 1;
+ out++;
+ *out = (inl >> 3) & 1;
+ out++;
+ *out = (inl >> 4) & 1;
+ out++;
+ *out = (inl >> 5) & 1;
+ out++;
+ *out = (inl >> 6) & 1;
+ out++;
+ *out = (inl >> 7) & 1;
+ out++;
+ *out = (inl >> 8) & 1;
+ out++;
+ *out = (inl >> 9) & 1;
+ out++;
+ *out = (inl >> 10) & 1;
+ out++;
+ *out = (inl >> 11) & 1;
+ out++;
+ *out = (inl >> 12) & 1;
+ out++;
+ *out = (inl >> 13) & 1;
+ out++;
+ *out = (inl >> 14) & 1;
+ out++;
+ *out = (inl >> 15) & 1;
+ out++;
+ *out = (inl >> 16) & 1;
+ out++;
+ *out = (inl >> 17) & 1;
+ out++;
+ *out = (inl >> 18) & 1;
+ out++;
+ *out = (inl >> 19) & 1;
+ out++;
+ *out = (inl >> 20) & 1;
+ out++;
+ *out = (inl >> 21) & 1;
+ out++;
+ *out = (inl >> 22) & 1;
+ out++;
+ *out = (inl >> 23) & 1;
+ out++;
+ *out = (inl >> 24) & 1;
+ out++;
+ *out = (inl >> 25) & 1;
+ out++;
+ *out = (inl >> 26) & 1;
+ out++;
+ *out = (inl >> 27) & 1;
+ out++;
+ *out = (inl >> 28) & 1;
+ out++;
+ *out = (inl >> 29) & 1;
+ out++;
+ *out = (inl >> 30) & 1;
+ out++;
+ *out = (inl >> 31);
+ ++in;
+ out++;
+
+ return in;
+}
+
+inline const uint32_t* unpack2_32(const uint32_t* in, uint32_t* out) {
+ uint32_t inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out = (inl >> 0) % (1U << 2);
+ out++;
+ *out = (inl >> 2) % (1U << 2);
+ out++;
+ *out = (inl >> 4) % (1U << 2);
+ out++;
+ *out = (inl >> 6) % (1U << 2);
+ out++;
+ *out = (inl >> 8) % (1U << 2);
+ out++;
+ *out = (inl >> 10) % (1U << 2);
+ out++;
+ *out = (inl >> 12) % (1U << 2);
+ out++;
+ *out = (inl >> 14) % (1U << 2);
+ out++;
+ *out = (inl >> 16) % (1U << 2);
+ out++;
+ *out = (inl >> 18) % (1U << 2);
+ out++;
+ *out = (inl >> 20) % (1U << 2);
+ out++;
+ *out = (inl >> 22) % (1U << 2);
+ out++;
+ *out = (inl >> 24) % (1U << 2);
+ out++;
+ *out = (inl >> 26) % (1U << 2);
+ out++;
+ *out = (inl >> 28) % (1U << 2);
+ out++;
+ *out = (inl >> 30);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0) % (1U << 2);
+ out++;
+ *out = (inl >> 2) % (1U << 2);
+ out++;
+ *out = (inl >> 4) % (1U << 2);
+ out++;
+ *out = (inl >> 6) % (1U << 2);
+ out++;
+ *out = (inl >> 8) % (1U << 2);
+ out++;
+ *out = (inl >> 10) % (1U << 2);
+ out++;
+ *out = (inl >> 12) % (1U << 2);
+ out++;
+ *out = (inl >> 14) % (1U << 2);
+ out++;
+ *out = (inl >> 16) % (1U << 2);
+ out++;
+ *out = (inl >> 18) % (1U << 2);
+ out++;
+ *out = (inl >> 20) % (1U << 2);
+ out++;
+ *out = (inl >> 22) % (1U << 2);
+ out++;
+ *out = (inl >> 24) % (1U << 2);
+ out++;
+ *out = (inl >> 26) % (1U << 2);
+ out++;
+ *out = (inl >> 28) % (1U << 2);
+ out++;
+ *out = (inl >> 30);
+ ++in;
+ out++;
+
+ return in;
+}
+
+inline const uint32_t* unpack3_32(const uint32_t* in, uint32_t* out) {
+ uint32_t inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out = (inl >> 0) % (1U << 3);
+ out++;
+ *out = (inl >> 3) % (1U << 3);
+ out++;
+ *out = (inl >> 6) % (1U << 3);
+ out++;
+ *out = (inl >> 9) % (1U << 3);
+ out++;
+ *out = (inl >> 12) % (1U << 3);
+ out++;
+ *out = (inl >> 15) % (1U << 3);
+ out++;
+ *out = (inl >> 18) % (1U << 3);
+ out++;
+ *out = (inl >> 21) % (1U << 3);
+ out++;
+ *out = (inl >> 24) % (1U << 3);
+ out++;
+ *out = (inl >> 27) % (1U << 3);
+ out++;
+ *out = (inl >> 30);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 1)) << (3 - 1);
+ out++;
+ *out = (inl >> 1) % (1U << 3);
+ out++;
+ *out = (inl >> 4) % (1U << 3);
+ out++;
+ *out = (inl >> 7) % (1U << 3);
+ out++;
+ *out = (inl >> 10) % (1U << 3);
+ out++;
+ *out = (inl >> 13) % (1U << 3);
+ out++;
+ *out = (inl >> 16) % (1U << 3);
+ out++;
+ *out = (inl >> 19) % (1U << 3);
+ out++;
+ *out = (inl >> 22) % (1U << 3);
+ out++;
+ *out = (inl >> 25) % (1U << 3);
+ out++;
+ *out = (inl >> 28) % (1U << 3);
+ out++;
+ *out = (inl >> 31);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 2)) << (3 - 2);
+ out++;
+ *out = (inl >> 2) % (1U << 3);
+ out++;
+ *out = (inl >> 5) % (1U << 3);
+ out++;
+ *out = (inl >> 8) % (1U << 3);
+ out++;
+ *out = (inl >> 11) % (1U << 3);
+ out++;
+ *out = (inl >> 14) % (1U << 3);
+ out++;
+ *out = (inl >> 17) % (1U << 3);
+ out++;
+ *out = (inl >> 20) % (1U << 3);
+ out++;
+ *out = (inl >> 23) % (1U << 3);
+ out++;
+ *out = (inl >> 26) % (1U << 3);
+ out++;
+ *out = (inl >> 29);
+ ++in;
+ out++;
+
+ return in;
+}
+
+inline const uint32_t* unpack4_32(const uint32_t* in, uint32_t* out) {
+ uint32_t inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out = (inl >> 0) % (1U << 4);
+ out++;
+ *out = (inl >> 4) % (1U << 4);
+ out++;
+ *out = (inl >> 8) % (1U << 4);
+ out++;
+ *out = (inl >> 12) % (1U << 4);
+ out++;
+ *out = (inl >> 16) % (1U << 4);
+ out++;
+ *out = (inl >> 20) % (1U << 4);
+ out++;
+ *out = (inl >> 24) % (1U << 4);
+ out++;
+ *out = (inl >> 28);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0) % (1U << 4);
+ out++;
+ *out = (inl >> 4) % (1U << 4);
+ out++;
+ *out = (inl >> 8) % (1U << 4);
+ out++;
+ *out = (inl >> 12) % (1U << 4);
+ out++;
+ *out = (inl >> 16) % (1U << 4);
+ out++;
+ *out = (inl >> 20) % (1U << 4);
+ out++;
+ *out = (inl >> 24) % (1U << 4);
+ out++;
+ *out = (inl >> 28);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0) % (1U << 4);
+ out++;
+ *out = (inl >> 4) % (1U << 4);
+ out++;
+ *out = (inl >> 8) % (1U << 4);
+ out++;
+ *out = (inl >> 12) % (1U << 4);
+ out++;
+ *out = (inl >> 16) % (1U << 4);
+ out++;
+ *out = (inl >> 20) % (1U << 4);
+ out++;
+ *out = (inl >> 24) % (1U << 4);
+ out++;
+ *out = (inl >> 28);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0) % (1U << 4);
+ out++;
+ *out = (inl >> 4) % (1U << 4);
+ out++;
+ *out = (inl >> 8) % (1U << 4);
+ out++;
+ *out = (inl >> 12) % (1U << 4);
+ out++;
+ *out = (inl >> 16) % (1U << 4);
+ out++;
+ *out = (inl >> 20) % (1U << 4);
+ out++;
+ *out = (inl >> 24) % (1U << 4);
+ out++;
+ *out = (inl >> 28);
+ ++in;
+ out++;
+
+ return in;
+}
+
+inline const uint32_t* unpack5_32(const uint32_t* in, uint32_t* out) {
+ uint32_t inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out = (inl >> 0) % (1U << 5);
+ out++;
+ *out = (inl >> 5) % (1U << 5);
+ out++;
+ *out = (inl >> 10) % (1U << 5);
+ out++;
+ *out = (inl >> 15) % (1U << 5);
+ out++;
+ *out = (inl >> 20) % (1U << 5);
+ out++;
+ *out = (inl >> 25) % (1U << 5);
+ out++;
+ *out = (inl >> 30);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 3)) << (5 - 3);
+ out++;
+ *out = (inl >> 3) % (1U << 5);
+ out++;
+ *out = (inl >> 8) % (1U << 5);
+ out++;
+ *out = (inl >> 13) % (1U << 5);
+ out++;
+ *out = (inl >> 18) % (1U << 5);
+ out++;
+ *out = (inl >> 23) % (1U << 5);
+ out++;
+ *out = (inl >> 28);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 1)) << (5 - 1);
+ out++;
+ *out = (inl >> 1) % (1U << 5);
+ out++;
+ *out = (inl >> 6) % (1U << 5);
+ out++;
+ *out = (inl >> 11) % (1U << 5);
+ out++;
+ *out = (inl >> 16) % (1U << 5);
+ out++;
+ *out = (inl >> 21) % (1U << 5);
+ out++;
+ *out = (inl >> 26) % (1U << 5);
+ out++;
+ *out = (inl >> 31);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 4)) << (5 - 4);
+ out++;
+ *out = (inl >> 4) % (1U << 5);
+ out++;
+ *out = (inl >> 9) % (1U << 5);
+ out++;
+ *out = (inl >> 14) % (1U << 5);
+ out++;
+ *out = (inl >> 19) % (1U << 5);
+ out++;
+ *out = (inl >> 24) % (1U << 5);
+ out++;
+ *out = (inl >> 29);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 2)) << (5 - 2);
+ out++;
+ *out = (inl >> 2) % (1U << 5);
+ out++;
+ *out = (inl >> 7) % (1U << 5);
+ out++;
+ *out = (inl >> 12) % (1U << 5);
+ out++;
+ *out = (inl >> 17) % (1U << 5);
+ out++;
+ *out = (inl >> 22) % (1U << 5);
+ out++;
+ *out = (inl >> 27);
+ ++in;
+ out++;
+
+ return in;
+}
+
+inline const uint32_t* unpack6_32(const uint32_t* in, uint32_t* out) {
+ uint32_t inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out = (inl >> 0) % (1U << 6);
+ out++;
+ *out = (inl >> 6) % (1U << 6);
+ out++;
+ *out = (inl >> 12) % (1U << 6);
+ out++;
+ *out = (inl >> 18) % (1U << 6);
+ out++;
+ *out = (inl >> 24) % (1U << 6);
+ out++;
+ *out = (inl >> 30);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 4)) << (6 - 4);
+ out++;
+ *out = (inl >> 4) % (1U << 6);
+ out++;
+ *out = (inl >> 10) % (1U << 6);
+ out++;
+ *out = (inl >> 16) % (1U << 6);
+ out++;
+ *out = (inl >> 22) % (1U << 6);
+ out++;
+ *out = (inl >> 28);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 2)) << (6 - 2);
+ out++;
+ *out = (inl >> 2) % (1U << 6);
+ out++;
+ *out = (inl >> 8) % (1U << 6);
+ out++;
+ *out = (inl >> 14) % (1U << 6);
+ out++;
+ *out = (inl >> 20) % (1U << 6);
+ out++;
+ *out = (inl >> 26);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0) % (1U << 6);
+ out++;
+ *out = (inl >> 6) % (1U << 6);
+ out++;
+ *out = (inl >> 12) % (1U << 6);
+ out++;
+ *out = (inl >> 18) % (1U << 6);
+ out++;
+ *out = (inl >> 24) % (1U << 6);
+ out++;
+ *out = (inl >> 30);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 4)) << (6 - 4);
+ out++;
+ *out = (inl >> 4) % (1U << 6);
+ out++;
+ *out = (inl >> 10) % (1U << 6);
+ out++;
+ *out = (inl >> 16) % (1U << 6);
+ out++;
+ *out = (inl >> 22) % (1U << 6);
+ out++;
+ *out = (inl >> 28);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 2)) << (6 - 2);
+ out++;
+ *out = (inl >> 2) % (1U << 6);
+ out++;
+ *out = (inl >> 8) % (1U << 6);
+ out++;
+ *out = (inl >> 14) % (1U << 6);
+ out++;
+ *out = (inl >> 20) % (1U << 6);
+ out++;
+ *out = (inl >> 26);
+ ++in;
+ out++;
+
+ return in;
+}
+
+inline const uint32_t* unpack7_32(const uint32_t* in, uint32_t* out) {
+ uint32_t inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out = (inl >> 0) % (1U << 7);
+ out++;
+ *out = (inl >> 7) % (1U << 7);
+ out++;
+ *out = (inl >> 14) % (1U << 7);
+ out++;
+ *out = (inl >> 21) % (1U << 7);
+ out++;
+ *out = (inl >> 28);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 3)) << (7 - 3);
+ out++;
+ *out = (inl >> 3) % (1U << 7);
+ out++;
+ *out = (inl >> 10) % (1U << 7);
+ out++;
+ *out = (inl >> 17) % (1U << 7);
+ out++;
+ *out = (inl >> 24) % (1U << 7);
+ out++;
+ *out = (inl >> 31);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 6)) << (7 - 6);
+ out++;
+ *out = (inl >> 6) % (1U << 7);
+ out++;
+ *out = (inl >> 13) % (1U << 7);
+ out++;
+ *out = (inl >> 20) % (1U << 7);
+ out++;
+ *out = (inl >> 27);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 2)) << (7 - 2);
+ out++;
+ *out = (inl >> 2) % (1U << 7);
+ out++;
+ *out = (inl >> 9) % (1U << 7);
+ out++;
+ *out = (inl >> 16) % (1U << 7);
+ out++;
+ *out = (inl >> 23) % (1U << 7);
+ out++;
+ *out = (inl >> 30);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 5)) << (7 - 5);
+ out++;
+ *out = (inl >> 5) % (1U << 7);
+ out++;
+ *out = (inl >> 12) % (1U << 7);
+ out++;
+ *out = (inl >> 19) % (1U << 7);
+ out++;
+ *out = (inl >> 26);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 1)) << (7 - 1);
+ out++;
+ *out = (inl >> 1) % (1U << 7);
+ out++;
+ *out = (inl >> 8) % (1U << 7);
+ out++;
+ *out = (inl >> 15) % (1U << 7);
+ out++;
+ *out = (inl >> 22) % (1U << 7);
+ out++;
+ *out = (inl >> 29);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 4)) << (7 - 4);
+ out++;
+ *out = (inl >> 4) % (1U << 7);
+ out++;
+ *out = (inl >> 11) % (1U << 7);
+ out++;
+ *out = (inl >> 18) % (1U << 7);
+ out++;
+ *out = (inl >> 25);
+ ++in;
+ out++;
+
+ return in;
+}
+
+inline const uint32_t* unpack8_32(const uint32_t* in, uint32_t* out) {
+ uint32_t inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out = (inl >> 0) % (1U << 8);
+ out++;
+ *out = (inl >> 8) % (1U << 8);
+ out++;
+ *out = (inl >> 16) % (1U << 8);
+ out++;
+ *out = (inl >> 24);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0) % (1U << 8);
+ out++;
+ *out = (inl >> 8) % (1U << 8);
+ out++;
+ *out = (inl >> 16) % (1U << 8);
+ out++;
+ *out = (inl >> 24);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0) % (1U << 8);
+ out++;
+ *out = (inl >> 8) % (1U << 8);
+ out++;
+ *out = (inl >> 16) % (1U << 8);
+ out++;
+ *out = (inl >> 24);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0) % (1U << 8);
+ out++;
+ *out = (inl >> 8) % (1U << 8);
+ out++;
+ *out = (inl >> 16) % (1U << 8);
+ out++;
+ *out = (inl >> 24);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0) % (1U << 8);
+ out++;
+ *out = (inl >> 8) % (1U << 8);
+ out++;
+ *out = (inl >> 16) % (1U << 8);
+ out++;
+ *out = (inl >> 24);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0) % (1U << 8);
+ out++;
+ *out = (inl >> 8) % (1U << 8);
+ out++;
+ *out = (inl >> 16) % (1U << 8);
+ out++;
+ *out = (inl >> 24);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0) % (1U << 8);
+ out++;
+ *out = (inl >> 8) % (1U << 8);
+ out++;
+ *out = (inl >> 16) % (1U << 8);
+ out++;
+ *out = (inl >> 24);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0) % (1U << 8);
+ out++;
+ *out = (inl >> 8) % (1U << 8);
+ out++;
+ *out = (inl >> 16) % (1U << 8);
+ out++;
+ *out = (inl >> 24);
+ ++in;
+ out++;
+
+ return in;
+}
+
+inline const uint32_t* unpack9_32(const uint32_t* in, uint32_t* out) {
+ uint32_t inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out = (inl >> 0) % (1U << 9);
+ out++;
+ *out = (inl >> 9) % (1U << 9);
+ out++;
+ *out = (inl >> 18) % (1U << 9);
+ out++;
+ *out = (inl >> 27);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 4)) << (9 - 4);
+ out++;
+ *out = (inl >> 4) % (1U << 9);
+ out++;
+ *out = (inl >> 13) % (1U << 9);
+ out++;
+ *out = (inl >> 22) % (1U << 9);
+ out++;
+ *out = (inl >> 31);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 8)) << (9 - 8);
+ out++;
+ *out = (inl >> 8) % (1U << 9);
+ out++;
+ *out = (inl >> 17) % (1U << 9);
+ out++;
+ *out = (inl >> 26);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 3)) << (9 - 3);
+ out++;
+ *out = (inl >> 3) % (1U << 9);
+ out++;
+ *out = (inl >> 12) % (1U << 9);
+ out++;
+ *out = (inl >> 21) % (1U << 9);
+ out++;
+ *out = (inl >> 30);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 7)) << (9 - 7);
+ out++;
+ *out = (inl >> 7) % (1U << 9);
+ out++;
+ *out = (inl >> 16) % (1U << 9);
+ out++;
+ *out = (inl >> 25);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 2)) << (9 - 2);
+ out++;
+ *out = (inl >> 2) % (1U << 9);
+ out++;
+ *out = (inl >> 11) % (1U << 9);
+ out++;
+ *out = (inl >> 20) % (1U << 9);
+ out++;
+ *out = (inl >> 29);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 6)) << (9 - 6);
+ out++;
+ *out = (inl >> 6) % (1U << 9);
+ out++;
+ *out = (inl >> 15) % (1U << 9);
+ out++;
+ *out = (inl >> 24);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 1)) << (9 - 1);
+ out++;
+ *out = (inl >> 1) % (1U << 9);
+ out++;
+ *out = (inl >> 10) % (1U << 9);
+ out++;
+ *out = (inl >> 19) % (1U << 9);
+ out++;
+ *out = (inl >> 28);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 5)) << (9 - 5);
+ out++;
+ *out = (inl >> 5) % (1U << 9);
+ out++;
+ *out = (inl >> 14) % (1U << 9);
+ out++;
+ *out = (inl >> 23);
+ ++in;
+ out++;
+
+ return in;
+}
+
+inline const uint32_t* unpack10_32(const uint32_t* in, uint32_t* out) {
+ uint32_t inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out = (inl >> 0) % (1U << 10);
+ out++;
+ *out = (inl >> 10) % (1U << 10);
+ out++;
+ *out = (inl >> 20) % (1U << 10);
+ out++;
+ *out = (inl >> 30);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 8)) << (10 - 8);
+ out++;
+ *out = (inl >> 8) % (1U << 10);
+ out++;
+ *out = (inl >> 18) % (1U << 10);
+ out++;
+ *out = (inl >> 28);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 6)) << (10 - 6);
+ out++;
+ *out = (inl >> 6) % (1U << 10);
+ out++;
+ *out = (inl >> 16) % (1U << 10);
+ out++;
+ *out = (inl >> 26);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 4)) << (10 - 4);
+ out++;
+ *out = (inl >> 4) % (1U << 10);
+ out++;
+ *out = (inl >> 14) % (1U << 10);
+ out++;
+ *out = (inl >> 24);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 2)) << (10 - 2);
+ out++;
+ *out = (inl >> 2) % (1U << 10);
+ out++;
+ *out = (inl >> 12) % (1U << 10);
+ out++;
+ *out = (inl >> 22);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0) % (1U << 10);
+ out++;
+ *out = (inl >> 10) % (1U << 10);
+ out++;
+ *out = (inl >> 20) % (1U << 10);
+ out++;
+ *out = (inl >> 30);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 8)) << (10 - 8);
+ out++;
+ *out = (inl >> 8) % (1U << 10);
+ out++;
+ *out = (inl >> 18) % (1U << 10);
+ out++;
+ *out = (inl >> 28);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 6)) << (10 - 6);
+ out++;
+ *out = (inl >> 6) % (1U << 10);
+ out++;
+ *out = (inl >> 16) % (1U << 10);
+ out++;
+ *out = (inl >> 26);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 4)) << (10 - 4);
+ out++;
+ *out = (inl >> 4) % (1U << 10);
+ out++;
+ *out = (inl >> 14) % (1U << 10);
+ out++;
+ *out = (inl >> 24);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 2)) << (10 - 2);
+ out++;
+ *out = (inl >> 2) % (1U << 10);
+ out++;
+ *out = (inl >> 12) % (1U << 10);
+ out++;
+ *out = (inl >> 22);
+ ++in;
+ out++;
+
+ return in;
+}
+
+inline const uint32_t* unpack11_32(const uint32_t* in, uint32_t* out) {
+ uint32_t inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out = (inl >> 0) % (1U << 11);
+ out++;
+ *out = (inl >> 11) % (1U << 11);
+ out++;
+ *out = (inl >> 22);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 1)) << (11 - 1);
+ out++;
+ *out = (inl >> 1) % (1U << 11);
+ out++;
+ *out = (inl >> 12) % (1U << 11);
+ out++;
+ *out = (inl >> 23);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 2)) << (11 - 2);
+ out++;
+ *out = (inl >> 2) % (1U << 11);
+ out++;
+ *out = (inl >> 13) % (1U << 11);
+ out++;
+ *out = (inl >> 24);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 3)) << (11 - 3);
+ out++;
+ *out = (inl >> 3) % (1U << 11);
+ out++;
+ *out = (inl >> 14) % (1U << 11);
+ out++;
+ *out = (inl >> 25);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 4)) << (11 - 4);
+ out++;
+ *out = (inl >> 4) % (1U << 11);
+ out++;
+ *out = (inl >> 15) % (1U << 11);
+ out++;
+ *out = (inl >> 26);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 5)) << (11 - 5);
+ out++;
+ *out = (inl >> 5) % (1U << 11);
+ out++;
+ *out = (inl >> 16) % (1U << 11);
+ out++;
+ *out = (inl >> 27);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 6)) << (11 - 6);
+ out++;
+ *out = (inl >> 6) % (1U << 11);
+ out++;
+ *out = (inl >> 17) % (1U << 11);
+ out++;
+ *out = (inl >> 28);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 7)) << (11 - 7);
+ out++;
+ *out = (inl >> 7) % (1U << 11);
+ out++;
+ *out = (inl >> 18) % (1U << 11);
+ out++;
+ *out = (inl >> 29);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 8)) << (11 - 8);
+ out++;
+ *out = (inl >> 8) % (1U << 11);
+ out++;
+ *out = (inl >> 19) % (1U << 11);
+ out++;
+ *out = (inl >> 30);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 9)) << (11 - 9);
+ out++;
+ *out = (inl >> 9) % (1U << 11);
+ out++;
+ *out = (inl >> 20) % (1U << 11);
+ out++;
+ *out = (inl >> 31);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 10)) << (11 - 10);
+ out++;
+ *out = (inl >> 10) % (1U << 11);
+ out++;
+ *out = (inl >> 21);
+ ++in;
+ out++;
+
+ return in;
+}
+
+inline const uint32_t* unpack12_32(const uint32_t* in, uint32_t* out) {
+ uint32_t inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out = (inl >> 0) % (1U << 12);
+ out++;
+ *out = (inl >> 12) % (1U << 12);
+ out++;
+ *out = (inl >> 24);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 4)) << (12 - 4);
+ out++;
+ *out = (inl >> 4) % (1U << 12);
+ out++;
+ *out = (inl >> 16) % (1U << 12);
+ out++;
+ *out = (inl >> 28);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 8)) << (12 - 8);
+ out++;
+ *out = (inl >> 8) % (1U << 12);
+ out++;
+ *out = (inl >> 20);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0) % (1U << 12);
+ out++;
+ *out = (inl >> 12) % (1U << 12);
+ out++;
+ *out = (inl >> 24);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 4)) << (12 - 4);
+ out++;
+ *out = (inl >> 4) % (1U << 12);
+ out++;
+ *out = (inl >> 16) % (1U << 12);
+ out++;
+ *out = (inl >> 28);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 8)) << (12 - 8);
+ out++;
+ *out = (inl >> 8) % (1U << 12);
+ out++;
+ *out = (inl >> 20);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0) % (1U << 12);
+ out++;
+ *out = (inl >> 12) % (1U << 12);
+ out++;
+ *out = (inl >> 24);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 4)) << (12 - 4);
+ out++;
+ *out = (inl >> 4) % (1U << 12);
+ out++;
+ *out = (inl >> 16) % (1U << 12);
+ out++;
+ *out = (inl >> 28);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 8)) << (12 - 8);
+ out++;
+ *out = (inl >> 8) % (1U << 12);
+ out++;
+ *out = (inl >> 20);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0) % (1U << 12);
+ out++;
+ *out = (inl >> 12) % (1U << 12);
+ out++;
+ *out = (inl >> 24);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 4)) << (12 - 4);
+ out++;
+ *out = (inl >> 4) % (1U << 12);
+ out++;
+ *out = (inl >> 16) % (1U << 12);
+ out++;
+ *out = (inl >> 28);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 8)) << (12 - 8);
+ out++;
+ *out = (inl >> 8) % (1U << 12);
+ out++;
+ *out = (inl >> 20);
+ ++in;
+ out++;
+
+ return in;
+}
+
+inline const uint32_t* unpack13_32(const uint32_t* in, uint32_t* out) {
+ uint32_t inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out = (inl >> 0) % (1U << 13);
+ out++;
+ *out = (inl >> 13) % (1U << 13);
+ out++;
+ *out = (inl >> 26);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 7)) << (13 - 7);
+ out++;
+ *out = (inl >> 7) % (1U << 13);
+ out++;
+ *out = (inl >> 20);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 1)) << (13 - 1);
+ out++;
+ *out = (inl >> 1) % (1U << 13);
+ out++;
+ *out = (inl >> 14) % (1U << 13);
+ out++;
+ *out = (inl >> 27);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 8)) << (13 - 8);
+ out++;
+ *out = (inl >> 8) % (1U << 13);
+ out++;
+ *out = (inl >> 21);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 2)) << (13 - 2);
+ out++;
+ *out = (inl >> 2) % (1U << 13);
+ out++;
+ *out = (inl >> 15) % (1U << 13);
+ out++;
+ *out = (inl >> 28);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 9)) << (13 - 9);
+ out++;
+ *out = (inl >> 9) % (1U << 13);
+ out++;
+ *out = (inl >> 22);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 3)) << (13 - 3);
+ out++;
+ *out = (inl >> 3) % (1U << 13);
+ out++;
+ *out = (inl >> 16) % (1U << 13);
+ out++;
+ *out = (inl >> 29);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 10)) << (13 - 10);
+ out++;
+ *out = (inl >> 10) % (1U << 13);
+ out++;
+ *out = (inl >> 23);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 4)) << (13 - 4);
+ out++;
+ *out = (inl >> 4) % (1U << 13);
+ out++;
+ *out = (inl >> 17) % (1U << 13);
+ out++;
+ *out = (inl >> 30);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 11)) << (13 - 11);
+ out++;
+ *out = (inl >> 11) % (1U << 13);
+ out++;
+ *out = (inl >> 24);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 5)) << (13 - 5);
+ out++;
+ *out = (inl >> 5) % (1U << 13);
+ out++;
+ *out = (inl >> 18) % (1U << 13);
+ out++;
+ *out = (inl >> 31);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 12)) << (13 - 12);
+ out++;
+ *out = (inl >> 12) % (1U << 13);
+ out++;
+ *out = (inl >> 25);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 6)) << (13 - 6);
+ out++;
+ *out = (inl >> 6) % (1U << 13);
+ out++;
+ *out = (inl >> 19);
+ ++in;
+ out++;
+
+ return in;
+}
+
+inline const uint32_t* unpack14_32(const uint32_t* in, uint32_t* out) {
+ uint32_t inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out = (inl >> 0) % (1U << 14);
+ out++;
+ *out = (inl >> 14) % (1U << 14);
+ out++;
+ *out = (inl >> 28);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 10)) << (14 - 10);
+ out++;
+ *out = (inl >> 10) % (1U << 14);
+ out++;
+ *out = (inl >> 24);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 6)) << (14 - 6);
+ out++;
+ *out = (inl >> 6) % (1U << 14);
+ out++;
+ *out = (inl >> 20);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 2)) << (14 - 2);
+ out++;
+ *out = (inl >> 2) % (1U << 14);
+ out++;
+ *out = (inl >> 16) % (1U << 14);
+ out++;
+ *out = (inl >> 30);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 12)) << (14 - 12);
+ out++;
+ *out = (inl >> 12) % (1U << 14);
+ out++;
+ *out = (inl >> 26);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 8)) << (14 - 8);
+ out++;
+ *out = (inl >> 8) % (1U << 14);
+ out++;
+ *out = (inl >> 22);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 4)) << (14 - 4);
+ out++;
+ *out = (inl >> 4) % (1U << 14);
+ out++;
+ *out = (inl >> 18);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0) % (1U << 14);
+ out++;
+ *out = (inl >> 14) % (1U << 14);
+ out++;
+ *out = (inl >> 28);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 10)) << (14 - 10);
+ out++;
+ *out = (inl >> 10) % (1U << 14);
+ out++;
+ *out = (inl >> 24);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 6)) << (14 - 6);
+ out++;
+ *out = (inl >> 6) % (1U << 14);
+ out++;
+ *out = (inl >> 20);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 2)) << (14 - 2);
+ out++;
+ *out = (inl >> 2) % (1U << 14);
+ out++;
+ *out = (inl >> 16) % (1U << 14);
+ out++;
+ *out = (inl >> 30);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 12)) << (14 - 12);
+ out++;
+ *out = (inl >> 12) % (1U << 14);
+ out++;
+ *out = (inl >> 26);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 8)) << (14 - 8);
+ out++;
+ *out = (inl >> 8) % (1U << 14);
+ out++;
+ *out = (inl >> 22);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 4)) << (14 - 4);
+ out++;
+ *out = (inl >> 4) % (1U << 14);
+ out++;
+ *out = (inl >> 18);
+ ++in;
+ out++;
+
+ return in;
+}
+
+inline const uint32_t* unpack15_32(const uint32_t* in, uint32_t* out) {
+ uint32_t inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out = (inl >> 0) % (1U << 15);
+ out++;
+ *out = (inl >> 15) % (1U << 15);
+ out++;
+ *out = (inl >> 30);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 13)) << (15 - 13);
+ out++;
+ *out = (inl >> 13) % (1U << 15);
+ out++;
+ *out = (inl >> 28);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 11)) << (15 - 11);
+ out++;
+ *out = (inl >> 11) % (1U << 15);
+ out++;
+ *out = (inl >> 26);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 9)) << (15 - 9);
+ out++;
+ *out = (inl >> 9) % (1U << 15);
+ out++;
+ *out = (inl >> 24);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 7)) << (15 - 7);
+ out++;
+ *out = (inl >> 7) % (1U << 15);
+ out++;
+ *out = (inl >> 22);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 5)) << (15 - 5);
+ out++;
+ *out = (inl >> 5) % (1U << 15);
+ out++;
+ *out = (inl >> 20);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 3)) << (15 - 3);
+ out++;
+ *out = (inl >> 3) % (1U << 15);
+ out++;
+ *out = (inl >> 18);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 1)) << (15 - 1);
+ out++;
+ *out = (inl >> 1) % (1U << 15);
+ out++;
+ *out = (inl >> 16) % (1U << 15);
+ out++;
+ *out = (inl >> 31);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 14)) << (15 - 14);
+ out++;
+ *out = (inl >> 14) % (1U << 15);
+ out++;
+ *out = (inl >> 29);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 12)) << (15 - 12);
+ out++;
+ *out = (inl >> 12) % (1U << 15);
+ out++;
+ *out = (inl >> 27);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 10)) << (15 - 10);
+ out++;
+ *out = (inl >> 10) % (1U << 15);
+ out++;
+ *out = (inl >> 25);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 8)) << (15 - 8);
+ out++;
+ *out = (inl >> 8) % (1U << 15);
+ out++;
+ *out = (inl >> 23);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 6)) << (15 - 6);
+ out++;
+ *out = (inl >> 6) % (1U << 15);
+ out++;
+ *out = (inl >> 21);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 4)) << (15 - 4);
+ out++;
+ *out = (inl >> 4) % (1U << 15);
+ out++;
+ *out = (inl >> 19);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 2)) << (15 - 2);
+ out++;
+ *out = (inl >> 2) % (1U << 15);
+ out++;
+ *out = (inl >> 17);
+ ++in;
+ out++;
+
+ return in;
+}
+
+inline const uint32_t* unpack16_32(const uint32_t* in, uint32_t* out) {
+ uint32_t inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out = (inl >> 0) % (1U << 16);
+ out++;
+ *out = (inl >> 16);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0) % (1U << 16);
+ out++;
+ *out = (inl >> 16);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0) % (1U << 16);
+ out++;
+ *out = (inl >> 16);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0) % (1U << 16);
+ out++;
+ *out = (inl >> 16);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0) % (1U << 16);
+ out++;
+ *out = (inl >> 16);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0) % (1U << 16);
+ out++;
+ *out = (inl >> 16);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0) % (1U << 16);
+ out++;
+ *out = (inl >> 16);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0) % (1U << 16);
+ out++;
+ *out = (inl >> 16);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0) % (1U << 16);
+ out++;
+ *out = (inl >> 16);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0) % (1U << 16);
+ out++;
+ *out = (inl >> 16);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0) % (1U << 16);
+ out++;
+ *out = (inl >> 16);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0) % (1U << 16);
+ out++;
+ *out = (inl >> 16);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0) % (1U << 16);
+ out++;
+ *out = (inl >> 16);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0) % (1U << 16);
+ out++;
+ *out = (inl >> 16);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0) % (1U << 16);
+ out++;
+ *out = (inl >> 16);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0) % (1U << 16);
+ out++;
+ *out = (inl >> 16);
+ ++in;
+ out++;
+
+ return in;
+}
+
+inline const uint32_t* unpack17_32(const uint32_t* in, uint32_t* out) {
+ uint32_t inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out = (inl >> 0) % (1U << 17);
+ out++;
+ *out = (inl >> 17);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 2)) << (17 - 2);
+ out++;
+ *out = (inl >> 2) % (1U << 17);
+ out++;
+ *out = (inl >> 19);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 4)) << (17 - 4);
+ out++;
+ *out = (inl >> 4) % (1U << 17);
+ out++;
+ *out = (inl >> 21);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 6)) << (17 - 6);
+ out++;
+ *out = (inl >> 6) % (1U << 17);
+ out++;
+ *out = (inl >> 23);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 8)) << (17 - 8);
+ out++;
+ *out = (inl >> 8) % (1U << 17);
+ out++;
+ *out = (inl >> 25);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 10)) << (17 - 10);
+ out++;
+ *out = (inl >> 10) % (1U << 17);
+ out++;
+ *out = (inl >> 27);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 12)) << (17 - 12);
+ out++;
+ *out = (inl >> 12) % (1U << 17);
+ out++;
+ *out = (inl >> 29);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 14)) << (17 - 14);
+ out++;
+ *out = (inl >> 14) % (1U << 17);
+ out++;
+ *out = (inl >> 31);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 16)) << (17 - 16);
+ out++;
+ *out = (inl >> 16);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 1)) << (17 - 1);
+ out++;
+ *out = (inl >> 1) % (1U << 17);
+ out++;
+ *out = (inl >> 18);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 3)) << (17 - 3);
+ out++;
+ *out = (inl >> 3) % (1U << 17);
+ out++;
+ *out = (inl >> 20);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 5)) << (17 - 5);
+ out++;
+ *out = (inl >> 5) % (1U << 17);
+ out++;
+ *out = (inl >> 22);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 7)) << (17 - 7);
+ out++;
+ *out = (inl >> 7) % (1U << 17);
+ out++;
+ *out = (inl >> 24);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 9)) << (17 - 9);
+ out++;
+ *out = (inl >> 9) % (1U << 17);
+ out++;
+ *out = (inl >> 26);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 11)) << (17 - 11);
+ out++;
+ *out = (inl >> 11) % (1U << 17);
+ out++;
+ *out = (inl >> 28);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 13)) << (17 - 13);
+ out++;
+ *out = (inl >> 13) % (1U << 17);
+ out++;
+ *out = (inl >> 30);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 15)) << (17 - 15);
+ out++;
+ *out = (inl >> 15);
+ ++in;
+ out++;
+
+ return in;
+}
+
+inline const uint32_t* unpack18_32(const uint32_t* in, uint32_t* out) {
+ uint32_t inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out = (inl >> 0) % (1U << 18);
+ out++;
+ *out = (inl >> 18);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 4)) << (18 - 4);
+ out++;
+ *out = (inl >> 4) % (1U << 18);
+ out++;
+ *out = (inl >> 22);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 8)) << (18 - 8);
+ out++;
+ *out = (inl >> 8) % (1U << 18);
+ out++;
+ *out = (inl >> 26);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 12)) << (18 - 12);
+ out++;
+ *out = (inl >> 12) % (1U << 18);
+ out++;
+ *out = (inl >> 30);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 16)) << (18 - 16);
+ out++;
+ *out = (inl >> 16);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 2)) << (18 - 2);
+ out++;
+ *out = (inl >> 2) % (1U << 18);
+ out++;
+ *out = (inl >> 20);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 6)) << (18 - 6);
+ out++;
+ *out = (inl >> 6) % (1U << 18);
+ out++;
+ *out = (inl >> 24);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 10)) << (18 - 10);
+ out++;
+ *out = (inl >> 10) % (1U << 18);
+ out++;
+ *out = (inl >> 28);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 14)) << (18 - 14);
+ out++;
+ *out = (inl >> 14);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0) % (1U << 18);
+ out++;
+ *out = (inl >> 18);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 4)) << (18 - 4);
+ out++;
+ *out = (inl >> 4) % (1U << 18);
+ out++;
+ *out = (inl >> 22);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 8)) << (18 - 8);
+ out++;
+ *out = (inl >> 8) % (1U << 18);
+ out++;
+ *out = (inl >> 26);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 12)) << (18 - 12);
+ out++;
+ *out = (inl >> 12) % (1U << 18);
+ out++;
+ *out = (inl >> 30);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 16)) << (18 - 16);
+ out++;
+ *out = (inl >> 16);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 2)) << (18 - 2);
+ out++;
+ *out = (inl >> 2) % (1U << 18);
+ out++;
+ *out = (inl >> 20);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 6)) << (18 - 6);
+ out++;
+ *out = (inl >> 6) % (1U << 18);
+ out++;
+ *out = (inl >> 24);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 10)) << (18 - 10);
+ out++;
+ *out = (inl >> 10) % (1U << 18);
+ out++;
+ *out = (inl >> 28);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 14)) << (18 - 14);
+ out++;
+ *out = (inl >> 14);
+ ++in;
+ out++;
+
+ return in;
+}
+
+inline const uint32_t* unpack19_32(const uint32_t* in, uint32_t* out) {
+ uint32_t inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out = (inl >> 0) % (1U << 19);
+ out++;
+ *out = (inl >> 19);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 6)) << (19 - 6);
+ out++;
+ *out = (inl >> 6) % (1U << 19);
+ out++;
+ *out = (inl >> 25);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 12)) << (19 - 12);
+ out++;
+ *out = (inl >> 12) % (1U << 19);
+ out++;
+ *out = (inl >> 31);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 18)) << (19 - 18);
+ out++;
+ *out = (inl >> 18);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 5)) << (19 - 5);
+ out++;
+ *out = (inl >> 5) % (1U << 19);
+ out++;
+ *out = (inl >> 24);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 11)) << (19 - 11);
+ out++;
+ *out = (inl >> 11) % (1U << 19);
+ out++;
+ *out = (inl >> 30);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 17)) << (19 - 17);
+ out++;
+ *out = (inl >> 17);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 4)) << (19 - 4);
+ out++;
+ *out = (inl >> 4) % (1U << 19);
+ out++;
+ *out = (inl >> 23);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 10)) << (19 - 10);
+ out++;
+ *out = (inl >> 10) % (1U << 19);
+ out++;
+ *out = (inl >> 29);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 16)) << (19 - 16);
+ out++;
+ *out = (inl >> 16);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 3)) << (19 - 3);
+ out++;
+ *out = (inl >> 3) % (1U << 19);
+ out++;
+ *out = (inl >> 22);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 9)) << (19 - 9);
+ out++;
+ *out = (inl >> 9) % (1U << 19);
+ out++;
+ *out = (inl >> 28);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 15)) << (19 - 15);
+ out++;
+ *out = (inl >> 15);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 2)) << (19 - 2);
+ out++;
+ *out = (inl >> 2) % (1U << 19);
+ out++;
+ *out = (inl >> 21);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 8)) << (19 - 8);
+ out++;
+ *out = (inl >> 8) % (1U << 19);
+ out++;
+ *out = (inl >> 27);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 14)) << (19 - 14);
+ out++;
+ *out = (inl >> 14);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 1)) << (19 - 1);
+ out++;
+ *out = (inl >> 1) % (1U << 19);
+ out++;
+ *out = (inl >> 20);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 7)) << (19 - 7);
+ out++;
+ *out = (inl >> 7) % (1U << 19);
+ out++;
+ *out = (inl >> 26);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 13)) << (19 - 13);
+ out++;
+ *out = (inl >> 13);
+ ++in;
+ out++;
+
+ return in;
+}
+
+inline const uint32_t* unpack20_32(const uint32_t* in, uint32_t* out) {
+ uint32_t inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out = (inl >> 0) % (1U << 20);
+ out++;
+ *out = (inl >> 20);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 8)) << (20 - 8);
+ out++;
+ *out = (inl >> 8) % (1U << 20);
+ out++;
+ *out = (inl >> 28);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 16)) << (20 - 16);
+ out++;
+ *out = (inl >> 16);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 4)) << (20 - 4);
+ out++;
+ *out = (inl >> 4) % (1U << 20);
+ out++;
+ *out = (inl >> 24);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 12)) << (20 - 12);
+ out++;
+ *out = (inl >> 12);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0) % (1U << 20);
+ out++;
+ *out = (inl >> 20);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 8)) << (20 - 8);
+ out++;
+ *out = (inl >> 8) % (1U << 20);
+ out++;
+ *out = (inl >> 28);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 16)) << (20 - 16);
+ out++;
+ *out = (inl >> 16);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 4)) << (20 - 4);
+ out++;
+ *out = (inl >> 4) % (1U << 20);
+ out++;
+ *out = (inl >> 24);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 12)) << (20 - 12);
+ out++;
+ *out = (inl >> 12);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0) % (1U << 20);
+ out++;
+ *out = (inl >> 20);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 8)) << (20 - 8);
+ out++;
+ *out = (inl >> 8) % (1U << 20);
+ out++;
+ *out = (inl >> 28);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 16)) << (20 - 16);
+ out++;
+ *out = (inl >> 16);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 4)) << (20 - 4);
+ out++;
+ *out = (inl >> 4) % (1U << 20);
+ out++;
+ *out = (inl >> 24);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 12)) << (20 - 12);
+ out++;
+ *out = (inl >> 12);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0) % (1U << 20);
+ out++;
+ *out = (inl >> 20);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 8)) << (20 - 8);
+ out++;
+ *out = (inl >> 8) % (1U << 20);
+ out++;
+ *out = (inl >> 28);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 16)) << (20 - 16);
+ out++;
+ *out = (inl >> 16);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 4)) << (20 - 4);
+ out++;
+ *out = (inl >> 4) % (1U << 20);
+ out++;
+ *out = (inl >> 24);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 12)) << (20 - 12);
+ out++;
+ *out = (inl >> 12);
+ ++in;
+ out++;
+
+ return in;
+}
+
+inline const uint32_t* unpack21_32(const uint32_t* in, uint32_t* out) {
+ uint32_t inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out = (inl >> 0) % (1U << 21);
+ out++;
+ *out = (inl >> 21);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 10)) << (21 - 10);
+ out++;
+ *out = (inl >> 10) % (1U << 21);
+ out++;
+ *out = (inl >> 31);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 20)) << (21 - 20);
+ out++;
+ *out = (inl >> 20);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 9)) << (21 - 9);
+ out++;
+ *out = (inl >> 9) % (1U << 21);
+ out++;
+ *out = (inl >> 30);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 19)) << (21 - 19);
+ out++;
+ *out = (inl >> 19);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 8)) << (21 - 8);
+ out++;
+ *out = (inl >> 8) % (1U << 21);
+ out++;
+ *out = (inl >> 29);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 18)) << (21 - 18);
+ out++;
+ *out = (inl >> 18);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 7)) << (21 - 7);
+ out++;
+ *out = (inl >> 7) % (1U << 21);
+ out++;
+ *out = (inl >> 28);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 17)) << (21 - 17);
+ out++;
+ *out = (inl >> 17);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 6)) << (21 - 6);
+ out++;
+ *out = (inl >> 6) % (1U << 21);
+ out++;
+ *out = (inl >> 27);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 16)) << (21 - 16);
+ out++;
+ *out = (inl >> 16);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 5)) << (21 - 5);
+ out++;
+ *out = (inl >> 5) % (1U << 21);
+ out++;
+ *out = (inl >> 26);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 15)) << (21 - 15);
+ out++;
+ *out = (inl >> 15);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 4)) << (21 - 4);
+ out++;
+ *out = (inl >> 4) % (1U << 21);
+ out++;
+ *out = (inl >> 25);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 14)) << (21 - 14);
+ out++;
+ *out = (inl >> 14);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 3)) << (21 - 3);
+ out++;
+ *out = (inl >> 3) % (1U << 21);
+ out++;
+ *out = (inl >> 24);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 13)) << (21 - 13);
+ out++;
+ *out = (inl >> 13);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 2)) << (21 - 2);
+ out++;
+ *out = (inl >> 2) % (1U << 21);
+ out++;
+ *out = (inl >> 23);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 12)) << (21 - 12);
+ out++;
+ *out = (inl >> 12);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 1)) << (21 - 1);
+ out++;
+ *out = (inl >> 1) % (1U << 21);
+ out++;
+ *out = (inl >> 22);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 11)) << (21 - 11);
+ out++;
+ *out = (inl >> 11);
+ ++in;
+ out++;
+
+ return in;
+}
+
+inline const uint32_t* unpack22_32(const uint32_t* in, uint32_t* out) {
+ uint32_t inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out = (inl >> 0) % (1U << 22);
+ out++;
+ *out = (inl >> 22);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 12)) << (22 - 12);
+ out++;
+ *out = (inl >> 12);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 2)) << (22 - 2);
+ out++;
+ *out = (inl >> 2) % (1U << 22);
+ out++;
+ *out = (inl >> 24);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 14)) << (22 - 14);
+ out++;
+ *out = (inl >> 14);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 4)) << (22 - 4);
+ out++;
+ *out = (inl >> 4) % (1U << 22);
+ out++;
+ *out = (inl >> 26);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 16)) << (22 - 16);
+ out++;
+ *out = (inl >> 16);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 6)) << (22 - 6);
+ out++;
+ *out = (inl >> 6) % (1U << 22);
+ out++;
+ *out = (inl >> 28);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 18)) << (22 - 18);
+ out++;
+ *out = (inl >> 18);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 8)) << (22 - 8);
+ out++;
+ *out = (inl >> 8) % (1U << 22);
+ out++;
+ *out = (inl >> 30);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 20)) << (22 - 20);
+ out++;
+ *out = (inl >> 20);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 10)) << (22 - 10);
+ out++;
+ *out = (inl >> 10);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0) % (1U << 22);
+ out++;
+ *out = (inl >> 22);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 12)) << (22 - 12);
+ out++;
+ *out = (inl >> 12);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 2)) << (22 - 2);
+ out++;
+ *out = (inl >> 2) % (1U << 22);
+ out++;
+ *out = (inl >> 24);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 14)) << (22 - 14);
+ out++;
+ *out = (inl >> 14);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 4)) << (22 - 4);
+ out++;
+ *out = (inl >> 4) % (1U << 22);
+ out++;
+ *out = (inl >> 26);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 16)) << (22 - 16);
+ out++;
+ *out = (inl >> 16);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 6)) << (22 - 6);
+ out++;
+ *out = (inl >> 6) % (1U << 22);
+ out++;
+ *out = (inl >> 28);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 18)) << (22 - 18);
+ out++;
+ *out = (inl >> 18);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 8)) << (22 - 8);
+ out++;
+ *out = (inl >> 8) % (1U << 22);
+ out++;
+ *out = (inl >> 30);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 20)) << (22 - 20);
+ out++;
+ *out = (inl >> 20);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 10)) << (22 - 10);
+ out++;
+ *out = (inl >> 10);
+ ++in;
+ out++;
+
+ return in;
+}
+
+inline const uint32_t* unpack23_32(const uint32_t* in, uint32_t* out) {
+ uint32_t inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out = (inl >> 0) % (1U << 23);
+ out++;
+ *out = (inl >> 23);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 14)) << (23 - 14);
+ out++;
+ *out = (inl >> 14);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 5)) << (23 - 5);
+ out++;
+ *out = (inl >> 5) % (1U << 23);
+ out++;
+ *out = (inl >> 28);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 19)) << (23 - 19);
+ out++;
+ *out = (inl >> 19);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 10)) << (23 - 10);
+ out++;
+ *out = (inl >> 10);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 1)) << (23 - 1);
+ out++;
+ *out = (inl >> 1) % (1U << 23);
+ out++;
+ *out = (inl >> 24);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 15)) << (23 - 15);
+ out++;
+ *out = (inl >> 15);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 6)) << (23 - 6);
+ out++;
+ *out = (inl >> 6) % (1U << 23);
+ out++;
+ *out = (inl >> 29);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 20)) << (23 - 20);
+ out++;
+ *out = (inl >> 20);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 11)) << (23 - 11);
+ out++;
+ *out = (inl >> 11);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 2)) << (23 - 2);
+ out++;
+ *out = (inl >> 2) % (1U << 23);
+ out++;
+ *out = (inl >> 25);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 16)) << (23 - 16);
+ out++;
+ *out = (inl >> 16);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 7)) << (23 - 7);
+ out++;
+ *out = (inl >> 7) % (1U << 23);
+ out++;
+ *out = (inl >> 30);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 21)) << (23 - 21);
+ out++;
+ *out = (inl >> 21);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 12)) << (23 - 12);
+ out++;
+ *out = (inl >> 12);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 3)) << (23 - 3);
+ out++;
+ *out = (inl >> 3) % (1U << 23);
+ out++;
+ *out = (inl >> 26);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 17)) << (23 - 17);
+ out++;
+ *out = (inl >> 17);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 8)) << (23 - 8);
+ out++;
+ *out = (inl >> 8) % (1U << 23);
+ out++;
+ *out = (inl >> 31);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 22)) << (23 - 22);
+ out++;
+ *out = (inl >> 22);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 13)) << (23 - 13);
+ out++;
+ *out = (inl >> 13);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 4)) << (23 - 4);
+ out++;
+ *out = (inl >> 4) % (1U << 23);
+ out++;
+ *out = (inl >> 27);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 18)) << (23 - 18);
+ out++;
+ *out = (inl >> 18);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 9)) << (23 - 9);
+ out++;
+ *out = (inl >> 9);
+ ++in;
+ out++;
+
+ return in;
+}
+
+inline const uint32_t* unpack24_32(const uint32_t* in, uint32_t* out) {
+ uint32_t inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out = (inl >> 0) % (1U << 24);
+ out++;
+ *out = (inl >> 24);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 16)) << (24 - 16);
+ out++;
+ *out = (inl >> 16);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 8)) << (24 - 8);
+ out++;
+ *out = (inl >> 8);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0) % (1U << 24);
+ out++;
+ *out = (inl >> 24);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 16)) << (24 - 16);
+ out++;
+ *out = (inl >> 16);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 8)) << (24 - 8);
+ out++;
+ *out = (inl >> 8);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0) % (1U << 24);
+ out++;
+ *out = (inl >> 24);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 16)) << (24 - 16);
+ out++;
+ *out = (inl >> 16);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 8)) << (24 - 8);
+ out++;
+ *out = (inl >> 8);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0) % (1U << 24);
+ out++;
+ *out = (inl >> 24);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 16)) << (24 - 16);
+ out++;
+ *out = (inl >> 16);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 8)) << (24 - 8);
+ out++;
+ *out = (inl >> 8);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0) % (1U << 24);
+ out++;
+ *out = (inl >> 24);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 16)) << (24 - 16);
+ out++;
+ *out = (inl >> 16);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 8)) << (24 - 8);
+ out++;
+ *out = (inl >> 8);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0) % (1U << 24);
+ out++;
+ *out = (inl >> 24);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 16)) << (24 - 16);
+ out++;
+ *out = (inl >> 16);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 8)) << (24 - 8);
+ out++;
+ *out = (inl >> 8);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0) % (1U << 24);
+ out++;
+ *out = (inl >> 24);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 16)) << (24 - 16);
+ out++;
+ *out = (inl >> 16);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 8)) << (24 - 8);
+ out++;
+ *out = (inl >> 8);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0) % (1U << 24);
+ out++;
+ *out = (inl >> 24);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 16)) << (24 - 16);
+ out++;
+ *out = (inl >> 16);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 8)) << (24 - 8);
+ out++;
+ *out = (inl >> 8);
+ ++in;
+ out++;
+
+ return in;
+}
+
+inline const uint32_t* unpack25_32(const uint32_t* in, uint32_t* out) {
+ uint32_t inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out = (inl >> 0) % (1U << 25);
+ out++;
+ *out = (inl >> 25);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 18)) << (25 - 18);
+ out++;
+ *out = (inl >> 18);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 11)) << (25 - 11);
+ out++;
+ *out = (inl >> 11);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 4)) << (25 - 4);
+ out++;
+ *out = (inl >> 4) % (1U << 25);
+ out++;
+ *out = (inl >> 29);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 22)) << (25 - 22);
+ out++;
+ *out = (inl >> 22);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 15)) << (25 - 15);
+ out++;
+ *out = (inl >> 15);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 8)) << (25 - 8);
+ out++;
+ *out = (inl >> 8);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 1)) << (25 - 1);
+ out++;
+ *out = (inl >> 1) % (1U << 25);
+ out++;
+ *out = (inl >> 26);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 19)) << (25 - 19);
+ out++;
+ *out = (inl >> 19);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 12)) << (25 - 12);
+ out++;
+ *out = (inl >> 12);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 5)) << (25 - 5);
+ out++;
+ *out = (inl >> 5) % (1U << 25);
+ out++;
+ *out = (inl >> 30);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 23)) << (25 - 23);
+ out++;
+ *out = (inl >> 23);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 16)) << (25 - 16);
+ out++;
+ *out = (inl >> 16);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 9)) << (25 - 9);
+ out++;
+ *out = (inl >> 9);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 2)) << (25 - 2);
+ out++;
+ *out = (inl >> 2) % (1U << 25);
+ out++;
+ *out = (inl >> 27);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 20)) << (25 - 20);
+ out++;
+ *out = (inl >> 20);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 13)) << (25 - 13);
+ out++;
+ *out = (inl >> 13);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 6)) << (25 - 6);
+ out++;
+ *out = (inl >> 6) % (1U << 25);
+ out++;
+ *out = (inl >> 31);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 24)) << (25 - 24);
+ out++;
+ *out = (inl >> 24);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 17)) << (25 - 17);
+ out++;
+ *out = (inl >> 17);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 10)) << (25 - 10);
+ out++;
+ *out = (inl >> 10);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 3)) << (25 - 3);
+ out++;
+ *out = (inl >> 3) % (1U << 25);
+ out++;
+ *out = (inl >> 28);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 21)) << (25 - 21);
+ out++;
+ *out = (inl >> 21);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 14)) << (25 - 14);
+ out++;
+ *out = (inl >> 14);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 7)) << (25 - 7);
+ out++;
+ *out = (inl >> 7);
+ ++in;
+ out++;
+
+ return in;
+}
+
+inline const uint32_t* unpack26_32(const uint32_t* in, uint32_t* out) {
+ uint32_t inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out = (inl >> 0) % (1U << 26);
+ out++;
+ *out = (inl >> 26);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 20)) << (26 - 20);
+ out++;
+ *out = (inl >> 20);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 14)) << (26 - 14);
+ out++;
+ *out = (inl >> 14);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 8)) << (26 - 8);
+ out++;
+ *out = (inl >> 8);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 2)) << (26 - 2);
+ out++;
+ *out = (inl >> 2) % (1U << 26);
+ out++;
+ *out = (inl >> 28);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 22)) << (26 - 22);
+ out++;
+ *out = (inl >> 22);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 16)) << (26 - 16);
+ out++;
+ *out = (inl >> 16);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 10)) << (26 - 10);
+ out++;
+ *out = (inl >> 10);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 4)) << (26 - 4);
+ out++;
+ *out = (inl >> 4) % (1U << 26);
+ out++;
+ *out = (inl >> 30);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 24)) << (26 - 24);
+ out++;
+ *out = (inl >> 24);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 18)) << (26 - 18);
+ out++;
+ *out = (inl >> 18);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 12)) << (26 - 12);
+ out++;
+ *out = (inl >> 12);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 6)) << (26 - 6);
+ out++;
+ *out = (inl >> 6);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0) % (1U << 26);
+ out++;
+ *out = (inl >> 26);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 20)) << (26 - 20);
+ out++;
+ *out = (inl >> 20);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 14)) << (26 - 14);
+ out++;
+ *out = (inl >> 14);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 8)) << (26 - 8);
+ out++;
+ *out = (inl >> 8);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 2)) << (26 - 2);
+ out++;
+ *out = (inl >> 2) % (1U << 26);
+ out++;
+ *out = (inl >> 28);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 22)) << (26 - 22);
+ out++;
+ *out = (inl >> 22);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 16)) << (26 - 16);
+ out++;
+ *out = (inl >> 16);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 10)) << (26 - 10);
+ out++;
+ *out = (inl >> 10);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 4)) << (26 - 4);
+ out++;
+ *out = (inl >> 4) % (1U << 26);
+ out++;
+ *out = (inl >> 30);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 24)) << (26 - 24);
+ out++;
+ *out = (inl >> 24);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 18)) << (26 - 18);
+ out++;
+ *out = (inl >> 18);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 12)) << (26 - 12);
+ out++;
+ *out = (inl >> 12);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 6)) << (26 - 6);
+ out++;
+ *out = (inl >> 6);
+ ++in;
+ out++;
+
+ return in;
+}
+
+inline const uint32_t* unpack27_32(const uint32_t* in, uint32_t* out) {
+ uint32_t inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out = (inl >> 0) % (1U << 27);
+ out++;
+ *out = (inl >> 27);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 22)) << (27 - 22);
+ out++;
+ *out = (inl >> 22);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 17)) << (27 - 17);
+ out++;
+ *out = (inl >> 17);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 12)) << (27 - 12);
+ out++;
+ *out = (inl >> 12);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 7)) << (27 - 7);
+ out++;
+ *out = (inl >> 7);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 2)) << (27 - 2);
+ out++;
+ *out = (inl >> 2) % (1U << 27);
+ out++;
+ *out = (inl >> 29);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 24)) << (27 - 24);
+ out++;
+ *out = (inl >> 24);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 19)) << (27 - 19);
+ out++;
+ *out = (inl >> 19);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 14)) << (27 - 14);
+ out++;
+ *out = (inl >> 14);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 9)) << (27 - 9);
+ out++;
+ *out = (inl >> 9);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 4)) << (27 - 4);
+ out++;
+ *out = (inl >> 4) % (1U << 27);
+ out++;
+ *out = (inl >> 31);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 26)) << (27 - 26);
+ out++;
+ *out = (inl >> 26);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 21)) << (27 - 21);
+ out++;
+ *out = (inl >> 21);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 16)) << (27 - 16);
+ out++;
+ *out = (inl >> 16);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 11)) << (27 - 11);
+ out++;
+ *out = (inl >> 11);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 6)) << (27 - 6);
+ out++;
+ *out = (inl >> 6);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 1)) << (27 - 1);
+ out++;
+ *out = (inl >> 1) % (1U << 27);
+ out++;
+ *out = (inl >> 28);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 23)) << (27 - 23);
+ out++;
+ *out = (inl >> 23);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 18)) << (27 - 18);
+ out++;
+ *out = (inl >> 18);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 13)) << (27 - 13);
+ out++;
+ *out = (inl >> 13);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 8)) << (27 - 8);
+ out++;
+ *out = (inl >> 8);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 3)) << (27 - 3);
+ out++;
+ *out = (inl >> 3) % (1U << 27);
+ out++;
+ *out = (inl >> 30);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 25)) << (27 - 25);
+ out++;
+ *out = (inl >> 25);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 20)) << (27 - 20);
+ out++;
+ *out = (inl >> 20);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 15)) << (27 - 15);
+ out++;
+ *out = (inl >> 15);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 10)) << (27 - 10);
+ out++;
+ *out = (inl >> 10);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 5)) << (27 - 5);
+ out++;
+ *out = (inl >> 5);
+ ++in;
+ out++;
+
+ return in;
+}
+
+inline const uint32_t* unpack28_32(const uint32_t* in, uint32_t* out) {
+ uint32_t inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out = (inl >> 0) % (1U << 28);
+ out++;
+ *out = (inl >> 28);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 24)) << (28 - 24);
+ out++;
+ *out = (inl >> 24);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 20)) << (28 - 20);
+ out++;
+ *out = (inl >> 20);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 16)) << (28 - 16);
+ out++;
+ *out = (inl >> 16);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 12)) << (28 - 12);
+ out++;
+ *out = (inl >> 12);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 8)) << (28 - 8);
+ out++;
+ *out = (inl >> 8);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 4)) << (28 - 4);
+ out++;
+ *out = (inl >> 4);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0) % (1U << 28);
+ out++;
+ *out = (inl >> 28);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 24)) << (28 - 24);
+ out++;
+ *out = (inl >> 24);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 20)) << (28 - 20);
+ out++;
+ *out = (inl >> 20);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 16)) << (28 - 16);
+ out++;
+ *out = (inl >> 16);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 12)) << (28 - 12);
+ out++;
+ *out = (inl >> 12);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 8)) << (28 - 8);
+ out++;
+ *out = (inl >> 8);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 4)) << (28 - 4);
+ out++;
+ *out = (inl >> 4);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0) % (1U << 28);
+ out++;
+ *out = (inl >> 28);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 24)) << (28 - 24);
+ out++;
+ *out = (inl >> 24);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 20)) << (28 - 20);
+ out++;
+ *out = (inl >> 20);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 16)) << (28 - 16);
+ out++;
+ *out = (inl >> 16);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 12)) << (28 - 12);
+ out++;
+ *out = (inl >> 12);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 8)) << (28 - 8);
+ out++;
+ *out = (inl >> 8);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 4)) << (28 - 4);
+ out++;
+ *out = (inl >> 4);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0) % (1U << 28);
+ out++;
+ *out = (inl >> 28);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 24)) << (28 - 24);
+ out++;
+ *out = (inl >> 24);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 20)) << (28 - 20);
+ out++;
+ *out = (inl >> 20);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 16)) << (28 - 16);
+ out++;
+ *out = (inl >> 16);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 12)) << (28 - 12);
+ out++;
+ *out = (inl >> 12);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 8)) << (28 - 8);
+ out++;
+ *out = (inl >> 8);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 4)) << (28 - 4);
+ out++;
+ *out = (inl >> 4);
+ ++in;
+ out++;
+
+ return in;
+}
+
+inline const uint32_t* unpack29_32(const uint32_t* in, uint32_t* out) {
+ uint32_t inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out = (inl >> 0) % (1U << 29);
+ out++;
+ *out = (inl >> 29);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 26)) << (29 - 26);
+ out++;
+ *out = (inl >> 26);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 23)) << (29 - 23);
+ out++;
+ *out = (inl >> 23);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 20)) << (29 - 20);
+ out++;
+ *out = (inl >> 20);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 17)) << (29 - 17);
+ out++;
+ *out = (inl >> 17);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 14)) << (29 - 14);
+ out++;
+ *out = (inl >> 14);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 11)) << (29 - 11);
+ out++;
+ *out = (inl >> 11);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 8)) << (29 - 8);
+ out++;
+ *out = (inl >> 8);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 5)) << (29 - 5);
+ out++;
+ *out = (inl >> 5);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 2)) << (29 - 2);
+ out++;
+ *out = (inl >> 2) % (1U << 29);
+ out++;
+ *out = (inl >> 31);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 28)) << (29 - 28);
+ out++;
+ *out = (inl >> 28);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 25)) << (29 - 25);
+ out++;
+ *out = (inl >> 25);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 22)) << (29 - 22);
+ out++;
+ *out = (inl >> 22);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 19)) << (29 - 19);
+ out++;
+ *out = (inl >> 19);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 16)) << (29 - 16);
+ out++;
+ *out = (inl >> 16);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 13)) << (29 - 13);
+ out++;
+ *out = (inl >> 13);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 10)) << (29 - 10);
+ out++;
+ *out = (inl >> 10);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 7)) << (29 - 7);
+ out++;
+ *out = (inl >> 7);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 4)) << (29 - 4);
+ out++;
+ *out = (inl >> 4);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 1)) << (29 - 1);
+ out++;
+ *out = (inl >> 1) % (1U << 29);
+ out++;
+ *out = (inl >> 30);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 27)) << (29 - 27);
+ out++;
+ *out = (inl >> 27);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 24)) << (29 - 24);
+ out++;
+ *out = (inl >> 24);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 21)) << (29 - 21);
+ out++;
+ *out = (inl >> 21);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 18)) << (29 - 18);
+ out++;
+ *out = (inl >> 18);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 15)) << (29 - 15);
+ out++;
+ *out = (inl >> 15);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 12)) << (29 - 12);
+ out++;
+ *out = (inl >> 12);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 9)) << (29 - 9);
+ out++;
+ *out = (inl >> 9);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 6)) << (29 - 6);
+ out++;
+ *out = (inl >> 6);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 3)) << (29 - 3);
+ out++;
+ *out = (inl >> 3);
+ ++in;
+ out++;
+
+ return in;
+}
+
+inline const uint32_t* unpack30_32(const uint32_t* in, uint32_t* out) {
+ uint32_t inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out = (inl >> 0) % (1U << 30);
+ out++;
+ *out = (inl >> 30);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 28)) << (30 - 28);
+ out++;
+ *out = (inl >> 28);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 26)) << (30 - 26);
+ out++;
+ *out = (inl >> 26);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 24)) << (30 - 24);
+ out++;
+ *out = (inl >> 24);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 22)) << (30 - 22);
+ out++;
+ *out = (inl >> 22);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 20)) << (30 - 20);
+ out++;
+ *out = (inl >> 20);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 18)) << (30 - 18);
+ out++;
+ *out = (inl >> 18);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 16)) << (30 - 16);
+ out++;
+ *out = (inl >> 16);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 14)) << (30 - 14);
+ out++;
+ *out = (inl >> 14);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 12)) << (30 - 12);
+ out++;
+ *out = (inl >> 12);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 10)) << (30 - 10);
+ out++;
+ *out = (inl >> 10);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 8)) << (30 - 8);
+ out++;
+ *out = (inl >> 8);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 6)) << (30 - 6);
+ out++;
+ *out = (inl >> 6);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 4)) << (30 - 4);
+ out++;
+ *out = (inl >> 4);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 2)) << (30 - 2);
+ out++;
+ *out = (inl >> 2);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0) % (1U << 30);
+ out++;
+ *out = (inl >> 30);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 28)) << (30 - 28);
+ out++;
+ *out = (inl >> 28);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 26)) << (30 - 26);
+ out++;
+ *out = (inl >> 26);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 24)) << (30 - 24);
+ out++;
+ *out = (inl >> 24);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 22)) << (30 - 22);
+ out++;
+ *out = (inl >> 22);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 20)) << (30 - 20);
+ out++;
+ *out = (inl >> 20);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 18)) << (30 - 18);
+ out++;
+ *out = (inl >> 18);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 16)) << (30 - 16);
+ out++;
+ *out = (inl >> 16);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 14)) << (30 - 14);
+ out++;
+ *out = (inl >> 14);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 12)) << (30 - 12);
+ out++;
+ *out = (inl >> 12);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 10)) << (30 - 10);
+ out++;
+ *out = (inl >> 10);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 8)) << (30 - 8);
+ out++;
+ *out = (inl >> 8);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 6)) << (30 - 6);
+ out++;
+ *out = (inl >> 6);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 4)) << (30 - 4);
+ out++;
+ *out = (inl >> 4);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 2)) << (30 - 2);
+ out++;
+ *out = (inl >> 2);
+ ++in;
+ out++;
+
+ return in;
+}
+
+inline const uint32_t* unpack31_32(const uint32_t* in, uint32_t* out) {
+ uint32_t inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out = (inl >> 0) % (1U << 31);
+ out++;
+ *out = (inl >> 31);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 30)) << (31 - 30);
+ out++;
+ *out = (inl >> 30);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 29)) << (31 - 29);
+ out++;
+ *out = (inl >> 29);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 28)) << (31 - 28);
+ out++;
+ *out = (inl >> 28);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 27)) << (31 - 27);
+ out++;
+ *out = (inl >> 27);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 26)) << (31 - 26);
+ out++;
+ *out = (inl >> 26);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 25)) << (31 - 25);
+ out++;
+ *out = (inl >> 25);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 24)) << (31 - 24);
+ out++;
+ *out = (inl >> 24);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 23)) << (31 - 23);
+ out++;
+ *out = (inl >> 23);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 22)) << (31 - 22);
+ out++;
+ *out = (inl >> 22);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 21)) << (31 - 21);
+ out++;
+ *out = (inl >> 21);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 20)) << (31 - 20);
+ out++;
+ *out = (inl >> 20);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 19)) << (31 - 19);
+ out++;
+ *out = (inl >> 19);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 18)) << (31 - 18);
+ out++;
+ *out = (inl >> 18);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 17)) << (31 - 17);
+ out++;
+ *out = (inl >> 17);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 16)) << (31 - 16);
+ out++;
+ *out = (inl >> 16);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 15)) << (31 - 15);
+ out++;
+ *out = (inl >> 15);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 14)) << (31 - 14);
+ out++;
+ *out = (inl >> 14);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 13)) << (31 - 13);
+ out++;
+ *out = (inl >> 13);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 12)) << (31 - 12);
+ out++;
+ *out = (inl >> 12);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 11)) << (31 - 11);
+ out++;
+ *out = (inl >> 11);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 10)) << (31 - 10);
+ out++;
+ *out = (inl >> 10);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 9)) << (31 - 9);
+ out++;
+ *out = (inl >> 9);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 8)) << (31 - 8);
+ out++;
+ *out = (inl >> 8);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 7)) << (31 - 7);
+ out++;
+ *out = (inl >> 7);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 6)) << (31 - 6);
+ out++;
+ *out = (inl >> 6);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 5)) << (31 - 5);
+ out++;
+ *out = (inl >> 5);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 4)) << (31 - 4);
+ out++;
+ *out = (inl >> 4);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 3)) << (31 - 3);
+ out++;
+ *out = (inl >> 3);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 2)) << (31 - 2);
+ out++;
+ *out = (inl >> 2);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out |= (inl % (1U << 1)) << (31 - 1);
+ out++;
+ *out = (inl >> 1);
+ ++in;
+ out++;
+
+ return in;
+}
+
+inline const uint32_t* unpack32_32(const uint32_t* in, uint32_t* out) {
+ uint32_t inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ *out = (inl >> 0);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0);
+ ++in;
+ inl = util::SafeLoad(in);
+ inl = arrow::BitUtil::FromLittleEndian(inl);
+ out++;
+ *out = (inl >> 0);
+ ++in;
+ out++;
+
+ return in;
+}
+
+inline const uint32_t* nullunpacker32(const uint32_t* in, uint32_t* out) {
+ for (int k = 0; k < 32; ++k) {
+ out[k] = 0;
+ }
+ return in;
+}
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/byte_stream_split.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/byte_stream_split.h
new file mode 100644
index 00000000000..28dcce52bb8
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/byte_stream_split.h
@@ -0,0 +1,626 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/util/simd.h"
+#include "arrow/util/ubsan.h"
+
+#include <stdint.h>
+#include <algorithm>
+
+#ifdef ARROW_HAVE_SSE4_2
+// Enable the SIMD for ByteStreamSplit Encoder/Decoder
+#define ARROW_HAVE_SIMD_SPLIT
+#endif // ARROW_HAVE_SSE4_2
+
+namespace arrow {
+namespace util {
+namespace internal {
+
+#if defined(ARROW_HAVE_SSE4_2)
+template <typename T>
+void ByteStreamSplitDecodeSse2(const uint8_t* data, int64_t num_values, int64_t stride,
+ T* out) {
+ constexpr size_t kNumStreams = sizeof(T);
+ static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of streams.");
+ constexpr size_t kNumStreamsLog2 = (kNumStreams == 8U ? 3U : 2U);
+
+ const int64_t size = num_values * sizeof(T);
+ constexpr int64_t kBlockSize = sizeof(__m128i) * kNumStreams;
+ const int64_t num_blocks = size / kBlockSize;
+ uint8_t* output_data = reinterpret_cast<uint8_t*>(out);
+
+ // First handle suffix.
+ // This helps catch if the simd-based processing overflows into the suffix
+ // since almost surely a test would fail.
+ const int64_t num_processed_elements = (num_blocks * kBlockSize) / kNumStreams;
+ for (int64_t i = num_processed_elements; i < num_values; ++i) {
+ uint8_t gathered_byte_data[kNumStreams];
+ for (size_t b = 0; b < kNumStreams; ++b) {
+ const size_t byte_index = b * stride + i;
+ gathered_byte_data[b] = data[byte_index];
+ }
+ out[i] = arrow::util::SafeLoadAs<T>(&gathered_byte_data[0]);
+ }
+
+ // The blocks get processed hierarchically using the unpack intrinsics.
+ // Example with four streams:
+ // Stage 1: AAAA BBBB CCCC DDDD
+ // Stage 2: ACAC ACAC BDBD BDBD
+ // Stage 3: ABCD ABCD ABCD ABCD
+ __m128i stage[kNumStreamsLog2 + 1U][kNumStreams];
+ constexpr size_t kNumStreamsHalf = kNumStreams / 2U;
+
+ for (int64_t i = 0; i < num_blocks; ++i) {
+ for (size_t j = 0; j < kNumStreams; ++j) {
+ stage[0][j] = _mm_loadu_si128(
+ reinterpret_cast<const __m128i*>(&data[i * sizeof(__m128i) + j * stride]));
+ }
+ for (size_t step = 0; step < kNumStreamsLog2; ++step) {
+ for (size_t j = 0; j < kNumStreamsHalf; ++j) {
+ stage[step + 1U][j * 2] =
+ _mm_unpacklo_epi8(stage[step][j], stage[step][kNumStreamsHalf + j]);
+ stage[step + 1U][j * 2 + 1U] =
+ _mm_unpackhi_epi8(stage[step][j], stage[step][kNumStreamsHalf + j]);
+ }
+ }
+ for (size_t j = 0; j < kNumStreams; ++j) {
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(
+ &output_data[(i * kNumStreams + j) * sizeof(__m128i)]),
+ stage[kNumStreamsLog2][j]);
+ }
+ }
+}
+
+template <typename T>
+void ByteStreamSplitEncodeSse2(const uint8_t* raw_values, const size_t num_values,
+ uint8_t* output_buffer_raw) {
+ constexpr size_t kNumStreams = sizeof(T);
+ static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of streams.");
+ __m128i stage[3][kNumStreams];
+ __m128i final_result[kNumStreams];
+
+ const size_t size = num_values * sizeof(T);
+ constexpr size_t kBlockSize = sizeof(__m128i) * kNumStreams;
+ const size_t num_blocks = size / kBlockSize;
+ const __m128i* raw_values_sse = reinterpret_cast<const __m128i*>(raw_values);
+ __m128i* output_buffer_streams[kNumStreams];
+ for (size_t i = 0; i < kNumStreams; ++i) {
+ output_buffer_streams[i] =
+ reinterpret_cast<__m128i*>(&output_buffer_raw[num_values * i]);
+ }
+
+ // First handle suffix.
+ const size_t num_processed_elements = (num_blocks * kBlockSize) / sizeof(T);
+ for (size_t i = num_processed_elements; i < num_values; ++i) {
+ for (size_t j = 0U; j < kNumStreams; ++j) {
+ const uint8_t byte_in_value = raw_values[i * kNumStreams + j];
+ output_buffer_raw[j * num_values + i] = byte_in_value;
+ }
+ }
+ // The current shuffling algorithm diverges for float and double types but the compiler
+ // should be able to remove the branch since only one path is taken for each template
+ // instantiation.
+ // Example run for floats:
+ // Step 0, copy:
+ // 0: ABCD ABCD ABCD ABCD 1: ABCD ABCD ABCD ABCD ...
+ // Step 1: _mm_unpacklo_epi8 and mm_unpackhi_epi8:
+ // 0: AABB CCDD AABB CCDD 1: AABB CCDD AABB CCDD ...
+ // 0: AAAA BBBB CCCC DDDD 1: AAAA BBBB CCCC DDDD ...
+ // Step 3: __mm_unpacklo_epi8 and _mm_unpackhi_epi8:
+ // 0: AAAA AAAA BBBB BBBB 1: CCCC CCCC DDDD DDDD ...
+ // Step 4: __mm_unpacklo_epi64 and _mm_unpackhi_epi64:
+ // 0: AAAA AAAA AAAA AAAA 1: BBBB BBBB BBBB BBBB ...
+ for (size_t block_index = 0; block_index < num_blocks; ++block_index) {
+ // First copy the data to stage 0.
+ for (size_t i = 0; i < kNumStreams; ++i) {
+ stage[0][i] = _mm_loadu_si128(&raw_values_sse[block_index * kNumStreams + i]);
+ }
+
+ // The shuffling of bytes is performed through the unpack intrinsics.
+ // In my measurements this gives better performance then an implementation
+ // which uses the shuffle intrinsics.
+ for (size_t stage_lvl = 0; stage_lvl < 2U; ++stage_lvl) {
+ for (size_t i = 0; i < kNumStreams / 2U; ++i) {
+ stage[stage_lvl + 1][i * 2] =
+ _mm_unpacklo_epi8(stage[stage_lvl][i * 2], stage[stage_lvl][i * 2 + 1]);
+ stage[stage_lvl + 1][i * 2 + 1] =
+ _mm_unpackhi_epi8(stage[stage_lvl][i * 2], stage[stage_lvl][i * 2 + 1]);
+ }
+ }
+ if (kNumStreams == 8U) {
+ // This is the path for double.
+ __m128i tmp[8];
+ for (size_t i = 0; i < 4; ++i) {
+ tmp[i * 2] = _mm_unpacklo_epi32(stage[2][i], stage[2][i + 4]);
+ tmp[i * 2 + 1] = _mm_unpackhi_epi32(stage[2][i], stage[2][i + 4]);
+ }
+
+ for (size_t i = 0; i < 4; ++i) {
+ final_result[i * 2] = _mm_unpacklo_epi32(tmp[i], tmp[i + 4]);
+ final_result[i * 2 + 1] = _mm_unpackhi_epi32(tmp[i], tmp[i + 4]);
+ }
+ } else {
+ // this is the path for float.
+ __m128i tmp[4];
+ for (size_t i = 0; i < 2; ++i) {
+ tmp[i * 2] = _mm_unpacklo_epi8(stage[2][i * 2], stage[2][i * 2 + 1]);
+ tmp[i * 2 + 1] = _mm_unpackhi_epi8(stage[2][i * 2], stage[2][i * 2 + 1]);
+ }
+ for (size_t i = 0; i < 2; ++i) {
+ final_result[i * 2] = _mm_unpacklo_epi64(tmp[i], tmp[i + 2]);
+ final_result[i * 2 + 1] = _mm_unpackhi_epi64(tmp[i], tmp[i + 2]);
+ }
+ }
+ for (size_t i = 0; i < kNumStreams; ++i) {
+ _mm_storeu_si128(&output_buffer_streams[i][block_index], final_result[i]);
+ }
+ }
+}
+#endif // ARROW_HAVE_SSE4_2
+
+#if defined(ARROW_HAVE_AVX2)
+template <typename T>
+void ByteStreamSplitDecodeAvx2(const uint8_t* data, int64_t num_values, int64_t stride,
+ T* out) {
+ constexpr size_t kNumStreams = sizeof(T);
+ static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of streams.");
+ constexpr size_t kNumStreamsLog2 = (kNumStreams == 8U ? 3U : 2U);
+
+ const int64_t size = num_values * sizeof(T);
+ constexpr int64_t kBlockSize = sizeof(__m256i) * kNumStreams;
+ if (size < kBlockSize) // Back to SSE for small size
+ return ByteStreamSplitDecodeSse2(data, num_values, stride, out);
+ const int64_t num_blocks = size / kBlockSize;
+ uint8_t* output_data = reinterpret_cast<uint8_t*>(out);
+
+ // First handle suffix.
+ const int64_t num_processed_elements = (num_blocks * kBlockSize) / kNumStreams;
+ for (int64_t i = num_processed_elements; i < num_values; ++i) {
+ uint8_t gathered_byte_data[kNumStreams];
+ for (size_t b = 0; b < kNumStreams; ++b) {
+ const size_t byte_index = b * stride + i;
+ gathered_byte_data[b] = data[byte_index];
+ }
+ out[i] = arrow::util::SafeLoadAs<T>(&gathered_byte_data[0]);
+ }
+
+ // Processed hierarchically using unpack intrinsics, then permute intrinsics.
+ __m256i stage[kNumStreamsLog2 + 1U][kNumStreams];
+ __m256i final_result[kNumStreams];
+ constexpr size_t kNumStreamsHalf = kNumStreams / 2U;
+
+ for (int64_t i = 0; i < num_blocks; ++i) {
+ for (size_t j = 0; j < kNumStreams; ++j) {
+ stage[0][j] = _mm256_loadu_si256(
+ reinterpret_cast<const __m256i*>(&data[i * sizeof(__m256i) + j * stride]));
+ }
+
+ for (size_t step = 0; step < kNumStreamsLog2; ++step) {
+ for (size_t j = 0; j < kNumStreamsHalf; ++j) {
+ stage[step + 1U][j * 2] =
+ _mm256_unpacklo_epi8(stage[step][j], stage[step][kNumStreamsHalf + j]);
+ stage[step + 1U][j * 2 + 1U] =
+ _mm256_unpackhi_epi8(stage[step][j], stage[step][kNumStreamsHalf + j]);
+ }
+ }
+
+ if (kNumStreams == 8U) {
+ // path for double, 128i index:
+ // {0x00, 0x08}, {0x01, 0x09}, {0x02, 0x0A}, {0x03, 0x0B},
+ // {0x04, 0x0C}, {0x05, 0x0D}, {0x06, 0x0E}, {0x07, 0x0F},
+ final_result[0] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][0],
+ stage[kNumStreamsLog2][1], 0b00100000);
+ final_result[1] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][2],
+ stage[kNumStreamsLog2][3], 0b00100000);
+ final_result[2] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][4],
+ stage[kNumStreamsLog2][5], 0b00100000);
+ final_result[3] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][6],
+ stage[kNumStreamsLog2][7], 0b00100000);
+ final_result[4] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][0],
+ stage[kNumStreamsLog2][1], 0b00110001);
+ final_result[5] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][2],
+ stage[kNumStreamsLog2][3], 0b00110001);
+ final_result[6] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][4],
+ stage[kNumStreamsLog2][5], 0b00110001);
+ final_result[7] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][6],
+ stage[kNumStreamsLog2][7], 0b00110001);
+ } else {
+ // path for float, 128i index:
+ // {0x00, 0x04}, {0x01, 0x05}, {0x02, 0x06}, {0x03, 0x07}
+ final_result[0] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][0],
+ stage[kNumStreamsLog2][1], 0b00100000);
+ final_result[1] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][2],
+ stage[kNumStreamsLog2][3], 0b00100000);
+ final_result[2] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][0],
+ stage[kNumStreamsLog2][1], 0b00110001);
+ final_result[3] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][2],
+ stage[kNumStreamsLog2][3], 0b00110001);
+ }
+
+ for (size_t j = 0; j < kNumStreams; ++j) {
+ _mm256_storeu_si256(reinterpret_cast<__m256i*>(
+ &output_data[(i * kNumStreams + j) * sizeof(__m256i)]),
+ final_result[j]);
+ }
+ }
+}
+
+template <typename T>
+void ByteStreamSplitEncodeAvx2(const uint8_t* raw_values, const size_t num_values,
+ uint8_t* output_buffer_raw) {
+ constexpr size_t kNumStreams = sizeof(T);
+ static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of streams.");
+ if (kNumStreams == 8U) // Back to SSE, currently no path for double.
+ return ByteStreamSplitEncodeSse2<T>(raw_values, num_values, output_buffer_raw);
+
+ const size_t size = num_values * sizeof(T);
+ constexpr size_t kBlockSize = sizeof(__m256i) * kNumStreams;
+ if (size < kBlockSize) // Back to SSE for small size
+ return ByteStreamSplitEncodeSse2<T>(raw_values, num_values, output_buffer_raw);
+ const size_t num_blocks = size / kBlockSize;
+ const __m256i* raw_values_simd = reinterpret_cast<const __m256i*>(raw_values);
+ __m256i* output_buffer_streams[kNumStreams];
+
+ for (size_t i = 0; i < kNumStreams; ++i) {
+ output_buffer_streams[i] =
+ reinterpret_cast<__m256i*>(&output_buffer_raw[num_values * i]);
+ }
+
+ // First handle suffix.
+ const size_t num_processed_elements = (num_blocks * kBlockSize) / sizeof(T);
+ for (size_t i = num_processed_elements; i < num_values; ++i) {
+ for (size_t j = 0U; j < kNumStreams; ++j) {
+ const uint8_t byte_in_value = raw_values[i * kNumStreams + j];
+ output_buffer_raw[j * num_values + i] = byte_in_value;
+ }
+ }
+
+ // Path for float.
+ // 1. Processed hierarchically to 32i blcok using the unpack intrinsics.
+ // 2. Pack 128i block using _mm256_permutevar8x32_epi32.
+ // 3. Pack final 256i block with _mm256_permute2x128_si256.
+ constexpr size_t kNumUnpack = 3U;
+ __m256i stage[kNumUnpack + 1][kNumStreams];
+ static const __m256i kPermuteMask =
+ _mm256_set_epi32(0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00);
+ __m256i permute[kNumStreams];
+ __m256i final_result[kNumStreams];
+
+ for (size_t block_index = 0; block_index < num_blocks; ++block_index) {
+ for (size_t i = 0; i < kNumStreams; ++i) {
+ stage[0][i] = _mm256_loadu_si256(&raw_values_simd[block_index * kNumStreams + i]);
+ }
+
+ for (size_t stage_lvl = 0; stage_lvl < kNumUnpack; ++stage_lvl) {
+ for (size_t i = 0; i < kNumStreams / 2U; ++i) {
+ stage[stage_lvl + 1][i * 2] =
+ _mm256_unpacklo_epi8(stage[stage_lvl][i * 2], stage[stage_lvl][i * 2 + 1]);
+ stage[stage_lvl + 1][i * 2 + 1] =
+ _mm256_unpackhi_epi8(stage[stage_lvl][i * 2], stage[stage_lvl][i * 2 + 1]);
+ }
+ }
+
+ for (size_t i = 0; i < kNumStreams; ++i) {
+ permute[i] = _mm256_permutevar8x32_epi32(stage[kNumUnpack][i], kPermuteMask);
+ }
+
+ final_result[0] = _mm256_permute2x128_si256(permute[0], permute[2], 0b00100000);
+ final_result[1] = _mm256_permute2x128_si256(permute[0], permute[2], 0b00110001);
+ final_result[2] = _mm256_permute2x128_si256(permute[1], permute[3], 0b00100000);
+ final_result[3] = _mm256_permute2x128_si256(permute[1], permute[3], 0b00110001);
+
+ for (size_t i = 0; i < kNumStreams; ++i) {
+ _mm256_storeu_si256(&output_buffer_streams[i][block_index], final_result[i]);
+ }
+ }
+}
+#endif // ARROW_HAVE_AVX2
+
+#if defined(ARROW_HAVE_AVX512)
+template <typename T>
+void ByteStreamSplitDecodeAvx512(const uint8_t* data, int64_t num_values, int64_t stride,
+ T* out) {
+ constexpr size_t kNumStreams = sizeof(T);
+ static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of streams.");
+ constexpr size_t kNumStreamsLog2 = (kNumStreams == 8U ? 3U : 2U);
+
+ const int64_t size = num_values * sizeof(T);
+ constexpr int64_t kBlockSize = sizeof(__m512i) * kNumStreams;
+ if (size < kBlockSize) // Back to AVX2 for small size
+ return ByteStreamSplitDecodeAvx2(data, num_values, stride, out);
+ const int64_t num_blocks = size / kBlockSize;
+ uint8_t* output_data = reinterpret_cast<uint8_t*>(out);
+
+ // First handle suffix.
+ const int64_t num_processed_elements = (num_blocks * kBlockSize) / kNumStreams;
+ for (int64_t i = num_processed_elements; i < num_values; ++i) {
+ uint8_t gathered_byte_data[kNumStreams];
+ for (size_t b = 0; b < kNumStreams; ++b) {
+ const size_t byte_index = b * stride + i;
+ gathered_byte_data[b] = data[byte_index];
+ }
+ out[i] = arrow::util::SafeLoadAs<T>(&gathered_byte_data[0]);
+ }
+
+ // Processed hierarchically using the unpack, then two shuffles.
+ __m512i stage[kNumStreamsLog2 + 1U][kNumStreams];
+ __m512i shuffle[kNumStreams];
+ __m512i final_result[kNumStreams];
+ constexpr size_t kNumStreamsHalf = kNumStreams / 2U;
+
+ for (int64_t i = 0; i < num_blocks; ++i) {
+ for (size_t j = 0; j < kNumStreams; ++j) {
+ stage[0][j] = _mm512_loadu_si512(
+ reinterpret_cast<const __m512i*>(&data[i * sizeof(__m512i) + j * stride]));
+ }
+
+ for (size_t step = 0; step < kNumStreamsLog2; ++step) {
+ for (size_t j = 0; j < kNumStreamsHalf; ++j) {
+ stage[step + 1U][j * 2] =
+ _mm512_unpacklo_epi8(stage[step][j], stage[step][kNumStreamsHalf + j]);
+ stage[step + 1U][j * 2 + 1U] =
+ _mm512_unpackhi_epi8(stage[step][j], stage[step][kNumStreamsHalf + j]);
+ }
+ }
+
+ if (kNumStreams == 8U) {
+ // path for double, 128i index:
+ // {0x00, 0x04, 0x08, 0x0C}, {0x10, 0x14, 0x18, 0x1C},
+ // {0x01, 0x05, 0x09, 0x0D}, {0x11, 0x15, 0x19, 0x1D},
+ // {0x02, 0x06, 0x0A, 0x0E}, {0x12, 0x16, 0x1A, 0x1E},
+ // {0x03, 0x07, 0x0B, 0x0F}, {0x13, 0x17, 0x1B, 0x1F},
+ shuffle[0] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][0],
+ stage[kNumStreamsLog2][1], 0b01000100);
+ shuffle[1] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][2],
+ stage[kNumStreamsLog2][3], 0b01000100);
+ shuffle[2] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][4],
+ stage[kNumStreamsLog2][5], 0b01000100);
+ shuffle[3] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][6],
+ stage[kNumStreamsLog2][7], 0b01000100);
+ shuffle[4] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][0],
+ stage[kNumStreamsLog2][1], 0b11101110);
+ shuffle[5] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][2],
+ stage[kNumStreamsLog2][3], 0b11101110);
+ shuffle[6] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][4],
+ stage[kNumStreamsLog2][5], 0b11101110);
+ shuffle[7] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][6],
+ stage[kNumStreamsLog2][7], 0b11101110);
+
+ final_result[0] = _mm512_shuffle_i32x4(shuffle[0], shuffle[1], 0b10001000);
+ final_result[1] = _mm512_shuffle_i32x4(shuffle[2], shuffle[3], 0b10001000);
+ final_result[2] = _mm512_shuffle_i32x4(shuffle[0], shuffle[1], 0b11011101);
+ final_result[3] = _mm512_shuffle_i32x4(shuffle[2], shuffle[3], 0b11011101);
+ final_result[4] = _mm512_shuffle_i32x4(shuffle[4], shuffle[5], 0b10001000);
+ final_result[5] = _mm512_shuffle_i32x4(shuffle[6], shuffle[7], 0b10001000);
+ final_result[6] = _mm512_shuffle_i32x4(shuffle[4], shuffle[5], 0b11011101);
+ final_result[7] = _mm512_shuffle_i32x4(shuffle[6], shuffle[7], 0b11011101);
+ } else {
+ // path for float, 128i index:
+ // {0x00, 0x04, 0x08, 0x0C}, {0x01, 0x05, 0x09, 0x0D}
+ // {0x02, 0x06, 0x0A, 0x0E}, {0x03, 0x07, 0x0B, 0x0F},
+ shuffle[0] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][0],
+ stage[kNumStreamsLog2][1], 0b01000100);
+ shuffle[1] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][2],
+ stage[kNumStreamsLog2][3], 0b01000100);
+ shuffle[2] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][0],
+ stage[kNumStreamsLog2][1], 0b11101110);
+ shuffle[3] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][2],
+ stage[kNumStreamsLog2][3], 0b11101110);
+
+ final_result[0] = _mm512_shuffle_i32x4(shuffle[0], shuffle[1], 0b10001000);
+ final_result[1] = _mm512_shuffle_i32x4(shuffle[0], shuffle[1], 0b11011101);
+ final_result[2] = _mm512_shuffle_i32x4(shuffle[2], shuffle[3], 0b10001000);
+ final_result[3] = _mm512_shuffle_i32x4(shuffle[2], shuffle[3], 0b11011101);
+ }
+
+ for (size_t j = 0; j < kNumStreams; ++j) {
+ _mm512_storeu_si512(reinterpret_cast<__m512i*>(
+ &output_data[(i * kNumStreams + j) * sizeof(__m512i)]),
+ final_result[j]);
+ }
+ }
+}
+
+template <typename T>
+void ByteStreamSplitEncodeAvx512(const uint8_t* raw_values, const size_t num_values,
+ uint8_t* output_buffer_raw) {
+ constexpr size_t kNumStreams = sizeof(T);
+ static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of streams.");
+ const size_t size = num_values * sizeof(T);
+ constexpr size_t kBlockSize = sizeof(__m512i) * kNumStreams;
+ if (size < kBlockSize) // Back to AVX2 for small size
+ return ByteStreamSplitEncodeAvx2<T>(raw_values, num_values, output_buffer_raw);
+
+ const size_t num_blocks = size / kBlockSize;
+ const __m512i* raw_values_simd = reinterpret_cast<const __m512i*>(raw_values);
+ __m512i* output_buffer_streams[kNumStreams];
+ for (size_t i = 0; i < kNumStreams; ++i) {
+ output_buffer_streams[i] =
+ reinterpret_cast<__m512i*>(&output_buffer_raw[num_values * i]);
+ }
+
+ // First handle suffix.
+ const size_t num_processed_elements = (num_blocks * kBlockSize) / sizeof(T);
+ for (size_t i = num_processed_elements; i < num_values; ++i) {
+ for (size_t j = 0U; j < kNumStreams; ++j) {
+ const uint8_t byte_in_value = raw_values[i * kNumStreams + j];
+ output_buffer_raw[j * num_values + i] = byte_in_value;
+ }
+ }
+
+ constexpr size_t KNumUnpack = (kNumStreams == 8U) ? 2U : 3U;
+ __m512i final_result[kNumStreams];
+ __m512i unpack[KNumUnpack + 1][kNumStreams];
+ __m512i permutex[kNumStreams];
+ __m512i permutex_mask;
+ if (kNumStreams == 8U) {
+ // use _mm512_set_epi32, no _mm512_set_epi16 for some old gcc version.
+ permutex_mask = _mm512_set_epi32(0x001F0017, 0x000F0007, 0x001E0016, 0x000E0006,
+ 0x001D0015, 0x000D0005, 0x001C0014, 0x000C0004,
+ 0x001B0013, 0x000B0003, 0x001A0012, 0x000A0002,
+ 0x00190011, 0x00090001, 0x00180010, 0x00080000);
+ } else {
+ permutex_mask = _mm512_set_epi32(0x0F, 0x0B, 0x07, 0x03, 0x0E, 0x0A, 0x06, 0x02, 0x0D,
+ 0x09, 0x05, 0x01, 0x0C, 0x08, 0x04, 0x00);
+ }
+
+ for (size_t block_index = 0; block_index < num_blocks; ++block_index) {
+ for (size_t i = 0; i < kNumStreams; ++i) {
+ unpack[0][i] = _mm512_loadu_si512(&raw_values_simd[block_index * kNumStreams + i]);
+ }
+
+ for (size_t unpack_lvl = 0; unpack_lvl < KNumUnpack; ++unpack_lvl) {
+ for (size_t i = 0; i < kNumStreams / 2U; ++i) {
+ unpack[unpack_lvl + 1][i * 2] = _mm512_unpacklo_epi8(
+ unpack[unpack_lvl][i * 2], unpack[unpack_lvl][i * 2 + 1]);
+ unpack[unpack_lvl + 1][i * 2 + 1] = _mm512_unpackhi_epi8(
+ unpack[unpack_lvl][i * 2], unpack[unpack_lvl][i * 2 + 1]);
+ }
+ }
+
+ if (kNumStreams == 8U) {
+ // path for double
+ // 1. unpack to epi16 block
+ // 2. permutexvar_epi16 to 128i block
+ // 3. shuffle 128i to final 512i target, index:
+ // {0x00, 0x04, 0x08, 0x0C}, {0x10, 0x14, 0x18, 0x1C},
+ // {0x01, 0x05, 0x09, 0x0D}, {0x11, 0x15, 0x19, 0x1D},
+ // {0x02, 0x06, 0x0A, 0x0E}, {0x12, 0x16, 0x1A, 0x1E},
+ // {0x03, 0x07, 0x0B, 0x0F}, {0x13, 0x17, 0x1B, 0x1F},
+ for (size_t i = 0; i < kNumStreams; ++i)
+ permutex[i] = _mm512_permutexvar_epi16(permutex_mask, unpack[KNumUnpack][i]);
+
+ __m512i shuffle[kNumStreams];
+ shuffle[0] = _mm512_shuffle_i32x4(permutex[0], permutex[2], 0b01000100);
+ shuffle[1] = _mm512_shuffle_i32x4(permutex[4], permutex[6], 0b01000100);
+ shuffle[2] = _mm512_shuffle_i32x4(permutex[0], permutex[2], 0b11101110);
+ shuffle[3] = _mm512_shuffle_i32x4(permutex[4], permutex[6], 0b11101110);
+ shuffle[4] = _mm512_shuffle_i32x4(permutex[1], permutex[3], 0b01000100);
+ shuffle[5] = _mm512_shuffle_i32x4(permutex[5], permutex[7], 0b01000100);
+ shuffle[6] = _mm512_shuffle_i32x4(permutex[1], permutex[3], 0b11101110);
+ shuffle[7] = _mm512_shuffle_i32x4(permutex[5], permutex[7], 0b11101110);
+
+ final_result[0] = _mm512_shuffle_i32x4(shuffle[0], shuffle[1], 0b10001000);
+ final_result[1] = _mm512_shuffle_i32x4(shuffle[0], shuffle[1], 0b11011101);
+ final_result[2] = _mm512_shuffle_i32x4(shuffle[2], shuffle[3], 0b10001000);
+ final_result[3] = _mm512_shuffle_i32x4(shuffle[2], shuffle[3], 0b11011101);
+ final_result[4] = _mm512_shuffle_i32x4(shuffle[4], shuffle[5], 0b10001000);
+ final_result[5] = _mm512_shuffle_i32x4(shuffle[4], shuffle[5], 0b11011101);
+ final_result[6] = _mm512_shuffle_i32x4(shuffle[6], shuffle[7], 0b10001000);
+ final_result[7] = _mm512_shuffle_i32x4(shuffle[6], shuffle[7], 0b11011101);
+ } else {
+ // Path for float.
+ // 1. Processed hierarchically to 32i blcok using the unpack intrinsics.
+ // 2. Pack 128i block using _mm256_permutevar8x32_epi32.
+ // 3. Pack final 256i block with _mm256_permute2x128_si256.
+ for (size_t i = 0; i < kNumStreams; ++i)
+ permutex[i] = _mm512_permutexvar_epi32(permutex_mask, unpack[KNumUnpack][i]);
+
+ final_result[0] = _mm512_shuffle_i32x4(permutex[0], permutex[2], 0b01000100);
+ final_result[1] = _mm512_shuffle_i32x4(permutex[0], permutex[2], 0b11101110);
+ final_result[2] = _mm512_shuffle_i32x4(permutex[1], permutex[3], 0b01000100);
+ final_result[3] = _mm512_shuffle_i32x4(permutex[1], permutex[3], 0b11101110);
+ }
+
+ for (size_t i = 0; i < kNumStreams; ++i) {
+ _mm512_storeu_si512(&output_buffer_streams[i][block_index], final_result[i]);
+ }
+ }
+}
+#endif // ARROW_HAVE_AVX512
+
+#if defined(ARROW_HAVE_SIMD_SPLIT)
+template <typename T>
+void inline ByteStreamSplitDecodeSimd(const uint8_t* data, int64_t num_values,
+ int64_t stride, T* out) {
+#if defined(ARROW_HAVE_AVX512)
+ return ByteStreamSplitDecodeAvx512(data, num_values, stride, out);
+#elif defined(ARROW_HAVE_AVX2)
+ return ByteStreamSplitDecodeAvx2(data, num_values, stride, out);
+#elif defined(ARROW_HAVE_SSE4_2)
+ return ByteStreamSplitDecodeSse2(data, num_values, stride, out);
+#else
+#error "ByteStreamSplitDecodeSimd not implemented"
+#endif
+}
+
+template <typename T>
+void inline ByteStreamSplitEncodeSimd(const uint8_t* raw_values, const size_t num_values,
+ uint8_t* output_buffer_raw) {
+#if defined(ARROW_HAVE_AVX512)
+ return ByteStreamSplitEncodeAvx512<T>(raw_values, num_values, output_buffer_raw);
+#elif defined(ARROW_HAVE_AVX2)
+ return ByteStreamSplitEncodeAvx2<T>(raw_values, num_values, output_buffer_raw);
+#elif defined(ARROW_HAVE_SSE4_2)
+ return ByteStreamSplitEncodeSse2<T>(raw_values, num_values, output_buffer_raw);
+#else
+#error "ByteStreamSplitEncodeSimd not implemented"
+#endif
+}
+#endif
+
+template <typename T>
+void ByteStreamSplitEncodeScalar(const uint8_t* raw_values, const size_t num_values,
+ uint8_t* output_buffer_raw) {
+ constexpr size_t kNumStreams = sizeof(T);
+ for (size_t i = 0U; i < num_values; ++i) {
+ for (size_t j = 0U; j < kNumStreams; ++j) {
+ const uint8_t byte_in_value = raw_values[i * kNumStreams + j];
+ output_buffer_raw[j * num_values + i] = byte_in_value;
+ }
+ }
+}
+
+template <typename T>
+void ByteStreamSplitDecodeScalar(const uint8_t* data, int64_t num_values, int64_t stride,
+ T* out) {
+ constexpr size_t kNumStreams = sizeof(T);
+ auto output_buffer_raw = reinterpret_cast<uint8_t*>(out);
+
+ for (int64_t i = 0; i < num_values; ++i) {
+ for (size_t b = 0; b < kNumStreams; ++b) {
+ const size_t byte_index = b * stride + i;
+ output_buffer_raw[i * kNumStreams + b] = data[byte_index];
+ }
+ }
+}
+
+template <typename T>
+void inline ByteStreamSplitEncode(const uint8_t* raw_values, const size_t num_values,
+ uint8_t* output_buffer_raw) {
+#if defined(ARROW_HAVE_SIMD_SPLIT)
+ return ByteStreamSplitEncodeSimd<T>(raw_values, num_values, output_buffer_raw);
+#else
+ return ByteStreamSplitEncodeScalar<T>(raw_values, num_values, output_buffer_raw);
+#endif
+}
+
+template <typename T>
+void inline ByteStreamSplitDecode(const uint8_t* data, int64_t num_values, int64_t stride,
+ T* out) {
+#if defined(ARROW_HAVE_SIMD_SPLIT)
+ return ByteStreamSplitDecodeSimd(data, num_values, stride, out);
+#else
+ return ByteStreamSplitDecodeScalar(data, num_values, stride, out);
+#endif
+}
+
+} // namespace internal
+} // namespace util
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/cancel.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/cancel.cc
new file mode 100644
index 00000000000..874b2c2c886
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/cancel.cc
@@ -0,0 +1,226 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/util/cancel.h"
+
+#include <atomic>
+#include <mutex>
+#include <sstream>
+#include <utility>
+
+#include "arrow/result.h"
+#include "arrow/util/atomic_shared_ptr.h"
+#include "arrow/util/io_util.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+#if ATOMIC_INT_LOCK_FREE != 2
+#error Lock-free atomic int required for signal safety
+#endif
+
+using internal::ReinstateSignalHandler;
+using internal::SetSignalHandler;
+using internal::SignalHandler;
+
+// NOTE: We care mainly about the making the common case (not cancelled) fast.
+
+struct StopSourceImpl {
+ std::atomic<int> requested_{0}; // will be -1 or signal number if requested
+ std::mutex mutex_;
+ Status cancel_error_;
+};
+
+StopSource::StopSource() : impl_(new StopSourceImpl) {}
+
+StopSource::~StopSource() = default;
+
+void StopSource::RequestStop() { RequestStop(Status::Cancelled("Operation cancelled")); }
+
+void StopSource::RequestStop(Status st) {
+ std::lock_guard<std::mutex> lock(impl_->mutex_);
+ DCHECK(!st.ok());
+ if (!impl_->requested_) {
+ impl_->requested_ = -1;
+ impl_->cancel_error_ = std::move(st);
+ }
+}
+
+void StopSource::RequestStopFromSignal(int signum) {
+ // Only async-signal-safe code allowed here
+ impl_->requested_.store(signum);
+}
+
+void StopSource::Reset() {
+ std::lock_guard<std::mutex> lock(impl_->mutex_);
+ impl_->cancel_error_ = Status::OK();
+ impl_->requested_.store(0);
+}
+
+StopToken StopSource::token() { return StopToken(impl_); }
+
+bool StopToken::IsStopRequested() const {
+ if (!impl_) {
+ return false;
+ }
+ return impl_->requested_.load() != 0;
+}
+
+Status StopToken::Poll() const {
+ if (!impl_) {
+ return Status::OK();
+ }
+ if (!impl_->requested_.load()) {
+ return Status::OK();
+ }
+
+ std::lock_guard<std::mutex> lock(impl_->mutex_);
+ if (impl_->cancel_error_.ok()) {
+ auto signum = impl_->requested_.load();
+ DCHECK_GT(signum, 0);
+ impl_->cancel_error_ = internal::CancelledFromSignal(signum, "Operation cancelled");
+ }
+ return impl_->cancel_error_;
+}
+
+namespace {
+
+struct SignalStopState {
+ struct SavedSignalHandler {
+ int signum;
+ SignalHandler handler;
+ };
+
+ Status RegisterHandlers(const std::vector<int>& signals) {
+ if (!saved_handlers_.empty()) {
+ return Status::Invalid("Signal handlers already registered");
+ }
+ for (int signum : signals) {
+ ARROW_ASSIGN_OR_RAISE(auto handler,
+ SetSignalHandler(signum, SignalHandler{&HandleSignal}));
+ saved_handlers_.push_back({signum, handler});
+ }
+ return Status::OK();
+ }
+
+ void UnregisterHandlers() {
+ auto handlers = std::move(saved_handlers_);
+ for (const auto& h : handlers) {
+ ARROW_CHECK_OK(SetSignalHandler(h.signum, h.handler).status());
+ }
+ }
+
+ ~SignalStopState() {
+ UnregisterHandlers();
+ Disable();
+ }
+
+ StopSource* stop_source() { return stop_source_.get(); }
+
+ bool enabled() { return stop_source_ != nullptr; }
+
+ void Enable() {
+ // Before creating a new StopSource, delete any lingering reference to
+ // the previous one in the trash can. See DoHandleSignal() for details.
+ EmptyTrashCan();
+ internal::atomic_store(&stop_source_, std::make_shared<StopSource>());
+ }
+
+ void Disable() { internal::atomic_store(&stop_source_, NullSource()); }
+
+ static SignalStopState* instance() { return &instance_; }
+
+ private:
+ // For readability
+ std::shared_ptr<StopSource> NullSource() { return nullptr; }
+
+ void EmptyTrashCan() { internal::atomic_store(&trash_can_, NullSource()); }
+
+ static void HandleSignal(int signum) { instance_.DoHandleSignal(signum); }
+
+ void DoHandleSignal(int signum) {
+ // async-signal-safe code only
+ auto source = internal::atomic_load(&stop_source_);
+ if (source) {
+ source->RequestStopFromSignal(signum);
+ // Disable() may have been called in the meantime, but we can't
+ // deallocate a shared_ptr here, so instead move it to a "trash can".
+ // This minimizes the possibility of running a deallocator here,
+ // however it doesn't entirely preclude it.
+ //
+ // Possible case:
+ // - a signal handler (A) starts running, fetches the current source
+ // - Disable() then Enable() are called, emptying the trash can and
+ // replacing the current source
+ // - a signal handler (B) starts running, fetches the current source
+ // - signal handler A resumes, moves its source (the old source) into
+ // the trash can (the only remaining reference)
+ // - signal handler B resumes, moves its source (the current source)
+ // into the trash can. This triggers deallocation of the old source,
+ // since the trash can had the only remaining reference to it.
+ //
+ // This case should be sufficiently unlikely, but we cannot entirely
+ // rule it out. The problem might be solved properly with a lock-free
+ // linked list of StopSources.
+ internal::atomic_store(&trash_can_, std::move(source));
+ }
+ ReinstateSignalHandler(signum, &HandleSignal);
+ }
+
+ std::shared_ptr<StopSource> stop_source_;
+ std::shared_ptr<StopSource> trash_can_;
+
+ std::vector<SavedSignalHandler> saved_handlers_;
+
+ static SignalStopState instance_;
+};
+
+SignalStopState SignalStopState::instance_{};
+
+} // namespace
+
+Result<StopSource*> SetSignalStopSource() {
+ auto stop_state = SignalStopState::instance();
+ if (stop_state->enabled()) {
+ return Status::Invalid("Signal stop source already set up");
+ }
+ stop_state->Enable();
+ return stop_state->stop_source();
+}
+
+void ResetSignalStopSource() {
+ auto stop_state = SignalStopState::instance();
+ DCHECK(stop_state->enabled());
+ stop_state->Disable();
+}
+
+Status RegisterCancellingSignalHandler(const std::vector<int>& signals) {
+ auto stop_state = SignalStopState::instance();
+ if (!stop_state->enabled()) {
+ return Status::Invalid("Signal stop source was not set up");
+ }
+ return stop_state->RegisterHandlers(signals);
+}
+
+void UnregisterCancellingSignalHandler() {
+ auto stop_state = SignalStopState::instance();
+ DCHECK(stop_state->enabled());
+ stop_state->UnregisterHandlers();
+}
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/cancel.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/cancel.h
new file mode 100644
index 00000000000..9e00f673a21
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/cancel.h
@@ -0,0 +1,102 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/status.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class StopToken;
+
+struct StopSourceImpl;
+
+/// EXPERIMENTAL
+class ARROW_EXPORT StopSource {
+ public:
+ StopSource();
+ ~StopSource();
+
+ // Consumer API (the side that stops)
+ void RequestStop();
+ void RequestStop(Status error);
+ void RequestStopFromSignal(int signum);
+
+ StopToken token();
+
+ // For internal use only
+ void Reset();
+
+ protected:
+ std::shared_ptr<StopSourceImpl> impl_;
+};
+
+/// EXPERIMENTAL
+class ARROW_EXPORT StopToken {
+ public:
+ // Public for Cython
+ StopToken() {}
+
+ explicit StopToken(std::shared_ptr<StopSourceImpl> impl) : impl_(std::move(impl)) {}
+
+ // A trivial token that never propagates any stop request
+ static StopToken Unstoppable() { return StopToken(); }
+
+ // Producer API (the side that gets asked to stopped)
+ Status Poll() const;
+ bool IsStopRequested() const;
+
+ protected:
+ std::shared_ptr<StopSourceImpl> impl_;
+};
+
+/// EXPERIMENTAL: Set a global StopSource that can receive signals
+///
+/// The only allowed order of calls is the following:
+/// - SetSignalStopSource()
+/// - any number of pairs of (RegisterCancellingSignalHandler,
+/// UnregisterCancellingSignalHandler) calls
+/// - ResetSignalStopSource()
+///
+/// Beware that these settings are process-wide. Typically, only one
+/// thread should call these APIs, even in a multithreaded setting.
+ARROW_EXPORT
+Result<StopSource*> SetSignalStopSource();
+
+/// EXPERIMENTAL: Reset the global signal-receiving StopSource
+///
+/// This will invalidate the pointer returned by SetSignalStopSource.
+ARROW_EXPORT
+void ResetSignalStopSource();
+
+/// EXPERIMENTAL: Register signal handler triggering the signal-receiving StopSource
+ARROW_EXPORT
+Status RegisterCancellingSignalHandler(const std::vector<int>& signals);
+
+/// EXPERIMENTAL: Unregister signal handler set up by RegisterCancellingSignalHandler
+ARROW_EXPORT
+void UnregisterCancellingSignalHandler();
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/checked_cast.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/checked_cast.h
new file mode 100644
index 00000000000..97f6b61a1f8
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/checked_cast.h
@@ -0,0 +1,61 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <type_traits>
+#include <utility>
+
+namespace arrow {
+namespace internal {
+
+template <typename OutputType, typename InputType>
+inline OutputType checked_cast(InputType&& value) {
+ static_assert(std::is_class<typename std::remove_pointer<
+ typename std::remove_reference<InputType>::type>::type>::value,
+ "checked_cast input type must be a class");
+ static_assert(std::is_class<typename std::remove_pointer<
+ typename std::remove_reference<OutputType>::type>::type>::value,
+ "checked_cast output type must be a class");
+#ifdef NDEBUG
+ return static_cast<OutputType>(value);
+#else
+ return dynamic_cast<OutputType>(value);
+#endif
+}
+
+template <class T, class U>
+std::shared_ptr<T> checked_pointer_cast(std::shared_ptr<U> r) noexcept {
+#ifdef NDEBUG
+ return std::static_pointer_cast<T>(std::move(r));
+#else
+ return std::dynamic_pointer_cast<T>(std::move(r));
+#endif
+}
+
+template <class T, class U>
+std::unique_ptr<T> checked_pointer_cast(std::unique_ptr<U> r) noexcept {
+#ifdef NDEBUG
+ return std::unique_ptr<T>(static_cast<T*>(r.release()));
+#else
+ return std::unique_ptr<T>(dynamic_cast<T*>(r.release()));
+#endif
+}
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/compare.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/compare.h
new file mode 100644
index 00000000000..6477bf139f5
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/compare.h
@@ -0,0 +1,62 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <type_traits>
+#include <utility>
+
+#include "arrow/util/macros.h"
+
+namespace arrow {
+namespace util {
+
+/// CRTP helper for declaring equality comparison. Defines operator== and operator!=
+template <typename T>
+class EqualityComparable {
+ public:
+ ~EqualityComparable() {
+ static_assert(
+ std::is_same<decltype(std::declval<const T>().Equals(std::declval<const T>())),
+ bool>::value,
+ "EqualityComparable depends on the method T::Equals(const T&) const");
+ }
+
+ template <typename... Extra>
+ bool Equals(const std::shared_ptr<T>& other, Extra&&... extra) const {
+ if (other == NULLPTR) {
+ return false;
+ }
+ return cast().Equals(*other, std::forward<Extra>(extra)...);
+ }
+
+ struct PtrsEqual {
+ bool operator()(const std::shared_ptr<T>& l, const std::shared_ptr<T>& r) const {
+ return l->Equals(r);
+ }
+ };
+
+ bool operator==(const T& other) const { return cast().Equals(other); }
+ bool operator!=(const T& other) const { return !(cast() == other); }
+
+ private:
+ const T& cast() const { return static_cast<const T&>(*this); }
+};
+
+} // namespace util
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/compression.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/compression.cc
new file mode 100644
index 00000000000..8db199b4e76
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/compression.cc
@@ -0,0 +1,261 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/util/compression.h"
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/util/compression_internal.h"
+#include "arrow/util/logging.h"
+
+namespace arrow {
+namespace util {
+
+namespace {
+
+Status CheckSupportsCompressionLevel(Compression::type type) {
+ if (!Codec::SupportsCompressionLevel(type)) {
+ return Status::Invalid(
+ "The specified codec does not support the compression level parameter");
+ }
+ return Status::OK();
+}
+
+} // namespace
+
+int Codec::UseDefaultCompressionLevel() { return kUseDefaultCompressionLevel; }
+
+Status Codec::Init() { return Status::OK(); }
+
+const std::string& Codec::GetCodecAsString(Compression::type t) {
+ static const std::string uncompressed = "uncompressed", snappy = "snappy",
+ gzip = "gzip", lzo = "lzo", brotli = "brotli",
+ lz4_raw = "lz4_raw", lz4 = "lz4", lz4_hadoop = "lz4_hadoop",
+ zstd = "zstd", bz2 = "bz2", unknown = "unknown";
+
+ switch (t) {
+ case Compression::UNCOMPRESSED:
+ return uncompressed;
+ case Compression::SNAPPY:
+ return snappy;
+ case Compression::GZIP:
+ return gzip;
+ case Compression::LZO:
+ return lzo;
+ case Compression::BROTLI:
+ return brotli;
+ case Compression::LZ4:
+ return lz4_raw;
+ case Compression::LZ4_FRAME:
+ return lz4;
+ case Compression::LZ4_HADOOP:
+ return lz4_hadoop;
+ case Compression::ZSTD:
+ return zstd;
+ case Compression::BZ2:
+ return bz2;
+ default:
+ return unknown;
+ }
+}
+
+Result<Compression::type> Codec::GetCompressionType(const std::string& name) {
+ if (name == "uncompressed") {
+ return Compression::UNCOMPRESSED;
+ } else if (name == "gzip") {
+ return Compression::GZIP;
+ } else if (name == "snappy") {
+ return Compression::SNAPPY;
+ } else if (name == "lzo") {
+ return Compression::LZO;
+ } else if (name == "brotli") {
+ return Compression::BROTLI;
+ } else if (name == "lz4_raw") {
+ return Compression::LZ4;
+ } else if (name == "lz4") {
+ return Compression::LZ4_FRAME;
+ } else if (name == "lz4_hadoop") {
+ return Compression::LZ4_HADOOP;
+ } else if (name == "zstd") {
+ return Compression::ZSTD;
+ } else if (name == "bz2") {
+ return Compression::BZ2;
+ } else {
+ return Status::Invalid("Unrecognized compression type: ", name);
+ }
+}
+
+bool Codec::SupportsCompressionLevel(Compression::type codec) {
+ switch (codec) {
+ case Compression::GZIP:
+ case Compression::BROTLI:
+ case Compression::ZSTD:
+ case Compression::BZ2:
+ return true;
+ default:
+ return false;
+ }
+}
+
+Result<int> Codec::MaximumCompressionLevel(Compression::type codec_type) {
+ RETURN_NOT_OK(CheckSupportsCompressionLevel(codec_type));
+ ARROW_ASSIGN_OR_RAISE(auto codec, Codec::Create(codec_type));
+ return codec->maximum_compression_level();
+}
+
+Result<int> Codec::MinimumCompressionLevel(Compression::type codec_type) {
+ RETURN_NOT_OK(CheckSupportsCompressionLevel(codec_type));
+ ARROW_ASSIGN_OR_RAISE(auto codec, Codec::Create(codec_type));
+ return codec->minimum_compression_level();
+}
+
+Result<int> Codec::DefaultCompressionLevel(Compression::type codec_type) {
+ RETURN_NOT_OK(CheckSupportsCompressionLevel(codec_type));
+ ARROW_ASSIGN_OR_RAISE(auto codec, Codec::Create(codec_type));
+ return codec->default_compression_level();
+}
+
+Result<std::unique_ptr<Codec>> Codec::Create(Compression::type codec_type,
+ int compression_level) {
+ if (!IsAvailable(codec_type)) {
+ if (codec_type == Compression::LZO) {
+ return Status::NotImplemented("LZO codec not implemented");
+ }
+
+ auto name = GetCodecAsString(codec_type);
+ if (name == "unknown") {
+ return Status::Invalid("Unrecognized codec");
+ }
+
+ return Status::NotImplemented("Support for codec '", GetCodecAsString(codec_type),
+ "' not built");
+ }
+
+ if (compression_level != kUseDefaultCompressionLevel &&
+ !SupportsCompressionLevel(codec_type)) {
+ return Status::Invalid("Codec '", GetCodecAsString(codec_type),
+ "' doesn't support setting a compression level.");
+ }
+
+ std::unique_ptr<Codec> codec;
+ switch (codec_type) {
+ case Compression::UNCOMPRESSED:
+ return nullptr;
+ case Compression::SNAPPY:
+#ifdef ARROW_WITH_SNAPPY
+ codec = internal::MakeSnappyCodec();
+#endif
+ break;
+ case Compression::GZIP:
+#ifdef ARROW_WITH_ZLIB
+ codec = internal::MakeGZipCodec(compression_level);
+#endif
+ break;
+ case Compression::BROTLI:
+#ifdef ARROW_WITH_BROTLI
+ codec = internal::MakeBrotliCodec(compression_level);
+#endif
+ break;
+ case Compression::LZ4:
+#ifdef ARROW_WITH_LZ4
+ codec = internal::MakeLz4RawCodec();
+#endif
+ break;
+ case Compression::LZ4_FRAME:
+#ifdef ARROW_WITH_LZ4
+ codec = internal::MakeLz4FrameCodec();
+#endif
+ break;
+ case Compression::LZ4_HADOOP:
+#ifdef ARROW_WITH_LZ4
+ codec = internal::MakeLz4HadoopRawCodec();
+#endif
+ break;
+ case Compression::ZSTD:
+#ifdef ARROW_WITH_ZSTD
+ codec = internal::MakeZSTDCodec(compression_level);
+#endif
+ break;
+ case Compression::BZ2:
+#ifdef ARROW_WITH_BZ2
+ codec = internal::MakeBZ2Codec(compression_level);
+#endif
+ break;
+ default:
+ break;
+ }
+
+ DCHECK_NE(codec, nullptr);
+ RETURN_NOT_OK(codec->Init());
+ return std::move(codec);
+}
+
+bool Codec::IsAvailable(Compression::type codec_type) {
+ switch (codec_type) {
+ case Compression::UNCOMPRESSED:
+ return true;
+ case Compression::SNAPPY:
+#ifdef ARROW_WITH_SNAPPY
+ return true;
+#else
+ return false;
+#endif
+ case Compression::GZIP:
+#ifdef ARROW_WITH_ZLIB
+ return true;
+#else
+ return false;
+#endif
+ case Compression::LZO:
+ return false;
+ case Compression::BROTLI:
+#ifdef ARROW_WITH_BROTLI
+ return true;
+#else
+ return false;
+#endif
+ case Compression::LZ4:
+ case Compression::LZ4_FRAME:
+ case Compression::LZ4_HADOOP:
+#ifdef ARROW_WITH_LZ4
+ return true;
+#else
+ return false;
+#endif
+ case Compression::ZSTD:
+#ifdef ARROW_WITH_ZSTD
+ return true;
+#else
+ return false;
+#endif
+ case Compression::BZ2:
+#ifdef ARROW_WITH_BZ2
+ return true;
+#else
+ return false;
+#endif
+ default:
+ return false;
+ }
+}
+
+} // namespace util
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/compression.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/compression.h
new file mode 100644
index 00000000000..0832e82a606
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/compression.h
@@ -0,0 +1,202 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <string>
+
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/util/type_fwd.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace util {
+
+constexpr int kUseDefaultCompressionLevel = std::numeric_limits<int>::min();
+
+/// \brief Streaming compressor interface
+///
+class ARROW_EXPORT Compressor {
+ public:
+ virtual ~Compressor() = default;
+
+ struct CompressResult {
+ int64_t bytes_read;
+ int64_t bytes_written;
+ };
+ struct FlushResult {
+ int64_t bytes_written;
+ bool should_retry;
+ };
+ struct EndResult {
+ int64_t bytes_written;
+ bool should_retry;
+ };
+
+ /// \brief Compress some input.
+ ///
+ /// If bytes_read is 0 on return, then a larger output buffer should be supplied.
+ virtual Result<CompressResult> Compress(int64_t input_len, const uint8_t* input,
+ int64_t output_len, uint8_t* output) = 0;
+
+ /// \brief Flush part of the compressed output.
+ ///
+ /// If should_retry is true on return, Flush() should be called again
+ /// with a larger buffer.
+ virtual Result<FlushResult> Flush(int64_t output_len, uint8_t* output) = 0;
+
+ /// \brief End compressing, doing whatever is necessary to end the stream.
+ ///
+ /// If should_retry is true on return, End() should be called again
+ /// with a larger buffer. Otherwise, the Compressor should not be used anymore.
+ ///
+ /// End() implies Flush().
+ virtual Result<EndResult> End(int64_t output_len, uint8_t* output) = 0;
+
+ // XXX add methods for buffer size heuristics?
+};
+
+/// \brief Streaming decompressor interface
+///
+class ARROW_EXPORT Decompressor {
+ public:
+ virtual ~Decompressor() = default;
+
+ struct DecompressResult {
+ // XXX is need_more_output necessary? (Brotli?)
+ int64_t bytes_read;
+ int64_t bytes_written;
+ bool need_more_output;
+ };
+
+ /// \brief Decompress some input.
+ ///
+ /// If need_more_output is true on return, a larger output buffer needs
+ /// to be supplied.
+ virtual Result<DecompressResult> Decompress(int64_t input_len, const uint8_t* input,
+ int64_t output_len, uint8_t* output) = 0;
+
+ /// \brief Return whether the compressed stream is finished.
+ ///
+ /// This is a heuristic. If true is returned, then it is guaranteed
+ /// that the stream is finished. If false is returned, however, it may
+ /// simply be that the underlying library isn't able to provide the information.
+ virtual bool IsFinished() = 0;
+
+ /// \brief Reinitialize decompressor, making it ready for a new compressed stream.
+ virtual Status Reset() = 0;
+
+ // XXX add methods for buffer size heuristics?
+};
+
+/// \brief Compression codec
+class ARROW_EXPORT Codec {
+ public:
+ virtual ~Codec() = default;
+
+ /// \brief Return special value to indicate that a codec implementation
+ /// should use its default compression level
+ static int UseDefaultCompressionLevel();
+
+ /// \brief Return a string name for compression type
+ static const std::string& GetCodecAsString(Compression::type t);
+
+ /// \brief Return compression type for name (all upper case)
+ static Result<Compression::type> GetCompressionType(const std::string& name);
+
+ /// \brief Create a codec for the given compression algorithm
+ static Result<std::unique_ptr<Codec>> Create(
+ Compression::type codec, int compression_level = kUseDefaultCompressionLevel);
+
+ /// \brief Return true if support for indicated codec has been enabled
+ static bool IsAvailable(Compression::type codec);
+
+ /// \brief Return true if indicated codec supports setting a compression level
+ static bool SupportsCompressionLevel(Compression::type codec);
+
+ /// \brief Return the smallest supported compression level for the codec
+ /// Note: This function creates a temporary Codec instance
+ static Result<int> MinimumCompressionLevel(Compression::type codec);
+
+ /// \brief Return the largest supported compression level for the codec
+ /// Note: This function creates a temporary Codec instance
+ static Result<int> MaximumCompressionLevel(Compression::type codec);
+
+ /// \brief Return the default compression level
+ /// Note: This function creates a temporary Codec instance
+ static Result<int> DefaultCompressionLevel(Compression::type codec);
+
+ /// \brief Return the smallest supported compression level
+ virtual int minimum_compression_level() const = 0;
+
+ /// \brief Return the largest supported compression level
+ virtual int maximum_compression_level() const = 0;
+
+ /// \brief Return the default compression level
+ virtual int default_compression_level() const = 0;
+
+ /// \brief One-shot decompression function
+ ///
+ /// output_buffer_len must be correct and therefore be obtained in advance.
+ /// The actual decompressed length is returned.
+ ///
+ /// \note One-shot decompression is not always compatible with streaming
+ /// compression. Depending on the codec (e.g. LZ4), different formats may
+ /// be used.
+ virtual Result<int64_t> Decompress(int64_t input_len, const uint8_t* input,
+ int64_t output_buffer_len,
+ uint8_t* output_buffer) = 0;
+
+ /// \brief One-shot compression function
+ ///
+ /// output_buffer_len must first have been computed using MaxCompressedLen().
+ /// The actual compressed length is returned.
+ ///
+ /// \note One-shot compression is not always compatible with streaming
+ /// decompression. Depending on the codec (e.g. LZ4), different formats may
+ /// be used.
+ virtual Result<int64_t> Compress(int64_t input_len, const uint8_t* input,
+ int64_t output_buffer_len, uint8_t* output_buffer) = 0;
+
+ virtual int64_t MaxCompressedLen(int64_t input_len, const uint8_t* input) = 0;
+
+ /// \brief Create a streaming compressor instance
+ virtual Result<std::shared_ptr<Compressor>> MakeCompressor() = 0;
+
+ /// \brief Create a streaming compressor instance
+ virtual Result<std::shared_ptr<Decompressor>> MakeDecompressor() = 0;
+
+ /// \brief This Codec's compression type
+ virtual Compression::type compression_type() const = 0;
+
+ /// \brief The name of this Codec's compression type
+ const std::string& name() const { return GetCodecAsString(compression_type()); }
+
+ /// \brief This Codec's compression level, if applicable
+ virtual int compression_level() const { return UseDefaultCompressionLevel(); }
+
+ private:
+ /// \brief Initializes the codec's resources.
+ virtual Status Init();
+};
+
+} // namespace util
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/compression_brotli.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/compression_brotli.cc
new file mode 100644
index 00000000000..cb547c2c8cf
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/compression_brotli.cc
@@ -0,0 +1,245 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/util/compression_internal.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+
+#include <brotli/decode.h>
+#include <brotli/encode.h>
+#include <brotli/types.h>
+
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/macros.h"
+
+namespace arrow {
+namespace util {
+namespace internal {
+
+namespace {
+
+class BrotliDecompressor : public Decompressor {
+ public:
+ ~BrotliDecompressor() override {
+ if (state_ != nullptr) {
+ BrotliDecoderDestroyInstance(state_);
+ }
+ }
+
+ Status Init() {
+ state_ = BrotliDecoderCreateInstance(nullptr, nullptr, nullptr);
+ if (state_ == nullptr) {
+ return BrotliError("Brotli init failed");
+ }
+ return Status::OK();
+ }
+
+ Status Reset() override {
+ if (state_ != nullptr) {
+ BrotliDecoderDestroyInstance(state_);
+ }
+ return Init();
+ }
+
+ Result<DecompressResult> Decompress(int64_t input_len, const uint8_t* input,
+ int64_t output_len, uint8_t* output) override {
+ auto avail_in = static_cast<size_t>(input_len);
+ auto avail_out = static_cast<size_t>(output_len);
+ BrotliDecoderResult ret;
+
+ ret = BrotliDecoderDecompressStream(state_, &avail_in, &input, &avail_out, &output,
+ nullptr /* total_out */);
+ if (ret == BROTLI_DECODER_RESULT_ERROR) {
+ return BrotliError(BrotliDecoderGetErrorCode(state_), "Brotli decompress failed: ");
+ }
+ return DecompressResult{static_cast<int64_t>(input_len - avail_in),
+ static_cast<int64_t>(output_len - avail_out),
+ (ret == BROTLI_DECODER_RESULT_NEEDS_MORE_OUTPUT)};
+ }
+
+ bool IsFinished() override { return BrotliDecoderIsFinished(state_); }
+
+ protected:
+ Status BrotliError(const char* msg) { return Status::IOError(msg); }
+
+ Status BrotliError(BrotliDecoderErrorCode code, const char* prefix_msg) {
+ return Status::IOError(prefix_msg, BrotliDecoderErrorString(code));
+ }
+
+ BrotliDecoderState* state_ = nullptr;
+};
+
+// ----------------------------------------------------------------------
+// Brotli compressor implementation
+
+class BrotliCompressor : public Compressor {
+ public:
+ explicit BrotliCompressor(int compression_level)
+ : compression_level_(compression_level) {}
+
+ ~BrotliCompressor() override {
+ if (state_ != nullptr) {
+ BrotliEncoderDestroyInstance(state_);
+ }
+ }
+
+ Status Init() {
+ state_ = BrotliEncoderCreateInstance(nullptr, nullptr, nullptr);
+ if (state_ == nullptr) {
+ return BrotliError("Brotli init failed");
+ }
+ if (!BrotliEncoderSetParameter(state_, BROTLI_PARAM_QUALITY, compression_level_)) {
+ return BrotliError("Brotli set compression level failed");
+ }
+ return Status::OK();
+ }
+
+ Result<CompressResult> Compress(int64_t input_len, const uint8_t* input,
+ int64_t output_len, uint8_t* output) override {
+ auto avail_in = static_cast<size_t>(input_len);
+ auto avail_out = static_cast<size_t>(output_len);
+ BROTLI_BOOL ret;
+
+ ret = BrotliEncoderCompressStream(state_, BROTLI_OPERATION_PROCESS, &avail_in, &input,
+ &avail_out, &output, nullptr /* total_out */);
+ if (!ret) {
+ return BrotliError("Brotli compress failed");
+ }
+ return CompressResult{static_cast<int64_t>(input_len - avail_in),
+ static_cast<int64_t>(output_len - avail_out)};
+ }
+
+ Result<FlushResult> Flush(int64_t output_len, uint8_t* output) override {
+ size_t avail_in = 0;
+ const uint8_t* next_in = nullptr;
+ auto avail_out = static_cast<size_t>(output_len);
+ BROTLI_BOOL ret;
+
+ ret = BrotliEncoderCompressStream(state_, BROTLI_OPERATION_FLUSH, &avail_in, &next_in,
+ &avail_out, &output, nullptr /* total_out */);
+ if (!ret) {
+ return BrotliError("Brotli flush failed");
+ }
+ return FlushResult{static_cast<int64_t>(output_len - avail_out),
+ !!BrotliEncoderHasMoreOutput(state_)};
+ }
+
+ Result<EndResult> End(int64_t output_len, uint8_t* output) override {
+ size_t avail_in = 0;
+ const uint8_t* next_in = nullptr;
+ auto avail_out = static_cast<size_t>(output_len);
+ BROTLI_BOOL ret;
+
+ ret =
+ BrotliEncoderCompressStream(state_, BROTLI_OPERATION_FINISH, &avail_in, &next_in,
+ &avail_out, &output, nullptr /* total_out */);
+ if (!ret) {
+ return BrotliError("Brotli end failed");
+ }
+ bool should_retry = !!BrotliEncoderHasMoreOutput(state_);
+ DCHECK_EQ(should_retry, !BrotliEncoderIsFinished(state_));
+ return EndResult{static_cast<int64_t>(output_len - avail_out), should_retry};
+ }
+
+ protected:
+ Status BrotliError(const char* msg) { return Status::IOError(msg); }
+
+ BrotliEncoderState* state_ = nullptr;
+
+ private:
+ const int compression_level_;
+};
+
+// ----------------------------------------------------------------------
+// Brotli codec implementation
+
+class BrotliCodec : public Codec {
+ public:
+ explicit BrotliCodec(int compression_level)
+ : compression_level_(compression_level == kUseDefaultCompressionLevel
+ ? kBrotliDefaultCompressionLevel
+ : compression_level) {}
+
+ Result<int64_t> Decompress(int64_t input_len, const uint8_t* input,
+ int64_t output_buffer_len, uint8_t* output_buffer) override {
+ DCHECK_GE(input_len, 0);
+ DCHECK_GE(output_buffer_len, 0);
+ std::size_t output_size = static_cast<size_t>(output_buffer_len);
+ if (BrotliDecoderDecompress(static_cast<size_t>(input_len), input, &output_size,
+ output_buffer) != BROTLI_DECODER_RESULT_SUCCESS) {
+ return Status::IOError("Corrupt brotli compressed data.");
+ }
+ return output_size;
+ }
+
+ int64_t MaxCompressedLen(int64_t input_len,
+ const uint8_t* ARROW_ARG_UNUSED(input)) override {
+ DCHECK_GE(input_len, 0);
+ return BrotliEncoderMaxCompressedSize(static_cast<size_t>(input_len));
+ }
+
+ Result<int64_t> Compress(int64_t input_len, const uint8_t* input,
+ int64_t output_buffer_len, uint8_t* output_buffer) override {
+ DCHECK_GE(input_len, 0);
+ DCHECK_GE(output_buffer_len, 0);
+ std::size_t output_size = static_cast<size_t>(output_buffer_len);
+ if (BrotliEncoderCompress(compression_level_, BROTLI_DEFAULT_WINDOW,
+ BROTLI_DEFAULT_MODE, static_cast<size_t>(input_len), input,
+ &output_size, output_buffer) == BROTLI_FALSE) {
+ return Status::IOError("Brotli compression failure.");
+ }
+ return output_size;
+ }
+
+ Result<std::shared_ptr<Compressor>> MakeCompressor() override {
+ auto ptr = std::make_shared<BrotliCompressor>(compression_level_);
+ RETURN_NOT_OK(ptr->Init());
+ return ptr;
+ }
+
+ Result<std::shared_ptr<Decompressor>> MakeDecompressor() override {
+ auto ptr = std::make_shared<BrotliDecompressor>();
+ RETURN_NOT_OK(ptr->Init());
+ return ptr;
+ }
+
+ Compression::type compression_type() const override { return Compression::BROTLI; }
+
+ int compression_level() const override { return compression_level_; }
+ int minimum_compression_level() const override { return BROTLI_MIN_QUALITY; }
+ int maximum_compression_level() const override { return BROTLI_MAX_QUALITY; }
+ int default_compression_level() const override {
+ return kBrotliDefaultCompressionLevel;
+ }
+
+ private:
+ const int compression_level_;
+};
+
+} // namespace
+
+std::unique_ptr<Codec> MakeBrotliCodec(int compression_level) {
+ return std::unique_ptr<Codec>(new BrotliCodec(compression_level));
+}
+
+} // namespace internal
+} // namespace util
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/compression_internal.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/compression_internal.h
new file mode 100644
index 00000000000..268672e14e2
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/compression_internal.h
@@ -0,0 +1,80 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+
+#include "arrow/util/compression.h" // IWYU pragma: export
+
+namespace arrow {
+namespace util {
+
+// ----------------------------------------------------------------------
+// Internal Codec factories
+
+namespace internal {
+
+// Brotli compression quality is max (11) by default, which is slow.
+// We use 8 as a default as it is the best trade-off for Parquet workload.
+constexpr int kBrotliDefaultCompressionLevel = 8;
+
+// Brotli codec.
+std::unique_ptr<Codec> MakeBrotliCodec(
+ int compression_level = kBrotliDefaultCompressionLevel);
+
+// BZ2 codec.
+constexpr int kBZ2DefaultCompressionLevel = 9;
+std::unique_ptr<Codec> MakeBZ2Codec(int compression_level = kBZ2DefaultCompressionLevel);
+
+// GZip
+constexpr int kGZipDefaultCompressionLevel = 9;
+
+struct GZipFormat {
+ enum type {
+ ZLIB,
+ DEFLATE,
+ GZIP,
+ };
+};
+
+std::unique_ptr<Codec> MakeGZipCodec(int compression_level = kGZipDefaultCompressionLevel,
+ GZipFormat::type format = GZipFormat::GZIP);
+
+// Snappy
+std::unique_ptr<Codec> MakeSnappyCodec();
+
+// Lz4 "raw" format codec.
+std::unique_ptr<Codec> MakeLz4RawCodec();
+
+// Lz4 "Hadoop" format codec (== Lz4 raw codec prefixed with lengths header)
+std::unique_ptr<Codec> MakeLz4HadoopRawCodec();
+
+// Lz4 frame format codec.
+std::unique_ptr<Codec> MakeLz4FrameCodec();
+
+// ZSTD codec.
+
+// XXX level = 1 probably doesn't compress very much
+constexpr int kZSTDDefaultCompressionLevel = 1;
+
+std::unique_ptr<Codec> MakeZSTDCodec(
+ int compression_level = kZSTDDefaultCompressionLevel);
+
+} // namespace internal
+} // namespace util
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/compression_lz4.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/compression_lz4.cc
new file mode 100644
index 00000000000..c783e405590
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/compression_lz4.cc
@@ -0,0 +1,495 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/util/compression.h"
+
+#include <cstdint>
+#include <cstring>
+#include <memory>
+
+#include <lz4.h>
+#include <lz4frame.h>
+
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/endian.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/ubsan.h"
+
+#ifndef LZ4F_HEADER_SIZE_MAX
+#define LZ4F_HEADER_SIZE_MAX 19
+#endif
+
+namespace arrow {
+namespace util {
+
+namespace {
+
+static Status LZ4Error(LZ4F_errorCode_t ret, const char* prefix_msg) {
+ return Status::IOError(prefix_msg, LZ4F_getErrorName(ret));
+}
+
+static LZ4F_preferences_t DefaultPreferences() {
+ LZ4F_preferences_t prefs;
+ memset(&prefs, 0, sizeof(prefs));
+ return prefs;
+}
+
+// ----------------------------------------------------------------------
+// Lz4 frame decompressor implementation
+
+class LZ4Decompressor : public Decompressor {
+ public:
+ LZ4Decompressor() {}
+
+ ~LZ4Decompressor() override {
+ if (ctx_ != nullptr) {
+ ARROW_UNUSED(LZ4F_freeDecompressionContext(ctx_));
+ }
+ }
+
+ Status Init() {
+ LZ4F_errorCode_t ret;
+ finished_ = false;
+
+ ret = LZ4F_createDecompressionContext(&ctx_, LZ4F_VERSION);
+ if (LZ4F_isError(ret)) {
+ return LZ4Error(ret, "LZ4 init failed: ");
+ } else {
+ return Status::OK();
+ }
+ }
+
+ Status Reset() override {
+#if defined(LZ4_VERSION_NUMBER) && LZ4_VERSION_NUMBER >= 10800
+ // LZ4F_resetDecompressionContext appeared in 1.8.0
+ DCHECK_NE(ctx_, nullptr);
+ LZ4F_resetDecompressionContext(ctx_);
+ finished_ = false;
+ return Status::OK();
+#else
+ if (ctx_ != nullptr) {
+ ARROW_UNUSED(LZ4F_freeDecompressionContext(ctx_));
+ }
+ return Init();
+#endif
+ }
+
+ Result<DecompressResult> Decompress(int64_t input_len, const uint8_t* input,
+ int64_t output_len, uint8_t* output) override {
+ auto src = input;
+ auto dst = output;
+ auto src_size = static_cast<size_t>(input_len);
+ auto dst_capacity = static_cast<size_t>(output_len);
+ size_t ret;
+
+ ret =
+ LZ4F_decompress(ctx_, dst, &dst_capacity, src, &src_size, nullptr /* options */);
+ if (LZ4F_isError(ret)) {
+ return LZ4Error(ret, "LZ4 decompress failed: ");
+ }
+ finished_ = (ret == 0);
+ return DecompressResult{static_cast<int64_t>(src_size),
+ static_cast<int64_t>(dst_capacity),
+ (src_size == 0 && dst_capacity == 0)};
+ }
+
+ bool IsFinished() override { return finished_; }
+
+ protected:
+ LZ4F_decompressionContext_t ctx_ = nullptr;
+ bool finished_;
+};
+
+// ----------------------------------------------------------------------
+// Lz4 frame compressor implementation
+
+class LZ4Compressor : public Compressor {
+ public:
+ LZ4Compressor() {}
+
+ ~LZ4Compressor() override {
+ if (ctx_ != nullptr) {
+ ARROW_UNUSED(LZ4F_freeCompressionContext(ctx_));
+ }
+ }
+
+ Status Init() {
+ LZ4F_errorCode_t ret;
+ prefs_ = DefaultPreferences();
+ first_time_ = true;
+
+ ret = LZ4F_createCompressionContext(&ctx_, LZ4F_VERSION);
+ if (LZ4F_isError(ret)) {
+ return LZ4Error(ret, "LZ4 init failed: ");
+ } else {
+ return Status::OK();
+ }
+ }
+
+#define BEGIN_COMPRESS(dst, dst_capacity, output_too_small) \
+ if (first_time_) { \
+ if (dst_capacity < LZ4F_HEADER_SIZE_MAX) { \
+ /* Output too small to write LZ4F header */ \
+ return (output_too_small); \
+ } \
+ ret = LZ4F_compressBegin(ctx_, dst, dst_capacity, &prefs_); \
+ if (LZ4F_isError(ret)) { \
+ return LZ4Error(ret, "LZ4 compress begin failed: "); \
+ } \
+ first_time_ = false; \
+ dst += ret; \
+ dst_capacity -= ret; \
+ bytes_written += static_cast<int64_t>(ret); \
+ }
+
+ Result<CompressResult> Compress(int64_t input_len, const uint8_t* input,
+ int64_t output_len, uint8_t* output) override {
+ auto src = input;
+ auto dst = output;
+ auto src_size = static_cast<size_t>(input_len);
+ auto dst_capacity = static_cast<size_t>(output_len);
+ size_t ret;
+ int64_t bytes_written = 0;
+
+ BEGIN_COMPRESS(dst, dst_capacity, (CompressResult{0, 0}));
+
+ if (dst_capacity < LZ4F_compressBound(src_size, &prefs_)) {
+ // Output too small to compress into
+ return CompressResult{0, bytes_written};
+ }
+ ret = LZ4F_compressUpdate(ctx_, dst, dst_capacity, src, src_size,
+ nullptr /* options */);
+ if (LZ4F_isError(ret)) {
+ return LZ4Error(ret, "LZ4 compress update failed: ");
+ }
+ bytes_written += static_cast<int64_t>(ret);
+ DCHECK_LE(bytes_written, output_len);
+ return CompressResult{input_len, bytes_written};
+ }
+
+ Result<FlushResult> Flush(int64_t output_len, uint8_t* output) override {
+ auto dst = output;
+ auto dst_capacity = static_cast<size_t>(output_len);
+ size_t ret;
+ int64_t bytes_written = 0;
+
+ BEGIN_COMPRESS(dst, dst_capacity, (FlushResult{0, true}));
+
+ if (dst_capacity < LZ4F_compressBound(0, &prefs_)) {
+ // Output too small to flush into
+ return FlushResult{bytes_written, true};
+ }
+
+ ret = LZ4F_flush(ctx_, dst, dst_capacity, nullptr /* options */);
+ if (LZ4F_isError(ret)) {
+ return LZ4Error(ret, "LZ4 flush failed: ");
+ }
+ bytes_written += static_cast<int64_t>(ret);
+ DCHECK_LE(bytes_written, output_len);
+ return FlushResult{bytes_written, false};
+ }
+
+ Result<EndResult> End(int64_t output_len, uint8_t* output) override {
+ auto dst = output;
+ auto dst_capacity = static_cast<size_t>(output_len);
+ size_t ret;
+ int64_t bytes_written = 0;
+
+ BEGIN_COMPRESS(dst, dst_capacity, (EndResult{0, true}));
+
+ if (dst_capacity < LZ4F_compressBound(0, &prefs_)) {
+ // Output too small to end frame into
+ return EndResult{bytes_written, true};
+ }
+
+ ret = LZ4F_compressEnd(ctx_, dst, dst_capacity, nullptr /* options */);
+ if (LZ4F_isError(ret)) {
+ return LZ4Error(ret, "LZ4 end failed: ");
+ }
+ bytes_written += static_cast<int64_t>(ret);
+ DCHECK_LE(bytes_written, output_len);
+ return EndResult{bytes_written, false};
+ }
+
+#undef BEGIN_COMPRESS
+
+ protected:
+ LZ4F_compressionContext_t ctx_ = nullptr;
+ LZ4F_preferences_t prefs_;
+ bool first_time_;
+};
+
+// ----------------------------------------------------------------------
+// Lz4 frame codec implementation
+
+class Lz4FrameCodec : public Codec {
+ public:
+ Lz4FrameCodec() : prefs_(DefaultPreferences()) {}
+
+ int64_t MaxCompressedLen(int64_t input_len,
+ const uint8_t* ARROW_ARG_UNUSED(input)) override {
+ return static_cast<int64_t>(
+ LZ4F_compressFrameBound(static_cast<size_t>(input_len), &prefs_));
+ }
+
+ Result<int64_t> Compress(int64_t input_len, const uint8_t* input,
+ int64_t output_buffer_len, uint8_t* output_buffer) override {
+ auto output_len =
+ LZ4F_compressFrame(output_buffer, static_cast<size_t>(output_buffer_len), input,
+ static_cast<size_t>(input_len), &prefs_);
+ if (LZ4F_isError(output_len)) {
+ return LZ4Error(output_len, "Lz4 compression failure: ");
+ }
+ return static_cast<int64_t>(output_len);
+ }
+
+ Result<int64_t> Decompress(int64_t input_len, const uint8_t* input,
+ int64_t output_buffer_len, uint8_t* output_buffer) override {
+ ARROW_ASSIGN_OR_RAISE(auto decomp, MakeDecompressor());
+
+ int64_t total_bytes_written = 0;
+ while (!decomp->IsFinished() && input_len != 0) {
+ ARROW_ASSIGN_OR_RAISE(
+ auto res,
+ decomp->Decompress(input_len, input, output_buffer_len, output_buffer));
+ input += res.bytes_read;
+ input_len -= res.bytes_read;
+ output_buffer += res.bytes_written;
+ output_buffer_len -= res.bytes_written;
+ total_bytes_written += res.bytes_written;
+ if (res.need_more_output) {
+ return Status::IOError("Lz4 decompression buffer too small");
+ }
+ }
+ if (!decomp->IsFinished()) {
+ return Status::IOError("Lz4 compressed input contains less than one frame");
+ }
+ if (input_len != 0) {
+ return Status::IOError("Lz4 compressed input contains more than one frame");
+ }
+ return total_bytes_written;
+ }
+
+ Result<std::shared_ptr<Compressor>> MakeCompressor() override {
+ auto ptr = std::make_shared<LZ4Compressor>();
+ RETURN_NOT_OK(ptr->Init());
+ return ptr;
+ }
+
+ Result<std::shared_ptr<Decompressor>> MakeDecompressor() override {
+ auto ptr = std::make_shared<LZ4Decompressor>();
+ RETURN_NOT_OK(ptr->Init());
+ return ptr;
+ }
+
+ Compression::type compression_type() const override { return Compression::LZ4_FRAME; }
+ int minimum_compression_level() const override { return kUseDefaultCompressionLevel; }
+ int maximum_compression_level() const override { return kUseDefaultCompressionLevel; }
+ int default_compression_level() const override { return kUseDefaultCompressionLevel; }
+
+ protected:
+ const LZ4F_preferences_t prefs_;
+};
+
+// ----------------------------------------------------------------------
+// Lz4 "raw" codec implementation
+
+class Lz4Codec : public Codec {
+ public:
+ Result<int64_t> Decompress(int64_t input_len, const uint8_t* input,
+ int64_t output_buffer_len, uint8_t* output_buffer) override {
+ int64_t decompressed_size = LZ4_decompress_safe(
+ reinterpret_cast<const char*>(input), reinterpret_cast<char*>(output_buffer),
+ static_cast<int>(input_len), static_cast<int>(output_buffer_len));
+ if (decompressed_size < 0) {
+ return Status::IOError("Corrupt Lz4 compressed data.");
+ }
+ return decompressed_size;
+ }
+
+ int64_t MaxCompressedLen(int64_t input_len,
+ const uint8_t* ARROW_ARG_UNUSED(input)) override {
+ return LZ4_compressBound(static_cast<int>(input_len));
+ }
+
+ Result<int64_t> Compress(int64_t input_len, const uint8_t* input,
+ int64_t output_buffer_len, uint8_t* output_buffer) override {
+ int64_t output_len = LZ4_compress_default(
+ reinterpret_cast<const char*>(input), reinterpret_cast<char*>(output_buffer),
+ static_cast<int>(input_len), static_cast<int>(output_buffer_len));
+ if (output_len == 0) {
+ return Status::IOError("Lz4 compression failure.");
+ }
+ return output_len;
+ }
+
+ Result<std::shared_ptr<Compressor>> MakeCompressor() override {
+ return Status::NotImplemented(
+ "Streaming compression unsupported with LZ4 raw format. "
+ "Try using LZ4 frame format instead.");
+ }
+
+ Result<std::shared_ptr<Decompressor>> MakeDecompressor() override {
+ return Status::NotImplemented(
+ "Streaming decompression unsupported with LZ4 raw format. "
+ "Try using LZ4 frame format instead.");
+ }
+
+ Compression::type compression_type() const override { return Compression::LZ4; }
+ int minimum_compression_level() const override { return kUseDefaultCompressionLevel; }
+ int maximum_compression_level() const override { return kUseDefaultCompressionLevel; }
+ int default_compression_level() const override { return kUseDefaultCompressionLevel; }
+};
+
+// ----------------------------------------------------------------------
+// Lz4 Hadoop "raw" codec implementation
+
+class Lz4HadoopCodec : public Lz4Codec {
+ public:
+ Result<int64_t> Decompress(int64_t input_len, const uint8_t* input,
+ int64_t output_buffer_len, uint8_t* output_buffer) override {
+ const int64_t decompressed_size =
+ TryDecompressHadoop(input_len, input, output_buffer_len, output_buffer);
+ if (decompressed_size != kNotHadoop) {
+ return decompressed_size;
+ }
+ // Fall back on raw LZ4 codec (for files produces by earlier versions of Parquet C++)
+ return Lz4Codec::Decompress(input_len, input, output_buffer_len, output_buffer);
+ }
+
+ int64_t MaxCompressedLen(int64_t input_len,
+ const uint8_t* ARROW_ARG_UNUSED(input)) override {
+ return kPrefixLength + Lz4Codec::MaxCompressedLen(input_len, nullptr);
+ }
+
+ Result<int64_t> Compress(int64_t input_len, const uint8_t* input,
+ int64_t output_buffer_len, uint8_t* output_buffer) override {
+ if (output_buffer_len < kPrefixLength) {
+ return Status::Invalid("Output buffer too small for Lz4HadoopCodec compression");
+ }
+
+ ARROW_ASSIGN_OR_RAISE(
+ int64_t output_len,
+ Lz4Codec::Compress(input_len, input, output_buffer_len - kPrefixLength,
+ output_buffer + kPrefixLength));
+
+ // Prepend decompressed size in bytes and compressed size in bytes
+ // to be compatible with Hadoop Lz4Codec
+ const uint32_t decompressed_size =
+ BitUtil::ToBigEndian(static_cast<uint32_t>(input_len));
+ const uint32_t compressed_size =
+ BitUtil::ToBigEndian(static_cast<uint32_t>(output_len));
+ SafeStore(output_buffer, decompressed_size);
+ SafeStore(output_buffer + sizeof(uint32_t), compressed_size);
+
+ return kPrefixLength + output_len;
+ }
+
+ Result<std::shared_ptr<Compressor>> MakeCompressor() override {
+ return Status::NotImplemented(
+ "Streaming compression unsupported with LZ4 Hadoop raw format. "
+ "Try using LZ4 frame format instead.");
+ }
+
+ Result<std::shared_ptr<Decompressor>> MakeDecompressor() override {
+ return Status::NotImplemented(
+ "Streaming decompression unsupported with LZ4 Hadoop raw format. "
+ "Try using LZ4 frame format instead.");
+ }
+
+ Compression::type compression_type() const override { return Compression::LZ4_HADOOP; }
+
+ protected:
+ // Offset starting at which page data can be read/written
+ static const int64_t kPrefixLength = sizeof(uint32_t) * 2;
+
+ static const int64_t kNotHadoop = -1;
+
+ int64_t TryDecompressHadoop(int64_t input_len, const uint8_t* input,
+ int64_t output_buffer_len, uint8_t* output_buffer) {
+ // Parquet files written with the Hadoop Lz4Codec use their own framing.
+ // The input buffer can contain an arbitrary number of "frames", each
+ // with the following structure:
+ // - bytes 0..3: big-endian uint32_t representing the frame decompressed size
+ // - bytes 4..7: big-endian uint32_t representing the frame compressed size
+ // - bytes 8...: frame compressed data
+ //
+ // The Hadoop Lz4Codec source code can be found here:
+ // https://github.com/apache/hadoop/blob/trunk/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-nativetask/src/main/native/src/codec/Lz4Codec.cc
+ int64_t total_decompressed_size = 0;
+
+ while (input_len >= kPrefixLength) {
+ const uint32_t expected_decompressed_size =
+ BitUtil::FromBigEndian(SafeLoadAs<uint32_t>(input));
+ const uint32_t expected_compressed_size =
+ BitUtil::FromBigEndian(SafeLoadAs<uint32_t>(input + sizeof(uint32_t)));
+ input += kPrefixLength;
+ input_len -= kPrefixLength;
+
+ if (input_len < expected_compressed_size) {
+ // Not enough bytes for Hadoop "frame"
+ return kNotHadoop;
+ }
+ if (output_buffer_len < expected_decompressed_size) {
+ // Not enough bytes to hold advertised output => probably not Hadoop
+ return kNotHadoop;
+ }
+ // Try decompressing and compare with expected decompressed length
+ auto maybe_decompressed_size = Lz4Codec::Decompress(
+ expected_compressed_size, input, output_buffer_len, output_buffer);
+ if (!maybe_decompressed_size.ok() ||
+ *maybe_decompressed_size != expected_decompressed_size) {
+ return kNotHadoop;
+ }
+ input += expected_compressed_size;
+ input_len -= expected_compressed_size;
+ output_buffer += expected_decompressed_size;
+ output_buffer_len -= expected_decompressed_size;
+ total_decompressed_size += expected_decompressed_size;
+ }
+
+ if (input_len == 0) {
+ return total_decompressed_size;
+ } else {
+ return kNotHadoop;
+ }
+ }
+};
+
+} // namespace
+
+namespace internal {
+
+std::unique_ptr<Codec> MakeLz4FrameCodec() {
+ return std::unique_ptr<Codec>(new Lz4FrameCodec());
+}
+
+std::unique_ptr<Codec> MakeLz4HadoopRawCodec() {
+ return std::unique_ptr<Codec>(new Lz4HadoopCodec());
+}
+
+std::unique_ptr<Codec> MakeLz4RawCodec() {
+ return std::unique_ptr<Codec>(new Lz4Codec());
+}
+
+} // namespace internal
+
+} // namespace util
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/compression_snappy.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/compression_snappy.cc
new file mode 100644
index 00000000000..3756f957d04
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/compression_snappy.cc
@@ -0,0 +1,102 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/util/compression_internal.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+
+#include <snappy.h>
+
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/macros.h"
+
+using std::size_t;
+
+namespace arrow {
+namespace util {
+namespace internal {
+
+namespace {
+
+// ----------------------------------------------------------------------
+// Snappy implementation
+
+class SnappyCodec : public Codec {
+ public:
+ Result<int64_t> Decompress(int64_t input_len, const uint8_t* input,
+ int64_t output_buffer_len, uint8_t* output_buffer) override {
+ size_t decompressed_size;
+ if (!snappy::GetUncompressedLength(reinterpret_cast<const char*>(input),
+ static_cast<size_t>(input_len),
+ &decompressed_size)) {
+ return Status::IOError("Corrupt snappy compressed data.");
+ }
+ if (output_buffer_len < static_cast<int64_t>(decompressed_size)) {
+ return Status::Invalid("Output buffer size (", output_buffer_len, ") must be ",
+ decompressed_size, " or larger.");
+ }
+ if (!snappy::RawUncompress(reinterpret_cast<const char*>(input),
+ static_cast<size_t>(input_len),
+ reinterpret_cast<char*>(output_buffer))) {
+ return Status::IOError("Corrupt snappy compressed data.");
+ }
+ return static_cast<int64_t>(decompressed_size);
+ }
+
+ int64_t MaxCompressedLen(int64_t input_len,
+ const uint8_t* ARROW_ARG_UNUSED(input)) override {
+ DCHECK_GE(input_len, 0);
+ return snappy::MaxCompressedLength(static_cast<size_t>(input_len));
+ }
+
+ Result<int64_t> Compress(int64_t input_len, const uint8_t* input,
+ int64_t ARROW_ARG_UNUSED(output_buffer_len),
+ uint8_t* output_buffer) override {
+ size_t output_size;
+ snappy::RawCompress(reinterpret_cast<const char*>(input),
+ static_cast<size_t>(input_len),
+ reinterpret_cast<char*>(output_buffer), &output_size);
+ return static_cast<int64_t>(output_size);
+ }
+
+ Result<std::shared_ptr<Compressor>> MakeCompressor() override {
+ return Status::NotImplemented("Streaming compression unsupported with Snappy");
+ }
+
+ Result<std::shared_ptr<Decompressor>> MakeDecompressor() override {
+ return Status::NotImplemented("Streaming decompression unsupported with Snappy");
+ }
+
+ Compression::type compression_type() const override { return Compression::SNAPPY; }
+ int minimum_compression_level() const override { return kUseDefaultCompressionLevel; }
+ int maximum_compression_level() const override { return kUseDefaultCompressionLevel; }
+ int default_compression_level() const override { return kUseDefaultCompressionLevel; }
+};
+
+} // namespace
+
+std::unique_ptr<Codec> MakeSnappyCodec() {
+ return std::unique_ptr<Codec>(new SnappyCodec());
+}
+
+} // namespace internal
+} // namespace util
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/compression_zlib.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/compression_zlib.cc
new file mode 100644
index 00000000000..e9cb2470ee2
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/compression_zlib.cc
@@ -0,0 +1,507 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/util/compression_internal.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <limits>
+#include <memory>
+
+#include <zconf.h>
+#include <zlib.h>
+
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/macros.h"
+
+namespace arrow {
+namespace util {
+namespace internal {
+
+namespace {
+
+// ----------------------------------------------------------------------
+// gzip implementation
+
+// These are magic numbers from zlib.h. Not clear why they are not defined
+// there.
+
+// Maximum window size
+constexpr int WINDOW_BITS = 15;
+
+// Output Gzip.
+constexpr int GZIP_CODEC = 16;
+
+// Determine if this is libz or gzip from header.
+constexpr int DETECT_CODEC = 32;
+
+constexpr int kGZipMinCompressionLevel = 1;
+constexpr int kGZipMaxCompressionLevel = 9;
+
+int CompressionWindowBitsForFormat(GZipFormat::type format) {
+ int window_bits = WINDOW_BITS;
+ switch (format) {
+ case GZipFormat::DEFLATE:
+ window_bits = -window_bits;
+ break;
+ case GZipFormat::GZIP:
+ window_bits += GZIP_CODEC;
+ break;
+ case GZipFormat::ZLIB:
+ break;
+ }
+ return window_bits;
+}
+
+int DecompressionWindowBitsForFormat(GZipFormat::type format) {
+ if (format == GZipFormat::DEFLATE) {
+ return -WINDOW_BITS;
+ } else {
+ /* If not deflate, autodetect format from header */
+ return WINDOW_BITS | DETECT_CODEC;
+ }
+}
+
+Status ZlibErrorPrefix(const char* prefix_msg, const char* msg) {
+ return Status::IOError(prefix_msg, (msg) ? msg : "(unknown error)");
+}
+
+// ----------------------------------------------------------------------
+// gzip decompressor implementation
+
+class GZipDecompressor : public Decompressor {
+ public:
+ explicit GZipDecompressor(GZipFormat::type format)
+ : format_(format), initialized_(false), finished_(false) {}
+
+ ~GZipDecompressor() override {
+ if (initialized_) {
+ inflateEnd(&stream_);
+ }
+ }
+
+ Status Init() {
+ DCHECK(!initialized_);
+ memset(&stream_, 0, sizeof(stream_));
+ finished_ = false;
+
+ int ret;
+ int window_bits = DecompressionWindowBitsForFormat(format_);
+ if ((ret = inflateInit2(&stream_, window_bits)) != Z_OK) {
+ return ZlibError("zlib inflateInit failed: ");
+ } else {
+ initialized_ = true;
+ return Status::OK();
+ }
+ }
+
+ Status Reset() override {
+ DCHECK(initialized_);
+ finished_ = false;
+ int ret;
+ if ((ret = inflateReset(&stream_)) != Z_OK) {
+ return ZlibError("zlib inflateReset failed: ");
+ } else {
+ return Status::OK();
+ }
+ }
+
+ Result<DecompressResult> Decompress(int64_t input_len, const uint8_t* input,
+ int64_t output_len, uint8_t* output) override {
+ static constexpr auto input_limit =
+ static_cast<int64_t>(std::numeric_limits<uInt>::max());
+ stream_.next_in = const_cast<Bytef*>(reinterpret_cast<const Bytef*>(input));
+ stream_.avail_in = static_cast<uInt>(std::min(input_len, input_limit));
+ stream_.next_out = reinterpret_cast<Bytef*>(output);
+ stream_.avail_out = static_cast<uInt>(std::min(output_len, input_limit));
+ int ret;
+
+ ret = inflate(&stream_, Z_SYNC_FLUSH);
+ if (ret == Z_DATA_ERROR || ret == Z_STREAM_ERROR || ret == Z_MEM_ERROR) {
+ return ZlibError("zlib inflate failed: ");
+ }
+ if (ret == Z_NEED_DICT) {
+ return ZlibError("zlib inflate failed (need preset dictionary): ");
+ }
+ finished_ = (ret == Z_STREAM_END);
+ if (ret == Z_BUF_ERROR) {
+ // No progress was possible
+ return DecompressResult{0, 0, true};
+ } else {
+ ARROW_CHECK(ret == Z_OK || ret == Z_STREAM_END);
+ // Some progress has been made
+ return DecompressResult{input_len - stream_.avail_in,
+ output_len - stream_.avail_out, false};
+ }
+ return Status::OK();
+ }
+
+ bool IsFinished() override { return finished_; }
+
+ protected:
+ Status ZlibError(const char* prefix_msg) {
+ return ZlibErrorPrefix(prefix_msg, stream_.msg);
+ }
+
+ z_stream stream_;
+ GZipFormat::type format_;
+ bool initialized_;
+ bool finished_;
+};
+
+// ----------------------------------------------------------------------
+// gzip compressor implementation
+
+class GZipCompressor : public Compressor {
+ public:
+ explicit GZipCompressor(int compression_level)
+ : initialized_(false), compression_level_(compression_level) {}
+
+ ~GZipCompressor() override {
+ if (initialized_) {
+ deflateEnd(&stream_);
+ }
+ }
+
+ Status Init(GZipFormat::type format) {
+ DCHECK(!initialized_);
+ memset(&stream_, 0, sizeof(stream_));
+
+ int ret;
+ // Initialize to run specified format
+ int window_bits = CompressionWindowBitsForFormat(format);
+ if ((ret = deflateInit2(&stream_, Z_DEFAULT_COMPRESSION, Z_DEFLATED, window_bits,
+ compression_level_, Z_DEFAULT_STRATEGY)) != Z_OK) {
+ return ZlibError("zlib deflateInit failed: ");
+ } else {
+ initialized_ = true;
+ return Status::OK();
+ }
+ }
+
+ Result<CompressResult> Compress(int64_t input_len, const uint8_t* input,
+ int64_t output_len, uint8_t* output) override {
+ DCHECK(initialized_) << "Called on non-initialized stream";
+
+ static constexpr auto input_limit =
+ static_cast<int64_t>(std::numeric_limits<uInt>::max());
+
+ stream_.next_in = const_cast<Bytef*>(reinterpret_cast<const Bytef*>(input));
+ stream_.avail_in = static_cast<uInt>(std::min(input_len, input_limit));
+ stream_.next_out = reinterpret_cast<Bytef*>(output);
+ stream_.avail_out = static_cast<uInt>(std::min(output_len, input_limit));
+
+ int64_t ret = 0;
+ ret = deflate(&stream_, Z_NO_FLUSH);
+ if (ret == Z_STREAM_ERROR) {
+ return ZlibError("zlib compress failed: ");
+ }
+ if (ret == Z_OK) {
+ // Some progress has been made
+ return CompressResult{input_len - stream_.avail_in, output_len - stream_.avail_out};
+ } else {
+ // No progress was possible
+ ARROW_CHECK_EQ(ret, Z_BUF_ERROR);
+ return CompressResult{0, 0};
+ }
+ }
+
+ Result<FlushResult> Flush(int64_t output_len, uint8_t* output) override {
+ DCHECK(initialized_) << "Called on non-initialized stream";
+
+ static constexpr auto input_limit =
+ static_cast<int64_t>(std::numeric_limits<uInt>::max());
+
+ stream_.avail_in = 0;
+ stream_.next_out = reinterpret_cast<Bytef*>(output);
+ stream_.avail_out = static_cast<uInt>(std::min(output_len, input_limit));
+
+ int64_t ret = 0;
+ ret = deflate(&stream_, Z_SYNC_FLUSH);
+ if (ret == Z_STREAM_ERROR) {
+ return ZlibError("zlib flush failed: ");
+ }
+ int64_t bytes_written;
+ if (ret == Z_OK) {
+ bytes_written = output_len - stream_.avail_out;
+ } else {
+ ARROW_CHECK_EQ(ret, Z_BUF_ERROR);
+ bytes_written = 0;
+ }
+ // "If deflate returns with avail_out == 0, this function must be called
+ // again with the same value of the flush parameter and more output space
+ // (updated avail_out), until the flush is complete (deflate returns
+ // with non-zero avail_out)."
+ // "Note that Z_BUF_ERROR is not fatal, and deflate() can be called again
+ // with more input and more output space to continue compressing."
+ return FlushResult{bytes_written, stream_.avail_out == 0};
+ }
+
+ Result<EndResult> End(int64_t output_len, uint8_t* output) override {
+ DCHECK(initialized_) << "Called on non-initialized stream";
+
+ static constexpr auto input_limit =
+ static_cast<int64_t>(std::numeric_limits<uInt>::max());
+
+ stream_.avail_in = 0;
+ stream_.next_out = reinterpret_cast<Bytef*>(output);
+ stream_.avail_out = static_cast<uInt>(std::min(output_len, input_limit));
+
+ int64_t ret = 0;
+ ret = deflate(&stream_, Z_FINISH);
+ if (ret == Z_STREAM_ERROR) {
+ return ZlibError("zlib flush failed: ");
+ }
+ int64_t bytes_written = output_len - stream_.avail_out;
+ if (ret == Z_STREAM_END) {
+ // Flush complete, we can now end the stream
+ initialized_ = false;
+ ret = deflateEnd(&stream_);
+ if (ret == Z_OK) {
+ return EndResult{bytes_written, false};
+ } else {
+ return ZlibError("zlib end failed: ");
+ }
+ } else {
+ // Not everything could be flushed,
+ return EndResult{bytes_written, true};
+ }
+ }
+
+ protected:
+ Status ZlibError(const char* prefix_msg) {
+ return ZlibErrorPrefix(prefix_msg, stream_.msg);
+ }
+
+ z_stream stream_;
+ bool initialized_;
+ int compression_level_;
+};
+
+// ----------------------------------------------------------------------
+// gzip codec implementation
+
+class GZipCodec : public Codec {
+ public:
+ explicit GZipCodec(int compression_level, GZipFormat::type format)
+ : format_(format),
+ compressor_initialized_(false),
+ decompressor_initialized_(false) {
+ compression_level_ = compression_level == kUseDefaultCompressionLevel
+ ? kGZipDefaultCompressionLevel
+ : compression_level;
+ }
+
+ ~GZipCodec() override {
+ EndCompressor();
+ EndDecompressor();
+ }
+
+ Result<std::shared_ptr<Compressor>> MakeCompressor() override {
+ auto ptr = std::make_shared<GZipCompressor>(compression_level_);
+ RETURN_NOT_OK(ptr->Init(format_));
+ return ptr;
+ }
+
+ Result<std::shared_ptr<Decompressor>> MakeDecompressor() override {
+ auto ptr = std::make_shared<GZipDecompressor>(format_);
+ RETURN_NOT_OK(ptr->Init());
+ return ptr;
+ }
+
+ Status InitCompressor() {
+ EndDecompressor();
+ memset(&stream_, 0, sizeof(stream_));
+
+ int ret;
+ // Initialize to run specified format
+ int window_bits = CompressionWindowBitsForFormat(format_);
+ if ((ret = deflateInit2(&stream_, Z_DEFAULT_COMPRESSION, Z_DEFLATED, window_bits,
+ compression_level_, Z_DEFAULT_STRATEGY)) != Z_OK) {
+ return ZlibErrorPrefix("zlib deflateInit failed: ", stream_.msg);
+ }
+ compressor_initialized_ = true;
+ return Status::OK();
+ }
+
+ void EndCompressor() {
+ if (compressor_initialized_) {
+ (void)deflateEnd(&stream_);
+ }
+ compressor_initialized_ = false;
+ }
+
+ Status InitDecompressor() {
+ EndCompressor();
+ memset(&stream_, 0, sizeof(stream_));
+ int ret;
+
+ // Initialize to run either deflate or zlib/gzip format
+ int window_bits = DecompressionWindowBitsForFormat(format_);
+ if ((ret = inflateInit2(&stream_, window_bits)) != Z_OK) {
+ return ZlibErrorPrefix("zlib inflateInit failed: ", stream_.msg);
+ }
+ decompressor_initialized_ = true;
+ return Status::OK();
+ }
+
+ void EndDecompressor() {
+ if (decompressor_initialized_) {
+ (void)inflateEnd(&stream_);
+ }
+ decompressor_initialized_ = false;
+ }
+
+ Result<int64_t> Decompress(int64_t input_length, const uint8_t* input,
+ int64_t output_buffer_length, uint8_t* output) override {
+ if (!decompressor_initialized_) {
+ RETURN_NOT_OK(InitDecompressor());
+ }
+ if (output_buffer_length == 0) {
+ // The zlib library does not allow *output to be NULL, even when
+ // output_buffer_length is 0 (inflate() will return Z_STREAM_ERROR). We don't
+ // consider this an error, so bail early if no output is expected. Note that we
+ // don't signal an error if the input actually contains compressed data.
+ return 0;
+ }
+
+ // Reset the stream for this block
+ if (inflateReset(&stream_) != Z_OK) {
+ return ZlibErrorPrefix("zlib inflateReset failed: ", stream_.msg);
+ }
+
+ int ret = 0;
+ // gzip can run in streaming mode or non-streaming mode. We only
+ // support the non-streaming use case where we present it the entire
+ // compressed input and a buffer big enough to contain the entire
+ // compressed output. In the case where we don't know the output,
+ // we just make a bigger buffer and try the non-streaming mode
+ // from the beginning again.
+ while (ret != Z_STREAM_END) {
+ stream_.next_in = const_cast<Bytef*>(reinterpret_cast<const Bytef*>(input));
+ stream_.avail_in = static_cast<uInt>(input_length);
+ stream_.next_out = reinterpret_cast<Bytef*>(output);
+ stream_.avail_out = static_cast<uInt>(output_buffer_length);
+
+ // We know the output size. In this case, we can use Z_FINISH
+ // which is more efficient.
+ ret = inflate(&stream_, Z_FINISH);
+ if (ret == Z_STREAM_END || ret != Z_OK) break;
+
+ // Failure, buffer was too small
+ return Status::IOError("Too small a buffer passed to GZipCodec. InputLength=",
+ input_length, " OutputLength=", output_buffer_length);
+ }
+
+ // Failure for some other reason
+ if (ret != Z_STREAM_END) {
+ return ZlibErrorPrefix("GZipCodec failed: ", stream_.msg);
+ }
+
+ return stream_.total_out;
+ }
+
+ int64_t MaxCompressedLen(int64_t input_length,
+ const uint8_t* ARROW_ARG_UNUSED(input)) override {
+ // Must be in compression mode
+ if (!compressor_initialized_) {
+ Status s = InitCompressor();
+ ARROW_CHECK_OK(s);
+ }
+ int64_t max_len = deflateBound(&stream_, static_cast<uLong>(input_length));
+ // ARROW-3514: return a more pessimistic estimate to account for bugs
+ // in old zlib versions.
+ return max_len + 12;
+ }
+
+ Result<int64_t> Compress(int64_t input_length, const uint8_t* input,
+ int64_t output_buffer_len, uint8_t* output) override {
+ if (!compressor_initialized_) {
+ RETURN_NOT_OK(InitCompressor());
+ }
+ stream_.next_in = const_cast<Bytef*>(reinterpret_cast<const Bytef*>(input));
+ stream_.avail_in = static_cast<uInt>(input_length);
+ stream_.next_out = reinterpret_cast<Bytef*>(output);
+ stream_.avail_out = static_cast<uInt>(output_buffer_len);
+
+ int64_t ret = 0;
+ if ((ret = deflate(&stream_, Z_FINISH)) != Z_STREAM_END) {
+ if (ret == Z_OK) {
+ // Will return Z_OK (and stream.msg NOT set) if stream.avail_out is too
+ // small
+ return Status::IOError("zlib deflate failed, output buffer too small");
+ }
+
+ return ZlibErrorPrefix("zlib deflate failed: ", stream_.msg);
+ }
+
+ if (deflateReset(&stream_) != Z_OK) {
+ return ZlibErrorPrefix("zlib deflateReset failed: ", stream_.msg);
+ }
+
+ // Actual output length
+ return output_buffer_len - stream_.avail_out;
+ }
+
+ Status Init() override {
+ const Status init_compressor_status = InitCompressor();
+ if (!init_compressor_status.ok()) {
+ return init_compressor_status;
+ }
+ return InitDecompressor();
+ }
+
+ Compression::type compression_type() const override { return Compression::GZIP; }
+
+ int compression_level() const override { return compression_level_; }
+ int minimum_compression_level() const override { return kGZipMinCompressionLevel; }
+ int maximum_compression_level() const override { return kGZipMaxCompressionLevel; }
+ int default_compression_level() const override { return kGZipDefaultCompressionLevel; }
+
+ private:
+ // zlib is stateful and the z_stream state variable must be initialized
+ // before
+ z_stream stream_;
+
+ // Realistically, this will always be GZIP, but we leave the option open to
+ // configure
+ GZipFormat::type format_;
+
+ // These variables are mutually exclusive. When the codec is in "compressor"
+ // state, compressor_initialized_ is true while decompressor_initialized_ is
+ // false. When it's decompressing, the opposite is true.
+ //
+ // Indeed, this is slightly hacky, but the alternative is having separate
+ // Compressor and Decompressor classes. If this ever becomes an issue, we can
+ // perform the refactoring then
+ bool compressor_initialized_;
+ bool decompressor_initialized_;
+ int compression_level_;
+};
+
+} // namespace
+
+std::unique_ptr<Codec> MakeGZipCodec(int compression_level, GZipFormat::type format) {
+ return std::unique_ptr<Codec>(new GZipCodec(compression_level, format));
+}
+
+} // namespace internal
+} // namespace util
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/compression_zstd.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/compression_zstd.cc
new file mode 100644
index 00000000000..e15ecb4e1fe
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/compression_zstd.cc
@@ -0,0 +1,249 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/util/compression_internal.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+
+#include <zstd.h>
+
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/macros.h"
+
+using std::size_t;
+
+namespace arrow {
+namespace util {
+namespace internal {
+
+namespace {
+
+Status ZSTDError(size_t ret, const char* prefix_msg) {
+ return Status::IOError(prefix_msg, ZSTD_getErrorName(ret));
+}
+
+// ----------------------------------------------------------------------
+// ZSTD decompressor implementation
+
+class ZSTDDecompressor : public Decompressor {
+ public:
+ ZSTDDecompressor() : stream_(ZSTD_createDStream()) {}
+
+ ~ZSTDDecompressor() override { ZSTD_freeDStream(stream_); }
+
+ Status Init() {
+ finished_ = false;
+ size_t ret = ZSTD_initDStream(stream_);
+ if (ZSTD_isError(ret)) {
+ return ZSTDError(ret, "ZSTD init failed: ");
+ } else {
+ return Status::OK();
+ }
+ }
+
+ Result<DecompressResult> Decompress(int64_t input_len, const uint8_t* input,
+ int64_t output_len, uint8_t* output) override {
+ ZSTD_inBuffer in_buf;
+ ZSTD_outBuffer out_buf;
+
+ in_buf.src = input;
+ in_buf.size = static_cast<size_t>(input_len);
+ in_buf.pos = 0;
+ out_buf.dst = output;
+ out_buf.size = static_cast<size_t>(output_len);
+ out_buf.pos = 0;
+
+ size_t ret;
+ ret = ZSTD_decompressStream(stream_, &out_buf, &in_buf);
+ if (ZSTD_isError(ret)) {
+ return ZSTDError(ret, "ZSTD decompress failed: ");
+ }
+ finished_ = (ret == 0);
+ return DecompressResult{static_cast<int64_t>(in_buf.pos),
+ static_cast<int64_t>(out_buf.pos),
+ in_buf.pos == 0 && out_buf.pos == 0};
+ }
+
+ Status Reset() override { return Init(); }
+
+ bool IsFinished() override { return finished_; }
+
+ protected:
+ ZSTD_DStream* stream_;
+ bool finished_;
+};
+
+// ----------------------------------------------------------------------
+// ZSTD compressor implementation
+
+class ZSTDCompressor : public Compressor {
+ public:
+ explicit ZSTDCompressor(int compression_level)
+ : stream_(ZSTD_createCStream()), compression_level_(compression_level) {}
+
+ ~ZSTDCompressor() override { ZSTD_freeCStream(stream_); }
+
+ Status Init() {
+ size_t ret = ZSTD_initCStream(stream_, compression_level_);
+ if (ZSTD_isError(ret)) {
+ return ZSTDError(ret, "ZSTD init failed: ");
+ } else {
+ return Status::OK();
+ }
+ }
+
+ Result<CompressResult> Compress(int64_t input_len, const uint8_t* input,
+ int64_t output_len, uint8_t* output) override {
+ ZSTD_inBuffer in_buf;
+ ZSTD_outBuffer out_buf;
+
+ in_buf.src = input;
+ in_buf.size = static_cast<size_t>(input_len);
+ in_buf.pos = 0;
+ out_buf.dst = output;
+ out_buf.size = static_cast<size_t>(output_len);
+ out_buf.pos = 0;
+
+ size_t ret;
+ ret = ZSTD_compressStream(stream_, &out_buf, &in_buf);
+ if (ZSTD_isError(ret)) {
+ return ZSTDError(ret, "ZSTD compress failed: ");
+ }
+ return CompressResult{static_cast<int64_t>(in_buf.pos),
+ static_cast<int64_t>(out_buf.pos)};
+ }
+
+ Result<FlushResult> Flush(int64_t output_len, uint8_t* output) override {
+ ZSTD_outBuffer out_buf;
+
+ out_buf.dst = output;
+ out_buf.size = static_cast<size_t>(output_len);
+ out_buf.pos = 0;
+
+ size_t ret;
+ ret = ZSTD_flushStream(stream_, &out_buf);
+ if (ZSTD_isError(ret)) {
+ return ZSTDError(ret, "ZSTD flush failed: ");
+ }
+ return FlushResult{static_cast<int64_t>(out_buf.pos), ret > 0};
+ }
+
+ Result<EndResult> End(int64_t output_len, uint8_t* output) override {
+ ZSTD_outBuffer out_buf;
+
+ out_buf.dst = output;
+ out_buf.size = static_cast<size_t>(output_len);
+ out_buf.pos = 0;
+
+ size_t ret;
+ ret = ZSTD_endStream(stream_, &out_buf);
+ if (ZSTD_isError(ret)) {
+ return ZSTDError(ret, "ZSTD end failed: ");
+ }
+ return EndResult{static_cast<int64_t>(out_buf.pos), ret > 0};
+ }
+
+ protected:
+ ZSTD_CStream* stream_;
+
+ private:
+ int compression_level_;
+};
+
+// ----------------------------------------------------------------------
+// ZSTD codec implementation
+
+class ZSTDCodec : public Codec {
+ public:
+ explicit ZSTDCodec(int compression_level)
+ : compression_level_(compression_level == kUseDefaultCompressionLevel
+ ? kZSTDDefaultCompressionLevel
+ : compression_level) {}
+
+ Result<int64_t> Decompress(int64_t input_len, const uint8_t* input,
+ int64_t output_buffer_len, uint8_t* output_buffer) override {
+ if (output_buffer == nullptr) {
+ // We may pass a NULL 0-byte output buffer but some zstd versions demand
+ // a valid pointer: https://github.com/facebook/zstd/issues/1385
+ static uint8_t empty_buffer;
+ DCHECK_EQ(output_buffer_len, 0);
+ output_buffer = &empty_buffer;
+ }
+
+ size_t ret = ZSTD_decompress(output_buffer, static_cast<size_t>(output_buffer_len),
+ input, static_cast<size_t>(input_len));
+ if (ZSTD_isError(ret)) {
+ return ZSTDError(ret, "ZSTD decompression failed: ");
+ }
+ if (static_cast<int64_t>(ret) != output_buffer_len) {
+ return Status::IOError("Corrupt ZSTD compressed data.");
+ }
+ return static_cast<int64_t>(ret);
+ }
+
+ int64_t MaxCompressedLen(int64_t input_len,
+ const uint8_t* ARROW_ARG_UNUSED(input)) override {
+ DCHECK_GE(input_len, 0);
+ return ZSTD_compressBound(static_cast<size_t>(input_len));
+ }
+
+ Result<int64_t> Compress(int64_t input_len, const uint8_t* input,
+ int64_t output_buffer_len, uint8_t* output_buffer) override {
+ size_t ret = ZSTD_compress(output_buffer, static_cast<size_t>(output_buffer_len),
+ input, static_cast<size_t>(input_len), compression_level_);
+ if (ZSTD_isError(ret)) {
+ return ZSTDError(ret, "ZSTD compression failed: ");
+ }
+ return static_cast<int64_t>(ret);
+ }
+
+ Result<std::shared_ptr<Compressor>> MakeCompressor() override {
+ auto ptr = std::make_shared<ZSTDCompressor>(compression_level_);
+ RETURN_NOT_OK(ptr->Init());
+ return ptr;
+ }
+
+ Result<std::shared_ptr<Decompressor>> MakeDecompressor() override {
+ auto ptr = std::make_shared<ZSTDDecompressor>();
+ RETURN_NOT_OK(ptr->Init());
+ return ptr;
+ }
+
+ Compression::type compression_type() const override { return Compression::ZSTD; }
+ int minimum_compression_level() const override { return ZSTD_minCLevel(); }
+ int maximum_compression_level() const override { return ZSTD_maxCLevel(); }
+ int default_compression_level() const override { return kZSTDDefaultCompressionLevel; }
+
+ int compression_level() const override { return compression_level_; }
+
+ private:
+ const int compression_level_;
+};
+
+} // namespace
+
+std::unique_ptr<Codec> MakeZSTDCodec(int compression_level) {
+ return std::unique_ptr<Codec>(new ZSTDCodec(compression_level));
+}
+
+} // namespace internal
+} // namespace util
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/cpu_info.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/cpu_info.cc
new file mode 100644
index 00000000000..d803521a2d9
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/cpu_info.cc
@@ -0,0 +1,563 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// From Apache Impala (incubating) as of 2016-01-29.
+
+#include "arrow/util/cpu_info.h"
+
+#ifdef __APPLE__
+#include <sys/sysctl.h>
+#endif
+
+#include <stdlib.h>
+#include <string.h>
+
+#ifndef _MSC_VER
+#include <unistd.h>
+#endif
+
+#ifdef _WIN32
+#include <immintrin.h>
+#include <intrin.h>
+#include <array>
+#include <bitset>
+
+#include "arrow/util/windows_compatibility.h"
+#endif
+
+#include <algorithm>
+#include <cctype>
+#include <cerrno>
+#include <cstdint>
+#include <fstream>
+#include <memory>
+#include <mutex>
+#include <string>
+
+#include "arrow/result.h"
+#include "arrow/util/io_util.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/optional.h"
+#include "arrow/util/string.h"
+
+namespace arrow {
+namespace internal {
+
+namespace {
+
+using std::max;
+
+constexpr int64_t kDefaultL1CacheSize = 32 * 1024; // Level 1: 32k
+constexpr int64_t kDefaultL2CacheSize = 256 * 1024; // Level 2: 256k
+constexpr int64_t kDefaultL3CacheSize = 3072 * 1024; // Level 3: 3M
+
+#if defined(__MINGW64_VERSION_MAJOR) && __MINGW64_VERSION_MAJOR < 5
+void __cpuidex(int CPUInfo[4], int function_id, int subfunction_id) {
+ __asm__ __volatile__("cpuid"
+ : "=a"(CPUInfo[0]), "=b"(CPUInfo[1]), "=c"(CPUInfo[2]),
+ "=d"(CPUInfo[3])
+ : "a"(function_id), "c"(subfunction_id));
+}
+
+int64_t _xgetbv(int xcr) {
+ int out = 0;
+ __asm__ __volatile__("xgetbv" : "=a"(out) : "c"(xcr) : "%edx");
+ return out;
+}
+#endif
+
+#ifdef __APPLE__
+util::optional<int64_t> IntegerSysCtlByName(const char* name) {
+ size_t len = sizeof(int64_t);
+ int64_t data = 0;
+ if (sysctlbyname(name, &data, &len, nullptr, 0) == 0) {
+ return data;
+ }
+ // ENOENT is the official errno value for non-existing sysctl's,
+ // but EINVAL and ENOTSUP have been seen in the wild.
+ if (errno != ENOENT && errno != EINVAL && errno != ENOTSUP) {
+ auto st = IOErrorFromErrno(errno, "sysctlbyname failed for '", name, "'");
+ ARROW_LOG(WARNING) << st.ToString();
+ }
+ return util::nullopt;
+}
+#endif
+
+#if defined(__GNUC__) && defined(__linux__) && defined(__aarch64__)
+// There is no direct instruction to get cache size on Arm64 like '__cpuid' on x86;
+// Get Arm64 cache size by reading '/sys/devices/system/cpu/cpu0/cache/index*/size';
+// index* :
+// index0: L1 Dcache
+// index1: L1 Icache
+// index2: L2 cache
+// index3: L3 cache
+const char* kL1CacheSizeFile = "/sys/devices/system/cpu/cpu0/cache/index0/size";
+const char* kL2CacheSizeFile = "/sys/devices/system/cpu/cpu0/cache/index2/size";
+const char* kL3CacheSizeFile = "/sys/devices/system/cpu/cpu0/cache/index3/size";
+
+int64_t GetArm64CacheSize(const char* filename, int64_t default_size = -1) {
+ char* content = nullptr;
+ char* last_char = nullptr;
+ size_t file_len = 0;
+
+ // Read cache file to 'content' for getting cache size.
+ FILE* cache_file = fopen(filename, "r");
+ if (cache_file == nullptr) {
+ return default_size;
+ }
+ int res = getline(&content, &file_len, cache_file);
+ fclose(cache_file);
+ if (res == -1) {
+ return default_size;
+ }
+ std::unique_ptr<char, decltype(&free)> content_guard(content, &free);
+
+ errno = 0;
+ const auto cardinal_num = strtoull(content, &last_char, 0);
+ if (errno != 0) {
+ return default_size;
+ }
+ // kB, MB, or GB
+ int64_t multip = 1;
+ switch (*last_char) {
+ case 'g':
+ case 'G':
+ multip *= 1024;
+ case 'm':
+ case 'M':
+ multip *= 1024;
+ case 'k':
+ case 'K':
+ multip *= 1024;
+ }
+ return cardinal_num * multip;
+}
+#endif
+
+#if !defined(_WIN32) && !defined(__APPLE__)
+struct {
+ std::string name;
+ int64_t flag;
+} flag_mappings[] = {
+#if (defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64))
+ {"ssse3", CpuInfo::SSSE3}, {"sse4_1", CpuInfo::SSE4_1},
+ {"sse4_2", CpuInfo::SSE4_2}, {"popcnt", CpuInfo::POPCNT},
+ {"avx", CpuInfo::AVX}, {"avx2", CpuInfo::AVX2},
+ {"avx512f", CpuInfo::AVX512F}, {"avx512cd", CpuInfo::AVX512CD},
+ {"avx512vl", CpuInfo::AVX512VL}, {"avx512dq", CpuInfo::AVX512DQ},
+ {"avx512bw", CpuInfo::AVX512BW}, {"bmi1", CpuInfo::BMI1},
+ {"bmi2", CpuInfo::BMI2},
+#endif
+#if defined(__aarch64__)
+ {"asimd", CpuInfo::ASIMD},
+#endif
+};
+const int64_t num_flags = sizeof(flag_mappings) / sizeof(flag_mappings[0]);
+
+// Helper function to parse for hardware flags.
+// values contains a list of space-separated flags. check to see if the flags we
+// care about are present.
+// Returns a bitmap of flags.
+int64_t ParseCPUFlags(const std::string& values) {
+ int64_t flags = 0;
+ for (int i = 0; i < num_flags; ++i) {
+ if (values.find(flag_mappings[i].name) != std::string::npos) {
+ flags |= flag_mappings[i].flag;
+ }
+ }
+ return flags;
+}
+#endif
+
+#ifdef _WIN32
+bool RetrieveCacheSize(int64_t* cache_sizes) {
+ if (!cache_sizes) {
+ return false;
+ }
+ PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = nullptr;
+ PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer_position = nullptr;
+ DWORD buffer_size = 0;
+ size_t offset = 0;
+ typedef BOOL(WINAPI * GetLogicalProcessorInformationFuncPointer)(void*, void*);
+ GetLogicalProcessorInformationFuncPointer func_pointer =
+ (GetLogicalProcessorInformationFuncPointer)GetProcAddress(
+ GetModuleHandle("kernel32"), "GetLogicalProcessorInformation");
+
+ if (!func_pointer) {
+ return false;
+ }
+
+ // Get buffer size
+ if (func_pointer(buffer, &buffer_size) && GetLastError() != ERROR_INSUFFICIENT_BUFFER)
+ return false;
+
+ buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(buffer_size);
+
+ if (!buffer || !func_pointer(buffer, &buffer_size)) {
+ return false;
+ }
+
+ buffer_position = buffer;
+ while (offset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= buffer_size) {
+ if (RelationCache == buffer_position->Relationship) {
+ PCACHE_DESCRIPTOR cache = &buffer_position->Cache;
+ if (cache->Level >= 1 && cache->Level <= 3) {
+ cache_sizes[cache->Level - 1] += cache->Size;
+ }
+ }
+ offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
+ buffer_position++;
+ }
+
+ if (buffer) {
+ free(buffer);
+ }
+ return true;
+}
+
+// Source: https://en.wikipedia.org/wiki/CPUID
+bool RetrieveCPUInfo(int64_t* hardware_flags, std::string* model_name,
+ CpuInfo::Vendor* vendor) {
+ if (!hardware_flags || !model_name || !vendor) {
+ return false;
+ }
+ int register_EAX_id = 1;
+ int highest_valid_id = 0;
+ int highest_extended_valid_id = 0;
+ std::bitset<32> features_ECX;
+ std::array<int, 4> cpu_info;
+
+ // Get highest valid id
+ __cpuid(cpu_info.data(), 0);
+ highest_valid_id = cpu_info[0];
+ // HEX of "GenuineIntel": 47656E75 696E6549 6E74656C
+ // HEX of "AuthenticAMD": 41757468 656E7469 63414D44
+ if (cpu_info[1] == 0x756e6547 && cpu_info[2] == 0x49656e69 &&
+ cpu_info[3] == 0x6c65746e) {
+ *vendor = CpuInfo::Vendor::Intel;
+ } else if (cpu_info[1] == 0x68747541 && cpu_info[2] == 0x69746e65 &&
+ cpu_info[3] == 0x444d4163) {
+ *vendor = CpuInfo::Vendor::AMD;
+ }
+
+ if (highest_valid_id <= register_EAX_id) return false;
+
+ // EAX=1: Processor Info and Feature Bits
+ __cpuidex(cpu_info.data(), register_EAX_id, 0);
+ features_ECX = cpu_info[2];
+
+ // Get highest extended id
+ __cpuid(cpu_info.data(), 0x80000000);
+ highest_extended_valid_id = cpu_info[0];
+
+ // Retrieve CPU model name
+ if (highest_extended_valid_id >= static_cast<int>(0x80000004)) {
+ model_name->clear();
+ for (int i = 0x80000002; i <= static_cast<int>(0x80000004); ++i) {
+ __cpuidex(cpu_info.data(), i, 0);
+ *model_name +=
+ std::string(reinterpret_cast<char*>(cpu_info.data()), sizeof(cpu_info));
+ }
+ }
+
+ bool zmm_enabled = false;
+ if (features_ECX[27]) { // OSXSAVE
+ // Query if the OS supports saving ZMM registers when switching contexts
+ int64_t xcr0 = _xgetbv(0);
+ zmm_enabled = (xcr0 & 0xE0) == 0xE0;
+ }
+
+ if (features_ECX[9]) *hardware_flags |= CpuInfo::SSSE3;
+ if (features_ECX[19]) *hardware_flags |= CpuInfo::SSE4_1;
+ if (features_ECX[20]) *hardware_flags |= CpuInfo::SSE4_2;
+ if (features_ECX[23]) *hardware_flags |= CpuInfo::POPCNT;
+ if (features_ECX[23]) *hardware_flags |= CpuInfo::AVX;
+
+ // cpuid with EAX=7, ECX=0: Extended Features
+ register_EAX_id = 7;
+ if (highest_valid_id > register_EAX_id) {
+ __cpuidex(cpu_info.data(), register_EAX_id, 0);
+ std::bitset<32> features_EBX = cpu_info[1];
+
+ if (features_EBX[3]) *hardware_flags |= CpuInfo::BMI1;
+ if (features_EBX[5]) *hardware_flags |= CpuInfo::AVX2;
+ if (features_EBX[8]) *hardware_flags |= CpuInfo::BMI2;
+ // ARROW-11427: only use AVX512 if enabled by the OS
+ if (zmm_enabled) {
+ if (features_EBX[16]) *hardware_flags |= CpuInfo::AVX512F;
+ if (features_EBX[17]) *hardware_flags |= CpuInfo::AVX512DQ;
+ if (features_EBX[28]) *hardware_flags |= CpuInfo::AVX512CD;
+ if (features_EBX[30]) *hardware_flags |= CpuInfo::AVX512BW;
+ if (features_EBX[31]) *hardware_flags |= CpuInfo::AVX512VL;
+ }
+ }
+
+ return true;
+}
+#endif
+
+} // namespace
+
+CpuInfo::CpuInfo()
+ : hardware_flags_(0),
+ num_cores_(1),
+ model_name_("unknown"),
+ vendor_(Vendor::Unknown) {}
+
+std::unique_ptr<CpuInfo> g_cpu_info;
+static std::once_flag cpuinfo_initialized;
+
+CpuInfo* CpuInfo::GetInstance() {
+ std::call_once(cpuinfo_initialized, []() {
+ g_cpu_info.reset(new CpuInfo);
+ g_cpu_info->Init();
+ });
+ return g_cpu_info.get();
+}
+
+void CpuInfo::Init() {
+ std::string line;
+ std::string name;
+ std::string value;
+
+ float max_mhz = 0;
+ int num_cores = 0;
+
+ memset(&cache_sizes_, 0, sizeof(cache_sizes_));
+
+#ifdef _WIN32
+ SYSTEM_INFO system_info;
+ GetSystemInfo(&system_info);
+ num_cores = system_info.dwNumberOfProcessors;
+
+ LARGE_INTEGER performance_frequency;
+ if (QueryPerformanceFrequency(&performance_frequency)) {
+ max_mhz = static_cast<float>(performance_frequency.QuadPart);
+ }
+#elif defined(__APPLE__)
+ // On macOS, get CPU information from system information base
+ struct SysCtlCpuFeature {
+ const char* name;
+ int64_t flag;
+ };
+ std::vector<SysCtlCpuFeature> features = {
+#if defined(__aarch64__)
+ // ARM64 (note that this is exposed under Rosetta as well)
+ {"hw.optional.neon", ASIMD},
+#else
+ // x86
+ {"hw.optional.sse4_2", SSSE3 | SSE4_1 | SSE4_2 | POPCNT},
+ {"hw.optional.avx1_0", AVX},
+ {"hw.optional.avx2_0", AVX2},
+ {"hw.optional.bmi1", BMI1},
+ {"hw.optional.bmi2", BMI2},
+ {"hw.optional.avx512f", AVX512F},
+ {"hw.optional.avx512cd", AVX512CD},
+ {"hw.optional.avx512dq", AVX512DQ},
+ {"hw.optional.avx512bw", AVX512BW},
+ {"hw.optional.avx512vl", AVX512VL},
+#endif
+ };
+ for (const auto& feature : features) {
+ auto v = IntegerSysCtlByName(feature.name);
+ if (v.value_or(0)) {
+ hardware_flags_ |= feature.flag;
+ }
+ }
+#else
+ // Read from /proc/cpuinfo
+ std::ifstream cpuinfo("/proc/cpuinfo", std::ios::in);
+ while (cpuinfo) {
+ std::getline(cpuinfo, line);
+ size_t colon = line.find(':');
+ if (colon != std::string::npos) {
+ name = TrimString(line.substr(0, colon - 1));
+ value = TrimString(line.substr(colon + 1, std::string::npos));
+ if (name.compare("flags") == 0 || name.compare("Features") == 0) {
+ hardware_flags_ |= ParseCPUFlags(value);
+ } else if (name.compare("cpu MHz") == 0) {
+ // Every core will report a different speed. We'll take the max, assuming
+ // that when impala is running, the core will not be in a lower power state.
+ // TODO: is there a more robust way to do this, such as
+ // Window's QueryPerformanceFrequency()
+ float mhz = static_cast<float>(atof(value.c_str()));
+ max_mhz = max(mhz, max_mhz);
+ } else if (name.compare("processor") == 0) {
+ ++num_cores;
+ } else if (name.compare("model name") == 0) {
+ model_name_ = value;
+ } else if (name.compare("vendor_id") == 0) {
+ if (value.compare("GenuineIntel") == 0) {
+ vendor_ = Vendor::Intel;
+ } else if (value.compare("AuthenticAMD") == 0) {
+ vendor_ = Vendor::AMD;
+ }
+ }
+ }
+ }
+ if (cpuinfo.is_open()) cpuinfo.close();
+#endif
+
+#ifdef __APPLE__
+ // On macOS, get cache size from system information base
+ SetDefaultCacheSize();
+ auto c = IntegerSysCtlByName("hw.l1dcachesize");
+ if (c.has_value()) {
+ cache_sizes_[0] = *c;
+ }
+ c = IntegerSysCtlByName("hw.l2cachesize");
+ if (c.has_value()) {
+ cache_sizes_[1] = *c;
+ }
+ c = IntegerSysCtlByName("hw.l3cachesize");
+ if (c.has_value()) {
+ cache_sizes_[2] = *c;
+ }
+#elif _WIN32
+ if (!RetrieveCacheSize(cache_sizes_)) {
+ SetDefaultCacheSize();
+ }
+ RetrieveCPUInfo(&hardware_flags_, &model_name_, &vendor_);
+#else
+ SetDefaultCacheSize();
+#endif
+
+ if (max_mhz != 0) {
+ cycles_per_ms_ = static_cast<int64_t>(max_mhz);
+#ifndef _WIN32
+ cycles_per_ms_ *= 1000;
+#endif
+ } else {
+ cycles_per_ms_ = 1000000;
+ }
+ original_hardware_flags_ = hardware_flags_;
+
+ if (num_cores > 0) {
+ num_cores_ = num_cores;
+ } else {
+ num_cores_ = 1;
+ }
+
+ // Parse the user simd level
+ ParseUserSimdLevel();
+}
+
+void CpuInfo::VerifyCpuRequirements() {
+#ifdef ARROW_HAVE_SSE4_2
+ if (!IsSupported(CpuInfo::SSSE3)) {
+ DCHECK(false) << "CPU does not support the Supplemental SSE3 instruction set";
+ }
+#endif
+#if defined(ARROW_HAVE_NEON)
+ if (!IsSupported(CpuInfo::ASIMD)) {
+ DCHECK(false) << "CPU does not support the Armv8 Neon instruction set";
+ }
+#endif
+}
+
+bool CpuInfo::CanUseSSE4_2() const {
+#if defined(ARROW_HAVE_SSE4_2)
+ return IsSupported(CpuInfo::SSE4_2);
+#else
+ return false;
+#endif
+}
+
+void CpuInfo::EnableFeature(int64_t flag, bool enable) {
+ if (!enable) {
+ hardware_flags_ &= ~flag;
+ } else {
+ // Can't turn something on that can't be supported
+ DCHECK_NE(original_hardware_flags_ & flag, 0);
+ hardware_flags_ |= flag;
+ }
+}
+
+int64_t CpuInfo::hardware_flags() { return hardware_flags_; }
+
+int64_t CpuInfo::CacheSize(CacheLevel level) { return cache_sizes_[level]; }
+
+int64_t CpuInfo::cycles_per_ms() { return cycles_per_ms_; }
+
+int CpuInfo::num_cores() { return num_cores_; }
+
+std::string CpuInfo::model_name() { return model_name_; }
+
+void CpuInfo::SetDefaultCacheSize() {
+#if defined(_SC_LEVEL1_DCACHE_SIZE) && !defined(__aarch64__)
+ // Call sysconf to query for the cache sizes
+ cache_sizes_[0] = sysconf(_SC_LEVEL1_DCACHE_SIZE);
+ cache_sizes_[1] = sysconf(_SC_LEVEL2_CACHE_SIZE);
+ cache_sizes_[2] = sysconf(_SC_LEVEL3_CACHE_SIZE);
+ ARROW_UNUSED(kDefaultL1CacheSize);
+ ARROW_UNUSED(kDefaultL2CacheSize);
+ ARROW_UNUSED(kDefaultL3CacheSize);
+#elif defined(__GNUC__) && defined(__linux__) && defined(__aarch64__)
+ cache_sizes_[0] = GetArm64CacheSize(kL1CacheSizeFile, kDefaultL1CacheSize);
+ cache_sizes_[1] = GetArm64CacheSize(kL2CacheSizeFile, kDefaultL2CacheSize);
+ cache_sizes_[2] = GetArm64CacheSize(kL3CacheSizeFile, kDefaultL3CacheSize);
+#else
+ // Provide reasonable default values if no info
+ cache_sizes_[0] = kDefaultL1CacheSize;
+ cache_sizes_[1] = kDefaultL2CacheSize;
+ cache_sizes_[2] = kDefaultL3CacheSize;
+#endif
+}
+
+void CpuInfo::ParseUserSimdLevel() {
+ auto maybe_env_var = GetEnvVar("ARROW_USER_SIMD_LEVEL");
+ if (!maybe_env_var.ok()) {
+ // No user settings
+ return;
+ }
+ std::string s = *std::move(maybe_env_var);
+ std::transform(s.begin(), s.end(), s.begin(),
+ [](unsigned char c) { return std::toupper(c); });
+
+ int level = USER_SIMD_MAX;
+ // Parse the level
+ if (s == "AVX512") {
+ level = USER_SIMD_AVX512;
+ } else if (s == "AVX2") {
+ level = USER_SIMD_AVX2;
+ } else if (s == "AVX") {
+ level = USER_SIMD_AVX;
+ } else if (s == "SSE4_2") {
+ level = USER_SIMD_SSE4_2;
+ } else if (s == "NONE") {
+ level = USER_SIMD_NONE;
+ } else if (!s.empty()) {
+ ARROW_LOG(WARNING) << "Invalid value for ARROW_USER_SIMD_LEVEL: " << s;
+ }
+
+ // Disable feature as the level
+ if (level < USER_SIMD_AVX512) { // Disable all AVX512 features
+ EnableFeature(AVX512, false);
+ }
+ if (level < USER_SIMD_AVX2) { // Disable all AVX2 features
+ EnableFeature(AVX2 | BMI2, false);
+ }
+ if (level < USER_SIMD_AVX) { // Disable all AVX features
+ EnableFeature(AVX, false);
+ }
+ if (level < USER_SIMD_SSE4_2) { // Disable all SSE4_2 features
+ EnableFeature(SSE4_2 | BMI1, false);
+ }
+}
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/cpu_info.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/cpu_info.h
new file mode 100644
index 00000000000..83819c25519
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/cpu_info.h
@@ -0,0 +1,143 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// From Apache Impala (incubating) as of 2016-01-29. Pared down to a minimal
+// set of functions needed for Apache Arrow / Apache parquet-cpp
+
+#pragma once
+
+#include <cstdint>
+#include <string>
+
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace internal {
+
+/// CpuInfo is an interface to query for cpu information at runtime. The caller can
+/// ask for the sizes of the caches and what hardware features are supported.
+/// On Linux, this information is pulled from a couple of sys files (/proc/cpuinfo and
+/// /sys/devices)
+class ARROW_EXPORT CpuInfo {
+ public:
+ static constexpr int64_t SSSE3 = (1 << 1);
+ static constexpr int64_t SSE4_1 = (1 << 2);
+ static constexpr int64_t SSE4_2 = (1 << 3);
+ static constexpr int64_t POPCNT = (1 << 4);
+ static constexpr int64_t ASIMD = (1 << 5);
+ static constexpr int64_t AVX = (1 << 6);
+ static constexpr int64_t AVX2 = (1 << 7);
+ static constexpr int64_t AVX512F = (1 << 8);
+ static constexpr int64_t AVX512CD = (1 << 9);
+ static constexpr int64_t AVX512VL = (1 << 10);
+ static constexpr int64_t AVX512DQ = (1 << 11);
+ static constexpr int64_t AVX512BW = (1 << 12);
+ static constexpr int64_t BMI1 = (1 << 13);
+ static constexpr int64_t BMI2 = (1 << 14);
+
+ /// Typical AVX512 subsets consists of AVX512F,AVX512BW,AVX512VL,AVX512CD,AVX512DQ
+ static constexpr int64_t AVX512 = AVX512F | AVX512CD | AVX512VL | AVX512DQ | AVX512BW;
+
+ /// Cache enums for L1 (data), L2 and L3
+ enum CacheLevel {
+ L1_CACHE = 0,
+ L2_CACHE = 1,
+ L3_CACHE = 2,
+ };
+
+ enum class Vendor : int { Unknown = 0, Intel, AMD };
+
+ static CpuInfo* GetInstance();
+
+ /// Determine if the CPU meets the minimum CPU requirements and if not, issue an error
+ /// and terminate.
+ void VerifyCpuRequirements();
+
+ /// Returns all the flags for this cpu
+ int64_t hardware_flags();
+
+ /// \brief Returns whether or not the given feature is enabled.
+ ///
+ /// IsSupported() is true iff IsDetected() is also true and the feature
+ /// wasn't disabled by the user (for example by setting the ARROW_USER_SIMD_LEVEL
+ /// environment variable).
+ bool IsSupported(int64_t flags) const { return (hardware_flags_ & flags) == flags; }
+
+ /// Returns whether or not the given feature is available on the CPU.
+ bool IsDetected(int64_t flags) const {
+ return (original_hardware_flags_ & flags) == flags;
+ }
+
+ /// \brief The processor supports SSE4.2 and the Arrow libraries are built
+ /// with support for it
+ bool CanUseSSE4_2() const;
+
+ /// Toggle a hardware feature on and off. It is not valid to turn on a feature
+ /// that the underlying hardware cannot support. This is useful for testing.
+ void EnableFeature(int64_t flag, bool enable);
+
+ /// Returns the size of the cache in KB at this cache level
+ int64_t CacheSize(CacheLevel level);
+
+ /// Returns the number of cpu cycles per millisecond
+ int64_t cycles_per_ms();
+
+ /// Returns the number of cores (including hyper-threaded) on this machine.
+ int num_cores();
+
+ /// Returns the model name of the cpu (e.g. Intel i7-2600)
+ std::string model_name();
+
+ /// Returns the vendor of the cpu.
+ Vendor vendor() const { return vendor_; }
+
+ bool HasEfficientBmi2() const {
+ // BMI2 (pext, pdep) is only efficient on Intel X86 processors.
+ return vendor() == Vendor::Intel && IsSupported(BMI2);
+ }
+
+ private:
+ CpuInfo();
+
+ enum UserSimdLevel {
+ USER_SIMD_NONE = 0,
+ USER_SIMD_SSE4_2,
+ USER_SIMD_AVX,
+ USER_SIMD_AVX2,
+ USER_SIMD_AVX512,
+ USER_SIMD_MAX,
+ };
+
+ void Init();
+
+ /// Inits CPU cache size variables with default values
+ void SetDefaultCacheSize();
+
+ /// Parse the SIMD level by ARROW_USER_SIMD_LEVEL env
+ void ParseUserSimdLevel();
+
+ int64_t hardware_flags_;
+ int64_t original_hardware_flags_;
+ int64_t cache_sizes_[L3_CACHE + 1];
+ int64_t cycles_per_ms_;
+ int num_cores_;
+ std::string model_name_;
+ Vendor vendor_;
+};
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/decimal.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/decimal.cc
new file mode 100644
index 00000000000..7aefd1ab9cd
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/decimal.cc
@@ -0,0 +1,932 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <algorithm>
+#include <array>
+#include <climits>
+#include <cmath>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <iomanip>
+#include <limits>
+#include <ostream>
+#include <sstream>
+#include <string>
+
+#include "arrow/status.h"
+#include "arrow/util/decimal.h"
+#include "arrow/util/endian.h"
+#include "arrow/util/formatting.h"
+#include "arrow/util/int128_internal.h"
+#include "arrow/util/int_util_internal.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/value_parsing.h"
+
+namespace arrow {
+
+using internal::SafeLeftShift;
+using internal::SafeSignedAdd;
+using internal::uint128_t;
+
+Decimal128::Decimal128(const std::string& str) : Decimal128() {
+ *this = Decimal128::FromString(str).ValueOrDie();
+}
+
+static constexpr auto kInt64DecimalDigits =
+ static_cast<size_t>(std::numeric_limits<int64_t>::digits10);
+
+static constexpr uint64_t kUInt64PowersOfTen[kInt64DecimalDigits + 1] = {
+ // clang-format off
+ 1ULL,
+ 10ULL,
+ 100ULL,
+ 1000ULL,
+ 10000ULL,
+ 100000ULL,
+ 1000000ULL,
+ 10000000ULL,
+ 100000000ULL,
+ 1000000000ULL,
+ 10000000000ULL,
+ 100000000000ULL,
+ 1000000000000ULL,
+ 10000000000000ULL,
+ 100000000000000ULL,
+ 1000000000000000ULL,
+ 10000000000000000ULL,
+ 100000000000000000ULL,
+ 1000000000000000000ULL
+ // clang-format on
+};
+
+static constexpr float kFloatPowersOfTen[2 * 38 + 1] = {
+ 1e-38f, 1e-37f, 1e-36f, 1e-35f, 1e-34f, 1e-33f, 1e-32f, 1e-31f, 1e-30f, 1e-29f,
+ 1e-28f, 1e-27f, 1e-26f, 1e-25f, 1e-24f, 1e-23f, 1e-22f, 1e-21f, 1e-20f, 1e-19f,
+ 1e-18f, 1e-17f, 1e-16f, 1e-15f, 1e-14f, 1e-13f, 1e-12f, 1e-11f, 1e-10f, 1e-9f,
+ 1e-8f, 1e-7f, 1e-6f, 1e-5f, 1e-4f, 1e-3f, 1e-2f, 1e-1f, 1e0f, 1e1f,
+ 1e2f, 1e3f, 1e4f, 1e5f, 1e6f, 1e7f, 1e8f, 1e9f, 1e10f, 1e11f,
+ 1e12f, 1e13f, 1e14f, 1e15f, 1e16f, 1e17f, 1e18f, 1e19f, 1e20f, 1e21f,
+ 1e22f, 1e23f, 1e24f, 1e25f, 1e26f, 1e27f, 1e28f, 1e29f, 1e30f, 1e31f,
+ 1e32f, 1e33f, 1e34f, 1e35f, 1e36f, 1e37f, 1e38f};
+
+static constexpr double kDoublePowersOfTen[2 * 38 + 1] = {
+ 1e-38, 1e-37, 1e-36, 1e-35, 1e-34, 1e-33, 1e-32, 1e-31, 1e-30, 1e-29, 1e-28,
+ 1e-27, 1e-26, 1e-25, 1e-24, 1e-23, 1e-22, 1e-21, 1e-20, 1e-19, 1e-18, 1e-17,
+ 1e-16, 1e-15, 1e-14, 1e-13, 1e-12, 1e-11, 1e-10, 1e-9, 1e-8, 1e-7, 1e-6,
+ 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4, 1e5,
+ 1e6, 1e7, 1e8, 1e9, 1e10, 1e11, 1e12, 1e13, 1e14, 1e15, 1e16,
+ 1e17, 1e18, 1e19, 1e20, 1e21, 1e22, 1e23, 1e24, 1e25, 1e26, 1e27,
+ 1e28, 1e29, 1e30, 1e31, 1e32, 1e33, 1e34, 1e35, 1e36, 1e37, 1e38};
+
+// On the Windows R toolchain, INFINITY is double type instead of float
+static constexpr float kFloatInf = std::numeric_limits<float>::infinity();
+static constexpr float kFloatPowersOfTen76[2 * 76 + 1] = {
+ 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 1e-45f, 1e-44f, 1e-43f, 1e-42f,
+ 1e-41f, 1e-40f, 1e-39f, 1e-38f, 1e-37f, 1e-36f, 1e-35f,
+ 1e-34f, 1e-33f, 1e-32f, 1e-31f, 1e-30f, 1e-29f, 1e-28f,
+ 1e-27f, 1e-26f, 1e-25f, 1e-24f, 1e-23f, 1e-22f, 1e-21f,
+ 1e-20f, 1e-19f, 1e-18f, 1e-17f, 1e-16f, 1e-15f, 1e-14f,
+ 1e-13f, 1e-12f, 1e-11f, 1e-10f, 1e-9f, 1e-8f, 1e-7f,
+ 1e-6f, 1e-5f, 1e-4f, 1e-3f, 1e-2f, 1e-1f, 1e0f,
+ 1e1f, 1e2f, 1e3f, 1e4f, 1e5f, 1e6f, 1e7f,
+ 1e8f, 1e9f, 1e10f, 1e11f, 1e12f, 1e13f, 1e14f,
+ 1e15f, 1e16f, 1e17f, 1e18f, 1e19f, 1e20f, 1e21f,
+ 1e22f, 1e23f, 1e24f, 1e25f, 1e26f, 1e27f, 1e28f,
+ 1e29f, 1e30f, 1e31f, 1e32f, 1e33f, 1e34f, 1e35f,
+ 1e36f, 1e37f, 1e38f, kFloatInf, kFloatInf, kFloatInf, kFloatInf,
+ kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf,
+ kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf,
+ kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf,
+ kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf,
+ kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf};
+
+static constexpr double kDoublePowersOfTen76[2 * 76 + 1] = {
+ 1e-76, 1e-75, 1e-74, 1e-73, 1e-72, 1e-71, 1e-70, 1e-69, 1e-68, 1e-67, 1e-66, 1e-65,
+ 1e-64, 1e-63, 1e-62, 1e-61, 1e-60, 1e-59, 1e-58, 1e-57, 1e-56, 1e-55, 1e-54, 1e-53,
+ 1e-52, 1e-51, 1e-50, 1e-49, 1e-48, 1e-47, 1e-46, 1e-45, 1e-44, 1e-43, 1e-42, 1e-41,
+ 1e-40, 1e-39, 1e-38, 1e-37, 1e-36, 1e-35, 1e-34, 1e-33, 1e-32, 1e-31, 1e-30, 1e-29,
+ 1e-28, 1e-27, 1e-26, 1e-25, 1e-24, 1e-23, 1e-22, 1e-21, 1e-20, 1e-19, 1e-18, 1e-17,
+ 1e-16, 1e-15, 1e-14, 1e-13, 1e-12, 1e-11, 1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5,
+ 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7,
+ 1e8, 1e9, 1e10, 1e11, 1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19,
+ 1e20, 1e21, 1e22, 1e23, 1e24, 1e25, 1e26, 1e27, 1e28, 1e29, 1e30, 1e31,
+ 1e32, 1e33, 1e34, 1e35, 1e36, 1e37, 1e38, 1e39, 1e40, 1e41, 1e42, 1e43,
+ 1e44, 1e45, 1e46, 1e47, 1e48, 1e49, 1e50, 1e51, 1e52, 1e53, 1e54, 1e55,
+ 1e56, 1e57, 1e58, 1e59, 1e60, 1e61, 1e62, 1e63, 1e64, 1e65, 1e66, 1e67,
+ 1e68, 1e69, 1e70, 1e71, 1e72, 1e73, 1e74, 1e75, 1e76};
+
+namespace {
+
+template <typename Real, typename Derived>
+struct DecimalRealConversion {
+ static Result<Decimal128> FromPositiveReal(Real real, int32_t precision,
+ int32_t scale) {
+ auto x = real;
+ if (scale >= -38 && scale <= 38) {
+ x *= Derived::powers_of_ten()[scale + 38];
+ } else {
+ x *= std::pow(static_cast<Real>(10), static_cast<Real>(scale));
+ }
+ x = std::nearbyint(x);
+ const auto max_abs = Derived::powers_of_ten()[precision + 38];
+ if (x <= -max_abs || x >= max_abs) {
+ return Status::Invalid("Cannot convert ", real,
+ " to Decimal128(precision = ", precision,
+ ", scale = ", scale, "): overflow");
+ }
+ // Extract high and low bits
+ const auto high = std::floor(std::ldexp(x, -64));
+ const auto low = x - std::ldexp(high, 64);
+
+ DCHECK_GE(high, -9.223372036854776e+18); // -2**63
+ DCHECK_LT(high, 9.223372036854776e+18); // 2**63
+ DCHECK_GE(low, 0);
+ DCHECK_LT(low, 1.8446744073709552e+19); // 2**64
+ return Decimal128(static_cast<int64_t>(high), static_cast<uint64_t>(low));
+ }
+
+ static Result<Decimal128> FromReal(Real x, int32_t precision, int32_t scale) {
+ DCHECK_GT(precision, 0);
+ DCHECK_LE(precision, 38);
+
+ if (!std::isfinite(x)) {
+ return Status::Invalid("Cannot convert ", x, " to Decimal128");
+ }
+ if (x < 0) {
+ ARROW_ASSIGN_OR_RAISE(auto dec, FromPositiveReal(-x, precision, scale));
+ return dec.Negate();
+ } else {
+ // Includes negative zero
+ return FromPositiveReal(x, precision, scale);
+ }
+ }
+
+ static Real ToRealPositive(const Decimal128& decimal, int32_t scale) {
+ Real x = static_cast<Real>(decimal.high_bits()) * Derived::two_to_64();
+ x += static_cast<Real>(decimal.low_bits());
+ if (scale >= -38 && scale <= 38) {
+ x *= Derived::powers_of_ten()[-scale + 38];
+ } else {
+ x *= std::pow(static_cast<Real>(10), static_cast<Real>(-scale));
+ }
+ return x;
+ }
+
+ static Real ToReal(Decimal128 decimal, int32_t scale) {
+ if (decimal.high_bits() < 0) {
+ // Convert the absolute value to avoid precision loss
+ decimal.Negate();
+ return -ToRealPositive(decimal, scale);
+ } else {
+ return ToRealPositive(decimal, scale);
+ }
+ }
+};
+
+struct DecimalFloatConversion
+ : public DecimalRealConversion<float, DecimalFloatConversion> {
+ static constexpr const float* powers_of_ten() { return kFloatPowersOfTen; }
+
+ static constexpr float two_to_64() { return 1.8446744e+19f; }
+};
+
+struct DecimalDoubleConversion
+ : public DecimalRealConversion<double, DecimalDoubleConversion> {
+ static constexpr const double* powers_of_ten() { return kDoublePowersOfTen; }
+
+ static constexpr double two_to_64() { return 1.8446744073709552e+19; }
+};
+
+} // namespace
+
+Result<Decimal128> Decimal128::FromReal(float x, int32_t precision, int32_t scale) {
+ return DecimalFloatConversion::FromReal(x, precision, scale);
+}
+
+Result<Decimal128> Decimal128::FromReal(double x, int32_t precision, int32_t scale) {
+ return DecimalDoubleConversion::FromReal(x, precision, scale);
+}
+
+float Decimal128::ToFloat(int32_t scale) const {
+ return DecimalFloatConversion::ToReal(*this, scale);
+}
+
+double Decimal128::ToDouble(int32_t scale) const {
+ return DecimalDoubleConversion::ToReal(*this, scale);
+}
+
+template <size_t n>
+static void AppendLittleEndianArrayToString(const std::array<uint64_t, n>& array,
+ std::string* result) {
+ const auto most_significant_non_zero =
+ find_if(array.rbegin(), array.rend(), [](uint64_t v) { return v != 0; });
+ if (most_significant_non_zero == array.rend()) {
+ result->push_back('0');
+ return;
+ }
+
+ size_t most_significant_elem_idx = &*most_significant_non_zero - array.data();
+ std::array<uint64_t, n> copy = array;
+ constexpr uint32_t k1e9 = 1000000000U;
+ constexpr size_t kNumBits = n * 64;
+ // Segments will contain the array split into groups that map to decimal digits,
+ // in little endian order. Each segment will hold at most 9 decimal digits.
+ // For example, if the input represents 9876543210123456789, then segments will be
+ // [123456789, 876543210, 9].
+ // The max number of segments needed = ceil(kNumBits * log(2) / log(1e9))
+ // = ceil(kNumBits / 29.897352854) <= ceil(kNumBits / 29).
+ std::array<uint32_t, (kNumBits + 28) / 29> segments;
+ size_t num_segments = 0;
+ uint64_t* most_significant_elem = &copy[most_significant_elem_idx];
+ do {
+ // Compute remainder = copy % 1e9 and copy = copy / 1e9.
+ uint32_t remainder = 0;
+ uint64_t* elem = most_significant_elem;
+ do {
+ // Compute dividend = (remainder << 32) | *elem (a virtual 96-bit integer);
+ // *elem = dividend / 1e9;
+ // remainder = dividend % 1e9.
+ uint32_t hi = static_cast<uint32_t>(*elem >> 32);
+ uint32_t lo = static_cast<uint32_t>(*elem & BitUtil::LeastSignificantBitMask(32));
+ uint64_t dividend_hi = (static_cast<uint64_t>(remainder) << 32) | hi;
+ uint64_t quotient_hi = dividend_hi / k1e9;
+ remainder = static_cast<uint32_t>(dividend_hi % k1e9);
+ uint64_t dividend_lo = (static_cast<uint64_t>(remainder) << 32) | lo;
+ uint64_t quotient_lo = dividend_lo / k1e9;
+ remainder = static_cast<uint32_t>(dividend_lo % k1e9);
+ *elem = (quotient_hi << 32) | quotient_lo;
+ } while (elem-- != copy.data());
+
+ segments[num_segments++] = remainder;
+ } while (*most_significant_elem != 0 || most_significant_elem-- != copy.data());
+
+ size_t old_size = result->size();
+ size_t new_size = old_size + num_segments * 9;
+ result->resize(new_size, '0');
+ char* output = &result->at(old_size);
+ const uint32_t* segment = &segments[num_segments - 1];
+ internal::StringFormatter<UInt32Type> format;
+ // First segment is formatted as-is.
+ format(*segment, [&output](util::string_view formatted) {
+ memcpy(output, formatted.data(), formatted.size());
+ output += formatted.size();
+ });
+ while (segment != segments.data()) {
+ --segment;
+ // Right-pad formatted segment such that e.g. 123 is formatted as "000000123".
+ output += 9;
+ format(*segment, [output](util::string_view formatted) {
+ memcpy(output - formatted.size(), formatted.data(), formatted.size());
+ });
+ }
+ result->resize(output - result->data());
+}
+
+std::string Decimal128::ToIntegerString() const {
+ std::string result;
+ if (high_bits() < 0) {
+ result.push_back('-');
+ Decimal128 abs = *this;
+ abs.Negate();
+ AppendLittleEndianArrayToString<2>(
+ {abs.low_bits(), static_cast<uint64_t>(abs.high_bits())}, &result);
+ } else {
+ AppendLittleEndianArrayToString<2>({low_bits(), static_cast<uint64_t>(high_bits())},
+ &result);
+ }
+ return result;
+}
+
+Decimal128::operator int64_t() const {
+ DCHECK(high_bits() == 0 || high_bits() == -1)
+ << "Trying to cast a Decimal128 greater than the value range of a "
+ "int64_t. high_bits_ must be equal to 0 or -1, got: "
+ << high_bits();
+ return static_cast<int64_t>(low_bits());
+}
+
+static void AdjustIntegerStringWithScale(int32_t scale, std::string* str) {
+ if (scale == 0) {
+ return;
+ }
+ DCHECK(str != nullptr);
+ DCHECK(!str->empty());
+ const bool is_negative = str->front() == '-';
+ const auto is_negative_offset = static_cast<int32_t>(is_negative);
+ const auto len = static_cast<int32_t>(str->size());
+ const int32_t num_digits = len - is_negative_offset;
+ const int32_t adjusted_exponent = num_digits - 1 - scale;
+
+ /// Note that the -6 is taken from the Java BigDecimal documentation.
+ if (scale < 0 || adjusted_exponent < -6) {
+ // Example 1:
+ // Precondition: *str = "123", is_negative_offset = 0, num_digits = 3, scale = -2,
+ // adjusted_exponent = 4
+ // After inserting decimal point: *str = "1.23"
+ // After appending exponent: *str = "1.23E+4"
+ // Example 2:
+ // Precondition: *str = "-123", is_negative_offset = 1, num_digits = 3, scale = 9,
+ // adjusted_exponent = -7
+ // After inserting decimal point: *str = "-1.23"
+ // After appending exponent: *str = "-1.23E-7"
+ str->insert(str->begin() + 1 + is_negative_offset, '.');
+ str->push_back('E');
+ if (adjusted_exponent >= 0) {
+ str->push_back('+');
+ }
+ internal::StringFormatter<Int32Type> format;
+ format(adjusted_exponent, [str](util::string_view formatted) {
+ str->append(formatted.data(), formatted.size());
+ });
+ return;
+ }
+
+ if (num_digits > scale) {
+ const auto n = static_cast<size_t>(len - scale);
+ // Example 1:
+ // Precondition: *str = "123", len = num_digits = 3, scale = 1, n = 2
+ // After inserting decimal point: *str = "12.3"
+ // Example 2:
+ // Precondition: *str = "-123", len = 4, num_digits = 3, scale = 1, n = 3
+ // After inserting decimal point: *str = "-12.3"
+ str->insert(str->begin() + n, '.');
+ return;
+ }
+
+ // Example 1:
+ // Precondition: *str = "123", is_negative_offset = 0, num_digits = 3, scale = 4
+ // After insert: *str = "000123"
+ // After setting decimal point: *str = "0.0123"
+ // Example 2:
+ // Precondition: *str = "-123", is_negative_offset = 1, num_digits = 3, scale = 4
+ // After insert: *str = "-000123"
+ // After setting decimal point: *str = "-0.0123"
+ str->insert(is_negative_offset, scale - num_digits + 2, '0');
+ str->at(is_negative_offset + 1) = '.';
+}
+
+std::string Decimal128::ToString(int32_t scale) const {
+ std::string str(ToIntegerString());
+ AdjustIntegerStringWithScale(scale, &str);
+ return str;
+}
+
+// Iterates over input and for each group of kInt64DecimalDigits multiple out by
+// the appropriate power of 10 necessary to add source parsed as uint64 and
+// then adds the parsed value of source.
+static inline void ShiftAndAdd(const util::string_view& input, uint64_t out[],
+ size_t out_size) {
+ for (size_t posn = 0; posn < input.size();) {
+ const size_t group_size = std::min(kInt64DecimalDigits, input.size() - posn);
+ const uint64_t multiple = kUInt64PowersOfTen[group_size];
+ uint64_t chunk = 0;
+ ARROW_CHECK(
+ internal::ParseValue<UInt64Type>(input.data() + posn, group_size, &chunk));
+
+ for (size_t i = 0; i < out_size; ++i) {
+ uint128_t tmp = out[i];
+ tmp *= multiple;
+ tmp += chunk;
+ out[i] = static_cast<uint64_t>(tmp & 0xFFFFFFFFFFFFFFFFULL);
+ chunk = static_cast<uint64_t>(tmp >> 64);
+ }
+ posn += group_size;
+ }
+}
+
+namespace {
+
+struct DecimalComponents {
+ util::string_view whole_digits;
+ util::string_view fractional_digits;
+ int32_t exponent = 0;
+ char sign = 0;
+ bool has_exponent = false;
+};
+
+inline bool IsSign(char c) { return c == '-' || c == '+'; }
+
+inline bool IsDot(char c) { return c == '.'; }
+
+inline bool IsDigit(char c) { return c >= '0' && c <= '9'; }
+
+inline bool StartsExponent(char c) { return c == 'e' || c == 'E'; }
+
+inline size_t ParseDigitsRun(const char* s, size_t start, size_t size,
+ util::string_view* out) {
+ size_t pos;
+ for (pos = start; pos < size; ++pos) {
+ if (!IsDigit(s[pos])) {
+ break;
+ }
+ }
+ *out = util::string_view(s + start, pos - start);
+ return pos;
+}
+
+bool ParseDecimalComponents(const char* s, size_t size, DecimalComponents* out) {
+ size_t pos = 0;
+
+ if (size == 0) {
+ return false;
+ }
+ // Sign of the number
+ if (IsSign(s[pos])) {
+ out->sign = *(s + pos);
+ ++pos;
+ }
+ // First run of digits
+ pos = ParseDigitsRun(s, pos, size, &out->whole_digits);
+ if (pos == size) {
+ return !out->whole_digits.empty();
+ }
+ // Optional dot (if given in fractional form)
+ bool has_dot = IsDot(s[pos]);
+ if (has_dot) {
+ // Second run of digits
+ ++pos;
+ pos = ParseDigitsRun(s, pos, size, &out->fractional_digits);
+ }
+ if (out->whole_digits.empty() && out->fractional_digits.empty()) {
+ // Need at least some digits (whole or fractional)
+ return false;
+ }
+ if (pos == size) {
+ return true;
+ }
+ // Optional exponent
+ if (StartsExponent(s[pos])) {
+ ++pos;
+ if (pos != size && s[pos] == '+') {
+ ++pos;
+ }
+ out->has_exponent = true;
+ return internal::ParseValue<Int32Type>(s + pos, size - pos, &(out->exponent));
+ }
+ return pos == size;
+}
+
+inline Status ToArrowStatus(DecimalStatus dstatus, int num_bits) {
+ switch (dstatus) {
+ case DecimalStatus::kSuccess:
+ return Status::OK();
+
+ case DecimalStatus::kDivideByZero:
+ return Status::Invalid("Division by 0 in Decimal", num_bits);
+
+ case DecimalStatus::kOverflow:
+ return Status::Invalid("Overflow occurred during Decimal", num_bits, " operation.");
+
+ case DecimalStatus::kRescaleDataLoss:
+ return Status::Invalid("Rescaling Decimal", num_bits,
+ " value would cause data loss");
+ }
+ return Status::OK();
+}
+
+} // namespace
+
+Status Decimal128::FromString(const util::string_view& s, Decimal128* out,
+ int32_t* precision, int32_t* scale) {
+ if (s.empty()) {
+ return Status::Invalid("Empty string cannot be converted to decimal");
+ }
+
+ DecimalComponents dec;
+ if (!ParseDecimalComponents(s.data(), s.size(), &dec)) {
+ return Status::Invalid("The string '", s, "' is not a valid decimal number");
+ }
+
+ // Count number of significant digits (without leading zeros)
+ size_t first_non_zero = dec.whole_digits.find_first_not_of('0');
+ size_t significant_digits = dec.fractional_digits.size();
+ if (first_non_zero != std::string::npos) {
+ significant_digits += dec.whole_digits.size() - first_non_zero;
+ }
+ int32_t parsed_precision = static_cast<int32_t>(significant_digits);
+
+ int32_t parsed_scale = 0;
+ if (dec.has_exponent) {
+ auto adjusted_exponent = dec.exponent;
+ auto len = static_cast<int32_t>(significant_digits);
+ parsed_scale = -adjusted_exponent + len - 1;
+ } else {
+ parsed_scale = static_cast<int32_t>(dec.fractional_digits.size());
+ }
+
+ if (out != nullptr) {
+ std::array<uint64_t, 2> little_endian_array = {0, 0};
+ ShiftAndAdd(dec.whole_digits, little_endian_array.data(), little_endian_array.size());
+ ShiftAndAdd(dec.fractional_digits, little_endian_array.data(),
+ little_endian_array.size());
+ *out =
+ Decimal128(static_cast<int64_t>(little_endian_array[1]), little_endian_array[0]);
+ if (parsed_scale < 0) {
+ *out *= GetScaleMultiplier(-parsed_scale);
+ }
+
+ if (dec.sign == '-') {
+ out->Negate();
+ }
+ }
+
+ if (parsed_scale < 0) {
+ parsed_precision -= parsed_scale;
+ parsed_scale = 0;
+ }
+
+ if (precision != nullptr) {
+ *precision = parsed_precision;
+ }
+ if (scale != nullptr) {
+ *scale = parsed_scale;
+ }
+
+ return Status::OK();
+}
+
+Status Decimal128::FromString(const std::string& s, Decimal128* out, int32_t* precision,
+ int32_t* scale) {
+ return FromString(util::string_view(s), out, precision, scale);
+}
+
+Status Decimal128::FromString(const char* s, Decimal128* out, int32_t* precision,
+ int32_t* scale) {
+ return FromString(util::string_view(s), out, precision, scale);
+}
+
+Result<Decimal128> Decimal128::FromString(const util::string_view& s) {
+ Decimal128 out;
+ RETURN_NOT_OK(FromString(s, &out, nullptr, nullptr));
+ return std::move(out);
+}
+
+Result<Decimal128> Decimal128::FromString(const std::string& s) {
+ return FromString(util::string_view(s));
+}
+
+Result<Decimal128> Decimal128::FromString(const char* s) {
+ return FromString(util::string_view(s));
+}
+
+// Helper function used by Decimal128::FromBigEndian
+static inline uint64_t UInt64FromBigEndian(const uint8_t* bytes, int32_t length) {
+ // We don't bounds check the length here because this is called by
+ // FromBigEndian that has a Decimal128 as its out parameters and
+ // that function is already checking the length of the bytes and only
+ // passes lengths between zero and eight.
+ uint64_t result = 0;
+ // Using memcpy instead of special casing for length
+ // and doing the conversion in 16, 32 parts, which could
+ // possibly create unaligned memory access on certain platforms
+ memcpy(reinterpret_cast<uint8_t*>(&result) + 8 - length, bytes, length);
+ return ::arrow::BitUtil::FromBigEndian(result);
+}
+
+Result<Decimal128> Decimal128::FromBigEndian(const uint8_t* bytes, int32_t length) {
+ static constexpr int32_t kMinDecimalBytes = 1;
+ static constexpr int32_t kMaxDecimalBytes = 16;
+
+ int64_t high, low;
+
+ if (ARROW_PREDICT_FALSE(length < kMinDecimalBytes || length > kMaxDecimalBytes)) {
+ return Status::Invalid("Length of byte array passed to Decimal128::FromBigEndian ",
+ "was ", length, ", but must be between ", kMinDecimalBytes,
+ " and ", kMaxDecimalBytes);
+ }
+
+ // Bytes are coming in big-endian, so the first byte is the MSB and therefore holds the
+ // sign bit.
+ const bool is_negative = static_cast<int8_t>(bytes[0]) < 0;
+
+ // 1. Extract the high bytes
+ // Stop byte of the high bytes
+ const int32_t high_bits_offset = std::max(0, length - 8);
+ const auto high_bits = UInt64FromBigEndian(bytes, high_bits_offset);
+
+ if (high_bits_offset == 8) {
+ // Avoid undefined shift by 64 below
+ high = high_bits;
+ } else {
+ high = -1 * (is_negative && length < kMaxDecimalBytes);
+ // Shift left enough bits to make room for the incoming int64_t
+ high = SafeLeftShift(high, high_bits_offset * CHAR_BIT);
+ // Preserve the upper bits by inplace OR-ing the int64_t
+ high |= high_bits;
+ }
+
+ // 2. Extract the low bytes
+ // Stop byte of the low bytes
+ const int32_t low_bits_offset = std::min(length, 8);
+ const auto low_bits =
+ UInt64FromBigEndian(bytes + high_bits_offset, length - high_bits_offset);
+
+ if (low_bits_offset == 8) {
+ // Avoid undefined shift by 64 below
+ low = low_bits;
+ } else {
+ // Sign extend the low bits if necessary
+ low = -1 * (is_negative && length < 8);
+ // Shift left enough bits to make room for the incoming int64_t
+ low = SafeLeftShift(low, low_bits_offset * CHAR_BIT);
+ // Preserve the upper bits by inplace OR-ing the int64_t
+ low |= low_bits;
+ }
+
+ return Decimal128(high, static_cast<uint64_t>(low));
+}
+
+Status Decimal128::ToArrowStatus(DecimalStatus dstatus) const {
+ return arrow::ToArrowStatus(dstatus, 128);
+}
+
+std::ostream& operator<<(std::ostream& os, const Decimal128& decimal) {
+ os << decimal.ToIntegerString();
+ return os;
+}
+
+Decimal256::Decimal256(const std::string& str) : Decimal256() {
+ *this = Decimal256::FromString(str).ValueOrDie();
+}
+
+std::string Decimal256::ToIntegerString() const {
+ std::string result;
+ if (static_cast<int64_t>(little_endian_array()[3]) < 0) {
+ result.push_back('-');
+ Decimal256 abs = *this;
+ abs.Negate();
+ AppendLittleEndianArrayToString(abs.little_endian_array(), &result);
+ } else {
+ AppendLittleEndianArrayToString(little_endian_array(), &result);
+ }
+ return result;
+}
+
+std::string Decimal256::ToString(int32_t scale) const {
+ std::string str(ToIntegerString());
+ AdjustIntegerStringWithScale(scale, &str);
+ return str;
+}
+
+Status Decimal256::FromString(const util::string_view& s, Decimal256* out,
+ int32_t* precision, int32_t* scale) {
+ if (s.empty()) {
+ return Status::Invalid("Empty string cannot be converted to decimal");
+ }
+
+ DecimalComponents dec;
+ if (!ParseDecimalComponents(s.data(), s.size(), &dec)) {
+ return Status::Invalid("The string '", s, "' is not a valid decimal number");
+ }
+
+ // Count number of significant digits (without leading zeros)
+ size_t first_non_zero = dec.whole_digits.find_first_not_of('0');
+ size_t significant_digits = dec.fractional_digits.size();
+ if (first_non_zero != std::string::npos) {
+ significant_digits += dec.whole_digits.size() - first_non_zero;
+ }
+
+ if (precision != nullptr) {
+ *precision = static_cast<int32_t>(significant_digits);
+ }
+
+ if (scale != nullptr) {
+ if (dec.has_exponent) {
+ auto adjusted_exponent = dec.exponent;
+ auto len = static_cast<int32_t>(significant_digits);
+ *scale = -adjusted_exponent + len - 1;
+ } else {
+ *scale = static_cast<int32_t>(dec.fractional_digits.size());
+ }
+ }
+
+ if (out != nullptr) {
+ std::array<uint64_t, 4> little_endian_array = {0, 0, 0, 0};
+ ShiftAndAdd(dec.whole_digits, little_endian_array.data(), little_endian_array.size());
+ ShiftAndAdd(dec.fractional_digits, little_endian_array.data(),
+ little_endian_array.size());
+ *out = Decimal256(little_endian_array);
+
+ if (dec.sign == '-') {
+ out->Negate();
+ }
+ }
+
+ return Status::OK();
+}
+
+Status Decimal256::FromString(const std::string& s, Decimal256* out, int32_t* precision,
+ int32_t* scale) {
+ return FromString(util::string_view(s), out, precision, scale);
+}
+
+Status Decimal256::FromString(const char* s, Decimal256* out, int32_t* precision,
+ int32_t* scale) {
+ return FromString(util::string_view(s), out, precision, scale);
+}
+
+Result<Decimal256> Decimal256::FromString(const util::string_view& s) {
+ Decimal256 out;
+ RETURN_NOT_OK(FromString(s, &out, nullptr, nullptr));
+ return std::move(out);
+}
+
+Result<Decimal256> Decimal256::FromString(const std::string& s) {
+ return FromString(util::string_view(s));
+}
+
+Result<Decimal256> Decimal256::FromString(const char* s) {
+ return FromString(util::string_view(s));
+}
+
+Result<Decimal256> Decimal256::FromBigEndian(const uint8_t* bytes, int32_t length) {
+ static constexpr int32_t kMinDecimalBytes = 1;
+ static constexpr int32_t kMaxDecimalBytes = 32;
+
+ std::array<uint64_t, 4> little_endian_array;
+
+ if (ARROW_PREDICT_FALSE(length < kMinDecimalBytes || length > kMaxDecimalBytes)) {
+ return Status::Invalid("Length of byte array passed to Decimal128::FromBigEndian ",
+ "was ", length, ", but must be between ", kMinDecimalBytes,
+ " and ", kMaxDecimalBytes);
+ }
+
+ // Bytes are coming in big-endian, so the first byte is the MSB and therefore holds the
+ // sign bit.
+ const bool is_negative = static_cast<int8_t>(bytes[0]) < 0;
+
+ for (int word_idx = 0; word_idx < 4; word_idx++) {
+ const int32_t word_length = std::min(length, static_cast<int32_t>(sizeof(uint64_t)));
+
+ if (word_length == 8) {
+ // Full words can be assigned as is (and are UB with the shift below).
+ little_endian_array[word_idx] =
+ UInt64FromBigEndian(bytes + length - word_length, word_length);
+ } else {
+ // Sign extend the word its if necessary
+ uint64_t word = -1 * is_negative;
+ if (length > 0) {
+ // Incorporate the actual values if present.
+ // Shift left enough bits to make room for the incoming int64_t
+ word = SafeLeftShift(word, word_length * CHAR_BIT);
+ // Preserve the upper bits by inplace OR-ing the int64_t
+ word |= UInt64FromBigEndian(bytes + length - word_length, word_length);
+ }
+ little_endian_array[word_idx] = word;
+ }
+ // Move on to the next word.
+ length -= word_length;
+ }
+
+ return Decimal256(little_endian_array);
+}
+
+Status Decimal256::ToArrowStatus(DecimalStatus dstatus) const {
+ return arrow::ToArrowStatus(dstatus, 256);
+}
+
+namespace {
+
+template <typename Real, typename Derived>
+struct Decimal256RealConversion {
+ static Result<Decimal256> FromPositiveReal(Real real, int32_t precision,
+ int32_t scale) {
+ auto x = real;
+ if (scale >= -76 && scale <= 76) {
+ x *= Derived::powers_of_ten()[scale + 76];
+ } else {
+ x *= std::pow(static_cast<Real>(10), static_cast<Real>(scale));
+ }
+ x = std::nearbyint(x);
+ const auto max_abs = Derived::powers_of_ten()[precision + 76];
+ if (x >= max_abs) {
+ return Status::Invalid("Cannot convert ", real,
+ " to Decimal256(precision = ", precision,
+ ", scale = ", scale, "): overflow");
+ }
+ // Extract parts
+ const auto part3 = std::floor(std::ldexp(x, -192));
+ x -= std::ldexp(part3, 192);
+ const auto part2 = std::floor(std::ldexp(x, -128));
+ x -= std::ldexp(part2, 128);
+ const auto part1 = std::floor(std::ldexp(x, -64));
+ x -= std::ldexp(part1, 64);
+ const auto part0 = x;
+
+ DCHECK_GE(part3, 0);
+ DCHECK_LT(part3, 1.8446744073709552e+19); // 2**64
+ DCHECK_GE(part2, 0);
+ DCHECK_LT(part2, 1.8446744073709552e+19); // 2**64
+ DCHECK_GE(part1, 0);
+ DCHECK_LT(part1, 1.8446744073709552e+19); // 2**64
+ DCHECK_GE(part0, 0);
+ DCHECK_LT(part0, 1.8446744073709552e+19); // 2**64
+ return Decimal256(std::array<uint64_t, 4>{
+ static_cast<uint64_t>(part0), static_cast<uint64_t>(part1),
+ static_cast<uint64_t>(part2), static_cast<uint64_t>(part3)});
+ }
+
+ static Result<Decimal256> FromReal(Real x, int32_t precision, int32_t scale) {
+ DCHECK_GT(precision, 0);
+ DCHECK_LE(precision, 76);
+
+ if (!std::isfinite(x)) {
+ return Status::Invalid("Cannot convert ", x, " to Decimal256");
+ }
+ if (x < 0) {
+ ARROW_ASSIGN_OR_RAISE(auto dec, FromPositiveReal(-x, precision, scale));
+ return dec.Negate();
+ } else {
+ // Includes negative zero
+ return FromPositiveReal(x, precision, scale);
+ }
+ }
+
+ static Real ToRealPositive(const Decimal256& decimal, int32_t scale) {
+ DCHECK_GE(decimal, 0);
+ Real x = 0;
+ const auto& parts = decimal.little_endian_array();
+ x += Derived::two_to_192(static_cast<Real>(parts[3]));
+ x += Derived::two_to_128(static_cast<Real>(parts[2]));
+ x += Derived::two_to_64(static_cast<Real>(parts[1]));
+ x += static_cast<Real>(parts[0]);
+ if (scale >= -76 && scale <= 76) {
+ x *= Derived::powers_of_ten()[-scale + 76];
+ } else {
+ x *= std::pow(static_cast<Real>(10), static_cast<Real>(-scale));
+ }
+ return x;
+ }
+
+ static Real ToReal(Decimal256 decimal, int32_t scale) {
+ if (decimal.little_endian_array()[3] & (1ULL << 63)) {
+ // Convert the absolute value to avoid precision loss
+ decimal.Negate();
+ return -ToRealPositive(decimal, scale);
+ } else {
+ return ToRealPositive(decimal, scale);
+ }
+ }
+};
+
+struct Decimal256FloatConversion
+ : public Decimal256RealConversion<float, Decimal256FloatConversion> {
+ static constexpr const float* powers_of_ten() { return kFloatPowersOfTen76; }
+
+ static float two_to_64(float x) { return x * 1.8446744e+19f; }
+ static float two_to_128(float x) { return x == 0 ? 0 : INFINITY; }
+ static float two_to_192(float x) { return x == 0 ? 0 : INFINITY; }
+};
+
+struct Decimal256DoubleConversion
+ : public Decimal256RealConversion<double, Decimal256DoubleConversion> {
+ static constexpr const double* powers_of_ten() { return kDoublePowersOfTen76; }
+
+ static double two_to_64(double x) { return x * 1.8446744073709552e+19; }
+ static double two_to_128(double x) { return x * 3.402823669209385e+38; }
+ static double two_to_192(double x) { return x * 6.277101735386681e+57; }
+};
+
+} // namespace
+
+Result<Decimal256> Decimal256::FromReal(float x, int32_t precision, int32_t scale) {
+ return Decimal256FloatConversion::FromReal(x, precision, scale);
+}
+
+Result<Decimal256> Decimal256::FromReal(double x, int32_t precision, int32_t scale) {
+ return Decimal256DoubleConversion::FromReal(x, precision, scale);
+}
+
+float Decimal256::ToFloat(int32_t scale) const {
+ return Decimal256FloatConversion::ToReal(*this, scale);
+}
+
+double Decimal256::ToDouble(int32_t scale) const {
+ return Decimal256DoubleConversion::ToReal(*this, scale);
+}
+
+std::ostream& operator<<(std::ostream& os, const Decimal256& decimal) {
+ os << decimal.ToIntegerString();
+ return os;
+}
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/decimal.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/decimal.h
new file mode 100644
index 00000000000..4a158728833
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/decimal.h
@@ -0,0 +1,291 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <iosfwd>
+#include <limits>
+#include <string>
+#include <utility>
+
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/util/basic_decimal.h"
+#include "arrow/util/string_view.h"
+
+namespace arrow {
+
+/// Represents a signed 128-bit integer in two's complement.
+/// Calculations wrap around and overflow is ignored.
+/// The max decimal precision that can be safely represented is
+/// 38 significant digits.
+///
+/// For a discussion of the algorithms, look at Knuth's volume 2,
+/// Semi-numerical Algorithms section 4.3.1.
+///
+/// Adapted from the Apache ORC C++ implementation
+///
+/// The implementation is split into two parts :
+///
+/// 1. BasicDecimal128
+/// - can be safely compiled to IR without references to libstdc++.
+/// 2. Decimal128
+/// - has additional functionality on top of BasicDecimal128 to deal with
+/// strings and streams.
+class ARROW_EXPORT Decimal128 : public BasicDecimal128 {
+ public:
+ /// \cond FALSE
+ // (need to avoid a duplicate definition in Sphinx)
+ using BasicDecimal128::BasicDecimal128;
+ /// \endcond
+
+ /// \brief constructor creates a Decimal128 from a BasicDecimal128.
+ constexpr Decimal128(const BasicDecimal128& value) noexcept // NOLINT runtime/explicit
+ : BasicDecimal128(value) {}
+
+ /// \brief Parse the number from a base 10 string representation.
+ explicit Decimal128(const std::string& value);
+
+ /// \brief Empty constructor creates a Decimal128 with a value of 0.
+ // This is required on some older compilers.
+ constexpr Decimal128() noexcept : BasicDecimal128() {}
+
+ /// Divide this number by right and return the result.
+ ///
+ /// This operation is not destructive.
+ /// The answer rounds to zero. Signs work like:
+ /// 21 / 5 -> 4, 1
+ /// -21 / 5 -> -4, -1
+ /// 21 / -5 -> -4, 1
+ /// -21 / -5 -> 4, -1
+ /// \param[in] divisor the number to divide by
+ /// \return the pair of the quotient and the remainder
+ Result<std::pair<Decimal128, Decimal128>> Divide(const Decimal128& divisor) const {
+ std::pair<Decimal128, Decimal128> result;
+ auto dstatus = BasicDecimal128::Divide(divisor, &result.first, &result.second);
+ ARROW_RETURN_NOT_OK(ToArrowStatus(dstatus));
+ return std::move(result);
+ }
+
+ /// \brief Convert the Decimal128 value to a base 10 decimal string with the given
+ /// scale.
+ std::string ToString(int32_t scale) const;
+
+ /// \brief Convert the value to an integer string
+ std::string ToIntegerString() const;
+
+ /// \brief Cast this value to an int64_t.
+ explicit operator int64_t() const;
+
+ /// \brief Convert a decimal string to a Decimal128 value, optionally including
+ /// precision and scale if they're passed in and not null.
+ static Status FromString(const util::string_view& s, Decimal128* out,
+ int32_t* precision, int32_t* scale = NULLPTR);
+ static Status FromString(const std::string& s, Decimal128* out, int32_t* precision,
+ int32_t* scale = NULLPTR);
+ static Status FromString(const char* s, Decimal128* out, int32_t* precision,
+ int32_t* scale = NULLPTR);
+ static Result<Decimal128> FromString(const util::string_view& s);
+ static Result<Decimal128> FromString(const std::string& s);
+ static Result<Decimal128> FromString(const char* s);
+
+ static Result<Decimal128> FromReal(double real, int32_t precision, int32_t scale);
+ static Result<Decimal128> FromReal(float real, int32_t precision, int32_t scale);
+
+ /// \brief Convert from a big-endian byte representation. The length must be
+ /// between 1 and 16.
+ /// \return error status if the length is an invalid value
+ static Result<Decimal128> FromBigEndian(const uint8_t* data, int32_t length);
+
+ /// \brief Convert Decimal128 from one scale to another
+ Result<Decimal128> Rescale(int32_t original_scale, int32_t new_scale) const {
+ Decimal128 out;
+ auto dstatus = BasicDecimal128::Rescale(original_scale, new_scale, &out);
+ ARROW_RETURN_NOT_OK(ToArrowStatus(dstatus));
+ return std::move(out);
+ }
+
+ /// \brief Convert to a signed integer
+ template <typename T, typename = internal::EnableIfIsOneOf<T, int32_t, int64_t>>
+ Result<T> ToInteger() const {
+ constexpr auto min_value = std::numeric_limits<T>::min();
+ constexpr auto max_value = std::numeric_limits<T>::max();
+ const auto& self = *this;
+ if (self < min_value || self > max_value) {
+ return Status::Invalid("Invalid cast from Decimal128 to ", sizeof(T),
+ " byte integer");
+ }
+ return static_cast<T>(low_bits());
+ }
+
+ /// \brief Convert to a signed integer
+ template <typename T, typename = internal::EnableIfIsOneOf<T, int32_t, int64_t>>
+ Status ToInteger(T* out) const {
+ return ToInteger<T>().Value(out);
+ }
+
+ /// \brief Convert to a floating-point number (scaled)
+ float ToFloat(int32_t scale) const;
+ /// \brief Convert to a floating-point number (scaled)
+ double ToDouble(int32_t scale) const;
+
+ /// \brief Convert to a floating-point number (scaled)
+ template <typename T>
+ T ToReal(int32_t scale) const {
+ return ToRealConversion<T>::ToReal(*this, scale);
+ }
+
+ friend ARROW_EXPORT std::ostream& operator<<(std::ostream& os,
+ const Decimal128& decimal);
+
+ private:
+ /// Converts internal error code to Status
+ Status ToArrowStatus(DecimalStatus dstatus) const;
+
+ template <typename T>
+ struct ToRealConversion {};
+};
+
+template <>
+struct Decimal128::ToRealConversion<float> {
+ static float ToReal(const Decimal128& dec, int32_t scale) { return dec.ToFloat(scale); }
+};
+
+template <>
+struct Decimal128::ToRealConversion<double> {
+ static double ToReal(const Decimal128& dec, int32_t scale) {
+ return dec.ToDouble(scale);
+ }
+};
+
+/// Represents a signed 256-bit integer in two's complement.
+/// The max decimal precision that can be safely represented is
+/// 76 significant digits.
+///
+/// The implementation is split into two parts :
+///
+/// 1. BasicDecimal256
+/// - can be safely compiled to IR without references to libstdc++.
+/// 2. Decimal256
+/// - (TODO) has additional functionality on top of BasicDecimal256 to deal with
+/// strings and streams.
+class ARROW_EXPORT Decimal256 : public BasicDecimal256 {
+ public:
+ /// \cond FALSE
+ // (need to avoid a duplicate definition in Sphinx)
+ using BasicDecimal256::BasicDecimal256;
+ /// \endcond
+
+ /// \brief constructor creates a Decimal256 from a BasicDecimal256.
+ constexpr Decimal256(const BasicDecimal256& value) noexcept : BasicDecimal256(value) {}
+
+ /// \brief Parse the number from a base 10 string representation.
+ explicit Decimal256(const std::string& value);
+
+ /// \brief Empty constructor creates a Decimal256 with a value of 0.
+ // This is required on some older compilers.
+ constexpr Decimal256() noexcept : BasicDecimal256() {}
+
+ /// \brief Convert the Decimal256 value to a base 10 decimal string with the given
+ /// scale.
+ std::string ToString(int32_t scale) const;
+
+ /// \brief Convert the value to an integer string
+ std::string ToIntegerString() const;
+
+ /// \brief Convert a decimal string to a Decimal256 value, optionally including
+ /// precision and scale if they're passed in and not null.
+ static Status FromString(const util::string_view& s, Decimal256* out,
+ int32_t* precision, int32_t* scale = NULLPTR);
+ static Status FromString(const std::string& s, Decimal256* out, int32_t* precision,
+ int32_t* scale = NULLPTR);
+ static Status FromString(const char* s, Decimal256* out, int32_t* precision,
+ int32_t* scale = NULLPTR);
+ static Result<Decimal256> FromString(const util::string_view& s);
+ static Result<Decimal256> FromString(const std::string& s);
+ static Result<Decimal256> FromString(const char* s);
+
+ /// \brief Convert Decimal256 from one scale to another
+ Result<Decimal256> Rescale(int32_t original_scale, int32_t new_scale) const {
+ Decimal256 out;
+ auto dstatus = BasicDecimal256::Rescale(original_scale, new_scale, &out);
+ ARROW_RETURN_NOT_OK(ToArrowStatus(dstatus));
+ return std::move(out);
+ }
+
+ /// Divide this number by right and return the result.
+ ///
+ /// This operation is not destructive.
+ /// The answer rounds to zero. Signs work like:
+ /// 21 / 5 -> 4, 1
+ /// -21 / 5 -> -4, -1
+ /// 21 / -5 -> -4, 1
+ /// -21 / -5 -> 4, -1
+ /// \param[in] divisor the number to divide by
+ /// \return the pair of the quotient and the remainder
+ Result<std::pair<Decimal256, Decimal256>> Divide(const Decimal256& divisor) const {
+ std::pair<Decimal256, Decimal256> result;
+ auto dstatus = BasicDecimal256::Divide(divisor, &result.first, &result.second);
+ ARROW_RETURN_NOT_OK(ToArrowStatus(dstatus));
+ return std::move(result);
+ }
+
+ /// \brief Convert from a big-endian byte representation. The length must be
+ /// between 1 and 32.
+ /// \return error status if the length is an invalid value
+ static Result<Decimal256> FromBigEndian(const uint8_t* data, int32_t length);
+
+ static Result<Decimal256> FromReal(double real, int32_t precision, int32_t scale);
+ static Result<Decimal256> FromReal(float real, int32_t precision, int32_t scale);
+
+ /// \brief Convert to a floating-point number (scaled).
+ /// May return infinity in case of overflow.
+ float ToFloat(int32_t scale) const;
+ /// \brief Convert to a floating-point number (scaled)
+ double ToDouble(int32_t scale) const;
+
+ /// \brief Convert to a floating-point number (scaled)
+ template <typename T>
+ T ToReal(int32_t scale) const {
+ return ToRealConversion<T>::ToReal(*this, scale);
+ }
+
+ friend ARROW_EXPORT std::ostream& operator<<(std::ostream& os,
+ const Decimal256& decimal);
+
+ private:
+ /// Converts internal error code to Status
+ Status ToArrowStatus(DecimalStatus dstatus) const;
+
+ template <typename T>
+ struct ToRealConversion {};
+};
+
+template <>
+struct Decimal256::ToRealConversion<float> {
+ static float ToReal(const Decimal256& dec, int32_t scale) { return dec.ToFloat(scale); }
+};
+
+template <>
+struct Decimal256::ToRealConversion<double> {
+ static double ToReal(const Decimal256& dec, int32_t scale) {
+ return dec.ToDouble(scale);
+ }
+};
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/delimiting.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/delimiting.cc
new file mode 100644
index 00000000000..fe1b6ea3126
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/delimiting.cc
@@ -0,0 +1,193 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/util/delimiting.h"
+#include "arrow/buffer.h"
+#include "arrow/util/logging.h"
+
+namespace arrow {
+
+BoundaryFinder::~BoundaryFinder() {}
+
+namespace {
+
+Status StraddlingTooLarge() {
+ return Status::Invalid(
+ "straddling object straddles two block boundaries (try to increase block size?)");
+}
+
+class NewlineBoundaryFinder : public BoundaryFinder {
+ public:
+ Status FindFirst(util::string_view partial, util::string_view block,
+ int64_t* out_pos) override {
+ auto pos = block.find_first_of(newline_delimiters);
+ if (pos == util::string_view::npos) {
+ *out_pos = kNoDelimiterFound;
+ } else {
+ auto end = block.find_first_not_of(newline_delimiters, pos);
+ if (end == util::string_view::npos) {
+ end = block.length();
+ }
+ *out_pos = static_cast<int64_t>(end);
+ }
+ return Status::OK();
+ }
+
+ Status FindLast(util::string_view block, int64_t* out_pos) override {
+ auto pos = block.find_last_of(newline_delimiters);
+ if (pos == util::string_view::npos) {
+ *out_pos = kNoDelimiterFound;
+ } else {
+ auto end = block.find_first_not_of(newline_delimiters, pos);
+ if (end == util::string_view::npos) {
+ end = block.length();
+ }
+ *out_pos = static_cast<int64_t>(end);
+ }
+ return Status::OK();
+ }
+
+ Status FindNth(util::string_view partial, util::string_view block, int64_t count,
+ int64_t* out_pos, int64_t* num_found) override {
+ DCHECK(partial.find_first_of(newline_delimiters) == util::string_view::npos);
+
+ int64_t found = 0;
+ int64_t pos = kNoDelimiterFound;
+
+ auto cur_pos = block.find_first_of(newline_delimiters);
+ while (cur_pos != util::string_view::npos) {
+ if (block[cur_pos] == '\r' && cur_pos + 1 < block.length() &&
+ block[cur_pos + 1] == '\n') {
+ cur_pos += 2;
+ } else {
+ ++cur_pos;
+ }
+
+ pos = static_cast<int64_t>(cur_pos);
+ if (++found >= count) {
+ break;
+ }
+
+ cur_pos = block.find_first_of(newline_delimiters, cur_pos);
+ }
+
+ *out_pos = pos;
+ *num_found = found;
+ return Status::OK();
+ }
+
+ protected:
+ static constexpr const char* newline_delimiters = "\r\n";
+};
+
+} // namespace
+
+std::shared_ptr<BoundaryFinder> MakeNewlineBoundaryFinder() {
+ return std::make_shared<NewlineBoundaryFinder>();
+}
+
+Chunker::~Chunker() {}
+
+Chunker::Chunker(std::shared_ptr<BoundaryFinder> delimiter)
+ : boundary_finder_(delimiter) {}
+
+Status Chunker::Process(std::shared_ptr<Buffer> block, std::shared_ptr<Buffer>* whole,
+ std::shared_ptr<Buffer>* partial) {
+ int64_t last_pos = -1;
+ RETURN_NOT_OK(boundary_finder_->FindLast(util::string_view(*block), &last_pos));
+ if (last_pos == BoundaryFinder::kNoDelimiterFound) {
+ // No delimiter found
+ *whole = SliceBuffer(block, 0, 0);
+ *partial = block;
+ return Status::OK();
+ } else {
+ *whole = SliceBuffer(block, 0, last_pos);
+ *partial = SliceBuffer(block, last_pos);
+ }
+ return Status::OK();
+}
+
+Status Chunker::ProcessWithPartial(std::shared_ptr<Buffer> partial,
+ std::shared_ptr<Buffer> block,
+ std::shared_ptr<Buffer>* completion,
+ std::shared_ptr<Buffer>* rest) {
+ if (partial->size() == 0) {
+ // If partial is empty, don't bother looking for completion
+ *completion = SliceBuffer(block, 0, 0);
+ *rest = block;
+ return Status::OK();
+ }
+ int64_t first_pos = -1;
+ RETURN_NOT_OK(boundary_finder_->FindFirst(util::string_view(*partial),
+ util::string_view(*block), &first_pos));
+ if (first_pos == BoundaryFinder::kNoDelimiterFound) {
+ // No delimiter in block => the current object is too large for block size
+ return StraddlingTooLarge();
+ } else {
+ *completion = SliceBuffer(block, 0, first_pos);
+ *rest = SliceBuffer(block, first_pos);
+ return Status::OK();
+ }
+}
+
+Status Chunker::ProcessFinal(std::shared_ptr<Buffer> partial,
+ std::shared_ptr<Buffer> block,
+ std::shared_ptr<Buffer>* completion,
+ std::shared_ptr<Buffer>* rest) {
+ if (partial->size() == 0) {
+ // If partial is empty, don't bother looking for completion
+ *completion = SliceBuffer(block, 0, 0);
+ *rest = block;
+ return Status::OK();
+ }
+ int64_t first_pos = -1;
+ RETURN_NOT_OK(boundary_finder_->FindFirst(util::string_view(*partial),
+ util::string_view(*block), &first_pos));
+ if (first_pos == BoundaryFinder::kNoDelimiterFound) {
+ // No delimiter in block => it's entirely a completion of partial
+ *completion = block;
+ *rest = SliceBuffer(block, 0, 0);
+ } else {
+ *completion = SliceBuffer(block, 0, first_pos);
+ *rest = SliceBuffer(block, first_pos);
+ }
+ return Status::OK();
+}
+
+Status Chunker::ProcessSkip(std::shared_ptr<Buffer> partial,
+ std::shared_ptr<Buffer> block, bool final, int64_t* count,
+ std::shared_ptr<Buffer>* rest) {
+ DCHECK_GT(*count, 0);
+ int64_t pos;
+ int64_t num_found;
+ ARROW_RETURN_NOT_OK(boundary_finder_->FindNth(
+ util::string_view(*partial), util::string_view(*block), *count, &pos, &num_found));
+ if (pos == BoundaryFinder::kNoDelimiterFound) {
+ return StraddlingTooLarge();
+ }
+ if (ARROW_PREDICT_FALSE(final && *count > num_found && block->size() != pos)) {
+ // Skip the last row in the final block which does not have a delimiter
+ ++num_found;
+ *rest = SliceBuffer(block, 0, 0);
+ } else {
+ *rest = SliceBuffer(block, pos);
+ }
+ *count -= num_found;
+ return Status::OK();
+}
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/delimiting.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/delimiting.h
new file mode 100644
index 00000000000..b4b868340db
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/delimiting.h
@@ -0,0 +1,181 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+
+#include "arrow/status.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/string_view.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class Buffer;
+
+class ARROW_EXPORT BoundaryFinder {
+ public:
+ BoundaryFinder() = default;
+
+ virtual ~BoundaryFinder();
+
+ /// \brief Find the position of the first delimiter inside block
+ ///
+ /// `partial` is taken to be the beginning of the block, and `block`
+ /// its continuation. Also, `partial` doesn't contain a delimiter.
+ ///
+ /// The returned `out_pos` is relative to `block`'s start and should point
+ /// to the first character after the first delimiter.
+ /// `out_pos` will be -1 if no delimiter is found.
+ virtual Status FindFirst(util::string_view partial, util::string_view block,
+ int64_t* out_pos) = 0;
+
+ /// \brief Find the position of the last delimiter inside block
+ ///
+ /// The returned `out_pos` is relative to `block`'s start and should point
+ /// to the first character after the last delimiter.
+ /// `out_pos` will be -1 if no delimiter is found.
+ virtual Status FindLast(util::string_view block, int64_t* out_pos) = 0;
+
+ /// \brief Find the position of the Nth delimiter inside the block
+ ///
+ /// `partial` is taken to be the beginning of the block, and `block`
+ /// its continuation. Also, `partial` doesn't contain a delimiter.
+ ///
+ /// The returned `out_pos` is relative to `block`'s start and should point
+ /// to the first character after the first delimiter.
+ /// `out_pos` will be -1 if no delimiter is found.
+ ///
+ /// The returned `num_found` is the number of delimiters actually found
+ virtual Status FindNth(util::string_view partial, util::string_view block,
+ int64_t count, int64_t* out_pos, int64_t* num_found) = 0;
+
+ static constexpr int64_t kNoDelimiterFound = -1;
+
+ protected:
+ ARROW_DISALLOW_COPY_AND_ASSIGN(BoundaryFinder);
+};
+
+ARROW_EXPORT
+std::shared_ptr<BoundaryFinder> MakeNewlineBoundaryFinder();
+
+/// \brief A reusable block-based chunker for delimited data
+///
+/// The chunker takes a block of delimited data and helps carve a sub-block
+/// which begins and ends on delimiters (suitable for consumption by parsers
+/// which can only parse whole objects).
+class ARROW_EXPORT Chunker {
+ public:
+ explicit Chunker(std::shared_ptr<BoundaryFinder> delimiter);
+ ~Chunker();
+
+ /// \brief Carve up a chunk in a block of data to contain only whole objects
+ ///
+ /// Pre-conditions:
+ /// - `block` is the start of a valid block of delimited data
+ /// (i.e. starts just after a delimiter)
+ ///
+ /// Post-conditions:
+ /// - block == whole + partial
+ /// - `whole` is a valid block of delimited data
+ /// (i.e. starts just after a delimiter and ends with a delimiter)
+ /// - `partial` doesn't contain an entire delimited object
+ /// (IOW: `partial` is generally small)
+ ///
+ /// This method will look for the last delimiter in `block` and may
+ /// therefore be costly.
+ ///
+ /// \param[in] block data to be chunked
+ /// \param[out] whole subrange of block containing whole delimited objects
+ /// \param[out] partial subrange of block starting with a partial delimited object
+ Status Process(std::shared_ptr<Buffer> block, std::shared_ptr<Buffer>* whole,
+ std::shared_ptr<Buffer>* partial);
+
+ /// \brief Carve the completion of a partial object out of a block
+ ///
+ /// Pre-conditions:
+ /// - `partial` is the start of a valid block of delimited data
+ /// (i.e. starts just after a delimiter)
+ /// - `block` follows `partial` in file order
+ ///
+ /// Post-conditions:
+ /// - block == completion + rest
+ /// - `partial + completion` is a valid block of delimited data
+ /// (i.e. starts just after a delimiter and ends with a delimiter)
+ /// - `completion` doesn't contain an entire delimited object
+ /// (IOW: `completion` is generally small)
+ ///
+ /// This method will look for the first delimiter in `block` and should
+ /// therefore be reasonably cheap.
+ ///
+ /// \param[in] partial incomplete delimited data
+ /// \param[in] block delimited data following partial
+ /// \param[out] completion subrange of block containing the completion of partial
+ /// \param[out] rest subrange of block containing what completion does not cover
+ Status ProcessWithPartial(std::shared_ptr<Buffer> partial,
+ std::shared_ptr<Buffer> block,
+ std::shared_ptr<Buffer>* completion,
+ std::shared_ptr<Buffer>* rest);
+
+ /// \brief Like ProcessWithPartial, but for the last block of a file
+ ///
+ /// This method allows for a final delimited object without a trailing delimiter
+ /// (ProcessWithPartial would return an error in that case).
+ ///
+ /// Pre-conditions:
+ /// - `partial` is the start of a valid block of delimited data
+ /// - `block` follows `partial` in file order and is the last data block
+ ///
+ /// Post-conditions:
+ /// - block == completion + rest
+ /// - `partial + completion` is a valid block of delimited data
+ /// - `completion` doesn't contain an entire delimited object
+ /// (IOW: `completion` is generally small)
+ ///
+ Status ProcessFinal(std::shared_ptr<Buffer> partial, std::shared_ptr<Buffer> block,
+ std::shared_ptr<Buffer>* completion, std::shared_ptr<Buffer>* rest);
+
+ /// \brief Skip count number of rows
+ /// Pre-conditions:
+ /// - `partial` is the start of a valid block of delimited data
+ /// (i.e. starts just after a delimiter)
+ /// - `block` follows `partial` in file order
+ ///
+ /// Post-conditions:
+ /// - `count` is updated to indicate the number of rows that still need to be skipped
+ /// - If `count` is > 0 then `rest` is an incomplete block that should be a future
+ /// `partial`
+ /// - Else `rest` could be one or more valid blocks of delimited data which need to be
+ /// parsed
+ ///
+ /// \param[in] partial incomplete delimited data
+ /// \param[in] block delimited data following partial
+ /// \param[in] final whether this is the final chunk
+ /// \param[in,out] count number of rows that need to be skipped
+ /// \param[out] rest subrange of block containing what was not skipped
+ Status ProcessSkip(std::shared_ptr<Buffer> partial, std::shared_ptr<Buffer> block,
+ bool final, int64_t* count, std::shared_ptr<Buffer>* rest);
+
+ protected:
+ ARROW_DISALLOW_COPY_AND_ASSIGN(Chunker);
+
+ std::shared_ptr<BoundaryFinder> boundary_finder_;
+};
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/dispatch.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/dispatch.h
new file mode 100644
index 00000000000..fae9293f9e7
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/dispatch.h
@@ -0,0 +1,115 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <utility>
+#include <vector>
+
+#include "arrow/status.h"
+#include "arrow/util/cpu_info.h"
+
+namespace arrow {
+namespace internal {
+
+enum class DispatchLevel : int {
+ // These dispatch levels, corresponding to instruction set features,
+ // are sorted in increasing order of preference.
+ NONE = 0,
+ SSE4_2,
+ AVX2,
+ AVX512,
+ NEON,
+ MAX
+};
+
+/*
+ A facility for dynamic dispatch according to available DispatchLevel.
+
+ Typical use:
+
+ static void my_function_default(...);
+ static void my_function_avx2(...);
+
+ struct MyDynamicFunction {
+ using FunctionType = decltype(&my_function_default);
+
+ static std::vector<std::pair<DispatchLevel, FunctionType>> implementations() {
+ return {
+ { DispatchLevel::NONE, my_function_default }
+ #if defined(ARROW_HAVE_RUNTIME_AVX2)
+ , { DispatchLevel::AVX2, my_function_avx2 }
+ #endif
+ };
+ }
+ };
+
+ void my_function(...) {
+ static DynamicDispatch<MyDynamicFunction> dispatch;
+ return dispatch.func(...);
+ }
+*/
+template <typename DynamicFunction>
+class DynamicDispatch {
+ protected:
+ using FunctionType = typename DynamicFunction::FunctionType;
+ using Implementation = std::pair<DispatchLevel, FunctionType>;
+
+ public:
+ DynamicDispatch() { Resolve(DynamicFunction::implementations()); }
+
+ FunctionType func = {};
+
+ protected:
+ // Use the Implementation with the highest DispatchLevel
+ void Resolve(const std::vector<Implementation>& implementations) {
+ Implementation cur{DispatchLevel::NONE, {}};
+
+ for (const auto& impl : implementations) {
+ if (impl.first >= cur.first && IsSupported(impl.first)) {
+ // Higher (or same) level than current
+ cur = impl;
+ }
+ }
+
+ if (!cur.second) {
+ Status::Invalid("No appropriate implementation found").Abort();
+ }
+ func = cur.second;
+ }
+
+ private:
+ bool IsSupported(DispatchLevel level) const {
+ static const auto cpu_info = arrow::internal::CpuInfo::GetInstance();
+
+ switch (level) {
+ case DispatchLevel::NONE:
+ return true;
+ case DispatchLevel::SSE4_2:
+ return cpu_info->IsSupported(CpuInfo::SSE4_2);
+ case DispatchLevel::AVX2:
+ return cpu_info->IsSupported(CpuInfo::AVX2);
+ case DispatchLevel::AVX512:
+ return cpu_info->IsSupported(CpuInfo::AVX512);
+ default:
+ return false;
+ }
+ }
+};
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/double_conversion.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/double_conversion.h
new file mode 100644
index 00000000000..bd99c0618db
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/double_conversion.h
@@ -0,0 +1,32 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "contrib/libs/double-conversion/double-conversion.h" // IWYU pragma: export
+
+namespace arrow {
+namespace util {
+namespace double_conversion {
+
+using ::double_conversion::DoubleToStringConverter;
+using ::double_conversion::StringBuilder;
+using ::double_conversion::StringToDoubleConverter;
+
+} // namespace double_conversion
+} // namespace util
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/endian.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/endian.h
new file mode 100644
index 00000000000..0cb2e44d275
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/endian.h
@@ -0,0 +1,181 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#ifdef _WIN32
+#define ARROW_LITTLE_ENDIAN 1
+#else
+#if defined(__APPLE__) || defined(__FreeBSD__)
+#include <machine/endian.h> // IWYU pragma: keep
+#elif defined(sun) || defined(__sun)
+#include <sys/byteorder.h> // IWYU pragma: keep
+#else
+#include <endian.h> // IWYU pragma: keep
+#endif
+#
+#ifndef __BYTE_ORDER__
+#error "__BYTE_ORDER__ not defined"
+#endif
+#
+#ifndef __ORDER_LITTLE_ENDIAN__
+#error "__ORDER_LITTLE_ENDIAN__ not defined"
+#endif
+#
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define ARROW_LITTLE_ENDIAN 1
+#else
+#define ARROW_LITTLE_ENDIAN 0
+#endif
+#endif
+
+#if defined(_MSC_VER)
+#include <intrin.h> // IWYU pragma: keep
+#define ARROW_BYTE_SWAP64 _byteswap_uint64
+#define ARROW_BYTE_SWAP32 _byteswap_ulong
+#else
+#define ARROW_BYTE_SWAP64 __builtin_bswap64
+#define ARROW_BYTE_SWAP32 __builtin_bswap32
+#endif
+
+#include "arrow/util/type_traits.h"
+#include "arrow/util/ubsan.h"
+
+namespace arrow {
+namespace BitUtil {
+
+//
+// Byte-swap 16-bit, 32-bit and 64-bit values
+//
+
+// Swap the byte order (i.e. endianness)
+static inline int64_t ByteSwap(int64_t value) { return ARROW_BYTE_SWAP64(value); }
+static inline uint64_t ByteSwap(uint64_t value) {
+ return static_cast<uint64_t>(ARROW_BYTE_SWAP64(value));
+}
+static inline int32_t ByteSwap(int32_t value) { return ARROW_BYTE_SWAP32(value); }
+static inline uint32_t ByteSwap(uint32_t value) {
+ return static_cast<uint32_t>(ARROW_BYTE_SWAP32(value));
+}
+static inline int16_t ByteSwap(int16_t value) {
+ constexpr auto m = static_cast<int16_t>(0xff);
+ return static_cast<int16_t>(((value >> 8) & m) | ((value & m) << 8));
+}
+static inline uint16_t ByteSwap(uint16_t value) {
+ return static_cast<uint16_t>(ByteSwap(static_cast<int16_t>(value)));
+}
+static inline uint8_t ByteSwap(uint8_t value) { return value; }
+static inline int8_t ByteSwap(int8_t value) { return value; }
+static inline double ByteSwap(double value) {
+ const uint64_t swapped = ARROW_BYTE_SWAP64(util::SafeCopy<uint64_t>(value));
+ return util::SafeCopy<double>(swapped);
+}
+static inline float ByteSwap(float value) {
+ const uint32_t swapped = ARROW_BYTE_SWAP32(util::SafeCopy<uint32_t>(value));
+ return util::SafeCopy<float>(swapped);
+}
+
+// Write the swapped bytes into dst. Src and dst cannot overlap.
+static inline void ByteSwap(void* dst, const void* src, int len) {
+ switch (len) {
+ case 1:
+ *reinterpret_cast<int8_t*>(dst) = *reinterpret_cast<const int8_t*>(src);
+ return;
+ case 2:
+ *reinterpret_cast<int16_t*>(dst) = ByteSwap(*reinterpret_cast<const int16_t*>(src));
+ return;
+ case 4:
+ *reinterpret_cast<int32_t*>(dst) = ByteSwap(*reinterpret_cast<const int32_t*>(src));
+ return;
+ case 8:
+ *reinterpret_cast<int64_t*>(dst) = ByteSwap(*reinterpret_cast<const int64_t*>(src));
+ return;
+ default:
+ break;
+ }
+
+ auto d = reinterpret_cast<uint8_t*>(dst);
+ auto s = reinterpret_cast<const uint8_t*>(src);
+ for (int i = 0; i < len; ++i) {
+ d[i] = s[len - i - 1];
+ }
+}
+
+// Convert to little/big endian format from the machine's native endian format.
+#if ARROW_LITTLE_ENDIAN
+template <typename T, typename = internal::EnableIfIsOneOf<
+ T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t,
+ uint8_t, int8_t, float, double>>
+static inline T ToBigEndian(T value) {
+ return ByteSwap(value);
+}
+
+template <typename T, typename = internal::EnableIfIsOneOf<
+ T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t,
+ uint8_t, int8_t, float, double>>
+static inline T ToLittleEndian(T value) {
+ return value;
+}
+#else
+template <typename T, typename = internal::EnableIfIsOneOf<
+ T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t,
+ uint8_t, int8_t, float, double>>
+static inline T ToBigEndian(T value) {
+ return value;
+}
+
+template <typename T, typename = internal::EnableIfIsOneOf<
+ T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t,
+ uint8_t, int8_t, float, double>>
+static inline T ToLittleEndian(T value) {
+ return ByteSwap(value);
+}
+#endif
+
+// Convert from big/little endian format to the machine's native endian format.
+#if ARROW_LITTLE_ENDIAN
+template <typename T, typename = internal::EnableIfIsOneOf<
+ T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t,
+ uint8_t, int8_t, float, double>>
+static inline T FromBigEndian(T value) {
+ return ByteSwap(value);
+}
+
+template <typename T, typename = internal::EnableIfIsOneOf<
+ T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t,
+ uint8_t, int8_t, float, double>>
+static inline T FromLittleEndian(T value) {
+ return value;
+}
+#else
+template <typename T, typename = internal::EnableIfIsOneOf<
+ T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t,
+ uint8_t, int8_t, float, double>>
+static inline T FromBigEndian(T value) {
+ return value;
+}
+
+template <typename T, typename = internal::EnableIfIsOneOf<
+ T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t,
+ uint8_t, int8_t, float, double>>
+static inline T FromLittleEndian(T value) {
+ return ByteSwap(value);
+}
+#endif
+
+} // namespace BitUtil
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/formatting.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/formatting.cc
new file mode 100644
index 00000000000..c16d42ce5cf
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/formatting.cc
@@ -0,0 +1,91 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/util/formatting.h"
+#include "arrow/util/config.h"
+#include "arrow/util/double_conversion.h"
+#include "arrow/util/logging.h"
+
+namespace arrow {
+
+using util::double_conversion::DoubleToStringConverter;
+
+static constexpr int kMinBufferSize = DoubleToStringConverter::kBase10MaximalLength + 1;
+
+namespace internal {
+namespace detail {
+
+const char digit_pairs[] =
+ "0001020304050607080910111213141516171819"
+ "2021222324252627282930313233343536373839"
+ "4041424344454647484950515253545556575859"
+ "6061626364656667686970717273747576777879"
+ "8081828384858687888990919293949596979899";
+
+} // namespace detail
+
+struct FloatToStringFormatter::Impl {
+ Impl()
+ : converter_(DoubleToStringConverter::EMIT_POSITIVE_EXPONENT_SIGN, "inf", "nan",
+ 'e', -6, 10, 6, 0) {}
+
+ Impl(int flags, const char* inf_symbol, const char* nan_symbol, char exp_character,
+ int decimal_in_shortest_low, int decimal_in_shortest_high,
+ int max_leading_padding_zeroes_in_precision_mode,
+ int max_trailing_padding_zeroes_in_precision_mode)
+ : converter_(flags, inf_symbol, nan_symbol, exp_character, decimal_in_shortest_low,
+ decimal_in_shortest_high, max_leading_padding_zeroes_in_precision_mode,
+ max_trailing_padding_zeroes_in_precision_mode) {}
+
+ DoubleToStringConverter converter_;
+};
+
+FloatToStringFormatter::FloatToStringFormatter() : impl_(new Impl()) {}
+
+FloatToStringFormatter::FloatToStringFormatter(
+ int flags, const char* inf_symbol, const char* nan_symbol, char exp_character,
+ int decimal_in_shortest_low, int decimal_in_shortest_high,
+ int max_leading_padding_zeroes_in_precision_mode,
+ int max_trailing_padding_zeroes_in_precision_mode)
+ : impl_(new Impl(flags, inf_symbol, nan_symbol, exp_character,
+ decimal_in_shortest_low, decimal_in_shortest_high,
+ max_leading_padding_zeroes_in_precision_mode,
+ max_trailing_padding_zeroes_in_precision_mode)) {}
+
+FloatToStringFormatter::~FloatToStringFormatter() {}
+
+int FloatToStringFormatter::FormatFloat(float v, char* out_buffer, int out_size) {
+ DCHECK_GE(out_size, kMinBufferSize);
+ // StringBuilder checks bounds in debug mode for us
+ util::double_conversion::StringBuilder builder(out_buffer, out_size);
+ bool result = impl_->converter_.ToShortestSingle(v, &builder);
+ DCHECK(result);
+ ARROW_UNUSED(result);
+ return builder.position();
+}
+
+int FloatToStringFormatter::FormatFloat(double v, char* out_buffer, int out_size) {
+ DCHECK_GE(out_size, kMinBufferSize);
+ util::double_conversion::StringBuilder builder(out_buffer, out_size);
+ bool result = impl_->converter_.ToShortest(v, &builder);
+ DCHECK(result);
+ ARROW_UNUSED(result);
+ return builder.position();
+}
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/formatting.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/formatting.h
new file mode 100644
index 00000000000..566c9795f83
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/formatting.h
@@ -0,0 +1,426 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This is a private header for number-to-string formatting utilities
+
+#pragma once
+
+#include <array>
+#include <cassert>
+#include <chrono>
+#include <limits>
+#include <memory>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/double_conversion.h"
+#include "arrow/util/string_view.h"
+#include "arrow/util/time.h"
+#include "arrow/util/visibility.h"
+#include "arrow/vendored/datetime.h"
+
+namespace arrow {
+namespace internal {
+
+/// \brief The entry point for conversion to strings.
+template <typename ARROW_TYPE, typename Enable = void>
+class StringFormatter;
+
+template <typename T>
+struct is_formattable {
+ template <typename U, typename = typename StringFormatter<U>::value_type>
+ static std::true_type Test(U*);
+
+ template <typename U>
+ static std::false_type Test(...);
+
+ static constexpr bool value = decltype(Test<T>(NULLPTR))::value;
+};
+
+template <typename T, typename R = void>
+using enable_if_formattable = enable_if_t<is_formattable<T>::value, R>;
+
+template <typename Appender>
+using Return = decltype(std::declval<Appender>()(util::string_view{}));
+
+/////////////////////////////////////////////////////////////////////////
+// Boolean formatting
+
+template <>
+class StringFormatter<BooleanType> {
+ public:
+ explicit StringFormatter(const std::shared_ptr<DataType>& = NULLPTR) {}
+
+ using value_type = bool;
+
+ template <typename Appender>
+ Return<Appender> operator()(bool value, Appender&& append) {
+ if (value) {
+ const char string[] = "true";
+ return append(util::string_view(string));
+ } else {
+ const char string[] = "false";
+ return append(util::string_view(string));
+ }
+ }
+};
+
+/////////////////////////////////////////////////////////////////////////
+// Integer formatting
+
+namespace detail {
+
+// A 2x100 direct table mapping integers in [0..99] to their decimal representations.
+ARROW_EXPORT extern const char digit_pairs[];
+
+// Based on fmtlib's format_int class:
+// Write digits from right to left into a stack allocated buffer
+inline void FormatOneChar(char c, char** cursor) { *--*cursor = c; }
+
+template <typename Int>
+void FormatOneDigit(Int value, char** cursor) {
+ assert(value >= 0 && value <= 9);
+ FormatOneChar(static_cast<char>('0' + value), cursor);
+}
+
+template <typename Int>
+void FormatTwoDigits(Int value, char** cursor) {
+ assert(value >= 0 && value <= 99);
+ auto digit_pair = &digit_pairs[value * 2];
+ FormatOneChar(digit_pair[1], cursor);
+ FormatOneChar(digit_pair[0], cursor);
+}
+
+template <typename Int>
+void FormatAllDigits(Int value, char** cursor) {
+ assert(value >= 0);
+ while (value >= 100) {
+ FormatTwoDigits(value % 100, cursor);
+ value /= 100;
+ }
+
+ if (value >= 10) {
+ FormatTwoDigits(value, cursor);
+ } else {
+ FormatOneDigit(value, cursor);
+ }
+}
+
+template <typename Int>
+void FormatAllDigitsLeftPadded(Int value, size_t pad, char pad_char, char** cursor) {
+ auto end = *cursor - pad;
+ FormatAllDigits(value, cursor);
+ while (*cursor > end) {
+ FormatOneChar(pad_char, cursor);
+ }
+}
+
+template <size_t BUFFER_SIZE>
+util::string_view ViewDigitBuffer(const std::array<char, BUFFER_SIZE>& buffer,
+ char* cursor) {
+ auto buffer_end = buffer.data() + BUFFER_SIZE;
+ return {cursor, static_cast<size_t>(buffer_end - cursor)};
+}
+
+template <typename Int, typename UInt = typename std::make_unsigned<Int>::type>
+constexpr UInt Abs(Int value) {
+ return value < 0 ? ~static_cast<UInt>(value) + 1 : static_cast<UInt>(value);
+}
+
+template <typename Int>
+constexpr size_t Digits10(Int value) {
+ return value <= 9 ? 1 : Digits10(value / 10) + 1;
+}
+
+} // namespace detail
+
+template <typename ARROW_TYPE>
+class IntToStringFormatterMixin {
+ public:
+ explicit IntToStringFormatterMixin(const std::shared_ptr<DataType>& = NULLPTR) {}
+
+ using value_type = typename ARROW_TYPE::c_type;
+
+ template <typename Appender>
+ Return<Appender> operator()(value_type value, Appender&& append) {
+ constexpr size_t buffer_size =
+ detail::Digits10(std::numeric_limits<value_type>::max()) + 1;
+
+ std::array<char, buffer_size> buffer;
+ char* cursor = buffer.data() + buffer_size;
+ detail::FormatAllDigits(detail::Abs(value), &cursor);
+ if (value < 0) {
+ detail::FormatOneChar('-', &cursor);
+ }
+ return append(detail::ViewDigitBuffer(buffer, cursor));
+ }
+};
+
+template <>
+class StringFormatter<Int8Type> : public IntToStringFormatterMixin<Int8Type> {
+ using IntToStringFormatterMixin::IntToStringFormatterMixin;
+};
+
+template <>
+class StringFormatter<Int16Type> : public IntToStringFormatterMixin<Int16Type> {
+ using IntToStringFormatterMixin::IntToStringFormatterMixin;
+};
+
+template <>
+class StringFormatter<Int32Type> : public IntToStringFormatterMixin<Int32Type> {
+ using IntToStringFormatterMixin::IntToStringFormatterMixin;
+};
+
+template <>
+class StringFormatter<Int64Type> : public IntToStringFormatterMixin<Int64Type> {
+ using IntToStringFormatterMixin::IntToStringFormatterMixin;
+};
+
+template <>
+class StringFormatter<UInt8Type> : public IntToStringFormatterMixin<UInt8Type> {
+ using IntToStringFormatterMixin::IntToStringFormatterMixin;
+};
+
+template <>
+class StringFormatter<UInt16Type> : public IntToStringFormatterMixin<UInt16Type> {
+ using IntToStringFormatterMixin::IntToStringFormatterMixin;
+};
+
+template <>
+class StringFormatter<UInt32Type> : public IntToStringFormatterMixin<UInt32Type> {
+ using IntToStringFormatterMixin::IntToStringFormatterMixin;
+};
+
+template <>
+class StringFormatter<UInt64Type> : public IntToStringFormatterMixin<UInt64Type> {
+ using IntToStringFormatterMixin::IntToStringFormatterMixin;
+};
+
+/////////////////////////////////////////////////////////////////////////
+// Floating-point formatting
+
+class ARROW_EXPORT FloatToStringFormatter {
+ public:
+ FloatToStringFormatter();
+ FloatToStringFormatter(int flags, const char* inf_symbol, const char* nan_symbol,
+ char exp_character, int decimal_in_shortest_low,
+ int decimal_in_shortest_high,
+ int max_leading_padding_zeroes_in_precision_mode,
+ int max_trailing_padding_zeroes_in_precision_mode);
+ ~FloatToStringFormatter();
+
+ // Returns the number of characters written
+ int FormatFloat(float v, char* out_buffer, int out_size);
+ int FormatFloat(double v, char* out_buffer, int out_size);
+
+ protected:
+ struct Impl;
+ std::unique_ptr<Impl> impl_;
+};
+
+template <typename ARROW_TYPE>
+class FloatToStringFormatterMixin : public FloatToStringFormatter {
+ public:
+ using value_type = typename ARROW_TYPE::c_type;
+
+ static constexpr int buffer_size = 50;
+
+ explicit FloatToStringFormatterMixin(const std::shared_ptr<DataType>& = NULLPTR) {}
+
+ FloatToStringFormatterMixin(int flags, const char* inf_symbol, const char* nan_symbol,
+ char exp_character, int decimal_in_shortest_low,
+ int decimal_in_shortest_high,
+ int max_leading_padding_zeroes_in_precision_mode,
+ int max_trailing_padding_zeroes_in_precision_mode)
+ : FloatToStringFormatter(flags, inf_symbol, nan_symbol, exp_character,
+ decimal_in_shortest_low, decimal_in_shortest_high,
+ max_leading_padding_zeroes_in_precision_mode,
+ max_trailing_padding_zeroes_in_precision_mode) {}
+
+ template <typename Appender>
+ Return<Appender> operator()(value_type value, Appender&& append) {
+ char buffer[buffer_size];
+ int size = FormatFloat(value, buffer, buffer_size);
+ return append(util::string_view(buffer, size));
+ }
+};
+
+template <>
+class StringFormatter<FloatType> : public FloatToStringFormatterMixin<FloatType> {
+ public:
+ using FloatToStringFormatterMixin::FloatToStringFormatterMixin;
+};
+
+template <>
+class StringFormatter<DoubleType> : public FloatToStringFormatterMixin<DoubleType> {
+ public:
+ using FloatToStringFormatterMixin::FloatToStringFormatterMixin;
+};
+
+/////////////////////////////////////////////////////////////////////////
+// Temporal formatting
+
+namespace detail {
+
+template <typename V>
+constexpr size_t BufferSizeYYYY_MM_DD() {
+ return detail::Digits10(9999) + 1 + detail::Digits10(12) + 1 + detail::Digits10(31);
+}
+
+inline void FormatYYYY_MM_DD(arrow_vendored::date::year_month_day ymd, char** cursor) {
+ FormatTwoDigits(static_cast<unsigned>(ymd.day()), cursor);
+ FormatOneChar('-', cursor);
+ FormatTwoDigits(static_cast<unsigned>(ymd.month()), cursor);
+ FormatOneChar('-', cursor);
+ auto year = static_cast<int>(ymd.year());
+ assert(year <= 9999);
+ FormatTwoDigits(year % 100, cursor);
+ FormatTwoDigits(year / 100, cursor);
+}
+
+template <typename Duration>
+constexpr size_t BufferSizeHH_MM_SS() {
+ return detail::Digits10(23) + 1 + detail::Digits10(59) + 1 + detail::Digits10(59) + 1 +
+ detail::Digits10(Duration::period::den) - 1;
+}
+
+template <typename Duration>
+void FormatHH_MM_SS(arrow_vendored::date::hh_mm_ss<Duration> hms, char** cursor) {
+ constexpr size_t subsecond_digits = Digits10(Duration::period::den) - 1;
+ if (subsecond_digits != 0) {
+ FormatAllDigitsLeftPadded(hms.subseconds().count(), subsecond_digits, '0', cursor);
+ FormatOneChar('.', cursor);
+ }
+ FormatTwoDigits(hms.seconds().count(), cursor);
+ FormatOneChar(':', cursor);
+ FormatTwoDigits(hms.minutes().count(), cursor);
+ FormatOneChar(':', cursor);
+ FormatTwoDigits(hms.hours().count(), cursor);
+}
+
+} // namespace detail
+
+template <>
+class StringFormatter<DurationType> : public IntToStringFormatterMixin<DurationType> {
+ using IntToStringFormatterMixin::IntToStringFormatterMixin;
+};
+
+template <typename T>
+class StringFormatter<T, enable_if_date<T>> {
+ public:
+ using value_type = typename T::c_type;
+
+ explicit StringFormatter(const std::shared_ptr<DataType>& = NULLPTR) {}
+
+ template <typename Appender>
+ Return<Appender> operator()(value_type value, Appender&& append) {
+ arrow_vendored::date::days since_epoch;
+ if (T::type_id == Type::DATE32) {
+ since_epoch = arrow_vendored::date::days{value};
+ } else {
+ since_epoch = std::chrono::duration_cast<arrow_vendored::date::days>(
+ std::chrono::milliseconds{value});
+ }
+
+ arrow_vendored::date::sys_days timepoint_days{since_epoch};
+
+ constexpr size_t buffer_size = detail::BufferSizeYYYY_MM_DD<value_type>();
+
+ std::array<char, buffer_size> buffer;
+ char* cursor = buffer.data() + buffer_size;
+
+ detail::FormatYYYY_MM_DD(arrow_vendored::date::year_month_day{timepoint_days},
+ &cursor);
+ return append(detail::ViewDigitBuffer(buffer, cursor));
+ }
+};
+
+template <typename T>
+class StringFormatter<T, enable_if_time<T>> {
+ public:
+ using value_type = typename T::c_type;
+
+ explicit StringFormatter(const std::shared_ptr<DataType>& type)
+ : unit_(checked_cast<const T&>(*type).unit()) {}
+
+ template <typename Duration, typename Appender>
+ Return<Appender> operator()(Duration, value_type count, Appender&& append) {
+ Duration since_midnight{count};
+
+ constexpr size_t buffer_size = detail::BufferSizeHH_MM_SS<Duration>();
+
+ std::array<char, buffer_size> buffer;
+ char* cursor = buffer.data() + buffer_size;
+
+ detail::FormatHH_MM_SS(arrow_vendored::date::make_time(since_midnight), &cursor);
+ return append(detail::ViewDigitBuffer(buffer, cursor));
+ }
+
+ template <typename Appender>
+ Return<Appender> operator()(value_type value, Appender&& append) {
+ return util::VisitDuration(unit_, *this, value, std::forward<Appender>(append));
+ }
+
+ private:
+ TimeUnit::type unit_;
+};
+
+template <>
+class StringFormatter<TimestampType> {
+ public:
+ using value_type = int64_t;
+
+ explicit StringFormatter(const std::shared_ptr<DataType>& type)
+ : unit_(checked_cast<const TimestampType&>(*type).unit()) {}
+
+ template <typename Duration, typename Appender>
+ Return<Appender> operator()(Duration, value_type count, Appender&& append) {
+ Duration since_epoch{count};
+
+ arrow_vendored::date::sys_days timepoint_days{
+ arrow_vendored::date::floor<arrow_vendored::date::days>(since_epoch)};
+
+ Duration since_midnight = since_epoch - timepoint_days.time_since_epoch();
+
+ constexpr size_t buffer_size = detail::BufferSizeYYYY_MM_DD<value_type>() + 1 +
+ detail::BufferSizeHH_MM_SS<Duration>();
+
+ std::array<char, buffer_size> buffer;
+ char* cursor = buffer.data() + buffer_size;
+
+ detail::FormatHH_MM_SS(arrow_vendored::date::make_time(since_midnight), &cursor);
+ detail::FormatOneChar(' ', &cursor);
+ detail::FormatYYYY_MM_DD(arrow_vendored::date::year_month_day{timepoint_days},
+ &cursor);
+ return append(detail::ViewDigitBuffer(buffer, cursor));
+ }
+
+ template <typename Appender>
+ Return<Appender> operator()(value_type value, Appender&& append) {
+ return util::VisitDuration(unit_, *this, value, std::forward<Appender>(append));
+ }
+
+ private:
+ TimeUnit::type unit_;
+};
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/functional.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/functional.h
new file mode 100644
index 00000000000..9da79046fec
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/functional.h
@@ -0,0 +1,160 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <tuple>
+#include <type_traits>
+
+#include "arrow/result.h"
+#include "arrow/util/macros.h"
+
+namespace arrow {
+namespace internal {
+
+struct Empty {
+ static Result<Empty> ToResult(Status s) {
+ if (ARROW_PREDICT_TRUE(s.ok())) {
+ return Empty{};
+ }
+ return s;
+ }
+};
+
+/// Helper struct for examining lambdas and other callables.
+/// TODO(ARROW-12655) support function pointers
+struct call_traits {
+ public:
+ template <typename R, typename... A>
+ static std::false_type is_overloaded_impl(R(A...));
+
+ template <typename F>
+ static std::false_type is_overloaded_impl(decltype(&F::operator())*);
+
+ template <typename F>
+ static std::true_type is_overloaded_impl(...);
+
+ template <typename F, typename R, typename... A>
+ static R return_type_impl(R (F::*)(A...));
+
+ template <typename F, typename R, typename... A>
+ static R return_type_impl(R (F::*)(A...) const);
+
+ template <std::size_t I, typename F, typename R, typename... A>
+ static typename std::tuple_element<I, std::tuple<A...>>::type argument_type_impl(
+ R (F::*)(A...));
+
+ template <std::size_t I, typename F, typename R, typename... A>
+ static typename std::tuple_element<I, std::tuple<A...>>::type argument_type_impl(
+ R (F::*)(A...) const);
+
+ template <std::size_t I, typename F, typename R, typename... A>
+ static typename std::tuple_element<I, std::tuple<A...>>::type argument_type_impl(
+ R (F::*)(A...) &&);
+
+ template <typename F, typename R, typename... A>
+ static std::integral_constant<int, sizeof...(A)> argument_count_impl(R (F::*)(A...));
+
+ template <typename F, typename R, typename... A>
+ static std::integral_constant<int, sizeof...(A)> argument_count_impl(R (F::*)(A...)
+ const);
+
+ template <typename F, typename R, typename... A>
+ static std::integral_constant<int, sizeof...(A)> argument_count_impl(R (F::*)(A...) &&);
+
+ /// bool constant indicating whether F is a callable with more than one possible
+ /// signature. Will be true_type for objects which define multiple operator() or which
+ /// define a template operator()
+ template <typename F>
+ using is_overloaded =
+ decltype(is_overloaded_impl<typename std::decay<F>::type>(NULLPTR));
+
+ template <typename F, typename T = void>
+ using enable_if_overloaded = typename std::enable_if<is_overloaded<F>::value, T>::type;
+
+ template <typename F, typename T = void>
+ using disable_if_overloaded =
+ typename std::enable_if<!is_overloaded<F>::value, T>::type;
+
+ /// If F is not overloaded, the argument types of its call operator can be
+ /// extracted via call_traits::argument_type<Index, F>
+ template <std::size_t I, typename F>
+ using argument_type = decltype(argument_type_impl<I>(&std::decay<F>::type::operator()));
+
+ template <typename F>
+ using argument_count = decltype(argument_count_impl(&std::decay<F>::type::operator()));
+
+ template <typename F>
+ using return_type = decltype(return_type_impl(&std::decay<F>::type::operator()));
+
+ template <typename F, typename T, typename RT = T>
+ using enable_if_return =
+ typename std::enable_if<std::is_same<return_type<F>, T>::value, RT>;
+
+ template <typename T, typename R = void>
+ using enable_if_empty = typename std::enable_if<std::is_same<T, Empty>::value, R>::type;
+
+ template <typename T, typename R = void>
+ using enable_if_not_empty =
+ typename std::enable_if<!std::is_same<T, Empty>::value, R>::type;
+};
+
+/// A type erased callable object which may only be invoked once.
+/// It can be constructed from any lambda which matches the provided call signature.
+/// Invoking it results in destruction of the lambda, freeing any state/references
+/// immediately. Invoking a default constructed FnOnce or one which has already been
+/// invoked will segfault.
+template <typename Signature>
+class FnOnce;
+
+template <typename R, typename... A>
+class FnOnce<R(A...)> {
+ public:
+ FnOnce() = default;
+
+ template <typename Fn,
+ typename = typename std::enable_if<std::is_convertible<
+ typename std::result_of<Fn && (A...)>::type, R>::value>::type>
+ FnOnce(Fn fn) : impl_(new FnImpl<Fn>(std::move(fn))) { // NOLINT runtime/explicit
+ }
+
+ explicit operator bool() const { return impl_ != NULLPTR; }
+
+ R operator()(A... a) && {
+ auto bye = std::move(impl_);
+ return bye->invoke(std::forward<A&&>(a)...);
+ }
+
+ private:
+ struct Impl {
+ virtual ~Impl() = default;
+ virtual R invoke(A&&... a) = 0;
+ };
+
+ template <typename Fn>
+ struct FnImpl : Impl {
+ explicit FnImpl(Fn fn) : fn_(std::move(fn)) {}
+ R invoke(A&&... a) override { return std::move(fn_)(std::forward<A&&>(a)...); }
+ Fn fn_;
+ };
+
+ std::unique_ptr<Impl> impl_;
+};
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/future.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/future.cc
new file mode 100644
index 00000000000..f288a15be3f
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/future.cc
@@ -0,0 +1,421 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/util/future.h"
+
+#include <algorithm>
+#include <atomic>
+#include <chrono>
+#include <condition_variable>
+#include <mutex>
+#include <numeric>
+
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/thread_pool.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+
+// Shared mutex for all FutureWaiter instances.
+// This simplifies lock management compared to a per-waiter mutex.
+// The locking order is: global waiter mutex, then per-future mutex.
+//
+// It is unlikely that many waiter instances are alive at once, so this
+// should ideally not limit scalability.
+static std::mutex global_waiter_mutex;
+
+const double FutureWaiter::kInfinity = HUGE_VAL;
+
+class FutureWaiterImpl : public FutureWaiter {
+ public:
+ FutureWaiterImpl(Kind kind, std::vector<FutureImpl*> futures)
+ : signalled_(false),
+ kind_(kind),
+ futures_(std::move(futures)),
+ one_failed_(-1),
+ fetch_pos_(0) {
+ finished_futures_.reserve(futures_.size());
+
+ // Observe the current state of futures and add waiters to receive future
+ // state changes, atomically per future.
+ // We need to lock ourselves, because as soon as SetWaiter() is called,
+ // a FutureImpl may call MarkFutureFinished() from another thread
+ // before this constructor finishes.
+ std::unique_lock<std::mutex> lock(global_waiter_mutex);
+
+ for (int i = 0; i < static_cast<int>(futures_.size()); ++i) {
+ const auto state = futures_[i]->SetWaiter(this, i);
+ if (IsFutureFinished(state)) {
+ finished_futures_.push_back(i);
+ }
+ if (state != FutureState::SUCCESS) {
+ one_failed_ = i;
+ }
+ }
+
+ // Maybe signal the waiter, if the ending condition is already satisfied
+ if (ShouldSignal()) {
+ // No need to notify non-existent Wait() calls
+ signalled_ = true;
+ }
+ }
+
+ ~FutureWaiterImpl() override {
+ for (auto future : futures_) {
+ future->RemoveWaiter(this);
+ }
+ }
+
+ // Is the ending condition satisfied?
+ bool ShouldSignal() {
+ bool do_signal = false;
+ switch (kind_) {
+ case ANY:
+ do_signal = (finished_futures_.size() > 0);
+ break;
+ case ALL:
+ do_signal = (finished_futures_.size() == futures_.size());
+ break;
+ case ALL_OR_FIRST_FAILED:
+ do_signal = (finished_futures_.size() == futures_.size()) || one_failed_ >= 0;
+ break;
+ case ITERATE:
+ do_signal = (finished_futures_.size() > static_cast<size_t>(fetch_pos_));
+ break;
+ }
+ return do_signal;
+ }
+
+ void Signal() {
+ signalled_ = true;
+ cv_.notify_one();
+ }
+
+ void DoWaitUnlocked(std::unique_lock<std::mutex>* lock) {
+ cv_.wait(*lock, [this] { return signalled_.load(); });
+ }
+
+ bool DoWait() {
+ if (signalled_) {
+ return true;
+ }
+ std::unique_lock<std::mutex> lock(global_waiter_mutex);
+ DoWaitUnlocked(&lock);
+ return true;
+ }
+
+ template <class Rep, class Period>
+ bool DoWait(const std::chrono::duration<Rep, Period>& duration) {
+ if (signalled_) {
+ return true;
+ }
+ std::unique_lock<std::mutex> lock(global_waiter_mutex);
+ cv_.wait_for(lock, duration, [this] { return signalled_.load(); });
+ return signalled_.load();
+ }
+
+ void DoMarkFutureFinishedUnlocked(int future_num, FutureState state) {
+ finished_futures_.push_back(future_num);
+ if (state != FutureState::SUCCESS) {
+ one_failed_ = future_num;
+ }
+ if (!signalled_ && ShouldSignal()) {
+ Signal();
+ }
+ }
+
+ int DoWaitAndFetchOne() {
+ std::unique_lock<std::mutex> lock(global_waiter_mutex);
+
+ DCHECK_EQ(kind_, ITERATE);
+ DoWaitUnlocked(&lock);
+ DCHECK_LT(static_cast<size_t>(fetch_pos_), finished_futures_.size());
+ if (static_cast<size_t>(fetch_pos_) == finished_futures_.size() - 1) {
+ signalled_ = false;
+ }
+ return finished_futures_[fetch_pos_++];
+ }
+
+ std::vector<int> DoMoveFinishedFutures() {
+ std::unique_lock<std::mutex> lock(global_waiter_mutex);
+
+ return std::move(finished_futures_);
+ }
+
+ protected:
+ std::condition_variable cv_;
+ std::atomic<bool> signalled_;
+
+ Kind kind_;
+ std::vector<FutureImpl*> futures_;
+ std::vector<int> finished_futures_;
+ int one_failed_;
+ int fetch_pos_;
+};
+
+namespace {
+
+FutureWaiterImpl* GetConcreteWaiter(FutureWaiter* waiter) {
+ return checked_cast<FutureWaiterImpl*>(waiter);
+}
+
+} // namespace
+
+FutureWaiter::FutureWaiter() = default;
+
+FutureWaiter::~FutureWaiter() = default;
+
+std::unique_ptr<FutureWaiter> FutureWaiter::Make(Kind kind,
+ std::vector<FutureImpl*> futures) {
+ return std::unique_ptr<FutureWaiter>(new FutureWaiterImpl(kind, std::move(futures)));
+}
+
+void FutureWaiter::MarkFutureFinishedUnlocked(int future_num, FutureState state) {
+ // Called by FutureImpl on state changes
+ GetConcreteWaiter(this)->DoMarkFutureFinishedUnlocked(future_num, state);
+}
+
+bool FutureWaiter::Wait(double seconds) {
+ if (seconds == kInfinity) {
+ return GetConcreteWaiter(this)->DoWait();
+ } else {
+ return GetConcreteWaiter(this)->DoWait(std::chrono::duration<double>(seconds));
+ }
+}
+
+int FutureWaiter::WaitAndFetchOne() {
+ return GetConcreteWaiter(this)->DoWaitAndFetchOne();
+}
+
+std::vector<int> FutureWaiter::MoveFinishedFutures() {
+ return GetConcreteWaiter(this)->DoMoveFinishedFutures();
+}
+
+class ConcreteFutureImpl : public FutureImpl {
+ public:
+ FutureState DoSetWaiter(FutureWaiter* w, int future_num) {
+ std::unique_lock<std::mutex> lock(mutex_);
+
+ // Atomically load state at the time of adding the waiter, to avoid
+ // missed or duplicate events in the caller
+ ARROW_CHECK_EQ(waiter_, nullptr)
+ << "Only one Waiter allowed per Future at any given time";
+ waiter_ = w;
+ waiter_arg_ = future_num;
+ return state_.load();
+ }
+
+ void DoRemoveWaiter(FutureWaiter* w) {
+ std::unique_lock<std::mutex> lock(mutex_);
+
+ ARROW_CHECK_EQ(waiter_, w);
+ waiter_ = nullptr;
+ }
+
+ void DoMarkFinished() { DoMarkFinishedOrFailed(FutureState::SUCCESS); }
+
+ void DoMarkFailed() { DoMarkFinishedOrFailed(FutureState::FAILURE); }
+
+ void CheckOptions(const CallbackOptions& opts) {
+ if (opts.should_schedule != ShouldSchedule::Never) {
+ DCHECK_NE(opts.executor, nullptr)
+ << "An executor must be specified when adding a callback that might schedule";
+ }
+ }
+
+ void AddCallback(Callback callback, CallbackOptions opts) {
+ CheckOptions(opts);
+ std::unique_lock<std::mutex> lock(mutex_);
+ CallbackRecord callback_record{std::move(callback), opts};
+ if (IsFutureFinished(state_)) {
+ lock.unlock();
+ RunOrScheduleCallback(std::move(callback_record), /*in_add_callback=*/true);
+ } else {
+ callbacks_.push_back(std::move(callback_record));
+ }
+ }
+
+ bool TryAddCallback(const std::function<Callback()>& callback_factory,
+ CallbackOptions opts) {
+ CheckOptions(opts);
+ std::unique_lock<std::mutex> lock(mutex_);
+ if (IsFutureFinished(state_)) {
+ return false;
+ } else {
+ callbacks_.push_back({callback_factory(), opts});
+ return true;
+ }
+ }
+
+ bool ShouldScheduleCallback(const CallbackRecord& callback_record,
+ bool in_add_callback) {
+ switch (callback_record.options.should_schedule) {
+ case ShouldSchedule::Never:
+ return false;
+ case ShouldSchedule::Always:
+ return true;
+ case ShouldSchedule::IfUnfinished:
+ return !in_add_callback;
+ case ShouldSchedule::IfDifferentExecutor:
+ return !callback_record.options.executor->OwnsThisThread();
+ default:
+ DCHECK(false) << "Unrecognized ShouldSchedule option";
+ return false;
+ }
+ }
+
+ void RunOrScheduleCallback(CallbackRecord&& callback_record, bool in_add_callback) {
+ if (ShouldScheduleCallback(callback_record, in_add_callback)) {
+ struct CallbackTask {
+ void operator()() { std::move(callback)(*self); }
+
+ Callback callback;
+ std::shared_ptr<FutureImpl> self;
+ };
+ // Need to keep `this` alive until the callback has a chance to be scheduled.
+ CallbackTask task{std::move(callback_record.callback), shared_from_this()};
+ DCHECK_OK(callback_record.options.executor->Spawn(std::move(task)));
+ } else {
+ std::move(callback_record.callback)(*this);
+ }
+ }
+
+ void DoMarkFinishedOrFailed(FutureState state) {
+ {
+ // Lock the hypothetical waiter first, and the future after.
+ // This matches the locking order done in FutureWaiter constructor.
+ std::unique_lock<std::mutex> waiter_lock(global_waiter_mutex);
+ std::unique_lock<std::mutex> lock(mutex_);
+
+ DCHECK(!IsFutureFinished(state_)) << "Future already marked finished";
+ state_ = state;
+ if (waiter_ != nullptr) {
+ waiter_->MarkFutureFinishedUnlocked(waiter_arg_, state);
+ }
+ }
+ cv_.notify_all();
+
+ // run callbacks, lock not needed since the future is finished by this
+ // point so nothing else can modify the callbacks list and it is safe
+ // to iterate.
+ //
+ // In fact, it is important not to hold the locks because the callback
+ // may be slow or do its own locking on other resources
+ for (auto& callback_record : callbacks_) {
+ RunOrScheduleCallback(std::move(callback_record), /*in_add_callback=*/false);
+ }
+ callbacks_.clear();
+ }
+
+ void DoWait() {
+ std::unique_lock<std::mutex> lock(mutex_);
+
+ cv_.wait(lock, [this] { return IsFutureFinished(state_); });
+ }
+
+ bool DoWait(double seconds) {
+ std::unique_lock<std::mutex> lock(mutex_);
+
+ cv_.wait_for(lock, std::chrono::duration<double>(seconds),
+ [this] { return IsFutureFinished(state_); });
+ return IsFutureFinished(state_);
+ }
+
+ std::mutex mutex_;
+ std::condition_variable cv_;
+ FutureWaiter* waiter_ = nullptr;
+ int waiter_arg_ = -1;
+};
+
+namespace {
+
+ConcreteFutureImpl* GetConcreteFuture(FutureImpl* future) {
+ return checked_cast<ConcreteFutureImpl*>(future);
+}
+
+} // namespace
+
+std::unique_ptr<FutureImpl> FutureImpl::Make() {
+ return std::unique_ptr<FutureImpl>(new ConcreteFutureImpl());
+}
+
+std::unique_ptr<FutureImpl> FutureImpl::MakeFinished(FutureState state) {
+ std::unique_ptr<ConcreteFutureImpl> ptr(new ConcreteFutureImpl());
+ ptr->state_ = state;
+ return std::move(ptr);
+}
+
+FutureImpl::FutureImpl() : state_(FutureState::PENDING) {}
+
+FutureState FutureImpl::SetWaiter(FutureWaiter* w, int future_num) {
+ return GetConcreteFuture(this)->DoSetWaiter(w, future_num);
+}
+
+void FutureImpl::RemoveWaiter(FutureWaiter* w) {
+ GetConcreteFuture(this)->DoRemoveWaiter(w);
+}
+
+void FutureImpl::Wait() { GetConcreteFuture(this)->DoWait(); }
+
+bool FutureImpl::Wait(double seconds) { return GetConcreteFuture(this)->DoWait(seconds); }
+
+void FutureImpl::MarkFinished() { GetConcreteFuture(this)->DoMarkFinished(); }
+
+void FutureImpl::MarkFailed() { GetConcreteFuture(this)->DoMarkFailed(); }
+
+void FutureImpl::AddCallback(Callback callback, CallbackOptions opts) {
+ GetConcreteFuture(this)->AddCallback(std::move(callback), opts);
+}
+
+bool FutureImpl::TryAddCallback(const std::function<Callback()>& callback_factory,
+ CallbackOptions opts) {
+ return GetConcreteFuture(this)->TryAddCallback(callback_factory, opts);
+}
+
+Future<> AllComplete(const std::vector<Future<>>& futures) {
+ struct State {
+ explicit State(int64_t n_futures) : mutex(), n_remaining(n_futures) {}
+
+ std::mutex mutex;
+ std::atomic<size_t> n_remaining;
+ };
+
+ if (futures.empty()) {
+ return Future<>::MakeFinished();
+ }
+
+ auto state = std::make_shared<State>(futures.size());
+ auto out = Future<>::Make();
+ for (const auto& future : futures) {
+ future.AddCallback([state, out](const Status& status) mutable {
+ if (!status.ok()) {
+ std::unique_lock<std::mutex> lock(state->mutex);
+ if (!out.is_finished()) {
+ out.MarkFinished(status);
+ }
+ return;
+ }
+ if (state->n_remaining.fetch_sub(1) != 1) return;
+ out.MarkFinished();
+ });
+ }
+ return out;
+}
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/future.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/future.h
new file mode 100644
index 00000000000..d9e0a939f25
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/future.h
@@ -0,0 +1,957 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <atomic>
+#include <cmath>
+#include <functional>
+#include <memory>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/functional.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/optional.h"
+#include "arrow/util/type_fwd.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+template <typename>
+struct EnsureFuture;
+
+namespace detail {
+
+template <typename>
+struct is_future : std::false_type {};
+
+template <typename T>
+struct is_future<Future<T>> : std::true_type {};
+
+template <typename Signature>
+using result_of_t = typename std::result_of<Signature>::type;
+
+// Helper to find the synchronous counterpart for a Future
+template <typename T>
+struct SyncType {
+ using type = Result<T>;
+};
+
+template <>
+struct SyncType<internal::Empty> {
+ using type = Status;
+};
+
+template <typename Fn>
+using first_arg_is_status =
+ std::is_same<typename std::decay<internal::call_traits::argument_type<0, Fn>>::type,
+ Status>;
+
+template <typename Fn, typename Then, typename Else,
+ typename Count = internal::call_traits::argument_count<Fn>>
+using if_has_no_args = typename std::conditional<Count::value == 0, Then, Else>::type;
+
+/// Creates a callback that can be added to a future to mark a `dest` future finished
+template <typename Source, typename Dest, bool SourceEmpty = Source::is_empty,
+ bool DestEmpty = Dest::is_empty>
+struct MarkNextFinished {};
+
+/// If the source and dest are both empty we can pass on the status
+template <typename Source, typename Dest>
+struct MarkNextFinished<Source, Dest, true, true> {
+ void operator()(const Status& status) && { next.MarkFinished(status); }
+ Dest next;
+};
+
+/// If the source is not empty but the dest is then we can take the
+/// status out of the result
+template <typename Source, typename Dest>
+struct MarkNextFinished<Source, Dest, false, true> {
+ void operator()(const Result<typename Source::ValueType>& res) && {
+ next.MarkFinished(internal::Empty::ToResult(res.status()));
+ }
+ Dest next;
+};
+
+/// If neither are empty we pass on the result
+template <typename Source, typename Dest>
+struct MarkNextFinished<Source, Dest, false, false> {
+ void operator()(const Result<typename Source::ValueType>& res) && {
+ next.MarkFinished(res);
+ }
+ Dest next;
+};
+
+/// Helper that contains information about how to apply a continuation
+struct ContinueFuture {
+ template <typename Return>
+ struct ForReturnImpl;
+
+ template <typename Return>
+ using ForReturn = typename ForReturnImpl<Return>::type;
+
+ template <typename Signature>
+ using ForSignature = ForReturn<result_of_t<Signature>>;
+
+ // If the callback returns void then we return Future<> that always finishes OK.
+ template <typename ContinueFunc, typename... Args,
+ typename ContinueResult = result_of_t<ContinueFunc && (Args && ...)>,
+ typename NextFuture = ForReturn<ContinueResult>>
+ typename std::enable_if<std::is_void<ContinueResult>::value>::type operator()(
+ NextFuture next, ContinueFunc&& f, Args&&... a) const {
+ std::forward<ContinueFunc>(f)(std::forward<Args>(a)...);
+ next.MarkFinished();
+ }
+
+ /// If the callback returns a non-future then we return Future<T>
+ /// and mark the future finished with the callback result. It will get promoted
+ /// to Result<T> as part of MarkFinished if it isn't already.
+ ///
+ /// If the callback returns Status and we return Future<> then also send the callback
+ /// result as-is to the destination future.
+ template <typename ContinueFunc, typename... Args,
+ typename ContinueResult = result_of_t<ContinueFunc && (Args && ...)>,
+ typename NextFuture = ForReturn<ContinueResult>>
+ typename std::enable_if<
+ !std::is_void<ContinueResult>::value && !is_future<ContinueResult>::value &&
+ (!NextFuture::is_empty || std::is_same<ContinueResult, Status>::value)>::type
+ operator()(NextFuture next, ContinueFunc&& f, Args&&... a) const {
+ next.MarkFinished(std::forward<ContinueFunc>(f)(std::forward<Args>(a)...));
+ }
+
+ /// If the callback returns a Result and the next future is Future<> then we mark
+ /// the future finished with the callback result.
+ ///
+ /// It may seem odd that the next future is Future<> when the callback returns a
+ /// result but this can occur if the OnFailure callback returns a result while the
+ /// OnSuccess callback is void/Status (e.g. you would get this calling the one-arg
+ /// version of Then with an OnSuccess callback that returns void)
+ template <typename ContinueFunc, typename... Args,
+ typename ContinueResult = result_of_t<ContinueFunc && (Args && ...)>,
+ typename NextFuture = ForReturn<ContinueResult>>
+ typename std::enable_if<!std::is_void<ContinueResult>::value &&
+ !is_future<ContinueResult>::value && NextFuture::is_empty &&
+ !std::is_same<ContinueResult, Status>::value>::type
+ operator()(NextFuture next, ContinueFunc&& f, Args&&... a) const {
+ next.MarkFinished(std::forward<ContinueFunc>(f)(std::forward<Args>(a)...).status());
+ }
+
+ /// If the callback returns a Future<T> then we return Future<T>. We create a new
+ /// future and add a callback to the future given to us by the user that forwards the
+ /// result to the future we just created
+ template <typename ContinueFunc, typename... Args,
+ typename ContinueResult = result_of_t<ContinueFunc && (Args && ...)>,
+ typename NextFuture = ForReturn<ContinueResult>>
+ typename std::enable_if<is_future<ContinueResult>::value>::type operator()(
+ NextFuture next, ContinueFunc&& f, Args&&... a) const {
+ ContinueResult signal_to_complete_next =
+ std::forward<ContinueFunc>(f)(std::forward<Args>(a)...);
+ MarkNextFinished<ContinueResult, NextFuture> callback{std::move(next)};
+ signal_to_complete_next.AddCallback(std::move(callback));
+ }
+
+ /// Helpers to conditionally ignore arguments to ContinueFunc
+ template <typename ContinueFunc, typename NextFuture, typename... Args>
+ void IgnoringArgsIf(std::true_type, NextFuture&& next, ContinueFunc&& f,
+ Args&&...) const {
+ operator()(std::forward<NextFuture>(next), std::forward<ContinueFunc>(f));
+ }
+ template <typename ContinueFunc, typename NextFuture, typename... Args>
+ void IgnoringArgsIf(std::false_type, NextFuture&& next, ContinueFunc&& f,
+ Args&&... a) const {
+ operator()(std::forward<NextFuture>(next), std::forward<ContinueFunc>(f),
+ std::forward<Args>(a)...);
+ }
+};
+
+/// Helper struct which tells us what kind of Future gets returned from `Then` based on
+/// the return type of the OnSuccess callback
+template <>
+struct ContinueFuture::ForReturnImpl<void> {
+ using type = Future<>;
+};
+
+template <>
+struct ContinueFuture::ForReturnImpl<Status> {
+ using type = Future<>;
+};
+
+template <typename R>
+struct ContinueFuture::ForReturnImpl {
+ using type = Future<R>;
+};
+
+template <typename T>
+struct ContinueFuture::ForReturnImpl<Result<T>> {
+ using type = Future<T>;
+};
+
+template <typename T>
+struct ContinueFuture::ForReturnImpl<Future<T>> {
+ using type = Future<T>;
+};
+
+} // namespace detail
+
+/// A Future's execution or completion status
+enum class FutureState : int8_t { PENDING, SUCCESS, FAILURE };
+
+inline bool IsFutureFinished(FutureState state) { return state != FutureState::PENDING; }
+
+/// \brief Describe whether the callback should be scheduled or run synchronously
+enum class ShouldSchedule {
+ /// Always run the callback synchronously (the default)
+ Never = 0,
+ /// Schedule a new task only if the future is not finished when the
+ /// callback is added
+ IfUnfinished = 1,
+ /// Always schedule the callback as a new task
+ Always = 2,
+ /// Schedule a new task only if it would run on an executor other than
+ /// the specified executor.
+ IfDifferentExecutor = 3,
+};
+
+/// \brief Options that control how a continuation is run
+struct CallbackOptions {
+ /// Describe whether the callback should be run synchronously or scheduled
+ ShouldSchedule should_schedule = ShouldSchedule::Never;
+ /// If the callback is scheduled then this is the executor it should be scheduled
+ /// on. If this is NULL then should_schedule must be Never
+ internal::Executor* executor = NULLPTR;
+
+ static CallbackOptions Defaults() { return {}; }
+};
+
+// Untyped private implementation
+class ARROW_EXPORT FutureImpl : public std::enable_shared_from_this<FutureImpl> {
+ public:
+ FutureImpl();
+ virtual ~FutureImpl() = default;
+
+ FutureState state() { return state_.load(); }
+
+ static std::unique_ptr<FutureImpl> Make();
+ static std::unique_ptr<FutureImpl> MakeFinished(FutureState state);
+
+ // Future API
+ void MarkFinished();
+ void MarkFailed();
+ void Wait();
+ bool Wait(double seconds);
+ template <typename ValueType>
+ Result<ValueType>* CastResult() const {
+ return static_cast<Result<ValueType>*>(result_.get());
+ }
+
+ using Callback = internal::FnOnce<void(const FutureImpl& impl)>;
+ void AddCallback(Callback callback, CallbackOptions opts);
+ bool TryAddCallback(const std::function<Callback()>& callback_factory,
+ CallbackOptions opts);
+
+ // Waiter API
+ inline FutureState SetWaiter(FutureWaiter* w, int future_num);
+ inline void RemoveWaiter(FutureWaiter* w);
+
+ std::atomic<FutureState> state_{FutureState::PENDING};
+
+ // Type erased storage for arbitrary results
+ // XXX small objects could be stored inline instead of boxed in a pointer
+ using Storage = std::unique_ptr<void, void (*)(void*)>;
+ Storage result_{NULLPTR, NULLPTR};
+
+ struct CallbackRecord {
+ Callback callback;
+ CallbackOptions options;
+ };
+ std::vector<CallbackRecord> callbacks_;
+};
+
+// An object that waits on multiple futures at once. Only one waiter
+// can be registered for each future at any time.
+class ARROW_EXPORT FutureWaiter {
+ public:
+ enum Kind : int8_t { ANY, ALL, ALL_OR_FIRST_FAILED, ITERATE };
+
+ // HUGE_VAL isn't constexpr on Windows
+ // https://social.msdn.microsoft.com/Forums/vstudio/en-US/47e8b9ff-b205-4189-968e-ee3bc3e2719f/constexpr-compile-error?forum=vclanguage
+ static const double kInfinity;
+
+ static std::unique_ptr<FutureWaiter> Make(Kind kind, std::vector<FutureImpl*> futures);
+
+ template <typename FutureType>
+ static std::unique_ptr<FutureWaiter> Make(Kind kind,
+ const std::vector<FutureType>& futures) {
+ return Make(kind, ExtractFutures(futures));
+ }
+
+ virtual ~FutureWaiter();
+
+ bool Wait(double seconds = kInfinity);
+ int WaitAndFetchOne();
+
+ std::vector<int> MoveFinishedFutures();
+
+ protected:
+ // Extract FutureImpls from Futures
+ template <typename FutureType,
+ typename Enable = std::enable_if<!std::is_pointer<FutureType>::value>>
+ static std::vector<FutureImpl*> ExtractFutures(const std::vector<FutureType>& futures) {
+ std::vector<FutureImpl*> base_futures(futures.size());
+ for (int i = 0; i < static_cast<int>(futures.size()); ++i) {
+ base_futures[i] = futures[i].impl_.get();
+ }
+ return base_futures;
+ }
+
+ // Extract FutureImpls from Future pointers
+ template <typename FutureType>
+ static std::vector<FutureImpl*> ExtractFutures(
+ const std::vector<FutureType*>& futures) {
+ std::vector<FutureImpl*> base_futures(futures.size());
+ for (int i = 0; i < static_cast<int>(futures.size()); ++i) {
+ base_futures[i] = futures[i]->impl_.get();
+ }
+ return base_futures;
+ }
+
+ FutureWaiter();
+ ARROW_DISALLOW_COPY_AND_ASSIGN(FutureWaiter);
+
+ inline void MarkFutureFinishedUnlocked(int future_num, FutureState state);
+
+ friend class FutureImpl;
+ friend class ConcreteFutureImpl;
+};
+
+// ---------------------------------------------------------------------
+// Public API
+
+/// \brief EXPERIMENTAL A std::future-like class with more functionality.
+///
+/// A Future represents the results of a past or future computation.
+/// The Future API has two sides: a producer side and a consumer side.
+///
+/// The producer API allows creating a Future and setting its result or
+/// status, possibly after running a computation function.
+///
+/// The consumer API allows querying a Future's current state, wait for it
+/// to complete, or wait on multiple Futures at once (using WaitForAll,
+/// WaitForAny or AsCompletedIterator).
+template <typename T>
+class ARROW_MUST_USE_TYPE Future {
+ public:
+ using ValueType = T;
+ using SyncType = typename detail::SyncType<T>::type;
+ static constexpr bool is_empty = std::is_same<T, internal::Empty>::value;
+ // The default constructor creates an invalid Future. Use Future::Make()
+ // for a valid Future. This constructor is mostly for the convenience
+ // of being able to presize a vector of Futures.
+ Future() = default;
+
+ // Consumer API
+
+ bool is_valid() const { return impl_ != NULLPTR; }
+
+ /// \brief Return the Future's current state
+ ///
+ /// A return value of PENDING is only indicative, as the Future can complete
+ /// concurrently. A return value of FAILURE or SUCCESS is definitive, though.
+ FutureState state() const {
+ CheckValid();
+ return impl_->state();
+ }
+
+ /// \brief Whether the Future is finished
+ ///
+ /// A false return value is only indicative, as the Future can complete
+ /// concurrently. A true return value is definitive, though.
+ bool is_finished() const {
+ CheckValid();
+ return IsFutureFinished(impl_->state());
+ }
+
+ /// \brief Wait for the Future to complete and return its Result
+ const Result<ValueType>& result() const& {
+ Wait();
+ return *GetResult();
+ }
+
+ /// \brief Returns an rvalue to the result. This method is potentially unsafe
+ ///
+ /// The future is not the unique owner of the result, copies of a future will
+ /// also point to the same result. You must make sure that no other copies
+ /// of the future exist. Attempts to add callbacks after you move the result
+ /// will result in undefined behavior.
+ Result<ValueType>&& MoveResult() {
+ Wait();
+ return std::move(*GetResult());
+ }
+
+ /// \brief Wait for the Future to complete and return its Status
+ const Status& status() const { return result().status(); }
+
+ /// \brief Future<T> is convertible to Future<>, which views only the
+ /// Status of the original. Marking the returned Future Finished is not supported.
+ explicit operator Future<>() const {
+ Future<> status_future;
+ status_future.impl_ = impl_;
+ return status_future;
+ }
+
+ /// \brief Wait for the Future to complete
+ void Wait() const {
+ CheckValid();
+ if (!IsFutureFinished(impl_->state())) {
+ impl_->Wait();
+ }
+ }
+
+ /// \brief Wait for the Future to complete, or for the timeout to expire
+ ///
+ /// `true` is returned if the Future completed, `false` if the timeout expired.
+ /// Note a `false` value is only indicative, as the Future can complete
+ /// concurrently.
+ bool Wait(double seconds) const {
+ CheckValid();
+ if (IsFutureFinished(impl_->state())) {
+ return true;
+ }
+ return impl_->Wait(seconds);
+ }
+
+ // Producer API
+
+ /// \brief Producer API: mark Future finished
+ ///
+ /// The Future's result is set to `res`.
+ void MarkFinished(Result<ValueType> res) { DoMarkFinished(std::move(res)); }
+
+ /// \brief Mark a Future<> completed with the provided Status.
+ template <typename E = ValueType, typename = typename std::enable_if<
+ std::is_same<E, internal::Empty>::value>::type>
+ void MarkFinished(Status s = Status::OK()) {
+ return DoMarkFinished(E::ToResult(std::move(s)));
+ }
+
+ /// \brief Producer API: instantiate a valid Future
+ ///
+ /// The Future's state is initialized with PENDING. If you are creating a future with
+ /// this method you must ensure that future is eventually completed (with success or
+ /// failure). Creating a future, returning it, and never completing the future can lead
+ /// to memory leaks (for example, see Loop).
+ static Future Make() {
+ Future fut;
+ fut.impl_ = FutureImpl::Make();
+ return fut;
+ }
+
+ /// \brief Producer API: instantiate a finished Future
+ static Future<ValueType> MakeFinished(Result<ValueType> res) {
+ Future<ValueType> fut;
+ fut.InitializeFromResult(std::move(res));
+ return fut;
+ }
+
+ /// \brief Make a finished Future<> with the provided Status.
+ template <typename E = ValueType, typename = typename std::enable_if<
+ std::is_same<E, internal::Empty>::value>::type>
+ static Future<> MakeFinished(Status s = Status::OK()) {
+ return MakeFinished(E::ToResult(std::move(s)));
+ }
+
+ struct WrapResultyOnComplete {
+ template <typename OnComplete>
+ struct Callback {
+ void operator()(const FutureImpl& impl) && {
+ std::move(on_complete)(*impl.CastResult<ValueType>());
+ }
+ OnComplete on_complete;
+ };
+ };
+
+ struct WrapStatusyOnComplete {
+ template <typename OnComplete>
+ struct Callback {
+ static_assert(std::is_same<internal::Empty, ValueType>::value,
+ "Only callbacks for Future<> should accept Status and not Result");
+
+ void operator()(const FutureImpl& impl) && {
+ std::move(on_complete)(impl.CastResult<ValueType>()->status());
+ }
+ OnComplete on_complete;
+ };
+ };
+
+ template <typename OnComplete>
+ using WrapOnComplete = typename std::conditional<
+ detail::first_arg_is_status<OnComplete>::value, WrapStatusyOnComplete,
+ WrapResultyOnComplete>::type::template Callback<OnComplete>;
+
+ /// \brief Consumer API: Register a callback to run when this future completes
+ ///
+ /// The callback should receive the result of the future (const Result<T>&)
+ /// For a void or statusy future this should be (const Status&)
+ ///
+ /// There is no guarantee to the order in which callbacks will run. In
+ /// particular, callbacks added while the future is being marked complete
+ /// may be executed immediately, ahead of, or even the same time as, other
+ /// callbacks that have been previously added.
+ ///
+ /// WARNING: callbacks may hold arbitrary references, including cyclic references.
+ /// Since callbacks will only be destroyed after they are invoked, this can lead to
+ /// memory leaks if a Future is never marked finished (abandoned):
+ ///
+ /// {
+ /// auto fut = Future<>::Make();
+ /// fut.AddCallback([fut]() {});
+ /// }
+ ///
+ /// In this example `fut` falls out of scope but is not destroyed because it holds a
+ /// cyclic reference to itself through the callback.
+ template <typename OnComplete, typename Callback = WrapOnComplete<OnComplete>>
+ void AddCallback(OnComplete on_complete,
+ CallbackOptions opts = CallbackOptions::Defaults()) const {
+ // We know impl_ will not be dangling when invoking callbacks because at least one
+ // thread will be waiting for MarkFinished to return. Thus it's safe to keep a
+ // weak reference to impl_ here
+ impl_->AddCallback(Callback{std::move(on_complete)}, opts);
+ }
+
+ /// \brief Overload of AddCallback that will return false instead of running
+ /// synchronously
+ ///
+ /// This overload will guarantee the callback is never run synchronously. If the future
+ /// is already finished then it will simply return false. This can be useful to avoid
+ /// stack overflow in a situation where you have recursive Futures. For an example
+ /// see the Loop function
+ ///
+ /// Takes in a callback factory function to allow moving callbacks (the factory function
+ /// will only be called if the callback can successfully be added)
+ ///
+ /// Returns true if a callback was actually added and false if the callback failed
+ /// to add because the future was marked complete.
+ template <typename CallbackFactory,
+ typename OnComplete = detail::result_of_t<CallbackFactory()>,
+ typename Callback = WrapOnComplete<OnComplete>>
+ bool TryAddCallback(const CallbackFactory& callback_factory,
+ CallbackOptions opts = CallbackOptions::Defaults()) const {
+ return impl_->TryAddCallback([&]() { return Callback{callback_factory()}; }, opts);
+ }
+
+ template <typename OnSuccess, typename OnFailure>
+ struct ThenOnComplete {
+ static constexpr bool has_no_args =
+ internal::call_traits::argument_count<OnSuccess>::value == 0;
+
+ using ContinuedFuture = detail::ContinueFuture::ForSignature<
+ detail::if_has_no_args<OnSuccess, OnSuccess && (), OnSuccess && (const T&)>>;
+
+ static_assert(
+ std::is_same<detail::ContinueFuture::ForSignature<OnFailure && (const Status&)>,
+ ContinuedFuture>::value,
+ "OnSuccess and OnFailure must continue with the same future type");
+
+ struct DummyOnSuccess {
+ void operator()(const T&);
+ };
+ using OnSuccessArg = typename std::decay<internal::call_traits::argument_type<
+ 0, detail::if_has_no_args<OnSuccess, DummyOnSuccess, OnSuccess>>>::type;
+
+ static_assert(
+ !std::is_same<OnSuccessArg, typename EnsureResult<OnSuccessArg>::type>::value,
+ "OnSuccess' argument should not be a Result");
+
+ void operator()(const Result<T>& result) && {
+ detail::ContinueFuture continue_future;
+ if (ARROW_PREDICT_TRUE(result.ok())) {
+ // move on_failure to a(n immediately destroyed) temporary to free its resources
+ ARROW_UNUSED(OnFailure(std::move(on_failure)));
+ continue_future.IgnoringArgsIf(
+ detail::if_has_no_args<OnSuccess, std::true_type, std::false_type>{},
+ std::move(next), std::move(on_success), result.ValueOrDie());
+ } else {
+ ARROW_UNUSED(OnSuccess(std::move(on_success)));
+ continue_future(std::move(next), std::move(on_failure), result.status());
+ }
+ }
+
+ OnSuccess on_success;
+ OnFailure on_failure;
+ ContinuedFuture next;
+ };
+
+ template <typename OnSuccess>
+ struct PassthruOnFailure {
+ using ContinuedFuture = detail::ContinueFuture::ForSignature<
+ detail::if_has_no_args<OnSuccess, OnSuccess && (), OnSuccess && (const T&)>>;
+
+ Result<typename ContinuedFuture::ValueType> operator()(const Status& s) { return s; }
+ };
+
+ /// \brief Consumer API: Register a continuation to run when this future completes
+ ///
+ /// The continuation will run in the same thread that called MarkFinished (whatever
+ /// callback is registered with this function will run before MarkFinished returns).
+ /// Avoid long-running callbacks in favor of submitting a task to an Executor and
+ /// returning the future.
+ ///
+ /// Two callbacks are supported:
+ /// - OnSuccess, called with the result (const ValueType&) on successul completion.
+ /// for an empty future this will be called with nothing ()
+ /// - OnFailure, called with the error (const Status&) on failed completion.
+ /// This callback is optional and defaults to a passthru of any errors.
+ ///
+ /// Then() returns a Future whose ValueType is derived from the return type of the
+ /// callbacks. If a callback returns:
+ /// - void, a Future<> will be returned which will completes successully as soon
+ /// as the callback runs.
+ /// - Status, a Future<> will be returned which will complete with the returned Status
+ /// as soon as the callback runs.
+ /// - V or Result<V>, a Future<V> will be returned which will complete with the result
+ /// of invoking the callback as soon as the callback runs.
+ /// - Future<V>, a Future<V> will be returned which will be marked complete when the
+ /// future returned by the callback completes (and will complete with the same
+ /// result).
+ ///
+ /// The continued Future type must be the same for both callbacks.
+ ///
+ /// Note that OnFailure can swallow errors, allowing continued Futures to successully
+ /// complete even if this Future fails.
+ ///
+ /// If this future is already completed then the callback will be run immediately
+ /// and the returned future may already be marked complete.
+ ///
+ /// See AddCallback for general considerations when writing callbacks.
+ template <typename OnSuccess, typename OnFailure = PassthruOnFailure<OnSuccess>,
+ typename OnComplete = ThenOnComplete<OnSuccess, OnFailure>,
+ typename ContinuedFuture = typename OnComplete::ContinuedFuture>
+ ContinuedFuture Then(OnSuccess on_success, OnFailure on_failure = {},
+ CallbackOptions options = CallbackOptions::Defaults()) const {
+ auto next = ContinuedFuture::Make();
+ AddCallback(OnComplete{std::forward<OnSuccess>(on_success),
+ std::forward<OnFailure>(on_failure), next},
+ options);
+ return next;
+ }
+
+ /// \brief Implicit constructor to create a finished future from a value
+ Future(ValueType val) : Future() { // NOLINT runtime/explicit
+ impl_ = FutureImpl::MakeFinished(FutureState::SUCCESS);
+ SetResult(std::move(val));
+ }
+
+ /// \brief Implicit constructor to create a future from a Result, enabling use
+ /// of macros like ARROW_ASSIGN_OR_RAISE.
+ Future(Result<ValueType> res) : Future() { // NOLINT runtime/explicit
+ if (ARROW_PREDICT_TRUE(res.ok())) {
+ impl_ = FutureImpl::MakeFinished(FutureState::SUCCESS);
+ } else {
+ impl_ = FutureImpl::MakeFinished(FutureState::FAILURE);
+ }
+ SetResult(std::move(res));
+ }
+
+ /// \brief Implicit constructor to create a future from a Status, enabling use
+ /// of macros like ARROW_RETURN_NOT_OK.
+ Future(Status s) // NOLINT runtime/explicit
+ : Future(Result<ValueType>(std::move(s))) {}
+
+ protected:
+ void InitializeFromResult(Result<ValueType> res) {
+ if (ARROW_PREDICT_TRUE(res.ok())) {
+ impl_ = FutureImpl::MakeFinished(FutureState::SUCCESS);
+ } else {
+ impl_ = FutureImpl::MakeFinished(FutureState::FAILURE);
+ }
+ SetResult(std::move(res));
+ }
+
+ void Initialize() { impl_ = FutureImpl::Make(); }
+
+ Result<ValueType>* GetResult() const { return impl_->CastResult<ValueType>(); }
+
+ void SetResult(Result<ValueType> res) {
+ impl_->result_ = {new Result<ValueType>(std::move(res)),
+ [](void* p) { delete static_cast<Result<ValueType>*>(p); }};
+ }
+
+ void DoMarkFinished(Result<ValueType> res) {
+ SetResult(std::move(res));
+
+ if (ARROW_PREDICT_TRUE(GetResult()->ok())) {
+ impl_->MarkFinished();
+ } else {
+ impl_->MarkFailed();
+ }
+ }
+
+ void CheckValid() const {
+#ifndef NDEBUG
+ if (!is_valid()) {
+ Status::Invalid("Invalid Future (default-initialized?)").Abort();
+ }
+#endif
+ }
+
+ explicit Future(std::shared_ptr<FutureImpl> impl) : impl_(std::move(impl)) {}
+
+ std::shared_ptr<FutureImpl> impl_;
+
+ friend class FutureWaiter;
+ friend struct detail::ContinueFuture;
+
+ template <typename U>
+ friend class Future;
+ friend class WeakFuture<T>;
+
+ FRIEND_TEST(FutureRefTest, ChainRemoved);
+ FRIEND_TEST(FutureRefTest, TailRemoved);
+ FRIEND_TEST(FutureRefTest, HeadRemoved);
+};
+
+template <typename T>
+typename Future<T>::SyncType FutureToSync(const Future<T>& fut) {
+ return fut.result();
+}
+
+template <>
+inline typename Future<internal::Empty>::SyncType FutureToSync<internal::Empty>(
+ const Future<internal::Empty>& fut) {
+ return fut.status();
+}
+
+template <typename T>
+class WeakFuture {
+ public:
+ explicit WeakFuture(const Future<T>& future) : impl_(future.impl_) {}
+
+ Future<T> get() { return Future<T>{impl_.lock()}; }
+
+ private:
+ std::weak_ptr<FutureImpl> impl_;
+};
+
+/// If a Result<Future> holds an error instead of a Future, construct a finished Future
+/// holding that error.
+template <typename T>
+static Future<T> DeferNotOk(Result<Future<T>> maybe_future) {
+ if (ARROW_PREDICT_FALSE(!maybe_future.ok())) {
+ return Future<T>::MakeFinished(std::move(maybe_future).status());
+ }
+ return std::move(maybe_future).MoveValueUnsafe();
+}
+
+/// \brief Wait for all the futures to end, or for the given timeout to expire.
+///
+/// `true` is returned if all the futures completed before the timeout was reached,
+/// `false` otherwise.
+template <typename T>
+inline bool WaitForAll(const std::vector<Future<T>>& futures,
+ double seconds = FutureWaiter::kInfinity) {
+ auto waiter = FutureWaiter::Make(FutureWaiter::ALL, futures);
+ return waiter->Wait(seconds);
+}
+
+/// \brief Wait for all the futures to end, or for the given timeout to expire.
+///
+/// `true` is returned if all the futures completed before the timeout was reached,
+/// `false` otherwise.
+template <typename T>
+inline bool WaitForAll(const std::vector<Future<T>*>& futures,
+ double seconds = FutureWaiter::kInfinity) {
+ auto waiter = FutureWaiter::Make(FutureWaiter::ALL, futures);
+ return waiter->Wait(seconds);
+}
+
+/// \brief Create a Future which completes when all of `futures` complete.
+///
+/// The future's result is a vector of the results of `futures`.
+/// Note that this future will never be marked "failed"; failed results
+/// will be stored in the result vector alongside successful results.
+template <typename T>
+Future<std::vector<Result<T>>> All(std::vector<Future<T>> futures) {
+ struct State {
+ explicit State(std::vector<Future<T>> f)
+ : futures(std::move(f)), n_remaining(futures.size()) {}
+
+ std::vector<Future<T>> futures;
+ std::atomic<size_t> n_remaining;
+ };
+
+ if (futures.size() == 0) {
+ return {std::vector<Result<T>>{}};
+ }
+
+ auto state = std::make_shared<State>(std::move(futures));
+
+ auto out = Future<std::vector<Result<T>>>::Make();
+ for (const Future<T>& future : state->futures) {
+ future.AddCallback([state, out](const Result<T>&) mutable {
+ if (state->n_remaining.fetch_sub(1) != 1) return;
+
+ std::vector<Result<T>> results(state->futures.size());
+ for (size_t i = 0; i < results.size(); ++i) {
+ results[i] = state->futures[i].result();
+ }
+ out.MarkFinished(std::move(results));
+ });
+ }
+ return out;
+}
+
+template <>
+inline Future<>::Future(Status s) : Future(internal::Empty::ToResult(std::move(s))) {}
+
+/// \brief Create a Future which completes when all of `futures` complete.
+///
+/// The future will be marked complete if all `futures` complete
+/// successfully. Otherwise, it will be marked failed with the status of
+/// the first failing future.
+ARROW_EXPORT
+Future<> AllComplete(const std::vector<Future<>>& futures);
+
+/// \brief Wait for one of the futures to end, or for the given timeout to expire.
+///
+/// The indices of all completed futures are returned. Note that some futures
+/// may not be in the returned set, but still complete concurrently.
+template <typename T>
+inline std::vector<int> WaitForAny(const std::vector<Future<T>>& futures,
+ double seconds = FutureWaiter::kInfinity) {
+ auto waiter = FutureWaiter::Make(FutureWaiter::ANY, futures);
+ waiter->Wait(seconds);
+ return waiter->MoveFinishedFutures();
+}
+
+/// \brief Wait for one of the futures to end, or for the given timeout to expire.
+///
+/// The indices of all completed futures are returned. Note that some futures
+/// may not be in the returned set, but still complete concurrently.
+template <typename T>
+inline std::vector<int> WaitForAny(const std::vector<Future<T>*>& futures,
+ double seconds = FutureWaiter::kInfinity) {
+ auto waiter = FutureWaiter::Make(FutureWaiter::ANY, futures);
+ waiter->Wait(seconds);
+ return waiter->MoveFinishedFutures();
+}
+
+struct Continue {
+ template <typename T>
+ operator util::optional<T>() && { // NOLINT explicit
+ return {};
+ }
+};
+
+template <typename T = internal::Empty>
+util::optional<T> Break(T break_value = {}) {
+ return util::optional<T>{std::move(break_value)};
+}
+
+template <typename T = internal::Empty>
+using ControlFlow = util::optional<T>;
+
+/// \brief Loop through an asynchronous sequence
+///
+/// \param[in] iterate A generator of Future<ControlFlow<BreakValue>>. On completion
+/// of each yielded future the resulting ControlFlow will be examined. A Break will
+/// terminate the loop, while a Continue will re-invoke `iterate`.
+///
+/// \return A future which will complete when a Future returned by iterate completes with
+/// a Break
+template <typename Iterate,
+ typename Control = typename detail::result_of_t<Iterate()>::ValueType,
+ typename BreakValueType = typename Control::value_type>
+Future<BreakValueType> Loop(Iterate iterate) {
+ struct Callback {
+ bool CheckForTermination(const Result<Control>& control_res) {
+ if (!control_res.ok()) {
+ break_fut.MarkFinished(control_res.status());
+ return true;
+ }
+ if (control_res->has_value()) {
+ break_fut.MarkFinished(**control_res);
+ return true;
+ }
+ return false;
+ }
+
+ void operator()(const Result<Control>& maybe_control) && {
+ if (CheckForTermination(maybe_control)) return;
+
+ auto control_fut = iterate();
+ while (true) {
+ if (control_fut.TryAddCallback([this]() { return *this; })) {
+ // Adding a callback succeeded; control_fut was not finished
+ // and we must wait to CheckForTermination.
+ return;
+ }
+ // Adding a callback failed; control_fut was finished and we
+ // can CheckForTermination immediately. This also avoids recursion and potential
+ // stack overflow.
+ if (CheckForTermination(control_fut.result())) return;
+
+ control_fut = iterate();
+ }
+ }
+
+ Iterate iterate;
+
+ // If the future returned by control_fut is never completed then we will be hanging on
+ // to break_fut forever even if the listener has given up listening on it. Instead we
+ // rely on the fact that a producer (the caller of Future<>::Make) is always
+ // responsible for completing the futures they create.
+ // TODO: Could avoid this kind of situation with "future abandonment" similar to mesos
+ Future<BreakValueType> break_fut;
+ };
+
+ auto break_fut = Future<BreakValueType>::Make();
+ auto control_fut = iterate();
+ control_fut.AddCallback(Callback{std::move(iterate), break_fut});
+
+ return break_fut;
+}
+
+inline Future<> ToFuture(Status status) {
+ return Future<>::MakeFinished(std::move(status));
+}
+
+template <typename T>
+Future<T> ToFuture(T value) {
+ return Future<T>::MakeFinished(std::move(value));
+}
+
+template <typename T>
+Future<T> ToFuture(Result<T> maybe_value) {
+ return Future<T>::MakeFinished(std::move(maybe_value));
+}
+
+template <typename T>
+Future<T> ToFuture(Future<T> fut) {
+ return std::move(fut);
+}
+
+template <typename T>
+struct EnsureFuture {
+ using type = decltype(ToFuture(std::declval<T>()));
+};
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/hash_util.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/hash_util.h
new file mode 100644
index 00000000000..dd1c38a7821
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/hash_util.h
@@ -0,0 +1,66 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+namespace arrow {
+namespace internal {
+
+// ----------------------------------------------------------------------
+// BEGIN Hash utilities from Boost
+
+namespace detail {
+
+#if defined(_MSC_VER)
+#define ARROW_HASH_ROTL32(x, r) _rotl(x, r)
+#else
+#define ARROW_HASH_ROTL32(x, r) (x << r) | (x >> (32 - r))
+#endif
+
+template <typename SizeT>
+inline void hash_combine_impl(SizeT& seed, SizeT value) {
+ seed ^= value + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+}
+
+inline void hash_combine_impl(uint32_t& h1, uint32_t k1) {
+ const uint32_t c1 = 0xcc9e2d51;
+ const uint32_t c2 = 0x1b873593;
+
+ k1 *= c1;
+ k1 = ARROW_HASH_ROTL32(k1, 15);
+ k1 *= c2;
+
+ h1 ^= k1;
+ h1 = ARROW_HASH_ROTL32(h1, 13);
+ h1 = h1 * 5 + 0xe6546b64;
+}
+
+#undef ARROW_HASH_ROTL32
+
+} // namespace detail
+
+template <class T>
+inline void hash_combine(std::size_t& seed, T const& v) {
+ std::hash<T> hasher;
+ return ::arrow::internal::detail::hash_combine_impl(seed, hasher(v));
+}
+
+// END Hash utilities from Boost
+// ----------------------------------------------------------------------
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/hashing.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/hashing.h
new file mode 100644
index 00000000000..ac1adcfb13e
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/hashing.h
@@ -0,0 +1,886 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Private header, not to be exported
+
+#pragma once
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstdint>
+#include <cstring>
+#include <limits>
+#include <memory>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "arrow/array/builder_binary.h"
+#include "arrow/buffer_builder.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type_fwd.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_builders.h"
+#include "arrow/util/endian.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/ubsan.h"
+
+#define XXH_INLINE_ALL
+
+#include "contrib/libs/xxhash/xxhash.h" // IWYU pragma: keep
+
+namespace arrow {
+namespace internal {
+
+// XXX would it help to have a 32-bit hash value on large datasets?
+typedef uint64_t hash_t;
+
+// Notes about the choice of a hash function.
+// - XXH3 is extremely fast on most data sizes, from small to huge;
+// faster even than HW CRC-based hashing schemes
+// - our custom hash function for tiny values (< 16 bytes) is still
+// significantly faster (~30%), at least on this machine and compiler
+
+template <uint64_t AlgNum>
+inline hash_t ComputeStringHash(const void* data, int64_t length);
+
+template <typename Scalar, uint64_t AlgNum>
+struct ScalarHelperBase {
+ static bool CompareScalars(Scalar u, Scalar v) { return u == v; }
+
+ static hash_t ComputeHash(const Scalar& value) {
+ // Generic hash computation for scalars. Simply apply the string hash
+ // to the bit representation of the value.
+
+ // XXX in the case of FP values, we'd like equal values to have the same hash,
+ // even if they have different bit representations...
+ return ComputeStringHash<AlgNum>(&value, sizeof(value));
+ }
+};
+
+template <typename Scalar, uint64_t AlgNum = 0, typename Enable = void>
+struct ScalarHelper : public ScalarHelperBase<Scalar, AlgNum> {};
+
+template <typename Scalar, uint64_t AlgNum>
+struct ScalarHelper<Scalar, AlgNum, enable_if_t<std::is_integral<Scalar>::value>>
+ : public ScalarHelperBase<Scalar, AlgNum> {
+ // ScalarHelper specialization for integers
+
+ static hash_t ComputeHash(const Scalar& value) {
+ // Faster hash computation for integers.
+
+ // Two of xxhash's prime multipliers (which are chosen for their
+ // bit dispersion properties)
+ static constexpr uint64_t multipliers[] = {11400714785074694791ULL,
+ 14029467366897019727ULL};
+
+ // Multiplying by the prime number mixes the low bits into the high bits,
+ // then byte-swapping (which is a single CPU instruction) allows the
+ // combined high and low bits to participate in the initial hash table index.
+ auto h = static_cast<hash_t>(value);
+ return BitUtil::ByteSwap(multipliers[AlgNum] * h);
+ }
+};
+
+template <typename Scalar, uint64_t AlgNum>
+struct ScalarHelper<Scalar, AlgNum,
+ enable_if_t<std::is_same<util::string_view, Scalar>::value>>
+ : public ScalarHelperBase<Scalar, AlgNum> {
+ // ScalarHelper specialization for util::string_view
+
+ static hash_t ComputeHash(const util::string_view& value) {
+ return ComputeStringHash<AlgNum>(value.data(), static_cast<int64_t>(value.size()));
+ }
+};
+
+template <typename Scalar, uint64_t AlgNum>
+struct ScalarHelper<Scalar, AlgNum, enable_if_t<std::is_floating_point<Scalar>::value>>
+ : public ScalarHelperBase<Scalar, AlgNum> {
+ // ScalarHelper specialization for reals
+
+ static bool CompareScalars(Scalar u, Scalar v) {
+ if (std::isnan(u)) {
+ // XXX should we do a bit-precise comparison?
+ return std::isnan(v);
+ }
+ return u == v;
+ }
+};
+
+template <uint64_t AlgNum = 0>
+hash_t ComputeStringHash(const void* data, int64_t length) {
+ if (ARROW_PREDICT_TRUE(length <= 16)) {
+ // Specialize for small hash strings, as they are quite common as
+ // hash table keys. Even XXH3 isn't quite as fast.
+ auto p = reinterpret_cast<const uint8_t*>(data);
+ auto n = static_cast<uint32_t>(length);
+ if (n <= 8) {
+ if (n <= 3) {
+ if (n == 0) {
+ return 1U;
+ }
+ uint32_t x = (n << 24) ^ (p[0] << 16) ^ (p[n / 2] << 8) ^ p[n - 1];
+ return ScalarHelper<uint32_t, AlgNum>::ComputeHash(x);
+ }
+ // 4 <= length <= 8
+ // We can read the string as two overlapping 32-bit ints, apply
+ // different hash functions to each of them in parallel, then XOR
+ // the results
+ uint32_t x, y;
+ hash_t hx, hy;
+ x = util::SafeLoadAs<uint32_t>(p + n - 4);
+ y = util::SafeLoadAs<uint32_t>(p);
+ hx = ScalarHelper<uint32_t, AlgNum>::ComputeHash(x);
+ hy = ScalarHelper<uint32_t, AlgNum ^ 1>::ComputeHash(y);
+ return n ^ hx ^ hy;
+ }
+ // 8 <= length <= 16
+ // Apply the same principle as above
+ uint64_t x, y;
+ hash_t hx, hy;
+ x = util::SafeLoadAs<uint64_t>(p + n - 8);
+ y = util::SafeLoadAs<uint64_t>(p);
+ hx = ScalarHelper<uint64_t, AlgNum>::ComputeHash(x);
+ hy = ScalarHelper<uint64_t, AlgNum ^ 1>::ComputeHash(y);
+ return n ^ hx ^ hy;
+ }
+
+#if XXH3_SECRET_SIZE_MIN != 136
+#error XXH3_SECRET_SIZE_MIN changed, please fix kXxh3Secrets
+#endif
+
+ // XXH3_64bits_withSeed generates a secret based on the seed, which is too slow.
+ // Instead, we use hard-coded random secrets. To maximize cache efficiency,
+ // they reuse the same memory area.
+ static constexpr unsigned char kXxh3Secrets[XXH3_SECRET_SIZE_MIN + 1] = {
+ 0xe7, 0x8b, 0x13, 0xf9, 0xfc, 0xb5, 0x8e, 0xef, 0x81, 0x48, 0x2c, 0xbf, 0xf9, 0x9f,
+ 0xc1, 0x1e, 0x43, 0x6d, 0xbf, 0xa6, 0x6d, 0xb5, 0x72, 0xbc, 0x97, 0xd8, 0x61, 0x24,
+ 0x0f, 0x12, 0xe3, 0x05, 0x21, 0xf7, 0x5c, 0x66, 0x67, 0xa5, 0x65, 0x03, 0x96, 0x26,
+ 0x69, 0xd8, 0x29, 0x20, 0xf8, 0xc7, 0xb0, 0x3d, 0xdd, 0x7d, 0x18, 0xa0, 0x60, 0x75,
+ 0x92, 0xa4, 0xce, 0xba, 0xc0, 0x77, 0xf4, 0xac, 0xb7, 0x03, 0x53, 0xf0, 0x98, 0xce,
+ 0xe6, 0x2b, 0x20, 0xc7, 0x82, 0x91, 0xab, 0xbf, 0x68, 0x5c, 0x62, 0x4d, 0x33, 0xa3,
+ 0xe1, 0xb3, 0xff, 0x97, 0x54, 0x4c, 0x44, 0x34, 0xb5, 0xb9, 0x32, 0x4c, 0x75, 0x42,
+ 0x89, 0x53, 0x94, 0xd4, 0x9f, 0x2b, 0x76, 0x4d, 0x4e, 0xe6, 0xfa, 0x15, 0x3e, 0xc1,
+ 0xdb, 0x71, 0x4b, 0x2c, 0x94, 0xf5, 0xfc, 0x8c, 0x89, 0x4b, 0xfb, 0xc1, 0x82, 0xa5,
+ 0x6a, 0x53, 0xf9, 0x4a, 0xba, 0xce, 0x1f, 0xc0, 0x97, 0x1a, 0x87};
+
+ static_assert(AlgNum < 2, "AlgNum too large");
+ static constexpr auto secret = kXxh3Secrets + AlgNum;
+ return XXH3_64bits_withSecret(data, static_cast<size_t>(length), secret,
+ XXH3_SECRET_SIZE_MIN);
+}
+
+// XXX add a HashEq<ArrowType> struct with both hash and compare functions?
+
+// ----------------------------------------------------------------------
+// An open-addressing insert-only hash table (no deletes)
+
+template <typename Payload>
+class HashTable {
+ public:
+ static constexpr hash_t kSentinel = 0ULL;
+ static constexpr int64_t kLoadFactor = 2UL;
+
+ struct Entry {
+ hash_t h;
+ Payload payload;
+
+ // An entry is valid if the hash is different from the sentinel value
+ operator bool() const { return h != kSentinel; }
+ };
+
+ HashTable(MemoryPool* pool, uint64_t capacity) : entries_builder_(pool) {
+ DCHECK_NE(pool, nullptr);
+ // Minimum of 32 elements
+ capacity = std::max<uint64_t>(capacity, 32UL);
+ capacity_ = BitUtil::NextPower2(capacity);
+ capacity_mask_ = capacity_ - 1;
+ size_ = 0;
+
+ DCHECK_OK(UpsizeBuffer(capacity_));
+ }
+
+ // Lookup with non-linear probing
+ // cmp_func should have signature bool(const Payload*).
+ // Return a (Entry*, found) pair.
+ template <typename CmpFunc>
+ std::pair<Entry*, bool> Lookup(hash_t h, CmpFunc&& cmp_func) {
+ auto p = Lookup<DoCompare, CmpFunc>(h, entries_, capacity_mask_,
+ std::forward<CmpFunc>(cmp_func));
+ return {&entries_[p.first], p.second};
+ }
+
+ template <typename CmpFunc>
+ std::pair<const Entry*, bool> Lookup(hash_t h, CmpFunc&& cmp_func) const {
+ auto p = Lookup<DoCompare, CmpFunc>(h, entries_, capacity_mask_,
+ std::forward<CmpFunc>(cmp_func));
+ return {&entries_[p.first], p.second};
+ }
+
+ Status Insert(Entry* entry, hash_t h, const Payload& payload) {
+ // Ensure entry is empty before inserting
+ assert(!*entry);
+ entry->h = FixHash(h);
+ entry->payload = payload;
+ ++size_;
+
+ if (ARROW_PREDICT_FALSE(NeedUpsizing())) {
+ // Resize less frequently since it is expensive
+ return Upsize(capacity_ * kLoadFactor * 2);
+ }
+ return Status::OK();
+ }
+
+ uint64_t size() const { return size_; }
+
+ // Visit all non-empty entries in the table
+ // The visit_func should have signature void(const Entry*)
+ template <typename VisitFunc>
+ void VisitEntries(VisitFunc&& visit_func) const {
+ for (uint64_t i = 0; i < capacity_; i++) {
+ const auto& entry = entries_[i];
+ if (entry) {
+ visit_func(&entry);
+ }
+ }
+ }
+
+ protected:
+ // NoCompare is for when the value is known not to exist in the table
+ enum CompareKind { DoCompare, NoCompare };
+
+ // The workhorse lookup function
+ template <CompareKind CKind, typename CmpFunc>
+ std::pair<uint64_t, bool> Lookup(hash_t h, const Entry* entries, uint64_t size_mask,
+ CmpFunc&& cmp_func) const {
+ static constexpr uint8_t perturb_shift = 5;
+
+ uint64_t index, perturb;
+ const Entry* entry;
+
+ h = FixHash(h);
+ index = h & size_mask;
+ perturb = (h >> perturb_shift) + 1U;
+
+ while (true) {
+ entry = &entries[index];
+ if (CompareEntry<CKind, CmpFunc>(h, entry, std::forward<CmpFunc>(cmp_func))) {
+ // Found
+ return {index, true};
+ }
+ if (entry->h == kSentinel) {
+ // Empty slot
+ return {index, false};
+ }
+
+ // Perturbation logic inspired from CPython's set / dict object.
+ // The goal is that all 64 bits of the unmasked hash value eventually
+ // participate in the probing sequence, to minimize clustering.
+ index = (index + perturb) & size_mask;
+ perturb = (perturb >> perturb_shift) + 1U;
+ }
+ }
+
+ template <CompareKind CKind, typename CmpFunc>
+ bool CompareEntry(hash_t h, const Entry* entry, CmpFunc&& cmp_func) const {
+ if (CKind == NoCompare) {
+ return false;
+ } else {
+ return entry->h == h && cmp_func(&entry->payload);
+ }
+ }
+
+ bool NeedUpsizing() const {
+ // Keep the load factor <= 1/2
+ return size_ * kLoadFactor >= capacity_;
+ }
+
+ Status UpsizeBuffer(uint64_t capacity) {
+ RETURN_NOT_OK(entries_builder_.Resize(capacity));
+ entries_ = entries_builder_.mutable_data();
+ memset(static_cast<void*>(entries_), 0, capacity * sizeof(Entry));
+
+ return Status::OK();
+ }
+
+ Status Upsize(uint64_t new_capacity) {
+ assert(new_capacity > capacity_);
+ uint64_t new_mask = new_capacity - 1;
+ assert((new_capacity & new_mask) == 0); // it's a power of two
+
+ // Stash old entries and seal builder, effectively resetting the Buffer
+ const Entry* old_entries = entries_;
+ ARROW_ASSIGN_OR_RAISE(auto previous, entries_builder_.FinishWithLength(capacity_));
+ // Allocate new buffer
+ RETURN_NOT_OK(UpsizeBuffer(new_capacity));
+
+ for (uint64_t i = 0; i < capacity_; i++) {
+ const auto& entry = old_entries[i];
+ if (entry) {
+ // Dummy compare function will not be called
+ auto p = Lookup<NoCompare>(entry.h, entries_, new_mask,
+ [](const Payload*) { return false; });
+ // Lookup<NoCompare> (and CompareEntry<NoCompare>) ensure that an
+ // empty slots is always returned
+ assert(!p.second);
+ entries_[p.first] = entry;
+ }
+ }
+ capacity_ = new_capacity;
+ capacity_mask_ = new_mask;
+
+ return Status::OK();
+ }
+
+ hash_t FixHash(hash_t h) const { return (h == kSentinel) ? 42U : h; }
+
+ // The number of slots available in the hash table array.
+ uint64_t capacity_;
+ uint64_t capacity_mask_;
+ // The number of used slots in the hash table array.
+ uint64_t size_;
+
+ Entry* entries_;
+ TypedBufferBuilder<Entry> entries_builder_;
+};
+
+// XXX typedef memo_index_t int32_t ?
+
+constexpr int32_t kKeyNotFound = -1;
+
+// ----------------------------------------------------------------------
+// A base class for memoization table.
+
+class MemoTable {
+ public:
+ virtual ~MemoTable() = default;
+
+ virtual int32_t size() const = 0;
+};
+
+// ----------------------------------------------------------------------
+// A memoization table for memory-cheap scalar values.
+
+// The memoization table remembers and allows to look up the insertion
+// index for each key.
+
+template <typename Scalar, template <class> class HashTableTemplateType = HashTable>
+class ScalarMemoTable : public MemoTable {
+ public:
+ explicit ScalarMemoTable(MemoryPool* pool, int64_t entries = 0)
+ : hash_table_(pool, static_cast<uint64_t>(entries)) {}
+
+ int32_t Get(const Scalar& value) const {
+ auto cmp_func = [value](const Payload* payload) -> bool {
+ return ScalarHelper<Scalar, 0>::CompareScalars(payload->value, value);
+ };
+ hash_t h = ComputeHash(value);
+ auto p = hash_table_.Lookup(h, cmp_func);
+ if (p.second) {
+ return p.first->payload.memo_index;
+ } else {
+ return kKeyNotFound;
+ }
+ }
+
+ template <typename Func1, typename Func2>
+ Status GetOrInsert(const Scalar& value, Func1&& on_found, Func2&& on_not_found,
+ int32_t* out_memo_index) {
+ auto cmp_func = [value](const Payload* payload) -> bool {
+ return ScalarHelper<Scalar, 0>::CompareScalars(value, payload->value);
+ };
+ hash_t h = ComputeHash(value);
+ auto p = hash_table_.Lookup(h, cmp_func);
+ int32_t memo_index;
+ if (p.second) {
+ memo_index = p.first->payload.memo_index;
+ on_found(memo_index);
+ } else {
+ memo_index = size();
+ RETURN_NOT_OK(hash_table_.Insert(p.first, h, {value, memo_index}));
+ on_not_found(memo_index);
+ }
+ *out_memo_index = memo_index;
+ return Status::OK();
+ }
+
+ Status GetOrInsert(const Scalar& value, int32_t* out_memo_index) {
+ return GetOrInsert(
+ value, [](int32_t i) {}, [](int32_t i) {}, out_memo_index);
+ }
+
+ int32_t GetNull() const { return null_index_; }
+
+ template <typename Func1, typename Func2>
+ int32_t GetOrInsertNull(Func1&& on_found, Func2&& on_not_found) {
+ int32_t memo_index = GetNull();
+ if (memo_index != kKeyNotFound) {
+ on_found(memo_index);
+ } else {
+ null_index_ = memo_index = size();
+ on_not_found(memo_index);
+ }
+ return memo_index;
+ }
+
+ int32_t GetOrInsertNull() {
+ return GetOrInsertNull([](int32_t i) {}, [](int32_t i) {});
+ }
+
+ // The number of entries in the memo table +1 if null was added.
+ // (which is also 1 + the largest memo index)
+ int32_t size() const override {
+ return static_cast<int32_t>(hash_table_.size()) + (GetNull() != kKeyNotFound);
+ }
+
+ // Copy values starting from index `start` into `out_data`
+ void CopyValues(int32_t start, Scalar* out_data) const {
+ hash_table_.VisitEntries([=](const HashTableEntry* entry) {
+ int32_t index = entry->payload.memo_index - start;
+ if (index >= 0) {
+ out_data[index] = entry->payload.value;
+ }
+ });
+ // Zero-initialize the null entry
+ if (null_index_ != kKeyNotFound) {
+ int32_t index = null_index_ - start;
+ if (index >= 0) {
+ out_data[index] = Scalar{};
+ }
+ }
+ }
+
+ void CopyValues(Scalar* out_data) const { CopyValues(0, out_data); }
+
+ protected:
+ struct Payload {
+ Scalar value;
+ int32_t memo_index;
+ };
+
+ using HashTableType = HashTableTemplateType<Payload>;
+ using HashTableEntry = typename HashTableType::Entry;
+ HashTableType hash_table_;
+ int32_t null_index_ = kKeyNotFound;
+
+ hash_t ComputeHash(const Scalar& value) const {
+ return ScalarHelper<Scalar, 0>::ComputeHash(value);
+ }
+};
+
+// ----------------------------------------------------------------------
+// A memoization table for small scalar values, using direct indexing
+
+template <typename Scalar, typename Enable = void>
+struct SmallScalarTraits {};
+
+template <>
+struct SmallScalarTraits<bool> {
+ static constexpr int32_t cardinality = 2;
+
+ static uint32_t AsIndex(bool value) { return value ? 1 : 0; }
+};
+
+template <typename Scalar>
+struct SmallScalarTraits<Scalar, enable_if_t<std::is_integral<Scalar>::value>> {
+ using Unsigned = typename std::make_unsigned<Scalar>::type;
+
+ static constexpr int32_t cardinality = 1U + std::numeric_limits<Unsigned>::max();
+
+ static uint32_t AsIndex(Scalar value) { return static_cast<Unsigned>(value); }
+};
+
+template <typename Scalar, template <class> class HashTableTemplateType = HashTable>
+class SmallScalarMemoTable : public MemoTable {
+ public:
+ explicit SmallScalarMemoTable(MemoryPool* pool, int64_t entries = 0) {
+ std::fill(value_to_index_, value_to_index_ + cardinality + 1, kKeyNotFound);
+ index_to_value_.reserve(cardinality);
+ }
+
+ int32_t Get(const Scalar value) const {
+ auto value_index = AsIndex(value);
+ return value_to_index_[value_index];
+ }
+
+ template <typename Func1, typename Func2>
+ Status GetOrInsert(const Scalar value, Func1&& on_found, Func2&& on_not_found,
+ int32_t* out_memo_index) {
+ auto value_index = AsIndex(value);
+ auto memo_index = value_to_index_[value_index];
+ if (memo_index == kKeyNotFound) {
+ memo_index = static_cast<int32_t>(index_to_value_.size());
+ index_to_value_.push_back(value);
+ value_to_index_[value_index] = memo_index;
+ DCHECK_LT(memo_index, cardinality + 1);
+ on_not_found(memo_index);
+ } else {
+ on_found(memo_index);
+ }
+ *out_memo_index = memo_index;
+ return Status::OK();
+ }
+
+ Status GetOrInsert(const Scalar value, int32_t* out_memo_index) {
+ return GetOrInsert(
+ value, [](int32_t i) {}, [](int32_t i) {}, out_memo_index);
+ }
+
+ int32_t GetNull() const { return value_to_index_[cardinality]; }
+
+ template <typename Func1, typename Func2>
+ int32_t GetOrInsertNull(Func1&& on_found, Func2&& on_not_found) {
+ auto memo_index = GetNull();
+ if (memo_index == kKeyNotFound) {
+ memo_index = value_to_index_[cardinality] = size();
+ index_to_value_.push_back(0);
+ on_not_found(memo_index);
+ } else {
+ on_found(memo_index);
+ }
+ return memo_index;
+ }
+
+ int32_t GetOrInsertNull() {
+ return GetOrInsertNull([](int32_t i) {}, [](int32_t i) {});
+ }
+
+ // The number of entries in the memo table
+ // (which is also 1 + the largest memo index)
+ int32_t size() const override { return static_cast<int32_t>(index_to_value_.size()); }
+
+ // Copy values starting from index `start` into `out_data`
+ void CopyValues(int32_t start, Scalar* out_data) const {
+ DCHECK_GE(start, 0);
+ DCHECK_LE(static_cast<size_t>(start), index_to_value_.size());
+ int64_t offset = start * static_cast<int32_t>(sizeof(Scalar));
+ memcpy(out_data, index_to_value_.data() + offset, (size() - start) * sizeof(Scalar));
+ }
+
+ void CopyValues(Scalar* out_data) const { CopyValues(0, out_data); }
+
+ const std::vector<Scalar>& values() const { return index_to_value_; }
+
+ protected:
+ static constexpr auto cardinality = SmallScalarTraits<Scalar>::cardinality;
+ static_assert(cardinality <= 256, "cardinality too large for direct-addressed table");
+
+ uint32_t AsIndex(Scalar value) const {
+ return SmallScalarTraits<Scalar>::AsIndex(value);
+ }
+
+ // The last index is reserved for the null element.
+ int32_t value_to_index_[cardinality + 1];
+ std::vector<Scalar> index_to_value_;
+};
+
+// ----------------------------------------------------------------------
+// A memoization table for variable-sized binary data.
+
+template <typename BinaryBuilderT>
+class BinaryMemoTable : public MemoTable {
+ public:
+ using builder_offset_type = typename BinaryBuilderT::offset_type;
+ explicit BinaryMemoTable(MemoryPool* pool, int64_t entries = 0,
+ int64_t values_size = -1)
+ : hash_table_(pool, static_cast<uint64_t>(entries)), binary_builder_(pool) {
+ const int64_t data_size = (values_size < 0) ? entries * 4 : values_size;
+ DCHECK_OK(binary_builder_.Resize(entries));
+ DCHECK_OK(binary_builder_.ReserveData(data_size));
+ }
+
+ int32_t Get(const void* data, builder_offset_type length) const {
+ hash_t h = ComputeStringHash<0>(data, length);
+ auto p = Lookup(h, data, length);
+ if (p.second) {
+ return p.first->payload.memo_index;
+ } else {
+ return kKeyNotFound;
+ }
+ }
+
+ int32_t Get(const util::string_view& value) const {
+ return Get(value.data(), static_cast<builder_offset_type>(value.length()));
+ }
+
+ template <typename Func1, typename Func2>
+ Status GetOrInsert(const void* data, builder_offset_type length, Func1&& on_found,
+ Func2&& on_not_found, int32_t* out_memo_index) {
+ hash_t h = ComputeStringHash<0>(data, length);
+ auto p = Lookup(h, data, length);
+ int32_t memo_index;
+ if (p.second) {
+ memo_index = p.first->payload.memo_index;
+ on_found(memo_index);
+ } else {
+ memo_index = size();
+ // Insert string value
+ RETURN_NOT_OK(binary_builder_.Append(static_cast<const char*>(data), length));
+ // Insert hash entry
+ RETURN_NOT_OK(
+ hash_table_.Insert(const_cast<HashTableEntry*>(p.first), h, {memo_index}));
+
+ on_not_found(memo_index);
+ }
+ *out_memo_index = memo_index;
+ return Status::OK();
+ }
+
+ template <typename Func1, typename Func2>
+ Status GetOrInsert(const util::string_view& value, Func1&& on_found,
+ Func2&& on_not_found, int32_t* out_memo_index) {
+ return GetOrInsert(value.data(), static_cast<builder_offset_type>(value.length()),
+ std::forward<Func1>(on_found), std::forward<Func2>(on_not_found),
+ out_memo_index);
+ }
+
+ Status GetOrInsert(const void* data, builder_offset_type length,
+ int32_t* out_memo_index) {
+ return GetOrInsert(
+ data, length, [](int32_t i) {}, [](int32_t i) {}, out_memo_index);
+ }
+
+ Status GetOrInsert(const util::string_view& value, int32_t* out_memo_index) {
+ return GetOrInsert(value.data(), static_cast<builder_offset_type>(value.length()),
+ out_memo_index);
+ }
+
+ int32_t GetNull() const { return null_index_; }
+
+ template <typename Func1, typename Func2>
+ int32_t GetOrInsertNull(Func1&& on_found, Func2&& on_not_found) {
+ int32_t memo_index = GetNull();
+ if (memo_index == kKeyNotFound) {
+ memo_index = null_index_ = size();
+ DCHECK_OK(binary_builder_.AppendNull());
+ on_not_found(memo_index);
+ } else {
+ on_found(memo_index);
+ }
+ return memo_index;
+ }
+
+ int32_t GetOrInsertNull() {
+ return GetOrInsertNull([](int32_t i) {}, [](int32_t i) {});
+ }
+
+ // The number of entries in the memo table
+ // (which is also 1 + the largest memo index)
+ int32_t size() const override {
+ return static_cast<int32_t>(hash_table_.size() + (GetNull() != kKeyNotFound));
+ }
+
+ int64_t values_size() const { return binary_builder_.value_data_length(); }
+
+ // Copy (n + 1) offsets starting from index `start` into `out_data`
+ template <class Offset>
+ void CopyOffsets(int32_t start, Offset* out_data) const {
+ DCHECK_LE(start, size());
+
+ const builder_offset_type* offsets = binary_builder_.offsets_data();
+ const builder_offset_type delta =
+ start < binary_builder_.length() ? offsets[start] : 0;
+ for (int32_t i = start; i < size(); ++i) {
+ const builder_offset_type adjusted_offset = offsets[i] - delta;
+ Offset cast_offset = static_cast<Offset>(adjusted_offset);
+ assert(static_cast<builder_offset_type>(cast_offset) ==
+ adjusted_offset); // avoid truncation
+ *out_data++ = cast_offset;
+ }
+
+ // Copy last value since BinaryBuilder only materializes it on in Finish()
+ *out_data = static_cast<Offset>(binary_builder_.value_data_length() - delta);
+ }
+
+ template <class Offset>
+ void CopyOffsets(Offset* out_data) const {
+ CopyOffsets(0, out_data);
+ }
+
+ // Copy values starting from index `start` into `out_data`
+ void CopyValues(int32_t start, uint8_t* out_data) const {
+ CopyValues(start, -1, out_data);
+ }
+
+ // Same as above, but check output size in debug mode
+ void CopyValues(int32_t start, int64_t out_size, uint8_t* out_data) const {
+ DCHECK_LE(start, size());
+
+ // The absolute byte offset of `start` value in the binary buffer.
+ const builder_offset_type offset = binary_builder_.offset(start);
+ const auto length = binary_builder_.value_data_length() - static_cast<size_t>(offset);
+
+ if (out_size != -1) {
+ assert(static_cast<int64_t>(length) <= out_size);
+ }
+
+ auto view = binary_builder_.GetView(start);
+ memcpy(out_data, view.data(), length);
+ }
+
+ void CopyValues(uint8_t* out_data) const { CopyValues(0, -1, out_data); }
+
+ void CopyValues(int64_t out_size, uint8_t* out_data) const {
+ CopyValues(0, out_size, out_data);
+ }
+
+ void CopyFixedWidthValues(int32_t start, int32_t width_size, int64_t out_size,
+ uint8_t* out_data) const {
+ // This method exists to cope with the fact that the BinaryMemoTable does
+ // not know the fixed width when inserting the null value. The data
+ // buffer hold a zero length string for the null value (if found).
+ //
+ // Thus, the method will properly inject an empty value of the proper width
+ // in the output buffer.
+ //
+ if (start >= size()) {
+ return;
+ }
+
+ int32_t null_index = GetNull();
+ if (null_index < start) {
+ // Nothing to skip, proceed as usual.
+ CopyValues(start, out_size, out_data);
+ return;
+ }
+
+ builder_offset_type left_offset = binary_builder_.offset(start);
+
+ // Ensure that the data length is exactly missing width_size bytes to fit
+ // in the expected output (n_values * width_size).
+#ifndef NDEBUG
+ int64_t data_length = values_size() - static_cast<size_t>(left_offset);
+ assert(data_length + width_size == out_size);
+ ARROW_UNUSED(data_length);
+#endif
+
+ auto in_data = binary_builder_.value_data() + left_offset;
+ // The null use 0-length in the data, slice the data in 2 and skip by
+ // width_size in out_data. [part_1][width_size][part_2]
+ auto null_data_offset = binary_builder_.offset(null_index);
+ auto left_size = null_data_offset - left_offset;
+ if (left_size > 0) {
+ memcpy(out_data, in_data + left_offset, left_size);
+ }
+ // Zero-initialize the null entry
+ memset(out_data + left_size, 0, width_size);
+
+ auto right_size = values_size() - static_cast<size_t>(null_data_offset);
+ if (right_size > 0) {
+ // skip the null fixed size value.
+ auto out_offset = left_size + width_size;
+ assert(out_data + out_offset + right_size == out_data + out_size);
+ memcpy(out_data + out_offset, in_data + null_data_offset, right_size);
+ }
+ }
+
+ // Visit the stored values in insertion order.
+ // The visitor function should have the signature `void(util::string_view)`
+ // or `void(const util::string_view&)`.
+ template <typename VisitFunc>
+ void VisitValues(int32_t start, VisitFunc&& visit) const {
+ for (int32_t i = start; i < size(); ++i) {
+ visit(binary_builder_.GetView(i));
+ }
+ }
+
+ protected:
+ struct Payload {
+ int32_t memo_index;
+ };
+
+ using HashTableType = HashTable<Payload>;
+ using HashTableEntry = typename HashTable<Payload>::Entry;
+ HashTableType hash_table_;
+ BinaryBuilderT binary_builder_;
+
+ int32_t null_index_ = kKeyNotFound;
+
+ std::pair<const HashTableEntry*, bool> Lookup(hash_t h, const void* data,
+ builder_offset_type length) const {
+ auto cmp_func = [=](const Payload* payload) {
+ util::string_view lhs = binary_builder_.GetView(payload->memo_index);
+ util::string_view rhs(static_cast<const char*>(data), length);
+ return lhs == rhs;
+ };
+ return hash_table_.Lookup(h, cmp_func);
+ }
+};
+
+template <typename T, typename Enable = void>
+struct HashTraits {};
+
+template <>
+struct HashTraits<BooleanType> {
+ using MemoTableType = SmallScalarMemoTable<bool>;
+};
+
+template <typename T>
+struct HashTraits<T, enable_if_8bit_int<T>> {
+ using c_type = typename T::c_type;
+ using MemoTableType = SmallScalarMemoTable<typename T::c_type>;
+};
+
+template <typename T>
+struct HashTraits<T, enable_if_t<has_c_type<T>::value && !is_8bit_int<T>::value>> {
+ using c_type = typename T::c_type;
+ using MemoTableType = ScalarMemoTable<c_type, HashTable>;
+};
+
+template <typename T>
+struct HashTraits<T, enable_if_t<has_string_view<T>::value &&
+ !std::is_base_of<LargeBinaryType, T>::value>> {
+ using MemoTableType = BinaryMemoTable<BinaryBuilder>;
+};
+
+template <typename T>
+struct HashTraits<T, enable_if_decimal<T>> {
+ using MemoTableType = BinaryMemoTable<BinaryBuilder>;
+};
+
+template <typename T>
+struct HashTraits<T, enable_if_t<std::is_base_of<LargeBinaryType, T>::value>> {
+ using MemoTableType = BinaryMemoTable<LargeBinaryBuilder>;
+};
+
+template <typename MemoTableType>
+static inline Status ComputeNullBitmap(MemoryPool* pool, const MemoTableType& memo_table,
+ int64_t start_offset, int64_t* null_count,
+ std::shared_ptr<Buffer>* null_bitmap) {
+ int64_t dict_length = static_cast<int64_t>(memo_table.size()) - start_offset;
+ int64_t null_index = memo_table.GetNull();
+
+ *null_count = 0;
+ *null_bitmap = nullptr;
+
+ if (null_index != kKeyNotFound && null_index >= start_offset) {
+ null_index -= start_offset;
+ *null_count = 1;
+ ARROW_ASSIGN_OR_RAISE(*null_bitmap,
+ internal::BitmapAllButOne(pool, dict_length, null_index));
+ }
+
+ return Status::OK();
+}
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/int128_internal.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/int128_internal.h
new file mode 100644
index 00000000000..1d494671a9f
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/int128_internal.h
@@ -0,0 +1,45 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+
+#include "arrow/util/config.h"
+#include "arrow/util/macros.h"
+
+#ifndef ARROW_USE_NATIVE_INT128
+#include <boost/multiprecision/cpp_int.hpp>
+#endif
+
+namespace arrow {
+namespace internal {
+
+// NOTE: __int128_t and boost::multiprecision::int128_t are not interchangeable.
+// For example, __int128_t does not have any member function, and does not have
+// operator<<(std::ostream, __int128_t). On the other hand, the behavior of
+// boost::multiprecision::int128_t might be surprising with some configs (e.g.,
+// static_cast<uint64_t>(boost::multiprecision::uint128_t) might return
+// ~uint64_t{0} instead of the lower 64 bits of the input).
+// Try to minimize the usage of int128_t and uint128_t.
+#ifdef ARROW_USE_NATIVE_INT128
+using int128_t = __int128_t;
+using uint128_t = __uint128_t;
+#else
+using boost::multiprecision::int128_t;
+using boost::multiprecision::uint128_t;
+#endif
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/int_util.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/int_util.cc
new file mode 100644
index 00000000000..24c5fe56eff
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/int_util.cc
@@ -0,0 +1,952 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/util/int_util.h"
+
+#include <algorithm>
+#include <cstring>
+#include <limits>
+
+#include "arrow/array/data.h"
+#include "arrow/datum.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/bit_block_counter.h"
+#include "arrow/util/bit_run_reader.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/ubsan.h"
+#include "arrow/visitor_inline.h"
+
+namespace arrow {
+namespace internal {
+
+using internal::checked_cast;
+
+static constexpr uint64_t max_uint8 =
+ static_cast<uint64_t>(std::numeric_limits<uint8_t>::max());
+static constexpr uint64_t max_uint16 =
+ static_cast<uint64_t>(std::numeric_limits<uint16_t>::max());
+static constexpr uint64_t max_uint32 =
+ static_cast<uint64_t>(std::numeric_limits<uint32_t>::max());
+static constexpr uint64_t max_uint64 = std::numeric_limits<uint64_t>::max();
+
+static constexpr uint64_t mask_uint8 = ~0xffULL;
+static constexpr uint64_t mask_uint16 = ~0xffffULL;
+static constexpr uint64_t mask_uint32 = ~0xffffffffULL;
+
+//
+// Unsigned integer width detection
+//
+
+static const uint64_t max_uints[] = {0, max_uint8, max_uint16, 0, max_uint32,
+ 0, 0, 0, max_uint64};
+
+// Check if we would need to expand the underlying storage type
+static inline uint8_t ExpandedUIntWidth(uint64_t val, uint8_t current_width) {
+ // Optimize for the common case where width doesn't change
+ if (ARROW_PREDICT_TRUE(val <= max_uints[current_width])) {
+ return current_width;
+ }
+ if (current_width == 1 && val <= max_uint8) {
+ return 1;
+ } else if (current_width <= 2 && val <= max_uint16) {
+ return 2;
+ } else if (current_width <= 4 && val <= max_uint32) {
+ return 4;
+ } else {
+ return 8;
+ }
+}
+
+uint8_t DetectUIntWidth(const uint64_t* values, int64_t length, uint8_t min_width) {
+ uint8_t width = min_width;
+ if (min_width < 8) {
+ auto p = values;
+ const auto end = p + length;
+ while (p <= end - 16) {
+ // This is probably SIMD-izable
+ auto u = p[0];
+ auto v = p[1];
+ auto w = p[2];
+ auto x = p[3];
+ u |= p[4];
+ v |= p[5];
+ w |= p[6];
+ x |= p[7];
+ u |= p[8];
+ v |= p[9];
+ w |= p[10];
+ x |= p[11];
+ u |= p[12];
+ v |= p[13];
+ w |= p[14];
+ x |= p[15];
+ p += 16;
+ width = ExpandedUIntWidth(u | v | w | x, width);
+ if (ARROW_PREDICT_FALSE(width == 8)) {
+ break;
+ }
+ }
+ if (p <= end - 8) {
+ auto u = p[0];
+ auto v = p[1];
+ auto w = p[2];
+ auto x = p[3];
+ u |= p[4];
+ v |= p[5];
+ w |= p[6];
+ x |= p[7];
+ p += 8;
+ width = ExpandedUIntWidth(u | v | w | x, width);
+ }
+ while (p < end) {
+ width = ExpandedUIntWidth(*p++, width);
+ }
+ }
+ return width;
+}
+
+uint8_t DetectUIntWidth(const uint64_t* values, const uint8_t* valid_bytes,
+ int64_t length, uint8_t min_width) {
+ if (valid_bytes == nullptr) {
+ return DetectUIntWidth(values, length, min_width);
+ }
+ uint8_t width = min_width;
+ if (min_width < 8) {
+ auto p = values;
+ const auto end = p + length;
+ auto b = valid_bytes;
+
+#define MASK(p, b, i) p[i] * (b[i] != 0)
+
+ while (p <= end - 8) {
+ // This is probably be SIMD-izable
+ auto u = MASK(p, b, 0);
+ auto v = MASK(p, b, 1);
+ auto w = MASK(p, b, 2);
+ auto x = MASK(p, b, 3);
+ u |= MASK(p, b, 4);
+ v |= MASK(p, b, 5);
+ w |= MASK(p, b, 6);
+ x |= MASK(p, b, 7);
+ b += 8;
+ p += 8;
+ width = ExpandedUIntWidth(u | v | w | x, width);
+ if (ARROW_PREDICT_FALSE(width == 8)) {
+ break;
+ }
+ }
+ uint64_t mask = 0;
+ while (p < end) {
+ mask |= MASK(p, b, 0);
+ ++b;
+ ++p;
+ }
+ width = ExpandedUIntWidth(mask, width);
+
+#undef MASK
+ }
+ return width;
+}
+
+//
+// Signed integer width detection
+//
+
+uint8_t DetectIntWidth(const int64_t* values, int64_t length, uint8_t min_width) {
+ if (min_width == 8) {
+ return min_width;
+ }
+ uint8_t width = min_width;
+
+ auto p = values;
+ const auto end = p + length;
+ // Strategy: to determine whether `x` is between -0x80 and 0x7f,
+ // we determine whether `x + 0x80` is between 0x00 and 0xff. The
+ // latter can be done with a simple AND mask with ~0xff and, more
+ // importantly, can be computed in a single step over multiple ORed
+ // values (so we can branch once every N items instead of once every item).
+ // This strategy could probably lend itself to explicit SIMD-ization,
+ // if more performance is needed.
+ constexpr uint64_t addend8 = 0x80ULL;
+ constexpr uint64_t addend16 = 0x8000ULL;
+ constexpr uint64_t addend32 = 0x80000000ULL;
+
+ auto test_one_item = [&](uint64_t addend, uint64_t test_mask) -> bool {
+ auto v = *p++;
+ if (ARROW_PREDICT_FALSE(((v + addend) & test_mask) != 0)) {
+ --p;
+ return false;
+ } else {
+ return true;
+ }
+ };
+
+ auto test_four_items = [&](uint64_t addend, uint64_t test_mask) -> bool {
+ auto mask = (p[0] + addend) | (p[1] + addend) | (p[2] + addend) | (p[3] + addend);
+ p += 4;
+ if (ARROW_PREDICT_FALSE((mask & test_mask) != 0)) {
+ p -= 4;
+ return false;
+ } else {
+ return true;
+ }
+ };
+
+ if (width == 1) {
+ while (p <= end - 4) {
+ if (!test_four_items(addend8, mask_uint8)) {
+ width = 2;
+ goto width2;
+ }
+ }
+ while (p < end) {
+ if (!test_one_item(addend8, mask_uint8)) {
+ width = 2;
+ goto width2;
+ }
+ }
+ return 1;
+ }
+width2:
+ if (width == 2) {
+ while (p <= end - 4) {
+ if (!test_four_items(addend16, mask_uint16)) {
+ width = 4;
+ goto width4;
+ }
+ }
+ while (p < end) {
+ if (!test_one_item(addend16, mask_uint16)) {
+ width = 4;
+ goto width4;
+ }
+ }
+ return 2;
+ }
+width4:
+ if (width == 4) {
+ while (p <= end - 4) {
+ if (!test_four_items(addend32, mask_uint32)) {
+ width = 8;
+ goto width8;
+ }
+ }
+ while (p < end) {
+ if (!test_one_item(addend32, mask_uint32)) {
+ width = 8;
+ goto width8;
+ }
+ }
+ return 4;
+ }
+width8:
+ return 8;
+}
+
+uint8_t DetectIntWidth(const int64_t* values, const uint8_t* valid_bytes, int64_t length,
+ uint8_t min_width) {
+ if (valid_bytes == nullptr) {
+ return DetectIntWidth(values, length, min_width);
+ }
+
+ if (min_width == 8) {
+ return min_width;
+ }
+ uint8_t width = min_width;
+
+ auto p = values;
+ const auto end = p + length;
+ auto b = valid_bytes;
+ // Strategy is similar to the no-nulls case above, but we also
+ // have to zero any incoming items that have a zero validity byte.
+ constexpr uint64_t addend8 = 0x80ULL;
+ constexpr uint64_t addend16 = 0x8000ULL;
+ constexpr uint64_t addend32 = 0x80000000ULL;
+
+#define MASK(p, b, addend, i) (p[i] + addend) * (b[i] != 0)
+
+ auto test_one_item = [&](uint64_t addend, uint64_t test_mask) -> bool {
+ auto v = MASK(p, b, addend, 0);
+ ++b;
+ ++p;
+ if (ARROW_PREDICT_FALSE((v & test_mask) != 0)) {
+ --b;
+ --p;
+ return false;
+ } else {
+ return true;
+ }
+ };
+
+ auto test_eight_items = [&](uint64_t addend, uint64_t test_mask) -> bool {
+ auto mask1 = MASK(p, b, addend, 0) | MASK(p, b, addend, 1) | MASK(p, b, addend, 2) |
+ MASK(p, b, addend, 3);
+ auto mask2 = MASK(p, b, addend, 4) | MASK(p, b, addend, 5) | MASK(p, b, addend, 6) |
+ MASK(p, b, addend, 7);
+ b += 8;
+ p += 8;
+ if (ARROW_PREDICT_FALSE(((mask1 | mask2) & test_mask) != 0)) {
+ b -= 8;
+ p -= 8;
+ return false;
+ } else {
+ return true;
+ }
+ };
+
+#undef MASK
+
+ if (width == 1) {
+ while (p <= end - 8) {
+ if (!test_eight_items(addend8, mask_uint8)) {
+ width = 2;
+ goto width2;
+ }
+ }
+ while (p < end) {
+ if (!test_one_item(addend8, mask_uint8)) {
+ width = 2;
+ goto width2;
+ }
+ }
+ return 1;
+ }
+width2:
+ if (width == 2) {
+ while (p <= end - 8) {
+ if (!test_eight_items(addend16, mask_uint16)) {
+ width = 4;
+ goto width4;
+ }
+ }
+ while (p < end) {
+ if (!test_one_item(addend16, mask_uint16)) {
+ width = 4;
+ goto width4;
+ }
+ }
+ return 2;
+ }
+width4:
+ if (width == 4) {
+ while (p <= end - 8) {
+ if (!test_eight_items(addend32, mask_uint32)) {
+ width = 8;
+ goto width8;
+ }
+ }
+ while (p < end) {
+ if (!test_one_item(addend32, mask_uint32)) {
+ width = 8;
+ goto width8;
+ }
+ }
+ return 4;
+ }
+width8:
+ return 8;
+}
+
+template <typename Source, typename Dest>
+static inline void CastIntsInternal(const Source* src, Dest* dest, int64_t length) {
+ while (length >= 4) {
+ dest[0] = static_cast<Dest>(src[0]);
+ dest[1] = static_cast<Dest>(src[1]);
+ dest[2] = static_cast<Dest>(src[2]);
+ dest[3] = static_cast<Dest>(src[3]);
+ length -= 4;
+ src += 4;
+ dest += 4;
+ }
+ while (length > 0) {
+ *dest++ = static_cast<Dest>(*src++);
+ --length;
+ }
+}
+
+void DowncastInts(const int64_t* source, int8_t* dest, int64_t length) {
+ CastIntsInternal(source, dest, length);
+}
+
+void DowncastInts(const int64_t* source, int16_t* dest, int64_t length) {
+ CastIntsInternal(source, dest, length);
+}
+
+void DowncastInts(const int64_t* source, int32_t* dest, int64_t length) {
+ CastIntsInternal(source, dest, length);
+}
+
+void DowncastInts(const int64_t* source, int64_t* dest, int64_t length) {
+ memcpy(dest, source, length * sizeof(int64_t));
+}
+
+void DowncastUInts(const uint64_t* source, uint8_t* dest, int64_t length) {
+ CastIntsInternal(source, dest, length);
+}
+
+void DowncastUInts(const uint64_t* source, uint16_t* dest, int64_t length) {
+ CastIntsInternal(source, dest, length);
+}
+
+void DowncastUInts(const uint64_t* source, uint32_t* dest, int64_t length) {
+ CastIntsInternal(source, dest, length);
+}
+
+void DowncastUInts(const uint64_t* source, uint64_t* dest, int64_t length) {
+ memcpy(dest, source, length * sizeof(int64_t));
+}
+
+void UpcastInts(const int32_t* source, int64_t* dest, int64_t length) {
+ CastIntsInternal(source, dest, length);
+}
+
+template <typename InputInt, typename OutputInt>
+void TransposeInts(const InputInt* src, OutputInt* dest, int64_t length,
+ const int32_t* transpose_map) {
+ while (length >= 4) {
+ dest[0] = static_cast<OutputInt>(transpose_map[src[0]]);
+ dest[1] = static_cast<OutputInt>(transpose_map[src[1]]);
+ dest[2] = static_cast<OutputInt>(transpose_map[src[2]]);
+ dest[3] = static_cast<OutputInt>(transpose_map[src[3]]);
+ length -= 4;
+ src += 4;
+ dest += 4;
+ }
+ while (length > 0) {
+ *dest++ = static_cast<OutputInt>(transpose_map[*src++]);
+ --length;
+ }
+}
+
+#define INSTANTIATE(SRC, DEST) \
+ template ARROW_EXPORT void TransposeInts( \
+ const SRC* source, DEST* dest, int64_t length, const int32_t* transpose_map);
+
+#define INSTANTIATE_ALL_DEST(DEST) \
+ INSTANTIATE(uint8_t, DEST) \
+ INSTANTIATE(int8_t, DEST) \
+ INSTANTIATE(uint16_t, DEST) \
+ INSTANTIATE(int16_t, DEST) \
+ INSTANTIATE(uint32_t, DEST) \
+ INSTANTIATE(int32_t, DEST) \
+ INSTANTIATE(uint64_t, DEST) \
+ INSTANTIATE(int64_t, DEST)
+
+#define INSTANTIATE_ALL() \
+ INSTANTIATE_ALL_DEST(uint8_t) \
+ INSTANTIATE_ALL_DEST(int8_t) \
+ INSTANTIATE_ALL_DEST(uint16_t) \
+ INSTANTIATE_ALL_DEST(int16_t) \
+ INSTANTIATE_ALL_DEST(uint32_t) \
+ INSTANTIATE_ALL_DEST(int32_t) \
+ INSTANTIATE_ALL_DEST(uint64_t) \
+ INSTANTIATE_ALL_DEST(int64_t)
+
+INSTANTIATE_ALL()
+
+#undef INSTANTIATE
+#undef INSTANTIATE_ALL
+#undef INSTANTIATE_ALL_DEST
+
+namespace {
+
+template <typename SrcType>
+struct TransposeIntsDest {
+ const SrcType* src;
+ uint8_t* dest;
+ int64_t dest_offset;
+ int64_t length;
+ const int32_t* transpose_map;
+
+ template <typename T>
+ enable_if_integer<T, Status> Visit(const T&) {
+ using DestType = typename T::c_type;
+ TransposeInts(src, reinterpret_cast<DestType*>(dest) + dest_offset, length,
+ transpose_map);
+ return Status::OK();
+ }
+
+ Status Visit(const DataType& type) {
+ return Status::TypeError("TransposeInts received non-integer dest_type");
+ }
+
+ Status operator()(const DataType& type) { return VisitTypeInline(type, this); }
+};
+
+struct TransposeIntsSrc {
+ const uint8_t* src;
+ uint8_t* dest;
+ int64_t src_offset;
+ int64_t dest_offset;
+ int64_t length;
+ const int32_t* transpose_map;
+ const DataType& dest_type;
+
+ template <typename T>
+ enable_if_integer<T, Status> Visit(const T&) {
+ using SrcType = typename T::c_type;
+ return TransposeIntsDest<SrcType>{reinterpret_cast<const SrcType*>(src) + src_offset,
+ dest, dest_offset, length,
+ transpose_map}(dest_type);
+ }
+
+ Status Visit(const DataType& type) {
+ return Status::TypeError("TransposeInts received non-integer dest_type");
+ }
+
+ Status operator()(const DataType& type) { return VisitTypeInline(type, this); }
+};
+
+}; // namespace
+
+Status TransposeInts(const DataType& src_type, const DataType& dest_type,
+ const uint8_t* src, uint8_t* dest, int64_t src_offset,
+ int64_t dest_offset, int64_t length, const int32_t* transpose_map) {
+ TransposeIntsSrc transposer{src, dest, src_offset, dest_offset,
+ length, transpose_map, dest_type};
+ return transposer(src_type);
+}
+
+template <typename T>
+static std::string FormatInt(T val) {
+ return std::to_string(val);
+}
+
+template <typename IndexCType, bool IsSigned = std::is_signed<IndexCType>::value>
+static Status CheckIndexBoundsImpl(const ArrayData& indices, uint64_t upper_limit) {
+ // For unsigned integers, if the values array is larger than the maximum
+ // index value (e.g. especially for UINT8 / UINT16), then there is no need to
+ // boundscheck.
+ if (!IsSigned &&
+ upper_limit > static_cast<uint64_t>(std::numeric_limits<IndexCType>::max())) {
+ return Status::OK();
+ }
+
+ const IndexCType* indices_data = indices.GetValues<IndexCType>(1);
+ const uint8_t* bitmap = nullptr;
+ if (indices.buffers[0]) {
+ bitmap = indices.buffers[0]->data();
+ }
+ auto IsOutOfBounds = [&](IndexCType val) -> bool {
+ return ((IsSigned && val < 0) ||
+ (val >= 0 && static_cast<uint64_t>(val) >= upper_limit));
+ };
+ return VisitSetBitRuns(
+ bitmap, indices.offset, indices.length, [&](int64_t offset, int64_t length) {
+ bool block_out_of_bounds = false;
+ for (int64_t i = 0; i < length; ++i) {
+ block_out_of_bounds |= IsOutOfBounds(indices_data[offset + i]);
+ }
+ if (ARROW_PREDICT_FALSE(block_out_of_bounds)) {
+ for (int64_t i = 0; i < length; ++i) {
+ if (IsOutOfBounds(indices_data[offset + i])) {
+ return Status::IndexError("Index ", FormatInt(indices_data[offset + i]),
+ " out of bounds");
+ }
+ }
+ }
+ return Status::OK();
+ });
+}
+
+/// \brief Branchless boundschecking of the indices. Processes batches of
+/// indices at a time and shortcircuits when encountering an out-of-bounds
+/// index in a batch
+Status CheckIndexBounds(const ArrayData& indices, uint64_t upper_limit) {
+ switch (indices.type->id()) {
+ case Type::INT8:
+ return CheckIndexBoundsImpl<int8_t>(indices, upper_limit);
+ case Type::INT16:
+ return CheckIndexBoundsImpl<int16_t>(indices, upper_limit);
+ case Type::INT32:
+ return CheckIndexBoundsImpl<int32_t>(indices, upper_limit);
+ case Type::INT64:
+ return CheckIndexBoundsImpl<int64_t>(indices, upper_limit);
+ case Type::UINT8:
+ return CheckIndexBoundsImpl<uint8_t>(indices, upper_limit);
+ case Type::UINT16:
+ return CheckIndexBoundsImpl<uint16_t>(indices, upper_limit);
+ case Type::UINT32:
+ return CheckIndexBoundsImpl<uint32_t>(indices, upper_limit);
+ case Type::UINT64:
+ return CheckIndexBoundsImpl<uint64_t>(indices, upper_limit);
+ default:
+ return Status::Invalid("Invalid index type for boundschecking");
+ }
+}
+
+// ----------------------------------------------------------------------
+// Utilities for casting from one integer type to another
+
+namespace {
+
+template <typename InType, typename CType = typename InType::c_type>
+Status IntegersInRange(const Datum& datum, CType bound_lower, CType bound_upper) {
+ if (std::numeric_limits<CType>::lowest() >= bound_lower &&
+ std::numeric_limits<CType>::max() <= bound_upper) {
+ return Status::OK();
+ }
+
+ auto IsOutOfBounds = [&](CType val) -> bool {
+ return val < bound_lower || val > bound_upper;
+ };
+ auto IsOutOfBoundsMaybeNull = [&](CType val, bool is_valid) -> bool {
+ return is_valid && (val < bound_lower || val > bound_upper);
+ };
+ auto GetErrorMessage = [&](CType val) {
+ return Status::Invalid("Integer value ", FormatInt(val),
+ " not in range: ", FormatInt(bound_lower), " to ",
+ FormatInt(bound_upper));
+ };
+
+ if (datum.kind() == Datum::SCALAR) {
+ const auto& scalar = datum.scalar_as<typename TypeTraits<InType>::ScalarType>();
+ if (IsOutOfBoundsMaybeNull(scalar.value, scalar.is_valid)) {
+ return GetErrorMessage(scalar.value);
+ }
+ return Status::OK();
+ }
+
+ const ArrayData& indices = *datum.array();
+ const CType* indices_data = indices.GetValues<CType>(1);
+ const uint8_t* bitmap = nullptr;
+ if (indices.buffers[0]) {
+ bitmap = indices.buffers[0]->data();
+ }
+ OptionalBitBlockCounter indices_bit_counter(bitmap, indices.offset, indices.length);
+ int64_t position = 0;
+ int64_t offset_position = indices.offset;
+ while (position < indices.length) {
+ BitBlockCount block = indices_bit_counter.NextBlock();
+ bool block_out_of_bounds = false;
+ if (block.popcount == block.length) {
+ // Fast path: branchless
+ int64_t i = 0;
+ for (int64_t chunk = 0; chunk < block.length / 8; ++chunk) {
+ // Let the compiler unroll this
+ for (int j = 0; j < 8; ++j) {
+ block_out_of_bounds |= IsOutOfBounds(indices_data[i++]);
+ }
+ }
+ for (; i < block.length; ++i) {
+ block_out_of_bounds |= IsOutOfBounds(indices_data[i]);
+ }
+ } else if (block.popcount > 0) {
+ // Indices have nulls, must only boundscheck non-null values
+ int64_t i = 0;
+ for (int64_t chunk = 0; chunk < block.length / 8; ++chunk) {
+ // Let the compiler unroll this
+ for (int j = 0; j < 8; ++j) {
+ block_out_of_bounds |= IsOutOfBoundsMaybeNull(
+ indices_data[i], BitUtil::GetBit(bitmap, offset_position + i));
+ ++i;
+ }
+ }
+ for (; i < block.length; ++i) {
+ block_out_of_bounds |= IsOutOfBoundsMaybeNull(
+ indices_data[i], BitUtil::GetBit(bitmap, offset_position + i));
+ }
+ }
+ if (ARROW_PREDICT_FALSE(block_out_of_bounds)) {
+ if (indices.GetNullCount() > 0) {
+ for (int64_t i = 0; i < block.length; ++i) {
+ if (IsOutOfBoundsMaybeNull(indices_data[i],
+ BitUtil::GetBit(bitmap, offset_position + i))) {
+ return GetErrorMessage(indices_data[i]);
+ }
+ }
+ } else {
+ for (int64_t i = 0; i < block.length; ++i) {
+ if (IsOutOfBounds(indices_data[i])) {
+ return GetErrorMessage(indices_data[i]);
+ }
+ }
+ }
+ }
+ indices_data += block.length;
+ position += block.length;
+ offset_position += block.length;
+ }
+ return Status::OK();
+}
+
+template <typename Type>
+Status CheckIntegersInRangeImpl(const Datum& datum, const Scalar& bound_lower,
+ const Scalar& bound_upper) {
+ using ScalarType = typename TypeTraits<Type>::ScalarType;
+ return IntegersInRange<Type>(datum, checked_cast<const ScalarType&>(bound_lower).value,
+ checked_cast<const ScalarType&>(bound_upper).value);
+}
+
+} // namespace
+
+Status CheckIntegersInRange(const Datum& datum, const Scalar& bound_lower,
+ const Scalar& bound_upper) {
+ Type::type type_id = datum.type()->id();
+
+ if (bound_lower.type->id() != type_id || bound_upper.type->id() != type_id ||
+ !bound_lower.is_valid || !bound_upper.is_valid) {
+ return Status::Invalid("Scalar bound types must be non-null and same type as data");
+ }
+
+ switch (type_id) {
+ case Type::INT8:
+ return CheckIntegersInRangeImpl<Int8Type>(datum, bound_lower, bound_upper);
+ case Type::INT16:
+ return CheckIntegersInRangeImpl<Int16Type>(datum, bound_lower, bound_upper);
+ case Type::INT32:
+ return CheckIntegersInRangeImpl<Int32Type>(datum, bound_lower, bound_upper);
+ case Type::INT64:
+ return CheckIntegersInRangeImpl<Int64Type>(datum, bound_lower, bound_upper);
+ case Type::UINT8:
+ return CheckIntegersInRangeImpl<UInt8Type>(datum, bound_lower, bound_upper);
+ case Type::UINT16:
+ return CheckIntegersInRangeImpl<UInt16Type>(datum, bound_lower, bound_upper);
+ case Type::UINT32:
+ return CheckIntegersInRangeImpl<UInt32Type>(datum, bound_lower, bound_upper);
+ case Type::UINT64:
+ return CheckIntegersInRangeImpl<UInt64Type>(datum, bound_lower, bound_upper);
+ default:
+ return Status::TypeError("Invalid index type for boundschecking");
+ }
+}
+
+namespace {
+
+template <typename O, typename I, typename Enable = void>
+struct is_number_downcast {
+ static constexpr bool value = false;
+};
+
+template <typename O, typename I>
+struct is_number_downcast<
+ O, I, enable_if_t<is_number_type<O>::value && is_number_type<I>::value>> {
+ using O_T = typename O::c_type;
+ using I_T = typename I::c_type;
+
+ static constexpr bool value =
+ ((!std::is_same<O, I>::value) &&
+ // Both types are of the same sign-ness.
+ ((std::is_signed<O_T>::value == std::is_signed<I_T>::value) &&
+ // Both types are of the same integral-ness.
+ (std::is_floating_point<O_T>::value == std::is_floating_point<I_T>::value)) &&
+ // Smaller output size
+ (sizeof(O_T) < sizeof(I_T)));
+};
+
+template <typename O, typename I, typename Enable = void>
+struct is_number_upcast {
+ static constexpr bool value = false;
+};
+
+template <typename O, typename I>
+struct is_number_upcast<
+ O, I, enable_if_t<is_number_type<O>::value && is_number_type<I>::value>> {
+ using O_T = typename O::c_type;
+ using I_T = typename I::c_type;
+
+ static constexpr bool value =
+ ((!std::is_same<O, I>::value) &&
+ // Both types are of the same sign-ness.
+ ((std::is_signed<O_T>::value == std::is_signed<I_T>::value) &&
+ // Both types are of the same integral-ness.
+ (std::is_floating_point<O_T>::value == std::is_floating_point<I_T>::value)) &&
+ // Larger output size
+ (sizeof(O_T) > sizeof(I_T)));
+};
+
+template <typename O, typename I, typename Enable = void>
+struct is_integral_signed_to_unsigned {
+ static constexpr bool value = false;
+};
+
+template <typename O, typename I>
+struct is_integral_signed_to_unsigned<
+ O, I, enable_if_t<is_integer_type<O>::value && is_integer_type<I>::value>> {
+ using O_T = typename O::c_type;
+ using I_T = typename I::c_type;
+
+ static constexpr bool value =
+ ((!std::is_same<O, I>::value) &&
+ ((std::is_unsigned<O_T>::value && std::is_signed<I_T>::value)));
+};
+
+template <typename O, typename I, typename Enable = void>
+struct is_integral_unsigned_to_signed {
+ static constexpr bool value = false;
+};
+
+template <typename O, typename I>
+struct is_integral_unsigned_to_signed<
+ O, I, enable_if_t<is_integer_type<O>::value && is_integer_type<I>::value>> {
+ using O_T = typename O::c_type;
+ using I_T = typename I::c_type;
+
+ static constexpr bool value =
+ ((!std::is_same<O, I>::value) &&
+ ((std::is_signed<O_T>::value && std::is_unsigned<I_T>::value)));
+};
+
+// This set of functions SafeMinimum/SafeMaximum would be simplified with
+// C++17 and `if constexpr`.
+
+// clang-format doesn't handle this construct properly. Thus the macro, but it
+// also improves readability.
+//
+// The effective return type of the function is always `I::c_type`, this is
+// just how enable_if works with functions.
+#define RET_TYPE(TRAIT) enable_if_t<TRAIT<O, I>::value, typename I::c_type>
+
+template <typename O, typename I>
+constexpr RET_TYPE(std::is_same) SafeMinimum() {
+ using out_type = typename O::c_type;
+
+ return std::numeric_limits<out_type>::lowest();
+}
+
+template <typename O, typename I>
+constexpr RET_TYPE(std::is_same) SafeMaximum() {
+ using out_type = typename O::c_type;
+
+ return std::numeric_limits<out_type>::max();
+}
+
+template <typename O, typename I>
+constexpr RET_TYPE(is_number_downcast) SafeMinimum() {
+ using out_type = typename O::c_type;
+
+ return std::numeric_limits<out_type>::lowest();
+}
+
+template <typename O, typename I>
+constexpr RET_TYPE(is_number_downcast) SafeMaximum() {
+ using out_type = typename O::c_type;
+
+ return std::numeric_limits<out_type>::max();
+}
+
+template <typename O, typename I>
+constexpr RET_TYPE(is_number_upcast) SafeMinimum() {
+ using in_type = typename I::c_type;
+ return std::numeric_limits<in_type>::lowest();
+}
+
+template <typename O, typename I>
+constexpr RET_TYPE(is_number_upcast) SafeMaximum() {
+ using in_type = typename I::c_type;
+ return std::numeric_limits<in_type>::max();
+}
+
+template <typename O, typename I>
+constexpr RET_TYPE(is_integral_unsigned_to_signed) SafeMinimum() {
+ return 0;
+}
+
+template <typename O, typename I>
+constexpr RET_TYPE(is_integral_unsigned_to_signed) SafeMaximum() {
+ using in_type = typename I::c_type;
+ using out_type = typename O::c_type;
+
+ // Equality is missing because in_type::max() > out_type::max() when types
+ // are of the same width.
+ return static_cast<in_type>(sizeof(in_type) < sizeof(out_type)
+ ? std::numeric_limits<in_type>::max()
+ : std::numeric_limits<out_type>::max());
+}
+
+template <typename O, typename I>
+constexpr RET_TYPE(is_integral_signed_to_unsigned) SafeMinimum() {
+ return 0;
+}
+
+template <typename O, typename I>
+constexpr RET_TYPE(is_integral_signed_to_unsigned) SafeMaximum() {
+ using in_type = typename I::c_type;
+ using out_type = typename O::c_type;
+
+ return static_cast<in_type>(sizeof(in_type) <= sizeof(out_type)
+ ? std::numeric_limits<in_type>::max()
+ : std::numeric_limits<out_type>::max());
+}
+
+#undef RET_TYPE
+
+#define GET_MIN_MAX_CASE(TYPE, OUT_TYPE) \
+ case Type::TYPE: \
+ *min = SafeMinimum<OUT_TYPE, InType>(); \
+ *max = SafeMaximum<OUT_TYPE, InType>(); \
+ break
+
+template <typename InType, typename T = typename InType::c_type>
+void GetSafeMinMax(Type::type out_type, T* min, T* max) {
+ switch (out_type) {
+ GET_MIN_MAX_CASE(INT8, Int8Type);
+ GET_MIN_MAX_CASE(INT16, Int16Type);
+ GET_MIN_MAX_CASE(INT32, Int32Type);
+ GET_MIN_MAX_CASE(INT64, Int64Type);
+ GET_MIN_MAX_CASE(UINT8, UInt8Type);
+ GET_MIN_MAX_CASE(UINT16, UInt16Type);
+ GET_MIN_MAX_CASE(UINT32, UInt32Type);
+ GET_MIN_MAX_CASE(UINT64, UInt64Type);
+ default:
+ break;
+ }
+}
+
+template <typename Type, typename CType = typename Type::c_type,
+ typename ScalarType = typename TypeTraits<Type>::ScalarType>
+Status IntegersCanFitImpl(const Datum& datum, const DataType& target_type) {
+ CType bound_min{}, bound_max{};
+ GetSafeMinMax<Type>(target_type.id(), &bound_min, &bound_max);
+ return CheckIntegersInRange(datum, ScalarType(bound_min), ScalarType(bound_max));
+}
+
+} // namespace
+
+Status IntegersCanFit(const Datum& datum, const DataType& target_type) {
+ if (!is_integer(target_type.id())) {
+ return Status::Invalid("Target type is not an integer type: ", target_type);
+ }
+
+ switch (datum.type()->id()) {
+ case Type::INT8:
+ return IntegersCanFitImpl<Int8Type>(datum, target_type);
+ case Type::INT16:
+ return IntegersCanFitImpl<Int16Type>(datum, target_type);
+ case Type::INT32:
+ return IntegersCanFitImpl<Int32Type>(datum, target_type);
+ case Type::INT64:
+ return IntegersCanFitImpl<Int64Type>(datum, target_type);
+ case Type::UINT8:
+ return IntegersCanFitImpl<UInt8Type>(datum, target_type);
+ case Type::UINT16:
+ return IntegersCanFitImpl<UInt16Type>(datum, target_type);
+ case Type::UINT32:
+ return IntegersCanFitImpl<UInt32Type>(datum, target_type);
+ case Type::UINT64:
+ return IntegersCanFitImpl<UInt64Type>(datum, target_type);
+ default:
+ return Status::TypeError("Invalid index type for boundschecking");
+ }
+}
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/int_util.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/int_util.h
new file mode 100644
index 00000000000..bf9226cdf12
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/int_util.h
@@ -0,0 +1,117 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <type_traits>
+
+#include "arrow/status.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class DataType;
+struct ArrayData;
+struct Datum;
+struct Scalar;
+
+namespace internal {
+
+ARROW_EXPORT
+uint8_t DetectUIntWidth(const uint64_t* values, int64_t length, uint8_t min_width = 1);
+
+ARROW_EXPORT
+uint8_t DetectUIntWidth(const uint64_t* values, const uint8_t* valid_bytes,
+ int64_t length, uint8_t min_width = 1);
+
+ARROW_EXPORT
+uint8_t DetectIntWidth(const int64_t* values, int64_t length, uint8_t min_width = 1);
+
+ARROW_EXPORT
+uint8_t DetectIntWidth(const int64_t* values, const uint8_t* valid_bytes, int64_t length,
+ uint8_t min_width = 1);
+
+ARROW_EXPORT
+void DowncastInts(const int64_t* source, int8_t* dest, int64_t length);
+
+ARROW_EXPORT
+void DowncastInts(const int64_t* source, int16_t* dest, int64_t length);
+
+ARROW_EXPORT
+void DowncastInts(const int64_t* source, int32_t* dest, int64_t length);
+
+ARROW_EXPORT
+void DowncastInts(const int64_t* source, int64_t* dest, int64_t length);
+
+ARROW_EXPORT
+void DowncastUInts(const uint64_t* source, uint8_t* dest, int64_t length);
+
+ARROW_EXPORT
+void DowncastUInts(const uint64_t* source, uint16_t* dest, int64_t length);
+
+ARROW_EXPORT
+void DowncastUInts(const uint64_t* source, uint32_t* dest, int64_t length);
+
+ARROW_EXPORT
+void DowncastUInts(const uint64_t* source, uint64_t* dest, int64_t length);
+
+ARROW_EXPORT
+void UpcastInts(const int32_t* source, int64_t* dest, int64_t length);
+
+template <typename InputInt, typename OutputInt>
+inline typename std::enable_if<(sizeof(InputInt) >= sizeof(OutputInt))>::type CastInts(
+ const InputInt* source, OutputInt* dest, int64_t length) {
+ DowncastInts(source, dest, length);
+}
+
+template <typename InputInt, typename OutputInt>
+inline typename std::enable_if<(sizeof(InputInt) < sizeof(OutputInt))>::type CastInts(
+ const InputInt* source, OutputInt* dest, int64_t length) {
+ UpcastInts(source, dest, length);
+}
+
+template <typename InputInt, typename OutputInt>
+ARROW_EXPORT void TransposeInts(const InputInt* source, OutputInt* dest, int64_t length,
+ const int32_t* transpose_map);
+
+ARROW_EXPORT
+Status TransposeInts(const DataType& src_type, const DataType& dest_type,
+ const uint8_t* src, uint8_t* dest, int64_t src_offset,
+ int64_t dest_offset, int64_t length, const int32_t* transpose_map);
+
+/// \brief Do vectorized boundschecking of integer-type array indices. The
+/// indices must be non-nonnegative and strictly less than the passed upper
+/// limit (which is usually the length of an array that is being indexed-into).
+ARROW_EXPORT
+Status CheckIndexBounds(const ArrayData& indices, uint64_t upper_limit);
+
+/// \brief Boundscheck integer values to determine if they are all between the
+/// passed upper and lower limits (inclusive). Upper and lower bounds must be
+/// the same type as the data and are not currently casted.
+ARROW_EXPORT
+Status CheckIntegersInRange(const Datum& datum, const Scalar& bound_lower,
+ const Scalar& bound_upper);
+
+/// \brief Use CheckIntegersInRange to determine whether the passed integers
+/// can fit safely in the passed integer type. This helps quickly determine if
+/// integer narrowing (e.g. int64->int32) is safe to do.
+ARROW_EXPORT
+Status IntegersCanFit(const Datum& datum, const DataType& target_type);
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/int_util_internal.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/int_util_internal.h
new file mode 100644
index 00000000000..4136706629f
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/int_util_internal.h
@@ -0,0 +1,153 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+
+#include "arrow/status.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+// "safe-math.h" includes <intsafe.h> from the Windows headers.
+#include "arrow/util/windows_compatibility.h"
+#include "arrow/vendored/portable-snippets/safe-math.h"
+// clang-format off (avoid include reordering)
+#include "arrow/util/windows_fixup.h"
+// clang-format on
+
+namespace arrow {
+namespace internal {
+
+// Define functions AddWithOverflow, SubtractWithOverflow, MultiplyWithOverflow
+// with the signature `bool(T u, T v, T* out)` where T is an integer type.
+// On overflow, these functions return true. Otherwise, false is returned
+// and `out` is updated with the result of the operation.
+
+#define OP_WITH_OVERFLOW(_func_name, _psnip_op, _type, _psnip_type) \
+ static inline bool _func_name(_type u, _type v, _type* out) { \
+ return !psnip_safe_##_psnip_type##_##_psnip_op(out, u, v); \
+ }
+
+#define OPS_WITH_OVERFLOW(_func_name, _psnip_op) \
+ OP_WITH_OVERFLOW(_func_name, _psnip_op, int8_t, int8) \
+ OP_WITH_OVERFLOW(_func_name, _psnip_op, int16_t, int16) \
+ OP_WITH_OVERFLOW(_func_name, _psnip_op, int32_t, int32) \
+ OP_WITH_OVERFLOW(_func_name, _psnip_op, int64_t, int64) \
+ OP_WITH_OVERFLOW(_func_name, _psnip_op, uint8_t, uint8) \
+ OP_WITH_OVERFLOW(_func_name, _psnip_op, uint16_t, uint16) \
+ OP_WITH_OVERFLOW(_func_name, _psnip_op, uint32_t, uint32) \
+ OP_WITH_OVERFLOW(_func_name, _psnip_op, uint64_t, uint64)
+
+OPS_WITH_OVERFLOW(AddWithOverflow, add)
+OPS_WITH_OVERFLOW(SubtractWithOverflow, sub)
+OPS_WITH_OVERFLOW(MultiplyWithOverflow, mul)
+OPS_WITH_OVERFLOW(DivideWithOverflow, div)
+
+#undef OP_WITH_OVERFLOW
+#undef OPS_WITH_OVERFLOW
+
+// Define function NegateWithOverflow with the signature `bool(T u, T* out)`
+// where T is a signed integer type. On overflow, these functions return true.
+// Otherwise, false is returned and `out` is updated with the result of the
+// operation.
+
+#define UNARY_OP_WITH_OVERFLOW(_func_name, _psnip_op, _type, _psnip_type) \
+ static inline bool _func_name(_type u, _type* out) { \
+ return !psnip_safe_##_psnip_type##_##_psnip_op(out, u); \
+ }
+
+#define SIGNED_UNARY_OPS_WITH_OVERFLOW(_func_name, _psnip_op) \
+ UNARY_OP_WITH_OVERFLOW(_func_name, _psnip_op, int8_t, int8) \
+ UNARY_OP_WITH_OVERFLOW(_func_name, _psnip_op, int16_t, int16) \
+ UNARY_OP_WITH_OVERFLOW(_func_name, _psnip_op, int32_t, int32) \
+ UNARY_OP_WITH_OVERFLOW(_func_name, _psnip_op, int64_t, int64)
+
+SIGNED_UNARY_OPS_WITH_OVERFLOW(NegateWithOverflow, neg)
+
+#undef UNARY_OP_WITH_OVERFLOW
+#undef SIGNED_UNARY_OPS_WITH_OVERFLOW
+
+/// Signed addition with well-defined behaviour on overflow (as unsigned)
+template <typename SignedInt>
+SignedInt SafeSignedAdd(SignedInt u, SignedInt v) {
+ using UnsignedInt = typename std::make_unsigned<SignedInt>::type;
+ return static_cast<SignedInt>(static_cast<UnsignedInt>(u) +
+ static_cast<UnsignedInt>(v));
+}
+
+/// Signed subtraction with well-defined behaviour on overflow (as unsigned)
+template <typename SignedInt>
+SignedInt SafeSignedSubtract(SignedInt u, SignedInt v) {
+ using UnsignedInt = typename std::make_unsigned<SignedInt>::type;
+ return static_cast<SignedInt>(static_cast<UnsignedInt>(u) -
+ static_cast<UnsignedInt>(v));
+}
+
+/// Signed negation with well-defined behaviour on overflow (as unsigned)
+template <typename SignedInt>
+SignedInt SafeSignedNegate(SignedInt u) {
+ using UnsignedInt = typename std::make_unsigned<SignedInt>::type;
+ return static_cast<SignedInt>(~static_cast<UnsignedInt>(u) + 1);
+}
+
+/// Signed left shift with well-defined behaviour on negative numbers or overflow
+template <typename SignedInt, typename Shift>
+SignedInt SafeLeftShift(SignedInt u, Shift shift) {
+ using UnsignedInt = typename std::make_unsigned<SignedInt>::type;
+ return static_cast<SignedInt>(static_cast<UnsignedInt>(u) << shift);
+}
+
+/// Upcast an integer to the largest possible width (currently 64 bits)
+
+template <typename Integer>
+typename std::enable_if<
+ std::is_integral<Integer>::value && std::is_signed<Integer>::value, int64_t>::type
+UpcastInt(Integer v) {
+ return v;
+}
+
+template <typename Integer>
+typename std::enable_if<
+ std::is_integral<Integer>::value && std::is_unsigned<Integer>::value, uint64_t>::type
+UpcastInt(Integer v) {
+ return v;
+}
+
+static inline Status CheckSliceParams(int64_t object_length, int64_t slice_offset,
+ int64_t slice_length, const char* object_name) {
+ if (ARROW_PREDICT_FALSE(slice_offset < 0)) {
+ return Status::Invalid("Negative ", object_name, " slice offset");
+ }
+ if (ARROW_PREDICT_FALSE(slice_length < 0)) {
+ return Status::Invalid("Negative ", object_name, " slice length");
+ }
+ int64_t offset_plus_length;
+ if (ARROW_PREDICT_FALSE(
+ internal::AddWithOverflow(slice_offset, slice_length, &offset_plus_length))) {
+ return Status::Invalid(object_name, " slice would overflow");
+ }
+ if (ARROW_PREDICT_FALSE(slice_offset + slice_length > object_length)) {
+ return Status::Invalid(object_name, " slice would exceed ", object_name, " length");
+ }
+ return Status::OK();
+}
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/io_util.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/io_util.cc
new file mode 100644
index 00000000000..f6566ea7e36
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/io_util.cc
@@ -0,0 +1,1685 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Ensure 64-bit off_t for platforms where it matters
+#ifdef _FILE_OFFSET_BITS
+#undef _FILE_OFFSET_BITS
+#endif
+
+#define _FILE_OFFSET_BITS 64
+
+#if defined(sun) || defined(__sun)
+// According to https://bugs.python.org/issue1759169#msg82201, __EXTENSIONS__
+// is the best way to enable modern POSIX APIs, such as posix_madvise(), on Solaris.
+// (see also
+// https://github.com/illumos/illumos-gate/blob/master/usr/src/uts/common/sys/mman.h)
+#undef __EXTENSIONS__
+#define __EXTENSIONS__
+#endif
+
+#include "arrow/util/windows_compatibility.h" // IWYU pragma: keep
+
+#include <algorithm>
+#include <cerrno>
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include <random>
+#include <sstream>
+#include <string>
+#include <thread>
+#include <utility>
+#include <vector>
+
+#include <fcntl.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <sys/types.h> // IWYU pragma: keep
+
+// ----------------------------------------------------------------------
+// file compatibility stuff
+
+#ifdef _WIN32
+#include <io.h>
+#include <share.h>
+#else // POSIX-like platforms
+#include <dirent.h>
+#endif
+
+#ifdef _WIN32
+#include "arrow/io/mman.h"
+#undef Realloc
+#undef Free
+#else // POSIX-like platforms
+#include <sys/mman.h>
+#include <unistd.h>
+#endif
+
+// define max read/write count
+#ifdef _WIN32
+#define ARROW_MAX_IO_CHUNKSIZE INT32_MAX
+#else
+
+#ifdef __APPLE__
+// due to macOS bug, we need to set read/write max
+#define ARROW_MAX_IO_CHUNKSIZE INT32_MAX
+#else
+// see notes on Linux read/write manpage
+#define ARROW_MAX_IO_CHUNKSIZE 0x7ffff000
+#endif
+
+#endif
+
+#include "arrow/buffer.h"
+#include "arrow/result.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/io_util.h"
+#include "arrow/util/logging.h"
+
+// For filename conversion
+#if defined(_WIN32)
+#include "arrow/util/utf8.h"
+#endif
+
+namespace arrow {
+
+using internal::checked_cast;
+
+namespace internal {
+
+namespace {
+
+template <typename CharT>
+std::basic_string<CharT> ReplaceChars(std::basic_string<CharT> s, CharT find, CharT rep) {
+ if (find != rep) {
+ for (size_t i = 0; i < s.length(); ++i) {
+ if (s[i] == find) {
+ s[i] = rep;
+ }
+ }
+ }
+ return s;
+}
+
+Result<NativePathString> StringToNative(const std::string& s) {
+#if _WIN32
+ return ::arrow::util::UTF8ToWideString(s);
+#else
+ return s;
+#endif
+}
+
+#if _WIN32
+Result<std::string> NativeToString(const NativePathString& ws) {
+ return ::arrow::util::WideStringToUTF8(ws);
+}
+#endif
+
+#if _WIN32
+const wchar_t kNativeSep = L'\\';
+const wchar_t kGenericSep = L'/';
+const wchar_t* kAllSeps = L"\\/";
+#else
+const char kNativeSep = '/';
+const char kGenericSep = '/';
+const char* kAllSeps = "/";
+#endif
+
+NativePathString NativeSlashes(NativePathString s) {
+ return ReplaceChars(std::move(s), kGenericSep, kNativeSep);
+}
+
+NativePathString GenericSlashes(NativePathString s) {
+ return ReplaceChars(std::move(s), kNativeSep, kGenericSep);
+}
+
+NativePathString NativeParent(const NativePathString& s) {
+ auto last_sep = s.find_last_of(kAllSeps);
+ if (last_sep == s.length() - 1) {
+ // Last separator is a trailing separator, skip all trailing separators
+ // and try again
+ auto before_last_seps = s.find_last_not_of(kAllSeps);
+ if (before_last_seps == NativePathString::npos) {
+ // Only separators in path
+ return s;
+ }
+ last_sep = s.find_last_of(kAllSeps, before_last_seps);
+ }
+ if (last_sep == NativePathString::npos) {
+ // No (other) separator in path
+ return s;
+ }
+ // There may be multiple contiguous separators, skip all of them
+ auto before_last_seps = s.find_last_not_of(kAllSeps, last_sep);
+ if (before_last_seps == NativePathString::npos) {
+ // All separators are at start of string, keep them all
+ return s.substr(0, last_sep + 1);
+ } else {
+ return s.substr(0, before_last_seps + 1);
+ }
+}
+
+Status ValidatePath(const std::string& s) {
+ if (s.find_first_of('\0') != std::string::npos) {
+ return Status::Invalid("Embedded NUL char in path: '", s, "'");
+ }
+ return Status::OK();
+}
+
+} // namespace
+
+std::string ErrnoMessage(int errnum) { return std::strerror(errnum); }
+
+#if _WIN32
+std::string WinErrorMessage(int errnum) {
+ char buf[1024];
+ auto nchars = FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
+ NULL, errnum, 0, buf, sizeof(buf), NULL);
+ if (nchars == 0) {
+ // Fallback
+ std::stringstream ss;
+ ss << "Windows error #" << errnum;
+ return ss.str();
+ }
+ return std::string(buf, nchars);
+}
+#endif
+
+namespace {
+
+const char kErrnoDetailTypeId[] = "arrow::ErrnoDetail";
+
+class ErrnoDetail : public StatusDetail {
+ public:
+ explicit ErrnoDetail(int errnum) : errnum_(errnum) {}
+
+ const char* type_id() const override { return kErrnoDetailTypeId; }
+
+ std::string ToString() const override {
+ std::stringstream ss;
+ ss << "[errno " << errnum_ << "] " << ErrnoMessage(errnum_);
+ return ss.str();
+ }
+
+ int errnum() const { return errnum_; }
+
+ protected:
+ int errnum_;
+};
+
+#if _WIN32
+const char kWinErrorDetailTypeId[] = "arrow::WinErrorDetail";
+
+class WinErrorDetail : public StatusDetail {
+ public:
+ explicit WinErrorDetail(int errnum) : errnum_(errnum) {}
+
+ const char* type_id() const override { return kWinErrorDetailTypeId; }
+
+ std::string ToString() const override {
+ std::stringstream ss;
+ ss << "[Windows error " << errnum_ << "] " << WinErrorMessage(errnum_);
+ return ss.str();
+ }
+
+ int errnum() const { return errnum_; }
+
+ protected:
+ int errnum_;
+};
+#endif
+
+const char kSignalDetailTypeId[] = "arrow::SignalDetail";
+
+class SignalDetail : public StatusDetail {
+ public:
+ explicit SignalDetail(int signum) : signum_(signum) {}
+
+ const char* type_id() const override { return kSignalDetailTypeId; }
+
+ std::string ToString() const override {
+ std::stringstream ss;
+ ss << "received signal " << signum_;
+ return ss.str();
+ }
+
+ int signum() const { return signum_; }
+
+ protected:
+ int signum_;
+};
+
+} // namespace
+
+std::shared_ptr<StatusDetail> StatusDetailFromErrno(int errnum) {
+ return std::make_shared<ErrnoDetail>(errnum);
+}
+
+#if _WIN32
+std::shared_ptr<StatusDetail> StatusDetailFromWinError(int errnum) {
+ return std::make_shared<WinErrorDetail>(errnum);
+}
+#endif
+
+std::shared_ptr<StatusDetail> StatusDetailFromSignal(int signum) {
+ return std::make_shared<SignalDetail>(signum);
+}
+
+int ErrnoFromStatus(const Status& status) {
+ const auto detail = status.detail();
+ if (detail != nullptr && detail->type_id() == kErrnoDetailTypeId) {
+ return checked_cast<const ErrnoDetail&>(*detail).errnum();
+ }
+ return 0;
+}
+
+int WinErrorFromStatus(const Status& status) {
+#if _WIN32
+ const auto detail = status.detail();
+ if (detail != nullptr && detail->type_id() == kWinErrorDetailTypeId) {
+ return checked_cast<const WinErrorDetail&>(*detail).errnum();
+ }
+#endif
+ return 0;
+}
+
+int SignalFromStatus(const Status& status) {
+ const auto detail = status.detail();
+ if (detail != nullptr && detail->type_id() == kSignalDetailTypeId) {
+ return checked_cast<const SignalDetail&>(*detail).signum();
+ }
+ return 0;
+}
+
+//
+// PlatformFilename implementation
+//
+
+struct PlatformFilename::Impl {
+ Impl() = default;
+ explicit Impl(NativePathString p) : native_(NativeSlashes(std::move(p))) {}
+
+ NativePathString native_;
+
+ // '/'-separated
+ NativePathString generic() const { return GenericSlashes(native_); }
+};
+
+PlatformFilename::PlatformFilename() : impl_(new Impl{}) {}
+
+PlatformFilename::~PlatformFilename() {}
+
+PlatformFilename::PlatformFilename(Impl impl) : impl_(new Impl(std::move(impl))) {}
+
+PlatformFilename::PlatformFilename(const PlatformFilename& other)
+ : PlatformFilename(Impl{other.impl_->native_}) {}
+
+PlatformFilename::PlatformFilename(PlatformFilename&& other)
+ : impl_(std::move(other.impl_)) {}
+
+PlatformFilename& PlatformFilename::operator=(const PlatformFilename& other) {
+ this->impl_.reset(new Impl{other.impl_->native_});
+ return *this;
+}
+
+PlatformFilename& PlatformFilename::operator=(PlatformFilename&& other) {
+ this->impl_ = std::move(other.impl_);
+ return *this;
+}
+
+PlatformFilename::PlatformFilename(const NativePathString& path)
+ : PlatformFilename(Impl{path}) {}
+
+PlatformFilename::PlatformFilename(const NativePathString::value_type* path)
+ : PlatformFilename(NativePathString(path)) {}
+
+bool PlatformFilename::operator==(const PlatformFilename& other) const {
+ return impl_->native_ == other.impl_->native_;
+}
+
+bool PlatformFilename::operator!=(const PlatformFilename& other) const {
+ return impl_->native_ != other.impl_->native_;
+}
+
+const NativePathString& PlatformFilename::ToNative() const { return impl_->native_; }
+
+std::string PlatformFilename::ToString() const {
+#if _WIN32
+ auto result = NativeToString(impl_->generic());
+ if (!result.ok()) {
+ std::stringstream ss;
+ ss << "<Unrepresentable filename: " << result.status().ToString() << ">";
+ return ss.str();
+ }
+ return *std::move(result);
+#else
+ return impl_->generic();
+#endif
+}
+
+PlatformFilename PlatformFilename::Parent() const {
+ return PlatformFilename(NativeParent(ToNative()));
+}
+
+Result<PlatformFilename> PlatformFilename::FromString(const std::string& file_name) {
+ RETURN_NOT_OK(ValidatePath(file_name));
+ ARROW_ASSIGN_OR_RAISE(auto ns, StringToNative(file_name));
+ return PlatformFilename(std::move(ns));
+}
+
+PlatformFilename PlatformFilename::Join(const PlatformFilename& child) const {
+ if (impl_->native_.empty() || impl_->native_.back() == kNativeSep) {
+ return PlatformFilename(Impl{impl_->native_ + child.impl_->native_});
+ } else {
+ return PlatformFilename(Impl{impl_->native_ + kNativeSep + child.impl_->native_});
+ }
+}
+
+Result<PlatformFilename> PlatformFilename::Join(const std::string& child_name) const {
+ ARROW_ASSIGN_OR_RAISE(auto child, PlatformFilename::FromString(child_name));
+ return Join(child);
+}
+
+//
+// Filesystem access routines
+//
+
+namespace {
+
+Result<bool> DoCreateDir(const PlatformFilename& dir_path, bool create_parents) {
+#ifdef _WIN32
+ const auto s = dir_path.ToNative().c_str();
+ if (CreateDirectoryW(s, nullptr)) {
+ return true;
+ }
+ int errnum = GetLastError();
+ if (errnum == ERROR_ALREADY_EXISTS) {
+ const auto attrs = GetFileAttributesW(s);
+ if (attrs == INVALID_FILE_ATTRIBUTES || !(attrs & FILE_ATTRIBUTE_DIRECTORY)) {
+ // Note we propagate the original error, not the GetFileAttributesW() error
+ return IOErrorFromWinError(ERROR_ALREADY_EXISTS, "Cannot create directory '",
+ dir_path.ToString(), "': non-directory entry exists");
+ }
+ return false;
+ }
+ if (create_parents && errnum == ERROR_PATH_NOT_FOUND) {
+ auto parent_path = dir_path.Parent();
+ if (parent_path != dir_path) {
+ RETURN_NOT_OK(DoCreateDir(parent_path, create_parents));
+ return DoCreateDir(dir_path, false); // Retry
+ }
+ }
+ return IOErrorFromWinError(GetLastError(), "Cannot create directory '",
+ dir_path.ToString(), "'");
+#else
+ const auto s = dir_path.ToNative().c_str();
+ if (mkdir(s, S_IRWXU | S_IRWXG | S_IRWXO) == 0) {
+ return true;
+ }
+ if (errno == EEXIST) {
+ struct stat st;
+ if (stat(s, &st) || !S_ISDIR(st.st_mode)) {
+ // Note we propagate the original errno, not the stat() errno
+ return IOErrorFromErrno(EEXIST, "Cannot create directory '", dir_path.ToString(),
+ "': non-directory entry exists");
+ }
+ return false;
+ }
+ if (create_parents && errno == ENOENT) {
+ auto parent_path = dir_path.Parent();
+ if (parent_path != dir_path) {
+ RETURN_NOT_OK(DoCreateDir(parent_path, create_parents));
+ return DoCreateDir(dir_path, false); // Retry
+ }
+ }
+ return IOErrorFromErrno(errno, "Cannot create directory '", dir_path.ToString(), "'");
+#endif
+}
+
+} // namespace
+
+Result<bool> CreateDir(const PlatformFilename& dir_path) {
+ return DoCreateDir(dir_path, false);
+}
+
+Result<bool> CreateDirTree(const PlatformFilename& dir_path) {
+ return DoCreateDir(dir_path, true);
+}
+
+#ifdef _WIN32
+
+namespace {
+
+void FindHandleDeleter(HANDLE* handle) {
+ if (!FindClose(*handle)) {
+ ARROW_LOG(WARNING) << "Cannot close directory handle: "
+ << WinErrorMessage(GetLastError());
+ }
+}
+
+std::wstring PathWithoutTrailingSlash(const PlatformFilename& fn) {
+ std::wstring path = fn.ToNative();
+ while (!path.empty() && path.back() == kNativeSep) {
+ path.pop_back();
+ }
+ return path;
+}
+
+Result<std::vector<WIN32_FIND_DATAW>> ListDirInternal(const PlatformFilename& dir_path) {
+ WIN32_FIND_DATAW find_data;
+ std::wstring pattern = PathWithoutTrailingSlash(dir_path) + L"\\*.*";
+ HANDLE handle = FindFirstFileW(pattern.c_str(), &find_data);
+ if (handle == INVALID_HANDLE_VALUE) {
+ return IOErrorFromWinError(GetLastError(), "Cannot list directory '",
+ dir_path.ToString(), "'");
+ }
+
+ std::unique_ptr<HANDLE, decltype(&FindHandleDeleter)> handle_guard(&handle,
+ FindHandleDeleter);
+
+ std::vector<WIN32_FIND_DATAW> results;
+ do {
+ // Skip "." and ".."
+ if (find_data.cFileName[0] == L'.') {
+ if (find_data.cFileName[1] == L'\0' ||
+ (find_data.cFileName[1] == L'.' && find_data.cFileName[2] == L'\0')) {
+ continue;
+ }
+ }
+ results.push_back(find_data);
+ } while (FindNextFileW(handle, &find_data));
+
+ int errnum = GetLastError();
+ if (errnum != ERROR_NO_MORE_FILES) {
+ return IOErrorFromWinError(GetLastError(), "Cannot list directory '",
+ dir_path.ToString(), "'");
+ }
+ return results;
+}
+
+Status FindOneFile(const PlatformFilename& fn, WIN32_FIND_DATAW* find_data,
+ bool* exists = nullptr) {
+ HANDLE handle = FindFirstFileW(PathWithoutTrailingSlash(fn).c_str(), find_data);
+ if (handle == INVALID_HANDLE_VALUE) {
+ int errnum = GetLastError();
+ if (exists == nullptr ||
+ (errnum != ERROR_PATH_NOT_FOUND && errnum != ERROR_FILE_NOT_FOUND)) {
+ return IOErrorFromWinError(GetLastError(), "Cannot get information for path '",
+ fn.ToString(), "'");
+ }
+ *exists = false;
+ } else {
+ if (exists != nullptr) {
+ *exists = true;
+ }
+ FindHandleDeleter(&handle);
+ }
+ return Status::OK();
+}
+
+} // namespace
+
+Result<std::vector<PlatformFilename>> ListDir(const PlatformFilename& dir_path) {
+ ARROW_ASSIGN_OR_RAISE(auto entries, ListDirInternal(dir_path));
+
+ std::vector<PlatformFilename> results;
+ results.reserve(entries.size());
+ for (const auto& entry : entries) {
+ results.emplace_back(std::wstring(entry.cFileName));
+ }
+ return results;
+}
+
+#else
+
+Result<std::vector<PlatformFilename>> ListDir(const PlatformFilename& dir_path) {
+ DIR* dir = opendir(dir_path.ToNative().c_str());
+ if (dir == nullptr) {
+ return IOErrorFromErrno(errno, "Cannot list directory '", dir_path.ToString(), "'");
+ }
+
+ auto dir_deleter = [](DIR* dir) -> void {
+ if (closedir(dir) != 0) {
+ ARROW_LOG(WARNING) << "Cannot close directory handle: " << ErrnoMessage(errno);
+ }
+ };
+ std::unique_ptr<DIR, decltype(dir_deleter)> dir_guard(dir, dir_deleter);
+
+ std::vector<PlatformFilename> results;
+ errno = 0;
+ struct dirent* entry = readdir(dir);
+ while (entry != nullptr) {
+ std::string path = entry->d_name;
+ if (path != "." && path != "..") {
+ results.emplace_back(std::move(path));
+ }
+ entry = readdir(dir);
+ }
+ if (errno != 0) {
+ return IOErrorFromErrno(errno, "Cannot list directory '", dir_path.ToString(), "'");
+ }
+ return results;
+}
+
+#endif
+
+namespace {
+
+#ifdef _WIN32
+
+Status DeleteDirTreeInternal(const PlatformFilename& dir_path);
+
+// Remove a directory entry that's always a directory
+Status DeleteDirEntryDir(const PlatformFilename& path, const WIN32_FIND_DATAW& entry,
+ bool remove_top_dir = true) {
+ if ((entry.dwFileAttributes & FILE_ATTRIBUTE_REPARSE_POINT) == 0) {
+ // It's a directory that doesn't have a reparse point => recurse
+ RETURN_NOT_OK(DeleteDirTreeInternal(path));
+ }
+ if (remove_top_dir) {
+ // Remove now empty directory or reparse point (e.g. symlink to dir)
+ if (!RemoveDirectoryW(path.ToNative().c_str())) {
+ return IOErrorFromWinError(GetLastError(), "Cannot delete directory entry '",
+ path.ToString(), "': ");
+ }
+ }
+ return Status::OK();
+}
+
+Status DeleteDirEntry(const PlatformFilename& path, const WIN32_FIND_DATAW& entry) {
+ if ((entry.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) != 0) {
+ return DeleteDirEntryDir(path, entry);
+ }
+ // It's a non-directory entry, most likely a regular file
+ if (!DeleteFileW(path.ToNative().c_str())) {
+ return IOErrorFromWinError(GetLastError(), "Cannot delete file '", path.ToString(),
+ "': ");
+ }
+ return Status::OK();
+}
+
+Status DeleteDirTreeInternal(const PlatformFilename& dir_path) {
+ ARROW_ASSIGN_OR_RAISE(auto entries, ListDirInternal(dir_path));
+ for (const auto& entry : entries) {
+ PlatformFilename path = dir_path.Join(PlatformFilename(entry.cFileName));
+ RETURN_NOT_OK(DeleteDirEntry(path, entry));
+ }
+ return Status::OK();
+}
+
+Result<bool> DeleteDirContents(const PlatformFilename& dir_path, bool allow_not_found,
+ bool remove_top_dir) {
+ bool exists = true;
+ WIN32_FIND_DATAW entry;
+ if (allow_not_found) {
+ RETURN_NOT_OK(FindOneFile(dir_path, &entry, &exists));
+ } else {
+ // Will raise if dir_path does not exist
+ RETURN_NOT_OK(FindOneFile(dir_path, &entry));
+ }
+ if (exists) {
+ RETURN_NOT_OK(DeleteDirEntryDir(dir_path, entry, remove_top_dir));
+ }
+ return exists;
+}
+
+#else // POSIX
+
+Status LinkStat(const PlatformFilename& path, struct stat* lst, bool* exists = nullptr) {
+ if (lstat(path.ToNative().c_str(), lst) != 0) {
+ if (exists == nullptr || (errno != ENOENT && errno != ENOTDIR && errno != ELOOP)) {
+ return IOErrorFromErrno(errno, "Cannot get information for path '", path.ToString(),
+ "'");
+ }
+ *exists = false;
+ } else if (exists != nullptr) {
+ *exists = true;
+ }
+ return Status::OK();
+}
+
+Status DeleteDirTreeInternal(const PlatformFilename& dir_path);
+
+Status DeleteDirEntryDir(const PlatformFilename& path, const struct stat& lst,
+ bool remove_top_dir = true) {
+ if (!S_ISLNK(lst.st_mode)) {
+ // Not a symlink => delete contents recursively
+ DCHECK(S_ISDIR(lst.st_mode));
+ RETURN_NOT_OK(DeleteDirTreeInternal(path));
+ if (remove_top_dir && rmdir(path.ToNative().c_str()) != 0) {
+ return IOErrorFromErrno(errno, "Cannot delete directory entry '", path.ToString(),
+ "'");
+ }
+ } else {
+ // Remove symlink
+ if (remove_top_dir && unlink(path.ToNative().c_str()) != 0) {
+ return IOErrorFromErrno(errno, "Cannot delete directory entry '", path.ToString(),
+ "'");
+ }
+ }
+ return Status::OK();
+}
+
+Status DeleteDirEntry(const PlatformFilename& path, const struct stat& lst) {
+ if (S_ISDIR(lst.st_mode)) {
+ return DeleteDirEntryDir(path, lst);
+ }
+ if (unlink(path.ToNative().c_str()) != 0) {
+ return IOErrorFromErrno(errno, "Cannot delete directory entry '", path.ToString(),
+ "'");
+ }
+ return Status::OK();
+}
+
+Status DeleteDirTreeInternal(const PlatformFilename& dir_path) {
+ ARROW_ASSIGN_OR_RAISE(auto children, ListDir(dir_path));
+ for (const auto& child : children) {
+ struct stat lst;
+ PlatformFilename full_path = dir_path.Join(child);
+ RETURN_NOT_OK(LinkStat(full_path, &lst));
+ RETURN_NOT_OK(DeleteDirEntry(full_path, lst));
+ }
+ return Status::OK();
+}
+
+Result<bool> DeleteDirContents(const PlatformFilename& dir_path, bool allow_not_found,
+ bool remove_top_dir) {
+ bool exists = true;
+ struct stat lst;
+ if (allow_not_found) {
+ RETURN_NOT_OK(LinkStat(dir_path, &lst, &exists));
+ } else {
+ // Will raise if dir_path does not exist
+ RETURN_NOT_OK(LinkStat(dir_path, &lst));
+ }
+ if (exists) {
+ if (!S_ISDIR(lst.st_mode) && !S_ISLNK(lst.st_mode)) {
+ return Status::IOError("Cannot delete directory '", dir_path.ToString(),
+ "': not a directory");
+ }
+ RETURN_NOT_OK(DeleteDirEntryDir(dir_path, lst, remove_top_dir));
+ }
+ return exists;
+}
+
+#endif
+
+} // namespace
+
+Result<bool> DeleteDirContents(const PlatformFilename& dir_path, bool allow_not_found) {
+ return DeleteDirContents(dir_path, allow_not_found, /*remove_top_dir=*/false);
+}
+
+Result<bool> DeleteDirTree(const PlatformFilename& dir_path, bool allow_not_found) {
+ return DeleteDirContents(dir_path, allow_not_found, /*remove_top_dir=*/true);
+}
+
+Result<bool> DeleteFile(const PlatformFilename& file_path, bool allow_not_found) {
+#ifdef _WIN32
+ if (DeleteFileW(file_path.ToNative().c_str())) {
+ return true;
+ } else {
+ int errnum = GetLastError();
+ if (!allow_not_found || errnum != ERROR_FILE_NOT_FOUND) {
+ return IOErrorFromWinError(GetLastError(), "Cannot delete file '",
+ file_path.ToString(), "'");
+ }
+ }
+#else
+ if (unlink(file_path.ToNative().c_str()) == 0) {
+ return true;
+ } else {
+ if (!allow_not_found || errno != ENOENT) {
+ return IOErrorFromErrno(errno, "Cannot delete file '", file_path.ToString(), "'");
+ }
+ }
+#endif
+ return false;
+}
+
+Result<bool> FileExists(const PlatformFilename& path) {
+#ifdef _WIN32
+ if (GetFileAttributesW(path.ToNative().c_str()) != INVALID_FILE_ATTRIBUTES) {
+ return true;
+ } else {
+ int errnum = GetLastError();
+ if (errnum != ERROR_PATH_NOT_FOUND && errnum != ERROR_FILE_NOT_FOUND) {
+ return IOErrorFromWinError(GetLastError(), "Failed getting information for path '",
+ path.ToString(), "'");
+ }
+ return false;
+ }
+#else
+ struct stat st;
+ if (stat(path.ToNative().c_str(), &st) == 0) {
+ return true;
+ } else {
+ if (errno != ENOENT && errno != ENOTDIR) {
+ return IOErrorFromErrno(errno, "Failed getting information for path '",
+ path.ToString(), "'");
+ }
+ return false;
+ }
+#endif
+}
+
+//
+// Functions for creating file descriptors
+//
+
+#define CHECK_LSEEK(retval) \
+ if ((retval) == -1) return Status::IOError("lseek failed");
+
+static inline int64_t lseek64_compat(int fd, int64_t pos, int whence) {
+#if defined(_WIN32)
+ return _lseeki64(fd, pos, whence);
+#else
+ return lseek(fd, pos, whence);
+#endif
+}
+
+static inline Result<int> CheckFileOpResult(int fd_ret, int errno_actual,
+ const PlatformFilename& file_name,
+ const char* opname) {
+ if (fd_ret == -1) {
+#ifdef _WIN32
+ int winerr = GetLastError();
+ if (winerr != ERROR_SUCCESS) {
+ return IOErrorFromWinError(GetLastError(), "Failed to ", opname, " file '",
+ file_name.ToString(), "'");
+ }
+#endif
+ return IOErrorFromErrno(errno_actual, "Failed to ", opname, " file '",
+ file_name.ToString(), "'");
+ }
+ return fd_ret;
+}
+
+Result<int> FileOpenReadable(const PlatformFilename& file_name) {
+ int fd, errno_actual;
+#if defined(_WIN32)
+ SetLastError(0);
+ HANDLE file_handle = CreateFileW(file_name.ToNative().c_str(), GENERIC_READ,
+ FILE_SHARE_READ | FILE_SHARE_WRITE, NULL,
+ OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
+
+ DWORD last_error = GetLastError();
+ if (last_error == ERROR_SUCCESS) {
+ errno_actual = 0;
+ fd = _open_osfhandle(reinterpret_cast<intptr_t>(file_handle),
+ _O_RDONLY | _O_BINARY | _O_NOINHERIT);
+ } else {
+ return IOErrorFromWinError(last_error, "Failed to open local file '",
+ file_name.ToString(), "'");
+ }
+#else
+ fd = open(file_name.ToNative().c_str(), O_RDONLY);
+ errno_actual = errno;
+
+ if (fd >= 0) {
+ // open(O_RDONLY) succeeds on directories, check for it
+ struct stat st;
+ int ret = fstat(fd, &st);
+ if (ret == -1) {
+ ARROW_UNUSED(FileClose(fd));
+ // Will propagate error below
+ } else if (S_ISDIR(st.st_mode)) {
+ ARROW_UNUSED(FileClose(fd));
+ return Status::IOError("Cannot open for reading: path '", file_name.ToString(),
+ "' is a directory");
+ }
+ }
+#endif
+
+ return CheckFileOpResult(fd, errno_actual, file_name, "open local");
+}
+
+Result<int> FileOpenWritable(const PlatformFilename& file_name, bool write_only,
+ bool truncate, bool append) {
+ int fd, errno_actual;
+
+#if defined(_WIN32)
+ SetLastError(0);
+ int oflag = _O_CREAT | _O_BINARY | _O_NOINHERIT;
+ DWORD desired_access = GENERIC_WRITE;
+ DWORD share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE;
+ DWORD creation_disposition = OPEN_ALWAYS;
+
+ if (append) {
+ oflag |= _O_APPEND;
+ }
+
+ if (truncate) {
+ oflag |= _O_TRUNC;
+ creation_disposition = CREATE_ALWAYS;
+ }
+
+ if (write_only) {
+ oflag |= _O_WRONLY;
+ } else {
+ oflag |= _O_RDWR;
+ desired_access |= GENERIC_READ;
+ }
+
+ HANDLE file_handle =
+ CreateFileW(file_name.ToNative().c_str(), desired_access, share_mode, NULL,
+ creation_disposition, FILE_ATTRIBUTE_NORMAL, NULL);
+
+ DWORD last_error = GetLastError();
+ if (last_error == ERROR_SUCCESS || last_error == ERROR_ALREADY_EXISTS) {
+ errno_actual = 0;
+ fd = _open_osfhandle(reinterpret_cast<intptr_t>(file_handle), oflag);
+ } else {
+ return IOErrorFromWinError(last_error, "Failed to open local file '",
+ file_name.ToString(), "'");
+ }
+#else
+ int oflag = O_CREAT;
+
+ if (truncate) {
+ oflag |= O_TRUNC;
+ }
+ if (append) {
+ oflag |= O_APPEND;
+ }
+
+ if (write_only) {
+ oflag |= O_WRONLY;
+ } else {
+ oflag |= O_RDWR;
+ }
+
+ fd = open(file_name.ToNative().c_str(), oflag, 0666);
+ errno_actual = errno;
+#endif
+
+ RETURN_NOT_OK(CheckFileOpResult(fd, errno_actual, file_name, "open local"));
+ if (append) {
+ // Seek to end, as O_APPEND does not necessarily do it
+ auto ret = lseek64_compat(fd, 0, SEEK_END);
+ if (ret == -1) {
+ ARROW_UNUSED(FileClose(fd));
+ return Status::IOError("lseek failed");
+ }
+ }
+ return fd;
+}
+
+Result<int64_t> FileTell(int fd) {
+ int64_t current_pos;
+#if defined(_WIN32)
+ current_pos = _telli64(fd);
+ if (current_pos == -1) {
+ return Status::IOError("_telli64 failed");
+ }
+#else
+ current_pos = lseek64_compat(fd, 0, SEEK_CUR);
+ CHECK_LSEEK(current_pos);
+#endif
+ return current_pos;
+}
+
+Result<Pipe> CreatePipe() {
+ int ret;
+ int fd[2];
+#if defined(_WIN32)
+ ret = _pipe(fd, 4096, _O_BINARY);
+#else
+ ret = pipe(fd);
+#endif
+
+ if (ret == -1) {
+ return IOErrorFromErrno(errno, "Error creating pipe");
+ }
+ return Pipe{fd[0], fd[1]};
+}
+
+static Status StatusFromMmapErrno(const char* prefix) {
+#ifdef _WIN32
+ errno = __map_mman_error(GetLastError(), EPERM);
+#endif
+ return IOErrorFromErrno(errno, prefix);
+}
+
+namespace {
+
+int64_t GetPageSizeInternal() {
+#if defined(__APPLE__)
+ return getpagesize();
+#elif defined(_WIN32)
+ SYSTEM_INFO si;
+ GetSystemInfo(&si);
+ return si.dwPageSize;
+#else
+ errno = 0;
+ const auto ret = sysconf(_SC_PAGESIZE);
+ if (ret == -1) {
+ ARROW_LOG(FATAL) << "sysconf(_SC_PAGESIZE) failed: " << ErrnoMessage(errno);
+ }
+ return static_cast<int64_t>(ret);
+#endif
+}
+
+} // namespace
+
+int64_t GetPageSize() {
+ static const int64_t kPageSize = GetPageSizeInternal(); // cache it
+ return kPageSize;
+}
+
+//
+// Compatible way to remap a memory map
+//
+
+Status MemoryMapRemap(void* addr, size_t old_size, size_t new_size, int fildes,
+ void** new_addr) {
+ // should only be called with writable files
+ *new_addr = MAP_FAILED;
+#ifdef _WIN32
+ // flags are ignored on windows
+ HANDLE fm, h;
+
+ if (!UnmapViewOfFile(addr)) {
+ return StatusFromMmapErrno("UnmapViewOfFile failed");
+ }
+
+ h = reinterpret_cast<HANDLE>(_get_osfhandle(fildes));
+ if (h == INVALID_HANDLE_VALUE) {
+ return StatusFromMmapErrno("Cannot get file handle");
+ }
+
+ uint64_t new_size64 = new_size;
+ LONG new_size_low = static_cast<LONG>(new_size64 & 0xFFFFFFFFUL);
+ LONG new_size_high = static_cast<LONG>((new_size64 >> 32) & 0xFFFFFFFFUL);
+
+ SetFilePointer(h, new_size_low, &new_size_high, FILE_BEGIN);
+ SetEndOfFile(h);
+ fm = CreateFileMapping(h, NULL, PAGE_READWRITE, 0, 0, "");
+ if (fm == NULL) {
+ return StatusFromMmapErrno("CreateFileMapping failed");
+ }
+ *new_addr = MapViewOfFile(fm, FILE_MAP_WRITE, 0, 0, new_size);
+ CloseHandle(fm);
+ if (new_addr == NULL) {
+ return StatusFromMmapErrno("MapViewOfFile failed");
+ }
+ return Status::OK();
+#elif defined(__linux__)
+ if (ftruncate(fildes, new_size) == -1) {
+ return StatusFromMmapErrno("ftruncate failed");
+ }
+ *new_addr = mremap(addr, old_size, new_size, MREMAP_MAYMOVE);
+ if (*new_addr == MAP_FAILED) {
+ return StatusFromMmapErrno("mremap failed");
+ }
+ return Status::OK();
+#else
+ // we have to close the mmap first, truncate the file to the new size
+ // and recreate the mmap
+ if (munmap(addr, old_size) == -1) {
+ return StatusFromMmapErrno("munmap failed");
+ }
+ if (ftruncate(fildes, new_size) == -1) {
+ return StatusFromMmapErrno("ftruncate failed");
+ }
+ // we set READ / WRITE flags on the new map, since we could only have
+ // unlarged a RW map in the first place
+ *new_addr = mmap(NULL, new_size, PROT_READ | PROT_WRITE, MAP_SHARED, fildes, 0);
+ if (*new_addr == MAP_FAILED) {
+ return StatusFromMmapErrno("mmap failed");
+ }
+ return Status::OK();
+#endif
+}
+
+Status MemoryAdviseWillNeed(const std::vector<MemoryRegion>& regions) {
+ const auto page_size = static_cast<size_t>(GetPageSize());
+ DCHECK_GT(page_size, 0);
+ const size_t page_mask = ~(page_size - 1);
+ DCHECK_EQ(page_mask & page_size, page_size);
+
+ auto align_region = [=](const MemoryRegion& region) -> MemoryRegion {
+ const auto addr = reinterpret_cast<uintptr_t>(region.addr);
+ const auto aligned_addr = addr & page_mask;
+ DCHECK_LT(addr - aligned_addr, page_size);
+ return {reinterpret_cast<void*>(aligned_addr),
+ region.size + static_cast<size_t>(addr - aligned_addr)};
+ };
+
+#ifdef _WIN32
+ // PrefetchVirtualMemory() is available on Windows 8 or later
+ struct PrefetchEntry { // Like WIN32_MEMORY_RANGE_ENTRY
+ void* VirtualAddress;
+ size_t NumberOfBytes;
+
+ PrefetchEntry(const MemoryRegion& region) // NOLINT runtime/explicit
+ : VirtualAddress(region.addr), NumberOfBytes(region.size) {}
+ };
+ using PrefetchVirtualMemoryFunc = BOOL (*)(HANDLE, ULONG_PTR, PrefetchEntry*, ULONG);
+ static const auto prefetch_virtual_memory = reinterpret_cast<PrefetchVirtualMemoryFunc>(
+ GetProcAddress(GetModuleHandleW(L"kernel32.dll"), "PrefetchVirtualMemory"));
+ if (prefetch_virtual_memory != nullptr) {
+ std::vector<PrefetchEntry> entries;
+ entries.reserve(regions.size());
+ for (const auto& region : regions) {
+ if (region.size != 0) {
+ entries.emplace_back(align_region(region));
+ }
+ }
+ if (!entries.empty() &&
+ !prefetch_virtual_memory(GetCurrentProcess(),
+ static_cast<ULONG_PTR>(entries.size()), entries.data(),
+ 0)) {
+ return IOErrorFromWinError(GetLastError(), "PrefetchVirtualMemory failed");
+ }
+ }
+ return Status::OK();
+#elif defined(POSIX_MADV_WILLNEED)
+ for (const auto& region : regions) {
+ if (region.size != 0) {
+ const auto aligned = align_region(region);
+ int err = posix_madvise(aligned.addr, aligned.size, POSIX_MADV_WILLNEED);
+ // EBADF can be returned on Linux in the following cases:
+ // - the kernel version is older than 3.9
+ // - the kernel was compiled with CONFIG_SWAP disabled (ARROW-9577)
+ if (err != 0 && err != EBADF) {
+ return IOErrorFromErrno(err, "posix_madvise failed");
+ }
+ }
+ }
+ return Status::OK();
+#else
+ return Status::OK();
+#endif
+}
+
+//
+// Closing files
+//
+
+Status FileClose(int fd) {
+ int ret;
+
+#if defined(_WIN32)
+ ret = static_cast<int>(_close(fd));
+#else
+ ret = static_cast<int>(close(fd));
+#endif
+
+ if (ret == -1) {
+ return Status::IOError("error closing file");
+ }
+ return Status::OK();
+}
+
+//
+// Seeking and telling
+//
+
+Status FileSeek(int fd, int64_t pos, int whence) {
+ int64_t ret = lseek64_compat(fd, pos, whence);
+ CHECK_LSEEK(ret);
+ return Status::OK();
+}
+
+Status FileSeek(int fd, int64_t pos) { return FileSeek(fd, pos, SEEK_SET); }
+
+Result<int64_t> FileGetSize(int fd) {
+#if defined(_WIN32)
+ struct __stat64 st;
+#else
+ struct stat st;
+#endif
+ st.st_size = -1;
+
+#if defined(_WIN32)
+ int ret = _fstat64(fd, &st);
+#else
+ int ret = fstat(fd, &st);
+#endif
+
+ if (ret == -1) {
+ return Status::IOError("error stat()ing file");
+ }
+ if (st.st_size == 0) {
+ // Maybe the file doesn't support getting its size, double-check by
+ // trying to tell() (seekable files usually have a size, while
+ // non-seekable files don't)
+ RETURN_NOT_OK(FileTell(fd));
+ } else if (st.st_size < 0) {
+ return Status::IOError("error getting file size");
+ }
+ return st.st_size;
+}
+
+//
+// Reading data
+//
+
+static inline int64_t pread_compat(int fd, void* buf, int64_t nbytes, int64_t pos) {
+#if defined(_WIN32)
+ HANDLE handle = reinterpret_cast<HANDLE>(_get_osfhandle(fd));
+ DWORD dwBytesRead = 0;
+ OVERLAPPED overlapped = {0};
+ overlapped.Offset = static_cast<uint32_t>(pos);
+ overlapped.OffsetHigh = static_cast<uint32_t>(pos >> 32);
+
+ // Note: ReadFile() will update the file position
+ BOOL bRet =
+ ReadFile(handle, buf, static_cast<uint32_t>(nbytes), &dwBytesRead, &overlapped);
+ if (bRet || GetLastError() == ERROR_HANDLE_EOF) {
+ return dwBytesRead;
+ } else {
+ return -1;
+ }
+#else
+ return static_cast<int64_t>(
+ pread(fd, buf, static_cast<size_t>(nbytes), static_cast<off_t>(pos)));
+#endif
+}
+
+Result<int64_t> FileRead(int fd, uint8_t* buffer, int64_t nbytes) {
+ int64_t bytes_read = 0;
+
+ while (bytes_read < nbytes) {
+ int64_t chunksize =
+ std::min(static_cast<int64_t>(ARROW_MAX_IO_CHUNKSIZE), nbytes - bytes_read);
+#if defined(_WIN32)
+ int64_t ret =
+ static_cast<int64_t>(_read(fd, buffer, static_cast<uint32_t>(chunksize)));
+#else
+ int64_t ret = static_cast<int64_t>(read(fd, buffer, static_cast<size_t>(chunksize)));
+#endif
+
+ if (ret == -1) {
+ return IOErrorFromErrno(errno, "Error reading bytes from file");
+ }
+ if (ret == 0) {
+ // EOF
+ break;
+ }
+ buffer += ret;
+ bytes_read += ret;
+ }
+ return bytes_read;
+}
+
+Result<int64_t> FileReadAt(int fd, uint8_t* buffer, int64_t position, int64_t nbytes) {
+ int64_t bytes_read = 0;
+
+ while (bytes_read < nbytes) {
+ int64_t chunksize =
+ std::min(static_cast<int64_t>(ARROW_MAX_IO_CHUNKSIZE), nbytes - bytes_read);
+ int64_t ret = pread_compat(fd, buffer, chunksize, position);
+
+ if (ret == -1) {
+ return IOErrorFromErrno(errno, "Error reading bytes from file");
+ }
+ if (ret == 0) {
+ // EOF
+ break;
+ }
+ buffer += ret;
+ position += ret;
+ bytes_read += ret;
+ }
+ return bytes_read;
+}
+
+//
+// Writing data
+//
+
+Status FileWrite(int fd, const uint8_t* buffer, const int64_t nbytes) {
+ int ret = 0;
+ int64_t bytes_written = 0;
+
+ while (ret != -1 && bytes_written < nbytes) {
+ int64_t chunksize =
+ std::min(static_cast<int64_t>(ARROW_MAX_IO_CHUNKSIZE), nbytes - bytes_written);
+#if defined(_WIN32)
+ ret = static_cast<int>(
+ _write(fd, buffer + bytes_written, static_cast<uint32_t>(chunksize)));
+#else
+ ret = static_cast<int>(
+ write(fd, buffer + bytes_written, static_cast<size_t>(chunksize)));
+#endif
+
+ if (ret != -1) {
+ bytes_written += ret;
+ }
+ }
+
+ if (ret == -1) {
+ return IOErrorFromErrno(errno, "Error writing bytes to file");
+ }
+ return Status::OK();
+}
+
+Status FileTruncate(int fd, const int64_t size) {
+ int ret, errno_actual;
+
+#ifdef _WIN32
+ errno_actual = _chsize_s(fd, static_cast<size_t>(size));
+ ret = errno_actual == 0 ? 0 : -1;
+#else
+ ret = ftruncate(fd, static_cast<size_t>(size));
+ errno_actual = errno;
+#endif
+
+ if (ret == -1) {
+ return IOErrorFromErrno(errno_actual, "Error writing bytes to file");
+ }
+ return Status::OK();
+}
+
+//
+// Environment variables
+//
+
+Result<std::string> GetEnvVar(const char* name) {
+#ifdef _WIN32
+ // On Windows, getenv() reads an early copy of the process' environment
+ // which doesn't get updated when SetEnvironmentVariable() is called.
+ constexpr int32_t bufsize = 2000;
+ char c_str[bufsize];
+ auto res = GetEnvironmentVariableA(name, c_str, bufsize);
+ if (res >= bufsize) {
+ return Status::CapacityError("environment variable value too long");
+ } else if (res == 0) {
+ return Status::KeyError("environment variable undefined");
+ }
+ return std::string(c_str);
+#else
+ char* c_str = getenv(name);
+ if (c_str == nullptr) {
+ return Status::KeyError("environment variable undefined");
+ }
+ return std::string(c_str);
+#endif
+}
+
+Result<std::string> GetEnvVar(const std::string& name) { return GetEnvVar(name.c_str()); }
+
+#ifdef _WIN32
+Result<NativePathString> GetEnvVarNative(const std::string& name) {
+ NativePathString w_name;
+ constexpr int32_t bufsize = 2000;
+ wchar_t w_str[bufsize];
+
+ ARROW_ASSIGN_OR_RAISE(w_name, StringToNative(name));
+ auto res = GetEnvironmentVariableW(w_name.c_str(), w_str, bufsize);
+ if (res >= bufsize) {
+ return Status::CapacityError("environment variable value too long");
+ } else if (res == 0) {
+ return Status::KeyError("environment variable undefined");
+ }
+ return NativePathString(w_str);
+}
+
+Result<NativePathString> GetEnvVarNative(const char* name) {
+ return GetEnvVarNative(std::string(name));
+}
+
+#else
+
+Result<NativePathString> GetEnvVarNative(const std::string& name) {
+ return GetEnvVar(name);
+}
+
+Result<NativePathString> GetEnvVarNative(const char* name) { return GetEnvVar(name); }
+#endif
+
+Status SetEnvVar(const char* name, const char* value) {
+#ifdef _WIN32
+ if (SetEnvironmentVariableA(name, value)) {
+ return Status::OK();
+ } else {
+ return Status::Invalid("failed setting environment variable");
+ }
+#else
+ if (setenv(name, value, 1) == 0) {
+ return Status::OK();
+ } else {
+ return Status::Invalid("failed setting environment variable");
+ }
+#endif
+}
+
+Status SetEnvVar(const std::string& name, const std::string& value) {
+ return SetEnvVar(name.c_str(), value.c_str());
+}
+
+Status DelEnvVar(const char* name) {
+#ifdef _WIN32
+ if (SetEnvironmentVariableA(name, nullptr)) {
+ return Status::OK();
+ } else {
+ return Status::Invalid("failed deleting environment variable");
+ }
+#else
+ if (unsetenv(name) == 0) {
+ return Status::OK();
+ } else {
+ return Status::Invalid("failed deleting environment variable");
+ }
+#endif
+}
+
+Status DelEnvVar(const std::string& name) { return DelEnvVar(name.c_str()); }
+
+//
+// Temporary directories
+//
+
+namespace {
+
+#if _WIN32
+NativePathString GetWindowsDirectoryPath() {
+ auto size = GetWindowsDirectoryW(nullptr, 0);
+ ARROW_CHECK_GT(size, 0) << "GetWindowsDirectoryW failed";
+ std::vector<wchar_t> w_str(size);
+ size = GetWindowsDirectoryW(w_str.data(), size);
+ ARROW_CHECK_GT(size, 0) << "GetWindowsDirectoryW failed";
+ return {w_str.data(), size};
+}
+#endif
+
+// Return a list of preferred locations for temporary files
+std::vector<NativePathString> GetPlatformTemporaryDirs() {
+ struct TempDirSelector {
+ std::string env_var;
+ NativePathString path_append;
+ };
+
+ std::vector<TempDirSelector> selectors;
+ NativePathString fallback_tmp;
+
+#if _WIN32
+ selectors = {
+ {"TMP", L""}, {"TEMP", L""}, {"LOCALAPPDATA", L"Temp"}, {"USERPROFILE", L"Temp"}};
+ fallback_tmp = GetWindowsDirectoryPath();
+
+#else
+ selectors = {{"TMPDIR", ""}, {"TMP", ""}, {"TEMP", ""}, {"TEMPDIR", ""}};
+#ifdef __ANDROID__
+ fallback_tmp = "/data/local/tmp";
+#else
+ fallback_tmp = "/tmp";
+#endif
+#endif
+
+ std::vector<NativePathString> temp_dirs;
+ for (const auto& sel : selectors) {
+ auto result = GetEnvVarNative(sel.env_var);
+ if (result.status().IsKeyError()) {
+ // Environment variable absent, skip
+ continue;
+ }
+ if (!result.ok()) {
+ ARROW_LOG(WARNING) << "Failed getting env var '" << sel.env_var
+ << "': " << result.status().ToString();
+ continue;
+ }
+ NativePathString p = *std::move(result);
+ if (p.empty()) {
+ // Environment variable set to empty string, skip
+ continue;
+ }
+ if (sel.path_append.empty()) {
+ temp_dirs.push_back(p);
+ } else {
+ temp_dirs.push_back(p + kNativeSep + sel.path_append);
+ }
+ }
+ temp_dirs.push_back(fallback_tmp);
+ return temp_dirs;
+}
+
+std::string MakeRandomName(int num_chars) {
+ static const std::string chars = "0123456789abcdefghijklmnopqrstuvwxyz";
+ std::default_random_engine gen(
+ static_cast<std::default_random_engine::result_type>(GetRandomSeed()));
+ std::uniform_int_distribution<int> dist(0, static_cast<int>(chars.length() - 1));
+
+ std::string s;
+ s.reserve(num_chars);
+ for (int i = 0; i < num_chars; ++i) {
+ s += chars[dist(gen)];
+ }
+ return s;
+}
+
+} // namespace
+
+Result<std::unique_ptr<TemporaryDir>> TemporaryDir::Make(const std::string& prefix) {
+ const int kNumChars = 8;
+
+ NativePathString base_name;
+
+ auto MakeBaseName = [&]() {
+ std::string suffix = MakeRandomName(kNumChars);
+ return StringToNative(prefix + suffix);
+ };
+
+ auto TryCreatingDirectory =
+ [&](const NativePathString& base_dir) -> Result<std::unique_ptr<TemporaryDir>> {
+ Status st;
+ for (int attempt = 0; attempt < 3; ++attempt) {
+ PlatformFilename fn(base_dir + kNativeSep + base_name + kNativeSep);
+ auto result = CreateDir(fn);
+ if (!result.ok()) {
+ // Probably a permissions error or a non-existing base_dir
+ return nullptr;
+ }
+ if (*result) {
+ return std::unique_ptr<TemporaryDir>(new TemporaryDir(std::move(fn)));
+ }
+ // The random name already exists in base_dir, try with another name
+ st = Status::IOError("Path already exists: '", fn.ToString(), "'");
+ ARROW_ASSIGN_OR_RAISE(base_name, MakeBaseName());
+ }
+ return st;
+ };
+
+ ARROW_ASSIGN_OR_RAISE(base_name, MakeBaseName());
+
+ auto base_dirs = GetPlatformTemporaryDirs();
+ DCHECK_NE(base_dirs.size(), 0);
+
+ for (const auto& base_dir : base_dirs) {
+ ARROW_ASSIGN_OR_RAISE(auto ptr, TryCreatingDirectory(base_dir));
+ if (ptr) {
+ return std::move(ptr);
+ }
+ // Cannot create in this directory, try the next one
+ }
+
+ return Status::IOError(
+ "Cannot create temporary subdirectory in any "
+ "of the platform temporary directories");
+}
+
+TemporaryDir::TemporaryDir(PlatformFilename&& path) : path_(std::move(path)) {}
+
+TemporaryDir::~TemporaryDir() {
+ Status st = DeleteDirTree(path_).status();
+ if (!st.ok()) {
+ ARROW_LOG(WARNING) << "When trying to delete temporary directory: " << st;
+ }
+}
+
+SignalHandler::SignalHandler() : SignalHandler(static_cast<Callback>(nullptr)) {}
+
+SignalHandler::SignalHandler(Callback cb) {
+#if ARROW_HAVE_SIGACTION
+ sa_.sa_handler = cb;
+ sa_.sa_flags = 0;
+ sigemptyset(&sa_.sa_mask);
+#else
+ cb_ = cb;
+#endif
+}
+
+#if ARROW_HAVE_SIGACTION
+SignalHandler::SignalHandler(const struct sigaction& sa) {
+ memcpy(&sa_, &sa, sizeof(sa));
+}
+#endif
+
+SignalHandler::Callback SignalHandler::callback() const {
+#if ARROW_HAVE_SIGACTION
+ return sa_.sa_handler;
+#else
+ return cb_;
+#endif
+}
+
+#if ARROW_HAVE_SIGACTION
+const struct sigaction& SignalHandler::action() const { return sa_; }
+#endif
+
+Result<SignalHandler> GetSignalHandler(int signum) {
+#if ARROW_HAVE_SIGACTION
+ struct sigaction sa;
+ int ret = sigaction(signum, nullptr, &sa);
+ if (ret != 0) {
+ // TODO more detailed message using errno
+ return Status::IOError("sigaction call failed");
+ }
+ return SignalHandler(sa);
+#else
+ // To read the old handler, set the signal handler to something else temporarily
+ SignalHandler::Callback cb = signal(signum, SIG_IGN);
+ if (cb == SIG_ERR || signal(signum, cb) == SIG_ERR) {
+ // TODO more detailed message using errno
+ return Status::IOError("signal call failed");
+ }
+ return SignalHandler(cb);
+#endif
+}
+
+Result<SignalHandler> SetSignalHandler(int signum, const SignalHandler& handler) {
+#if ARROW_HAVE_SIGACTION
+ struct sigaction old_sa;
+ int ret = sigaction(signum, &handler.action(), &old_sa);
+ if (ret != 0) {
+ // TODO more detailed message using errno
+ return Status::IOError("sigaction call failed");
+ }
+ return SignalHandler(old_sa);
+#else
+ SignalHandler::Callback cb = signal(signum, handler.callback());
+ if (cb == SIG_ERR) {
+ // TODO more detailed message using errno
+ return Status::IOError("signal call failed");
+ }
+ return SignalHandler(cb);
+#endif
+ return Status::OK();
+}
+
+void ReinstateSignalHandler(int signum, SignalHandler::Callback handler) {
+#if !ARROW_HAVE_SIGACTION
+ // Cannot report any errors from signal() (but there shouldn't be any)
+ signal(signum, handler);
+#endif
+}
+
+Status SendSignal(int signum) {
+ if (raise(signum) == 0) {
+ return Status::OK();
+ }
+ if (errno == EINVAL) {
+ return Status::Invalid("Invalid signal number ", signum);
+ }
+ return IOErrorFromErrno(errno, "Failed to raise signal");
+}
+
+Status SendSignalToThread(int signum, uint64_t thread_id) {
+#ifdef _WIN32
+ return Status::NotImplemented("Cannot send signal to specific thread on Windows");
+#else
+ // Have to use a C-style cast because pthread_t can be a pointer *or* integer type
+ int r = pthread_kill((pthread_t)thread_id, signum); // NOLINT readability-casting
+ if (r == 0) {
+ return Status::OK();
+ }
+ if (r == EINVAL) {
+ return Status::Invalid("Invalid signal number ", signum);
+ }
+ return IOErrorFromErrno(r, "Failed to raise signal");
+#endif
+}
+
+namespace {
+
+int64_t GetPid() {
+#ifdef _WIN32
+ return GetCurrentProcessId();
+#else
+ return getpid();
+#endif
+}
+
+std::mt19937_64 GetSeedGenerator() {
+ // Initialize Mersenne Twister PRNG with a true random seed.
+ // Make sure to mix in process id to minimize risks of clashes when parallel testing.
+#ifdef ARROW_VALGRIND
+ // Valgrind can crash, hang or enter an infinite loop on std::random_device,
+ // use a crude initializer instead.
+ const uint8_t dummy = 0;
+ ARROW_UNUSED(dummy);
+ std::mt19937_64 seed_gen(reinterpret_cast<uintptr_t>(&dummy) ^
+ static_cast<uintptr_t>(GetPid()));
+#else
+ std::random_device true_random;
+ std::mt19937_64 seed_gen(static_cast<uint64_t>(true_random()) ^
+ (static_cast<uint64_t>(true_random()) << 32) ^
+ static_cast<uint64_t>(GetPid()));
+#endif
+ return seed_gen;
+}
+
+} // namespace
+
+int64_t GetRandomSeed() {
+ // The process-global seed generator to aims to avoid calling std::random_device
+ // unless truly necessary (it can block on some systems, see ARROW-10287).
+ static auto seed_gen = GetSeedGenerator();
+ return static_cast<int64_t>(seed_gen());
+}
+
+uint64_t GetThreadId() {
+ uint64_t equiv{0};
+ // std::thread::id is trivially copyable as per C++ spec,
+ // so type punning as a uint64_t should work
+ static_assert(sizeof(std::thread::id) <= sizeof(uint64_t),
+ "std::thread::id can't fit into uint64_t");
+ const auto tid = std::this_thread::get_id();
+ memcpy(&equiv, reinterpret_cast<const void*>(&tid), sizeof(tid));
+ return equiv;
+}
+
+uint64_t GetOptionalThreadId() {
+ auto tid = GetThreadId();
+ return (tid == 0) ? tid - 1 : tid;
+}
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/io_util.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/io_util.h
new file mode 100644
index 00000000000..4255dd37105
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/io_util.h
@@ -0,0 +1,349 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#ifndef _WIN32
+#define ARROW_HAVE_SIGACTION 1
+#endif
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#if ARROW_HAVE_SIGACTION
+#include <signal.h> // Needed for struct sigaction
+#endif
+
+#include "arrow/status.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/windows_fixup.h"
+
+namespace arrow {
+namespace internal {
+
+// NOTE: 8-bit path strings on Windows are encoded using UTF-8.
+// Using MBCS would fail encoding some paths.
+
+#if defined(_WIN32)
+using NativePathString = std::wstring;
+#else
+using NativePathString = std::string;
+#endif
+
+class ARROW_EXPORT PlatformFilename {
+ public:
+ struct Impl;
+
+ ~PlatformFilename();
+ PlatformFilename();
+ PlatformFilename(const PlatformFilename&);
+ PlatformFilename(PlatformFilename&&);
+ PlatformFilename& operator=(const PlatformFilename&);
+ PlatformFilename& operator=(PlatformFilename&&);
+ explicit PlatformFilename(const NativePathString& path);
+ explicit PlatformFilename(const NativePathString::value_type* path);
+
+ const NativePathString& ToNative() const;
+ std::string ToString() const;
+
+ PlatformFilename Parent() const;
+
+ // These functions can fail for character encoding reasons.
+ static Result<PlatformFilename> FromString(const std::string& file_name);
+ Result<PlatformFilename> Join(const std::string& child_name) const;
+
+ PlatformFilename Join(const PlatformFilename& child_name) const;
+
+ bool operator==(const PlatformFilename& other) const;
+ bool operator!=(const PlatformFilename& other) const;
+
+ // Made public to avoid the proliferation of friend declarations.
+ const Impl* impl() const { return impl_.get(); }
+
+ private:
+ std::unique_ptr<Impl> impl_;
+
+ explicit PlatformFilename(Impl impl);
+};
+
+/// Create a directory if it doesn't exist.
+///
+/// Return whether the directory was created.
+ARROW_EXPORT
+Result<bool> CreateDir(const PlatformFilename& dir_path);
+
+/// Create a directory and its parents if it doesn't exist.
+///
+/// Return whether the directory was created.
+ARROW_EXPORT
+Result<bool> CreateDirTree(const PlatformFilename& dir_path);
+
+/// Delete a directory's contents (but not the directory itself) if it exists.
+///
+/// Return whether the directory existed.
+ARROW_EXPORT
+Result<bool> DeleteDirContents(const PlatformFilename& dir_path,
+ bool allow_not_found = true);
+
+/// Delete a directory tree if it exists.
+///
+/// Return whether the directory existed.
+ARROW_EXPORT
+Result<bool> DeleteDirTree(const PlatformFilename& dir_path, bool allow_not_found = true);
+
+// Non-recursively list the contents of the given directory.
+// The returned names are the children's base names, not including dir_path.
+ARROW_EXPORT
+Result<std::vector<PlatformFilename>> ListDir(const PlatformFilename& dir_path);
+
+/// Delete a file if it exists.
+///
+/// Return whether the file existed.
+ARROW_EXPORT
+Result<bool> DeleteFile(const PlatformFilename& file_path, bool allow_not_found = true);
+
+/// Return whether a file exists.
+ARROW_EXPORT
+Result<bool> FileExists(const PlatformFilename& path);
+
+/// Open a file for reading and return a file descriptor.
+ARROW_EXPORT
+Result<int> FileOpenReadable(const PlatformFilename& file_name);
+
+/// Open a file for writing and return a file descriptor.
+ARROW_EXPORT
+Result<int> FileOpenWritable(const PlatformFilename& file_name, bool write_only = true,
+ bool truncate = true, bool append = false);
+
+/// Read from current file position. Return number of bytes read.
+ARROW_EXPORT
+Result<int64_t> FileRead(int fd, uint8_t* buffer, int64_t nbytes);
+/// Read from given file position. Return number of bytes read.
+ARROW_EXPORT
+Result<int64_t> FileReadAt(int fd, uint8_t* buffer, int64_t position, int64_t nbytes);
+
+ARROW_EXPORT
+Status FileWrite(int fd, const uint8_t* buffer, const int64_t nbytes);
+ARROW_EXPORT
+Status FileTruncate(int fd, const int64_t size);
+
+ARROW_EXPORT
+Status FileSeek(int fd, int64_t pos);
+ARROW_EXPORT
+Status FileSeek(int fd, int64_t pos, int whence);
+ARROW_EXPORT
+Result<int64_t> FileTell(int fd);
+ARROW_EXPORT
+Result<int64_t> FileGetSize(int fd);
+
+ARROW_EXPORT
+Status FileClose(int fd);
+
+struct Pipe {
+ int rfd;
+ int wfd;
+};
+
+ARROW_EXPORT
+Result<Pipe> CreatePipe();
+
+ARROW_EXPORT
+int64_t GetPageSize();
+
+struct MemoryRegion {
+ void* addr;
+ size_t size;
+};
+
+ARROW_EXPORT
+Status MemoryMapRemap(void* addr, size_t old_size, size_t new_size, int fildes,
+ void** new_addr);
+ARROW_EXPORT
+Status MemoryAdviseWillNeed(const std::vector<MemoryRegion>& regions);
+
+ARROW_EXPORT
+Result<std::string> GetEnvVar(const char* name);
+ARROW_EXPORT
+Result<std::string> GetEnvVar(const std::string& name);
+ARROW_EXPORT
+Result<NativePathString> GetEnvVarNative(const char* name);
+ARROW_EXPORT
+Result<NativePathString> GetEnvVarNative(const std::string& name);
+
+ARROW_EXPORT
+Status SetEnvVar(const char* name, const char* value);
+ARROW_EXPORT
+Status SetEnvVar(const std::string& name, const std::string& value);
+ARROW_EXPORT
+Status DelEnvVar(const char* name);
+ARROW_EXPORT
+Status DelEnvVar(const std::string& name);
+
+ARROW_EXPORT
+std::string ErrnoMessage(int errnum);
+#if _WIN32
+ARROW_EXPORT
+std::string WinErrorMessage(int errnum);
+#endif
+
+ARROW_EXPORT
+std::shared_ptr<StatusDetail> StatusDetailFromErrno(int errnum);
+#if _WIN32
+ARROW_EXPORT
+std::shared_ptr<StatusDetail> StatusDetailFromWinError(int errnum);
+#endif
+ARROW_EXPORT
+std::shared_ptr<StatusDetail> StatusDetailFromSignal(int signum);
+
+template <typename... Args>
+Status StatusFromErrno(int errnum, StatusCode code, Args&&... args) {
+ return Status::FromDetailAndArgs(code, StatusDetailFromErrno(errnum),
+ std::forward<Args>(args)...);
+}
+
+template <typename... Args>
+Status IOErrorFromErrno(int errnum, Args&&... args) {
+ return StatusFromErrno(errnum, StatusCode::IOError, std::forward<Args>(args)...);
+}
+
+#if _WIN32
+template <typename... Args>
+Status StatusFromWinError(int errnum, StatusCode code, Args&&... args) {
+ return Status::FromDetailAndArgs(code, StatusDetailFromWinError(errnum),
+ std::forward<Args>(args)...);
+}
+
+template <typename... Args>
+Status IOErrorFromWinError(int errnum, Args&&... args) {
+ return StatusFromWinError(errnum, StatusCode::IOError, std::forward<Args>(args)...);
+}
+#endif
+
+template <typename... Args>
+Status StatusFromSignal(int signum, StatusCode code, Args&&... args) {
+ return Status::FromDetailAndArgs(code, StatusDetailFromSignal(signum),
+ std::forward<Args>(args)...);
+}
+
+template <typename... Args>
+Status CancelledFromSignal(int signum, Args&&... args) {
+ return StatusFromSignal(signum, StatusCode::Cancelled, std::forward<Args>(args)...);
+}
+
+ARROW_EXPORT
+int ErrnoFromStatus(const Status&);
+
+// Always returns 0 on non-Windows platforms (for Python).
+ARROW_EXPORT
+int WinErrorFromStatus(const Status&);
+
+ARROW_EXPORT
+int SignalFromStatus(const Status&);
+
+class ARROW_EXPORT TemporaryDir {
+ public:
+ ~TemporaryDir();
+
+ /// '/'-terminated path to the temporary dir
+ const PlatformFilename& path() { return path_; }
+
+ /// Create a temporary subdirectory in the system temporary dir,
+ /// named starting with `prefix`.
+ static Result<std::unique_ptr<TemporaryDir>> Make(const std::string& prefix);
+
+ private:
+ PlatformFilename path_;
+
+ explicit TemporaryDir(PlatformFilename&&);
+};
+
+class ARROW_EXPORT SignalHandler {
+ public:
+ typedef void (*Callback)(int);
+
+ SignalHandler();
+ explicit SignalHandler(Callback cb);
+#if ARROW_HAVE_SIGACTION
+ explicit SignalHandler(const struct sigaction& sa);
+#endif
+
+ Callback callback() const;
+#if ARROW_HAVE_SIGACTION
+ const struct sigaction& action() const;
+#endif
+
+ protected:
+#if ARROW_HAVE_SIGACTION
+ // Storing the full sigaction allows to restore the entire signal handling
+ // configuration.
+ struct sigaction sa_;
+#else
+ Callback cb_;
+#endif
+};
+
+/// \brief Return the current handler for the given signal number.
+ARROW_EXPORT
+Result<SignalHandler> GetSignalHandler(int signum);
+
+/// \brief Set a new handler for the given signal number.
+///
+/// The old signal handler is returned.
+ARROW_EXPORT
+Result<SignalHandler> SetSignalHandler(int signum, const SignalHandler& handler);
+
+/// \brief Reinstate the signal handler
+///
+/// For use in signal handlers. This is needed on platforms without sigaction()
+/// such as Windows, as the default signal handler is restored there as
+/// soon as a signal is raised.
+ARROW_EXPORT
+void ReinstateSignalHandler(int signum, SignalHandler::Callback handler);
+
+/// \brief Send a signal to the current process
+///
+/// The thread which will receive the signal is unspecified.
+ARROW_EXPORT
+Status SendSignal(int signum);
+
+/// \brief Send a signal to the given thread
+///
+/// This function isn't supported on Windows.
+ARROW_EXPORT
+Status SendSignalToThread(int signum, uint64_t thread_id);
+
+/// \brief Get an unpredictable random seed
+///
+/// This function may be slightly costly, so should only be used to initialize
+/// a PRNG, not to generate a large amount of random numbers.
+/// It is better to use this function rather than std::random_device, unless
+/// absolutely necessary (e.g. to generate a cryptographic secret).
+ARROW_EXPORT
+int64_t GetRandomSeed();
+
+/// \brief Get the current thread id
+///
+/// In addition to having the same properties as std::thread, the returned value
+/// is a regular integer value, which is more convenient than an opaque type.
+ARROW_EXPORT
+uint64_t GetThreadId();
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/iterator.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/iterator.h
new file mode 100644
index 00000000000..2f42803d26f
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/iterator.h
@@ -0,0 +1,568 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cassert>
+#include <functional>
+#include <memory>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/util/compare.h"
+#include "arrow/util/functional.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/optional.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+template <typename T>
+class Iterator;
+
+template <typename T>
+struct IterationTraits {
+ /// \brief a reserved value which indicates the end of iteration. By
+ /// default this is NULLPTR since most iterators yield pointer types.
+ /// Specialize IterationTraits if different end semantics are required.
+ ///
+ /// Note: This should not be used to determine if a given value is a
+ /// terminal value. Use IsIterationEnd (which uses IsEnd) instead. This
+ /// is only for returning terminal values.
+ static T End() { return T(NULLPTR); }
+
+ /// \brief Checks to see if the value is a terminal value.
+ /// A method is used here since T is not neccesarily comparable in many
+ /// cases even though it has a distinct final value
+ static bool IsEnd(const T& val) { return val == End(); }
+};
+
+template <typename T>
+T IterationEnd() {
+ return IterationTraits<T>::End();
+}
+
+template <typename T>
+bool IsIterationEnd(const T& val) {
+ return IterationTraits<T>::IsEnd(val);
+}
+
+template <typename T>
+struct IterationTraits<util::optional<T>> {
+ /// \brief by default when iterating through a sequence of optional,
+ /// nullopt indicates the end of iteration.
+ /// Specialize IterationTraits if different end semantics are required.
+ static util::optional<T> End() { return util::nullopt; }
+
+ /// \brief by default when iterating through a sequence of optional,
+ /// nullopt (!has_value()) indicates the end of iteration.
+ /// Specialize IterationTraits if different end semantics are required.
+ static bool IsEnd(const util::optional<T>& val) { return !val.has_value(); }
+
+ // TODO(bkietz) The range-for loop over Iterator<optional<T>> yields
+ // Result<optional<T>> which is unnecessary (since only the unyielded end optional
+ // is nullopt. Add IterationTraits::GetRangeElement() to handle this case
+};
+
+/// \brief A generic Iterator that can return errors
+template <typename T>
+class Iterator : public util::EqualityComparable<Iterator<T>> {
+ public:
+ /// \brief Iterator may be constructed from any type which has a member function
+ /// with signature Result<T> Next();
+ /// End of iterator is signalled by returning IteratorTraits<T>::End();
+ ///
+ /// The argument is moved or copied to the heap and kept in a unique_ptr<void>. Only
+ /// its destructor and its Next method (which are stored in function pointers) are
+ /// referenced after construction.
+ ///
+ /// This approach is used to dodge MSVC linkage hell (ARROW-6244, ARROW-6558) when using
+ /// an abstract template base class: instead of being inlined as usual for a template
+ /// function the base's virtual destructor will be exported, leading to multiple
+ /// definition errors when linking to any other TU where the base is instantiated.
+ template <typename Wrapped>
+ explicit Iterator(Wrapped has_next)
+ : ptr_(new Wrapped(std::move(has_next)), Delete<Wrapped>), next_(Next<Wrapped>) {}
+
+ Iterator() : ptr_(NULLPTR, [](void*) {}) {}
+
+ /// \brief Return the next element of the sequence, IterationTraits<T>::End() when the
+ /// iteration is completed. Calling this on a default constructed Iterator
+ /// will result in undefined behavior.
+ Result<T> Next() { return next_(ptr_.get()); }
+
+ /// Pass each element of the sequence to a visitor. Will return any error status
+ /// returned by the visitor, terminating iteration.
+ template <typename Visitor>
+ Status Visit(Visitor&& visitor) {
+ for (;;) {
+ ARROW_ASSIGN_OR_RAISE(auto value, Next());
+
+ if (IsIterationEnd(value)) break;
+
+ ARROW_RETURN_NOT_OK(visitor(std::move(value)));
+ }
+
+ return Status::OK();
+ }
+
+ /// Iterators will only compare equal if they are both null.
+ /// Equality comparability is required to make an Iterator of Iterators
+ /// (to check for the end condition).
+ bool Equals(const Iterator& other) const { return ptr_ == other.ptr_; }
+
+ explicit operator bool() const { return ptr_ != NULLPTR; }
+
+ class RangeIterator {
+ public:
+ RangeIterator() : value_(IterationTraits<T>::End()) {}
+
+ explicit RangeIterator(Iterator i)
+ : value_(IterationTraits<T>::End()),
+ iterator_(std::make_shared<Iterator>(std::move(i))) {
+ Next();
+ }
+
+ bool operator!=(const RangeIterator& other) const { return value_ != other.value_; }
+
+ RangeIterator& operator++() {
+ Next();
+ return *this;
+ }
+
+ Result<T> operator*() {
+ ARROW_RETURN_NOT_OK(value_.status());
+
+ auto value = std::move(value_);
+ value_ = IterationTraits<T>::End();
+ return value;
+ }
+
+ private:
+ void Next() {
+ if (!value_.ok()) {
+ value_ = IterationTraits<T>::End();
+ return;
+ }
+ value_ = iterator_->Next();
+ }
+
+ Result<T> value_;
+ std::shared_ptr<Iterator> iterator_;
+ };
+
+ RangeIterator begin() { return RangeIterator(std::move(*this)); }
+
+ RangeIterator end() { return RangeIterator(); }
+
+ /// \brief Move every element of this iterator into a vector.
+ Result<std::vector<T>> ToVector() {
+ std::vector<T> out;
+ for (auto maybe_element : *this) {
+ ARROW_ASSIGN_OR_RAISE(auto element, maybe_element);
+ out.push_back(std::move(element));
+ }
+ // ARROW-8193: On gcc-4.8 without the explicit move it tries to use the
+ // copy constructor, which may be deleted on the elements of type T
+ return std::move(out);
+ }
+
+ private:
+ /// Implementation of deleter for ptr_: Casts from void* to the wrapped type and
+ /// deletes that.
+ template <typename HasNext>
+ static void Delete(void* ptr) {
+ delete static_cast<HasNext*>(ptr);
+ }
+
+ /// Implementation of Next: Casts from void* to the wrapped type and invokes that
+ /// type's Next member function.
+ template <typename HasNext>
+ static Result<T> Next(void* ptr) {
+ return static_cast<HasNext*>(ptr)->Next();
+ }
+
+ /// ptr_ is a unique_ptr to void with a custom deleter: a function pointer which first
+ /// casts from void* to a pointer to the wrapped type then deletes that.
+ std::unique_ptr<void, void (*)(void*)> ptr_;
+
+ /// next_ is a function pointer which first casts from void* to a pointer to the wrapped
+ /// type then invokes its Next member function.
+ Result<T> (*next_)(void*) = NULLPTR;
+};
+
+template <typename T>
+struct TransformFlow {
+ using YieldValueType = T;
+
+ TransformFlow(YieldValueType value, bool ready_for_next)
+ : finished_(false),
+ ready_for_next_(ready_for_next),
+ yield_value_(std::move(value)) {}
+ TransformFlow(bool finished, bool ready_for_next)
+ : finished_(finished), ready_for_next_(ready_for_next), yield_value_() {}
+
+ bool HasValue() const { return yield_value_.has_value(); }
+ bool Finished() const { return finished_; }
+ bool ReadyForNext() const { return ready_for_next_; }
+ T Value() const { return *yield_value_; }
+
+ bool finished_ = false;
+ bool ready_for_next_ = false;
+ util::optional<YieldValueType> yield_value_;
+};
+
+struct TransformFinish {
+ template <typename T>
+ operator TransformFlow<T>() && { // NOLINT explicit
+ return TransformFlow<T>(true, true);
+ }
+};
+
+struct TransformSkip {
+ template <typename T>
+ operator TransformFlow<T>() && { // NOLINT explicit
+ return TransformFlow<T>(false, true);
+ }
+};
+
+template <typename T>
+TransformFlow<T> TransformYield(T value = {}, bool ready_for_next = true) {
+ return TransformFlow<T>(std::move(value), ready_for_next);
+}
+
+template <typename T, typename V>
+using Transformer = std::function<Result<TransformFlow<V>>(T)>;
+
+template <typename T, typename V>
+class TransformIterator {
+ public:
+ explicit TransformIterator(Iterator<T> it, Transformer<T, V> transformer)
+ : it_(std::move(it)),
+ transformer_(std::move(transformer)),
+ last_value_(),
+ finished_() {}
+
+ Result<V> Next() {
+ while (!finished_) {
+ ARROW_ASSIGN_OR_RAISE(util::optional<V> next, Pump());
+ if (next.has_value()) {
+ return std::move(*next);
+ }
+ ARROW_ASSIGN_OR_RAISE(last_value_, it_.Next());
+ }
+ return IterationTraits<V>::End();
+ }
+
+ private:
+ // Calls the transform function on the current value. Can return in several ways
+ // * If the next value is requested (e.g. skip) it will return an empty optional
+ // * If an invalid status is encountered that will be returned
+ // * If finished it will return IterationTraits<V>::End()
+ // * If a value is returned by the transformer that will be returned
+ Result<util::optional<V>> Pump() {
+ if (!finished_ && last_value_.has_value()) {
+ auto next_res = transformer_(*last_value_);
+ if (!next_res.ok()) {
+ finished_ = true;
+ return next_res.status();
+ }
+ auto next = *next_res;
+ if (next.ReadyForNext()) {
+ if (IsIterationEnd(*last_value_)) {
+ finished_ = true;
+ }
+ last_value_.reset();
+ }
+ if (next.Finished()) {
+ finished_ = true;
+ }
+ if (next.HasValue()) {
+ return next.Value();
+ }
+ }
+ if (finished_) {
+ return IterationTraits<V>::End();
+ }
+ return util::nullopt;
+ }
+
+ Iterator<T> it_;
+ Transformer<T, V> transformer_;
+ util::optional<T> last_value_;
+ bool finished_ = false;
+};
+
+/// \brief Transforms an iterator according to a transformer, returning a new Iterator.
+///
+/// The transformer will be called on each element of the source iterator and for each
+/// call it can yield a value, skip, or finish the iteration. When yielding a value the
+/// transformer can choose to consume the source item (the default, ready_for_next = true)
+/// or to keep it and it will be called again on the same value.
+///
+/// This is essentially a more generic form of the map operation that can return 0, 1, or
+/// many values for each of the source items.
+///
+/// The transformer will be exposed to the end of the source sequence
+/// (IterationTraits::End) in case it needs to return some penultimate item(s).
+///
+/// Any invalid status returned by the transformer will be returned immediately.
+template <typename T, typename V>
+Iterator<V> MakeTransformedIterator(Iterator<T> it, Transformer<T, V> op) {
+ return Iterator<V>(TransformIterator<T, V>(std::move(it), std::move(op)));
+}
+
+template <typename T>
+struct IterationTraits<Iterator<T>> {
+ // The end condition for an Iterator of Iterators is a default constructed (null)
+ // Iterator.
+ static Iterator<T> End() { return Iterator<T>(); }
+ static bool IsEnd(const Iterator<T>& val) { return !val; }
+};
+
+template <typename Fn, typename T>
+class FunctionIterator {
+ public:
+ explicit FunctionIterator(Fn fn) : fn_(std::move(fn)) {}
+
+ Result<T> Next() { return fn_(); }
+
+ private:
+ Fn fn_;
+};
+
+/// \brief Construct an Iterator which invokes a callable on Next()
+template <typename Fn,
+ typename Ret = typename internal::call_traits::return_type<Fn>::ValueType>
+Iterator<Ret> MakeFunctionIterator(Fn fn) {
+ return Iterator<Ret>(FunctionIterator<Fn, Ret>(std::move(fn)));
+}
+
+template <typename T>
+Iterator<T> MakeEmptyIterator() {
+ return MakeFunctionIterator([]() -> Result<T> { return IterationTraits<T>::End(); });
+}
+
+template <typename T>
+Iterator<T> MakeErrorIterator(Status s) {
+ return MakeFunctionIterator([s]() -> Result<T> {
+ ARROW_RETURN_NOT_OK(s);
+ return IterationTraits<T>::End();
+ });
+}
+
+/// \brief Simple iterator which yields the elements of a std::vector
+template <typename T>
+class VectorIterator {
+ public:
+ explicit VectorIterator(std::vector<T> v) : elements_(std::move(v)) {}
+
+ Result<T> Next() {
+ if (i_ == elements_.size()) {
+ return IterationTraits<T>::End();
+ }
+ return std::move(elements_[i_++]);
+ }
+
+ private:
+ std::vector<T> elements_;
+ size_t i_ = 0;
+};
+
+template <typename T>
+Iterator<T> MakeVectorIterator(std::vector<T> v) {
+ return Iterator<T>(VectorIterator<T>(std::move(v)));
+}
+
+/// \brief Simple iterator which yields *pointers* to the elements of a std::vector<T>.
+/// This is provided to support T where IterationTraits<T>::End is not specialized
+template <typename T>
+class VectorPointingIterator {
+ public:
+ explicit VectorPointingIterator(std::vector<T> v) : elements_(std::move(v)) {}
+
+ Result<T*> Next() {
+ if (i_ == elements_.size()) {
+ return NULLPTR;
+ }
+ return &elements_[i_++];
+ }
+
+ private:
+ std::vector<T> elements_;
+ size_t i_ = 0;
+};
+
+template <typename T>
+Iterator<T*> MakeVectorPointingIterator(std::vector<T> v) {
+ return Iterator<T*>(VectorPointingIterator<T>(std::move(v)));
+}
+
+/// \brief MapIterator takes ownership of an iterator and a function to apply
+/// on every element. The mapped function is not allowed to fail.
+template <typename Fn, typename I, typename O>
+class MapIterator {
+ public:
+ explicit MapIterator(Fn map, Iterator<I> it)
+ : map_(std::move(map)), it_(std::move(it)) {}
+
+ Result<O> Next() {
+ ARROW_ASSIGN_OR_RAISE(I i, it_.Next());
+
+ if (IsIterationEnd(i)) {
+ return IterationTraits<O>::End();
+ }
+
+ return map_(std::move(i));
+ }
+
+ private:
+ Fn map_;
+ Iterator<I> it_;
+};
+
+/// \brief MapIterator takes ownership of an iterator and a function to apply
+/// on every element. The mapped function is not allowed to fail.
+template <typename Fn, typename From = internal::call_traits::argument_type<0, Fn>,
+ typename To = internal::call_traits::return_type<Fn>>
+Iterator<To> MakeMapIterator(Fn map, Iterator<From> it) {
+ return Iterator<To>(MapIterator<Fn, From, To>(std::move(map), std::move(it)));
+}
+
+/// \brief Like MapIterator, but where the function can fail.
+template <typename Fn, typename From = internal::call_traits::argument_type<0, Fn>,
+ typename To = typename internal::call_traits::return_type<Fn>::ValueType>
+Iterator<To> MakeMaybeMapIterator(Fn map, Iterator<From> it) {
+ return Iterator<To>(MapIterator<Fn, From, To>(std::move(map), std::move(it)));
+}
+
+struct FilterIterator {
+ enum Action { ACCEPT, REJECT };
+
+ template <typename To>
+ static Result<std::pair<To, Action>> Reject() {
+ return std::make_pair(IterationTraits<To>::End(), REJECT);
+ }
+
+ template <typename To>
+ static Result<std::pair<To, Action>> Accept(To out) {
+ return std::make_pair(std::move(out), ACCEPT);
+ }
+
+ template <typename To>
+ static Result<std::pair<To, Action>> MaybeAccept(Result<To> maybe_out) {
+ return std::move(maybe_out).Map(Accept<To>);
+ }
+
+ template <typename To>
+ static Result<std::pair<To, Action>> Error(Status s) {
+ return s;
+ }
+
+ template <typename Fn, typename From, typename To>
+ class Impl {
+ public:
+ explicit Impl(Fn filter, Iterator<From> it) : filter_(filter), it_(std::move(it)) {}
+
+ Result<To> Next() {
+ To out = IterationTraits<To>::End();
+ Action action;
+
+ for (;;) {
+ ARROW_ASSIGN_OR_RAISE(From i, it_.Next());
+
+ if (IsIterationEnd(i)) {
+ return IterationTraits<To>::End();
+ }
+
+ ARROW_ASSIGN_OR_RAISE(std::tie(out, action), filter_(std::move(i)));
+
+ if (action == ACCEPT) return out;
+ }
+ }
+
+ private:
+ Fn filter_;
+ Iterator<From> it_;
+ };
+};
+
+/// \brief Like MapIterator, but where the function can fail or reject elements.
+template <
+ typename Fn, typename From = typename internal::call_traits::argument_type<0, Fn>,
+ typename Ret = typename internal::call_traits::return_type<Fn>::ValueType,
+ typename To = typename std::tuple_element<0, Ret>::type,
+ typename Enable = typename std::enable_if<std::is_same<
+ typename std::tuple_element<1, Ret>::type, FilterIterator::Action>::value>::type>
+Iterator<To> MakeFilterIterator(Fn filter, Iterator<From> it) {
+ return Iterator<To>(
+ FilterIterator::Impl<Fn, From, To>(std::move(filter), std::move(it)));
+}
+
+/// \brief FlattenIterator takes an iterator generating iterators and yields a
+/// unified iterator that flattens/concatenates in a single stream.
+template <typename T>
+class FlattenIterator {
+ public:
+ explicit FlattenIterator(Iterator<Iterator<T>> it) : parent_(std::move(it)) {}
+
+ Result<T> Next() {
+ if (IsIterationEnd(child_)) {
+ // Pop from parent's iterator.
+ ARROW_ASSIGN_OR_RAISE(child_, parent_.Next());
+
+ // Check if final iteration reached.
+ if (IsIterationEnd(child_)) {
+ return IterationTraits<T>::End();
+ }
+
+ return Next();
+ }
+
+ // Pop from child_ and check for depletion.
+ ARROW_ASSIGN_OR_RAISE(T out, child_.Next());
+ if (IsIterationEnd(out)) {
+ // Reset state such that we pop from parent on the recursive call
+ child_ = IterationTraits<Iterator<T>>::End();
+
+ return Next();
+ }
+
+ return out;
+ }
+
+ private:
+ Iterator<Iterator<T>> parent_;
+ Iterator<T> child_ = IterationTraits<Iterator<T>>::End();
+};
+
+template <typename T>
+Iterator<T> MakeFlattenIterator(Iterator<Iterator<T>> it) {
+ return Iterator<T>(FlattenIterator<T>(std::move(it)));
+}
+
+template <typename Reader>
+Iterator<typename Reader::ValueType> MakeIteratorFromReader(
+ const std::shared_ptr<Reader>& reader) {
+ return MakeFunctionIterator([reader] { return reader->Next(); });
+}
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/key_value_metadata.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/key_value_metadata.cc
new file mode 100644
index 00000000000..ad3b686a9bd
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/key_value_metadata.cc
@@ -0,0 +1,274 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/util/key_value_metadata.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/sort.h"
+
+using std::size_t;
+
+namespace arrow {
+
+static std::vector<std::string> UnorderedMapKeys(
+ const std::unordered_map<std::string, std::string>& map) {
+ std::vector<std::string> keys;
+ keys.reserve(map.size());
+ for (const auto& pair : map) {
+ keys.push_back(pair.first);
+ }
+ return keys;
+}
+
+static std::vector<std::string> UnorderedMapValues(
+ const std::unordered_map<std::string, std::string>& map) {
+ std::vector<std::string> values;
+ values.reserve(map.size());
+ for (const auto& pair : map) {
+ values.push_back(pair.second);
+ }
+ return values;
+}
+
+KeyValueMetadata::KeyValueMetadata() : keys_(), values_() {}
+
+KeyValueMetadata::KeyValueMetadata(
+ const std::unordered_map<std::string, std::string>& map)
+ : keys_(UnorderedMapKeys(map)), values_(UnorderedMapValues(map)) {
+ ARROW_CHECK_EQ(keys_.size(), values_.size());
+}
+
+KeyValueMetadata::KeyValueMetadata(std::vector<std::string> keys,
+ std::vector<std::string> values)
+ : keys_(std::move(keys)), values_(std::move(values)) {
+ ARROW_CHECK_EQ(keys.size(), values.size());
+}
+
+std::shared_ptr<KeyValueMetadata> KeyValueMetadata::Make(
+ std::vector<std::string> keys, std::vector<std::string> values) {
+ return std::make_shared<KeyValueMetadata>(std::move(keys), std::move(values));
+}
+
+void KeyValueMetadata::ToUnorderedMap(
+ std::unordered_map<std::string, std::string>* out) const {
+ DCHECK_NE(out, nullptr);
+ const int64_t n = size();
+ out->reserve(n);
+ for (int64_t i = 0; i < n; ++i) {
+ out->insert(std::make_pair(key(i), value(i)));
+ }
+}
+
+void KeyValueMetadata::Append(const std::string& key, const std::string& value) {
+ keys_.push_back(key);
+ values_.push_back(value);
+}
+
+Result<std::string> KeyValueMetadata::Get(const std::string& key) const {
+ auto index = FindKey(key);
+ if (index < 0) {
+ return Status::KeyError(key);
+ } else {
+ return value(index);
+ }
+}
+
+Status KeyValueMetadata::Delete(int64_t index) {
+ keys_.erase(keys_.begin() + index);
+ values_.erase(values_.begin() + index);
+ return Status::OK();
+}
+
+Status KeyValueMetadata::DeleteMany(std::vector<int64_t> indices) {
+ std::sort(indices.begin(), indices.end());
+ const int64_t size = static_cast<int64_t>(keys_.size());
+ indices.push_back(size);
+
+ int64_t shift = 0;
+ for (int64_t i = 0; i < static_cast<int64_t>(indices.size() - 1); ++i) {
+ ++shift;
+ const auto start = indices[i] + 1;
+ const auto stop = indices[i + 1];
+ DCHECK_GE(start, 0);
+ DCHECK_LE(start, size);
+ DCHECK_GE(stop, 0);
+ DCHECK_LE(stop, size);
+ for (int64_t index = start; index < stop; ++index) {
+ keys_[index - shift] = std::move(keys_[index]);
+ values_[index - shift] = std::move(values_[index]);
+ }
+ }
+ keys_.resize(size - shift);
+ values_.resize(size - shift);
+ return Status::OK();
+}
+
+Status KeyValueMetadata::Delete(const std::string& key) {
+ auto index = FindKey(key);
+ if (index < 0) {
+ return Status::KeyError(key);
+ } else {
+ return Delete(index);
+ }
+}
+
+Status KeyValueMetadata::Set(const std::string& key, const std::string& value) {
+ auto index = FindKey(key);
+ if (index < 0) {
+ Append(key, value);
+ } else {
+ keys_[index] = key;
+ values_[index] = value;
+ }
+ return Status::OK();
+}
+
+bool KeyValueMetadata::Contains(const std::string& key) const {
+ return FindKey(key) >= 0;
+}
+
+void KeyValueMetadata::reserve(int64_t n) {
+ DCHECK_GE(n, 0);
+ const auto m = static_cast<size_t>(n);
+ keys_.reserve(m);
+ values_.reserve(m);
+}
+
+int64_t KeyValueMetadata::size() const {
+ DCHECK_EQ(keys_.size(), values_.size());
+ return static_cast<int64_t>(keys_.size());
+}
+
+const std::string& KeyValueMetadata::key(int64_t i) const {
+ DCHECK_GE(i, 0);
+ DCHECK_LT(static_cast<size_t>(i), keys_.size());
+ return keys_[i];
+}
+
+const std::string& KeyValueMetadata::value(int64_t i) const {
+ DCHECK_GE(i, 0);
+ DCHECK_LT(static_cast<size_t>(i), values_.size());
+ return values_[i];
+}
+
+std::vector<std::pair<std::string, std::string>> KeyValueMetadata::sorted_pairs() const {
+ std::vector<std::pair<std::string, std::string>> pairs;
+ pairs.reserve(size());
+
+ auto indices = internal::ArgSort(keys_);
+ for (const auto i : indices) {
+ pairs.emplace_back(keys_[i], values_[i]);
+ }
+ return pairs;
+}
+
+int KeyValueMetadata::FindKey(const std::string& key) const {
+ for (size_t i = 0; i < keys_.size(); ++i) {
+ if (keys_[i] == key) {
+ return static_cast<int>(i);
+ }
+ }
+ return -1;
+}
+
+std::shared_ptr<KeyValueMetadata> KeyValueMetadata::Copy() const {
+ return std::make_shared<KeyValueMetadata>(keys_, values_);
+}
+
+std::shared_ptr<KeyValueMetadata> KeyValueMetadata::Merge(
+ const KeyValueMetadata& other) const {
+ std::unordered_set<std::string> observed_keys;
+ std::vector<std::string> result_keys;
+ std::vector<std::string> result_values;
+
+ result_keys.reserve(keys_.size());
+ result_values.reserve(keys_.size());
+
+ for (int64_t i = 0; i < other.size(); ++i) {
+ const auto& key = other.key(i);
+ auto it = observed_keys.find(key);
+ if (it == observed_keys.end()) {
+ result_keys.push_back(key);
+ result_values.push_back(other.value(i));
+ observed_keys.insert(key);
+ }
+ }
+ for (size_t i = 0; i < keys_.size(); ++i) {
+ auto it = observed_keys.find(keys_[i]);
+ if (it == observed_keys.end()) {
+ result_keys.push_back(keys_[i]);
+ result_values.push_back(values_[i]);
+ observed_keys.insert(keys_[i]);
+ }
+ }
+
+ return std::make_shared<KeyValueMetadata>(std::move(result_keys),
+ std::move(result_values));
+}
+
+bool KeyValueMetadata::Equals(const KeyValueMetadata& other) const {
+ if (size() != other.size()) {
+ return false;
+ }
+
+ auto indices = internal::ArgSort(keys_);
+ auto other_indices = internal::ArgSort(other.keys_);
+
+ for (int64_t i = 0; i < size(); ++i) {
+ auto j = indices[i];
+ auto k = other_indices[i];
+ if (keys_[j] != other.keys_[k] || values_[j] != other.values_[k]) {
+ return false;
+ }
+ }
+ return true;
+}
+
+std::string KeyValueMetadata::ToString() const {
+ std::stringstream buffer;
+
+ buffer << "\n-- metadata --";
+ for (int64_t i = 0; i < size(); ++i) {
+ buffer << "\n" << keys_[i] << ": " << values_[i];
+ }
+
+ return buffer.str();
+}
+
+std::shared_ptr<KeyValueMetadata> key_value_metadata(
+ const std::unordered_map<std::string, std::string>& pairs) {
+ return std::make_shared<KeyValueMetadata>(pairs);
+}
+
+std::shared_ptr<KeyValueMetadata> key_value_metadata(std::vector<std::string> keys,
+ std::vector<std::string> values) {
+ return std::make_shared<KeyValueMetadata>(std::move(keys), std::move(values));
+}
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/key_value_metadata.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/key_value_metadata.h
new file mode 100644
index 00000000000..d42ab78f667
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/key_value_metadata.h
@@ -0,0 +1,99 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+/// \brief A container for key-value pair type metadata. Not thread-safe
+class ARROW_EXPORT KeyValueMetadata {
+ public:
+ KeyValueMetadata();
+ KeyValueMetadata(std::vector<std::string> keys, std::vector<std::string> values);
+ explicit KeyValueMetadata(const std::unordered_map<std::string, std::string>& map);
+ virtual ~KeyValueMetadata() = default;
+
+ static std::shared_ptr<KeyValueMetadata> Make(std::vector<std::string> keys,
+ std::vector<std::string> values);
+
+ void ToUnorderedMap(std::unordered_map<std::string, std::string>* out) const;
+ void Append(const std::string& key, const std::string& value);
+
+ Result<std::string> Get(const std::string& key) const;
+ bool Contains(const std::string& key) const;
+ // Note that deleting may invalidate known indices
+ Status Delete(const std::string& key);
+ Status Delete(int64_t index);
+ Status DeleteMany(std::vector<int64_t> indices);
+ Status Set(const std::string& key, const std::string& value);
+
+ void reserve(int64_t n);
+
+ int64_t size() const;
+ const std::string& key(int64_t i) const;
+ const std::string& value(int64_t i) const;
+ const std::vector<std::string>& keys() const { return keys_; }
+ const std::vector<std::string>& values() const { return values_; }
+
+ std::vector<std::pair<std::string, std::string>> sorted_pairs() const;
+
+ /// \brief Perform linear search for key, returning -1 if not found
+ int FindKey(const std::string& key) const;
+
+ std::shared_ptr<KeyValueMetadata> Copy() const;
+
+ /// \brief Return a new KeyValueMetadata by combining the passed metadata
+ /// with this KeyValueMetadata. Colliding keys will be overridden by the
+ /// passed metadata. Assumes keys in both containers are unique
+ std::shared_ptr<KeyValueMetadata> Merge(const KeyValueMetadata& other) const;
+
+ bool Equals(const KeyValueMetadata& other) const;
+ std::string ToString() const;
+
+ private:
+ std::vector<std::string> keys_;
+ std::vector<std::string> values_;
+
+ ARROW_DISALLOW_COPY_AND_ASSIGN(KeyValueMetadata);
+};
+
+/// \brief Create a KeyValueMetadata instance
+///
+/// \param pairs key-value mapping
+std::shared_ptr<KeyValueMetadata> ARROW_EXPORT
+key_value_metadata(const std::unordered_map<std::string, std::string>& pairs);
+
+/// \brief Create a KeyValueMetadata instance
+///
+/// \param keys sequence of metadata keys
+/// \param values sequence of corresponding metadata values
+std::shared_ptr<KeyValueMetadata> ARROW_EXPORT
+key_value_metadata(std::vector<std::string> keys, std::vector<std::string> values);
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/logging.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/logging.cc
new file mode 100644
index 00000000000..65359b44081
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/logging.cc
@@ -0,0 +1,256 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/util/logging.h"
+
+#ifdef ARROW_WITH_BACKTRACE
+#include <execinfo.h>
+#endif
+#include <cstdlib>
+#include <iostream>
+
+#ifdef ARROW_USE_GLOG
+
+#include <signal.h>
+#include <vector>
+
+#error #include "glog/logging.h"
+
+// Restore our versions of DCHECK and friends, as GLog defines its own
+#undef DCHECK
+#undef DCHECK_OK
+#undef DCHECK_EQ
+#undef DCHECK_NE
+#undef DCHECK_LE
+#undef DCHECK_LT
+#undef DCHECK_GE
+#undef DCHECK_GT
+
+#define DCHECK ARROW_DCHECK
+#define DCHECK_OK ARROW_DCHECK_OK
+#define DCHECK_EQ ARROW_DCHECK_EQ
+#define DCHECK_NE ARROW_DCHECK_NE
+#define DCHECK_LE ARROW_DCHECK_LE
+#define DCHECK_LT ARROW_DCHECK_LT
+#define DCHECK_GE ARROW_DCHECK_GE
+#define DCHECK_GT ARROW_DCHECK_GT
+
+#endif
+
+namespace arrow {
+namespace util {
+
+// This code is adapted from
+// https://github.com/ray-project/ray/blob/master/src/ray/util/logging.cc.
+
+// This is the default implementation of arrow log,
+// which is independent of any libs.
+class CerrLog {
+ public:
+ explicit CerrLog(ArrowLogLevel severity) : severity_(severity), has_logged_(false) {}
+
+ virtual ~CerrLog() {
+ if (has_logged_) {
+ std::cerr << std::endl;
+ }
+ if (severity_ == ArrowLogLevel::ARROW_FATAL) {
+ PrintBackTrace();
+ std::abort();
+ }
+ }
+
+ std::ostream& Stream() {
+ has_logged_ = true;
+ return std::cerr;
+ }
+
+ template <class T>
+ CerrLog& operator<<(const T& t) {
+ if (severity_ != ArrowLogLevel::ARROW_DEBUG) {
+ has_logged_ = true;
+ std::cerr << t;
+ }
+ return *this;
+ }
+
+ protected:
+ const ArrowLogLevel severity_;
+ bool has_logged_;
+
+ void PrintBackTrace() {
+#ifdef ARROW_WITH_BACKTRACE
+ void* buffer[255];
+ const int calls = backtrace(buffer, static_cast<int>(sizeof(buffer) / sizeof(void*)));
+ backtrace_symbols_fd(buffer, calls, 1);
+#endif
+ }
+};
+
+#ifdef ARROW_USE_GLOG
+typedef google::LogMessage LoggingProvider;
+#else
+typedef CerrLog LoggingProvider;
+#endif
+
+ArrowLogLevel ArrowLog::severity_threshold_ = ArrowLogLevel::ARROW_INFO;
+// Keep the log directory.
+static std::unique_ptr<std::string> log_dir_;
+
+#ifdef ARROW_USE_GLOG
+
+// Glog's severity map.
+static int GetMappedSeverity(ArrowLogLevel severity) {
+ switch (severity) {
+ case ArrowLogLevel::ARROW_DEBUG:
+ return google::GLOG_INFO;
+ case ArrowLogLevel::ARROW_INFO:
+ return google::GLOG_INFO;
+ case ArrowLogLevel::ARROW_WARNING:
+ return google::GLOG_WARNING;
+ case ArrowLogLevel::ARROW_ERROR:
+ return google::GLOG_ERROR;
+ case ArrowLogLevel::ARROW_FATAL:
+ return google::GLOG_FATAL;
+ default:
+ ARROW_LOG(FATAL) << "Unsupported logging level: " << static_cast<int>(severity);
+ // This return won't be hit but compiler needs it.
+ return google::GLOG_FATAL;
+ }
+}
+
+#endif
+
+void ArrowLog::StartArrowLog(const std::string& app_name,
+ ArrowLogLevel severity_threshold,
+ const std::string& log_dir) {
+ severity_threshold_ = severity_threshold;
+ // In InitGoogleLogging, it simply keeps the pointer.
+ // We need to make sure the app name passed to InitGoogleLogging exist.
+ // We should avoid using static string is a dynamic lib.
+ static std::unique_ptr<std::string> app_name_;
+ app_name_.reset(new std::string(app_name));
+ log_dir_.reset(new std::string(log_dir));
+#ifdef ARROW_USE_GLOG
+ int mapped_severity_threshold = GetMappedSeverity(severity_threshold_);
+ google::SetStderrLogging(mapped_severity_threshold);
+ // Enble log file if log_dir is not empty.
+ if (!log_dir.empty()) {
+ auto dir_ends_with_slash = log_dir;
+ if (log_dir[log_dir.length() - 1] != '/') {
+ dir_ends_with_slash += "/";
+ }
+ auto app_name_without_path = app_name;
+ if (app_name.empty()) {
+ app_name_without_path = "DefaultApp";
+ } else {
+ // Find the app name without the path.
+ size_t pos = app_name.rfind('/');
+ if (pos != app_name.npos && pos + 1 < app_name.length()) {
+ app_name_without_path = app_name.substr(pos + 1);
+ }
+ }
+ // If InitGoogleLogging is called but SetLogDestination is not called,
+ // the log will be output to /tmp besides stderr. If log_dir is not
+ // provided, we'd better not call InitGoogleLogging.
+ google::InitGoogleLogging(app_name_->c_str());
+ google::SetLogFilenameExtension(app_name_without_path.c_str());
+ for (int i = static_cast<int>(severity_threshold_);
+ i <= static_cast<int>(ArrowLogLevel::ARROW_FATAL); ++i) {
+ int level = GetMappedSeverity(static_cast<ArrowLogLevel>(i));
+ google::SetLogDestination(level, dir_ends_with_slash.c_str());
+ }
+ }
+#endif
+}
+
+void ArrowLog::UninstallSignalAction() {
+#ifdef ARROW_USE_GLOG
+ ARROW_LOG(DEBUG) << "Uninstall signal handlers.";
+ // This signal list comes from glog's signalhandler.cc.
+ // https://github.com/google/glog/blob/master/src/signalhandler.cc#L58-L70
+ std::vector<int> installed_signals({SIGSEGV, SIGILL, SIGFPE, SIGABRT, SIGTERM});
+#ifdef WIN32
+ for (int signal_num : installed_signals) {
+ ARROW_CHECK(signal(signal_num, SIG_DFL) != SIG_ERR);
+ }
+#else
+ struct sigaction sig_action;
+ memset(&sig_action, 0, sizeof(sig_action));
+ sigemptyset(&sig_action.sa_mask);
+ sig_action.sa_handler = SIG_DFL;
+ for (int signal_num : installed_signals) {
+ ARROW_CHECK(sigaction(signal_num, &sig_action, NULL) == 0);
+ }
+#endif
+#endif
+}
+
+void ArrowLog::ShutDownArrowLog() {
+#ifdef ARROW_USE_GLOG
+ if (!log_dir_->empty()) {
+ google::ShutdownGoogleLogging();
+ }
+#endif
+}
+
+void ArrowLog::InstallFailureSignalHandler() {
+#ifdef ARROW_USE_GLOG
+ google::InstallFailureSignalHandler();
+#endif
+}
+
+bool ArrowLog::IsLevelEnabled(ArrowLogLevel log_level) {
+ return log_level >= severity_threshold_;
+}
+
+ArrowLog::ArrowLog(const char* file_name, int line_number, ArrowLogLevel severity)
+ // glog does not have DEBUG level, we can handle it using is_enabled_.
+ : logging_provider_(nullptr), is_enabled_(severity >= severity_threshold_) {
+#ifdef ARROW_USE_GLOG
+ if (is_enabled_) {
+ logging_provider_ =
+ new google::LogMessage(file_name, line_number, GetMappedSeverity(severity));
+ }
+#else
+ auto logging_provider = new CerrLog(severity);
+ *logging_provider << file_name << ":" << line_number << ": ";
+ logging_provider_ = logging_provider;
+#endif
+}
+
+std::ostream& ArrowLog::Stream() {
+ auto logging_provider = reinterpret_cast<LoggingProvider*>(logging_provider_);
+#ifdef ARROW_USE_GLOG
+ // Before calling this function, user should check IsEnabled.
+ // When IsEnabled == false, logging_provider_ will be empty.
+ return logging_provider->stream();
+#else
+ return logging_provider->Stream();
+#endif
+}
+
+bool ArrowLog::IsEnabled() const { return is_enabled_; }
+
+ArrowLog::~ArrowLog() {
+ if (logging_provider_ != nullptr) {
+ delete reinterpret_cast<LoggingProvider*>(logging_provider_);
+ logging_provider_ = nullptr;
+ }
+}
+
+} // namespace util
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/logging.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/logging.h
new file mode 100644
index 00000000000..15a0188ab76
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/logging.h
@@ -0,0 +1,259 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#ifdef GANDIVA_IR
+
+// The LLVM IR code doesn't have an NDEBUG mode. And, it shouldn't include references to
+// streams or stdc++. So, making the DCHECK calls void in that case.
+
+#define ARROW_IGNORE_EXPR(expr) ((void)(expr))
+
+#define DCHECK(condition) ARROW_IGNORE_EXPR(condition)
+#define DCHECK_OK(status) ARROW_IGNORE_EXPR(status)
+#define DCHECK_EQ(val1, val2) ARROW_IGNORE_EXPR(val1)
+#define DCHECK_NE(val1, val2) ARROW_IGNORE_EXPR(val1)
+#define DCHECK_LE(val1, val2) ARROW_IGNORE_EXPR(val1)
+#define DCHECK_LT(val1, val2) ARROW_IGNORE_EXPR(val1)
+#define DCHECK_GE(val1, val2) ARROW_IGNORE_EXPR(val1)
+#define DCHECK_GT(val1, val2) ARROW_IGNORE_EXPR(val1)
+
+#else // !GANDIVA_IR
+
+#include <memory>
+#include <ostream>
+#include <string>
+
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace util {
+
+enum class ArrowLogLevel : int {
+ ARROW_DEBUG = -1,
+ ARROW_INFO = 0,
+ ARROW_WARNING = 1,
+ ARROW_ERROR = 2,
+ ARROW_FATAL = 3
+};
+
+#define ARROW_LOG_INTERNAL(level) ::arrow::util::ArrowLog(__FILE__, __LINE__, level)
+#define ARROW_LOG(level) ARROW_LOG_INTERNAL(::arrow::util::ArrowLogLevel::ARROW_##level)
+
+#define ARROW_IGNORE_EXPR(expr) ((void)(expr))
+
+#define ARROW_CHECK(condition) \
+ ARROW_PREDICT_TRUE(condition) \
+ ? ARROW_IGNORE_EXPR(0) \
+ : ::arrow::util::Voidify() & \
+ ::arrow::util::ArrowLog(__FILE__, __LINE__, \
+ ::arrow::util::ArrowLogLevel::ARROW_FATAL) \
+ << " Check failed: " #condition " "
+
+// If 'to_call' returns a bad status, CHECK immediately with a logged message
+// of 'msg' followed by the status.
+#define ARROW_CHECK_OK_PREPEND(to_call, msg) \
+ do { \
+ ::arrow::Status _s = (to_call); \
+ ARROW_CHECK(_s.ok()) << "Operation failed: " << ARROW_STRINGIFY(to_call) << "\n" \
+ << (msg) << ": " << _s.ToString(); \
+ } while (false)
+
+// If the status is bad, CHECK immediately, appending the status to the
+// logged message.
+#define ARROW_CHECK_OK(s) ARROW_CHECK_OK_PREPEND(s, "Bad status")
+
+#define ARROW_CHECK_EQ(val1, val2) ARROW_CHECK((val1) == (val2))
+#define ARROW_CHECK_NE(val1, val2) ARROW_CHECK((val1) != (val2))
+#define ARROW_CHECK_LE(val1, val2) ARROW_CHECK((val1) <= (val2))
+#define ARROW_CHECK_LT(val1, val2) ARROW_CHECK((val1) < (val2))
+#define ARROW_CHECK_GE(val1, val2) ARROW_CHECK((val1) >= (val2))
+#define ARROW_CHECK_GT(val1, val2) ARROW_CHECK((val1) > (val2))
+
+#ifdef NDEBUG
+#define ARROW_DFATAL ::arrow::util::ArrowLogLevel::ARROW_WARNING
+
+// CAUTION: DCHECK_OK() always evaluates its argument, but other DCHECK*() macros
+// only do so in debug mode.
+
+#define ARROW_DCHECK(condition) \
+ while (false) ARROW_IGNORE_EXPR(condition); \
+ while (false) ::arrow::util::detail::NullLog()
+#define ARROW_DCHECK_OK(s) \
+ ARROW_IGNORE_EXPR(s); \
+ while (false) ::arrow::util::detail::NullLog()
+#define ARROW_DCHECK_EQ(val1, val2) \
+ while (false) ARROW_IGNORE_EXPR(val1); \
+ while (false) ARROW_IGNORE_EXPR(val2); \
+ while (false) ::arrow::util::detail::NullLog()
+#define ARROW_DCHECK_NE(val1, val2) \
+ while (false) ARROW_IGNORE_EXPR(val1); \
+ while (false) ARROW_IGNORE_EXPR(val2); \
+ while (false) ::arrow::util::detail::NullLog()
+#define ARROW_DCHECK_LE(val1, val2) \
+ while (false) ARROW_IGNORE_EXPR(val1); \
+ while (false) ARROW_IGNORE_EXPR(val2); \
+ while (false) ::arrow::util::detail::NullLog()
+#define ARROW_DCHECK_LT(val1, val2) \
+ while (false) ARROW_IGNORE_EXPR(val1); \
+ while (false) ARROW_IGNORE_EXPR(val2); \
+ while (false) ::arrow::util::detail::NullLog()
+#define ARROW_DCHECK_GE(val1, val2) \
+ while (false) ARROW_IGNORE_EXPR(val1); \
+ while (false) ARROW_IGNORE_EXPR(val2); \
+ while (false) ::arrow::util::detail::NullLog()
+#define ARROW_DCHECK_GT(val1, val2) \
+ while (false) ARROW_IGNORE_EXPR(val1); \
+ while (false) ARROW_IGNORE_EXPR(val2); \
+ while (false) ::arrow::util::detail::NullLog()
+
+#else
+#define ARROW_DFATAL ::arrow::util::ArrowLogLevel::ARROW_FATAL
+
+#define ARROW_DCHECK ARROW_CHECK
+#define ARROW_DCHECK_OK ARROW_CHECK_OK
+#define ARROW_DCHECK_EQ ARROW_CHECK_EQ
+#define ARROW_DCHECK_NE ARROW_CHECK_NE
+#define ARROW_DCHECK_LE ARROW_CHECK_LE
+#define ARROW_DCHECK_LT ARROW_CHECK_LT
+#define ARROW_DCHECK_GE ARROW_CHECK_GE
+#define ARROW_DCHECK_GT ARROW_CHECK_GT
+
+#endif // NDEBUG
+
+#define DCHECK ARROW_DCHECK
+#define DCHECK_OK ARROW_DCHECK_OK
+#define DCHECK_EQ ARROW_DCHECK_EQ
+#define DCHECK_NE ARROW_DCHECK_NE
+#define DCHECK_LE ARROW_DCHECK_LE
+#define DCHECK_LT ARROW_DCHECK_LT
+#define DCHECK_GE ARROW_DCHECK_GE
+#define DCHECK_GT ARROW_DCHECK_GT
+
+// This code is adapted from
+// https://github.com/ray-project/ray/blob/master/src/ray/util/logging.h.
+
+// To make the logging lib pluggable with other logging libs and make
+// the implementation unawared by the user, ArrowLog is only a declaration
+// which hide the implementation into logging.cc file.
+// In logging.cc, we can choose different log libs using different macros.
+
+// This is also a null log which does not output anything.
+class ARROW_EXPORT ArrowLogBase {
+ public:
+ virtual ~ArrowLogBase() {}
+
+ virtual bool IsEnabled() const { return false; }
+
+ template <typename T>
+ ArrowLogBase& operator<<(const T& t) {
+ if (IsEnabled()) {
+ Stream() << t;
+ }
+ return *this;
+ }
+
+ protected:
+ virtual std::ostream& Stream() = 0;
+};
+
+class ARROW_EXPORT ArrowLog : public ArrowLogBase {
+ public:
+ ArrowLog(const char* file_name, int line_number, ArrowLogLevel severity);
+ ~ArrowLog() override;
+
+ /// Return whether or not current logging instance is enabled.
+ ///
+ /// \return True if logging is enabled and false otherwise.
+ bool IsEnabled() const override;
+
+ /// The init function of arrow log for a program which should be called only once.
+ ///
+ /// \param appName The app name which starts the log.
+ /// \param severity_threshold Logging threshold for the program.
+ /// \param logDir Logging output file name. If empty, the log won't output to file.
+ static void StartArrowLog(const std::string& appName,
+ ArrowLogLevel severity_threshold = ArrowLogLevel::ARROW_INFO,
+ const std::string& logDir = "");
+
+ /// The shutdown function of arrow log, it should be used with StartArrowLog as a pair.
+ static void ShutDownArrowLog();
+
+ /// Install the failure signal handler to output call stack when crash.
+ /// If glog is not installed, this function won't do anything.
+ static void InstallFailureSignalHandler();
+
+ /// Uninstall the signal actions installed by InstallFailureSignalHandler.
+ static void UninstallSignalAction();
+
+ /// Return whether or not the log level is enabled in current setting.
+ ///
+ /// \param log_level The input log level to test.
+ /// \return True if input log level is not lower than the threshold.
+ static bool IsLevelEnabled(ArrowLogLevel log_level);
+
+ private:
+ ARROW_DISALLOW_COPY_AND_ASSIGN(ArrowLog);
+
+ // Hide the implementation of log provider by void *.
+ // Otherwise, lib user may define the same macro to use the correct header file.
+ void* logging_provider_;
+ /// True if log messages should be logged and false if they should be ignored.
+ bool is_enabled_;
+
+ static ArrowLogLevel severity_threshold_;
+
+ protected:
+ std::ostream& Stream() override;
+};
+
+// This class make ARROW_CHECK compilation pass to change the << operator to void.
+// This class is copied from glog.
+class ARROW_EXPORT Voidify {
+ public:
+ Voidify() {}
+ // This has to be an operator with a precedence lower than << but
+ // higher than ?:
+ void operator&(ArrowLogBase&) {}
+};
+
+namespace detail {
+
+/// @brief A helper for the nil log sink.
+///
+/// Using this helper is analogous to sending log messages to /dev/null:
+/// nothing gets logged.
+class NullLog {
+ public:
+ /// The no-op output operator.
+ ///
+ /// @param [in] t
+ /// The object to send into the nil sink.
+ /// @return Reference to the updated object.
+ template <class T>
+ NullLog& operator<<(const T& t) {
+ return *this;
+ }
+};
+
+} // namespace detail
+} // namespace util
+} // namespace arrow
+
+#endif // GANDIVA_IR
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/macros.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/macros.h
new file mode 100644
index 00000000000..548cc041ec8
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/macros.h
@@ -0,0 +1,185 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+
+#define ARROW_EXPAND(x) x
+#define ARROW_STRINGIFY(x) #x
+#define ARROW_CONCAT(x, y) x##y
+
+// From Google gutil
+#ifndef ARROW_DISALLOW_COPY_AND_ASSIGN
+#define ARROW_DISALLOW_COPY_AND_ASSIGN(TypeName) \
+ TypeName(const TypeName&) = delete; \
+ void operator=(const TypeName&) = delete
+#endif
+
+#ifndef ARROW_DEFAULT_MOVE_AND_ASSIGN
+#define ARROW_DEFAULT_MOVE_AND_ASSIGN(TypeName) \
+ TypeName(TypeName&&) = default; \
+ TypeName& operator=(TypeName&&) = default
+#endif
+
+#define ARROW_UNUSED(x) (void)(x)
+#define ARROW_ARG_UNUSED(x)
+//
+// GCC can be told that a certain branch is not likely to be taken (for
+// instance, a CHECK failure), and use that information in static analysis.
+// Giving it this information can help it optimize for the common case in
+// the absence of better information (ie. -fprofile-arcs).
+//
+#if defined(__GNUC__)
+#define ARROW_PREDICT_FALSE(x) (__builtin_expect(!!(x), 0))
+#define ARROW_PREDICT_TRUE(x) (__builtin_expect(!!(x), 1))
+#define ARROW_NORETURN __attribute__((noreturn))
+#define ARROW_NOINLINE __attribute__((noinline))
+#define ARROW_PREFETCH(addr) __builtin_prefetch(addr)
+#elif defined(_MSC_VER)
+#define ARROW_NORETURN __declspec(noreturn)
+#define ARROW_NOINLINE __declspec(noinline)
+#define ARROW_PREDICT_FALSE(x) (x)
+#define ARROW_PREDICT_TRUE(x) (x)
+#define ARROW_PREFETCH(addr)
+#else
+#define ARROW_NORETURN
+#define ARROW_PREDICT_FALSE(x) (x)
+#define ARROW_PREDICT_TRUE(x) (x)
+#define ARROW_PREFETCH(addr)
+#endif
+
+#if (defined(__GNUC__) || defined(__APPLE__))
+#define ARROW_MUST_USE_RESULT __attribute__((warn_unused_result))
+#elif defined(_MSC_VER)
+#define ARROW_MUST_USE_RESULT
+#else
+#define ARROW_MUST_USE_RESULT
+#endif
+
+#if defined(__clang__)
+// Only clang supports warn_unused_result as a type annotation.
+#define ARROW_MUST_USE_TYPE ARROW_MUST_USE_RESULT
+#else
+#define ARROW_MUST_USE_TYPE
+#endif
+
+// ----------------------------------------------------------------------
+// C++/CLI support macros (see ARROW-1134)
+
+#ifndef NULLPTR
+
+#ifdef __cplusplus_cli
+#define NULLPTR __nullptr
+#else
+#define NULLPTR nullptr
+#endif
+
+#endif // ifndef NULLPTR
+
+// ----------------------------------------------------------------------
+
+// clang-format off
+// [[deprecated]] is only available in C++14, use this for the time being
+// This macro takes an optional deprecation message
+#ifdef __COVERITY__
+# define ARROW_DEPRECATED(...)
+# define ARROW_DEPRECATED_USING(...)
+#elif __cplusplus > 201103L
+# define ARROW_DEPRECATED(...) [[deprecated(__VA_ARGS__)]]
+# define ARROW_DEPRECATED_USING(...) ARROW_DEPRECATED(__VA_ARGS__)
+#else
+# ifdef __GNUC__
+# define ARROW_DEPRECATED(...) __attribute__((deprecated(__VA_ARGS__)))
+# define ARROW_DEPRECATED_USING(...) ARROW_DEPRECATED(__VA_ARGS__)
+# elif defined(_MSC_VER)
+# define ARROW_DEPRECATED(...) __declspec(deprecated(__VA_ARGS__))
+# define ARROW_DEPRECATED_USING(...)
+# else
+# define ARROW_DEPRECATED(...)
+# define ARROW_DEPRECATED_USING(...)
+# endif
+#endif
+// clang-format on
+
+// ----------------------------------------------------------------------
+
+// macros to disable padding
+// these macros are portable across different compilers and platforms
+//[https://github.com/google/flatbuffers/blob/master/include/flatbuffers/flatbuffers.h#L1355]
+#if !defined(MANUALLY_ALIGNED_STRUCT)
+#if defined(_MSC_VER)
+#define MANUALLY_ALIGNED_STRUCT(alignment) \
+ __pragma(pack(1)); \
+ struct __declspec(align(alignment))
+#define STRUCT_END(name, size) \
+ __pragma(pack()); \
+ static_assert(sizeof(name) == size, "compiler breaks packing rules")
+#elif defined(__GNUC__) || defined(__clang__)
+#define MANUALLY_ALIGNED_STRUCT(alignment) \
+ _Pragma("pack(1)") struct __attribute__((aligned(alignment)))
+#define STRUCT_END(name, size) \
+ _Pragma("pack()") static_assert(sizeof(name) == size, "compiler breaks packing rules")
+#else
+#error Unknown compiler, please define structure alignment macros
+#endif
+#endif // !defined(MANUALLY_ALIGNED_STRUCT)
+
+// ----------------------------------------------------------------------
+// Convenience macro disabling a particular UBSan check in a function
+
+#if defined(__clang__)
+#define ARROW_DISABLE_UBSAN(feature) __attribute__((no_sanitize(feature)))
+#else
+#define ARROW_DISABLE_UBSAN(feature)
+#endif
+
+// ----------------------------------------------------------------------
+// Machine information
+
+#if INTPTR_MAX == INT64_MAX
+#define ARROW_BITNESS 64
+#elif INTPTR_MAX == INT32_MAX
+#define ARROW_BITNESS 32
+#else
+#error Unexpected INTPTR_MAX
+#endif
+
+// ----------------------------------------------------------------------
+// From googletest
+// (also in parquet-cpp)
+
+// When you need to test the private or protected members of a class,
+// use the FRIEND_TEST macro to declare your tests as friends of the
+// class. For example:
+//
+// class MyClass {
+// private:
+// void MyMethod();
+// FRIEND_TEST(MyClassTest, MyMethod);
+// };
+//
+// class MyClassTest : public testing::Test {
+// // ...
+// };
+//
+// TEST_F(MyClassTest, MyMethod) {
+// // Can call MyClass::MyMethod() here.
+// }
+
+#define FRIEND_TEST(test_case_name, test_name) \
+ friend class test_case_name##_##test_name##_Test
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/make_unique.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/make_unique.h
new file mode 100644
index 00000000000..850e20409b9
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/make_unique.h
@@ -0,0 +1,42 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <type_traits>
+#include <utility>
+
+namespace arrow {
+namespace internal {
+
+template <typename T, typename... A>
+typename std::enable_if<!std::is_array<T>::value, std::unique_ptr<T>>::type make_unique(
+ A&&... args) {
+ return std::unique_ptr<T>(new T(std::forward<A>(args)...));
+}
+
+template <typename T>
+typename std::enable_if<std::is_array<T>::value && std::extent<T>::value == 0,
+ std::unique_ptr<T>>::type
+make_unique(std::size_t n) {
+ using value_type = typename std::remove_extent<T>::type;
+ return std::unique_ptr<value_type[]>(new value_type[n]);
+}
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/memory.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/memory.cc
new file mode 100644
index 00000000000..e91009d5860
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/memory.cc
@@ -0,0 +1,74 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <vector>
+
+#include "arrow/util/logging.h"
+#include "arrow/util/memory.h"
+#include "arrow/util/thread_pool.h"
+
+namespace arrow {
+namespace internal {
+
+inline uint8_t* pointer_logical_and(const uint8_t* address, uintptr_t bits) {
+ uintptr_t value = reinterpret_cast<uintptr_t>(address);
+ return reinterpret_cast<uint8_t*>(value & bits);
+}
+
+// This function is just for avoiding MinGW-w64 32bit crash.
+// See also: https://sourceforge.net/p/mingw-w64/bugs/767/
+void* wrap_memcpy(void* dst, const void* src, size_t n) { return memcpy(dst, src, n); }
+
+void parallel_memcopy(uint8_t* dst, const uint8_t* src, int64_t nbytes,
+ uintptr_t block_size, int num_threads) {
+ // XXX This function is really using `num_threads + 1` threads.
+ auto pool = GetCpuThreadPool();
+
+ uint8_t* left = pointer_logical_and(src + block_size - 1, ~(block_size - 1));
+ uint8_t* right = pointer_logical_and(src + nbytes, ~(block_size - 1));
+ int64_t num_blocks = (right - left) / block_size;
+
+ // Update right address
+ right = right - (num_blocks % num_threads) * block_size;
+
+ // Now we divide these blocks between available threads. The remainder is
+ // handled separately.
+ size_t chunk_size = (right - left) / num_threads;
+ int64_t prefix = left - src;
+ int64_t suffix = src + nbytes - right;
+ // Now the data layout is | prefix | k * num_threads * block_size | suffix |.
+ // We have chunk_size = k * block_size, therefore the data layout is
+ // | prefix | num_threads * chunk_size | suffix |.
+ // Each thread gets a "chunk" of k blocks.
+
+ // Start all parallel memcpy tasks and handle leftovers while threads run.
+ std::vector<Future<void*>> futures;
+
+ for (int i = 0; i < num_threads; i++) {
+ futures.push_back(*pool->Submit(wrap_memcpy, dst + prefix + i * chunk_size,
+ left + i * chunk_size, chunk_size));
+ }
+ memcpy(dst, src, prefix);
+ memcpy(dst + prefix + num_threads * chunk_size, right, suffix);
+
+ for (auto& fut : futures) {
+ ARROW_CHECK_OK(fut.status());
+ }
+}
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/memory.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/memory.h
new file mode 100644
index 00000000000..4250d0694b7
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/memory.h
@@ -0,0 +1,43 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+
+#include "arrow/util/macros.h"
+
+namespace arrow {
+namespace internal {
+
+// A helper function for doing memcpy with multiple threads. This is required
+// to saturate the memory bandwidth of modern cpus.
+void parallel_memcopy(uint8_t* dst, const uint8_t* src, int64_t nbytes,
+ uintptr_t block_size, int num_threads);
+
+// A helper function for checking if two wrapped objects implementing `Equals`
+// are equal.
+template <typename T>
+bool SharedPtrEquals(const std::shared_ptr<T>& left, const std::shared_ptr<T>& right) {
+ if (left == right) return true;
+ if (left == NULLPTR || right == NULLPTR) return false;
+ return left->Equals(*right);
+}
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/mutex.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/mutex.cc
new file mode 100644
index 00000000000..7456d7889d8
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/mutex.cc
@@ -0,0 +1,54 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/util/mutex.h"
+
+#include <mutex>
+
+#include "arrow/util/logging.h"
+
+namespace arrow {
+namespace util {
+
+struct Mutex::Impl {
+ std::mutex mutex_;
+};
+
+Mutex::Guard::Guard(Mutex* locked)
+ : locked_(locked, [](Mutex* locked) {
+ DCHECK(!locked->impl_->mutex_.try_lock());
+ locked->impl_->mutex_.unlock();
+ }) {}
+
+Mutex::Guard Mutex::TryLock() {
+ DCHECK_NE(impl_, nullptr);
+ if (impl_->mutex_.try_lock()) {
+ return Guard{this};
+ }
+ return Guard{};
+}
+
+Mutex::Guard Mutex::Lock() {
+ DCHECK_NE(impl_, nullptr);
+ impl_->mutex_.lock();
+ return Guard{this};
+}
+
+Mutex::Mutex() : impl_(new Impl, [](Impl* impl) { delete impl; }) {}
+
+} // namespace util
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/mutex.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/mutex.h
new file mode 100644
index 00000000000..6c80be380ae
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/mutex.h
@@ -0,0 +1,64 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace util {
+
+/// A wrapper around std::mutex since we can't use it directly in
+/// public headers due to C++/CLI.
+/// https://docs.microsoft.com/en-us/cpp/standard-library/mutex#remarks
+class ARROW_EXPORT Mutex {
+ public:
+ Mutex();
+ Mutex(Mutex&&) = default;
+ Mutex& operator=(Mutex&&) = default;
+
+ /// A Guard is falsy if a lock could not be acquired.
+ class ARROW_EXPORT Guard {
+ public:
+ Guard() : locked_(NULLPTR, [](Mutex* /* mutex */) {}) {}
+ Guard(Guard&&) = default;
+ Guard& operator=(Guard&&) = default;
+
+ explicit operator bool() const { return bool(locked_); }
+
+ void Unlock() { locked_.reset(); }
+
+ private:
+ explicit Guard(Mutex* locked);
+
+ std::unique_ptr<Mutex, void (*)(Mutex*)> locked_;
+ friend Mutex;
+ };
+
+ Guard TryLock();
+ Guard Lock();
+
+ private:
+ struct Impl;
+ std::unique_ptr<Impl, void (*)(Impl*)> impl_;
+};
+
+} // namespace util
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/optional.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/optional.h
new file mode 100644
index 00000000000..b824b499bb8
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/optional.h
@@ -0,0 +1,33 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <optional>
+
+namespace arrow {
+namespace util {
+
+template <typename T>
+using optional = std::optional<T>;
+
+using std::bad_optional_access;
+using std::make_optional;
+using std::nullopt;
+
+} // namespace util
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/parallel.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/parallel.h
new file mode 100644
index 00000000000..80f60fbdb36
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/parallel.h
@@ -0,0 +1,102 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <utility>
+#include <vector>
+
+#include "arrow/status.h"
+#include "arrow/util/functional.h"
+#include "arrow/util/thread_pool.h"
+#include "arrow/util/vector.h"
+
+namespace arrow {
+namespace internal {
+
+// A parallelizer that takes a `Status(int)` function and calls it with
+// arguments between 0 and `num_tasks - 1`, on an arbitrary number of threads.
+
+template <class FUNCTION>
+Status ParallelFor(int num_tasks, FUNCTION&& func,
+ Executor* executor = internal::GetCpuThreadPool()) {
+ std::vector<Future<>> futures(num_tasks);
+
+ for (int i = 0; i < num_tasks; ++i) {
+ ARROW_ASSIGN_OR_RAISE(futures[i], executor->Submit(func, i));
+ }
+ auto st = Status::OK();
+ for (auto& fut : futures) {
+ st &= fut.status();
+ }
+ return st;
+}
+
+template <class FUNCTION, typename T,
+ typename R = typename internal::call_traits::return_type<FUNCTION>::ValueType>
+Future<std::vector<R>> ParallelForAsync(
+ std::vector<T> inputs, FUNCTION&& func,
+ Executor* executor = internal::GetCpuThreadPool()) {
+ std::vector<Future<R>> futures(inputs.size());
+ for (size_t i = 0; i < inputs.size(); ++i) {
+ ARROW_ASSIGN_OR_RAISE(futures[i], executor->Submit(func, i, std::move(inputs[i])));
+ }
+ return All(std::move(futures))
+ .Then([](const std::vector<Result<R>>& results) -> Result<std::vector<R>> {
+ return UnwrapOrRaise(results);
+ });
+}
+
+// A parallelizer that takes a `Status(int)` function and calls it with
+// arguments between 0 and `num_tasks - 1`, in sequence or in parallel,
+// depending on the input boolean.
+
+template <class FUNCTION>
+Status OptionalParallelFor(bool use_threads, int num_tasks, FUNCTION&& func,
+ Executor* executor = internal::GetCpuThreadPool()) {
+ if (use_threads) {
+ return ParallelFor(num_tasks, std::forward<FUNCTION>(func), executor);
+ } else {
+ for (int i = 0; i < num_tasks; ++i) {
+ RETURN_NOT_OK(func(i));
+ }
+ return Status::OK();
+ }
+}
+
+// A parallelizer that takes a `Result<R>(int index, T item)` function and
+// calls it with each item from the input array, in sequence or in parallel,
+// depending on the input boolean.
+
+template <class FUNCTION, typename T,
+ typename R = typename internal::call_traits::return_type<FUNCTION>::ValueType>
+Future<std::vector<R>> OptionalParallelForAsync(
+ bool use_threads, std::vector<T> inputs, FUNCTION&& func,
+ Executor* executor = internal::GetCpuThreadPool()) {
+ if (use_threads) {
+ return ParallelForAsync(std::move(inputs), std::forward<FUNCTION>(func), executor);
+ } else {
+ std::vector<R> result(inputs.size());
+ for (size_t i = 0; i < inputs.size(); ++i) {
+ ARROW_ASSIGN_OR_RAISE(result[i], func(i, inputs[i]));
+ }
+ return result;
+ }
+}
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/queue.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/queue.h
new file mode 100644
index 00000000000..6c71fa6e155
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/queue.h
@@ -0,0 +1,29 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/vendored/ProducerConsumerQueue.h"
+
+namespace arrow {
+namespace util {
+
+template <typename T>
+using SpscQueue = arrow_vendored::folly::ProducerConsumerQueue<T>;
+
+}
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/range.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/range.h
new file mode 100644
index 00000000000..ea0fb0eeaab
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/range.h
@@ -0,0 +1,155 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <numeric>
+#include <utility>
+#include <vector>
+
+namespace arrow {
+namespace internal {
+
+/// Create a vector containing the values from start up to stop
+template <typename T>
+std::vector<T> Iota(T start, T stop) {
+ if (start > stop) {
+ return {};
+ }
+ std::vector<T> result(static_cast<size_t>(stop - start));
+ std::iota(result.begin(), result.end(), start);
+ return result;
+}
+
+/// Create a vector containing the values from 0 up to length
+template <typename T>
+std::vector<T> Iota(T length) {
+ return Iota(static_cast<T>(0), length);
+}
+
+/// Create a range from a callable which takes a single index parameter
+/// and returns the value of iterator on each call and a length.
+/// Only iterators obtained from the same range should be compared, the
+/// behaviour generally similar to other STL containers.
+template <typename Generator>
+class LazyRange {
+ private:
+ // callable which generates the values
+ // has to be defined at the beginning of the class for type deduction
+ const Generator gen_;
+ // the length of the range
+ int64_t length_;
+#ifdef _MSC_VER
+ // workaround to VS2010 not supporting decltype properly
+ // see https://stackoverflow.com/questions/21782846/decltype-for-class-member-function
+ static Generator gen_static_;
+#endif
+
+ public:
+#ifdef _MSC_VER
+ using return_type = decltype(gen_static_(0));
+#else
+ using return_type = decltype(gen_(0));
+#endif
+
+ /// Construct a new range from a callable and length
+ LazyRange(Generator gen, int64_t length) : gen_(gen), length_(length) {}
+
+ // Class of the dependent iterator, created implicitly by begin and end
+ class RangeIter {
+ public:
+ using difference_type = int64_t;
+ using value_type = return_type;
+ using reference = const value_type&;
+ using pointer = const value_type*;
+ using iterator_category = std::forward_iterator_tag;
+
+#ifdef _MSC_VER
+ // msvc complains about unchecked iterators,
+ // see https://stackoverflow.com/questions/21655496/error-c4996-checked-iterators
+ using _Unchecked_type = typename LazyRange<Generator>::RangeIter;
+#endif
+
+ RangeIter() = delete;
+ RangeIter(const RangeIter& other) = default;
+ RangeIter& operator=(const RangeIter& other) = default;
+
+ RangeIter(const LazyRange<Generator>& range, int64_t index)
+ : range_(&range), index_(index) {}
+
+ const return_type operator*() const { return range_->gen_(index_); }
+
+ RangeIter operator+(difference_type length) const {
+ return RangeIter(*range_, index_ + length);
+ }
+
+ // pre-increment
+ RangeIter& operator++() {
+ ++index_;
+ return *this;
+ }
+
+ // post-increment
+ RangeIter operator++(int) {
+ auto copy = RangeIter(*this);
+ ++index_;
+ return copy;
+ }
+
+ bool operator==(const typename LazyRange<Generator>::RangeIter& other) const {
+ return this->index_ == other.index_ && this->range_ == other.range_;
+ }
+
+ bool operator!=(const typename LazyRange<Generator>::RangeIter& other) const {
+ return this->index_ != other.index_ || this->range_ != other.range_;
+ }
+
+ int64_t operator-(const typename LazyRange<Generator>::RangeIter& other) const {
+ return this->index_ - other.index_;
+ }
+
+ bool operator<(const typename LazyRange<Generator>::RangeIter& other) const {
+ return this->index_ < other.index_;
+ }
+
+ private:
+ // parent range reference
+ const LazyRange* range_;
+ // current index
+ int64_t index_;
+ };
+
+ friend class RangeIter;
+
+ // Create a new begin const iterator
+ RangeIter begin() { return RangeIter(*this, 0); }
+
+ // Create a new end const iterator
+ RangeIter end() { return RangeIter(*this, length_); }
+};
+
+/// Helper function to create a lazy range from a callable (e.g. lambda) and length
+template <typename Generator>
+LazyRange<Generator> MakeLazyRange(Generator&& gen, int64_t length) {
+ return LazyRange<Generator>(std::forward<Generator>(gen), length);
+}
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/reflection_internal.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/reflection_internal.h
new file mode 100644
index 00000000000..0440a2eb563
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/reflection_internal.h
@@ -0,0 +1,133 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <string>
+#include <tuple>
+#include <utility>
+
+#include "arrow/type_traits.h"
+#include "arrow/util/string_view.h"
+
+namespace arrow {
+namespace internal {
+
+template <size_t...>
+struct index_sequence {};
+
+template <size_t N, size_t Head = N, size_t... Tail>
+struct make_index_sequence_impl;
+
+template <size_t N>
+using make_index_sequence = typename make_index_sequence_impl<N>::type;
+
+template <typename... T>
+using index_sequence_for = make_index_sequence<sizeof...(T)>;
+
+template <size_t N, size_t... I>
+struct make_index_sequence_impl<N, 0, I...> {
+ using type = index_sequence<I...>;
+};
+
+template <size_t N, size_t H, size_t... I>
+struct make_index_sequence_impl : make_index_sequence_impl<N, H - 1, H - 1, I...> {};
+
+static_assert(std::is_same<index_sequence<>, make_index_sequence<0>>::value, "");
+static_assert(std::is_same<index_sequence<0, 1, 2>, make_index_sequence<3>>::value, "");
+
+template <typename...>
+struct all_same : std::true_type {};
+
+template <typename One>
+struct all_same<One> : std::true_type {};
+
+template <typename Same, typename... Rest>
+struct all_same<Same, Same, Rest...> : all_same<Same, Rest...> {};
+
+template <typename One, typename Other, typename... Rest>
+struct all_same<One, Other, Rest...> : std::false_type {};
+
+template <size_t... I, typename... T, typename Fn>
+void ForEachTupleMemberImpl(const std::tuple<T...>& tup, Fn&& fn, index_sequence<I...>) {
+ (void)std::make_tuple((fn(std::get<I>(tup), I), std::ignore)...);
+}
+
+template <typename... T, typename Fn>
+void ForEachTupleMember(const std::tuple<T...>& tup, Fn&& fn) {
+ ForEachTupleMemberImpl(tup, fn, index_sequence_for<T...>());
+}
+
+template <typename C, typename T>
+struct DataMemberProperty {
+ using Class = C;
+ using Type = T;
+
+ constexpr const Type& get(const Class& obj) const { return obj.*ptr_; }
+
+ void set(Class* obj, Type value) const { (*obj).*ptr_ = std::move(value); }
+
+ constexpr util::string_view name() const { return name_; }
+
+ util::string_view name_;
+ Type Class::*ptr_;
+};
+
+template <typename Class, typename Type>
+constexpr DataMemberProperty<Class, Type> DataMember(util::string_view name,
+ Type Class::*ptr) {
+ return {name, ptr};
+}
+
+template <typename... Properties>
+struct PropertyTuple {
+ template <typename Fn>
+ void ForEach(Fn&& fn) const {
+ ForEachTupleMember(props_, fn);
+ }
+
+ static_assert(all_same<typename Properties::Class...>::value,
+ "All properties must be properties of the same class");
+
+ size_t size() const { return sizeof...(Properties); }
+
+ std::tuple<Properties...> props_;
+};
+
+template <typename... Properties>
+PropertyTuple<Properties...> MakeProperties(Properties... props) {
+ return {std::make_tuple(props...)};
+}
+
+template <typename Enum>
+struct EnumTraits {};
+
+template <typename Enum, Enum... Values>
+struct BasicEnumTraits {
+ using CType = typename std::underlying_type<Enum>::type;
+ using Type = typename CTypeTraits<CType>::ArrowType;
+ static std::array<Enum, sizeof...(Values)> values() { return {Values...}; }
+};
+
+template <typename T, typename Enable = void>
+struct has_enum_traits : std::false_type {};
+
+template <typename T>
+struct has_enum_traits<T, void_t<typename EnumTraits<T>::Type>> : std::true_type {};
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/rle_encoding.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/rle_encoding.h
new file mode 100644
index 00000000000..68d29930666
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/rle_encoding.h
@@ -0,0 +1,826 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Imported from Apache Impala (incubating) on 2016-01-29 and modified for use
+// in parquet-cpp, Arrow
+
+#pragma once
+
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <vector>
+
+#include "arrow/util/bit_block_counter.h"
+#include "arrow/util/bit_run_reader.h"
+#include "arrow/util/bit_stream_utils.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/macros.h"
+
+namespace arrow {
+namespace util {
+
+/// Utility classes to do run length encoding (RLE) for fixed bit width values. If runs
+/// are sufficiently long, RLE is used, otherwise, the values are just bit-packed
+/// (literal encoding).
+/// For both types of runs, there is a byte-aligned indicator which encodes the length
+/// of the run and the type of the run.
+/// This encoding has the benefit that when there aren't any long enough runs, values
+/// are always decoded at fixed (can be precomputed) bit offsets OR both the value and
+/// the run length are byte aligned. This allows for very efficient decoding
+/// implementations.
+/// The encoding is:
+/// encoded-block := run*
+/// run := literal-run | repeated-run
+/// literal-run := literal-indicator < literal bytes >
+/// repeated-run := repeated-indicator < repeated value. padded to byte boundary >
+/// literal-indicator := varint_encode( number_of_groups << 1 | 1)
+/// repeated-indicator := varint_encode( number_of_repetitions << 1 )
+//
+/// Each run is preceded by a varint. The varint's least significant bit is
+/// used to indicate whether the run is a literal run or a repeated run. The rest
+/// of the varint is used to determine the length of the run (eg how many times the
+/// value repeats).
+//
+/// In the case of literal runs, the run length is always a multiple of 8 (i.e. encode
+/// in groups of 8), so that no matter the bit-width of the value, the sequence will end
+/// on a byte boundary without padding.
+/// Given that we know it is a multiple of 8, we store the number of 8-groups rather than
+/// the actual number of encoded ints. (This means that the total number of encoded values
+/// can not be determined from the encoded data, since the number of values in the last
+/// group may not be a multiple of 8). For the last group of literal runs, we pad
+/// the group to 8 with zeros. This allows for 8 at a time decoding on the read side
+/// without the need for additional checks.
+//
+/// There is a break-even point when it is more storage efficient to do run length
+/// encoding. For 1 bit-width values, that point is 8 values. They require 2 bytes
+/// for both the repeated encoding or the literal encoding. This value can always
+/// be computed based on the bit-width.
+/// TODO: think about how to use this for strings. The bit packing isn't quite the same.
+//
+/// Examples with bit-width 1 (eg encoding booleans):
+/// ----------------------------------------
+/// 100 1s followed by 100 0s:
+/// <varint(100 << 1)> <1, padded to 1 byte> <varint(100 << 1)> <0, padded to 1 byte>
+/// - (total 4 bytes)
+//
+/// alternating 1s and 0s (200 total):
+/// 200 ints = 25 groups of 8
+/// <varint((25 << 1) | 1)> <25 bytes of values, bitpacked>
+/// (total 26 bytes, 1 byte overhead)
+//
+
+/// Decoder class for RLE encoded data.
+class RleDecoder {
+ public:
+ /// Create a decoder object. buffer/buffer_len is the decoded data.
+ /// bit_width is the width of each value (before encoding).
+ RleDecoder(const uint8_t* buffer, int buffer_len, int bit_width)
+ : bit_reader_(buffer, buffer_len),
+ bit_width_(bit_width),
+ current_value_(0),
+ repeat_count_(0),
+ literal_count_(0) {
+ DCHECK_GE(bit_width_, 0);
+ DCHECK_LE(bit_width_, 64);
+ }
+
+ RleDecoder() : bit_width_(-1) {}
+
+ void Reset(const uint8_t* buffer, int buffer_len, int bit_width) {
+ DCHECK_GE(bit_width, 0);
+ DCHECK_LE(bit_width, 64);
+ bit_reader_.Reset(buffer, buffer_len);
+ bit_width_ = bit_width;
+ current_value_ = 0;
+ repeat_count_ = 0;
+ literal_count_ = 0;
+ }
+
+ /// Gets the next value. Returns false if there are no more.
+ template <typename T>
+ bool Get(T* val);
+
+ /// Gets a batch of values. Returns the number of decoded elements.
+ template <typename T>
+ int GetBatch(T* values, int batch_size);
+
+ /// Like GetBatch but add spacing for null entries
+ template <typename T>
+ int GetBatchSpaced(int batch_size, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset, T* out);
+
+ /// Like GetBatch but the values are then decoded using the provided dictionary
+ template <typename T>
+ int GetBatchWithDict(const T* dictionary, int32_t dictionary_length, T* values,
+ int batch_size);
+
+ /// Like GetBatchWithDict but add spacing for null entries
+ ///
+ /// Null entries will be zero-initialized in `values` to avoid leaking
+ /// private data.
+ template <typename T>
+ int GetBatchWithDictSpaced(const T* dictionary, int32_t dictionary_length, T* values,
+ int batch_size, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset);
+
+ protected:
+ BitUtil::BitReader bit_reader_;
+ /// Number of bits needed to encode the value. Must be between 0 and 64.
+ int bit_width_;
+ uint64_t current_value_;
+ int32_t repeat_count_;
+ int32_t literal_count_;
+
+ private:
+ /// Fills literal_count_ and repeat_count_ with next values. Returns false if there
+ /// are no more.
+ template <typename T>
+ bool NextCounts();
+
+ /// Utility methods for retrieving spaced values.
+ template <typename T, typename RunType, typename Converter>
+ int GetSpaced(Converter converter, int batch_size, int null_count,
+ const uint8_t* valid_bits, int64_t valid_bits_offset, T* out);
+};
+
+/// Class to incrementally build the rle data. This class does not allocate any memory.
+/// The encoding has two modes: encoding repeated runs and literal runs.
+/// If the run is sufficiently short, it is more efficient to encode as a literal run.
+/// This class does so by buffering 8 values at a time. If they are not all the same
+/// they are added to the literal run. If they are the same, they are added to the
+/// repeated run. When we switch modes, the previous run is flushed out.
+class RleEncoder {
+ public:
+ /// buffer/buffer_len: preallocated output buffer.
+ /// bit_width: max number of bits for value.
+ /// TODO: consider adding a min_repeated_run_length so the caller can control
+ /// when values should be encoded as repeated runs. Currently this is derived
+ /// based on the bit_width, which can determine a storage optimal choice.
+ /// TODO: allow 0 bit_width (and have dict encoder use it)
+ RleEncoder(uint8_t* buffer, int buffer_len, int bit_width)
+ : bit_width_(bit_width), bit_writer_(buffer, buffer_len) {
+ DCHECK_GE(bit_width_, 0);
+ DCHECK_LE(bit_width_, 64);
+ max_run_byte_size_ = MinBufferSize(bit_width);
+ DCHECK_GE(buffer_len, max_run_byte_size_) << "Input buffer not big enough.";
+ Clear();
+ }
+
+ /// Returns the minimum buffer size needed to use the encoder for 'bit_width'
+ /// This is the maximum length of a single run for 'bit_width'.
+ /// It is not valid to pass a buffer less than this length.
+ static int MinBufferSize(int bit_width) {
+ /// 1 indicator byte and MAX_VALUES_PER_LITERAL_RUN 'bit_width' values.
+ int max_literal_run_size =
+ 1 +
+ static_cast<int>(BitUtil::BytesForBits(MAX_VALUES_PER_LITERAL_RUN * bit_width));
+ /// Up to kMaxVlqByteLength indicator and a single 'bit_width' value.
+ int max_repeated_run_size = BitUtil::BitReader::kMaxVlqByteLength +
+ static_cast<int>(BitUtil::BytesForBits(bit_width));
+ return std::max(max_literal_run_size, max_repeated_run_size);
+ }
+
+ /// Returns the maximum byte size it could take to encode 'num_values'.
+ static int MaxBufferSize(int bit_width, int num_values) {
+ // For a bit_width > 1, the worst case is the repetition of "literal run of length 8
+ // and then a repeated run of length 8".
+ // 8 values per smallest run, 8 bits per byte
+ int bytes_per_run = bit_width;
+ int num_runs = static_cast<int>(BitUtil::CeilDiv(num_values, 8));
+ int literal_max_size = num_runs + num_runs * bytes_per_run;
+
+ // In the very worst case scenario, the data is a concatenation of repeated
+ // runs of 8 values. Repeated run has a 1 byte varint followed by the
+ // bit-packed repeated value
+ int min_repeated_run_size = 1 + static_cast<int>(BitUtil::BytesForBits(bit_width));
+ int repeated_max_size =
+ static_cast<int>(BitUtil::CeilDiv(num_values, 8)) * min_repeated_run_size;
+
+ return std::max(literal_max_size, repeated_max_size);
+ }
+
+ /// Encode value. Returns true if the value fits in buffer, false otherwise.
+ /// This value must be representable with bit_width_ bits.
+ bool Put(uint64_t value);
+
+ /// Flushes any pending values to the underlying buffer.
+ /// Returns the total number of bytes written
+ int Flush();
+
+ /// Resets all the state in the encoder.
+ void Clear();
+
+ /// Returns pointer to underlying buffer
+ uint8_t* buffer() { return bit_writer_.buffer(); }
+ int32_t len() { return bit_writer_.bytes_written(); }
+
+ private:
+ /// Flushes any buffered values. If this is part of a repeated run, this is largely
+ /// a no-op.
+ /// If it is part of a literal run, this will call FlushLiteralRun, which writes
+ /// out the buffered literal values.
+ /// If 'done' is true, the current run would be written even if it would normally
+ /// have been buffered more. This should only be called at the end, when the
+ /// encoder has received all values even if it would normally continue to be
+ /// buffered.
+ void FlushBufferedValues(bool done);
+
+ /// Flushes literal values to the underlying buffer. If update_indicator_byte,
+ /// then the current literal run is complete and the indicator byte is updated.
+ void FlushLiteralRun(bool update_indicator_byte);
+
+ /// Flushes a repeated run to the underlying buffer.
+ void FlushRepeatedRun();
+
+ /// Checks and sets buffer_full_. This must be called after flushing a run to
+ /// make sure there are enough bytes remaining to encode the next run.
+ void CheckBufferFull();
+
+ /// The maximum number of values in a single literal run
+ /// (number of groups encodable by a 1-byte indicator * 8)
+ static const int MAX_VALUES_PER_LITERAL_RUN = (1 << 6) * 8;
+
+ /// Number of bits needed to encode the value. Must be between 0 and 64.
+ const int bit_width_;
+
+ /// Underlying buffer.
+ BitUtil::BitWriter bit_writer_;
+
+ /// If true, the buffer is full and subsequent Put()'s will fail.
+ bool buffer_full_;
+
+ /// The maximum byte size a single run can take.
+ int max_run_byte_size_;
+
+ /// We need to buffer at most 8 values for literals. This happens when the
+ /// bit_width is 1 (so 8 values fit in one byte).
+ /// TODO: generalize this to other bit widths
+ int64_t buffered_values_[8];
+
+ /// Number of values in buffered_values_
+ int num_buffered_values_;
+
+ /// The current (also last) value that was written and the count of how
+ /// many times in a row that value has been seen. This is maintained even
+ /// if we are in a literal run. If the repeat_count_ get high enough, we switch
+ /// to encoding repeated runs.
+ uint64_t current_value_;
+ int repeat_count_;
+
+ /// Number of literals in the current run. This does not include the literals
+ /// that might be in buffered_values_. Only after we've got a group big enough
+ /// can we decide if they should part of the literal_count_ or repeat_count_
+ int literal_count_;
+
+ /// Pointer to a byte in the underlying buffer that stores the indicator byte.
+ /// This is reserved as soon as we need a literal run but the value is written
+ /// when the literal run is complete.
+ uint8_t* literal_indicator_byte_;
+};
+
+template <typename T>
+inline bool RleDecoder::Get(T* val) {
+ return GetBatch(val, 1) == 1;
+}
+
+template <typename T>
+inline int RleDecoder::GetBatch(T* values, int batch_size) {
+ DCHECK_GE(bit_width_, 0);
+ int values_read = 0;
+
+ auto* out = values;
+
+ while (values_read < batch_size) {
+ int remaining = batch_size - values_read;
+
+ if (repeat_count_ > 0) { // Repeated value case.
+ int repeat_batch = std::min(remaining, repeat_count_);
+ std::fill(out, out + repeat_batch, static_cast<T>(current_value_));
+
+ repeat_count_ -= repeat_batch;
+ values_read += repeat_batch;
+ out += repeat_batch;
+ } else if (literal_count_ > 0) {
+ int literal_batch = std::min(remaining, literal_count_);
+ int actual_read = bit_reader_.GetBatch(bit_width_, out, literal_batch);
+ if (actual_read != literal_batch) {
+ return values_read;
+ }
+
+ literal_count_ -= literal_batch;
+ values_read += literal_batch;
+ out += literal_batch;
+ } else {
+ if (!NextCounts<T>()) return values_read;
+ }
+ }
+
+ return values_read;
+}
+
+template <typename T, typename RunType, typename Converter>
+inline int RleDecoder::GetSpaced(Converter converter, int batch_size, int null_count,
+ const uint8_t* valid_bits, int64_t valid_bits_offset,
+ T* out) {
+ if (ARROW_PREDICT_FALSE(null_count == batch_size)) {
+ converter.FillZero(out, out + batch_size);
+ return batch_size;
+ }
+
+ DCHECK_GE(bit_width_, 0);
+ int values_read = 0;
+ int values_remaining = batch_size - null_count;
+
+ // Assume no bits to start.
+ arrow::internal::BitRunReader bit_reader(valid_bits, valid_bits_offset,
+ /*length=*/batch_size);
+ arrow::internal::BitRun valid_run = bit_reader.NextRun();
+ while (values_read < batch_size) {
+ if (ARROW_PREDICT_FALSE(valid_run.length == 0)) {
+ valid_run = bit_reader.NextRun();
+ }
+
+ DCHECK_GT(batch_size, 0);
+ DCHECK_GT(valid_run.length, 0);
+
+ if (valid_run.set) {
+ if ((repeat_count_ == 0) && (literal_count_ == 0)) {
+ if (!NextCounts<RunType>()) return values_read;
+ DCHECK((repeat_count_ > 0) ^ (literal_count_ > 0));
+ }
+
+ if (repeat_count_ > 0) {
+ int repeat_batch = 0;
+ // Consume the entire repeat counts incrementing repeat_batch to
+ // be the total of nulls + values consumed, we only need to
+ // get the total count because we can fill in the same value for
+ // nulls and non-nulls. This proves to be a big efficiency win.
+ while (repeat_count_ > 0 && (values_read + repeat_batch) < batch_size) {
+ DCHECK_GT(valid_run.length, 0);
+ if (valid_run.set) {
+ int update_size = std::min(static_cast<int>(valid_run.length), repeat_count_);
+ repeat_count_ -= update_size;
+ repeat_batch += update_size;
+ valid_run.length -= update_size;
+ values_remaining -= update_size;
+ } else {
+ // We can consume all nulls here because we would do so on
+ // the next loop anyways.
+ repeat_batch += static_cast<int>(valid_run.length);
+ valid_run.length = 0;
+ }
+ if (valid_run.length == 0) {
+ valid_run = bit_reader.NextRun();
+ }
+ }
+ RunType current_value = static_cast<RunType>(current_value_);
+ if (ARROW_PREDICT_FALSE(!converter.IsValid(current_value))) {
+ return values_read;
+ }
+ converter.Fill(out, out + repeat_batch, current_value);
+ out += repeat_batch;
+ values_read += repeat_batch;
+ } else if (literal_count_ > 0) {
+ int literal_batch = std::min(values_remaining, literal_count_);
+ DCHECK_GT(literal_batch, 0);
+
+ // Decode the literals
+ constexpr int kBufferSize = 1024;
+ RunType indices[kBufferSize];
+ literal_batch = std::min(literal_batch, kBufferSize);
+ int actual_read = bit_reader_.GetBatch(bit_width_, indices, literal_batch);
+ if (ARROW_PREDICT_FALSE(actual_read != literal_batch)) {
+ return values_read;
+ }
+ if (!converter.IsValid(indices, /*length=*/actual_read)) {
+ return values_read;
+ }
+ int skipped = 0;
+ int literals_read = 0;
+ while (literals_read < literal_batch) {
+ if (valid_run.set) {
+ int update_size = std::min(literal_batch - literals_read,
+ static_cast<int>(valid_run.length));
+ converter.Copy(out, indices + literals_read, update_size);
+ literals_read += update_size;
+ out += update_size;
+ valid_run.length -= update_size;
+ } else {
+ converter.FillZero(out, out + valid_run.length);
+ out += valid_run.length;
+ skipped += static_cast<int>(valid_run.length);
+ valid_run.length = 0;
+ }
+ if (valid_run.length == 0) {
+ valid_run = bit_reader.NextRun();
+ }
+ }
+ literal_count_ -= literal_batch;
+ values_remaining -= literal_batch;
+ values_read += literal_batch + skipped;
+ }
+ } else {
+ converter.FillZero(out, out + valid_run.length);
+ out += valid_run.length;
+ values_read += static_cast<int>(valid_run.length);
+ valid_run.length = 0;
+ }
+ }
+ DCHECK_EQ(valid_run.length, 0);
+ DCHECK_EQ(values_remaining, 0);
+ return values_read;
+}
+
+// Converter for GetSpaced that handles runs that get returned
+// directly as output.
+template <typename T>
+struct PlainRleConverter {
+ T kZero = {};
+ inline bool IsValid(const T& values) const { return true; }
+ inline bool IsValid(const T* values, int32_t length) const { return true; }
+ inline void Fill(T* begin, T* end, const T& run_value) const {
+ std::fill(begin, end, run_value);
+ }
+ inline void FillZero(T* begin, T* end) { std::fill(begin, end, kZero); }
+ inline void Copy(T* out, const T* values, int length) const {
+ std::memcpy(out, values, length * sizeof(T));
+ }
+};
+
+template <typename T>
+inline int RleDecoder::GetBatchSpaced(int batch_size, int null_count,
+ const uint8_t* valid_bits,
+ int64_t valid_bits_offset, T* out) {
+ if (null_count == 0) {
+ return GetBatch<T>(out, batch_size);
+ }
+
+ PlainRleConverter<T> converter;
+ arrow::internal::BitBlockCounter block_counter(valid_bits, valid_bits_offset,
+ batch_size);
+
+ int total_processed = 0;
+ int processed = 0;
+ arrow::internal::BitBlockCount block;
+
+ do {
+ block = block_counter.NextFourWords();
+ if (block.length == 0) {
+ break;
+ }
+ if (block.AllSet()) {
+ processed = GetBatch<T>(out, block.length);
+ } else if (block.NoneSet()) {
+ converter.FillZero(out, out + block.length);
+ processed = block.length;
+ } else {
+ processed = GetSpaced<T, /*RunType=*/T, PlainRleConverter<T>>(
+ converter, block.length, block.length - block.popcount, valid_bits,
+ valid_bits_offset, out);
+ }
+ total_processed += processed;
+ out += block.length;
+ valid_bits_offset += block.length;
+ } while (processed == block.length);
+ return total_processed;
+}
+
+static inline bool IndexInRange(int32_t idx, int32_t dictionary_length) {
+ return idx >= 0 && idx < dictionary_length;
+}
+
+// Converter for GetSpaced that handles runs of returned dictionary
+// indices.
+template <typename T>
+struct DictionaryConverter {
+ T kZero = {};
+ const T* dictionary;
+ int32_t dictionary_length;
+
+ inline bool IsValid(int32_t value) { return IndexInRange(value, dictionary_length); }
+
+ inline bool IsValid(const int32_t* values, int32_t length) const {
+ using IndexType = int32_t;
+ IndexType min_index = std::numeric_limits<IndexType>::max();
+ IndexType max_index = std::numeric_limits<IndexType>::min();
+ for (int x = 0; x < length; x++) {
+ min_index = std::min(values[x], min_index);
+ max_index = std::max(values[x], max_index);
+ }
+
+ return IndexInRange(min_index, dictionary_length) &&
+ IndexInRange(max_index, dictionary_length);
+ }
+ inline void Fill(T* begin, T* end, const int32_t& run_value) const {
+ std::fill(begin, end, dictionary[run_value]);
+ }
+ inline void FillZero(T* begin, T* end) { std::fill(begin, end, kZero); }
+
+ inline void Copy(T* out, const int32_t* values, int length) const {
+ for (int x = 0; x < length; x++) {
+ out[x] = dictionary[values[x]];
+ }
+ }
+};
+
+template <typename T>
+inline int RleDecoder::GetBatchWithDict(const T* dictionary, int32_t dictionary_length,
+ T* values, int batch_size) {
+ // Per https://github.com/apache/parquet-format/blob/master/Encodings.md,
+ // the maximum dictionary index width in Parquet is 32 bits.
+ using IndexType = int32_t;
+ DictionaryConverter<T> converter;
+ converter.dictionary = dictionary;
+ converter.dictionary_length = dictionary_length;
+
+ DCHECK_GE(bit_width_, 0);
+ int values_read = 0;
+
+ auto* out = values;
+
+ while (values_read < batch_size) {
+ int remaining = batch_size - values_read;
+
+ if (repeat_count_ > 0) {
+ auto idx = static_cast<IndexType>(current_value_);
+ if (ARROW_PREDICT_FALSE(!IndexInRange(idx, dictionary_length))) {
+ return values_read;
+ }
+ T val = dictionary[idx];
+
+ int repeat_batch = std::min(remaining, repeat_count_);
+ std::fill(out, out + repeat_batch, val);
+
+ /* Upkeep counters */
+ repeat_count_ -= repeat_batch;
+ values_read += repeat_batch;
+ out += repeat_batch;
+ } else if (literal_count_ > 0) {
+ constexpr int kBufferSize = 1024;
+ IndexType indices[kBufferSize];
+
+ int literal_batch = std::min(remaining, literal_count_);
+ literal_batch = std::min(literal_batch, kBufferSize);
+
+ int actual_read = bit_reader_.GetBatch(bit_width_, indices, literal_batch);
+ if (ARROW_PREDICT_FALSE(actual_read != literal_batch)) {
+ return values_read;
+ }
+ if (ARROW_PREDICT_FALSE(!converter.IsValid(indices, /*length=*/literal_batch))) {
+ return values_read;
+ }
+ converter.Copy(out, indices, literal_batch);
+
+ /* Upkeep counters */
+ literal_count_ -= literal_batch;
+ values_read += literal_batch;
+ out += literal_batch;
+ } else {
+ if (!NextCounts<IndexType>()) return values_read;
+ }
+ }
+
+ return values_read;
+}
+
+template <typename T>
+inline int RleDecoder::GetBatchWithDictSpaced(const T* dictionary,
+ int32_t dictionary_length, T* out,
+ int batch_size, int null_count,
+ const uint8_t* valid_bits,
+ int64_t valid_bits_offset) {
+ if (null_count == 0) {
+ return GetBatchWithDict<T>(dictionary, dictionary_length, out, batch_size);
+ }
+ arrow::internal::BitBlockCounter block_counter(valid_bits, valid_bits_offset,
+ batch_size);
+ using IndexType = int32_t;
+ DictionaryConverter<T> converter;
+ converter.dictionary = dictionary;
+ converter.dictionary_length = dictionary_length;
+
+ int total_processed = 0;
+ int processed = 0;
+ arrow::internal::BitBlockCount block;
+ do {
+ block = block_counter.NextFourWords();
+ if (block.length == 0) {
+ break;
+ }
+ if (block.AllSet()) {
+ processed = GetBatchWithDict<T>(dictionary, dictionary_length, out, block.length);
+ } else if (block.NoneSet()) {
+ converter.FillZero(out, out + block.length);
+ processed = block.length;
+ } else {
+ processed = GetSpaced<T, /*RunType=*/IndexType, DictionaryConverter<T>>(
+ converter, block.length, block.length - block.popcount, valid_bits,
+ valid_bits_offset, out);
+ }
+ total_processed += processed;
+ out += block.length;
+ valid_bits_offset += block.length;
+ } while (processed == block.length);
+ return total_processed;
+}
+
+template <typename T>
+bool RleDecoder::NextCounts() {
+ // Read the next run's indicator int, it could be a literal or repeated run.
+ // The int is encoded as a vlq-encoded value.
+ uint32_t indicator_value = 0;
+ if (!bit_reader_.GetVlqInt(&indicator_value)) return false;
+
+ // lsb indicates if it is a literal run or repeated run
+ bool is_literal = indicator_value & 1;
+ uint32_t count = indicator_value >> 1;
+ if (is_literal) {
+ if (ARROW_PREDICT_FALSE(count == 0 || count > static_cast<uint32_t>(INT32_MAX) / 8)) {
+ return false;
+ }
+ literal_count_ = count * 8;
+ } else {
+ if (ARROW_PREDICT_FALSE(count == 0 || count > static_cast<uint32_t>(INT32_MAX))) {
+ return false;
+ }
+ repeat_count_ = count;
+ T value = {};
+ if (!bit_reader_.GetAligned<T>(static_cast<int>(BitUtil::CeilDiv(bit_width_, 8)),
+ &value)) {
+ return false;
+ }
+ current_value_ = static_cast<uint64_t>(value);
+ }
+ return true;
+}
+
+/// This function buffers input values 8 at a time. After seeing all 8 values,
+/// it decides whether they should be encoded as a literal or repeated run.
+inline bool RleEncoder::Put(uint64_t value) {
+ DCHECK(bit_width_ == 64 || value < (1ULL << bit_width_));
+ if (ARROW_PREDICT_FALSE(buffer_full_)) return false;
+
+ if (ARROW_PREDICT_TRUE(current_value_ == value)) {
+ ++repeat_count_;
+ if (repeat_count_ > 8) {
+ // This is just a continuation of the current run, no need to buffer the
+ // values.
+ // Note that this is the fast path for long repeated runs.
+ return true;
+ }
+ } else {
+ if (repeat_count_ >= 8) {
+ // We had a run that was long enough but it has ended. Flush the
+ // current repeated run.
+ DCHECK_EQ(literal_count_, 0);
+ FlushRepeatedRun();
+ }
+ repeat_count_ = 1;
+ current_value_ = value;
+ }
+
+ buffered_values_[num_buffered_values_] = value;
+ if (++num_buffered_values_ == 8) {
+ DCHECK_EQ(literal_count_ % 8, 0);
+ FlushBufferedValues(false);
+ }
+ return true;
+}
+
+inline void RleEncoder::FlushLiteralRun(bool update_indicator_byte) {
+ if (literal_indicator_byte_ == NULL) {
+ // The literal indicator byte has not been reserved yet, get one now.
+ literal_indicator_byte_ = bit_writer_.GetNextBytePtr();
+ DCHECK(literal_indicator_byte_ != NULL);
+ }
+
+ // Write all the buffered values as bit packed literals
+ for (int i = 0; i < num_buffered_values_; ++i) {
+ bool success = bit_writer_.PutValue(buffered_values_[i], bit_width_);
+ DCHECK(success) << "There is a bug in using CheckBufferFull()";
+ }
+ num_buffered_values_ = 0;
+
+ if (update_indicator_byte) {
+ // At this point we need to write the indicator byte for the literal run.
+ // We only reserve one byte, to allow for streaming writes of literal values.
+ // The logic makes sure we flush literal runs often enough to not overrun
+ // the 1 byte.
+ DCHECK_EQ(literal_count_ % 8, 0);
+ int num_groups = literal_count_ / 8;
+ int32_t indicator_value = (num_groups << 1) | 1;
+ DCHECK_EQ(indicator_value & 0xFFFFFF00, 0);
+ *literal_indicator_byte_ = static_cast<uint8_t>(indicator_value);
+ literal_indicator_byte_ = NULL;
+ literal_count_ = 0;
+ CheckBufferFull();
+ }
+}
+
+inline void RleEncoder::FlushRepeatedRun() {
+ DCHECK_GT(repeat_count_, 0);
+ bool result = true;
+ // The lsb of 0 indicates this is a repeated run
+ int32_t indicator_value = repeat_count_ << 1 | 0;
+ result &= bit_writer_.PutVlqInt(indicator_value);
+ result &= bit_writer_.PutAligned(current_value_,
+ static_cast<int>(BitUtil::CeilDiv(bit_width_, 8)));
+ DCHECK(result);
+ num_buffered_values_ = 0;
+ repeat_count_ = 0;
+ CheckBufferFull();
+}
+
+/// Flush the values that have been buffered. At this point we decide whether
+/// we need to switch between the run types or continue the current one.
+inline void RleEncoder::FlushBufferedValues(bool done) {
+ if (repeat_count_ >= 8) {
+ // Clear the buffered values. They are part of the repeated run now and we
+ // don't want to flush them out as literals.
+ num_buffered_values_ = 0;
+ if (literal_count_ != 0) {
+ // There was a current literal run. All the values in it have been flushed
+ // but we still need to update the indicator byte.
+ DCHECK_EQ(literal_count_ % 8, 0);
+ DCHECK_EQ(repeat_count_, 8);
+ FlushLiteralRun(true);
+ }
+ DCHECK_EQ(literal_count_, 0);
+ return;
+ }
+
+ literal_count_ += num_buffered_values_;
+ DCHECK_EQ(literal_count_ % 8, 0);
+ int num_groups = literal_count_ / 8;
+ if (num_groups + 1 >= (1 << 6)) {
+ // We need to start a new literal run because the indicator byte we've reserved
+ // cannot store more values.
+ DCHECK(literal_indicator_byte_ != NULL);
+ FlushLiteralRun(true);
+ } else {
+ FlushLiteralRun(done);
+ }
+ repeat_count_ = 0;
+}
+
+inline int RleEncoder::Flush() {
+ if (literal_count_ > 0 || repeat_count_ > 0 || num_buffered_values_ > 0) {
+ bool all_repeat = literal_count_ == 0 && (repeat_count_ == num_buffered_values_ ||
+ num_buffered_values_ == 0);
+ // There is something pending, figure out if it's a repeated or literal run
+ if (repeat_count_ > 0 && all_repeat) {
+ FlushRepeatedRun();
+ } else {
+ DCHECK_EQ(literal_count_ % 8, 0);
+ // Buffer the last group of literals to 8 by padding with 0s.
+ for (; num_buffered_values_ != 0 && num_buffered_values_ < 8;
+ ++num_buffered_values_) {
+ buffered_values_[num_buffered_values_] = 0;
+ }
+ literal_count_ += num_buffered_values_;
+ FlushLiteralRun(true);
+ repeat_count_ = 0;
+ }
+ }
+ bit_writer_.Flush();
+ DCHECK_EQ(num_buffered_values_, 0);
+ DCHECK_EQ(literal_count_, 0);
+ DCHECK_EQ(repeat_count_, 0);
+
+ return bit_writer_.bytes_written();
+}
+
+inline void RleEncoder::CheckBufferFull() {
+ int bytes_written = bit_writer_.bytes_written();
+ if (bytes_written + max_run_byte_size_ > bit_writer_.buffer_len()) {
+ buffer_full_ = true;
+ }
+}
+
+inline void RleEncoder::Clear() {
+ buffer_full_ = false;
+ current_value_ = 0;
+ repeat_count_ = 0;
+ num_buffered_values_ = 0;
+ literal_count_ = 0;
+ literal_indicator_byte_ = NULL;
+ bit_writer_.Clear();
+}
+
+} // namespace util
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/simd.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/simd.h
new file mode 100644
index 00000000000..259641dd456
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/simd.h
@@ -0,0 +1,50 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#ifdef _MSC_VER
+// MSVC x86_64/arm64
+
+#if defined(_M_AMD64) || defined(_M_X64)
+#include <intrin.h>
+#elif defined(_M_ARM64)
+#include <arm64_neon.h>
+#endif
+
+#else
+// gcc/clang (possibly others)
+
+#if defined(ARROW_HAVE_BMI2)
+#include <x86intrin.h>
+#endif
+
+#if defined(ARROW_HAVE_AVX2) || defined(ARROW_HAVE_AVX512)
+#include <immintrin.h>
+#elif defined(ARROW_HAVE_SSE4_2)
+#include <nmmintrin.h>
+#endif
+
+#ifdef ARROW_HAVE_NEON
+#include <arm_neon.h>
+#endif
+
+#ifdef ARROW_HAVE_ARMV8_CRC
+#include <arm_acle.h>
+#endif
+
+#endif
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/sort.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/sort.h
new file mode 100644
index 00000000000..cdffe0b2317
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/sort.h
@@ -0,0 +1,78 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cstdint>
+#include <functional>
+#include <numeric>
+#include <utility>
+#include <vector>
+
+namespace arrow {
+namespace internal {
+
+template <typename T, typename Cmp = std::less<T>>
+std::vector<int64_t> ArgSort(const std::vector<T>& values, Cmp&& cmp = {}) {
+ std::vector<int64_t> indices(values.size());
+ std::iota(indices.begin(), indices.end(), 0);
+ std::sort(indices.begin(), indices.end(),
+ [&](int64_t i, int64_t j) -> bool { return cmp(values[i], values[j]); });
+ return indices;
+}
+
+template <typename T>
+size_t Permute(const std::vector<int64_t>& indices, std::vector<T>* values) {
+ if (indices.size() <= 1) {
+ return indices.size();
+ }
+
+ // mask indicating which of values are in the correct location
+ std::vector<bool> sorted(indices.size(), false);
+
+ size_t cycle_count = 0;
+
+ for (auto cycle_start = sorted.begin(); cycle_start != sorted.end();
+ cycle_start = std::find(cycle_start, sorted.end(), false)) {
+ ++cycle_count;
+
+ // position in which an element belongs WRT sort
+ auto sort_into = static_cast<int64_t>(cycle_start - sorted.begin());
+
+ if (indices[sort_into] == sort_into) {
+ // trivial cycle
+ sorted[sort_into] = true;
+ continue;
+ }
+
+ // resolve this cycle
+ const auto end = sort_into;
+ for (int64_t take_from = indices[sort_into]; take_from != end;
+ take_from = indices[sort_into]) {
+ std::swap(values->at(sort_into), values->at(take_from));
+ sorted[sort_into] = true;
+ sort_into = take_from;
+ }
+ sorted[sort_into] = true;
+ }
+
+ return cycle_count;
+}
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/spaced.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/spaced.h
new file mode 100644
index 00000000000..8265e1d22ae
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/spaced.h
@@ -0,0 +1,98 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+
+#include "arrow/util/bit_run_reader.h"
+
+namespace arrow {
+namespace util {
+namespace internal {
+
+/// \brief Compress the buffer to spaced, excluding the null entries.
+///
+/// \param[in] src the source buffer
+/// \param[in] num_values the size of source buffer
+/// \param[in] valid_bits bitmap data indicating position of valid slots
+/// \param[in] valid_bits_offset offset into valid_bits
+/// \param[out] output the output buffer spaced
+/// \return The size of spaced buffer.
+template <typename T>
+inline int SpacedCompress(const T* src, int num_values, const uint8_t* valid_bits,
+ int64_t valid_bits_offset, T* output) {
+ int num_valid_values = 0;
+
+ arrow::internal::SetBitRunReader reader(valid_bits, valid_bits_offset, num_values);
+ while (true) {
+ const auto run = reader.NextRun();
+ if (run.length == 0) {
+ break;
+ }
+ std::memcpy(output + num_valid_values, src + run.position, run.length * sizeof(T));
+ num_valid_values += static_cast<int32_t>(run.length);
+ }
+
+ return num_valid_values;
+}
+
+/// \brief Relocate values in buffer into positions of non-null values as indicated by
+/// a validity bitmap.
+///
+/// \param[in, out] buffer the in-place buffer
+/// \param[in] num_values total size of buffer including null slots
+/// \param[in] null_count number of null slots
+/// \param[in] valid_bits bitmap data indicating position of valid slots
+/// \param[in] valid_bits_offset offset into valid_bits
+/// \return The number of values expanded, including nulls.
+template <typename T>
+inline int SpacedExpand(T* buffer, int num_values, int null_count,
+ const uint8_t* valid_bits, int64_t valid_bits_offset) {
+ // Point to end as we add the spacing from the back.
+ int idx_decode = num_values - null_count;
+
+ // Depending on the number of nulls, some of the value slots in buffer may
+ // be uninitialized, and this will cause valgrind warnings / potentially UB
+ std::memset(static_cast<void*>(buffer + idx_decode), 0, null_count * sizeof(T));
+ if (idx_decode == 0) {
+ // All nulls, nothing more to do
+ return num_values;
+ }
+
+ arrow::internal::ReverseSetBitRunReader reader(valid_bits, valid_bits_offset,
+ num_values);
+ while (true) {
+ const auto run = reader.NextRun();
+ if (run.length == 0) {
+ break;
+ }
+ idx_decode -= static_cast<int32_t>(run.length);
+ assert(idx_decode >= 0);
+ std::memmove(buffer + run.position, buffer + idx_decode, run.length * sizeof(T));
+ }
+
+ // Otherwise caller gave an incorrect null_count
+ assert(idx_decode == 0);
+ return num_values;
+}
+
+} // namespace internal
+} // namespace util
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/string.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/string.cc
new file mode 100644
index 00000000000..d922311df1c
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/string.cc
@@ -0,0 +1,191 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/util/string.h"
+
+#include <algorithm>
+#include <cctype>
+#include <memory>
+
+#include "arrow/status.h"
+
+namespace arrow {
+
+static const char* kAsciiTable = "0123456789ABCDEF";
+
+std::string HexEncode(const uint8_t* data, size_t length) {
+ std::string hex_string;
+ hex_string.reserve(length * 2);
+ for (size_t j = 0; j < length; ++j) {
+ // Convert to 2 base16 digits
+ hex_string.push_back(kAsciiTable[data[j] >> 4]);
+ hex_string.push_back(kAsciiTable[data[j] & 15]);
+ }
+ return hex_string;
+}
+
+std::string Escape(const char* data, size_t length) {
+ std::string escaped_string;
+ escaped_string.reserve(length);
+ for (size_t j = 0; j < length; ++j) {
+ switch (data[j]) {
+ case '"':
+ escaped_string += R"(\")";
+ break;
+ case '\\':
+ escaped_string += R"(\\)";
+ break;
+ case '\t':
+ escaped_string += R"(\t)";
+ break;
+ case '\r':
+ escaped_string += R"(\r)";
+ break;
+ case '\n':
+ escaped_string += R"(\n)";
+ break;
+ default:
+ escaped_string.push_back(data[j]);
+ }
+ }
+ return escaped_string;
+}
+
+std::string HexEncode(const char* data, size_t length) {
+ return HexEncode(reinterpret_cast<const uint8_t*>(data), length);
+}
+
+std::string HexEncode(util::string_view str) { return HexEncode(str.data(), str.size()); }
+
+std::string Escape(util::string_view str) { return Escape(str.data(), str.size()); }
+
+Status ParseHexValue(const char* data, uint8_t* out) {
+ char c1 = data[0];
+ char c2 = data[1];
+
+ const char* kAsciiTableEnd = kAsciiTable + 16;
+ const char* pos1 = std::lower_bound(kAsciiTable, kAsciiTableEnd, c1);
+ const char* pos2 = std::lower_bound(kAsciiTable, kAsciiTableEnd, c2);
+
+ // Error checking
+ if (pos1 == kAsciiTableEnd || pos2 == kAsciiTableEnd || *pos1 != c1 || *pos2 != c2) {
+ return Status::Invalid("Encountered non-hex digit");
+ }
+
+ *out = static_cast<uint8_t>((pos1 - kAsciiTable) << 4 | (pos2 - kAsciiTable));
+ return Status::OK();
+}
+
+namespace internal {
+
+std::vector<util::string_view> SplitString(util::string_view v, char delimiter) {
+ std::vector<util::string_view> parts;
+ size_t start = 0, end;
+ while (true) {
+ end = v.find(delimiter, start);
+ parts.push_back(v.substr(start, end - start));
+ if (end == std::string::npos) {
+ break;
+ }
+ start = end + 1;
+ }
+ return parts;
+}
+
+template <typename StringLike>
+static std::string JoinStringLikes(const std::vector<StringLike>& strings,
+ util::string_view delimiter) {
+ if (strings.size() == 0) {
+ return "";
+ }
+ std::string out = std::string(strings.front());
+ for (size_t i = 1; i < strings.size(); ++i) {
+ out.append(delimiter.begin(), delimiter.end());
+ out.append(strings[i].begin(), strings[i].end());
+ }
+ return out;
+}
+
+std::string JoinStrings(const std::vector<util::string_view>& strings,
+ util::string_view delimiter) {
+ return JoinStringLikes(strings, delimiter);
+}
+
+std::string JoinStrings(const std::vector<std::string>& strings,
+ util::string_view delimiter) {
+ return JoinStringLikes(strings, delimiter);
+}
+
+static constexpr bool IsWhitespace(char c) { return c == ' ' || c == '\t'; }
+
+std::string TrimString(std::string value) {
+ size_t ltrim_chars = 0;
+ while (ltrim_chars < value.size() && IsWhitespace(value[ltrim_chars])) {
+ ++ltrim_chars;
+ }
+ value.erase(0, ltrim_chars);
+ size_t rtrim_chars = 0;
+ while (rtrim_chars < value.size() &&
+ IsWhitespace(value[value.size() - 1 - rtrim_chars])) {
+ ++rtrim_chars;
+ }
+ value.erase(value.size() - rtrim_chars, rtrim_chars);
+ return value;
+}
+
+bool AsciiEqualsCaseInsensitive(util::string_view left, util::string_view right) {
+ // TODO: ASCII validation
+ if (left.size() != right.size()) {
+ return false;
+ }
+ for (size_t i = 0; i < left.size(); ++i) {
+ if (std::tolower(static_cast<unsigned char>(left[i])) !=
+ std::tolower(static_cast<unsigned char>(right[i]))) {
+ return false;
+ }
+ }
+ return true;
+}
+
+std::string AsciiToLower(util::string_view value) {
+ // TODO: ASCII validation
+ std::string result = std::string(value);
+ std::transform(result.begin(), result.end(), result.begin(),
+ [](unsigned char c) { return std::tolower(c); });
+ return result;
+}
+
+std::string AsciiToUpper(util::string_view value) {
+ // TODO: ASCII validation
+ std::string result = std::string(value);
+ std::transform(result.begin(), result.end(), result.begin(),
+ [](unsigned char c) { return std::toupper(c); });
+ return result;
+}
+
+util::optional<std::string> Replace(util::string_view s, util::string_view token,
+ util::string_view replacement) {
+ size_t token_start = s.find(token);
+ if (token_start == std::string::npos) {
+ return util::nullopt;
+ }
+ return s.substr(0, token_start).to_string() + replacement.to_string() +
+ s.substr(token_start + token.size()).to_string();
+}
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/string.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/string.h
new file mode 100644
index 00000000000..68b8a54e313
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/string.h
@@ -0,0 +1,79 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "arrow/util/optional.h"
+#include "arrow/util/string_view.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class Status;
+
+ARROW_EXPORT std::string HexEncode(const uint8_t* data, size_t length);
+
+ARROW_EXPORT std::string Escape(const char* data, size_t length);
+
+ARROW_EXPORT std::string HexEncode(const char* data, size_t length);
+
+ARROW_EXPORT std::string HexEncode(util::string_view str);
+
+ARROW_EXPORT std::string Escape(util::string_view str);
+
+ARROW_EXPORT Status ParseHexValue(const char* data, uint8_t* out);
+
+namespace internal {
+
+/// \brief Split a string with a delimiter
+ARROW_EXPORT
+std::vector<util::string_view> SplitString(util::string_view v, char delim);
+
+/// \brief Join strings with a delimiter
+ARROW_EXPORT
+std::string JoinStrings(const std::vector<util::string_view>& strings,
+ util::string_view delimiter);
+
+/// \brief Join strings with a delimiter
+ARROW_EXPORT
+std::string JoinStrings(const std::vector<std::string>& strings,
+ util::string_view delimiter);
+
+/// \brief Trim whitespace from left and right sides of string
+ARROW_EXPORT
+std::string TrimString(std::string value);
+
+ARROW_EXPORT
+bool AsciiEqualsCaseInsensitive(util::string_view left, util::string_view right);
+
+ARROW_EXPORT
+std::string AsciiToLower(util::string_view value);
+
+ARROW_EXPORT
+std::string AsciiToUpper(util::string_view value);
+
+/// \brief Search for the first instance of a token and replace it or return nullopt if
+/// the token is not found.
+ARROW_EXPORT
+util::optional<std::string> Replace(util::string_view s, util::string_view token,
+ util::string_view replacement);
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/string_builder.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/string_builder.cc
new file mode 100644
index 00000000000..625ae007534
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/string_builder.cc
@@ -0,0 +1,40 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/util/string_builder.h"
+
+#include <sstream>
+
+#include "arrow/util/make_unique.h"
+
+namespace arrow {
+
+using internal::make_unique;
+
+namespace util {
+namespace detail {
+
+StringStreamWrapper::StringStreamWrapper()
+ : sstream_(make_unique<std::ostringstream>()), ostream_(*sstream_) {}
+
+StringStreamWrapper::~StringStreamWrapper() {}
+
+std::string StringStreamWrapper::str() { return sstream_->str(); }
+
+} // namespace detail
+} // namespace util
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/string_builder.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/string_builder.h
new file mode 100644
index 00000000000..7c05ccd51f7
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/string_builder.h
@@ -0,0 +1,84 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License. template <typename T>
+
+#pragma once
+
+#include <memory>
+#include <ostream>
+#include <string>
+#include <utility>
+
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace util {
+
+namespace detail {
+
+class ARROW_EXPORT StringStreamWrapper {
+ public:
+ StringStreamWrapper();
+ ~StringStreamWrapper();
+
+ std::ostream& stream() { return ostream_; }
+ std::string str();
+
+ protected:
+ std::unique_ptr<std::ostringstream> sstream_;
+ std::ostream& ostream_;
+};
+
+} // namespace detail
+
+template <typename Head>
+void StringBuilderRecursive(std::ostream& stream, Head&& head) {
+ stream << head;
+}
+
+template <typename Head, typename... Tail>
+void StringBuilderRecursive(std::ostream& stream, Head&& head, Tail&&... tail) {
+ StringBuilderRecursive(stream, std::forward<Head>(head));
+ StringBuilderRecursive(stream, std::forward<Tail>(tail)...);
+}
+
+template <typename... Args>
+std::string StringBuilder(Args&&... args) {
+ detail::StringStreamWrapper ss;
+ StringBuilderRecursive(ss.stream(), std::forward<Args>(args)...);
+ return ss.str();
+}
+
+/// CRTP helper for declaring string representation. Defines operator<<
+template <typename T>
+class ToStringOstreamable {
+ public:
+ ~ToStringOstreamable() {
+ static_assert(
+ std::is_same<decltype(std::declval<const T>().ToString()), std::string>::value,
+ "ToStringOstreamable depends on the method T::ToString() const");
+ }
+
+ private:
+ const T& cast() const { return static_cast<const T&>(*this); }
+
+ friend inline std::ostream& operator<<(std::ostream& os, const ToStringOstreamable& t) {
+ return os << t.cast().ToString();
+ }
+};
+
+} // namespace util
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/string_view.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/string_view.h
new file mode 100644
index 00000000000..4a51c2ebd9e
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/string_view.h
@@ -0,0 +1,38 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#define nssv_CONFIG_SELECT_STRING_VIEW nssv_STRING_VIEW_NONSTD
+
+#include <cstdint>
+#include <string>
+
+#include "arrow/vendored/string_view.hpp" // IWYU pragma: export
+
+namespace arrow {
+namespace util {
+
+using nonstd::string_view;
+
+template <class Char, class Traits = std::char_traits<Char>>
+using basic_string_view = nonstd::basic_string_view<Char, Traits>;
+
+using bytes_view = basic_string_view<uint8_t>;
+
+} // namespace util
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/task_group.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/task_group.cc
new file mode 100644
index 00000000000..7e8ab64b703
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/task_group.cc
@@ -0,0 +1,224 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/util/task_group.h"
+
+#include <atomic>
+#include <condition_variable>
+#include <cstdint>
+#include <mutex>
+#include <utility>
+
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/thread_pool.h"
+
+namespace arrow {
+namespace internal {
+
+namespace {
+
+////////////////////////////////////////////////////////////////////////
+// Serial TaskGroup implementation
+
+class SerialTaskGroup : public TaskGroup {
+ public:
+ explicit SerialTaskGroup(StopToken stop_token) : stop_token_(std::move(stop_token)) {}
+
+ void AppendReal(FnOnce<Status()> task) override {
+ DCHECK(!finished_);
+ if (stop_token_.IsStopRequested()) {
+ status_ &= stop_token_.Poll();
+ return;
+ }
+ if (status_.ok()) {
+ status_ &= std::move(task)();
+ }
+ }
+
+ Status current_status() override { return status_; }
+
+ bool ok() const override { return status_.ok(); }
+
+ Status Finish() override {
+ if (!finished_) {
+ finished_ = true;
+ }
+ return status_;
+ }
+
+ Future<> FinishAsync() override { return Future<>::MakeFinished(Finish()); }
+
+ int parallelism() override { return 1; }
+
+ StopToken stop_token_;
+ Status status_;
+ bool finished_ = false;
+};
+
+////////////////////////////////////////////////////////////////////////
+// Threaded TaskGroup implementation
+
+class ThreadedTaskGroup : public TaskGroup {
+ public:
+ ThreadedTaskGroup(Executor* executor, StopToken stop_token)
+ : executor_(executor),
+ stop_token_(std::move(stop_token)),
+ nremaining_(0),
+ ok_(true) {}
+
+ ~ThreadedTaskGroup() override {
+ // Make sure all pending tasks are finished, so that dangling references
+ // to this don't persist.
+ ARROW_UNUSED(Finish());
+ }
+
+ void AppendReal(FnOnce<Status()> task) override {
+ DCHECK(!finished_);
+ if (stop_token_.IsStopRequested()) {
+ UpdateStatus(stop_token_.Poll());
+ return;
+ }
+
+ // The hot path is unlocked thanks to atomics
+ // Only if an error occurs is the lock taken
+ if (ok_.load(std::memory_order_acquire)) {
+ nremaining_.fetch_add(1, std::memory_order_acquire);
+
+ auto self = checked_pointer_cast<ThreadedTaskGroup>(shared_from_this());
+
+ struct Callable {
+ void operator()() {
+ if (self_->ok_.load(std::memory_order_acquire)) {
+ Status st;
+ if (stop_token_.IsStopRequested()) {
+ st = stop_token_.Poll();
+ } else {
+ // XXX what about exceptions?
+ st = std::move(task_)();
+ }
+ self_->UpdateStatus(std::move(st));
+ }
+ self_->OneTaskDone();
+ }
+
+ std::shared_ptr<ThreadedTaskGroup> self_;
+ FnOnce<Status()> task_;
+ StopToken stop_token_;
+ };
+
+ Status st =
+ executor_->Spawn(Callable{std::move(self), std::move(task), stop_token_});
+ UpdateStatus(std::move(st));
+ }
+ }
+
+ Status current_status() override {
+ std::lock_guard<std::mutex> lock(mutex_);
+ return status_;
+ }
+
+ bool ok() const override { return ok_.load(); }
+
+ Status Finish() override {
+ std::unique_lock<std::mutex> lock(mutex_);
+ if (!finished_) {
+ cv_.wait(lock, [&]() { return nremaining_.load() == 0; });
+ // Current tasks may start other tasks, so only set this when done
+ finished_ = true;
+ }
+ return status_;
+ }
+
+ Future<> FinishAsync() override {
+ std::lock_guard<std::mutex> lock(mutex_);
+ if (!completion_future_.has_value()) {
+ if (nremaining_.load() == 0) {
+ completion_future_ = Future<>::MakeFinished(status_);
+ } else {
+ completion_future_ = Future<>::Make();
+ }
+ }
+ return *completion_future_;
+ }
+
+ int parallelism() override { return executor_->GetCapacity(); }
+
+ protected:
+ void UpdateStatus(Status&& st) {
+ // Must be called unlocked, only locks on error
+ if (ARROW_PREDICT_FALSE(!st.ok())) {
+ std::lock_guard<std::mutex> lock(mutex_);
+ ok_.store(false, std::memory_order_release);
+ status_ &= std::move(st);
+ }
+ }
+
+ void OneTaskDone() {
+ // Can be called unlocked thanks to atomics
+ auto nremaining = nremaining_.fetch_sub(1, std::memory_order_release) - 1;
+ DCHECK_GE(nremaining, 0);
+ if (nremaining == 0) {
+ // Take the lock so that ~ThreadedTaskGroup cannot destroy cv
+ // before cv.notify_one() has returned
+ std::unique_lock<std::mutex> lock(mutex_);
+ cv_.notify_one();
+ if (completion_future_.has_value()) {
+ // MarkFinished could be slow. We don't want to call it while we are holding
+ // the lock.
+ auto& future = *completion_future_;
+ const auto finished = completion_future_->is_finished();
+ const auto& status = status_;
+ // This will be redundant if the user calls Finish and not FinishAsync
+ if (!finished && !finished_) {
+ finished_ = true;
+ lock.unlock();
+ future.MarkFinished(status);
+ } else {
+ lock.unlock();
+ }
+ }
+ }
+ }
+
+ // These members are usable unlocked
+ Executor* executor_;
+ StopToken stop_token_;
+ std::atomic<int32_t> nremaining_;
+ std::atomic<bool> ok_;
+
+ // These members use locking
+ std::mutex mutex_;
+ std::condition_variable cv_;
+ Status status_;
+ bool finished_ = false;
+ util::optional<Future<>> completion_future_;
+};
+
+} // namespace
+
+std::shared_ptr<TaskGroup> TaskGroup::MakeSerial(StopToken stop_token) {
+ return std::shared_ptr<TaskGroup>(new SerialTaskGroup{stop_token});
+}
+
+std::shared_ptr<TaskGroup> TaskGroup::MakeThreaded(Executor* thread_pool,
+ StopToken stop_token) {
+ return std::shared_ptr<TaskGroup>(new ThreadedTaskGroup{thread_pool, stop_token});
+}
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/task_group.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/task_group.h
new file mode 100644
index 00000000000..3bb72f0d9cb
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/task_group.h
@@ -0,0 +1,106 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <utility>
+
+#include "arrow/status.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/cancel.h"
+#include "arrow/util/functional.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/type_fwd.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace internal {
+
+/// \brief A group of related tasks
+///
+/// A TaskGroup executes tasks with the signature `Status()`.
+/// Execution can be serial or parallel, depending on the TaskGroup
+/// implementation. When Finish() returns, it is guaranteed that all
+/// tasks have finished, or at least one has errored.
+///
+/// Once an error has occurred any tasks that are submitted to the task group
+/// will not run. The call to Append will simply return without scheduling the
+/// task.
+///
+/// If the task group is parallel it is possible that multiple tasks could be
+/// running at the same time and one of those tasks fails. This will put the
+/// task group in a failure state (so additional tasks cannot be run) however
+/// it will not interrupt running tasks. Finish will not complete
+/// until all running tasks have finished, even if one task fails.
+///
+/// Once a task group has finished new tasks may not be added to it. If you need to start
+/// a new batch of work then you should create a new task group.
+class ARROW_EXPORT TaskGroup : public std::enable_shared_from_this<TaskGroup> {
+ public:
+ /// Add a Status-returning function to execute. Execution order is
+ /// undefined. The function may be executed immediately or later.
+ template <typename Function>
+ void Append(Function&& func) {
+ return AppendReal(std::forward<Function>(func));
+ }
+
+ /// Wait for execution of all tasks (and subgroups) to be finished,
+ /// or for at least one task (or subgroup) to error out.
+ /// The returned Status propagates the error status of the first failing
+ /// task (or subgroup).
+ virtual Status Finish() = 0;
+
+ /// Returns a future that will complete the first time all tasks are finished.
+ /// This should be called only after all top level tasks
+ /// have been added to the task group.
+ ///
+ /// If you are using a TaskGroup asynchronously there are a few considerations to keep
+ /// in mind. The tasks should not block on I/O, etc (defeats the purpose of using
+ /// futures) and should not be doing any nested locking or you run the risk of the tasks
+ /// getting stuck in the thread pool waiting for tasks which cannot get scheduled.
+ ///
+ /// Primarily this call is intended to help migrate existing work written with TaskGroup
+ /// in mind to using futures without having to do a complete conversion on the first
+ /// pass.
+ virtual Future<> FinishAsync() = 0;
+
+ /// The current aggregate error Status. Non-blocking, useful for stopping early.
+ virtual Status current_status() = 0;
+
+ /// Whether some tasks have already failed. Non-blocking, useful for stopping early.
+ virtual bool ok() const = 0;
+
+ /// How many tasks can typically be executed in parallel.
+ /// This is only a hint, useful for testing or debugging.
+ virtual int parallelism() = 0;
+
+ static std::shared_ptr<TaskGroup> MakeSerial(StopToken = StopToken::Unstoppable());
+ static std::shared_ptr<TaskGroup> MakeThreaded(internal::Executor*,
+ StopToken = StopToken::Unstoppable());
+
+ virtual ~TaskGroup() = default;
+
+ protected:
+ TaskGroup() = default;
+ ARROW_DISALLOW_COPY_AND_ASSIGN(TaskGroup);
+
+ virtual void AppendReal(FnOnce<Status()> task) = 0;
+};
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/tdigest.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/tdigest.cc
new file mode 100644
index 00000000000..99b771ca0f2
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/tdigest.cc
@@ -0,0 +1,417 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/util/tdigest.h"
+
+#include <algorithm>
+#include <cmath>
+#include <iostream>
+#include <limits>
+#include <queue>
+#include <tuple>
+#include <vector>
+
+#include "arrow/status.h"
+
+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#endif
+
+namespace arrow {
+namespace internal {
+
+namespace {
+
+// a numerically stable lerp is unbelievably complex
+// but we are *approximating* the quantile, so let's keep it simple
+double Lerp(double a, double b, double t) { return a + t * (b - a); }
+
+// histogram bin
+struct Centroid {
+ double mean;
+ double weight; // # data points in this bin
+
+ // merge with another centroid
+ void Merge(const Centroid& centroid) {
+ weight += centroid.weight;
+ mean += (centroid.mean - mean) * centroid.weight / weight;
+ }
+};
+
+// scale function K0: linear function, as baseline
+struct ScalerK0 {
+ explicit ScalerK0(uint32_t delta) : delta_norm(delta / 2.0) {}
+
+ double K(double q) const { return delta_norm * q; }
+ double Q(double k) const { return k / delta_norm; }
+
+ const double delta_norm;
+};
+
+// scale function K1
+struct ScalerK1 {
+ explicit ScalerK1(uint32_t delta) : delta_norm(delta / (2.0 * M_PI)) {}
+
+ double K(double q) const { return delta_norm * std::asin(2 * q - 1); }
+ double Q(double k) const { return (std::sin(k / delta_norm) + 1) / 2; }
+
+ const double delta_norm;
+};
+
+// implements t-digest merging algorithm
+template <class T = ScalerK1>
+class TDigestMerger : private T {
+ public:
+ explicit TDigestMerger(uint32_t delta) : T(delta) { Reset(0, nullptr); }
+
+ void Reset(double total_weight, std::vector<Centroid>* tdigest) {
+ total_weight_ = total_weight;
+ tdigest_ = tdigest;
+ if (tdigest_) {
+ tdigest_->resize(0);
+ }
+ weight_so_far_ = 0;
+ weight_limit_ = -1; // trigger first centroid merge
+ }
+
+ // merge one centroid from a sorted centroid stream
+ void Add(const Centroid& centroid) {
+ auto& td = *tdigest_;
+ const double weight = weight_so_far_ + centroid.weight;
+ if (weight <= weight_limit_) {
+ td.back().Merge(centroid);
+ } else {
+ const double quantile = weight_so_far_ / total_weight_;
+ const double next_weight_limit = total_weight_ * this->Q(this->K(quantile) + 1);
+ // weight limit should be strictly increasing, until the last centroid
+ if (next_weight_limit <= weight_limit_) {
+ weight_limit_ = total_weight_;
+ } else {
+ weight_limit_ = next_weight_limit;
+ }
+ td.push_back(centroid); // should never exceed capacity and trigger reallocation
+ }
+ weight_so_far_ = weight;
+ }
+
+ // validate k-size of a tdigest
+ Status Validate(const std::vector<Centroid>& tdigest, double total_weight) const {
+ double q_prev = 0, k_prev = this->K(0);
+ for (size_t i = 0; i < tdigest.size(); ++i) {
+ const double q = q_prev + tdigest[i].weight / total_weight;
+ const double k = this->K(q);
+ if (tdigest[i].weight != 1 && (k - k_prev) > 1.001) {
+ return Status::Invalid("oversized centroid: ", k - k_prev);
+ }
+ k_prev = k;
+ q_prev = q;
+ }
+ return Status::OK();
+ }
+
+ private:
+ double total_weight_; // total weight of this tdigest
+ double weight_so_far_; // accumulated weight till current bin
+ double weight_limit_; // max accumulated weight to move to next bin
+ std::vector<Centroid>* tdigest_;
+};
+
+} // namespace
+
+class TDigest::TDigestImpl {
+ public:
+ explicit TDigestImpl(uint32_t delta)
+ : delta_(delta > 10 ? delta : 10), merger_(delta_) {
+ tdigests_[0].reserve(delta_);
+ tdigests_[1].reserve(delta_);
+ Reset();
+ }
+
+ void Reset() {
+ tdigests_[0].resize(0);
+ tdigests_[1].resize(0);
+ current_ = 0;
+ total_weight_ = 0;
+ min_ = std::numeric_limits<double>::max();
+ max_ = std::numeric_limits<double>::lowest();
+ merger_.Reset(0, nullptr);
+ }
+
+ Status Validate() const {
+ // check weight, centroid order
+ double total_weight = 0, prev_mean = std::numeric_limits<double>::lowest();
+ for (const auto& centroid : tdigests_[current_]) {
+ if (std::isnan(centroid.mean) || std::isnan(centroid.weight)) {
+ return Status::Invalid("NAN found in tdigest");
+ }
+ if (centroid.mean < prev_mean) {
+ return Status::Invalid("centroid mean decreases");
+ }
+ if (centroid.weight < 1) {
+ return Status::Invalid("invalid centroid weight");
+ }
+ prev_mean = centroid.mean;
+ total_weight += centroid.weight;
+ }
+ if (total_weight != total_weight_) {
+ return Status::Invalid("tdigest total weight mismatch");
+ }
+ // check if buffer expanded
+ if (tdigests_[0].capacity() > delta_ || tdigests_[1].capacity() > delta_) {
+ return Status::Invalid("oversized tdigest buffer");
+ }
+ // check k-size
+ return merger_.Validate(tdigests_[current_], total_weight_);
+ }
+
+ void Dump() const {
+ const auto& td = tdigests_[current_];
+ for (size_t i = 0; i < td.size(); ++i) {
+ std::cerr << i << ": mean = " << td[i].mean << ", weight = " << td[i].weight
+ << std::endl;
+ }
+ std::cerr << "min = " << min_ << ", max = " << max_ << std::endl;
+ }
+
+ // merge with other tdigests
+ void Merge(const std::vector<const TDigestImpl*>& tdigest_impls) {
+ // current and end iterator
+ using CentroidIter = std::vector<Centroid>::const_iterator;
+ using CentroidIterPair = std::pair<CentroidIter, CentroidIter>;
+ // use a min-heap to find next minimal centroid from all tdigests
+ auto centroid_gt = [](const CentroidIterPair& lhs, const CentroidIterPair& rhs) {
+ return lhs.first->mean > rhs.first->mean;
+ };
+ using CentroidQueue =
+ std::priority_queue<CentroidIterPair, std::vector<CentroidIterPair>,
+ decltype(centroid_gt)>;
+
+ // trivial dynamic memory allocated at runtime
+ std::vector<CentroidIterPair> queue_buffer;
+ queue_buffer.reserve(tdigest_impls.size() + 1);
+ CentroidQueue queue(std::move(centroid_gt), std::move(queue_buffer));
+
+ const auto& this_tdigest = tdigests_[current_];
+ if (this_tdigest.size() > 0) {
+ queue.emplace(this_tdigest.cbegin(), this_tdigest.cend());
+ }
+ for (const TDigestImpl* td : tdigest_impls) {
+ const auto& other_tdigest = td->tdigests_[td->current_];
+ if (other_tdigest.size() > 0) {
+ queue.emplace(other_tdigest.cbegin(), other_tdigest.cend());
+ total_weight_ += td->total_weight_;
+ min_ = std::min(min_, td->min_);
+ max_ = std::max(max_, td->max_);
+ }
+ }
+
+ merger_.Reset(total_weight_, &tdigests_[1 - current_]);
+ CentroidIter current_iter, end_iter;
+ // do k-way merge till one buffer left
+ while (queue.size() > 1) {
+ std::tie(current_iter, end_iter) = queue.top();
+ merger_.Add(*current_iter);
+ queue.pop();
+ if (++current_iter != end_iter) {
+ queue.emplace(current_iter, end_iter);
+ }
+ }
+ // merge last buffer
+ if (!queue.empty()) {
+ std::tie(current_iter, end_iter) = queue.top();
+ while (current_iter != end_iter) {
+ merger_.Add(*current_iter++);
+ }
+ }
+ merger_.Reset(0, nullptr);
+
+ current_ = 1 - current_;
+ }
+
+ // merge input data with current tdigest
+ void MergeInput(std::vector<double>& input) {
+ total_weight_ += input.size();
+
+ std::sort(input.begin(), input.end());
+ min_ = std::min(min_, input.front());
+ max_ = std::max(max_, input.back());
+
+ // pick next minimal centroid from input and tdigest, feed to merger
+ merger_.Reset(total_weight_, &tdigests_[1 - current_]);
+ const auto& td = tdigests_[current_];
+ uint32_t tdigest_index = 0, input_index = 0;
+ while (tdigest_index < td.size() && input_index < input.size()) {
+ if (td[tdigest_index].mean < input[input_index]) {
+ merger_.Add(td[tdigest_index++]);
+ } else {
+ merger_.Add(Centroid{input[input_index++], 1});
+ }
+ }
+ while (tdigest_index < td.size()) {
+ merger_.Add(td[tdigest_index++]);
+ }
+ while (input_index < input.size()) {
+ merger_.Add(Centroid{input[input_index++], 1});
+ }
+ merger_.Reset(0, nullptr);
+
+ input.resize(0);
+ current_ = 1 - current_;
+ }
+
+ double Quantile(double q) const {
+ const auto& td = tdigests_[current_];
+
+ if (q < 0 || q > 1 || td.size() == 0) {
+ return NAN;
+ }
+
+ const double index = q * total_weight_;
+ if (index <= 1) {
+ return min_;
+ } else if (index >= total_weight_ - 1) {
+ return max_;
+ }
+
+ // find centroid contains the index
+ uint32_t ci = 0;
+ double weight_sum = 0;
+ for (; ci < td.size(); ++ci) {
+ weight_sum += td[ci].weight;
+ if (index <= weight_sum) {
+ break;
+ }
+ }
+ DCHECK_LT(ci, td.size());
+
+ // deviation of index from the centroid center
+ double diff = index + td[ci].weight / 2 - weight_sum;
+
+ // index happen to be in a unit weight centroid
+ if (td[ci].weight == 1 && std::abs(diff) < 0.5) {
+ return td[ci].mean;
+ }
+
+ // find adjacent centroids for interpolation
+ uint32_t ci_left = ci, ci_right = ci;
+ if (diff > 0) {
+ if (ci_right == td.size() - 1) {
+ // index larger than center of last bin
+ DCHECK_EQ(weight_sum, total_weight_);
+ const Centroid* c = &td[ci_right];
+ DCHECK_GE(c->weight, 2);
+ return Lerp(c->mean, max_, diff / (c->weight / 2));
+ }
+ ++ci_right;
+ } else {
+ if (ci_left == 0) {
+ // index smaller than center of first bin
+ const Centroid* c = &td[0];
+ DCHECK_GE(c->weight, 2);
+ return Lerp(min_, c->mean, index / (c->weight / 2));
+ }
+ --ci_left;
+ diff += td[ci_left].weight / 2 + td[ci_right].weight / 2;
+ }
+
+ // interpolate from adjacent centroids
+ diff /= (td[ci_left].weight / 2 + td[ci_right].weight / 2);
+ return Lerp(td[ci_left].mean, td[ci_right].mean, diff);
+ }
+
+ double Mean() const {
+ double sum = 0;
+ for (const auto& centroid : tdigests_[current_]) {
+ sum += centroid.mean * centroid.weight;
+ }
+ return total_weight_ == 0 ? NAN : sum / total_weight_;
+ }
+
+ double total_weight() const { return total_weight_; }
+
+ private:
+ // must be delcared before merger_, see constructor initialization list
+ const uint32_t delta_;
+
+ TDigestMerger<> merger_;
+ double total_weight_;
+ double min_, max_;
+
+ // ping-pong buffer holds two tdigests, size = 2 * delta * sizeof(Centroid)
+ std::vector<Centroid> tdigests_[2];
+ // index of active tdigest buffer, 0 or 1
+ int current_;
+};
+
+TDigest::TDigest(uint32_t delta, uint32_t buffer_size) : impl_(new TDigestImpl(delta)) {
+ input_.reserve(buffer_size);
+ Reset();
+}
+
+TDigest::~TDigest() = default;
+TDigest::TDigest(TDigest&&) = default;
+TDigest& TDigest::operator=(TDigest&&) = default;
+
+void TDigest::Reset() {
+ input_.resize(0);
+ impl_->Reset();
+}
+
+Status TDigest::Validate() {
+ MergeInput();
+ return impl_->Validate();
+}
+
+void TDigest::Dump() {
+ MergeInput();
+ impl_->Dump();
+}
+
+void TDigest::Merge(std::vector<TDigest>* tdigests) {
+ MergeInput();
+
+ std::vector<const TDigestImpl*> tdigest_impls;
+ tdigest_impls.reserve(tdigests->size());
+ for (auto& td : *tdigests) {
+ td.MergeInput();
+ tdigest_impls.push_back(td.impl_.get());
+ }
+ impl_->Merge(tdigest_impls);
+}
+
+double TDigest::Quantile(double q) {
+ MergeInput();
+ return impl_->Quantile(q);
+}
+
+double TDigest::Mean() {
+ MergeInput();
+ return impl_->Mean();
+}
+
+bool TDigest::is_empty() const {
+ return input_.size() == 0 && impl_->total_weight() == 0;
+}
+
+void TDigest::MergeInput() {
+ if (input_.size() > 0) {
+ impl_->MergeInput(input_); // will mutate input_
+ }
+}
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/tdigest.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/tdigest.h
new file mode 100644
index 00000000000..ae42ce48e7d
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/tdigest.h
@@ -0,0 +1,103 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// approximate quantiles from arbitrary length dataset with O(1) space
+// based on 'Computing Extremely Accurate Quantiles Using t-Digests' from Dunning & Ertl
+// - https://arxiv.org/abs/1902.04023
+// - https://github.com/tdunning/t-digest
+
+#pragma once
+
+#include <cmath>
+#include <memory>
+#include <vector>
+
+#include "arrow/util/logging.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class Status;
+
+namespace internal {
+
+class ARROW_EXPORT TDigest {
+ public:
+ explicit TDigest(uint32_t delta = 100, uint32_t buffer_size = 500);
+ ~TDigest();
+ TDigest(TDigest&&);
+ TDigest& operator=(TDigest&&);
+
+ // reset and re-use this tdigest
+ void Reset();
+
+ // validate data integrity
+ Status Validate();
+
+ // dump internal data, only for debug
+ void Dump();
+
+ // buffer a single data point, consume internal buffer if full
+ // this function is intensively called and performance critical
+ // call it only if you are sure no NAN exists in input data
+ void Add(double value) {
+ DCHECK(!std::isnan(value)) << "cannot add NAN";
+ if (ARROW_PREDICT_FALSE(input_.size() == input_.capacity())) {
+ MergeInput();
+ }
+ input_.push_back(value);
+ }
+
+ // skip NAN on adding
+ template <typename T>
+ typename std::enable_if<std::is_floating_point<T>::value>::type NanAdd(T value) {
+ if (!std::isnan(value)) Add(value);
+ }
+
+ template <typename T>
+ typename std::enable_if<std::is_integral<T>::value>::type NanAdd(T value) {
+ Add(static_cast<double>(value));
+ }
+
+ // merge with other t-digests, called infrequently
+ void Merge(std::vector<TDigest>* tdigests);
+
+ // calculate quantile
+ double Quantile(double q);
+
+ double Min() { return Quantile(0); }
+ double Max() { return Quantile(1); }
+ double Mean();
+
+ // check if this tdigest contains no valid data points
+ bool is_empty() const;
+
+ private:
+ // merge input data with current tdigest
+ void MergeInput();
+
+ // input buffer, size = buffer_size * sizeof(double)
+ std::vector<double> input_;
+
+ // hide other members with pimpl
+ class TDigestImpl;
+ std::unique_ptr<TDigestImpl> impl_;
+};
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/thread_pool.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/thread_pool.cc
new file mode 100644
index 00000000000..758295d01ed
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/thread_pool.cc
@@ -0,0 +1,442 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/util/thread_pool.h"
+
+#include <algorithm>
+#include <condition_variable>
+#include <deque>
+#include <list>
+#include <mutex>
+#include <string>
+#include <thread>
+#include <vector>
+
+#include "arrow/util/io_util.h"
+#include "arrow/util/logging.h"
+
+namespace arrow {
+namespace internal {
+
+Executor::~Executor() = default;
+
+namespace {
+
+struct Task {
+ FnOnce<void()> callable;
+ StopToken stop_token;
+ Executor::StopCallback stop_callback;
+};
+
+} // namespace
+
+struct SerialExecutor::State {
+ std::deque<Task> task_queue;
+ std::mutex mutex;
+ std::condition_variable wait_for_tasks;
+ bool finished{false};
+};
+
+SerialExecutor::SerialExecutor() : state_(std::make_shared<State>()) {}
+
+SerialExecutor::~SerialExecutor() = default;
+
+Status SerialExecutor::SpawnReal(TaskHints hints, FnOnce<void()> task,
+ StopToken stop_token, StopCallback&& stop_callback) {
+ // While the SerialExecutor runs tasks synchronously on its main thread,
+ // SpawnReal may be called from external threads (e.g. when transferring back
+ // from blocking I/O threads), so we need to keep the state alive *and* to
+ // lock its contents.
+ //
+ // Note that holding the lock while notifying the condition variable may
+ // not be sufficient, as some exit paths in the main thread are unlocked.
+ auto state = state_;
+ {
+ std::lock_guard<std::mutex> lk(state->mutex);
+ state->task_queue.push_back(
+ Task{std::move(task), std::move(stop_token), std::move(stop_callback)});
+ }
+ state->wait_for_tasks.notify_one();
+ return Status::OK();
+}
+
+void SerialExecutor::MarkFinished() {
+ // Same comment as SpawnReal above
+ auto state = state_;
+ {
+ std::lock_guard<std::mutex> lk(state->mutex);
+ state->finished = true;
+ }
+ state->wait_for_tasks.notify_one();
+}
+
+void SerialExecutor::RunLoop() {
+ // This is called from the SerialExecutor's main thread, so the
+ // state is guaranteed to be kept alive.
+ std::unique_lock<std::mutex> lk(state_->mutex);
+
+ while (!state_->finished) {
+ while (!state_->task_queue.empty()) {
+ Task task = std::move(state_->task_queue.front());
+ state_->task_queue.pop_front();
+ lk.unlock();
+ if (!task.stop_token.IsStopRequested()) {
+ std::move(task.callable)();
+ } else {
+ if (task.stop_callback) {
+ std::move(task.stop_callback)(task.stop_token.Poll());
+ }
+ // Can't break here because there may be cleanup tasks down the chain we still
+ // need to run.
+ }
+ lk.lock();
+ }
+ // In this case we must be waiting on work from external (e.g. I/O) executors. Wait
+ // for tasks to arrive (typically via transferred futures).
+ state_->wait_for_tasks.wait(
+ lk, [&] { return state_->finished || !state_->task_queue.empty(); });
+ }
+}
+
+struct ThreadPool::State {
+ State() = default;
+
+ // NOTE: in case locking becomes too expensive, we can investigate lock-free FIFOs
+ // such as https://github.com/cameron314/concurrentqueue
+
+ std::mutex mutex_;
+ std::condition_variable cv_;
+ std::condition_variable cv_shutdown_;
+
+ std::list<std::thread> workers_;
+ // Trashcan for finished threads
+ std::vector<std::thread> finished_workers_;
+ std::deque<Task> pending_tasks_;
+
+ // Desired number of threads
+ int desired_capacity_ = 0;
+
+ // Total number of tasks that are either queued or running
+ int tasks_queued_or_running_ = 0;
+
+ // Are we shutting down?
+ bool please_shutdown_ = false;
+ bool quick_shutdown_ = false;
+};
+
+// The worker loop is an independent function so that it can keep running
+// after the ThreadPool is destroyed.
+static void WorkerLoop(std::shared_ptr<ThreadPool::State> state,
+ std::list<std::thread>::iterator it) {
+ std::unique_lock<std::mutex> lock(state->mutex_);
+
+ // Since we hold the lock, `it` now points to the correct thread object
+ // (LaunchWorkersUnlocked has exited)
+ DCHECK_EQ(std::this_thread::get_id(), it->get_id());
+
+ // If too many threads, we should secede from the pool
+ const auto should_secede = [&]() -> bool {
+ return state->workers_.size() > static_cast<size_t>(state->desired_capacity_);
+ };
+
+ while (true) {
+ // By the time this thread is started, some tasks may have been pushed
+ // or shutdown could even have been requested. So we only wait on the
+ // condition variable at the end of the loop.
+
+ // Execute pending tasks if any
+ while (!state->pending_tasks_.empty() && !state->quick_shutdown_) {
+ // We check this opportunistically at each loop iteration since
+ // it releases the lock below.
+ if (should_secede()) {
+ break;
+ }
+
+ DCHECK_GE(state->tasks_queued_or_running_, 0);
+ {
+ Task task = std::move(state->pending_tasks_.front());
+ state->pending_tasks_.pop_front();
+ StopToken* stop_token = &task.stop_token;
+ lock.unlock();
+ if (!stop_token->IsStopRequested()) {
+ std::move(task.callable)();
+ } else {
+ if (task.stop_callback) {
+ std::move(task.stop_callback)(stop_token->Poll());
+ }
+ }
+ ARROW_UNUSED(std::move(task)); // release resources before waiting for lock
+ lock.lock();
+ }
+ state->tasks_queued_or_running_--;
+ }
+ // Now either the queue is empty *or* a quick shutdown was requested
+ if (state->please_shutdown_ || should_secede()) {
+ break;
+ }
+ // Wait for next wakeup
+ state->cv_.wait(lock);
+ }
+ DCHECK_GE(state->tasks_queued_or_running_, 0);
+
+ // We're done. Move our thread object to the trashcan of finished
+ // workers. This has two motivations:
+ // 1) the thread object doesn't get destroyed before this function finishes
+ // (but we could call thread::detach() instead)
+ // 2) we can explicitly join() the trashcan threads to make sure all OS threads
+ // are exited before the ThreadPool is destroyed. Otherwise subtle
+ // timing conditions can lead to false positives with Valgrind.
+ DCHECK_EQ(std::this_thread::get_id(), it->get_id());
+ state->finished_workers_.push_back(std::move(*it));
+ state->workers_.erase(it);
+ if (state->please_shutdown_) {
+ // Notify the function waiting in Shutdown().
+ state->cv_shutdown_.notify_one();
+ }
+}
+
+ThreadPool::ThreadPool()
+ : sp_state_(std::make_shared<ThreadPool::State>()),
+ state_(sp_state_.get()),
+ shutdown_on_destroy_(true) {
+#ifndef _WIN32
+ pid_ = getpid();
+#endif
+}
+
+ThreadPool::~ThreadPool() {
+ if (shutdown_on_destroy_) {
+ ARROW_UNUSED(Shutdown(false /* wait */));
+ }
+}
+
+void ThreadPool::ProtectAgainstFork() {
+#ifndef _WIN32
+ pid_t current_pid = getpid();
+ if (pid_ != current_pid) {
+ // Reinitialize internal state in child process after fork()
+ // Ideally we would use pthread_at_fork(), but that doesn't allow
+ // storing an argument, hence we'd need to maintain a list of all
+ // existing ThreadPools.
+ int capacity = state_->desired_capacity_;
+
+ auto new_state = std::make_shared<ThreadPool::State>();
+ new_state->please_shutdown_ = state_->please_shutdown_;
+ new_state->quick_shutdown_ = state_->quick_shutdown_;
+
+ pid_ = current_pid;
+ sp_state_ = new_state;
+ state_ = sp_state_.get();
+
+ // Launch worker threads anew
+ if (!state_->please_shutdown_) {
+ ARROW_UNUSED(SetCapacity(capacity));
+ }
+ }
+#endif
+}
+
+Status ThreadPool::SetCapacity(int threads) {
+ ProtectAgainstFork();
+ std::unique_lock<std::mutex> lock(state_->mutex_);
+ if (state_->please_shutdown_) {
+ return Status::Invalid("operation forbidden during or after shutdown");
+ }
+ if (threads <= 0) {
+ return Status::Invalid("ThreadPool capacity must be > 0");
+ }
+ CollectFinishedWorkersUnlocked();
+
+ state_->desired_capacity_ = threads;
+ // See if we need to increase or decrease the number of running threads
+ const int required = std::min(static_cast<int>(state_->pending_tasks_.size()),
+ threads - static_cast<int>(state_->workers_.size()));
+ if (required > 0) {
+ // Some tasks are pending, spawn the number of needed threads immediately
+ LaunchWorkersUnlocked(required);
+ } else if (required < 0) {
+ // Excess threads are running, wake them so that they stop
+ state_->cv_.notify_all();
+ }
+ return Status::OK();
+}
+
+int ThreadPool::GetCapacity() {
+ ProtectAgainstFork();
+ std::unique_lock<std::mutex> lock(state_->mutex_);
+ return state_->desired_capacity_;
+}
+
+int ThreadPool::GetNumTasks() {
+ ProtectAgainstFork();
+ std::unique_lock<std::mutex> lock(state_->mutex_);
+ return state_->tasks_queued_or_running_;
+}
+
+int ThreadPool::GetActualCapacity() {
+ ProtectAgainstFork();
+ std::unique_lock<std::mutex> lock(state_->mutex_);
+ return static_cast<int>(state_->workers_.size());
+}
+
+Status ThreadPool::Shutdown(bool wait) {
+ ProtectAgainstFork();
+ std::unique_lock<std::mutex> lock(state_->mutex_);
+
+ if (state_->please_shutdown_) {
+ return Status::Invalid("Shutdown() already called");
+ }
+ state_->please_shutdown_ = true;
+ state_->quick_shutdown_ = !wait;
+ state_->cv_.notify_all();
+ state_->cv_shutdown_.wait(lock, [this] { return state_->workers_.empty(); });
+ if (!state_->quick_shutdown_) {
+ DCHECK_EQ(state_->pending_tasks_.size(), 0);
+ } else {
+ state_->pending_tasks_.clear();
+ }
+ CollectFinishedWorkersUnlocked();
+ return Status::OK();
+}
+
+void ThreadPool::CollectFinishedWorkersUnlocked() {
+ for (auto& thread : state_->finished_workers_) {
+ // Make sure OS thread has exited
+ thread.join();
+ }
+ state_->finished_workers_.clear();
+}
+
+thread_local ThreadPool* current_thread_pool_ = nullptr;
+
+bool ThreadPool::OwnsThisThread() { return current_thread_pool_ == this; }
+
+void ThreadPool::LaunchWorkersUnlocked(int threads) {
+ std::shared_ptr<State> state = sp_state_;
+
+ for (int i = 0; i < threads; i++) {
+ state_->workers_.emplace_back();
+ auto it = --(state_->workers_.end());
+ *it = std::thread([this, state, it] {
+ current_thread_pool_ = this;
+ WorkerLoop(state, it);
+ });
+ }
+}
+
+Status ThreadPool::SpawnReal(TaskHints hints, FnOnce<void()> task, StopToken stop_token,
+ StopCallback&& stop_callback) {
+ {
+ ProtectAgainstFork();
+ std::lock_guard<std::mutex> lock(state_->mutex_);
+ if (state_->please_shutdown_) {
+ return Status::Invalid("operation forbidden during or after shutdown");
+ }
+ CollectFinishedWorkersUnlocked();
+ state_->tasks_queued_or_running_++;
+ if (static_cast<int>(state_->workers_.size()) < state_->tasks_queued_or_running_ &&
+ state_->desired_capacity_ > static_cast<int>(state_->workers_.size())) {
+ // We can still spin up more workers so spin up a new worker
+ LaunchWorkersUnlocked(/*threads=*/1);
+ }
+ state_->pending_tasks_.push_back(
+ {std::move(task), std::move(stop_token), std::move(stop_callback)});
+ }
+ state_->cv_.notify_one();
+ return Status::OK();
+}
+
+Result<std::shared_ptr<ThreadPool>> ThreadPool::Make(int threads) {
+ auto pool = std::shared_ptr<ThreadPool>(new ThreadPool());
+ RETURN_NOT_OK(pool->SetCapacity(threads));
+ return pool;
+}
+
+Result<std::shared_ptr<ThreadPool>> ThreadPool::MakeEternal(int threads) {
+ ARROW_ASSIGN_OR_RAISE(auto pool, Make(threads));
+ // On Windows, the ThreadPool destructor may be called after non-main threads
+ // have been killed by the OS, and hang in a condition variable.
+ // On Unix, we want to avoid leak reports by Valgrind.
+#ifdef _WIN32
+ pool->shutdown_on_destroy_ = false;
+#endif
+ return pool;
+}
+
+// ----------------------------------------------------------------------
+// Global thread pool
+
+static int ParseOMPEnvVar(const char* name) {
+ // OMP_NUM_THREADS is a comma-separated list of positive integers.
+ // We are only interested in the first (top-level) number.
+ auto result = GetEnvVar(name);
+ if (!result.ok()) {
+ return 0;
+ }
+ auto str = *std::move(result);
+ auto first_comma = str.find_first_of(',');
+ if (first_comma != std::string::npos) {
+ str = str.substr(0, first_comma);
+ }
+ try {
+ return std::max(0, std::stoi(str));
+ } catch (...) {
+ return 0;
+ }
+}
+
+int ThreadPool::DefaultCapacity() {
+ int capacity, limit;
+ capacity = ParseOMPEnvVar("OMP_NUM_THREADS");
+ if (capacity == 0) {
+ capacity = std::thread::hardware_concurrency();
+ }
+ limit = ParseOMPEnvVar("OMP_THREAD_LIMIT");
+ if (limit > 0) {
+ capacity = std::min(limit, capacity);
+ }
+ if (capacity == 0) {
+ ARROW_LOG(WARNING) << "Failed to determine the number of available threads, "
+ "using a hardcoded arbitrary value";
+ capacity = 4;
+ }
+ return capacity;
+}
+
+// Helper for the singleton pattern
+std::shared_ptr<ThreadPool> ThreadPool::MakeCpuThreadPool() {
+ auto maybe_pool = ThreadPool::MakeEternal(ThreadPool::DefaultCapacity());
+ if (!maybe_pool.ok()) {
+ maybe_pool.status().Abort("Failed to create global CPU thread pool");
+ }
+ return *std::move(maybe_pool);
+}
+
+ThreadPool* GetCpuThreadPool() {
+ static std::shared_ptr<ThreadPool> singleton = ThreadPool::MakeCpuThreadPool();
+ return singleton.get();
+}
+
+} // namespace internal
+
+int GetCpuThreadPoolCapacity() { return internal::GetCpuThreadPool()->GetCapacity(); }
+
+Status SetCpuThreadPoolCapacity(int threads) {
+ return internal::GetCpuThreadPool()->SetCapacity(threads);
+}
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/thread_pool.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/thread_pool.h
new file mode 100644
index 00000000000..9ac8e36a3d8
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/thread_pool.h
@@ -0,0 +1,398 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <cstdint>
+#include <memory>
+#include <queue>
+#include <type_traits>
+#include <utility>
+
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/util/cancel.h"
+#include "arrow/util/functional.h"
+#include "arrow/util/future.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+#if defined(_MSC_VER)
+// Disable harmless warning for decorated name length limit
+#pragma warning(disable : 4503)
+#endif
+
+namespace arrow {
+
+/// \brief Get the capacity of the global thread pool
+///
+/// Return the number of worker threads in the thread pool to which
+/// Arrow dispatches various CPU-bound tasks. This is an ideal number,
+/// not necessarily the exact number of threads at a given point in time.
+///
+/// You can change this number using SetCpuThreadPoolCapacity().
+ARROW_EXPORT int GetCpuThreadPoolCapacity();
+
+/// \brief Set the capacity of the global thread pool
+///
+/// Set the number of worker threads int the thread pool to which
+/// Arrow dispatches various CPU-bound tasks.
+///
+/// The current number is returned by GetCpuThreadPoolCapacity().
+ARROW_EXPORT Status SetCpuThreadPoolCapacity(int threads);
+
+namespace internal {
+
+// Hints about a task that may be used by an Executor.
+// They are ignored by the provided ThreadPool implementation.
+struct TaskHints {
+ // The lower, the more urgent
+ int32_t priority = 0;
+ // The IO transfer size in bytes
+ int64_t io_size = -1;
+ // The approximate CPU cost in number of instructions
+ int64_t cpu_cost = -1;
+ // An application-specific ID
+ int64_t external_id = -1;
+};
+
+class ARROW_EXPORT Executor {
+ public:
+ using StopCallback = internal::FnOnce<void(const Status&)>;
+
+ virtual ~Executor();
+
+ // Spawn a fire-and-forget task.
+ template <typename Function>
+ Status Spawn(Function&& func) {
+ return SpawnReal(TaskHints{}, std::forward<Function>(func), StopToken::Unstoppable(),
+ StopCallback{});
+ }
+ template <typename Function>
+ Status Spawn(Function&& func, StopToken stop_token) {
+ return SpawnReal(TaskHints{}, std::forward<Function>(func), std::move(stop_token),
+ StopCallback{});
+ }
+ template <typename Function>
+ Status Spawn(TaskHints hints, Function&& func) {
+ return SpawnReal(hints, std::forward<Function>(func), StopToken::Unstoppable(),
+ StopCallback{});
+ }
+ template <typename Function>
+ Status Spawn(TaskHints hints, Function&& func, StopToken stop_token) {
+ return SpawnReal(hints, std::forward<Function>(func), std::move(stop_token),
+ StopCallback{});
+ }
+ template <typename Function>
+ Status Spawn(TaskHints hints, Function&& func, StopToken stop_token,
+ StopCallback stop_callback) {
+ return SpawnReal(hints, std::forward<Function>(func), std::move(stop_token),
+ std::move(stop_callback));
+ }
+
+ // Transfers a future to this executor. Any continuations added to the
+ // returned future will run in this executor. Otherwise they would run
+ // on the same thread that called MarkFinished.
+ //
+ // This is necessary when (for example) an I/O task is completing a future.
+ // The continuations of that future should run on the CPU thread pool keeping
+ // CPU heavy work off the I/O thread pool. So the I/O task should transfer
+ // the future to the CPU executor before returning.
+ //
+ // By default this method will only transfer if the future is not already completed. If
+ // the future is already completed then any callback would be run synchronously and so
+ // no transfer is typically necessary. However, in cases where you want to force a
+ // transfer (e.g. to help the scheduler break up units of work across multiple cores)
+ // then you can override this behavior with `always_transfer`.
+ template <typename T>
+ Future<T> Transfer(Future<T> future) {
+ return DoTransfer(std::move(future), false);
+ }
+
+ // Overload of Transfer which will always schedule callbacks on new threads even if the
+ // future is finished when the callback is added.
+ //
+ // This can be useful in cases where you want to ensure parallelism
+ template <typename T>
+ Future<T> TransferAlways(Future<T> future) {
+ return DoTransfer(std::move(future), true);
+ }
+
+ // Submit a callable and arguments for execution. Return a future that
+ // will return the callable's result value once.
+ // The callable's arguments are copied before execution.
+ template <typename Function, typename... Args,
+ typename FutureType = typename ::arrow::detail::ContinueFuture::ForSignature<
+ Function && (Args && ...)>>
+ Result<FutureType> Submit(TaskHints hints, StopToken stop_token, Function&& func,
+ Args&&... args) {
+ using ValueType = typename FutureType::ValueType;
+
+ auto future = FutureType::Make();
+ auto task = std::bind(::arrow::detail::ContinueFuture{}, future,
+ std::forward<Function>(func), std::forward<Args>(args)...);
+ struct {
+ WeakFuture<ValueType> weak_fut;
+
+ void operator()(const Status& st) {
+ auto fut = weak_fut.get();
+ if (fut.is_valid()) {
+ fut.MarkFinished(st);
+ }
+ }
+ } stop_callback{WeakFuture<ValueType>(future)};
+ ARROW_RETURN_NOT_OK(SpawnReal(hints, std::move(task), std::move(stop_token),
+ std::move(stop_callback)));
+
+ return future;
+ }
+
+ template <typename Function, typename... Args,
+ typename FutureType = typename ::arrow::detail::ContinueFuture::ForSignature<
+ Function && (Args && ...)>>
+ Result<FutureType> Submit(StopToken stop_token, Function&& func, Args&&... args) {
+ return Submit(TaskHints{}, stop_token, std::forward<Function>(func),
+ std::forward<Args>(args)...);
+ }
+
+ template <typename Function, typename... Args,
+ typename FutureType = typename ::arrow::detail::ContinueFuture::ForSignature<
+ Function && (Args && ...)>>
+ Result<FutureType> Submit(TaskHints hints, Function&& func, Args&&... args) {
+ return Submit(std::move(hints), StopToken::Unstoppable(),
+ std::forward<Function>(func), std::forward<Args>(args)...);
+ }
+
+ template <typename Function, typename... Args,
+ typename FutureType = typename ::arrow::detail::ContinueFuture::ForSignature<
+ Function && (Args && ...)>>
+ Result<FutureType> Submit(Function&& func, Args&&... args) {
+ return Submit(TaskHints{}, StopToken::Unstoppable(), std::forward<Function>(func),
+ std::forward<Args>(args)...);
+ }
+
+ // Return the level of parallelism (the number of tasks that may be executed
+ // concurrently). This may be an approximate number.
+ virtual int GetCapacity() = 0;
+
+ // Return true if the thread from which this function is called is owned by this
+ // Executor. Returns false if this Executor does not support this property.
+ virtual bool OwnsThisThread() { return false; }
+
+ protected:
+ ARROW_DISALLOW_COPY_AND_ASSIGN(Executor);
+
+ Executor() = default;
+
+ template <typename T, typename FT = Future<T>, typename FTSync = typename FT::SyncType>
+ Future<T> DoTransfer(Future<T> future, bool always_transfer = false) {
+ auto transferred = Future<T>::Make();
+ if (always_transfer) {
+ CallbackOptions callback_options = CallbackOptions::Defaults();
+ callback_options.should_schedule = ShouldSchedule::Always;
+ callback_options.executor = this;
+ auto sync_callback = [transferred](const FTSync& result) mutable {
+ transferred.MarkFinished(result);
+ };
+ future.AddCallback(sync_callback, callback_options);
+ return transferred;
+ }
+
+ // We could use AddCallback's ShouldSchedule::IfUnfinished but we can save a bit of
+ // work by doing the test here.
+ auto callback = [this, transferred](const FTSync& result) mutable {
+ auto spawn_status =
+ Spawn([transferred, result]() mutable { transferred.MarkFinished(result); });
+ if (!spawn_status.ok()) {
+ transferred.MarkFinished(spawn_status);
+ }
+ };
+ auto callback_factory = [&callback]() { return callback; };
+ if (future.TryAddCallback(callback_factory)) {
+ return transferred;
+ }
+ // If the future is already finished and we aren't going to force spawn a thread
+ // then we don't need to add another layer of callback and can return the original
+ // future
+ return future;
+ }
+
+ // Subclassing API
+ virtual Status SpawnReal(TaskHints hints, FnOnce<void()> task, StopToken,
+ StopCallback&&) = 0;
+};
+
+/// \brief An executor implementation that runs all tasks on a single thread using an
+/// event loop.
+///
+/// Note: Any sort of nested parallelism will deadlock this executor. Blocking waits are
+/// fine but if one task needs to wait for another task it must be expressed as an
+/// asynchronous continuation.
+class ARROW_EXPORT SerialExecutor : public Executor {
+ public:
+ template <typename T = ::arrow::internal::Empty>
+ using TopLevelTask = internal::FnOnce<Future<T>(Executor*)>;
+
+ ~SerialExecutor() override;
+
+ int GetCapacity() override { return 1; };
+ Status SpawnReal(TaskHints hints, FnOnce<void()> task, StopToken,
+ StopCallback&&) override;
+
+ /// \brief Runs the TopLevelTask and any scheduled tasks
+ ///
+ /// The TopLevelTask (or one of the tasks it schedules) must either return an invalid
+ /// status or call the finish signal. Failure to do this will result in a deadlock. For
+ /// this reason it is preferable (if possible) to use the helper methods (below)
+ /// RunSynchronously/RunSerially which delegates the responsiblity onto a Future
+ /// producer's existing responsibility to always mark a future finished (which can
+ /// someday be aided by ARROW-12207).
+ template <typename T = internal::Empty, typename FT = Future<T>,
+ typename FTSync = typename FT::SyncType>
+ static FTSync RunInSerialExecutor(TopLevelTask<T> initial_task) {
+ Future<T> fut = SerialExecutor().Run<T>(std::move(initial_task));
+ return FutureToSync(fut);
+ }
+
+ private:
+ SerialExecutor();
+
+ // State uses mutex
+ struct State;
+ std::shared_ptr<State> state_;
+
+ template <typename T, typename FTSync = typename Future<T>::SyncType>
+ Future<T> Run(TopLevelTask<T> initial_task) {
+ auto final_fut = std::move(initial_task)(this);
+ if (final_fut.is_finished()) {
+ return final_fut;
+ }
+ final_fut.AddCallback([this](const FTSync&) { MarkFinished(); });
+ RunLoop();
+ return final_fut;
+ }
+ void RunLoop();
+ void MarkFinished();
+};
+
+/// An Executor implementation spawning tasks in FIFO manner on a fixed-size
+/// pool of worker threads.
+///
+/// Note: Any sort of nested parallelism will deadlock this executor. Blocking waits are
+/// fine but if one task needs to wait for another task it must be expressed as an
+/// asynchronous continuation.
+class ARROW_EXPORT ThreadPool : public Executor {
+ public:
+ // Construct a thread pool with the given number of worker threads
+ static Result<std::shared_ptr<ThreadPool>> Make(int threads);
+
+ // Like Make(), but takes care that the returned ThreadPool is compatible
+ // with destruction late at process exit.
+ static Result<std::shared_ptr<ThreadPool>> MakeEternal(int threads);
+
+ // Destroy thread pool; the pool will first be shut down
+ ~ThreadPool() override;
+
+ // Return the desired number of worker threads.
+ // The actual number of workers may lag a bit before being adjusted to
+ // match this value.
+ int GetCapacity() override;
+
+ bool OwnsThisThread() override;
+
+ // Return the number of tasks either running or in the queue.
+ int GetNumTasks();
+
+ // Dynamically change the number of worker threads.
+ //
+ // This function always returns immediately.
+ // If fewer threads are running than this number, new threads are spawned
+ // on-demand when needed for task execution.
+ // If more threads are running than this number, excess threads are reaped
+ // as soon as possible.
+ Status SetCapacity(int threads);
+
+ // Heuristic for the default capacity of a thread pool for CPU-bound tasks.
+ // This is exposed as a static method to help with testing.
+ static int DefaultCapacity();
+
+ // Shutdown the pool. Once the pool starts shutting down, new tasks
+ // cannot be submitted anymore.
+ // If "wait" is true, shutdown waits for all pending tasks to be finished.
+ // If "wait" is false, workers are stopped as soon as currently executing
+ // tasks are finished.
+ Status Shutdown(bool wait = true);
+
+ struct State;
+
+ protected:
+ FRIEND_TEST(TestThreadPool, SetCapacity);
+ FRIEND_TEST(TestGlobalThreadPool, Capacity);
+ friend ARROW_EXPORT ThreadPool* GetCpuThreadPool();
+
+ ThreadPool();
+
+ Status SpawnReal(TaskHints hints, FnOnce<void()> task, StopToken,
+ StopCallback&&) override;
+
+ // Collect finished worker threads, making sure the OS threads have exited
+ void CollectFinishedWorkersUnlocked();
+ // Launch a given number of additional workers
+ void LaunchWorkersUnlocked(int threads);
+ // Get the current actual capacity
+ int GetActualCapacity();
+ // Reinitialize the thread pool if the pid changed
+ void ProtectAgainstFork();
+
+ static std::shared_ptr<ThreadPool> MakeCpuThreadPool();
+
+ std::shared_ptr<State> sp_state_;
+ State* state_;
+ bool shutdown_on_destroy_;
+#ifndef _WIN32
+ pid_t pid_;
+#endif
+};
+
+// Return the process-global thread pool for CPU-bound tasks.
+ARROW_EXPORT ThreadPool* GetCpuThreadPool();
+
+/// \brief Potentially run an async operation serially (if use_threads is false)
+/// \see RunSerially
+///
+/// If `use_threads` is true, the global CPU executor is used.
+/// If `use_threads` is false, a temporary SerialExecutor is used.
+/// `get_future` is called (from this thread) with the chosen executor and must
+/// return a future that will eventually finish. This function returns once the
+/// future has finished.
+template <typename Fut, typename ValueType = typename Fut::ValueType>
+typename Fut::SyncType RunSynchronously(FnOnce<Fut(Executor*)> get_future,
+ bool use_threads) {
+ if (use_threads) {
+ auto fut = std::move(get_future)(GetCpuThreadPool());
+ return FutureToSync(fut);
+ } else {
+ return SerialExecutor::RunInSerialExecutor<ValueType>(std::move(get_future));
+ }
+}
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/time.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/time.cc
new file mode 100644
index 00000000000..c285f075099
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/time.cc
@@ -0,0 +1,68 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <memory>
+
+#include "arrow/type.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/time.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+
+namespace util {
+
+// TimestampType -> TimestampType
+static const std::pair<DivideOrMultiply, int64_t> kTimestampConversionTable[4][4] = {
+ // TimestampType::SECOND
+ {{MULTIPLY, 1}, {MULTIPLY, 1000}, {MULTIPLY, 1000000}, {MULTIPLY, 1000000000}},
+ // TimestampType::MILLI
+ {{DIVIDE, 1000}, {MULTIPLY, 1}, {MULTIPLY, 1000}, {MULTIPLY, 1000000}},
+ // TimestampType::MICRO
+ {{DIVIDE, 1000000}, {DIVIDE, 1000}, {MULTIPLY, 1}, {MULTIPLY, 1000}},
+ // TimestampType::NANO
+ {{DIVIDE, 1000000000}, {DIVIDE, 1000000}, {DIVIDE, 1000}, {MULTIPLY, 1}},
+};
+
+std::pair<DivideOrMultiply, int64_t> GetTimestampConversion(TimeUnit::type in_unit,
+ TimeUnit::type out_unit) {
+ return kTimestampConversionTable[static_cast<int>(in_unit)][static_cast<int>(out_unit)];
+}
+
+Result<int64_t> ConvertTimestampValue(const std::shared_ptr<DataType>& in,
+ const std::shared_ptr<DataType>& out,
+ int64_t value) {
+ auto op_factor =
+ GetTimestampConversion(checked_cast<const TimestampType&>(*in).unit(),
+ checked_cast<const TimestampType&>(*out).unit());
+
+ auto op = op_factor.first;
+ auto factor = op_factor.second;
+ switch (op) {
+ case MULTIPLY:
+ return value * factor;
+ case DIVIDE:
+ return value / factor;
+ }
+
+ // unreachable...
+ return 0;
+}
+
+} // namespace util
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/time.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/time.h
new file mode 100644
index 00000000000..80b41f63c58
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/time.h
@@ -0,0 +1,82 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <chrono>
+#include <memory>
+#include <utility>
+
+#include "arrow/type_fwd.h"
+
+namespace arrow {
+namespace util {
+
+enum DivideOrMultiply {
+ MULTIPLY,
+ DIVIDE,
+};
+
+ARROW_EXPORT
+std::pair<DivideOrMultiply, int64_t> GetTimestampConversion(TimeUnit::type in_unit,
+ TimeUnit::type out_unit);
+
+// Converts a Timestamp value into another Timestamp value.
+//
+// This function takes care of properly transforming from one unit to another.
+//
+// \param[in] in the input type. Must be TimestampType.
+// \param[in] out the output type. Must be TimestampType.
+// \param[in] value the input value.
+//
+// \return The converted value, or an error.
+ARROW_EXPORT Result<int64_t> ConvertTimestampValue(const std::shared_ptr<DataType>& in,
+ const std::shared_ptr<DataType>& out,
+ int64_t value);
+
+template <typename Visitor, typename... Args>
+decltype(std::declval<Visitor>()(std::chrono::seconds{}, std::declval<Args&&>()...))
+VisitDuration(TimeUnit::type unit, Visitor&& visitor, Args&&... args) {
+ switch (unit) {
+ default:
+ case TimeUnit::SECOND:
+ break;
+ case TimeUnit::MILLI:
+ return visitor(std::chrono::milliseconds{}, std::forward<Args>(args)...);
+ case TimeUnit::MICRO:
+ return visitor(std::chrono::microseconds{}, std::forward<Args>(args)...);
+ case TimeUnit::NANO:
+ return visitor(std::chrono::nanoseconds{}, std::forward<Args>(args)...);
+ }
+ return visitor(std::chrono::seconds{}, std::forward<Args>(args)...);
+}
+
+/// Convert a count of seconds to the corresponding count in a different TimeUnit
+struct CastSecondsToUnitImpl {
+ template <typename Duration>
+ int64_t operator()(Duration, int64_t seconds) {
+ auto duration = std::chrono::duration_cast<Duration>(std::chrono::seconds{seconds});
+ return static_cast<int64_t>(duration.count());
+ }
+};
+
+inline int64_t CastSecondsToUnit(TimeUnit::type unit, int64_t seconds) {
+ return VisitDuration(unit, CastSecondsToUnitImpl{}, seconds);
+}
+
+} // namespace util
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/trie.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/trie.cc
new file mode 100644
index 00000000000..7fa7f852eb4
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/trie.cc
@@ -0,0 +1,211 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/util/trie.h"
+
+#include <iostream>
+#include <utility>
+
+#include "arrow/util/logging.h"
+
+namespace arrow {
+namespace internal {
+
+Status Trie::Validate() const {
+ const auto n_nodes = static_cast<fast_index_type>(nodes_.size());
+ if (size_ > n_nodes) {
+ return Status::Invalid("Number of entries larger than number of nodes");
+ }
+ for (const auto& node : nodes_) {
+ if (node.found_index_ >= size_) {
+ return Status::Invalid("Found index >= size");
+ }
+ if (node.child_lookup_ != -1 &&
+ node.child_lookup_ * 256 >
+ static_cast<fast_index_type>(lookup_table_.size() - 256)) {
+ return Status::Invalid("Child lookup base doesn't point to 256 valid indices");
+ }
+ }
+ for (const auto index : lookup_table_) {
+ if (index >= n_nodes) {
+ return Status::Invalid("Child lookup index out of bounds");
+ }
+ }
+ return Status::OK();
+}
+
+void Trie::Dump(const Node* node, const std::string& indent) const {
+ std::cerr << "[\"" << node->substring_ << "\"]";
+ if (node->found_index_ >= 0) {
+ std::cerr << " *";
+ }
+ std::cerr << "\n";
+ if (node->child_lookup_ >= 0) {
+ auto child_indent = indent + " ";
+ std::cerr << child_indent << "|\n";
+ for (fast_index_type i = 0; i < 256; ++i) {
+ auto child_index = lookup_table_[node->child_lookup_ * 256 + i];
+ if (child_index >= 0) {
+ const Node* child = &nodes_[child_index];
+ std::cerr << child_indent << "|-> '" << static_cast<char>(i) << "' (" << i
+ << ") -> ";
+ Dump(child, child_indent);
+ }
+ }
+ }
+}
+
+void Trie::Dump() const { Dump(&nodes_[0], ""); }
+
+TrieBuilder::TrieBuilder() { trie_.nodes_.push_back(Trie::Node{-1, -1, ""}); }
+
+Status TrieBuilder::AppendChildNode(Trie::Node* parent, uint8_t ch, Trie::Node&& node) {
+ if (parent->child_lookup_ == -1) {
+ RETURN_NOT_OK(ExtendLookupTable(&parent->child_lookup_));
+ }
+ auto parent_lookup = parent->child_lookup_ * 256 + ch;
+
+ DCHECK_EQ(trie_.lookup_table_[parent_lookup], -1);
+ if (trie_.nodes_.size() >= static_cast<size_t>(kMaxIndex)) {
+ auto max_capacity = kMaxIndex;
+ return Status::CapacityError("TrieBuilder cannot contain more than ", max_capacity,
+ " child nodes");
+ }
+ trie_.nodes_.push_back(std::move(node));
+ trie_.lookup_table_[parent_lookup] = static_cast<index_type>(trie_.nodes_.size() - 1);
+ return Status::OK();
+}
+
+Status TrieBuilder::CreateChildNode(Trie::Node* parent, uint8_t ch,
+ util::string_view substring) {
+ const auto kMaxSubstringLength = Trie::kMaxSubstringLength;
+
+ while (substring.length() > kMaxSubstringLength) {
+ // Substring doesn't fit in node => create intermediate node
+ auto mid_node = Trie::Node{-1, -1, substring.substr(0, kMaxSubstringLength)};
+ RETURN_NOT_OK(AppendChildNode(parent, ch, std::move(mid_node)));
+ // Recurse
+ parent = &trie_.nodes_.back();
+ ch = static_cast<uint8_t>(substring[kMaxSubstringLength]);
+ substring = substring.substr(kMaxSubstringLength + 1);
+ }
+
+ // Create final matching node
+ auto child_node = Trie::Node{trie_.size_, -1, substring};
+ RETURN_NOT_OK(AppendChildNode(parent, ch, std::move(child_node)));
+ ++trie_.size_;
+ return Status::OK();
+}
+
+Status TrieBuilder::CreateChildNode(Trie::Node* parent, char ch,
+ util::string_view substring) {
+ return CreateChildNode(parent, static_cast<uint8_t>(ch), substring);
+}
+
+Status TrieBuilder::ExtendLookupTable(index_type* out_index) {
+ auto cur_size = trie_.lookup_table_.size();
+ auto cur_index = cur_size / 256;
+ if (cur_index > static_cast<size_t>(kMaxIndex)) {
+ return Status::CapacityError("TrieBuilder cannot extend lookup table further");
+ }
+ trie_.lookup_table_.resize(cur_size + 256, -1);
+ *out_index = static_cast<index_type>(cur_index);
+ return Status::OK();
+}
+
+Status TrieBuilder::SplitNode(fast_index_type node_index, fast_index_type split_at) {
+ Trie::Node* node = &trie_.nodes_[node_index];
+
+ DCHECK_LT(split_at, node->substring_length());
+
+ // Before:
+ // {node} -> [...]
+ // After:
+ // {node} -> [c] -> {out_node} -> [...]
+ auto child_node = Trie::Node{node->found_index_, node->child_lookup_,
+ node->substring_.substr(split_at + 1)};
+ auto ch = node->substring_[split_at];
+ node->child_lookup_ = -1;
+ node->found_index_ = -1;
+ node->substring_ = node->substring_.substr(0, split_at);
+ RETURN_NOT_OK(AppendChildNode(node, ch, std::move(child_node)));
+
+ return Status::OK();
+}
+
+Status TrieBuilder::Append(util::string_view s, bool allow_duplicate) {
+ // Find or create node for string
+ fast_index_type node_index = 0;
+ fast_index_type pos = 0;
+ fast_index_type remaining = static_cast<fast_index_type>(s.length());
+
+ while (true) {
+ Trie::Node* node = &trie_.nodes_[node_index];
+ const auto substring_length = node->substring_length();
+ const auto substring_data = node->substring_data();
+
+ for (fast_index_type i = 0; i < substring_length; ++i) {
+ if (remaining == 0) {
+ // New string too short => need to split node
+ RETURN_NOT_OK(SplitNode(node_index, i));
+ // Current node matches exactly
+ node = &trie_.nodes_[node_index];
+ node->found_index_ = trie_.size_++;
+ return Status::OK();
+ }
+ if (s[pos] != substring_data[i]) {
+ // Mismatching substring => need to split node
+ RETURN_NOT_OK(SplitNode(node_index, i));
+ // Create new node for mismatching char
+ node = &trie_.nodes_[node_index];
+ return CreateChildNode(node, s[pos], s.substr(pos + 1));
+ }
+ ++pos;
+ --remaining;
+ }
+ if (remaining == 0) {
+ // Node matches exactly
+ if (node->found_index_ >= 0) {
+ if (allow_duplicate) {
+ return Status::OK();
+ } else {
+ return Status::Invalid("Duplicate entry in trie");
+ }
+ }
+ node->found_index_ = trie_.size_++;
+ return Status::OK();
+ }
+ // Lookup child using next input character
+ if (node->child_lookup_ == -1) {
+ // Need to extend lookup table for this node
+ RETURN_NOT_OK(ExtendLookupTable(&node->child_lookup_));
+ }
+ auto c = static_cast<uint8_t>(s[pos++]);
+ --remaining;
+ node_index = trie_.lookup_table_[node->child_lookup_ * 256 + c];
+ if (node_index == -1) {
+ // Child not found => need to create child node
+ return CreateChildNode(node, c, s.substr(pos));
+ }
+ node = &trie_.nodes_[node_index];
+ }
+}
+
+Trie TrieBuilder::Finish() { return std::move(trie_); }
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/trie.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/trie.h
new file mode 100644
index 00000000000..b250cca647d
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/trie.h
@@ -0,0 +1,245 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <iosfwd>
+#include <limits>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/status.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/string_view.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace internal {
+
+// A non-zero-terminated small string class.
+// std::string usually has a small string optimization
+// (see review at https://shaharmike.com/cpp/std-string/)
+// but this one allows tight control and optimization of memory layout.
+template <uint8_t N>
+class SmallString {
+ public:
+ SmallString() : length_(0) {}
+
+ template <typename T>
+ SmallString(const T& v) { // NOLINT implicit constructor
+ *this = util::string_view(v);
+ }
+
+ SmallString& operator=(const util::string_view s) {
+#ifndef NDEBUG
+ CheckSize(s.size());
+#endif
+ length_ = static_cast<uint8_t>(s.size());
+ std::memcpy(data_, s.data(), length_);
+ return *this;
+ }
+
+ SmallString& operator=(const std::string& s) {
+ *this = util::string_view(s);
+ return *this;
+ }
+
+ SmallString& operator=(const char* s) {
+ *this = util::string_view(s);
+ return *this;
+ }
+
+ explicit operator util::string_view() const {
+ return util::string_view(data_, length_);
+ }
+
+ const char* data() const { return data_; }
+ size_t length() const { return length_; }
+ bool empty() const { return length_ == 0; }
+ char operator[](size_t pos) const {
+#ifdef NDEBUG
+ assert(pos <= length_);
+#endif
+ return data_[pos];
+ }
+
+ SmallString substr(size_t pos) const {
+ return SmallString(util::string_view(*this).substr(pos));
+ }
+
+ SmallString substr(size_t pos, size_t count) const {
+ return SmallString(util::string_view(*this).substr(pos, count));
+ }
+
+ template <typename T>
+ bool operator==(T&& other) const {
+ return util::string_view(*this) == util::string_view(std::forward<T>(other));
+ }
+
+ template <typename T>
+ bool operator!=(T&& other) const {
+ return util::string_view(*this) != util::string_view(std::forward<T>(other));
+ }
+
+ protected:
+ uint8_t length_;
+ char data_[N];
+
+ void CheckSize(size_t n) { assert(n <= N); }
+};
+
+template <uint8_t N>
+std::ostream& operator<<(std::ostream& os, const SmallString<N>& str) {
+ return os << util::string_view(str);
+}
+
+// A trie class for byte strings, optimized for small sets of short strings.
+// This class is immutable by design, use a TrieBuilder to construct it.
+class ARROW_EXPORT Trie {
+ using index_type = int16_t;
+ using fast_index_type = int_fast16_t;
+ static constexpr auto kMaxIndex = std::numeric_limits<index_type>::max();
+
+ public:
+ Trie() : size_(0) {}
+ Trie(Trie&&) = default;
+ Trie& operator=(Trie&&) = default;
+
+ int32_t Find(util::string_view s) const {
+ const Node* node = &nodes_[0];
+ fast_index_type pos = 0;
+ if (s.length() > static_cast<size_t>(kMaxIndex)) {
+ return -1;
+ }
+ fast_index_type remaining = static_cast<fast_index_type>(s.length());
+
+ while (remaining > 0) {
+ auto substring_length = node->substring_length();
+ if (substring_length > 0) {
+ auto substring_data = node->substring_data();
+ if (remaining < substring_length) {
+ // Input too short
+ return -1;
+ }
+ for (fast_index_type i = 0; i < substring_length; ++i) {
+ if (s[pos++] != substring_data[i]) {
+ // Mismatching substring
+ return -1;
+ }
+ --remaining;
+ }
+ if (remaining == 0) {
+ // Matched node exactly
+ return node->found_index_;
+ }
+ }
+ // Lookup child using next input character
+ if (node->child_lookup_ == -1) {
+ // Input too long
+ return -1;
+ }
+ auto c = static_cast<uint8_t>(s[pos++]);
+ --remaining;
+ auto child_index = lookup_table_[node->child_lookup_ * 256 + c];
+ if (child_index == -1) {
+ // Child not found
+ return -1;
+ }
+ node = &nodes_[child_index];
+ }
+
+ // Input exhausted
+ if (node->substring_.empty()) {
+ // Matched node exactly
+ return node->found_index_;
+ } else {
+ return -1;
+ }
+ }
+
+ Status Validate() const;
+
+ void Dump() const;
+
+ protected:
+ static constexpr size_t kNodeSize = 16;
+ static constexpr auto kMaxSubstringLength =
+ kNodeSize - 2 * sizeof(index_type) - sizeof(int8_t);
+
+ struct Node {
+ // If this node is a valid end of string, index of found string, otherwise -1
+ index_type found_index_;
+ // Base index for child lookup in lookup_table_ (-1 if no child nodes)
+ index_type child_lookup_;
+ // The substring for this node.
+ SmallString<kMaxSubstringLength> substring_;
+
+ fast_index_type substring_length() const {
+ return static_cast<fast_index_type>(substring_.length());
+ }
+ const char* substring_data() const { return substring_.data(); }
+ };
+
+ static_assert(sizeof(Node) == kNodeSize, "Unexpected node size");
+
+ ARROW_DISALLOW_COPY_AND_ASSIGN(Trie);
+
+ void Dump(const Node* node, const std::string& indent) const;
+
+ // Node table: entry 0 is the root node
+ std::vector<Node> nodes_;
+
+ // Indexed lookup structure: gives index in node table, or -1 if not found
+ std::vector<index_type> lookup_table_;
+
+ // Number of entries
+ index_type size_;
+
+ friend class TrieBuilder;
+};
+
+class ARROW_EXPORT TrieBuilder {
+ using index_type = Trie::index_type;
+ using fast_index_type = Trie::fast_index_type;
+
+ public:
+ TrieBuilder();
+ Status Append(util::string_view s, bool allow_duplicate = false);
+ Trie Finish();
+
+ protected:
+ // Extend the lookup table by 256 entries, return the index of the new span
+ Status ExtendLookupTable(index_type* out_lookup_index);
+ // Split the node given by the index at the substring index `split_at`
+ Status SplitNode(fast_index_type node_index, fast_index_type split_at);
+ // Append an already constructed child node to the parent
+ Status AppendChildNode(Trie::Node* parent, uint8_t ch, Trie::Node&& node);
+ // Create a matching child node from this parent
+ Status CreateChildNode(Trie::Node* parent, uint8_t ch, util::string_view substring);
+ Status CreateChildNode(Trie::Node* parent, char ch, util::string_view substring);
+
+ Trie trie_;
+
+ static constexpr auto kMaxIndex = std::numeric_limits<index_type>::max();
+};
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/type_fwd.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/type_fwd.h
new file mode 100644
index 00000000000..ca107c2c69d
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/type_fwd.h
@@ -0,0 +1,62 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+namespace arrow {
+
+namespace internal {
+struct Empty;
+} // namespace internal
+
+template <typename T = internal::Empty>
+class WeakFuture;
+class FutureWaiter;
+
+class TimestampParser;
+
+namespace internal {
+
+class Executor;
+class TaskGroup;
+class ThreadPool;
+
+} // namespace internal
+
+struct Compression {
+ /// \brief Compression algorithm
+ enum type {
+ UNCOMPRESSED,
+ SNAPPY,
+ GZIP,
+ BROTLI,
+ ZSTD,
+ LZ4,
+ LZ4_FRAME,
+ LZO,
+ BZ2,
+ LZ4_HADOOP
+ };
+};
+
+namespace util {
+class Compressor;
+class Decompressor;
+class Codec;
+} // namespace util
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/type_traits.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/type_traits.h
new file mode 100644
index 00000000000..80cc6297e39
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/type_traits.h
@@ -0,0 +1,86 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <type_traits>
+
+namespace arrow {
+namespace internal {
+
+/// \brief Metafunction to allow checking if a type matches any of another set of types
+template <typename...>
+struct IsOneOf : std::false_type {}; /// Base case: nothing has matched
+
+template <typename T, typename U, typename... Args>
+struct IsOneOf<T, U, Args...> {
+ /// Recursive case: T == U or T matches any other types provided (not including U).
+ static constexpr bool value = std::is_same<T, U>::value || IsOneOf<T, Args...>::value;
+};
+
+/// \brief Shorthand for using IsOneOf + std::enable_if
+template <typename T, typename... Args>
+using EnableIfIsOneOf = typename std::enable_if<IsOneOf<T, Args...>::value, T>::type;
+
+/// \brief is_null_pointer from C++17
+template <typename T>
+struct is_null_pointer : std::is_same<std::nullptr_t, typename std::remove_cv<T>::type> {
+};
+
+#ifdef __GLIBCXX__
+
+// A aligned_union backport, because old libstdc++ versions don't include it.
+
+constexpr std::size_t max_size(std::size_t a, std::size_t b) { return (a > b) ? a : b; }
+
+template <typename...>
+struct max_size_traits;
+
+template <typename H, typename... T>
+struct max_size_traits<H, T...> {
+ static constexpr std::size_t max_sizeof() {
+ return max_size(sizeof(H), max_size_traits<T...>::max_sizeof());
+ }
+ static constexpr std::size_t max_alignof() {
+ return max_size(alignof(H), max_size_traits<T...>::max_alignof());
+ }
+};
+
+template <>
+struct max_size_traits<> {
+ static constexpr std::size_t max_sizeof() { return 0; }
+ static constexpr std::size_t max_alignof() { return 0; }
+};
+
+template <std::size_t Len, typename... T>
+struct aligned_union {
+ static constexpr std::size_t alignment_value = max_size_traits<T...>::max_alignof();
+ static constexpr std::size_t size_value =
+ max_size(Len, max_size_traits<T...>::max_sizeof());
+ using type = typename std::aligned_storage<size_value, alignment_value>::type;
+};
+
+#else
+
+template <std::size_t Len, typename... T>
+using aligned_union = std::aligned_union<Len, T...>;
+
+#endif
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/ubsan.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/ubsan.h
new file mode 100644
index 00000000000..2d4b513894b
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/ubsan.h
@@ -0,0 +1,88 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Contains utilities for making UBSan happy.
+
+#pragma once
+
+#include <cstring>
+#include <memory>
+#include <type_traits>
+
+#include "arrow/util/macros.h"
+
+namespace arrow {
+namespace util {
+
+namespace internal {
+
+static uint8_t non_null_filler;
+
+} // namespace internal
+
+/// \brief Returns maybe_null if not null or a non-null pointer to an arbitrary memory
+/// that shouldn't be dereferenced.
+///
+/// Memset/Memcpy are undefined when a nullptr is passed as an argument use this utility
+/// method to wrap locations where this could happen.
+///
+/// Note: Flatbuffers has UBSan warnings if a zero length vector is passed.
+/// https://github.com/google/flatbuffers/pull/5355 is trying to resolve
+/// them.
+template <typename T>
+inline T* MakeNonNull(T* maybe_null) {
+ if (ARROW_PREDICT_TRUE(maybe_null != NULLPTR)) {
+ return maybe_null;
+ }
+
+ return reinterpret_cast<T*>(&internal::non_null_filler);
+}
+
+template <typename T>
+inline typename std::enable_if<std::is_trivial<T>::value, T>::type SafeLoadAs(
+ const uint8_t* unaligned) {
+ typename std::remove_const<T>::type ret;
+ std::memcpy(&ret, unaligned, sizeof(T));
+ return ret;
+}
+
+template <typename T>
+inline typename std::enable_if<std::is_trivial<T>::value, T>::type SafeLoad(
+ const T* unaligned) {
+ typename std::remove_const<T>::type ret;
+ std::memcpy(&ret, unaligned, sizeof(T));
+ return ret;
+}
+
+template <typename U, typename T>
+inline typename std::enable_if<std::is_trivial<T>::value && std::is_trivial<U>::value &&
+ sizeof(T) == sizeof(U),
+ U>::type
+SafeCopy(T value) {
+ typename std::remove_const<U>::type ret;
+ std::memcpy(&ret, &value, sizeof(T));
+ return ret;
+}
+
+template <typename T>
+inline typename std::enable_if<std::is_trivial<T>::value, void>::type SafeStore(
+ void* unaligned, T value) {
+ std::memcpy(unaligned, &value, sizeof(T));
+}
+
+} // namespace util
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/uri.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/uri.cc
new file mode 100644
index 00000000000..c19a7bc2eee
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/uri.cc
@@ -0,0 +1,292 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/util/uri.h"
+
+#include <cstring>
+#include <sstream>
+#include <vector>
+
+#include "arrow/util/string_view.h"
+#include "arrow/util/value_parsing.h"
+#include "contrib/restricted/uriparser/include/uriparser/Uri.h"
+
+namespace arrow {
+namespace internal {
+
+namespace {
+
+util::string_view TextRangeToView(const UriTextRangeStructA& range) {
+ if (range.first == nullptr) {
+ return "";
+ } else {
+ return {range.first, static_cast<size_t>(range.afterLast - range.first)};
+ }
+}
+
+std::string TextRangeToString(const UriTextRangeStructA& range) {
+ return std::string(TextRangeToView(range));
+}
+
+// There can be a difference between an absent field and an empty field.
+// For example, in "unix:/tmp/foo", the host is absent, while in
+// "unix:///tmp/foo", the host is empty but present.
+// This function helps distinguish.
+bool IsTextRangeSet(const UriTextRangeStructA& range) { return range.first != nullptr; }
+
+#ifdef _WIN32
+bool IsDriveSpec(const util::string_view s) {
+ return (s.length() >= 2 && s[1] == ':' &&
+ ((s[0] >= 'A' && s[0] <= 'Z') || (s[0] >= 'a' && s[0] <= 'z')));
+}
+#endif
+
+} // namespace
+
+std::string UriEscape(const std::string& s) {
+ if (s.empty()) {
+ // Avoid passing null pointer to uriEscapeExA
+ return s;
+ }
+ std::string escaped;
+ escaped.resize(3 * s.length());
+
+ auto end = uriEscapeExA(s.data(), s.data() + s.length(), &escaped[0],
+ /*spaceToPlus=*/URI_FALSE, /*normalizeBreaks=*/URI_FALSE);
+ escaped.resize(end - &escaped[0]);
+ return escaped;
+}
+
+std::string UriUnescape(const util::string_view s) {
+ std::string result(s);
+ if (!result.empty()) {
+ auto end = uriUnescapeInPlaceA(&result[0]);
+ result.resize(end - &result[0]);
+ }
+ return result;
+}
+
+std::string UriEncodeHost(const std::string& host) {
+ // Fairly naive check: if it contains a ':', it's IPv6 and needs
+ // brackets, else it's OK
+ if (host.find(":") != std::string::npos) {
+ std::string result = "[";
+ result += host;
+ result += ']';
+ return result;
+ } else {
+ return host;
+ }
+}
+
+struct Uri::Impl {
+ Impl() : string_rep_(""), port_(-1) { memset(&uri_, 0, sizeof(uri_)); }
+
+ ~Impl() { uriFreeUriMembersA(&uri_); }
+
+ void Reset() {
+ uriFreeUriMembersA(&uri_);
+ memset(&uri_, 0, sizeof(uri_));
+ data_.clear();
+ string_rep_.clear();
+ path_segments_.clear();
+ port_ = -1;
+ }
+
+ const std::string& KeepString(const std::string& s) {
+ data_.push_back(s);
+ return data_.back();
+ }
+
+ UriUriA uri_;
+ // Keep alive strings that uriparser stores pointers to
+ std::vector<std::string> data_;
+ std::string string_rep_;
+ int32_t port_;
+ std::vector<util::string_view> path_segments_;
+ bool is_file_uri_;
+ bool is_absolute_path_;
+};
+
+Uri::Uri() : impl_(new Impl) {}
+
+Uri::~Uri() {}
+
+Uri::Uri(Uri&& u) : impl_(std::move(u.impl_)) {}
+
+Uri& Uri::operator=(Uri&& u) {
+ impl_ = std::move(u.impl_);
+ return *this;
+}
+
+std::string Uri::scheme() const { return TextRangeToString(impl_->uri_.scheme); }
+
+std::string Uri::host() const { return TextRangeToString(impl_->uri_.hostText); }
+
+bool Uri::has_host() const { return IsTextRangeSet(impl_->uri_.hostText); }
+
+std::string Uri::port_text() const { return TextRangeToString(impl_->uri_.portText); }
+
+int32_t Uri::port() const { return impl_->port_; }
+
+std::string Uri::username() const {
+ auto userpass = TextRangeToView(impl_->uri_.userInfo);
+ auto sep_pos = userpass.find_first_of(':');
+ if (sep_pos == util::string_view::npos) {
+ return UriUnescape(userpass);
+ } else {
+ return UriUnescape(userpass.substr(0, sep_pos));
+ }
+}
+
+std::string Uri::password() const {
+ auto userpass = TextRangeToView(impl_->uri_.userInfo);
+ auto sep_pos = userpass.find_first_of(':');
+ if (sep_pos == util::string_view::npos) {
+ return std::string();
+ } else {
+ return UriUnescape(userpass.substr(sep_pos + 1));
+ }
+}
+
+std::string Uri::path() const {
+ const auto& segments = impl_->path_segments_;
+
+ bool must_prepend_slash = impl_->is_absolute_path_;
+#ifdef _WIN32
+ // On Windows, "file:///C:/foo" should have path "C:/foo", not "/C:/foo",
+ // despite it being absolute.
+ // (see https://tools.ietf.org/html/rfc8089#page-13)
+ if (impl_->is_absolute_path_ && impl_->is_file_uri_ && segments.size() > 0 &&
+ IsDriveSpec(segments[0])) {
+ must_prepend_slash = false;
+ }
+#endif
+
+ std::stringstream ss;
+ if (must_prepend_slash) {
+ ss << "/";
+ }
+ bool first = true;
+ for (const auto& seg : segments) {
+ if (!first) {
+ ss << "/";
+ }
+ first = false;
+ ss << seg;
+ }
+ return std::move(ss).str();
+}
+
+std::string Uri::query_string() const { return TextRangeToString(impl_->uri_.query); }
+
+Result<std::vector<std::pair<std::string, std::string>>> Uri::query_items() const {
+ const auto& query = impl_->uri_.query;
+ UriQueryListA* query_list;
+ int item_count;
+ std::vector<std::pair<std::string, std::string>> items;
+
+ if (query.first == nullptr) {
+ return items;
+ }
+ if (uriDissectQueryMallocA(&query_list, &item_count, query.first, query.afterLast) !=
+ URI_SUCCESS) {
+ return Status::Invalid("Cannot parse query string: '", query_string(), "'");
+ }
+ std::unique_ptr<UriQueryListA, decltype(&uriFreeQueryListA)> query_guard(
+ query_list, uriFreeQueryListA);
+
+ items.reserve(item_count);
+ while (query_list != nullptr) {
+ if (query_list->value != nullptr) {
+ items.emplace_back(query_list->key, query_list->value);
+ } else {
+ items.emplace_back(query_list->key, "");
+ }
+ query_list = query_list->next;
+ }
+ return items;
+}
+
+const std::string& Uri::ToString() const { return impl_->string_rep_; }
+
+Status Uri::Parse(const std::string& uri_string) {
+ impl_->Reset();
+
+ const auto& s = impl_->KeepString(uri_string);
+ impl_->string_rep_ = s;
+ const char* error_pos;
+ if (uriParseSingleUriExA(&impl_->uri_, s.data(), s.data() + s.size(), &error_pos) !=
+ URI_SUCCESS) {
+ return Status::Invalid("Cannot parse URI: '", uri_string, "'");
+ }
+
+ const auto scheme = TextRangeToView(impl_->uri_.scheme);
+ if (scheme.empty()) {
+ return Status::Invalid("URI has empty scheme: '", uri_string, "'");
+ }
+ impl_->is_file_uri_ = (scheme == "file");
+
+ // Gather path segments
+ auto path_seg = impl_->uri_.pathHead;
+ while (path_seg != nullptr) {
+ impl_->path_segments_.push_back(TextRangeToView(path_seg->text));
+ path_seg = path_seg->next;
+ }
+
+ // Decide whether URI path is absolute
+ impl_->is_absolute_path_ = false;
+ if (impl_->uri_.absolutePath == URI_TRUE) {
+ impl_->is_absolute_path_ = true;
+ } else if (has_host() && impl_->path_segments_.size() > 0) {
+ // When there's a host (even empty), uriparser considers the path relative.
+ // Several URI parsers for Python all consider it absolute, though.
+ // For example, the path for "file:///tmp/foo" is "/tmp/foo", not "tmp/foo".
+ // Similarly, the path for "file://localhost/" is "/".
+ // However, the path for "file://localhost" is "".
+ impl_->is_absolute_path_ = true;
+ }
+#ifdef _WIN32
+ // There's an exception on Windows: "file:/C:foo/bar" is relative.
+ if (impl_->is_file_uri_ && impl_->path_segments_.size() > 0) {
+ const auto& first_seg = impl_->path_segments_[0];
+ if (IsDriveSpec(first_seg) && (first_seg.length() >= 3 && first_seg[2] != '/')) {
+ impl_->is_absolute_path_ = false;
+ }
+ }
+#endif
+
+ if (impl_->is_file_uri_ && !impl_->is_absolute_path_) {
+ return Status::Invalid("File URI cannot be relative: '", uri_string, "'");
+ }
+
+ // Parse port number
+ auto port_text = TextRangeToView(impl_->uri_.portText);
+ if (port_text.size()) {
+ uint16_t port_num;
+ if (!ParseValue<UInt16Type>(port_text.data(), port_text.size(), &port_num)) {
+ return Status::Invalid("Invalid port number '", port_text, "' in URI '", uri_string,
+ "'");
+ }
+ impl_->port_ = port_num;
+ }
+
+ return Status::OK();
+}
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/uri.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/uri.h
new file mode 100644
index 00000000000..b4ffbb04dec
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/uri.h
@@ -0,0 +1,104 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/type_fwd.h"
+#include "arrow/util/string_view.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace internal {
+
+/// \brief A parsed URI
+class ARROW_EXPORT Uri {
+ public:
+ Uri();
+ ~Uri();
+ Uri(Uri&&);
+ Uri& operator=(Uri&&);
+
+ // XXX Should we use util::string_view instead? These functions are
+ // not performance-critical.
+
+ /// The URI scheme, such as "http", or the empty string if the URI has no
+ /// explicit scheme.
+ std::string scheme() const;
+
+ /// Whether the URI has an explicit host name. This may return true if
+ /// the URI has an empty host (e.g. "file:///tmp/foo"), while it returns
+ /// false is the URI has not host component at all (e.g. "file:/tmp/foo").
+ bool has_host() const;
+ /// The URI host name, such as "localhost", "127.0.0.1" or "::1", or the empty
+ /// string is the URI does not have a host component.
+ std::string host() const;
+
+ /// The URI port number, as a string such as "80", or the empty string is the URI
+ /// does not have a port number component.
+ std::string port_text() const;
+ /// The URI port parsed as an integer, or -1 if the URI does not have a port
+ /// number component.
+ int32_t port() const;
+
+ /// The username specified in the URI.
+ std::string username() const;
+ /// The password specified in the URI.
+ std::string password() const;
+
+ /// The URI path component.
+ std::string path() const;
+
+ /// The URI query string
+ std::string query_string() const;
+
+ /// The URI query items
+ ///
+ /// Note this API doesn't allow differentiating between an empty value
+ /// and a missing value, such in "a&b=1" vs. "a=&b=1".
+ Result<std::vector<std::pair<std::string, std::string>>> query_items() const;
+
+ /// Get the string representation of this URI.
+ const std::string& ToString() const;
+
+ /// Factory function to parse a URI from its string representation.
+ Status Parse(const std::string& uri_string);
+
+ private:
+ struct Impl;
+ std::unique_ptr<Impl> impl_;
+};
+
+/// Percent-encode the input string, for use e.g. as a URI query parameter.
+ARROW_EXPORT
+std::string UriEscape(const std::string& s);
+
+ARROW_EXPORT
+std::string UriUnescape(const arrow::util::string_view s);
+
+/// Encode a host for use within a URI, such as "localhost",
+/// "127.0.0.1", or "[::1]".
+ARROW_EXPORT
+std::string UriEncodeHost(const std::string& host);
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/utf8.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/utf8.cc
new file mode 100644
index 00000000000..11394d2e64c
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/utf8.cc
@@ -0,0 +1,160 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cstdint>
+#include <iterator>
+#include <mutex>
+#include <stdexcept>
+#include <utility>
+
+#include "arrow/result.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/utf8.h"
+#include "arrow/vendored/utfcpp/checked.h"
+
+// Can be defined by utfcpp
+#ifdef NOEXCEPT
+#undef NOEXCEPT
+#endif
+
+namespace arrow {
+namespace util {
+namespace internal {
+
+// Copyright (c) 2008-2010 Bjoern Hoehrmann <[email protected]>
+// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
+
+// clang-format off
+const uint8_t utf8_small_table[] = { // NOLINT
+ // The first part of the table maps bytes to character classes that
+ // to reduce the size of the transition table and create bitmasks.
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // NOLINT
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // NOLINT
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // NOLINT
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // NOLINT
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // NOLINT
+ 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // NOLINT
+ 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // NOLINT
+ 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, // NOLINT
+
+ // The second part is a transition table that maps a combination
+ // of a state of the automaton and a character class to a state.
+ // Character classes are between 0 and 11, states are multiples of 12.
+ 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, // NOLINT
+ 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, // NOLINT
+ 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, // NOLINT
+ 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, // NOLINT
+ 12,36,12,12,12,12,12,12,12,12,12,12, // NOLINT
+};
+// clang-format on
+
+uint16_t utf8_large_table[9 * 256] = {0xffff};
+
+const uint8_t utf8_byte_size_table[16] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4};
+
+static void InitializeLargeTable() {
+ for (uint32_t state = 0; state < 9; ++state) {
+ for (uint32_t byte = 0; byte < 256; ++byte) {
+ uint32_t byte_class = utf8_small_table[byte];
+ uint8_t next_state = utf8_small_table[256 + state * 12 + byte_class] / 12;
+ DCHECK_LT(next_state, 9);
+ utf8_large_table[state * 256 + byte] = static_cast<uint16_t>(next_state * 256);
+ }
+ }
+}
+
+ARROW_EXPORT void CheckUTF8Initialized() {
+ DCHECK_EQ(utf8_large_table[0], 0)
+ << "InitializeUTF8() must be called before calling UTF8 routines";
+}
+
+} // namespace internal
+
+static std::once_flag utf8_initialized;
+
+void InitializeUTF8() {
+ std::call_once(utf8_initialized, internal::InitializeLargeTable);
+}
+
+static const uint8_t kBOM[] = {0xEF, 0xBB, 0xBF};
+
+Result<const uint8_t*> SkipUTF8BOM(const uint8_t* data, int64_t size) {
+ int64_t i;
+ for (i = 0; i < static_cast<int64_t>(sizeof(kBOM)); ++i) {
+ if (size == 0) {
+ if (i == 0) {
+ // Empty string
+ return data;
+ } else {
+ return Status::Invalid("UTF8 string too short (truncated byte order mark?)");
+ }
+ }
+ if (data[i] != kBOM[i]) {
+ // BOM not found
+ return data;
+ }
+ --size;
+ }
+ // BOM found
+ return data + i;
+}
+
+namespace {
+
+// Some platforms (such as old MinGWs) don't have the <codecvt> header,
+// so call into a vendored utf8 implementation instead.
+
+std::wstring UTF8ToWideStringInternal(const std::string& source) {
+ std::wstring ws;
+#if WCHAR_MAX > 0xFFFF
+ ::utf8::utf8to32(source.begin(), source.end(), std::back_inserter(ws));
+#else
+ ::utf8::utf8to16(source.begin(), source.end(), std::back_inserter(ws));
+#endif
+ return ws;
+}
+
+std::string WideStringToUTF8Internal(const std::wstring& source) {
+ std::string s;
+#if WCHAR_MAX > 0xFFFF
+ ::utf8::utf32to8(source.begin(), source.end(), std::back_inserter(s));
+#else
+ ::utf8::utf16to8(source.begin(), source.end(), std::back_inserter(s));
+#endif
+ return s;
+}
+
+} // namespace
+
+Result<std::wstring> UTF8ToWideString(const std::string& source) {
+ try {
+ return UTF8ToWideStringInternal(source);
+ } catch (std::exception& e) {
+ return Status::Invalid(e.what());
+ }
+}
+
+ARROW_EXPORT Result<std::string> WideStringToUTF8(const std::wstring& source) {
+ try {
+ return WideStringToUTF8Internal(source);
+ } catch (std::exception& e) {
+ return Status::Invalid(e.what());
+ }
+}
+
+} // namespace util
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/utf8.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/utf8.h
new file mode 100644
index 00000000000..0ec3538b95c
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/utf8.h
@@ -0,0 +1,570 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <string>
+
+#if defined(ARROW_HAVE_NEON) || defined(ARROW_HAVE_SSE4_2)
+#error #include <xsimd/xsimd.hpp>
+#endif
+
+#include "arrow/type_fwd.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/simd.h"
+#include "arrow/util/string_view.h"
+#include "arrow/util/ubsan.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace util {
+
+// Convert a UTF8 string to a wstring (either UTF16 or UTF32, depending
+// on the wchar_t width).
+ARROW_EXPORT Result<std::wstring> UTF8ToWideString(const std::string& source);
+
+// Similarly, convert a wstring to a UTF8 string.
+ARROW_EXPORT Result<std::string> WideStringToUTF8(const std::wstring& source);
+
+namespace internal {
+
+// Copyright (c) 2008-2010 Bjoern Hoehrmann <[email protected]>
+// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
+
+// A compact state table allowing UTF8 decoding using two dependent
+// lookups per byte. The first lookup determines the character class
+// and the second lookup reads the next state.
+// In this table states are multiples of 12.
+ARROW_EXPORT extern const uint8_t utf8_small_table[256 + 9 * 12];
+
+// Success / reject states when looked up in the small table
+static constexpr uint8_t kUTF8DecodeAccept = 0;
+static constexpr uint8_t kUTF8DecodeReject = 12;
+
+// An expanded state table allowing transitions using a single lookup
+// at the expense of a larger memory footprint (but on non-random data,
+// not all the table will end up accessed and cached).
+// In this table states are multiples of 256.
+ARROW_EXPORT extern uint16_t utf8_large_table[9 * 256];
+
+ARROW_EXPORT extern const uint8_t utf8_byte_size_table[16];
+
+// Success / reject states when looked up in the large table
+static constexpr uint16_t kUTF8ValidateAccept = 0;
+static constexpr uint16_t kUTF8ValidateReject = 256;
+
+static inline uint8_t DecodeOneUTF8Byte(uint8_t byte, uint8_t state, uint32_t* codep) {
+ uint8_t type = utf8_small_table[byte];
+
+ *codep = (state != kUTF8DecodeAccept) ? (byte & 0x3fu) | (*codep << 6)
+ : (0xff >> type) & (byte);
+
+ state = utf8_small_table[256 + state + type];
+ return state;
+}
+
+static inline uint16_t ValidateOneUTF8Byte(uint8_t byte, uint16_t state) {
+ return utf8_large_table[state + byte];
+}
+
+ARROW_EXPORT void CheckUTF8Initialized();
+
+} // namespace internal
+
+// This function needs to be called before doing UTF8 validation.
+ARROW_EXPORT void InitializeUTF8();
+
+inline bool ValidateUTF8(const uint8_t* data, int64_t size) {
+ static constexpr uint64_t high_bits_64 = 0x8080808080808080ULL;
+ static constexpr uint32_t high_bits_32 = 0x80808080UL;
+ static constexpr uint16_t high_bits_16 = 0x8080U;
+ static constexpr uint8_t high_bits_8 = 0x80U;
+
+#ifndef NDEBUG
+ internal::CheckUTF8Initialized();
+#endif
+
+ while (size >= 8) {
+ // XXX This is doing an unaligned access. Contemporary architectures
+ // (x86-64, AArch64, PPC64) support it natively and often have good
+ // performance nevertheless.
+ uint64_t mask64 = SafeLoadAs<uint64_t>(data);
+ if (ARROW_PREDICT_TRUE((mask64 & high_bits_64) == 0)) {
+ // 8 bytes of pure ASCII, move forward
+ size -= 8;
+ data += 8;
+ continue;
+ }
+ // Non-ASCII run detected.
+ // We process at least 4 bytes, to avoid too many spurious 64-bit reads
+ // in case the non-ASCII bytes are at the end of the tested 64-bit word.
+ // We also only check for rejection at the end since that state is stable
+ // (once in reject state, we always remain in reject state).
+ // It is guaranteed that size >= 8 when arriving here, which allows
+ // us to avoid size checks.
+ uint16_t state = internal::kUTF8ValidateAccept;
+ // Byte 0
+ state = internal::ValidateOneUTF8Byte(*data++, state);
+ --size;
+ // Byte 1
+ state = internal::ValidateOneUTF8Byte(*data++, state);
+ --size;
+ // Byte 2
+ state = internal::ValidateOneUTF8Byte(*data++, state);
+ --size;
+ // Byte 3
+ state = internal::ValidateOneUTF8Byte(*data++, state);
+ --size;
+ // Byte 4
+ state = internal::ValidateOneUTF8Byte(*data++, state);
+ --size;
+ if (state == internal::kUTF8ValidateAccept) {
+ continue; // Got full char, switch back to ASCII detection
+ }
+ // Byte 5
+ state = internal::ValidateOneUTF8Byte(*data++, state);
+ --size;
+ if (state == internal::kUTF8ValidateAccept) {
+ continue; // Got full char, switch back to ASCII detection
+ }
+ // Byte 6
+ state = internal::ValidateOneUTF8Byte(*data++, state);
+ --size;
+ if (state == internal::kUTF8ValidateAccept) {
+ continue; // Got full char, switch back to ASCII detection
+ }
+ // Byte 7
+ state = internal::ValidateOneUTF8Byte(*data++, state);
+ --size;
+ if (state == internal::kUTF8ValidateAccept) {
+ continue; // Got full char, switch back to ASCII detection
+ }
+ // kUTF8ValidateAccept not reached along 4 transitions has to mean a rejection
+ assert(state == internal::kUTF8ValidateReject);
+ return false;
+ }
+
+ // Check if string tail is full ASCII (common case, fast)
+ if (size >= 4) {
+ uint32_t tail_mask = SafeLoadAs<uint32_t>(data + size - 4);
+ uint32_t head_mask = SafeLoadAs<uint32_t>(data);
+ if (ARROW_PREDICT_TRUE(((head_mask | tail_mask) & high_bits_32) == 0)) {
+ return true;
+ }
+ } else if (size >= 2) {
+ uint16_t tail_mask = SafeLoadAs<uint16_t>(data + size - 2);
+ uint16_t head_mask = SafeLoadAs<uint16_t>(data);
+ if (ARROW_PREDICT_TRUE(((head_mask | tail_mask) & high_bits_16) == 0)) {
+ return true;
+ }
+ } else if (size == 1) {
+ if (ARROW_PREDICT_TRUE((*data & high_bits_8) == 0)) {
+ return true;
+ }
+ } else {
+ /* size == 0 */
+ return true;
+ }
+
+ // Fall back to UTF8 validation of tail string.
+ // Note the state table is designed so that, once in the reject state,
+ // we remain in that state until the end. So we needn't check for
+ // rejection at each char (we don't gain much by short-circuiting here).
+ uint16_t state = internal::kUTF8ValidateAccept;
+ switch (size) {
+ case 7:
+ state = internal::ValidateOneUTF8Byte(data[size - 7], state);
+ case 6:
+ state = internal::ValidateOneUTF8Byte(data[size - 6], state);
+ case 5:
+ state = internal::ValidateOneUTF8Byte(data[size - 5], state);
+ case 4:
+ state = internal::ValidateOneUTF8Byte(data[size - 4], state);
+ case 3:
+ state = internal::ValidateOneUTF8Byte(data[size - 3], state);
+ case 2:
+ state = internal::ValidateOneUTF8Byte(data[size - 2], state);
+ case 1:
+ state = internal::ValidateOneUTF8Byte(data[size - 1], state);
+ default:
+ break;
+ }
+ return ARROW_PREDICT_TRUE(state == internal::kUTF8ValidateAccept);
+}
+
+inline bool ValidateUTF8(const util::string_view& str) {
+ const uint8_t* data = reinterpret_cast<const uint8_t*>(str.data());
+ const size_t length = str.size();
+
+ return ValidateUTF8(data, length);
+}
+
+inline bool ValidateAsciiSw(const uint8_t* data, int64_t len) {
+ uint8_t orall = 0;
+
+ if (len >= 16) {
+ uint64_t or1 = 0, or2 = 0;
+ const uint8_t* data2 = data + 8;
+
+ do {
+ or1 |= *(const uint64_t*)data;
+ or2 |= *(const uint64_t*)data2;
+ data += 16;
+ data2 += 16;
+ len -= 16;
+ } while (len >= 16);
+
+ orall = !((or1 | or2) & 0x8080808080808080ULL) - 1;
+ }
+
+ while (len--) {
+ orall |= *data++;
+ }
+
+ if (orall < 0x80) {
+ return true;
+ } else {
+ return false;
+ }
+}
+
+#if defined(ARROW_HAVE_NEON) || defined(ARROW_HAVE_SSE4_2)
+inline bool ValidateAsciiSimd(const uint8_t* data, int64_t len) {
+ using simd_batch = xsimd::batch<int8_t, 16>;
+
+ if (len >= 32) {
+ const simd_batch zero(static_cast<int8_t>(0));
+ const uint8_t* data2 = data + 16;
+ simd_batch or1 = zero, or2 = zero;
+
+ while (len >= 32) {
+ or1 |= simd_batch(reinterpret_cast<const int8_t*>(data), xsimd::unaligned_mode{});
+ or2 |= simd_batch(reinterpret_cast<const int8_t*>(data2), xsimd::unaligned_mode{});
+ data += 32;
+ data2 += 32;
+ len -= 32;
+ }
+
+ // To test for upper bit in all bytes, test whether any of them is negative
+ or1 |= or2;
+ if (xsimd::any(or1 < zero)) {
+ return false;
+ }
+ }
+
+ return ValidateAsciiSw(data, len);
+}
+#endif // ARROW_HAVE_SSE4_2
+
+inline bool ValidateAscii(const uint8_t* data, int64_t len) {
+#if defined(ARROW_HAVE_NEON) || defined(ARROW_HAVE_SSE4_2)
+ return ValidateAsciiSimd(data, len);
+#else
+ return ValidateAsciiSw(data, len);
+#endif
+}
+
+inline bool ValidateAscii(const util::string_view& str) {
+ const uint8_t* data = reinterpret_cast<const uint8_t*>(str.data());
+ const size_t length = str.size();
+
+ return ValidateAscii(data, length);
+}
+
+// Skip UTF8 byte order mark, if any.
+ARROW_EXPORT
+Result<const uint8_t*> SkipUTF8BOM(const uint8_t* data, int64_t size);
+
+static constexpr uint32_t kMaxUnicodeCodepoint = 0x110000;
+
+// size of a valid UTF8 can be determined by looking at leading 4 bits of BYTE1
+// utf8_byte_size_table[0..7] --> pure ascii chars --> 1B length
+// utf8_byte_size_table[8..11] --> internal bytes --> 1B length
+// utf8_byte_size_table[12,13] --> 2B long UTF8 chars
+// utf8_byte_size_table[14] --> 3B long UTF8 chars
+// utf8_byte_size_table[15] --> 4B long UTF8 chars
+// NOTE: Results for invalid/ malformed utf-8 sequences are undefined.
+// ex: \xFF... returns 4B
+static inline uint8_t ValidUtf8CodepointByteSize(const uint8_t* codeunit) {
+ return internal::utf8_byte_size_table[*codeunit >> 4];
+}
+
+static inline bool Utf8IsContinuation(const uint8_t codeunit) {
+ return (codeunit & 0xC0) == 0x80; // upper two bits should be 10
+}
+
+static inline bool Utf8Is2ByteStart(const uint8_t codeunit) {
+ return (codeunit & 0xE0) == 0xC0; // upper three bits should be 110
+}
+
+static inline bool Utf8Is3ByteStart(const uint8_t codeunit) {
+ return (codeunit & 0xF0) == 0xE0; // upper four bits should be 1110
+}
+
+static inline bool Utf8Is4ByteStart(const uint8_t codeunit) {
+ return (codeunit & 0xF8) == 0xF0; // upper five bits should be 11110
+}
+
+static inline uint8_t* UTF8Encode(uint8_t* str, uint32_t codepoint) {
+ if (codepoint < 0x80) {
+ *str++ = codepoint;
+ } else if (codepoint < 0x800) {
+ *str++ = 0xC0 + (codepoint >> 6);
+ *str++ = 0x80 + (codepoint & 0x3F);
+ } else if (codepoint < 0x10000) {
+ *str++ = 0xE0 + (codepoint >> 12);
+ *str++ = 0x80 + ((codepoint >> 6) & 0x3F);
+ *str++ = 0x80 + (codepoint & 0x3F);
+ } else {
+ // Assume proper codepoints are always passed
+ assert(codepoint < kMaxUnicodeCodepoint);
+ *str++ = 0xF0 + (codepoint >> 18);
+ *str++ = 0x80 + ((codepoint >> 12) & 0x3F);
+ *str++ = 0x80 + ((codepoint >> 6) & 0x3F);
+ *str++ = 0x80 + (codepoint & 0x3F);
+ }
+ return str;
+}
+
+static inline bool UTF8Decode(const uint8_t** data, uint32_t* codepoint) {
+ const uint8_t* str = *data;
+ if (*str < 0x80) { // ascii
+ *codepoint = *str++;
+ } else if (ARROW_PREDICT_FALSE(*str < 0xC0)) { // invalid non-ascii char
+ return false;
+ } else if (*str < 0xE0) {
+ uint8_t code_unit_1 = (*str++) & 0x1F; // take last 5 bits
+ if (ARROW_PREDICT_FALSE(!Utf8IsContinuation(*str))) {
+ return false;
+ }
+ uint8_t code_unit_2 = (*str++) & 0x3F; // take last 6 bits
+ *codepoint = (code_unit_1 << 6) + code_unit_2;
+ } else if (*str < 0xF0) {
+ uint8_t code_unit_1 = (*str++) & 0x0F; // take last 4 bits
+ if (ARROW_PREDICT_FALSE(!Utf8IsContinuation(*str))) {
+ return false;
+ }
+ uint8_t code_unit_2 = (*str++) & 0x3F; // take last 6 bits
+ if (ARROW_PREDICT_FALSE(!Utf8IsContinuation(*str))) {
+ return false;
+ }
+ uint8_t code_unit_3 = (*str++) & 0x3F; // take last 6 bits
+ *codepoint = (code_unit_1 << 12) + (code_unit_2 << 6) + code_unit_3;
+ } else if (*str < 0xF8) {
+ uint8_t code_unit_1 = (*str++) & 0x07; // take last 3 bits
+ if (ARROW_PREDICT_FALSE(!Utf8IsContinuation(*str))) {
+ return false;
+ }
+ uint8_t code_unit_2 = (*str++) & 0x3F; // take last 6 bits
+ if (ARROW_PREDICT_FALSE(!Utf8IsContinuation(*str))) {
+ return false;
+ }
+ uint8_t code_unit_3 = (*str++) & 0x3F; // take last 6 bits
+ if (ARROW_PREDICT_FALSE(!Utf8IsContinuation(*str))) {
+ return false;
+ }
+ uint8_t code_unit_4 = (*str++) & 0x3F; // take last 6 bits
+ *codepoint =
+ (code_unit_1 << 18) + (code_unit_2 << 12) + (code_unit_3 << 6) + code_unit_4;
+ } else { // invalid non-ascii char
+ return false;
+ }
+ *data = str;
+ return true;
+}
+
+static inline bool UTF8DecodeReverse(const uint8_t** data, uint32_t* codepoint) {
+ const uint8_t* str = *data;
+ if (*str < 0x80) { // ascii
+ *codepoint = *str--;
+ } else {
+ if (ARROW_PREDICT_FALSE(!Utf8IsContinuation(*str))) {
+ return false;
+ }
+ uint8_t code_unit_N = (*str--) & 0x3F; // take last 6 bits
+ if (Utf8Is2ByteStart(*str)) {
+ uint8_t code_unit_1 = (*str--) & 0x1F; // take last 5 bits
+ *codepoint = (code_unit_1 << 6) + code_unit_N;
+ } else {
+ if (ARROW_PREDICT_FALSE(!Utf8IsContinuation(*str))) {
+ return false;
+ }
+ uint8_t code_unit_Nmin1 = (*str--) & 0x3F; // take last 6 bits
+ if (Utf8Is3ByteStart(*str)) {
+ uint8_t code_unit_1 = (*str--) & 0x0F; // take last 4 bits
+ *codepoint = (code_unit_1 << 12) + (code_unit_Nmin1 << 6) + code_unit_N;
+ } else {
+ if (ARROW_PREDICT_FALSE(!Utf8IsContinuation(*str))) {
+ return false;
+ }
+ uint8_t code_unit_Nmin2 = (*str--) & 0x3F; // take last 6 bits
+ if (ARROW_PREDICT_TRUE(Utf8Is4ByteStart(*str))) {
+ uint8_t code_unit_1 = (*str--) & 0x07; // take last 3 bits
+ *codepoint = (code_unit_1 << 18) + (code_unit_Nmin2 << 12) +
+ (code_unit_Nmin1 << 6) + code_unit_N;
+ } else {
+ return false;
+ }
+ }
+ }
+ }
+ *data = str;
+ return true;
+}
+
+template <class UnaryOperation>
+static inline bool UTF8Transform(const uint8_t* first, const uint8_t* last,
+ uint8_t** destination, UnaryOperation&& unary_op) {
+ const uint8_t* i = first;
+ uint8_t* out = *destination;
+ while (i < last) {
+ uint32_t codepoint = 0;
+ if (ARROW_PREDICT_FALSE(!UTF8Decode(&i, &codepoint))) {
+ return false;
+ }
+ out = UTF8Encode(out, unary_op(codepoint));
+ }
+ *destination = out;
+ return true;
+}
+
+template <class Predicate>
+static inline bool UTF8FindIf(const uint8_t* first, const uint8_t* last,
+ Predicate&& predicate, const uint8_t** position) {
+ const uint8_t* i = first;
+ while (i < last) {
+ uint32_t codepoint = 0;
+ const uint8_t* current = i;
+ if (ARROW_PREDICT_FALSE(!UTF8Decode(&i, &codepoint))) {
+ return false;
+ }
+ if (predicate(codepoint)) {
+ *position = current;
+ return true;
+ }
+ }
+ *position = last;
+ return true;
+}
+
+// Same semantics as std::find_if using reverse iterators with the return value
+// having the same semantics as std::reverse_iterator<..>.base()
+// A reverse iterator physically points to the next address, e.g.:
+// &*reverse_iterator(i) == &*(i + 1)
+template <class Predicate>
+static inline bool UTF8FindIfReverse(const uint8_t* first, const uint8_t* last,
+ Predicate&& predicate, const uint8_t** position) {
+ // converts to a normal point
+ const uint8_t* i = last - 1;
+ while (i >= first) {
+ uint32_t codepoint = 0;
+ const uint8_t* current = i;
+ if (ARROW_PREDICT_FALSE(!UTF8DecodeReverse(&i, &codepoint))) {
+ return false;
+ }
+ if (predicate(codepoint)) {
+ // converts normal pointer to 'reverse iterator semantics'.
+ *position = current + 1;
+ return true;
+ }
+ }
+ // similar to how an end pointer point to 1 beyond the last, reverse iterators point
+ // to the 'first' pointer to indicate out of range.
+ *position = first;
+ return true;
+}
+
+static inline bool UTF8AdvanceCodepoints(const uint8_t* first, const uint8_t* last,
+ const uint8_t** destination, int64_t n) {
+ return UTF8FindIf(
+ first, last,
+ [&](uint32_t codepoint) {
+ bool done = n == 0;
+ n--;
+ return done;
+ },
+ destination);
+}
+
+static inline bool UTF8AdvanceCodepointsReverse(const uint8_t* first, const uint8_t* last,
+ const uint8_t** destination, int64_t n) {
+ return UTF8FindIfReverse(
+ first, last,
+ [&](uint32_t codepoint) {
+ bool done = n == 0;
+ n--;
+ return done;
+ },
+ destination);
+}
+
+template <class UnaryFunction>
+static inline bool UTF8ForEach(const uint8_t* first, const uint8_t* last,
+ UnaryFunction&& f) {
+ const uint8_t* i = first;
+ while (i < last) {
+ uint32_t codepoint = 0;
+ if (ARROW_PREDICT_FALSE(!UTF8Decode(&i, &codepoint))) {
+ return false;
+ }
+ f(codepoint);
+ }
+ return true;
+}
+
+template <class UnaryFunction>
+static inline bool UTF8ForEach(const std::string& s, UnaryFunction&& f) {
+ return UTF8ForEach(reinterpret_cast<const uint8_t*>(s.data()),
+ reinterpret_cast<const uint8_t*>(s.data() + s.length()),
+ std::forward<UnaryFunction>(f));
+}
+
+template <class UnaryPredicate>
+static inline bool UTF8AllOf(const uint8_t* first, const uint8_t* last, bool* result,
+ UnaryPredicate&& predicate) {
+ const uint8_t* i = first;
+ while (i < last) {
+ uint32_t codepoint = 0;
+ if (ARROW_PREDICT_FALSE(!UTF8Decode(&i, &codepoint))) {
+ return false;
+ }
+
+ if (!predicate(codepoint)) {
+ *result = false;
+ return true;
+ }
+ }
+ *result = true;
+ return true;
+}
+
+/// Count the number of codepoints in the given string (assuming it is valid UTF8).
+static inline int64_t UTF8Length(const uint8_t* first, const uint8_t* last) {
+ int64_t length = 0;
+ while (first != last) {
+ length += ((*first & 0xc0) != 0x80);
+ ++first;
+ }
+ return length;
+}
+
+} // namespace util
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/value_parsing.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/value_parsing.cc
new file mode 100644
index 00000000000..3b147366636
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/value_parsing.cc
@@ -0,0 +1,87 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/util/value_parsing.h"
+
+#include <string>
+#include <utility>
+
+#include "contrib/restricted/fast_float/include/fast_float/fast_float.h"
+
+namespace arrow {
+namespace internal {
+
+bool StringToFloat(const char* s, size_t length, float* out) {
+ const auto res = fast_float::from_chars(s, s + length, *out);
+ return res.ec == std::errc() && res.ptr == s + length;
+}
+
+bool StringToFloat(const char* s, size_t length, double* out) {
+ const auto res = fast_float::from_chars(s, s + length, *out);
+ return res.ec == std::errc() && res.ptr == s + length;
+}
+
+// ----------------------------------------------------------------------
+// strptime-like parsing
+
+namespace {
+
+class StrptimeTimestampParser : public TimestampParser {
+ public:
+ explicit StrptimeTimestampParser(std::string format) : format_(std::move(format)) {}
+
+ bool operator()(const char* s, size_t length, TimeUnit::type out_unit,
+ int64_t* out) const override {
+ return ParseTimestampStrptime(s, length, format_.c_str(),
+ /*ignore_time_in_day=*/false,
+ /*allow_trailing_chars=*/false, out_unit, out);
+ }
+
+ const char* kind() const override { return "strptime"; }
+
+ const char* format() const override { return format_.c_str(); }
+
+ private:
+ std::string format_;
+};
+
+class ISO8601Parser : public TimestampParser {
+ public:
+ ISO8601Parser() {}
+
+ bool operator()(const char* s, size_t length, TimeUnit::type out_unit,
+ int64_t* out) const override {
+ return ParseTimestampISO8601(s, length, out_unit, out);
+ }
+
+ const char* kind() const override { return "iso8601"; }
+};
+
+} // namespace
+} // namespace internal
+
+const char* TimestampParser::format() const { return ""; }
+
+std::shared_ptr<TimestampParser> TimestampParser::MakeStrptime(std::string format) {
+ return std::make_shared<internal::StrptimeTimestampParser>(std::move(format));
+}
+
+std::shared_ptr<TimestampParser> TimestampParser::MakeISO8601() {
+ return std::make_shared<internal::ISO8601Parser>();
+}
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/value_parsing.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/value_parsing.h
new file mode 100644
index 00000000000..00295d1b51f
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/value_parsing.h
@@ -0,0 +1,780 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This is a private header for string-to-number parsing utilities
+
+#pragma once
+
+#include <cassert>
+#include <chrono>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <string>
+#include <type_traits>
+
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/time.h"
+#include "arrow/util/visibility.h"
+#include "arrow/vendored/datetime.h"
+#include "arrow/vendored/strptime.h"
+
+namespace arrow {
+
+/// \brief A virtual string to timestamp parser
+class ARROW_EXPORT TimestampParser {
+ public:
+ virtual ~TimestampParser() = default;
+
+ virtual bool operator()(const char* s, size_t length, TimeUnit::type out_unit,
+ int64_t* out) const = 0;
+
+ virtual const char* kind() const = 0;
+
+ virtual const char* format() const;
+
+ /// \brief Create a TimestampParser that recognizes strptime-like format strings
+ static std::shared_ptr<TimestampParser> MakeStrptime(std::string format);
+
+ /// \brief Create a TimestampParser that recognizes (locale-agnostic) ISO8601
+ /// timestamps
+ static std::shared_ptr<TimestampParser> MakeISO8601();
+};
+
+namespace internal {
+
+/// \brief The entry point for conversion from strings.
+///
+/// Specializations of StringConverter for `ARROW_TYPE` must define:
+/// - A default constructible member type `value_type` which will be yielded on a
+/// successful parse.
+/// - The static member function `Convert`, callable with signature
+/// `(const ARROW_TYPE& t, const char* s, size_t length, value_type* out)`.
+/// `Convert` returns truthy for successful parses and assigns the parsed values to
+/// `*out`. Parameters required for parsing (for example a timestamp's TimeUnit)
+/// are acquired from the type parameter `t`.
+template <typename ARROW_TYPE, typename Enable = void>
+struct StringConverter;
+
+template <typename T>
+struct is_parseable {
+ template <typename U, typename = typename StringConverter<U>::value_type>
+ static std::true_type Test(U*);
+
+ template <typename U>
+ static std::false_type Test(...);
+
+ static constexpr bool value = decltype(Test<T>(NULLPTR))::value;
+};
+
+template <typename T, typename R = void>
+using enable_if_parseable = enable_if_t<is_parseable<T>::value, R>;
+
+template <>
+struct StringConverter<BooleanType> {
+ using value_type = bool;
+
+ static bool Convert(const BooleanType&, const char* s, size_t length, value_type* out) {
+ if (length == 1) {
+ // "0" or "1"?
+ if (s[0] == '0') {
+ *out = false;
+ return true;
+ }
+ if (s[0] == '1') {
+ *out = true;
+ return true;
+ }
+ return false;
+ }
+ if (length == 4) {
+ // "true"?
+ *out = true;
+ return ((s[0] == 't' || s[0] == 'T') && (s[1] == 'r' || s[1] == 'R') &&
+ (s[2] == 'u' || s[2] == 'U') && (s[3] == 'e' || s[3] == 'E'));
+ }
+ if (length == 5) {
+ // "false"?
+ *out = false;
+ return ((s[0] == 'f' || s[0] == 'F') && (s[1] == 'a' || s[1] == 'A') &&
+ (s[2] == 'l' || s[2] == 'L') && (s[3] == 's' || s[3] == 'S') &&
+ (s[4] == 'e' || s[4] == 'E'));
+ }
+ return false;
+ }
+};
+
+// Ideas for faster float parsing:
+// - http://rapidjson.org/md_doc_internals.html#ParsingDouble
+// - https://github.com/google/double-conversion [used here]
+// - https://github.com/achan001/dtoa-fast
+
+ARROW_EXPORT
+bool StringToFloat(const char* s, size_t length, float* out);
+
+ARROW_EXPORT
+bool StringToFloat(const char* s, size_t length, double* out);
+
+template <>
+struct StringConverter<FloatType> {
+ using value_type = float;
+
+ static bool Convert(const FloatType&, const char* s, size_t length, value_type* out) {
+ return ARROW_PREDICT_TRUE(StringToFloat(s, length, out));
+ }
+};
+
+template <>
+struct StringConverter<DoubleType> {
+ using value_type = double;
+
+ static bool Convert(const DoubleType&, const char* s, size_t length, value_type* out) {
+ return ARROW_PREDICT_TRUE(StringToFloat(s, length, out));
+ }
+};
+
+// NOTE: HalfFloatType would require a half<->float conversion library
+
+inline uint8_t ParseDecimalDigit(char c) { return static_cast<uint8_t>(c - '0'); }
+
+#define PARSE_UNSIGNED_ITERATION(C_TYPE) \
+ if (length > 0) { \
+ uint8_t digit = ParseDecimalDigit(*s++); \
+ result = static_cast<C_TYPE>(result * 10U); \
+ length--; \
+ if (ARROW_PREDICT_FALSE(digit > 9U)) { \
+ /* Non-digit */ \
+ return false; \
+ } \
+ result = static_cast<C_TYPE>(result + digit); \
+ } else { \
+ break; \
+ }
+
+#define PARSE_UNSIGNED_ITERATION_LAST(C_TYPE) \
+ if (length > 0) { \
+ if (ARROW_PREDICT_FALSE(result > std::numeric_limits<C_TYPE>::max() / 10U)) { \
+ /* Overflow */ \
+ return false; \
+ } \
+ uint8_t digit = ParseDecimalDigit(*s++); \
+ result = static_cast<C_TYPE>(result * 10U); \
+ C_TYPE new_result = static_cast<C_TYPE>(result + digit); \
+ if (ARROW_PREDICT_FALSE(--length > 0)) { \
+ /* Too many digits */ \
+ return false; \
+ } \
+ if (ARROW_PREDICT_FALSE(digit > 9U)) { \
+ /* Non-digit */ \
+ return false; \
+ } \
+ if (ARROW_PREDICT_FALSE(new_result < result)) { \
+ /* Overflow */ \
+ return false; \
+ } \
+ result = new_result; \
+ }
+
+inline bool ParseUnsigned(const char* s, size_t length, uint8_t* out) {
+ uint8_t result = 0;
+
+ do {
+ PARSE_UNSIGNED_ITERATION(uint8_t);
+ PARSE_UNSIGNED_ITERATION(uint8_t);
+ PARSE_UNSIGNED_ITERATION_LAST(uint8_t);
+ } while (false);
+ *out = result;
+ return true;
+}
+
+inline bool ParseUnsigned(const char* s, size_t length, uint16_t* out) {
+ uint16_t result = 0;
+ do {
+ PARSE_UNSIGNED_ITERATION(uint16_t);
+ PARSE_UNSIGNED_ITERATION(uint16_t);
+ PARSE_UNSIGNED_ITERATION(uint16_t);
+ PARSE_UNSIGNED_ITERATION(uint16_t);
+ PARSE_UNSIGNED_ITERATION_LAST(uint16_t);
+ } while (false);
+ *out = result;
+ return true;
+}
+
+inline bool ParseUnsigned(const char* s, size_t length, uint32_t* out) {
+ uint32_t result = 0;
+ do {
+ PARSE_UNSIGNED_ITERATION(uint32_t);
+ PARSE_UNSIGNED_ITERATION(uint32_t);
+ PARSE_UNSIGNED_ITERATION(uint32_t);
+ PARSE_UNSIGNED_ITERATION(uint32_t);
+ PARSE_UNSIGNED_ITERATION(uint32_t);
+
+ PARSE_UNSIGNED_ITERATION(uint32_t);
+ PARSE_UNSIGNED_ITERATION(uint32_t);
+ PARSE_UNSIGNED_ITERATION(uint32_t);
+ PARSE_UNSIGNED_ITERATION(uint32_t);
+
+ PARSE_UNSIGNED_ITERATION_LAST(uint32_t);
+ } while (false);
+ *out = result;
+ return true;
+}
+
+inline bool ParseUnsigned(const char* s, size_t length, uint64_t* out) {
+ uint64_t result = 0;
+ do {
+ PARSE_UNSIGNED_ITERATION(uint64_t);
+ PARSE_UNSIGNED_ITERATION(uint64_t);
+ PARSE_UNSIGNED_ITERATION(uint64_t);
+ PARSE_UNSIGNED_ITERATION(uint64_t);
+ PARSE_UNSIGNED_ITERATION(uint64_t);
+
+ PARSE_UNSIGNED_ITERATION(uint64_t);
+ PARSE_UNSIGNED_ITERATION(uint64_t);
+ PARSE_UNSIGNED_ITERATION(uint64_t);
+ PARSE_UNSIGNED_ITERATION(uint64_t);
+ PARSE_UNSIGNED_ITERATION(uint64_t);
+
+ PARSE_UNSIGNED_ITERATION(uint64_t);
+ PARSE_UNSIGNED_ITERATION(uint64_t);
+ PARSE_UNSIGNED_ITERATION(uint64_t);
+ PARSE_UNSIGNED_ITERATION(uint64_t);
+ PARSE_UNSIGNED_ITERATION(uint64_t);
+
+ PARSE_UNSIGNED_ITERATION(uint64_t);
+ PARSE_UNSIGNED_ITERATION(uint64_t);
+ PARSE_UNSIGNED_ITERATION(uint64_t);
+ PARSE_UNSIGNED_ITERATION(uint64_t);
+
+ PARSE_UNSIGNED_ITERATION_LAST(uint64_t);
+ } while (false);
+ *out = result;
+ return true;
+}
+
+#undef PARSE_UNSIGNED_ITERATION
+#undef PARSE_UNSIGNED_ITERATION_LAST
+
+template <class ARROW_TYPE>
+struct StringToUnsignedIntConverterMixin {
+ using value_type = typename ARROW_TYPE::c_type;
+
+ static bool Convert(const ARROW_TYPE&, const char* s, size_t length, value_type* out) {
+ if (ARROW_PREDICT_FALSE(length == 0)) {
+ return false;
+ }
+ // Skip leading zeros
+ while (length > 0 && *s == '0') {
+ length--;
+ s++;
+ }
+ return ParseUnsigned(s, length, out);
+ }
+};
+
+template <>
+struct StringConverter<UInt8Type> : public StringToUnsignedIntConverterMixin<UInt8Type> {
+ using StringToUnsignedIntConverterMixin<UInt8Type>::StringToUnsignedIntConverterMixin;
+};
+
+template <>
+struct StringConverter<UInt16Type>
+ : public StringToUnsignedIntConverterMixin<UInt16Type> {
+ using StringToUnsignedIntConverterMixin<UInt16Type>::StringToUnsignedIntConverterMixin;
+};
+
+template <>
+struct StringConverter<UInt32Type>
+ : public StringToUnsignedIntConverterMixin<UInt32Type> {
+ using StringToUnsignedIntConverterMixin<UInt32Type>::StringToUnsignedIntConverterMixin;
+};
+
+template <>
+struct StringConverter<UInt64Type>
+ : public StringToUnsignedIntConverterMixin<UInt64Type> {
+ using StringToUnsignedIntConverterMixin<UInt64Type>::StringToUnsignedIntConverterMixin;
+};
+
+template <class ARROW_TYPE>
+struct StringToSignedIntConverterMixin {
+ using value_type = typename ARROW_TYPE::c_type;
+ using unsigned_type = typename std::make_unsigned<value_type>::type;
+
+ static bool Convert(const ARROW_TYPE&, const char* s, size_t length, value_type* out) {
+ static constexpr auto max_positive =
+ static_cast<unsigned_type>(std::numeric_limits<value_type>::max());
+ // Assuming two's complement
+ static constexpr unsigned_type max_negative = max_positive + 1;
+ bool negative = false;
+ unsigned_type unsigned_value = 0;
+
+ if (ARROW_PREDICT_FALSE(length == 0)) {
+ return false;
+ }
+ if (*s == '-') {
+ negative = true;
+ s++;
+ if (--length == 0) {
+ return false;
+ }
+ }
+ // Skip leading zeros
+ while (length > 0 && *s == '0') {
+ length--;
+ s++;
+ }
+ if (!ARROW_PREDICT_TRUE(ParseUnsigned(s, length, &unsigned_value))) {
+ return false;
+ }
+ if (negative) {
+ if (ARROW_PREDICT_FALSE(unsigned_value > max_negative)) {
+ return false;
+ }
+ // To avoid both compiler warnings (with unsigned negation)
+ // and undefined behaviour (with signed negation overflow),
+ // use the expanded formula for 2's complement negation.
+ *out = static_cast<value_type>(~unsigned_value + 1);
+ } else {
+ if (ARROW_PREDICT_FALSE(unsigned_value > max_positive)) {
+ return false;
+ }
+ *out = static_cast<value_type>(unsigned_value);
+ }
+ return true;
+ }
+};
+
+template <>
+struct StringConverter<Int8Type> : public StringToSignedIntConverterMixin<Int8Type> {
+ using StringToSignedIntConverterMixin<Int8Type>::StringToSignedIntConverterMixin;
+};
+
+template <>
+struct StringConverter<Int16Type> : public StringToSignedIntConverterMixin<Int16Type> {
+ using StringToSignedIntConverterMixin<Int16Type>::StringToSignedIntConverterMixin;
+};
+
+template <>
+struct StringConverter<Int32Type> : public StringToSignedIntConverterMixin<Int32Type> {
+ using StringToSignedIntConverterMixin<Int32Type>::StringToSignedIntConverterMixin;
+};
+
+template <>
+struct StringConverter<Int64Type> : public StringToSignedIntConverterMixin<Int64Type> {
+ using StringToSignedIntConverterMixin<Int64Type>::StringToSignedIntConverterMixin;
+};
+
+namespace detail {
+
+// Inline-able ISO-8601 parser
+
+using ts_type = TimestampType::c_type;
+
+template <typename Duration>
+static inline bool ParseYYYY_MM_DD(const char* s, Duration* since_epoch) {
+ uint16_t year = 0;
+ uint8_t month = 0;
+ uint8_t day = 0;
+ if (ARROW_PREDICT_FALSE(s[4] != '-') || ARROW_PREDICT_FALSE(s[7] != '-')) {
+ return false;
+ }
+ if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 0, 4, &year))) {
+ return false;
+ }
+ if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 5, 2, &month))) {
+ return false;
+ }
+ if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 8, 2, &day))) {
+ return false;
+ }
+ arrow_vendored::date::year_month_day ymd{arrow_vendored::date::year{year},
+ arrow_vendored::date::month{month},
+ arrow_vendored::date::day{day}};
+ if (ARROW_PREDICT_FALSE(!ymd.ok())) return false;
+
+ *since_epoch = std::chrono::duration_cast<Duration>(
+ arrow_vendored::date::sys_days{ymd}.time_since_epoch());
+ return true;
+}
+
+template <typename Duration>
+static inline bool ParseHH(const char* s, Duration* out) {
+ uint8_t hours = 0;
+ if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 0, 2, &hours))) {
+ return false;
+ }
+ if (ARROW_PREDICT_FALSE(hours >= 24)) {
+ return false;
+ }
+ *out = std::chrono::duration_cast<Duration>(std::chrono::hours(hours));
+ return true;
+}
+
+template <typename Duration>
+static inline bool ParseHH_MM(const char* s, Duration* out) {
+ uint8_t hours = 0;
+ uint8_t minutes = 0;
+ if (ARROW_PREDICT_FALSE(s[2] != ':')) {
+ return false;
+ }
+ if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 0, 2, &hours))) {
+ return false;
+ }
+ if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 3, 2, &minutes))) {
+ return false;
+ }
+ if (ARROW_PREDICT_FALSE(hours >= 24)) {
+ return false;
+ }
+ if (ARROW_PREDICT_FALSE(minutes >= 60)) {
+ return false;
+ }
+ *out = std::chrono::duration_cast<Duration>(std::chrono::hours(hours) +
+ std::chrono::minutes(minutes));
+ return true;
+}
+
+template <typename Duration>
+static inline bool ParseHH_MM_SS(const char* s, Duration* out) {
+ uint8_t hours = 0;
+ uint8_t minutes = 0;
+ uint8_t seconds = 0;
+ if (ARROW_PREDICT_FALSE(s[2] != ':') || ARROW_PREDICT_FALSE(s[5] != ':')) {
+ return false;
+ }
+ if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 0, 2, &hours))) {
+ return false;
+ }
+ if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 3, 2, &minutes))) {
+ return false;
+ }
+ if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 6, 2, &seconds))) {
+ return false;
+ }
+ if (ARROW_PREDICT_FALSE(hours >= 24)) {
+ return false;
+ }
+ if (ARROW_PREDICT_FALSE(minutes >= 60)) {
+ return false;
+ }
+ if (ARROW_PREDICT_FALSE(seconds >= 60)) {
+ return false;
+ }
+ *out = std::chrono::duration_cast<Duration>(std::chrono::hours(hours) +
+ std::chrono::minutes(minutes) +
+ std::chrono::seconds(seconds));
+ return true;
+}
+
+static inline bool ParseSubSeconds(const char* s, size_t length, TimeUnit::type unit,
+ uint32_t* out) {
+ // The decimal point has been peeled off at this point
+
+ // Fail if number of decimal places provided exceeds what the unit can hold.
+ // Calculate how many trailing decimal places are omitted for the unit
+ // e.g. if 4 decimal places are provided and unit is MICRO, 2 are missing
+ size_t omitted = 0;
+ switch (unit) {
+ case TimeUnit::MILLI:
+ if (ARROW_PREDICT_FALSE(length > 3)) {
+ return false;
+ }
+ if (length < 3) {
+ omitted = 3 - length;
+ }
+ break;
+ case TimeUnit::MICRO:
+ if (ARROW_PREDICT_FALSE(length > 6)) {
+ return false;
+ }
+ if (length < 6) {
+ omitted = 6 - length;
+ }
+ break;
+ case TimeUnit::NANO:
+ if (ARROW_PREDICT_FALSE(length > 9)) {
+ return false;
+ }
+ if (length < 9) {
+ omitted = 9 - length;
+ }
+ break;
+ default:
+ return false;
+ }
+
+ if (ARROW_PREDICT_TRUE(omitted == 0)) {
+ return ParseUnsigned(s, length, out);
+ } else {
+ uint32_t subseconds;
+ bool success = ParseUnsigned(s, length, &subseconds);
+ if (ARROW_PREDICT_TRUE(success)) {
+ switch (omitted) {
+ case 1:
+ *out = subseconds * 10;
+ break;
+ case 2:
+ *out = subseconds * 100;
+ break;
+ case 3:
+ *out = subseconds * 1000;
+ break;
+ case 4:
+ *out = subseconds * 10000;
+ break;
+ case 5:
+ *out = subseconds * 100000;
+ break;
+ case 6:
+ *out = subseconds * 1000000;
+ break;
+ case 7:
+ *out = subseconds * 10000000;
+ break;
+ case 8:
+ *out = subseconds * 100000000;
+ break;
+ default:
+ // Impossible case
+ break;
+ }
+ return true;
+ } else {
+ return false;
+ }
+ }
+}
+
+} // namespace detail
+
+static inline bool ParseTimestampISO8601(const char* s, size_t length,
+ TimeUnit::type unit,
+ TimestampType::c_type* out) {
+ using seconds_type = std::chrono::duration<TimestampType::c_type>;
+
+ // We allow the following formats for all units:
+ // - "YYYY-MM-DD"
+ // - "YYYY-MM-DD[ T]hhZ?"
+ // - "YYYY-MM-DD[ T]hh:mmZ?"
+ // - "YYYY-MM-DD[ T]hh:mm:ssZ?"
+ //
+ // We allow the following formats for unit == MILLI, MICRO, or NANO:
+ // - "YYYY-MM-DD[ T]hh:mm:ss.s{1,3}Z?"
+ //
+ // We allow the following formats for unit == MICRO, or NANO:
+ // - "YYYY-MM-DD[ T]hh:mm:ss.s{4,6}Z?"
+ //
+ // We allow the following formats for unit == NANO:
+ // - "YYYY-MM-DD[ T]hh:mm:ss.s{7,9}Z?"
+ //
+ // UTC is always assumed, and the DataType's timezone is ignored.
+ //
+
+ if (ARROW_PREDICT_FALSE(length < 10)) return false;
+
+ seconds_type seconds_since_epoch;
+ if (ARROW_PREDICT_FALSE(!detail::ParseYYYY_MM_DD(s, &seconds_since_epoch))) {
+ return false;
+ }
+
+ if (length == 10) {
+ *out = util::CastSecondsToUnit(unit, seconds_since_epoch.count());
+ return true;
+ }
+
+ if (ARROW_PREDICT_FALSE(s[10] != ' ') && ARROW_PREDICT_FALSE(s[10] != 'T')) {
+ return false;
+ }
+
+ if (s[length - 1] == 'Z') {
+ --length;
+ }
+
+ seconds_type seconds_since_midnight;
+ switch (length) {
+ case 13: // YYYY-MM-DD[ T]hh
+ if (ARROW_PREDICT_FALSE(!detail::ParseHH(s + 11, &seconds_since_midnight))) {
+ return false;
+ }
+ break;
+ case 16: // YYYY-MM-DD[ T]hh:mm
+ if (ARROW_PREDICT_FALSE(!detail::ParseHH_MM(s + 11, &seconds_since_midnight))) {
+ return false;
+ }
+ break;
+ case 19: // YYYY-MM-DD[ T]hh:mm:ss
+ case 21: // YYYY-MM-DD[ T]hh:mm:ss.s
+ case 22: // YYYY-MM-DD[ T]hh:mm:ss.ss
+ case 23: // YYYY-MM-DD[ T]hh:mm:ss.sss
+ case 24: // YYYY-MM-DD[ T]hh:mm:ss.ssss
+ case 25: // YYYY-MM-DD[ T]hh:mm:ss.sssss
+ case 26: // YYYY-MM-DD[ T]hh:mm:ss.ssssss
+ case 27: // YYYY-MM-DD[ T]hh:mm:ss.sssssss
+ case 28: // YYYY-MM-DD[ T]hh:mm:ss.ssssssss
+ case 29: // YYYY-MM-DD[ T]hh:mm:ss.sssssssss
+ if (ARROW_PREDICT_FALSE(!detail::ParseHH_MM_SS(s + 11, &seconds_since_midnight))) {
+ return false;
+ }
+ break;
+ default:
+ return false;
+ }
+
+ seconds_since_epoch += seconds_since_midnight;
+
+ if (length <= 19) {
+ *out = util::CastSecondsToUnit(unit, seconds_since_epoch.count());
+ return true;
+ }
+
+ if (ARROW_PREDICT_FALSE(s[19] != '.')) {
+ return false;
+ }
+
+ uint32_t subseconds = 0;
+ if (ARROW_PREDICT_FALSE(
+ !detail::ParseSubSeconds(s + 20, length - 20, unit, &subseconds))) {
+ return false;
+ }
+
+ *out = util::CastSecondsToUnit(unit, seconds_since_epoch.count()) + subseconds;
+ return true;
+}
+
+/// \brief Returns time since the UNIX epoch in the requested unit
+static inline bool ParseTimestampStrptime(const char* buf, size_t length,
+ const char* format, bool ignore_time_in_day,
+ bool allow_trailing_chars, TimeUnit::type unit,
+ int64_t* out) {
+ // NOTE: strptime() is more than 10x faster than arrow_vendored::date::parse().
+ // The buffer may not be nul-terminated
+ std::string clean_copy(buf, length);
+ struct tm result;
+ memset(&result, 0, sizeof(struct tm));
+#ifdef _WIN32
+ char* ret = arrow_strptime(clean_copy.c_str(), format, &result);
+#else
+ char* ret = strptime(clean_copy.c_str(), format, &result);
+#endif
+ if (ret == NULLPTR) {
+ return false;
+ }
+ if (!allow_trailing_chars && static_cast<size_t>(ret - clean_copy.c_str()) != length) {
+ return false;
+ }
+ // ignore the time part
+ arrow_vendored::date::sys_seconds secs =
+ arrow_vendored::date::sys_days(arrow_vendored::date::year(result.tm_year + 1900) /
+ (result.tm_mon + 1) / result.tm_mday);
+ if (!ignore_time_in_day) {
+ secs += (std::chrono::hours(result.tm_hour) + std::chrono::minutes(result.tm_min) +
+ std::chrono::seconds(result.tm_sec));
+ }
+ *out = util::CastSecondsToUnit(unit, secs.time_since_epoch().count());
+ return true;
+}
+
+template <>
+struct StringConverter<TimestampType> {
+ using value_type = int64_t;
+
+ static bool Convert(const TimestampType& type, const char* s, size_t length,
+ value_type* out) {
+ return ParseTimestampISO8601(s, length, type.unit(), out);
+ }
+};
+
+template <>
+struct StringConverter<DurationType>
+ : public StringToSignedIntConverterMixin<DurationType> {
+ using StringToSignedIntConverterMixin<DurationType>::StringToSignedIntConverterMixin;
+};
+
+template <typename DATE_TYPE>
+struct StringConverter<DATE_TYPE, enable_if_date<DATE_TYPE>> {
+ using value_type = typename DATE_TYPE::c_type;
+
+ using duration_type =
+ typename std::conditional<std::is_same<DATE_TYPE, Date32Type>::value,
+ arrow_vendored::date::days,
+ std::chrono::milliseconds>::type;
+
+ static bool Convert(const DATE_TYPE& type, const char* s, size_t length,
+ value_type* out) {
+ if (length != 10) return false;
+
+ duration_type since_epoch;
+ if (ARROW_PREDICT_FALSE(!detail::ParseYYYY_MM_DD(s, &since_epoch))) {
+ return false;
+ }
+
+ *out = static_cast<value_type>(since_epoch.count());
+ return true;
+ }
+};
+
+template <typename TIME_TYPE>
+struct StringConverter<TIME_TYPE, enable_if_time<TIME_TYPE>> {
+ using value_type = typename TIME_TYPE::c_type;
+
+ static bool Convert(const TIME_TYPE& type, const char* s, size_t length,
+ value_type* out) {
+ if (length < 8) return false;
+ auto unit = type.unit();
+
+ std::chrono::seconds since_midnight;
+ if (ARROW_PREDICT_FALSE(!detail::ParseHH_MM_SS(s, &since_midnight))) {
+ return false;
+ }
+
+ *out = static_cast<value_type>(util::CastSecondsToUnit(unit, since_midnight.count()));
+
+ if (length == 8) {
+ return true;
+ }
+
+ uint32_t subseconds_count = 0;
+ if (ARROW_PREDICT_FALSE(
+ !detail::ParseSubSeconds(s + 9, length - 9, unit, &subseconds_count))) {
+ return false;
+ }
+
+ *out += subseconds_count;
+ return true;
+ }
+};
+
+/// \brief Convenience wrappers around internal::StringConverter.
+template <typename T>
+bool ParseValue(const T& type, const char* s, size_t length,
+ typename StringConverter<T>::value_type* out) {
+ return StringConverter<T>::Convert(type, s, length, out);
+}
+
+template <typename T>
+enable_if_parameter_free<T, bool> ParseValue(
+ const char* s, size_t length, typename StringConverter<T>::value_type* out) {
+ static T type;
+ return StringConverter<T>::Convert(type, s, length, out);
+}
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/variant.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/variant.h
new file mode 100644
index 00000000000..b4b0d8f6f31
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/variant.h
@@ -0,0 +1,439 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstddef>
+#include <exception>
+#include <type_traits>
+#include <utility>
+
+#include "arrow/util/macros.h"
+#include "arrow/util/type_traits.h"
+
+namespace arrow {
+namespace util {
+
+/// \brief a std::variant-like discriminated union
+///
+/// Simplifications from std::variant:
+///
+/// - Strictly defaultable. The first type of T... should be nothrow default constructible
+/// and it will be used for default Variants.
+///
+/// - Never valueless_by_exception. std::variant supports a state outside those specified
+/// by T... to which it can return in the event that a constructor throws. If a Variant
+/// would become valueless_by_exception it will instead return to its default state.
+///
+/// - Strictly nothrow move constructible and assignable
+///
+/// - Less sophisticated type deduction. std::variant<bool, std::string>("hello") will
+/// intelligently construct std::string while Variant<bool, std::string>("hello") will
+/// construct bool.
+///
+/// - Either both copy constructible and assignable or neither (std::variant independently
+/// enables copy construction and copy assignment). Variant is copy constructible if
+/// each of T... is copy constructible and assignable.
+///
+/// - Slimmer interface; several members of std::variant are omitted.
+///
+/// - Throws no exceptions; if a bad_variant_access would be thrown Variant will instead
+/// segfault (nullptr dereference).
+///
+/// - Mutable visit takes a pointer instead of mutable reference or rvalue reference,
+/// which is more conformant with our code style.
+template <typename... T>
+class Variant;
+
+namespace detail {
+
+template <typename T, typename = void>
+struct is_equality_comparable : std::false_type {};
+
+template <typename T>
+struct is_equality_comparable<
+ T, typename std::enable_if<std::is_convertible<
+ decltype(std::declval<T>() == std::declval<T>()), bool>::value>::type>
+ : std::true_type {};
+
+template <bool C, typename T, typename E>
+using conditional_t = typename std::conditional<C, T, E>::type;
+
+template <typename T>
+struct type_constant {
+ using type = T;
+};
+
+template <typename...>
+struct first;
+
+template <typename H, typename... T>
+struct first<H, T...> {
+ using type = H;
+};
+
+template <typename T>
+using decay_t = typename std::decay<T>::type;
+
+template <bool...>
+struct all : std::true_type {};
+
+template <bool H, bool... T>
+struct all<H, T...> : conditional_t<H, all<T...>, std::false_type> {};
+
+struct delete_copy_constructor {
+ template <typename>
+ struct type {
+ type() = default;
+ type(const type& other) = delete;
+ type& operator=(const type& other) = delete;
+ };
+};
+
+struct explicit_copy_constructor {
+ template <typename Copyable>
+ struct type {
+ type() = default;
+ type(const type& other) { static_cast<const Copyable&>(other).copy_to(this); }
+ type& operator=(const type& other) {
+ static_cast<Copyable*>(this)->destroy();
+ static_cast<const Copyable&>(other).copy_to(this);
+ return *this;
+ }
+ };
+};
+
+template <typename... T>
+struct VariantStorage {
+ VariantStorage() = default;
+ VariantStorage(const VariantStorage&) {}
+ VariantStorage& operator=(const VariantStorage&) { return *this; }
+ VariantStorage(VariantStorage&&) noexcept {}
+ VariantStorage& operator=(VariantStorage&&) noexcept { return *this; }
+ ~VariantStorage() {
+ static_assert(offsetof(VariantStorage, data_) == 0,
+ "(void*)&VariantStorage::data_ == (void*)this");
+ }
+
+ typename arrow::internal::aligned_union<0, T...>::type data_;
+ uint8_t index_ = 0;
+};
+
+template <typename V, typename...>
+struct VariantImpl;
+
+template <typename... T>
+struct VariantImpl<Variant<T...>> : VariantStorage<T...> {
+ static void index_of() noexcept {}
+ void destroy() noexcept {}
+ void move_to(...) noexcept {}
+ void copy_to(...) const {}
+
+ template <typename R, typename Visitor>
+ [[noreturn]] R visit_const(Visitor&& /* visitor */) const {
+ std::terminate();
+ }
+ template <typename R, typename Visitor>
+ [[noreturn]] R visit_mutable(Visitor&& /* visitor */) {
+ std::terminate();
+ }
+};
+
+template <typename... M, typename H, typename... T>
+struct VariantImpl<Variant<M...>, H, T...> : VariantImpl<Variant<M...>, T...> {
+ using VariantType = Variant<M...>;
+ using Impl = VariantImpl<VariantType, T...>;
+
+ static constexpr uint8_t kIndex = sizeof...(M) - sizeof...(T) - 1;
+
+ VariantImpl() = default;
+
+ using VariantImpl<VariantType, T...>::VariantImpl;
+ using Impl::operator=;
+ using Impl::index_of;
+
+ explicit VariantImpl(H value) {
+ new (this) H(std::move(value));
+ this->index_ = kIndex;
+ }
+
+ VariantImpl& operator=(H value) {
+ static_cast<VariantType*>(this)->destroy();
+ new (this) H(std::move(value));
+ this->index_ = kIndex;
+ return *this;
+ }
+
+ H& cast_this() { return *reinterpret_cast<H*>(this); }
+ const H& cast_this() const { return *reinterpret_cast<const H*>(this); }
+
+ void move_to(VariantType* target) noexcept {
+ if (this->index_ == kIndex) {
+ new (target) H(std::move(cast_this()));
+ target->index_ = kIndex;
+ } else {
+ Impl::move_to(target);
+ }
+ }
+
+ // Templated to avoid instantiation in case H is not copy constructible
+ template <typename Void>
+ void copy_to(Void* generic_target) const {
+ const auto target = static_cast<VariantType*>(generic_target);
+ try {
+ if (this->index_ == kIndex) {
+ new (target) H(cast_this());
+ target->index_ = kIndex;
+ } else {
+ Impl::copy_to(target);
+ }
+ } catch (...) {
+ target->construct_default();
+ throw;
+ }
+ }
+
+ void destroy() noexcept {
+ if (this->index_ == kIndex) {
+ if (!std::is_trivially_destructible<H>::value) {
+ cast_this().~H();
+ }
+ } else {
+ Impl::destroy();
+ }
+ }
+
+ static constexpr std::integral_constant<uint8_t, kIndex> index_of(
+ const type_constant<H>&) {
+ return {};
+ }
+
+ template <typename R, typename Visitor>
+ R visit_const(Visitor&& visitor) const {
+ if (this->index_ == kIndex) {
+ return std::forward<Visitor>(visitor)(cast_this());
+ }
+ return Impl::template visit_const<R>(std::forward<Visitor>(visitor));
+ }
+
+ template <typename R, typename Visitor>
+ R visit_mutable(Visitor&& visitor) {
+ if (this->index_ == kIndex) {
+ return std::forward<Visitor>(visitor)(&cast_this());
+ }
+ return Impl::template visit_mutable<R>(std::forward<Visitor>(visitor));
+ }
+};
+
+} // namespace detail
+
+template <typename... T>
+class Variant : detail::VariantImpl<Variant<T...>, T...>,
+ detail::conditional_t<
+ detail::all<(std::is_copy_constructible<T>::value &&
+ std::is_copy_assignable<T>::value)...>::value,
+ detail::explicit_copy_constructor,
+ detail::delete_copy_constructor>::template type<Variant<T...>> {
+ template <typename U>
+ static constexpr uint8_t index_of() {
+ return Impl::index_of(detail::type_constant<U>{});
+ }
+
+ using Impl = detail::VariantImpl<Variant<T...>, T...>;
+
+ public:
+ using default_type = typename util::detail::first<T...>::type;
+
+ Variant() noexcept { construct_default(); }
+
+ Variant(const Variant& other) = default;
+ Variant& operator=(const Variant& other) = default;
+ Variant& operator=(Variant&& other) noexcept {
+ this->destroy();
+ other.move_to(this);
+ return *this;
+ }
+
+ using Impl::Impl;
+ using Impl::operator=;
+
+ Variant(Variant&& other) noexcept { other.move_to(this); }
+
+ ~Variant() {
+ static_assert(offsetof(Variant, data_) == 0, "(void*)&Variant::data_ == (void*)this");
+ this->destroy();
+ }
+
+ /// \brief Return the zero-based type index of the value held by the variant
+ uint8_t index() const noexcept { return this->index_; }
+
+ /// \brief Get a const pointer to the value held by the variant
+ ///
+ /// If the type given as template argument doesn't match, a null pointer is returned.
+ template <typename U, uint8_t I = index_of<U>()>
+ const U* get() const noexcept {
+ return index() == I ? reinterpret_cast<const U*>(this) : NULLPTR;
+ }
+
+ /// \brief Get a pointer to the value held by the variant
+ ///
+ /// If the type given as template argument doesn't match, a null pointer is returned.
+ template <typename U, uint8_t I = index_of<U>()>
+ U* get() noexcept {
+ return index() == I ? reinterpret_cast<U*>(this) : NULLPTR;
+ }
+
+ /// \brief Replace the value held by the variant
+ ///
+ /// The intended type must be given as a template argument.
+ /// The value is constructed in-place using the given function arguments.
+ template <typename U, typename... A, uint8_t I = index_of<U>()>
+ void emplace(A&&... args) try {
+ this->destroy();
+ new (this) U(std::forward<A>(args)...);
+ this->index_ = I;
+ } catch (...) {
+ construct_default();
+ throw;
+ }
+
+ template <typename U, typename E, typename... A, uint8_t I = index_of<U>()>
+ void emplace(std::initializer_list<E> il, A&&... args) try {
+ this->destroy();
+ new (this) U(il, std::forward<A>(args)...);
+ this->index_ = I;
+ } catch (...) {
+ construct_default();
+ throw;
+ }
+
+ /// \brief Swap with another variant's contents
+ void swap(Variant& other) noexcept { // NOLINT google-runtime-references
+ Variant tmp = std::move(other);
+ other = std::move(*this);
+ *this = std::move(tmp);
+ }
+
+ using Impl::visit_const;
+ using Impl::visit_mutable;
+
+ private:
+ void construct_default() noexcept {
+ new (this) default_type();
+ this->index_ = 0;
+ }
+
+ template <typename V>
+ friend struct detail::explicit_copy_constructor::type;
+
+ template <typename V, typename...>
+ friend struct detail::VariantImpl;
+};
+
+/// \brief Call polymorphic visitor on a const variant's value
+///
+/// The visitor will receive a const reference to the value held by the variant.
+/// It must define overloads for each possible variant type.
+/// The overloads should all return the same type (no attempt
+/// is made to find a generalized return type).
+template <typename Visitor, typename... T,
+ typename R = decltype(std::declval<Visitor&&>()(
+ std::declval<const typename Variant<T...>::default_type&>()))>
+R visit(Visitor&& visitor, const util::Variant<T...>& v) {
+ return v.template visit_const<R>(std::forward<Visitor>(visitor));
+}
+
+/// \brief Call polymorphic visitor on a non-const variant's value
+///
+/// The visitor will receive a pointer to the value held by the variant.
+/// It must define overloads for each possible variant type.
+/// The overloads should all return the same type (no attempt
+/// is made to find a generalized return type).
+template <typename Visitor, typename... T,
+ typename R = decltype(std::declval<Visitor&&>()(
+ std::declval<typename Variant<T...>::default_type*>()))>
+R visit(Visitor&& visitor, util::Variant<T...>* v) {
+ return v->template visit_mutable<R>(std::forward<Visitor>(visitor));
+}
+
+/// \brief Get a const reference to the value held by the variant
+///
+/// If the type given as template argument doesn't match, behavior is undefined
+/// (a null pointer will be dereferenced).
+template <typename U, typename... T>
+const U& get(const Variant<T...>& v) {
+ return *v.template get<U>();
+}
+
+/// \brief Get a reference to the value held by the variant
+///
+/// If the type given as template argument doesn't match, behavior is undefined
+/// (a null pointer will be dereferenced).
+template <typename U, typename... T>
+U& get(Variant<T...>& v) {
+ return *v.template get<U>();
+}
+
+/// \brief Get a const pointer to the value held by the variant
+///
+/// If the type given as template argument doesn't match, a nullptr is returned.
+template <typename U, typename... T>
+const U* get_if(const Variant<T...>* v) {
+ return v->template get<U>();
+}
+
+/// \brief Get a pointer to the value held by the variant
+///
+/// If the type given as template argument doesn't match, a nullptr is returned.
+template <typename U, typename... T>
+U* get_if(Variant<T...>* v) {
+ return v->template get<U>();
+}
+
+namespace detail {
+
+template <typename... T>
+struct VariantsEqual {
+ template <typename U>
+ bool operator()(const U& r) const {
+ return get<U>(l_) == r;
+ }
+ const Variant<T...>& l_;
+};
+
+} // namespace detail
+
+template <typename... T, typename = typename std::enable_if<detail::all<
+ detail::is_equality_comparable<T>::value...>::value>>
+bool operator==(const Variant<T...>& l, const Variant<T...>& r) {
+ if (l.index() != r.index()) return false;
+ return visit(detail::VariantsEqual<T...>{l}, r);
+}
+
+template <typename... T>
+auto operator!=(const Variant<T...>& l, const Variant<T...>& r) -> decltype(l == r) {
+ return !(l == r);
+}
+
+/// \brief Return whether the variant holds a value of the given type
+template <typename U, typename... T>
+bool holds_alternative(const Variant<T...>& v) {
+ return v.template get<U>();
+}
+
+} // namespace util
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/vector.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/vector.h
new file mode 100644
index 00000000000..041bdb424a7
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/vector.h
@@ -0,0 +1,172 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "arrow/result.h"
+#include "arrow/util/algorithm.h"
+#include "arrow/util/functional.h"
+#include "arrow/util/logging.h"
+
+namespace arrow {
+namespace internal {
+
+template <typename T>
+std::vector<T> DeleteVectorElement(const std::vector<T>& values, size_t index) {
+ DCHECK(!values.empty());
+ DCHECK_LT(index, values.size());
+ std::vector<T> out;
+ out.reserve(values.size() - 1);
+ for (size_t i = 0; i < index; ++i) {
+ out.push_back(values[i]);
+ }
+ for (size_t i = index + 1; i < values.size(); ++i) {
+ out.push_back(values[i]);
+ }
+ return out;
+}
+
+template <typename T>
+std::vector<T> AddVectorElement(const std::vector<T>& values, size_t index,
+ T new_element) {
+ DCHECK_LE(index, values.size());
+ std::vector<T> out;
+ out.reserve(values.size() + 1);
+ for (size_t i = 0; i < index; ++i) {
+ out.push_back(values[i]);
+ }
+ out.emplace_back(std::move(new_element));
+ for (size_t i = index; i < values.size(); ++i) {
+ out.push_back(values[i]);
+ }
+ return out;
+}
+
+template <typename T>
+std::vector<T> ReplaceVectorElement(const std::vector<T>& values, size_t index,
+ T new_element) {
+ DCHECK_LE(index, values.size());
+ std::vector<T> out;
+ out.reserve(values.size());
+ for (size_t i = 0; i < index; ++i) {
+ out.push_back(values[i]);
+ }
+ out.emplace_back(std::move(new_element));
+ for (size_t i = index + 1; i < values.size(); ++i) {
+ out.push_back(values[i]);
+ }
+ return out;
+}
+
+template <typename T, typename Predicate>
+std::vector<T> FilterVector(std::vector<T> values, Predicate&& predicate) {
+ auto new_end =
+ std::remove_if(values.begin(), values.end(), std::forward<Predicate>(predicate));
+ values.erase(new_end, values.end());
+ return values;
+}
+
+template <typename Fn, typename From,
+ typename To = decltype(std::declval<Fn>()(std::declval<From>()))>
+std::vector<To> MapVector(Fn&& map, const std::vector<From>& source) {
+ std::vector<To> out;
+ out.reserve(source.size());
+ std::transform(source.begin(), source.end(), std::back_inserter(out),
+ std::forward<Fn>(map));
+ return out;
+}
+
+template <typename Fn, typename From,
+ typename To = decltype(std::declval<Fn>()(std::declval<From>()))>
+std::vector<To> MapVector(Fn&& map, std::vector<From>&& source) {
+ std::vector<To> out;
+ out.reserve(source.size());
+ std::transform(std::make_move_iterator(source.begin()),
+ std::make_move_iterator(source.end()), std::back_inserter(out),
+ std::forward<Fn>(map));
+ return out;
+}
+
+/// \brief Like MapVector, but where the function can fail.
+template <typename Fn, typename From = internal::call_traits::argument_type<0, Fn>,
+ typename To = typename internal::call_traits::return_type<Fn>::ValueType>
+Result<std::vector<To>> MaybeMapVector(Fn&& map, const std::vector<From>& source) {
+ std::vector<To> out;
+ out.reserve(source.size());
+ ARROW_RETURN_NOT_OK(MaybeTransform(source.begin(), source.end(),
+ std::back_inserter(out), std::forward<Fn>(map)));
+ return std::move(out);
+}
+
+template <typename Fn, typename From = internal::call_traits::argument_type<0, Fn>,
+ typename To = typename internal::call_traits::return_type<Fn>::ValueType>
+Result<std::vector<To>> MaybeMapVector(Fn&& map, std::vector<From>&& source) {
+ std::vector<To> out;
+ out.reserve(source.size());
+ ARROW_RETURN_NOT_OK(MaybeTransform(std::make_move_iterator(source.begin()),
+ std::make_move_iterator(source.end()),
+ std::back_inserter(out), std::forward<Fn>(map)));
+ return std::move(out);
+}
+
+template <typename T>
+std::vector<T> FlattenVectors(const std::vector<std::vector<T>>& vecs) {
+ std::size_t sum = 0;
+ for (const auto& vec : vecs) {
+ sum += vec.size();
+ }
+ std::vector<T> out;
+ out.reserve(sum);
+ for (const auto& vec : vecs) {
+ out.insert(out.end(), vec.begin(), vec.end());
+ }
+ return out;
+}
+
+template <typename T>
+Result<std::vector<T>> UnwrapOrRaise(std::vector<Result<T>>&& results) {
+ std::vector<T> out;
+ out.reserve(results.size());
+ auto end = std::make_move_iterator(results.end());
+ for (auto it = std::make_move_iterator(results.begin()); it != end; it++) {
+ if (!it->ok()) {
+ return it->status();
+ }
+ out.push_back(it->MoveValueUnsafe());
+ }
+ return std::move(out);
+}
+
+template <typename T>
+Result<std::vector<T>> UnwrapOrRaise(const std::vector<Result<T>>& results) {
+ std::vector<T> out;
+ out.reserve(results.size());
+ for (const auto& result : results) {
+ if (!result.ok()) {
+ return result.status();
+ }
+ out.push_back(result.ValueUnsafe());
+ }
+ return std::move(out);
+}
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/visibility.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/visibility.h
new file mode 100644
index 00000000000..dd9ac45e9bb
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/visibility.h
@@ -0,0 +1,45 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#if defined(_WIN32) || defined(__CYGWIN__)
+#if defined(_MSC_VER)
+#pragma warning(disable : 4251)
+#else
+#pragma GCC diagnostic ignored "-Wattributes"
+#endif
+
+#ifdef ARROW_STATIC
+#define ARROW_EXPORT
+#elif defined(ARROW_EXPORTING)
+#define ARROW_EXPORT __declspec(dllexport)
+#else
+#define ARROW_EXPORT __declspec(dllimport)
+#endif
+
+#define ARROW_NO_EXPORT
+#define ARROW_FORCE_INLINE __forceinline
+#else // Not Windows
+#ifndef ARROW_EXPORT
+#define ARROW_EXPORT __attribute__((visibility("default")))
+#endif
+#ifndef ARROW_NO_EXPORT
+#define ARROW_NO_EXPORT __attribute__((visibility("hidden")))
+#define ARROW_FORCE_INLINE
+#endif
+#endif // Non-Windows
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/windows_compatibility.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/windows_compatibility.h
new file mode 100644
index 00000000000..64a2772c41c
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/windows_compatibility.h
@@ -0,0 +1,42 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#ifdef _WIN32
+
+// Windows defines min and max macros that mess up std::min/max
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+
+#define WIN32_LEAN_AND_MEAN
+
+// Set Windows 7 as a conservative minimum for Apache Arrow
+#if defined(_WIN32_WINNT) && _WIN32_WINNT < 0x601
+#undef _WIN32_WINNT
+#endif
+#ifndef _WIN32_WINNT
+#define _WIN32_WINNT 0x601
+#endif
+
+#include <winsock2.h>
+#include <windows.h>
+
+#include "arrow/util/windows_fixup.h"
+
+#endif // _WIN32
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/windows_fixup.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/windows_fixup.h
new file mode 100644
index 00000000000..2949ac4ab76
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/windows_fixup.h
@@ -0,0 +1,52 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This header needs to be included multiple times.
+
+#ifdef _WIN32
+
+#ifdef max
+#undef max
+#endif
+#ifdef min
+#undef min
+#endif
+
+// The Windows API defines macros from *File resolving to either
+// *FileA or *FileW. Need to undo them.
+#ifdef CopyFile
+#undef CopyFile
+#endif
+#ifdef CreateFile
+#undef CreateFile
+#endif
+#ifdef DeleteFile
+#undef DeleteFile
+#endif
+
+// Other annoying Windows macro definitions...
+#ifdef IN
+#undef IN
+#endif
+#ifdef OUT
+#undef OUT
+#endif
+
+// Note that we can't undefine OPTIONAL, because it can be used in other
+// Windows headers...
+
+#endif // _WIN32
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/vendored/ProducerConsumerQueue.h b/contrib/libs/apache/arrow/cpp/src/arrow/vendored/ProducerConsumerQueue.h
new file mode 100644
index 00000000000..0b7cfa1cb16
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/vendored/ProducerConsumerQueue.h
@@ -0,0 +1,217 @@
+// Vendored from git tag v2021.02.15.00
+
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// @author Bo Hu ([email protected])
+// @author Jordan DeLong ([email protected])
+
+// This file has been modified as part of Apache Arrow to conform to
+// Apache Arrow's coding conventions
+
+#pragma once
+
+#include <atomic>
+#include <cassert>
+#include <cstdlib>
+#include <memory>
+#include <stdexcept>
+#include <type_traits>
+#include <utility>
+
+namespace arrow_vendored {
+namespace folly {
+
+// Vendored from folly/Portability.h
+namespace {
+#if defined(__arm__)
+#define FOLLY_ARM 1
+#else
+#define FOLLY_ARM 0
+#endif
+
+#if defined(__s390x__)
+#define FOLLY_S390X 1
+#else
+#define FOLLY_S390X 0
+#endif
+
+constexpr bool kIsArchArm = FOLLY_ARM == 1;
+constexpr bool kIsArchS390X = FOLLY_S390X == 1;
+} // namespace
+
+// Vendored from folly/lang/Align.h
+namespace {
+
+constexpr std::size_t hardware_destructive_interference_size =
+ (kIsArchArm || kIsArchS390X) ? 64 : 128;
+
+} // namespace
+
+/*
+ * ProducerConsumerQueue is a one producer and one consumer queue
+ * without locks.
+ */
+template <class T>
+struct ProducerConsumerQueue {
+ typedef T value_type;
+
+ ProducerConsumerQueue(const ProducerConsumerQueue&) = delete;
+ ProducerConsumerQueue& operator=(const ProducerConsumerQueue&) = delete;
+
+ // size must be >= 2.
+ //
+ // Also, note that the number of usable slots in the queue at any
+ // given time is actually (size-1), so if you start with an empty queue,
+ // IsFull() will return true after size-1 insertions.
+ explicit ProducerConsumerQueue(uint32_t size)
+ : size_(size),
+ records_(static_cast<T*>(std::malloc(sizeof(T) * size))),
+ readIndex_(0),
+ writeIndex_(0) {
+ assert(size >= 2);
+ if (!records_) {
+ throw std::bad_alloc();
+ }
+ }
+
+ ~ProducerConsumerQueue() {
+ // We need to destruct anything that may still exist in our queue.
+ // (No real synchronization needed at destructor time: only one
+ // thread can be doing this.)
+ if (!std::is_trivially_destructible<T>::value) {
+ size_t readIndex = readIndex_;
+ size_t endIndex = writeIndex_;
+ while (readIndex != endIndex) {
+ records_[readIndex].~T();
+ if (++readIndex == size_) {
+ readIndex = 0;
+ }
+ }
+ }
+
+ std::free(records_);
+ }
+
+ template <class... Args>
+ bool Write(Args&&... recordArgs) {
+ auto const currentWrite = writeIndex_.load(std::memory_order_relaxed);
+ auto nextRecord = currentWrite + 1;
+ if (nextRecord == size_) {
+ nextRecord = 0;
+ }
+ if (nextRecord != readIndex_.load(std::memory_order_acquire)) {
+ new (&records_[currentWrite]) T(std::forward<Args>(recordArgs)...);
+ writeIndex_.store(nextRecord, std::memory_order_release);
+ return true;
+ }
+
+ // queue is full
+ return false;
+ }
+
+ // move the value at the front of the queue to given variable
+ bool Read(T& record) {
+ auto const currentRead = readIndex_.load(std::memory_order_relaxed);
+ if (currentRead == writeIndex_.load(std::memory_order_acquire)) {
+ // queue is empty
+ return false;
+ }
+
+ auto nextRecord = currentRead + 1;
+ if (nextRecord == size_) {
+ nextRecord = 0;
+ }
+ record = std::move(records_[currentRead]);
+ records_[currentRead].~T();
+ readIndex_.store(nextRecord, std::memory_order_release);
+ return true;
+ }
+
+ // pointer to the value at the front of the queue (for use in-place) or
+ // nullptr if empty.
+ T* FrontPtr() {
+ auto const currentRead = readIndex_.load(std::memory_order_relaxed);
+ if (currentRead == writeIndex_.load(std::memory_order_acquire)) {
+ // queue is empty
+ return nullptr;
+ }
+ return &records_[currentRead];
+ }
+
+ // queue must not be empty
+ void PopFront() {
+ auto const currentRead = readIndex_.load(std::memory_order_relaxed);
+ assert(currentRead != writeIndex_.load(std::memory_order_acquire));
+
+ auto nextRecord = currentRead + 1;
+ if (nextRecord == size_) {
+ nextRecord = 0;
+ }
+ records_[currentRead].~T();
+ readIndex_.store(nextRecord, std::memory_order_release);
+ }
+
+ bool IsEmpty() const {
+ return readIndex_.load(std::memory_order_acquire) ==
+ writeIndex_.load(std::memory_order_acquire);
+ }
+
+ bool IsFull() const {
+ auto nextRecord = writeIndex_.load(std::memory_order_acquire) + 1;
+ if (nextRecord == size_) {
+ nextRecord = 0;
+ }
+ if (nextRecord != readIndex_.load(std::memory_order_acquire)) {
+ return false;
+ }
+ // queue is full
+ return true;
+ }
+
+ // * If called by consumer, then true size may be more (because producer may
+ // be adding items concurrently).
+ // * If called by producer, then true size may be less (because consumer may
+ // be removing items concurrently).
+ // * It is undefined to call this from any other thread.
+ size_t SizeGuess() const {
+ int ret = writeIndex_.load(std::memory_order_acquire) -
+ readIndex_.load(std::memory_order_acquire);
+ if (ret < 0) {
+ ret += size_;
+ }
+ return ret;
+ }
+
+ // maximum number of items in the queue.
+ size_t capacity() const { return size_ - 1; }
+
+ private:
+ using AtomicIndex = std::atomic<unsigned int>;
+
+ char pad0_[hardware_destructive_interference_size];
+ const uint32_t size_;
+ T* const records_;
+
+ AtomicIndex readIndex_;
+ char pad1_[hardware_destructive_interference_size - sizeof(AtomicIndex)];
+ AtomicIndex writeIndex_;
+
+ char pad2_[hardware_destructive_interference_size - sizeof(AtomicIndex)];
+};
+
+} // namespace folly
+} // namespace arrow_vendored
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/vendored/base64.cpp b/contrib/libs/apache/arrow/cpp/src/arrow/vendored/base64.cpp
new file mode 100644
index 00000000000..50ece19455e
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/vendored/base64.cpp
@@ -0,0 +1,128 @@
+/*
+ base64.cpp and base64.h
+
+ base64 encoding and decoding with C++.
+
+ Version: 1.01.00
+
+ Copyright (C) 2004-2017 René Nyffenegger
+
+ This source code is provided 'as-is', without any express or implied
+ warranty. In no event will the author be held liable for any damages
+ arising from the use of this software.
+
+ Permission is granted to anyone to use this software for any purpose,
+ including commercial applications, and to alter it and redistribute it
+ freely, subject to the following restrictions:
+
+ 1. The origin of this source code must not be misrepresented; you must not
+ claim that you wrote the original source code. If you use this source code
+ in a product, an acknowledgment in the product documentation would be
+ appreciated but is not required.
+
+ 2. Altered source versions must be plainly marked as such, and must not be
+ misrepresented as being the original source code.
+
+ 3. This notice may not be removed or altered from any source distribution.
+
+ René Nyffenegger [email protected]
+
+*/
+
+#include "arrow/util/base64.h"
+#include <iostream>
+
+namespace arrow {
+namespace util {
+
+static const std::string base64_chars =
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+ "abcdefghijklmnopqrstuvwxyz"
+ "0123456789+/";
+
+
+static inline bool is_base64(unsigned char c) {
+ return (isalnum(c) || (c == '+') || (c == '/'));
+}
+
+std::string base64_encode(unsigned char const* bytes_to_encode, unsigned int in_len) {
+ std::string ret;
+ int i = 0;
+ int j = 0;
+ unsigned char char_array_3[3];
+ unsigned char char_array_4[4];
+
+ while (in_len--) {
+ char_array_3[i++] = *(bytes_to_encode++);
+ if (i == 3) {
+ char_array_4[0] = (char_array_3[0] & 0xfc) >> 2;
+ char_array_4[1] = ((char_array_3[0] & 0x03) << 4) + ((char_array_3[1] & 0xf0) >> 4);
+ char_array_4[2] = ((char_array_3[1] & 0x0f) << 2) + ((char_array_3[2] & 0xc0) >> 6);
+ char_array_4[3] = char_array_3[2] & 0x3f;
+
+ for(i = 0; (i <4) ; i++)
+ ret += base64_chars[char_array_4[i]];
+ i = 0;
+ }
+ }
+
+ if (i)
+ {
+ for(j = i; j < 3; j++)
+ char_array_3[j] = '\0';
+
+ char_array_4[0] = ( char_array_3[0] & 0xfc) >> 2;
+ char_array_4[1] = ((char_array_3[0] & 0x03) << 4) + ((char_array_3[1] & 0xf0) >> 4);
+ char_array_4[2] = ((char_array_3[1] & 0x0f) << 2) + ((char_array_3[2] & 0xc0) >> 6);
+
+ for (j = 0; (j < i + 1); j++)
+ ret += base64_chars[char_array_4[j]];
+
+ while((i++ < 3))
+ ret += '=';
+
+ }
+
+ return ret;
+
+}
+
+std::string base64_decode(std::string const& encoded_string) {
+ size_t in_len = encoded_string.size();
+ int i = 0;
+ int j = 0;
+ int in_ = 0;
+ unsigned char char_array_4[4], char_array_3[3];
+ std::string ret;
+
+ while (in_len-- && ( encoded_string[in_] != '=') && is_base64(encoded_string[in_])) {
+ char_array_4[i++] = encoded_string[in_]; in_++;
+ if (i ==4) {
+ for (i = 0; i <4; i++)
+ char_array_4[i] = base64_chars.find(char_array_4[i]) & 0xff;
+
+ char_array_3[0] = ( char_array_4[0] << 2 ) + ((char_array_4[1] & 0x30) >> 4);
+ char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
+ char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
+
+ for (i = 0; (i < 3); i++)
+ ret += char_array_3[i];
+ i = 0;
+ }
+ }
+
+ if (i) {
+ for (j = 0; j < i; j++)
+ char_array_4[j] = base64_chars.find(char_array_4[j]) & 0xff;
+
+ char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
+ char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
+
+ for (j = 0; (j < i - 1); j++) ret += char_array_3[j];
+ }
+
+ return ret;
+}
+
+} // namespace util
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/vendored/datetime.h b/contrib/libs/apache/arrow/cpp/src/arrow/vendored/datetime.h
new file mode 100644
index 00000000000..e437cdcbc2d
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/vendored/datetime.h
@@ -0,0 +1,26 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/vendored/datetime/date.h" // IWYU pragma: export
+#include "arrow/vendored/datetime/tz.h" // IWYU pragma: export
+
+// Can be defined by date.h.
+#ifdef NOEXCEPT
+#undef NOEXCEPT
+#endif
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/vendored/datetime/README.md b/contrib/libs/apache/arrow/cpp/src/arrow/vendored/datetime/README.md
new file mode 100644
index 00000000000..811b6935ff2
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/vendored/datetime/README.md
@@ -0,0 +1,21 @@
+<!--
+The MIT License (MIT)
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+-->
+
+# Utilities for supporting date time functions
+
+Sources for datetime are adapted from Howard Hinnant's date library
+(https://github.com/HowardHinnant/date).
+
+Sources are taken from v3.0.0 release of the above project.
+
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/vendored/datetime/date.h b/contrib/libs/apache/arrow/cpp/src/arrow/vendored/datetime/date.h
new file mode 100644
index 00000000000..6d0455a354b
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/vendored/datetime/date.h
@@ -0,0 +1,7949 @@
+#ifndef DATE_H
+#define DATE_H
+
+// The MIT License (MIT)
+//
+// Copyright (c) 2015, 2016, 2017 Howard Hinnant
+// Copyright (c) 2016 Adrian Colomitchi
+// Copyright (c) 2017 Florian Dang
+// Copyright (c) 2017 Paul Thompson
+// Copyright (c) 2018, 2019 Tomasz Kamiński
+// Copyright (c) 2019 Jiangang Zhuang
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+// Our apologies. When the previous paragraph was written, lowercase had not yet
+// been invented (that would involve another several millennia of evolution).
+// We did not mean to shout.
+
+#ifndef HAS_STRING_VIEW
+# if __cplusplus >= 201703 || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L)
+# define HAS_STRING_VIEW 1
+# else
+# define HAS_STRING_VIEW 0
+# endif
+#endif // HAS_STRING_VIEW
+
+#include <cassert>
+#include <algorithm>
+#include <cctype>
+#include <chrono>
+#include <climits>
+#if !(__cplusplus >= 201402)
+# include <cmath>
+#endif
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <ctime>
+#include <ios>
+#include <istream>
+#include <iterator>
+#include <limits>
+#include <locale>
+#include <memory>
+#include <ostream>
+#include <ratio>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#if HAS_STRING_VIEW
+# include <string_view>
+#endif
+#include <utility>
+#include <type_traits>
+
+#ifdef __GNUC__
+# pragma GCC diagnostic push
+# if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 7)
+# pragma GCC diagnostic ignored "-Wpedantic"
+# endif
+# if __GNUC__ < 5
+ // GCC 4.9 Bug 61489 Wrong warning with -Wmissing-field-initializers
+# pragma GCC diagnostic ignored "-Wmissing-field-initializers"
+# endif
+#endif
+
+#ifdef _MSC_VER
+# pragma warning(push)
+// warning C4127: conditional expression is constant
+# pragma warning(disable : 4127)
+#endif
+
+namespace arrow_vendored
+{
+namespace date
+{
+
+//---------------+
+// Configuration |
+//---------------+
+
+#ifndef ONLY_C_LOCALE
+# define ONLY_C_LOCALE 0
+#endif
+
+#if defined(_MSC_VER) && (!defined(__clang__) || (_MSC_VER < 1910))
+// MSVC
+# ifndef _SILENCE_CXX17_UNCAUGHT_EXCEPTION_DEPRECATION_WARNING
+# define _SILENCE_CXX17_UNCAUGHT_EXCEPTION_DEPRECATION_WARNING
+# endif
+# if _MSC_VER < 1910
+// before VS2017
+# define CONSTDATA const
+# define CONSTCD11
+# define CONSTCD14
+# define NOEXCEPT _NOEXCEPT
+# else
+// VS2017 and later
+# define CONSTDATA constexpr const
+# define CONSTCD11 constexpr
+# define CONSTCD14 constexpr
+# define NOEXCEPT noexcept
+# endif
+
+#elif defined(__SUNPRO_CC) && __SUNPRO_CC <= 0x5150
+// Oracle Developer Studio 12.6 and earlier
+# define CONSTDATA constexpr const
+# define CONSTCD11 constexpr
+# define CONSTCD14
+# define NOEXCEPT noexcept
+
+#elif __cplusplus >= 201402
+// C++14
+# define CONSTDATA constexpr const
+# define CONSTCD11 constexpr
+# define CONSTCD14 constexpr
+# define NOEXCEPT noexcept
+#else
+// C++11
+# define CONSTDATA constexpr const
+# define CONSTCD11 constexpr
+# define CONSTCD14
+# define NOEXCEPT noexcept
+#endif
+
+#ifndef HAS_UNCAUGHT_EXCEPTIONS
+# if __cplusplus > 201703 || (defined(_MSVC_LANG) && _MSVC_LANG > 201703L)
+# define HAS_UNCAUGHT_EXCEPTIONS 1
+# else
+# define HAS_UNCAUGHT_EXCEPTIONS 0
+# endif
+#endif // HAS_UNCAUGHT_EXCEPTIONS
+
+#ifndef HAS_VOID_T
+# if __cplusplus >= 201703 || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L)
+# define HAS_VOID_T 1
+# else
+# define HAS_VOID_T 0
+# endif
+#endif // HAS_VOID_T
+
+// Protect from Oracle sun macro
+#ifdef sun
+# undef sun
+#endif
+
+// Work around for a NVCC compiler bug which causes it to fail
+// to compile std::ratio_{multiply,divide} when used directly
+// in the std::chrono::duration template instantiations below
+namespace detail {
+template <typename R1, typename R2>
+using ratio_multiply = decltype(std::ratio_multiply<R1, R2>{});
+
+template <typename R1, typename R2>
+using ratio_divide = decltype(std::ratio_divide<R1, R2>{});
+} // namespace detail
+
+//-----------+
+// Interface |
+//-----------+
+
+// durations
+
+using days = std::chrono::duration
+ <int, detail::ratio_multiply<std::ratio<24>, std::chrono::hours::period>>;
+
+using weeks = std::chrono::duration
+ <int, detail::ratio_multiply<std::ratio<7>, days::period>>;
+
+using years = std::chrono::duration
+ <int, detail::ratio_multiply<std::ratio<146097, 400>, days::period>>;
+
+using months = std::chrono::duration
+ <int, detail::ratio_divide<years::period, std::ratio<12>>>;
+
+// time_point
+
+template <class Duration>
+ using sys_time = std::chrono::time_point<std::chrono::system_clock, Duration>;
+
+using sys_days = sys_time<days>;
+using sys_seconds = sys_time<std::chrono::seconds>;
+
+struct local_t {};
+
+template <class Duration>
+ using local_time = std::chrono::time_point<local_t, Duration>;
+
+using local_seconds = local_time<std::chrono::seconds>;
+using local_days = local_time<days>;
+
+// types
+
+struct last_spec
+{
+ explicit last_spec() = default;
+};
+
+class day;
+class month;
+class year;
+
+class weekday;
+class weekday_indexed;
+class weekday_last;
+
+class month_day;
+class month_day_last;
+class month_weekday;
+class month_weekday_last;
+
+class year_month;
+
+class year_month_day;
+class year_month_day_last;
+class year_month_weekday;
+class year_month_weekday_last;
+
+// date composition operators
+
+CONSTCD11 year_month operator/(const year& y, const month& m) NOEXCEPT;
+CONSTCD11 year_month operator/(const year& y, int m) NOEXCEPT;
+
+CONSTCD11 month_day operator/(const day& d, const month& m) NOEXCEPT;
+CONSTCD11 month_day operator/(const day& d, int m) NOEXCEPT;
+CONSTCD11 month_day operator/(const month& m, const day& d) NOEXCEPT;
+CONSTCD11 month_day operator/(const month& m, int d) NOEXCEPT;
+CONSTCD11 month_day operator/(int m, const day& d) NOEXCEPT;
+
+CONSTCD11 month_day_last operator/(const month& m, last_spec) NOEXCEPT;
+CONSTCD11 month_day_last operator/(int m, last_spec) NOEXCEPT;
+CONSTCD11 month_day_last operator/(last_spec, const month& m) NOEXCEPT;
+CONSTCD11 month_day_last operator/(last_spec, int m) NOEXCEPT;
+
+CONSTCD11 month_weekday operator/(const month& m, const weekday_indexed& wdi) NOEXCEPT;
+CONSTCD11 month_weekday operator/(int m, const weekday_indexed& wdi) NOEXCEPT;
+CONSTCD11 month_weekday operator/(const weekday_indexed& wdi, const month& m) NOEXCEPT;
+CONSTCD11 month_weekday operator/(const weekday_indexed& wdi, int m) NOEXCEPT;
+
+CONSTCD11 month_weekday_last operator/(const month& m, const weekday_last& wdl) NOEXCEPT;
+CONSTCD11 month_weekday_last operator/(int m, const weekday_last& wdl) NOEXCEPT;
+CONSTCD11 month_weekday_last operator/(const weekday_last& wdl, const month& m) NOEXCEPT;
+CONSTCD11 month_weekday_last operator/(const weekday_last& wdl, int m) NOEXCEPT;
+
+CONSTCD11 year_month_day operator/(const year_month& ym, const day& d) NOEXCEPT;
+CONSTCD11 year_month_day operator/(const year_month& ym, int d) NOEXCEPT;
+CONSTCD11 year_month_day operator/(const year& y, const month_day& md) NOEXCEPT;
+CONSTCD11 year_month_day operator/(int y, const month_day& md) NOEXCEPT;
+CONSTCD11 year_month_day operator/(const month_day& md, const year& y) NOEXCEPT;
+CONSTCD11 year_month_day operator/(const month_day& md, int y) NOEXCEPT;
+
+CONSTCD11
+ year_month_day_last operator/(const year_month& ym, last_spec) NOEXCEPT;
+CONSTCD11
+ year_month_day_last operator/(const year& y, const month_day_last& mdl) NOEXCEPT;
+CONSTCD11
+ year_month_day_last operator/(int y, const month_day_last& mdl) NOEXCEPT;
+CONSTCD11
+ year_month_day_last operator/(const month_day_last& mdl, const year& y) NOEXCEPT;
+CONSTCD11
+ year_month_day_last operator/(const month_day_last& mdl, int y) NOEXCEPT;
+
+CONSTCD11
+year_month_weekday
+operator/(const year_month& ym, const weekday_indexed& wdi) NOEXCEPT;
+
+CONSTCD11
+year_month_weekday
+operator/(const year& y, const month_weekday& mwd) NOEXCEPT;
+
+CONSTCD11
+year_month_weekday
+operator/(int y, const month_weekday& mwd) NOEXCEPT;
+
+CONSTCD11
+year_month_weekday
+operator/(const month_weekday& mwd, const year& y) NOEXCEPT;
+
+CONSTCD11
+year_month_weekday
+operator/(const month_weekday& mwd, int y) NOEXCEPT;
+
+CONSTCD11
+year_month_weekday_last
+operator/(const year_month& ym, const weekday_last& wdl) NOEXCEPT;
+
+CONSTCD11
+year_month_weekday_last
+operator/(const year& y, const month_weekday_last& mwdl) NOEXCEPT;
+
+CONSTCD11
+year_month_weekday_last
+operator/(int y, const month_weekday_last& mwdl) NOEXCEPT;
+
+CONSTCD11
+year_month_weekday_last
+operator/(const month_weekday_last& mwdl, const year& y) NOEXCEPT;
+
+CONSTCD11
+year_month_weekday_last
+operator/(const month_weekday_last& mwdl, int y) NOEXCEPT;
+
+// Detailed interface
+
+// day
+
+class day
+{
+ unsigned char d_;
+
+public:
+ day() = default;
+ explicit CONSTCD11 day(unsigned d) NOEXCEPT;
+
+ CONSTCD14 day& operator++() NOEXCEPT;
+ CONSTCD14 day operator++(int) NOEXCEPT;
+ CONSTCD14 day& operator--() NOEXCEPT;
+ CONSTCD14 day operator--(int) NOEXCEPT;
+
+ CONSTCD14 day& operator+=(const days& d) NOEXCEPT;
+ CONSTCD14 day& operator-=(const days& d) NOEXCEPT;
+
+ CONSTCD11 explicit operator unsigned() const NOEXCEPT;
+ CONSTCD11 bool ok() const NOEXCEPT;
+};
+
+CONSTCD11 bool operator==(const day& x, const day& y) NOEXCEPT;
+CONSTCD11 bool operator!=(const day& x, const day& y) NOEXCEPT;
+CONSTCD11 bool operator< (const day& x, const day& y) NOEXCEPT;
+CONSTCD11 bool operator> (const day& x, const day& y) NOEXCEPT;
+CONSTCD11 bool operator<=(const day& x, const day& y) NOEXCEPT;
+CONSTCD11 bool operator>=(const day& x, const day& y) NOEXCEPT;
+
+CONSTCD11 day operator+(const day& x, const days& y) NOEXCEPT;
+CONSTCD11 day operator+(const days& x, const day& y) NOEXCEPT;
+CONSTCD11 day operator-(const day& x, const days& y) NOEXCEPT;
+CONSTCD11 days operator-(const day& x, const day& y) NOEXCEPT;
+
+template<class CharT, class Traits>
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const day& d);
+
+// month
+
+class month
+{
+ unsigned char m_;
+
+public:
+ month() = default;
+ explicit CONSTCD11 month(unsigned m) NOEXCEPT;
+
+ CONSTCD14 month& operator++() NOEXCEPT;
+ CONSTCD14 month operator++(int) NOEXCEPT;
+ CONSTCD14 month& operator--() NOEXCEPT;
+ CONSTCD14 month operator--(int) NOEXCEPT;
+
+ CONSTCD14 month& operator+=(const months& m) NOEXCEPT;
+ CONSTCD14 month& operator-=(const months& m) NOEXCEPT;
+
+ CONSTCD11 explicit operator unsigned() const NOEXCEPT;
+ CONSTCD11 bool ok() const NOEXCEPT;
+};
+
+CONSTCD11 bool operator==(const month& x, const month& y) NOEXCEPT;
+CONSTCD11 bool operator!=(const month& x, const month& y) NOEXCEPT;
+CONSTCD11 bool operator< (const month& x, const month& y) NOEXCEPT;
+CONSTCD11 bool operator> (const month& x, const month& y) NOEXCEPT;
+CONSTCD11 bool operator<=(const month& x, const month& y) NOEXCEPT;
+CONSTCD11 bool operator>=(const month& x, const month& y) NOEXCEPT;
+
+CONSTCD14 month operator+(const month& x, const months& y) NOEXCEPT;
+CONSTCD14 month operator+(const months& x, const month& y) NOEXCEPT;
+CONSTCD14 month operator-(const month& x, const months& y) NOEXCEPT;
+CONSTCD14 months operator-(const month& x, const month& y) NOEXCEPT;
+
+template<class CharT, class Traits>
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const month& m);
+
+// year
+
+class year
+{
+ short y_;
+
+public:
+ year() = default;
+ explicit CONSTCD11 year(int y) NOEXCEPT;
+
+ CONSTCD14 year& operator++() NOEXCEPT;
+ CONSTCD14 year operator++(int) NOEXCEPT;
+ CONSTCD14 year& operator--() NOEXCEPT;
+ CONSTCD14 year operator--(int) NOEXCEPT;
+
+ CONSTCD14 year& operator+=(const years& y) NOEXCEPT;
+ CONSTCD14 year& operator-=(const years& y) NOEXCEPT;
+
+ CONSTCD11 year operator-() const NOEXCEPT;
+ CONSTCD11 year operator+() const NOEXCEPT;
+
+ CONSTCD11 bool is_leap() const NOEXCEPT;
+
+ CONSTCD11 explicit operator int() const NOEXCEPT;
+ CONSTCD11 bool ok() const NOEXCEPT;
+
+ static CONSTCD11 year min() NOEXCEPT { return year{-32767}; }
+ static CONSTCD11 year max() NOEXCEPT { return year{32767}; }
+};
+
+CONSTCD11 bool operator==(const year& x, const year& y) NOEXCEPT;
+CONSTCD11 bool operator!=(const year& x, const year& y) NOEXCEPT;
+CONSTCD11 bool operator< (const year& x, const year& y) NOEXCEPT;
+CONSTCD11 bool operator> (const year& x, const year& y) NOEXCEPT;
+CONSTCD11 bool operator<=(const year& x, const year& y) NOEXCEPT;
+CONSTCD11 bool operator>=(const year& x, const year& y) NOEXCEPT;
+
+CONSTCD11 year operator+(const year& x, const years& y) NOEXCEPT;
+CONSTCD11 year operator+(const years& x, const year& y) NOEXCEPT;
+CONSTCD11 year operator-(const year& x, const years& y) NOEXCEPT;
+CONSTCD11 years operator-(const year& x, const year& y) NOEXCEPT;
+
+template<class CharT, class Traits>
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const year& y);
+
+// weekday
+
+class weekday
+{
+ unsigned char wd_;
+public:
+ weekday() = default;
+ explicit CONSTCD11 weekday(unsigned wd) NOEXCEPT;
+ CONSTCD14 weekday(const sys_days& dp) NOEXCEPT;
+ CONSTCD14 explicit weekday(const local_days& dp) NOEXCEPT;
+
+ CONSTCD14 weekday& operator++() NOEXCEPT;
+ CONSTCD14 weekday operator++(int) NOEXCEPT;
+ CONSTCD14 weekday& operator--() NOEXCEPT;
+ CONSTCD14 weekday operator--(int) NOEXCEPT;
+
+ CONSTCD14 weekday& operator+=(const days& d) NOEXCEPT;
+ CONSTCD14 weekday& operator-=(const days& d) NOEXCEPT;
+
+ CONSTCD11 bool ok() const NOEXCEPT;
+
+ CONSTCD11 unsigned c_encoding() const NOEXCEPT;
+ CONSTCD11 unsigned iso_encoding() const NOEXCEPT;
+
+ CONSTCD11 weekday_indexed operator[](unsigned index) const NOEXCEPT;
+ CONSTCD11 weekday_last operator[](last_spec) const NOEXCEPT;
+
+private:
+ static CONSTCD14 unsigned char weekday_from_days(int z) NOEXCEPT;
+
+ friend CONSTCD11 bool operator==(const weekday& x, const weekday& y) NOEXCEPT;
+ friend CONSTCD14 days operator-(const weekday& x, const weekday& y) NOEXCEPT;
+ friend CONSTCD14 weekday operator+(const weekday& x, const days& y) NOEXCEPT;
+ template<class CharT, class Traits>
+ friend std::basic_ostream<CharT, Traits>&
+ operator<<(std::basic_ostream<CharT, Traits>& os, const weekday& wd);
+ friend class weekday_indexed;
+};
+
+CONSTCD11 bool operator==(const weekday& x, const weekday& y) NOEXCEPT;
+CONSTCD11 bool operator!=(const weekday& x, const weekday& y) NOEXCEPT;
+
+CONSTCD14 weekday operator+(const weekday& x, const days& y) NOEXCEPT;
+CONSTCD14 weekday operator+(const days& x, const weekday& y) NOEXCEPT;
+CONSTCD14 weekday operator-(const weekday& x, const days& y) NOEXCEPT;
+CONSTCD14 days operator-(const weekday& x, const weekday& y) NOEXCEPT;
+
+template<class CharT, class Traits>
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const weekday& wd);
+
+// weekday_indexed
+
+class weekday_indexed
+{
+ unsigned char wd_ : 4;
+ unsigned char index_ : 4;
+
+public:
+ weekday_indexed() = default;
+ CONSTCD11 weekday_indexed(const date::weekday& wd, unsigned index) NOEXCEPT;
+
+ CONSTCD11 date::weekday weekday() const NOEXCEPT;
+ CONSTCD11 unsigned index() const NOEXCEPT;
+ CONSTCD11 bool ok() const NOEXCEPT;
+};
+
+CONSTCD11 bool operator==(const weekday_indexed& x, const weekday_indexed& y) NOEXCEPT;
+CONSTCD11 bool operator!=(const weekday_indexed& x, const weekday_indexed& y) NOEXCEPT;
+
+template<class CharT, class Traits>
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const weekday_indexed& wdi);
+
+// weekday_last
+
+class weekday_last
+{
+ date::weekday wd_;
+
+public:
+ explicit CONSTCD11 weekday_last(const date::weekday& wd) NOEXCEPT;
+
+ CONSTCD11 date::weekday weekday() const NOEXCEPT;
+ CONSTCD11 bool ok() const NOEXCEPT;
+};
+
+CONSTCD11 bool operator==(const weekday_last& x, const weekday_last& y) NOEXCEPT;
+CONSTCD11 bool operator!=(const weekday_last& x, const weekday_last& y) NOEXCEPT;
+
+template<class CharT, class Traits>
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const weekday_last& wdl);
+
+namespace detail
+{
+
+struct unspecified_month_disambiguator {};
+
+} // namespace detail
+
+// year_month
+
+class year_month
+{
+ date::year y_;
+ date::month m_;
+
+public:
+ year_month() = default;
+ CONSTCD11 year_month(const date::year& y, const date::month& m) NOEXCEPT;
+
+ CONSTCD11 date::year year() const NOEXCEPT;
+ CONSTCD11 date::month month() const NOEXCEPT;
+
+ template<class = detail::unspecified_month_disambiguator>
+ CONSTCD14 year_month& operator+=(const months& dm) NOEXCEPT;
+ template<class = detail::unspecified_month_disambiguator>
+ CONSTCD14 year_month& operator-=(const months& dm) NOEXCEPT;
+ CONSTCD14 year_month& operator+=(const years& dy) NOEXCEPT;
+ CONSTCD14 year_month& operator-=(const years& dy) NOEXCEPT;
+
+ CONSTCD11 bool ok() const NOEXCEPT;
+};
+
+CONSTCD11 bool operator==(const year_month& x, const year_month& y) NOEXCEPT;
+CONSTCD11 bool operator!=(const year_month& x, const year_month& y) NOEXCEPT;
+CONSTCD11 bool operator< (const year_month& x, const year_month& y) NOEXCEPT;
+CONSTCD11 bool operator> (const year_month& x, const year_month& y) NOEXCEPT;
+CONSTCD11 bool operator<=(const year_month& x, const year_month& y) NOEXCEPT;
+CONSTCD11 bool operator>=(const year_month& x, const year_month& y) NOEXCEPT;
+
+template<class = detail::unspecified_month_disambiguator>
+CONSTCD14 year_month operator+(const year_month& ym, const months& dm) NOEXCEPT;
+template<class = detail::unspecified_month_disambiguator>
+CONSTCD14 year_month operator+(const months& dm, const year_month& ym) NOEXCEPT;
+template<class = detail::unspecified_month_disambiguator>
+CONSTCD14 year_month operator-(const year_month& ym, const months& dm) NOEXCEPT;
+
+CONSTCD11 months operator-(const year_month& x, const year_month& y) NOEXCEPT;
+CONSTCD11 year_month operator+(const year_month& ym, const years& dy) NOEXCEPT;
+CONSTCD11 year_month operator+(const years& dy, const year_month& ym) NOEXCEPT;
+CONSTCD11 year_month operator-(const year_month& ym, const years& dy) NOEXCEPT;
+
+template<class CharT, class Traits>
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const year_month& ym);
+
+// month_day
+
+class month_day
+{
+ date::month m_;
+ date::day d_;
+
+public:
+ month_day() = default;
+ CONSTCD11 month_day(const date::month& m, const date::day& d) NOEXCEPT;
+
+ CONSTCD11 date::month month() const NOEXCEPT;
+ CONSTCD11 date::day day() const NOEXCEPT;
+
+ CONSTCD14 bool ok() const NOEXCEPT;
+};
+
+CONSTCD11 bool operator==(const month_day& x, const month_day& y) NOEXCEPT;
+CONSTCD11 bool operator!=(const month_day& x, const month_day& y) NOEXCEPT;
+CONSTCD11 bool operator< (const month_day& x, const month_day& y) NOEXCEPT;
+CONSTCD11 bool operator> (const month_day& x, const month_day& y) NOEXCEPT;
+CONSTCD11 bool operator<=(const month_day& x, const month_day& y) NOEXCEPT;
+CONSTCD11 bool operator>=(const month_day& x, const month_day& y) NOEXCEPT;
+
+template<class CharT, class Traits>
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const month_day& md);
+
+// month_day_last
+
+class month_day_last
+{
+ date::month m_;
+
+public:
+ CONSTCD11 explicit month_day_last(const date::month& m) NOEXCEPT;
+
+ CONSTCD11 date::month month() const NOEXCEPT;
+ CONSTCD11 bool ok() const NOEXCEPT;
+};
+
+CONSTCD11 bool operator==(const month_day_last& x, const month_day_last& y) NOEXCEPT;
+CONSTCD11 bool operator!=(const month_day_last& x, const month_day_last& y) NOEXCEPT;
+CONSTCD11 bool operator< (const month_day_last& x, const month_day_last& y) NOEXCEPT;
+CONSTCD11 bool operator> (const month_day_last& x, const month_day_last& y) NOEXCEPT;
+CONSTCD11 bool operator<=(const month_day_last& x, const month_day_last& y) NOEXCEPT;
+CONSTCD11 bool operator>=(const month_day_last& x, const month_day_last& y) NOEXCEPT;
+
+template<class CharT, class Traits>
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const month_day_last& mdl);
+
+// month_weekday
+
+class month_weekday
+{
+ date::month m_;
+ date::weekday_indexed wdi_;
+public:
+ CONSTCD11 month_weekday(const date::month& m,
+ const date::weekday_indexed& wdi) NOEXCEPT;
+
+ CONSTCD11 date::month month() const NOEXCEPT;
+ CONSTCD11 date::weekday_indexed weekday_indexed() const NOEXCEPT;
+
+ CONSTCD11 bool ok() const NOEXCEPT;
+};
+
+CONSTCD11 bool operator==(const month_weekday& x, const month_weekday& y) NOEXCEPT;
+CONSTCD11 bool operator!=(const month_weekday& x, const month_weekday& y) NOEXCEPT;
+
+template<class CharT, class Traits>
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const month_weekday& mwd);
+
+// month_weekday_last
+
+class month_weekday_last
+{
+ date::month m_;
+ date::weekday_last wdl_;
+
+public:
+ CONSTCD11 month_weekday_last(const date::month& m,
+ const date::weekday_last& wd) NOEXCEPT;
+
+ CONSTCD11 date::month month() const NOEXCEPT;
+ CONSTCD11 date::weekday_last weekday_last() const NOEXCEPT;
+
+ CONSTCD11 bool ok() const NOEXCEPT;
+};
+
+CONSTCD11
+ bool operator==(const month_weekday_last& x, const month_weekday_last& y) NOEXCEPT;
+CONSTCD11
+ bool operator!=(const month_weekday_last& x, const month_weekday_last& y) NOEXCEPT;
+
+template<class CharT, class Traits>
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const month_weekday_last& mwdl);
+
+// class year_month_day
+
+class year_month_day
+{
+ date::year y_;
+ date::month m_;
+ date::day d_;
+
+public:
+ year_month_day() = default;
+ CONSTCD11 year_month_day(const date::year& y, const date::month& m,
+ const date::day& d) NOEXCEPT;
+ CONSTCD14 year_month_day(const year_month_day_last& ymdl) NOEXCEPT;
+
+ CONSTCD14 year_month_day(sys_days dp) NOEXCEPT;
+ CONSTCD14 explicit year_month_day(local_days dp) NOEXCEPT;
+
+ template<class = detail::unspecified_month_disambiguator>
+ CONSTCD14 year_month_day& operator+=(const months& m) NOEXCEPT;
+ template<class = detail::unspecified_month_disambiguator>
+ CONSTCD14 year_month_day& operator-=(const months& m) NOEXCEPT;
+ CONSTCD14 year_month_day& operator+=(const years& y) NOEXCEPT;
+ CONSTCD14 year_month_day& operator-=(const years& y) NOEXCEPT;
+
+ CONSTCD11 date::year year() const NOEXCEPT;
+ CONSTCD11 date::month month() const NOEXCEPT;
+ CONSTCD11 date::day day() const NOEXCEPT;
+
+ CONSTCD14 operator sys_days() const NOEXCEPT;
+ CONSTCD14 explicit operator local_days() const NOEXCEPT;
+ CONSTCD14 bool ok() const NOEXCEPT;
+
+private:
+ static CONSTCD14 year_month_day from_days(days dp) NOEXCEPT;
+ CONSTCD14 days to_days() const NOEXCEPT;
+};
+
+CONSTCD11 bool operator==(const year_month_day& x, const year_month_day& y) NOEXCEPT;
+CONSTCD11 bool operator!=(const year_month_day& x, const year_month_day& y) NOEXCEPT;
+CONSTCD11 bool operator< (const year_month_day& x, const year_month_day& y) NOEXCEPT;
+CONSTCD11 bool operator> (const year_month_day& x, const year_month_day& y) NOEXCEPT;
+CONSTCD11 bool operator<=(const year_month_day& x, const year_month_day& y) NOEXCEPT;
+CONSTCD11 bool operator>=(const year_month_day& x, const year_month_day& y) NOEXCEPT;
+
+template<class = detail::unspecified_month_disambiguator>
+CONSTCD14 year_month_day operator+(const year_month_day& ymd, const months& dm) NOEXCEPT;
+template<class = detail::unspecified_month_disambiguator>
+CONSTCD14 year_month_day operator+(const months& dm, const year_month_day& ymd) NOEXCEPT;
+template<class = detail::unspecified_month_disambiguator>
+CONSTCD14 year_month_day operator-(const year_month_day& ymd, const months& dm) NOEXCEPT;
+CONSTCD11 year_month_day operator+(const year_month_day& ymd, const years& dy) NOEXCEPT;
+CONSTCD11 year_month_day operator+(const years& dy, const year_month_day& ymd) NOEXCEPT;
+CONSTCD11 year_month_day operator-(const year_month_day& ymd, const years& dy) NOEXCEPT;
+
+template<class CharT, class Traits>
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const year_month_day& ymd);
+
+// year_month_day_last
+
+class year_month_day_last
+{
+ date::year y_;
+ date::month_day_last mdl_;
+
+public:
+ CONSTCD11 year_month_day_last(const date::year& y,
+ const date::month_day_last& mdl) NOEXCEPT;
+
+ template<class = detail::unspecified_month_disambiguator>
+ CONSTCD14 year_month_day_last& operator+=(const months& m) NOEXCEPT;
+ template<class = detail::unspecified_month_disambiguator>
+ CONSTCD14 year_month_day_last& operator-=(const months& m) NOEXCEPT;
+ CONSTCD14 year_month_day_last& operator+=(const years& y) NOEXCEPT;
+ CONSTCD14 year_month_day_last& operator-=(const years& y) NOEXCEPT;
+
+ CONSTCD11 date::year year() const NOEXCEPT;
+ CONSTCD11 date::month month() const NOEXCEPT;
+ CONSTCD11 date::month_day_last month_day_last() const NOEXCEPT;
+ CONSTCD14 date::day day() const NOEXCEPT;
+
+ CONSTCD14 operator sys_days() const NOEXCEPT;
+ CONSTCD14 explicit operator local_days() const NOEXCEPT;
+ CONSTCD11 bool ok() const NOEXCEPT;
+};
+
+CONSTCD11
+ bool operator==(const year_month_day_last& x, const year_month_day_last& y) NOEXCEPT;
+CONSTCD11
+ bool operator!=(const year_month_day_last& x, const year_month_day_last& y) NOEXCEPT;
+CONSTCD11
+ bool operator< (const year_month_day_last& x, const year_month_day_last& y) NOEXCEPT;
+CONSTCD11
+ bool operator> (const year_month_day_last& x, const year_month_day_last& y) NOEXCEPT;
+CONSTCD11
+ bool operator<=(const year_month_day_last& x, const year_month_day_last& y) NOEXCEPT;
+CONSTCD11
+ bool operator>=(const year_month_day_last& x, const year_month_day_last& y) NOEXCEPT;
+
+template<class = detail::unspecified_month_disambiguator>
+CONSTCD14
+year_month_day_last
+operator+(const year_month_day_last& ymdl, const months& dm) NOEXCEPT;
+
+template<class = detail::unspecified_month_disambiguator>
+CONSTCD14
+year_month_day_last
+operator+(const months& dm, const year_month_day_last& ymdl) NOEXCEPT;
+
+CONSTCD11
+year_month_day_last
+operator+(const year_month_day_last& ymdl, const years& dy) NOEXCEPT;
+
+CONSTCD11
+year_month_day_last
+operator+(const years& dy, const year_month_day_last& ymdl) NOEXCEPT;
+
+template<class = detail::unspecified_month_disambiguator>
+CONSTCD14
+year_month_day_last
+operator-(const year_month_day_last& ymdl, const months& dm) NOEXCEPT;
+
+CONSTCD11
+year_month_day_last
+operator-(const year_month_day_last& ymdl, const years& dy) NOEXCEPT;
+
+template<class CharT, class Traits>
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const year_month_day_last& ymdl);
+
+// year_month_weekday
+
+class year_month_weekday
+{
+ date::year y_;
+ date::month m_;
+ date::weekday_indexed wdi_;
+
+public:
+ year_month_weekday() = default;
+ CONSTCD11 year_month_weekday(const date::year& y, const date::month& m,
+ const date::weekday_indexed& wdi) NOEXCEPT;
+ CONSTCD14 year_month_weekday(const sys_days& dp) NOEXCEPT;
+ CONSTCD14 explicit year_month_weekday(const local_days& dp) NOEXCEPT;
+
+ template<class = detail::unspecified_month_disambiguator>
+ CONSTCD14 year_month_weekday& operator+=(const months& m) NOEXCEPT;
+ template<class = detail::unspecified_month_disambiguator>
+ CONSTCD14 year_month_weekday& operator-=(const months& m) NOEXCEPT;
+ CONSTCD14 year_month_weekday& operator+=(const years& y) NOEXCEPT;
+ CONSTCD14 year_month_weekday& operator-=(const years& y) NOEXCEPT;
+
+ CONSTCD11 date::year year() const NOEXCEPT;
+ CONSTCD11 date::month month() const NOEXCEPT;
+ CONSTCD11 date::weekday weekday() const NOEXCEPT;
+ CONSTCD11 unsigned index() const NOEXCEPT;
+ CONSTCD11 date::weekday_indexed weekday_indexed() const NOEXCEPT;
+
+ CONSTCD14 operator sys_days() const NOEXCEPT;
+ CONSTCD14 explicit operator local_days() const NOEXCEPT;
+ CONSTCD14 bool ok() const NOEXCEPT;
+
+private:
+ static CONSTCD14 year_month_weekday from_days(days dp) NOEXCEPT;
+ CONSTCD14 days to_days() const NOEXCEPT;
+};
+
+CONSTCD11
+ bool operator==(const year_month_weekday& x, const year_month_weekday& y) NOEXCEPT;
+CONSTCD11
+ bool operator!=(const year_month_weekday& x, const year_month_weekday& y) NOEXCEPT;
+
+template<class = detail::unspecified_month_disambiguator>
+CONSTCD14
+year_month_weekday
+operator+(const year_month_weekday& ymwd, const months& dm) NOEXCEPT;
+
+template<class = detail::unspecified_month_disambiguator>
+CONSTCD14
+year_month_weekday
+operator+(const months& dm, const year_month_weekday& ymwd) NOEXCEPT;
+
+CONSTCD11
+year_month_weekday
+operator+(const year_month_weekday& ymwd, const years& dy) NOEXCEPT;
+
+CONSTCD11
+year_month_weekday
+operator+(const years& dy, const year_month_weekday& ymwd) NOEXCEPT;
+
+template<class = detail::unspecified_month_disambiguator>
+CONSTCD14
+year_month_weekday
+operator-(const year_month_weekday& ymwd, const months& dm) NOEXCEPT;
+
+CONSTCD11
+year_month_weekday
+operator-(const year_month_weekday& ymwd, const years& dy) NOEXCEPT;
+
+template<class CharT, class Traits>
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const year_month_weekday& ymwdi);
+
+// year_month_weekday_last
+
+class year_month_weekday_last
+{
+ date::year y_;
+ date::month m_;
+ date::weekday_last wdl_;
+
+public:
+ CONSTCD11 year_month_weekday_last(const date::year& y, const date::month& m,
+ const date::weekday_last& wdl) NOEXCEPT;
+
+ template<class = detail::unspecified_month_disambiguator>
+ CONSTCD14 year_month_weekday_last& operator+=(const months& m) NOEXCEPT;
+ template<class = detail::unspecified_month_disambiguator>
+ CONSTCD14 year_month_weekday_last& operator-=(const months& m) NOEXCEPT;
+ CONSTCD14 year_month_weekday_last& operator+=(const years& y) NOEXCEPT;
+ CONSTCD14 year_month_weekday_last& operator-=(const years& y) NOEXCEPT;
+
+ CONSTCD11 date::year year() const NOEXCEPT;
+ CONSTCD11 date::month month() const NOEXCEPT;
+ CONSTCD11 date::weekday weekday() const NOEXCEPT;
+ CONSTCD11 date::weekday_last weekday_last() const NOEXCEPT;
+
+ CONSTCD14 operator sys_days() const NOEXCEPT;
+ CONSTCD14 explicit operator local_days() const NOEXCEPT;
+ CONSTCD11 bool ok() const NOEXCEPT;
+
+private:
+ CONSTCD14 days to_days() const NOEXCEPT;
+};
+
+CONSTCD11
+bool
+operator==(const year_month_weekday_last& x, const year_month_weekday_last& y) NOEXCEPT;
+
+CONSTCD11
+bool
+operator!=(const year_month_weekday_last& x, const year_month_weekday_last& y) NOEXCEPT;
+
+template<class = detail::unspecified_month_disambiguator>
+CONSTCD14
+year_month_weekday_last
+operator+(const year_month_weekday_last& ymwdl, const months& dm) NOEXCEPT;
+
+template<class = detail::unspecified_month_disambiguator>
+CONSTCD14
+year_month_weekday_last
+operator+(const months& dm, const year_month_weekday_last& ymwdl) NOEXCEPT;
+
+CONSTCD11
+year_month_weekday_last
+operator+(const year_month_weekday_last& ymwdl, const years& dy) NOEXCEPT;
+
+CONSTCD11
+year_month_weekday_last
+operator+(const years& dy, const year_month_weekday_last& ymwdl) NOEXCEPT;
+
+template<class = detail::unspecified_month_disambiguator>
+CONSTCD14
+year_month_weekday_last
+operator-(const year_month_weekday_last& ymwdl, const months& dm) NOEXCEPT;
+
+CONSTCD11
+year_month_weekday_last
+operator-(const year_month_weekday_last& ymwdl, const years& dy) NOEXCEPT;
+
+template<class CharT, class Traits>
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const year_month_weekday_last& ymwdl);
+
+#if !defined(_MSC_VER) || (_MSC_VER >= 1900)
+inline namespace literals
+{
+
+CONSTCD11 date::day operator "" _d(unsigned long long d) NOEXCEPT;
+CONSTCD11 date::year operator "" _y(unsigned long long y) NOEXCEPT;
+
+} // inline namespace literals
+#endif // !defined(_MSC_VER) || (_MSC_VER >= 1900)
+
+// CONSTDATA date::month January{1};
+// CONSTDATA date::month February{2};
+// CONSTDATA date::month March{3};
+// CONSTDATA date::month April{4};
+// CONSTDATA date::month May{5};
+// CONSTDATA date::month June{6};
+// CONSTDATA date::month July{7};
+// CONSTDATA date::month August{8};
+// CONSTDATA date::month September{9};
+// CONSTDATA date::month October{10};
+// CONSTDATA date::month November{11};
+// CONSTDATA date::month December{12};
+//
+// CONSTDATA date::weekday Sunday{0u};
+// CONSTDATA date::weekday Monday{1u};
+// CONSTDATA date::weekday Tuesday{2u};
+// CONSTDATA date::weekday Wednesday{3u};
+// CONSTDATA date::weekday Thursday{4u};
+// CONSTDATA date::weekday Friday{5u};
+// CONSTDATA date::weekday Saturday{6u};
+
+#if HAS_VOID_T
+
+template <class T, class = std::void_t<>>
+struct is_clock
+ : std::false_type
+{};
+
+template <class T>
+struct is_clock<T, std::void_t<decltype(T::now()), typename T::rep, typename T::period,
+ typename T::duration, typename T::time_point,
+ decltype(T::is_steady)>>
+ : std::true_type
+{};
+
+#endif // HAS_VOID_T
+
+//----------------+
+// Implementation |
+//----------------+
+
+// utilities
+namespace detail {
+
+template<class CharT, class Traits = std::char_traits<CharT>>
+class save_istream
+{
+protected:
+ std::basic_ios<CharT, Traits>& is_;
+ CharT fill_;
+ std::ios::fmtflags flags_;
+ std::streamsize width_;
+ std::basic_ostream<CharT, Traits>* tie_;
+ std::locale loc_;
+
+public:
+ ~save_istream()
+ {
+ is_.fill(fill_);
+ is_.flags(flags_);
+ is_.width(width_);
+ is_.imbue(loc_);
+ is_.tie(tie_);
+ }
+
+ save_istream(const save_istream&) = delete;
+ save_istream& operator=(const save_istream&) = delete;
+
+ explicit save_istream(std::basic_ios<CharT, Traits>& is)
+ : is_(is)
+ , fill_(is.fill())
+ , flags_(is.flags())
+ , width_(is.width(0))
+ , tie_(is.tie(nullptr))
+ , loc_(is.getloc())
+ {
+ if (tie_ != nullptr)
+ tie_->flush();
+ }
+};
+
+template<class CharT, class Traits = std::char_traits<CharT>>
+class save_ostream
+ : private save_istream<CharT, Traits>
+{
+public:
+ ~save_ostream()
+ {
+ if ((this->flags_ & std::ios::unitbuf) &&
+#if HAS_UNCAUGHT_EXCEPTIONS
+ std::uncaught_exceptions() == 0 &&
+#else
+ !std::uncaught_exception() &&
+#endif
+ this->is_.good())
+ this->is_.rdbuf()->pubsync();
+ }
+
+ save_ostream(const save_ostream&) = delete;
+ save_ostream& operator=(const save_ostream&) = delete;
+
+ explicit save_ostream(std::basic_ios<CharT, Traits>& os)
+ : save_istream<CharT, Traits>(os)
+ {
+ }
+};
+
+template <class T>
+struct choose_trunc_type
+{
+ static const int digits = std::numeric_limits<T>::digits;
+ using type = typename std::conditional
+ <
+ digits < 32,
+ std::int32_t,
+ typename std::conditional
+ <
+ digits < 64,
+ std::int64_t,
+#ifdef __SIZEOF_INT128__
+ __int128
+#else
+ std::int64_t
+#endif
+ >::type
+ >::type;
+};
+
+template <class T>
+CONSTCD11
+inline
+typename std::enable_if
+<
+ !std::chrono::treat_as_floating_point<T>::value,
+ T
+>::type
+trunc(T t) NOEXCEPT
+{
+ return t;
+}
+
+template <class T>
+CONSTCD14
+inline
+typename std::enable_if
+<
+ std::chrono::treat_as_floating_point<T>::value,
+ T
+>::type
+trunc(T t) NOEXCEPT
+{
+ using std::numeric_limits;
+ using I = typename choose_trunc_type<T>::type;
+ CONSTDATA auto digits = numeric_limits<T>::digits;
+ static_assert(digits < numeric_limits<I>::digits, "");
+ CONSTDATA auto max = I{1} << (digits-1);
+ CONSTDATA auto min = -max;
+ const auto negative = t < T{0};
+ if (min <= t && t <= max && t != 0 && t == t)
+ {
+ t = static_cast<T>(static_cast<I>(t));
+ if (t == 0 && negative)
+ t = -t;
+ }
+ return t;
+}
+
+template <std::intmax_t Xp, std::intmax_t Yp>
+struct static_gcd
+{
+ static const std::intmax_t value = static_gcd<Yp, Xp % Yp>::value;
+};
+
+template <std::intmax_t Xp>
+struct static_gcd<Xp, 0>
+{
+ static const std::intmax_t value = Xp;
+};
+
+template <>
+struct static_gcd<0, 0>
+{
+ static const std::intmax_t value = 1;
+};
+
+template <class R1, class R2>
+struct no_overflow
+{
+private:
+ static const std::intmax_t gcd_n1_n2 = static_gcd<R1::num, R2::num>::value;
+ static const std::intmax_t gcd_d1_d2 = static_gcd<R1::den, R2::den>::value;
+ static const std::intmax_t n1 = R1::num / gcd_n1_n2;
+ static const std::intmax_t d1 = R1::den / gcd_d1_d2;
+ static const std::intmax_t n2 = R2::num / gcd_n1_n2;
+ static const std::intmax_t d2 = R2::den / gcd_d1_d2;
+ static const std::intmax_t max = std::numeric_limits<std::intmax_t>::max();
+
+ template <std::intmax_t Xp, std::intmax_t Yp, bool overflow>
+ struct mul // overflow == false
+ {
+ static const std::intmax_t value = Xp * Yp;
+ };
+
+ template <std::intmax_t Xp, std::intmax_t Yp>
+ struct mul<Xp, Yp, true>
+ {
+ static const std::intmax_t value = 1;
+ };
+
+public:
+ static const bool value = (n1 <= max / d2) && (n2 <= max / d1);
+ typedef std::ratio<mul<n1, d2, !value>::value,
+ mul<n2, d1, !value>::value> type;
+};
+
+} // detail
+
+// trunc towards zero
+template <class To, class Rep, class Period>
+CONSTCD11
+inline
+typename std::enable_if
+<
+ detail::no_overflow<Period, typename To::period>::value,
+ To
+>::type
+trunc(const std::chrono::duration<Rep, Period>& d)
+{
+ return To{detail::trunc(std::chrono::duration_cast<To>(d).count())};
+}
+
+template <class To, class Rep, class Period>
+CONSTCD11
+inline
+typename std::enable_if
+<
+ !detail::no_overflow<Period, typename To::period>::value,
+ To
+>::type
+trunc(const std::chrono::duration<Rep, Period>& d)
+{
+ using std::chrono::duration_cast;
+ using std::chrono::duration;
+ using rep = typename std::common_type<Rep, typename To::rep>::type;
+ return To{detail::trunc(duration_cast<To>(duration_cast<duration<rep>>(d)).count())};
+}
+
+#ifndef HAS_CHRONO_ROUNDING
+# if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 190023918 || (_MSC_FULL_VER >= 190000000 && defined (__clang__)))
+# define HAS_CHRONO_ROUNDING 1
+# elif defined(__cpp_lib_chrono) && __cplusplus > 201402 && __cpp_lib_chrono >= 201510
+# define HAS_CHRONO_ROUNDING 1
+# elif defined(_LIBCPP_VERSION) && __cplusplus > 201402 && _LIBCPP_VERSION >= 3800
+# define HAS_CHRONO_ROUNDING 1
+# else
+# define HAS_CHRONO_ROUNDING 0
+# endif
+#endif // HAS_CHRONO_ROUNDING
+
+#if HAS_CHRONO_ROUNDING == 0
+
+// round down
+template <class To, class Rep, class Period>
+CONSTCD14
+inline
+typename std::enable_if
+<
+ detail::no_overflow<Period, typename To::period>::value,
+ To
+>::type
+floor(const std::chrono::duration<Rep, Period>& d)
+{
+ auto t = trunc<To>(d);
+ if (t > d)
+ return t - To{1};
+ return t;
+}
+
+template <class To, class Rep, class Period>
+CONSTCD14
+inline
+typename std::enable_if
+<
+ !detail::no_overflow<Period, typename To::period>::value,
+ To
+>::type
+floor(const std::chrono::duration<Rep, Period>& d)
+{
+ using rep = typename std::common_type<Rep, typename To::rep>::type;
+ return floor<To>(floor<std::chrono::duration<rep>>(d));
+}
+
+// round to nearest, to even on tie
+template <class To, class Rep, class Period>
+CONSTCD14
+inline
+To
+round(const std::chrono::duration<Rep, Period>& d)
+{
+ auto t0 = floor<To>(d);
+ auto t1 = t0 + To{1};
+ if (t1 == To{0} && t0 < To{0})
+ t1 = -t1;
+ auto diff0 = d - t0;
+ auto diff1 = t1 - d;
+ if (diff0 == diff1)
+ {
+ if (t0 - trunc<To>(t0/2)*2 == To{0})
+ return t0;
+ return t1;
+ }
+ if (diff0 < diff1)
+ return t0;
+ return t1;
+}
+
+// round up
+template <class To, class Rep, class Period>
+CONSTCD14
+inline
+To
+ceil(const std::chrono::duration<Rep, Period>& d)
+{
+ auto t = trunc<To>(d);
+ if (t < d)
+ return t + To{1};
+ return t;
+}
+
+template <class Rep, class Period,
+ class = typename std::enable_if
+ <
+ std::numeric_limits<Rep>::is_signed
+ >::type>
+CONSTCD11
+std::chrono::duration<Rep, Period>
+abs(std::chrono::duration<Rep, Period> d)
+{
+ return d >= d.zero() ? d : -d;
+}
+
+// round down
+template <class To, class Clock, class FromDuration>
+CONSTCD11
+inline
+std::chrono::time_point<Clock, To>
+floor(const std::chrono::time_point<Clock, FromDuration>& tp)
+{
+ using std::chrono::time_point;
+ return time_point<Clock, To>{date::floor<To>(tp.time_since_epoch())};
+}
+
+// round to nearest, to even on tie
+template <class To, class Clock, class FromDuration>
+CONSTCD11
+inline
+std::chrono::time_point<Clock, To>
+round(const std::chrono::time_point<Clock, FromDuration>& tp)
+{
+ using std::chrono::time_point;
+ return time_point<Clock, To>{round<To>(tp.time_since_epoch())};
+}
+
+// round up
+template <class To, class Clock, class FromDuration>
+CONSTCD11
+inline
+std::chrono::time_point<Clock, To>
+ceil(const std::chrono::time_point<Clock, FromDuration>& tp)
+{
+ using std::chrono::time_point;
+ return time_point<Clock, To>{ceil<To>(tp.time_since_epoch())};
+}
+
+#else // HAS_CHRONO_ROUNDING == 1
+
+using std::chrono::floor;
+using std::chrono::ceil;
+using std::chrono::round;
+using std::chrono::abs;
+
+#endif // HAS_CHRONO_ROUNDING
+
+// trunc towards zero
+template <class To, class Clock, class FromDuration>
+CONSTCD11
+inline
+std::chrono::time_point<Clock, To>
+trunc(const std::chrono::time_point<Clock, FromDuration>& tp)
+{
+ using std::chrono::time_point;
+ return time_point<Clock, To>{trunc<To>(tp.time_since_epoch())};
+}
+
+// day
+
+CONSTCD11 inline day::day(unsigned d) NOEXCEPT : d_(static_cast<decltype(d_)>(d)) {}
+CONSTCD14 inline day& day::operator++() NOEXCEPT {++d_; return *this;}
+CONSTCD14 inline day day::operator++(int) NOEXCEPT {auto tmp(*this); ++(*this); return tmp;}
+CONSTCD14 inline day& day::operator--() NOEXCEPT {--d_; return *this;}
+CONSTCD14 inline day day::operator--(int) NOEXCEPT {auto tmp(*this); --(*this); return tmp;}
+CONSTCD14 inline day& day::operator+=(const days& d) NOEXCEPT {*this = *this + d; return *this;}
+CONSTCD14 inline day& day::operator-=(const days& d) NOEXCEPT {*this = *this - d; return *this;}
+CONSTCD11 inline day::operator unsigned() const NOEXCEPT {return d_;}
+CONSTCD11 inline bool day::ok() const NOEXCEPT {return 1 <= d_ && d_ <= 31;}
+
+CONSTCD11
+inline
+bool
+operator==(const day& x, const day& y) NOEXCEPT
+{
+ return static_cast<unsigned>(x) == static_cast<unsigned>(y);
+}
+
+CONSTCD11
+inline
+bool
+operator!=(const day& x, const day& y) NOEXCEPT
+{
+ return !(x == y);
+}
+
+CONSTCD11
+inline
+bool
+operator<(const day& x, const day& y) NOEXCEPT
+{
+ return static_cast<unsigned>(x) < static_cast<unsigned>(y);
+}
+
+CONSTCD11
+inline
+bool
+operator>(const day& x, const day& y) NOEXCEPT
+{
+ return y < x;
+}
+
+CONSTCD11
+inline
+bool
+operator<=(const day& x, const day& y) NOEXCEPT
+{
+ return !(y < x);
+}
+
+CONSTCD11
+inline
+bool
+operator>=(const day& x, const day& y) NOEXCEPT
+{
+ return !(x < y);
+}
+
+CONSTCD11
+inline
+days
+operator-(const day& x, const day& y) NOEXCEPT
+{
+ return days{static_cast<days::rep>(static_cast<unsigned>(x)
+ - static_cast<unsigned>(y))};
+}
+
+CONSTCD11
+inline
+day
+operator+(const day& x, const days& y) NOEXCEPT
+{
+ return day{static_cast<unsigned>(x) + static_cast<unsigned>(y.count())};
+}
+
+CONSTCD11
+inline
+day
+operator+(const days& x, const day& y) NOEXCEPT
+{
+ return y + x;
+}
+
+CONSTCD11
+inline
+day
+operator-(const day& x, const days& y) NOEXCEPT
+{
+ return x + -y;
+}
+
+template<class CharT, class Traits>
+inline
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const day& d)
+{
+ detail::save_ostream<CharT, Traits> _(os);
+ os.fill('0');
+ os.flags(std::ios::dec | std::ios::right);
+ os.width(2);
+ os << static_cast<unsigned>(d);
+ if (!d.ok())
+ os << " is not a valid day";
+ return os;
+}
+
+// month
+
+CONSTCD11 inline month::month(unsigned m) NOEXCEPT : m_(static_cast<decltype(m_)>(m)) {}
+CONSTCD14 inline month& month::operator++() NOEXCEPT {*this += months{1}; return *this;}
+CONSTCD14 inline month month::operator++(int) NOEXCEPT {auto tmp(*this); ++(*this); return tmp;}
+CONSTCD14 inline month& month::operator--() NOEXCEPT {*this -= months{1}; return *this;}
+CONSTCD14 inline month month::operator--(int) NOEXCEPT {auto tmp(*this); --(*this); return tmp;}
+
+CONSTCD14
+inline
+month&
+month::operator+=(const months& m) NOEXCEPT
+{
+ *this = *this + m;
+ return *this;
+}
+
+CONSTCD14
+inline
+month&
+month::operator-=(const months& m) NOEXCEPT
+{
+ *this = *this - m;
+ return *this;
+}
+
+CONSTCD11 inline month::operator unsigned() const NOEXCEPT {return m_;}
+CONSTCD11 inline bool month::ok() const NOEXCEPT {return 1 <= m_ && m_ <= 12;}
+
+CONSTCD11
+inline
+bool
+operator==(const month& x, const month& y) NOEXCEPT
+{
+ return static_cast<unsigned>(x) == static_cast<unsigned>(y);
+}
+
+CONSTCD11
+inline
+bool
+operator!=(const month& x, const month& y) NOEXCEPT
+{
+ return !(x == y);
+}
+
+CONSTCD11
+inline
+bool
+operator<(const month& x, const month& y) NOEXCEPT
+{
+ return static_cast<unsigned>(x) < static_cast<unsigned>(y);
+}
+
+CONSTCD11
+inline
+bool
+operator>(const month& x, const month& y) NOEXCEPT
+{
+ return y < x;
+}
+
+CONSTCD11
+inline
+bool
+operator<=(const month& x, const month& y) NOEXCEPT
+{
+ return !(y < x);
+}
+
+CONSTCD11
+inline
+bool
+operator>=(const month& x, const month& y) NOEXCEPT
+{
+ return !(x < y);
+}
+
+CONSTCD14
+inline
+months
+operator-(const month& x, const month& y) NOEXCEPT
+{
+ auto const d = static_cast<unsigned>(x) - static_cast<unsigned>(y);
+ return months(d <= 11 ? d : d + 12);
+}
+
+CONSTCD14
+inline
+month
+operator+(const month& x, const months& y) NOEXCEPT
+{
+ auto const mu = static_cast<long long>(static_cast<unsigned>(x)) + y.count() - 1;
+ auto const yr = (mu >= 0 ? mu : mu-11) / 12;
+ return month{static_cast<unsigned>(mu - yr * 12 + 1)};
+}
+
+CONSTCD14
+inline
+month
+operator+(const months& x, const month& y) NOEXCEPT
+{
+ return y + x;
+}
+
+CONSTCD14
+inline
+month
+operator-(const month& x, const months& y) NOEXCEPT
+{
+ return x + -y;
+}
+
+template<class CharT, class Traits>
+inline
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const month& m)
+{
+ if (m.ok())
+ {
+ CharT fmt[] = {'%', 'b', 0};
+ os << format(os.getloc(), fmt, m);
+ }
+ else
+ os << static_cast<unsigned>(m) << " is not a valid month";
+ return os;
+}
+
+// year
+
+CONSTCD11 inline year::year(int y) NOEXCEPT : y_(static_cast<decltype(y_)>(y)) {}
+CONSTCD14 inline year& year::operator++() NOEXCEPT {++y_; return *this;}
+CONSTCD14 inline year year::operator++(int) NOEXCEPT {auto tmp(*this); ++(*this); return tmp;}
+CONSTCD14 inline year& year::operator--() NOEXCEPT {--y_; return *this;}
+CONSTCD14 inline year year::operator--(int) NOEXCEPT {auto tmp(*this); --(*this); return tmp;}
+CONSTCD14 inline year& year::operator+=(const years& y) NOEXCEPT {*this = *this + y; return *this;}
+CONSTCD14 inline year& year::operator-=(const years& y) NOEXCEPT {*this = *this - y; return *this;}
+CONSTCD11 inline year year::operator-() const NOEXCEPT {return year{-y_};}
+CONSTCD11 inline year year::operator+() const NOEXCEPT {return *this;}
+
+CONSTCD11
+inline
+bool
+year::is_leap() const NOEXCEPT
+{
+ return y_ % 4 == 0 && (y_ % 100 != 0 || y_ % 400 == 0);
+}
+
+CONSTCD11 inline year::operator int() const NOEXCEPT {return y_;}
+
+CONSTCD11
+inline
+bool
+year::ok() const NOEXCEPT
+{
+ return y_ != std::numeric_limits<short>::min();
+}
+
+CONSTCD11
+inline
+bool
+operator==(const year& x, const year& y) NOEXCEPT
+{
+ return static_cast<int>(x) == static_cast<int>(y);
+}
+
+CONSTCD11
+inline
+bool
+operator!=(const year& x, const year& y) NOEXCEPT
+{
+ return !(x == y);
+}
+
+CONSTCD11
+inline
+bool
+operator<(const year& x, const year& y) NOEXCEPT
+{
+ return static_cast<int>(x) < static_cast<int>(y);
+}
+
+CONSTCD11
+inline
+bool
+operator>(const year& x, const year& y) NOEXCEPT
+{
+ return y < x;
+}
+
+CONSTCD11
+inline
+bool
+operator<=(const year& x, const year& y) NOEXCEPT
+{
+ return !(y < x);
+}
+
+CONSTCD11
+inline
+bool
+operator>=(const year& x, const year& y) NOEXCEPT
+{
+ return !(x < y);
+}
+
+CONSTCD11
+inline
+years
+operator-(const year& x, const year& y) NOEXCEPT
+{
+ return years{static_cast<int>(x) - static_cast<int>(y)};
+}
+
+CONSTCD11
+inline
+year
+operator+(const year& x, const years& y) NOEXCEPT
+{
+ return year{static_cast<int>(x) + y.count()};
+}
+
+CONSTCD11
+inline
+year
+operator+(const years& x, const year& y) NOEXCEPT
+{
+ return y + x;
+}
+
+CONSTCD11
+inline
+year
+operator-(const year& x, const years& y) NOEXCEPT
+{
+ return year{static_cast<int>(x) - y.count()};
+}
+
+template<class CharT, class Traits>
+inline
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const year& y)
+{
+ detail::save_ostream<CharT, Traits> _(os);
+ os.fill('0');
+ os.flags(std::ios::dec | std::ios::internal);
+ os.width(4 + (y < year{0}));
+ os.imbue(std::locale::classic());
+ os << static_cast<int>(y);
+ if (!y.ok())
+ os << " is not a valid year";
+ return os;
+}
+
+// weekday
+
+CONSTCD14
+inline
+unsigned char
+weekday::weekday_from_days(int z) NOEXCEPT
+{
+ auto u = static_cast<unsigned>(z);
+ return static_cast<unsigned char>(z >= -4 ? (u+4) % 7 : u % 7);
+}
+
+CONSTCD11
+inline
+weekday::weekday(unsigned wd) NOEXCEPT
+ : wd_(static_cast<decltype(wd_)>(wd != 7 ? wd : 0))
+ {}
+
+CONSTCD14
+inline
+weekday::weekday(const sys_days& dp) NOEXCEPT
+ : wd_(weekday_from_days(dp.time_since_epoch().count()))
+ {}
+
+CONSTCD14
+inline
+weekday::weekday(const local_days& dp) NOEXCEPT
+ : wd_(weekday_from_days(dp.time_since_epoch().count()))
+ {}
+
+CONSTCD14 inline weekday& weekday::operator++() NOEXCEPT {*this += days{1}; return *this;}
+CONSTCD14 inline weekday weekday::operator++(int) NOEXCEPT {auto tmp(*this); ++(*this); return tmp;}
+CONSTCD14 inline weekday& weekday::operator--() NOEXCEPT {*this -= days{1}; return *this;}
+CONSTCD14 inline weekday weekday::operator--(int) NOEXCEPT {auto tmp(*this); --(*this); return tmp;}
+
+CONSTCD14
+inline
+weekday&
+weekday::operator+=(const days& d) NOEXCEPT
+{
+ *this = *this + d;
+ return *this;
+}
+
+CONSTCD14
+inline
+weekday&
+weekday::operator-=(const days& d) NOEXCEPT
+{
+ *this = *this - d;
+ return *this;
+}
+
+CONSTCD11 inline bool weekday::ok() const NOEXCEPT {return wd_ <= 6;}
+
+CONSTCD11
+inline
+unsigned weekday::c_encoding() const NOEXCEPT
+{
+ return unsigned{wd_};
+}
+
+CONSTCD11
+inline
+unsigned weekday::iso_encoding() const NOEXCEPT
+{
+ return unsigned{((wd_ == 0u) ? 7u : wd_)};
+}
+
+CONSTCD11
+inline
+bool
+operator==(const weekday& x, const weekday& y) NOEXCEPT
+{
+ return x.wd_ == y.wd_;
+}
+
+CONSTCD11
+inline
+bool
+operator!=(const weekday& x, const weekday& y) NOEXCEPT
+{
+ return !(x == y);
+}
+
+CONSTCD14
+inline
+days
+operator-(const weekday& x, const weekday& y) NOEXCEPT
+{
+ auto const wdu = x.wd_ - y.wd_;
+ auto const wk = (wdu >= 0 ? wdu : wdu-6) / 7;
+ return days{wdu - wk * 7};
+}
+
+CONSTCD14
+inline
+weekday
+operator+(const weekday& x, const days& y) NOEXCEPT
+{
+ auto const wdu = static_cast<long long>(static_cast<unsigned>(x.wd_)) + y.count();
+ auto const wk = (wdu >= 0 ? wdu : wdu-6) / 7;
+ return weekday{static_cast<unsigned>(wdu - wk * 7)};
+}
+
+CONSTCD14
+inline
+weekday
+operator+(const days& x, const weekday& y) NOEXCEPT
+{
+ return y + x;
+}
+
+CONSTCD14
+inline
+weekday
+operator-(const weekday& x, const days& y) NOEXCEPT
+{
+ return x + -y;
+}
+
+template<class CharT, class Traits>
+inline
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const weekday& wd)
+{
+ if (wd.ok())
+ {
+ CharT fmt[] = {'%', 'a', 0};
+ os << format(fmt, wd);
+ }
+ else
+ os << static_cast<unsigned>(wd.wd_) << " is not a valid weekday";
+ return os;
+}
+
+#if !defined(_MSC_VER) || (_MSC_VER >= 1900)
+inline namespace literals
+{
+
+CONSTCD11
+inline
+date::day
+operator "" _d(unsigned long long d) NOEXCEPT
+{
+ return date::day{static_cast<unsigned>(d)};
+}
+
+CONSTCD11
+inline
+date::year
+operator "" _y(unsigned long long y) NOEXCEPT
+{
+ return date::year(static_cast<int>(y));
+}
+#endif // !defined(_MSC_VER) || (_MSC_VER >= 1900)
+
+CONSTDATA date::last_spec last{};
+
+CONSTDATA date::month jan{1};
+CONSTDATA date::month feb{2};
+CONSTDATA date::month mar{3};
+CONSTDATA date::month apr{4};
+CONSTDATA date::month may{5};
+CONSTDATA date::month jun{6};
+CONSTDATA date::month jul{7};
+CONSTDATA date::month aug{8};
+CONSTDATA date::month sep{9};
+CONSTDATA date::month oct{10};
+CONSTDATA date::month nov{11};
+CONSTDATA date::month dec{12};
+
+CONSTDATA date::weekday sun{0u};
+CONSTDATA date::weekday mon{1u};
+CONSTDATA date::weekday tue{2u};
+CONSTDATA date::weekday wed{3u};
+CONSTDATA date::weekday thu{4u};
+CONSTDATA date::weekday fri{5u};
+CONSTDATA date::weekday sat{6u};
+
+#if !defined(_MSC_VER) || (_MSC_VER >= 1900)
+} // inline namespace literals
+#endif
+
+CONSTDATA date::month January{1};
+CONSTDATA date::month February{2};
+CONSTDATA date::month March{3};
+CONSTDATA date::month April{4};
+CONSTDATA date::month May{5};
+CONSTDATA date::month June{6};
+CONSTDATA date::month July{7};
+CONSTDATA date::month August{8};
+CONSTDATA date::month September{9};
+CONSTDATA date::month October{10};
+CONSTDATA date::month November{11};
+CONSTDATA date::month December{12};
+
+CONSTDATA date::weekday Monday{1};
+CONSTDATA date::weekday Tuesday{2};
+CONSTDATA date::weekday Wednesday{3};
+CONSTDATA date::weekday Thursday{4};
+CONSTDATA date::weekday Friday{5};
+CONSTDATA date::weekday Saturday{6};
+CONSTDATA date::weekday Sunday{7};
+
+// weekday_indexed
+
+CONSTCD11
+inline
+weekday
+weekday_indexed::weekday() const NOEXCEPT
+{
+ return date::weekday{static_cast<unsigned>(wd_)};
+}
+
+CONSTCD11 inline unsigned weekday_indexed::index() const NOEXCEPT {return index_;}
+
+CONSTCD11
+inline
+bool
+weekday_indexed::ok() const NOEXCEPT
+{
+ return weekday().ok() && 1 <= index_ && index_ <= 5;
+}
+
+#ifdef __GNUC__
+# pragma GCC diagnostic push
+# pragma GCC diagnostic ignored "-Wconversion"
+#endif // __GNUC__
+
+CONSTCD11
+inline
+weekday_indexed::weekday_indexed(const date::weekday& wd, unsigned index) NOEXCEPT
+ : wd_(static_cast<decltype(wd_)>(static_cast<unsigned>(wd.wd_)))
+ , index_(static_cast<decltype(index_)>(index))
+ {}
+
+#ifdef __GNUC__
+# pragma GCC diagnostic pop
+#endif // __GNUC__
+
+template<class CharT, class Traits>
+inline
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const weekday_indexed& wdi)
+{
+ os << wdi.weekday() << '[' << wdi.index();
+ if (!(1 <= wdi.index() && wdi.index() <= 5))
+ os << " is not a valid index";
+ os << ']';
+ return os;
+}
+
+CONSTCD11
+inline
+weekday_indexed
+weekday::operator[](unsigned index) const NOEXCEPT
+{
+ return {*this, index};
+}
+
+CONSTCD11
+inline
+bool
+operator==(const weekday_indexed& x, const weekday_indexed& y) NOEXCEPT
+{
+ return x.weekday() == y.weekday() && x.index() == y.index();
+}
+
+CONSTCD11
+inline
+bool
+operator!=(const weekday_indexed& x, const weekday_indexed& y) NOEXCEPT
+{
+ return !(x == y);
+}
+
+// weekday_last
+
+CONSTCD11 inline date::weekday weekday_last::weekday() const NOEXCEPT {return wd_;}
+CONSTCD11 inline bool weekday_last::ok() const NOEXCEPT {return wd_.ok();}
+CONSTCD11 inline weekday_last::weekday_last(const date::weekday& wd) NOEXCEPT : wd_(wd) {}
+
+CONSTCD11
+inline
+bool
+operator==(const weekday_last& x, const weekday_last& y) NOEXCEPT
+{
+ return x.weekday() == y.weekday();
+}
+
+CONSTCD11
+inline
+bool
+operator!=(const weekday_last& x, const weekday_last& y) NOEXCEPT
+{
+ return !(x == y);
+}
+
+template<class CharT, class Traits>
+inline
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const weekday_last& wdl)
+{
+ return os << wdl.weekday() << "[last]";
+}
+
+CONSTCD11
+inline
+weekday_last
+weekday::operator[](last_spec) const NOEXCEPT
+{
+ return weekday_last{*this};
+}
+
+// year_month
+
+CONSTCD11
+inline
+year_month::year_month(const date::year& y, const date::month& m) NOEXCEPT
+ : y_(y)
+ , m_(m)
+ {}
+
+CONSTCD11 inline year year_month::year() const NOEXCEPT {return y_;}
+CONSTCD11 inline month year_month::month() const NOEXCEPT {return m_;}
+CONSTCD11 inline bool year_month::ok() const NOEXCEPT {return y_.ok() && m_.ok();}
+
+template<class>
+CONSTCD14
+inline
+year_month&
+year_month::operator+=(const months& dm) NOEXCEPT
+{
+ *this = *this + dm;
+ return *this;
+}
+
+template<class>
+CONSTCD14
+inline
+year_month&
+year_month::operator-=(const months& dm) NOEXCEPT
+{
+ *this = *this - dm;
+ return *this;
+}
+
+CONSTCD14
+inline
+year_month&
+year_month::operator+=(const years& dy) NOEXCEPT
+{
+ *this = *this + dy;
+ return *this;
+}
+
+CONSTCD14
+inline
+year_month&
+year_month::operator-=(const years& dy) NOEXCEPT
+{
+ *this = *this - dy;
+ return *this;
+}
+
+CONSTCD11
+inline
+bool
+operator==(const year_month& x, const year_month& y) NOEXCEPT
+{
+ return x.year() == y.year() && x.month() == y.month();
+}
+
+CONSTCD11
+inline
+bool
+operator!=(const year_month& x, const year_month& y) NOEXCEPT
+{
+ return !(x == y);
+}
+
+CONSTCD11
+inline
+bool
+operator<(const year_month& x, const year_month& y) NOEXCEPT
+{
+ return x.year() < y.year() ? true
+ : (x.year() > y.year() ? false
+ : (x.month() < y.month()));
+}
+
+CONSTCD11
+inline
+bool
+operator>(const year_month& x, const year_month& y) NOEXCEPT
+{
+ return y < x;
+}
+
+CONSTCD11
+inline
+bool
+operator<=(const year_month& x, const year_month& y) NOEXCEPT
+{
+ return !(y < x);
+}
+
+CONSTCD11
+inline
+bool
+operator>=(const year_month& x, const year_month& y) NOEXCEPT
+{
+ return !(x < y);
+}
+
+template<class>
+CONSTCD14
+inline
+year_month
+operator+(const year_month& ym, const months& dm) NOEXCEPT
+{
+ auto dmi = static_cast<int>(static_cast<unsigned>(ym.month())) - 1 + dm.count();
+ auto dy = (dmi >= 0 ? dmi : dmi-11) / 12;
+ dmi = dmi - dy * 12 + 1;
+ return (ym.year() + years(dy)) / month(static_cast<unsigned>(dmi));
+}
+
+template<class>
+CONSTCD14
+inline
+year_month
+operator+(const months& dm, const year_month& ym) NOEXCEPT
+{
+ return ym + dm;
+}
+
+template<class>
+CONSTCD14
+inline
+year_month
+operator-(const year_month& ym, const months& dm) NOEXCEPT
+{
+ return ym + -dm;
+}
+
+CONSTCD11
+inline
+months
+operator-(const year_month& x, const year_month& y) NOEXCEPT
+{
+ return (x.year() - y.year()) +
+ months(static_cast<unsigned>(x.month()) - static_cast<unsigned>(y.month()));
+}
+
+CONSTCD11
+inline
+year_month
+operator+(const year_month& ym, const years& dy) NOEXCEPT
+{
+ return (ym.year() + dy) / ym.month();
+}
+
+CONSTCD11
+inline
+year_month
+operator+(const years& dy, const year_month& ym) NOEXCEPT
+{
+ return ym + dy;
+}
+
+CONSTCD11
+inline
+year_month
+operator-(const year_month& ym, const years& dy) NOEXCEPT
+{
+ return ym + -dy;
+}
+
+template<class CharT, class Traits>
+inline
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const year_month& ym)
+{
+ return os << ym.year() << '/' << ym.month();
+}
+
+// month_day
+
+CONSTCD11
+inline
+month_day::month_day(const date::month& m, const date::day& d) NOEXCEPT
+ : m_(m)
+ , d_(d)
+ {}
+
+CONSTCD11 inline date::month month_day::month() const NOEXCEPT {return m_;}
+CONSTCD11 inline date::day month_day::day() const NOEXCEPT {return d_;}
+
+CONSTCD14
+inline
+bool
+month_day::ok() const NOEXCEPT
+{
+ CONSTDATA date::day d[] =
+ {
+ date::day(31), date::day(29), date::day(31),
+ date::day(30), date::day(31), date::day(30),
+ date::day(31), date::day(31), date::day(30),
+ date::day(31), date::day(30), date::day(31)
+ };
+ return m_.ok() && date::day{1} <= d_ && d_ <= d[static_cast<unsigned>(m_)-1];
+}
+
+CONSTCD11
+inline
+bool
+operator==(const month_day& x, const month_day& y) NOEXCEPT
+{
+ return x.month() == y.month() && x.day() == y.day();
+}
+
+CONSTCD11
+inline
+bool
+operator!=(const month_day& x, const month_day& y) NOEXCEPT
+{
+ return !(x == y);
+}
+
+CONSTCD11
+inline
+bool
+operator<(const month_day& x, const month_day& y) NOEXCEPT
+{
+ return x.month() < y.month() ? true
+ : (x.month() > y.month() ? false
+ : (x.day() < y.day()));
+}
+
+CONSTCD11
+inline
+bool
+operator>(const month_day& x, const month_day& y) NOEXCEPT
+{
+ return y < x;
+}
+
+CONSTCD11
+inline
+bool
+operator<=(const month_day& x, const month_day& y) NOEXCEPT
+{
+ return !(y < x);
+}
+
+CONSTCD11
+inline
+bool
+operator>=(const month_day& x, const month_day& y) NOEXCEPT
+{
+ return !(x < y);
+}
+
+template<class CharT, class Traits>
+inline
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const month_day& md)
+{
+ return os << md.month() << '/' << md.day();
+}
+
+// month_day_last
+
+CONSTCD11 inline month month_day_last::month() const NOEXCEPT {return m_;}
+CONSTCD11 inline bool month_day_last::ok() const NOEXCEPT {return m_.ok();}
+CONSTCD11 inline month_day_last::month_day_last(const date::month& m) NOEXCEPT : m_(m) {}
+
+CONSTCD11
+inline
+bool
+operator==(const month_day_last& x, const month_day_last& y) NOEXCEPT
+{
+ return x.month() == y.month();
+}
+
+CONSTCD11
+inline
+bool
+operator!=(const month_day_last& x, const month_day_last& y) NOEXCEPT
+{
+ return !(x == y);
+}
+
+CONSTCD11
+inline
+bool
+operator<(const month_day_last& x, const month_day_last& y) NOEXCEPT
+{
+ return x.month() < y.month();
+}
+
+CONSTCD11
+inline
+bool
+operator>(const month_day_last& x, const month_day_last& y) NOEXCEPT
+{
+ return y < x;
+}
+
+CONSTCD11
+inline
+bool
+operator<=(const month_day_last& x, const month_day_last& y) NOEXCEPT
+{
+ return !(y < x);
+}
+
+CONSTCD11
+inline
+bool
+operator>=(const month_day_last& x, const month_day_last& y) NOEXCEPT
+{
+ return !(x < y);
+}
+
+template<class CharT, class Traits>
+inline
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const month_day_last& mdl)
+{
+ return os << mdl.month() << "/last";
+}
+
+// month_weekday
+
+CONSTCD11
+inline
+month_weekday::month_weekday(const date::month& m,
+ const date::weekday_indexed& wdi) NOEXCEPT
+ : m_(m)
+ , wdi_(wdi)
+ {}
+
+CONSTCD11 inline month month_weekday::month() const NOEXCEPT {return m_;}
+
+CONSTCD11
+inline
+weekday_indexed
+month_weekday::weekday_indexed() const NOEXCEPT
+{
+ return wdi_;
+}
+
+CONSTCD11
+inline
+bool
+month_weekday::ok() const NOEXCEPT
+{
+ return m_.ok() && wdi_.ok();
+}
+
+CONSTCD11
+inline
+bool
+operator==(const month_weekday& x, const month_weekday& y) NOEXCEPT
+{
+ return x.month() == y.month() && x.weekday_indexed() == y.weekday_indexed();
+}
+
+CONSTCD11
+inline
+bool
+operator!=(const month_weekday& x, const month_weekday& y) NOEXCEPT
+{
+ return !(x == y);
+}
+
+template<class CharT, class Traits>
+inline
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const month_weekday& mwd)
+{
+ return os << mwd.month() << '/' << mwd.weekday_indexed();
+}
+
+// month_weekday_last
+
+CONSTCD11
+inline
+month_weekday_last::month_weekday_last(const date::month& m,
+ const date::weekday_last& wdl) NOEXCEPT
+ : m_(m)
+ , wdl_(wdl)
+ {}
+
+CONSTCD11 inline month month_weekday_last::month() const NOEXCEPT {return m_;}
+
+CONSTCD11
+inline
+weekday_last
+month_weekday_last::weekday_last() const NOEXCEPT
+{
+ return wdl_;
+}
+
+CONSTCD11
+inline
+bool
+month_weekday_last::ok() const NOEXCEPT
+{
+ return m_.ok() && wdl_.ok();
+}
+
+CONSTCD11
+inline
+bool
+operator==(const month_weekday_last& x, const month_weekday_last& y) NOEXCEPT
+{
+ return x.month() == y.month() && x.weekday_last() == y.weekday_last();
+}
+
+CONSTCD11
+inline
+bool
+operator!=(const month_weekday_last& x, const month_weekday_last& y) NOEXCEPT
+{
+ return !(x == y);
+}
+
+template<class CharT, class Traits>
+inline
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const month_weekday_last& mwdl)
+{
+ return os << mwdl.month() << '/' << mwdl.weekday_last();
+}
+
+// year_month_day_last
+
+CONSTCD11
+inline
+year_month_day_last::year_month_day_last(const date::year& y,
+ const date::month_day_last& mdl) NOEXCEPT
+ : y_(y)
+ , mdl_(mdl)
+ {}
+
+template<class>
+CONSTCD14
+inline
+year_month_day_last&
+year_month_day_last::operator+=(const months& m) NOEXCEPT
+{
+ *this = *this + m;
+ return *this;
+}
+
+template<class>
+CONSTCD14
+inline
+year_month_day_last&
+year_month_day_last::operator-=(const months& m) NOEXCEPT
+{
+ *this = *this - m;
+ return *this;
+}
+
+CONSTCD14
+inline
+year_month_day_last&
+year_month_day_last::operator+=(const years& y) NOEXCEPT
+{
+ *this = *this + y;
+ return *this;
+}
+
+CONSTCD14
+inline
+year_month_day_last&
+year_month_day_last::operator-=(const years& y) NOEXCEPT
+{
+ *this = *this - y;
+ return *this;
+}
+
+CONSTCD11 inline year year_month_day_last::year() const NOEXCEPT {return y_;}
+CONSTCD11 inline month year_month_day_last::month() const NOEXCEPT {return mdl_.month();}
+
+CONSTCD11
+inline
+month_day_last
+year_month_day_last::month_day_last() const NOEXCEPT
+{
+ return mdl_;
+}
+
+CONSTCD14
+inline
+day
+year_month_day_last::day() const NOEXCEPT
+{
+ CONSTDATA date::day d[] =
+ {
+ date::day(31), date::day(28), date::day(31),
+ date::day(30), date::day(31), date::day(30),
+ date::day(31), date::day(31), date::day(30),
+ date::day(31), date::day(30), date::day(31)
+ };
+ return (month() != February || !y_.is_leap()) && mdl_.ok() ?
+ d[static_cast<unsigned>(month()) - 1] : date::day{29};
+}
+
+CONSTCD14
+inline
+year_month_day_last::operator sys_days() const NOEXCEPT
+{
+ return sys_days(year()/month()/day());
+}
+
+CONSTCD14
+inline
+year_month_day_last::operator local_days() const NOEXCEPT
+{
+ return local_days(year()/month()/day());
+}
+
+CONSTCD11
+inline
+bool
+year_month_day_last::ok() const NOEXCEPT
+{
+ return y_.ok() && mdl_.ok();
+}
+
+CONSTCD11
+inline
+bool
+operator==(const year_month_day_last& x, const year_month_day_last& y) NOEXCEPT
+{
+ return x.year() == y.year() && x.month_day_last() == y.month_day_last();
+}
+
+CONSTCD11
+inline
+bool
+operator!=(const year_month_day_last& x, const year_month_day_last& y) NOEXCEPT
+{
+ return !(x == y);
+}
+
+CONSTCD11
+inline
+bool
+operator<(const year_month_day_last& x, const year_month_day_last& y) NOEXCEPT
+{
+ return x.year() < y.year() ? true
+ : (x.year() > y.year() ? false
+ : (x.month_day_last() < y.month_day_last()));
+}
+
+CONSTCD11
+inline
+bool
+operator>(const year_month_day_last& x, const year_month_day_last& y) NOEXCEPT
+{
+ return y < x;
+}
+
+CONSTCD11
+inline
+bool
+operator<=(const year_month_day_last& x, const year_month_day_last& y) NOEXCEPT
+{
+ return !(y < x);
+}
+
+CONSTCD11
+inline
+bool
+operator>=(const year_month_day_last& x, const year_month_day_last& y) NOEXCEPT
+{
+ return !(x < y);
+}
+
+template<class CharT, class Traits>
+inline
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const year_month_day_last& ymdl)
+{
+ return os << ymdl.year() << '/' << ymdl.month_day_last();
+}
+
+template<class>
+CONSTCD14
+inline
+year_month_day_last
+operator+(const year_month_day_last& ymdl, const months& dm) NOEXCEPT
+{
+ return (ymdl.year() / ymdl.month() + dm) / last;
+}
+
+template<class>
+CONSTCD14
+inline
+year_month_day_last
+operator+(const months& dm, const year_month_day_last& ymdl) NOEXCEPT
+{
+ return ymdl + dm;
+}
+
+template<class>
+CONSTCD14
+inline
+year_month_day_last
+operator-(const year_month_day_last& ymdl, const months& dm) NOEXCEPT
+{
+ return ymdl + (-dm);
+}
+
+CONSTCD11
+inline
+year_month_day_last
+operator+(const year_month_day_last& ymdl, const years& dy) NOEXCEPT
+{
+ return {ymdl.year()+dy, ymdl.month_day_last()};
+}
+
+CONSTCD11
+inline
+year_month_day_last
+operator+(const years& dy, const year_month_day_last& ymdl) NOEXCEPT
+{
+ return ymdl + dy;
+}
+
+CONSTCD11
+inline
+year_month_day_last
+operator-(const year_month_day_last& ymdl, const years& dy) NOEXCEPT
+{
+ return ymdl + (-dy);
+}
+
+// year_month_day
+
+CONSTCD11
+inline
+year_month_day::year_month_day(const date::year& y, const date::month& m,
+ const date::day& d) NOEXCEPT
+ : y_(y)
+ , m_(m)
+ , d_(d)
+ {}
+
+CONSTCD14
+inline
+year_month_day::year_month_day(const year_month_day_last& ymdl) NOEXCEPT
+ : y_(ymdl.year())
+ , m_(ymdl.month())
+ , d_(ymdl.day())
+ {}
+
+CONSTCD14
+inline
+year_month_day::year_month_day(sys_days dp) NOEXCEPT
+ : year_month_day(from_days(dp.time_since_epoch()))
+ {}
+
+CONSTCD14
+inline
+year_month_day::year_month_day(local_days dp) NOEXCEPT
+ : year_month_day(from_days(dp.time_since_epoch()))
+ {}
+
+CONSTCD11 inline year year_month_day::year() const NOEXCEPT {return y_;}
+CONSTCD11 inline month year_month_day::month() const NOEXCEPT {return m_;}
+CONSTCD11 inline day year_month_day::day() const NOEXCEPT {return d_;}
+
+template<class>
+CONSTCD14
+inline
+year_month_day&
+year_month_day::operator+=(const months& m) NOEXCEPT
+{
+ *this = *this + m;
+ return *this;
+}
+
+template<class>
+CONSTCD14
+inline
+year_month_day&
+year_month_day::operator-=(const months& m) NOEXCEPT
+{
+ *this = *this - m;
+ return *this;
+}
+
+CONSTCD14
+inline
+year_month_day&
+year_month_day::operator+=(const years& y) NOEXCEPT
+{
+ *this = *this + y;
+ return *this;
+}
+
+CONSTCD14
+inline
+year_month_day&
+year_month_day::operator-=(const years& y) NOEXCEPT
+{
+ *this = *this - y;
+ return *this;
+}
+
+CONSTCD14
+inline
+days
+year_month_day::to_days() const NOEXCEPT
+{
+ static_assert(std::numeric_limits<unsigned>::digits >= 18,
+ "This algorithm has not been ported to a 16 bit unsigned integer");
+ static_assert(std::numeric_limits<int>::digits >= 20,
+ "This algorithm has not been ported to a 16 bit signed integer");
+ auto const y = static_cast<int>(y_) - (m_ <= February);
+ auto const m = static_cast<unsigned>(m_);
+ auto const d = static_cast<unsigned>(d_);
+ auto const era = (y >= 0 ? y : y-399) / 400;
+ auto const yoe = static_cast<unsigned>(y - era * 400); // [0, 399]
+ auto const doy = (153*(m > 2 ? m-3 : m+9) + 2)/5 + d-1; // [0, 365]
+ auto const doe = yoe * 365 + yoe/4 - yoe/100 + doy; // [0, 146096]
+ return days{era * 146097 + static_cast<int>(doe) - 719468};
+}
+
+CONSTCD14
+inline
+year_month_day::operator sys_days() const NOEXCEPT
+{
+ return sys_days{to_days()};
+}
+
+CONSTCD14
+inline
+year_month_day::operator local_days() const NOEXCEPT
+{
+ return local_days{to_days()};
+}
+
+CONSTCD14
+inline
+bool
+year_month_day::ok() const NOEXCEPT
+{
+ if (!(y_.ok() && m_.ok()))
+ return false;
+ return date::day{1} <= d_ && d_ <= (y_ / m_ / last).day();
+}
+
+CONSTCD11
+inline
+bool
+operator==(const year_month_day& x, const year_month_day& y) NOEXCEPT
+{
+ return x.year() == y.year() && x.month() == y.month() && x.day() == y.day();
+}
+
+CONSTCD11
+inline
+bool
+operator!=(const year_month_day& x, const year_month_day& y) NOEXCEPT
+{
+ return !(x == y);
+}
+
+CONSTCD11
+inline
+bool
+operator<(const year_month_day& x, const year_month_day& y) NOEXCEPT
+{
+ return x.year() < y.year() ? true
+ : (x.year() > y.year() ? false
+ : (x.month() < y.month() ? true
+ : (x.month() > y.month() ? false
+ : (x.day() < y.day()))));
+}
+
+CONSTCD11
+inline
+bool
+operator>(const year_month_day& x, const year_month_day& y) NOEXCEPT
+{
+ return y < x;
+}
+
+CONSTCD11
+inline
+bool
+operator<=(const year_month_day& x, const year_month_day& y) NOEXCEPT
+{
+ return !(y < x);
+}
+
+CONSTCD11
+inline
+bool
+operator>=(const year_month_day& x, const year_month_day& y) NOEXCEPT
+{
+ return !(x < y);
+}
+
+template<class CharT, class Traits>
+inline
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const year_month_day& ymd)
+{
+ detail::save_ostream<CharT, Traits> _(os);
+ os.fill('0');
+ os.flags(std::ios::dec | std::ios::right);
+ os.imbue(std::locale::classic());
+ os << ymd.year() << '-';
+ os.width(2);
+ os << static_cast<unsigned>(ymd.month()) << '-';
+ os << ymd.day();
+ if (!ymd.ok())
+ os << " is not a valid date";
+ return os;
+}
+
+CONSTCD14
+inline
+year_month_day
+year_month_day::from_days(days dp) NOEXCEPT
+{
+ static_assert(std::numeric_limits<unsigned>::digits >= 18,
+ "This algorithm has not been ported to a 16 bit unsigned integer");
+ static_assert(std::numeric_limits<int>::digits >= 20,
+ "This algorithm has not been ported to a 16 bit signed integer");
+ auto const z = dp.count() + 719468;
+ auto const era = (z >= 0 ? z : z - 146096) / 146097;
+ auto const doe = static_cast<unsigned>(z - era * 146097); // [0, 146096]
+ auto const yoe = (doe - doe/1460 + doe/36524 - doe/146096) / 365; // [0, 399]
+ auto const y = static_cast<days::rep>(yoe) + era * 400;
+ auto const doy = doe - (365*yoe + yoe/4 - yoe/100); // [0, 365]
+ auto const mp = (5*doy + 2)/153; // [0, 11]
+ auto const d = doy - (153*mp+2)/5 + 1; // [1, 31]
+ auto const m = mp < 10 ? mp+3 : mp-9; // [1, 12]
+ return year_month_day{date::year{y + (m <= 2)}, date::month(m), date::day(d)};
+}
+
+template<class>
+CONSTCD14
+inline
+year_month_day
+operator+(const year_month_day& ymd, const months& dm) NOEXCEPT
+{
+ return (ymd.year() / ymd.month() + dm) / ymd.day();
+}
+
+template<class>
+CONSTCD14
+inline
+year_month_day
+operator+(const months& dm, const year_month_day& ymd) NOEXCEPT
+{
+ return ymd + dm;
+}
+
+template<class>
+CONSTCD14
+inline
+year_month_day
+operator-(const year_month_day& ymd, const months& dm) NOEXCEPT
+{
+ return ymd + (-dm);
+}
+
+CONSTCD11
+inline
+year_month_day
+operator+(const year_month_day& ymd, const years& dy) NOEXCEPT
+{
+ return (ymd.year() + dy) / ymd.month() / ymd.day();
+}
+
+CONSTCD11
+inline
+year_month_day
+operator+(const years& dy, const year_month_day& ymd) NOEXCEPT
+{
+ return ymd + dy;
+}
+
+CONSTCD11
+inline
+year_month_day
+operator-(const year_month_day& ymd, const years& dy) NOEXCEPT
+{
+ return ymd + (-dy);
+}
+
+// year_month_weekday
+
+CONSTCD11
+inline
+year_month_weekday::year_month_weekday(const date::year& y, const date::month& m,
+ const date::weekday_indexed& wdi)
+ NOEXCEPT
+ : y_(y)
+ , m_(m)
+ , wdi_(wdi)
+ {}
+
+CONSTCD14
+inline
+year_month_weekday::year_month_weekday(const sys_days& dp) NOEXCEPT
+ : year_month_weekday(from_days(dp.time_since_epoch()))
+ {}
+
+CONSTCD14
+inline
+year_month_weekday::year_month_weekday(const local_days& dp) NOEXCEPT
+ : year_month_weekday(from_days(dp.time_since_epoch()))
+ {}
+
+template<class>
+CONSTCD14
+inline
+year_month_weekday&
+year_month_weekday::operator+=(const months& m) NOEXCEPT
+{
+ *this = *this + m;
+ return *this;
+}
+
+template<class>
+CONSTCD14
+inline
+year_month_weekday&
+year_month_weekday::operator-=(const months& m) NOEXCEPT
+{
+ *this = *this - m;
+ return *this;
+}
+
+CONSTCD14
+inline
+year_month_weekday&
+year_month_weekday::operator+=(const years& y) NOEXCEPT
+{
+ *this = *this + y;
+ return *this;
+}
+
+CONSTCD14
+inline
+year_month_weekday&
+year_month_weekday::operator-=(const years& y) NOEXCEPT
+{
+ *this = *this - y;
+ return *this;
+}
+
+CONSTCD11 inline year year_month_weekday::year() const NOEXCEPT {return y_;}
+CONSTCD11 inline month year_month_weekday::month() const NOEXCEPT {return m_;}
+
+CONSTCD11
+inline
+weekday
+year_month_weekday::weekday() const NOEXCEPT
+{
+ return wdi_.weekday();
+}
+
+CONSTCD11
+inline
+unsigned
+year_month_weekday::index() const NOEXCEPT
+{
+ return wdi_.index();
+}
+
+CONSTCD11
+inline
+weekday_indexed
+year_month_weekday::weekday_indexed() const NOEXCEPT
+{
+ return wdi_;
+}
+
+CONSTCD14
+inline
+year_month_weekday::operator sys_days() const NOEXCEPT
+{
+ return sys_days{to_days()};
+}
+
+CONSTCD14
+inline
+year_month_weekday::operator local_days() const NOEXCEPT
+{
+ return local_days{to_days()};
+}
+
+CONSTCD14
+inline
+bool
+year_month_weekday::ok() const NOEXCEPT
+{
+ if (!y_.ok() || !m_.ok() || !wdi_.weekday().ok() || wdi_.index() < 1)
+ return false;
+ if (wdi_.index() <= 4)
+ return true;
+ auto d2 = wdi_.weekday() - date::weekday(static_cast<sys_days>(y_/m_/1)) +
+ days((wdi_.index()-1)*7 + 1);
+ return static_cast<unsigned>(d2.count()) <= static_cast<unsigned>((y_/m_/last).day());
+}
+
+CONSTCD14
+inline
+year_month_weekday
+year_month_weekday::from_days(days d) NOEXCEPT
+{
+ sys_days dp{d};
+ auto const wd = date::weekday(dp);
+ auto const ymd = year_month_day(dp);
+ return {ymd.year(), ymd.month(), wd[(static_cast<unsigned>(ymd.day())-1)/7+1]};
+}
+
+CONSTCD14
+inline
+days
+year_month_weekday::to_days() const NOEXCEPT
+{
+ auto d = sys_days(y_/m_/1);
+ return (d + (wdi_.weekday() - date::weekday(d) + days{(wdi_.index()-1)*7})
+ ).time_since_epoch();
+}
+
+CONSTCD11
+inline
+bool
+operator==(const year_month_weekday& x, const year_month_weekday& y) NOEXCEPT
+{
+ return x.year() == y.year() && x.month() == y.month() &&
+ x.weekday_indexed() == y.weekday_indexed();
+}
+
+CONSTCD11
+inline
+bool
+operator!=(const year_month_weekday& x, const year_month_weekday& y) NOEXCEPT
+{
+ return !(x == y);
+}
+
+template<class CharT, class Traits>
+inline
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const year_month_weekday& ymwdi)
+{
+ return os << ymwdi.year() << '/' << ymwdi.month()
+ << '/' << ymwdi.weekday_indexed();
+}
+
+template<class>
+CONSTCD14
+inline
+year_month_weekday
+operator+(const year_month_weekday& ymwd, const months& dm) NOEXCEPT
+{
+ return (ymwd.year() / ymwd.month() + dm) / ymwd.weekday_indexed();
+}
+
+template<class>
+CONSTCD14
+inline
+year_month_weekday
+operator+(const months& dm, const year_month_weekday& ymwd) NOEXCEPT
+{
+ return ymwd + dm;
+}
+
+template<class>
+CONSTCD14
+inline
+year_month_weekday
+operator-(const year_month_weekday& ymwd, const months& dm) NOEXCEPT
+{
+ return ymwd + (-dm);
+}
+
+CONSTCD11
+inline
+year_month_weekday
+operator+(const year_month_weekday& ymwd, const years& dy) NOEXCEPT
+{
+ return {ymwd.year()+dy, ymwd.month(), ymwd.weekday_indexed()};
+}
+
+CONSTCD11
+inline
+year_month_weekday
+operator+(const years& dy, const year_month_weekday& ymwd) NOEXCEPT
+{
+ return ymwd + dy;
+}
+
+CONSTCD11
+inline
+year_month_weekday
+operator-(const year_month_weekday& ymwd, const years& dy) NOEXCEPT
+{
+ return ymwd + (-dy);
+}
+
+// year_month_weekday_last
+
+CONSTCD11
+inline
+year_month_weekday_last::year_month_weekday_last(const date::year& y,
+ const date::month& m,
+ const date::weekday_last& wdl) NOEXCEPT
+ : y_(y)
+ , m_(m)
+ , wdl_(wdl)
+ {}
+
+template<class>
+CONSTCD14
+inline
+year_month_weekday_last&
+year_month_weekday_last::operator+=(const months& m) NOEXCEPT
+{
+ *this = *this + m;
+ return *this;
+}
+
+template<class>
+CONSTCD14
+inline
+year_month_weekday_last&
+year_month_weekday_last::operator-=(const months& m) NOEXCEPT
+{
+ *this = *this - m;
+ return *this;
+}
+
+CONSTCD14
+inline
+year_month_weekday_last&
+year_month_weekday_last::operator+=(const years& y) NOEXCEPT
+{
+ *this = *this + y;
+ return *this;
+}
+
+CONSTCD14
+inline
+year_month_weekday_last&
+year_month_weekday_last::operator-=(const years& y) NOEXCEPT
+{
+ *this = *this - y;
+ return *this;
+}
+
+CONSTCD11 inline year year_month_weekday_last::year() const NOEXCEPT {return y_;}
+CONSTCD11 inline month year_month_weekday_last::month() const NOEXCEPT {return m_;}
+
+CONSTCD11
+inline
+weekday
+year_month_weekday_last::weekday() const NOEXCEPT
+{
+ return wdl_.weekday();
+}
+
+CONSTCD11
+inline
+weekday_last
+year_month_weekday_last::weekday_last() const NOEXCEPT
+{
+ return wdl_;
+}
+
+CONSTCD14
+inline
+year_month_weekday_last::operator sys_days() const NOEXCEPT
+{
+ return sys_days{to_days()};
+}
+
+CONSTCD14
+inline
+year_month_weekday_last::operator local_days() const NOEXCEPT
+{
+ return local_days{to_days()};
+}
+
+CONSTCD11
+inline
+bool
+year_month_weekday_last::ok() const NOEXCEPT
+{
+ return y_.ok() && m_.ok() && wdl_.ok();
+}
+
+CONSTCD14
+inline
+days
+year_month_weekday_last::to_days() const NOEXCEPT
+{
+ auto const d = sys_days(y_/m_/last);
+ return (d - (date::weekday{d} - wdl_.weekday())).time_since_epoch();
+}
+
+CONSTCD11
+inline
+bool
+operator==(const year_month_weekday_last& x, const year_month_weekday_last& y) NOEXCEPT
+{
+ return x.year() == y.year() && x.month() == y.month() &&
+ x.weekday_last() == y.weekday_last();
+}
+
+CONSTCD11
+inline
+bool
+operator!=(const year_month_weekday_last& x, const year_month_weekday_last& y) NOEXCEPT
+{
+ return !(x == y);
+}
+
+template<class CharT, class Traits>
+inline
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const year_month_weekday_last& ymwdl)
+{
+ return os << ymwdl.year() << '/' << ymwdl.month() << '/' << ymwdl.weekday_last();
+}
+
+template<class>
+CONSTCD14
+inline
+year_month_weekday_last
+operator+(const year_month_weekday_last& ymwdl, const months& dm) NOEXCEPT
+{
+ return (ymwdl.year() / ymwdl.month() + dm) / ymwdl.weekday_last();
+}
+
+template<class>
+CONSTCD14
+inline
+year_month_weekday_last
+operator+(const months& dm, const year_month_weekday_last& ymwdl) NOEXCEPT
+{
+ return ymwdl + dm;
+}
+
+template<class>
+CONSTCD14
+inline
+year_month_weekday_last
+operator-(const year_month_weekday_last& ymwdl, const months& dm) NOEXCEPT
+{
+ return ymwdl + (-dm);
+}
+
+CONSTCD11
+inline
+year_month_weekday_last
+operator+(const year_month_weekday_last& ymwdl, const years& dy) NOEXCEPT
+{
+ return {ymwdl.year()+dy, ymwdl.month(), ymwdl.weekday_last()};
+}
+
+CONSTCD11
+inline
+year_month_weekday_last
+operator+(const years& dy, const year_month_weekday_last& ymwdl) NOEXCEPT
+{
+ return ymwdl + dy;
+}
+
+CONSTCD11
+inline
+year_month_weekday_last
+operator-(const year_month_weekday_last& ymwdl, const years& dy) NOEXCEPT
+{
+ return ymwdl + (-dy);
+}
+
+// year_month from operator/()
+
+CONSTCD11
+inline
+year_month
+operator/(const year& y, const month& m) NOEXCEPT
+{
+ return {y, m};
+}
+
+CONSTCD11
+inline
+year_month
+operator/(const year& y, int m) NOEXCEPT
+{
+ return y / month(static_cast<unsigned>(m));
+}
+
+// month_day from operator/()
+
+CONSTCD11
+inline
+month_day
+operator/(const month& m, const day& d) NOEXCEPT
+{
+ return {m, d};
+}
+
+CONSTCD11
+inline
+month_day
+operator/(const day& d, const month& m) NOEXCEPT
+{
+ return m / d;
+}
+
+CONSTCD11
+inline
+month_day
+operator/(const month& m, int d) NOEXCEPT
+{
+ return m / day(static_cast<unsigned>(d));
+}
+
+CONSTCD11
+inline
+month_day
+operator/(int m, const day& d) NOEXCEPT
+{
+ return month(static_cast<unsigned>(m)) / d;
+}
+
+CONSTCD11 inline month_day operator/(const day& d, int m) NOEXCEPT {return m / d;}
+
+// month_day_last from operator/()
+
+CONSTCD11
+inline
+month_day_last
+operator/(const month& m, last_spec) NOEXCEPT
+{
+ return month_day_last{m};
+}
+
+CONSTCD11
+inline
+month_day_last
+operator/(last_spec, const month& m) NOEXCEPT
+{
+ return m/last;
+}
+
+CONSTCD11
+inline
+month_day_last
+operator/(int m, last_spec) NOEXCEPT
+{
+ return month(static_cast<unsigned>(m))/last;
+}
+
+CONSTCD11
+inline
+month_day_last
+operator/(last_spec, int m) NOEXCEPT
+{
+ return m/last;
+}
+
+// month_weekday from operator/()
+
+CONSTCD11
+inline
+month_weekday
+operator/(const month& m, const weekday_indexed& wdi) NOEXCEPT
+{
+ return {m, wdi};
+}
+
+CONSTCD11
+inline
+month_weekday
+operator/(const weekday_indexed& wdi, const month& m) NOEXCEPT
+{
+ return m / wdi;
+}
+
+CONSTCD11
+inline
+month_weekday
+operator/(int m, const weekday_indexed& wdi) NOEXCEPT
+{
+ return month(static_cast<unsigned>(m)) / wdi;
+}
+
+CONSTCD11
+inline
+month_weekday
+operator/(const weekday_indexed& wdi, int m) NOEXCEPT
+{
+ return m / wdi;
+}
+
+// month_weekday_last from operator/()
+
+CONSTCD11
+inline
+month_weekday_last
+operator/(const month& m, const weekday_last& wdl) NOEXCEPT
+{
+ return {m, wdl};
+}
+
+CONSTCD11
+inline
+month_weekday_last
+operator/(const weekday_last& wdl, const month& m) NOEXCEPT
+{
+ return m / wdl;
+}
+
+CONSTCD11
+inline
+month_weekday_last
+operator/(int m, const weekday_last& wdl) NOEXCEPT
+{
+ return month(static_cast<unsigned>(m)) / wdl;
+}
+
+CONSTCD11
+inline
+month_weekday_last
+operator/(const weekday_last& wdl, int m) NOEXCEPT
+{
+ return m / wdl;
+}
+
+// year_month_day from operator/()
+
+CONSTCD11
+inline
+year_month_day
+operator/(const year_month& ym, const day& d) NOEXCEPT
+{
+ return {ym.year(), ym.month(), d};
+}
+
+CONSTCD11
+inline
+year_month_day
+operator/(const year_month& ym, int d) NOEXCEPT
+{
+ return ym / day(static_cast<unsigned>(d));
+}
+
+CONSTCD11
+inline
+year_month_day
+operator/(const year& y, const month_day& md) NOEXCEPT
+{
+ return y / md.month() / md.day();
+}
+
+CONSTCD11
+inline
+year_month_day
+operator/(int y, const month_day& md) NOEXCEPT
+{
+ return year(y) / md;
+}
+
+CONSTCD11
+inline
+year_month_day
+operator/(const month_day& md, const year& y) NOEXCEPT
+{
+ return y / md;
+}
+
+CONSTCD11
+inline
+year_month_day
+operator/(const month_day& md, int y) NOEXCEPT
+{
+ return year(y) / md;
+}
+
+// year_month_day_last from operator/()
+
+CONSTCD11
+inline
+year_month_day_last
+operator/(const year_month& ym, last_spec) NOEXCEPT
+{
+ return {ym.year(), month_day_last{ym.month()}};
+}
+
+CONSTCD11
+inline
+year_month_day_last
+operator/(const year& y, const month_day_last& mdl) NOEXCEPT
+{
+ return {y, mdl};
+}
+
+CONSTCD11
+inline
+year_month_day_last
+operator/(int y, const month_day_last& mdl) NOEXCEPT
+{
+ return year(y) / mdl;
+}
+
+CONSTCD11
+inline
+year_month_day_last
+operator/(const month_day_last& mdl, const year& y) NOEXCEPT
+{
+ return y / mdl;
+}
+
+CONSTCD11
+inline
+year_month_day_last
+operator/(const month_day_last& mdl, int y) NOEXCEPT
+{
+ return year(y) / mdl;
+}
+
+// year_month_weekday from operator/()
+
+CONSTCD11
+inline
+year_month_weekday
+operator/(const year_month& ym, const weekday_indexed& wdi) NOEXCEPT
+{
+ return {ym.year(), ym.month(), wdi};
+}
+
+CONSTCD11
+inline
+year_month_weekday
+operator/(const year& y, const month_weekday& mwd) NOEXCEPT
+{
+ return {y, mwd.month(), mwd.weekday_indexed()};
+}
+
+CONSTCD11
+inline
+year_month_weekday
+operator/(int y, const month_weekday& mwd) NOEXCEPT
+{
+ return year(y) / mwd;
+}
+
+CONSTCD11
+inline
+year_month_weekday
+operator/(const month_weekday& mwd, const year& y) NOEXCEPT
+{
+ return y / mwd;
+}
+
+CONSTCD11
+inline
+year_month_weekday
+operator/(const month_weekday& mwd, int y) NOEXCEPT
+{
+ return year(y) / mwd;
+}
+
+// year_month_weekday_last from operator/()
+
+CONSTCD11
+inline
+year_month_weekday_last
+operator/(const year_month& ym, const weekday_last& wdl) NOEXCEPT
+{
+ return {ym.year(), ym.month(), wdl};
+}
+
+CONSTCD11
+inline
+year_month_weekday_last
+operator/(const year& y, const month_weekday_last& mwdl) NOEXCEPT
+{
+ return {y, mwdl.month(), mwdl.weekday_last()};
+}
+
+CONSTCD11
+inline
+year_month_weekday_last
+operator/(int y, const month_weekday_last& mwdl) NOEXCEPT
+{
+ return year(y) / mwdl;
+}
+
+CONSTCD11
+inline
+year_month_weekday_last
+operator/(const month_weekday_last& mwdl, const year& y) NOEXCEPT
+{
+ return y / mwdl;
+}
+
+CONSTCD11
+inline
+year_month_weekday_last
+operator/(const month_weekday_last& mwdl, int y) NOEXCEPT
+{
+ return year(y) / mwdl;
+}
+
+template <class Duration>
+struct fields;
+
+template <class CharT, class Traits, class Duration>
+std::basic_ostream<CharT, Traits>&
+to_stream(std::basic_ostream<CharT, Traits>& os, const CharT* fmt,
+ const fields<Duration>& fds, const std::string* abbrev = nullptr,
+ const std::chrono::seconds* offset_sec = nullptr);
+
+template <class CharT, class Traits, class Duration, class Alloc>
+std::basic_istream<CharT, Traits>&
+from_stream(std::basic_istream<CharT, Traits>& is, const CharT* fmt,
+ fields<Duration>& fds, std::basic_string<CharT, Traits, Alloc>* abbrev = nullptr,
+ std::chrono::minutes* offset = nullptr);
+
+// hh_mm_ss
+
+namespace detail
+{
+
+struct undocumented {explicit undocumented() = default;};
+
+// width<n>::value is the number of fractional decimal digits in 1/n
+// width<0>::value and width<1>::value are defined to be 0
+// If 1/n takes more than 18 fractional decimal digits,
+// the result is truncated to 19.
+// Example: width<2>::value == 1
+// Example: width<3>::value == 19
+// Example: width<4>::value == 2
+// Example: width<10>::value == 1
+// Example: width<1000>::value == 3
+template <std::uint64_t n, std::uint64_t d = 10, unsigned w = 0,
+ bool should_continue = !(n < 2) && d != 0 && (w < 19)>
+struct width
+{
+ static CONSTDATA unsigned value = 1 + width<n, d%n*10, w+1>::value;
+};
+
+template <std::uint64_t n, std::uint64_t d, unsigned w>
+struct width<n, d, w, false>
+{
+ static CONSTDATA unsigned value = 0;
+};
+
+template <unsigned exp>
+struct static_pow10
+{
+private:
+ static CONSTDATA std::uint64_t h = static_pow10<exp/2>::value;
+public:
+ static CONSTDATA std::uint64_t value = h * h * (exp % 2 ? 10 : 1);
+};
+
+template <>
+struct static_pow10<0>
+{
+ static CONSTDATA std::uint64_t value = 1;
+};
+
+template <class Duration>
+class decimal_format_seconds
+{
+ using CT = typename std::common_type<Duration, std::chrono::seconds>::type;
+ using rep = typename CT::rep;
+public:
+ static unsigned constexpr width = detail::width<CT::period::den>::value < 19 ?
+ detail::width<CT::period::den>::value : 6u;
+ using precision = std::chrono::duration<rep,
+ std::ratio<1, static_pow10<width>::value>>;
+
+private:
+ std::chrono::seconds s_;
+ precision sub_s_;
+
+public:
+ CONSTCD11 decimal_format_seconds()
+ : s_()
+ , sub_s_()
+ {}
+
+ CONSTCD11 explicit decimal_format_seconds(const Duration& d) NOEXCEPT
+ : s_(std::chrono::duration_cast<std::chrono::seconds>(d))
+ , sub_s_(std::chrono::duration_cast<precision>(d - s_))
+ {}
+
+ CONSTCD14 std::chrono::seconds& seconds() NOEXCEPT {return s_;}
+ CONSTCD11 std::chrono::seconds seconds() const NOEXCEPT {return s_;}
+ CONSTCD11 precision subseconds() const NOEXCEPT {return sub_s_;}
+
+ CONSTCD14 precision to_duration() const NOEXCEPT
+ {
+ return s_ + sub_s_;
+ }
+
+ CONSTCD11 bool in_conventional_range() const NOEXCEPT
+ {
+ return sub_s_ < std::chrono::seconds{1} && s_ < std::chrono::minutes{1};
+ }
+
+ template <class CharT, class Traits>
+ friend
+ std::basic_ostream<CharT, Traits>&
+ operator<<(std::basic_ostream<CharT, Traits>& os, const decimal_format_seconds& x)
+ {
+ return x.print(os, std::chrono::treat_as_floating_point<rep>{});
+ }
+
+ template <class CharT, class Traits>
+ std::basic_ostream<CharT, Traits>&
+ print(std::basic_ostream<CharT, Traits>& os, std::true_type) const
+ {
+ date::detail::save_ostream<CharT, Traits> _(os);
+ std::chrono::duration<rep> d = s_ + sub_s_;
+ if (d < std::chrono::seconds{10})
+ os << '0';
+ os << std::fixed << d.count();
+ return os;
+ }
+
+ template <class CharT, class Traits>
+ std::basic_ostream<CharT, Traits>&
+ print(std::basic_ostream<CharT, Traits>& os, std::false_type) const
+ {
+ date::detail::save_ostream<CharT, Traits> _(os);
+ os.fill('0');
+ os.flags(std::ios::dec | std::ios::right);
+ os.width(2);
+ os << s_.count();
+ if (width > 0)
+ {
+#if !ONLY_C_LOCALE
+ os << std::use_facet<std::numpunct<CharT>>(os.getloc()).decimal_point();
+#else
+ os << '.';
+#endif
+ date::detail::save_ostream<CharT, Traits> _s(os);
+ os.imbue(std::locale::classic());
+ os.width(width);
+ os << sub_s_.count();
+ }
+ return os;
+ }
+};
+
+template <class Rep, class Period>
+inline
+CONSTCD11
+typename std::enable_if
+ <
+ std::numeric_limits<Rep>::is_signed,
+ std::chrono::duration<Rep, Period>
+ >::type
+abs(std::chrono::duration<Rep, Period> d)
+{
+ return d >= d.zero() ? +d : -d;
+}
+
+template <class Rep, class Period>
+inline
+CONSTCD11
+typename std::enable_if
+ <
+ !std::numeric_limits<Rep>::is_signed,
+ std::chrono::duration<Rep, Period>
+ >::type
+abs(std::chrono::duration<Rep, Period> d)
+{
+ return d;
+}
+
+} // namespace detail
+
+template <class Duration>
+class hh_mm_ss
+{
+ using dfs = detail::decimal_format_seconds<typename std::common_type<Duration,
+ std::chrono::seconds>::type>;
+
+ std::chrono::hours h_;
+ std::chrono::minutes m_;
+ dfs s_;
+ bool neg_;
+
+public:
+ static unsigned CONSTDATA fractional_width = dfs::width;
+ using precision = typename dfs::precision;
+
+ CONSTCD11 hh_mm_ss() NOEXCEPT
+ : hh_mm_ss(Duration::zero())
+ {}
+
+ CONSTCD11 explicit hh_mm_ss(Duration d) NOEXCEPT
+ : h_(std::chrono::duration_cast<std::chrono::hours>(detail::abs(d)))
+ , m_(std::chrono::duration_cast<std::chrono::minutes>(detail::abs(d)) - h_)
+ , s_(detail::abs(d) - h_ - m_)
+ , neg_(d < Duration::zero())
+ {}
+
+ CONSTCD11 std::chrono::hours hours() const NOEXCEPT {return h_;}
+ CONSTCD11 std::chrono::minutes minutes() const NOEXCEPT {return m_;}
+ CONSTCD11 std::chrono::seconds seconds() const NOEXCEPT {return s_.seconds();}
+ CONSTCD14 std::chrono::seconds&
+ seconds(detail::undocumented) NOEXCEPT {return s_.seconds();}
+ CONSTCD11 precision subseconds() const NOEXCEPT {return s_.subseconds();}
+ CONSTCD11 bool is_negative() const NOEXCEPT {return neg_;}
+
+ CONSTCD11 explicit operator precision() const NOEXCEPT {return to_duration();}
+ CONSTCD11 precision to_duration() const NOEXCEPT
+ {return (s_.to_duration() + m_ + h_) * (1-2*neg_);}
+
+ CONSTCD11 bool in_conventional_range() const NOEXCEPT
+ {
+ return !neg_ && h_ < days{1} && m_ < std::chrono::hours{1} &&
+ s_.in_conventional_range();
+ }
+
+private:
+
+ template <class charT, class traits>
+ friend
+ std::basic_ostream<charT, traits>&
+ operator<<(std::basic_ostream<charT, traits>& os, hh_mm_ss const& tod)
+ {
+ if (tod.is_negative())
+ os << '-';
+ if (tod.h_ < std::chrono::hours{10})
+ os << '0';
+ os << tod.h_.count() << ':';
+ if (tod.m_ < std::chrono::minutes{10})
+ os << '0';
+ os << tod.m_.count() << ':' << tod.s_;
+ return os;
+ }
+
+ template <class CharT, class Traits, class Duration2>
+ friend
+ std::basic_ostream<CharT, Traits>&
+ date::to_stream(std::basic_ostream<CharT, Traits>& os, const CharT* fmt,
+ const fields<Duration2>& fds, const std::string* abbrev,
+ const std::chrono::seconds* offset_sec);
+
+ template <class CharT, class Traits, class Duration2, class Alloc>
+ friend
+ std::basic_istream<CharT, Traits>&
+ date::from_stream(std::basic_istream<CharT, Traits>& is, const CharT* fmt,
+ fields<Duration2>& fds,
+ std::basic_string<CharT, Traits, Alloc>* abbrev, std::chrono::minutes* offset);
+};
+
+inline
+CONSTCD14
+bool
+is_am(std::chrono::hours const& h) NOEXCEPT
+{
+ using std::chrono::hours;
+ return hours{0} <= h && h < hours{12};
+}
+
+inline
+CONSTCD14
+bool
+is_pm(std::chrono::hours const& h) NOEXCEPT
+{
+ using std::chrono::hours;
+ return hours{12} <= h && h < hours{24};
+}
+
+inline
+CONSTCD14
+std::chrono::hours
+make12(std::chrono::hours h) NOEXCEPT
+{
+ using std::chrono::hours;
+ if (h < hours{12})
+ {
+ if (h == hours{0})
+ h = hours{12};
+ }
+ else
+ {
+ if (h != hours{12})
+ h = h - hours{12};
+ }
+ return h;
+}
+
+inline
+CONSTCD14
+std::chrono::hours
+make24(std::chrono::hours h, bool is_pm) NOEXCEPT
+{
+ using std::chrono::hours;
+ if (is_pm)
+ {
+ if (h != hours{12})
+ h = h + hours{12};
+ }
+ else if (h == hours{12})
+ h = hours{0};
+ return h;
+}
+
+template <class Duration>
+using time_of_day = hh_mm_ss<Duration>;
+
+template <class Rep, class Period,
+ class = typename std::enable_if
+ <!std::chrono::treat_as_floating_point<Rep>::value>::type>
+CONSTCD11
+inline
+hh_mm_ss<std::chrono::duration<Rep, Period>>
+make_time(const std::chrono::duration<Rep, Period>& d)
+{
+ return hh_mm_ss<std::chrono::duration<Rep, Period>>(d);
+}
+
+template <class CharT, class Traits, class Duration>
+inline
+typename std::enable_if
+<
+ !std::chrono::treat_as_floating_point<typename Duration::rep>::value &&
+ std::ratio_less<typename Duration::period, days::period>::value
+ , std::basic_ostream<CharT, Traits>&
+>::type
+operator<<(std::basic_ostream<CharT, Traits>& os, const sys_time<Duration>& tp)
+{
+ auto const dp = date::floor<days>(tp);
+ return os << year_month_day(dp) << ' ' << make_time(tp-dp);
+}
+
+template <class CharT, class Traits>
+inline
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const sys_days& dp)
+{
+ return os << year_month_day(dp);
+}
+
+template <class CharT, class Traits, class Duration>
+inline
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const local_time<Duration>& ut)
+{
+ return (os << sys_time<Duration>{ut.time_since_epoch()});
+}
+
+namespace detail
+{
+
+template <class CharT, std::size_t N>
+class string_literal;
+
+template <class CharT1, class CharT2, std::size_t N1, std::size_t N2>
+inline
+CONSTCD14
+string_literal<typename std::conditional<sizeof(CharT2) <= sizeof(CharT1), CharT1, CharT2>::type,
+ N1 + N2 - 1>
+operator+(const string_literal<CharT1, N1>& x, const string_literal<CharT2, N2>& y) NOEXCEPT;
+
+template <class CharT, std::size_t N>
+class string_literal
+{
+ CharT p_[N];
+
+ CONSTCD11 string_literal() NOEXCEPT
+ : p_{}
+ {}
+
+public:
+ using const_iterator = const CharT*;
+
+ string_literal(string_literal const&) = default;
+ string_literal& operator=(string_literal const&) = delete;
+
+ template <std::size_t N1 = 2,
+ class = typename std::enable_if<N1 == N>::type>
+ CONSTCD11 string_literal(CharT c) NOEXCEPT
+ : p_{c}
+ {
+ }
+
+ template <std::size_t N1 = 3,
+ class = typename std::enable_if<N1 == N>::type>
+ CONSTCD11 string_literal(CharT c1, CharT c2) NOEXCEPT
+ : p_{c1, c2}
+ {
+ }
+
+ template <std::size_t N1 = 4,
+ class = typename std::enable_if<N1 == N>::type>
+ CONSTCD11 string_literal(CharT c1, CharT c2, CharT c3) NOEXCEPT
+ : p_{c1, c2, c3}
+ {
+ }
+
+ CONSTCD14 string_literal(const CharT(&a)[N]) NOEXCEPT
+ : p_{}
+ {
+ for (std::size_t i = 0; i < N; ++i)
+ p_[i] = a[i];
+ }
+
+ template <class U = CharT,
+ class = typename std::enable_if<(1 < sizeof(U))>::type>
+ CONSTCD14 string_literal(const char(&a)[N]) NOEXCEPT
+ : p_{}
+ {
+ for (std::size_t i = 0; i < N; ++i)
+ p_[i] = a[i];
+ }
+
+ template <class CharT2,
+ class = typename std::enable_if<!std::is_same<CharT2, CharT>::value>::type>
+ CONSTCD14 string_literal(string_literal<CharT2, N> const& a) NOEXCEPT
+ : p_{}
+ {
+ for (std::size_t i = 0; i < N; ++i)
+ p_[i] = a[i];
+ }
+
+ CONSTCD11 const CharT* data() const NOEXCEPT {return p_;}
+ CONSTCD11 std::size_t size() const NOEXCEPT {return N-1;}
+
+ CONSTCD11 const_iterator begin() const NOEXCEPT {return p_;}
+ CONSTCD11 const_iterator end() const NOEXCEPT {return p_ + N-1;}
+
+ CONSTCD11 CharT const& operator[](std::size_t n) const NOEXCEPT
+ {
+ return p_[n];
+ }
+
+ template <class Traits>
+ friend
+ std::basic_ostream<CharT, Traits>&
+ operator<<(std::basic_ostream<CharT, Traits>& os, const string_literal& s)
+ {
+ return os << s.p_;
+ }
+
+ template <class CharT1, class CharT2, std::size_t N1, std::size_t N2>
+ friend
+ CONSTCD14
+ string_literal<typename std::conditional<sizeof(CharT2) <= sizeof(CharT1), CharT1, CharT2>::type,
+ N1 + N2 - 1>
+ operator+(const string_literal<CharT1, N1>& x, const string_literal<CharT2, N2>& y) NOEXCEPT;
+};
+
+template <class CharT>
+CONSTCD11
+inline
+string_literal<CharT, 3>
+operator+(const string_literal<CharT, 2>& x, const string_literal<CharT, 2>& y) NOEXCEPT
+{
+ return string_literal<CharT, 3>(x[0], y[0]);
+}
+
+template <class CharT>
+CONSTCD11
+inline
+string_literal<CharT, 4>
+operator+(const string_literal<CharT, 3>& x, const string_literal<CharT, 2>& y) NOEXCEPT
+{
+ return string_literal<CharT, 4>(x[0], x[1], y[0]);
+}
+
+template <class CharT1, class CharT2, std::size_t N1, std::size_t N2>
+CONSTCD14
+inline
+string_literal<typename std::conditional<sizeof(CharT2) <= sizeof(CharT1), CharT1, CharT2>::type,
+ N1 + N2 - 1>
+operator+(const string_literal<CharT1, N1>& x, const string_literal<CharT2, N2>& y) NOEXCEPT
+{
+ using CT = typename std::conditional<sizeof(CharT2) <= sizeof(CharT1), CharT1, CharT2>::type;
+
+ string_literal<CT, N1 + N2 - 1> r;
+ std::size_t i = 0;
+ for (; i < N1-1; ++i)
+ r.p_[i] = CT(x.p_[i]);
+ for (std::size_t j = 0; j < N2; ++j, ++i)
+ r.p_[i] = CT(y.p_[j]);
+
+ return r;
+}
+
+
+template <class CharT, class Traits, class Alloc, std::size_t N>
+inline
+std::basic_string<CharT, Traits, Alloc>
+operator+(std::basic_string<CharT, Traits, Alloc> x, const string_literal<CharT, N>& y)
+{
+ x.append(y.data(), y.size());
+ return x;
+}
+
+#if __cplusplus >= 201402 && (!defined(__EDG_VERSION__) || __EDG_VERSION__ > 411) \
+ && (!defined(__SUNPRO_CC) || __SUNPRO_CC > 0x5150)
+
+template <class CharT,
+ class = std::enable_if_t<std::is_same<CharT, char>::value ||
+ std::is_same<CharT, wchar_t>::value ||
+ std::is_same<CharT, char16_t>::value ||
+ std::is_same<CharT, char32_t>::value>>
+CONSTCD14
+inline
+string_literal<CharT, 2>
+msl(CharT c) NOEXCEPT
+{
+ return string_literal<CharT, 2>{c};
+}
+
+CONSTCD14
+inline
+std::size_t
+to_string_len(std::intmax_t i)
+{
+ std::size_t r = 0;
+ do
+ {
+ i /= 10;
+ ++r;
+ } while (i > 0);
+ return r;
+}
+
+template <std::intmax_t N>
+CONSTCD14
+inline
+std::enable_if_t
+<
+ N < 10,
+ string_literal<char, to_string_len(N)+1>
+>
+msl() NOEXCEPT
+{
+ return msl(char(N % 10 + '0'));
+}
+
+template <std::intmax_t N>
+CONSTCD14
+inline
+std::enable_if_t
+<
+ 10 <= N,
+ string_literal<char, to_string_len(N)+1>
+>
+msl() NOEXCEPT
+{
+ return msl<N/10>() + msl(char(N % 10 + '0'));
+}
+
+template <class CharT, std::intmax_t N, std::intmax_t D>
+CONSTCD14
+inline
+std::enable_if_t
+<
+ std::ratio<N, D>::type::den != 1,
+ string_literal<CharT, to_string_len(std::ratio<N, D>::type::num) +
+ to_string_len(std::ratio<N, D>::type::den) + 4>
+>
+msl(std::ratio<N, D>) NOEXCEPT
+{
+ using R = typename std::ratio<N, D>::type;
+ return msl(CharT{'['}) + msl<R::num>() + msl(CharT{'/'}) +
+ msl<R::den>() + msl(CharT{']'});
+}
+
+template <class CharT, std::intmax_t N, std::intmax_t D>
+CONSTCD14
+inline
+std::enable_if_t
+<
+ std::ratio<N, D>::type::den == 1,
+ string_literal<CharT, to_string_len(std::ratio<N, D>::type::num) + 3>
+>
+msl(std::ratio<N, D>) NOEXCEPT
+{
+ using R = typename std::ratio<N, D>::type;
+ return msl(CharT{'['}) + msl<R::num>() + msl(CharT{']'});
+}
+
+
+#else // __cplusplus < 201402 || (defined(__EDG_VERSION__) && __EDG_VERSION__ <= 411)
+
+inline
+std::string
+to_string(std::uint64_t x)
+{
+ return std::to_string(x);
+}
+
+template <class CharT>
+inline
+std::basic_string<CharT>
+to_string(std::uint64_t x)
+{
+ auto y = std::to_string(x);
+ return std::basic_string<CharT>(y.begin(), y.end());
+}
+
+template <class CharT, std::intmax_t N, std::intmax_t D>
+inline
+typename std::enable_if
+<
+ std::ratio<N, D>::type::den != 1,
+ std::basic_string<CharT>
+>::type
+msl(std::ratio<N, D>)
+{
+ using R = typename std::ratio<N, D>::type;
+ return std::basic_string<CharT>(1, '[') + to_string<CharT>(R::num) + CharT{'/'} +
+ to_string<CharT>(R::den) + CharT{']'};
+}
+
+template <class CharT, std::intmax_t N, std::intmax_t D>
+inline
+typename std::enable_if
+<
+ std::ratio<N, D>::type::den == 1,
+ std::basic_string<CharT>
+>::type
+msl(std::ratio<N, D>)
+{
+ using R = typename std::ratio<N, D>::type;
+ return std::basic_string<CharT>(1, '[') + to_string<CharT>(R::num) + CharT{']'};
+}
+
+#endif // __cplusplus < 201402 || (defined(__EDG_VERSION__) && __EDG_VERSION__ <= 411)
+
+template <class CharT>
+CONSTCD11
+inline
+string_literal<CharT, 2>
+msl(std::atto) NOEXCEPT
+{
+ return string_literal<CharT, 2>{'a'};
+}
+
+template <class CharT>
+CONSTCD11
+inline
+string_literal<CharT, 2>
+msl(std::femto) NOEXCEPT
+{
+ return string_literal<CharT, 2>{'f'};
+}
+
+template <class CharT>
+CONSTCD11
+inline
+string_literal<CharT, 2>
+msl(std::pico) NOEXCEPT
+{
+ return string_literal<CharT, 2>{'p'};
+}
+
+template <class CharT>
+CONSTCD11
+inline
+string_literal<CharT, 2>
+msl(std::nano) NOEXCEPT
+{
+ return string_literal<CharT, 2>{'n'};
+}
+
+template <class CharT>
+CONSTCD11
+inline
+typename std::enable_if
+<
+ std::is_same<CharT, char>::value,
+ string_literal<char, 3>
+>::type
+msl(std::micro) NOEXCEPT
+{
+ return string_literal<char, 3>{'\xC2', '\xB5'};
+}
+
+template <class CharT>
+CONSTCD11
+inline
+typename std::enable_if
+<
+ !std::is_same<CharT, char>::value,
+ string_literal<CharT, 2>
+>::type
+msl(std::micro) NOEXCEPT
+{
+ return string_literal<CharT, 2>{CharT{static_cast<unsigned char>('\xB5')}};
+}
+
+template <class CharT>
+CONSTCD11
+inline
+string_literal<CharT, 2>
+msl(std::milli) NOEXCEPT
+{
+ return string_literal<CharT, 2>{'m'};
+}
+
+template <class CharT>
+CONSTCD11
+inline
+string_literal<CharT, 2>
+msl(std::centi) NOEXCEPT
+{
+ return string_literal<CharT, 2>{'c'};
+}
+
+template <class CharT>
+CONSTCD11
+inline
+string_literal<CharT, 3>
+msl(std::deca) NOEXCEPT
+{
+ return string_literal<CharT, 3>{'d', 'a'};
+}
+
+template <class CharT>
+CONSTCD11
+inline
+string_literal<CharT, 2>
+msl(std::deci) NOEXCEPT
+{
+ return string_literal<CharT, 2>{'d'};
+}
+
+template <class CharT>
+CONSTCD11
+inline
+string_literal<CharT, 2>
+msl(std::hecto) NOEXCEPT
+{
+ return string_literal<CharT, 2>{'h'};
+}
+
+template <class CharT>
+CONSTCD11
+inline
+string_literal<CharT, 2>
+msl(std::kilo) NOEXCEPT
+{
+ return string_literal<CharT, 2>{'k'};
+}
+
+template <class CharT>
+CONSTCD11
+inline
+string_literal<CharT, 2>
+msl(std::mega) NOEXCEPT
+{
+ return string_literal<CharT, 2>{'M'};
+}
+
+template <class CharT>
+CONSTCD11
+inline
+string_literal<CharT, 2>
+msl(std::giga) NOEXCEPT
+{
+ return string_literal<CharT, 2>{'G'};
+}
+
+template <class CharT>
+CONSTCD11
+inline
+string_literal<CharT, 2>
+msl(std::tera) NOEXCEPT
+{
+ return string_literal<CharT, 2>{'T'};
+}
+
+template <class CharT>
+CONSTCD11
+inline
+string_literal<CharT, 2>
+msl(std::peta) NOEXCEPT
+{
+ return string_literal<CharT, 2>{'P'};
+}
+
+template <class CharT>
+CONSTCD11
+inline
+string_literal<CharT, 2>
+msl(std::exa) NOEXCEPT
+{
+ return string_literal<CharT, 2>{'E'};
+}
+
+template <class CharT, class Period>
+CONSTCD11
+inline
+auto
+get_units(Period p)
+ -> decltype(msl<CharT>(p) + string_literal<CharT, 2>{'s'})
+{
+ return msl<CharT>(p) + string_literal<CharT, 2>{'s'};
+}
+
+template <class CharT>
+CONSTCD11
+inline
+string_literal<CharT, 2>
+get_units(std::ratio<1>)
+{
+ return string_literal<CharT, 2>{'s'};
+}
+
+template <class CharT>
+CONSTCD11
+inline
+string_literal<CharT, 2>
+get_units(std::ratio<3600>)
+{
+ return string_literal<CharT, 2>{'h'};
+}
+
+template <class CharT>
+CONSTCD11
+inline
+string_literal<CharT, 4>
+get_units(std::ratio<60>)
+{
+ return string_literal<CharT, 4>{'m', 'i', 'n'};
+}
+
+template <class CharT>
+CONSTCD11
+inline
+string_literal<CharT, 2>
+get_units(std::ratio<86400>)
+{
+ return string_literal<CharT, 2>{'d'};
+}
+
+template <class CharT, class Traits = std::char_traits<CharT>>
+struct make_string;
+
+template <>
+struct make_string<char>
+{
+ template <class Rep>
+ static
+ std::string
+ from(Rep n)
+ {
+ return std::to_string(n);
+ }
+};
+
+template <class Traits>
+struct make_string<char, Traits>
+{
+ template <class Rep>
+ static
+ std::basic_string<char, Traits>
+ from(Rep n)
+ {
+ auto s = std::to_string(n);
+ return std::basic_string<char, Traits>(s.begin(), s.end());
+ }
+};
+
+template <>
+struct make_string<wchar_t>
+{
+ template <class Rep>
+ static
+ std::wstring
+ from(Rep n)
+ {
+ return std::to_wstring(n);
+ }
+};
+
+template <class Traits>
+struct make_string<wchar_t, Traits>
+{
+ template <class Rep>
+ static
+ std::basic_string<wchar_t, Traits>
+ from(Rep n)
+ {
+ auto s = std::to_wstring(n);
+ return std::basic_string<wchar_t, Traits>(s.begin(), s.end());
+ }
+};
+
+} // namespace detail
+
+// to_stream
+
+CONSTDATA year nanyear{-32768};
+
+template <class Duration>
+struct fields
+{
+ year_month_day ymd{nanyear/0/0};
+ weekday wd{8u};
+ hh_mm_ss<Duration> tod{};
+ bool has_tod = false;
+
+ fields() = default;
+
+ fields(year_month_day ymd_) : ymd(ymd_) {}
+ fields(weekday wd_) : wd(wd_) {}
+ fields(hh_mm_ss<Duration> tod_) : tod(tod_), has_tod(true) {}
+
+ fields(year_month_day ymd_, weekday wd_) : ymd(ymd_), wd(wd_) {}
+ fields(year_month_day ymd_, hh_mm_ss<Duration> tod_) : ymd(ymd_), tod(tod_),
+ has_tod(true) {}
+
+ fields(weekday wd_, hh_mm_ss<Duration> tod_) : wd(wd_), tod(tod_), has_tod(true) {}
+
+ fields(year_month_day ymd_, weekday wd_, hh_mm_ss<Duration> tod_)
+ : ymd(ymd_)
+ , wd(wd_)
+ , tod(tod_)
+ , has_tod(true)
+ {}
+};
+
+namespace detail
+{
+
+template <class CharT, class Traits, class Duration>
+unsigned
+extract_weekday(std::basic_ostream<CharT, Traits>& os, const fields<Duration>& fds)
+{
+ if (!fds.ymd.ok() && !fds.wd.ok())
+ {
+ // fds does not contain a valid weekday
+ os.setstate(std::ios::failbit);
+ return 8;
+ }
+ weekday wd;
+ if (fds.ymd.ok())
+ {
+ wd = weekday{sys_days(fds.ymd)};
+ if (fds.wd.ok() && wd != fds.wd)
+ {
+ // fds.ymd and fds.wd are inconsistent
+ os.setstate(std::ios::failbit);
+ return 8;
+ }
+ }
+ else
+ wd = fds.wd;
+ return static_cast<unsigned>((wd - Sunday).count());
+}
+
+template <class CharT, class Traits, class Duration>
+unsigned
+extract_month(std::basic_ostream<CharT, Traits>& os, const fields<Duration>& fds)
+{
+ if (!fds.ymd.month().ok())
+ {
+ // fds does not contain a valid month
+ os.setstate(std::ios::failbit);
+ return 0;
+ }
+ return static_cast<unsigned>(fds.ymd.month());
+}
+
+} // namespace detail
+
+#if ONLY_C_LOCALE
+
+namespace detail
+{
+
+inline
+std::pair<const std::string*, const std::string*>
+weekday_names()
+{
+ static const std::string nm[] =
+ {
+ "Sunday",
+ "Monday",
+ "Tuesday",
+ "Wednesday",
+ "Thursday",
+ "Friday",
+ "Saturday",
+ "Sun",
+ "Mon",
+ "Tue",
+ "Wed",
+ "Thu",
+ "Fri",
+ "Sat"
+ };
+ return std::make_pair(nm, nm+sizeof(nm)/sizeof(nm[0]));
+}
+
+inline
+std::pair<const std::string*, const std::string*>
+month_names()
+{
+ static const std::string nm[] =
+ {
+ "January",
+ "February",
+ "March",
+ "April",
+ "May",
+ "June",
+ "July",
+ "August",
+ "September",
+ "October",
+ "November",
+ "December",
+ "Jan",
+ "Feb",
+ "Mar",
+ "Apr",
+ "May",
+ "Jun",
+ "Jul",
+ "Aug",
+ "Sep",
+ "Oct",
+ "Nov",
+ "Dec"
+ };
+ return std::make_pair(nm, nm+sizeof(nm)/sizeof(nm[0]));
+}
+
+inline
+std::pair<const std::string*, const std::string*>
+ampm_names()
+{
+ static const std::string nm[] =
+ {
+ "AM",
+ "PM"
+ };
+ return std::make_pair(nm, nm+sizeof(nm)/sizeof(nm[0]));
+}
+
+template <class CharT, class Traits, class FwdIter>
+FwdIter
+scan_keyword(std::basic_istream<CharT, Traits>& is, FwdIter kb, FwdIter ke)
+{
+ size_t nkw = static_cast<size_t>(std::distance(kb, ke));
+ const unsigned char doesnt_match = '\0';
+ const unsigned char might_match = '\1';
+ const unsigned char does_match = '\2';
+ unsigned char statbuf[100];
+ unsigned char* status = statbuf;
+ std::unique_ptr<unsigned char, void(*)(void*)> stat_hold(0, free);
+ if (nkw > sizeof(statbuf))
+ {
+ status = (unsigned char*)std::malloc(nkw);
+ if (status == nullptr)
+ throw std::bad_alloc();
+ stat_hold.reset(status);
+ }
+ size_t n_might_match = nkw; // At this point, any keyword might match
+ size_t n_does_match = 0; // but none of them definitely do
+ // Initialize all statuses to might_match, except for "" keywords are does_match
+ unsigned char* st = status;
+ for (auto ky = kb; ky != ke; ++ky, ++st)
+ {
+ if (!ky->empty())
+ *st = might_match;
+ else
+ {
+ *st = does_match;
+ --n_might_match;
+ ++n_does_match;
+ }
+ }
+ // While there might be a match, test keywords against the next CharT
+ for (size_t indx = 0; is && n_might_match > 0; ++indx)
+ {
+ // Peek at the next CharT but don't consume it
+ auto ic = is.peek();
+ if (ic == EOF)
+ {
+ is.setstate(std::ios::eofbit);
+ break;
+ }
+ auto c = static_cast<char>(toupper(ic));
+ bool consume = false;
+ // For each keyword which might match, see if the indx character is c
+ // If a match if found, consume c
+ // If a match is found, and that is the last character in the keyword,
+ // then that keyword matches.
+ // If the keyword doesn't match this character, then change the keyword
+ // to doesn't match
+ st = status;
+ for (auto ky = kb; ky != ke; ++ky, ++st)
+ {
+ if (*st == might_match)
+ {
+ if (c == static_cast<char>(toupper((*ky)[indx])))
+ {
+ consume = true;
+ if (ky->size() == indx+1)
+ {
+ *st = does_match;
+ --n_might_match;
+ ++n_does_match;
+ }
+ }
+ else
+ {
+ *st = doesnt_match;
+ --n_might_match;
+ }
+ }
+ }
+ // consume if we matched a character
+ if (consume)
+ {
+ (void)is.get();
+ // If we consumed a character and there might be a matched keyword that
+ // was marked matched on a previous iteration, then such keywords
+ // are now marked as not matching.
+ if (n_might_match + n_does_match > 1)
+ {
+ st = status;
+ for (auto ky = kb; ky != ke; ++ky, ++st)
+ {
+ if (*st == does_match && ky->size() != indx+1)
+ {
+ *st = doesnt_match;
+ --n_does_match;
+ }
+ }
+ }
+ }
+ }
+ // We've exited the loop because we hit eof and/or we have no more "might matches".
+ // Return the first matching result
+ for (st = status; kb != ke; ++kb, ++st)
+ if (*st == does_match)
+ break;
+ if (kb == ke)
+ is.setstate(std::ios::failbit);
+ return kb;
+}
+
+} // namespace detail
+
+#endif // ONLY_C_LOCALE
+
+template <class CharT, class Traits, class Duration>
+std::basic_ostream<CharT, Traits>&
+to_stream(std::basic_ostream<CharT, Traits>& os, const CharT* fmt,
+ const fields<Duration>& fds, const std::string* abbrev,
+ const std::chrono::seconds* offset_sec)
+{
+#if ONLY_C_LOCALE
+ using detail::weekday_names;
+ using detail::month_names;
+ using detail::ampm_names;
+#endif
+ using detail::save_ostream;
+ using detail::get_units;
+ using detail::extract_weekday;
+ using detail::extract_month;
+ using std::ios;
+ using std::chrono::duration_cast;
+ using std::chrono::seconds;
+ using std::chrono::minutes;
+ using std::chrono::hours;
+ date::detail::save_ostream<CharT, Traits> ss(os);
+ os.fill(' ');
+ os.flags(std::ios::skipws | std::ios::dec);
+ os.width(0);
+ tm tm{};
+ bool insert_negative = fds.has_tod && fds.tod.to_duration() < Duration::zero();
+#if !ONLY_C_LOCALE
+ auto& facet = std::use_facet<std::time_put<CharT>>(os.getloc());
+#endif
+ const CharT* command = nullptr;
+ CharT modified = CharT{};
+ for (; *fmt; ++fmt)
+ {
+ switch (*fmt)
+ {
+ case 'a':
+ case 'A':
+ if (command)
+ {
+ if (modified == CharT{})
+ {
+ tm.tm_wday = static_cast<int>(extract_weekday(os, fds));
+ if (os.fail())
+ return os;
+#if !ONLY_C_LOCALE
+ const CharT f[] = {'%', *fmt};
+ facet.put(os, os, os.fill(), &tm, std::begin(f), std::end(f));
+#else // ONLY_C_LOCALE
+ os << weekday_names().first[tm.tm_wday+7*(*fmt == 'a')];
+#endif // ONLY_C_LOCALE
+ }
+ else
+ {
+ os << CharT{'%'} << modified << *fmt;
+ modified = CharT{};
+ }
+ command = nullptr;
+ }
+ else
+ os << *fmt;
+ break;
+ case 'b':
+ case 'B':
+ case 'h':
+ if (command)
+ {
+ if (modified == CharT{})
+ {
+ tm.tm_mon = static_cast<int>(extract_month(os, fds)) - 1;
+#if !ONLY_C_LOCALE
+ const CharT f[] = {'%', *fmt};
+ facet.put(os, os, os.fill(), &tm, std::begin(f), std::end(f));
+#else // ONLY_C_LOCALE
+ os << month_names().first[tm.tm_mon+12*(*fmt != 'B')];
+#endif // ONLY_C_LOCALE
+ }
+ else
+ {
+ os << CharT{'%'} << modified << *fmt;
+ modified = CharT{};
+ }
+ command = nullptr;
+ }
+ else
+ os << *fmt;
+ break;
+ case 'c':
+ case 'x':
+ if (command)
+ {
+ if (modified == CharT{'O'})
+ os << CharT{'%'} << modified << *fmt;
+ else
+ {
+ if (!fds.ymd.ok())
+ os.setstate(std::ios::failbit);
+ if (*fmt == 'c' && !fds.has_tod)
+ os.setstate(std::ios::failbit);
+#if !ONLY_C_LOCALE
+ tm = std::tm{};
+ auto const& ymd = fds.ymd;
+ auto ld = local_days(ymd);
+ if (*fmt == 'c')
+ {
+ tm.tm_sec = static_cast<int>(fds.tod.seconds().count());
+ tm.tm_min = static_cast<int>(fds.tod.minutes().count());
+ tm.tm_hour = static_cast<int>(fds.tod.hours().count());
+ }
+ tm.tm_mday = static_cast<int>(static_cast<unsigned>(ymd.day()));
+ tm.tm_mon = static_cast<int>(extract_month(os, fds) - 1);
+ tm.tm_year = static_cast<int>(ymd.year()) - 1900;
+ tm.tm_wday = static_cast<int>(extract_weekday(os, fds));
+ if (os.fail())
+ return os;
+ tm.tm_yday = static_cast<int>((ld - local_days(ymd.year()/1/1)).count());
+ CharT f[3] = {'%'};
+ auto fe = std::begin(f) + 1;
+ if (modified == CharT{'E'})
+ *fe++ = modified;
+ *fe++ = *fmt;
+ facet.put(os, os, os.fill(), &tm, std::begin(f), fe);
+#else // ONLY_C_LOCALE
+ if (*fmt == 'c')
+ {
+ auto wd = static_cast<int>(extract_weekday(os, fds));
+ os << weekday_names().first[static_cast<unsigned>(wd)+7]
+ << ' ';
+ os << month_names().first[extract_month(os, fds)-1+12] << ' ';
+ auto d = static_cast<int>(static_cast<unsigned>(fds.ymd.day()));
+ if (d < 10)
+ os << ' ';
+ os << d << ' '
+ << make_time(duration_cast<seconds>(fds.tod.to_duration()))
+ << ' ' << fds.ymd.year();
+
+ }
+ else // *fmt == 'x'
+ {
+ auto const& ymd = fds.ymd;
+ save_ostream<CharT, Traits> _(os);
+ os.fill('0');
+ os.flags(std::ios::dec | std::ios::right);
+ os.width(2);
+ os << static_cast<unsigned>(ymd.month()) << CharT{'/'};
+ os.width(2);
+ os << static_cast<unsigned>(ymd.day()) << CharT{'/'};
+ os.width(2);
+ os << static_cast<int>(ymd.year()) % 100;
+ }
+#endif // ONLY_C_LOCALE
+ }
+ command = nullptr;
+ modified = CharT{};
+ }
+ else
+ os << *fmt;
+ break;
+ case 'C':
+ if (command)
+ {
+ if (modified == CharT{'O'})
+ os << CharT{'%'} << modified << *fmt;
+ else
+ {
+ if (!fds.ymd.year().ok())
+ os.setstate(std::ios::failbit);
+ auto y = static_cast<int>(fds.ymd.year());
+#if !ONLY_C_LOCALE
+ if (modified == CharT{})
+#endif
+ {
+ save_ostream<CharT, Traits> _(os);
+ os.fill('0');
+ os.flags(std::ios::dec | std::ios::right);
+ if (y >= 0)
+ {
+ os.width(2);
+ os << y/100;
+ }
+ else
+ {
+ os << CharT{'-'};
+ os.width(2);
+ os << -(y-99)/100;
+ }
+ }
+#if !ONLY_C_LOCALE
+ else if (modified == CharT{'E'})
+ {
+ tm.tm_year = y - 1900;
+ CharT f[3] = {'%', 'E', 'C'};
+ facet.put(os, os, os.fill(), &tm, std::begin(f), std::end(f));
+ }
+#endif
+ }
+ command = nullptr;
+ modified = CharT{};
+ }
+ else
+ os << *fmt;
+ break;
+ case 'd':
+ case 'e':
+ if (command)
+ {
+ if (modified == CharT{'E'})
+ os << CharT{'%'} << modified << *fmt;
+ else
+ {
+ if (!fds.ymd.day().ok())
+ os.setstate(std::ios::failbit);
+ auto d = static_cast<int>(static_cast<unsigned>(fds.ymd.day()));
+#if !ONLY_C_LOCALE
+ if (modified == CharT{})
+#endif
+ {
+ save_ostream<CharT, Traits> _(os);
+ if (*fmt == CharT{'d'})
+ os.fill('0');
+ else
+ os.fill(' ');
+ os.flags(std::ios::dec | std::ios::right);
+ os.width(2);
+ os << d;
+ }
+#if !ONLY_C_LOCALE
+ else if (modified == CharT{'O'})
+ {
+ tm.tm_mday = d;
+ CharT f[3] = {'%', 'O', *fmt};
+ facet.put(os, os, os.fill(), &tm, std::begin(f), std::end(f));
+ }
+#endif
+ }
+ command = nullptr;
+ modified = CharT{};
+ }
+ else
+ os << *fmt;
+ break;
+ case 'D':
+ if (command)
+ {
+ if (modified == CharT{})
+ {
+ if (!fds.ymd.ok())
+ os.setstate(std::ios::failbit);
+ auto const& ymd = fds.ymd;
+ save_ostream<CharT, Traits> _(os);
+ os.fill('0');
+ os.flags(std::ios::dec | std::ios::right);
+ os.width(2);
+ os << static_cast<unsigned>(ymd.month()) << CharT{'/'};
+ os.width(2);
+ os << static_cast<unsigned>(ymd.day()) << CharT{'/'};
+ os.width(2);
+ os << static_cast<int>(ymd.year()) % 100;
+ }
+ else
+ {
+ os << CharT{'%'} << modified << *fmt;
+ modified = CharT{};
+ }
+ command = nullptr;
+ }
+ else
+ os << *fmt;
+ break;
+ case 'F':
+ if (command)
+ {
+ if (modified == CharT{})
+ {
+ if (!fds.ymd.ok())
+ os.setstate(std::ios::failbit);
+ auto const& ymd = fds.ymd;
+ save_ostream<CharT, Traits> _(os);
+ os.imbue(std::locale::classic());
+ os.fill('0');
+ os.flags(std::ios::dec | std::ios::right);
+ os.width(4);
+ os << static_cast<int>(ymd.year()) << CharT{'-'};
+ os.width(2);
+ os << static_cast<unsigned>(ymd.month()) << CharT{'-'};
+ os.width(2);
+ os << static_cast<unsigned>(ymd.day());
+ }
+ else
+ {
+ os << CharT{'%'} << modified << *fmt;
+ modified = CharT{};
+ }
+ command = nullptr;
+ }
+ else
+ os << *fmt;
+ break;
+ case 'g':
+ case 'G':
+ if (command)
+ {
+ if (modified == CharT{})
+ {
+ if (!fds.ymd.ok())
+ os.setstate(std::ios::failbit);
+ auto ld = local_days(fds.ymd);
+ auto y = year_month_day{ld + days{3}}.year();
+ auto start = local_days((y-years{1})/December/Thursday[last]) +
+ (Monday-Thursday);
+ if (ld < start)
+ --y;
+ if (*fmt == CharT{'G'})
+ os << y;
+ else
+ {
+ save_ostream<CharT, Traits> _(os);
+ os.fill('0');
+ os.flags(std::ios::dec | std::ios::right);
+ os.width(2);
+ os << std::abs(static_cast<int>(y)) % 100;
+ }
+ }
+ else
+ {
+ os << CharT{'%'} << modified << *fmt;
+ modified = CharT{};
+ }
+ command = nullptr;
+ }
+ else
+ os << *fmt;
+ break;
+ case 'H':
+ case 'I':
+ if (command)
+ {
+ if (modified == CharT{'E'})
+ os << CharT{'%'} << modified << *fmt;
+ else
+ {
+ if (!fds.has_tod)
+ os.setstate(std::ios::failbit);
+ if (insert_negative)
+ {
+ os << '-';
+ insert_negative = false;
+ }
+ auto hms = fds.tod;
+#if !ONLY_C_LOCALE
+ if (modified == CharT{})
+#endif
+ {
+ auto h = *fmt == CharT{'I'} ? date::make12(hms.hours()) : hms.hours();
+ if (h < hours{10})
+ os << CharT{'0'};
+ os << h.count();
+ }
+#if !ONLY_C_LOCALE
+ else if (modified == CharT{'O'})
+ {
+ const CharT f[] = {'%', modified, *fmt};
+ tm.tm_hour = static_cast<int>(hms.hours().count());
+ facet.put(os, os, os.fill(), &tm, std::begin(f), std::end(f));
+ }
+#endif
+ }
+ modified = CharT{};
+ command = nullptr;
+ }
+ else
+ os << *fmt;
+ break;
+ case 'j':
+ if (command)
+ {
+ if (modified == CharT{})
+ {
+ if (fds.ymd.ok() || fds.has_tod)
+ {
+ days doy;
+ if (fds.ymd.ok())
+ {
+ auto ld = local_days(fds.ymd);
+ auto y = fds.ymd.year();
+ doy = ld - local_days(y/January/1) + days{1};
+ }
+ else
+ {
+ doy = duration_cast<days>(fds.tod.to_duration());
+ }
+ save_ostream<CharT, Traits> _(os);
+ os.fill('0');
+ os.flags(std::ios::dec | std::ios::right);
+ os.width(3);
+ os << doy.count();
+ }
+ else
+ {
+ os.setstate(std::ios::failbit);
+ }
+ }
+ else
+ {
+ os << CharT{'%'} << modified << *fmt;
+ modified = CharT{};
+ }
+ command = nullptr;
+ }
+ else
+ os << *fmt;
+ break;
+ case 'm':
+ if (command)
+ {
+ if (modified == CharT{'E'})
+ os << CharT{'%'} << modified << *fmt;
+ else
+ {
+ if (!fds.ymd.month().ok())
+ os.setstate(std::ios::failbit);
+ auto m = static_cast<unsigned>(fds.ymd.month());
+#if !ONLY_C_LOCALE
+ if (modified == CharT{})
+#endif
+ {
+ if (m < 10)
+ os << CharT{'0'};
+ os << m;
+ }
+#if !ONLY_C_LOCALE
+ else if (modified == CharT{'O'})
+ {
+ const CharT f[] = {'%', modified, *fmt};
+ tm.tm_mon = static_cast<int>(m-1);
+ facet.put(os, os, os.fill(), &tm, std::begin(f), std::end(f));
+ }
+#endif
+ }
+ modified = CharT{};
+ command = nullptr;
+ }
+ else
+ os << *fmt;
+ break;
+ case 'M':
+ if (command)
+ {
+ if (modified == CharT{'E'})
+ os << CharT{'%'} << modified << *fmt;
+ else
+ {
+ if (!fds.has_tod)
+ os.setstate(std::ios::failbit);
+ if (insert_negative)
+ {
+ os << '-';
+ insert_negative = false;
+ }
+#if !ONLY_C_LOCALE
+ if (modified == CharT{})
+#endif
+ {
+ if (fds.tod.minutes() < minutes{10})
+ os << CharT{'0'};
+ os << fds.tod.minutes().count();
+ }
+#if !ONLY_C_LOCALE
+ else if (modified == CharT{'O'})
+ {
+ const CharT f[] = {'%', modified, *fmt};
+ tm.tm_min = static_cast<int>(fds.tod.minutes().count());
+ facet.put(os, os, os.fill(), &tm, std::begin(f), std::end(f));
+ }
+#endif
+ }
+ modified = CharT{};
+ command = nullptr;
+ }
+ else
+ os << *fmt;
+ break;
+ case 'n':
+ if (command)
+ {
+ if (modified == CharT{})
+ os << CharT{'\n'};
+ else
+ {
+ os << CharT{'%'} << modified << *fmt;
+ modified = CharT{};
+ }
+ command = nullptr;
+ }
+ else
+ os << *fmt;
+ break;
+ case 'p':
+ if (command)
+ {
+ if (modified == CharT{})
+ {
+ if (!fds.has_tod)
+ os.setstate(std::ios::failbit);
+#if !ONLY_C_LOCALE
+ const CharT f[] = {'%', *fmt};
+ tm.tm_hour = static_cast<int>(fds.tod.hours().count());
+ facet.put(os, os, os.fill(), &tm, std::begin(f), std::end(f));
+#else
+ if (date::is_am(fds.tod.hours()))
+ os << ampm_names().first[0];
+ else
+ os << ampm_names().first[1];
+#endif
+ }
+ else
+ {
+ os << CharT{'%'} << modified << *fmt;
+ }
+ modified = CharT{};
+ command = nullptr;
+ }
+ else
+ os << *fmt;
+ break;
+ case 'Q':
+ case 'q':
+ if (command)
+ {
+ if (modified == CharT{})
+ {
+ if (!fds.has_tod)
+ os.setstate(std::ios::failbit);
+ auto d = fds.tod.to_duration();
+ if (*fmt == 'q')
+ os << get_units<CharT>(typename decltype(d)::period::type{});
+ else
+ os << d.count();
+ }
+ else
+ {
+ os << CharT{'%'} << modified << *fmt;
+ }
+ modified = CharT{};
+ command = nullptr;
+ }
+ else
+ os << *fmt;
+ break;
+ case 'r':
+ if (command)
+ {
+ if (modified == CharT{})
+ {
+ if (!fds.has_tod)
+ os.setstate(std::ios::failbit);
+#if !ONLY_C_LOCALE
+ const CharT f[] = {'%', *fmt};
+ tm.tm_hour = static_cast<int>(fds.tod.hours().count());
+ tm.tm_min = static_cast<int>(fds.tod.minutes().count());
+ tm.tm_sec = static_cast<int>(fds.tod.seconds().count());
+ facet.put(os, os, os.fill(), &tm, std::begin(f), std::end(f));
+#else
+ hh_mm_ss<seconds> tod(duration_cast<seconds>(fds.tod.to_duration()));
+ save_ostream<CharT, Traits> _(os);
+ os.fill('0');
+ os.width(2);
+ os << date::make12(tod.hours()).count() << CharT{':'};
+ os.width(2);
+ os << tod.minutes().count() << CharT{':'};
+ os.width(2);
+ os << tod.seconds().count() << CharT{' '};
+ if (date::is_am(tod.hours()))
+ os << ampm_names().first[0];
+ else
+ os << ampm_names().first[1];
+#endif
+ }
+ else
+ {
+ os << CharT{'%'} << modified << *fmt;
+ }
+ modified = CharT{};
+ command = nullptr;
+ }
+ else
+ os << *fmt;
+ break;
+ case 'R':
+ if (command)
+ {
+ if (modified == CharT{})
+ {
+ if (!fds.has_tod)
+ os.setstate(std::ios::failbit);
+ if (fds.tod.hours() < hours{10})
+ os << CharT{'0'};
+ os << fds.tod.hours().count() << CharT{':'};
+ if (fds.tod.minutes() < minutes{10})
+ os << CharT{'0'};
+ os << fds.tod.minutes().count();
+ }
+ else
+ {
+ os << CharT{'%'} << modified << *fmt;
+ modified = CharT{};
+ }
+ command = nullptr;
+ }
+ else
+ os << *fmt;
+ break;
+ case 'S':
+ if (command)
+ {
+ if (modified == CharT{'E'})
+ os << CharT{'%'} << modified << *fmt;
+ else
+ {
+ if (!fds.has_tod)
+ os.setstate(std::ios::failbit);
+ if (insert_negative)
+ {
+ os << '-';
+ insert_negative = false;
+ }
+#if !ONLY_C_LOCALE
+ if (modified == CharT{})
+#endif
+ {
+ os << fds.tod.s_;
+ }
+#if !ONLY_C_LOCALE
+ else if (modified == CharT{'O'})
+ {
+ const CharT f[] = {'%', modified, *fmt};
+ tm.tm_sec = static_cast<int>(fds.tod.s_.seconds().count());
+ facet.put(os, os, os.fill(), &tm, std::begin(f), std::end(f));
+ }
+#endif
+ }
+ modified = CharT{};
+ command = nullptr;
+ }
+ else
+ os << *fmt;
+ break;
+ case 't':
+ if (command)
+ {
+ if (modified == CharT{})
+ os << CharT{'\t'};
+ else
+ {
+ os << CharT{'%'} << modified << *fmt;
+ modified = CharT{};
+ }
+ command = nullptr;
+ }
+ else
+ os << *fmt;
+ break;
+ case 'T':
+ if (command)
+ {
+ if (modified == CharT{})
+ {
+ if (!fds.has_tod)
+ os.setstate(std::ios::failbit);
+ os << fds.tod;
+ }
+ else
+ {
+ os << CharT{'%'} << modified << *fmt;
+ modified = CharT{};
+ }
+ command = nullptr;
+ }
+ else
+ os << *fmt;
+ break;
+ case 'u':
+ if (command)
+ {
+ if (modified == CharT{'E'})
+ os << CharT{'%'} << modified << *fmt;
+ else
+ {
+ auto wd = extract_weekday(os, fds);
+#if !ONLY_C_LOCALE
+ if (modified == CharT{})
+#endif
+ {
+ os << (wd != 0 ? wd : 7u);
+ }
+#if !ONLY_C_LOCALE
+ else if (modified == CharT{'O'})
+ {
+ const CharT f[] = {'%', modified, *fmt};
+ tm.tm_wday = static_cast<int>(wd);
+ facet.put(os, os, os.fill(), &tm, std::begin(f), std::end(f));
+ }
+#endif
+ }
+ modified = CharT{};
+ command = nullptr;
+ }
+ else
+ os << *fmt;
+ break;
+ case 'U':
+ if (command)
+ {
+ if (modified == CharT{'E'})
+ os << CharT{'%'} << modified << *fmt;
+ else
+ {
+ auto const& ymd = fds.ymd;
+ if (!ymd.ok())
+ os.setstate(std::ios::failbit);
+ auto ld = local_days(ymd);
+#if !ONLY_C_LOCALE
+ if (modified == CharT{})
+#endif
+ {
+ auto st = local_days(Sunday[1]/January/ymd.year());
+ if (ld < st)
+ os << CharT{'0'} << CharT{'0'};
+ else
+ {
+ auto wn = duration_cast<weeks>(ld - st).count() + 1;
+ if (wn < 10)
+ os << CharT{'0'};
+ os << wn;
+ }
+ }
+ #if !ONLY_C_LOCALE
+ else if (modified == CharT{'O'})
+ {
+ const CharT f[] = {'%', modified, *fmt};
+ tm.tm_year = static_cast<int>(ymd.year()) - 1900;
+ tm.tm_wday = static_cast<int>(extract_weekday(os, fds));
+ if (os.fail())
+ return os;
+ tm.tm_yday = static_cast<int>((ld - local_days(ymd.year()/1/1)).count());
+ facet.put(os, os, os.fill(), &tm, std::begin(f), std::end(f));
+ }
+#endif
+ }
+ modified = CharT{};
+ command = nullptr;
+ }
+ else
+ os << *fmt;
+ break;
+ case 'V':
+ if (command)
+ {
+ if (modified == CharT{'E'})
+ os << CharT{'%'} << modified << *fmt;
+ else
+ {
+ if (!fds.ymd.ok())
+ os.setstate(std::ios::failbit);
+ auto ld = local_days(fds.ymd);
+#if !ONLY_C_LOCALE
+ if (modified == CharT{})
+#endif
+ {
+ auto y = year_month_day{ld + days{3}}.year();
+ auto st = local_days((y-years{1})/12/Thursday[last]) +
+ (Monday-Thursday);
+ if (ld < st)
+ {
+ --y;
+ st = local_days((y - years{1})/12/Thursday[last]) +
+ (Monday-Thursday);
+ }
+ auto wn = duration_cast<weeks>(ld - st).count() + 1;
+ if (wn < 10)
+ os << CharT{'0'};
+ os << wn;
+ }
+#if !ONLY_C_LOCALE
+ else if (modified == CharT{'O'})
+ {
+ const CharT f[] = {'%', modified, *fmt};
+ auto const& ymd = fds.ymd;
+ tm.tm_year = static_cast<int>(ymd.year()) - 1900;
+ tm.tm_wday = static_cast<int>(extract_weekday(os, fds));
+ if (os.fail())
+ return os;
+ tm.tm_yday = static_cast<int>((ld - local_days(ymd.year()/1/1)).count());
+ facet.put(os, os, os.fill(), &tm, std::begin(f), std::end(f));
+ }
+#endif
+ }
+ modified = CharT{};
+ command = nullptr;
+ }
+ else
+ os << *fmt;
+ break;
+ case 'w':
+ if (command)
+ {
+ auto wd = extract_weekday(os, fds);
+ if (os.fail())
+ return os;
+#if !ONLY_C_LOCALE
+ if (modified == CharT{})
+#else
+ if (modified != CharT{'E'})
+#endif
+ {
+ os << wd;
+ }
+#if !ONLY_C_LOCALE
+ else if (modified == CharT{'O'})
+ {
+ const CharT f[] = {'%', modified, *fmt};
+ tm.tm_wday = static_cast<int>(wd);
+ facet.put(os, os, os.fill(), &tm, std::begin(f), std::end(f));
+ }
+#endif
+ else
+ {
+ os << CharT{'%'} << modified << *fmt;
+ }
+ modified = CharT{};
+ command = nullptr;
+ }
+ else
+ os << *fmt;
+ break;
+ case 'W':
+ if (command)
+ {
+ if (modified == CharT{'E'})
+ os << CharT{'%'} << modified << *fmt;
+ else
+ {
+ auto const& ymd = fds.ymd;
+ if (!ymd.ok())
+ os.setstate(std::ios::failbit);
+ auto ld = local_days(ymd);
+#if !ONLY_C_LOCALE
+ if (modified == CharT{})
+#endif
+ {
+ auto st = local_days(Monday[1]/January/ymd.year());
+ if (ld < st)
+ os << CharT{'0'} << CharT{'0'};
+ else
+ {
+ auto wn = duration_cast<weeks>(ld - st).count() + 1;
+ if (wn < 10)
+ os << CharT{'0'};
+ os << wn;
+ }
+ }
+#if !ONLY_C_LOCALE
+ else if (modified == CharT{'O'})
+ {
+ const CharT f[] = {'%', modified, *fmt};
+ tm.tm_year = static_cast<int>(ymd.year()) - 1900;
+ tm.tm_wday = static_cast<int>(extract_weekday(os, fds));
+ if (os.fail())
+ return os;
+ tm.tm_yday = static_cast<int>((ld - local_days(ymd.year()/1/1)).count());
+ facet.put(os, os, os.fill(), &tm, std::begin(f), std::end(f));
+ }
+#endif
+ }
+ modified = CharT{};
+ command = nullptr;
+ }
+ else
+ os << *fmt;
+ break;
+ case 'X':
+ if (command)
+ {
+ if (modified == CharT{'O'})
+ os << CharT{'%'} << modified << *fmt;
+ else
+ {
+ if (!fds.has_tod)
+ os.setstate(std::ios::failbit);
+#if !ONLY_C_LOCALE
+ tm = std::tm{};
+ tm.tm_sec = static_cast<int>(fds.tod.seconds().count());
+ tm.tm_min = static_cast<int>(fds.tod.minutes().count());
+ tm.tm_hour = static_cast<int>(fds.tod.hours().count());
+ CharT f[3] = {'%'};
+ auto fe = std::begin(f) + 1;
+ if (modified == CharT{'E'})
+ *fe++ = modified;
+ *fe++ = *fmt;
+ facet.put(os, os, os.fill(), &tm, std::begin(f), fe);
+#else
+ os << fds.tod;
+#endif
+ }
+ command = nullptr;
+ modified = CharT{};
+ }
+ else
+ os << *fmt;
+ break;
+ case 'y':
+ if (command)
+ {
+ if (!fds.ymd.year().ok())
+ os.setstate(std::ios::failbit);
+ auto y = static_cast<int>(fds.ymd.year());
+#if !ONLY_C_LOCALE
+ if (modified == CharT{})
+ {
+#endif
+ y = std::abs(y) % 100;
+ if (y < 10)
+ os << CharT{'0'};
+ os << y;
+#if !ONLY_C_LOCALE
+ }
+ else
+ {
+ const CharT f[] = {'%', modified, *fmt};
+ tm.tm_year = y - 1900;
+ facet.put(os, os, os.fill(), &tm, std::begin(f), std::end(f));
+ }
+#endif
+ modified = CharT{};
+ command = nullptr;
+ }
+ else
+ os << *fmt;
+ break;
+ case 'Y':
+ if (command)
+ {
+ if (modified == CharT{'O'})
+ os << CharT{'%'} << modified << *fmt;
+ else
+ {
+ if (!fds.ymd.year().ok())
+ os.setstate(std::ios::failbit);
+ auto y = fds.ymd.year();
+#if !ONLY_C_LOCALE
+ if (modified == CharT{})
+#endif
+ {
+ save_ostream<CharT, Traits> _(os);
+ os.imbue(std::locale::classic());
+ os << y;
+ }
+#if !ONLY_C_LOCALE
+ else if (modified == CharT{'E'})
+ {
+ const CharT f[] = {'%', modified, *fmt};
+ tm.tm_year = static_cast<int>(y) - 1900;
+ facet.put(os, os, os.fill(), &tm, std::begin(f), std::end(f));
+ }
+#endif
+ }
+ modified = CharT{};
+ command = nullptr;
+ }
+ else
+ os << *fmt;
+ break;
+ case 'z':
+ if (command)
+ {
+ if (offset_sec == nullptr)
+ {
+ // Can not format %z with unknown offset
+ os.setstate(ios::failbit);
+ return os;
+ }
+ auto m = duration_cast<minutes>(*offset_sec);
+ auto neg = m < minutes{0};
+ m = date::abs(m);
+ auto h = duration_cast<hours>(m);
+ m -= h;
+ if (neg)
+ os << CharT{'-'};
+ else
+ os << CharT{'+'};
+ if (h < hours{10})
+ os << CharT{'0'};
+ os << h.count();
+ if (modified != CharT{})
+ os << CharT{':'};
+ if (m < minutes{10})
+ os << CharT{'0'};
+ os << m.count();
+ command = nullptr;
+ modified = CharT{};
+ }
+ else
+ os << *fmt;
+ break;
+ case 'Z':
+ if (command)
+ {
+ if (modified == CharT{})
+ {
+ if (abbrev == nullptr)
+ {
+ // Can not format %Z with unknown time_zone
+ os.setstate(ios::failbit);
+ return os;
+ }
+ for (auto c : *abbrev)
+ os << CharT(c);
+ }
+ else
+ {
+ os << CharT{'%'} << modified << *fmt;
+ modified = CharT{};
+ }
+ command = nullptr;
+ }
+ else
+ os << *fmt;
+ break;
+ case 'E':
+ case 'O':
+ if (command)
+ {
+ if (modified == CharT{})
+ {
+ modified = *fmt;
+ }
+ else
+ {
+ os << CharT{'%'} << modified << *fmt;
+ command = nullptr;
+ modified = CharT{};
+ }
+ }
+ else
+ os << *fmt;
+ break;
+ case '%':
+ if (command)
+ {
+ if (modified == CharT{})
+ {
+ os << CharT{'%'};
+ command = nullptr;
+ }
+ else
+ {
+ os << CharT{'%'} << modified << CharT{'%'};
+ command = nullptr;
+ modified = CharT{};
+ }
+ }
+ else
+ command = fmt;
+ break;
+ default:
+ if (command)
+ {
+ os << CharT{'%'};
+ command = nullptr;
+ }
+ if (modified != CharT{})
+ {
+ os << modified;
+ modified = CharT{};
+ }
+ os << *fmt;
+ break;
+ }
+ }
+ if (command)
+ os << CharT{'%'};
+ if (modified != CharT{})
+ os << modified;
+ return os;
+}
+
+template <class CharT, class Traits>
+inline
+std::basic_ostream<CharT, Traits>&
+to_stream(std::basic_ostream<CharT, Traits>& os, const CharT* fmt, const year& y)
+{
+ using CT = std::chrono::seconds;
+ fields<CT> fds{y/0/0};
+ return to_stream(os, fmt, fds);
+}
+
+template <class CharT, class Traits>
+inline
+std::basic_ostream<CharT, Traits>&
+to_stream(std::basic_ostream<CharT, Traits>& os, const CharT* fmt, const month& m)
+{
+ using CT = std::chrono::seconds;
+ fields<CT> fds{m/0/nanyear};
+ return to_stream(os, fmt, fds);
+}
+
+template <class CharT, class Traits>
+inline
+std::basic_ostream<CharT, Traits>&
+to_stream(std::basic_ostream<CharT, Traits>& os, const CharT* fmt, const day& d)
+{
+ using CT = std::chrono::seconds;
+ fields<CT> fds{d/0/nanyear};
+ return to_stream(os, fmt, fds);
+}
+
+template <class CharT, class Traits>
+inline
+std::basic_ostream<CharT, Traits>&
+to_stream(std::basic_ostream<CharT, Traits>& os, const CharT* fmt, const weekday& wd)
+{
+ using CT = std::chrono::seconds;
+ fields<CT> fds{wd};
+ return to_stream(os, fmt, fds);
+}
+
+template <class CharT, class Traits>
+inline
+std::basic_ostream<CharT, Traits>&
+to_stream(std::basic_ostream<CharT, Traits>& os, const CharT* fmt, const year_month& ym)
+{
+ using CT = std::chrono::seconds;
+ fields<CT> fds{ym/0};
+ return to_stream(os, fmt, fds);
+}
+
+template <class CharT, class Traits>
+inline
+std::basic_ostream<CharT, Traits>&
+to_stream(std::basic_ostream<CharT, Traits>& os, const CharT* fmt, const month_day& md)
+{
+ using CT = std::chrono::seconds;
+ fields<CT> fds{md/nanyear};
+ return to_stream(os, fmt, fds);
+}
+
+template <class CharT, class Traits>
+inline
+std::basic_ostream<CharT, Traits>&
+to_stream(std::basic_ostream<CharT, Traits>& os, const CharT* fmt,
+ const year_month_day& ymd)
+{
+ using CT = std::chrono::seconds;
+ fields<CT> fds{ymd};
+ return to_stream(os, fmt, fds);
+}
+
+template <class CharT, class Traits, class Rep, class Period>
+inline
+std::basic_ostream<CharT, Traits>&
+to_stream(std::basic_ostream<CharT, Traits>& os, const CharT* fmt,
+ const std::chrono::duration<Rep, Period>& d)
+{
+ using Duration = std::chrono::duration<Rep, Period>;
+ using CT = typename std::common_type<Duration, std::chrono::seconds>::type;
+ fields<CT> fds{hh_mm_ss<CT>{d}};
+ return to_stream(os, fmt, fds);
+}
+
+template <class CharT, class Traits, class Duration>
+std::basic_ostream<CharT, Traits>&
+to_stream(std::basic_ostream<CharT, Traits>& os, const CharT* fmt,
+ const local_time<Duration>& tp, const std::string* abbrev = nullptr,
+ const std::chrono::seconds* offset_sec = nullptr)
+{
+ using CT = typename std::common_type<Duration, std::chrono::seconds>::type;
+ auto ld = floor<days>(tp);
+ fields<CT> fds{year_month_day{ld}, hh_mm_ss<CT>{tp-local_seconds{ld}}};
+ return to_stream(os, fmt, fds, abbrev, offset_sec);
+}
+
+template <class CharT, class Traits, class Duration>
+std::basic_ostream<CharT, Traits>&
+to_stream(std::basic_ostream<CharT, Traits>& os, const CharT* fmt,
+ const sys_time<Duration>& tp)
+{
+ using std::chrono::seconds;
+ using CT = typename std::common_type<Duration, seconds>::type;
+ const std::string abbrev("UTC");
+ CONSTDATA seconds offset{0};
+ auto sd = floor<days>(tp);
+ fields<CT> fds{year_month_day{sd}, hh_mm_ss<CT>{tp-sys_seconds{sd}}};
+ return to_stream(os, fmt, fds, &abbrev, &offset);
+}
+
+// format
+
+template <class CharT, class Streamable>
+auto
+format(const std::locale& loc, const CharT* fmt, const Streamable& tp)
+ -> decltype(to_stream(std::declval<std::basic_ostream<CharT>&>(), fmt, tp),
+ std::basic_string<CharT>{})
+{
+ std::basic_ostringstream<CharT> os;
+ os.exceptions(std::ios::failbit | std::ios::badbit);
+ os.imbue(loc);
+ to_stream(os, fmt, tp);
+ return os.str();
+}
+
+template <class CharT, class Streamable>
+auto
+format(const CharT* fmt, const Streamable& tp)
+ -> decltype(to_stream(std::declval<std::basic_ostream<CharT>&>(), fmt, tp),
+ std::basic_string<CharT>{})
+{
+ std::basic_ostringstream<CharT> os;
+ os.exceptions(std::ios::failbit | std::ios::badbit);
+ to_stream(os, fmt, tp);
+ return os.str();
+}
+
+template <class CharT, class Traits, class Alloc, class Streamable>
+auto
+format(const std::locale& loc, const std::basic_string<CharT, Traits, Alloc>& fmt,
+ const Streamable& tp)
+ -> decltype(to_stream(std::declval<std::basic_ostream<CharT, Traits>&>(), fmt.c_str(), tp),
+ std::basic_string<CharT, Traits, Alloc>{})
+{
+ std::basic_ostringstream<CharT, Traits, Alloc> os;
+ os.exceptions(std::ios::failbit | std::ios::badbit);
+ os.imbue(loc);
+ to_stream(os, fmt.c_str(), tp);
+ return os.str();
+}
+
+template <class CharT, class Traits, class Alloc, class Streamable>
+auto
+format(const std::basic_string<CharT, Traits, Alloc>& fmt, const Streamable& tp)
+ -> decltype(to_stream(std::declval<std::basic_ostream<CharT, Traits>&>(), fmt.c_str(), tp),
+ std::basic_string<CharT, Traits, Alloc>{})
+{
+ std::basic_ostringstream<CharT, Traits, Alloc> os;
+ os.exceptions(std::ios::failbit | std::ios::badbit);
+ to_stream(os, fmt.c_str(), tp);
+ return os.str();
+}
+
+// parse
+
+namespace detail
+{
+
+template <class CharT, class Traits>
+bool
+read_char(std::basic_istream<CharT, Traits>& is, CharT fmt, std::ios::iostate& err)
+{
+ auto ic = is.get();
+ if (Traits::eq_int_type(ic, Traits::eof()) ||
+ !Traits::eq(Traits::to_char_type(ic), fmt))
+ {
+ err |= std::ios::failbit;
+ is.setstate(std::ios::failbit);
+ return false;
+ }
+ return true;
+}
+
+template <class CharT, class Traits>
+unsigned
+read_unsigned(std::basic_istream<CharT, Traits>& is, unsigned m = 1, unsigned M = 10)
+{
+ unsigned x = 0;
+ unsigned count = 0;
+ while (true)
+ {
+ auto ic = is.peek();
+ if (Traits::eq_int_type(ic, Traits::eof()))
+ break;
+ auto c = static_cast<char>(Traits::to_char_type(ic));
+ if (!('0' <= c && c <= '9'))
+ break;
+ (void)is.get();
+ ++count;
+ x = 10*x + static_cast<unsigned>(c - '0');
+ if (count == M)
+ break;
+ }
+ if (count < m)
+ is.setstate(std::ios::failbit);
+ return x;
+}
+
+template <class CharT, class Traits>
+int
+read_signed(std::basic_istream<CharT, Traits>& is, unsigned m = 1, unsigned M = 10)
+{
+ auto ic = is.peek();
+ if (!Traits::eq_int_type(ic, Traits::eof()))
+ {
+ auto c = static_cast<char>(Traits::to_char_type(ic));
+ if (('0' <= c && c <= '9') || c == '-' || c == '+')
+ {
+ if (c == '-' || c == '+')
+ (void)is.get();
+ auto x = static_cast<int>(read_unsigned(is, std::max(m, 1u), M));
+ if (!is.fail())
+ {
+ if (c == '-')
+ x = -x;
+ return x;
+ }
+ }
+ }
+ if (m > 0)
+ is.setstate(std::ios::failbit);
+ return 0;
+}
+
+template <class CharT, class Traits>
+long double
+read_long_double(std::basic_istream<CharT, Traits>& is, unsigned m = 1, unsigned M = 10)
+{
+ unsigned count = 0;
+ auto decimal_point = Traits::to_int_type(
+ std::use_facet<std::numpunct<CharT>>(is.getloc()).decimal_point());
+ std::string buf;
+ while (true)
+ {
+ auto ic = is.peek();
+ if (Traits::eq_int_type(ic, Traits::eof()))
+ break;
+ if (Traits::eq_int_type(ic, decimal_point))
+ {
+ buf += '.';
+ decimal_point = Traits::eof();
+ is.get();
+ }
+ else
+ {
+ auto c = static_cast<char>(Traits::to_char_type(ic));
+ if (!('0' <= c && c <= '9'))
+ break;
+ buf += c;
+ (void)is.get();
+ }
+ if (++count == M)
+ break;
+ }
+ if (count < m)
+ {
+ is.setstate(std::ios::failbit);
+ return 0;
+ }
+ return std::stold(buf);
+}
+
+struct rs
+{
+ int& i;
+ unsigned m;
+ unsigned M;
+};
+
+struct ru
+{
+ int& i;
+ unsigned m;
+ unsigned M;
+};
+
+struct rld
+{
+ long double& i;
+ unsigned m;
+ unsigned M;
+};
+
+template <class CharT, class Traits>
+void
+read(std::basic_istream<CharT, Traits>&)
+{
+}
+
+template <class CharT, class Traits, class ...Args>
+void
+read(std::basic_istream<CharT, Traits>& is, CharT a0, Args&& ...args);
+
+template <class CharT, class Traits, class ...Args>
+void
+read(std::basic_istream<CharT, Traits>& is, rs a0, Args&& ...args);
+
+template <class CharT, class Traits, class ...Args>
+void
+read(std::basic_istream<CharT, Traits>& is, ru a0, Args&& ...args);
+
+template <class CharT, class Traits, class ...Args>
+void
+read(std::basic_istream<CharT, Traits>& is, int a0, Args&& ...args);
+
+template <class CharT, class Traits, class ...Args>
+void
+read(std::basic_istream<CharT, Traits>& is, rld a0, Args&& ...args);
+
+template <class CharT, class Traits, class ...Args>
+void
+read(std::basic_istream<CharT, Traits>& is, CharT a0, Args&& ...args)
+{
+ // No-op if a0 == CharT{}
+ if (a0 != CharT{})
+ {
+ auto ic = is.peek();
+ if (Traits::eq_int_type(ic, Traits::eof()))
+ {
+ is.setstate(std::ios::failbit | std::ios::eofbit);
+ return;
+ }
+ if (!Traits::eq(Traits::to_char_type(ic), a0))
+ {
+ is.setstate(std::ios::failbit);
+ return;
+ }
+ (void)is.get();
+ }
+ read(is, std::forward<Args>(args)...);
+}
+
+template <class CharT, class Traits, class ...Args>
+void
+read(std::basic_istream<CharT, Traits>& is, rs a0, Args&& ...args)
+{
+ auto x = read_signed(is, a0.m, a0.M);
+ if (is.fail())
+ return;
+ a0.i = x;
+ read(is, std::forward<Args>(args)...);
+}
+
+template <class CharT, class Traits, class ...Args>
+void
+read(std::basic_istream<CharT, Traits>& is, ru a0, Args&& ...args)
+{
+ auto x = read_unsigned(is, a0.m, a0.M);
+ if (is.fail())
+ return;
+ a0.i = static_cast<int>(x);
+ read(is, std::forward<Args>(args)...);
+}
+
+template <class CharT, class Traits, class ...Args>
+void
+read(std::basic_istream<CharT, Traits>& is, int a0, Args&& ...args)
+{
+ if (a0 != -1)
+ {
+ auto u = static_cast<unsigned>(a0);
+ CharT buf[std::numeric_limits<unsigned>::digits10+2u] = {};
+ auto e = buf;
+ do
+ {
+ *e++ = static_cast<CharT>(CharT(u % 10) + CharT{'0'});
+ u /= 10;
+ } while (u > 0);
+ std::reverse(buf, e);
+ for (auto p = buf; p != e && is.rdstate() == std::ios::goodbit; ++p)
+ read(is, *p);
+ }
+ if (is.rdstate() == std::ios::goodbit)
+ read(is, std::forward<Args>(args)...);
+}
+
+template <class CharT, class Traits, class ...Args>
+void
+read(std::basic_istream<CharT, Traits>& is, rld a0, Args&& ...args)
+{
+ auto x = read_long_double(is, a0.m, a0.M);
+ if (is.fail())
+ return;
+ a0.i = x;
+ read(is, std::forward<Args>(args)...);
+}
+
+template <class T, class CharT, class Traits>
+inline
+void
+checked_set(T& value, T from, T not_a_value, std::basic_ios<CharT, Traits>& is)
+{
+ if (!is.fail())
+ {
+ if (value == not_a_value)
+ value = std::move(from);
+ else if (value != from)
+ is.setstate(std::ios::failbit);
+ }
+}
+
+} // namespace detail;
+
+template <class CharT, class Traits, class Duration, class Alloc = std::allocator<CharT>>
+std::basic_istream<CharT, Traits>&
+from_stream(std::basic_istream<CharT, Traits>& is, const CharT* fmt,
+ fields<Duration>& fds, std::basic_string<CharT, Traits, Alloc>* abbrev,
+ std::chrono::minutes* offset)
+{
+ using std::numeric_limits;
+ using std::ios;
+ using std::chrono::duration;
+ using std::chrono::duration_cast;
+ using std::chrono::seconds;
+ using std::chrono::minutes;
+ using std::chrono::hours;
+ typename std::basic_istream<CharT, Traits>::sentry ok{is, true};
+ if (ok)
+ {
+ date::detail::save_istream<CharT, Traits> ss(is);
+ is.fill(' ');
+ is.flags(std::ios::skipws | std::ios::dec);
+ is.width(0);
+#if !ONLY_C_LOCALE
+ auto& f = std::use_facet<std::time_get<CharT>>(is.getloc());
+ std::tm tm{};
+#endif
+ const CharT* command = nullptr;
+ auto modified = CharT{};
+ auto width = -1;
+
+ CONSTDATA int not_a_year = numeric_limits<int>::min();
+ CONSTDATA int not_a_2digit_year = 100;
+ CONSTDATA int not_a_century = not_a_year / 100;
+ CONSTDATA int not_a_month = 0;
+ CONSTDATA int not_a_day = 0;
+ CONSTDATA int not_a_hour = numeric_limits<int>::min();
+ CONSTDATA int not_a_hour_12_value = 0;
+ CONSTDATA int not_a_minute = not_a_hour;
+ CONSTDATA Duration not_a_second = Duration::min();
+ CONSTDATA int not_a_doy = -1;
+ CONSTDATA int not_a_weekday = 8;
+ CONSTDATA int not_a_week_num = 100;
+ CONSTDATA int not_a_ampm = -1;
+ CONSTDATA minutes not_a_offset = minutes::min();
+
+ int Y = not_a_year; // c, F, Y *
+ int y = not_a_2digit_year; // D, x, y *
+ int g = not_a_2digit_year; // g *
+ int G = not_a_year; // G *
+ int C = not_a_century; // C *
+ int m = not_a_month; // b, B, h, m, c, D, F, x *
+ int d = not_a_day; // c, d, D, e, F, x *
+ int j = not_a_doy; // j *
+ int wd = not_a_weekday; // a, A, u, w *
+ int H = not_a_hour; // c, H, R, T, X *
+ int I = not_a_hour_12_value; // I, r *
+ int p = not_a_ampm; // p, r *
+ int M = not_a_minute; // c, M, r, R, T, X *
+ Duration s = not_a_second; // c, r, S, T, X *
+ int U = not_a_week_num; // U *
+ int V = not_a_week_num; // V *
+ int W = not_a_week_num; // W *
+ std::basic_string<CharT, Traits, Alloc> temp_abbrev; // Z *
+ minutes temp_offset = not_a_offset; // z *
+
+ using detail::read;
+ using detail::rs;
+ using detail::ru;
+ using detail::rld;
+ using detail::checked_set;
+ for (; *fmt != CharT{} && !is.fail(); ++fmt)
+ {
+ switch (*fmt)
+ {
+ case 'a':
+ case 'A':
+ case 'u':
+ case 'w': // wd: a, A, u, w
+ if (command)
+ {
+ int trial_wd = not_a_weekday;
+ if (*fmt == 'a' || *fmt == 'A')
+ {
+ if (modified == CharT{})
+ {
+#if !ONLY_C_LOCALE
+ ios::iostate err = ios::goodbit;
+ f.get(is, nullptr, is, err, &tm, command, fmt+1);
+ is.setstate(err);
+ if (!is.fail())
+ trial_wd = tm.tm_wday;
+#else
+ auto nm = detail::weekday_names();
+ auto i = detail::scan_keyword(is, nm.first, nm.second) - nm.first;
+ if (!is.fail())
+ trial_wd = i % 7;
+#endif
+ }
+ else
+ read(is, CharT{'%'}, width, modified, *fmt);
+ }
+ else // *fmt == 'u' || *fmt == 'w'
+ {
+#if !ONLY_C_LOCALE
+ if (modified == CharT{})
+#else
+ if (modified != CharT{'E'})
+#endif
+ {
+ read(is, ru{trial_wd, 1, width == -1 ?
+ 1u : static_cast<unsigned>(width)});
+ if (!is.fail())
+ {
+ if (*fmt == 'u')
+ {
+ if (!(1 <= trial_wd && trial_wd <= 7))
+ {
+ trial_wd = not_a_weekday;
+ is.setstate(ios::failbit);
+ }
+ else if (trial_wd == 7)
+ trial_wd = 0;
+ }
+ else // *fmt == 'w'
+ {
+ if (!(0 <= trial_wd && trial_wd <= 6))
+ {
+ trial_wd = not_a_weekday;
+ is.setstate(ios::failbit);
+ }
+ }
+ }
+ }
+#if !ONLY_C_LOCALE
+ else if (modified == CharT{'O'})
+ {
+ ios::iostate err = ios::goodbit;
+ f.get(is, nullptr, is, err, &tm, command, fmt+1);
+ is.setstate(err);
+ if (!is.fail())
+ trial_wd = tm.tm_wday;
+ }
+#endif
+ else
+ read(is, CharT{'%'}, width, modified, *fmt);
+ }
+ if (trial_wd != not_a_weekday)
+ checked_set(wd, trial_wd, not_a_weekday, is);
+ }
+ else // !command
+ read(is, *fmt);
+ command = nullptr;
+ width = -1;
+ modified = CharT{};
+ break;
+ case 'b':
+ case 'B':
+ case 'h':
+ if (command)
+ {
+ if (modified == CharT{})
+ {
+ int ttm = not_a_month;
+#if !ONLY_C_LOCALE
+ ios::iostate err = ios::goodbit;
+ f.get(is, nullptr, is, err, &tm, command, fmt+1);
+ if ((err & ios::failbit) == 0)
+ ttm = tm.tm_mon + 1;
+ is.setstate(err);
+#else
+ auto nm = detail::month_names();
+ auto i = detail::scan_keyword(is, nm.first, nm.second) - nm.first;
+ if (!is.fail())
+ ttm = i % 12 + 1;
+#endif
+ checked_set(m, ttm, not_a_month, is);
+ }
+ else
+ read(is, CharT{'%'}, width, modified, *fmt);
+ command = nullptr;
+ width = -1;
+ modified = CharT{};
+ }
+ else
+ read(is, *fmt);
+ break;
+ case 'c':
+ if (command)
+ {
+ if (modified != CharT{'O'})
+ {
+#if !ONLY_C_LOCALE
+ ios::iostate err = ios::goodbit;
+ f.get(is, nullptr, is, err, &tm, command, fmt+1);
+ if ((err & ios::failbit) == 0)
+ {
+ checked_set(Y, tm.tm_year + 1900, not_a_year, is);
+ checked_set(m, tm.tm_mon + 1, not_a_month, is);
+ checked_set(d, tm.tm_mday, not_a_day, is);
+ checked_set(H, tm.tm_hour, not_a_hour, is);
+ checked_set(M, tm.tm_min, not_a_minute, is);
+ checked_set(s, duration_cast<Duration>(seconds{tm.tm_sec}),
+ not_a_second, is);
+ }
+ is.setstate(err);
+#else
+ // "%a %b %e %T %Y"
+ auto nm = detail::weekday_names();
+ auto i = detail::scan_keyword(is, nm.first, nm.second) - nm.first;
+ checked_set(wd, static_cast<int>(i % 7), not_a_weekday, is);
+ ws(is);
+ nm = detail::month_names();
+ i = detail::scan_keyword(is, nm.first, nm.second) - nm.first;
+ checked_set(m, static_cast<int>(i % 12 + 1), not_a_month, is);
+ ws(is);
+ int td = not_a_day;
+ read(is, rs{td, 1, 2});
+ checked_set(d, td, not_a_day, is);
+ ws(is);
+ using dfs = detail::decimal_format_seconds<Duration>;
+ CONSTDATA auto w = Duration::period::den == 1 ? 2 : 3 + dfs::width;
+ int tH;
+ int tM;
+ long double S;
+ read(is, ru{tH, 1, 2}, CharT{':'}, ru{tM, 1, 2},
+ CharT{':'}, rld{S, 1, w});
+ checked_set(H, tH, not_a_hour, is);
+ checked_set(M, tM, not_a_minute, is);
+ checked_set(s, round<Duration>(duration<long double>{S}),
+ not_a_second, is);
+ ws(is);
+ int tY = not_a_year;
+ read(is, rs{tY, 1, 4u});
+ checked_set(Y, tY, not_a_year, is);
+#endif
+ }
+ else
+ read(is, CharT{'%'}, width, modified, *fmt);
+ command = nullptr;
+ width = -1;
+ modified = CharT{};
+ }
+ else
+ read(is, *fmt);
+ break;
+ case 'x':
+ if (command)
+ {
+ if (modified != CharT{'O'})
+ {
+#if !ONLY_C_LOCALE
+ ios::iostate err = ios::goodbit;
+ f.get(is, nullptr, is, err, &tm, command, fmt+1);
+ if ((err & ios::failbit) == 0)
+ {
+ checked_set(Y, tm.tm_year + 1900, not_a_year, is);
+ checked_set(m, tm.tm_mon + 1, not_a_month, is);
+ checked_set(d, tm.tm_mday, not_a_day, is);
+ }
+ is.setstate(err);
+#else
+ // "%m/%d/%y"
+ int ty = not_a_2digit_year;
+ int tm = not_a_month;
+ int td = not_a_day;
+ read(is, ru{tm, 1, 2}, CharT{'/'}, ru{td, 1, 2}, CharT{'/'},
+ rs{ty, 1, 2});
+ checked_set(y, ty, not_a_2digit_year, is);
+ checked_set(m, tm, not_a_month, is);
+ checked_set(d, td, not_a_day, is);
+#endif
+ }
+ else
+ read(is, CharT{'%'}, width, modified, *fmt);
+ command = nullptr;
+ width = -1;
+ modified = CharT{};
+ }
+ else
+ read(is, *fmt);
+ break;
+ case 'X':
+ if (command)
+ {
+ if (modified != CharT{'O'})
+ {
+#if !ONLY_C_LOCALE
+ ios::iostate err = ios::goodbit;
+ f.get(is, nullptr, is, err, &tm, command, fmt+1);
+ if ((err & ios::failbit) == 0)
+ {
+ checked_set(H, tm.tm_hour, not_a_hour, is);
+ checked_set(M, tm.tm_min, not_a_minute, is);
+ checked_set(s, duration_cast<Duration>(seconds{tm.tm_sec}),
+ not_a_second, is);
+ }
+ is.setstate(err);
+#else
+ // "%T"
+ using dfs = detail::decimal_format_seconds<Duration>;
+ CONSTDATA auto w = Duration::period::den == 1 ? 2 : 3 + dfs::width;
+ int tH = not_a_hour;
+ int tM = not_a_minute;
+ long double S;
+ read(is, ru{tH, 1, 2}, CharT{':'}, ru{tM, 1, 2},
+ CharT{':'}, rld{S, 1, w});
+ checked_set(H, tH, not_a_hour, is);
+ checked_set(M, tM, not_a_minute, is);
+ checked_set(s, round<Duration>(duration<long double>{S}),
+ not_a_second, is);
+#endif
+ }
+ else
+ read(is, CharT{'%'}, width, modified, *fmt);
+ command = nullptr;
+ width = -1;
+ modified = CharT{};
+ }
+ else
+ read(is, *fmt);
+ break;
+ case 'C':
+ if (command)
+ {
+ int tC = not_a_century;
+#if !ONLY_C_LOCALE
+ if (modified == CharT{})
+ {
+#endif
+ read(is, rs{tC, 1, width == -1 ? 2u : static_cast<unsigned>(width)});
+#if !ONLY_C_LOCALE
+ }
+ else
+ {
+ ios::iostate err = ios::goodbit;
+ f.get(is, nullptr, is, err, &tm, command, fmt+1);
+ if ((err & ios::failbit) == 0)
+ {
+ auto tY = tm.tm_year + 1900;
+ tC = (tY >= 0 ? tY : tY-99) / 100;
+ }
+ is.setstate(err);
+ }
+#endif
+ checked_set(C, tC, not_a_century, is);
+ command = nullptr;
+ width = -1;
+ modified = CharT{};
+ }
+ else
+ read(is, *fmt);
+ break;
+ case 'D':
+ if (command)
+ {
+ if (modified == CharT{})
+ {
+ int tn = not_a_month;
+ int td = not_a_day;
+ int ty = not_a_2digit_year;
+ read(is, ru{tn, 1, 2}, CharT{'\0'}, CharT{'/'}, CharT{'\0'},
+ ru{td, 1, 2}, CharT{'\0'}, CharT{'/'}, CharT{'\0'},
+ rs{ty, 1, 2});
+ checked_set(y, ty, not_a_2digit_year, is);
+ checked_set(m, tn, not_a_month, is);
+ checked_set(d, td, not_a_day, is);
+ }
+ else
+ read(is, CharT{'%'}, width, modified, *fmt);
+ command = nullptr;
+ width = -1;
+ modified = CharT{};
+ }
+ else
+ read(is, *fmt);
+ break;
+ case 'F':
+ if (command)
+ {
+ if (modified == CharT{})
+ {
+ int tY = not_a_year;
+ int tn = not_a_month;
+ int td = not_a_day;
+ read(is, rs{tY, 1, width == -1 ? 4u : static_cast<unsigned>(width)},
+ CharT{'-'}, ru{tn, 1, 2}, CharT{'-'}, ru{td, 1, 2});
+ checked_set(Y, tY, not_a_year, is);
+ checked_set(m, tn, not_a_month, is);
+ checked_set(d, td, not_a_day, is);
+ }
+ else
+ read(is, CharT{'%'}, width, modified, *fmt);
+ command = nullptr;
+ width = -1;
+ modified = CharT{};
+ }
+ else
+ read(is, *fmt);
+ break;
+ case 'd':
+ case 'e':
+ if (command)
+ {
+#if !ONLY_C_LOCALE
+ if (modified == CharT{})
+#else
+ if (modified != CharT{'E'})
+#endif
+ {
+ int td = not_a_day;
+ read(is, rs{td, 1, width == -1 ? 2u : static_cast<unsigned>(width)});
+ checked_set(d, td, not_a_day, is);
+ }
+#if !ONLY_C_LOCALE
+ else if (modified == CharT{'O'})
+ {
+ ios::iostate err = ios::goodbit;
+ f.get(is, nullptr, is, err, &tm, command, fmt+1);
+ command = nullptr;
+ width = -1;
+ modified = CharT{};
+ if ((err & ios::failbit) == 0)
+ checked_set(d, tm.tm_mday, not_a_day, is);
+ is.setstate(err);
+ }
+#endif
+ else
+ read(is, CharT{'%'}, width, modified, *fmt);
+ command = nullptr;
+ width = -1;
+ modified = CharT{};
+ }
+ else
+ read(is, *fmt);
+ break;
+ case 'H':
+ if (command)
+ {
+#if !ONLY_C_LOCALE
+ if (modified == CharT{})
+#else
+ if (modified != CharT{'E'})
+#endif
+ {
+ int tH = not_a_hour;
+ read(is, ru{tH, 1, width == -1 ? 2u : static_cast<unsigned>(width)});
+ checked_set(H, tH, not_a_hour, is);
+ }
+#if !ONLY_C_LOCALE
+ else if (modified == CharT{'O'})
+ {
+ ios::iostate err = ios::goodbit;
+ f.get(is, nullptr, is, err, &tm, command, fmt+1);
+ if ((err & ios::failbit) == 0)
+ checked_set(H, tm.tm_hour, not_a_hour, is);
+ is.setstate(err);
+ }
+#endif
+ else
+ read(is, CharT{'%'}, width, modified, *fmt);
+ command = nullptr;
+ width = -1;
+ modified = CharT{};
+ }
+ else
+ read(is, *fmt);
+ break;
+ case 'I':
+ if (command)
+ {
+ if (modified == CharT{})
+ {
+ int tI = not_a_hour_12_value;
+ // reads in an hour into I, but most be in [1, 12]
+ read(is, rs{tI, 1, width == -1 ? 2u : static_cast<unsigned>(width)});
+ if (!(1 <= tI && tI <= 12))
+ is.setstate(ios::failbit);
+ checked_set(I, tI, not_a_hour_12_value, is);
+ }
+ else
+ read(is, CharT{'%'}, width, modified, *fmt);
+ command = nullptr;
+ width = -1;
+ modified = CharT{};
+ }
+ else
+ read(is, *fmt);
+ break;
+ case 'j':
+ if (command)
+ {
+ if (modified == CharT{})
+ {
+ int tj = not_a_doy;
+ read(is, ru{tj, 1, width == -1 ? 3u : static_cast<unsigned>(width)});
+ checked_set(j, tj, not_a_doy, is);
+ }
+ else
+ read(is, CharT{'%'}, width, modified, *fmt);
+ command = nullptr;
+ width = -1;
+ modified = CharT{};
+ }
+ else
+ read(is, *fmt);
+ break;
+ case 'M':
+ if (command)
+ {
+#if !ONLY_C_LOCALE
+ if (modified == CharT{})
+#else
+ if (modified != CharT{'E'})
+#endif
+ {
+ int tM = not_a_minute;
+ read(is, ru{tM, 1, width == -1 ? 2u : static_cast<unsigned>(width)});
+ checked_set(M, tM, not_a_minute, is);
+ }
+#if !ONLY_C_LOCALE
+ else if (modified == CharT{'O'})
+ {
+ ios::iostate err = ios::goodbit;
+ f.get(is, nullptr, is, err, &tm, command, fmt+1);
+ if ((err & ios::failbit) == 0)
+ checked_set(M, tm.tm_min, not_a_minute, is);
+ is.setstate(err);
+ }
+#endif
+ else
+ read(is, CharT{'%'}, width, modified, *fmt);
+ command = nullptr;
+ width = -1;
+ modified = CharT{};
+ }
+ else
+ read(is, *fmt);
+ break;
+ case 'm':
+ if (command)
+ {
+#if !ONLY_C_LOCALE
+ if (modified == CharT{})
+#else
+ if (modified != CharT{'E'})
+#endif
+ {
+ int tn = not_a_month;
+ read(is, rs{tn, 1, width == -1 ? 2u : static_cast<unsigned>(width)});
+ checked_set(m, tn, not_a_month, is);
+ }
+#if !ONLY_C_LOCALE
+ else if (modified == CharT{'O'})
+ {
+ ios::iostate err = ios::goodbit;
+ f.get(is, nullptr, is, err, &tm, command, fmt+1);
+ if ((err & ios::failbit) == 0)
+ checked_set(m, tm.tm_mon + 1, not_a_month, is);
+ is.setstate(err);
+ }
+#endif
+ else
+ read(is, CharT{'%'}, width, modified, *fmt);
+ command = nullptr;
+ width = -1;
+ modified = CharT{};
+ }
+ else
+ read(is, *fmt);
+ break;
+ case 'n':
+ case 't':
+ if (command)
+ {
+ if (modified == CharT{})
+ {
+ // %n matches a single white space character
+ // %t matches 0 or 1 white space characters
+ auto ic = is.peek();
+ if (Traits::eq_int_type(ic, Traits::eof()))
+ {
+ ios::iostate err = ios::eofbit;
+ if (*fmt == 'n')
+ err |= ios::failbit;
+ is.setstate(err);
+ break;
+ }
+ if (isspace(ic))
+ {
+ (void)is.get();
+ }
+ else if (*fmt == 'n')
+ is.setstate(ios::failbit);
+ }
+ else
+ read(is, CharT{'%'}, width, modified, *fmt);
+ command = nullptr;
+ width = -1;
+ modified = CharT{};
+ }
+ else
+ read(is, *fmt);
+ break;
+ case 'p':
+ if (command)
+ {
+ if (modified == CharT{})
+ {
+ int tp = not_a_ampm;
+#if !ONLY_C_LOCALE
+ tm = std::tm{};
+ tm.tm_hour = 1;
+ ios::iostate err = ios::goodbit;
+ f.get(is, nullptr, is, err, &tm, command, fmt+1);
+ is.setstate(err);
+ if (tm.tm_hour == 1)
+ tp = 0;
+ else if (tm.tm_hour == 13)
+ tp = 1;
+ else
+ is.setstate(err);
+#else
+ auto nm = detail::ampm_names();
+ auto i = detail::scan_keyword(is, nm.first, nm.second) - nm.first;
+ tp = i;
+#endif
+ checked_set(p, tp, not_a_ampm, is);
+ }
+ else
+ read(is, CharT{'%'}, width, modified, *fmt);
+ command = nullptr;
+ width = -1;
+ modified = CharT{};
+ }
+ else
+ read(is, *fmt);
+
+ break;
+ case 'r':
+ if (command)
+ {
+ if (modified == CharT{})
+ {
+#if !ONLY_C_LOCALE
+ ios::iostate err = ios::goodbit;
+ f.get(is, nullptr, is, err, &tm, command, fmt+1);
+ if ((err & ios::failbit) == 0)
+ {
+ checked_set(H, tm.tm_hour, not_a_hour, is);
+ checked_set(M, tm.tm_min, not_a_hour, is);
+ checked_set(s, duration_cast<Duration>(seconds{tm.tm_sec}),
+ not_a_second, is);
+ }
+ is.setstate(err);
+#else
+ // "%I:%M:%S %p"
+ using dfs = detail::decimal_format_seconds<Duration>;
+ CONSTDATA auto w = Duration::period::den == 1 ? 2 : 3 + dfs::width;
+ long double S;
+ int tI = not_a_hour_12_value;
+ int tM = not_a_minute;
+ read(is, ru{tI, 1, 2}, CharT{':'}, ru{tM, 1, 2},
+ CharT{':'}, rld{S, 1, w});
+ checked_set(I, tI, not_a_hour_12_value, is);
+ checked_set(M, tM, not_a_minute, is);
+ checked_set(s, round<Duration>(duration<long double>{S}),
+ not_a_second, is);
+ ws(is);
+ auto nm = detail::ampm_names();
+ auto i = detail::scan_keyword(is, nm.first, nm.second) - nm.first;
+ checked_set(p, static_cast<int>(i), not_a_ampm, is);
+#endif
+ }
+ else
+ read(is, CharT{'%'}, width, modified, *fmt);
+ command = nullptr;
+ width = -1;
+ modified = CharT{};
+ }
+ else
+ read(is, *fmt);
+ break;
+ case 'R':
+ if (command)
+ {
+ if (modified == CharT{})
+ {
+ int tH = not_a_hour;
+ int tM = not_a_minute;
+ read(is, ru{tH, 1, 2}, CharT{'\0'}, CharT{':'}, CharT{'\0'},
+ ru{tM, 1, 2}, CharT{'\0'});
+ checked_set(H, tH, not_a_hour, is);
+ checked_set(M, tM, not_a_minute, is);
+ }
+ else
+ read(is, CharT{'%'}, width, modified, *fmt);
+ command = nullptr;
+ width = -1;
+ modified = CharT{};
+ }
+ else
+ read(is, *fmt);
+ break;
+ case 'S':
+ if (command)
+ {
+ #if !ONLY_C_LOCALE
+ if (modified == CharT{})
+#else
+ if (modified != CharT{'E'})
+#endif
+ {
+ using dfs = detail::decimal_format_seconds<Duration>;
+ CONSTDATA auto w = Duration::period::den == 1 ? 2 : 3 + dfs::width;
+ long double S;
+ read(is, rld{S, 1, width == -1 ? w : static_cast<unsigned>(width)});
+ checked_set(s, round<Duration>(duration<long double>{S}),
+ not_a_second, is);
+ }
+#if !ONLY_C_LOCALE
+ else if (modified == CharT{'O'})
+ {
+ ios::iostate err = ios::goodbit;
+ f.get(is, nullptr, is, err, &tm, command, fmt+1);
+ if ((err & ios::failbit) == 0)
+ checked_set(s, duration_cast<Duration>(seconds{tm.tm_sec}),
+ not_a_second, is);
+ is.setstate(err);
+ }
+#endif
+ else
+ read(is, CharT{'%'}, width, modified, *fmt);
+ command = nullptr;
+ width = -1;
+ modified = CharT{};
+ }
+ else
+ read(is, *fmt);
+ break;
+ case 'T':
+ if (command)
+ {
+ if (modified == CharT{})
+ {
+ using dfs = detail::decimal_format_seconds<Duration>;
+ CONSTDATA auto w = Duration::period::den == 1 ? 2 : 3 + dfs::width;
+ int tH = not_a_hour;
+ int tM = not_a_minute;
+ long double S;
+ read(is, ru{tH, 1, 2}, CharT{':'}, ru{tM, 1, 2},
+ CharT{':'}, rld{S, 1, w});
+ checked_set(H, tH, not_a_hour, is);
+ checked_set(M, tM, not_a_minute, is);
+ checked_set(s, round<Duration>(duration<long double>{S}),
+ not_a_second, is);
+ }
+ else
+ read(is, CharT{'%'}, width, modified, *fmt);
+ command = nullptr;
+ width = -1;
+ modified = CharT{};
+ }
+ else
+ read(is, *fmt);
+ break;
+ case 'Y':
+ if (command)
+ {
+#if !ONLY_C_LOCALE
+ if (modified == CharT{})
+#else
+ if (modified != CharT{'O'})
+#endif
+ {
+ int tY = not_a_year;
+ read(is, rs{tY, 1, width == -1 ? 4u : static_cast<unsigned>(width)});
+ checked_set(Y, tY, not_a_year, is);
+ }
+#if !ONLY_C_LOCALE
+ else if (modified == CharT{'E'})
+ {
+ ios::iostate err = ios::goodbit;
+ f.get(is, nullptr, is, err, &tm, command, fmt+1);
+ if ((err & ios::failbit) == 0)
+ checked_set(Y, tm.tm_year + 1900, not_a_year, is);
+ is.setstate(err);
+ }
+#endif
+ else
+ read(is, CharT{'%'}, width, modified, *fmt);
+ command = nullptr;
+ width = -1;
+ modified = CharT{};
+ }
+ else
+ read(is, *fmt);
+ break;
+ case 'y':
+ if (command)
+ {
+#if !ONLY_C_LOCALE
+ if (modified == CharT{})
+#endif
+ {
+ int ty = not_a_2digit_year;
+ read(is, ru{ty, 1, width == -1 ? 2u : static_cast<unsigned>(width)});
+ checked_set(y, ty, not_a_2digit_year, is);
+ }
+#if !ONLY_C_LOCALE
+ else
+ {
+ ios::iostate err = ios::goodbit;
+ f.get(is, nullptr, is, err, &tm, command, fmt+1);
+ if ((err & ios::failbit) == 0)
+ checked_set(Y, tm.tm_year + 1900, not_a_year, is);
+ is.setstate(err);
+ }
+#endif
+ command = nullptr;
+ width = -1;
+ modified = CharT{};
+ }
+ else
+ read(is, *fmt);
+ break;
+ case 'g':
+ if (command)
+ {
+ if (modified == CharT{})
+ {
+ int tg = not_a_2digit_year;
+ read(is, ru{tg, 1, width == -1 ? 2u : static_cast<unsigned>(width)});
+ checked_set(g, tg, not_a_2digit_year, is);
+ }
+ else
+ read(is, CharT{'%'}, width, modified, *fmt);
+ command = nullptr;
+ width = -1;
+ modified = CharT{};
+ }
+ else
+ read(is, *fmt);
+ break;
+ case 'G':
+ if (command)
+ {
+ if (modified == CharT{})
+ {
+ int tG = not_a_year;
+ read(is, rs{tG, 1, width == -1 ? 4u : static_cast<unsigned>(width)});
+ checked_set(G, tG, not_a_year, is);
+ }
+ else
+ read(is, CharT{'%'}, width, modified, *fmt);
+ command = nullptr;
+ width = -1;
+ modified = CharT{};
+ }
+ else
+ read(is, *fmt);
+ break;
+ case 'U':
+ if (command)
+ {
+ if (modified == CharT{})
+ {
+ int tU = not_a_week_num;
+ read(is, ru{tU, 1, width == -1 ? 2u : static_cast<unsigned>(width)});
+ checked_set(U, tU, not_a_week_num, is);
+ }
+ else
+ read(is, CharT{'%'}, width, modified, *fmt);
+ command = nullptr;
+ width = -1;
+ modified = CharT{};
+ }
+ else
+ read(is, *fmt);
+ break;
+ case 'V':
+ if (command)
+ {
+ if (modified == CharT{})
+ {
+ int tV = not_a_week_num;
+ read(is, ru{tV, 1, width == -1 ? 2u : static_cast<unsigned>(width)});
+ checked_set(V, tV, not_a_week_num, is);
+ }
+ else
+ read(is, CharT{'%'}, width, modified, *fmt);
+ command = nullptr;
+ width = -1;
+ modified = CharT{};
+ }
+ else
+ read(is, *fmt);
+ break;
+ case 'W':
+ if (command)
+ {
+ if (modified == CharT{})
+ {
+ int tW = not_a_week_num;
+ read(is, ru{tW, 1, width == -1 ? 2u : static_cast<unsigned>(width)});
+ checked_set(W, tW, not_a_week_num, is);
+ }
+ else
+ read(is, CharT{'%'}, width, modified, *fmt);
+ command = nullptr;
+ width = -1;
+ modified = CharT{};
+ }
+ else
+ read(is, *fmt);
+ break;
+ case 'E':
+ case 'O':
+ if (command)
+ {
+ if (modified == CharT{})
+ {
+ modified = *fmt;
+ }
+ else
+ {
+ read(is, CharT{'%'}, width, modified, *fmt);
+ command = nullptr;
+ width = -1;
+ modified = CharT{};
+ }
+ }
+ else
+ read(is, *fmt);
+ break;
+ case '%':
+ if (command)
+ {
+ if (modified == CharT{})
+ read(is, *fmt);
+ else
+ read(is, CharT{'%'}, width, modified, *fmt);
+ command = nullptr;
+ width = -1;
+ modified = CharT{};
+ }
+ else
+ command = fmt;
+ break;
+ case 'z':
+ if (command)
+ {
+ int tH, tM;
+ minutes toff = not_a_offset;
+ bool neg = false;
+ auto ic = is.peek();
+ if (!Traits::eq_int_type(ic, Traits::eof()))
+ {
+ auto c = static_cast<char>(Traits::to_char_type(ic));
+ if (c == '-')
+ neg = true;
+ }
+ if (modified == CharT{})
+ {
+ read(is, rs{tH, 2, 2});
+ if (!is.fail())
+ toff = hours{std::abs(tH)};
+ if (is.good())
+ {
+ ic = is.peek();
+ if (!Traits::eq_int_type(ic, Traits::eof()))
+ {
+ auto c = static_cast<char>(Traits::to_char_type(ic));
+ if ('0' <= c && c <= '9')
+ {
+ read(is, ru{tM, 2, 2});
+ if (!is.fail())
+ toff += minutes{tM};
+ }
+ }
+ }
+ }
+ else
+ {
+ read(is, rs{tH, 1, 2});
+ if (!is.fail())
+ toff = hours{std::abs(tH)};
+ if (is.good())
+ {
+ ic = is.peek();
+ if (!Traits::eq_int_type(ic, Traits::eof()))
+ {
+ auto c = static_cast<char>(Traits::to_char_type(ic));
+ if (c == ':')
+ {
+ (void)is.get();
+ read(is, ru{tM, 2, 2});
+ if (!is.fail())
+ toff += minutes{tM};
+ }
+ }
+ }
+ }
+ if (neg)
+ toff = -toff;
+ checked_set(temp_offset, toff, not_a_offset, is);
+ command = nullptr;
+ width = -1;
+ modified = CharT{};
+ }
+ else
+ read(is, *fmt);
+ break;
+ case 'Z':
+ if (command)
+ {
+ if (modified == CharT{})
+ {
+ std::basic_string<CharT, Traits, Alloc> buf;
+ while (is.rdstate() == std::ios::goodbit)
+ {
+ auto i = is.rdbuf()->sgetc();
+ if (Traits::eq_int_type(i, Traits::eof()))
+ {
+ is.setstate(ios::eofbit);
+ break;
+ }
+ auto wc = Traits::to_char_type(i);
+ auto c = static_cast<char>(wc);
+ // is c a valid time zone name or abbreviation character?
+ if (!(CharT{1} < wc && wc < CharT{127}) || !(isalnum(c) ||
+ c == '_' || c == '/' || c == '-' || c == '+'))
+ break;
+ buf.push_back(c);
+ is.rdbuf()->sbumpc();
+ }
+ if (buf.empty())
+ is.setstate(ios::failbit);
+ checked_set(temp_abbrev, buf, {}, is);
+ }
+ else
+ read(is, CharT{'%'}, width, modified, *fmt);
+ command = nullptr;
+ width = -1;
+ modified = CharT{};
+ }
+ else
+ read(is, *fmt);
+ break;
+ default:
+ if (command)
+ {
+ if (width == -1 && modified == CharT{} && '0' <= *fmt && *fmt <= '9')
+ {
+ width = static_cast<char>(*fmt) - '0';
+ while ('0' <= fmt[1] && fmt[1] <= '9')
+ width = 10*width + static_cast<char>(*++fmt) - '0';
+ }
+ else
+ {
+ if (modified == CharT{})
+ read(is, CharT{'%'}, width, *fmt);
+ else
+ read(is, CharT{'%'}, width, modified, *fmt);
+ command = nullptr;
+ width = -1;
+ modified = CharT{};
+ }
+ }
+ else // !command
+ {
+ if (isspace(static_cast<unsigned char>(*fmt)))
+ {
+ // space matches 0 or more white space characters
+ if (is.good())
+ ws(is);
+ }
+ else
+ read(is, *fmt);
+ }
+ break;
+ }
+ }
+ // is.fail() || *fmt == CharT{}
+ if (is.rdstate() == ios::goodbit && command)
+ {
+ if (modified == CharT{})
+ read(is, CharT{'%'}, width);
+ else
+ read(is, CharT{'%'}, width, modified);
+ }
+ if (!is.fail())
+ {
+ if (y != not_a_2digit_year)
+ {
+ // Convert y and an optional C to Y
+ if (!(0 <= y && y <= 99))
+ goto broken;
+ if (C == not_a_century)
+ {
+ if (Y == not_a_year)
+ {
+ if (y >= 69)
+ C = 19;
+ else
+ C = 20;
+ }
+ else
+ {
+ C = (Y >= 0 ? Y : Y-100) / 100;
+ }
+ }
+ int tY;
+ if (C >= 0)
+ tY = 100*C + y;
+ else
+ tY = 100*(C+1) - (y == 0 ? 100 : y);
+ if (Y != not_a_year && Y != tY)
+ goto broken;
+ Y = tY;
+ }
+ if (g != not_a_2digit_year)
+ {
+ // Convert g and an optional C to G
+ if (!(0 <= g && g <= 99))
+ goto broken;
+ if (C == not_a_century)
+ {
+ if (G == not_a_year)
+ {
+ if (g >= 69)
+ C = 19;
+ else
+ C = 20;
+ }
+ else
+ {
+ C = (G >= 0 ? G : G-100) / 100;
+ }
+ }
+ int tG;
+ if (C >= 0)
+ tG = 100*C + g;
+ else
+ tG = 100*(C+1) - (g == 0 ? 100 : g);
+ if (G != not_a_year && G != tG)
+ goto broken;
+ G = tG;
+ }
+ if (Y < static_cast<int>(year::min()) || Y > static_cast<int>(year::max()))
+ Y = not_a_year;
+ bool computed = false;
+ if (G != not_a_year && V != not_a_week_num && wd != not_a_weekday)
+ {
+ year_month_day ymd_trial = sys_days(year{G-1}/December/Thursday[last]) +
+ (Monday-Thursday) + weeks{V-1} +
+ (weekday{static_cast<unsigned>(wd)}-Monday);
+ if (Y == not_a_year)
+ Y = static_cast<int>(ymd_trial.year());
+ else if (year{Y} != ymd_trial.year())
+ goto broken;
+ if (m == not_a_month)
+ m = static_cast<int>(static_cast<unsigned>(ymd_trial.month()));
+ else if (month(static_cast<unsigned>(m)) != ymd_trial.month())
+ goto broken;
+ if (d == not_a_day)
+ d = static_cast<int>(static_cast<unsigned>(ymd_trial.day()));
+ else if (day(static_cast<unsigned>(d)) != ymd_trial.day())
+ goto broken;
+ computed = true;
+ }
+ if (Y != not_a_year && U != not_a_week_num && wd != not_a_weekday)
+ {
+ year_month_day ymd_trial = sys_days(year{Y}/January/Sunday[1]) +
+ weeks{U-1} +
+ (weekday{static_cast<unsigned>(wd)} - Sunday);
+ if (Y == not_a_year)
+ Y = static_cast<int>(ymd_trial.year());
+ else if (year{Y} != ymd_trial.year())
+ goto broken;
+ if (m == not_a_month)
+ m = static_cast<int>(static_cast<unsigned>(ymd_trial.month()));
+ else if (month(static_cast<unsigned>(m)) != ymd_trial.month())
+ goto broken;
+ if (d == not_a_day)
+ d = static_cast<int>(static_cast<unsigned>(ymd_trial.day()));
+ else if (day(static_cast<unsigned>(d)) != ymd_trial.day())
+ goto broken;
+ computed = true;
+ }
+ if (Y != not_a_year && W != not_a_week_num && wd != not_a_weekday)
+ {
+ year_month_day ymd_trial = sys_days(year{Y}/January/Monday[1]) +
+ weeks{W-1} +
+ (weekday{static_cast<unsigned>(wd)} - Monday);
+ if (Y == not_a_year)
+ Y = static_cast<int>(ymd_trial.year());
+ else if (year{Y} != ymd_trial.year())
+ goto broken;
+ if (m == not_a_month)
+ m = static_cast<int>(static_cast<unsigned>(ymd_trial.month()));
+ else if (month(static_cast<unsigned>(m)) != ymd_trial.month())
+ goto broken;
+ if (d == not_a_day)
+ d = static_cast<int>(static_cast<unsigned>(ymd_trial.day()));
+ else if (day(static_cast<unsigned>(d)) != ymd_trial.day())
+ goto broken;
+ computed = true;
+ }
+ if (j != not_a_doy && Y != not_a_year)
+ {
+ auto ymd_trial = year_month_day{local_days(year{Y}/1/1) + days{j-1}};
+ if (m == 0)
+ m = static_cast<int>(static_cast<unsigned>(ymd_trial.month()));
+ else if (month(static_cast<unsigned>(m)) != ymd_trial.month())
+ goto broken;
+ if (d == 0)
+ d = static_cast<int>(static_cast<unsigned>(ymd_trial.day()));
+ else if (day(static_cast<unsigned>(d)) != ymd_trial.day())
+ goto broken;
+ j = not_a_doy;
+ }
+ auto ymd = year{Y}/m/d;
+ if (ymd.ok())
+ {
+ if (wd == not_a_weekday)
+ wd = static_cast<int>((weekday(sys_days(ymd)) - Sunday).count());
+ else if (wd != static_cast<int>((weekday(sys_days(ymd)) - Sunday).count()))
+ goto broken;
+ if (!computed)
+ {
+ if (G != not_a_year || V != not_a_week_num)
+ {
+ sys_days sd = ymd;
+ auto G_trial = year_month_day{sd + days{3}}.year();
+ auto start = sys_days((G_trial - years{1})/December/Thursday[last]) +
+ (Monday - Thursday);
+ if (sd < start)
+ {
+ --G_trial;
+ if (V != not_a_week_num)
+ start = sys_days((G_trial - years{1})/December/Thursday[last])
+ + (Monday - Thursday);
+ }
+ if (G != not_a_year && G != static_cast<int>(G_trial))
+ goto broken;
+ if (V != not_a_week_num)
+ {
+ auto V_trial = duration_cast<weeks>(sd - start).count() + 1;
+ if (V != V_trial)
+ goto broken;
+ }
+ }
+ if (U != not_a_week_num)
+ {
+ auto start = sys_days(Sunday[1]/January/ymd.year());
+ auto U_trial = floor<weeks>(sys_days(ymd) - start).count() + 1;
+ if (U != U_trial)
+ goto broken;
+ }
+ if (W != not_a_week_num)
+ {
+ auto start = sys_days(Monday[1]/January/ymd.year());
+ auto W_trial = floor<weeks>(sys_days(ymd) - start).count() + 1;
+ if (W != W_trial)
+ goto broken;
+ }
+ }
+ }
+ fds.ymd = ymd;
+ if (I != not_a_hour_12_value)
+ {
+ if (!(1 <= I && I <= 12))
+ goto broken;
+ if (p != not_a_ampm)
+ {
+ // p is in [0, 1] == [AM, PM]
+ // Store trial H in I
+ if (I == 12)
+ --p;
+ I += p*12;
+ // Either set H from I or make sure H and I are consistent
+ if (H == not_a_hour)
+ H = I;
+ else if (I != H)
+ goto broken;
+ }
+ else // p == not_a_ampm
+ {
+ // if H, make sure H and I could be consistent
+ if (H != not_a_hour)
+ {
+ if (I == 12)
+ {
+ if (H != 0 && H != 12)
+ goto broken;
+ }
+ else if (!(I == H || I == H+12))
+ {
+ goto broken;
+ }
+ }
+ }
+ }
+ if (H != not_a_hour)
+ {
+ fds.has_tod = true;
+ fds.tod = hh_mm_ss<Duration>{hours{H}};
+ }
+ if (M != not_a_minute)
+ {
+ fds.has_tod = true;
+ fds.tod.m_ = minutes{M};
+ }
+ if (s != not_a_second)
+ {
+ fds.has_tod = true;
+ fds.tod.s_ = detail::decimal_format_seconds<Duration>{s};
+ }
+ if (j != not_a_doy)
+ {
+ fds.has_tod = true;
+ fds.tod.h_ += hours{days{j}};
+ }
+ if (wd != not_a_weekday)
+ fds.wd = weekday{static_cast<unsigned>(wd)};
+ if (abbrev != nullptr)
+ *abbrev = std::move(temp_abbrev);
+ if (offset != nullptr && temp_offset != not_a_offset)
+ *offset = temp_offset;
+ }
+ return is;
+ }
+broken:
+ is.setstate(ios::failbit);
+ return is;
+}
+
+template <class CharT, class Traits, class Alloc = std::allocator<CharT>>
+std::basic_istream<CharT, Traits>&
+from_stream(std::basic_istream<CharT, Traits>& is, const CharT* fmt, year& y,
+ std::basic_string<CharT, Traits, Alloc>* abbrev = nullptr,
+ std::chrono::minutes* offset = nullptr)
+{
+ using CT = std::chrono::seconds;
+ fields<CT> fds{};
+ from_stream(is, fmt, fds, abbrev, offset);
+ if (!fds.ymd.year().ok())
+ is.setstate(std::ios::failbit);
+ if (!is.fail())
+ y = fds.ymd.year();
+ return is;
+}
+
+template <class CharT, class Traits, class Alloc = std::allocator<CharT>>
+std::basic_istream<CharT, Traits>&
+from_stream(std::basic_istream<CharT, Traits>& is, const CharT* fmt, month& m,
+ std::basic_string<CharT, Traits, Alloc>* abbrev = nullptr,
+ std::chrono::minutes* offset = nullptr)
+{
+ using CT = std::chrono::seconds;
+ fields<CT> fds{};
+ from_stream(is, fmt, fds, abbrev, offset);
+ if (!fds.ymd.month().ok())
+ is.setstate(std::ios::failbit);
+ if (!is.fail())
+ m = fds.ymd.month();
+ return is;
+}
+
+template <class CharT, class Traits, class Alloc = std::allocator<CharT>>
+std::basic_istream<CharT, Traits>&
+from_stream(std::basic_istream<CharT, Traits>& is, const CharT* fmt, day& d,
+ std::basic_string<CharT, Traits, Alloc>* abbrev = nullptr,
+ std::chrono::minutes* offset = nullptr)
+{
+ using CT = std::chrono::seconds;
+ fields<CT> fds{};
+ from_stream(is, fmt, fds, abbrev, offset);
+ if (!fds.ymd.day().ok())
+ is.setstate(std::ios::failbit);
+ if (!is.fail())
+ d = fds.ymd.day();
+ return is;
+}
+
+template <class CharT, class Traits, class Alloc = std::allocator<CharT>>
+std::basic_istream<CharT, Traits>&
+from_stream(std::basic_istream<CharT, Traits>& is, const CharT* fmt, weekday& wd,
+ std::basic_string<CharT, Traits, Alloc>* abbrev = nullptr,
+ std::chrono::minutes* offset = nullptr)
+{
+ using CT = std::chrono::seconds;
+ fields<CT> fds{};
+ from_stream(is, fmt, fds, abbrev, offset);
+ if (!fds.wd.ok())
+ is.setstate(std::ios::failbit);
+ if (!is.fail())
+ wd = fds.wd;
+ return is;
+}
+
+template <class CharT, class Traits, class Alloc = std::allocator<CharT>>
+std::basic_istream<CharT, Traits>&
+from_stream(std::basic_istream<CharT, Traits>& is, const CharT* fmt, year_month& ym,
+ std::basic_string<CharT, Traits, Alloc>* abbrev = nullptr,
+ std::chrono::minutes* offset = nullptr)
+{
+ using CT = std::chrono::seconds;
+ fields<CT> fds{};
+ from_stream(is, fmt, fds, abbrev, offset);
+ if (!fds.ymd.month().ok())
+ is.setstate(std::ios::failbit);
+ if (!is.fail())
+ ym = fds.ymd.year()/fds.ymd.month();
+ return is;
+}
+
+template <class CharT, class Traits, class Alloc = std::allocator<CharT>>
+std::basic_istream<CharT, Traits>&
+from_stream(std::basic_istream<CharT, Traits>& is, const CharT* fmt, month_day& md,
+ std::basic_string<CharT, Traits, Alloc>* abbrev = nullptr,
+ std::chrono::minutes* offset = nullptr)
+{
+ using CT = std::chrono::seconds;
+ fields<CT> fds{};
+ from_stream(is, fmt, fds, abbrev, offset);
+ if (!fds.ymd.month().ok() || !fds.ymd.day().ok())
+ is.setstate(std::ios::failbit);
+ if (!is.fail())
+ md = fds.ymd.month()/fds.ymd.day();
+ return is;
+}
+
+template <class CharT, class Traits, class Alloc = std::allocator<CharT>>
+std::basic_istream<CharT, Traits>&
+from_stream(std::basic_istream<CharT, Traits>& is, const CharT* fmt,
+ year_month_day& ymd, std::basic_string<CharT, Traits, Alloc>* abbrev = nullptr,
+ std::chrono::minutes* offset = nullptr)
+{
+ using CT = std::chrono::seconds;
+ fields<CT> fds{};
+ from_stream(is, fmt, fds, abbrev, offset);
+ if (!fds.ymd.ok())
+ is.setstate(std::ios::failbit);
+ if (!is.fail())
+ ymd = fds.ymd;
+ return is;
+}
+
+template <class Duration, class CharT, class Traits, class Alloc = std::allocator<CharT>>
+std::basic_istream<CharT, Traits>&
+from_stream(std::basic_istream<CharT, Traits>& is, const CharT* fmt,
+ sys_time<Duration>& tp, std::basic_string<CharT, Traits, Alloc>* abbrev = nullptr,
+ std::chrono::minutes* offset = nullptr)
+{
+ using CT = typename std::common_type<Duration, std::chrono::seconds>::type;
+ std::chrono::minutes offset_local{};
+ auto offptr = offset ? offset : &offset_local;
+ fields<CT> fds{};
+ fds.has_tod = true;
+ from_stream(is, fmt, fds, abbrev, offptr);
+ if (!fds.ymd.ok() || !fds.tod.in_conventional_range())
+ is.setstate(std::ios::failbit);
+ if (!is.fail())
+ tp = round<Duration>(sys_days(fds.ymd) - *offptr + fds.tod.to_duration());
+ return is;
+}
+
+template <class Duration, class CharT, class Traits, class Alloc = std::allocator<CharT>>
+std::basic_istream<CharT, Traits>&
+from_stream(std::basic_istream<CharT, Traits>& is, const CharT* fmt,
+ local_time<Duration>& tp, std::basic_string<CharT, Traits, Alloc>* abbrev = nullptr,
+ std::chrono::minutes* offset = nullptr)
+{
+ using CT = typename std::common_type<Duration, std::chrono::seconds>::type;
+ fields<CT> fds{};
+ fds.has_tod = true;
+ from_stream(is, fmt, fds, abbrev, offset);
+ if (!fds.ymd.ok() || !fds.tod.in_conventional_range())
+ is.setstate(std::ios::failbit);
+ if (!is.fail())
+ tp = round<Duration>(local_seconds{local_days(fds.ymd)} + fds.tod.to_duration());
+ return is;
+}
+
+template <class Rep, class Period, class CharT, class Traits, class Alloc = std::allocator<CharT>>
+std::basic_istream<CharT, Traits>&
+from_stream(std::basic_istream<CharT, Traits>& is, const CharT* fmt,
+ std::chrono::duration<Rep, Period>& d,
+ std::basic_string<CharT, Traits, Alloc>* abbrev = nullptr,
+ std::chrono::minutes* offset = nullptr)
+{
+ using Duration = std::chrono::duration<Rep, Period>;
+ using CT = typename std::common_type<Duration, std::chrono::seconds>::type;
+ fields<CT> fds{};
+ from_stream(is, fmt, fds, abbrev, offset);
+ if (!fds.has_tod)
+ is.setstate(std::ios::failbit);
+ if (!is.fail())
+ d = std::chrono::duration_cast<Duration>(fds.tod.to_duration());
+ return is;
+}
+
+template <class Parsable, class CharT, class Traits = std::char_traits<CharT>,
+ class Alloc = std::allocator<CharT>>
+struct parse_manip
+{
+ const std::basic_string<CharT, Traits, Alloc> format_;
+ Parsable& tp_;
+ std::basic_string<CharT, Traits, Alloc>* abbrev_;
+ std::chrono::minutes* offset_;
+
+public:
+ parse_manip(std::basic_string<CharT, Traits, Alloc> format, Parsable& tp,
+ std::basic_string<CharT, Traits, Alloc>* abbrev = nullptr,
+ std::chrono::minutes* offset = nullptr)
+ : format_(std::move(format))
+ , tp_(tp)
+ , abbrev_(abbrev)
+ , offset_(offset)
+ {}
+
+};
+
+template <class Parsable, class CharT, class Traits, class Alloc>
+std::basic_istream<CharT, Traits>&
+operator>>(std::basic_istream<CharT, Traits>& is,
+ const parse_manip<Parsable, CharT, Traits, Alloc>& x)
+{
+ return from_stream(is, x.format_.c_str(), x.tp_, x.abbrev_, x.offset_);
+}
+
+template <class Parsable, class CharT, class Traits, class Alloc>
+inline
+auto
+parse(const std::basic_string<CharT, Traits, Alloc>& format, Parsable& tp)
+ -> decltype(from_stream(std::declval<std::basic_istream<CharT, Traits>&>(),
+ format.c_str(), tp),
+ parse_manip<Parsable, CharT, Traits, Alloc>{format, tp})
+{
+ return {format, tp};
+}
+
+template <class Parsable, class CharT, class Traits, class Alloc>
+inline
+auto
+parse(const std::basic_string<CharT, Traits, Alloc>& format, Parsable& tp,
+ std::basic_string<CharT, Traits, Alloc>& abbrev)
+ -> decltype(from_stream(std::declval<std::basic_istream<CharT, Traits>&>(),
+ format.c_str(), tp, &abbrev),
+ parse_manip<Parsable, CharT, Traits, Alloc>{format, tp, &abbrev})
+{
+ return {format, tp, &abbrev};
+}
+
+template <class Parsable, class CharT, class Traits, class Alloc>
+inline
+auto
+parse(const std::basic_string<CharT, Traits, Alloc>& format, Parsable& tp,
+ std::chrono::minutes& offset)
+ -> decltype(from_stream(std::declval<std::basic_istream<CharT, Traits>&>(),
+ format.c_str(), tp,
+ std::declval<std::basic_string<CharT, Traits, Alloc>*>(),
+ &offset),
+ parse_manip<Parsable, CharT, Traits, Alloc>{format, tp, nullptr, &offset})
+{
+ return {format, tp, nullptr, &offset};
+}
+
+template <class Parsable, class CharT, class Traits, class Alloc>
+inline
+auto
+parse(const std::basic_string<CharT, Traits, Alloc>& format, Parsable& tp,
+ std::basic_string<CharT, Traits, Alloc>& abbrev, std::chrono::minutes& offset)
+ -> decltype(from_stream(std::declval<std::basic_istream<CharT, Traits>&>(),
+ format.c_str(), tp, &abbrev, &offset),
+ parse_manip<Parsable, CharT, Traits, Alloc>{format, tp, &abbrev, &offset})
+{
+ return {format, tp, &abbrev, &offset};
+}
+
+// const CharT* formats
+
+template <class Parsable, class CharT>
+inline
+auto
+parse(const CharT* format, Parsable& tp)
+ -> decltype(from_stream(std::declval<std::basic_istream<CharT>&>(), format, tp),
+ parse_manip<Parsable, CharT>{format, tp})
+{
+ return {format, tp};
+}
+
+template <class Parsable, class CharT, class Traits, class Alloc>
+inline
+auto
+parse(const CharT* format, Parsable& tp, std::basic_string<CharT, Traits, Alloc>& abbrev)
+ -> decltype(from_stream(std::declval<std::basic_istream<CharT, Traits>&>(), format,
+ tp, &abbrev),
+ parse_manip<Parsable, CharT, Traits, Alloc>{format, tp, &abbrev})
+{
+ return {format, tp, &abbrev};
+}
+
+template <class Parsable, class CharT>
+inline
+auto
+parse(const CharT* format, Parsable& tp, std::chrono::minutes& offset)
+ -> decltype(from_stream(std::declval<std::basic_istream<CharT>&>(), format,
+ tp, std::declval<std::basic_string<CharT>*>(), &offset),
+ parse_manip<Parsable, CharT>{format, tp, nullptr, &offset})
+{
+ return {format, tp, nullptr, &offset};
+}
+
+template <class Parsable, class CharT, class Traits, class Alloc>
+inline
+auto
+parse(const CharT* format, Parsable& tp,
+ std::basic_string<CharT, Traits, Alloc>& abbrev, std::chrono::minutes& offset)
+ -> decltype(from_stream(std::declval<std::basic_istream<CharT, Traits>&>(), format,
+ tp, &abbrev, &offset),
+ parse_manip<Parsable, CharT, Traits, Alloc>{format, tp, &abbrev, &offset})
+{
+ return {format, tp, &abbrev, &offset};
+}
+
+// duration streaming
+
+template <class CharT, class Traits, class Rep, class Period>
+inline
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os,
+ const std::chrono::duration<Rep, Period>& d)
+{
+ return os << detail::make_string<CharT, Traits>::from(d.count()) +
+ detail::get_units<CharT>(typename Period::type{});
+}
+
+} // namespace date
+} // namespace arrow_vendored
+
+#ifdef _MSC_VER
+# pragma warning(pop)
+#endif
+
+#ifdef __GNUC__
+# pragma GCC diagnostic pop
+#endif
+
+#endif // DATE_H
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/vendored/datetime/ios.h b/contrib/libs/apache/arrow/cpp/src/arrow/vendored/datetime/ios.h
new file mode 100644
index 00000000000..46567d69b18
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/vendored/datetime/ios.h
@@ -0,0 +1,53 @@
+//
+// ios.h
+// DateTimeLib
+//
+// The MIT License (MIT)
+//
+// Copyright (c) 2016 Alexander Kormanovsky
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#ifndef ios_hpp
+#define ios_hpp
+
+#if __APPLE__
+# include <TargetConditionals.h>
+# if TARGET_OS_IPHONE
+# include <string>
+
+ namespace arrow_vendored
+ {
+ namespace date
+ {
+ namespace iOSUtils
+ {
+
+ std::string get_tzdata_path();
+ std::string get_current_timezone();
+
+ } // namespace iOSUtils
+ } // namespace date
+ } // namespace arrow_vendored
+
+# endif // TARGET_OS_IPHONE
+#else // !__APPLE__
+# define TARGET_OS_IPHONE 0
+#endif // !__APPLE__
+#endif // ios_hpp
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/vendored/datetime/ios.mm b/contrib/libs/apache/arrow/cpp/src/arrow/vendored/datetime/ios.mm
new file mode 100644
index 00000000000..18c521201d3
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/vendored/datetime/ios.mm
@@ -0,0 +1,340 @@
+//
+// The MIT License (MIT)
+//
+// Copyright (c) 2016 Alexander Kormanovsky
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+
+#include "ios.h"
+
+#if TARGET_OS_IPHONE
+
+#include <Foundation/Foundation.h>
+
+#include <fstream>
+#include <zlib.h>
+#include <sys/stat.h>
+
+#ifndef TAR_DEBUG
+# define TAR_DEBUG 0
+#endif
+
+#define INTERNAL_DIR "Library"
+#define TZDATA_DIR "tzdata"
+#define TARGZ_EXTENSION "tar.gz"
+
+#define TAR_BLOCK_SIZE 512
+#define TAR_TYPE_POSITION 156
+#define TAR_NAME_POSITION 0
+#define TAR_NAME_SIZE 100
+#define TAR_SIZE_POSITION 124
+#define TAR_SIZE_SIZE 12
+
+namespace arrow_vendored
+{
+namespace date
+{
+ namespace iOSUtils
+ {
+
+ struct TarInfo
+ {
+ char objType;
+ std::string objName;
+ size_t realContentSize; // writable size without padding zeroes
+ size_t blocksContentSize; // adjusted size to 512 bytes blocks
+ bool success;
+ };
+
+ std::string convertCFStringRefPathToCStringPath(CFStringRef ref);
+ bool extractTzdata(CFURLRef homeUrl, CFURLRef archiveUrl, std::string destPath);
+ TarInfo getTarObjectInfo(std::ifstream &readStream);
+ std::string getTarObject(std::ifstream &readStream, int64_t size);
+ bool writeFile(const std::string &tzdataPath, const std::string &fileName,
+ const std::string &data, size_t realContentSize);
+
+ std::string
+ get_current_timezone()
+ {
+ CFTimeZoneRef tzRef = CFTimeZoneCopySystem();
+ CFStringRef tzNameRef = CFTimeZoneGetName(tzRef);
+ CFIndex bufferSize = CFStringGetLength(tzNameRef) + 1;
+ char buffer[bufferSize];
+
+ if (CFStringGetCString(tzNameRef, buffer, bufferSize, kCFStringEncodingUTF8))
+ {
+ CFRelease(tzRef);
+ return std::string(buffer);
+ }
+
+ CFRelease(tzRef);
+
+ return "";
+ }
+
+ std::string
+ get_tzdata_path()
+ {
+ CFURLRef homeUrlRef = CFCopyHomeDirectoryURL();
+ CFStringRef homePath = CFURLCopyPath(homeUrlRef);
+ std::string path(std::string(convertCFStringRefPathToCStringPath(homePath)) +
+ INTERNAL_DIR + "/" + TZDATA_DIR);
+ std::string result_path(std::string(convertCFStringRefPathToCStringPath(homePath)) +
+ INTERNAL_DIR);
+
+ if (access(path.c_str(), F_OK) == 0)
+ {
+#if TAR_DEBUG
+ printf("tzdata dir exists\n");
+#endif
+ CFRelease(homeUrlRef);
+ CFRelease(homePath);
+
+ return result_path;
+ }
+
+ CFBundleRef mainBundle = CFBundleGetMainBundle();
+ CFArrayRef paths = CFBundleCopyResourceURLsOfType(mainBundle, CFSTR(TARGZ_EXTENSION),
+ NULL);
+
+ if (CFArrayGetCount(paths) != 0)
+ {
+ // get archive path, assume there is no other tar.gz in bundle
+ CFURLRef archiveUrl = static_cast<CFURLRef>(CFArrayGetValueAtIndex(paths, 0));
+ CFStringRef archiveName = CFURLCopyPath(archiveUrl);
+ archiveUrl = CFBundleCopyResourceURL(mainBundle, archiveName, NULL, NULL);
+
+ extractTzdata(homeUrlRef, archiveUrl, path);
+
+ CFRelease(archiveUrl);
+ CFRelease(archiveName);
+ }
+
+ CFRelease(homeUrlRef);
+ CFRelease(homePath);
+ CFRelease(paths);
+
+ return result_path;
+ }
+
+ std::string
+ convertCFStringRefPathToCStringPath(CFStringRef ref)
+ {
+ CFIndex bufferSize = CFStringGetMaximumSizeOfFileSystemRepresentation(ref);
+ char *buffer = new char[bufferSize];
+ CFStringGetFileSystemRepresentation(ref, buffer, bufferSize);
+ auto result = std::string(buffer);
+ delete[] buffer;
+ return result;
+ }
+
+ bool
+ extractTzdata(CFURLRef homeUrl, CFURLRef archiveUrl, std::string destPath)
+ {
+ std::string TAR_TMP_PATH = "/tmp.tar";
+
+ CFStringRef homeStringRef = CFURLCopyPath(homeUrl);
+ auto homePath = convertCFStringRefPathToCStringPath(homeStringRef);
+ CFRelease(homeStringRef);
+
+ CFStringRef archiveStringRef = CFURLCopyPath(archiveUrl);
+ auto archivePath = convertCFStringRefPathToCStringPath(archiveStringRef);
+ CFRelease(archiveStringRef);
+
+ // create Library path
+ auto libraryPath = homePath + INTERNAL_DIR;
+
+ // create tzdata path
+ auto tzdataPath = libraryPath + "/" + TZDATA_DIR;
+
+ // -- replace %20 with " "
+ const std::string search = "%20";
+ const std::string replacement = " ";
+ size_t pos = 0;
+
+ while ((pos = archivePath.find(search, pos)) != std::string::npos) {
+ archivePath.replace(pos, search.length(), replacement);
+ pos += replacement.length();
+ }
+
+ gzFile tarFile = gzopen(archivePath.c_str(), "rb");
+
+ // create tar unpacking path
+ auto tarPath = libraryPath + TAR_TMP_PATH;
+
+ // create tzdata directory
+ mkdir(destPath.c_str(), S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
+
+ // ======= extract tar ========
+
+ std::ofstream os(tarPath.c_str(), std::ofstream::out | std::ofstream::app);
+ unsigned int bufferLength = 1024 * 256; // 256Kb
+ unsigned char *buffer = (unsigned char *)malloc(bufferLength);
+ bool success = true;
+
+ while (true)
+ {
+ int readBytes = gzread(tarFile, buffer, bufferLength);
+
+ if (readBytes > 0)
+ {
+ os.write((char *) &buffer[0], readBytes);
+ }
+ else
+ if (readBytes == 0)
+ {
+ break;
+ }
+ else
+ if (readBytes == -1)
+ {
+ printf("decompression failed\n");
+ success = false;
+ break;
+ }
+ else
+ {
+ printf("unexpected zlib state\n");
+ success = false;
+ break;
+ }
+ }
+
+ os.close();
+ free(buffer);
+ gzclose(tarFile);
+
+ if (!success)
+ {
+ remove(tarPath.c_str());
+ return false;
+ }
+
+ // ======== extract files =========
+
+ uint64_t location = 0; // Position in the file
+
+ // get file size
+ struct stat stat_buf;
+ int res = stat(tarPath.c_str(), &stat_buf);
+ if (res != 0)
+ {
+ printf("error file size\n");
+ remove(tarPath.c_str());
+ return false;
+ }
+ int64_t tarSize = stat_buf.st_size;
+
+ // create read stream
+ std::ifstream is(tarPath.c_str(), std::ifstream::in | std::ifstream::binary);
+
+ // process files
+ while (location < tarSize)
+ {
+ TarInfo info = getTarObjectInfo(is);
+
+ if (!info.success || info.realContentSize == 0)
+ {
+ break; // something wrong or all files are read
+ }
+
+ switch (info.objType)
+ {
+ case '0': // file
+ case '\0': //
+ {
+ std::string obj = getTarObject(is, info.blocksContentSize);
+#if TAR_DEBUG
+ size += info.realContentSize;
+ printf("#%i %s file size %lld written total %ld from %lld\n", ++count,
+ info.objName.c_str(), info.realContentSize, size, tarSize);
+#endif
+ writeFile(tzdataPath, info.objName, obj, info.realContentSize);
+ location += info.blocksContentSize;
+
+ break;
+ }
+ }
+ }
+
+ remove(tarPath.c_str());
+
+ return true;
+ }
+
+ TarInfo
+ getTarObjectInfo(std::ifstream &readStream)
+ {
+ int64_t length = TAR_BLOCK_SIZE;
+ char buffer[length];
+ char type;
+ char name[TAR_NAME_SIZE + 1];
+ char sizeBuf[TAR_SIZE_SIZE + 1];
+
+ readStream.read(buffer, length);
+
+ memcpy(&type, &buffer[TAR_TYPE_POSITION], 1);
+
+ memset(&name, '\0', TAR_NAME_SIZE + 1);
+ memcpy(&name, &buffer[TAR_NAME_POSITION], TAR_NAME_SIZE);
+
+ memset(&sizeBuf, '\0', TAR_SIZE_SIZE + 1);
+ memcpy(&sizeBuf, &buffer[TAR_SIZE_POSITION], TAR_SIZE_SIZE);
+ size_t realSize = strtol(sizeBuf, NULL, 8);
+ size_t blocksSize = realSize + (TAR_BLOCK_SIZE - (realSize % TAR_BLOCK_SIZE));
+
+ return {type, std::string(name), realSize, blocksSize, true};
+ }
+
+ std::string
+ getTarObject(std::ifstream &readStream, int64_t size)
+ {
+ char buffer[size];
+ readStream.read(buffer, size);
+ return std::string(buffer);
+ }
+
+ bool
+ writeFile(const std::string &tzdataPath, const std::string &fileName, const std::string &data,
+ size_t realContentSize)
+ {
+ std::ofstream os(tzdataPath + "/" + fileName, std::ofstream::out | std::ofstream::binary);
+
+ if (!os) {
+ return false;
+ }
+
+ // trim empty space
+ char trimmedData[realContentSize + 1];
+ memset(&trimmedData, '\0', realContentSize);
+ memcpy(&trimmedData, data.c_str(), realContentSize);
+
+ // write
+ os.write(trimmedData, realContentSize);
+ os.close();
+
+ return true;
+ }
+
+ } // namespace iOSUtils
+} // namespace date
+} // namespace arrow_vendored
+
+#endif // TARGET_OS_IPHONE
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/vendored/datetime/tz.cpp b/contrib/libs/apache/arrow/cpp/src/arrow/vendored/datetime/tz.cpp
new file mode 100644
index 00000000000..e80e392bd73
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/vendored/datetime/tz.cpp
@@ -0,0 +1,3877 @@
+// The MIT License (MIT)
+//
+// Copyright (c) 2015, 2016, 2017 Howard Hinnant
+// Copyright (c) 2015 Ville Voutilainen
+// Copyright (c) 2016 Alexander Kormanovsky
+// Copyright (c) 2016, 2017 Jiangang Zhuang
+// Copyright (c) 2017 Nicolas Veloz Savino
+// Copyright (c) 2017 Florian Dang
+// Copyright (c) 2017 Aaron Bishop
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+// Our apologies. When the previous paragraph was written, lowercase had not yet
+// been invented (that would involve another several millennia of evolution).
+// We did not mean to shout.
+
+// NOTE(ARROW): This is required so that symbols are properly exported from the DLL
+#include "visibility.h"
+
+#ifdef _WIN32
+ // windows.h will be included directly and indirectly (e.g. by curl).
+ // We need to define these macros to prevent windows.h bringing in
+ // more than we need and do it early so windows.h doesn't get included
+ // without these macros having been defined.
+ // min/max macros interfere with the C++ versions.
+# ifndef NOMINMAX
+# define NOMINMAX
+# endif
+ // We don't need all that Windows has to offer.
+# ifndef WIN32_LEAN_AND_MEAN
+# define WIN32_LEAN_AND_MEAN
+# endif
+
+ // for wcstombs
+# ifndef _CRT_SECURE_NO_WARNINGS
+# define _CRT_SECURE_NO_WARNINGS
+# endif
+
+ // None of this happens with the MS SDK (at least VS14 which I tested), but:
+ // Compiling with mingw, we get "error: 'KF_FLAG_DEFAULT' was not declared in this scope."
+ // and error: 'SHGetKnownFolderPath' was not declared in this scope.".
+ // It seems when using mingw NTDDI_VERSION is undefined and that
+ // causes KNOWN_FOLDER_FLAG and the KF_ flags to not get defined.
+ // So we must define NTDDI_VERSION to get those flags on mingw.
+ // The docs say though here:
+ // https://msdn.microsoft.com/en-nz/library/windows/desktop/aa383745(v=vs.85).aspx
+ // that "If you define NTDDI_VERSION, you must also define _WIN32_WINNT."
+ // So we declare we require Vista or greater.
+# ifdef __MINGW32__
+
+# ifndef NTDDI_VERSION
+# define NTDDI_VERSION 0x06000000
+# define _WIN32_WINNT _WIN32_WINNT_VISTA
+# elif NTDDI_VERSION < 0x06000000
+# warning "If this fails to compile NTDDI_VERSION may be to low. See comments above."
+# endif
+ // But once we define the values above we then get this linker error:
+ // "tz.cpp:(.rdata$.refptr.FOLDERID_Downloads[.refptr.FOLDERID_Downloads]+0x0): "
+ // "undefined reference to `FOLDERID_Downloads'"
+ // which #include <initguid.h> cures see:
+ // https://support.microsoft.com/en-us/kb/130869
+# include <initguid.h>
+ // But with <initguid.h> included, the error moves on to:
+ // error: 'FOLDERID_Downloads' was not declared in this scope
+ // Which #include <knownfolders.h> cures.
+# error #include <knownfolders.h>
+
+# endif // __MINGW32__
+
+# include <windows.h>
+#endif // _WIN32
+
+#include "tz_private.h"
+
+#ifdef __APPLE__
+# include "ios.h"
+#else
+# define TARGET_OS_IPHONE 0
+# define TARGET_OS_SIMULATOR 0
+#endif
+
+#if USE_OS_TZDB
+# include <dirent.h>
+#endif
+#include <algorithm>
+#include <cctype>
+#include <cstdlib>
+#include <cstring>
+#include <cwchar>
+#include <exception>
+#include <fstream>
+#include <iostream>
+#include <iterator>
+#include <memory>
+#if USE_OS_TZDB
+# include <queue>
+#endif
+#include <sstream>
+#include <string>
+#include <tuple>
+#include <vector>
+#include <sys/stat.h>
+
+// unistd.h is used on some platforms as part of the the means to get
+// the current time zone. On Win32 windows.h provides a means to do it.
+// gcc/mingw supports unistd.h on Win32 but MSVC does not.
+
+#ifdef _WIN32
+# ifdef WINAPI_FAMILY
+# include <winapifamily.h>
+# if WINAPI_FAMILY != WINAPI_FAMILY_DESKTOP_APP
+# define WINRT
+# define INSTALL .
+# endif
+# endif
+
+# include <io.h> // _unlink etc.
+
+# if defined(__clang__)
+ struct IUnknown; // fix for issue with static_cast<> in objbase.h
+ // (see https://github.com/philsquared/Catch/issues/690)
+# endif
+
+# include <shlobj.h> // CoTaskFree, ShGetKnownFolderPath etc.
+# if HAS_REMOTE_API
+# include <direct.h> // _mkdir
+# include <shellapi.h> // ShFileOperation etc.
+# endif // HAS_REMOTE_API
+#else // !_WIN32
+# include <unistd.h>
+# if !USE_OS_TZDB
+# include <wordexp.h>
+# endif
+# include <limits.h>
+# include <string.h>
+# if !USE_SHELL_API
+# include <sys/stat.h>
+# include <sys/fcntl.h>
+# include <dirent.h>
+# include <cstring>
+# include <sys/wait.h>
+# include <sys/types.h>
+# endif //!USE_SHELL_API
+#endif // !_WIN32
+
+
+#if HAS_REMOTE_API
+ // Note curl includes windows.h so we must include curl AFTER definitions of things
+ // that affect windows.h such as NOMINMAX.
+#if defined(_MSC_VER) && defined(SHORTENED_CURL_INCLUDE)
+ // For rmt_curl nuget package
+# error #include <curl.h>
+#else
+# error #include <curl/curl.h>
+#endif
+#endif
+
+#ifdef _WIN32
+static CONSTDATA char folder_delimiter = '\\';
+#else // !_WIN32
+static CONSTDATA char folder_delimiter = '/';
+#endif // !_WIN32
+
+#if defined(__GNUC__) && __GNUC__ < 5
+ // GCC 4.9 Bug 61489 Wrong warning with -Wmissing-field-initializers
+# pragma GCC diagnostic push
+# pragma GCC diagnostic ignored "-Wmissing-field-initializers"
+#endif // defined(__GNUC__) && __GNUC__ < 5
+
+#if !USE_OS_TZDB
+
+# ifdef _WIN32
+# ifndef WINRT
+
+namespace
+{
+ struct task_mem_deleter
+ {
+ void operator()(wchar_t buf[])
+ {
+ if (buf != nullptr)
+ CoTaskMemFree(buf);
+ }
+ };
+ using co_task_mem_ptr = std::unique_ptr<wchar_t[], task_mem_deleter>;
+}
+
+// We might need to know certain locations even if not using the remote API,
+// so keep these routines out of that block for now.
+static
+std::string
+get_known_folder(const GUID& folderid)
+{
+ std::string folder;
+ PWSTR pfolder = nullptr;
+ HRESULT hr = SHGetKnownFolderPath(folderid, KF_FLAG_DEFAULT, nullptr, &pfolder);
+ if (SUCCEEDED(hr))
+ {
+ co_task_mem_ptr folder_ptr(pfolder);
+ const wchar_t* fptr = folder_ptr.get();
+ auto state = std::mbstate_t();
+ const auto required = std::wcsrtombs(nullptr, &fptr, 0, &state);
+ if (required != 0 && required != std::size_t(-1))
+ {
+ folder.resize(required);
+ std::wcsrtombs(&folder[0], &fptr, folder.size(), &state);
+ }
+ }
+ return folder;
+}
+
+# ifndef INSTALL
+
+// Usually something like "c:\Users\username\Downloads".
+static
+std::string
+get_download_folder()
+{
+ return get_known_folder(FOLDERID_Downloads);
+}
+
+# endif // !INSTALL
+
+# endif // WINRT
+# else // !_WIN32
+
+# if !defined(INSTALL)
+
+static
+std::string
+expand_path(std::string path)
+{
+# if TARGET_OS_IPHONE
+ return date::iOSUtils::get_tzdata_path();
+# else // !TARGET_OS_IPHONE
+ ::wordexp_t w{};
+ std::unique_ptr<::wordexp_t, void(*)(::wordexp_t*)> hold{&w, ::wordfree};
+ ::wordexp(path.c_str(), &w, 0);
+ if (w.we_wordc != 1)
+ throw std::runtime_error("Cannot expand path: " + path);
+ path = w.we_wordv[0];
+ return path;
+# endif // !TARGET_OS_IPHONE
+}
+
+static
+std::string
+get_download_folder()
+{
+ return expand_path("~/Downloads");
+}
+
+# endif // !defined(INSTALL)
+
+# endif // !_WIN32
+
+#endif // !USE_OS_TZDB
+
+namespace arrow_vendored
+{
+namespace date
+{
+// +---------------------+
+// | Begin Configuration |
+// +---------------------+
+
+using namespace detail;
+
+#if !USE_OS_TZDB
+
+static
+std::string&
+access_install()
+{
+ static std::string install
+#ifndef INSTALL
+
+ = get_download_folder() + folder_delimiter + "tzdata";
+
+#else // !INSTALL
+
+# define STRINGIZEIMP(x) #x
+# define STRINGIZE(x) STRINGIZEIMP(x)
+
+ = STRINGIZE(INSTALL) + std::string(1, folder_delimiter) + "tzdata";
+
+ #undef STRINGIZEIMP
+ #undef STRINGIZE
+#endif // !INSTALL
+
+ return install;
+}
+
+void
+set_install(const std::string& s)
+{
+ access_install() = s;
+}
+
+static
+const std::string&
+get_install()
+{
+ static const std::string& ref = access_install();
+ return ref;
+}
+
+#if HAS_REMOTE_API
+static
+std::string
+get_download_gz_file(const std::string& version)
+{
+ auto file = get_install() + version + ".tar.gz";
+ return file;
+}
+#endif // HAS_REMOTE_API
+
+#endif // !USE_OS_TZDB
+
+// These can be used to reduce the range of the database to save memory
+CONSTDATA auto min_year = date::year::min();
+CONSTDATA auto max_year = date::year::max();
+
+CONSTDATA auto min_day = date::January/1;
+CONSTDATA auto max_day = date::December/31;
+
+#if USE_OS_TZDB
+
+CONSTCD14 const sys_seconds min_seconds = sys_days(min_year/min_day);
+
+#endif // USE_OS_TZDB
+
+#ifndef _WIN32
+
+static
+std::string
+discover_tz_dir()
+{
+ struct stat sb;
+ using namespace std;
+# ifndef __APPLE__
+ CONSTDATA auto tz_dir_default = "/usr/share/zoneinfo";
+ CONSTDATA auto tz_dir_buildroot = "/usr/share/zoneinfo/uclibc";
+
+ // Check special path which is valid for buildroot with uclibc builds
+ if(stat(tz_dir_buildroot, &sb) == 0 && S_ISDIR(sb.st_mode))
+ return tz_dir_buildroot;
+ else if(stat(tz_dir_default, &sb) == 0 && S_ISDIR(sb.st_mode))
+ return tz_dir_default;
+ else
+ throw runtime_error("discover_tz_dir failed to find zoneinfo\n");
+# else // __APPLE__
+# if TARGET_OS_IPHONE
+# if TARGET_OS_SIMULATOR
+ return "/usr/share/zoneinfo";
+# else
+ return "/var/db/timezone/zoneinfo";
+# endif
+# else
+ CONSTDATA auto timezone = "/etc/localtime";
+ if (!(lstat(timezone, &sb) == 0 && S_ISLNK(sb.st_mode) && sb.st_size > 0))
+ throw runtime_error("discover_tz_dir failed\n");
+ string result;
+ char rp[PATH_MAX+1] = {};
+ if (readlink(timezone, rp, sizeof(rp)-1) > 0)
+ result = string(rp);
+ else
+ throw system_error(errno, system_category(), "readlink() failed");
+ auto i = result.find("zoneinfo");
+ if (i == string::npos)
+ throw runtime_error("discover_tz_dir failed to find zoneinfo\n");
+ i = result.find('/', i);
+ if (i == string::npos)
+ throw runtime_error("discover_tz_dir failed to find '/'\n");
+ return result.substr(0, i);
+# endif
+# endif // __APPLE__
+}
+
+static
+const std::string&
+get_tz_dir()
+{
+ static const std::string tz_dir = discover_tz_dir();
+ return tz_dir;
+}
+
+#endif
+
+// +-------------------+
+// | End Configuration |
+// +-------------------+
+
+#ifndef _MSC_VER
+static_assert(min_year <= max_year, "Configuration error");
+#endif
+
+static std::unique_ptr<tzdb> init_tzdb();
+
+tzdb_list::~tzdb_list()
+{
+ const tzdb* ptr = head_;
+ head_ = nullptr;
+ while (ptr != nullptr)
+ {
+ auto next = ptr->next;
+ delete ptr;
+ ptr = next;
+ }
+}
+
+tzdb_list::tzdb_list(tzdb_list&& x) noexcept
+ : head_{x.head_.exchange(nullptr)}
+{
+}
+
+void
+tzdb_list::push_front(tzdb* tzdb) noexcept
+{
+ tzdb->next = head_;
+ head_ = tzdb;
+}
+
+tzdb_list::const_iterator
+tzdb_list::erase_after(const_iterator p) noexcept
+{
+ auto t = p.p_->next;
+ p.p_->next = p.p_->next->next;
+ delete t;
+ return ++p;
+}
+
+struct tzdb_list::undocumented_helper
+{
+ static void push_front(tzdb_list& db_list, tzdb* tzdb) noexcept
+ {
+ db_list.push_front(tzdb);
+ }
+};
+
+static
+tzdb_list
+create_tzdb()
+{
+ tzdb_list tz_db;
+ tzdb_list::undocumented_helper::push_front(tz_db, init_tzdb().release());
+ return tz_db;
+}
+
+tzdb_list&
+get_tzdb_list()
+{
+ static tzdb_list tz_db = create_tzdb();
+ return tz_db;
+}
+
+#if !USE_OS_TZDB
+
+#ifdef _WIN32
+
+static
+void
+sort_zone_mappings(std::vector<date::detail::timezone_mapping>& mappings)
+{
+ std::sort(mappings.begin(), mappings.end(),
+ [](const date::detail::timezone_mapping& lhs,
+ const date::detail::timezone_mapping& rhs)->bool
+ {
+ auto other_result = lhs.other.compare(rhs.other);
+ if (other_result < 0)
+ return true;
+ else if (other_result == 0)
+ {
+ auto territory_result = lhs.territory.compare(rhs.territory);
+ if (territory_result < 0)
+ return true;
+ else if (territory_result == 0)
+ {
+ if (lhs.type < rhs.type)
+ return true;
+ }
+ }
+ return false;
+ });
+}
+
+static
+bool
+native_to_standard_timezone_name(const std::string& native_tz_name,
+ std::string& standard_tz_name)
+{
+ // TOOD! Need be a case insensitive compare?
+ if (native_tz_name == "UTC")
+ {
+ standard_tz_name = "Etc/UTC";
+ return true;
+ }
+ standard_tz_name.clear();
+ // TODO! we can improve on linear search.
+ const auto& mappings = date::get_tzdb().mappings;
+ for (const auto& tzm : mappings)
+ {
+ if (tzm.other == native_tz_name)
+ {
+ standard_tz_name = tzm.type;
+ return true;
+ }
+ }
+ return false;
+}
+
+// Parse this XML file:
+// https://raw.githubusercontent.com/unicode-org/cldr/master/common/supplemental/windowsZones.xml
+// The parsing method is designed to be simple and quick. It is not overly
+// forgiving of change but it should diagnose basic format issues.
+// See timezone_mapping structure for more info.
+static
+std::vector<detail::timezone_mapping>
+load_timezone_mappings_from_xml_file(const std::string& input_path)
+{
+ std::size_t line_num = 0;
+ std::vector<detail::timezone_mapping> mappings;
+ std::string line;
+
+ std::ifstream is(input_path);
+ if (!is.is_open())
+ {
+ // We don't emit file exceptions because that's an implementation detail.
+ std::string msg = "Error opening time zone mapping file \"";
+ msg += input_path;
+ msg += "\".";
+ throw std::runtime_error(msg);
+ }
+
+ auto error = [&input_path, &line_num](const char* info)
+ {
+ std::string msg = "Error loading time zone mapping file \"";
+ msg += input_path;
+ msg += "\" at line ";
+ msg += std::to_string(line_num);
+ msg += ": ";
+ msg += info;
+ throw std::runtime_error(msg);
+ };
+ // [optional space]a="b"
+ auto read_attribute = [&line, &error]
+ (const char* name, std::string& value, std::size_t startPos)
+ ->std::size_t
+ {
+ value.clear();
+ // Skip leading space before attribute name.
+ std::size_t spos = line.find_first_not_of(' ', startPos);
+ if (spos == std::string::npos)
+ spos = startPos;
+ // Assume everything up to next = is the attribute name
+ // and that an = will always delimit that.
+ std::size_t epos = line.find('=', spos);
+ if (epos == std::string::npos)
+ error("Expected \'=\' right after attribute name.");
+ std::size_t name_len = epos - spos;
+ // Expect the name we find matches the name we expect.
+ if (line.compare(spos, name_len, name) != 0)
+ {
+ std::string msg;
+ msg = "Expected attribute name \'";
+ msg += name;
+ msg += "\' around position ";
+ msg += std::to_string(spos);
+ msg += " but found something else.";
+ error(msg.c_str());
+ }
+ ++epos; // Skip the '=' that is after the attribute name.
+ spos = epos;
+ if (spos < line.length() && line[spos] == '\"')
+ ++spos; // Skip the quote that is before the attribute value.
+ else
+ {
+ std::string msg = "Expected '\"' to begin value of attribute \'";
+ msg += name;
+ msg += "\'.";
+ error(msg.c_str());
+ }
+ epos = line.find('\"', spos);
+ if (epos == std::string::npos)
+ {
+ std::string msg = "Expected '\"' to end value of attribute \'";
+ msg += name;
+ msg += "\'.";
+ error(msg.c_str());
+ }
+ // Extract everything in between the quotes. Note no escaping is done.
+ std::size_t value_len = epos - spos;
+ value.assign(line, spos, value_len);
+ ++epos; // Skip the quote that is after the attribute value;
+ return epos;
+ };
+
+ // Quick but not overly forgiving XML mapping file processing.
+ bool mapTimezonesOpenTagFound = false;
+ bool mapTimezonesCloseTagFound = false;
+ std::size_t mapZonePos = std::string::npos;
+ std::size_t mapTimezonesPos = std::string::npos;
+ CONSTDATA char mapTimeZonesOpeningTag[] = { "<mapTimezones " };
+ CONSTDATA char mapZoneOpeningTag[] = { "<mapZone " };
+ CONSTDATA std::size_t mapZoneOpeningTagLen = sizeof(mapZoneOpeningTag) /
+ sizeof(mapZoneOpeningTag[0]) - 1;
+ while (!mapTimezonesOpenTagFound)
+ {
+ std::getline(is, line);
+ ++line_num;
+ if (is.eof())
+ {
+ // If there is no mapTimezones tag is it an error?
+ // Perhaps if there are no mapZone mappings it might be ok for
+ // its parent mapTimezones element to be missing?
+ // We treat this as an error though on the assumption that if there
+ // really are no mappings we should still get a mapTimezones parent
+ // element but no mapZone elements inside. Assuming we must
+ // find something will hopefully at least catch more drastic formatting
+ // changes or errors than if we don't do this and assume nothing found.
+ error("Expected a mapTimezones opening tag.");
+ }
+ mapTimezonesPos = line.find(mapTimeZonesOpeningTag);
+ mapTimezonesOpenTagFound = (mapTimezonesPos != std::string::npos);
+ }
+
+ // NOTE: We could extract the version info that follows the opening
+ // mapTimezones tag and compare that to the version of other data we have.
+ // I would have expected them to be kept in synch but testing has shown
+ // it typically does not match anyway. So what's the point?
+ while (!mapTimezonesCloseTagFound)
+ {
+ std::ws(is);
+ std::getline(is, line);
+ ++line_num;
+ if (is.eof())
+ error("Expected a mapTimezones closing tag.");
+ if (line.empty())
+ continue;
+ mapZonePos = line.find(mapZoneOpeningTag);
+ if (mapZonePos != std::string::npos)
+ {
+ mapZonePos += mapZoneOpeningTagLen;
+ detail::timezone_mapping zm{};
+ std::size_t pos = read_attribute("other", zm.other, mapZonePos);
+ pos = read_attribute("territory", zm.territory, pos);
+ read_attribute("type", zm.type, pos);
+ mappings.push_back(std::move(zm));
+
+ continue;
+ }
+ mapTimezonesPos = line.find("</mapTimezones>");
+ mapTimezonesCloseTagFound = (mapTimezonesPos != std::string::npos);
+ if (!mapTimezonesCloseTagFound)
+ {
+ std::size_t commentPos = line.find("<!--");
+ if (commentPos == std::string::npos)
+ error("Unexpected mapping record found. A xml mapZone or comment "
+ "attribute or mapTimezones closing tag was expected.");
+ }
+ }
+
+ is.close();
+ return mappings;
+}
+
+#endif // _WIN32
+
+// Parsing helpers
+
+static
+std::string
+parse3(std::istream& in)
+{
+ std::string r(3, ' ');
+ ws(in);
+ r[0] = static_cast<char>(in.get());
+ r[1] = static_cast<char>(in.get());
+ r[2] = static_cast<char>(in.get());
+ return r;
+}
+
+static
+unsigned
+parse_dow(std::istream& in)
+{
+ CONSTDATA char*const dow_names[] =
+ {"Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"};
+ auto s = parse3(in);
+ auto dow = std::find(std::begin(dow_names), std::end(dow_names), s) - dow_names;
+ if (dow >= std::end(dow_names) - std::begin(dow_names))
+ throw std::runtime_error("oops: bad dow name: " + s);
+ return static_cast<unsigned>(dow);
+}
+
+static
+unsigned
+parse_month(std::istream& in)
+{
+ CONSTDATA char*const month_names[] =
+ {"Jan", "Feb", "Mar", "Apr", "May", "Jun",
+ "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"};
+ auto s = parse3(in);
+ auto m = std::find(std::begin(month_names), std::end(month_names), s) - month_names;
+ if (m >= std::end(month_names) - std::begin(month_names))
+ throw std::runtime_error("oops: bad month name: " + s);
+ return static_cast<unsigned>(++m);
+}
+
+static
+std::chrono::seconds
+parse_unsigned_time(std::istream& in)
+{
+ using namespace std::chrono;
+ int x;
+ in >> x;
+ auto r = seconds{hours{x}};
+ if (!in.eof() && in.peek() == ':')
+ {
+ in.get();
+ in >> x;
+ r += minutes{x};
+ if (!in.eof() && in.peek() == ':')
+ {
+ in.get();
+ in >> x;
+ r += seconds{x};
+ }
+ }
+ return r;
+}
+
+static
+std::chrono::seconds
+parse_signed_time(std::istream& in)
+{
+ ws(in);
+ auto sign = 1;
+ if (in.peek() == '-')
+ {
+ sign = -1;
+ in.get();
+ }
+ else if (in.peek() == '+')
+ in.get();
+ return sign * parse_unsigned_time(in);
+}
+
+// MonthDayTime
+
+detail::MonthDayTime::MonthDayTime(local_seconds tp, tz timezone)
+ : zone_(timezone)
+{
+ using namespace date;
+ const auto dp = date::floor<days>(tp);
+ const auto hms = make_time(tp - dp);
+ const auto ymd = year_month_day(dp);
+ u = ymd.month() / ymd.day();
+ h_ = hms.hours();
+ m_ = hms.minutes();
+ s_ = hms.seconds();
+}
+
+detail::MonthDayTime::MonthDayTime(const date::month_day& md, tz timezone)
+ : zone_(timezone)
+{
+ u = md;
+}
+
+date::day
+detail::MonthDayTime::day() const
+{
+ switch (type_)
+ {
+ case month_day:
+ return u.month_day_.day();
+ case month_last_dow:
+ return date::day{31};
+ case lteq:
+ case gteq:
+ break;
+ }
+ return u.month_day_weekday_.month_day_.day();
+}
+
+date::month
+detail::MonthDayTime::month() const
+{
+ switch (type_)
+ {
+ case month_day:
+ return u.month_day_.month();
+ case month_last_dow:
+ return u.month_weekday_last_.month();
+ case lteq:
+ case gteq:
+ break;
+ }
+ return u.month_day_weekday_.month_day_.month();
+}
+
+int
+detail::MonthDayTime::compare(date::year y, const MonthDayTime& x, date::year yx,
+ std::chrono::seconds offset, std::chrono::minutes prev_save) const
+{
+ if (zone_ != x.zone_)
+ {
+ auto dp0 = to_sys_days(y);
+ auto dp1 = x.to_sys_days(yx);
+ if (std::abs((dp0-dp1).count()) > 1)
+ return dp0 < dp1 ? -1 : 1;
+ if (zone_ == tz::local)
+ {
+ auto tp0 = to_time_point(y) - prev_save;
+ if (x.zone_ == tz::utc)
+ tp0 -= offset;
+ auto tp1 = x.to_time_point(yx);
+ return tp0 < tp1 ? -1 : tp0 == tp1 ? 0 : 1;
+ }
+ else if (zone_ == tz::standard)
+ {
+ auto tp0 = to_time_point(y);
+ auto tp1 = x.to_time_point(yx);
+ if (x.zone_ == tz::local)
+ tp1 -= prev_save;
+ else
+ tp0 -= offset;
+ return tp0 < tp1 ? -1 : tp0 == tp1 ? 0 : 1;
+ }
+ // zone_ == tz::utc
+ auto tp0 = to_time_point(y);
+ auto tp1 = x.to_time_point(yx);
+ if (x.zone_ == tz::local)
+ tp1 -= offset + prev_save;
+ else
+ tp1 -= offset;
+ return tp0 < tp1 ? -1 : tp0 == tp1 ? 0 : 1;
+ }
+ auto const t0 = to_time_point(y);
+ auto const t1 = x.to_time_point(yx);
+ return t0 < t1 ? -1 : t0 == t1 ? 0 : 1;
+}
+
+sys_seconds
+detail::MonthDayTime::to_sys(date::year y, std::chrono::seconds offset,
+ std::chrono::seconds save) const
+{
+ using namespace date;
+ using namespace std::chrono;
+ auto until_utc = to_time_point(y);
+ if (zone_ == tz::standard)
+ until_utc -= offset;
+ else if (zone_ == tz::local)
+ until_utc -= offset + save;
+ return until_utc;
+}
+
+detail::MonthDayTime::U&
+detail::MonthDayTime::U::operator=(const date::month_day& x)
+{
+ month_day_ = x;
+ return *this;
+}
+
+detail::MonthDayTime::U&
+detail::MonthDayTime::U::operator=(const date::month_weekday_last& x)
+{
+ month_weekday_last_ = x;
+ return *this;
+}
+
+detail::MonthDayTime::U&
+detail::MonthDayTime::U::operator=(const pair& x)
+{
+ month_day_weekday_ = x;
+ return *this;
+}
+
+date::sys_days
+detail::MonthDayTime::to_sys_days(date::year y) const
+{
+ using namespace std::chrono;
+ using namespace date;
+ switch (type_)
+ {
+ case month_day:
+ return sys_days(y/u.month_day_);
+ case month_last_dow:
+ return sys_days(y/u.month_weekday_last_);
+ case lteq:
+ {
+ auto const x = y/u.month_day_weekday_.month_day_;
+ auto const wd1 = weekday(static_cast<sys_days>(x));
+ auto const wd0 = u.month_day_weekday_.weekday_;
+ return sys_days(x) - (wd1-wd0);
+ }
+ case gteq:
+ break;
+ }
+ auto const x = y/u.month_day_weekday_.month_day_;
+ auto const wd1 = u.month_day_weekday_.weekday_;
+ auto const wd0 = weekday(static_cast<sys_days>(x));
+ return sys_days(x) + (wd1-wd0);
+}
+
+sys_seconds
+detail::MonthDayTime::to_time_point(date::year y) const
+{
+ // Add seconds first to promote to largest rep early to prevent overflow
+ return to_sys_days(y) + s_ + h_ + m_;
+}
+
+void
+detail::MonthDayTime::canonicalize(date::year y)
+{
+ using namespace std::chrono;
+ using namespace date;
+ switch (type_)
+ {
+ case month_day:
+ return;
+ case month_last_dow:
+ {
+ auto const ymd = year_month_day(sys_days(y/u.month_weekday_last_));
+ u.month_day_ = ymd.month()/ymd.day();
+ type_ = month_day;
+ return;
+ }
+ case lteq:
+ {
+ auto const x = y/u.month_day_weekday_.month_day_;
+ auto const wd1 = weekday(static_cast<sys_days>(x));
+ auto const wd0 = u.month_day_weekday_.weekday_;
+ auto const ymd = year_month_day(sys_days(x) - (wd1-wd0));
+ u.month_day_ = ymd.month()/ymd.day();
+ type_ = month_day;
+ return;
+ }
+ case gteq:
+ {
+ auto const x = y/u.month_day_weekday_.month_day_;
+ auto const wd1 = u.month_day_weekday_.weekday_;
+ auto const wd0 = weekday(static_cast<sys_days>(x));
+ auto const ymd = year_month_day(sys_days(x) + (wd1-wd0));
+ u.month_day_ = ymd.month()/ymd.day();
+ type_ = month_day;
+ return;
+ }
+ }
+}
+
+std::istream&
+detail::operator>>(std::istream& is, MonthDayTime& x)
+{
+ using namespace date;
+ using namespace std::chrono;
+ assert(((std::ios::failbit | std::ios::badbit) & is.exceptions()) ==
+ (std::ios::failbit | std::ios::badbit));
+ x = MonthDayTime{};
+ if (!is.eof() && ws(is) && !is.eof() && is.peek() != '#')
+ {
+ auto m = parse_month(is);
+ if (!is.eof() && ws(is) && !is.eof() && is.peek() != '#')
+ {
+ if (is.peek() == 'l')
+ {
+ for (int i = 0; i < 4; ++i)
+ is.get();
+ auto dow = parse_dow(is);
+ x.type_ = MonthDayTime::month_last_dow;
+ x.u = date::month(m)/weekday(dow)[last];
+ }
+ else if (std::isalpha(is.peek()))
+ {
+ auto dow = parse_dow(is);
+ char c{};
+ is >> c;
+ if (c == '<' || c == '>')
+ {
+ char c2{};
+ is >> c2;
+ if (c2 != '=')
+ throw std::runtime_error(std::string("bad operator: ") + c + c2);
+ int d;
+ is >> d;
+ if (d < 1 || d > 31)
+ throw std::runtime_error(std::string("bad operator: ") + c + c2
+ + std::to_string(d));
+ x.type_ = c == '<' ? MonthDayTime::lteq : MonthDayTime::gteq;
+ x.u = MonthDayTime::pair{ date::month(m) / d, date::weekday(dow) };
+ }
+ else
+ throw std::runtime_error(std::string("bad operator: ") + c);
+ }
+ else // if (std::isdigit(is.peek())
+ {
+ int d;
+ is >> d;
+ if (d < 1 || d > 31)
+ throw std::runtime_error(std::string("day of month: ")
+ + std::to_string(d));
+ x.type_ = MonthDayTime::month_day;
+ x.u = date::month(m)/d;
+ }
+ if (!is.eof() && ws(is) && !is.eof() && is.peek() != '#')
+ {
+ int t;
+ is >> t;
+ x.h_ = hours{t};
+ if (!is.eof() && is.peek() == ':')
+ {
+ is.get();
+ is >> t;
+ x.m_ = minutes{t};
+ if (!is.eof() && is.peek() == ':')
+ {
+ is.get();
+ is >> t;
+ x.s_ = seconds{t};
+ }
+ }
+ if (!is.eof() && std::isalpha(is.peek()))
+ {
+ char c;
+ is >> c;
+ switch (c)
+ {
+ case 's':
+ x.zone_ = tz::standard;
+ break;
+ case 'u':
+ x.zone_ = tz::utc;
+ break;
+ }
+ }
+ }
+ }
+ else
+ {
+ x.u = month{m}/1;
+ }
+ }
+ return is;
+}
+
+std::ostream&
+detail::operator<<(std::ostream& os, const MonthDayTime& x)
+{
+ switch (x.type_)
+ {
+ case MonthDayTime::month_day:
+ os << x.u.month_day_ << " ";
+ break;
+ case MonthDayTime::month_last_dow:
+ os << x.u.month_weekday_last_ << " ";
+ break;
+ case MonthDayTime::lteq:
+ os << x.u.month_day_weekday_.weekday_ << " on or before "
+ << x.u.month_day_weekday_.month_day_ << " ";
+ break;
+ case MonthDayTime::gteq:
+ if ((static_cast<unsigned>(x.day()) - 1) % 7 == 0)
+ {
+ os << (x.u.month_day_weekday_.month_day_.month() /
+ x.u.month_day_weekday_.weekday_[
+ (static_cast<unsigned>(x.day()) - 1)/7+1]) << " ";
+ }
+ else
+ {
+ os << x.u.month_day_weekday_.weekday_ << " on or after "
+ << x.u.month_day_weekday_.month_day_ << " ";
+ }
+ break;
+ }
+ os << date::make_time(x.s_ + x.h_ + x.m_);
+ if (x.zone_ == tz::utc)
+ os << "UTC ";
+ else if (x.zone_ == tz::standard)
+ os << "STD ";
+ else
+ os << " ";
+ return os;
+}
+
+// Rule
+
+detail::Rule::Rule(const std::string& s)
+{
+ try
+ {
+ using namespace date;
+ using namespace std::chrono;
+ std::istringstream in(s);
+ in.exceptions(std::ios::failbit | std::ios::badbit);
+ std::string word;
+ in >> word >> name_;
+ int x;
+ ws(in);
+ if (std::isalpha(in.peek()))
+ {
+ in >> word;
+ if (word == "min")
+ {
+ starting_year_ = year::min();
+ }
+ else
+ throw std::runtime_error("Didn't find expected word: " + word);
+ }
+ else
+ {
+ in >> x;
+ starting_year_ = year{x};
+ }
+ std::ws(in);
+ if (std::isalpha(in.peek()))
+ {
+ in >> word;
+ if (word == "only")
+ {
+ ending_year_ = starting_year_;
+ }
+ else if (word == "max")
+ {
+ ending_year_ = year::max();
+ }
+ else
+ throw std::runtime_error("Didn't find expected word: " + word);
+ }
+ else
+ {
+ in >> x;
+ ending_year_ = year{x};
+ }
+ in >> word; // TYPE (always "-")
+ assert(word == "-");
+ in >> starting_at_;
+ save_ = duration_cast<minutes>(parse_signed_time(in));
+ in >> abbrev_;
+ if (abbrev_ == "-")
+ abbrev_.clear();
+ assert(hours{-1} <= save_ && save_ <= hours{2});
+ }
+ catch (...)
+ {
+ std::cerr << s << '\n';
+ std::cerr << *this << '\n';
+ throw;
+ }
+}
+
+detail::Rule::Rule(const Rule& r, date::year starting_year, date::year ending_year)
+ : name_(r.name_)
+ , starting_year_(starting_year)
+ , ending_year_(ending_year)
+ , starting_at_(r.starting_at_)
+ , save_(r.save_)
+ , abbrev_(r.abbrev_)
+{
+}
+
+bool
+detail::operator==(const Rule& x, const Rule& y)
+{
+ if (std::tie(x.name_, x.save_, x.starting_year_, x.ending_year_) ==
+ std::tie(y.name_, y.save_, y.starting_year_, y.ending_year_))
+ return x.month() == y.month() && x.day() == y.day();
+ return false;
+}
+
+bool
+detail::operator<(const Rule& x, const Rule& y)
+{
+ using namespace std::chrono;
+ auto const xm = x.month();
+ auto const ym = y.month();
+ if (std::tie(x.name_, x.starting_year_, xm, x.ending_year_) <
+ std::tie(y.name_, y.starting_year_, ym, y.ending_year_))
+ return true;
+ if (std::tie(x.name_, x.starting_year_, xm, x.ending_year_) >
+ std::tie(y.name_, y.starting_year_, ym, y.ending_year_))
+ return false;
+ return x.day() < y.day();
+}
+
+bool
+detail::operator==(const Rule& x, const date::year& y)
+{
+ return x.starting_year_ <= y && y <= x.ending_year_;
+}
+
+bool
+detail::operator<(const Rule& x, const date::year& y)
+{
+ return x.ending_year_ < y;
+}
+
+bool
+detail::operator==(const date::year& x, const Rule& y)
+{
+ return y.starting_year_ <= x && x <= y.ending_year_;
+}
+
+bool
+detail::operator<(const date::year& x, const Rule& y)
+{
+ return x < y.starting_year_;
+}
+
+bool
+detail::operator==(const Rule& x, const std::string& y)
+{
+ return x.name() == y;
+}
+
+bool
+detail::operator<(const Rule& x, const std::string& y)
+{
+ return x.name() < y;
+}
+
+bool
+detail::operator==(const std::string& x, const Rule& y)
+{
+ return y.name() == x;
+}
+
+bool
+detail::operator<(const std::string& x, const Rule& y)
+{
+ return x < y.name();
+}
+
+std::ostream&
+detail::operator<<(std::ostream& os, const Rule& r)
+{
+ using namespace date;
+ using namespace std::chrono;
+ detail::save_ostream<char> _(os);
+ os.fill(' ');
+ os.flags(std::ios::dec | std::ios::left);
+ os.width(15);
+ os << r.name_;
+ os << r.starting_year_ << " " << r.ending_year_ << " ";
+ os << r.starting_at_;
+ if (r.save_ >= minutes{0})
+ os << ' ';
+ os << date::make_time(r.save_) << " ";
+ os << r.abbrev_;
+ return os;
+}
+
+date::day
+detail::Rule::day() const
+{
+ return starting_at_.day();
+}
+
+date::month
+detail::Rule::month() const
+{
+ return starting_at_.month();
+}
+
+struct find_rule_by_name
+{
+ bool operator()(const Rule& x, const std::string& nm) const
+ {
+ return x.name() < nm;
+ }
+
+ bool operator()(const std::string& nm, const Rule& x) const
+ {
+ return nm < x.name();
+ }
+};
+
+bool
+detail::Rule::overlaps(const Rule& x, const Rule& y)
+{
+ // assume x.starting_year_ <= y.starting_year_;
+ if (!(x.starting_year_ <= y.starting_year_))
+ {
+ std::cerr << x << '\n';
+ std::cerr << y << '\n';
+ assert(x.starting_year_ <= y.starting_year_);
+ }
+ if (y.starting_year_ > x.ending_year_)
+ return false;
+ return !(x.starting_year_ == y.starting_year_ && x.ending_year_ == y.ending_year_);
+}
+
+void
+detail::Rule::split(std::vector<Rule>& rules, std::size_t i, std::size_t k, std::size_t& e)
+{
+ using namespace date;
+ using difference_type = std::iterator_traits<std::vector<Rule>::iterator>::difference_type;
+ // rules[i].starting_year_ <= rules[k].starting_year_ &&
+ // rules[i].ending_year_ >= rules[k].starting_year_ &&
+ // (rules[i].starting_year_ != rules[k].starting_year_ ||
+ // rules[i].ending_year_ != rules[k].ending_year_)
+ assert(rules[i].starting_year_ <= rules[k].starting_year_ &&
+ rules[i].ending_year_ >= rules[k].starting_year_ &&
+ (rules[i].starting_year_ != rules[k].starting_year_ ||
+ rules[i].ending_year_ != rules[k].ending_year_));
+ if (rules[i].starting_year_ == rules[k].starting_year_)
+ {
+ if (rules[k].ending_year_ < rules[i].ending_year_)
+ {
+ rules.insert(rules.begin() + static_cast<difference_type>(k+1),
+ Rule(rules[i], rules[k].ending_year_ + years{1},
+ std::move(rules[i].ending_year_)));
+ ++e;
+ rules[i].ending_year_ = rules[k].ending_year_;
+ }
+ else // rules[k].ending_year_ > rules[i].ending_year_
+ {
+ rules.insert(rules.begin() + static_cast<difference_type>(k+1),
+ Rule(rules[k], rules[i].ending_year_ + years{1},
+ std::move(rules[k].ending_year_)));
+ ++e;
+ rules[k].ending_year_ = rules[i].ending_year_;
+ }
+ }
+ else // rules[i].starting_year_ < rules[k].starting_year_
+ {
+ if (rules[k].ending_year_ < rules[i].ending_year_)
+ {
+ rules.insert(rules.begin() + static_cast<difference_type>(k),
+ Rule(rules[i], rules[k].starting_year_, rules[k].ending_year_));
+ ++k;
+ rules.insert(rules.begin() + static_cast<difference_type>(k+1),
+ Rule(rules[i], rules[k].ending_year_ + years{1},
+ std::move(rules[i].ending_year_)));
+ rules[i].ending_year_ = rules[k].starting_year_ - years{1};
+ e += 2;
+ }
+ else if (rules[k].ending_year_ > rules[i].ending_year_)
+ {
+ rules.insert(rules.begin() + static_cast<difference_type>(k),
+ Rule(rules[i], rules[k].starting_year_, rules[i].ending_year_));
+ ++k;
+ rules.insert(rules.begin() + static_cast<difference_type>(k+1),
+ Rule(rules[k], rules[i].ending_year_ + years{1},
+ std::move(rules[k].ending_year_)));
+ e += 2;
+ rules[k].ending_year_ = std::move(rules[i].ending_year_);
+ rules[i].ending_year_ = rules[k].starting_year_ - years{1};
+ }
+ else // rules[k].ending_year_ == rules[i].ending_year_
+ {
+ rules.insert(rules.begin() + static_cast<difference_type>(k),
+ Rule(rules[i], rules[k].starting_year_,
+ std::move(rules[i].ending_year_)));
+ ++k;
+ ++e;
+ rules[i].ending_year_ = rules[k].starting_year_ - years{1};
+ }
+ }
+}
+
+void
+detail::Rule::split_overlaps(std::vector<Rule>& rules, std::size_t i, std::size_t& e)
+{
+ using difference_type = std::iterator_traits<std::vector<Rule>::iterator>::difference_type;
+ auto j = i;
+ for (; i + 1 < e; ++i)
+ {
+ for (auto k = i + 1; k < e; ++k)
+ {
+ if (overlaps(rules[i], rules[k]))
+ {
+ split(rules, i, k, e);
+ std::sort(rules.begin() + static_cast<difference_type>(i),
+ rules.begin() + static_cast<difference_type>(e));
+ }
+ }
+ }
+ for (; j < e; ++j)
+ {
+ if (rules[j].starting_year() == rules[j].ending_year())
+ rules[j].starting_at_.canonicalize(rules[j].starting_year());
+ }
+}
+
+void
+detail::Rule::split_overlaps(std::vector<Rule>& rules)
+{
+ using difference_type = std::iterator_traits<std::vector<Rule>::iterator>::difference_type;
+ for (std::size_t i = 0; i < rules.size();)
+ {
+ auto e = static_cast<std::size_t>(std::upper_bound(
+ rules.cbegin()+static_cast<difference_type>(i), rules.cend(), rules[i].name(),
+ [](const std::string& nm, const Rule& x)
+ {
+ return nm < x.name();
+ }) - rules.cbegin());
+ split_overlaps(rules, i, e);
+ auto first_rule = rules.begin() + static_cast<difference_type>(i);
+ auto last_rule = rules.begin() + static_cast<difference_type>(e);
+ auto t = std::lower_bound(first_rule, last_rule, min_year);
+ if (t > first_rule+1)
+ {
+ if (t == last_rule || t->starting_year() >= min_year)
+ --t;
+ auto d = static_cast<std::size_t>(t - first_rule);
+ rules.erase(first_rule, t);
+ e -= d;
+ }
+ first_rule = rules.begin() + static_cast<difference_type>(i);
+ last_rule = rules.begin() + static_cast<difference_type>(e);
+ t = std::upper_bound(first_rule, last_rule, max_year);
+ if (t != last_rule)
+ {
+ auto d = static_cast<std::size_t>(last_rule - t);
+ rules.erase(t, last_rule);
+ e -= d;
+ }
+ i = e;
+ }
+ rules.shrink_to_fit();
+}
+
+// Find the rule that comes chronologically before Rule r. For multi-year rules,
+// y specifies which rules in r. For single year rules, y is assumed to be equal
+// to the year specified by r.
+// Returns a pointer to the chronologically previous rule, and the year within
+// that rule. If there is no previous rule, returns nullptr and year::min().
+// Preconditions:
+// r->starting_year() <= y && y <= r->ending_year()
+static
+std::pair<const Rule*, date::year>
+find_previous_rule(const Rule* r, date::year y)
+{
+ using namespace date;
+ auto const& rules = get_tzdb().rules;
+ if (y == r->starting_year())
+ {
+ if (r == &rules.front() || r->name() != r[-1].name())
+ std::terminate(); // never called with first rule
+ --r;
+ if (y == r->starting_year())
+ return {r, y};
+ return {r, r->ending_year()};
+ }
+ if (r == &rules.front() || r->name() != r[-1].name() ||
+ r[-1].starting_year() < r->starting_year())
+ {
+ while (r < &rules.back() && r->name() == r[1].name() &&
+ r->starting_year() == r[1].starting_year())
+ ++r;
+ return {r, --y};
+ }
+ --r;
+ return {r, y};
+}
+
+// Find the rule that comes chronologically after Rule r. For multi-year rules,
+// y specifies which rules in r. For single year rules, y is assumed to be equal
+// to the year specified by r.
+// Returns a pointer to the chronologically next rule, and the year within
+// that rule. If there is no next rule, return a pointer to a defaulted rule
+// and y+1.
+// Preconditions:
+// first <= r && r < last && r->starting_year() <= y && y <= r->ending_year()
+// [first, last) all have the same name
+static
+std::pair<const Rule*, date::year>
+find_next_rule(const Rule* first_rule, const Rule* last_rule, const Rule* r, date::year y)
+{
+ using namespace date;
+ if (y == r->ending_year())
+ {
+ if (r == last_rule-1)
+ return {nullptr, year::max()};
+ ++r;
+ if (y == r->ending_year())
+ return {r, y};
+ return {r, r->starting_year()};
+ }
+ if (r == last_rule-1 || r->ending_year() < r[1].ending_year())
+ {
+ while (r > first_rule && r->starting_year() == r[-1].starting_year())
+ --r;
+ return {r, ++y};
+ }
+ ++r;
+ return {r, y};
+}
+
+// Find the rule that comes chronologically after Rule r. For multi-year rules,
+// y specifies which rules in r. For single year rules, y is assumed to be equal
+// to the year specified by r.
+// Returns a pointer to the chronologically next rule, and the year within
+// that rule. If there is no next rule, return nullptr and year::max().
+// Preconditions:
+// r->starting_year() <= y && y <= r->ending_year()
+static
+std::pair<const Rule*, date::year>
+find_next_rule(const Rule* r, date::year y)
+{
+ using namespace date;
+ auto const& rules = get_tzdb().rules;
+ if (y == r->ending_year())
+ {
+ if (r == &rules.back() || r->name() != r[1].name())
+ return {nullptr, year::max()};
+ ++r;
+ if (y == r->ending_year())
+ return {r, y};
+ return {r, r->starting_year()};
+ }
+ if (r == &rules.back() || r->name() != r[1].name() ||
+ r->ending_year() < r[1].ending_year())
+ {
+ while (r > &rules.front() && r->name() == r[-1].name() &&
+ r->starting_year() == r[-1].starting_year())
+ --r;
+ return {r, ++y};
+ }
+ ++r;
+ return {r, y};
+}
+
+static
+const Rule*
+find_first_std_rule(const std::pair<const Rule*, const Rule*>& eqr)
+{
+ auto r = eqr.first;
+ auto ry = r->starting_year();
+ while (r->save() != std::chrono::minutes{0})
+ {
+ std::tie(r, ry) = find_next_rule(eqr.first, eqr.second, r, ry);
+ if (r == nullptr)
+ throw std::runtime_error("Could not find standard offset in rule "
+ + eqr.first->name());
+ }
+ return r;
+}
+
+static
+std::pair<const Rule*, date::year>
+find_rule_for_zone(const std::pair<const Rule*, const Rule*>& eqr,
+ const date::year& y, const std::chrono::seconds& offset,
+ const MonthDayTime& mdt)
+{
+ assert(eqr.first != nullptr);
+ assert(eqr.second != nullptr);
+
+ using namespace std::chrono;
+ using namespace date;
+ auto r = eqr.first;
+ auto ry = r->starting_year();
+ auto prev_save = minutes{0};
+ auto prev_year = year::min();
+ const Rule* prev_rule = nullptr;
+ while (r != nullptr)
+ {
+ if (mdt.compare(y, r->mdt(), ry, offset, prev_save) <= 0)
+ break;
+ prev_rule = r;
+ prev_year = ry;
+ prev_save = prev_rule->save();
+ std::tie(r, ry) = find_next_rule(eqr.first, eqr.second, r, ry);
+ }
+ return {prev_rule, prev_year};
+}
+
+static
+std::pair<const Rule*, date::year>
+find_rule_for_zone(const std::pair<const Rule*, const Rule*>& eqr,
+ const sys_seconds& tp_utc,
+ const local_seconds& tp_std,
+ const local_seconds& tp_loc)
+{
+ using namespace std::chrono;
+ using namespace date;
+ auto r = eqr.first;
+ auto ry = r->starting_year();
+ auto prev_save = minutes{0};
+ auto prev_year = year::min();
+ const Rule* prev_rule = nullptr;
+ while (r != nullptr)
+ {
+ bool found = false;
+ switch (r->mdt().zone())
+ {
+ case tz::utc:
+ found = tp_utc < r->mdt().to_time_point(ry);
+ break;
+ case tz::standard:
+ found = sys_seconds{tp_std.time_since_epoch()} < r->mdt().to_time_point(ry);
+ break;
+ case tz::local:
+ found = sys_seconds{tp_loc.time_since_epoch()} < r->mdt().to_time_point(ry);
+ break;
+ }
+ if (found)
+ break;
+ prev_rule = r;
+ prev_year = ry;
+ prev_save = prev_rule->save();
+ std::tie(r, ry) = find_next_rule(eqr.first, eqr.second, r, ry);
+ }
+ return {prev_rule, prev_year};
+}
+
+static
+sys_info
+find_rule(const std::pair<const Rule*, date::year>& first_rule,
+ const std::pair<const Rule*, date::year>& last_rule,
+ const date::year& y, const std::chrono::seconds& offset,
+ const MonthDayTime& mdt, const std::chrono::minutes& initial_save,
+ const std::string& initial_abbrev)
+{
+ using namespace std::chrono;
+ using namespace date;
+ auto r = first_rule.first;
+ auto ry = first_rule.second;
+ sys_info x{sys_days(year::min()/min_day), sys_days(year::max()/max_day),
+ seconds{0}, initial_save, initial_abbrev};
+ while (r != nullptr)
+ {
+ auto tr = r->mdt().to_sys(ry, offset, x.save);
+ auto tx = mdt.to_sys(y, offset, x.save);
+ // Find last rule where tx >= tr
+ if (tx <= tr || (r == last_rule.first && ry == last_rule.second))
+ {
+ if (tx < tr && r == first_rule.first && ry == first_rule.second)
+ {
+ x.end = r->mdt().to_sys(ry, offset, x.save);
+ break;
+ }
+ if (tx < tr)
+ {
+ std::tie(r, ry) = find_previous_rule(r, ry); // can't return nullptr for r
+ assert(r != nullptr);
+ }
+ // r != nullptr && tx >= tr (if tr were to be recomputed)
+ auto prev_save = initial_save;
+ if (!(r == first_rule.first && ry == first_rule.second))
+ prev_save = find_previous_rule(r, ry).first->save();
+ x.begin = r->mdt().to_sys(ry, offset, prev_save);
+ x.save = r->save();
+ x.abbrev = r->abbrev();
+ if (!(r == last_rule.first && ry == last_rule.second))
+ {
+ std::tie(r, ry) = find_next_rule(r, ry); // can't return nullptr for r
+ assert(r != nullptr);
+ x.end = r->mdt().to_sys(ry, offset, x.save);
+ }
+ else
+ x.end = sys_days(year::max()/max_day);
+ break;
+ }
+ x.save = r->save();
+ std::tie(r, ry) = find_next_rule(r, ry); // Can't return nullptr for r
+ assert(r != nullptr);
+ }
+ return x;
+}
+
+// zonelet
+
+detail::zonelet::~zonelet()
+{
+#if !defined(_MSC_VER) || (_MSC_VER >= 1900)
+ using minutes = std::chrono::minutes;
+ using string = std::string;
+ if (tag_ == has_save)
+ u.save_.~minutes();
+ else
+ u.rule_.~string();
+#endif
+}
+
+detail::zonelet::zonelet()
+{
+#if !defined(_MSC_VER) || (_MSC_VER >= 1900)
+ ::new(&u.rule_) std::string();
+#endif
+}
+
+detail::zonelet::zonelet(const zonelet& i)
+ : gmtoff_(i.gmtoff_)
+ , tag_(i.tag_)
+ , format_(i.format_)
+ , until_year_(i.until_year_)
+ , until_date_(i.until_date_)
+ , until_utc_(i.until_utc_)
+ , until_std_(i.until_std_)
+ , until_loc_(i.until_loc_)
+ , initial_save_(i.initial_save_)
+ , initial_abbrev_(i.initial_abbrev_)
+ , first_rule_(i.first_rule_)
+ , last_rule_(i.last_rule_)
+{
+#if !defined(_MSC_VER) || (_MSC_VER >= 1900)
+ if (tag_ == has_save)
+ ::new(&u.save_) std::chrono::minutes(i.u.save_);
+ else
+ ::new(&u.rule_) std::string(i.u.rule_);
+#else
+ if (tag_ == has_save)
+ u.save_ = i.u.save_;
+ else
+ u.rule_ = i.u.rule_;
+#endif
+}
+
+#endif // !USE_OS_TZDB
+
+// time_zone
+
+#if USE_OS_TZDB
+
+time_zone::time_zone(const std::string& s, detail::undocumented)
+ : name_(s)
+ , adjusted_(new std::once_flag{})
+{
+}
+
+enum class endian
+{
+ native = __BYTE_ORDER__,
+ little = __ORDER_LITTLE_ENDIAN__,
+ big = __ORDER_BIG_ENDIAN__
+};
+
+static
+inline
+std::uint32_t
+reverse_bytes(std::uint32_t i)
+{
+ return
+ (i & 0xff000000u) >> 24 |
+ (i & 0x00ff0000u) >> 8 |
+ (i & 0x0000ff00u) << 8 |
+ (i & 0x000000ffu) << 24;
+}
+
+static
+inline
+std::uint64_t
+reverse_bytes(std::uint64_t i)
+{
+ return
+ (i & 0xff00000000000000ull) >> 56 |
+ (i & 0x00ff000000000000ull) >> 40 |
+ (i & 0x0000ff0000000000ull) >> 24 |
+ (i & 0x000000ff00000000ull) >> 8 |
+ (i & 0x00000000ff000000ull) << 8 |
+ (i & 0x0000000000ff0000ull) << 24 |
+ (i & 0x000000000000ff00ull) << 40 |
+ (i & 0x00000000000000ffull) << 56;
+}
+
+template <class T>
+static
+inline
+void
+maybe_reverse_bytes(T&, std::false_type)
+{
+}
+
+static
+inline
+void
+maybe_reverse_bytes(std::int32_t& t, std::true_type)
+{
+ t = static_cast<std::int32_t>(reverse_bytes(static_cast<std::uint32_t>(t)));
+}
+
+static
+inline
+void
+maybe_reverse_bytes(std::int64_t& t, std::true_type)
+{
+ t = static_cast<std::int64_t>(reverse_bytes(static_cast<std::uint64_t>(t)));
+}
+
+template <class T>
+static
+inline
+void
+maybe_reverse_bytes(T& t)
+{
+ maybe_reverse_bytes(t, std::integral_constant<bool,
+ endian::native == endian::little>{});
+}
+
+static
+void
+load_header(std::istream& inf)
+{
+ // Read TZif
+ auto t = inf.get();
+ auto z = inf.get();
+ auto i = inf.get();
+ auto f = inf.get();
+#ifndef NDEBUG
+ assert(t == 'T');
+ assert(z == 'Z');
+ assert(i == 'i');
+ assert(f == 'f');
+#else
+ (void)t;
+ (void)z;
+ (void)i;
+ (void)f;
+#endif
+}
+
+static
+unsigned char
+load_version(std::istream& inf)
+{
+ // Read version
+ auto v = inf.get();
+ assert(v != EOF);
+ return static_cast<unsigned char>(v);
+}
+
+static
+void
+skip_reserve(std::istream& inf)
+{
+ inf.ignore(15);
+}
+
+static
+void
+load_counts(std::istream& inf,
+ std::int32_t& tzh_ttisgmtcnt, std::int32_t& tzh_ttisstdcnt,
+ std::int32_t& tzh_leapcnt, std::int32_t& tzh_timecnt,
+ std::int32_t& tzh_typecnt, std::int32_t& tzh_charcnt)
+{
+ // Read counts;
+ inf.read(reinterpret_cast<char*>(&tzh_ttisgmtcnt), 4);
+ maybe_reverse_bytes(tzh_ttisgmtcnt);
+ inf.read(reinterpret_cast<char*>(&tzh_ttisstdcnt), 4);
+ maybe_reverse_bytes(tzh_ttisstdcnt);
+ inf.read(reinterpret_cast<char*>(&tzh_leapcnt), 4);
+ maybe_reverse_bytes(tzh_leapcnt);
+ inf.read(reinterpret_cast<char*>(&tzh_timecnt), 4);
+ maybe_reverse_bytes(tzh_timecnt);
+ inf.read(reinterpret_cast<char*>(&tzh_typecnt), 4);
+ maybe_reverse_bytes(tzh_typecnt);
+ inf.read(reinterpret_cast<char*>(&tzh_charcnt), 4);
+ maybe_reverse_bytes(tzh_charcnt);
+}
+
+template <class TimeType>
+static
+std::vector<detail::transition>
+load_transitions(std::istream& inf, std::int32_t tzh_timecnt)
+{
+ // Read transitions
+ using namespace std::chrono;
+ std::vector<detail::transition> transitions;
+ transitions.reserve(static_cast<unsigned>(tzh_timecnt));
+ for (std::int32_t i = 0; i < tzh_timecnt; ++i)
+ {
+ TimeType t;
+ inf.read(reinterpret_cast<char*>(&t), sizeof(t));
+ maybe_reverse_bytes(t);
+ transitions.emplace_back(sys_seconds{seconds{t}});
+ if (transitions.back().timepoint < min_seconds)
+ transitions.back().timepoint = min_seconds;
+ }
+ return transitions;
+}
+
+static
+std::vector<std::uint8_t>
+load_indices(std::istream& inf, std::int32_t tzh_timecnt)
+{
+ // Read indices
+ std::vector<std::uint8_t> indices;
+ indices.reserve(static_cast<unsigned>(tzh_timecnt));
+ for (std::int32_t i = 0; i < tzh_timecnt; ++i)
+ {
+ std::uint8_t t;
+ inf.read(reinterpret_cast<char*>(&t), sizeof(t));
+ indices.emplace_back(t);
+ }
+ return indices;
+}
+
+static
+std::vector<ttinfo>
+load_ttinfo(std::istream& inf, std::int32_t tzh_typecnt)
+{
+ // Read ttinfo
+ std::vector<ttinfo> ttinfos;
+ ttinfos.reserve(static_cast<unsigned>(tzh_typecnt));
+ for (std::int32_t i = 0; i < tzh_typecnt; ++i)
+ {
+ ttinfo t;
+ inf.read(reinterpret_cast<char*>(&t), 6);
+ maybe_reverse_bytes(t.tt_gmtoff);
+ ttinfos.emplace_back(t);
+ }
+ return ttinfos;
+}
+
+static
+std::string
+load_abbreviations(std::istream& inf, std::int32_t tzh_charcnt)
+{
+ // Read abbreviations
+ std::string abbrev;
+ abbrev.resize(static_cast<unsigned>(tzh_charcnt), '\0');
+ inf.read(&abbrev[0], tzh_charcnt);
+ return abbrev;
+}
+
+#if !MISSING_LEAP_SECONDS
+
+template <class TimeType>
+static
+std::vector<leap_second>
+load_leaps(std::istream& inf, std::int32_t tzh_leapcnt)
+{
+ // Read tzh_leapcnt pairs
+ using namespace std::chrono;
+ std::vector<leap_second> leap_seconds;
+ leap_seconds.reserve(static_cast<std::size_t>(tzh_leapcnt));
+ for (std::int32_t i = 0; i < tzh_leapcnt; ++i)
+ {
+ TimeType t0;
+ std::int32_t t1;
+ inf.read(reinterpret_cast<char*>(&t0), sizeof(t0));
+ inf.read(reinterpret_cast<char*>(&t1), sizeof(t1));
+ maybe_reverse_bytes(t0);
+ maybe_reverse_bytes(t1);
+ leap_seconds.emplace_back(sys_seconds{seconds{t0 - (t1-1)}},
+ detail::undocumented{});
+ }
+ return leap_seconds;
+}
+
+template <class TimeType>
+static
+std::vector<leap_second>
+load_leap_data(std::istream& inf,
+ std::int32_t tzh_leapcnt, std::int32_t tzh_timecnt,
+ std::int32_t tzh_typecnt, std::int32_t tzh_charcnt)
+{
+ inf.ignore(tzh_timecnt*static_cast<std::int32_t>(sizeof(TimeType)) + tzh_timecnt +
+ tzh_typecnt*6 + tzh_charcnt);
+ return load_leaps<TimeType>(inf, tzh_leapcnt);
+}
+
+static
+std::vector<leap_second>
+load_just_leaps(std::istream& inf)
+{
+ // Read tzh_leapcnt pairs
+ using namespace std::chrono;
+ load_header(inf);
+ auto v = load_version(inf);
+ std::int32_t tzh_ttisgmtcnt, tzh_ttisstdcnt, tzh_leapcnt,
+ tzh_timecnt, tzh_typecnt, tzh_charcnt;
+ skip_reserve(inf);
+ load_counts(inf, tzh_ttisgmtcnt, tzh_ttisstdcnt, tzh_leapcnt,
+ tzh_timecnt, tzh_typecnt, tzh_charcnt);
+ if (v == 0)
+ return load_leap_data<int32_t>(inf, tzh_leapcnt, tzh_timecnt, tzh_typecnt,
+ tzh_charcnt);
+#if !defined(NDEBUG)
+ inf.ignore((4+1)*tzh_timecnt + 6*tzh_typecnt + tzh_charcnt + 8*tzh_leapcnt +
+ tzh_ttisstdcnt + tzh_ttisgmtcnt);
+ load_header(inf);
+ auto v2 = load_version(inf);
+ assert(v == v2);
+ skip_reserve(inf);
+#else // defined(NDEBUG)
+ inf.ignore((4+1)*tzh_timecnt + 6*tzh_typecnt + tzh_charcnt + 8*tzh_leapcnt +
+ tzh_ttisstdcnt + tzh_ttisgmtcnt + (4+1+15));
+#endif // defined(NDEBUG)
+ load_counts(inf, tzh_ttisgmtcnt, tzh_ttisstdcnt, tzh_leapcnt,
+ tzh_timecnt, tzh_typecnt, tzh_charcnt);
+ return load_leap_data<int64_t>(inf, tzh_leapcnt, tzh_timecnt, tzh_typecnt,
+ tzh_charcnt);
+}
+
+#endif // !MISSING_LEAP_SECONDS
+
+template <class TimeType>
+void
+time_zone::load_data(std::istream& inf,
+ std::int32_t tzh_leapcnt, std::int32_t tzh_timecnt,
+ std::int32_t tzh_typecnt, std::int32_t tzh_charcnt)
+{
+ using namespace std::chrono;
+ transitions_ = load_transitions<TimeType>(inf, tzh_timecnt);
+ auto indices = load_indices(inf, tzh_timecnt);
+ auto infos = load_ttinfo(inf, tzh_typecnt);
+ auto abbrev = load_abbreviations(inf, tzh_charcnt);
+#if !MISSING_LEAP_SECONDS
+ auto& leap_seconds = get_tzdb_list().front().leap_seconds;
+ if (leap_seconds.empty() && tzh_leapcnt > 0)
+ leap_seconds = load_leaps<TimeType>(inf, tzh_leapcnt);
+#endif
+ ttinfos_.reserve(infos.size());
+ for (auto& info : infos)
+ {
+ ttinfos_.push_back({seconds{info.tt_gmtoff},
+ abbrev.c_str() + info.tt_abbrind,
+ info.tt_isdst != 0});
+ }
+ auto i = 0u;
+ if (transitions_.empty() || transitions_.front().timepoint != min_seconds)
+ {
+ transitions_.emplace(transitions_.begin(), min_seconds);
+ auto tf = std::find_if(ttinfos_.begin(), ttinfos_.end(),
+ [](const expanded_ttinfo& ti)
+ {return ti.is_dst == 0;});
+ if (tf == ttinfos_.end())
+ tf = ttinfos_.begin();
+ transitions_[i].info = &*tf;
+ ++i;
+ }
+ for (auto j = 0u; i < transitions_.size(); ++i, ++j)
+ transitions_[i].info = ttinfos_.data() + indices[j];
+}
+
+void
+time_zone::init_impl()
+{
+ using namespace std;
+ using namespace std::chrono;
+ auto name = get_tz_dir() + ('/' + name_);
+ std::ifstream inf(name);
+ if (!inf.is_open())
+ throw std::runtime_error{"Unable to open " + name};
+ inf.exceptions(std::ios::failbit | std::ios::badbit);
+ load_header(inf);
+ auto v = load_version(inf);
+ std::int32_t tzh_ttisgmtcnt, tzh_ttisstdcnt, tzh_leapcnt,
+ tzh_timecnt, tzh_typecnt, tzh_charcnt;
+ skip_reserve(inf);
+ load_counts(inf, tzh_ttisgmtcnt, tzh_ttisstdcnt, tzh_leapcnt,
+ tzh_timecnt, tzh_typecnt, tzh_charcnt);
+ if (v == 0)
+ {
+ load_data<int32_t>(inf, tzh_leapcnt, tzh_timecnt, tzh_typecnt, tzh_charcnt);
+ }
+ else
+ {
+#if !defined(NDEBUG)
+ inf.ignore((4+1)*tzh_timecnt + 6*tzh_typecnt + tzh_charcnt + 8*tzh_leapcnt +
+ tzh_ttisstdcnt + tzh_ttisgmtcnt);
+ load_header(inf);
+ auto v2 = load_version(inf);
+ assert(v == v2);
+ skip_reserve(inf);
+#else // defined(NDEBUG)
+ inf.ignore((4+1)*tzh_timecnt + 6*tzh_typecnt + tzh_charcnt + 8*tzh_leapcnt +
+ tzh_ttisstdcnt + tzh_ttisgmtcnt + (4+1+15));
+#endif // defined(NDEBUG)
+ load_counts(inf, tzh_ttisgmtcnt, tzh_ttisstdcnt, tzh_leapcnt,
+ tzh_timecnt, tzh_typecnt, tzh_charcnt);
+ load_data<int64_t>(inf, tzh_leapcnt, tzh_timecnt, tzh_typecnt, tzh_charcnt);
+ }
+#if !MISSING_LEAP_SECONDS
+ if (tzh_leapcnt > 0)
+ {
+ auto& leap_seconds = get_tzdb_list().front().leap_seconds;
+ auto itr = leap_seconds.begin();
+ auto l = itr->date();
+ seconds leap_count{0};
+ for (auto t = std::upper_bound(transitions_.begin(), transitions_.end(), l,
+ [](const sys_seconds& x, const transition& ct)
+ {
+ return x < ct.timepoint;
+ });
+ t != transitions_.end(); ++t)
+ {
+ while (t->timepoint >= l)
+ {
+ ++leap_count;
+ if (++itr == leap_seconds.end())
+ l = sys_days(max_year/max_day);
+ else
+ l = itr->date() + leap_count;
+ }
+ t->timepoint -= leap_count;
+ }
+ }
+#endif // !MISSING_LEAP_SECONDS
+ auto b = transitions_.begin();
+ auto i = transitions_.end();
+ if (i != b)
+ {
+ for (--i; i != b; --i)
+ {
+ if (i->info->offset == i[-1].info->offset &&
+ i->info->abbrev == i[-1].info->abbrev &&
+ i->info->is_dst == i[-1].info->is_dst)
+ i = transitions_.erase(i);
+ }
+ }
+}
+
+void
+time_zone::init() const
+{
+ std::call_once(*adjusted_, [this]() {const_cast<time_zone*>(this)->init_impl();});
+}
+
+sys_info
+time_zone::load_sys_info(std::vector<detail::transition>::const_iterator i) const
+{
+ using namespace std::chrono;
+ assert(!transitions_.empty());
+ assert(i != transitions_.begin());
+ sys_info r;
+ r.begin = i[-1].timepoint;
+ r.end = i != transitions_.end() ? i->timepoint :
+ sys_seconds(sys_days(year::max()/max_day));
+ r.offset = i[-1].info->offset;
+ r.save = i[-1].info->is_dst ? minutes{1} : minutes{0};
+ r.abbrev = i[-1].info->abbrev;
+ return r;
+}
+
+sys_info
+time_zone::get_info_impl(sys_seconds tp) const
+{
+ using namespace std;
+ init();
+ return load_sys_info(upper_bound(transitions_.begin(), transitions_.end(), tp,
+ [](const sys_seconds& x, const transition& t)
+ {
+ return x < t.timepoint;
+ }));
+}
+
+local_info
+time_zone::get_info_impl(local_seconds tp) const
+{
+ using namespace std::chrono;
+ init();
+ local_info i;
+ i.result = local_info::unique;
+ auto tr = upper_bound(transitions_.begin(), transitions_.end(), tp,
+ [](const local_seconds& x, const transition& t)
+ {
+ return sys_seconds{x.time_since_epoch()} -
+ t.info->offset < t.timepoint;
+ });
+ i.first = load_sys_info(tr);
+ auto tps = sys_seconds{(tp - i.first.offset).time_since_epoch()};
+ if (tps < i.first.begin + days{1} && tr != transitions_.begin())
+ {
+ i.second = load_sys_info(--tr);
+ tps = sys_seconds{(tp - i.second.offset).time_since_epoch()};
+ if (tps < i.second.end)
+ {
+ i.result = local_info::ambiguous;
+ std::swap(i.first, i.second);
+ }
+ else
+ {
+ i.second = {};
+ }
+ }
+ else if (tps >= i.first.end && tr != transitions_.end())
+ {
+ i.second = load_sys_info(++tr);
+ tps = sys_seconds{(tp - i.second.offset).time_since_epoch()};
+ if (tps < i.second.begin)
+ i.result = local_info::nonexistent;
+ else
+ i.second = {};
+ }
+ return i;
+}
+
+std::ostream&
+operator<<(std::ostream& os, const time_zone& z)
+{
+ using namespace std::chrono;
+ z.init();
+ os << z.name_ << '\n';
+ os << "Initially: ";
+ auto const& t = z.transitions_.front();
+ if (t.info->offset >= seconds{0})
+ os << '+';
+ os << make_time(t.info->offset);
+ if (t.info->is_dst > 0)
+ os << " daylight ";
+ else
+ os << " standard ";
+ os << t.info->abbrev << '\n';
+ for (auto i = std::next(z.transitions_.cbegin()); i < z.transitions_.cend(); ++i)
+ os << *i << '\n';
+ return os;
+}
+
+#if !MISSING_LEAP_SECONDS
+
+leap_second::leap_second(const sys_seconds& s, detail::undocumented)
+ : date_(s)
+{
+}
+
+#endif // !MISSING_LEAP_SECONDS
+
+#else // !USE_OS_TZDB
+
+time_zone::time_zone(const std::string& s, detail::undocumented)
+ : adjusted_(new std::once_flag{})
+{
+ try
+ {
+ using namespace date;
+ std::istringstream in(s);
+ in.exceptions(std::ios::failbit | std::ios::badbit);
+ std::string word;
+ in >> word >> name_;
+ parse_info(in);
+ }
+ catch (...)
+ {
+ std::cerr << s << '\n';
+ std::cerr << *this << '\n';
+ zonelets_.pop_back();
+ throw;
+ }
+}
+
+sys_info
+time_zone::get_info_impl(sys_seconds tp) const
+{
+ return get_info_impl(tp, static_cast<int>(tz::utc));
+}
+
+local_info
+time_zone::get_info_impl(local_seconds tp) const
+{
+ using namespace std::chrono;
+ local_info i{};
+ i.first = get_info_impl(sys_seconds{tp.time_since_epoch()}, static_cast<int>(tz::local));
+ auto tps = sys_seconds{(tp - i.first.offset).time_since_epoch()};
+ if (tps < i.first.begin)
+ {
+ i.second = std::move(i.first);
+ i.first = get_info_impl(i.second.begin - seconds{1}, static_cast<int>(tz::utc));
+ i.result = local_info::nonexistent;
+ }
+ else if (i.first.end - tps <= days{1})
+ {
+ i.second = get_info_impl(i.first.end, static_cast<int>(tz::utc));
+ tps = sys_seconds{(tp - i.second.offset).time_since_epoch()};
+ if (tps >= i.second.begin)
+ i.result = local_info::ambiguous;
+ else
+ i.second = {};
+ }
+ return i;
+}
+
+void
+time_zone::add(const std::string& s)
+{
+ try
+ {
+ std::istringstream in(s);
+ in.exceptions(std::ios::failbit | std::ios::badbit);
+ ws(in);
+ if (!in.eof() && in.peek() != '#')
+ parse_info(in);
+ }
+ catch (...)
+ {
+ std::cerr << s << '\n';
+ std::cerr << *this << '\n';
+ zonelets_.pop_back();
+ throw;
+ }
+}
+
+void
+time_zone::parse_info(std::istream& in)
+{
+ using namespace date;
+ using namespace std::chrono;
+ zonelets_.emplace_back();
+ auto& zonelet = zonelets_.back();
+ zonelet.gmtoff_ = parse_signed_time(in);
+ in >> zonelet.u.rule_;
+ if (zonelet.u.rule_ == "-")
+ zonelet.u.rule_.clear();
+ in >> zonelet.format_;
+ if (!in.eof())
+ ws(in);
+ if (in.eof() || in.peek() == '#')
+ {
+ zonelet.until_year_ = year::max();
+ zonelet.until_date_ = MonthDayTime(max_day, tz::utc);
+ }
+ else
+ {
+ int y;
+ in >> y;
+ zonelet.until_year_ = year{y};
+ in >> zonelet.until_date_;
+ zonelet.until_date_.canonicalize(zonelet.until_year_);
+ }
+ if ((zonelet.until_year_ < min_year) ||
+ (zonelets_.size() > 1 && zonelets_.end()[-2].until_year_ > max_year))
+ zonelets_.pop_back();
+}
+
+void
+time_zone::adjust_infos(const std::vector<Rule>& rules)
+{
+ using namespace std::chrono;
+ using namespace date;
+ const zonelet* prev_zonelet = nullptr;
+ for (auto& z : zonelets_)
+ {
+ std::pair<const Rule*, const Rule*> eqr{};
+ std::istringstream in;
+ in.exceptions(std::ios::failbit | std::ios::badbit);
+ // Classify info as rule-based, has save, or neither
+ if (!z.u.rule_.empty())
+ {
+ // Find out if this zonelet has a rule or a save
+ eqr = std::equal_range(rules.data(), rules.data() + rules.size(), z.u.rule_);
+ if (eqr.first == eqr.second)
+ {
+ // The rule doesn't exist. Assume this is a save
+ try
+ {
+ using namespace std::chrono;
+ using string = std::string;
+ in.str(z.u.rule_);
+ auto tmp = duration_cast<minutes>(parse_signed_time(in));
+#if !defined(_MSC_VER) || (_MSC_VER >= 1900)
+ z.u.rule_.~string();
+ z.tag_ = zonelet::has_save;
+ ::new(&z.u.save_) minutes(tmp);
+#else
+ z.u.rule_.clear();
+ z.tag_ = zonelet::has_save;
+ z.u.save_ = tmp;
+#endif
+ }
+ catch (...)
+ {
+ std::cerr << name_ << " : " << z.u.rule_ << '\n';
+ throw;
+ }
+ }
+ }
+ else
+ {
+ // This zone::zonelet has no rule and no save
+ z.tag_ = zonelet::is_empty;
+ }
+
+ minutes final_save{0};
+ if (z.tag_ == zonelet::has_save)
+ {
+ final_save = z.u.save_;
+ }
+ else if (z.tag_ == zonelet::has_rule)
+ {
+ z.last_rule_ = find_rule_for_zone(eqr, z.until_year_, z.gmtoff_,
+ z.until_date_);
+ if (z.last_rule_.first != nullptr)
+ final_save = z.last_rule_.first->save();
+ }
+ z.until_utc_ = z.until_date_.to_sys(z.until_year_, z.gmtoff_, final_save);
+ z.until_std_ = local_seconds{z.until_utc_.time_since_epoch()} + z.gmtoff_;
+ z.until_loc_ = z.until_std_ + final_save;
+
+ if (z.tag_ == zonelet::has_rule)
+ {
+ if (prev_zonelet != nullptr)
+ {
+ z.first_rule_ = find_rule_for_zone(eqr, prev_zonelet->until_utc_,
+ prev_zonelet->until_std_,
+ prev_zonelet->until_loc_);
+ if (z.first_rule_.first != nullptr)
+ {
+ z.initial_save_ = z.first_rule_.first->save();
+ z.initial_abbrev_ = z.first_rule_.first->abbrev();
+ if (z.first_rule_ != z.last_rule_)
+ {
+ z.first_rule_ = find_next_rule(eqr.first, eqr.second,
+ z.first_rule_.first,
+ z.first_rule_.second);
+ }
+ else
+ {
+ z.first_rule_ = std::make_pair(nullptr, year::min());
+ z.last_rule_ = std::make_pair(nullptr, year::max());
+ }
+ }
+ }
+ if (z.first_rule_.first == nullptr && z.last_rule_.first != nullptr)
+ {
+ z.first_rule_ = std::make_pair(eqr.first, eqr.first->starting_year());
+ z.initial_abbrev_ = find_first_std_rule(eqr)->abbrev();
+ }
+ }
+
+#ifndef NDEBUG
+ if (z.first_rule_.first == nullptr)
+ {
+ assert(z.first_rule_.second == year::min());
+ assert(z.last_rule_.first == nullptr);
+ assert(z.last_rule_.second == year::max());
+ }
+ else
+ {
+ assert(z.last_rule_.first != nullptr);
+ }
+#endif
+ prev_zonelet = &z;
+ }
+}
+
+static
+std::string
+format_abbrev(std::string format, const std::string& variable, std::chrono::seconds off,
+ std::chrono::minutes save)
+{
+ using namespace std::chrono;
+ auto k = format.find("%s");
+ if (k != std::string::npos)
+ {
+ format.replace(k, 2, variable);
+ }
+ else
+ {
+ k = format.find('/');
+ if (k != std::string::npos)
+ {
+ if (save == minutes{0})
+ format.erase(k);
+ else
+ format.erase(0, k+1);
+ }
+ else
+ {
+ k = format.find("%z");
+ if (k != std::string::npos)
+ {
+ std::string temp;
+ if (off < seconds{0})
+ {
+ temp = '-';
+ off = -off;
+ }
+ else
+ temp = '+';
+ auto h = date::floor<hours>(off);
+ off -= h;
+ if (h < hours{10})
+ temp += '0';
+ temp += std::to_string(h.count());
+ if (off > seconds{0})
+ {
+ auto m = date::floor<minutes>(off);
+ off -= m;
+ if (m < minutes{10})
+ temp += '0';
+ temp += std::to_string(m.count());
+ if (off > seconds{0})
+ {
+ if (off < seconds{10})
+ temp += '0';
+ temp += std::to_string(off.count());
+ }
+ }
+ format.replace(k, 2, temp);
+ }
+ }
+ }
+ return format;
+}
+
+sys_info
+time_zone::get_info_impl(sys_seconds tp, int tz_int) const
+{
+ using namespace std::chrono;
+ using namespace date;
+ tz timezone = static_cast<tz>(tz_int);
+ assert(timezone != tz::standard);
+ auto y = year_month_day(floor<days>(tp)).year();
+ if (y < min_year || y > max_year)
+ throw std::runtime_error("The year " + std::to_string(static_cast<int>(y)) +
+ " is out of range:[" + std::to_string(static_cast<int>(min_year)) + ", "
+ + std::to_string(static_cast<int>(max_year)) + "]");
+ std::call_once(*adjusted_,
+ [this]()
+ {
+ const_cast<time_zone*>(this)->adjust_infos(get_tzdb().rules);
+ });
+ auto i = std::upper_bound(zonelets_.begin(), zonelets_.end(), tp,
+ [timezone](sys_seconds t, const zonelet& zl)
+ {
+ return timezone == tz::utc ? t < zl.until_utc_ :
+ t < sys_seconds{zl.until_loc_.time_since_epoch()};
+ });
+
+ sys_info r{};
+ if (i != zonelets_.end())
+ {
+ if (i->tag_ == zonelet::has_save)
+ {
+ if (i != zonelets_.begin())
+ r.begin = i[-1].until_utc_;
+ else
+ r.begin = sys_days(year::min()/min_day);
+ r.end = i->until_utc_;
+ r.offset = i->gmtoff_ + i->u.save_;
+ r.save = i->u.save_;
+ }
+ else if (i->u.rule_.empty())
+ {
+ if (i != zonelets_.begin())
+ r.begin = i[-1].until_utc_;
+ else
+ r.begin = sys_days(year::min()/min_day);
+ r.end = i->until_utc_;
+ r.offset = i->gmtoff_;
+ }
+ else
+ {
+ r = find_rule(i->first_rule_, i->last_rule_, y, i->gmtoff_,
+ MonthDayTime(local_seconds{tp.time_since_epoch()}, timezone),
+ i->initial_save_, i->initial_abbrev_);
+ r.offset = i->gmtoff_ + r.save;
+ if (i != zonelets_.begin() && r.begin < i[-1].until_utc_)
+ r.begin = i[-1].until_utc_;
+ if (r.end > i->until_utc_)
+ r.end = i->until_utc_;
+ }
+ r.abbrev = format_abbrev(i->format_, r.abbrev, r.offset, r.save);
+ assert(r.begin < r.end);
+ }
+ return r;
+}
+
+std::ostream&
+operator<<(std::ostream& os, const time_zone& z)
+{
+ using namespace date;
+ using namespace std::chrono;
+ detail::save_ostream<char> _(os);
+ os.fill(' ');
+ os.flags(std::ios::dec | std::ios::left);
+ std::call_once(*z.adjusted_,
+ [&z]()
+ {
+ const_cast<time_zone&>(z).adjust_infos(get_tzdb().rules);
+ });
+ os.width(35);
+ os << z.name_;
+ std::string indent;
+ for (auto const& s : z.zonelets_)
+ {
+ os << indent;
+ if (s.gmtoff_ >= seconds{0})
+ os << ' ';
+ os << make_time(s.gmtoff_) << " ";
+ os.width(15);
+ if (s.tag_ != zonelet::has_save)
+ os << s.u.rule_;
+ else
+ {
+ std::ostringstream tmp;
+ tmp << make_time(s.u.save_);
+ os << tmp.str();
+ }
+ os.width(8);
+ os << s.format_ << " ";
+ os << s.until_year_ << ' ' << s.until_date_;
+ os << " " << s.until_utc_ << " UTC";
+ os << " " << s.until_std_ << " STD";
+ os << " " << s.until_loc_;
+ os << " " << make_time(s.initial_save_);
+ os << " " << s.initial_abbrev_;
+ if (s.first_rule_.first != nullptr)
+ os << " {" << *s.first_rule_.first << ", " << s.first_rule_.second << '}';
+ else
+ os << " {" << "nullptr" << ", " << s.first_rule_.second << '}';
+ if (s.last_rule_.first != nullptr)
+ os << " {" << *s.last_rule_.first << ", " << s.last_rule_.second << '}';
+ else
+ os << " {" << "nullptr" << ", " << s.last_rule_.second << '}';
+ os << '\n';
+ if (indent.empty())
+ indent = std::string(35, ' ');
+ }
+ return os;
+}
+
+#endif // !USE_OS_TZDB
+
+#if !MISSING_LEAP_SECONDS
+
+std::ostream&
+operator<<(std::ostream& os, const leap_second& x)
+{
+ using namespace date;
+ return os << x.date_ << " +";
+}
+
+#endif // !MISSING_LEAP_SECONDS
+
+#if USE_OS_TZDB
+
+# ifdef __APPLE__
+static
+std::string
+get_version()
+{
+ using namespace std;
+ auto path = get_tz_dir() + string("/+VERSION");
+ ifstream in{path};
+ string version;
+ in >> version;
+ if (in.fail())
+ throw std::runtime_error("Unable to get Timezone database version from " + path);
+ return version;
+}
+# endif
+
+static
+std::unique_ptr<tzdb>
+init_tzdb()
+{
+ std::unique_ptr<tzdb> db(new tzdb);
+
+ //Iterate through folders
+ std::queue<std::string> subfolders;
+ subfolders.emplace(get_tz_dir());
+ struct dirent* d;
+ struct stat s;
+ while (!subfolders.empty())
+ {
+ auto dirname = std::move(subfolders.front());
+ subfolders.pop();
+ auto dir = opendir(dirname.c_str());
+ if (!dir)
+ continue;
+ while ((d = readdir(dir)) != nullptr)
+ {
+ // Ignore these files:
+ if (d->d_name[0] == '.' || // curdir, prevdir, hidden
+ memcmp(d->d_name, "posix", 5) == 0 || // starts with posix
+ strcmp(d->d_name, "Factory") == 0 ||
+ strcmp(d->d_name, "iso3166.tab") == 0 ||
+ strcmp(d->d_name, "right") == 0 ||
+ strcmp(d->d_name, "+VERSION") == 0 ||
+ strcmp(d->d_name, "zone.tab") == 0 ||
+ strcmp(d->d_name, "zone1970.tab") == 0 ||
+ strcmp(d->d_name, "tzdata.zi") == 0 ||
+ strcmp(d->d_name, "leapseconds") == 0 ||
+ strcmp(d->d_name, "leap-seconds.list") == 0 )
+ continue;
+ auto subname = dirname + folder_delimiter + d->d_name;
+ if(stat(subname.c_str(), &s) == 0)
+ {
+ if(S_ISDIR(s.st_mode))
+ {
+ if(!S_ISLNK(s.st_mode))
+ {
+ subfolders.push(subname);
+ }
+ }
+ else
+ {
+ db->zones.emplace_back(subname.substr(get_tz_dir().size()+1),
+ detail::undocumented{});
+ }
+ }
+ }
+ closedir(dir);
+ }
+ db->zones.shrink_to_fit();
+ std::sort(db->zones.begin(), db->zones.end());
+# if !MISSING_LEAP_SECONDS
+ std::ifstream in(get_tz_dir() + std::string(1, folder_delimiter) + "right/UTC",
+ std::ios_base::binary);
+ if (in)
+ {
+ in.exceptions(std::ios::failbit | std::ios::badbit);
+ db->leap_seconds = load_just_leaps(in);
+ }
+ else
+ {
+ in.clear();
+ in.open(get_tz_dir() + std::string(1, folder_delimiter) +
+ "UTC", std::ios_base::binary);
+ if (!in)
+ throw std::runtime_error("Unable to extract leap second information");
+ in.exceptions(std::ios::failbit | std::ios::badbit);
+ db->leap_seconds = load_just_leaps(in);
+ }
+# endif // !MISSING_LEAP_SECONDS
+# ifdef __APPLE__
+ db->version = get_version();
+# endif
+ return db;
+}
+
+#else // !USE_OS_TZDB
+
+// time_zone_link
+
+time_zone_link::time_zone_link(const std::string& s)
+{
+ using namespace date;
+ std::istringstream in(s);
+ in.exceptions(std::ios::failbit | std::ios::badbit);
+ std::string word;
+ in >> word >> target_ >> name_;
+}
+
+std::ostream&
+operator<<(std::ostream& os, const time_zone_link& x)
+{
+ using namespace date;
+ detail::save_ostream<char> _(os);
+ os.fill(' ');
+ os.flags(std::ios::dec | std::ios::left);
+ os.width(35);
+ return os << x.name_ << " --> " << x.target_;
+}
+
+// leap_second
+
+leap_second::leap_second(const std::string& s, detail::undocumented)
+{
+ using namespace date;
+ std::istringstream in(s);
+ in.exceptions(std::ios::failbit | std::ios::badbit);
+ std::string word;
+ int y;
+ MonthDayTime date;
+ in >> word >> y >> date;
+ date_ = date.to_time_point(year(y));
+}
+
+static
+bool
+file_exists(const std::string& filename)
+{
+#ifdef _WIN32
+ return ::_access(filename.c_str(), 0) == 0;
+#else
+ return ::access(filename.c_str(), F_OK) == 0;
+#endif
+}
+
+#if HAS_REMOTE_API
+
+// CURL tools
+
+static
+int
+curl_global()
+{
+ if (::curl_global_init(CURL_GLOBAL_DEFAULT) != 0)
+ throw std::runtime_error("CURL global initialization failed");
+ return 0;
+}
+
+namespace
+{
+
+struct curl_deleter
+{
+ void operator()(CURL* p) const
+ {
+ ::curl_easy_cleanup(p);
+ }
+};
+
+} // unnamed namespace
+
+static
+std::unique_ptr<CURL, curl_deleter>
+curl_init()
+{
+ static const auto curl_is_now_initiailized = curl_global();
+ (void)curl_is_now_initiailized;
+ return std::unique_ptr<CURL, curl_deleter>{::curl_easy_init()};
+}
+
+static
+bool
+download_to_string(const std::string& url, std::string& str)
+{
+ str.clear();
+ auto curl = curl_init();
+ if (!curl)
+ return false;
+ std::string version;
+ curl_easy_setopt(curl.get(), CURLOPT_USERAGENT, "curl");
+ curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
+ curl_write_callback write_cb = [](char* contents, std::size_t size, std::size_t nmemb,
+ void* userp) -> std::size_t
+ {
+ auto& userstr = *static_cast<std::string*>(userp);
+ auto realsize = size * nmemb;
+ userstr.append(contents, realsize);
+ return realsize;
+ };
+ curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, write_cb);
+ curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &str);
+ curl_easy_setopt(curl.get(), CURLOPT_SSL_VERIFYPEER, false);
+ auto res = curl_easy_perform(curl.get());
+ return (res == CURLE_OK);
+}
+
+namespace
+{
+ enum class download_file_options { binary, text };
+}
+
+static
+bool
+download_to_file(const std::string& url, const std::string& local_filename,
+ download_file_options opts, char* error_buffer)
+{
+ auto curl = curl_init();
+ if (!curl)
+ return false;
+ curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
+ curl_easy_setopt(curl.get(), CURLOPT_SSL_VERIFYPEER, false);
+ if (error_buffer)
+ curl_easy_setopt(curl.get(), CURLOPT_ERRORBUFFER, error_buffer);
+ curl_write_callback write_cb = [](char* contents, std::size_t size, std::size_t nmemb,
+ void* userp) -> std::size_t
+ {
+ auto& of = *static_cast<std::ofstream*>(userp);
+ auto realsize = size * nmemb;
+ of.write(contents, static_cast<std::streamsize>(realsize));
+ return realsize;
+ };
+ curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, write_cb);
+ decltype(curl_easy_perform(curl.get())) res;
+ {
+ std::ofstream of(local_filename,
+ opts == download_file_options::binary ?
+ std::ofstream::out | std::ofstream::binary :
+ std::ofstream::out);
+ of.exceptions(std::ios::badbit);
+ curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &of);
+ res = curl_easy_perform(curl.get());
+ }
+ return res == CURLE_OK;
+}
+
+std::string
+remote_version()
+{
+ std::string version;
+ std::string str;
+ if (download_to_string("https://www.iana.org/time-zones", str))
+ {
+ CONSTDATA char db[] = "/time-zones/releases/tzdata";
+ CONSTDATA auto db_size = sizeof(db) - 1;
+ auto p = str.find(db, 0, db_size);
+ const int ver_str_len = 5;
+ if (p != std::string::npos && p + (db_size + ver_str_len) <= str.size())
+ version = str.substr(p + db_size, ver_str_len);
+ }
+ return version;
+}
+
+
+// TODO! Using system() create a process and a console window.
+// This is useful to see what errors may occur but is slow and distracting.
+// Consider implementing this functionality more directly, such as
+// using _mkdir and CreateProcess etc.
+// But use the current means now as matches Unix implementations and while
+// in proof of concept / testing phase.
+// TODO! Use <filesystem> eventually.
+static
+bool
+remove_folder_and_subfolders(const std::string& folder)
+{
+# ifdef _WIN32
+# if USE_SHELL_API
+ // Delete the folder contents by deleting the folder.
+ std::string cmd = "rd /s /q \"";
+ cmd += folder;
+ cmd += '\"';
+ return std::system(cmd.c_str()) == EXIT_SUCCESS;
+# else // !USE_SHELL_API
+ // Create a buffer containing the path to delete. It must be terminated
+ // by two nuls. Who designs these API's...
+ std::vector<char> from;
+ from.assign(folder.begin(), folder.end());
+ from.push_back('\0');
+ from.push_back('\0');
+ SHFILEOPSTRUCT fo{}; // Zero initialize.
+ fo.wFunc = FO_DELETE;
+ fo.pFrom = from.data();
+ fo.fFlags = FOF_NO_UI;
+ int ret = SHFileOperation(&fo);
+ if (ret == 0 && !fo.fAnyOperationsAborted)
+ return true;
+ return false;
+# endif // !USE_SHELL_API
+# else // !_WIN32
+# if USE_SHELL_API
+ return std::system(("rm -R " + folder).c_str()) == EXIT_SUCCESS;
+# else // !USE_SHELL_API
+ struct dir_deleter {
+ dir_deleter() {}
+ void operator()(DIR* d) const
+ {
+ if (d != nullptr)
+ {
+ int result = closedir(d);
+ assert(result == 0);
+ }
+ }
+ };
+ using closedir_ptr = std::unique_ptr<DIR, dir_deleter>;
+
+ std::string filename;
+ struct stat statbuf;
+ std::size_t folder_len = folder.length();
+ struct dirent* p = nullptr;
+
+ closedir_ptr d(opendir(folder.c_str()));
+ bool r = d.get() != nullptr;
+ while (r && (p=readdir(d.get())) != nullptr)
+ {
+ if (strcmp(p->d_name, ".") == 0 || strcmp(p->d_name, "..") == 0)
+ continue;
+
+ // + 2 for path delimiter and nul terminator.
+ std::size_t buf_len = folder_len + strlen(p->d_name) + 2;
+ filename.resize(buf_len);
+ std::size_t path_len = static_cast<std::size_t>(
+ snprintf(&filename[0], buf_len, "%s/%s", folder.c_str(), p->d_name));
+ assert(path_len == buf_len - 1);
+ filename.resize(path_len);
+
+ if (stat(filename.c_str(), &statbuf) == 0)
+ r = S_ISDIR(statbuf.st_mode)
+ ? remove_folder_and_subfolders(filename)
+ : unlink(filename.c_str()) == 0;
+ }
+ d.reset();
+
+ if (r)
+ r = rmdir(folder.c_str()) == 0;
+
+ return r;
+# endif // !USE_SHELL_API
+# endif // !_WIN32
+}
+
+static
+bool
+make_directory(const std::string& folder)
+{
+# ifdef _WIN32
+# if USE_SHELL_API
+ // Re-create the folder.
+ std::string cmd = "mkdir \"";
+ cmd += folder;
+ cmd += '\"';
+ return std::system(cmd.c_str()) == EXIT_SUCCESS;
+# else // !USE_SHELL_API
+ return _mkdir(folder.c_str()) == 0;
+# endif // !USE_SHELL_API
+# else // !_WIN32
+# if USE_SHELL_API
+ return std::system(("mkdir -p " + folder).c_str()) == EXIT_SUCCESS;
+# else // !USE_SHELL_API
+ return mkdir(folder.c_str(), 0777) == 0;
+# endif // !USE_SHELL_API
+# endif // !_WIN32
+}
+
+static
+bool
+delete_file(const std::string& file)
+{
+# ifdef _WIN32
+# if USE_SHELL_API
+ std::string cmd = "del \"";
+ cmd += file;
+ cmd += '\"';
+ return std::system(cmd.c_str()) == 0;
+# else // !USE_SHELL_API
+ return _unlink(file.c_str()) == 0;
+# endif // !USE_SHELL_API
+# else // !_WIN32
+# if USE_SHELL_API
+ return std::system(("rm " + file).c_str()) == EXIT_SUCCESS;
+# else // !USE_SHELL_API
+ return unlink(file.c_str()) == 0;
+# endif // !USE_SHELL_API
+# endif // !_WIN32
+}
+
+# ifdef _WIN32
+
+static
+bool
+move_file(const std::string& from, const std::string& to)
+{
+# if USE_SHELL_API
+ std::string cmd = "move \"";
+ cmd += from;
+ cmd += "\" \"";
+ cmd += to;
+ cmd += '\"';
+ return std::system(cmd.c_str()) == EXIT_SUCCESS;
+# else // !USE_SHELL_API
+ return !!::MoveFile(from.c_str(), to.c_str());
+# endif // !USE_SHELL_API
+}
+
+// Usually something like "c:\Program Files".
+static
+std::string
+get_program_folder()
+{
+ return get_known_folder(FOLDERID_ProgramFiles);
+}
+
+// Note folder can and usually does contain spaces.
+static
+std::string
+get_unzip_program()
+{
+ std::string path;
+
+ // 7-Zip appears to note its location in the registry.
+ // If that doesn't work, fall through and take a guess, but it will likely be wrong.
+ HKEY hKey = nullptr;
+ if (RegOpenKeyExA(HKEY_LOCAL_MACHINE, "SOFTWARE\\7-Zip", 0, KEY_READ, &hKey) == ERROR_SUCCESS)
+ {
+ char value_buffer[MAX_PATH + 1]; // fyi 260 at time of writing.
+ // in/out parameter. Documentation say that size is a count of bytes not chars.
+ DWORD size = sizeof(value_buffer) - sizeof(value_buffer[0]);
+ DWORD tzi_type = REG_SZ;
+ // Testing shows Path key value is "C:\Program Files\7-Zip\" i.e. always with trailing \.
+ bool got_value = (RegQueryValueExA(hKey, "Path", nullptr, &tzi_type,
+ reinterpret_cast<LPBYTE>(value_buffer), &size) == ERROR_SUCCESS);
+ RegCloseKey(hKey); // Close now incase of throw later.
+ if (got_value)
+ {
+ // Function does not guarantee to null terminate.
+ value_buffer[size / sizeof(value_buffer[0])] = '\0';
+ path = value_buffer;
+ if (!path.empty())
+ {
+ path += "7z.exe";
+ return path;
+ }
+ }
+ }
+ path += get_program_folder();
+ path += folder_delimiter;
+ path += "7-Zip\\7z.exe";
+ return path;
+}
+
+# if !USE_SHELL_API
+static
+int
+run_program(const std::string& command)
+{
+ STARTUPINFO si{};
+ si.cb = sizeof(si);
+ PROCESS_INFORMATION pi{};
+
+ // Allegedly CreateProcess overwrites the command line. Ugh.
+ std::string mutable_command(command);
+ if (CreateProcess(nullptr, &mutable_command[0],
+ nullptr, nullptr, FALSE, CREATE_NO_WINDOW, nullptr, nullptr, &si, &pi))
+ {
+ WaitForSingleObject(pi.hProcess, INFINITE);
+ DWORD exit_code;
+ bool got_exit_code = !!GetExitCodeProcess(pi.hProcess, &exit_code);
+ CloseHandle(pi.hProcess);
+ CloseHandle(pi.hThread);
+ // Not 100% sure about this still active thing is correct,
+ // but I'm going with it because I *think* WaitForSingleObject might
+ // return in some cases without INFINITE-ly waiting.
+ // But why/wouldn't GetExitCodeProcess return false in that case?
+ if (got_exit_code && exit_code != STILL_ACTIVE)
+ return static_cast<int>(exit_code);
+ }
+ return EXIT_FAILURE;
+}
+# endif // !USE_SHELL_API
+
+static
+std::string
+get_download_tar_file(const std::string& version)
+{
+ auto file = get_install();
+ file += folder_delimiter;
+ file += "tzdata";
+ file += version;
+ file += ".tar";
+ return file;
+}
+
+static
+bool
+extract_gz_file(const std::string& version, const std::string& gz_file,
+ const std::string& dest_folder)
+{
+ auto unzip_prog = get_unzip_program();
+ bool unzip_result = false;
+ // Use the unzip program to extract the tar file from the archive.
+
+ // Aim to create a string like:
+ // "C:\Program Files\7-Zip\7z.exe" x "C:\Users\SomeUser\Downloads\tzdata2016d.tar.gz"
+ // -o"C:\Users\SomeUser\Downloads\tzdata"
+ std::string cmd;
+ cmd = '\"';
+ cmd += unzip_prog;
+ cmd += "\" x \"";
+ cmd += gz_file;
+ cmd += "\" -o\"";
+ cmd += dest_folder;
+ cmd += '\"';
+
+# if USE_SHELL_API
+ // When using shelling out with std::system() extra quotes are required around the
+ // whole command. It's weird but necessary it seems, see:
+ // http://stackoverflow.com/q/27975969/576911
+
+ cmd = "\"" + cmd + "\"";
+ if (std::system(cmd.c_str()) == EXIT_SUCCESS)
+ unzip_result = true;
+# else // !USE_SHELL_API
+ if (run_program(cmd) == EXIT_SUCCESS)
+ unzip_result = true;
+# endif // !USE_SHELL_API
+ if (unzip_result)
+ delete_file(gz_file);
+
+ // Use the unzip program extract the data from the tar file that was
+ // just extracted from the archive.
+ auto tar_file = get_download_tar_file(version);
+ cmd = '\"';
+ cmd += unzip_prog;
+ cmd += "\" x \"";
+ cmd += tar_file;
+ cmd += "\" -o\"";
+ cmd += get_install();
+ cmd += '\"';
+# if USE_SHELL_API
+ cmd = "\"" + cmd + "\"";
+ if (std::system(cmd.c_str()) == EXIT_SUCCESS)
+ unzip_result = true;
+# else // !USE_SHELL_API
+ if (run_program(cmd) == EXIT_SUCCESS)
+ unzip_result = true;
+# endif // !USE_SHELL_API
+
+ if (unzip_result)
+ delete_file(tar_file);
+
+ return unzip_result;
+}
+
+static
+std::string
+get_download_mapping_file(const std::string& version)
+{
+ auto file = get_install() + version + "windowsZones.xml";
+ return file;
+}
+
+# else // !_WIN32
+
+# if !USE_SHELL_API
+static
+int
+run_program(const char* prog, const char*const args[])
+{
+ pid_t pid = fork();
+ if (pid == -1) // Child failed to start.
+ return EXIT_FAILURE;
+
+ if (pid != 0)
+ {
+ // We are in the parent. Child started. Wait for it.
+ pid_t ret;
+ int status;
+ while ((ret = waitpid(pid, &status, 0)) == -1)
+ {
+ if (errno != EINTR)
+ break;
+ }
+ if (ret != -1)
+ {
+ if (WIFEXITED(status))
+ return WEXITSTATUS(status);
+ }
+ printf("Child issues!\n");
+
+ return EXIT_FAILURE; // Not sure what status of child is.
+ }
+ else // We are in the child process. Start the program the parent wants to run.
+ {
+
+ if (execv(prog, const_cast<char**>(args)) == -1) // Does not return.
+ {
+ perror("unreachable 0\n");
+ _Exit(127);
+ }
+ printf("unreachable 2\n");
+ }
+ printf("unreachable 2\n");
+ // Unreachable.
+ assert(false);
+ exit(EXIT_FAILURE);
+ return EXIT_FAILURE;
+}
+# endif // !USE_SHELL_API
+
+static
+bool
+extract_gz_file(const std::string&, const std::string& gz_file, const std::string&)
+{
+# if USE_SHELL_API
+ bool unzipped = std::system(("tar -xzf " + gz_file + " -C " + get_install()).c_str()) == EXIT_SUCCESS;
+# else // !USE_SHELL_API
+ const char prog[] = {"/usr/bin/tar"};
+ const char*const args[] =
+ {
+ prog, "-xzf", gz_file.c_str(), "-C", get_install().c_str(), nullptr
+ };
+ bool unzipped = (run_program(prog, args) == EXIT_SUCCESS);
+# endif // !USE_SHELL_API
+ if (unzipped)
+ {
+ delete_file(gz_file);
+ return true;
+ }
+ return false;
+}
+
+# endif // !_WIN32
+
+bool
+remote_download(const std::string& version, char* error_buffer)
+{
+ assert(!version.empty());
+
+# ifdef _WIN32
+ // Download folder should be always available for Windows
+# else // !_WIN32
+ // Create download folder if it does not exist on UNIX system
+ auto download_folder = get_install();
+ if (!file_exists(download_folder))
+ {
+ if (!make_directory(download_folder))
+ return false;
+ }
+# endif // _WIN32
+
+ auto url = "https://data.iana.org/time-zones/releases/tzdata" + version +
+ ".tar.gz";
+ bool result = download_to_file(url, get_download_gz_file(version),
+ download_file_options::binary, error_buffer);
+# ifdef _WIN32
+ if (result)
+ {
+ auto mapping_file = get_download_mapping_file(version);
+ result = download_to_file(
+ "https://raw.githubusercontent.com/unicode-org/cldr/master/"
+ "common/supplemental/windowsZones.xml",
+ mapping_file, download_file_options::text, error_buffer);
+ }
+# endif // _WIN32
+ return result;
+}
+
+bool
+remote_install(const std::string& version)
+{
+ auto success = false;
+ assert(!version.empty());
+
+ std::string install = get_install();
+ auto gz_file = get_download_gz_file(version);
+ if (file_exists(gz_file))
+ {
+ if (file_exists(install))
+ remove_folder_and_subfolders(install);
+ if (make_directory(install))
+ {
+ if (extract_gz_file(version, gz_file, install))
+ success = true;
+# ifdef _WIN32
+ auto mapping_file_source = get_download_mapping_file(version);
+ auto mapping_file_dest = get_install();
+ mapping_file_dest += folder_delimiter;
+ mapping_file_dest += "windowsZones.xml";
+ if (!move_file(mapping_file_source, mapping_file_dest))
+ success = false;
+# endif // _WIN32
+ }
+ }
+ return success;
+}
+
+#endif // HAS_REMOTE_API
+
+static
+std::string
+get_version(const std::string& path)
+{
+ std::string version;
+ std::ifstream infile(path + "version");
+ if (infile.is_open())
+ {
+ infile >> version;
+ if (!infile.fail())
+ return version;
+ }
+ else
+ {
+ infile.open(path + "NEWS");
+ while (infile)
+ {
+ infile >> version;
+ if (version == "Release")
+ {
+ infile >> version;
+ return version;
+ }
+ }
+ }
+ throw std::runtime_error("Unable to get Timezone database version from " + path);
+}
+
+static
+std::unique_ptr<tzdb>
+init_tzdb()
+{
+ using namespace date;
+ const std::string install = get_install();
+ const std::string path = install + folder_delimiter;
+ std::string line;
+ bool continue_zone = false;
+ std::unique_ptr<tzdb> db(new tzdb);
+
+#if AUTO_DOWNLOAD
+ if (!file_exists(install))
+ {
+ auto rv = remote_version();
+ if (!rv.empty() && remote_download(rv))
+ {
+ if (!remote_install(rv))
+ {
+ std::string msg = "Timezone database version \"";
+ msg += rv;
+ msg += "\" did not install correctly to \"";
+ msg += install;
+ msg += "\"";
+ throw std::runtime_error(msg);
+ }
+ }
+ if (!file_exists(install))
+ {
+ std::string msg = "Timezone database not found at \"";
+ msg += install;
+ msg += "\"";
+ throw std::runtime_error(msg);
+ }
+ db->version = get_version(path);
+ }
+ else
+ {
+ db->version = get_version(path);
+ auto rv = remote_version();
+ if (!rv.empty() && db->version != rv)
+ {
+ if (remote_download(rv))
+ {
+ remote_install(rv);
+ db->version = get_version(path);
+ }
+ }
+ }
+#else // !AUTO_DOWNLOAD
+ if (!file_exists(install))
+ {
+ std::string msg = "Timezone database not found at \"";
+ msg += install;
+ msg += "\"";
+ throw std::runtime_error(msg);
+ }
+ db->version = get_version(path);
+#endif // !AUTO_DOWNLOAD
+
+ CONSTDATA char*const files[] =
+ {
+ "africa", "antarctica", "asia", "australasia", "backward", "etcetera", "europe",
+ "pacificnew", "northamerica", "southamerica", "systemv", "leapseconds"
+ };
+
+ for (const auto& filename : files)
+ {
+ std::ifstream infile(path + filename);
+ while (infile)
+ {
+ std::getline(infile, line);
+ if (!line.empty() && line[0] != '#')
+ {
+ std::istringstream in(line);
+ std::string word;
+ in >> word;
+ if (word == "Rule")
+ {
+ db->rules.push_back(Rule(line));
+ continue_zone = false;
+ }
+ else if (word == "Link")
+ {
+ db->links.push_back(time_zone_link(line));
+ continue_zone = false;
+ }
+ else if (word == "Leap")
+ {
+ db->leap_seconds.push_back(leap_second(line, detail::undocumented{}));
+ continue_zone = false;
+ }
+ else if (word == "Zone")
+ {
+ db->zones.push_back(time_zone(line, detail::undocumented{}));
+ continue_zone = true;
+ }
+ else if (line[0] == '\t' && continue_zone)
+ {
+ db->zones.back().add(line);
+ }
+ else
+ {
+ std::cerr << line << '\n';
+ }
+ }
+ }
+ }
+ std::sort(db->rules.begin(), db->rules.end());
+ Rule::split_overlaps(db->rules);
+ std::sort(db->zones.begin(), db->zones.end());
+ db->zones.shrink_to_fit();
+ std::sort(db->links.begin(), db->links.end());
+ db->links.shrink_to_fit();
+ std::sort(db->leap_seconds.begin(), db->leap_seconds.end());
+ db->leap_seconds.shrink_to_fit();
+
+#ifdef _WIN32
+ std::string mapping_file = get_install() + folder_delimiter + "windowsZones.xml";
+ db->mappings = load_timezone_mappings_from_xml_file(mapping_file);
+ sort_zone_mappings(db->mappings);
+#endif // _WIN32
+
+ return db;
+}
+
+const tzdb&
+reload_tzdb()
+{
+#if AUTO_DOWNLOAD
+ auto const& v = get_tzdb_list().front().version;
+ if (!v.empty() && v == remote_version())
+ return get_tzdb_list().front();
+#endif // AUTO_DOWNLOAD
+ tzdb_list::undocumented_helper::push_front(get_tzdb_list(), init_tzdb().release());
+ return get_tzdb_list().front();
+}
+
+#endif // !USE_OS_TZDB
+
+const tzdb&
+get_tzdb()
+{
+ return get_tzdb_list().front();
+}
+
+const time_zone*
+#if HAS_STRING_VIEW
+tzdb::locate_zone(std::string_view tz_name) const
+#else
+tzdb::locate_zone(const std::string& tz_name) const
+#endif
+{
+ auto zi = std::lower_bound(zones.begin(), zones.end(), tz_name,
+#if HAS_STRING_VIEW
+ [](const time_zone& z, const std::string_view& nm)
+#else
+ [](const time_zone& z, const std::string& nm)
+#endif
+ {
+ return z.name() < nm;
+ });
+ if (zi == zones.end() || zi->name() != tz_name)
+ {
+#if !USE_OS_TZDB
+ auto li = std::lower_bound(links.begin(), links.end(), tz_name,
+#if HAS_STRING_VIEW
+ [](const time_zone_link& z, const std::string_view& nm)
+#else
+ [](const time_zone_link& z, const std::string& nm)
+#endif
+ {
+ return z.name() < nm;
+ });
+ if (li != links.end() && li->name() == tz_name)
+ {
+ zi = std::lower_bound(zones.begin(), zones.end(), li->target(),
+ [](const time_zone& z, const std::string& nm)
+ {
+ return z.name() < nm;
+ });
+ if (zi != zones.end() && zi->name() == li->target())
+ return &*zi;
+ }
+#endif // !USE_OS_TZDB
+ throw std::runtime_error(std::string(tz_name) + " not found in timezone database");
+ }
+ return &*zi;
+}
+
+const time_zone*
+#if HAS_STRING_VIEW
+locate_zone(std::string_view tz_name)
+#else
+locate_zone(const std::string& tz_name)
+#endif
+{
+ return get_tzdb().locate_zone(tz_name);
+}
+
+#if USE_OS_TZDB
+
+std::ostream&
+operator<<(std::ostream& os, const tzdb& db)
+{
+ os << "Version: " << db.version << "\n\n";
+ for (const auto& x : db.zones)
+ os << x << '\n';
+#if !MISSING_LEAP_SECONDS
+ os << '\n';
+ for (const auto& x : db.leap_seconds)
+ os << x << '\n';
+#endif // !MISSING_LEAP_SECONDS
+ return os;
+}
+
+#else // !USE_OS_TZDB
+
+std::ostream&
+operator<<(std::ostream& os, const tzdb& db)
+{
+ os << "Version: " << db.version << '\n';
+ std::string title("--------------------------------------------"
+ "--------------------------------------------\n"
+ "Name ""Start Y ""End Y "
+ "Beginning ""Offset "
+ "Designator\n"
+ "--------------------------------------------"
+ "--------------------------------------------\n");
+ int count = 0;
+ for (const auto& x : db.rules)
+ {
+ if (count++ % 50 == 0)
+ os << title;
+ os << x << '\n';
+ }
+ os << '\n';
+ title = std::string("---------------------------------------------------------"
+ "--------------------------------------------------------\n"
+ "Name ""Offset "
+ "Rule ""Abrev ""Until\n"
+ "---------------------------------------------------------"
+ "--------------------------------------------------------\n");
+ count = 0;
+ for (const auto& x : db.zones)
+ {
+ if (count++ % 10 == 0)
+ os << title;
+ os << x << '\n';
+ }
+ os << '\n';
+ title = std::string("---------------------------------------------------------"
+ "--------------------------------------------------------\n"
+ "Alias ""To\n"
+ "---------------------------------------------------------"
+ "--------------------------------------------------------\n");
+ count = 0;
+ for (const auto& x : db.links)
+ {
+ if (count++ % 45 == 0)
+ os << title;
+ os << x << '\n';
+ }
+ os << '\n';
+ title = std::string("---------------------------------------------------------"
+ "--------------------------------------------------------\n"
+ "Leap second on\n"
+ "---------------------------------------------------------"
+ "--------------------------------------------------------\n");
+ os << title;
+ for (const auto& x : db.leap_seconds)
+ os << x << '\n';
+ return os;
+}
+
+#endif // !USE_OS_TZDB
+
+// -----------------------
+
+#ifdef _WIN32
+
+static
+std::string
+getTimeZoneKeyName()
+{
+ DYNAMIC_TIME_ZONE_INFORMATION dtzi{};
+ auto result = GetDynamicTimeZoneInformation(&dtzi);
+ if (result == TIME_ZONE_ID_INVALID)
+ throw std::runtime_error("current_zone(): GetDynamicTimeZoneInformation()"
+ " reported TIME_ZONE_ID_INVALID.");
+ auto wlen = wcslen(dtzi.TimeZoneKeyName);
+ char buf[128] = {};
+ assert(sizeof(buf) >= wlen+1);
+ wcstombs(buf, dtzi.TimeZoneKeyName, wlen);
+ if (strcmp(buf, "Coordinated Universal Time") == 0)
+ return "UTC";
+ return buf;
+}
+
+const time_zone*
+tzdb::current_zone() const
+{
+ std::string win_tzid = getTimeZoneKeyName();
+ std::string standard_tzid;
+ if (!native_to_standard_timezone_name(win_tzid, standard_tzid))
+ {
+ std::string msg;
+ msg = "current_zone() failed: A mapping from the Windows Time Zone id \"";
+ msg += win_tzid;
+ msg += "\" was not found in the time zone mapping database.";
+ throw std::runtime_error(msg);
+ }
+ return locate_zone(standard_tzid);
+}
+
+#else // !_WIN32
+
+#if HAS_STRING_VIEW
+
+static
+std::string_view
+extract_tz_name(char const* rp)
+{
+ using namespace std;
+ string_view result = rp;
+ CONSTDATA string_view zoneinfo = "zoneinfo";
+ size_t pos = result.rfind(zoneinfo);
+ if (pos == result.npos)
+ throw runtime_error(
+ "current_zone() failed to find \"zoneinfo\" in " + string(result));
+ pos = result.find('/', pos);
+ result.remove_prefix(pos + 1);
+ return result;
+}
+
+#else // !HAS_STRING_VIEW
+
+static
+std::string
+extract_tz_name(char const* rp)
+{
+ using namespace std;
+ string result = rp;
+ CONSTDATA char zoneinfo[] = "zoneinfo";
+ size_t pos = result.rfind(zoneinfo);
+ if (pos == result.npos)
+ throw runtime_error(
+ "current_zone() failed to find \"zoneinfo\" in " + result);
+ pos = result.find('/', pos);
+ result.erase(0, pos + 1);
+ return result;
+}
+
+#endif // HAS_STRING_VIEW
+
+static
+bool
+sniff_realpath(const char* timezone)
+{
+ using namespace std;
+ char rp[PATH_MAX+1] = {};
+ if (realpath(timezone, rp) == nullptr)
+ throw system_error(errno, system_category(), "realpath() failed");
+ auto result = extract_tz_name(rp);
+ return result != "posixrules";
+}
+
+const time_zone*
+tzdb::current_zone() const
+{
+ // On some OS's a file called /etc/localtime may
+ // exist and it may be either a real file
+ // containing time zone details or a symlink to such a file.
+ // On MacOS and BSD Unix if this file is a symlink it
+ // might resolve to a path like this:
+ // "/usr/share/zoneinfo/America/Los_Angeles"
+ // If it does, we try to determine the current
+ // timezone from the remainder of the path by removing the prefix
+ // and hoping the rest resolves to a valid timezone.
+ // It may not always work though. If it doesn't then an
+ // exception will be thrown by local_timezone.
+ // The path may also take a relative form:
+ // "../usr/share/zoneinfo/America/Los_Angeles".
+ {
+ struct stat sb;
+ CONSTDATA auto timezone = "/etc/localtime";
+ if (lstat(timezone, &sb) == 0 && S_ISLNK(sb.st_mode) && sb.st_size > 0)
+ {
+ using namespace std;
+ static const bool use_realpath = sniff_realpath(timezone);
+ char rp[PATH_MAX+1] = {};
+ if (use_realpath)
+ {
+ if (realpath(timezone, rp) == nullptr)
+ throw system_error(errno, system_category(), "realpath() failed");
+ }
+ else
+ {
+ if (readlink(timezone, rp, sizeof(rp)-1) <= 0)
+ throw system_error(errno, system_category(), "readlink() failed");
+ }
+ return locate_zone(extract_tz_name(rp));
+ }
+ }
+ // On embedded systems e.g. buildroot with uclibc the timezone is linked
+ // into /etc/TZ which is a symlink to path like this:
+ // "/usr/share/zoneinfo/uclibc/America/Los_Angeles"
+ // If it does, we try to determine the current
+ // timezone from the remainder of the path by removing the prefix
+ // and hoping the rest resolves to valid timezone.
+ // It may not always work though. If it doesn't then an
+ // exception will be thrown by local_timezone.
+ // The path may also take a relative form:
+ // "../usr/share/zoneinfo/uclibc/America/Los_Angeles".
+ {
+ struct stat sb;
+ CONSTDATA auto timezone = "/etc/TZ";
+ if (lstat(timezone, &sb) == 0 && S_ISLNK(sb.st_mode) && sb.st_size > 0) {
+ using namespace std;
+ string result;
+ char rp[PATH_MAX+1] = {};
+ if (readlink(timezone, rp, sizeof(rp)-1) > 0)
+ result = string(rp);
+ else
+ throw system_error(errno, system_category(), "readlink() failed");
+
+ const size_t pos = result.find(get_tz_dir());
+ if (pos != result.npos)
+ result.erase(0, get_tz_dir().size() + 1 + pos);
+ return locate_zone(result);
+ }
+ }
+ {
+ // On some versions of some linux distro's (e.g. Ubuntu),
+ // the current timezone might be in the first line of
+ // the /etc/timezone file.
+ std::ifstream timezone_file("/etc/timezone");
+ if (timezone_file.is_open())
+ {
+ std::string result;
+ std::getline(timezone_file, result);
+ if (!result.empty())
+ return locate_zone(result);
+ }
+ // Fall through to try other means.
+ }
+ {
+ // On some versions of some bsd distro's (e.g. FreeBSD),
+ // the current timezone might be in the first line of
+ // the /var/db/zoneinfo file.
+ std::ifstream timezone_file("/var/db/zoneinfo");
+ if (timezone_file.is_open())
+ {
+ std::string result;
+ std::getline(timezone_file, result);
+ if (!result.empty())
+ return locate_zone(result);
+ }
+ // Fall through to try other means.
+ }
+ {
+ // On some versions of some bsd distro's (e.g. iOS),
+ // it is not possible to use file based approach,
+ // we switch to system API, calling functions in
+ // CoreFoundation framework.
+#if TARGET_OS_IPHONE
+ std::string result = date::iOSUtils::get_current_timezone();
+ if (!result.empty())
+ return locate_zone(result);
+#endif
+ // Fall through to try other means.
+ }
+ {
+ // On some versions of some linux distro's (e.g. Red Hat),
+ // the current timezone might be in the first line of
+ // the /etc/sysconfig/clock file as:
+ // ZONE="US/Eastern"
+ std::ifstream timezone_file("/etc/sysconfig/clock");
+ std::string result;
+ while (timezone_file)
+ {
+ std::getline(timezone_file, result);
+ auto p = result.find("ZONE=\"");
+ if (p != std::string::npos)
+ {
+ result.erase(p, p+6);
+ result.erase(result.rfind('"'));
+ return locate_zone(result);
+ }
+ }
+ // Fall through to try other means.
+ }
+ throw std::runtime_error("Could not get current timezone");
+}
+
+#endif // !_WIN32
+
+const time_zone*
+current_zone()
+{
+ return get_tzdb().current_zone();
+}
+
+} // namespace date
+} // namespace arrow_vendored
+
+#if defined(__GNUC__) && __GNUC__ < 5
+# pragma GCC diagnostic pop
+#endif
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/vendored/datetime/tz.h b/contrib/libs/apache/arrow/cpp/src/arrow/vendored/datetime/tz.h
new file mode 100644
index 00000000000..23c6742143c
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/vendored/datetime/tz.h
@@ -0,0 +1,2804 @@
+#ifndef TZ_H
+#define TZ_H
+
+// The MIT License (MIT)
+//
+// Copyright (c) 2015, 2016, 2017 Howard Hinnant
+// Copyright (c) 2017 Jiangang Zhuang
+// Copyright (c) 2017 Aaron Bishop
+// Copyright (c) 2017 Tomasz Kamiński
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+// Our apologies. When the previous paragraph was written, lowercase had not yet
+// been invented (that would involve another several millennia of evolution).
+// We did not mean to shout.
+
+// Get more recent database at http://www.iana.org/time-zones
+
+// The notion of "current timezone" is something the operating system is expected to "just
+// know". How it knows this is system specific. It's often a value set by the user at OS
+// installation time and recorded by the OS somewhere. On Linux and Mac systems the current
+// timezone name is obtained by looking at the name or contents of a particular file on
+// disk. On Windows the current timezone name comes from the registry. In either method,
+// there is no guarantee that the "native" current timezone name obtained will match any
+// of the "Standard" names in this library's "database". On Linux, the names usually do
+// seem to match so mapping functions to map from native to "Standard" are typically not
+// required. On Windows, the names are never "Standard" so mapping is always required.
+// Technically any OS may use the mapping process but currently only Windows does use it.
+
+// NOTE(ARROW): If this is not set, then the library will attempt to
+// use libcurl to obtain a timezone database, and we do not yet have
+// curl in our build toolchain
+#ifndef _WIN32
+#define USE_OS_TZDB 1
+#endif
+
+#ifndef USE_OS_TZDB
+# define USE_OS_TZDB 0
+#endif
+
+#ifndef HAS_REMOTE_API
+# if USE_OS_TZDB == 0
+# ifdef _WIN32
+# define HAS_REMOTE_API 0
+# else
+# define HAS_REMOTE_API 1
+# endif
+# else // HAS_REMOTE_API makes no since when using the OS timezone database
+# define HAS_REMOTE_API 0
+# endif
+#endif
+
+#ifdef __clang__
+# pragma clang diagnostic push
+# pragma clang diagnostic ignored "-Wconstant-logical-operand"
+#endif
+
+static_assert(!(USE_OS_TZDB && HAS_REMOTE_API),
+ "USE_OS_TZDB and HAS_REMOTE_API can not be used together");
+
+#ifdef __clang__
+# pragma clang diagnostic pop
+#endif
+
+#ifndef AUTO_DOWNLOAD
+# define AUTO_DOWNLOAD HAS_REMOTE_API
+#endif
+
+static_assert(HAS_REMOTE_API == 0 ? AUTO_DOWNLOAD == 0 : true,
+ "AUTO_DOWNLOAD can not be turned on without HAS_REMOTE_API");
+
+#ifndef USE_SHELL_API
+# define USE_SHELL_API 1
+#endif
+
+#if USE_OS_TZDB
+# ifdef _WIN32
+# error "USE_OS_TZDB can not be used on Windows"
+# endif
+# ifndef MISSING_LEAP_SECONDS
+# ifdef __APPLE__
+# define MISSING_LEAP_SECONDS 1
+# else
+# define MISSING_LEAP_SECONDS 0
+# endif
+# endif
+#else
+# define MISSING_LEAP_SECONDS 0
+#endif
+
+#ifndef HAS_DEDUCTION_GUIDES
+# if __cplusplus >= 201703
+# define HAS_DEDUCTION_GUIDES 1
+# else
+# define HAS_DEDUCTION_GUIDES 0
+# endif
+#endif // HAS_DEDUCTION_GUIDES
+
+#include "date.h"
+
+#if defined(_MSC_VER) && (_MSC_VER < 1900)
+#include "tz_private.h"
+#endif
+
+#include <algorithm>
+#include <atomic>
+#include <cassert>
+#include <chrono>
+#include <istream>
+#include <locale>
+#include <memory>
+#include <mutex>
+#include <ostream>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#ifdef _WIN32
+# ifdef DATE_BUILD_DLL
+# define DATE_API __declspec(dllexport)
+# elif defined(DATE_USE_DLL)
+# define DATE_API __declspec(dllimport)
+# else
+# define DATE_API
+# endif
+#else
+# ifdef DATE_BUILD_DLL
+# define DATE_API __attribute__ ((visibility ("default")))
+# else
+# define DATE_API
+# endif
+#endif
+
+namespace arrow_vendored
+{
+namespace date
+{
+
+enum class choose {earliest, latest};
+
+namespace detail
+{
+ struct undocumented;
+
+ template<typename T>
+ struct nodeduct
+ {
+ using type = T;
+ };
+
+ template<typename T>
+ using nodeduct_t = typename nodeduct<T>::type;
+}
+
+struct sys_info
+{
+ sys_seconds begin;
+ sys_seconds end;
+ std::chrono::seconds offset;
+ std::chrono::minutes save;
+ std::string abbrev;
+};
+
+template<class CharT, class Traits>
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const sys_info& r)
+{
+ os << r.begin << '\n';
+ os << r.end << '\n';
+ os << make_time(r.offset) << "\n";
+ os << make_time(r.save) << "\n";
+ os << r.abbrev << '\n';
+ return os;
+}
+
+struct local_info
+{
+ enum {unique, nonexistent, ambiguous} result;
+ sys_info first;
+ sys_info second;
+};
+
+template<class CharT, class Traits>
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const local_info& r)
+{
+ if (r.result == local_info::nonexistent)
+ os << "nonexistent between\n";
+ else if (r.result == local_info::ambiguous)
+ os << "ambiguous between\n";
+ os << r.first;
+ if (r.result != local_info::unique)
+ {
+ os << "and\n";
+ os << r.second;
+ }
+ return os;
+}
+
+class nonexistent_local_time
+ : public std::runtime_error
+{
+public:
+ template <class Duration>
+ nonexistent_local_time(local_time<Duration> tp, const local_info& i);
+
+private:
+ template <class Duration>
+ static
+ std::string
+ make_msg(local_time<Duration> tp, const local_info& i);
+};
+
+template <class Duration>
+inline
+nonexistent_local_time::nonexistent_local_time(local_time<Duration> tp,
+ const local_info& i)
+ : std::runtime_error(make_msg(tp, i))
+{
+}
+
+template <class Duration>
+std::string
+nonexistent_local_time::make_msg(local_time<Duration> tp, const local_info& i)
+{
+ assert(i.result == local_info::nonexistent);
+ std::ostringstream os;
+ os << tp << " is in a gap between\n"
+ << local_seconds{i.first.end.time_since_epoch()} + i.first.offset << ' '
+ << i.first.abbrev << " and\n"
+ << local_seconds{i.second.begin.time_since_epoch()} + i.second.offset << ' '
+ << i.second.abbrev
+ << " which are both equivalent to\n"
+ << i.first.end << " UTC";
+ return os.str();
+}
+
+class ambiguous_local_time
+ : public std::runtime_error
+{
+public:
+ template <class Duration>
+ ambiguous_local_time(local_time<Duration> tp, const local_info& i);
+
+private:
+ template <class Duration>
+ static
+ std::string
+ make_msg(local_time<Duration> tp, const local_info& i);
+};
+
+template <class Duration>
+inline
+ambiguous_local_time::ambiguous_local_time(local_time<Duration> tp, const local_info& i)
+ : std::runtime_error(make_msg(tp, i))
+{
+}
+
+template <class Duration>
+std::string
+ambiguous_local_time::make_msg(local_time<Duration> tp, const local_info& i)
+{
+ assert(i.result == local_info::ambiguous);
+ std::ostringstream os;
+ os << tp << " is ambiguous. It could be\n"
+ << tp << ' ' << i.first.abbrev << " == "
+ << tp - i.first.offset << " UTC or\n"
+ << tp << ' ' << i.second.abbrev << " == "
+ << tp - i.second.offset << " UTC";
+ return os.str();
+}
+
+class time_zone;
+
+#if HAS_STRING_VIEW
+DATE_API const time_zone* locate_zone(std::string_view tz_name);
+#else
+DATE_API const time_zone* locate_zone(const std::string& tz_name);
+#endif
+
+DATE_API const time_zone* current_zone();
+
+template <class T>
+struct zoned_traits
+{
+};
+
+template <>
+struct zoned_traits<const time_zone*>
+{
+ static
+ const time_zone*
+ default_zone()
+ {
+ return date::locate_zone("Etc/UTC");
+ }
+
+#if HAS_STRING_VIEW
+
+ static
+ const time_zone*
+ locate_zone(std::string_view name)
+ {
+ return date::locate_zone(name);
+ }
+
+#else // !HAS_STRING_VIEW
+
+ static
+ const time_zone*
+ locate_zone(const std::string& name)
+ {
+ return date::locate_zone(name);
+ }
+
+ static
+ const time_zone*
+ locate_zone(const char* name)
+ {
+ return date::locate_zone(name);
+ }
+
+#endif // !HAS_STRING_VIEW
+};
+
+template <class Duration, class TimeZonePtr>
+class zoned_time;
+
+template <class Duration1, class Duration2, class TimeZonePtr>
+bool
+operator==(const zoned_time<Duration1, TimeZonePtr>& x,
+ const zoned_time<Duration2, TimeZonePtr>& y);
+
+template <class Duration, class TimeZonePtr = const time_zone*>
+class zoned_time
+{
+public:
+ using duration = typename std::common_type<Duration, std::chrono::seconds>::type;
+
+private:
+ TimeZonePtr zone_;
+ sys_time<duration> tp_;
+
+public:
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+ template <class T = TimeZonePtr,
+ class = decltype(zoned_traits<T>::default_zone())>
+#endif
+ zoned_time();
+
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+ template <class T = TimeZonePtr,
+ class = decltype(zoned_traits<T>::default_zone())>
+#endif
+ zoned_time(const sys_time<Duration>& st);
+ explicit zoned_time(TimeZonePtr z);
+
+#if HAS_STRING_VIEW
+ template <class T = TimeZonePtr,
+ class = typename std::enable_if
+ <
+ std::is_constructible
+ <
+ zoned_time,
+ decltype(zoned_traits<T>::locate_zone(std::string_view()))
+ >::value
+ >::type>
+ explicit zoned_time(std::string_view name);
+#else
+# if !defined(_MSC_VER) || (_MSC_VER > 1916)
+ template <class T = TimeZonePtr,
+ class = typename std::enable_if
+ <
+ std::is_constructible
+ <
+ zoned_time,
+ decltype(zoned_traits<T>::locate_zone(std::string()))
+ >::value
+ >::type>
+# endif
+ explicit zoned_time(const std::string& name);
+#endif
+
+ template <class Duration2,
+ class = typename std::enable_if
+ <
+ std::is_convertible<sys_time<Duration2>,
+ sys_time<Duration>>::value
+ >::type>
+ zoned_time(const zoned_time<Duration2, TimeZonePtr>& zt) NOEXCEPT;
+
+ zoned_time(TimeZonePtr z, const sys_time<Duration>& st);
+
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+ template <class T = TimeZonePtr,
+ class = typename std::enable_if
+ <
+ std::is_convertible
+ <
+ decltype(std::declval<T&>()->to_sys(local_time<Duration>{})),
+ sys_time<duration>
+ >::value
+ >::type>
+#endif
+ zoned_time(TimeZonePtr z, const local_time<Duration>& tp);
+
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+ template <class T = TimeZonePtr,
+ class = typename std::enable_if
+ <
+ std::is_convertible
+ <
+ decltype(std::declval<T&>()->to_sys(local_time<Duration>{},
+ choose::earliest)),
+ sys_time<duration>
+ >::value
+ >::type>
+#endif
+ zoned_time(TimeZonePtr z, const local_time<Duration>& tp, choose c);
+
+ template <class Duration2, class TimeZonePtr2,
+ class = typename std::enable_if
+ <
+ std::is_convertible<sys_time<Duration2>,
+ sys_time<Duration>>::value
+ >::type>
+ zoned_time(TimeZonePtr z, const zoned_time<Duration2, TimeZonePtr2>& zt);
+
+ template <class Duration2, class TimeZonePtr2,
+ class = typename std::enable_if
+ <
+ std::is_convertible<sys_time<Duration2>,
+ sys_time<Duration>>::value
+ >::type>
+ zoned_time(TimeZonePtr z, const zoned_time<Duration2, TimeZonePtr2>& zt, choose);
+
+#if HAS_STRING_VIEW
+
+ template <class T = TimeZonePtr,
+ class = typename std::enable_if
+ <
+ std::is_constructible
+ <
+ zoned_time,
+ decltype(zoned_traits<T>::locate_zone(std::string_view())),
+ sys_time<Duration>
+ >::value
+ >::type>
+ zoned_time(std::string_view name, detail::nodeduct_t<const sys_time<Duration>&> st);
+
+ template <class T = TimeZonePtr,
+ class = typename std::enable_if
+ <
+ std::is_constructible
+ <
+ zoned_time,
+ decltype(zoned_traits<T>::locate_zone(std::string_view())),
+ local_time<Duration>
+ >::value
+ >::type>
+ zoned_time(std::string_view name, detail::nodeduct_t<const local_time<Duration>&> tp);
+
+ template <class T = TimeZonePtr,
+ class = typename std::enable_if
+ <
+ std::is_constructible
+ <
+ zoned_time,
+ decltype(zoned_traits<T>::locate_zone(std::string_view())),
+ local_time<Duration>,
+ choose
+ >::value
+ >::type>
+ zoned_time(std::string_view name, detail::nodeduct_t<const local_time<Duration>&> tp, choose c);
+
+ template <class Duration2, class TimeZonePtr2, class T = TimeZonePtr,
+ class = typename std::enable_if
+ <
+ std::is_convertible<sys_time<Duration2>,
+ sys_time<Duration>>::value &&
+ std::is_constructible
+ <
+ zoned_time,
+ decltype(zoned_traits<T>::locate_zone(std::string_view())),
+ zoned_time
+ >::value
+ >::type>
+ zoned_time(std::string_view name, const zoned_time<Duration2, TimeZonePtr2>& zt);
+
+ template <class Duration2, class TimeZonePtr2, class T = TimeZonePtr,
+ class = typename std::enable_if
+ <
+ std::is_convertible<sys_time<Duration2>,
+ sys_time<Duration>>::value &&
+ std::is_constructible
+ <
+ zoned_time,
+ decltype(zoned_traits<T>::locate_zone(std::string_view())),
+ zoned_time,
+ choose
+ >::value
+ >::type>
+ zoned_time(std::string_view name, const zoned_time<Duration2, TimeZonePtr2>& zt, choose);
+
+#else // !HAS_STRING_VIEW
+
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+ template <class T = TimeZonePtr,
+ class = typename std::enable_if
+ <
+ std::is_constructible
+ <
+ zoned_time,
+ decltype(zoned_traits<T>::locate_zone(std::string())),
+ sys_time<Duration>
+ >::value
+ >::type>
+#endif
+ zoned_time(const std::string& name, const sys_time<Duration>& st);
+
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+ template <class T = TimeZonePtr,
+ class = typename std::enable_if
+ <
+ std::is_constructible
+ <
+ zoned_time,
+ decltype(zoned_traits<T>::locate_zone(std::string())),
+ sys_time<Duration>
+ >::value
+ >::type>
+#endif
+ zoned_time(const char* name, const sys_time<Duration>& st);
+
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+ template <class T = TimeZonePtr,
+ class = typename std::enable_if
+ <
+ std::is_constructible
+ <
+ zoned_time,
+ decltype(zoned_traits<T>::locate_zone(std::string())),
+ local_time<Duration>
+ >::value
+ >::type>
+#endif
+ zoned_time(const std::string& name, const local_time<Duration>& tp);
+
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+ template <class T = TimeZonePtr,
+ class = typename std::enable_if
+ <
+ std::is_constructible
+ <
+ zoned_time,
+ decltype(zoned_traits<T>::locate_zone(std::string())),
+ local_time<Duration>
+ >::value
+ >::type>
+#endif
+ zoned_time(const char* name, const local_time<Duration>& tp);
+
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+ template <class T = TimeZonePtr,
+ class = typename std::enable_if
+ <
+ std::is_constructible
+ <
+ zoned_time,
+ decltype(zoned_traits<T>::locate_zone(std::string())),
+ local_time<Duration>,
+ choose
+ >::value
+ >::type>
+#endif
+ zoned_time(const std::string& name, const local_time<Duration>& tp, choose c);
+
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+ template <class T = TimeZonePtr,
+ class = typename std::enable_if
+ <
+ std::is_constructible
+ <
+ zoned_time,
+ decltype(zoned_traits<T>::locate_zone(std::string())),
+ local_time<Duration>,
+ choose
+ >::value
+ >::type>
+#endif
+ zoned_time(const char* name, const local_time<Duration>& tp, choose c);
+
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+ template <class Duration2, class TimeZonePtr2, class T = TimeZonePtr,
+ class = typename std::enable_if
+ <
+ std::is_convertible<sys_time<Duration2>,
+ sys_time<Duration>>::value &&
+ std::is_constructible
+ <
+ zoned_time,
+ decltype(zoned_traits<T>::locate_zone(std::string())),
+ zoned_time
+ >::value
+ >::type>
+#else
+ template <class Duration2, class TimeZonePtr2>
+#endif
+ zoned_time(const std::string& name, const zoned_time<Duration2, TimeZonePtr2>& zt);
+
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+ template <class Duration2, class TimeZonePtr2, class T = TimeZonePtr,
+ class = typename std::enable_if
+ <
+ std::is_convertible<sys_time<Duration2>,
+ sys_time<Duration>>::value &&
+ std::is_constructible
+ <
+ zoned_time,
+ decltype(zoned_traits<T>::locate_zone(std::string())),
+ zoned_time
+ >::value
+ >::type>
+#else
+ template <class Duration2, class TimeZonePtr2>
+#endif
+ zoned_time(const char* name, const zoned_time<Duration2, TimeZonePtr2>& zt);
+
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+ template <class Duration2, class TimeZonePtr2, class T = TimeZonePtr,
+ class = typename std::enable_if
+ <
+ std::is_convertible<sys_time<Duration2>,
+ sys_time<Duration>>::value &&
+ std::is_constructible
+ <
+ zoned_time,
+ decltype(zoned_traits<T>::locate_zone(std::string())),
+ zoned_time,
+ choose
+ >::value
+ >::type>
+#else
+ template <class Duration2, class TimeZonePtr2>
+#endif
+ zoned_time(const std::string& name, const zoned_time<Duration2, TimeZonePtr2>& zt,
+ choose);
+
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+ template <class Duration2, class TimeZonePtr2, class T = TimeZonePtr,
+ class = typename std::enable_if
+ <
+ std::is_convertible<sys_time<Duration2>,
+ sys_time<Duration>>::value &&
+ std::is_constructible
+ <
+ zoned_time,
+ decltype(zoned_traits<T>::locate_zone(std::string())),
+ zoned_time,
+ choose
+ >::value
+ >::type>
+#else
+ template <class Duration2, class TimeZonePtr2>
+#endif
+ zoned_time(const char* name, const zoned_time<Duration2, TimeZonePtr2>& zt,
+ choose);
+
+#endif // !HAS_STRING_VIEW
+
+ zoned_time& operator=(const sys_time<Duration>& st);
+ zoned_time& operator=(const local_time<Duration>& ut);
+
+ explicit operator sys_time<duration>() const;
+ explicit operator local_time<duration>() const;
+
+ TimeZonePtr get_time_zone() const;
+ local_time<duration> get_local_time() const;
+ sys_time<duration> get_sys_time() const;
+ sys_info get_info() const;
+
+ template <class Duration1, class Duration2, class TimeZonePtr1>
+ friend
+ bool
+ operator==(const zoned_time<Duration1, TimeZonePtr1>& x,
+ const zoned_time<Duration2, TimeZonePtr1>& y);
+
+ template <class CharT, class Traits, class Duration1, class TimeZonePtr1>
+ friend
+ std::basic_ostream<CharT, Traits>&
+ operator<<(std::basic_ostream<CharT, Traits>& os,
+ const zoned_time<Duration1, TimeZonePtr1>& t);
+
+private:
+ template <class D, class T> friend class zoned_time;
+};
+
+using zoned_seconds = zoned_time<std::chrono::seconds>;
+
+#if HAS_DEDUCTION_GUIDES
+
+namespace detail
+{
+ template<typename TimeZonePtrOrName>
+ using time_zone_representation =
+ std::conditional_t
+ <
+ std::is_convertible<TimeZonePtrOrName, std::string_view>::value,
+ time_zone const*,
+ std::remove_cv_t<std::remove_reference_t<TimeZonePtrOrName>>
+ >;
+}
+
+zoned_time()
+ -> zoned_time<std::chrono::seconds>;
+
+template <class Duration>
+zoned_time(sys_time<Duration>)
+ -> zoned_time<std::common_type_t<Duration, std::chrono::seconds>>;
+
+template <class TimeZonePtrOrName>
+zoned_time(TimeZonePtrOrName&&)
+ -> zoned_time<std::chrono::seconds, detail::time_zone_representation<TimeZonePtrOrName>>;
+
+template <class TimeZonePtrOrName, class Duration>
+zoned_time(TimeZonePtrOrName&&, sys_time<Duration>)
+ -> zoned_time<std::common_type_t<Duration, std::chrono::seconds>, detail::time_zone_representation<TimeZonePtrOrName>>;
+
+template <class TimeZonePtrOrName, class Duration>
+zoned_time(TimeZonePtrOrName&&, local_time<Duration>, choose = choose::earliest)
+ -> zoned_time<std::common_type_t<Duration, std::chrono::seconds>, detail::time_zone_representation<TimeZonePtrOrName>>;
+
+template <class Duration, class TimeZonePtrOrName, class TimeZonePtr2>
+zoned_time(TimeZonePtrOrName&&, zoned_time<Duration, TimeZonePtr2>, choose = choose::earliest)
+ -> zoned_time<std::common_type_t<Duration, std::chrono::seconds>, detail::time_zone_representation<TimeZonePtrOrName>>;
+
+#endif // HAS_DEDUCTION_GUIDES
+
+template <class Duration1, class Duration2, class TimeZonePtr>
+inline
+bool
+operator==(const zoned_time<Duration1, TimeZonePtr>& x,
+ const zoned_time<Duration2, TimeZonePtr>& y)
+{
+ return x.zone_ == y.zone_ && x.tp_ == y.tp_;
+}
+
+template <class Duration1, class Duration2, class TimeZonePtr>
+inline
+bool
+operator!=(const zoned_time<Duration1, TimeZonePtr>& x,
+ const zoned_time<Duration2, TimeZonePtr>& y)
+{
+ return !(x == y);
+}
+
+#if !defined(_MSC_VER) || (_MSC_VER >= 1900)
+
+namespace detail
+{
+# if USE_OS_TZDB
+ struct transition;
+ struct expanded_ttinfo;
+# else // !USE_OS_TZDB
+ struct zonelet;
+ class Rule;
+# endif // !USE_OS_TZDB
+}
+
+#endif // !defined(_MSC_VER) || (_MSC_VER >= 1900)
+
+class time_zone
+{
+private:
+ std::string name_;
+#if USE_OS_TZDB
+ std::vector<detail::transition> transitions_;
+ std::vector<detail::expanded_ttinfo> ttinfos_;
+#else // !USE_OS_TZDB
+ std::vector<detail::zonelet> zonelets_;
+#endif // !USE_OS_TZDB
+ std::unique_ptr<std::once_flag> adjusted_;
+
+public:
+#if !defined(_MSC_VER) || (_MSC_VER >= 1900)
+ time_zone(time_zone&&) = default;
+ time_zone& operator=(time_zone&&) = default;
+#else // defined(_MSC_VER) && (_MSC_VER < 1900)
+ time_zone(time_zone&& src);
+ time_zone& operator=(time_zone&& src);
+#endif // defined(_MSC_VER) && (_MSC_VER < 1900)
+
+ DATE_API explicit time_zone(const std::string& s, detail::undocumented);
+
+ const std::string& name() const NOEXCEPT;
+
+ template <class Duration> sys_info get_info(sys_time<Duration> st) const;
+ template <class Duration> local_info get_info(local_time<Duration> tp) const;
+
+ template <class Duration>
+ sys_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+ to_sys(local_time<Duration> tp) const;
+
+ template <class Duration>
+ sys_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+ to_sys(local_time<Duration> tp, choose z) const;
+
+ template <class Duration>
+ local_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+ to_local(sys_time<Duration> tp) const;
+
+ friend bool operator==(const time_zone& x, const time_zone& y) NOEXCEPT;
+ friend bool operator< (const time_zone& x, const time_zone& y) NOEXCEPT;
+ friend DATE_API std::ostream& operator<<(std::ostream& os, const time_zone& z);
+
+#if !USE_OS_TZDB
+ DATE_API void add(const std::string& s);
+#endif // !USE_OS_TZDB
+
+private:
+ DATE_API sys_info get_info_impl(sys_seconds tp) const;
+ DATE_API local_info get_info_impl(local_seconds tp) const;
+
+ template <class Duration>
+ sys_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+ to_sys_impl(local_time<Duration> tp, choose z, std::false_type) const;
+ template <class Duration>
+ sys_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+ to_sys_impl(local_time<Duration> tp, choose, std::true_type) const;
+
+#if USE_OS_TZDB
+ DATE_API void init() const;
+ DATE_API void init_impl();
+ DATE_API sys_info
+ load_sys_info(std::vector<detail::transition>::const_iterator i) const;
+
+ template <class TimeType>
+ DATE_API void
+ load_data(std::istream& inf, std::int32_t tzh_leapcnt, std::int32_t tzh_timecnt,
+ std::int32_t tzh_typecnt, std::int32_t tzh_charcnt);
+#else // !USE_OS_TZDB
+ DATE_API sys_info get_info_impl(sys_seconds tp, int timezone) const;
+ DATE_API void adjust_infos(const std::vector<detail::Rule>& rules);
+ DATE_API void parse_info(std::istream& in);
+#endif // !USE_OS_TZDB
+};
+
+#if defined(_MSC_VER) && (_MSC_VER < 1900)
+
+inline
+time_zone::time_zone(time_zone&& src)
+ : name_(std::move(src.name_))
+ , zonelets_(std::move(src.zonelets_))
+ , adjusted_(std::move(src.adjusted_))
+ {}
+
+inline
+time_zone&
+time_zone::operator=(time_zone&& src)
+{
+ name_ = std::move(src.name_);
+ zonelets_ = std::move(src.zonelets_);
+ adjusted_ = std::move(src.adjusted_);
+ return *this;
+}
+
+#endif // defined(_MSC_VER) && (_MSC_VER < 1900)
+
+inline
+const std::string&
+time_zone::name() const NOEXCEPT
+{
+ return name_;
+}
+
+template <class Duration>
+inline
+sys_info
+time_zone::get_info(sys_time<Duration> st) const
+{
+ return get_info_impl(date::floor<std::chrono::seconds>(st));
+}
+
+template <class Duration>
+inline
+local_info
+time_zone::get_info(local_time<Duration> tp) const
+{
+ return get_info_impl(date::floor<std::chrono::seconds>(tp));
+}
+
+template <class Duration>
+inline
+sys_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+time_zone::to_sys(local_time<Duration> tp) const
+{
+ return to_sys_impl(tp, choose{}, std::true_type{});
+}
+
+template <class Duration>
+inline
+sys_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+time_zone::to_sys(local_time<Duration> tp, choose z) const
+{
+ return to_sys_impl(tp, z, std::false_type{});
+}
+
+template <class Duration>
+inline
+local_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+time_zone::to_local(sys_time<Duration> tp) const
+{
+ using LT = local_time<typename std::common_type<Duration, std::chrono::seconds>::type>;
+ auto i = get_info(tp);
+ return LT{(tp + i.offset).time_since_epoch()};
+}
+
+inline bool operator==(const time_zone& x, const time_zone& y) NOEXCEPT {return x.name_ == y.name_;}
+inline bool operator< (const time_zone& x, const time_zone& y) NOEXCEPT {return x.name_ < y.name_;}
+
+inline bool operator!=(const time_zone& x, const time_zone& y) NOEXCEPT {return !(x == y);}
+inline bool operator> (const time_zone& x, const time_zone& y) NOEXCEPT {return y < x;}
+inline bool operator<=(const time_zone& x, const time_zone& y) NOEXCEPT {return !(y < x);}
+inline bool operator>=(const time_zone& x, const time_zone& y) NOEXCEPT {return !(x < y);}
+
+template <class Duration>
+sys_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+time_zone::to_sys_impl(local_time<Duration> tp, choose z, std::false_type) const
+{
+ auto i = get_info(tp);
+ if (i.result == local_info::nonexistent)
+ {
+ return i.first.end;
+ }
+ else if (i.result == local_info::ambiguous)
+ {
+ if (z == choose::latest)
+ return sys_time<Duration>{tp.time_since_epoch()} - i.second.offset;
+ }
+ return sys_time<Duration>{tp.time_since_epoch()} - i.first.offset;
+}
+
+template <class Duration>
+sys_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+time_zone::to_sys_impl(local_time<Duration> tp, choose, std::true_type) const
+{
+ auto i = get_info(tp);
+ if (i.result == local_info::nonexistent)
+ throw nonexistent_local_time(tp, i);
+ else if (i.result == local_info::ambiguous)
+ throw ambiguous_local_time(tp, i);
+ return sys_time<Duration>{tp.time_since_epoch()} - i.first.offset;
+}
+
+#if !USE_OS_TZDB
+
+class time_zone_link
+{
+private:
+ std::string name_;
+ std::string target_;
+public:
+ DATE_API explicit time_zone_link(const std::string& s);
+
+ const std::string& name() const {return name_;}
+ const std::string& target() const {return target_;}
+
+ friend bool operator==(const time_zone_link& x, const time_zone_link& y) {return x.name_ == y.name_;}
+ friend bool operator< (const time_zone_link& x, const time_zone_link& y) {return x.name_ < y.name_;}
+
+ friend DATE_API std::ostream& operator<<(std::ostream& os, const time_zone_link& x);
+};
+
+using link = time_zone_link;
+
+inline bool operator!=(const time_zone_link& x, const time_zone_link& y) {return !(x == y);}
+inline bool operator> (const time_zone_link& x, const time_zone_link& y) {return y < x;}
+inline bool operator<=(const time_zone_link& x, const time_zone_link& y) {return !(y < x);}
+inline bool operator>=(const time_zone_link& x, const time_zone_link& y) {return !(x < y);}
+
+#endif // !USE_OS_TZDB
+
+#if !MISSING_LEAP_SECONDS
+
+class leap_second
+{
+private:
+ sys_seconds date_;
+
+public:
+#if USE_OS_TZDB
+ DATE_API explicit leap_second(const sys_seconds& s, detail::undocumented);
+#else
+ DATE_API explicit leap_second(const std::string& s, detail::undocumented);
+#endif
+
+ sys_seconds date() const {return date_;}
+
+ friend bool operator==(const leap_second& x, const leap_second& y) {return x.date_ == y.date_;}
+ friend bool operator< (const leap_second& x, const leap_second& y) {return x.date_ < y.date_;}
+
+ template <class Duration>
+ friend
+ bool
+ operator==(const leap_second& x, const sys_time<Duration>& y)
+ {
+ return x.date_ == y;
+ }
+
+ template <class Duration>
+ friend
+ bool
+ operator< (const leap_second& x, const sys_time<Duration>& y)
+ {
+ return x.date_ < y;
+ }
+
+ template <class Duration>
+ friend
+ bool
+ operator< (const sys_time<Duration>& x, const leap_second& y)
+ {
+ return x < y.date_;
+ }
+
+ friend DATE_API std::ostream& operator<<(std::ostream& os, const leap_second& x);
+};
+
+inline bool operator!=(const leap_second& x, const leap_second& y) {return !(x == y);}
+inline bool operator> (const leap_second& x, const leap_second& y) {return y < x;}
+inline bool operator<=(const leap_second& x, const leap_second& y) {return !(y < x);}
+inline bool operator>=(const leap_second& x, const leap_second& y) {return !(x < y);}
+
+template <class Duration>
+inline
+bool
+operator==(const sys_time<Duration>& x, const leap_second& y)
+{
+ return y == x;
+}
+
+template <class Duration>
+inline
+bool
+operator!=(const leap_second& x, const sys_time<Duration>& y)
+{
+ return !(x == y);
+}
+
+template <class Duration>
+inline
+bool
+operator!=(const sys_time<Duration>& x, const leap_second& y)
+{
+ return !(x == y);
+}
+
+template <class Duration>
+inline
+bool
+operator> (const leap_second& x, const sys_time<Duration>& y)
+{
+ return y < x;
+}
+
+template <class Duration>
+inline
+bool
+operator> (const sys_time<Duration>& x, const leap_second& y)
+{
+ return y < x;
+}
+
+template <class Duration>
+inline
+bool
+operator<=(const leap_second& x, const sys_time<Duration>& y)
+{
+ return !(y < x);
+}
+
+template <class Duration>
+inline
+bool
+operator<=(const sys_time<Duration>& x, const leap_second& y)
+{
+ return !(y < x);
+}
+
+template <class Duration>
+inline
+bool
+operator>=(const leap_second& x, const sys_time<Duration>& y)
+{
+ return !(x < y);
+}
+
+template <class Duration>
+inline
+bool
+operator>=(const sys_time<Duration>& x, const leap_second& y)
+{
+ return !(x < y);
+}
+
+using leap = leap_second;
+
+#endif // !MISSING_LEAP_SECONDS
+
+#ifdef _WIN32
+
+namespace detail
+{
+
+// The time zone mapping is modelled after this data file:
+// http://unicode.org/repos/cldr/trunk/common/supplemental/windowsZones.xml
+// and the field names match the element names from the mapZone element
+// of windowsZones.xml.
+// The website displays this file here:
+// http://www.unicode.org/cldr/charts/latest/supplemental/zone_tzid.html
+// The html view is sorted before being displayed but is otherwise the same
+// There is a mapping between the os centric view (in this case windows)
+// the html displays uses and the generic view the xml file.
+// That mapping is this:
+// display column "windows" -> xml field "other".
+// display column "region" -> xml field "territory".
+// display column "tzid" -> xml field "type".
+// This structure uses the generic terminology because it could be
+// used to to support other os/native name conversions, not just windows,
+// and using the same generic names helps retain the connection to the
+// origin of the data that we are using.
+struct timezone_mapping
+{
+ timezone_mapping(const char* other, const char* territory, const char* type)
+ : other(other), territory(territory), type(type)
+ {
+ }
+ timezone_mapping() = default;
+ std::string other;
+ std::string territory;
+ std::string type;
+};
+
+} // detail
+
+#endif // _WIN32
+
+struct tzdb
+{
+ std::string version = "unknown";
+ std::vector<time_zone> zones;
+#if !USE_OS_TZDB
+ std::vector<time_zone_link> links;
+#endif
+#if !MISSING_LEAP_SECONDS
+ std::vector<leap_second> leap_seconds;
+#endif
+#if !USE_OS_TZDB
+ std::vector<detail::Rule> rules;
+#endif
+#ifdef _WIN32
+ std::vector<detail::timezone_mapping> mappings;
+#endif
+ tzdb* next = nullptr;
+
+ tzdb() = default;
+#if !defined(_MSC_VER) || (_MSC_VER >= 1900)
+ tzdb(tzdb&&) = default;
+ tzdb& operator=(tzdb&&) = default;
+#else // defined(_MSC_VER) && (_MSC_VER < 1900)
+ tzdb(tzdb&& src)
+ : version(std::move(src.version))
+ , zones(std::move(src.zones))
+ , links(std::move(src.links))
+ , leap_seconds(std::move(src.leap_seconds))
+ , rules(std::move(src.rules))
+ , mappings(std::move(src.mappings))
+ {}
+
+ tzdb& operator=(tzdb&& src)
+ {
+ version = std::move(src.version);
+ zones = std::move(src.zones);
+ links = std::move(src.links);
+ leap_seconds = std::move(src.leap_seconds);
+ rules = std::move(src.rules);
+ mappings = std::move(src.mappings);
+ return *this;
+ }
+#endif // defined(_MSC_VER) && (_MSC_VER < 1900)
+
+#if HAS_STRING_VIEW
+ const time_zone* locate_zone(std::string_view tz_name) const;
+#else
+ const time_zone* locate_zone(const std::string& tz_name) const;
+#endif
+ const time_zone* current_zone() const;
+};
+
+using TZ_DB = tzdb;
+
+DATE_API std::ostream&
+operator<<(std::ostream& os, const tzdb& db);
+
+DATE_API const tzdb& get_tzdb();
+
+class tzdb_list
+{
+ std::atomic<tzdb*> head_{nullptr};
+
+public:
+ ~tzdb_list();
+ tzdb_list() = default;
+ tzdb_list(tzdb_list&& x) noexcept;
+
+ const tzdb& front() const noexcept {return *head_;}
+ tzdb& front() noexcept {return *head_;}
+
+ class const_iterator;
+
+ const_iterator begin() const noexcept;
+ const_iterator end() const noexcept;
+
+ const_iterator cbegin() const noexcept;
+ const_iterator cend() const noexcept;
+
+ const_iterator erase_after(const_iterator p) noexcept;
+
+ struct undocumented_helper;
+private:
+ void push_front(tzdb* tzdb) noexcept;
+};
+
+class tzdb_list::const_iterator
+{
+ tzdb* p_ = nullptr;
+
+ explicit const_iterator(tzdb* p) noexcept : p_{p} {}
+public:
+ const_iterator() = default;
+
+ using iterator_category = std::forward_iterator_tag;
+ using value_type = tzdb;
+ using reference = const value_type&;
+ using pointer = const value_type*;
+ using difference_type = std::ptrdiff_t;
+
+ reference operator*() const noexcept {return *p_;}
+ pointer operator->() const noexcept {return p_;}
+
+ const_iterator& operator++() noexcept {p_ = p_->next; return *this;}
+ const_iterator operator++(int) noexcept {auto t = *this; ++(*this); return t;}
+
+ friend
+ bool
+ operator==(const const_iterator& x, const const_iterator& y) noexcept
+ {return x.p_ == y.p_;}
+
+ friend
+ bool
+ operator!=(const const_iterator& x, const const_iterator& y) noexcept
+ {return !(x == y);}
+
+ friend class tzdb_list;
+};
+
+inline
+tzdb_list::const_iterator
+tzdb_list::begin() const noexcept
+{
+ return const_iterator{head_};
+}
+
+inline
+tzdb_list::const_iterator
+tzdb_list::end() const noexcept
+{
+ return const_iterator{nullptr};
+}
+
+inline
+tzdb_list::const_iterator
+tzdb_list::cbegin() const noexcept
+{
+ return begin();
+}
+
+inline
+tzdb_list::const_iterator
+tzdb_list::cend() const noexcept
+{
+ return end();
+}
+
+DATE_API tzdb_list& get_tzdb_list();
+
+#if !USE_OS_TZDB
+
+DATE_API const tzdb& reload_tzdb();
+DATE_API void set_install(const std::string& install);
+
+#endif // !USE_OS_TZDB
+
+#if HAS_REMOTE_API
+
+DATE_API std::string remote_version();
+// if provided error_buffer size should be at least CURL_ERROR_SIZE
+DATE_API bool remote_download(const std::string& version, char* error_buffer = nullptr);
+DATE_API bool remote_install(const std::string& version);
+
+#endif
+
+// zoned_time
+
+namespace detail
+{
+
+template <class T>
+inline
+T*
+to_raw_pointer(T* p) noexcept
+{
+ return p;
+}
+
+template <class Pointer>
+inline
+auto
+to_raw_pointer(Pointer p) noexcept
+ -> decltype(detail::to_raw_pointer(p.operator->()))
+{
+ return detail::to_raw_pointer(p.operator->());
+}
+
+} // namespace detail
+
+template <class Duration, class TimeZonePtr>
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+template <class T, class>
+#endif
+inline
+zoned_time<Duration, TimeZonePtr>::zoned_time()
+ : zone_(zoned_traits<TimeZonePtr>::default_zone())
+ {}
+
+template <class Duration, class TimeZonePtr>
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+template <class T, class>
+#endif
+inline
+zoned_time<Duration, TimeZonePtr>::zoned_time(const sys_time<Duration>& st)
+ : zone_(zoned_traits<TimeZonePtr>::default_zone())
+ , tp_(st)
+ {}
+
+template <class Duration, class TimeZonePtr>
+inline
+zoned_time<Duration, TimeZonePtr>::zoned_time(TimeZonePtr z)
+ : zone_(std::move(z))
+ {assert(detail::to_raw_pointer(zone_) != nullptr);}
+
+#if HAS_STRING_VIEW
+
+template <class Duration, class TimeZonePtr>
+template <class T, class>
+inline
+zoned_time<Duration, TimeZonePtr>::zoned_time(std::string_view name)
+ : zoned_time(zoned_traits<TimeZonePtr>::locate_zone(name))
+ {}
+
+#else // !HAS_STRING_VIEW
+
+template <class Duration, class TimeZonePtr>
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+template <class T, class>
+#endif
+inline
+zoned_time<Duration, TimeZonePtr>::zoned_time(const std::string& name)
+ : zoned_time(zoned_traits<TimeZonePtr>::locate_zone(name))
+ {}
+
+#endif // !HAS_STRING_VIEW
+
+template <class Duration, class TimeZonePtr>
+template <class Duration2, class>
+inline
+zoned_time<Duration, TimeZonePtr>::zoned_time(const zoned_time<Duration2, TimeZonePtr>& zt) NOEXCEPT
+ : zone_(zt.zone_)
+ , tp_(zt.tp_)
+ {}
+
+template <class Duration, class TimeZonePtr>
+inline
+zoned_time<Duration, TimeZonePtr>::zoned_time(TimeZonePtr z, const sys_time<Duration>& st)
+ : zone_(std::move(z))
+ , tp_(st)
+ {}
+
+template <class Duration, class TimeZonePtr>
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+template <class T, class>
+#endif
+inline
+zoned_time<Duration, TimeZonePtr>::zoned_time(TimeZonePtr z, const local_time<Duration>& t)
+ : zone_(std::move(z))
+ , tp_(zone_->to_sys(t))
+ {}
+
+template <class Duration, class TimeZonePtr>
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+template <class T, class>
+#endif
+inline
+zoned_time<Duration, TimeZonePtr>::zoned_time(TimeZonePtr z, const local_time<Duration>& t,
+ choose c)
+ : zone_(std::move(z))
+ , tp_(zone_->to_sys(t, c))
+ {}
+
+template <class Duration, class TimeZonePtr>
+template <class Duration2, class TimeZonePtr2, class>
+inline
+zoned_time<Duration, TimeZonePtr>::zoned_time(TimeZonePtr z,
+ const zoned_time<Duration2, TimeZonePtr2>& zt)
+ : zone_(std::move(z))
+ , tp_(zt.tp_)
+ {}
+
+template <class Duration, class TimeZonePtr>
+template <class Duration2, class TimeZonePtr2, class>
+inline
+zoned_time<Duration, TimeZonePtr>::zoned_time(TimeZonePtr z,
+ const zoned_time<Duration2, TimeZonePtr2>& zt, choose)
+ : zoned_time(std::move(z), zt)
+ {}
+
+#if HAS_STRING_VIEW
+
+template <class Duration, class TimeZonePtr>
+template <class T, class>
+inline
+zoned_time<Duration, TimeZonePtr>::zoned_time(std::string_view name,
+ detail::nodeduct_t<const sys_time<Duration>&> st)
+ : zoned_time(zoned_traits<TimeZonePtr>::locate_zone(name), st)
+ {}
+
+template <class Duration, class TimeZonePtr>
+template <class T, class>
+inline
+zoned_time<Duration, TimeZonePtr>::zoned_time(std::string_view name,
+ detail::nodeduct_t<const local_time<Duration>&> t)
+ : zoned_time(zoned_traits<TimeZonePtr>::locate_zone(name), t)
+ {}
+
+template <class Duration, class TimeZonePtr>
+template <class T, class>
+inline
+zoned_time<Duration, TimeZonePtr>::zoned_time(std::string_view name,
+ detail::nodeduct_t<const local_time<Duration>&> t, choose c)
+ : zoned_time(zoned_traits<TimeZonePtr>::locate_zone(name), t, c)
+ {}
+
+template <class Duration, class TimeZonePtr>
+template <class Duration2, class TimeZonePtr2, class, class>
+inline
+zoned_time<Duration, TimeZonePtr>::zoned_time(std::string_view name,
+ const zoned_time<Duration2, TimeZonePtr2>& zt)
+ : zoned_time(zoned_traits<TimeZonePtr>::locate_zone(name), zt)
+ {}
+
+template <class Duration, class TimeZonePtr>
+template <class Duration2, class TimeZonePtr2, class, class>
+inline
+zoned_time<Duration, TimeZonePtr>::zoned_time(std::string_view name,
+ const zoned_time<Duration2, TimeZonePtr2>& zt,
+ choose c)
+ : zoned_time(zoned_traits<TimeZonePtr>::locate_zone(name), zt, c)
+ {}
+
+#else // !HAS_STRING_VIEW
+
+template <class Duration, class TimeZonePtr>
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+template <class T, class>
+#endif
+inline
+zoned_time<Duration, TimeZonePtr>::zoned_time(const std::string& name,
+ const sys_time<Duration>& st)
+ : zoned_time(zoned_traits<TimeZonePtr>::locate_zone(name), st)
+ {}
+
+template <class Duration, class TimeZonePtr>
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+template <class T, class>
+#endif
+inline
+zoned_time<Duration, TimeZonePtr>::zoned_time(const char* name,
+ const sys_time<Duration>& st)
+ : zoned_time(zoned_traits<TimeZonePtr>::locate_zone(name), st)
+ {}
+
+template <class Duration, class TimeZonePtr>
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+template <class T, class>
+#endif
+inline
+zoned_time<Duration, TimeZonePtr>::zoned_time(const std::string& name,
+ const local_time<Duration>& t)
+ : zoned_time(zoned_traits<TimeZonePtr>::locate_zone(name), t)
+ {}
+
+template <class Duration, class TimeZonePtr>
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+template <class T, class>
+#endif
+inline
+zoned_time<Duration, TimeZonePtr>::zoned_time(const char* name,
+ const local_time<Duration>& t)
+ : zoned_time(zoned_traits<TimeZonePtr>::locate_zone(name), t)
+ {}
+
+template <class Duration, class TimeZonePtr>
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+template <class T, class>
+#endif
+inline
+zoned_time<Duration, TimeZonePtr>::zoned_time(const std::string& name,
+ const local_time<Duration>& t, choose c)
+ : zoned_time(zoned_traits<TimeZonePtr>::locate_zone(name), t, c)
+ {}
+
+template <class Duration, class TimeZonePtr>
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+template <class T, class>
+#endif
+inline
+zoned_time<Duration, TimeZonePtr>::zoned_time(const char* name,
+ const local_time<Duration>& t, choose c)
+ : zoned_time(zoned_traits<TimeZonePtr>::locate_zone(name), t, c)
+ {}
+
+template <class Duration, class TimeZonePtr>
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+template <class Duration2, class TimeZonePtr2, class, class>
+#else
+template <class Duration2, class TimeZonePtr2>
+#endif
+inline
+zoned_time<Duration, TimeZonePtr>::zoned_time(const std::string& name,
+ const zoned_time<Duration2, TimeZonePtr2>& zt)
+ : zoned_time(zoned_traits<TimeZonePtr>::locate_zone(name), zt)
+ {}
+
+template <class Duration, class TimeZonePtr>
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+template <class Duration2, class TimeZonePtr2, class, class>
+#else
+template <class Duration2, class TimeZonePtr2>
+#endif
+inline
+zoned_time<Duration, TimeZonePtr>::zoned_time(const char* name,
+ const zoned_time<Duration2, TimeZonePtr2>& zt)
+ : zoned_time(zoned_traits<TimeZonePtr>::locate_zone(name), zt)
+ {}
+
+template <class Duration, class TimeZonePtr>
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+template <class Duration2, class TimeZonePtr2, class, class>
+#else
+template <class Duration2, class TimeZonePtr2>
+#endif
+inline
+zoned_time<Duration, TimeZonePtr>::zoned_time(const std::string& name,
+ const zoned_time<Duration2, TimeZonePtr2>& zt,
+ choose c)
+ : zoned_time(zoned_traits<TimeZonePtr>::locate_zone(name), zt, c)
+ {}
+
+template <class Duration, class TimeZonePtr>
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+template <class Duration2, class TimeZonePtr2, class, class>
+#else
+template <class Duration2, class TimeZonePtr2>
+#endif
+inline
+zoned_time<Duration, TimeZonePtr>::zoned_time(const char* name,
+ const zoned_time<Duration2, TimeZonePtr2>& zt,
+ choose c)
+ : zoned_time(zoned_traits<TimeZonePtr>::locate_zone(name), zt, c)
+ {}
+
+#endif // HAS_STRING_VIEW
+
+template <class Duration, class TimeZonePtr>
+inline
+zoned_time<Duration, TimeZonePtr>&
+zoned_time<Duration, TimeZonePtr>::operator=(const sys_time<Duration>& st)
+{
+ tp_ = st;
+ return *this;
+}
+
+template <class Duration, class TimeZonePtr>
+inline
+zoned_time<Duration, TimeZonePtr>&
+zoned_time<Duration, TimeZonePtr>::operator=(const local_time<Duration>& ut)
+{
+ tp_ = zone_->to_sys(ut);
+ return *this;
+}
+
+template <class Duration, class TimeZonePtr>
+inline
+zoned_time<Duration, TimeZonePtr>::operator local_time<typename zoned_time<Duration, TimeZonePtr>::duration>() const
+{
+ return get_local_time();
+}
+
+template <class Duration, class TimeZonePtr>
+inline
+zoned_time<Duration, TimeZonePtr>::operator sys_time<typename zoned_time<Duration, TimeZonePtr>::duration>() const
+{
+ return get_sys_time();
+}
+
+template <class Duration, class TimeZonePtr>
+inline
+TimeZonePtr
+zoned_time<Duration, TimeZonePtr>::get_time_zone() const
+{
+ return zone_;
+}
+
+template <class Duration, class TimeZonePtr>
+inline
+local_time<typename zoned_time<Duration, TimeZonePtr>::duration>
+zoned_time<Duration, TimeZonePtr>::get_local_time() const
+{
+ return zone_->to_local(tp_);
+}
+
+template <class Duration, class TimeZonePtr>
+inline
+sys_time<typename zoned_time<Duration, TimeZonePtr>::duration>
+zoned_time<Duration, TimeZonePtr>::get_sys_time() const
+{
+ return tp_;
+}
+
+template <class Duration, class TimeZonePtr>
+inline
+sys_info
+zoned_time<Duration, TimeZonePtr>::get_info() const
+{
+ return zone_->get_info(tp_);
+}
+
+// make_zoned_time
+
+inline
+zoned_time<std::chrono::seconds>
+make_zoned()
+{
+ return zoned_time<std::chrono::seconds>();
+}
+
+template <class Duration>
+inline
+zoned_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+make_zoned(const sys_time<Duration>& tp)
+{
+ return zoned_time<typename std::common_type<Duration, std::chrono::seconds>::type>(tp);
+}
+
+template <class TimeZonePtr
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+#if !defined(__INTEL_COMPILER) || (__INTEL_COMPILER > 1600)
+ , class = typename std::enable_if
+ <
+ std::is_class
+ <
+ typename std::decay
+ <
+ decltype(*detail::to_raw_pointer(std::declval<TimeZonePtr&>()))
+ >::type
+ >{}
+ >::type
+#endif
+#endif
+ >
+inline
+zoned_time<std::chrono::seconds, TimeZonePtr>
+make_zoned(TimeZonePtr z)
+{
+ return zoned_time<std::chrono::seconds, TimeZonePtr>(std::move(z));
+}
+
+inline
+zoned_seconds
+make_zoned(const std::string& name)
+{
+ return zoned_seconds(name);
+}
+
+template <class Duration, class TimeZonePtr
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+#if !defined(__INTEL_COMPILER) || (__INTEL_COMPILER > 1600)
+ , class = typename std::enable_if
+ <
+ std::is_class<typename std::decay<decltype(*std::declval<TimeZonePtr&>())>::type>{}
+ >::type
+#endif
+#endif
+ >
+inline
+zoned_time<typename std::common_type<Duration, std::chrono::seconds>::type, TimeZonePtr>
+make_zoned(TimeZonePtr zone, const local_time<Duration>& tp)
+{
+ return zoned_time<typename std::common_type<Duration, std::chrono::seconds>::type,
+ TimeZonePtr>(std::move(zone), tp);
+}
+
+template <class Duration, class TimeZonePtr
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+#if !defined(__INTEL_COMPILER) || (__INTEL_COMPILER > 1600)
+ , class = typename std::enable_if
+ <
+ std::is_class<typename std::decay<decltype(*std::declval<TimeZonePtr&>())>::type>{}
+ >::type
+#endif
+#endif
+ >
+inline
+zoned_time<typename std::common_type<Duration, std::chrono::seconds>::type, TimeZonePtr>
+make_zoned(TimeZonePtr zone, const local_time<Duration>& tp, choose c)
+{
+ return zoned_time<typename std::common_type<Duration, std::chrono::seconds>::type,
+ TimeZonePtr>(std::move(zone), tp, c);
+}
+
+template <class Duration>
+inline
+zoned_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+make_zoned(const std::string& name, const local_time<Duration>& tp)
+{
+ return zoned_time<typename std::common_type<Duration,
+ std::chrono::seconds>::type>(name, tp);
+}
+
+template <class Duration>
+inline
+zoned_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+make_zoned(const std::string& name, const local_time<Duration>& tp, choose c)
+{
+ return zoned_time<typename std::common_type<Duration,
+ std::chrono::seconds>::type>(name, tp, c);
+}
+
+template <class Duration, class TimeZonePtr>
+inline
+zoned_time<Duration, TimeZonePtr>
+make_zoned(TimeZonePtr zone, const zoned_time<Duration, TimeZonePtr>& zt)
+{
+ return zoned_time<Duration, TimeZonePtr>(std::move(zone), zt);
+}
+
+template <class Duration, class TimeZonePtr>
+inline
+zoned_time<Duration, TimeZonePtr>
+make_zoned(const std::string& name, const zoned_time<Duration, TimeZonePtr>& zt)
+{
+ return zoned_time<Duration, TimeZonePtr>(name, zt);
+}
+
+template <class Duration, class TimeZonePtr>
+inline
+zoned_time<Duration, TimeZonePtr>
+make_zoned(TimeZonePtr zone, const zoned_time<Duration, TimeZonePtr>& zt, choose c)
+{
+ return zoned_time<Duration, TimeZonePtr>(std::move(zone), zt, c);
+}
+
+template <class Duration, class TimeZonePtr>
+inline
+zoned_time<Duration, TimeZonePtr>
+make_zoned(const std::string& name, const zoned_time<Duration, TimeZonePtr>& zt, choose c)
+{
+ return zoned_time<Duration, TimeZonePtr>(name, zt, c);
+}
+
+template <class Duration, class TimeZonePtr
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+#if !defined(__INTEL_COMPILER) || (__INTEL_COMPILER > 1600)
+ , class = typename std::enable_if
+ <
+ std::is_class<typename std::decay<decltype(*std::declval<TimeZonePtr&>())>::type>{}
+ >::type
+#endif
+#endif
+ >
+inline
+zoned_time<typename std::common_type<Duration, std::chrono::seconds>::type, TimeZonePtr>
+make_zoned(TimeZonePtr zone, const sys_time<Duration>& st)
+{
+ return zoned_time<typename std::common_type<Duration, std::chrono::seconds>::type,
+ TimeZonePtr>(std::move(zone), st);
+}
+
+template <class Duration>
+inline
+zoned_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+make_zoned(const std::string& name, const sys_time<Duration>& st)
+{
+ return zoned_time<typename std::common_type<Duration,
+ std::chrono::seconds>::type>(name, st);
+}
+
+template <class CharT, class Traits, class Duration, class TimeZonePtr>
+std::basic_ostream<CharT, Traits>&
+to_stream(std::basic_ostream<CharT, Traits>& os, const CharT* fmt,
+ const zoned_time<Duration, TimeZonePtr>& tp)
+{
+ using duration = typename zoned_time<Duration, TimeZonePtr>::duration;
+ using LT = local_time<duration>;
+ auto const st = tp.get_sys_time();
+ auto const info = tp.get_time_zone()->get_info(st);
+ return to_stream(os, fmt, LT{(st+info.offset).time_since_epoch()},
+ &info.abbrev, &info.offset);
+}
+
+template <class CharT, class Traits, class Duration, class TimeZonePtr>
+inline
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const zoned_time<Duration, TimeZonePtr>& t)
+{
+ const CharT fmt[] = {'%', 'F', ' ', '%', 'T', ' ', '%', 'Z', CharT{}};
+ return to_stream(os, fmt, t);
+}
+
+#if !MISSING_LEAP_SECONDS
+
+class utc_clock
+{
+public:
+ using duration = std::chrono::system_clock::duration;
+ using rep = duration::rep;
+ using period = duration::period;
+ using time_point = std::chrono::time_point<utc_clock>;
+ static CONSTDATA bool is_steady = false;
+
+ static time_point now();
+
+ template<typename Duration>
+ static
+ std::chrono::time_point<std::chrono::system_clock, typename std::common_type<Duration, std::chrono::seconds>::type>
+ to_sys(const std::chrono::time_point<utc_clock, Duration>&);
+
+ template<typename Duration>
+ static
+ std::chrono::time_point<utc_clock, typename std::common_type<Duration, std::chrono::seconds>::type>
+ from_sys(const std::chrono::time_point<std::chrono::system_clock, Duration>&);
+
+ template<typename Duration>
+ static
+ std::chrono::time_point<local_t, typename std::common_type<Duration, std::chrono::seconds>::type>
+ to_local(const std::chrono::time_point<utc_clock, Duration>&);
+
+ template<typename Duration>
+ static
+ std::chrono::time_point<utc_clock, typename std::common_type<Duration, std::chrono::seconds>::type>
+ from_local(const std::chrono::time_point<local_t, Duration>&);
+};
+
+template <class Duration>
+ using utc_time = std::chrono::time_point<utc_clock, Duration>;
+
+using utc_seconds = utc_time<std::chrono::seconds>;
+
+template <class Duration>
+utc_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+utc_clock::from_sys(const sys_time<Duration>& st)
+{
+ using std::chrono::seconds;
+ using CD = typename std::common_type<Duration, seconds>::type;
+ auto const& leaps = get_tzdb().leap_seconds;
+ auto const lt = std::upper_bound(leaps.begin(), leaps.end(), st);
+ return utc_time<CD>{st.time_since_epoch() + seconds{lt-leaps.begin()}};
+}
+
+// Return pair<is_leap_second, seconds{number_of_leap_seconds_since_1970}>
+// first is true if ut is during a leap second insertion, otherwise false.
+// If ut is during a leap second insertion, that leap second is included in the count
+template <class Duration>
+std::pair<bool, std::chrono::seconds>
+is_leap_second(date::utc_time<Duration> const& ut)
+{
+ using std::chrono::seconds;
+ using duration = typename std::common_type<Duration, seconds>::type;
+ auto const& leaps = get_tzdb().leap_seconds;
+ auto tp = sys_time<duration>{ut.time_since_epoch()};
+ auto const lt = std::upper_bound(leaps.begin(), leaps.end(), tp);
+ auto ds = seconds{lt-leaps.begin()};
+ tp -= ds;
+ auto ls = false;
+ if (lt > leaps.begin())
+ {
+ if (tp < lt[-1])
+ {
+ if (tp >= lt[-1].date() - seconds{1})
+ ls = true;
+ else
+ --ds;
+ }
+ }
+ return {ls, ds};
+}
+
+struct leap_second_info
+{
+ bool is_leap_second;
+ std::chrono::seconds elapsed;
+};
+
+template <class Duration>
+leap_second_info
+get_leap_second_info(date::utc_time<Duration> const& ut)
+{
+ auto p = is_leap_second(ut);
+ return {p.first, p.second};
+}
+
+template <class Duration>
+sys_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+utc_clock::to_sys(const utc_time<Duration>& ut)
+{
+ using std::chrono::seconds;
+ using CD = typename std::common_type<Duration, seconds>::type;
+ auto ls = is_leap_second(ut);
+ auto tp = sys_time<CD>{ut.time_since_epoch() - ls.second};
+ if (ls.first)
+ tp = floor<seconds>(tp) + seconds{1} - CD{1};
+ return tp;
+}
+
+inline
+utc_clock::time_point
+utc_clock::now()
+{
+ return from_sys(std::chrono::system_clock::now());
+}
+
+template <class Duration>
+utc_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+utc_clock::from_local(const local_time<Duration>& st)
+{
+ return from_sys(sys_time<Duration>{st.time_since_epoch()});
+}
+
+template <class Duration>
+local_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+utc_clock::to_local(const utc_time<Duration>& ut)
+{
+ using CD = typename std::common_type<Duration, std::chrono::seconds>::type;
+ return local_time<CD>{to_sys(ut).time_since_epoch()};
+}
+
+template <class CharT, class Traits, class Duration>
+std::basic_ostream<CharT, Traits>&
+to_stream(std::basic_ostream<CharT, Traits>& os, const CharT* fmt,
+ const utc_time<Duration>& t)
+{
+ using std::chrono::seconds;
+ using CT = typename std::common_type<Duration, seconds>::type;
+ const std::string abbrev("UTC");
+ CONSTDATA seconds offset{0};
+ auto ls = is_leap_second(t);
+ auto tp = sys_time<CT>{t.time_since_epoch() - ls.second};
+ auto const sd = floor<days>(tp);
+ year_month_day ymd = sd;
+ auto time = make_time(tp - sys_seconds{sd});
+ time.seconds(detail::undocumented{}) += seconds{ls.first};
+ fields<CT> fds{ymd, time};
+ return to_stream(os, fmt, fds, &abbrev, &offset);
+}
+
+template <class CharT, class Traits, class Duration>
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const utc_time<Duration>& t)
+{
+ const CharT fmt[] = {'%', 'F', ' ', '%', 'T', CharT{}};
+ return to_stream(os, fmt, t);
+}
+
+template <class Duration, class CharT, class Traits, class Alloc = std::allocator<CharT>>
+std::basic_istream<CharT, Traits>&
+from_stream(std::basic_istream<CharT, Traits>& is, const CharT* fmt,
+ utc_time<Duration>& tp, std::basic_string<CharT, Traits, Alloc>* abbrev = nullptr,
+ std::chrono::minutes* offset = nullptr)
+{
+ using std::chrono::seconds;
+ using std::chrono::minutes;
+ using CT = typename std::common_type<Duration, seconds>::type;
+ minutes offset_local{};
+ auto offptr = offset ? offset : &offset_local;
+ fields<CT> fds{};
+ fds.has_tod = true;
+ from_stream(is, fmt, fds, abbrev, offptr);
+ if (!fds.ymd.ok())
+ is.setstate(std::ios::failbit);
+ if (!is.fail())
+ {
+ bool is_60_sec = fds.tod.seconds() == seconds{60};
+ if (is_60_sec)
+ fds.tod.seconds(detail::undocumented{}) -= seconds{1};
+ auto tmp = utc_clock::from_sys(sys_days(fds.ymd) - *offptr + fds.tod.to_duration());
+ if (is_60_sec)
+ tmp += seconds{1};
+ if (is_60_sec != is_leap_second(tmp).first || !fds.tod.in_conventional_range())
+ {
+ is.setstate(std::ios::failbit);
+ return is;
+ }
+ tp = std::chrono::time_point_cast<Duration>(tmp);
+ }
+ return is;
+}
+
+// tai_clock
+
+class tai_clock
+{
+public:
+ using duration = std::chrono::system_clock::duration;
+ using rep = duration::rep;
+ using period = duration::period;
+ using time_point = std::chrono::time_point<tai_clock>;
+ static const bool is_steady = false;
+
+ static time_point now();
+
+ template<typename Duration>
+ static
+ std::chrono::time_point<utc_clock, typename std::common_type<Duration, std::chrono::seconds>::type>
+ to_utc(const std::chrono::time_point<tai_clock, Duration>&) NOEXCEPT;
+
+ template<typename Duration>
+ static
+ std::chrono::time_point<tai_clock, typename std::common_type<Duration, std::chrono::seconds>::type>
+ from_utc(const std::chrono::time_point<utc_clock, Duration>&) NOEXCEPT;
+
+ template<typename Duration>
+ static
+ std::chrono::time_point<local_t, typename std::common_type<Duration, date::days>::type>
+ to_local(const std::chrono::time_point<tai_clock, Duration>&) NOEXCEPT;
+
+ template<typename Duration>
+ static
+ std::chrono::time_point<tai_clock, typename std::common_type<Duration, date::days>::type>
+ from_local(const std::chrono::time_point<local_t, Duration>&) NOEXCEPT;
+};
+
+template <class Duration>
+ using tai_time = std::chrono::time_point<tai_clock, Duration>;
+
+using tai_seconds = tai_time<std::chrono::seconds>;
+
+template <class Duration>
+inline
+utc_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+tai_clock::to_utc(const tai_time<Duration>& t) NOEXCEPT
+{
+ using std::chrono::seconds;
+ using CD = typename std::common_type<Duration, seconds>::type;
+ return utc_time<CD>{t.time_since_epoch()} -
+ (sys_days(year{1970}/January/1) - sys_days(year{1958}/January/1) + seconds{10});
+}
+
+template <class Duration>
+inline
+tai_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+tai_clock::from_utc(const utc_time<Duration>& t) NOEXCEPT
+{
+ using std::chrono::seconds;
+ using CD = typename std::common_type<Duration, seconds>::type;
+ return tai_time<CD>{t.time_since_epoch()} +
+ (sys_days(year{1970}/January/1) - sys_days(year{1958}/January/1) + seconds{10});
+}
+
+inline
+tai_clock::time_point
+tai_clock::now()
+{
+ return from_utc(utc_clock::now());
+}
+
+template <class Duration>
+inline
+local_time<typename std::common_type<Duration, date::days>::type>
+tai_clock::to_local(const tai_time<Duration>& t) NOEXCEPT
+{
+ using CD = typename std::common_type<Duration, date::days>::type;
+ return local_time<CD>{t.time_since_epoch()} -
+ (local_days(year{1970}/January/1) - local_days(year{1958}/January/1));
+}
+
+template <class Duration>
+inline
+tai_time<typename std::common_type<Duration, date::days>::type>
+tai_clock::from_local(const local_time<Duration>& t) NOEXCEPT
+{
+ using CD = typename std::common_type<Duration, date::days>::type;
+ return tai_time<CD>{t.time_since_epoch()} +
+ (local_days(year{1970}/January/1) - local_days(year{1958}/January/1));
+}
+
+template <class CharT, class Traits, class Duration>
+std::basic_ostream<CharT, Traits>&
+to_stream(std::basic_ostream<CharT, Traits>& os, const CharT* fmt,
+ const tai_time<Duration>& t)
+{
+ const std::string abbrev("TAI");
+ CONSTDATA std::chrono::seconds offset{0};
+ return to_stream(os, fmt, tai_clock::to_local(t), &abbrev, &offset);
+}
+
+template <class CharT, class Traits, class Duration>
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const tai_time<Duration>& t)
+{
+ const CharT fmt[] = {'%', 'F', ' ', '%', 'T', CharT{}};
+ return to_stream(os, fmt, t);
+}
+
+template <class Duration, class CharT, class Traits, class Alloc = std::allocator<CharT>>
+std::basic_istream<CharT, Traits>&
+from_stream(std::basic_istream<CharT, Traits>& is, const CharT* fmt,
+ tai_time<Duration>& tp,
+ std::basic_string<CharT, Traits, Alloc>* abbrev = nullptr,
+ std::chrono::minutes* offset = nullptr)
+{
+ local_time<Duration> lp;
+ from_stream(is, fmt, lp, abbrev, offset);
+ if (!is.fail())
+ tp = tai_clock::from_local(lp);
+ return is;
+}
+
+// gps_clock
+
+class gps_clock
+{
+public:
+ using duration = std::chrono::system_clock::duration;
+ using rep = duration::rep;
+ using period = duration::period;
+ using time_point = std::chrono::time_point<gps_clock>;
+ static const bool is_steady = false;
+
+ static time_point now();
+
+ template<typename Duration>
+ static
+ std::chrono::time_point<utc_clock, typename std::common_type<Duration, std::chrono::seconds>::type>
+ to_utc(const std::chrono::time_point<gps_clock, Duration>&) NOEXCEPT;
+
+ template<typename Duration>
+ static
+ std::chrono::time_point<gps_clock, typename std::common_type<Duration, std::chrono::seconds>::type>
+ from_utc(const std::chrono::time_point<utc_clock, Duration>&) NOEXCEPT;
+
+ template<typename Duration>
+ static
+ std::chrono::time_point<local_t, typename std::common_type<Duration, date::days>::type>
+ to_local(const std::chrono::time_point<gps_clock, Duration>&) NOEXCEPT;
+
+ template<typename Duration>
+ static
+ std::chrono::time_point<gps_clock, typename std::common_type<Duration, date::days>::type>
+ from_local(const std::chrono::time_point<local_t, Duration>&) NOEXCEPT;
+};
+
+template <class Duration>
+ using gps_time = std::chrono::time_point<gps_clock, Duration>;
+
+using gps_seconds = gps_time<std::chrono::seconds>;
+
+template <class Duration>
+inline
+utc_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+gps_clock::to_utc(const gps_time<Duration>& t) NOEXCEPT
+{
+ using std::chrono::seconds;
+ using CD = typename std::common_type<Duration, seconds>::type;
+ return utc_time<CD>{t.time_since_epoch()} +
+ (sys_days(year{1980}/January/Sunday[1]) - sys_days(year{1970}/January/1) +
+ seconds{9});
+}
+
+template <class Duration>
+inline
+gps_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+gps_clock::from_utc(const utc_time<Duration>& t) NOEXCEPT
+{
+ using std::chrono::seconds;
+ using CD = typename std::common_type<Duration, seconds>::type;
+ return gps_time<CD>{t.time_since_epoch()} -
+ (sys_days(year{1980}/January/Sunday[1]) - sys_days(year{1970}/January/1) +
+ seconds{9});
+}
+
+inline
+gps_clock::time_point
+gps_clock::now()
+{
+ return from_utc(utc_clock::now());
+}
+
+template <class Duration>
+inline
+local_time<typename std::common_type<Duration, date::days>::type>
+gps_clock::to_local(const gps_time<Duration>& t) NOEXCEPT
+{
+ using CD = typename std::common_type<Duration, date::days>::type;
+ return local_time<CD>{t.time_since_epoch()} +
+ (local_days(year{1980}/January/Sunday[1]) - local_days(year{1970}/January/1));
+}
+
+template <class Duration>
+inline
+gps_time<typename std::common_type<Duration, date::days>::type>
+gps_clock::from_local(const local_time<Duration>& t) NOEXCEPT
+{
+ using CD = typename std::common_type<Duration, date::days>::type;
+ return gps_time<CD>{t.time_since_epoch()} -
+ (local_days(year{1980}/January/Sunday[1]) - local_days(year{1970}/January/1));
+}
+
+
+template <class CharT, class Traits, class Duration>
+std::basic_ostream<CharT, Traits>&
+to_stream(std::basic_ostream<CharT, Traits>& os, const CharT* fmt,
+ const gps_time<Duration>& t)
+{
+ const std::string abbrev("GPS");
+ CONSTDATA std::chrono::seconds offset{0};
+ return to_stream(os, fmt, gps_clock::to_local(t), &abbrev, &offset);
+}
+
+template <class CharT, class Traits, class Duration>
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const gps_time<Duration>& t)
+{
+ const CharT fmt[] = {'%', 'F', ' ', '%', 'T', CharT{}};
+ return to_stream(os, fmt, t);
+}
+
+template <class Duration, class CharT, class Traits, class Alloc = std::allocator<CharT>>
+std::basic_istream<CharT, Traits>&
+from_stream(std::basic_istream<CharT, Traits>& is, const CharT* fmt,
+ gps_time<Duration>& tp,
+ std::basic_string<CharT, Traits, Alloc>* abbrev = nullptr,
+ std::chrono::minutes* offset = nullptr)
+{
+ local_time<Duration> lp;
+ from_stream(is, fmt, lp, abbrev, offset);
+ if (!is.fail())
+ tp = gps_clock::from_local(lp);
+ return is;
+}
+
+// clock_time_conversion
+
+template <class DstClock, class SrcClock>
+struct clock_time_conversion
+{};
+
+template <>
+struct clock_time_conversion<std::chrono::system_clock, std::chrono::system_clock>
+{
+ template <class Duration>
+ CONSTCD14
+ sys_time<Duration>
+ operator()(const sys_time<Duration>& st) const
+ {
+ return st;
+ }
+};
+
+template <>
+struct clock_time_conversion<utc_clock, utc_clock>
+{
+ template <class Duration>
+ CONSTCD14
+ utc_time<Duration>
+ operator()(const utc_time<Duration>& ut) const
+ {
+ return ut;
+ }
+};
+
+template<>
+struct clock_time_conversion<local_t, local_t>
+{
+ template <class Duration>
+ CONSTCD14
+ local_time<Duration>
+ operator()(const local_time<Duration>& lt) const
+ {
+ return lt;
+ }
+};
+
+template <>
+struct clock_time_conversion<utc_clock, std::chrono::system_clock>
+{
+ template <class Duration>
+ utc_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+ operator()(const sys_time<Duration>& st) const
+ {
+ return utc_clock::from_sys(st);
+ }
+};
+
+template <>
+struct clock_time_conversion<std::chrono::system_clock, utc_clock>
+{
+ template <class Duration>
+ sys_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+ operator()(const utc_time<Duration>& ut) const
+ {
+ return utc_clock::to_sys(ut);
+ }
+};
+
+template<>
+struct clock_time_conversion<local_t, std::chrono::system_clock>
+{
+ template <class Duration>
+ CONSTCD14
+ local_time<Duration>
+ operator()(const sys_time<Duration>& st) const
+ {
+ return local_time<Duration>{st.time_since_epoch()};
+ }
+};
+
+template<>
+struct clock_time_conversion<std::chrono::system_clock, local_t>
+{
+ template <class Duration>
+ CONSTCD14
+ sys_time<Duration>
+ operator()(const local_time<Duration>& lt) const
+ {
+ return sys_time<Duration>{lt.time_since_epoch()};
+ }
+};
+
+template<>
+struct clock_time_conversion<utc_clock, local_t>
+{
+ template <class Duration>
+ utc_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+ operator()(const local_time<Duration>& lt) const
+ {
+ return utc_clock::from_local(lt);
+ }
+};
+
+template<>
+struct clock_time_conversion<local_t, utc_clock>
+{
+ template <class Duration>
+ local_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+ operator()(const utc_time<Duration>& ut) const
+ {
+ return utc_clock::to_local(ut);
+ }
+};
+
+template<typename Clock>
+struct clock_time_conversion<Clock, Clock>
+{
+ template <class Duration>
+ CONSTCD14
+ std::chrono::time_point<Clock, Duration>
+ operator()(const std::chrono::time_point<Clock, Duration>& tp) const
+ {
+ return tp;
+ }
+};
+
+namespace ctc_detail
+{
+
+template <class Clock, class Duration>
+ using time_point = std::chrono::time_point<Clock, Duration>;
+
+using std::declval;
+using std::chrono::system_clock;
+
+//Check if TimePoint is time for given clock,
+//if not emits hard error
+template <class Clock, class TimePoint>
+struct return_clock_time
+{
+ using clock_time_point = time_point<Clock, typename TimePoint::duration>;
+ using type = TimePoint;
+
+ static_assert(std::is_same<TimePoint, clock_time_point>::value,
+ "time point with appropariate clock shall be returned");
+};
+
+// Check if Clock has to_sys method accepting TimePoint with given duration const& and
+// returning sys_time. If so has nested type member equal to return type to_sys.
+template <class Clock, class Duration, class = void>
+struct return_to_sys
+{};
+
+template <class Clock, class Duration>
+struct return_to_sys
+ <
+ Clock, Duration,
+ decltype(Clock::to_sys(declval<time_point<Clock, Duration> const&>()), void())
+ >
+ : return_clock_time
+ <
+ system_clock,
+ decltype(Clock::to_sys(declval<time_point<Clock, Duration> const&>()))
+ >
+{};
+
+// Similiar to above
+template <class Clock, class Duration, class = void>
+struct return_from_sys
+{};
+
+template <class Clock, class Duration>
+struct return_from_sys
+ <
+ Clock, Duration,
+ decltype(Clock::from_sys(declval<time_point<system_clock, Duration> const&>()),
+ void())
+ >
+ : return_clock_time
+ <
+ Clock,
+ decltype(Clock::from_sys(declval<time_point<system_clock, Duration> const&>()))
+ >
+{};
+
+// Similiar to above
+template <class Clock, class Duration, class = void>
+struct return_to_utc
+{};
+
+template <class Clock, class Duration>
+struct return_to_utc
+ <
+ Clock, Duration,
+ decltype(Clock::to_utc(declval<time_point<Clock, Duration> const&>()), void())
+ >
+ : return_clock_time
+ <
+ utc_clock,
+ decltype(Clock::to_utc(declval<time_point<Clock, Duration> const&>()))>
+{};
+
+// Similiar to above
+template <class Clock, class Duration, class = void>
+struct return_from_utc
+{};
+
+template <class Clock, class Duration>
+struct return_from_utc
+ <
+ Clock, Duration,
+ decltype(Clock::from_utc(declval<time_point<utc_clock, Duration> const&>()),
+ void())
+ >
+ : return_clock_time
+ <
+ Clock,
+ decltype(Clock::from_utc(declval<time_point<utc_clock, Duration> const&>()))
+ >
+{};
+
+// Similiar to above
+template<typename Clock, typename Duration, typename = void>
+struct return_to_local
+{};
+
+template<typename Clock, typename Duration>
+struct return_to_local
+ <
+ Clock, Duration,
+ decltype(Clock::to_local(declval<time_point<Clock, Duration> const&>()),
+ void())
+ >
+ : return_clock_time
+ <
+ local_t,
+ decltype(Clock::to_local(declval<time_point<Clock, Duration> const&>()))
+ >
+{};
+
+// Similiar to above
+template<typename Clock, typename Duration, typename = void>
+struct return_from_local
+{};
+
+template<typename Clock, typename Duration>
+struct return_from_local
+ <
+ Clock, Duration,
+ decltype(Clock::from_local(declval<time_point<local_t, Duration> const&>()),
+ void())
+ >
+ : return_clock_time
+ <
+ Clock,
+ decltype(Clock::from_local(declval<time_point<local_t, Duration> const&>()))
+ >
+{};
+
+} // namespace ctc_detail
+
+template <class SrcClock>
+struct clock_time_conversion<std::chrono::system_clock, SrcClock>
+{
+ template <class Duration>
+ CONSTCD14
+ typename ctc_detail::return_to_sys<SrcClock, Duration>::type
+ operator()(const std::chrono::time_point<SrcClock, Duration>& tp) const
+ {
+ return SrcClock::to_sys(tp);
+ }
+};
+
+template <class DstClock>
+struct clock_time_conversion<DstClock, std::chrono::system_clock>
+{
+ template <class Duration>
+ CONSTCD14
+ typename ctc_detail::return_from_sys<DstClock, Duration>::type
+ operator()(const sys_time<Duration>& st) const
+ {
+ return DstClock::from_sys(st);
+ }
+};
+
+template <class SrcClock>
+struct clock_time_conversion<utc_clock, SrcClock>
+{
+ template <class Duration>
+ CONSTCD14
+ typename ctc_detail::return_to_utc<SrcClock, Duration>::type
+ operator()(const std::chrono::time_point<SrcClock, Duration>& tp) const
+ {
+ return SrcClock::to_utc(tp);
+ }
+};
+
+template <class DstClock>
+struct clock_time_conversion<DstClock, utc_clock>
+{
+ template <class Duration>
+ CONSTCD14
+ typename ctc_detail::return_from_utc<DstClock, Duration>::type
+ operator()(const utc_time<Duration>& ut) const
+ {
+ return DstClock::from_utc(ut);
+ }
+};
+
+template<typename SrcClock>
+struct clock_time_conversion<local_t, SrcClock>
+{
+ template <class Duration>
+ CONSTCD14
+ typename ctc_detail::return_to_local<SrcClock, Duration>::type
+ operator()(const std::chrono::time_point<SrcClock, Duration>& tp) const
+ {
+ return SrcClock::to_local(tp);
+ }
+};
+
+template<typename DstClock>
+struct clock_time_conversion<DstClock, local_t>
+{
+ template <class Duration>
+ CONSTCD14
+ typename ctc_detail::return_from_local<DstClock, Duration>::type
+ operator()(const local_time<Duration>& lt) const
+ {
+ return DstClock::from_local(lt);
+ }
+};
+
+namespace clock_cast_detail
+{
+
+template <class Clock, class Duration>
+ using time_point = std::chrono::time_point<Clock, Duration>;
+using std::chrono::system_clock;
+
+template <class DstClock, class SrcClock, class Duration>
+CONSTCD14
+auto
+conv_clock(const time_point<SrcClock, Duration>& t)
+ -> decltype(std::declval<clock_time_conversion<DstClock, SrcClock>>()(t))
+{
+ return clock_time_conversion<DstClock, SrcClock>{}(t);
+}
+
+//direct trait conversion, 1st candidate
+template <class DstClock, class SrcClock, class Duration>
+CONSTCD14
+auto
+cc_impl(const time_point<SrcClock, Duration>& t, const time_point<SrcClock, Duration>*)
+ -> decltype(conv_clock<DstClock>(t))
+{
+ return conv_clock<DstClock>(t);
+}
+
+//conversion through sys, 2nd candidate
+template <class DstClock, class SrcClock, class Duration>
+CONSTCD14
+auto
+cc_impl(const time_point<SrcClock, Duration>& t, const void*)
+ -> decltype(conv_clock<DstClock>(conv_clock<system_clock>(t)))
+{
+ return conv_clock<DstClock>(conv_clock<system_clock>(t));
+}
+
+//conversion through utc, 2nd candidate
+template <class DstClock, class SrcClock, class Duration>
+CONSTCD14
+auto
+cc_impl(const time_point<SrcClock, Duration>& t, const void*)
+ -> decltype(0, // MSVC_WORKAROUND
+ conv_clock<DstClock>(conv_clock<utc_clock>(t)))
+{
+ return conv_clock<DstClock>(conv_clock<utc_clock>(t));
+}
+
+//conversion through sys and utc, 3rd candidate
+template <class DstClock, class SrcClock, class Duration>
+CONSTCD14
+auto
+cc_impl(const time_point<SrcClock, Duration>& t, ...)
+ -> decltype(conv_clock<DstClock>(conv_clock<utc_clock>(conv_clock<system_clock>(t))))
+{
+ return conv_clock<DstClock>(conv_clock<utc_clock>(conv_clock<system_clock>(t)));
+}
+
+//conversion through utc and sys, 3rd candidate
+template <class DstClock, class SrcClock, class Duration>
+CONSTCD14
+auto
+cc_impl(const time_point<SrcClock, Duration>& t, ...)
+ -> decltype(0, // MSVC_WORKAROUND
+ conv_clock<DstClock>(conv_clock<system_clock>(conv_clock<utc_clock>(t))))
+{
+ return conv_clock<DstClock>(conv_clock<system_clock>(conv_clock<utc_clock>(t)));
+}
+
+} // namespace clock_cast_detail
+
+template <class DstClock, class SrcClock, class Duration>
+CONSTCD14
+auto
+clock_cast(const std::chrono::time_point<SrcClock, Duration>& tp)
+ -> decltype(clock_cast_detail::cc_impl<DstClock>(tp, &tp))
+{
+ return clock_cast_detail::cc_impl<DstClock>(tp, &tp);
+}
+
+// Deprecated API
+
+template <class Duration>
+inline
+sys_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+to_sys_time(const utc_time<Duration>& t)
+{
+ return utc_clock::to_sys(t);
+}
+
+template <class Duration>
+inline
+sys_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+to_sys_time(const tai_time<Duration>& t)
+{
+ return utc_clock::to_sys(tai_clock::to_utc(t));
+}
+
+template <class Duration>
+inline
+sys_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+to_sys_time(const gps_time<Duration>& t)
+{
+ return utc_clock::to_sys(gps_clock::to_utc(t));
+}
+
+
+template <class Duration>
+inline
+utc_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+to_utc_time(const sys_time<Duration>& t)
+{
+ return utc_clock::from_sys(t);
+}
+
+template <class Duration>
+inline
+utc_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+to_utc_time(const tai_time<Duration>& t)
+{
+ return tai_clock::to_utc(t);
+}
+
+template <class Duration>
+inline
+utc_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+to_utc_time(const gps_time<Duration>& t)
+{
+ return gps_clock::to_utc(t);
+}
+
+
+template <class Duration>
+inline
+tai_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+to_tai_time(const sys_time<Duration>& t)
+{
+ return tai_clock::from_utc(utc_clock::from_sys(t));
+}
+
+template <class Duration>
+inline
+tai_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+to_tai_time(const utc_time<Duration>& t)
+{
+ return tai_clock::from_utc(t);
+}
+
+template <class Duration>
+inline
+tai_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+to_tai_time(const gps_time<Duration>& t)
+{
+ return tai_clock::from_utc(gps_clock::to_utc(t));
+}
+
+
+template <class Duration>
+inline
+gps_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+to_gps_time(const sys_time<Duration>& t)
+{
+ return gps_clock::from_utc(utc_clock::from_sys(t));
+}
+
+template <class Duration>
+inline
+gps_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+to_gps_time(const utc_time<Duration>& t)
+{
+ return gps_clock::from_utc(t);
+}
+
+template <class Duration>
+inline
+gps_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+to_gps_time(const tai_time<Duration>& t)
+{
+ return gps_clock::from_utc(tai_clock::to_utc(t));
+}
+
+#endif // !MISSING_LEAP_SECONDS
+
+} // namespace date
+} // namespace arrow_vendored
+
+#endif // TZ_H
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/vendored/datetime/tz_private.h b/contrib/libs/apache/arrow/cpp/src/arrow/vendored/datetime/tz_private.h
new file mode 100644
index 00000000000..282842e7441
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/vendored/datetime/tz_private.h
@@ -0,0 +1,319 @@
+#ifndef TZ_PRIVATE_H
+#define TZ_PRIVATE_H
+
+// The MIT License (MIT)
+//
+// Copyright (c) 2015, 2016 Howard Hinnant
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+// Our apologies. When the previous paragraph was written, lowercase had not yet
+// been invented (that would involve another several millennia of evolution).
+// We did not mean to shout.
+
+#if !defined(_MSC_VER) || (_MSC_VER >= 1900)
+#include "tz.h"
+#else
+#include "date.h"
+#include <vector>
+#endif
+
+namespace arrow_vendored
+{
+namespace date
+{
+
+namespace detail
+{
+
+#if !USE_OS_TZDB
+
+enum class tz {utc, local, standard};
+
+//forward declare to avoid warnings in gcc 6.2
+class MonthDayTime;
+std::istream& operator>>(std::istream& is, MonthDayTime& x);
+std::ostream& operator<<(std::ostream& os, const MonthDayTime& x);
+
+
+class MonthDayTime
+{
+private:
+ struct pair
+ {
+#if defined(_MSC_VER) && (_MSC_VER < 1900)
+ pair() : month_day_(date::jan / 1), weekday_(0U) {}
+
+ pair(const date::month_day& month_day, const date::weekday& weekday)
+ : month_day_(month_day), weekday_(weekday) {}
+#endif
+
+ date::month_day month_day_;
+ date::weekday weekday_;
+ };
+
+ enum Type {month_day, month_last_dow, lteq, gteq};
+
+ Type type_{month_day};
+
+#if !defined(_MSC_VER) || (_MSC_VER >= 1900)
+ union U
+#else
+ struct U
+#endif
+ {
+ date::month_day month_day_;
+ date::month_weekday_last month_weekday_last_;
+ pair month_day_weekday_;
+
+#if !defined(_MSC_VER) || (_MSC_VER >= 1900)
+ U() : month_day_{date::jan/1} {}
+#else
+ U() :
+ month_day_(date::jan/1),
+ month_weekday_last_(date::month(0U), date::weekday_last(date::weekday(0U)))
+ {}
+
+#endif // !defined(_MSC_VER) || (_MSC_VER >= 1900)
+
+ U& operator=(const date::month_day& x);
+ U& operator=(const date::month_weekday_last& x);
+ U& operator=(const pair& x);
+ } u;
+
+ std::chrono::hours h_{};
+ std::chrono::minutes m_{};
+ std::chrono::seconds s_{};
+ tz zone_{tz::local};
+
+public:
+ MonthDayTime() = default;
+ MonthDayTime(local_seconds tp, tz timezone);
+ MonthDayTime(const date::month_day& md, tz timezone);
+
+ date::day day() const;
+ date::month month() const;
+ tz zone() const {return zone_;}
+
+ void canonicalize(date::year y);
+
+ sys_seconds
+ to_sys(date::year y, std::chrono::seconds offset, std::chrono::seconds save) const;
+ sys_days to_sys_days(date::year y) const;
+
+ sys_seconds to_time_point(date::year y) const;
+ int compare(date::year y, const MonthDayTime& x, date::year yx,
+ std::chrono::seconds offset, std::chrono::minutes prev_save) const;
+
+ friend std::istream& operator>>(std::istream& is, MonthDayTime& x);
+ friend std::ostream& operator<<(std::ostream& os, const MonthDayTime& x);
+};
+
+// A Rule specifies one or more set of datetimes without using an offset.
+// Multiple dates are specified with multiple years. The years in effect
+// go from starting_year_ to ending_year_, inclusive. starting_year_ <=
+// ending_year_. save_ is in effect for times from the specified time
+// onward, including the specified time. When the specified time is
+// local, it uses the save_ from the chronologically previous Rule, or if
+// there is none, 0.
+
+//forward declare to avoid warnings in gcc 6.2
+class Rule;
+bool operator==(const Rule& x, const Rule& y);
+bool operator<(const Rule& x, const Rule& y);
+bool operator==(const Rule& x, const date::year& y);
+bool operator<(const Rule& x, const date::year& y);
+bool operator==(const date::year& x, const Rule& y);
+bool operator<(const date::year& x, const Rule& y);
+bool operator==(const Rule& x, const std::string& y);
+bool operator<(const Rule& x, const std::string& y);
+bool operator==(const std::string& x, const Rule& y);
+bool operator<(const std::string& x, const Rule& y);
+std::ostream& operator<<(std::ostream& os, const Rule& r);
+
+class Rule
+{
+private:
+ std::string name_;
+ date::year starting_year_{0};
+ date::year ending_year_{0};
+ MonthDayTime starting_at_;
+ std::chrono::minutes save_{0};
+ std::string abbrev_;
+
+public:
+ Rule() = default;
+ explicit Rule(const std::string& s);
+ Rule(const Rule& r, date::year starting_year, date::year ending_year);
+
+ const std::string& name() const {return name_;}
+ const std::string& abbrev() const {return abbrev_;}
+
+ const MonthDayTime& mdt() const {return starting_at_;}
+ const date::year& starting_year() const {return starting_year_;}
+ const date::year& ending_year() const {return ending_year_;}
+ const std::chrono::minutes& save() const {return save_;}
+
+ static void split_overlaps(std::vector<Rule>& rules);
+
+ friend bool operator==(const Rule& x, const Rule& y);
+ friend bool operator<(const Rule& x, const Rule& y);
+ friend bool operator==(const Rule& x, const date::year& y);
+ friend bool operator<(const Rule& x, const date::year& y);
+ friend bool operator==(const date::year& x, const Rule& y);
+ friend bool operator<(const date::year& x, const Rule& y);
+ friend bool operator==(const Rule& x, const std::string& y);
+ friend bool operator<(const Rule& x, const std::string& y);
+ friend bool operator==(const std::string& x, const Rule& y);
+ friend bool operator<(const std::string& x, const Rule& y);
+
+ friend std::ostream& operator<<(std::ostream& os, const Rule& r);
+
+private:
+ date::day day() const;
+ date::month month() const;
+ static void split_overlaps(std::vector<Rule>& rules, std::size_t i, std::size_t& e);
+ static bool overlaps(const Rule& x, const Rule& y);
+ static void split(std::vector<Rule>& rules, std::size_t i, std::size_t k,
+ std::size_t& e);
+};
+
+inline bool operator!=(const Rule& x, const Rule& y) {return !(x == y);}
+inline bool operator> (const Rule& x, const Rule& y) {return y < x;}
+inline bool operator<=(const Rule& x, const Rule& y) {return !(y < x);}
+inline bool operator>=(const Rule& x, const Rule& y) {return !(x < y);}
+
+inline bool operator!=(const Rule& x, const date::year& y) {return !(x == y);}
+inline bool operator> (const Rule& x, const date::year& y) {return y < x;}
+inline bool operator<=(const Rule& x, const date::year& y) {return !(y < x);}
+inline bool operator>=(const Rule& x, const date::year& y) {return !(x < y);}
+
+inline bool operator!=(const date::year& x, const Rule& y) {return !(x == y);}
+inline bool operator> (const date::year& x, const Rule& y) {return y < x;}
+inline bool operator<=(const date::year& x, const Rule& y) {return !(y < x);}
+inline bool operator>=(const date::year& x, const Rule& y) {return !(x < y);}
+
+inline bool operator!=(const Rule& x, const std::string& y) {return !(x == y);}
+inline bool operator> (const Rule& x, const std::string& y) {return y < x;}
+inline bool operator<=(const Rule& x, const std::string& y) {return !(y < x);}
+inline bool operator>=(const Rule& x, const std::string& y) {return !(x < y);}
+
+inline bool operator!=(const std::string& x, const Rule& y) {return !(x == y);}
+inline bool operator> (const std::string& x, const Rule& y) {return y < x;}
+inline bool operator<=(const std::string& x, const Rule& y) {return !(y < x);}
+inline bool operator>=(const std::string& x, const Rule& y) {return !(x < y);}
+
+struct zonelet
+{
+ enum tag {has_rule, has_save, is_empty};
+
+ std::chrono::seconds gmtoff_;
+ tag tag_ = has_rule;
+
+#if !defined(_MSC_VER) || (_MSC_VER >= 1900)
+ union U
+#else
+ struct U
+#endif
+ {
+ std::string rule_;
+ std::chrono::minutes save_;
+
+ ~U() {}
+ U() {}
+ U(const U&) {}
+ U& operator=(const U&) = delete;
+ } u;
+
+ std::string format_;
+ date::year until_year_{0};
+ MonthDayTime until_date_;
+ sys_seconds until_utc_;
+ local_seconds until_std_;
+ local_seconds until_loc_;
+ std::chrono::minutes initial_save_{};
+ std::string initial_abbrev_;
+ std::pair<const Rule*, date::year> first_rule_{nullptr, date::year::min()};
+ std::pair<const Rule*, date::year> last_rule_{nullptr, date::year::max()};
+
+ ~zonelet();
+ zonelet();
+ zonelet(const zonelet& i);
+ zonelet& operator=(const zonelet&) = delete;
+};
+
+#else // USE_OS_TZDB
+
+struct ttinfo
+{
+ std::int32_t tt_gmtoff;
+ unsigned char tt_isdst;
+ unsigned char tt_abbrind;
+ unsigned char pad[2];
+};
+
+static_assert(sizeof(ttinfo) == 8, "");
+
+struct expanded_ttinfo
+{
+ std::chrono::seconds offset;
+ std::string abbrev;
+ bool is_dst;
+};
+
+struct transition
+{
+ sys_seconds timepoint;
+ const expanded_ttinfo* info;
+
+ transition(sys_seconds tp, const expanded_ttinfo* i = nullptr)
+ : timepoint(tp)
+ , info(i)
+ {}
+
+ friend
+ std::ostream&
+ operator<<(std::ostream& os, const transition& t)
+ {
+ using date::operator<<;
+ os << t.timepoint << "Z ";
+ if (t.info->offset >= std::chrono::seconds{0})
+ os << '+';
+ os << make_time(t.info->offset);
+ if (t.info->is_dst > 0)
+ os << " daylight ";
+ else
+ os << " standard ";
+ os << t.info->abbrev;
+ return os;
+ }
+};
+
+#endif // USE_OS_TZDB
+
+} // namespace detail
+
+} // namespace date
+} // namespace arrow_vendored
+
+#if defined(_MSC_VER) && (_MSC_VER < 1900)
+#include "tz.h"
+#endif
+
+#endif // TZ_PRIVATE_H
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/vendored/datetime/visibility.h b/contrib/libs/apache/arrow/cpp/src/arrow/vendored/datetime/visibility.h
new file mode 100644
index 00000000000..ae031238d85
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/vendored/datetime/visibility.h
@@ -0,0 +1,26 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#if defined(ARROW_STATIC)
+// intentially empty
+#elif defined(ARROW_EXPORTING)
+#define DATE_BUILD_DLL
+#else
+#define DATE_USE_DLL
+#endif
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/vendored/musl/README.md b/contrib/libs/apache/arrow/cpp/src/arrow/vendored/musl/README.md
new file mode 100644
index 00000000000..40962a14ca9
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/vendored/musl/README.md
@@ -0,0 +1,25 @@
+<!--
+Copyright © 2005-2020 Rich Felker, et al.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+-->
+
+Assorted utility functions are adapted from the musl libc project
+(https://musl.libc.org/).
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/vendored/musl/strptime.c b/contrib/libs/apache/arrow/cpp/src/arrow/vendored/musl/strptime.c
new file mode 100644
index 00000000000..e8111f57679
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/vendored/musl/strptime.c
@@ -0,0 +1,237 @@
+// Vendored from musl git commit 593caa456309714402ca4cb77c3770f4c24da9da
+// + adaptations
+
+#include "arrow/vendored/strptime.h"
+
+#include <ctype.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef _WIN32
+#define strncasecmp _strnicmp
+#define strcasecmp _stricmp
+#else
+#include <strings.h>
+#endif
+
+#undef HAVE_LANGINFO
+
+#ifndef _WIN32
+#define HAVE_LANGINFO 1
+#endif
+
+#ifdef HAVE_LANGINFO
+#include <langinfo.h>
+#endif
+
+#define strptime arrow_strptime
+
+char *strptime(const char *__restrict s, const char *__restrict f, struct tm *__restrict tm)
+{
+ int i, w, neg, adj, min, range, *dest, dummy;
+#ifdef HAVE_LANGINFO
+ const char *ex;
+ size_t len;
+#endif
+ int want_century = 0, century = 0, relyear = 0;
+ while (*f) {
+ if (*f != '%') {
+ if (isspace(*f)) for (; *s && isspace(*s); s++);
+ else if (*s != *f) return 0;
+ else s++;
+ f++;
+ continue;
+ }
+ f++;
+ if (*f == '+') f++;
+ if (isdigit(*f)) {
+ char *new_f;
+ w=strtoul(f, &new_f, 10);
+ f = new_f;
+ } else {
+ w=-1;
+ }
+ adj=0;
+ switch (*f++) {
+#ifdef HAVE_LANGINFO
+ case 'a': case 'A':
+ dest = &tm->tm_wday;
+ min = ABDAY_1;
+ range = 7;
+ goto symbolic_range;
+ case 'b': case 'B': case 'h':
+ dest = &tm->tm_mon;
+ min = ABMON_1;
+ range = 12;
+ goto symbolic_range;
+ case 'c':
+ s = strptime(s, nl_langinfo(D_T_FMT), tm);
+ if (!s) return 0;
+ break;
+#endif
+ case 'C':
+ dest = &century;
+ if (w<0) w=2;
+ want_century |= 2;
+ goto numeric_digits;
+ case 'd': case 'e':
+ dest = &tm->tm_mday;
+ min = 1;
+ range = 31;
+ goto numeric_range;
+ case 'D':
+ s = strptime(s, "%m/%d/%y", tm);
+ if (!s) return 0;
+ break;
+ case 'H':
+ dest = &tm->tm_hour;
+ min = 0;
+ range = 24;
+ goto numeric_range;
+ case 'I':
+ dest = &tm->tm_hour;
+ min = 1;
+ range = 12;
+ goto numeric_range;
+ case 'j':
+ dest = &tm->tm_yday;
+ min = 1;
+ range = 366;
+ adj = 1;
+ goto numeric_range;
+ case 'm':
+ dest = &tm->tm_mon;
+ min = 1;
+ range = 12;
+ adj = 1;
+ goto numeric_range;
+ case 'M':
+ dest = &tm->tm_min;
+ min = 0;
+ range = 60;
+ goto numeric_range;
+ case 'n': case 't':
+ for (; *s && isspace(*s); s++);
+ break;
+#ifdef HAVE_LANGINFO
+ case 'p':
+ ex = nl_langinfo(AM_STR);
+ len = strlen(ex);
+ if (!strncasecmp(s, ex, len)) {
+ tm->tm_hour %= 12;
+ s += len;
+ break;
+ }
+ ex = nl_langinfo(PM_STR);
+ len = strlen(ex);
+ if (!strncasecmp(s, ex, len)) {
+ tm->tm_hour %= 12;
+ tm->tm_hour += 12;
+ s += len;
+ break;
+ }
+ return 0;
+ case 'r':
+ s = strptime(s, nl_langinfo(T_FMT_AMPM), tm);
+ if (!s) return 0;
+ break;
+#endif
+ case 'R':
+ s = strptime(s, "%H:%M", tm);
+ if (!s) return 0;
+ break;
+ case 'S':
+ dest = &tm->tm_sec;
+ min = 0;
+ range = 61;
+ goto numeric_range;
+ case 'T':
+ s = strptime(s, "%H:%M:%S", tm);
+ if (!s) return 0;
+ break;
+ case 'U':
+ case 'W':
+ /* Throw away result, for now. (FIXME?) */
+ dest = &dummy;
+ min = 0;
+ range = 54;
+ goto numeric_range;
+ case 'w':
+ dest = &tm->tm_wday;
+ min = 0;
+ range = 7;
+ goto numeric_range;
+#ifdef HAVE_LANGINFO
+ case 'x':
+ s = strptime(s, nl_langinfo(D_FMT), tm);
+ if (!s) return 0;
+ break;
+ case 'X':
+ s = strptime(s, nl_langinfo(T_FMT), tm);
+ if (!s) return 0;
+ break;
+#endif
+ case 'y':
+ dest = &relyear;
+ w = 2;
+ want_century |= 1;
+ goto numeric_digits;
+ case 'Y':
+ dest = &tm->tm_year;
+ if (w<0) w=4;
+ adj = 1900;
+ want_century = 0;
+ goto numeric_digits;
+ case '%':
+ if (*s++ != '%') return 0;
+ break;
+ default:
+ return 0;
+ numeric_range:
+ if (!isdigit(*s)) return 0;
+ *dest = 0;
+ for (i=1; i<=min+range && isdigit(*s); i*=10)
+ *dest = *dest * 10 + *s++ - '0';
+ if (*dest - min >= range) return 0;
+ *dest -= adj;
+ switch((char *)dest - (char *)tm) {
+ case offsetof(struct tm, tm_yday):
+ ;
+ }
+ goto update;
+ numeric_digits:
+ neg = 0;
+ if (*s == '+') s++;
+ else if (*s == '-') neg=1, s++;
+ if (!isdigit(*s)) return 0;
+ for (*dest=i=0; i<w && isdigit(*s); i++)
+ *dest = *dest * 10 + *s++ - '0';
+ if (neg) *dest = -*dest;
+ *dest -= adj;
+ goto update;
+#ifdef HAVE_LANGINFO
+ symbolic_range:
+ for (i=2*range-1; i>=0; i--) {
+ ex = nl_langinfo(min+i);
+ len = strlen(ex);
+ if (strncasecmp(s, ex, len)) continue;
+ s += len;
+ *dest = i % range;
+ break;
+ }
+ if (i<0) return 0;
+ goto update;
+#endif
+ update:
+ //FIXME
+ ;
+ }
+ }
+ if (want_century) {
+ tm->tm_year = relyear;
+ if (want_century & 2) tm->tm_year += century * 100 - 1900;
+ else if (tm->tm_year <= 68) tm->tm_year += 100;
+ }
+ return (char *)s;
+}
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/vendored/portable-snippets/README.md b/contrib/libs/apache/arrow/cpp/src/arrow/vendored/portable-snippets/README.md
new file mode 100644
index 00000000000..9c67b7baa1c
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/vendored/portable-snippets/README.md
@@ -0,0 +1,10 @@
+<!---
+Each source file contains a preamble explaining the license situation
+for that file, which takes priority over this file. With the
+exception of some code pulled in from other repositories (such as
+µnit, an MIT-licensed project which is used for testing), the code is
+public domain, released using the CC0 1.0 Universal dedication.
+-->
+
+The files in this directory are vendored from portable-snippets
+git changeset f596f8b0a4b8a6ea1166c2361a5cb7e6f802c5ea.
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/vendored/portable-snippets/safe-math.h b/contrib/libs/apache/arrow/cpp/src/arrow/vendored/portable-snippets/safe-math.h
new file mode 100644
index 00000000000..7f6426ac765
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/vendored/portable-snippets/safe-math.h
@@ -0,0 +1,1072 @@
+/* Overflow-safe math functions
+ * Portable Snippets - https://github.com/nemequ/portable-snippets
+ * Created by Evan Nemerson <[email protected]>
+ *
+ * To the extent possible under law, the authors have waived all
+ * copyright and related or neighboring rights to this code. For
+ * details, see the Creative Commons Zero 1.0 Universal license at
+ * https://creativecommons.org/publicdomain/zero/1.0/
+ */
+
+#if !defined(PSNIP_SAFE_H)
+#define PSNIP_SAFE_H
+
+#if !defined(PSNIP_SAFE_FORCE_PORTABLE)
+# if defined(__has_builtin)
+# if __has_builtin(__builtin_add_overflow) && !defined(__ibmxl__)
+# define PSNIP_SAFE_HAVE_BUILTIN_OVERFLOW
+# endif
+# elif defined(__GNUC__) && (__GNUC__ >= 5) && !defined(__INTEL_COMPILER)
+# define PSNIP_SAFE_HAVE_BUILTIN_OVERFLOW
+# endif
+# if defined(__has_include)
+# if __has_include(<intsafe.h>)
+# define PSNIP_SAFE_HAVE_INTSAFE_H
+# endif
+# elif defined(_WIN32)
+# define PSNIP_SAFE_HAVE_INTSAFE_H
+# endif
+#endif /* !defined(PSNIP_SAFE_FORCE_PORTABLE) */
+
+#if defined(__GNUC__)
+# define PSNIP_SAFE_LIKELY(expr) __builtin_expect(!!(expr), 1)
+# define PSNIP_SAFE_UNLIKELY(expr) __builtin_expect(!!(expr), 0)
+#else
+# define PSNIP_SAFE_LIKELY(expr) !!(expr)
+# define PSNIP_SAFE_UNLIKELY(expr) !!(expr)
+#endif /* defined(__GNUC__) */
+
+#if !defined(PSNIP_SAFE_STATIC_INLINE)
+# if defined(__GNUC__)
+# define PSNIP_SAFE__COMPILER_ATTRIBUTES __attribute__((__unused__))
+# else
+# define PSNIP_SAFE__COMPILER_ATTRIBUTES
+# endif
+
+# if defined(HEDLEY_INLINE)
+# define PSNIP_SAFE__INLINE HEDLEY_INLINE
+# elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+# define PSNIP_SAFE__INLINE inline
+# elif defined(__GNUC_STDC_INLINE__)
+# define PSNIP_SAFE__INLINE __inline__
+# elif defined(_MSC_VER) && _MSC_VER >= 1200
+# define PSNIP_SAFE__INLINE __inline
+# else
+# define PSNIP_SAFE__INLINE
+# endif
+
+# define PSNIP_SAFE__FUNCTION PSNIP_SAFE__COMPILER_ATTRIBUTES static PSNIP_SAFE__INLINE
+#endif
+
+// !defined(__cplusplus) added for Solaris support
+#if !defined(__cplusplus) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+# define psnip_safe_bool _Bool
+#else
+# define psnip_safe_bool int
+#endif
+
+#if !defined(PSNIP_SAFE_NO_FIXED)
+/* For maximum portability include the exact-int module from
+ portable snippets. */
+# if \
+ !defined(psnip_int64_t) || !defined(psnip_uint64_t) || \
+ !defined(psnip_int32_t) || !defined(psnip_uint32_t) || \
+ !defined(psnip_int16_t) || !defined(psnip_uint16_t) || \
+ !defined(psnip_int8_t) || !defined(psnip_uint8_t)
+# include <stdint.h>
+# if !defined(psnip_int64_t)
+# define psnip_int64_t int64_t
+# endif
+# if !defined(psnip_uint64_t)
+# define psnip_uint64_t uint64_t
+# endif
+# if !defined(psnip_int32_t)
+# define psnip_int32_t int32_t
+# endif
+# if !defined(psnip_uint32_t)
+# define psnip_uint32_t uint32_t
+# endif
+# if !defined(psnip_int16_t)
+# define psnip_int16_t int16_t
+# endif
+# if !defined(psnip_uint16_t)
+# define psnip_uint16_t uint16_t
+# endif
+# if !defined(psnip_int8_t)
+# define psnip_int8_t int8_t
+# endif
+# if !defined(psnip_uint8_t)
+# define psnip_uint8_t uint8_t
+# endif
+# endif
+#endif /* !defined(PSNIP_SAFE_NO_FIXED) */
+#include <limits.h>
+#include <stdlib.h>
+
+#if !defined(PSNIP_SAFE_SIZE_MAX)
+# if defined(__SIZE_MAX__)
+# define PSNIP_SAFE_SIZE_MAX __SIZE_MAX__
+# elif defined(PSNIP_EXACT_INT_HAVE_STDINT)
+# include <stdint.h>
+# endif
+#endif
+
+#if defined(PSNIP_SAFE_SIZE_MAX)
+# define PSNIP_SAFE__SIZE_MAX_RT PSNIP_SAFE_SIZE_MAX
+#else
+# define PSNIP_SAFE__SIZE_MAX_RT (~((size_t) 0))
+#endif
+
+#if defined(PSNIP_SAFE_HAVE_INTSAFE_H)
+/* In VS 10, stdint.h and intsafe.h both define (U)INTN_MIN/MAX, which
+ triggers warning C4005 (level 1). */
+# if defined(_MSC_VER) && (_MSC_VER == 1600)
+# pragma warning(push)
+# pragma warning(disable:4005)
+# endif
+# include <intsafe.h>
+# if defined(_MSC_VER) && (_MSC_VER == 1600)
+# pragma warning(pop)
+# endif
+#endif /* defined(PSNIP_SAFE_HAVE_INTSAFE_H) */
+
+/* If there is a type larger than the one we're concerned with it's
+ * likely much faster to simply promote the operands, perform the
+ * requested operation, verify that the result falls within the
+ * original type, then cast the result back to the original type. */
+
+#if !defined(PSNIP_SAFE_NO_PROMOTIONS)
+
+#define PSNIP_SAFE_DEFINE_LARGER_BINARY_OP(T, name, op_name, op) \
+ PSNIP_SAFE__FUNCTION psnip_safe_##name##_larger \
+ psnip_safe_larger_##name##_##op_name (T a, T b) { \
+ return ((psnip_safe_##name##_larger) a) op ((psnip_safe_##name##_larger) b); \
+ }
+
+#define PSNIP_SAFE_DEFINE_LARGER_UNARY_OP(T, name, op_name, op) \
+ PSNIP_SAFE__FUNCTION psnip_safe_##name##_larger \
+ psnip_safe_larger_##name##_##op_name (T value) { \
+ return (op ((psnip_safe_##name##_larger) value)); \
+ }
+
+#define PSNIP_SAFE_DEFINE_LARGER_SIGNED_OPS(T, name) \
+ PSNIP_SAFE_DEFINE_LARGER_BINARY_OP(T, name, add, +) \
+ PSNIP_SAFE_DEFINE_LARGER_BINARY_OP(T, name, sub, -) \
+ PSNIP_SAFE_DEFINE_LARGER_BINARY_OP(T, name, mul, *) \
+ PSNIP_SAFE_DEFINE_LARGER_BINARY_OP(T, name, div, /) \
+ PSNIP_SAFE_DEFINE_LARGER_BINARY_OP(T, name, mod, %) \
+ PSNIP_SAFE_DEFINE_LARGER_UNARY_OP (T, name, neg, -)
+
+#define PSNIP_SAFE_DEFINE_LARGER_UNSIGNED_OPS(T, name) \
+ PSNIP_SAFE_DEFINE_LARGER_BINARY_OP(T, name, add, +) \
+ PSNIP_SAFE_DEFINE_LARGER_BINARY_OP(T, name, sub, -) \
+ PSNIP_SAFE_DEFINE_LARGER_BINARY_OP(T, name, mul, *) \
+ PSNIP_SAFE_DEFINE_LARGER_BINARY_OP(T, name, div, /) \
+ PSNIP_SAFE_DEFINE_LARGER_BINARY_OP(T, name, mod, %)
+
+#define PSNIP_SAFE_IS_LARGER(ORIG_MAX, DEST_MAX) ((DEST_MAX / ORIG_MAX) >= ORIG_MAX)
+
+#if defined(__GNUC__) && ((__GNUC__ >= 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)) && defined(__SIZEOF_INT128__) && !defined(__ibmxl__)
+#define PSNIP_SAFE_HAVE_128
+typedef __int128 psnip_safe_int128_t;
+typedef unsigned __int128 psnip_safe_uint128_t;
+#endif /* defined(__GNUC__) */
+
+#if !defined(PSNIP_SAFE_NO_FIXED)
+#define PSNIP_SAFE_HAVE_INT8_LARGER
+#define PSNIP_SAFE_HAVE_UINT8_LARGER
+typedef psnip_int16_t psnip_safe_int8_larger;
+typedef psnip_uint16_t psnip_safe_uint8_larger;
+
+#define PSNIP_SAFE_HAVE_INT16_LARGER
+typedef psnip_int32_t psnip_safe_int16_larger;
+typedef psnip_uint32_t psnip_safe_uint16_larger;
+
+#define PSNIP_SAFE_HAVE_INT32_LARGER
+typedef psnip_int64_t psnip_safe_int32_larger;
+typedef psnip_uint64_t psnip_safe_uint32_larger;
+
+#if defined(PSNIP_SAFE_HAVE_128)
+#define PSNIP_SAFE_HAVE_INT64_LARGER
+typedef psnip_safe_int128_t psnip_safe_int64_larger;
+typedef psnip_safe_uint128_t psnip_safe_uint64_larger;
+#endif /* defined(PSNIP_SAFE_HAVE_128) */
+#endif /* !defined(PSNIP_SAFE_NO_FIXED) */
+
+#define PSNIP_SAFE_HAVE_LARGER_SCHAR
+#if PSNIP_SAFE_IS_LARGER(SCHAR_MAX, SHRT_MAX)
+typedef short psnip_safe_schar_larger;
+#elif PSNIP_SAFE_IS_LARGER(SCHAR_MAX, INT_MAX)
+typedef int psnip_safe_schar_larger;
+#elif PSNIP_SAFE_IS_LARGER(SCHAR_MAX, LONG_MAX)
+typedef long psnip_safe_schar_larger;
+#elif PSNIP_SAFE_IS_LARGER(SCHAR_MAX, LLONG_MAX)
+typedef long long psnip_safe_schar_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(SCHAR_MAX, 0x7fff)
+typedef psnip_int16_t psnip_safe_schar_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(SCHAR_MAX, 0x7fffffffLL)
+typedef psnip_int32_t psnip_safe_schar_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(SCHAR_MAX, 0x7fffffffffffffffLL)
+typedef psnip_int64_t psnip_safe_schar_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && defined(PSNIP_SAFE_HAVE_128) && (SCHAR_MAX <= 0x7fffffffffffffffLL)
+typedef psnip_safe_int128_t psnip_safe_schar_larger;
+#else
+#undef PSNIP_SAFE_HAVE_LARGER_SCHAR
+#endif
+
+#define PSNIP_SAFE_HAVE_LARGER_UCHAR
+#if PSNIP_SAFE_IS_LARGER(UCHAR_MAX, USHRT_MAX)
+typedef unsigned short psnip_safe_uchar_larger;
+#elif PSNIP_SAFE_IS_LARGER(UCHAR_MAX, UINT_MAX)
+typedef unsigned int psnip_safe_uchar_larger;
+#elif PSNIP_SAFE_IS_LARGER(UCHAR_MAX, ULONG_MAX)
+typedef unsigned long psnip_safe_uchar_larger;
+#elif PSNIP_SAFE_IS_LARGER(UCHAR_MAX, ULLONG_MAX)
+typedef unsigned long long psnip_safe_uchar_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(UCHAR_MAX, 0xffffU)
+typedef psnip_uint16_t psnip_safe_uchar_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(UCHAR_MAX, 0xffffffffUL)
+typedef psnip_uint32_t psnip_safe_uchar_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(UCHAR_MAX, 0xffffffffffffffffULL)
+typedef psnip_uint64_t psnip_safe_uchar_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && defined(PSNIP_SAFE_HAVE_128) && (UCHAR_MAX <= 0xffffffffffffffffULL)
+typedef psnip_safe_uint128_t psnip_safe_uchar_larger;
+#else
+#undef PSNIP_SAFE_HAVE_LARGER_UCHAR
+#endif
+
+#if CHAR_MIN == 0 && defined(PSNIP_SAFE_HAVE_LARGER_UCHAR)
+#define PSNIP_SAFE_HAVE_LARGER_CHAR
+typedef psnip_safe_uchar_larger psnip_safe_char_larger;
+#elif CHAR_MIN < 0 && defined(PSNIP_SAFE_HAVE_LARGER_SCHAR)
+#define PSNIP_SAFE_HAVE_LARGER_CHAR
+typedef psnip_safe_schar_larger psnip_safe_char_larger;
+#endif
+
+#define PSNIP_SAFE_HAVE_LARGER_SHRT
+#if PSNIP_SAFE_IS_LARGER(SHRT_MAX, INT_MAX)
+typedef int psnip_safe_short_larger;
+#elif PSNIP_SAFE_IS_LARGER(SHRT_MAX, LONG_MAX)
+typedef long psnip_safe_short_larger;
+#elif PSNIP_SAFE_IS_LARGER(SHRT_MAX, LLONG_MAX)
+typedef long long psnip_safe_short_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(SHRT_MAX, 0x7fff)
+typedef psnip_int16_t psnip_safe_short_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(SHRT_MAX, 0x7fffffffLL)
+typedef psnip_int32_t psnip_safe_short_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(SHRT_MAX, 0x7fffffffffffffffLL)
+typedef psnip_int64_t psnip_safe_short_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && defined(PSNIP_SAFE_HAVE_128) && (SHRT_MAX <= 0x7fffffffffffffffLL)
+typedef psnip_safe_int128_t psnip_safe_short_larger;
+#else
+#undef PSNIP_SAFE_HAVE_LARGER_SHRT
+#endif
+
+#define PSNIP_SAFE_HAVE_LARGER_USHRT
+#if PSNIP_SAFE_IS_LARGER(USHRT_MAX, UINT_MAX)
+typedef unsigned int psnip_safe_ushort_larger;
+#elif PSNIP_SAFE_IS_LARGER(USHRT_MAX, ULONG_MAX)
+typedef unsigned long psnip_safe_ushort_larger;
+#elif PSNIP_SAFE_IS_LARGER(USHRT_MAX, ULLONG_MAX)
+typedef unsigned long long psnip_safe_ushort_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(USHRT_MAX, 0xffff)
+typedef psnip_uint16_t psnip_safe_ushort_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(USHRT_MAX, 0xffffffffUL)
+typedef psnip_uint32_t psnip_safe_ushort_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(USHRT_MAX, 0xffffffffffffffffULL)
+typedef psnip_uint64_t psnip_safe_ushort_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && defined(PSNIP_SAFE_HAVE_128) && (USHRT_MAX <= 0xffffffffffffffffULL)
+typedef psnip_safe_uint128_t psnip_safe_ushort_larger;
+#else
+#undef PSNIP_SAFE_HAVE_LARGER_USHRT
+#endif
+
+#define PSNIP_SAFE_HAVE_LARGER_INT
+#if PSNIP_SAFE_IS_LARGER(INT_MAX, LONG_MAX)
+typedef long psnip_safe_int_larger;
+#elif PSNIP_SAFE_IS_LARGER(INT_MAX, LLONG_MAX)
+typedef long long psnip_safe_int_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(INT_MAX, 0x7fff)
+typedef psnip_int16_t psnip_safe_int_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(INT_MAX, 0x7fffffffLL)
+typedef psnip_int32_t psnip_safe_int_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(INT_MAX, 0x7fffffffffffffffLL)
+typedef psnip_int64_t psnip_safe_int_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && defined(PSNIP_SAFE_HAVE_128) && (INT_MAX <= 0x7fffffffffffffffLL)
+typedef psnip_safe_int128_t psnip_safe_int_larger;
+#else
+#undef PSNIP_SAFE_HAVE_LARGER_INT
+#endif
+
+#define PSNIP_SAFE_HAVE_LARGER_UINT
+#if PSNIP_SAFE_IS_LARGER(UINT_MAX, ULONG_MAX)
+typedef unsigned long psnip_safe_uint_larger;
+#elif PSNIP_SAFE_IS_LARGER(UINT_MAX, ULLONG_MAX)
+typedef unsigned long long psnip_safe_uint_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(UINT_MAX, 0xffff)
+typedef psnip_uint16_t psnip_safe_uint_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(UINT_MAX, 0xffffffffUL)
+typedef psnip_uint32_t psnip_safe_uint_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(UINT_MAX, 0xffffffffffffffffULL)
+typedef psnip_uint64_t psnip_safe_uint_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && defined(PSNIP_SAFE_HAVE_128) && (UINT_MAX <= 0xffffffffffffffffULL)
+typedef psnip_safe_uint128_t psnip_safe_uint_larger;
+#else
+#undef PSNIP_SAFE_HAVE_LARGER_UINT
+#endif
+
+#define PSNIP_SAFE_HAVE_LARGER_LONG
+#if PSNIP_SAFE_IS_LARGER(LONG_MAX, LLONG_MAX)
+typedef long long psnip_safe_long_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(LONG_MAX, 0x7fff)
+typedef psnip_int16_t psnip_safe_long_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(LONG_MAX, 0x7fffffffLL)
+typedef psnip_int32_t psnip_safe_long_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(LONG_MAX, 0x7fffffffffffffffLL)
+typedef psnip_int64_t psnip_safe_long_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && defined(PSNIP_SAFE_HAVE_128) && (LONG_MAX <= 0x7fffffffffffffffLL)
+typedef psnip_safe_int128_t psnip_safe_long_larger;
+#else
+#undef PSNIP_SAFE_HAVE_LARGER_LONG
+#endif
+
+#define PSNIP_SAFE_HAVE_LARGER_ULONG
+#if PSNIP_SAFE_IS_LARGER(ULONG_MAX, ULLONG_MAX)
+typedef unsigned long long psnip_safe_ulong_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(ULONG_MAX, 0xffff)
+typedef psnip_uint16_t psnip_safe_ulong_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(ULONG_MAX, 0xffffffffUL)
+typedef psnip_uint32_t psnip_safe_ulong_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(ULONG_MAX, 0xffffffffffffffffULL)
+typedef psnip_uint64_t psnip_safe_ulong_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && defined(PSNIP_SAFE_HAVE_128) && (ULONG_MAX <= 0xffffffffffffffffULL)
+typedef psnip_safe_uint128_t psnip_safe_ulong_larger;
+#else
+#undef PSNIP_SAFE_HAVE_LARGER_ULONG
+#endif
+
+#define PSNIP_SAFE_HAVE_LARGER_LLONG
+#if !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(LLONG_MAX, 0x7fff)
+typedef psnip_int16_t psnip_safe_llong_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(LLONG_MAX, 0x7fffffffLL)
+typedef psnip_int32_t psnip_safe_llong_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(LLONG_MAX, 0x7fffffffffffffffLL)
+typedef psnip_int64_t psnip_safe_llong_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && defined(PSNIP_SAFE_HAVE_128) && (LLONG_MAX <= 0x7fffffffffffffffLL)
+typedef psnip_safe_int128_t psnip_safe_llong_larger;
+#else
+#undef PSNIP_SAFE_HAVE_LARGER_LLONG
+#endif
+
+#define PSNIP_SAFE_HAVE_LARGER_ULLONG
+#if !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(ULLONG_MAX, 0xffff)
+typedef psnip_uint16_t psnip_safe_ullong_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(ULLONG_MAX, 0xffffffffUL)
+typedef psnip_uint32_t psnip_safe_ullong_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(ULLONG_MAX, 0xffffffffffffffffULL)
+typedef psnip_uint64_t psnip_safe_ullong_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && defined(PSNIP_SAFE_HAVE_128) && (ULLONG_MAX <= 0xffffffffffffffffULL)
+typedef psnip_safe_uint128_t psnip_safe_ullong_larger;
+#else
+#undef PSNIP_SAFE_HAVE_LARGER_ULLONG
+#endif
+
+#if defined(PSNIP_SAFE_SIZE_MAX)
+#define PSNIP_SAFE_HAVE_LARGER_SIZE
+#if PSNIP_SAFE_IS_LARGER(PSNIP_SAFE_SIZE_MAX, USHRT_MAX)
+typedef unsigned short psnip_safe_size_larger;
+#elif PSNIP_SAFE_IS_LARGER(PSNIP_SAFE_SIZE_MAX, UINT_MAX)
+typedef unsigned int psnip_safe_size_larger;
+#elif PSNIP_SAFE_IS_LARGER(PSNIP_SAFE_SIZE_MAX, ULONG_MAX)
+typedef unsigned long psnip_safe_size_larger;
+#elif PSNIP_SAFE_IS_LARGER(PSNIP_SAFE_SIZE_MAX, ULLONG_MAX)
+typedef unsigned long long psnip_safe_size_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(PSNIP_SAFE_SIZE_MAX, 0xffff)
+typedef psnip_uint16_t psnip_safe_size_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(PSNIP_SAFE_SIZE_MAX, 0xffffffffUL)
+typedef psnip_uint32_t psnip_safe_size_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(PSNIP_SAFE_SIZE_MAX, 0xffffffffffffffffULL)
+typedef psnip_uint64_t psnip_safe_size_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && defined(PSNIP_SAFE_HAVE_128) && (PSNIP_SAFE_SIZE_MAX <= 0xffffffffffffffffULL)
+typedef psnip_safe_uint128_t psnip_safe_size_larger;
+#else
+#undef PSNIP_SAFE_HAVE_LARGER_SIZE
+#endif
+#endif
+
+#if defined(PSNIP_SAFE_HAVE_LARGER_SCHAR)
+PSNIP_SAFE_DEFINE_LARGER_SIGNED_OPS(signed char, schar)
+#endif
+
+#if defined(PSNIP_SAFE_HAVE_LARGER_UCHAR)
+PSNIP_SAFE_DEFINE_LARGER_UNSIGNED_OPS(unsigned char, uchar)
+#endif
+
+#if defined(PSNIP_SAFE_HAVE_LARGER_CHAR)
+#if CHAR_MIN == 0
+PSNIP_SAFE_DEFINE_LARGER_UNSIGNED_OPS(char, char)
+#else
+PSNIP_SAFE_DEFINE_LARGER_SIGNED_OPS(char, char)
+#endif
+#endif
+
+#if defined(PSNIP_SAFE_HAVE_LARGER_SHORT)
+PSNIP_SAFE_DEFINE_LARGER_SIGNED_OPS(short, short)
+#endif
+
+#if defined(PSNIP_SAFE_HAVE_LARGER_USHORT)
+PSNIP_SAFE_DEFINE_LARGER_UNSIGNED_OPS(unsigned short, ushort)
+#endif
+
+#if defined(PSNIP_SAFE_HAVE_LARGER_INT)
+PSNIP_SAFE_DEFINE_LARGER_SIGNED_OPS(int, int)
+#endif
+
+#if defined(PSNIP_SAFE_HAVE_LARGER_UINT)
+PSNIP_SAFE_DEFINE_LARGER_UNSIGNED_OPS(unsigned int, uint)
+#endif
+
+#if defined(PSNIP_SAFE_HAVE_LARGER_LONG)
+PSNIP_SAFE_DEFINE_LARGER_SIGNED_OPS(long, long)
+#endif
+
+#if defined(PSNIP_SAFE_HAVE_LARGER_ULONG)
+PSNIP_SAFE_DEFINE_LARGER_UNSIGNED_OPS(unsigned long, ulong)
+#endif
+
+#if defined(PSNIP_SAFE_HAVE_LARGER_LLONG)
+PSNIP_SAFE_DEFINE_LARGER_SIGNED_OPS(long long, llong)
+#endif
+
+#if defined(PSNIP_SAFE_HAVE_LARGER_ULLONG)
+PSNIP_SAFE_DEFINE_LARGER_UNSIGNED_OPS(unsigned long long, ullong)
+#endif
+
+#if defined(PSNIP_SAFE_HAVE_LARGER_SIZE)
+PSNIP_SAFE_DEFINE_LARGER_UNSIGNED_OPS(size_t, size)
+#endif
+
+#if !defined(PSNIP_SAFE_NO_FIXED)
+PSNIP_SAFE_DEFINE_LARGER_SIGNED_OPS(psnip_int8_t, int8)
+PSNIP_SAFE_DEFINE_LARGER_UNSIGNED_OPS(psnip_uint8_t, uint8)
+PSNIP_SAFE_DEFINE_LARGER_SIGNED_OPS(psnip_int16_t, int16)
+PSNIP_SAFE_DEFINE_LARGER_UNSIGNED_OPS(psnip_uint16_t, uint16)
+PSNIP_SAFE_DEFINE_LARGER_SIGNED_OPS(psnip_int32_t, int32)
+PSNIP_SAFE_DEFINE_LARGER_UNSIGNED_OPS(psnip_uint32_t, uint32)
+#if defined(PSNIP_SAFE_HAVE_128)
+PSNIP_SAFE_DEFINE_LARGER_SIGNED_OPS(psnip_int64_t, int64)
+PSNIP_SAFE_DEFINE_LARGER_UNSIGNED_OPS(psnip_uint64_t, uint64)
+#endif
+#endif
+
+#endif /* !defined(PSNIP_SAFE_NO_PROMOTIONS) */
+
+#define PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(T, name, op_name) \
+ PSNIP_SAFE__FUNCTION psnip_safe_bool \
+ psnip_safe_##name##_##op_name(T* res, T a, T b) { \
+ return !__builtin_##op_name##_overflow(a, b, res); \
+ }
+
+#define PSNIP_SAFE_DEFINE_PROMOTED_SIGNED_BINARY_OP(T, name, op_name, min, max) \
+ PSNIP_SAFE__FUNCTION psnip_safe_bool \
+ psnip_safe_##name##_##op_name(T* res, T a, T b) { \
+ const psnip_safe_##name##_larger r = psnip_safe_larger_##name##_##op_name(a, b); \
+ *res = (T) r; \
+ return (r >= min) && (r <= max); \
+ }
+
+#define PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(T, name, op_name, max) \
+ PSNIP_SAFE__FUNCTION psnip_safe_bool \
+ psnip_safe_##name##_##op_name(T* res, T a, T b) { \
+ const psnip_safe_##name##_larger r = psnip_safe_larger_##name##_##op_name(a, b); \
+ *res = (T) r; \
+ return (r <= max); \
+ }
+
+#define PSNIP_SAFE_DEFINE_SIGNED_ADD(T, name, min, max) \
+ PSNIP_SAFE__FUNCTION psnip_safe_bool \
+ psnip_safe_##name##_add (T* res, T a, T b) { \
+ psnip_safe_bool r = !( ((b > 0) && (a > (max - b))) || \
+ ((b < 0) && (a < (min - b))) ); \
+ if(PSNIP_SAFE_LIKELY(r)) \
+ *res = a + b; \
+ return r; \
+ }
+
+#define PSNIP_SAFE_DEFINE_UNSIGNED_ADD(T, name, max) \
+ PSNIP_SAFE__FUNCTION psnip_safe_bool \
+ psnip_safe_##name##_add (T* res, T a, T b) { \
+ *res = (T) (a + b); \
+ return !PSNIP_SAFE_UNLIKELY((b > 0) && (a > (max - b))); \
+ }
+
+#define PSNIP_SAFE_DEFINE_SIGNED_SUB(T, name, min, max) \
+ PSNIP_SAFE__FUNCTION psnip_safe_bool \
+ psnip_safe_##name##_sub (T* res, T a, T b) { \
+ psnip_safe_bool r = !((b > 0 && a < (min + b)) || \
+ (b < 0 && a > (max + b))); \
+ if(PSNIP_SAFE_LIKELY(r)) \
+ *res = a - b; \
+ return r; \
+ }
+
+#define PSNIP_SAFE_DEFINE_UNSIGNED_SUB(T, name, max) \
+ PSNIP_SAFE__FUNCTION psnip_safe_bool \
+ psnip_safe_##name##_sub (T* res, T a, T b) { \
+ *res = a - b; \
+ return !PSNIP_SAFE_UNLIKELY(b > a); \
+ }
+
+#define PSNIP_SAFE_DEFINE_SIGNED_MUL(T, name, min, max) \
+ PSNIP_SAFE__FUNCTION psnip_safe_bool \
+ psnip_safe_##name##_mul (T* res, T a, T b) { \
+ psnip_safe_bool r = 1; \
+ if (a > 0) { \
+ if (b > 0) { \
+ if (a > (max / b)) { \
+ r = 0; \
+ } \
+ } else { \
+ if (b < (min / a)) { \
+ r = 0; \
+ } \
+ } \
+ } else { \
+ if (b > 0) { \
+ if (a < (min / b)) { \
+ r = 0; \
+ } \
+ } else { \
+ if ( (a != 0) && (b < (max / a))) { \
+ r = 0; \
+ } \
+ } \
+ } \
+ if(PSNIP_SAFE_LIKELY(r)) \
+ *res = a * b; \
+ return r; \
+ }
+
+#define PSNIP_SAFE_DEFINE_UNSIGNED_MUL(T, name, max) \
+ PSNIP_SAFE__FUNCTION psnip_safe_bool \
+ psnip_safe_##name##_mul (T* res, T a, T b) { \
+ *res = (T) (a * b); \
+ return !PSNIP_SAFE_UNLIKELY((a > 0) && (b > 0) && (a > (max / b))); \
+ }
+
+#define PSNIP_SAFE_DEFINE_SIGNED_DIV(T, name, min, max) \
+ PSNIP_SAFE__FUNCTION psnip_safe_bool \
+ psnip_safe_##name##_div (T* res, T a, T b) { \
+ if (PSNIP_SAFE_UNLIKELY(b == 0)) { \
+ *res = 0; \
+ return 0; \
+ } else if (PSNIP_SAFE_UNLIKELY(a == min && b == -1)) { \
+ *res = min; \
+ return 0; \
+ } else { \
+ *res = (T) (a / b); \
+ return 1; \
+ } \
+ }
+
+#define PSNIP_SAFE_DEFINE_UNSIGNED_DIV(T, name, max) \
+ PSNIP_SAFE__FUNCTION psnip_safe_bool \
+ psnip_safe_##name##_div (T* res, T a, T b) { \
+ if (PSNIP_SAFE_UNLIKELY(b == 0)) { \
+ *res = 0; \
+ return 0; \
+ } else { \
+ *res = a / b; \
+ return 1; \
+ } \
+ }
+
+#define PSNIP_SAFE_DEFINE_SIGNED_MOD(T, name, min, max) \
+ PSNIP_SAFE__FUNCTION psnip_safe_bool \
+ psnip_safe_##name##_mod (T* res, T a, T b) { \
+ if (PSNIP_SAFE_UNLIKELY(b == 0)) { \
+ *res = 0; \
+ return 0; \
+ } else if (PSNIP_SAFE_UNLIKELY(a == min && b == -1)) { \
+ *res = min; \
+ return 0; \
+ } else { \
+ *res = (T) (a % b); \
+ return 1; \
+ } \
+ }
+
+#define PSNIP_SAFE_DEFINE_UNSIGNED_MOD(T, name, max) \
+ PSNIP_SAFE__FUNCTION psnip_safe_bool \
+ psnip_safe_##name##_mod (T* res, T a, T b) { \
+ if (PSNIP_SAFE_UNLIKELY(b == 0)) { \
+ *res = 0; \
+ return 0; \
+ } else { \
+ *res = a % b; \
+ return 1; \
+ } \
+ }
+
+#define PSNIP_SAFE_DEFINE_SIGNED_NEG(T, name, min, max) \
+ PSNIP_SAFE__FUNCTION psnip_safe_bool \
+ psnip_safe_##name##_neg (T* res, T value) { \
+ psnip_safe_bool r = value != min; \
+ *res = PSNIP_SAFE_LIKELY(r) ? -value : max; \
+ return r; \
+ }
+
+#define PSNIP_SAFE_DEFINE_INTSAFE(T, name, op, isf) \
+ PSNIP_SAFE__FUNCTION psnip_safe_bool \
+ psnip_safe_##name##_##op (T* res, T a, T b) { \
+ return isf(a, b, res) == S_OK; \
+ }
+
+#if CHAR_MIN == 0
+#if defined(PSNIP_SAFE_HAVE_BUILTIN_OVERFLOW)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(char, char, add)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(char, char, sub)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(char, char, mul)
+#elif defined(PSNIP_SAFE_HAVE_LARGER_CHAR)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(char, char, add, CHAR_MAX)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(char, char, sub, CHAR_MAX)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(char, char, mul, CHAR_MAX)
+#else
+PSNIP_SAFE_DEFINE_UNSIGNED_ADD(char, char, CHAR_MAX)
+PSNIP_SAFE_DEFINE_UNSIGNED_SUB(char, char, CHAR_MAX)
+PSNIP_SAFE_DEFINE_UNSIGNED_MUL(char, char, CHAR_MAX)
+#endif
+PSNIP_SAFE_DEFINE_UNSIGNED_DIV(char, char, CHAR_MAX)
+PSNIP_SAFE_DEFINE_UNSIGNED_MOD(char, char, CHAR_MAX)
+#else /* CHAR_MIN != 0 */
+#if defined(PSNIP_SAFE_HAVE_BUILTIN_OVERFLOW)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(char, char, add)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(char, char, sub)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(char, char, mul)
+#elif defined(PSNIP_SAFE_HAVE_LARGER_CHAR)
+PSNIP_SAFE_DEFINE_PROMOTED_SIGNED_BINARY_OP(char, char, add, CHAR_MIN, CHAR_MAX)
+PSNIP_SAFE_DEFINE_PROMOTED_SIGNED_BINARY_OP(char, char, sub, CHAR_MIN, CHAR_MAX)
+PSNIP_SAFE_DEFINE_PROMOTED_SIGNED_BINARY_OP(char, char, mul, CHAR_MIN, CHAR_MAX)
+#else
+PSNIP_SAFE_DEFINE_SIGNED_ADD(char, char, CHAR_MIN, CHAR_MAX)
+PSNIP_SAFE_DEFINE_SIGNED_SUB(char, char, CHAR_MIN, CHAR_MAX)
+PSNIP_SAFE_DEFINE_SIGNED_MUL(char, char, CHAR_MIN, CHAR_MAX)
+#endif
+PSNIP_SAFE_DEFINE_SIGNED_DIV(char, char, CHAR_MIN, CHAR_MAX)
+PSNIP_SAFE_DEFINE_SIGNED_MOD(char, char, CHAR_MIN, CHAR_MAX)
+PSNIP_SAFE_DEFINE_SIGNED_NEG(char, char, CHAR_MIN, CHAR_MAX)
+#endif
+
+#if defined(PSNIP_SAFE_HAVE_BUILTIN_OVERFLOW)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(signed char, schar, add)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(signed char, schar, sub)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(signed char, schar, mul)
+#elif defined(PSNIP_SAFE_HAVE_LARGER_SCHAR)
+PSNIP_SAFE_DEFINE_PROMOTED_SIGNED_BINARY_OP(signed char, schar, add, SCHAR_MIN, SCHAR_MAX)
+PSNIP_SAFE_DEFINE_PROMOTED_SIGNED_BINARY_OP(signed char, schar, sub, SCHAR_MIN, SCHAR_MAX)
+PSNIP_SAFE_DEFINE_PROMOTED_SIGNED_BINARY_OP(signed char, schar, mul, SCHAR_MIN, SCHAR_MAX)
+#else
+PSNIP_SAFE_DEFINE_SIGNED_ADD(signed char, schar, SCHAR_MIN, SCHAR_MAX)
+PSNIP_SAFE_DEFINE_SIGNED_SUB(signed char, schar, SCHAR_MIN, SCHAR_MAX)
+PSNIP_SAFE_DEFINE_SIGNED_MUL(signed char, schar, SCHAR_MIN, SCHAR_MAX)
+#endif
+PSNIP_SAFE_DEFINE_SIGNED_DIV(signed char, schar, SCHAR_MIN, SCHAR_MAX)
+PSNIP_SAFE_DEFINE_SIGNED_MOD(signed char, schar, SCHAR_MIN, SCHAR_MAX)
+PSNIP_SAFE_DEFINE_SIGNED_NEG(signed char, schar, SCHAR_MIN, SCHAR_MAX)
+
+#if defined(PSNIP_SAFE_HAVE_BUILTIN_OVERFLOW)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(unsigned char, uchar, add)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(unsigned char, uchar, sub)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(unsigned char, uchar, mul)
+#elif defined(PSNIP_SAFE_HAVE_LARGER_UCHAR)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(unsigned char, uchar, add, UCHAR_MAX)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(unsigned char, uchar, sub, UCHAR_MAX)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(unsigned char, uchar, mul, UCHAR_MAX)
+#else
+PSNIP_SAFE_DEFINE_UNSIGNED_ADD(unsigned char, uchar, UCHAR_MAX)
+PSNIP_SAFE_DEFINE_UNSIGNED_SUB(unsigned char, uchar, UCHAR_MAX)
+PSNIP_SAFE_DEFINE_UNSIGNED_MUL(unsigned char, uchar, UCHAR_MAX)
+#endif
+PSNIP_SAFE_DEFINE_UNSIGNED_DIV(unsigned char, uchar, UCHAR_MAX)
+PSNIP_SAFE_DEFINE_UNSIGNED_MOD(unsigned char, uchar, UCHAR_MAX)
+
+#if defined(PSNIP_SAFE_HAVE_BUILTIN_OVERFLOW)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(short, short, add)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(short, short, sub)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(short, short, mul)
+#elif defined(PSNIP_SAFE_HAVE_LARGER_SHORT)
+PSNIP_SAFE_DEFINE_PROMOTED_SIGNED_BINARY_OP(short, short, add, SHRT_MIN, SHRT_MAX)
+PSNIP_SAFE_DEFINE_PROMOTED_SIGNED_BINARY_OP(short, short, sub, SHRT_MIN, SHRT_MAX)
+PSNIP_SAFE_DEFINE_PROMOTED_SIGNED_BINARY_OP(short, short, mul, SHRT_MIN, SHRT_MAX)
+#else
+PSNIP_SAFE_DEFINE_SIGNED_ADD(short, short, SHRT_MIN, SHRT_MAX)
+PSNIP_SAFE_DEFINE_SIGNED_SUB(short, short, SHRT_MIN, SHRT_MAX)
+PSNIP_SAFE_DEFINE_SIGNED_MUL(short, short, SHRT_MIN, SHRT_MAX)
+#endif
+PSNIP_SAFE_DEFINE_SIGNED_DIV(short, short, SHRT_MIN, SHRT_MAX)
+PSNIP_SAFE_DEFINE_SIGNED_MOD(short, short, SHRT_MIN, SHRT_MAX)
+PSNIP_SAFE_DEFINE_SIGNED_NEG(short, short, SHRT_MIN, SHRT_MAX)
+
+#if defined(PSNIP_SAFE_HAVE_BUILTIN_OVERFLOW)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(unsigned short, ushort, add)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(unsigned short, ushort, sub)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(unsigned short, ushort, mul)
+#elif defined(PSNIP_SAFE_HAVE_INTSAFE_H)
+PSNIP_SAFE_DEFINE_INTSAFE(unsigned short, ushort, add, UShortAdd)
+PSNIP_SAFE_DEFINE_INTSAFE(unsigned short, ushort, sub, UShortSub)
+PSNIP_SAFE_DEFINE_INTSAFE(unsigned short, ushort, mul, UShortMult)
+#elif defined(PSNIP_SAFE_HAVE_LARGER_USHORT)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(unsigned short, ushort, add, USHRT_MAX)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(unsigned short, ushort, sub, USHRT_MAX)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(unsigned short, ushort, mul, USHRT_MAX)
+#else
+PSNIP_SAFE_DEFINE_UNSIGNED_ADD(unsigned short, ushort, USHRT_MAX)
+PSNIP_SAFE_DEFINE_UNSIGNED_SUB(unsigned short, ushort, USHRT_MAX)
+PSNIP_SAFE_DEFINE_UNSIGNED_MUL(unsigned short, ushort, USHRT_MAX)
+#endif
+PSNIP_SAFE_DEFINE_UNSIGNED_DIV(unsigned short, ushort, USHRT_MAX)
+PSNIP_SAFE_DEFINE_UNSIGNED_MOD(unsigned short, ushort, USHRT_MAX)
+
+#if defined(PSNIP_SAFE_HAVE_BUILTIN_OVERFLOW)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(int, int, add)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(int, int, sub)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(int, int, mul)
+#elif defined(PSNIP_SAFE_HAVE_LARGER_INT)
+PSNIP_SAFE_DEFINE_PROMOTED_SIGNED_BINARY_OP(int, int, add, INT_MIN, INT_MAX)
+PSNIP_SAFE_DEFINE_PROMOTED_SIGNED_BINARY_OP(int, int, sub, INT_MIN, INT_MAX)
+PSNIP_SAFE_DEFINE_PROMOTED_SIGNED_BINARY_OP(int, int, mul, INT_MIN, INT_MAX)
+#else
+PSNIP_SAFE_DEFINE_SIGNED_ADD(int, int, INT_MIN, INT_MAX)
+PSNIP_SAFE_DEFINE_SIGNED_SUB(int, int, INT_MIN, INT_MAX)
+PSNIP_SAFE_DEFINE_SIGNED_MUL(int, int, INT_MIN, INT_MAX)
+#endif
+PSNIP_SAFE_DEFINE_SIGNED_DIV(int, int, INT_MIN, INT_MAX)
+PSNIP_SAFE_DEFINE_SIGNED_MOD(int, int, INT_MIN, INT_MAX)
+PSNIP_SAFE_DEFINE_SIGNED_NEG(int, int, INT_MIN, INT_MAX)
+
+#if defined(PSNIP_SAFE_HAVE_BUILTIN_OVERFLOW)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(unsigned int, uint, add)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(unsigned int, uint, sub)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(unsigned int, uint, mul)
+#elif defined(PSNIP_SAFE_HAVE_INTSAFE_H)
+PSNIP_SAFE_DEFINE_INTSAFE(unsigned int, uint, add, UIntAdd)
+PSNIP_SAFE_DEFINE_INTSAFE(unsigned int, uint, sub, UIntSub)
+PSNIP_SAFE_DEFINE_INTSAFE(unsigned int, uint, mul, UIntMult)
+#elif defined(PSNIP_SAFE_HAVE_LARGER_UINT)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(unsigned int, uint, add, UINT_MAX)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(unsigned int, uint, sub, UINT_MAX)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(unsigned int, uint, mul, UINT_MAX)
+#else
+PSNIP_SAFE_DEFINE_UNSIGNED_ADD(unsigned int, uint, UINT_MAX)
+PSNIP_SAFE_DEFINE_UNSIGNED_SUB(unsigned int, uint, UINT_MAX)
+PSNIP_SAFE_DEFINE_UNSIGNED_MUL(unsigned int, uint, UINT_MAX)
+#endif
+PSNIP_SAFE_DEFINE_UNSIGNED_DIV(unsigned int, uint, UINT_MAX)
+PSNIP_SAFE_DEFINE_UNSIGNED_MOD(unsigned int, uint, UINT_MAX)
+
+#if defined(PSNIP_SAFE_HAVE_BUILTIN_OVERFLOW)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(long, long, add)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(long, long, sub)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(long, long, mul)
+#elif defined(PSNIP_SAFE_HAVE_LARGER_LONG)
+PSNIP_SAFE_DEFINE_PROMOTED_SIGNED_BINARY_OP(long, long, add, LONG_MIN, LONG_MAX)
+PSNIP_SAFE_DEFINE_PROMOTED_SIGNED_BINARY_OP(long, long, sub, LONG_MIN, LONG_MAX)
+PSNIP_SAFE_DEFINE_PROMOTED_SIGNED_BINARY_OP(long, long, mul, LONG_MIN, LONG_MAX)
+#else
+PSNIP_SAFE_DEFINE_SIGNED_ADD(long, long, LONG_MIN, LONG_MAX)
+PSNIP_SAFE_DEFINE_SIGNED_SUB(long, long, LONG_MIN, LONG_MAX)
+PSNIP_SAFE_DEFINE_SIGNED_MUL(long, long, LONG_MIN, LONG_MAX)
+#endif
+PSNIP_SAFE_DEFINE_SIGNED_DIV(long, long, LONG_MIN, LONG_MAX)
+PSNIP_SAFE_DEFINE_SIGNED_MOD(long, long, LONG_MIN, LONG_MAX)
+PSNIP_SAFE_DEFINE_SIGNED_NEG(long, long, LONG_MIN, LONG_MAX)
+
+#if defined(PSNIP_SAFE_HAVE_BUILTIN_OVERFLOW)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(unsigned long, ulong, add)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(unsigned long, ulong, sub)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(unsigned long, ulong, mul)
+#elif defined(PSNIP_SAFE_HAVE_INTSAFE_H)
+PSNIP_SAFE_DEFINE_INTSAFE(unsigned long, ulong, add, ULongAdd)
+PSNIP_SAFE_DEFINE_INTSAFE(unsigned long, ulong, sub, ULongSub)
+PSNIP_SAFE_DEFINE_INTSAFE(unsigned long, ulong, mul, ULongMult)
+#elif defined(PSNIP_SAFE_HAVE_LARGER_ULONG)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(unsigned long, ulong, add, ULONG_MAX)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(unsigned long, ulong, sub, ULONG_MAX)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(unsigned long, ulong, mul, ULONG_MAX)
+#else
+PSNIP_SAFE_DEFINE_UNSIGNED_ADD(unsigned long, ulong, ULONG_MAX)
+PSNIP_SAFE_DEFINE_UNSIGNED_SUB(unsigned long, ulong, ULONG_MAX)
+PSNIP_SAFE_DEFINE_UNSIGNED_MUL(unsigned long, ulong, ULONG_MAX)
+#endif
+PSNIP_SAFE_DEFINE_UNSIGNED_DIV(unsigned long, ulong, ULONG_MAX)
+PSNIP_SAFE_DEFINE_UNSIGNED_MOD(unsigned long, ulong, ULONG_MAX)
+
+#if defined(PSNIP_SAFE_HAVE_BUILTIN_OVERFLOW)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(long long, llong, add)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(long long, llong, sub)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(long long, llong, mul)
+#elif defined(PSNIP_SAFE_HAVE_LARGER_LLONG)
+PSNIP_SAFE_DEFINE_PROMOTED_SIGNED_BINARY_OP(long long, llong, add, LLONG_MIN, LLONG_MAX)
+PSNIP_SAFE_DEFINE_PROMOTED_SIGNED_BINARY_OP(long long, llong, sub, LLONG_MIN, LLONG_MAX)
+PSNIP_SAFE_DEFINE_PROMOTED_SIGNED_BINARY_OP(long long, llong, mul, LLONG_MIN, LLONG_MAX)
+#else
+PSNIP_SAFE_DEFINE_SIGNED_ADD(long long, llong, LLONG_MIN, LLONG_MAX)
+PSNIP_SAFE_DEFINE_SIGNED_SUB(long long, llong, LLONG_MIN, LLONG_MAX)
+PSNIP_SAFE_DEFINE_SIGNED_MUL(long long, llong, LLONG_MIN, LLONG_MAX)
+#endif
+PSNIP_SAFE_DEFINE_SIGNED_DIV(long long, llong, LLONG_MIN, LLONG_MAX)
+PSNIP_SAFE_DEFINE_SIGNED_MOD(long long, llong, LLONG_MIN, LLONG_MAX)
+PSNIP_SAFE_DEFINE_SIGNED_NEG(long long, llong, LLONG_MIN, LLONG_MAX)
+
+#if defined(PSNIP_SAFE_HAVE_BUILTIN_OVERFLOW)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(unsigned long long, ullong, add)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(unsigned long long, ullong, sub)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(unsigned long long, ullong, mul)
+#elif defined(PSNIP_SAFE_HAVE_INTSAFE_H)
+PSNIP_SAFE_DEFINE_INTSAFE(unsigned long long, ullong, add, ULongLongAdd)
+PSNIP_SAFE_DEFINE_INTSAFE(unsigned long long, ullong, sub, ULongLongSub)
+PSNIP_SAFE_DEFINE_INTSAFE(unsigned long long, ullong, mul, ULongLongMult)
+#elif defined(PSNIP_SAFE_HAVE_LARGER_ULLONG)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(unsigned long long, ullong, add, ULLONG_MAX)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(unsigned long long, ullong, sub, ULLONG_MAX)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(unsigned long long, ullong, mul, ULLONG_MAX)
+#else
+PSNIP_SAFE_DEFINE_UNSIGNED_ADD(unsigned long long, ullong, ULLONG_MAX)
+PSNIP_SAFE_DEFINE_UNSIGNED_SUB(unsigned long long, ullong, ULLONG_MAX)
+PSNIP_SAFE_DEFINE_UNSIGNED_MUL(unsigned long long, ullong, ULLONG_MAX)
+#endif
+PSNIP_SAFE_DEFINE_UNSIGNED_DIV(unsigned long long, ullong, ULLONG_MAX)
+PSNIP_SAFE_DEFINE_UNSIGNED_MOD(unsigned long long, ullong, ULLONG_MAX)
+
+#if defined(PSNIP_SAFE_HAVE_BUILTIN_OVERFLOW)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(size_t, size, add)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(size_t, size, sub)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(size_t, size, mul)
+#elif defined(PSNIP_SAFE_HAVE_INTSAFE_H)
+PSNIP_SAFE_DEFINE_INTSAFE(size_t, size, add, SizeTAdd)
+PSNIP_SAFE_DEFINE_INTSAFE(size_t, size, sub, SizeTSub)
+PSNIP_SAFE_DEFINE_INTSAFE(size_t, size, mul, SizeTMult)
+#elif defined(PSNIP_SAFE_HAVE_LARGER_SIZE)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(size_t, size, add, PSNIP_SAFE__SIZE_MAX_RT)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(size_t, size, sub, PSNIP_SAFE__SIZE_MAX_RT)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(size_t, size, mul, PSNIP_SAFE__SIZE_MAX_RT)
+#else
+PSNIP_SAFE_DEFINE_UNSIGNED_ADD(size_t, size, PSNIP_SAFE__SIZE_MAX_RT)
+PSNIP_SAFE_DEFINE_UNSIGNED_SUB(size_t, size, PSNIP_SAFE__SIZE_MAX_RT)
+PSNIP_SAFE_DEFINE_UNSIGNED_MUL(size_t, size, PSNIP_SAFE__SIZE_MAX_RT)
+#endif
+PSNIP_SAFE_DEFINE_UNSIGNED_DIV(size_t, size, PSNIP_SAFE__SIZE_MAX_RT)
+PSNIP_SAFE_DEFINE_UNSIGNED_MOD(size_t, size, PSNIP_SAFE__SIZE_MAX_RT)
+
+#if !defined(PSNIP_SAFE_NO_FIXED)
+
+#if defined(PSNIP_SAFE_HAVE_BUILTIN_OVERFLOW)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(psnip_int8_t, int8, add)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(psnip_int8_t, int8, sub)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(psnip_int8_t, int8, mul)
+#elif defined(PSNIP_SAFE_HAVE_LARGER_INT8)
+PSNIP_SAFE_DEFINE_PROMOTED_SIGNED_BINARY_OP(psnip_int8_t, int8, add, (-0x7fLL-1), 0x7f)
+PSNIP_SAFE_DEFINE_PROMOTED_SIGNED_BINARY_OP(psnip_int8_t, int8, sub, (-0x7fLL-1), 0x7f)
+PSNIP_SAFE_DEFINE_PROMOTED_SIGNED_BINARY_OP(psnip_int8_t, int8, mul, (-0x7fLL-1), 0x7f)
+#else
+PSNIP_SAFE_DEFINE_SIGNED_ADD(psnip_int8_t, int8, (-0x7fLL-1), 0x7f)
+PSNIP_SAFE_DEFINE_SIGNED_SUB(psnip_int8_t, int8, (-0x7fLL-1), 0x7f)
+PSNIP_SAFE_DEFINE_SIGNED_MUL(psnip_int8_t, int8, (-0x7fLL-1), 0x7f)
+#endif
+PSNIP_SAFE_DEFINE_SIGNED_DIV(psnip_int8_t, int8, (-0x7fLL-1), 0x7f)
+PSNIP_SAFE_DEFINE_SIGNED_MOD(psnip_int8_t, int8, (-0x7fLL-1), 0x7f)
+PSNIP_SAFE_DEFINE_SIGNED_NEG(psnip_int8_t, int8, (-0x7fLL-1), 0x7f)
+
+#if defined(PSNIP_SAFE_HAVE_BUILTIN_OVERFLOW)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(psnip_uint8_t, uint8, add)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(psnip_uint8_t, uint8, sub)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(psnip_uint8_t, uint8, mul)
+#elif defined(PSNIP_SAFE_HAVE_LARGER_UINT8)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(psnip_uint8_t, uint8, add, 0xff)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(psnip_uint8_t, uint8, sub, 0xff)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(psnip_uint8_t, uint8, mul, 0xff)
+#else
+PSNIP_SAFE_DEFINE_UNSIGNED_ADD(psnip_uint8_t, uint8, 0xff)
+PSNIP_SAFE_DEFINE_UNSIGNED_SUB(psnip_uint8_t, uint8, 0xff)
+PSNIP_SAFE_DEFINE_UNSIGNED_MUL(psnip_uint8_t, uint8, 0xff)
+#endif
+PSNIP_SAFE_DEFINE_UNSIGNED_DIV(psnip_uint8_t, uint8, 0xff)
+PSNIP_SAFE_DEFINE_UNSIGNED_MOD(psnip_uint8_t, uint8, 0xff)
+
+#if defined(PSNIP_SAFE_HAVE_BUILTIN_OVERFLOW)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(psnip_int16_t, int16, add)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(psnip_int16_t, int16, sub)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(psnip_int16_t, int16, mul)
+#elif defined(PSNIP_SAFE_HAVE_LARGER_INT16)
+PSNIP_SAFE_DEFINE_PROMOTED_SIGNED_BINARY_OP(psnip_int16_t, int16, add, (-32767-1), 0x7fff)
+PSNIP_SAFE_DEFINE_PROMOTED_SIGNED_BINARY_OP(psnip_int16_t, int16, sub, (-32767-1), 0x7fff)
+PSNIP_SAFE_DEFINE_PROMOTED_SIGNED_BINARY_OP(psnip_int16_t, int16, mul, (-32767-1), 0x7fff)
+#else
+PSNIP_SAFE_DEFINE_SIGNED_ADD(psnip_int16_t, int16, (-32767-1), 0x7fff)
+PSNIP_SAFE_DEFINE_SIGNED_SUB(psnip_int16_t, int16, (-32767-1), 0x7fff)
+PSNIP_SAFE_DEFINE_SIGNED_MUL(psnip_int16_t, int16, (-32767-1), 0x7fff)
+#endif
+PSNIP_SAFE_DEFINE_SIGNED_DIV(psnip_int16_t, int16, (-32767-1), 0x7fff)
+PSNIP_SAFE_DEFINE_SIGNED_MOD(psnip_int16_t, int16, (-32767-1), 0x7fff)
+PSNIP_SAFE_DEFINE_SIGNED_NEG(psnip_int16_t, int16, (-32767-1), 0x7fff)
+
+#if defined(PSNIP_SAFE_HAVE_BUILTIN_OVERFLOW)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(psnip_uint16_t, uint16, add)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(psnip_uint16_t, uint16, sub)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(psnip_uint16_t, uint16, mul)
+#elif defined(PSNIP_SAFE_HAVE_INTSAFE_H) && defined(_WIN32)
+PSNIP_SAFE_DEFINE_INTSAFE(psnip_uint16_t, uint16, add, UShortAdd)
+PSNIP_SAFE_DEFINE_INTSAFE(psnip_uint16_t, uint16, sub, UShortSub)
+PSNIP_SAFE_DEFINE_INTSAFE(psnip_uint16_t, uint16, mul, UShortMult)
+#elif defined(PSNIP_SAFE_HAVE_LARGER_UINT16)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(psnip_uint16_t, uint16, add, 0xffff)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(psnip_uint16_t, uint16, sub, 0xffff)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(psnip_uint16_t, uint16, mul, 0xffff)
+#else
+PSNIP_SAFE_DEFINE_UNSIGNED_ADD(psnip_uint16_t, uint16, 0xffff)
+PSNIP_SAFE_DEFINE_UNSIGNED_SUB(psnip_uint16_t, uint16, 0xffff)
+PSNIP_SAFE_DEFINE_UNSIGNED_MUL(psnip_uint16_t, uint16, 0xffff)
+#endif
+PSNIP_SAFE_DEFINE_UNSIGNED_DIV(psnip_uint16_t, uint16, 0xffff)
+PSNIP_SAFE_DEFINE_UNSIGNED_MOD(psnip_uint16_t, uint16, 0xffff)
+
+#if defined(PSNIP_SAFE_HAVE_BUILTIN_OVERFLOW)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(psnip_int32_t, int32, add)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(psnip_int32_t, int32, sub)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(psnip_int32_t, int32, mul)
+#elif defined(PSNIP_SAFE_HAVE_LARGER_INT32)
+PSNIP_SAFE_DEFINE_PROMOTED_SIGNED_BINARY_OP(psnip_int32_t, int32, add, (-0x7fffffffLL-1), 0x7fffffffLL)
+PSNIP_SAFE_DEFINE_PROMOTED_SIGNED_BINARY_OP(psnip_int32_t, int32, sub, (-0x7fffffffLL-1), 0x7fffffffLL)
+PSNIP_SAFE_DEFINE_PROMOTED_SIGNED_BINARY_OP(psnip_int32_t, int32, mul, (-0x7fffffffLL-1), 0x7fffffffLL)
+#else
+PSNIP_SAFE_DEFINE_SIGNED_ADD(psnip_int32_t, int32, (-0x7fffffffLL-1), 0x7fffffffLL)
+PSNIP_SAFE_DEFINE_SIGNED_SUB(psnip_int32_t, int32, (-0x7fffffffLL-1), 0x7fffffffLL)
+PSNIP_SAFE_DEFINE_SIGNED_MUL(psnip_int32_t, int32, (-0x7fffffffLL-1), 0x7fffffffLL)
+#endif
+PSNIP_SAFE_DEFINE_SIGNED_DIV(psnip_int32_t, int32, (-0x7fffffffLL-1), 0x7fffffffLL)
+PSNIP_SAFE_DEFINE_SIGNED_MOD(psnip_int32_t, int32, (-0x7fffffffLL-1), 0x7fffffffLL)
+PSNIP_SAFE_DEFINE_SIGNED_NEG(psnip_int32_t, int32, (-0x7fffffffLL-1), 0x7fffffffLL)
+
+#if defined(PSNIP_SAFE_HAVE_BUILTIN_OVERFLOW)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(psnip_uint32_t, uint32, add)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(psnip_uint32_t, uint32, sub)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(psnip_uint32_t, uint32, mul)
+#elif defined(PSNIP_SAFE_HAVE_INTSAFE_H) && defined(_WIN32)
+PSNIP_SAFE_DEFINE_INTSAFE(psnip_uint32_t, uint32, add, UIntAdd)
+PSNIP_SAFE_DEFINE_INTSAFE(psnip_uint32_t, uint32, sub, UIntSub)
+PSNIP_SAFE_DEFINE_INTSAFE(psnip_uint32_t, uint32, mul, UIntMult)
+#elif defined(PSNIP_SAFE_HAVE_LARGER_UINT32)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(psnip_uint32_t, uint32, add, 0xffffffffUL)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(psnip_uint32_t, uint32, sub, 0xffffffffUL)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(psnip_uint32_t, uint32, mul, 0xffffffffUL)
+#else
+PSNIP_SAFE_DEFINE_UNSIGNED_ADD(psnip_uint32_t, uint32, 0xffffffffUL)
+PSNIP_SAFE_DEFINE_UNSIGNED_SUB(psnip_uint32_t, uint32, 0xffffffffUL)
+PSNIP_SAFE_DEFINE_UNSIGNED_MUL(psnip_uint32_t, uint32, 0xffffffffUL)
+#endif
+PSNIP_SAFE_DEFINE_UNSIGNED_DIV(psnip_uint32_t, uint32, 0xffffffffUL)
+PSNIP_SAFE_DEFINE_UNSIGNED_MOD(psnip_uint32_t, uint32, 0xffffffffUL)
+
+#if defined(PSNIP_SAFE_HAVE_BUILTIN_OVERFLOW)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(psnip_int64_t, int64, add)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(psnip_int64_t, int64, sub)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(psnip_int64_t, int64, mul)
+#elif defined(PSNIP_SAFE_HAVE_LARGER_INT64)
+PSNIP_SAFE_DEFINE_PROMOTED_SIGNED_BINARY_OP(psnip_int64_t, int64, add, (-0x7fffffffffffffffLL-1), 0x7fffffffffffffffLL)
+PSNIP_SAFE_DEFINE_PROMOTED_SIGNED_BINARY_OP(psnip_int64_t, int64, sub, (-0x7fffffffffffffffLL-1), 0x7fffffffffffffffLL)
+PSNIP_SAFE_DEFINE_PROMOTED_SIGNED_BINARY_OP(psnip_int64_t, int64, mul, (-0x7fffffffffffffffLL-1), 0x7fffffffffffffffLL)
+#else
+PSNIP_SAFE_DEFINE_SIGNED_ADD(psnip_int64_t, int64, (-0x7fffffffffffffffLL-1), 0x7fffffffffffffffLL)
+PSNIP_SAFE_DEFINE_SIGNED_SUB(psnip_int64_t, int64, (-0x7fffffffffffffffLL-1), 0x7fffffffffffffffLL)
+PSNIP_SAFE_DEFINE_SIGNED_MUL(psnip_int64_t, int64, (-0x7fffffffffffffffLL-1), 0x7fffffffffffffffLL)
+#endif
+PSNIP_SAFE_DEFINE_SIGNED_DIV(psnip_int64_t, int64, (-0x7fffffffffffffffLL-1), 0x7fffffffffffffffLL)
+PSNIP_SAFE_DEFINE_SIGNED_MOD(psnip_int64_t, int64, (-0x7fffffffffffffffLL-1), 0x7fffffffffffffffLL)
+PSNIP_SAFE_DEFINE_SIGNED_NEG(psnip_int64_t, int64, (-0x7fffffffffffffffLL-1), 0x7fffffffffffffffLL)
+
+#if defined(PSNIP_SAFE_HAVE_BUILTIN_OVERFLOW)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(psnip_uint64_t, uint64, add)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(psnip_uint64_t, uint64, sub)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(psnip_uint64_t, uint64, mul)
+#elif defined(PSNIP_SAFE_HAVE_INTSAFE_H) && defined(_WIN32)
+PSNIP_SAFE_DEFINE_INTSAFE(psnip_uint64_t, uint64, add, ULongLongAdd)
+PSNIP_SAFE_DEFINE_INTSAFE(psnip_uint64_t, uint64, sub, ULongLongSub)
+PSNIP_SAFE_DEFINE_INTSAFE(psnip_uint64_t, uint64, mul, ULongLongMult)
+#elif defined(PSNIP_SAFE_HAVE_LARGER_UINT64)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(psnip_uint64_t, uint64, add, 0xffffffffffffffffULL)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(psnip_uint64_t, uint64, sub, 0xffffffffffffffffULL)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(psnip_uint64_t, uint64, mul, 0xffffffffffffffffULL)
+#else
+PSNIP_SAFE_DEFINE_UNSIGNED_ADD(psnip_uint64_t, uint64, 0xffffffffffffffffULL)
+PSNIP_SAFE_DEFINE_UNSIGNED_SUB(psnip_uint64_t, uint64, 0xffffffffffffffffULL)
+PSNIP_SAFE_DEFINE_UNSIGNED_MUL(psnip_uint64_t, uint64, 0xffffffffffffffffULL)
+#endif
+PSNIP_SAFE_DEFINE_UNSIGNED_DIV(psnip_uint64_t, uint64, 0xffffffffffffffffULL)
+PSNIP_SAFE_DEFINE_UNSIGNED_MOD(psnip_uint64_t, uint64, 0xffffffffffffffffULL)
+
+#endif /* !defined(PSNIP_SAFE_NO_FIXED) */
+
+#define PSNIP_SAFE_C11_GENERIC_SELECTION(res, op) \
+ _Generic((*res), \
+ char: psnip_safe_char_##op, \
+ unsigned char: psnip_safe_uchar_##op, \
+ short: psnip_safe_short_##op, \
+ unsigned short: psnip_safe_ushort_##op, \
+ int: psnip_safe_int_##op, \
+ unsigned int: psnip_safe_uint_##op, \
+ long: psnip_safe_long_##op, \
+ unsigned long: psnip_safe_ulong_##op, \
+ long long: psnip_safe_llong_##op, \
+ unsigned long long: psnip_safe_ullong_##op)
+
+#define PSNIP_SAFE_C11_GENERIC_BINARY_OP(op, res, a, b) \
+ PSNIP_SAFE_C11_GENERIC_SELECTION(res, op)(res, a, b)
+#define PSNIP_SAFE_C11_GENERIC_UNARY_OP(op, res, v) \
+ PSNIP_SAFE_C11_GENERIC_SELECTION(res, op)(res, v)
+
+#if defined(PSNIP_SAFE_HAVE_BUILTIN_OVERFLOW)
+#define psnip_safe_add(res, a, b) !__builtin_add_overflow(a, b, res)
+#define psnip_safe_sub(res, a, b) !__builtin_sub_overflow(a, b, res)
+#define psnip_safe_mul(res, a, b) !__builtin_mul_overflow(a, b, res)
+#define psnip_safe_div(res, a, b) !__builtin_div_overflow(a, b, res)
+#define psnip_safe_mod(res, a, b) !__builtin_mod_overflow(a, b, res)
+#define psnip_safe_neg(res, v) PSNIP_SAFE_C11_GENERIC_UNARY_OP (neg, res, v)
+
+#elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
+/* The are no fixed-length or size selections because they cause an
+ * error about _Generic specifying two compatible types. Hopefully
+ * this doesn't cause problems on exotic platforms, but if it does
+ * please let me know and I'll try to figure something out. */
+
+#define psnip_safe_add(res, a, b) PSNIP_SAFE_C11_GENERIC_BINARY_OP(add, res, a, b)
+#define psnip_safe_sub(res, a, b) PSNIP_SAFE_C11_GENERIC_BINARY_OP(sub, res, a, b)
+#define psnip_safe_mul(res, a, b) PSNIP_SAFE_C11_GENERIC_BINARY_OP(mul, res, a, b)
+#define psnip_safe_div(res, a, b) PSNIP_SAFE_C11_GENERIC_BINARY_OP(div, res, a, b)
+#define psnip_safe_mod(res, a, b) PSNIP_SAFE_C11_GENERIC_BINARY_OP(mod, res, a, b)
+#define psnip_safe_neg(res, v) PSNIP_SAFE_C11_GENERIC_UNARY_OP (neg, res, v)
+#endif
+
+#if !defined(PSNIP_SAFE_HAVE_BUILTINS) && (defined(PSNIP_SAFE_EMULATE_NATIVE) || defined(PSNIP_BUILTIN_EMULATE_NATIVE))
+# define __builtin_sadd_overflow(a, b, res) (!psnip_safe_int_add(res, a, b))
+# define __builtin_saddl_overflow(a, b, res) (!psnip_safe_long_add(res, a, b))
+# define __builtin_saddll_overflow(a, b, res) (!psnip_safe_llong_add(res, a, b))
+# define __builtin_uadd_overflow(a, b, res) (!psnip_safe_uint_add(res, a, b))
+# define __builtin_uaddl_overflow(a, b, res) (!psnip_safe_ulong_add(res, a, b))
+# define __builtin_uaddll_overflow(a, b, res) (!psnip_safe_ullong_add(res, a, b))
+
+# define __builtin_ssub_overflow(a, b, res) (!psnip_safe_int_sub(res, a, b))
+# define __builtin_ssubl_overflow(a, b, res) (!psnip_safe_long_sub(res, a, b))
+# define __builtin_ssubll_overflow(a, b, res) (!psnip_safe_llong_sub(res, a, b))
+# define __builtin_usub_overflow(a, b, res) (!psnip_safe_uint_sub(res, a, b))
+# define __builtin_usubl_overflow(a, b, res) (!psnip_safe_ulong_sub(res, a, b))
+# define __builtin_usubll_overflow(a, b, res) (!psnip_safe_ullong_sub(res, a, b))
+
+# define __builtin_smul_overflow(a, b, res) (!psnip_safe_int_mul(res, a, b))
+# define __builtin_smull_overflow(a, b, res) (!psnip_safe_long_mul(res, a, b))
+# define __builtin_smulll_overflow(a, b, res) (!psnip_safe_llong_mul(res, a, b))
+# define __builtin_umul_overflow(a, b, res) (!psnip_safe_uint_mul(res, a, b))
+# define __builtin_umull_overflow(a, b, res) (!psnip_safe_ulong_mul(res, a, b))
+# define __builtin_umulll_overflow(a, b, res) (!psnip_safe_ullong_mul(res, a, b))
+#endif
+
+#endif /* !defined(PSNIP_SAFE_H) */
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/vendored/string_view.hpp b/contrib/libs/apache/arrow/cpp/src/arrow/vendored/string_view.hpp
new file mode 100644
index 00000000000..a2d5567854f
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/vendored/string_view.hpp
@@ -0,0 +1,1531 @@
+// Vendored from git changeset v1.4.0
+
+// Copyright 2017-2020 by Martin Moene
+//
+// string-view lite, a C++17-like string_view for C++98 and later.
+// For more information see https://github.com/martinmoene/string-view-lite
+//
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#pragma once
+
+#ifndef NONSTD_SV_LITE_H_INCLUDED
+#define NONSTD_SV_LITE_H_INCLUDED
+
+#define string_view_lite_MAJOR 1
+#define string_view_lite_MINOR 4
+#define string_view_lite_PATCH 0
+
+#define string_view_lite_VERSION nssv_STRINGIFY(string_view_lite_MAJOR) "." nssv_STRINGIFY(string_view_lite_MINOR) "." nssv_STRINGIFY(string_view_lite_PATCH)
+
+#define nssv_STRINGIFY( x ) nssv_STRINGIFY_( x )
+#define nssv_STRINGIFY_( x ) #x
+
+// string-view lite configuration:
+
+#define nssv_STRING_VIEW_DEFAULT 0
+#define nssv_STRING_VIEW_NONSTD 1
+#define nssv_STRING_VIEW_STD 2
+
+#if !defined( nssv_CONFIG_SELECT_STRING_VIEW )
+# define nssv_CONFIG_SELECT_STRING_VIEW ( nssv_HAVE_STD_STRING_VIEW ? nssv_STRING_VIEW_STD : nssv_STRING_VIEW_NONSTD )
+#endif
+
+#if defined( nssv_CONFIG_SELECT_STD_STRING_VIEW ) || defined( nssv_CONFIG_SELECT_NONSTD_STRING_VIEW )
+# error nssv_CONFIG_SELECT_STD_STRING_VIEW and nssv_CONFIG_SELECT_NONSTD_STRING_VIEW are deprecated and removed, please use nssv_CONFIG_SELECT_STRING_VIEW=nssv_STRING_VIEW_...
+#endif
+
+#ifndef nssv_CONFIG_STD_SV_OPERATOR
+# define nssv_CONFIG_STD_SV_OPERATOR 0
+#endif
+
+#ifndef nssv_CONFIG_USR_SV_OPERATOR
+# define nssv_CONFIG_USR_SV_OPERATOR 1
+#endif
+
+#ifdef nssv_CONFIG_CONVERSION_STD_STRING
+# define nssv_CONFIG_CONVERSION_STD_STRING_CLASS_METHODS nssv_CONFIG_CONVERSION_STD_STRING
+# define nssv_CONFIG_CONVERSION_STD_STRING_FREE_FUNCTIONS nssv_CONFIG_CONVERSION_STD_STRING
+#endif
+
+#ifndef nssv_CONFIG_CONVERSION_STD_STRING_CLASS_METHODS
+# define nssv_CONFIG_CONVERSION_STD_STRING_CLASS_METHODS 1
+#endif
+
+#ifndef nssv_CONFIG_CONVERSION_STD_STRING_FREE_FUNCTIONS
+# define nssv_CONFIG_CONVERSION_STD_STRING_FREE_FUNCTIONS 1
+#endif
+
+// Control presence of exception handling (try and auto discover):
+
+#ifndef nssv_CONFIG_NO_EXCEPTIONS
+# if defined(__cpp_exceptions) || defined(__EXCEPTIONS) || defined(_CPPUNWIND)
+# define nssv_CONFIG_NO_EXCEPTIONS 0
+# else
+# define nssv_CONFIG_NO_EXCEPTIONS 1
+# endif
+#endif
+
+// C++ language version detection (C++20 is speculative):
+// Note: VC14.0/1900 (VS2015) lacks too much from C++14.
+
+#ifndef nssv_CPLUSPLUS
+# if defined(_MSVC_LANG ) && !defined(__clang__)
+# define nssv_CPLUSPLUS (_MSC_VER == 1900 ? 201103L : _MSVC_LANG )
+# else
+# define nssv_CPLUSPLUS __cplusplus
+# endif
+#endif
+
+#define nssv_CPP98_OR_GREATER ( nssv_CPLUSPLUS >= 199711L )
+#define nssv_CPP11_OR_GREATER ( nssv_CPLUSPLUS >= 201103L )
+#define nssv_CPP11_OR_GREATER_ ( nssv_CPLUSPLUS >= 201103L )
+#define nssv_CPP14_OR_GREATER ( nssv_CPLUSPLUS >= 201402L )
+#define nssv_CPP17_OR_GREATER ( nssv_CPLUSPLUS >= 201703L )
+#define nssv_CPP20_OR_GREATER ( nssv_CPLUSPLUS >= 202000L )
+
+// use C++17 std::string_view if available and requested:
+
+#if nssv_CPP17_OR_GREATER && defined(__has_include )
+# if __has_include( <string_view> )
+# define nssv_HAVE_STD_STRING_VIEW 1
+# else
+# define nssv_HAVE_STD_STRING_VIEW 0
+# endif
+#else
+# define nssv_HAVE_STD_STRING_VIEW 0
+#endif
+
+#define nssv_USES_STD_STRING_VIEW ( (nssv_CONFIG_SELECT_STRING_VIEW == nssv_STRING_VIEW_STD) || ((nssv_CONFIG_SELECT_STRING_VIEW == nssv_STRING_VIEW_DEFAULT) && nssv_HAVE_STD_STRING_VIEW) )
+
+#define nssv_HAVE_STARTS_WITH ( nssv_CPP20_OR_GREATER || !nssv_USES_STD_STRING_VIEW )
+#define nssv_HAVE_ENDS_WITH nssv_HAVE_STARTS_WITH
+
+//
+// Use C++17 std::string_view:
+//
+
+#if nssv_USES_STD_STRING_VIEW
+
+#include <string_view>
+
+// Extensions for std::string:
+
+#if nssv_CONFIG_CONVERSION_STD_STRING_FREE_FUNCTIONS
+
+namespace nonstd {
+
+template< class CharT, class Traits, class Allocator = std::allocator<CharT> >
+std::basic_string<CharT, Traits, Allocator>
+to_string( std::basic_string_view<CharT, Traits> v, Allocator const & a = Allocator() )
+{
+ return std::basic_string<CharT,Traits, Allocator>( v.begin(), v.end(), a );
+}
+
+template< class CharT, class Traits, class Allocator >
+std::basic_string_view<CharT, Traits>
+to_string_view( std::basic_string<CharT, Traits, Allocator> const & s )
+{
+ return std::basic_string_view<CharT, Traits>( s.data(), s.size() );
+}
+
+// Literal operators sv and _sv:
+
+#if nssv_CONFIG_STD_SV_OPERATOR
+
+using namespace std::literals::string_view_literals;
+
+#endif
+
+#if nssv_CONFIG_USR_SV_OPERATOR
+
+inline namespace literals {
+inline namespace string_view_literals {
+
+
+constexpr std::string_view operator "" _sv( const char* str, size_t len ) noexcept // (1)
+{
+ return std::string_view{ str, len };
+}
+
+constexpr std::u16string_view operator "" _sv( const char16_t* str, size_t len ) noexcept // (2)
+{
+ return std::u16string_view{ str, len };
+}
+
+constexpr std::u32string_view operator "" _sv( const char32_t* str, size_t len ) noexcept // (3)
+{
+ return std::u32string_view{ str, len };
+}
+
+constexpr std::wstring_view operator "" _sv( const wchar_t* str, size_t len ) noexcept // (4)
+{
+ return std::wstring_view{ str, len };
+}
+
+}} // namespace literals::string_view_literals
+
+#endif // nssv_CONFIG_USR_SV_OPERATOR
+
+} // namespace nonstd
+
+#endif // nssv_CONFIG_CONVERSION_STD_STRING_FREE_FUNCTIONS
+
+namespace nonstd {
+
+using std::string_view;
+using std::wstring_view;
+using std::u16string_view;
+using std::u32string_view;
+using std::basic_string_view;
+
+// literal "sv" and "_sv", see above
+
+using std::operator==;
+using std::operator!=;
+using std::operator<;
+using std::operator<=;
+using std::operator>;
+using std::operator>=;
+
+using std::operator<<;
+
+} // namespace nonstd
+
+#else // nssv_HAVE_STD_STRING_VIEW
+
+//
+// Before C++17: use string_view lite:
+//
+
+// Compiler versions:
+//
+// MSVC++ 6.0 _MSC_VER == 1200 nssv_COMPILER_MSVC_VERSION == 60 (Visual Studio 6.0)
+// MSVC++ 7.0 _MSC_VER == 1300 nssv_COMPILER_MSVC_VERSION == 70 (Visual Studio .NET 2002)
+// MSVC++ 7.1 _MSC_VER == 1310 nssv_COMPILER_MSVC_VERSION == 71 (Visual Studio .NET 2003)
+// MSVC++ 8.0 _MSC_VER == 1400 nssv_COMPILER_MSVC_VERSION == 80 (Visual Studio 2005)
+// MSVC++ 9.0 _MSC_VER == 1500 nssv_COMPILER_MSVC_VERSION == 90 (Visual Studio 2008)
+// MSVC++ 10.0 _MSC_VER == 1600 nssv_COMPILER_MSVC_VERSION == 100 (Visual Studio 2010)
+// MSVC++ 11.0 _MSC_VER == 1700 nssv_COMPILER_MSVC_VERSION == 110 (Visual Studio 2012)
+// MSVC++ 12.0 _MSC_VER == 1800 nssv_COMPILER_MSVC_VERSION == 120 (Visual Studio 2013)
+// MSVC++ 14.0 _MSC_VER == 1900 nssv_COMPILER_MSVC_VERSION == 140 (Visual Studio 2015)
+// MSVC++ 14.1 _MSC_VER >= 1910 nssv_COMPILER_MSVC_VERSION == 141 (Visual Studio 2017)
+// MSVC++ 14.2 _MSC_VER >= 1920 nssv_COMPILER_MSVC_VERSION == 142 (Visual Studio 2019)
+
+#if defined(_MSC_VER ) && !defined(__clang__)
+# define nssv_COMPILER_MSVC_VER (_MSC_VER )
+# define nssv_COMPILER_MSVC_VERSION (_MSC_VER / 10 - 10 * ( 5 + (_MSC_VER < 1900 ) ) )
+#else
+# define nssv_COMPILER_MSVC_VER 0
+# define nssv_COMPILER_MSVC_VERSION 0
+#endif
+
+#define nssv_COMPILER_VERSION( major, minor, patch ) ( 10 * ( 10 * (major) + (minor) ) + (patch) )
+
+#if defined(__clang__)
+# define nssv_COMPILER_CLANG_VERSION nssv_COMPILER_VERSION(__clang_major__, __clang_minor__, __clang_patchlevel__)
+#else
+# define nssv_COMPILER_CLANG_VERSION 0
+#endif
+
+#if defined(__GNUC__) && !defined(__clang__)
+# define nssv_COMPILER_GNUC_VERSION nssv_COMPILER_VERSION(__GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__)
+#else
+# define nssv_COMPILER_GNUC_VERSION 0
+#endif
+
+// half-open range [lo..hi):
+#define nssv_BETWEEN( v, lo, hi ) ( (lo) <= (v) && (v) < (hi) )
+
+// Presence of language and library features:
+
+#ifdef _HAS_CPP0X
+# define nssv_HAS_CPP0X _HAS_CPP0X
+#else
+# define nssv_HAS_CPP0X 0
+#endif
+
+// Unless defined otherwise below, consider VC14 as C++11 for variant-lite:
+
+#if nssv_COMPILER_MSVC_VER >= 1900
+# undef nssv_CPP11_OR_GREATER
+# define nssv_CPP11_OR_GREATER 1
+#endif
+
+#define nssv_CPP11_90 (nssv_CPP11_OR_GREATER_ || nssv_COMPILER_MSVC_VER >= 1500)
+#define nssv_CPP11_100 (nssv_CPP11_OR_GREATER_ || nssv_COMPILER_MSVC_VER >= 1600)
+#define nssv_CPP11_110 (nssv_CPP11_OR_GREATER_ || nssv_COMPILER_MSVC_VER >= 1700)
+#define nssv_CPP11_120 (nssv_CPP11_OR_GREATER_ || nssv_COMPILER_MSVC_VER >= 1800)
+#define nssv_CPP11_140 (nssv_CPP11_OR_GREATER_ || nssv_COMPILER_MSVC_VER >= 1900)
+#define nssv_CPP11_141 (nssv_CPP11_OR_GREATER_ || nssv_COMPILER_MSVC_VER >= 1910)
+
+#define nssv_CPP14_000 (nssv_CPP14_OR_GREATER)
+#define nssv_CPP17_000 (nssv_CPP17_OR_GREATER)
+
+// Presence of C++11 language features:
+
+#define nssv_HAVE_CONSTEXPR_11 nssv_CPP11_140
+#define nssv_HAVE_EXPLICIT_CONVERSION nssv_CPP11_140
+#define nssv_HAVE_INLINE_NAMESPACE nssv_CPP11_140
+#define nssv_HAVE_NOEXCEPT nssv_CPP11_140
+#define nssv_HAVE_NULLPTR nssv_CPP11_100
+#define nssv_HAVE_REF_QUALIFIER nssv_CPP11_140
+#define nssv_HAVE_UNICODE_LITERALS nssv_CPP11_140
+#define nssv_HAVE_USER_DEFINED_LITERALS nssv_CPP11_140
+#define nssv_HAVE_WCHAR16_T nssv_CPP11_100
+#define nssv_HAVE_WCHAR32_T nssv_CPP11_100
+
+#if ! ( ( nssv_CPP11_OR_GREATER && nssv_COMPILER_CLANG_VERSION ) || nssv_BETWEEN( nssv_COMPILER_CLANG_VERSION, 300, 400 ) )
+# define nssv_HAVE_STD_DEFINED_LITERALS nssv_CPP11_140
+#else
+# define nssv_HAVE_STD_DEFINED_LITERALS 0
+#endif
+
+// Presence of C++14 language features:
+
+#define nssv_HAVE_CONSTEXPR_14 nssv_CPP14_000
+
+// Presence of C++17 language features:
+
+#define nssv_HAVE_NODISCARD nssv_CPP17_000
+
+// Presence of C++ library features:
+
+#define nssv_HAVE_STD_HASH nssv_CPP11_120
+
+// C++ feature usage:
+
+#if nssv_HAVE_CONSTEXPR_11
+# define nssv_constexpr constexpr
+#else
+# define nssv_constexpr /*constexpr*/
+#endif
+
+#if nssv_HAVE_CONSTEXPR_14
+# define nssv_constexpr14 constexpr
+#else
+# define nssv_constexpr14 /*constexpr*/
+#endif
+
+#if nssv_HAVE_EXPLICIT_CONVERSION
+# define nssv_explicit explicit
+#else
+# define nssv_explicit /*explicit*/
+#endif
+
+#if nssv_HAVE_INLINE_NAMESPACE
+# define nssv_inline_ns inline
+#else
+# define nssv_inline_ns /*inline*/
+#endif
+
+#if nssv_HAVE_NOEXCEPT
+# define nssv_noexcept noexcept
+#else
+# define nssv_noexcept /*noexcept*/
+#endif
+
+//#if nssv_HAVE_REF_QUALIFIER
+//# define nssv_ref_qual &
+//# define nssv_refref_qual &&
+//#else
+//# define nssv_ref_qual /*&*/
+//# define nssv_refref_qual /*&&*/
+//#endif
+
+#if nssv_HAVE_NULLPTR
+# define nssv_nullptr nullptr
+#else
+# define nssv_nullptr NULL
+#endif
+
+#if nssv_HAVE_NODISCARD
+# define nssv_nodiscard [[nodiscard]]
+#else
+# define nssv_nodiscard /*[[nodiscard]]*/
+#endif
+
+// Additional includes:
+
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <limits>
+#include <ostream>
+#include <string> // std::char_traits<>
+
+#if ! nssv_CONFIG_NO_EXCEPTIONS
+# include <stdexcept>
+#endif
+
+#if nssv_CPP11_OR_GREATER
+# include <type_traits>
+#endif
+
+// Clang, GNUC, MSVC warning suppression macros:
+
+#if defined(__clang__)
+# pragma clang diagnostic ignored "-Wreserved-user-defined-literal"
+# pragma clang diagnostic push
+# pragma clang diagnostic ignored "-Wuser-defined-literals"
+#elif defined(__GNUC__)
+# pragma GCC diagnostic push
+# pragma GCC diagnostic ignored "-Wliteral-suffix"
+#endif // __clang__
+
+#if nssv_COMPILER_MSVC_VERSION >= 140
+# define nssv_SUPPRESS_MSGSL_WARNING(expr) [[gsl::suppress(expr)]]
+# define nssv_SUPPRESS_MSVC_WARNING(code, descr) __pragma(warning(suppress: code) )
+# define nssv_DISABLE_MSVC_WARNINGS(codes) __pragma(warning(push)) __pragma(warning(disable: codes))
+#else
+# define nssv_SUPPRESS_MSGSL_WARNING(expr)
+# define nssv_SUPPRESS_MSVC_WARNING(code, descr)
+# define nssv_DISABLE_MSVC_WARNINGS(codes)
+#endif
+
+#if defined(__clang__)
+# define nssv_RESTORE_WARNINGS() _Pragma("clang diagnostic pop")
+#elif defined(__GNUC__)
+# define nssv_RESTORE_WARNINGS() _Pragma("GCC diagnostic pop")
+#elif nssv_COMPILER_MSVC_VERSION >= 140
+# define nssv_RESTORE_WARNINGS() __pragma(warning(pop ))
+#else
+# define nssv_RESTORE_WARNINGS()
+#endif
+
+// Suppress the following MSVC (GSL) warnings:
+// - C4455, non-gsl : 'operator ""sv': literal suffix identifiers that do not
+// start with an underscore are reserved
+// - C26472, gsl::t.1 : don't use a static_cast for arithmetic conversions;
+// use brace initialization, gsl::narrow_cast or gsl::narow
+// - C26481: gsl::b.1 : don't use pointer arithmetic. Use span instead
+
+nssv_DISABLE_MSVC_WARNINGS( 4455 26481 26472 )
+//nssv_DISABLE_CLANG_WARNINGS( "-Wuser-defined-literals" )
+//nssv_DISABLE_GNUC_WARNINGS( -Wliteral-suffix )
+
+namespace nonstd { namespace sv_lite {
+
+#if nssv_CPP11_OR_GREATER
+
+namespace detail {
+
+#if nssv_CPP14_OR_GREATER
+
+template< typename CharT >
+inline constexpr std::size_t length( CharT * s, std::size_t result = 0 )
+{
+ CharT * v = s;
+ std::size_t r = result;
+ while ( *v != '\0' ) {
+ ++v;
+ ++r;
+ }
+ return r;
+}
+
+#else // nssv_CPP14_OR_GREATER
+
+// Expect tail call optimization to make length() non-recursive:
+
+template< typename CharT >
+inline constexpr std::size_t length( CharT * s, std::size_t result = 0 )
+{
+ return *s == '\0' ? result : length( s + 1, result + 1 );
+}
+
+#endif // nssv_CPP14_OR_GREATER
+
+} // namespace detail
+
+#endif // nssv_CPP11_OR_GREATER
+
+template
+<
+ class CharT,
+ class Traits = std::char_traits<CharT>
+>
+class basic_string_view;
+
+//
+// basic_string_view:
+//
+
+template
+<
+ class CharT,
+ class Traits /* = std::char_traits<CharT> */
+>
+class basic_string_view
+{
+public:
+ // Member types:
+
+ typedef Traits traits_type;
+ typedef CharT value_type;
+
+ typedef CharT * pointer;
+ typedef CharT const * const_pointer;
+ typedef CharT & reference;
+ typedef CharT const & const_reference;
+
+ typedef const_pointer iterator;
+ typedef const_pointer const_iterator;
+ typedef std::reverse_iterator< const_iterator > reverse_iterator;
+ typedef std::reverse_iterator< const_iterator > const_reverse_iterator;
+
+ typedef std::size_t size_type;
+ typedef std::ptrdiff_t difference_type;
+
+ // 24.4.2.1 Construction and assignment:
+
+ nssv_constexpr basic_string_view() nssv_noexcept
+ : data_( nssv_nullptr )
+ , size_( 0 )
+ {}
+
+#if nssv_CPP11_OR_GREATER
+ nssv_constexpr basic_string_view( basic_string_view const & other ) nssv_noexcept = default;
+#else
+ nssv_constexpr basic_string_view( basic_string_view const & other ) nssv_noexcept
+ : data_( other.data_)
+ , size_( other.size_)
+ {}
+#endif
+
+ nssv_constexpr basic_string_view( CharT const * s, size_type count ) nssv_noexcept // non-standard noexcept
+ : data_( s )
+ , size_( count )
+ {}
+
+ nssv_constexpr basic_string_view( CharT const * s) nssv_noexcept // non-standard noexcept
+ : data_( s )
+#if nssv_CPP17_OR_GREATER
+ , size_( Traits::length(s) )
+#elif nssv_CPP11_OR_GREATER
+ , size_( detail::length(s) )
+#else
+ , size_( Traits::length(s) )
+#endif
+ {}
+
+ // Assignment:
+
+#if nssv_CPP11_OR_GREATER
+ nssv_constexpr14 basic_string_view & operator=( basic_string_view const & other ) nssv_noexcept = default;
+#else
+ nssv_constexpr14 basic_string_view & operator=( basic_string_view const & other ) nssv_noexcept
+ {
+ data_ = other.data_;
+ size_ = other.size_;
+ return *this;
+ }
+#endif
+
+ // 24.4.2.2 Iterator support:
+
+ nssv_constexpr const_iterator begin() const nssv_noexcept { return data_; }
+ nssv_constexpr const_iterator end() const nssv_noexcept { return data_ + size_; }
+
+ nssv_constexpr const_iterator cbegin() const nssv_noexcept { return begin(); }
+ nssv_constexpr const_iterator cend() const nssv_noexcept { return end(); }
+
+ nssv_constexpr const_reverse_iterator rbegin() const nssv_noexcept { return const_reverse_iterator( end() ); }
+ nssv_constexpr const_reverse_iterator rend() const nssv_noexcept { return const_reverse_iterator( begin() ); }
+
+ nssv_constexpr const_reverse_iterator crbegin() const nssv_noexcept { return rbegin(); }
+ nssv_constexpr const_reverse_iterator crend() const nssv_noexcept { return rend(); }
+
+ // 24.4.2.3 Capacity:
+
+ nssv_constexpr size_type size() const nssv_noexcept { return size_; }
+ nssv_constexpr size_type length() const nssv_noexcept { return size_; }
+ nssv_constexpr size_type max_size() const nssv_noexcept { return (std::numeric_limits< size_type >::max)(); }
+
+ // since C++20
+ nssv_nodiscard nssv_constexpr bool empty() const nssv_noexcept
+ {
+ return 0 == size_;
+ }
+
+ // 24.4.2.4 Element access:
+
+ nssv_constexpr const_reference operator[]( size_type pos ) const
+ {
+ return data_at( pos );
+ }
+
+ nssv_constexpr14 const_reference at( size_type pos ) const
+ {
+#if nssv_CONFIG_NO_EXCEPTIONS
+ assert( pos < size() );
+#else
+ if ( pos >= size() )
+ {
+ throw std::out_of_range("nonstd::string_view::at()");
+ }
+#endif
+ return data_at( pos );
+ }
+
+ nssv_constexpr const_reference front() const { return data_at( 0 ); }
+ nssv_constexpr const_reference back() const { return data_at( size() - 1 ); }
+
+ nssv_constexpr const_pointer data() const nssv_noexcept { return data_; }
+
+ // 24.4.2.5 Modifiers:
+
+ nssv_constexpr14 void remove_prefix( size_type n )
+ {
+ assert( n <= size() );
+ data_ += n;
+ size_ -= n;
+ }
+
+ nssv_constexpr14 void remove_suffix( size_type n )
+ {
+ assert( n <= size() );
+ size_ -= n;
+ }
+
+ nssv_constexpr14 void swap( basic_string_view & other ) nssv_noexcept
+ {
+ using std::swap;
+ swap( data_, other.data_ );
+ swap( size_, other.size_ );
+ }
+
+ // 24.4.2.6 String operations:
+
+ size_type copy( CharT * dest, size_type n, size_type pos = 0 ) const
+ {
+#if nssv_CONFIG_NO_EXCEPTIONS
+ assert( pos <= size() );
+#else
+ if ( pos > size() )
+ {
+ throw std::out_of_range("nonstd::string_view::copy()");
+ }
+#endif
+ const size_type rlen = (std::min)( n, size() - pos );
+
+ (void) Traits::copy( dest, data() + pos, rlen );
+
+ return rlen;
+ }
+
+ nssv_constexpr14 basic_string_view substr( size_type pos = 0, size_type n = npos ) const
+ {
+#if nssv_CONFIG_NO_EXCEPTIONS
+ assert( pos <= size() );
+#else
+ if ( pos > size() )
+ {
+ throw std::out_of_range("nonstd::string_view::substr()");
+ }
+#endif
+ return basic_string_view( data() + pos, (std::min)( n, size() - pos ) );
+ }
+
+ // compare(), 6x:
+
+ nssv_constexpr14 int compare( basic_string_view other ) const nssv_noexcept // (1)
+ {
+ if ( const int result = Traits::compare( data(), other.data(), (std::min)( size(), other.size() ) ) )
+ {
+ return result;
+ }
+
+ return size() == other.size() ? 0 : size() < other.size() ? -1 : 1;
+ }
+
+ nssv_constexpr int compare( size_type pos1, size_type n1, basic_string_view other ) const // (2)
+ {
+ return substr( pos1, n1 ).compare( other );
+ }
+
+ nssv_constexpr int compare( size_type pos1, size_type n1, basic_string_view other, size_type pos2, size_type n2 ) const // (3)
+ {
+ return substr( pos1, n1 ).compare( other.substr( pos2, n2 ) );
+ }
+
+ nssv_constexpr int compare( CharT const * s ) const // (4)
+ {
+ return compare( basic_string_view( s ) );
+ }
+
+ nssv_constexpr int compare( size_type pos1, size_type n1, CharT const * s ) const // (5)
+ {
+ return substr( pos1, n1 ).compare( basic_string_view( s ) );
+ }
+
+ nssv_constexpr int compare( size_type pos1, size_type n1, CharT const * s, size_type n2 ) const // (6)
+ {
+ return substr( pos1, n1 ).compare( basic_string_view( s, n2 ) );
+ }
+
+ // 24.4.2.7 Searching:
+
+ // starts_with(), 3x, since C++20:
+
+ nssv_constexpr bool starts_with( basic_string_view v ) const nssv_noexcept // (1)
+ {
+ return size() >= v.size() && compare( 0, v.size(), v ) == 0;
+ }
+
+ nssv_constexpr bool starts_with( CharT c ) const nssv_noexcept // (2)
+ {
+ return starts_with( basic_string_view( &c, 1 ) );
+ }
+
+ nssv_constexpr bool starts_with( CharT const * s ) const // (3)
+ {
+ return starts_with( basic_string_view( s ) );
+ }
+
+ // ends_with(), 3x, since C++20:
+
+ nssv_constexpr bool ends_with( basic_string_view v ) const nssv_noexcept // (1)
+ {
+ return size() >= v.size() && compare( size() - v.size(), npos, v ) == 0;
+ }
+
+ nssv_constexpr bool ends_with( CharT c ) const nssv_noexcept // (2)
+ {
+ return ends_with( basic_string_view( &c, 1 ) );
+ }
+
+ nssv_constexpr bool ends_with( CharT const * s ) const // (3)
+ {
+ return ends_with( basic_string_view( s ) );
+ }
+
+ // find(), 4x:
+
+ nssv_constexpr14 size_type find( basic_string_view v, size_type pos = 0 ) const nssv_noexcept // (1)
+ {
+ return assert( v.size() == 0 || v.data() != nssv_nullptr )
+ , pos >= size()
+ ? npos
+ : to_pos( std::search( cbegin() + pos, cend(), v.cbegin(), v.cend(), Traits::eq ) );
+ }
+
+ nssv_constexpr14 size_type find( CharT c, size_type pos = 0 ) const nssv_noexcept // (2)
+ {
+ return find( basic_string_view( &c, 1 ), pos );
+ }
+
+ nssv_constexpr14 size_type find( CharT const * s, size_type pos, size_type n ) const // (3)
+ {
+ return find( basic_string_view( s, n ), pos );
+ }
+
+ nssv_constexpr14 size_type find( CharT const * s, size_type pos = 0 ) const // (4)
+ {
+ return find( basic_string_view( s ), pos );
+ }
+
+ // rfind(), 4x:
+
+ nssv_constexpr14 size_type rfind( basic_string_view v, size_type pos = npos ) const nssv_noexcept // (1)
+ {
+ if ( size() < v.size() )
+ {
+ return npos;
+ }
+
+ if ( v.empty() )
+ {
+ return (std::min)( size(), pos );
+ }
+
+ const_iterator last = cbegin() + (std::min)( size() - v.size(), pos ) + v.size();
+ const_iterator result = std::find_end( cbegin(), last, v.cbegin(), v.cend(), Traits::eq );
+
+ return result != last ? size_type( result - cbegin() ) : npos;
+ }
+
+ nssv_constexpr14 size_type rfind( CharT c, size_type pos = npos ) const nssv_noexcept // (2)
+ {
+ return rfind( basic_string_view( &c, 1 ), pos );
+ }
+
+ nssv_constexpr14 size_type rfind( CharT const * s, size_type pos, size_type n ) const // (3)
+ {
+ return rfind( basic_string_view( s, n ), pos );
+ }
+
+ nssv_constexpr14 size_type rfind( CharT const * s, size_type pos = npos ) const // (4)
+ {
+ return rfind( basic_string_view( s ), pos );
+ }
+
+ // find_first_of(), 4x:
+
+ nssv_constexpr size_type find_first_of( basic_string_view v, size_type pos = 0 ) const nssv_noexcept // (1)
+ {
+ return pos >= size()
+ ? npos
+ : to_pos( std::find_first_of( cbegin() + pos, cend(), v.cbegin(), v.cend(), Traits::eq ) );
+ }
+
+ nssv_constexpr size_type find_first_of( CharT c, size_type pos = 0 ) const nssv_noexcept // (2)
+ {
+ return find_first_of( basic_string_view( &c, 1 ), pos );
+ }
+
+ nssv_constexpr size_type find_first_of( CharT const * s, size_type pos, size_type n ) const // (3)
+ {
+ return find_first_of( basic_string_view( s, n ), pos );
+ }
+
+ nssv_constexpr size_type find_first_of( CharT const * s, size_type pos = 0 ) const // (4)
+ {
+ return find_first_of( basic_string_view( s ), pos );
+ }
+
+ // find_last_of(), 4x:
+
+ nssv_constexpr size_type find_last_of( basic_string_view v, size_type pos = npos ) const nssv_noexcept // (1)
+ {
+ return empty()
+ ? npos
+ : pos >= size()
+ ? find_last_of( v, size() - 1 )
+ : to_pos( std::find_first_of( const_reverse_iterator( cbegin() + pos + 1 ), crend(), v.cbegin(), v.cend(), Traits::eq ) );
+ }
+
+ nssv_constexpr size_type find_last_of( CharT c, size_type pos = npos ) const nssv_noexcept // (2)
+ {
+ return find_last_of( basic_string_view( &c, 1 ), pos );
+ }
+
+ nssv_constexpr size_type find_last_of( CharT const * s, size_type pos, size_type count ) const // (3)
+ {
+ return find_last_of( basic_string_view( s, count ), pos );
+ }
+
+ nssv_constexpr size_type find_last_of( CharT const * s, size_type pos = npos ) const // (4)
+ {
+ return find_last_of( basic_string_view( s ), pos );
+ }
+
+ // find_first_not_of(), 4x:
+
+ nssv_constexpr size_type find_first_not_of( basic_string_view v, size_type pos = 0 ) const nssv_noexcept // (1)
+ {
+ return pos >= size()
+ ? npos
+ : to_pos( std::find_if( cbegin() + pos, cend(), not_in_view( v ) ) );
+ }
+
+ nssv_constexpr size_type find_first_not_of( CharT c, size_type pos = 0 ) const nssv_noexcept // (2)
+ {
+ return find_first_not_of( basic_string_view( &c, 1 ), pos );
+ }
+
+ nssv_constexpr size_type find_first_not_of( CharT const * s, size_type pos, size_type count ) const // (3)
+ {
+ return find_first_not_of( basic_string_view( s, count ), pos );
+ }
+
+ nssv_constexpr size_type find_first_not_of( CharT const * s, size_type pos = 0 ) const // (4)
+ {
+ return find_first_not_of( basic_string_view( s ), pos );
+ }
+
+ // find_last_not_of(), 4x:
+
+ nssv_constexpr size_type find_last_not_of( basic_string_view v, size_type pos = npos ) const nssv_noexcept // (1)
+ {
+ return empty()
+ ? npos
+ : pos >= size()
+ ? find_last_not_of( v, size() - 1 )
+ : to_pos( std::find_if( const_reverse_iterator( cbegin() + pos + 1 ), crend(), not_in_view( v ) ) );
+ }
+
+ nssv_constexpr size_type find_last_not_of( CharT c, size_type pos = npos ) const nssv_noexcept // (2)
+ {
+ return find_last_not_of( basic_string_view( &c, 1 ), pos );
+ }
+
+ nssv_constexpr size_type find_last_not_of( CharT const * s, size_type pos, size_type count ) const // (3)
+ {
+ return find_last_not_of( basic_string_view( s, count ), pos );
+ }
+
+ nssv_constexpr size_type find_last_not_of( CharT const * s, size_type pos = npos ) const // (4)
+ {
+ return find_last_not_of( basic_string_view( s ), pos );
+ }
+
+ // Constants:
+
+#if nssv_CPP17_OR_GREATER
+ static nssv_constexpr size_type npos = size_type(-1);
+#elif nssv_CPP11_OR_GREATER
+ enum : size_type { npos = size_type(-1) };
+#else
+ enum { npos = size_type(-1) };
+#endif
+
+private:
+ struct not_in_view
+ {
+ const basic_string_view v;
+
+ nssv_constexpr explicit not_in_view( basic_string_view v_ ) : v( v_ ) {}
+
+ nssv_constexpr bool operator()( CharT c ) const
+ {
+ return npos == v.find_first_of( c );
+ }
+ };
+
+ nssv_constexpr size_type to_pos( const_iterator it ) const
+ {
+ return it == cend() ? npos : size_type( it - cbegin() );
+ }
+
+ nssv_constexpr size_type to_pos( const_reverse_iterator it ) const
+ {
+ return it == crend() ? npos : size_type( crend() - it - 1 );
+ }
+
+ nssv_constexpr const_reference data_at( size_type pos ) const
+ {
+#if nssv_BETWEEN( nssv_COMPILER_GNUC_VERSION, 1, 500 )
+ return data_[pos];
+#else
+ return assert( pos < size() ), data_[pos];
+#endif
+ }
+
+private:
+ const_pointer data_;
+ size_type size_;
+
+public:
+#if nssv_CONFIG_CONVERSION_STD_STRING_CLASS_METHODS
+
+ template< class Allocator >
+ basic_string_view( std::basic_string<CharT, Traits, Allocator> const & s ) nssv_noexcept
+ : data_( s.data() )
+ , size_( s.size() )
+ {}
+
+#if nssv_HAVE_EXPLICIT_CONVERSION
+
+ template< class Allocator >
+ explicit operator std::basic_string<CharT, Traits, Allocator>() const
+ {
+ return to_string( Allocator() );
+ }
+
+#endif // nssv_HAVE_EXPLICIT_CONVERSION
+
+#if nssv_CPP11_OR_GREATER
+
+ template< class Allocator = std::allocator<CharT> >
+ std::basic_string<CharT, Traits, Allocator>
+ to_string( Allocator const & a = Allocator() ) const
+ {
+ return std::basic_string<CharT, Traits, Allocator>( begin(), end(), a );
+ }
+
+#else
+
+ std::basic_string<CharT, Traits>
+ to_string() const
+ {
+ return std::basic_string<CharT, Traits>( begin(), end() );
+ }
+
+ template< class Allocator >
+ std::basic_string<CharT, Traits, Allocator>
+ to_string( Allocator const & a ) const
+ {
+ return std::basic_string<CharT, Traits, Allocator>( begin(), end(), a );
+ }
+
+#endif // nssv_CPP11_OR_GREATER
+
+#endif // nssv_CONFIG_CONVERSION_STD_STRING_CLASS_METHODS
+};
+
+//
+// Non-member functions:
+//
+
+// 24.4.3 Non-member comparison functions:
+// lexicographically compare two string views (function template):
+
+template< class CharT, class Traits >
+nssv_constexpr bool operator== (
+ basic_string_view <CharT, Traits> lhs,
+ basic_string_view <CharT, Traits> rhs ) nssv_noexcept
+{ return lhs.compare( rhs ) == 0 ; }
+
+template< class CharT, class Traits >
+nssv_constexpr bool operator!= (
+ basic_string_view <CharT, Traits> lhs,
+ basic_string_view <CharT, Traits> rhs ) nssv_noexcept
+{ return lhs.compare( rhs ) != 0 ; }
+
+template< class CharT, class Traits >
+nssv_constexpr bool operator< (
+ basic_string_view <CharT, Traits> lhs,
+ basic_string_view <CharT, Traits> rhs ) nssv_noexcept
+{ return lhs.compare( rhs ) < 0 ; }
+
+template< class CharT, class Traits >
+nssv_constexpr bool operator<= (
+ basic_string_view <CharT, Traits> lhs,
+ basic_string_view <CharT, Traits> rhs ) nssv_noexcept
+{ return lhs.compare( rhs ) <= 0 ; }
+
+template< class CharT, class Traits >
+nssv_constexpr bool operator> (
+ basic_string_view <CharT, Traits> lhs,
+ basic_string_view <CharT, Traits> rhs ) nssv_noexcept
+{ return lhs.compare( rhs ) > 0 ; }
+
+template< class CharT, class Traits >
+nssv_constexpr bool operator>= (
+ basic_string_view <CharT, Traits> lhs,
+ basic_string_view <CharT, Traits> rhs ) nssv_noexcept
+{ return lhs.compare( rhs ) >= 0 ; }
+
+// Let S be basic_string_view<CharT, Traits>, and sv be an instance of S.
+// Implementations shall provide sufficient additional overloads marked
+// constexpr and noexcept so that an object t with an implicit conversion
+// to S can be compared according to Table 67.
+
+#if ! nssv_CPP11_OR_GREATER || nssv_BETWEEN( nssv_COMPILER_MSVC_VERSION, 100, 141 )
+
+// accomodate for older compilers:
+
+// ==
+
+template< class CharT, class Traits>
+nssv_constexpr bool operator==(
+ basic_string_view<CharT, Traits> lhs,
+ CharT const * rhs ) nssv_noexcept
+{ return lhs.compare( rhs ) == 0; }
+
+template< class CharT, class Traits>
+nssv_constexpr bool operator==(
+ CharT const * lhs,
+ basic_string_view<CharT, Traits> rhs ) nssv_noexcept
+{ return rhs.compare( lhs ) == 0; }
+
+template< class CharT, class Traits>
+nssv_constexpr bool operator==(
+ basic_string_view<CharT, Traits> lhs,
+ std::basic_string<CharT, Traits> rhs ) nssv_noexcept
+{ return lhs.size() == rhs.size() && lhs.compare( rhs ) == 0; }
+
+template< class CharT, class Traits>
+nssv_constexpr bool operator==(
+ std::basic_string<CharT, Traits> rhs,
+ basic_string_view<CharT, Traits> lhs ) nssv_noexcept
+{ return lhs.size() == rhs.size() && lhs.compare( rhs ) == 0; }
+
+// !=
+
+template< class CharT, class Traits>
+nssv_constexpr bool operator!=(
+ basic_string_view<CharT, Traits> lhs,
+ char const * rhs ) nssv_noexcept
+{ return lhs.compare( rhs ) != 0; }
+
+template< class CharT, class Traits>
+nssv_constexpr bool operator!=(
+ char const * lhs,
+ basic_string_view<CharT, Traits> rhs ) nssv_noexcept
+{ return rhs.compare( lhs ) != 0; }
+
+template< class CharT, class Traits>
+nssv_constexpr bool operator!=(
+ basic_string_view<CharT, Traits> lhs,
+ std::basic_string<CharT, Traits> rhs ) nssv_noexcept
+{ return lhs.size() != rhs.size() && lhs.compare( rhs ) != 0; }
+
+template< class CharT, class Traits>
+nssv_constexpr bool operator!=(
+ std::basic_string<CharT, Traits> rhs,
+ basic_string_view<CharT, Traits> lhs ) nssv_noexcept
+{ return lhs.size() != rhs.size() || rhs.compare( lhs ) != 0; }
+
+// <
+
+template< class CharT, class Traits>
+nssv_constexpr bool operator<(
+ basic_string_view<CharT, Traits> lhs,
+ char const * rhs ) nssv_noexcept
+{ return lhs.compare( rhs ) < 0; }
+
+template< class CharT, class Traits>
+nssv_constexpr bool operator<(
+ char const * lhs,
+ basic_string_view<CharT, Traits> rhs ) nssv_noexcept
+{ return rhs.compare( lhs ) > 0; }
+
+template< class CharT, class Traits>
+nssv_constexpr bool operator<(
+ basic_string_view<CharT, Traits> lhs,
+ std::basic_string<CharT, Traits> rhs ) nssv_noexcept
+{ return lhs.compare( rhs ) < 0; }
+
+template< class CharT, class Traits>
+nssv_constexpr bool operator<(
+ std::basic_string<CharT, Traits> rhs,
+ basic_string_view<CharT, Traits> lhs ) nssv_noexcept
+{ return rhs.compare( lhs ) > 0; }
+
+// <=
+
+template< class CharT, class Traits>
+nssv_constexpr bool operator<=(
+ basic_string_view<CharT, Traits> lhs,
+ char const * rhs ) nssv_noexcept
+{ return lhs.compare( rhs ) <= 0; }
+
+template< class CharT, class Traits>
+nssv_constexpr bool operator<=(
+ char const * lhs,
+ basic_string_view<CharT, Traits> rhs ) nssv_noexcept
+{ return rhs.compare( lhs ) >= 0; }
+
+template< class CharT, class Traits>
+nssv_constexpr bool operator<=(
+ basic_string_view<CharT, Traits> lhs,
+ std::basic_string<CharT, Traits> rhs ) nssv_noexcept
+{ return lhs.compare( rhs ) <= 0; }
+
+template< class CharT, class Traits>
+nssv_constexpr bool operator<=(
+ std::basic_string<CharT, Traits> rhs,
+ basic_string_view<CharT, Traits> lhs ) nssv_noexcept
+{ return rhs.compare( lhs ) >= 0; }
+
+// >
+
+template< class CharT, class Traits>
+nssv_constexpr bool operator>(
+ basic_string_view<CharT, Traits> lhs,
+ char const * rhs ) nssv_noexcept
+{ return lhs.compare( rhs ) > 0; }
+
+template< class CharT, class Traits>
+nssv_constexpr bool operator>(
+ char const * lhs,
+ basic_string_view<CharT, Traits> rhs ) nssv_noexcept
+{ return rhs.compare( lhs ) < 0; }
+
+template< class CharT, class Traits>
+nssv_constexpr bool operator>(
+ basic_string_view<CharT, Traits> lhs,
+ std::basic_string<CharT, Traits> rhs ) nssv_noexcept
+{ return lhs.compare( rhs ) > 0; }
+
+template< class CharT, class Traits>
+nssv_constexpr bool operator>(
+ std::basic_string<CharT, Traits> rhs,
+ basic_string_view<CharT, Traits> lhs ) nssv_noexcept
+{ return rhs.compare( lhs ) < 0; }
+
+// >=
+
+template< class CharT, class Traits>
+nssv_constexpr bool operator>=(
+ basic_string_view<CharT, Traits> lhs,
+ char const * rhs ) nssv_noexcept
+{ return lhs.compare( rhs ) >= 0; }
+
+template< class CharT, class Traits>
+nssv_constexpr bool operator>=(
+ char const * lhs,
+ basic_string_view<CharT, Traits> rhs ) nssv_noexcept
+{ return rhs.compare( lhs ) <= 0; }
+
+template< class CharT, class Traits>
+nssv_constexpr bool operator>=(
+ basic_string_view<CharT, Traits> lhs,
+ std::basic_string<CharT, Traits> rhs ) nssv_noexcept
+{ return lhs.compare( rhs ) >= 0; }
+
+template< class CharT, class Traits>
+nssv_constexpr bool operator>=(
+ std::basic_string<CharT, Traits> rhs,
+ basic_string_view<CharT, Traits> lhs ) nssv_noexcept
+{ return rhs.compare( lhs ) <= 0; }
+
+#else // newer compilers:
+
+#define nssv_BASIC_STRING_VIEW_I(T,U) typename std::decay< basic_string_view<T,U> >::type
+
+#if nssv_BETWEEN( nssv_COMPILER_MSVC_VERSION, 140, 150 )
+# define nssv_MSVC_ORDER(x) , int=x
+#else
+# define nssv_MSVC_ORDER(x) /*, int=x*/
+#endif
+
+// ==
+
+template< class CharT, class Traits nssv_MSVC_ORDER(1) >
+nssv_constexpr bool operator==(
+ basic_string_view <CharT, Traits> lhs,
+ nssv_BASIC_STRING_VIEW_I(CharT, Traits) rhs ) nssv_noexcept
+{ return lhs.compare( rhs ) == 0; }
+
+template< class CharT, class Traits nssv_MSVC_ORDER(2) >
+nssv_constexpr bool operator==(
+ nssv_BASIC_STRING_VIEW_I(CharT, Traits) lhs,
+ basic_string_view <CharT, Traits> rhs ) nssv_noexcept
+{ return lhs.size() == rhs.size() && lhs.compare( rhs ) == 0; }
+
+// !=
+
+template< class CharT, class Traits nssv_MSVC_ORDER(1) >
+nssv_constexpr bool operator!= (
+ basic_string_view < CharT, Traits > lhs,
+ nssv_BASIC_STRING_VIEW_I( CharT, Traits ) rhs ) nssv_noexcept
+{ return lhs.size() != rhs.size() || lhs.compare( rhs ) != 0 ; }
+
+template< class CharT, class Traits nssv_MSVC_ORDER(2) >
+nssv_constexpr bool operator!= (
+ nssv_BASIC_STRING_VIEW_I( CharT, Traits ) lhs,
+ basic_string_view < CharT, Traits > rhs ) nssv_noexcept
+{ return lhs.compare( rhs ) != 0 ; }
+
+// <
+
+template< class CharT, class Traits nssv_MSVC_ORDER(1) >
+nssv_constexpr bool operator< (
+ basic_string_view < CharT, Traits > lhs,
+ nssv_BASIC_STRING_VIEW_I( CharT, Traits ) rhs ) nssv_noexcept
+{ return lhs.compare( rhs ) < 0 ; }
+
+template< class CharT, class Traits nssv_MSVC_ORDER(2) >
+nssv_constexpr bool operator< (
+ nssv_BASIC_STRING_VIEW_I( CharT, Traits ) lhs,
+ basic_string_view < CharT, Traits > rhs ) nssv_noexcept
+{ return lhs.compare( rhs ) < 0 ; }
+
+// <=
+
+template< class CharT, class Traits nssv_MSVC_ORDER(1) >
+nssv_constexpr bool operator<= (
+ basic_string_view < CharT, Traits > lhs,
+ nssv_BASIC_STRING_VIEW_I( CharT, Traits ) rhs ) nssv_noexcept
+{ return lhs.compare( rhs ) <= 0 ; }
+
+template< class CharT, class Traits nssv_MSVC_ORDER(2) >
+nssv_constexpr bool operator<= (
+ nssv_BASIC_STRING_VIEW_I( CharT, Traits ) lhs,
+ basic_string_view < CharT, Traits > rhs ) nssv_noexcept
+{ return lhs.compare( rhs ) <= 0 ; }
+
+// >
+
+template< class CharT, class Traits nssv_MSVC_ORDER(1) >
+nssv_constexpr bool operator> (
+ basic_string_view < CharT, Traits > lhs,
+ nssv_BASIC_STRING_VIEW_I( CharT, Traits ) rhs ) nssv_noexcept
+{ return lhs.compare( rhs ) > 0 ; }
+
+template< class CharT, class Traits nssv_MSVC_ORDER(2) >
+nssv_constexpr bool operator> (
+ nssv_BASIC_STRING_VIEW_I( CharT, Traits ) lhs,
+ basic_string_view < CharT, Traits > rhs ) nssv_noexcept
+{ return lhs.compare( rhs ) > 0 ; }
+
+// >=
+
+template< class CharT, class Traits nssv_MSVC_ORDER(1) >
+nssv_constexpr bool operator>= (
+ basic_string_view < CharT, Traits > lhs,
+ nssv_BASIC_STRING_VIEW_I( CharT, Traits ) rhs ) nssv_noexcept
+{ return lhs.compare( rhs ) >= 0 ; }
+
+template< class CharT, class Traits nssv_MSVC_ORDER(2) >
+nssv_constexpr bool operator>= (
+ nssv_BASIC_STRING_VIEW_I( CharT, Traits ) lhs,
+ basic_string_view < CharT, Traits > rhs ) nssv_noexcept
+{ return lhs.compare( rhs ) >= 0 ; }
+
+#undef nssv_MSVC_ORDER
+#undef nssv_BASIC_STRING_VIEW_I
+
+#endif // compiler-dependent approach to comparisons
+
+// 24.4.4 Inserters and extractors:
+
+namespace detail {
+
+template< class Stream >
+void write_padding( Stream & os, std::streamsize n )
+{
+ for ( std::streamsize i = 0; i < n; ++i )
+ os.rdbuf()->sputc( os.fill() );
+}
+
+template< class Stream, class View >
+Stream & write_to_stream( Stream & os, View const & sv )
+{
+ typename Stream::sentry sentry( os );
+
+ if ( !os )
+ return os;
+
+ const std::streamsize length = static_cast<std::streamsize>( sv.length() );
+
+ // Whether, and how, to pad:
+ const bool pad = ( length < os.width() );
+ const bool left_pad = pad && ( os.flags() & std::ios_base::adjustfield ) == std::ios_base::right;
+
+ if ( left_pad )
+ write_padding( os, os.width() - length );
+
+ // Write span characters:
+ os.rdbuf()->sputn( sv.begin(), length );
+
+ if ( pad && !left_pad )
+ write_padding( os, os.width() - length );
+
+ // Reset output stream width:
+ os.width( 0 );
+
+ return os;
+}
+
+} // namespace detail
+
+template< class CharT, class Traits >
+std::basic_ostream<CharT, Traits> &
+operator<<(
+ std::basic_ostream<CharT, Traits>& os,
+ basic_string_view <CharT, Traits> sv )
+{
+ return detail::write_to_stream( os, sv );
+}
+
+// Several typedefs for common character types are provided:
+
+typedef basic_string_view<char> string_view;
+typedef basic_string_view<wchar_t> wstring_view;
+#if nssv_HAVE_WCHAR16_T
+typedef basic_string_view<char16_t> u16string_view;
+typedef basic_string_view<char32_t> u32string_view;
+#endif
+
+}} // namespace nonstd::sv_lite
+
+//
+// 24.4.6 Suffix for basic_string_view literals:
+//
+
+#if nssv_HAVE_USER_DEFINED_LITERALS
+
+namespace nonstd {
+nssv_inline_ns namespace literals {
+nssv_inline_ns namespace string_view_literals {
+
+#if nssv_CONFIG_STD_SV_OPERATOR && nssv_HAVE_STD_DEFINED_LITERALS
+
+nssv_constexpr nonstd::sv_lite::string_view operator "" sv( const char* str, size_t len ) nssv_noexcept // (1)
+{
+ return nonstd::sv_lite::string_view{ str, len };
+}
+
+nssv_constexpr nonstd::sv_lite::u16string_view operator "" sv( const char16_t* str, size_t len ) nssv_noexcept // (2)
+{
+ return nonstd::sv_lite::u16string_view{ str, len };
+}
+
+nssv_constexpr nonstd::sv_lite::u32string_view operator "" sv( const char32_t* str, size_t len ) nssv_noexcept // (3)
+{
+ return nonstd::sv_lite::u32string_view{ str, len };
+}
+
+nssv_constexpr nonstd::sv_lite::wstring_view operator "" sv( const wchar_t* str, size_t len ) nssv_noexcept // (4)
+{
+ return nonstd::sv_lite::wstring_view{ str, len };
+}
+
+#endif // nssv_CONFIG_STD_SV_OPERATOR && nssv_HAVE_STD_DEFINED_LITERALS
+
+#if nssv_CONFIG_USR_SV_OPERATOR
+
+nssv_constexpr nonstd::sv_lite::string_view operator "" _sv( const char* str, size_t len ) nssv_noexcept // (1)
+{
+ return nonstd::sv_lite::string_view{ str, len };
+}
+
+nssv_constexpr nonstd::sv_lite::u16string_view operator "" _sv( const char16_t* str, size_t len ) nssv_noexcept // (2)
+{
+ return nonstd::sv_lite::u16string_view{ str, len };
+}
+
+nssv_constexpr nonstd::sv_lite::u32string_view operator "" _sv( const char32_t* str, size_t len ) nssv_noexcept // (3)
+{
+ return nonstd::sv_lite::u32string_view{ str, len };
+}
+
+nssv_constexpr nonstd::sv_lite::wstring_view operator "" _sv( const wchar_t* str, size_t len ) nssv_noexcept // (4)
+{
+ return nonstd::sv_lite::wstring_view{ str, len };
+}
+
+#endif // nssv_CONFIG_USR_SV_OPERATOR
+
+}}} // namespace nonstd::literals::string_view_literals
+
+#endif
+
+//
+// Extensions for std::string:
+//
+
+#if nssv_CONFIG_CONVERSION_STD_STRING_FREE_FUNCTIONS
+
+namespace nonstd {
+namespace sv_lite {
+
+// Exclude MSVC 14 (19.00): it yields ambiguous to_string():
+
+#if nssv_CPP11_OR_GREATER && nssv_COMPILER_MSVC_VERSION != 140
+
+template< class CharT, class Traits, class Allocator = std::allocator<CharT> >
+std::basic_string<CharT, Traits, Allocator>
+to_string( basic_string_view<CharT, Traits> v, Allocator const & a = Allocator() )
+{
+ return std::basic_string<CharT,Traits, Allocator>( v.begin(), v.end(), a );
+}
+
+#else
+
+template< class CharT, class Traits >
+std::basic_string<CharT, Traits>
+to_string( basic_string_view<CharT, Traits> v )
+{
+ return std::basic_string<CharT, Traits>( v.begin(), v.end() );
+}
+
+template< class CharT, class Traits, class Allocator >
+std::basic_string<CharT, Traits, Allocator>
+to_string( basic_string_view<CharT, Traits> v, Allocator const & a )
+{
+ return std::basic_string<CharT, Traits, Allocator>( v.begin(), v.end(), a );
+}
+
+#endif // nssv_CPP11_OR_GREATER
+
+template< class CharT, class Traits, class Allocator >
+basic_string_view<CharT, Traits>
+to_string_view( std::basic_string<CharT, Traits, Allocator> const & s )
+{
+ return basic_string_view<CharT, Traits>( s.data(), s.size() );
+}
+
+}} // namespace nonstd::sv_lite
+
+#endif // nssv_CONFIG_CONVERSION_STD_STRING_FREE_FUNCTIONS
+
+//
+// make types and algorithms available in namespace nonstd:
+//
+
+namespace nonstd {
+
+using sv_lite::basic_string_view;
+using sv_lite::string_view;
+using sv_lite::wstring_view;
+
+#if nssv_HAVE_WCHAR16_T
+using sv_lite::u16string_view;
+#endif
+#if nssv_HAVE_WCHAR32_T
+using sv_lite::u32string_view;
+#endif
+
+// literal "sv"
+
+using sv_lite::operator==;
+using sv_lite::operator!=;
+using sv_lite::operator<;
+using sv_lite::operator<=;
+using sv_lite::operator>;
+using sv_lite::operator>=;
+
+using sv_lite::operator<<;
+
+#if nssv_CONFIG_CONVERSION_STD_STRING_FREE_FUNCTIONS
+using sv_lite::to_string;
+using sv_lite::to_string_view;
+#endif
+
+} // namespace nonstd
+
+// 24.4.5 Hash support (C++11):
+
+// Note: The hash value of a string view object is equal to the hash value of
+// the corresponding string object.
+
+#if nssv_HAVE_STD_HASH
+
+#include <functional>
+
+namespace std {
+
+template<>
+struct hash< nonstd::string_view >
+{
+public:
+ std::size_t operator()( nonstd::string_view v ) const nssv_noexcept
+ {
+ return std::hash<std::string>()( std::string( v.data(), v.size() ) );
+ }
+};
+
+template<>
+struct hash< nonstd::wstring_view >
+{
+public:
+ std::size_t operator()( nonstd::wstring_view v ) const nssv_noexcept
+ {
+ return std::hash<std::wstring>()( std::wstring( v.data(), v.size() ) );
+ }
+};
+
+template<>
+struct hash< nonstd::u16string_view >
+{
+public:
+ std::size_t operator()( nonstd::u16string_view v ) const nssv_noexcept
+ {
+ return std::hash<std::u16string>()( std::u16string( v.data(), v.size() ) );
+ }
+};
+
+template<>
+struct hash< nonstd::u32string_view >
+{
+public:
+ std::size_t operator()( nonstd::u32string_view v ) const nssv_noexcept
+ {
+ return std::hash<std::u32string>()( std::u32string( v.data(), v.size() ) );
+ }
+};
+
+} // namespace std
+
+#endif // nssv_HAVE_STD_HASH
+
+nssv_RESTORE_WARNINGS()
+
+#endif // nssv_HAVE_STD_STRING_VIEW
+#endif // NONSTD_SV_LITE_H_INCLUDED
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/vendored/strptime.h b/contrib/libs/apache/arrow/cpp/src/arrow/vendored/strptime.h
new file mode 100644
index 00000000000..764a4440ee4
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/vendored/strptime.h
@@ -0,0 +1,35 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <time.h>
+
+#include "arrow/util/visibility.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// A less featureful implementation of strptime() for platforms lacking
+// a standard implementation (e.g. Windows).
+ARROW_EXPORT char* arrow_strptime(const char* __restrict, const char* __restrict,
+ struct tm* __restrict);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/vendored/utfcpp/README.md b/contrib/libs/apache/arrow/cpp/src/arrow/vendored/utfcpp/README.md
new file mode 100644
index 00000000000..c0abfd7d11d
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/vendored/utfcpp/README.md
@@ -0,0 +1,28 @@
+<!---
+ Boost Software License - Version 1.0 - August 17th, 2003
+
+ Permission is hereby granted, free of charge, to any person or organization
+ obtaining a copy of the software and accompanying documentation covered by
+ this license (the "Software") to use, reproduce, display, distribute,
+ execute, and transmit the Software, and to prepare derivative works of the
+ Software, and to permit third-parties to whom the Software is furnished to
+ do so, all subject to the following:
+
+ The copyright notices in the Software and this entire statement, including
+ the above license grant, this restriction and the following disclaimer,
+ must be included in all copies of the Software, in whole or in part, and
+ all derivative works of the Software, unless such copies or derivative
+ works are solely in the form of machine-executable object code generated by
+ a source language processor.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+ SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+ FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ DEALINGS IN THE SOFTWARE.
+-->
+
+The files in this directory are vendored from utfcpp git tag v3.1.1
+(https://github.com/nemtrif/utfcpp).
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/vendored/utfcpp/checked.h b/contrib/libs/apache/arrow/cpp/src/arrow/vendored/utfcpp/checked.h
new file mode 100644
index 00000000000..648636e4686
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/vendored/utfcpp/checked.h
@@ -0,0 +1,333 @@
+// Copyright 2006-2016 Nemanja Trifunovic
+
+/*
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+*/
+
+
+#ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
+#define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
+
+#include "core.h"
+#include <stdexcept>
+
+namespace utf8
+{
+ // Base for the exceptions that may be thrown from the library
+ class exception : public ::std::exception {
+ };
+
+ // Exceptions that may be thrown from the library functions.
+ class invalid_code_point : public exception {
+ uint32_t cp;
+ public:
+ invalid_code_point(uint32_t codepoint) : cp(codepoint) {}
+ virtual const char* what() const NOEXCEPT OVERRIDE { return "Invalid code point"; }
+ uint32_t code_point() const {return cp;}
+ };
+
+ class invalid_utf8 : public exception {
+ uint8_t u8;
+ public:
+ invalid_utf8 (uint8_t u) : u8(u) {}
+ virtual const char* what() const NOEXCEPT OVERRIDE { return "Invalid UTF-8"; }
+ uint8_t utf8_octet() const {return u8;}
+ };
+
+ class invalid_utf16 : public exception {
+ uint16_t u16;
+ public:
+ invalid_utf16 (uint16_t u) : u16(u) {}
+ virtual const char* what() const NOEXCEPT OVERRIDE { return "Invalid UTF-16"; }
+ uint16_t utf16_word() const {return u16;}
+ };
+
+ class not_enough_room : public exception {
+ public:
+ virtual const char* what() const NOEXCEPT OVERRIDE { return "Not enough space"; }
+ };
+
+ /// The library API - functions intended to be called by the users
+
+ template <typename octet_iterator>
+ octet_iterator append(uint32_t cp, octet_iterator result)
+ {
+ if (!utf8::internal::is_code_point_valid(cp))
+ throw invalid_code_point(cp);
+
+ if (cp < 0x80) // one octet
+ *(result++) = static_cast<uint8_t>(cp);
+ else if (cp < 0x800) { // two octets
+ *(result++) = static_cast<uint8_t>((cp >> 6) | 0xc0);
+ *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
+ }
+ else if (cp < 0x10000) { // three octets
+ *(result++) = static_cast<uint8_t>((cp >> 12) | 0xe0);
+ *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
+ *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
+ }
+ else { // four octets
+ *(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0);
+ *(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f) | 0x80);
+ *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
+ *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
+ }
+ return result;
+ }
+
+ template <typename octet_iterator, typename output_iterator>
+ output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement)
+ {
+ while (start != end) {
+ octet_iterator sequence_start = start;
+ internal::utf_error err_code = utf8::internal::validate_next(start, end);
+ switch (err_code) {
+ case internal::UTF8_OK :
+ for (octet_iterator it = sequence_start; it != start; ++it)
+ *out++ = *it;
+ break;
+ case internal::NOT_ENOUGH_ROOM:
+ out = utf8::append (replacement, out);
+ start = end;
+ break;
+ case internal::INVALID_LEAD:
+ out = utf8::append (replacement, out);
+ ++start;
+ break;
+ case internal::INCOMPLETE_SEQUENCE:
+ case internal::OVERLONG_SEQUENCE:
+ case internal::INVALID_CODE_POINT:
+ out = utf8::append (replacement, out);
+ ++start;
+ // just one replacement mark for the sequence
+ while (start != end && utf8::internal::is_trail(*start))
+ ++start;
+ break;
+ }
+ }
+ return out;
+ }
+
+ template <typename octet_iterator, typename output_iterator>
+ inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out)
+ {
+ static const uint32_t replacement_marker = utf8::internal::mask16(0xfffd);
+ return utf8::replace_invalid(start, end, out, replacement_marker);
+ }
+
+ template <typename octet_iterator>
+ uint32_t next(octet_iterator& it, octet_iterator end)
+ {
+ uint32_t cp = 0;
+ internal::utf_error err_code = utf8::internal::validate_next(it, end, cp);
+ switch (err_code) {
+ case internal::UTF8_OK :
+ break;
+ case internal::NOT_ENOUGH_ROOM :
+ throw not_enough_room();
+ case internal::INVALID_LEAD :
+ case internal::INCOMPLETE_SEQUENCE :
+ case internal::OVERLONG_SEQUENCE :
+ throw invalid_utf8(*it);
+ case internal::INVALID_CODE_POINT :
+ throw invalid_code_point(cp);
+ }
+ return cp;
+ }
+
+ template <typename octet_iterator>
+ uint32_t peek_next(octet_iterator it, octet_iterator end)
+ {
+ return utf8::next(it, end);
+ }
+
+ template <typename octet_iterator>
+ uint32_t prior(octet_iterator& it, octet_iterator start)
+ {
+ // can't do much if it == start
+ if (it == start)
+ throw not_enough_room();
+
+ octet_iterator end = it;
+ // Go back until we hit either a lead octet or start
+ while (utf8::internal::is_trail(*(--it)))
+ if (it == start)
+ throw invalid_utf8(*it); // error - no lead byte in the sequence
+ return utf8::peek_next(it, end);
+ }
+
+ template <typename octet_iterator, typename distance_type>
+ void advance (octet_iterator& it, distance_type n, octet_iterator end)
+ {
+ const distance_type zero(0);
+ if (n < zero) {
+ // backward
+ for (distance_type i = n; i < zero; ++i)
+ utf8::prior(it, end);
+ } else {
+ // forward
+ for (distance_type i = zero; i < n; ++i)
+ utf8::next(it, end);
+ }
+ }
+
+ template <typename octet_iterator>
+ typename std::iterator_traits<octet_iterator>::difference_type
+ distance (octet_iterator first, octet_iterator last)
+ {
+ typename std::iterator_traits<octet_iterator>::difference_type dist;
+ for (dist = 0; first < last; ++dist)
+ utf8::next(first, last);
+ return dist;
+ }
+
+ template <typename u16bit_iterator, typename octet_iterator>
+ octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
+ {
+ while (start != end) {
+ uint32_t cp = utf8::internal::mask16(*start++);
+ // Take care of surrogate pairs first
+ if (utf8::internal::is_lead_surrogate(cp)) {
+ if (start != end) {
+ uint32_t trail_surrogate = utf8::internal::mask16(*start++);
+ if (utf8::internal::is_trail_surrogate(trail_surrogate))
+ cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
+ else
+ throw invalid_utf16(static_cast<uint16_t>(trail_surrogate));
+ }
+ else
+ throw invalid_utf16(static_cast<uint16_t>(cp));
+
+ }
+ // Lone trail surrogate
+ else if (utf8::internal::is_trail_surrogate(cp))
+ throw invalid_utf16(static_cast<uint16_t>(cp));
+
+ result = utf8::append(cp, result);
+ }
+ return result;
+ }
+
+ template <typename u16bit_iterator, typename octet_iterator>
+ u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
+ {
+ while (start < end) {
+ uint32_t cp = utf8::next(start, end);
+ if (cp > 0xffff) { //make a surrogate pair
+ *result++ = static_cast<uint16_t>((cp >> 10) + internal::LEAD_OFFSET);
+ *result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
+ }
+ else
+ *result++ = static_cast<uint16_t>(cp);
+ }
+ return result;
+ }
+
+ template <typename octet_iterator, typename u32bit_iterator>
+ octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
+ {
+ while (start != end)
+ result = utf8::append(*(start++), result);
+
+ return result;
+ }
+
+ template <typename octet_iterator, typename u32bit_iterator>
+ u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
+ {
+ while (start < end)
+ (*result++) = utf8::next(start, end);
+
+ return result;
+ }
+
+ // The iterator class
+ template <typename octet_iterator>
+ class iterator {
+ octet_iterator it;
+ octet_iterator range_start;
+ octet_iterator range_end;
+ public:
+ typedef uint32_t value_type;
+ typedef uint32_t* pointer;
+ typedef uint32_t& reference;
+ typedef std::ptrdiff_t difference_type;
+ typedef std::bidirectional_iterator_tag iterator_category;
+ iterator () {}
+ explicit iterator (const octet_iterator& octet_it,
+ const octet_iterator& rangestart,
+ const octet_iterator& rangeend) :
+ it(octet_it), range_start(rangestart), range_end(rangeend)
+ {
+ if (it < range_start || it > range_end)
+ throw std::out_of_range("Invalid utf-8 iterator position");
+ }
+ // the default "big three" are OK
+ octet_iterator base () const { return it; }
+ uint32_t operator * () const
+ {
+ octet_iterator temp = it;
+ return utf8::next(temp, range_end);
+ }
+ bool operator == (const iterator& rhs) const
+ {
+ if (range_start != rhs.range_start || range_end != rhs.range_end)
+ throw std::logic_error("Comparing utf-8 iterators defined with different ranges");
+ return (it == rhs.it);
+ }
+ bool operator != (const iterator& rhs) const
+ {
+ return !(operator == (rhs));
+ }
+ iterator& operator ++ ()
+ {
+ utf8::next(it, range_end);
+ return *this;
+ }
+ iterator operator ++ (int)
+ {
+ iterator temp = *this;
+ utf8::next(it, range_end);
+ return temp;
+ }
+ iterator& operator -- ()
+ {
+ utf8::prior(it, range_start);
+ return *this;
+ }
+ iterator operator -- (int)
+ {
+ iterator temp = *this;
+ utf8::prior(it, range_start);
+ return temp;
+ }
+ }; // class iterator
+
+} // namespace utf8
+
+#if UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later
+#include "cpp11.h"
+#endif // C++ 11 or later
+
+#endif //header guard
+
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/vendored/utfcpp/core.h b/contrib/libs/apache/arrow/cpp/src/arrow/vendored/utfcpp/core.h
new file mode 100644
index 00000000000..244e8923112
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/vendored/utfcpp/core.h
@@ -0,0 +1,338 @@
+// Copyright 2006 Nemanja Trifunovic
+
+/*
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+*/
+
+
+#ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
+#define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
+
+#include <iterator>
+
+// Determine the C++ standard version.
+// If the user defines UTF_CPP_CPLUSPLUS, use that.
+// Otherwise, trust the unreliable predefined macro __cplusplus
+
+#if !defined UTF_CPP_CPLUSPLUS
+ #define UTF_CPP_CPLUSPLUS __cplusplus
+#endif
+
+#if UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later
+ #define OVERRIDE override
+ #define NOEXCEPT noexcept
+#else // C++ 98/03
+ #define OVERRIDE
+ #define NOEXCEPT throw()
+#endif // C++ 11 or later
+
+
+namespace utf8
+{
+ // The typedefs for 8-bit, 16-bit and 32-bit unsigned integers
+ // You may need to change them to match your system.
+ // These typedefs have the same names as ones from cstdint, or boost/cstdint
+ typedef unsigned char uint8_t;
+ typedef unsigned short uint16_t;
+ typedef unsigned int uint32_t;
+
+// Helper code - not intended to be directly called by the library users. May be changed at any time
+namespace internal
+{
+ // Unicode constants
+ // Leading (high) surrogates: 0xd800 - 0xdbff
+ // Trailing (low) surrogates: 0xdc00 - 0xdfff
+ const uint16_t LEAD_SURROGATE_MIN = 0xd800u;
+ const uint16_t LEAD_SURROGATE_MAX = 0xdbffu;
+ const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u;
+ const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu;
+ const uint16_t LEAD_OFFSET = 0xd7c0u; // LEAD_SURROGATE_MIN - (0x10000 >> 10)
+ const uint32_t SURROGATE_OFFSET = 0xfca02400u; // 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN
+
+ // Maximum valid value for a Unicode code point
+ const uint32_t CODE_POINT_MAX = 0x0010ffffu;
+
+ template<typename octet_type>
+ inline uint8_t mask8(octet_type oc)
+ {
+ return static_cast<uint8_t>(0xff & oc);
+ }
+ template<typename u16_type>
+ inline uint16_t mask16(u16_type oc)
+ {
+ return static_cast<uint16_t>(0xffff & oc);
+ }
+ template<typename octet_type>
+ inline bool is_trail(octet_type oc)
+ {
+ return ((utf8::internal::mask8(oc) >> 6) == 0x2);
+ }
+
+ template <typename u16>
+ inline bool is_lead_surrogate(u16 cp)
+ {
+ return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX);
+ }
+
+ template <typename u16>
+ inline bool is_trail_surrogate(u16 cp)
+ {
+ return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
+ }
+
+ template <typename u16>
+ inline bool is_surrogate(u16 cp)
+ {
+ return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
+ }
+
+ template <typename u32>
+ inline bool is_code_point_valid(u32 cp)
+ {
+ return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp));
+ }
+
+ template <typename octet_iterator>
+ inline typename std::iterator_traits<octet_iterator>::difference_type
+ sequence_length(octet_iterator lead_it)
+ {
+ uint8_t lead = utf8::internal::mask8(*lead_it);
+ if (lead < 0x80)
+ return 1;
+ else if ((lead >> 5) == 0x6)
+ return 2;
+ else if ((lead >> 4) == 0xe)
+ return 3;
+ else if ((lead >> 3) == 0x1e)
+ return 4;
+ else
+ return 0;
+ }
+
+ template <typename octet_difference_type>
+ inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length)
+ {
+ if (cp < 0x80) {
+ if (length != 1)
+ return true;
+ }
+ else if (cp < 0x800) {
+ if (length != 2)
+ return true;
+ }
+ else if (cp < 0x10000) {
+ if (length != 3)
+ return true;
+ }
+
+ return false;
+ }
+
+ enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT};
+
+ /// Helper for get_sequence_x
+ template <typename octet_iterator>
+ utf_error increase_safely(octet_iterator& it, octet_iterator end)
+ {
+ if (++it == end)
+ return NOT_ENOUGH_ROOM;
+
+ if (!utf8::internal::is_trail(*it))
+ return INCOMPLETE_SEQUENCE;
+
+ return UTF8_OK;
+ }
+
+ #define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) {utf_error ret = increase_safely(IT, END); if (ret != UTF8_OK) return ret;}
+
+ /// get_sequence_x functions decode utf-8 sequences of the length x
+ template <typename octet_iterator>
+ utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t& code_point)
+ {
+ if (it == end)
+ return NOT_ENOUGH_ROOM;
+
+ code_point = utf8::internal::mask8(*it);
+
+ return UTF8_OK;
+ }
+
+ template <typename octet_iterator>
+ utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t& code_point)
+ {
+ if (it == end)
+ return NOT_ENOUGH_ROOM;
+
+ code_point = utf8::internal::mask8(*it);
+
+ UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
+
+ code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f);
+
+ return UTF8_OK;
+ }
+
+ template <typename octet_iterator>
+ utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t& code_point)
+ {
+ if (it == end)
+ return NOT_ENOUGH_ROOM;
+
+ code_point = utf8::internal::mask8(*it);
+
+ UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
+
+ code_point = ((code_point << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff);
+
+ UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
+
+ code_point += (*it) & 0x3f;
+
+ return UTF8_OK;
+ }
+
+ template <typename octet_iterator>
+ utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t& code_point)
+ {
+ if (it == end)
+ return NOT_ENOUGH_ROOM;
+
+ code_point = utf8::internal::mask8(*it);
+
+ UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
+
+ code_point = ((code_point << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff);
+
+ UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
+
+ code_point += (utf8::internal::mask8(*it) << 6) & 0xfff;
+
+ UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
+
+ code_point += (*it) & 0x3f;
+
+ return UTF8_OK;
+ }
+
+ #undef UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR
+
+ template <typename octet_iterator>
+ utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t& code_point)
+ {
+ if (it == end)
+ return NOT_ENOUGH_ROOM;
+
+ // Save the original value of it so we can go back in case of failure
+ // Of course, it does not make much sense with i.e. stream iterators
+ octet_iterator original_it = it;
+
+ uint32_t cp = 0;
+ // Determine the sequence length based on the lead octet
+ typedef typename std::iterator_traits<octet_iterator>::difference_type octet_difference_type;
+ const octet_difference_type length = utf8::internal::sequence_length(it);
+
+ // Get trail octets and calculate the code point
+ utf_error err = UTF8_OK;
+ switch (length) {
+ case 0:
+ return INVALID_LEAD;
+ case 1:
+ err = utf8::internal::get_sequence_1(it, end, cp);
+ break;
+ case 2:
+ err = utf8::internal::get_sequence_2(it, end, cp);
+ break;
+ case 3:
+ err = utf8::internal::get_sequence_3(it, end, cp);
+ break;
+ case 4:
+ err = utf8::internal::get_sequence_4(it, end, cp);
+ break;
+ }
+
+ if (err == UTF8_OK) {
+ // Decoding succeeded. Now, security checks...
+ if (utf8::internal::is_code_point_valid(cp)) {
+ if (!utf8::internal::is_overlong_sequence(cp, length)){
+ // Passed! Return here.
+ code_point = cp;
+ ++it;
+ return UTF8_OK;
+ }
+ else
+ err = OVERLONG_SEQUENCE;
+ }
+ else
+ err = INVALID_CODE_POINT;
+ }
+
+ // Failure branch - restore the original value of the iterator
+ it = original_it;
+ return err;
+ }
+
+ template <typename octet_iterator>
+ inline utf_error validate_next(octet_iterator& it, octet_iterator end) {
+ uint32_t ignored;
+ return utf8::internal::validate_next(it, end, ignored);
+ }
+
+} // namespace internal
+
+ /// The library API - functions intended to be called by the users
+
+ // Byte order mark
+ const uint8_t bom[] = {0xef, 0xbb, 0xbf};
+
+ template <typename octet_iterator>
+ octet_iterator find_invalid(octet_iterator start, octet_iterator end)
+ {
+ octet_iterator result = start;
+ while (result != end) {
+ utf8::internal::utf_error err_code = utf8::internal::validate_next(result, end);
+ if (err_code != internal::UTF8_OK)
+ return result;
+ }
+ return result;
+ }
+
+ template <typename octet_iterator>
+ inline bool is_valid(octet_iterator start, octet_iterator end)
+ {
+ return (utf8::find_invalid(start, end) == end);
+ }
+
+ template <typename octet_iterator>
+ inline bool starts_with_bom (octet_iterator it, octet_iterator end)
+ {
+ return (
+ ((it != end) && (utf8::internal::mask8(*it++)) == bom[0]) &&
+ ((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) &&
+ ((it != end) && (utf8::internal::mask8(*it)) == bom[2])
+ );
+ }
+} // namespace utf8
+
+#endif // header guard
+
+
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/vendored/utfcpp/cpp11.h b/contrib/libs/apache/arrow/cpp/src/arrow/vendored/utfcpp/cpp11.h
new file mode 100644
index 00000000000..d93961b04f8
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/vendored/utfcpp/cpp11.h
@@ -0,0 +1,103 @@
+// Copyright 2018 Nemanja Trifunovic
+
+/*
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+*/
+
+
+#ifndef UTF8_FOR_CPP_a184c22c_d012_11e8_a8d5_f2801f1b9fd1
+#define UTF8_FOR_CPP_a184c22c_d012_11e8_a8d5_f2801f1b9fd1
+
+#include "checked.h"
+#include <string>
+
+namespace utf8
+{
+
+ inline void append(char32_t cp, std::string& s)
+ {
+ append(uint32_t(cp), std::back_inserter(s));
+ }
+
+ inline std::string utf16to8(const std::u16string& s)
+ {
+ std::string result;
+ utf16to8(s.begin(), s.end(), std::back_inserter(result));
+ return result;
+ }
+
+ inline std::u16string utf8to16(const std::string& s)
+ {
+ std::u16string result;
+ utf8to16(s.begin(), s.end(), std::back_inserter(result));
+ return result;
+ }
+
+ inline std::string utf32to8(const std::u32string& s)
+ {
+ std::string result;
+ utf32to8(s.begin(), s.end(), std::back_inserter(result));
+ return result;
+ }
+
+ inline std::u32string utf8to32(const std::string& s)
+ {
+ std::u32string result;
+ utf8to32(s.begin(), s.end(), std::back_inserter(result));
+ return result;
+ }
+
+ inline std::size_t find_invalid(const std::string& s)
+ {
+ std::string::const_iterator invalid = find_invalid(s.begin(), s.end());
+ return (invalid == s.end()) ? std::string::npos : (invalid - s.begin());
+ }
+
+ inline bool is_valid(const std::string& s)
+ {
+ return is_valid(s.begin(), s.end());
+ }
+
+ inline std::string replace_invalid(const std::string& s, char32_t replacement)
+ {
+ std::string result;
+ replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement);
+ return result;
+ }
+
+ inline std::string replace_invalid(const std::string& s)
+ {
+ std::string result;
+ replace_invalid(s.begin(), s.end(), std::back_inserter(result));
+ return result;
+ }
+
+ inline bool starts_with_bom(const std::string& s)
+ {
+ return starts_with_bom(s.begin(), s.end());
+ }
+
+} // namespace utf8
+
+#endif // header guard
+
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/visitor.cc b/contrib/libs/apache/arrow/cpp/src/arrow/visitor.cc
new file mode 100644
index 00000000000..851785081c7
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/visitor.cc
@@ -0,0 +1,169 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/visitor.h"
+
+#include <memory>
+
+#include "arrow/array.h" // IWYU pragma: keep
+#include "arrow/extension_type.h"
+#include "arrow/scalar.h" // IWYU pragma: keep
+#include "arrow/status.h"
+#include "arrow/type.h"
+
+namespace arrow {
+
+#define ARRAY_VISITOR_DEFAULT(ARRAY_CLASS) \
+ Status ArrayVisitor::Visit(const ARRAY_CLASS& array) { \
+ return Status::NotImplemented(array.type()->ToString()); \
+ }
+
+ARRAY_VISITOR_DEFAULT(NullArray)
+ARRAY_VISITOR_DEFAULT(BooleanArray)
+ARRAY_VISITOR_DEFAULT(Int8Array)
+ARRAY_VISITOR_DEFAULT(Int16Array)
+ARRAY_VISITOR_DEFAULT(Int32Array)
+ARRAY_VISITOR_DEFAULT(Int64Array)
+ARRAY_VISITOR_DEFAULT(UInt8Array)
+ARRAY_VISITOR_DEFAULT(UInt16Array)
+ARRAY_VISITOR_DEFAULT(UInt32Array)
+ARRAY_VISITOR_DEFAULT(UInt64Array)
+ARRAY_VISITOR_DEFAULT(HalfFloatArray)
+ARRAY_VISITOR_DEFAULT(FloatArray)
+ARRAY_VISITOR_DEFAULT(DoubleArray)
+ARRAY_VISITOR_DEFAULT(BinaryArray)
+ARRAY_VISITOR_DEFAULT(StringArray)
+ARRAY_VISITOR_DEFAULT(LargeBinaryArray)
+ARRAY_VISITOR_DEFAULT(LargeStringArray)
+ARRAY_VISITOR_DEFAULT(FixedSizeBinaryArray)
+ARRAY_VISITOR_DEFAULT(Date32Array)
+ARRAY_VISITOR_DEFAULT(Date64Array)
+ARRAY_VISITOR_DEFAULT(Time32Array)
+ARRAY_VISITOR_DEFAULT(Time64Array)
+ARRAY_VISITOR_DEFAULT(TimestampArray)
+ARRAY_VISITOR_DEFAULT(DayTimeIntervalArray)
+ARRAY_VISITOR_DEFAULT(MonthIntervalArray)
+ARRAY_VISITOR_DEFAULT(DurationArray)
+ARRAY_VISITOR_DEFAULT(ListArray)
+ARRAY_VISITOR_DEFAULT(LargeListArray)
+ARRAY_VISITOR_DEFAULT(MapArray)
+ARRAY_VISITOR_DEFAULT(FixedSizeListArray)
+ARRAY_VISITOR_DEFAULT(StructArray)
+ARRAY_VISITOR_DEFAULT(SparseUnionArray)
+ARRAY_VISITOR_DEFAULT(DenseUnionArray)
+ARRAY_VISITOR_DEFAULT(DictionaryArray)
+ARRAY_VISITOR_DEFAULT(Decimal128Array)
+ARRAY_VISITOR_DEFAULT(Decimal256Array)
+ARRAY_VISITOR_DEFAULT(ExtensionArray)
+
+#undef ARRAY_VISITOR_DEFAULT
+
+// ----------------------------------------------------------------------
+// Default implementations of TypeVisitor methods
+
+#define TYPE_VISITOR_DEFAULT(TYPE_CLASS) \
+ Status TypeVisitor::Visit(const TYPE_CLASS& type) { \
+ return Status::NotImplemented(type.ToString()); \
+ }
+
+TYPE_VISITOR_DEFAULT(NullType)
+TYPE_VISITOR_DEFAULT(BooleanType)
+TYPE_VISITOR_DEFAULT(Int8Type)
+TYPE_VISITOR_DEFAULT(Int16Type)
+TYPE_VISITOR_DEFAULT(Int32Type)
+TYPE_VISITOR_DEFAULT(Int64Type)
+TYPE_VISITOR_DEFAULT(UInt8Type)
+TYPE_VISITOR_DEFAULT(UInt16Type)
+TYPE_VISITOR_DEFAULT(UInt32Type)
+TYPE_VISITOR_DEFAULT(UInt64Type)
+TYPE_VISITOR_DEFAULT(HalfFloatType)
+TYPE_VISITOR_DEFAULT(FloatType)
+TYPE_VISITOR_DEFAULT(DoubleType)
+TYPE_VISITOR_DEFAULT(StringType)
+TYPE_VISITOR_DEFAULT(BinaryType)
+TYPE_VISITOR_DEFAULT(LargeStringType)
+TYPE_VISITOR_DEFAULT(LargeBinaryType)
+TYPE_VISITOR_DEFAULT(FixedSizeBinaryType)
+TYPE_VISITOR_DEFAULT(Date64Type)
+TYPE_VISITOR_DEFAULT(Date32Type)
+TYPE_VISITOR_DEFAULT(Time32Type)
+TYPE_VISITOR_DEFAULT(Time64Type)
+TYPE_VISITOR_DEFAULT(TimestampType)
+TYPE_VISITOR_DEFAULT(DayTimeIntervalType)
+TYPE_VISITOR_DEFAULT(MonthIntervalType)
+TYPE_VISITOR_DEFAULT(DurationType)
+TYPE_VISITOR_DEFAULT(Decimal128Type)
+TYPE_VISITOR_DEFAULT(Decimal256Type)
+TYPE_VISITOR_DEFAULT(ListType)
+TYPE_VISITOR_DEFAULT(LargeListType)
+TYPE_VISITOR_DEFAULT(MapType)
+TYPE_VISITOR_DEFAULT(FixedSizeListType)
+TYPE_VISITOR_DEFAULT(StructType)
+TYPE_VISITOR_DEFAULT(SparseUnionType)
+TYPE_VISITOR_DEFAULT(DenseUnionType)
+TYPE_VISITOR_DEFAULT(DictionaryType)
+TYPE_VISITOR_DEFAULT(ExtensionType)
+
+#undef TYPE_VISITOR_DEFAULT
+
+// ----------------------------------------------------------------------
+// Default implementations of ScalarVisitor methods
+
+#define SCALAR_VISITOR_DEFAULT(SCALAR_CLASS) \
+ Status ScalarVisitor::Visit(const SCALAR_CLASS& scalar) { \
+ return Status::NotImplemented( \
+ "ScalarVisitor not implemented for " ARROW_STRINGIFY(SCALAR_CLASS)); \
+ }
+
+SCALAR_VISITOR_DEFAULT(NullScalar)
+SCALAR_VISITOR_DEFAULT(BooleanScalar)
+SCALAR_VISITOR_DEFAULT(Int8Scalar)
+SCALAR_VISITOR_DEFAULT(Int16Scalar)
+SCALAR_VISITOR_DEFAULT(Int32Scalar)
+SCALAR_VISITOR_DEFAULT(Int64Scalar)
+SCALAR_VISITOR_DEFAULT(UInt8Scalar)
+SCALAR_VISITOR_DEFAULT(UInt16Scalar)
+SCALAR_VISITOR_DEFAULT(UInt32Scalar)
+SCALAR_VISITOR_DEFAULT(UInt64Scalar)
+SCALAR_VISITOR_DEFAULT(HalfFloatScalar)
+SCALAR_VISITOR_DEFAULT(FloatScalar)
+SCALAR_VISITOR_DEFAULT(DoubleScalar)
+SCALAR_VISITOR_DEFAULT(StringScalar)
+SCALAR_VISITOR_DEFAULT(BinaryScalar)
+SCALAR_VISITOR_DEFAULT(LargeStringScalar)
+SCALAR_VISITOR_DEFAULT(LargeBinaryScalar)
+SCALAR_VISITOR_DEFAULT(FixedSizeBinaryScalar)
+SCALAR_VISITOR_DEFAULT(Date64Scalar)
+SCALAR_VISITOR_DEFAULT(Date32Scalar)
+SCALAR_VISITOR_DEFAULT(Time32Scalar)
+SCALAR_VISITOR_DEFAULT(Time64Scalar)
+SCALAR_VISITOR_DEFAULT(TimestampScalar)
+SCALAR_VISITOR_DEFAULT(DayTimeIntervalScalar)
+SCALAR_VISITOR_DEFAULT(MonthIntervalScalar)
+SCALAR_VISITOR_DEFAULT(DurationScalar)
+SCALAR_VISITOR_DEFAULT(Decimal128Scalar)
+SCALAR_VISITOR_DEFAULT(Decimal256Scalar)
+SCALAR_VISITOR_DEFAULT(ListScalar)
+SCALAR_VISITOR_DEFAULT(LargeListScalar)
+SCALAR_VISITOR_DEFAULT(MapScalar)
+SCALAR_VISITOR_DEFAULT(FixedSizeListScalar)
+SCALAR_VISITOR_DEFAULT(StructScalar)
+SCALAR_VISITOR_DEFAULT(DictionaryScalar)
+
+#undef SCALAR_VISITOR_DEFAULT
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/visitor.h b/contrib/libs/apache/arrow/cpp/src/arrow/visitor.h
new file mode 100644
index 00000000000..0382e461199
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/visitor.h
@@ -0,0 +1,152 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/status.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class ARROW_EXPORT ArrayVisitor {
+ public:
+ virtual ~ArrayVisitor() = default;
+
+ virtual Status Visit(const NullArray& array);
+ virtual Status Visit(const BooleanArray& array);
+ virtual Status Visit(const Int8Array& array);
+ virtual Status Visit(const Int16Array& array);
+ virtual Status Visit(const Int32Array& array);
+ virtual Status Visit(const Int64Array& array);
+ virtual Status Visit(const UInt8Array& array);
+ virtual Status Visit(const UInt16Array& array);
+ virtual Status Visit(const UInt32Array& array);
+ virtual Status Visit(const UInt64Array& array);
+ virtual Status Visit(const HalfFloatArray& array);
+ virtual Status Visit(const FloatArray& array);
+ virtual Status Visit(const DoubleArray& array);
+ virtual Status Visit(const StringArray& array);
+ virtual Status Visit(const BinaryArray& array);
+ virtual Status Visit(const LargeStringArray& array);
+ virtual Status Visit(const LargeBinaryArray& array);
+ virtual Status Visit(const FixedSizeBinaryArray& array);
+ virtual Status Visit(const Date32Array& array);
+ virtual Status Visit(const Date64Array& array);
+ virtual Status Visit(const Time32Array& array);
+ virtual Status Visit(const Time64Array& array);
+ virtual Status Visit(const TimestampArray& array);
+ virtual Status Visit(const DayTimeIntervalArray& array);
+ virtual Status Visit(const MonthIntervalArray& array);
+ virtual Status Visit(const DurationArray& array);
+ virtual Status Visit(const Decimal128Array& array);
+ virtual Status Visit(const Decimal256Array& array);
+ virtual Status Visit(const ListArray& array);
+ virtual Status Visit(const LargeListArray& array);
+ virtual Status Visit(const MapArray& array);
+ virtual Status Visit(const FixedSizeListArray& array);
+ virtual Status Visit(const StructArray& array);
+ virtual Status Visit(const SparseUnionArray& array);
+ virtual Status Visit(const DenseUnionArray& array);
+ virtual Status Visit(const DictionaryArray& array);
+ virtual Status Visit(const ExtensionArray& array);
+};
+
+class ARROW_EXPORT TypeVisitor {
+ public:
+ virtual ~TypeVisitor() = default;
+
+ virtual Status Visit(const NullType& type);
+ virtual Status Visit(const BooleanType& type);
+ virtual Status Visit(const Int8Type& type);
+ virtual Status Visit(const Int16Type& type);
+ virtual Status Visit(const Int32Type& type);
+ virtual Status Visit(const Int64Type& type);
+ virtual Status Visit(const UInt8Type& type);
+ virtual Status Visit(const UInt16Type& type);
+ virtual Status Visit(const UInt32Type& type);
+ virtual Status Visit(const UInt64Type& type);
+ virtual Status Visit(const HalfFloatType& type);
+ virtual Status Visit(const FloatType& type);
+ virtual Status Visit(const DoubleType& type);
+ virtual Status Visit(const StringType& type);
+ virtual Status Visit(const BinaryType& type);
+ virtual Status Visit(const LargeStringType& type);
+ virtual Status Visit(const LargeBinaryType& type);
+ virtual Status Visit(const FixedSizeBinaryType& type);
+ virtual Status Visit(const Date64Type& type);
+ virtual Status Visit(const Date32Type& type);
+ virtual Status Visit(const Time32Type& type);
+ virtual Status Visit(const Time64Type& type);
+ virtual Status Visit(const TimestampType& type);
+ virtual Status Visit(const MonthIntervalType& type);
+ virtual Status Visit(const DayTimeIntervalType& type);
+ virtual Status Visit(const DurationType& type);
+ virtual Status Visit(const Decimal128Type& type);
+ virtual Status Visit(const Decimal256Type& type);
+ virtual Status Visit(const ListType& type);
+ virtual Status Visit(const LargeListType& type);
+ virtual Status Visit(const MapType& type);
+ virtual Status Visit(const FixedSizeListType& type);
+ virtual Status Visit(const StructType& type);
+ virtual Status Visit(const SparseUnionType& type);
+ virtual Status Visit(const DenseUnionType& type);
+ virtual Status Visit(const DictionaryType& type);
+ virtual Status Visit(const ExtensionType& type);
+};
+
+class ARROW_EXPORT ScalarVisitor {
+ public:
+ virtual ~ScalarVisitor() = default;
+
+ virtual Status Visit(const NullScalar& scalar);
+ virtual Status Visit(const BooleanScalar& scalar);
+ virtual Status Visit(const Int8Scalar& scalar);
+ virtual Status Visit(const Int16Scalar& scalar);
+ virtual Status Visit(const Int32Scalar& scalar);
+ virtual Status Visit(const Int64Scalar& scalar);
+ virtual Status Visit(const UInt8Scalar& scalar);
+ virtual Status Visit(const UInt16Scalar& scalar);
+ virtual Status Visit(const UInt32Scalar& scalar);
+ virtual Status Visit(const UInt64Scalar& scalar);
+ virtual Status Visit(const HalfFloatScalar& scalar);
+ virtual Status Visit(const FloatScalar& scalar);
+ virtual Status Visit(const DoubleScalar& scalar);
+ virtual Status Visit(const StringScalar& scalar);
+ virtual Status Visit(const BinaryScalar& scalar);
+ virtual Status Visit(const LargeStringScalar& scalar);
+ virtual Status Visit(const LargeBinaryScalar& scalar);
+ virtual Status Visit(const FixedSizeBinaryScalar& scalar);
+ virtual Status Visit(const Date64Scalar& scalar);
+ virtual Status Visit(const Date32Scalar& scalar);
+ virtual Status Visit(const Time32Scalar& scalar);
+ virtual Status Visit(const Time64Scalar& scalar);
+ virtual Status Visit(const TimestampScalar& scalar);
+ virtual Status Visit(const DayTimeIntervalScalar& scalar);
+ virtual Status Visit(const MonthIntervalScalar& scalar);
+ virtual Status Visit(const DurationScalar& scalar);
+ virtual Status Visit(const Decimal128Scalar& scalar);
+ virtual Status Visit(const Decimal256Scalar& scalar);
+ virtual Status Visit(const ListScalar& scalar);
+ virtual Status Visit(const LargeListScalar& scalar);
+ virtual Status Visit(const MapScalar& scalar);
+ virtual Status Visit(const FixedSizeListScalar& scalar);
+ virtual Status Visit(const StructScalar& scalar);
+ virtual Status Visit(const DictionaryScalar& scalar);
+};
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/visitor_inline.h b/contrib/libs/apache/arrow/cpp/src/arrow/visitor_inline.h
new file mode 100644
index 00000000000..132c35aeaa1
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/visitor_inline.h
@@ -0,0 +1,449 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Private header, not to be exported
+
+#pragma once
+
+#include <utility>
+
+#include "arrow/array.h"
+#include "arrow/extension_type.h"
+#include "arrow/scalar.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/util/bit_block_counter.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/functional.h"
+#include "arrow/util/string_view.h"
+
+namespace arrow {
+
+#define ARROW_GENERATE_FOR_ALL_INTEGER_TYPES(ACTION) \
+ ACTION(Int8); \
+ ACTION(UInt8); \
+ ACTION(Int16); \
+ ACTION(UInt16); \
+ ACTION(Int32); \
+ ACTION(UInt32); \
+ ACTION(Int64); \
+ ACTION(UInt64)
+
+#define ARROW_GENERATE_FOR_ALL_NUMERIC_TYPES(ACTION) \
+ ARROW_GENERATE_FOR_ALL_INTEGER_TYPES(ACTION); \
+ ACTION(HalfFloat); \
+ ACTION(Float); \
+ ACTION(Double)
+
+#define ARROW_GENERATE_FOR_ALL_TYPES(ACTION) \
+ ACTION(Null); \
+ ACTION(Boolean); \
+ ARROW_GENERATE_FOR_ALL_NUMERIC_TYPES(ACTION); \
+ ACTION(String); \
+ ACTION(Binary); \
+ ACTION(LargeString); \
+ ACTION(LargeBinary); \
+ ACTION(FixedSizeBinary); \
+ ACTION(Duration); \
+ ACTION(Date32); \
+ ACTION(Date64); \
+ ACTION(Timestamp); \
+ ACTION(Time32); \
+ ACTION(Time64); \
+ ACTION(MonthInterval); \
+ ACTION(DayTimeInterval); \
+ ACTION(Decimal128); \
+ ACTION(Decimal256); \
+ ACTION(List); \
+ ACTION(LargeList); \
+ ACTION(Map); \
+ ACTION(FixedSizeList); \
+ ACTION(Struct); \
+ ACTION(SparseUnion); \
+ ACTION(DenseUnion); \
+ ACTION(Dictionary); \
+ ACTION(Extension)
+
+#define TYPE_VISIT_INLINE(TYPE_CLASS) \
+ case TYPE_CLASS##Type::type_id: \
+ return visitor->Visit(internal::checked_cast<const TYPE_CLASS##Type&>(type));
+
+template <typename VISITOR>
+inline Status VisitTypeInline(const DataType& type, VISITOR* visitor) {
+ switch (type.id()) {
+ ARROW_GENERATE_FOR_ALL_TYPES(TYPE_VISIT_INLINE);
+ default:
+ break;
+ }
+ return Status::NotImplemented("Type not implemented");
+}
+
+#undef TYPE_VISIT_INLINE
+
+#define TYPE_ID_VISIT_INLINE(TYPE_CLASS) \
+ case TYPE_CLASS##Type::type_id: { \
+ const TYPE_CLASS##Type* concrete_ptr = nullptr; \
+ return visitor->Visit(concrete_ptr); \
+ }
+
+// Calls `visitor` with a nullptr of the corresponding concrete type class
+template <typename VISITOR>
+inline Status VisitTypeIdInline(Type::type id, VISITOR* visitor) {
+ switch (id) {
+ ARROW_GENERATE_FOR_ALL_TYPES(TYPE_ID_VISIT_INLINE);
+ default:
+ break;
+ }
+ return Status::NotImplemented("Type not implemented");
+}
+
+#undef TYPE_ID_VISIT_INLINE
+
+#define ARRAY_VISIT_INLINE(TYPE_CLASS) \
+ case TYPE_CLASS##Type::type_id: \
+ return visitor->Visit( \
+ internal::checked_cast<const typename TypeTraits<TYPE_CLASS##Type>::ArrayType&>( \
+ array));
+
+template <typename VISITOR>
+inline Status VisitArrayInline(const Array& array, VISITOR* visitor) {
+ switch (array.type_id()) {
+ ARROW_GENERATE_FOR_ALL_TYPES(ARRAY_VISIT_INLINE);
+ default:
+ break;
+ }
+ return Status::NotImplemented("Type not implemented");
+}
+
+namespace internal {
+
+template <typename T, typename Enable = void>
+struct ArrayDataInlineVisitor {};
+
+// Numeric and primitive C-compatible types
+template <typename T>
+struct ArrayDataInlineVisitor<T, enable_if_has_c_type<T>> {
+ using c_type = typename T::c_type;
+
+ template <typename ValidFunc, typename NullFunc>
+ static Status VisitStatus(const ArrayData& arr, ValidFunc&& valid_func,
+ NullFunc&& null_func) {
+ const c_type* data = arr.GetValues<c_type>(1);
+ auto visit_valid = [&](int64_t i) { return valid_func(data[i]); };
+ return VisitBitBlocks(arr.buffers[0], arr.offset, arr.length, std::move(visit_valid),
+ std::forward<NullFunc>(null_func));
+ }
+
+ template <typename ValidFunc, typename NullFunc>
+ static void VisitVoid(const ArrayData& arr, ValidFunc&& valid_func,
+ NullFunc&& null_func) {
+ using c_type = typename T::c_type;
+ const c_type* data = arr.GetValues<c_type>(1);
+ auto visit_valid = [&](int64_t i) { valid_func(data[i]); };
+ VisitBitBlocksVoid(arr.buffers[0], arr.offset, arr.length, std::move(visit_valid),
+ std::forward<NullFunc>(null_func));
+ }
+};
+
+// Boolean
+template <>
+struct ArrayDataInlineVisitor<BooleanType> {
+ using c_type = bool;
+
+ template <typename ValidFunc, typename NullFunc>
+ static Status VisitStatus(const ArrayData& arr, ValidFunc&& valid_func,
+ NullFunc&& null_func) {
+ int64_t offset = arr.offset;
+ const uint8_t* data = arr.buffers[1]->data();
+ return VisitBitBlocks(
+ arr.buffers[0], offset, arr.length,
+ [&](int64_t i) { return valid_func(BitUtil::GetBit(data, offset + i)); },
+ std::forward<NullFunc>(null_func));
+ }
+
+ template <typename ValidFunc, typename NullFunc>
+ static void VisitVoid(const ArrayData& arr, ValidFunc&& valid_func,
+ NullFunc&& null_func) {
+ int64_t offset = arr.offset;
+ const uint8_t* data = arr.buffers[1]->data();
+ VisitBitBlocksVoid(
+ arr.buffers[0], offset, arr.length,
+ [&](int64_t i) { valid_func(BitUtil::GetBit(data, offset + i)); },
+ std::forward<NullFunc>(null_func));
+ }
+};
+
+// Binary, String...
+template <typename T>
+struct ArrayDataInlineVisitor<T, enable_if_base_binary<T>> {
+ using c_type = util::string_view;
+
+ template <typename ValidFunc, typename NullFunc>
+ static Status VisitStatus(const ArrayData& arr, ValidFunc&& valid_func,
+ NullFunc&& null_func) {
+ using offset_type = typename T::offset_type;
+ constexpr char empty_value = 0;
+
+ if (arr.length == 0) {
+ return Status::OK();
+ }
+ const offset_type* offsets = arr.GetValues<offset_type>(1);
+ const char* data;
+ if (!arr.buffers[2]) {
+ data = &empty_value;
+ } else {
+ // Do not apply the array offset to the values array; the value_offsets
+ // index the non-sliced values array.
+ data = arr.GetValues<char>(2, /*absolute_offset=*/0);
+ }
+ offset_type cur_offset = *offsets++;
+ return VisitBitBlocks(
+ arr.buffers[0], arr.offset, arr.length,
+ [&](int64_t i) {
+ ARROW_UNUSED(i);
+ auto value = util::string_view(data + cur_offset, *offsets - cur_offset);
+ cur_offset = *offsets++;
+ return valid_func(value);
+ },
+ [&]() {
+ cur_offset = *offsets++;
+ return null_func();
+ });
+ }
+
+ template <typename ValidFunc, typename NullFunc>
+ static void VisitVoid(const ArrayData& arr, ValidFunc&& valid_func,
+ NullFunc&& null_func) {
+ using offset_type = typename T::offset_type;
+ constexpr uint8_t empty_value = 0;
+
+ if (arr.length == 0) {
+ return;
+ }
+ const offset_type* offsets = arr.GetValues<offset_type>(1);
+ const uint8_t* data;
+ if (!arr.buffers[2]) {
+ data = &empty_value;
+ } else {
+ // Do not apply the array offset to the values array; the value_offsets
+ // index the non-sliced values array.
+ data = arr.GetValues<uint8_t>(2, /*absolute_offset=*/0);
+ }
+
+ VisitBitBlocksVoid(
+ arr.buffers[0], arr.offset, arr.length,
+ [&](int64_t i) {
+ auto value = util::string_view(reinterpret_cast<const char*>(data + offsets[i]),
+ offsets[i + 1] - offsets[i]);
+ valid_func(value);
+ },
+ std::forward<NullFunc>(null_func));
+ }
+};
+
+// FixedSizeBinary, Decimal128
+template <typename T>
+struct ArrayDataInlineVisitor<T, enable_if_fixed_size_binary<T>> {
+ using c_type = util::string_view;
+
+ template <typename ValidFunc, typename NullFunc>
+ static Status VisitStatus(const ArrayData& arr, ValidFunc&& valid_func,
+ NullFunc&& null_func) {
+ const auto& fw_type = internal::checked_cast<const FixedSizeBinaryType&>(*arr.type);
+
+ const int32_t byte_width = fw_type.byte_width();
+ const char* data = arr.GetValues<char>(1,
+ /*absolute_offset=*/arr.offset * byte_width);
+
+ return VisitBitBlocks(
+ arr.buffers[0], arr.offset, arr.length,
+ [&](int64_t i) {
+ auto value = util::string_view(data, byte_width);
+ data += byte_width;
+ return valid_func(value);
+ },
+ [&]() {
+ data += byte_width;
+ return null_func();
+ });
+ }
+
+ template <typename ValidFunc, typename NullFunc>
+ static void VisitVoid(const ArrayData& arr, ValidFunc&& valid_func,
+ NullFunc&& null_func) {
+ const auto& fw_type = internal::checked_cast<const FixedSizeBinaryType&>(*arr.type);
+
+ const int32_t byte_width = fw_type.byte_width();
+ const char* data = arr.GetValues<char>(1,
+ /*absolute_offset=*/arr.offset * byte_width);
+
+ VisitBitBlocksVoid(
+ arr.buffers[0], arr.offset, arr.length,
+ [&](int64_t i) {
+ valid_func(util::string_view(data, byte_width));
+ data += byte_width;
+ },
+ [&]() {
+ data += byte_width;
+ null_func();
+ });
+ }
+};
+
+} // namespace internal
+
+// Visit an array's data values, in order, without overhead.
+//
+// The given `ValidFunc` should be a callable with either of these signatures:
+// - void(scalar_type)
+// - Status(scalar_type)
+//
+// The `NullFunc` should have the same return type as `ValidFunc`.
+//
+// ... where `scalar_type` depends on the array data type:
+// - the type's `c_type`, if any
+// - for boolean arrays, a `bool`
+// - for binary, string and fixed-size binary arrays, a `util::string_view`
+
+template <typename T, typename ValidFunc, typename NullFunc>
+typename internal::call_traits::enable_if_return<ValidFunc, Status>::type
+VisitArrayDataInline(const ArrayData& arr, ValidFunc&& valid_func, NullFunc&& null_func) {
+ return internal::ArrayDataInlineVisitor<T>::VisitStatus(
+ arr, std::forward<ValidFunc>(valid_func), std::forward<NullFunc>(null_func));
+}
+
+template <typename T, typename ValidFunc, typename NullFunc>
+typename internal::call_traits::enable_if_return<ValidFunc, void>::type
+VisitArrayDataInline(const ArrayData& arr, ValidFunc&& valid_func, NullFunc&& null_func) {
+ return internal::ArrayDataInlineVisitor<T>::VisitVoid(
+ arr, std::forward<ValidFunc>(valid_func), std::forward<NullFunc>(null_func));
+}
+
+// Visit an array's data values, in order, without overhead.
+//
+// The Visit method's `visitor` argument should be an object with two public methods:
+// - Status VisitNull()
+// - Status VisitValue(<scalar>)
+//
+// The scalar value's type depends on the array data type:
+// - the type's `c_type`, if any
+// - for boolean arrays, a `bool`
+// - for binary, string and fixed-size binary arrays, a `util::string_view`
+
+template <typename T>
+struct ArrayDataVisitor {
+ using InlineVisitorType = internal::ArrayDataInlineVisitor<T>;
+ using c_type = typename InlineVisitorType::c_type;
+
+ template <typename Visitor>
+ static Status Visit(const ArrayData& arr, Visitor* visitor) {
+ return InlineVisitorType::VisitStatus(
+ arr, [visitor](c_type v) { return visitor->VisitValue(v); },
+ [visitor]() { return visitor->VisitNull(); });
+ }
+};
+
+#define SCALAR_VISIT_INLINE(TYPE_CLASS) \
+ case TYPE_CLASS##Type::type_id: \
+ return visitor->Visit(internal::checked_cast<const TYPE_CLASS##Scalar&>(scalar));
+
+template <typename VISITOR>
+inline Status VisitScalarInline(const Scalar& scalar, VISITOR* visitor) {
+ switch (scalar.type->id()) {
+ ARROW_GENERATE_FOR_ALL_TYPES(SCALAR_VISIT_INLINE);
+ default:
+ break;
+ }
+ return Status::NotImplemented("Scalar visitor for type not implemented ",
+ scalar.type->ToString());
+}
+
+#undef TYPE_VISIT_INLINE
+
+// Visit a null bitmap, in order, without overhead.
+//
+// The given `ValidFunc` should be a callable with either of these signatures:
+// - void()
+// - Status()
+//
+// The `NullFunc` should have the same return type as `ValidFunc`.
+
+template <typename ValidFunc, typename NullFunc>
+typename internal::call_traits::enable_if_return<ValidFunc, Status>::type
+VisitNullBitmapInline(const uint8_t* valid_bits, int64_t valid_bits_offset,
+ int64_t num_values, int64_t null_count, ValidFunc&& valid_func,
+ NullFunc&& null_func) {
+ ARROW_UNUSED(null_count);
+ internal::OptionalBitBlockCounter bit_counter(valid_bits, valid_bits_offset,
+ num_values);
+ int64_t position = 0;
+ int64_t offset_position = valid_bits_offset;
+ while (position < num_values) {
+ internal::BitBlockCount block = bit_counter.NextBlock();
+ if (block.AllSet()) {
+ for (int64_t i = 0; i < block.length; ++i) {
+ ARROW_RETURN_NOT_OK(valid_func());
+ }
+ } else if (block.NoneSet()) {
+ for (int64_t i = 0; i < block.length; ++i) {
+ ARROW_RETURN_NOT_OK(null_func());
+ }
+ } else {
+ for (int64_t i = 0; i < block.length; ++i) {
+ ARROW_RETURN_NOT_OK(BitUtil::GetBit(valid_bits, offset_position + i)
+ ? valid_func()
+ : null_func());
+ }
+ }
+ position += block.length;
+ offset_position += block.length;
+ }
+ return Status::OK();
+}
+
+template <typename ValidFunc, typename NullFunc>
+typename internal::call_traits::enable_if_return<ValidFunc, void>::type
+VisitNullBitmapInline(const uint8_t* valid_bits, int64_t valid_bits_offset,
+ int64_t num_values, int64_t null_count, ValidFunc&& valid_func,
+ NullFunc&& null_func) {
+ ARROW_UNUSED(null_count);
+ internal::OptionalBitBlockCounter bit_counter(valid_bits, valid_bits_offset,
+ num_values);
+ int64_t position = 0;
+ int64_t offset_position = valid_bits_offset;
+ while (position < num_values) {
+ internal::BitBlockCount block = bit_counter.NextBlock();
+ if (block.AllSet()) {
+ for (int64_t i = 0; i < block.length; ++i) {
+ valid_func();
+ }
+ } else if (block.NoneSet()) {
+ for (int64_t i = 0; i < block.length; ++i) {
+ null_func();
+ }
+ } else {
+ for (int64_t i = 0; i < block.length; ++i) {
+ BitUtil::GetBit(valid_bits, offset_position + i) ? valid_func() : null_func();
+ }
+ }
+ position += block.length;
+ offset_position += block.length;
+ }
+}
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/generated/File_generated.h b/contrib/libs/apache/arrow/cpp/src/generated/File_generated.h
new file mode 100644
index 00000000000..06953c4a040
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/generated/File_generated.h
@@ -0,0 +1,200 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+
+#ifndef FLATBUFFERS_GENERATED_FILE_ORG_APACHE_ARROW_FLATBUF_H_
+#define FLATBUFFERS_GENERATED_FILE_ORG_APACHE_ARROW_FLATBUF_H_
+
+#include "flatbuffers/flatbuffers.h"
+
+#include "Schema_generated.h"
+
+namespace org {
+namespace apache {
+namespace arrow {
+namespace flatbuf {
+
+struct Footer;
+struct FooterBuilder;
+
+struct Block;
+
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(8) Block FLATBUFFERS_FINAL_CLASS {
+ private:
+ int64_t offset_;
+ int32_t metaDataLength_;
+ int32_t padding0__;
+ int64_t bodyLength_;
+
+ public:
+ Block() {
+ memset(static_cast<void *>(this), 0, sizeof(Block));
+ }
+ Block(int64_t _offset, int32_t _metaDataLength, int64_t _bodyLength)
+ : offset_(flatbuffers::EndianScalar(_offset)),
+ metaDataLength_(flatbuffers::EndianScalar(_metaDataLength)),
+ padding0__(0),
+ bodyLength_(flatbuffers::EndianScalar(_bodyLength)) {
+ (void)padding0__;
+ }
+ /// Index to the start of the RecordBlock (note this is past the Message header)
+ int64_t offset() const {
+ return flatbuffers::EndianScalar(offset_);
+ }
+ /// Length of the metadata
+ int32_t metaDataLength() const {
+ return flatbuffers::EndianScalar(metaDataLength_);
+ }
+ /// Length of the data (this is aligned so there can be a gap between this and
+ /// the metadata).
+ int64_t bodyLength() const {
+ return flatbuffers::EndianScalar(bodyLength_);
+ }
+};
+FLATBUFFERS_STRUCT_END(Block, 24);
+
+/// ----------------------------------------------------------------------
+/// Arrow File metadata
+///
+struct Footer FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+ typedef FooterBuilder Builder;
+ enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+ VT_VERSION = 4,
+ VT_SCHEMA = 6,
+ VT_DICTIONARIES = 8,
+ VT_RECORDBATCHES = 10,
+ VT_CUSTOM_METADATA = 12
+ };
+ org::apache::arrow::flatbuf::MetadataVersion version() const {
+ return static_cast<org::apache::arrow::flatbuf::MetadataVersion>(GetField<int16_t>(VT_VERSION, 0));
+ }
+ const org::apache::arrow::flatbuf::Schema *schema() const {
+ return GetPointer<const org::apache::arrow::flatbuf::Schema *>(VT_SCHEMA);
+ }
+ const flatbuffers::Vector<const org::apache::arrow::flatbuf::Block *> *dictionaries() const {
+ return GetPointer<const flatbuffers::Vector<const org::apache::arrow::flatbuf::Block *> *>(VT_DICTIONARIES);
+ }
+ const flatbuffers::Vector<const org::apache::arrow::flatbuf::Block *> *recordBatches() const {
+ return GetPointer<const flatbuffers::Vector<const org::apache::arrow::flatbuf::Block *> *>(VT_RECORDBATCHES);
+ }
+ /// User-defined metadata
+ const flatbuffers::Vector<flatbuffers::Offset<org::apache::arrow::flatbuf::KeyValue>> *custom_metadata() const {
+ return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<org::apache::arrow::flatbuf::KeyValue>> *>(VT_CUSTOM_METADATA);
+ }
+ bool Verify(flatbuffers::Verifier &verifier) const {
+ return VerifyTableStart(verifier) &&
+ VerifyField<int16_t>(verifier, VT_VERSION) &&
+ VerifyOffset(verifier, VT_SCHEMA) &&
+ verifier.VerifyTable(schema()) &&
+ VerifyOffset(verifier, VT_DICTIONARIES) &&
+ verifier.VerifyVector(dictionaries()) &&
+ VerifyOffset(verifier, VT_RECORDBATCHES) &&
+ verifier.VerifyVector(recordBatches()) &&
+ VerifyOffset(verifier, VT_CUSTOM_METADATA) &&
+ verifier.VerifyVector(custom_metadata()) &&
+ verifier.VerifyVectorOfTables(custom_metadata()) &&
+ verifier.EndTable();
+ }
+};
+
+struct FooterBuilder {
+ typedef Footer Table;
+ flatbuffers::FlatBufferBuilder &fbb_;
+ flatbuffers::uoffset_t start_;
+ void add_version(org::apache::arrow::flatbuf::MetadataVersion version) {
+ fbb_.AddElement<int16_t>(Footer::VT_VERSION, static_cast<int16_t>(version), 0);
+ }
+ void add_schema(flatbuffers::Offset<org::apache::arrow::flatbuf::Schema> schema) {
+ fbb_.AddOffset(Footer::VT_SCHEMA, schema);
+ }
+ void add_dictionaries(flatbuffers::Offset<flatbuffers::Vector<const org::apache::arrow::flatbuf::Block *>> dictionaries) {
+ fbb_.AddOffset(Footer::VT_DICTIONARIES, dictionaries);
+ }
+ void add_recordBatches(flatbuffers::Offset<flatbuffers::Vector<const org::apache::arrow::flatbuf::Block *>> recordBatches) {
+ fbb_.AddOffset(Footer::VT_RECORDBATCHES, recordBatches);
+ }
+ void add_custom_metadata(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<org::apache::arrow::flatbuf::KeyValue>>> custom_metadata) {
+ fbb_.AddOffset(Footer::VT_CUSTOM_METADATA, custom_metadata);
+ }
+ explicit FooterBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+ : fbb_(_fbb) {
+ start_ = fbb_.StartTable();
+ }
+ FooterBuilder &operator=(const FooterBuilder &);
+ flatbuffers::Offset<Footer> Finish() {
+ const auto end = fbb_.EndTable(start_);
+ auto o = flatbuffers::Offset<Footer>(end);
+ return o;
+ }
+};
+
+inline flatbuffers::Offset<Footer> CreateFooter(
+ flatbuffers::FlatBufferBuilder &_fbb,
+ org::apache::arrow::flatbuf::MetadataVersion version = org::apache::arrow::flatbuf::MetadataVersion::V1,
+ flatbuffers::Offset<org::apache::arrow::flatbuf::Schema> schema = 0,
+ flatbuffers::Offset<flatbuffers::Vector<const org::apache::arrow::flatbuf::Block *>> dictionaries = 0,
+ flatbuffers::Offset<flatbuffers::Vector<const org::apache::arrow::flatbuf::Block *>> recordBatches = 0,
+ flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<org::apache::arrow::flatbuf::KeyValue>>> custom_metadata = 0) {
+ FooterBuilder builder_(_fbb);
+ builder_.add_custom_metadata(custom_metadata);
+ builder_.add_recordBatches(recordBatches);
+ builder_.add_dictionaries(dictionaries);
+ builder_.add_schema(schema);
+ builder_.add_version(version);
+ return builder_.Finish();
+}
+
+inline flatbuffers::Offset<Footer> CreateFooterDirect(
+ flatbuffers::FlatBufferBuilder &_fbb,
+ org::apache::arrow::flatbuf::MetadataVersion version = org::apache::arrow::flatbuf::MetadataVersion::V1,
+ flatbuffers::Offset<org::apache::arrow::flatbuf::Schema> schema = 0,
+ const std::vector<org::apache::arrow::flatbuf::Block> *dictionaries = nullptr,
+ const std::vector<org::apache::arrow::flatbuf::Block> *recordBatches = nullptr,
+ const std::vector<flatbuffers::Offset<org::apache::arrow::flatbuf::KeyValue>> *custom_metadata = nullptr) {
+ auto dictionaries__ = dictionaries ? _fbb.CreateVectorOfStructs<org::apache::arrow::flatbuf::Block>(*dictionaries) : 0;
+ auto recordBatches__ = recordBatches ? _fbb.CreateVectorOfStructs<org::apache::arrow::flatbuf::Block>(*recordBatches) : 0;
+ auto custom_metadata__ = custom_metadata ? _fbb.CreateVector<flatbuffers::Offset<org::apache::arrow::flatbuf::KeyValue>>(*custom_metadata) : 0;
+ return org::apache::arrow::flatbuf::CreateFooter(
+ _fbb,
+ version,
+ schema,
+ dictionaries__,
+ recordBatches__,
+ custom_metadata__);
+}
+
+inline const org::apache::arrow::flatbuf::Footer *GetFooter(const void *buf) {
+ return flatbuffers::GetRoot<org::apache::arrow::flatbuf::Footer>(buf);
+}
+
+inline const org::apache::arrow::flatbuf::Footer *GetSizePrefixedFooter(const void *buf) {
+ return flatbuffers::GetSizePrefixedRoot<org::apache::arrow::flatbuf::Footer>(buf);
+}
+
+inline bool VerifyFooterBuffer(
+ flatbuffers::Verifier &verifier) {
+ return verifier.VerifyBuffer<org::apache::arrow::flatbuf::Footer>(nullptr);
+}
+
+inline bool VerifySizePrefixedFooterBuffer(
+ flatbuffers::Verifier &verifier) {
+ return verifier.VerifySizePrefixedBuffer<org::apache::arrow::flatbuf::Footer>(nullptr);
+}
+
+inline void FinishFooterBuffer(
+ flatbuffers::FlatBufferBuilder &fbb,
+ flatbuffers::Offset<org::apache::arrow::flatbuf::Footer> root) {
+ fbb.Finish(root);
+}
+
+inline void FinishSizePrefixedFooterBuffer(
+ flatbuffers::FlatBufferBuilder &fbb,
+ flatbuffers::Offset<org::apache::arrow::flatbuf::Footer> root) {
+ fbb.FinishSizePrefixed(root);
+}
+
+} // namespace flatbuf
+} // namespace arrow
+} // namespace apache
+} // namespace org
+
+#endif // FLATBUFFERS_GENERATED_FILE_ORG_APACHE_ARROW_FLATBUF_H_
diff --git a/contrib/libs/apache/arrow/cpp/src/generated/Message_generated.h b/contrib/libs/apache/arrow/cpp/src/generated/Message_generated.h
new file mode 100644
index 00000000000..822bec9952b
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/generated/Message_generated.h
@@ -0,0 +1,659 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+
+#ifndef FLATBUFFERS_GENERATED_MESSAGE_ORG_APACHE_ARROW_FLATBUF_H_
+#define FLATBUFFERS_GENERATED_MESSAGE_ORG_APACHE_ARROW_FLATBUF_H_
+
+#include "flatbuffers/flatbuffers.h"
+
+#include "Schema_generated.h"
+#include "SparseTensor_generated.h"
+#include "Tensor_generated.h"
+
+namespace org {
+namespace apache {
+namespace arrow {
+namespace flatbuf {
+
+struct FieldNode;
+
+struct BodyCompression;
+struct BodyCompressionBuilder;
+
+struct RecordBatch;
+struct RecordBatchBuilder;
+
+struct DictionaryBatch;
+struct DictionaryBatchBuilder;
+
+struct Message;
+struct MessageBuilder;
+
+enum class CompressionType : int8_t {
+ LZ4_FRAME = 0,
+ ZSTD = 1,
+ MIN = LZ4_FRAME,
+ MAX = ZSTD
+};
+
+inline const CompressionType (&EnumValuesCompressionType())[2] {
+ static const CompressionType values[] = {
+ CompressionType::LZ4_FRAME,
+ CompressionType::ZSTD
+ };
+ return values;
+}
+
+inline const char * const *EnumNamesCompressionType() {
+ static const char * const names[3] = {
+ "LZ4_FRAME",
+ "ZSTD",
+ nullptr
+ };
+ return names;
+}
+
+inline const char *EnumNameCompressionType(CompressionType e) {
+ if (flatbuffers::IsOutRange(e, CompressionType::LZ4_FRAME, CompressionType::ZSTD)) return "";
+ const size_t index = static_cast<size_t>(e);
+ return EnumNamesCompressionType()[index];
+}
+
+/// Provided for forward compatibility in case we need to support different
+/// strategies for compressing the IPC message body (like whole-body
+/// compression rather than buffer-level) in the future
+enum class BodyCompressionMethod : int8_t {
+ /// Each constituent buffer is first compressed with the indicated
+ /// compressor, and then written with the uncompressed length in the first 8
+ /// bytes as a 64-bit little-endian signed integer followed by the compressed
+ /// buffer bytes (and then padding as required by the protocol). The
+ /// uncompressed length may be set to -1 to indicate that the data that
+ /// follows is not compressed, which can be useful for cases where
+ /// compression does not yield appreciable savings.
+ BUFFER = 0,
+ MIN = BUFFER,
+ MAX = BUFFER
+};
+
+inline const BodyCompressionMethod (&EnumValuesBodyCompressionMethod())[1] {
+ static const BodyCompressionMethod values[] = {
+ BodyCompressionMethod::BUFFER
+ };
+ return values;
+}
+
+inline const char * const *EnumNamesBodyCompressionMethod() {
+ static const char * const names[2] = {
+ "BUFFER",
+ nullptr
+ };
+ return names;
+}
+
+inline const char *EnumNameBodyCompressionMethod(BodyCompressionMethod e) {
+ if (flatbuffers::IsOutRange(e, BodyCompressionMethod::BUFFER, BodyCompressionMethod::BUFFER)) return "";
+ const size_t index = static_cast<size_t>(e);
+ return EnumNamesBodyCompressionMethod()[index];
+}
+
+/// ----------------------------------------------------------------------
+/// The root Message type
+/// This union enables us to easily send different message types without
+/// redundant storage, and in the future we can easily add new message types.
+///
+/// Arrow implementations do not need to implement all of the message types,
+/// which may include experimental metadata types. For maximum compatibility,
+/// it is best to send data using RecordBatch
+enum class MessageHeader : uint8_t {
+ NONE = 0,
+ Schema = 1,
+ DictionaryBatch = 2,
+ RecordBatch = 3,
+ Tensor = 4,
+ SparseTensor = 5,
+ MIN = NONE,
+ MAX = SparseTensor
+};
+
+inline const MessageHeader (&EnumValuesMessageHeader())[6] {
+ static const MessageHeader values[] = {
+ MessageHeader::NONE,
+ MessageHeader::Schema,
+ MessageHeader::DictionaryBatch,
+ MessageHeader::RecordBatch,
+ MessageHeader::Tensor,
+ MessageHeader::SparseTensor
+ };
+ return values;
+}
+
+inline const char * const *EnumNamesMessageHeader() {
+ static const char * const names[7] = {
+ "NONE",
+ "Schema",
+ "DictionaryBatch",
+ "RecordBatch",
+ "Tensor",
+ "SparseTensor",
+ nullptr
+ };
+ return names;
+}
+
+inline const char *EnumNameMessageHeader(MessageHeader e) {
+ if (flatbuffers::IsOutRange(e, MessageHeader::NONE, MessageHeader::SparseTensor)) return "";
+ const size_t index = static_cast<size_t>(e);
+ return EnumNamesMessageHeader()[index];
+}
+
+template<typename T> struct MessageHeaderTraits {
+ static const MessageHeader enum_value = MessageHeader::NONE;
+};
+
+template<> struct MessageHeaderTraits<org::apache::arrow::flatbuf::Schema> {
+ static const MessageHeader enum_value = MessageHeader::Schema;
+};
+
+template<> struct MessageHeaderTraits<org::apache::arrow::flatbuf::DictionaryBatch> {
+ static const MessageHeader enum_value = MessageHeader::DictionaryBatch;
+};
+
+template<> struct MessageHeaderTraits<org::apache::arrow::flatbuf::RecordBatch> {
+ static const MessageHeader enum_value = MessageHeader::RecordBatch;
+};
+
+template<> struct MessageHeaderTraits<org::apache::arrow::flatbuf::Tensor> {
+ static const MessageHeader enum_value = MessageHeader::Tensor;
+};
+
+template<> struct MessageHeaderTraits<org::apache::arrow::flatbuf::SparseTensor> {
+ static const MessageHeader enum_value = MessageHeader::SparseTensor;
+};
+
+bool VerifyMessageHeader(flatbuffers::Verifier &verifier, const void *obj, MessageHeader type);
+bool VerifyMessageHeaderVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types);
+
+/// ----------------------------------------------------------------------
+/// Data structures for describing a table row batch (a collection of
+/// equal-length Arrow arrays)
+/// Metadata about a field at some level of a nested type tree (but not
+/// its children).
+///
+/// For example, a List<Int16> with values [[1, 2, 3], null, [4], [5, 6], null]
+/// would have {length: 5, null_count: 2} for its List node, and {length: 6,
+/// null_count: 0} for its Int16 node, as separate FieldNode structs
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(8) FieldNode FLATBUFFERS_FINAL_CLASS {
+ private:
+ int64_t length_;
+ int64_t null_count_;
+
+ public:
+ FieldNode() {
+ memset(static_cast<void *>(this), 0, sizeof(FieldNode));
+ }
+ FieldNode(int64_t _length, int64_t _null_count)
+ : length_(flatbuffers::EndianScalar(_length)),
+ null_count_(flatbuffers::EndianScalar(_null_count)) {
+ }
+ /// The number of value slots in the Arrow array at this level of a nested
+ /// tree
+ int64_t length() const {
+ return flatbuffers::EndianScalar(length_);
+ }
+ /// The number of observed nulls. Fields with null_count == 0 may choose not
+ /// to write their physical validity bitmap out as a materialized buffer,
+ /// instead setting the length of the bitmap buffer to 0.
+ int64_t null_count() const {
+ return flatbuffers::EndianScalar(null_count_);
+ }
+};
+FLATBUFFERS_STRUCT_END(FieldNode, 16);
+
+/// Optional compression for the memory buffers constituting IPC message
+/// bodies. Intended for use with RecordBatch but could be used for other
+/// message types
+struct BodyCompression FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+ typedef BodyCompressionBuilder Builder;
+ enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+ VT_CODEC = 4,
+ VT_METHOD = 6
+ };
+ /// Compressor library
+ org::apache::arrow::flatbuf::CompressionType codec() const {
+ return static_cast<org::apache::arrow::flatbuf::CompressionType>(GetField<int8_t>(VT_CODEC, 0));
+ }
+ /// Indicates the way the record batch body was compressed
+ org::apache::arrow::flatbuf::BodyCompressionMethod method() const {
+ return static_cast<org::apache::arrow::flatbuf::BodyCompressionMethod>(GetField<int8_t>(VT_METHOD, 0));
+ }
+ bool Verify(flatbuffers::Verifier &verifier) const {
+ return VerifyTableStart(verifier) &&
+ VerifyField<int8_t>(verifier, VT_CODEC) &&
+ VerifyField<int8_t>(verifier, VT_METHOD) &&
+ verifier.EndTable();
+ }
+};
+
+struct BodyCompressionBuilder {
+ typedef BodyCompression Table;
+ flatbuffers::FlatBufferBuilder &fbb_;
+ flatbuffers::uoffset_t start_;
+ void add_codec(org::apache::arrow::flatbuf::CompressionType codec) {
+ fbb_.AddElement<int8_t>(BodyCompression::VT_CODEC, static_cast<int8_t>(codec), 0);
+ }
+ void add_method(org::apache::arrow::flatbuf::BodyCompressionMethod method) {
+ fbb_.AddElement<int8_t>(BodyCompression::VT_METHOD, static_cast<int8_t>(method), 0);
+ }
+ explicit BodyCompressionBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+ : fbb_(_fbb) {
+ start_ = fbb_.StartTable();
+ }
+ BodyCompressionBuilder &operator=(const BodyCompressionBuilder &);
+ flatbuffers::Offset<BodyCompression> Finish() {
+ const auto end = fbb_.EndTable(start_);
+ auto o = flatbuffers::Offset<BodyCompression>(end);
+ return o;
+ }
+};
+
+inline flatbuffers::Offset<BodyCompression> CreateBodyCompression(
+ flatbuffers::FlatBufferBuilder &_fbb,
+ org::apache::arrow::flatbuf::CompressionType codec = org::apache::arrow::flatbuf::CompressionType::LZ4_FRAME,
+ org::apache::arrow::flatbuf::BodyCompressionMethod method = org::apache::arrow::flatbuf::BodyCompressionMethod::BUFFER) {
+ BodyCompressionBuilder builder_(_fbb);
+ builder_.add_method(method);
+ builder_.add_codec(codec);
+ return builder_.Finish();
+}
+
+/// A data header describing the shared memory layout of a "record" or "row"
+/// batch. Some systems call this a "row batch" internally and others a "record
+/// batch".
+struct RecordBatch FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+ typedef RecordBatchBuilder Builder;
+ enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+ VT_LENGTH = 4,
+ VT_NODES = 6,
+ VT_BUFFERS = 8,
+ VT_COMPRESSION = 10
+ };
+ /// number of records / rows. The arrays in the batch should all have this
+ /// length
+ int64_t length() const {
+ return GetField<int64_t>(VT_LENGTH, 0);
+ }
+ /// Nodes correspond to the pre-ordered flattened logical schema
+ const flatbuffers::Vector<const org::apache::arrow::flatbuf::FieldNode *> *nodes() const {
+ return GetPointer<const flatbuffers::Vector<const org::apache::arrow::flatbuf::FieldNode *> *>(VT_NODES);
+ }
+ /// Buffers correspond to the pre-ordered flattened buffer tree
+ ///
+ /// The number of buffers appended to this list depends on the schema. For
+ /// example, most primitive arrays will have 2 buffers, 1 for the validity
+ /// bitmap and 1 for the values. For struct arrays, there will only be a
+ /// single buffer for the validity (nulls) bitmap
+ const flatbuffers::Vector<const org::apache::arrow::flatbuf::Buffer *> *buffers() const {
+ return GetPointer<const flatbuffers::Vector<const org::apache::arrow::flatbuf::Buffer *> *>(VT_BUFFERS);
+ }
+ /// Optional compression of the message body
+ const org::apache::arrow::flatbuf::BodyCompression *compression() const {
+ return GetPointer<const org::apache::arrow::flatbuf::BodyCompression *>(VT_COMPRESSION);
+ }
+ bool Verify(flatbuffers::Verifier &verifier) const {
+ return VerifyTableStart(verifier) &&
+ VerifyField<int64_t>(verifier, VT_LENGTH) &&
+ VerifyOffset(verifier, VT_NODES) &&
+ verifier.VerifyVector(nodes()) &&
+ VerifyOffset(verifier, VT_BUFFERS) &&
+ verifier.VerifyVector(buffers()) &&
+ VerifyOffset(verifier, VT_COMPRESSION) &&
+ verifier.VerifyTable(compression()) &&
+ verifier.EndTable();
+ }
+};
+
+struct RecordBatchBuilder {
+ typedef RecordBatch Table;
+ flatbuffers::FlatBufferBuilder &fbb_;
+ flatbuffers::uoffset_t start_;
+ void add_length(int64_t length) {
+ fbb_.AddElement<int64_t>(RecordBatch::VT_LENGTH, length, 0);
+ }
+ void add_nodes(flatbuffers::Offset<flatbuffers::Vector<const org::apache::arrow::flatbuf::FieldNode *>> nodes) {
+ fbb_.AddOffset(RecordBatch::VT_NODES, nodes);
+ }
+ void add_buffers(flatbuffers::Offset<flatbuffers::Vector<const org::apache::arrow::flatbuf::Buffer *>> buffers) {
+ fbb_.AddOffset(RecordBatch::VT_BUFFERS, buffers);
+ }
+ void add_compression(flatbuffers::Offset<org::apache::arrow::flatbuf::BodyCompression> compression) {
+ fbb_.AddOffset(RecordBatch::VT_COMPRESSION, compression);
+ }
+ explicit RecordBatchBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+ : fbb_(_fbb) {
+ start_ = fbb_.StartTable();
+ }
+ RecordBatchBuilder &operator=(const RecordBatchBuilder &);
+ flatbuffers::Offset<RecordBatch> Finish() {
+ const auto end = fbb_.EndTable(start_);
+ auto o = flatbuffers::Offset<RecordBatch>(end);
+ return o;
+ }
+};
+
+inline flatbuffers::Offset<RecordBatch> CreateRecordBatch(
+ flatbuffers::FlatBufferBuilder &_fbb,
+ int64_t length = 0,
+ flatbuffers::Offset<flatbuffers::Vector<const org::apache::arrow::flatbuf::FieldNode *>> nodes = 0,
+ flatbuffers::Offset<flatbuffers::Vector<const org::apache::arrow::flatbuf::Buffer *>> buffers = 0,
+ flatbuffers::Offset<org::apache::arrow::flatbuf::BodyCompression> compression = 0) {
+ RecordBatchBuilder builder_(_fbb);
+ builder_.add_length(length);
+ builder_.add_compression(compression);
+ builder_.add_buffers(buffers);
+ builder_.add_nodes(nodes);
+ return builder_.Finish();
+}
+
+inline flatbuffers::Offset<RecordBatch> CreateRecordBatchDirect(
+ flatbuffers::FlatBufferBuilder &_fbb,
+ int64_t length = 0,
+ const std::vector<org::apache::arrow::flatbuf::FieldNode> *nodes = nullptr,
+ const std::vector<org::apache::arrow::flatbuf::Buffer> *buffers = nullptr,
+ flatbuffers::Offset<org::apache::arrow::flatbuf::BodyCompression> compression = 0) {
+ auto nodes__ = nodes ? _fbb.CreateVectorOfStructs<org::apache::arrow::flatbuf::FieldNode>(*nodes) : 0;
+ auto buffers__ = buffers ? _fbb.CreateVectorOfStructs<org::apache::arrow::flatbuf::Buffer>(*buffers) : 0;
+ return org::apache::arrow::flatbuf::CreateRecordBatch(
+ _fbb,
+ length,
+ nodes__,
+ buffers__,
+ compression);
+}
+
+/// For sending dictionary encoding information. Any Field can be
+/// dictionary-encoded, but in this case none of its children may be
+/// dictionary-encoded.
+/// There is one vector / column per dictionary, but that vector / column
+/// may be spread across multiple dictionary batches by using the isDelta
+/// flag
+struct DictionaryBatch FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+ typedef DictionaryBatchBuilder Builder;
+ enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+ VT_ID = 4,
+ VT_DATA = 6,
+ VT_ISDELTA = 8
+ };
+ int64_t id() const {
+ return GetField<int64_t>(VT_ID, 0);
+ }
+ const org::apache::arrow::flatbuf::RecordBatch *data() const {
+ return GetPointer<const org::apache::arrow::flatbuf::RecordBatch *>(VT_DATA);
+ }
+ /// If isDelta is true the values in the dictionary are to be appended to a
+ /// dictionary with the indicated id. If isDelta is false this dictionary
+ /// should replace the existing dictionary.
+ bool isDelta() const {
+ return GetField<uint8_t>(VT_ISDELTA, 0) != 0;
+ }
+ bool Verify(flatbuffers::Verifier &verifier) const {
+ return VerifyTableStart(verifier) &&
+ VerifyField<int64_t>(verifier, VT_ID) &&
+ VerifyOffset(verifier, VT_DATA) &&
+ verifier.VerifyTable(data()) &&
+ VerifyField<uint8_t>(verifier, VT_ISDELTA) &&
+ verifier.EndTable();
+ }
+};
+
+struct DictionaryBatchBuilder {
+ typedef DictionaryBatch Table;
+ flatbuffers::FlatBufferBuilder &fbb_;
+ flatbuffers::uoffset_t start_;
+ void add_id(int64_t id) {
+ fbb_.AddElement<int64_t>(DictionaryBatch::VT_ID, id, 0);
+ }
+ void add_data(flatbuffers::Offset<org::apache::arrow::flatbuf::RecordBatch> data) {
+ fbb_.AddOffset(DictionaryBatch::VT_DATA, data);
+ }
+ void add_isDelta(bool isDelta) {
+ fbb_.AddElement<uint8_t>(DictionaryBatch::VT_ISDELTA, static_cast<uint8_t>(isDelta), 0);
+ }
+ explicit DictionaryBatchBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+ : fbb_(_fbb) {
+ start_ = fbb_.StartTable();
+ }
+ DictionaryBatchBuilder &operator=(const DictionaryBatchBuilder &);
+ flatbuffers::Offset<DictionaryBatch> Finish() {
+ const auto end = fbb_.EndTable(start_);
+ auto o = flatbuffers::Offset<DictionaryBatch>(end);
+ return o;
+ }
+};
+
+inline flatbuffers::Offset<DictionaryBatch> CreateDictionaryBatch(
+ flatbuffers::FlatBufferBuilder &_fbb,
+ int64_t id = 0,
+ flatbuffers::Offset<org::apache::arrow::flatbuf::RecordBatch> data = 0,
+ bool isDelta = false) {
+ DictionaryBatchBuilder builder_(_fbb);
+ builder_.add_id(id);
+ builder_.add_data(data);
+ builder_.add_isDelta(isDelta);
+ return builder_.Finish();
+}
+
+struct Message FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+ typedef MessageBuilder Builder;
+ enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+ VT_VERSION = 4,
+ VT_HEADER_TYPE = 6,
+ VT_HEADER = 8,
+ VT_BODYLENGTH = 10,
+ VT_CUSTOM_METADATA = 12
+ };
+ org::apache::arrow::flatbuf::MetadataVersion version() const {
+ return static_cast<org::apache::arrow::flatbuf::MetadataVersion>(GetField<int16_t>(VT_VERSION, 0));
+ }
+ org::apache::arrow::flatbuf::MessageHeader header_type() const {
+ return static_cast<org::apache::arrow::flatbuf::MessageHeader>(GetField<uint8_t>(VT_HEADER_TYPE, 0));
+ }
+ const void *header() const {
+ return GetPointer<const void *>(VT_HEADER);
+ }
+ template<typename T> const T *header_as() const;
+ const org::apache::arrow::flatbuf::Schema *header_as_Schema() const {
+ return header_type() == org::apache::arrow::flatbuf::MessageHeader::Schema ? static_cast<const org::apache::arrow::flatbuf::Schema *>(header()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::DictionaryBatch *header_as_DictionaryBatch() const {
+ return header_type() == org::apache::arrow::flatbuf::MessageHeader::DictionaryBatch ? static_cast<const org::apache::arrow::flatbuf::DictionaryBatch *>(header()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::RecordBatch *header_as_RecordBatch() const {
+ return header_type() == org::apache::arrow::flatbuf::MessageHeader::RecordBatch ? static_cast<const org::apache::arrow::flatbuf::RecordBatch *>(header()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::Tensor *header_as_Tensor() const {
+ return header_type() == org::apache::arrow::flatbuf::MessageHeader::Tensor ? static_cast<const org::apache::arrow::flatbuf::Tensor *>(header()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::SparseTensor *header_as_SparseTensor() const {
+ return header_type() == org::apache::arrow::flatbuf::MessageHeader::SparseTensor ? static_cast<const org::apache::arrow::flatbuf::SparseTensor *>(header()) : nullptr;
+ }
+ int64_t bodyLength() const {
+ return GetField<int64_t>(VT_BODYLENGTH, 0);
+ }
+ const flatbuffers::Vector<flatbuffers::Offset<org::apache::arrow::flatbuf::KeyValue>> *custom_metadata() const {
+ return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<org::apache::arrow::flatbuf::KeyValue>> *>(VT_CUSTOM_METADATA);
+ }
+ bool Verify(flatbuffers::Verifier &verifier) const {
+ return VerifyTableStart(verifier) &&
+ VerifyField<int16_t>(verifier, VT_VERSION) &&
+ VerifyField<uint8_t>(verifier, VT_HEADER_TYPE) &&
+ VerifyOffset(verifier, VT_HEADER) &&
+ VerifyMessageHeader(verifier, header(), header_type()) &&
+ VerifyField<int64_t>(verifier, VT_BODYLENGTH) &&
+ VerifyOffset(verifier, VT_CUSTOM_METADATA) &&
+ verifier.VerifyVector(custom_metadata()) &&
+ verifier.VerifyVectorOfTables(custom_metadata()) &&
+ verifier.EndTable();
+ }
+};
+
+template<> inline const org::apache::arrow::flatbuf::Schema *Message::header_as<org::apache::arrow::flatbuf::Schema>() const {
+ return header_as_Schema();
+}
+
+template<> inline const org::apache::arrow::flatbuf::DictionaryBatch *Message::header_as<org::apache::arrow::flatbuf::DictionaryBatch>() const {
+ return header_as_DictionaryBatch();
+}
+
+template<> inline const org::apache::arrow::flatbuf::RecordBatch *Message::header_as<org::apache::arrow::flatbuf::RecordBatch>() const {
+ return header_as_RecordBatch();
+}
+
+template<> inline const org::apache::arrow::flatbuf::Tensor *Message::header_as<org::apache::arrow::flatbuf::Tensor>() const {
+ return header_as_Tensor();
+}
+
+template<> inline const org::apache::arrow::flatbuf::SparseTensor *Message::header_as<org::apache::arrow::flatbuf::SparseTensor>() const {
+ return header_as_SparseTensor();
+}
+
+struct MessageBuilder {
+ typedef Message Table;
+ flatbuffers::FlatBufferBuilder &fbb_;
+ flatbuffers::uoffset_t start_;
+ void add_version(org::apache::arrow::flatbuf::MetadataVersion version) {
+ fbb_.AddElement<int16_t>(Message::VT_VERSION, static_cast<int16_t>(version), 0);
+ }
+ void add_header_type(org::apache::arrow::flatbuf::MessageHeader header_type) {
+ fbb_.AddElement<uint8_t>(Message::VT_HEADER_TYPE, static_cast<uint8_t>(header_type), 0);
+ }
+ void add_header(flatbuffers::Offset<void> header) {
+ fbb_.AddOffset(Message::VT_HEADER, header);
+ }
+ void add_bodyLength(int64_t bodyLength) {
+ fbb_.AddElement<int64_t>(Message::VT_BODYLENGTH, bodyLength, 0);
+ }
+ void add_custom_metadata(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<org::apache::arrow::flatbuf::KeyValue>>> custom_metadata) {
+ fbb_.AddOffset(Message::VT_CUSTOM_METADATA, custom_metadata);
+ }
+ explicit MessageBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+ : fbb_(_fbb) {
+ start_ = fbb_.StartTable();
+ }
+ MessageBuilder &operator=(const MessageBuilder &);
+ flatbuffers::Offset<Message> Finish() {
+ const auto end = fbb_.EndTable(start_);
+ auto o = flatbuffers::Offset<Message>(end);
+ return o;
+ }
+};
+
+inline flatbuffers::Offset<Message> CreateMessage(
+ flatbuffers::FlatBufferBuilder &_fbb,
+ org::apache::arrow::flatbuf::MetadataVersion version = org::apache::arrow::flatbuf::MetadataVersion::V1,
+ org::apache::arrow::flatbuf::MessageHeader header_type = org::apache::arrow::flatbuf::MessageHeader::NONE,
+ flatbuffers::Offset<void> header = 0,
+ int64_t bodyLength = 0,
+ flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<org::apache::arrow::flatbuf::KeyValue>>> custom_metadata = 0) {
+ MessageBuilder builder_(_fbb);
+ builder_.add_bodyLength(bodyLength);
+ builder_.add_custom_metadata(custom_metadata);
+ builder_.add_header(header);
+ builder_.add_version(version);
+ builder_.add_header_type(header_type);
+ return builder_.Finish();
+}
+
+inline flatbuffers::Offset<Message> CreateMessageDirect(
+ flatbuffers::FlatBufferBuilder &_fbb,
+ org::apache::arrow::flatbuf::MetadataVersion version = org::apache::arrow::flatbuf::MetadataVersion::V1,
+ org::apache::arrow::flatbuf::MessageHeader header_type = org::apache::arrow::flatbuf::MessageHeader::NONE,
+ flatbuffers::Offset<void> header = 0,
+ int64_t bodyLength = 0,
+ const std::vector<flatbuffers::Offset<org::apache::arrow::flatbuf::KeyValue>> *custom_metadata = nullptr) {
+ auto custom_metadata__ = custom_metadata ? _fbb.CreateVector<flatbuffers::Offset<org::apache::arrow::flatbuf::KeyValue>>(*custom_metadata) : 0;
+ return org::apache::arrow::flatbuf::CreateMessage(
+ _fbb,
+ version,
+ header_type,
+ header,
+ bodyLength,
+ custom_metadata__);
+}
+
+inline bool VerifyMessageHeader(flatbuffers::Verifier &verifier, const void *obj, MessageHeader type) {
+ switch (type) {
+ case MessageHeader::NONE: {
+ return true;
+ }
+ case MessageHeader::Schema: {
+ auto ptr = reinterpret_cast<const org::apache::arrow::flatbuf::Schema *>(obj);
+ return verifier.VerifyTable(ptr);
+ }
+ case MessageHeader::DictionaryBatch: {
+ auto ptr = reinterpret_cast<const org::apache::arrow::flatbuf::DictionaryBatch *>(obj);
+ return verifier.VerifyTable(ptr);
+ }
+ case MessageHeader::RecordBatch: {
+ auto ptr = reinterpret_cast<const org::apache::arrow::flatbuf::RecordBatch *>(obj);
+ return verifier.VerifyTable(ptr);
+ }
+ case MessageHeader::Tensor: {
+ auto ptr = reinterpret_cast<const org::apache::arrow::flatbuf::Tensor *>(obj);
+ return verifier.VerifyTable(ptr);
+ }
+ case MessageHeader::SparseTensor: {
+ auto ptr = reinterpret_cast<const org::apache::arrow::flatbuf::SparseTensor *>(obj);
+ return verifier.VerifyTable(ptr);
+ }
+ default: return true;
+ }
+}
+
+inline bool VerifyMessageHeaderVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types) {
+ if (!values || !types) return !values && !types;
+ if (values->size() != types->size()) return false;
+ for (flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
+ if (!VerifyMessageHeader(
+ verifier, values->Get(i), types->GetEnum<MessageHeader>(i))) {
+ return false;
+ }
+ }
+ return true;
+}
+
+inline const org::apache::arrow::flatbuf::Message *GetMessage(const void *buf) {
+ return flatbuffers::GetRoot<org::apache::arrow::flatbuf::Message>(buf);
+}
+
+inline const org::apache::arrow::flatbuf::Message *GetSizePrefixedMessage(const void *buf) {
+ return flatbuffers::GetSizePrefixedRoot<org::apache::arrow::flatbuf::Message>(buf);
+}
+
+inline bool VerifyMessageBuffer(
+ flatbuffers::Verifier &verifier) {
+ return verifier.VerifyBuffer<org::apache::arrow::flatbuf::Message>(nullptr);
+}
+
+inline bool VerifySizePrefixedMessageBuffer(
+ flatbuffers::Verifier &verifier) {
+ return verifier.VerifySizePrefixedBuffer<org::apache::arrow::flatbuf::Message>(nullptr);
+}
+
+inline void FinishMessageBuffer(
+ flatbuffers::FlatBufferBuilder &fbb,
+ flatbuffers::Offset<org::apache::arrow::flatbuf::Message> root) {
+ fbb.Finish(root);
+}
+
+inline void FinishSizePrefixedMessageBuffer(
+ flatbuffers::FlatBufferBuilder &fbb,
+ flatbuffers::Offset<org::apache::arrow::flatbuf::Message> root) {
+ fbb.FinishSizePrefixed(root);
+}
+
+} // namespace flatbuf
+} // namespace arrow
+} // namespace apache
+} // namespace org
+
+#endif // FLATBUFFERS_GENERATED_MESSAGE_ORG_APACHE_ARROW_FLATBUF_H_
diff --git a/contrib/libs/apache/arrow/cpp/src/generated/Schema_generated.h b/contrib/libs/apache/arrow/cpp/src/generated/Schema_generated.h
new file mode 100644
index 00000000000..91e01d33758
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/generated/Schema_generated.h
@@ -0,0 +1,2265 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+
+#ifndef FLATBUFFERS_GENERATED_SCHEMA_ORG_APACHE_ARROW_FLATBUF_H_
+#define FLATBUFFERS_GENERATED_SCHEMA_ORG_APACHE_ARROW_FLATBUF_H_
+
+#include "flatbuffers/flatbuffers.h"
+
+namespace org {
+namespace apache {
+namespace arrow {
+namespace flatbuf {
+
+struct Null;
+struct NullBuilder;
+
+struct Struct_;
+struct Struct_Builder;
+
+struct List;
+struct ListBuilder;
+
+struct LargeList;
+struct LargeListBuilder;
+
+struct FixedSizeList;
+struct FixedSizeListBuilder;
+
+struct Map;
+struct MapBuilder;
+
+struct Union;
+struct UnionBuilder;
+
+struct Int;
+struct IntBuilder;
+
+struct FloatingPoint;
+struct FloatingPointBuilder;
+
+struct Utf8;
+struct Utf8Builder;
+
+struct Binary;
+struct BinaryBuilder;
+
+struct LargeUtf8;
+struct LargeUtf8Builder;
+
+struct LargeBinary;
+struct LargeBinaryBuilder;
+
+struct FixedSizeBinary;
+struct FixedSizeBinaryBuilder;
+
+struct Bool;
+struct BoolBuilder;
+
+struct Decimal;
+struct DecimalBuilder;
+
+struct Date;
+struct DateBuilder;
+
+struct Time;
+struct TimeBuilder;
+
+struct Timestamp;
+struct TimestampBuilder;
+
+struct Interval;
+struct IntervalBuilder;
+
+struct Duration;
+struct DurationBuilder;
+
+struct KeyValue;
+struct KeyValueBuilder;
+
+struct DictionaryEncoding;
+struct DictionaryEncodingBuilder;
+
+struct Field;
+struct FieldBuilder;
+
+struct Buffer;
+
+struct Schema;
+struct SchemaBuilder;
+
+enum class MetadataVersion : int16_t {
+ /// 0.1.0 (October 2016).
+ V1 = 0,
+ /// 0.2.0 (February 2017). Non-backwards compatible with V1.
+ V2 = 1,
+ /// 0.3.0 -> 0.7.1 (May - December 2017). Non-backwards compatible with V2.
+ V3 = 2,
+ /// >= 0.8.0 (December 2017). Non-backwards compatible with V3.
+ V4 = 3,
+ /// >= 1.0.0 (July 2020. Backwards compatible with V4 (V5 readers can read V4
+ /// metadata and IPC messages). Implementations are recommended to provide a
+ /// V4 compatibility mode with V5 format changes disabled.
+ ///
+ /// Incompatible changes between V4 and V5:
+ /// - Union buffer layout has changed. In V5, Unions don't have a validity
+ /// bitmap buffer.
+ V5 = 4,
+ MIN = V1,
+ MAX = V5
+};
+
+inline const MetadataVersion (&EnumValuesMetadataVersion())[5] {
+ static const MetadataVersion values[] = {
+ MetadataVersion::V1,
+ MetadataVersion::V2,
+ MetadataVersion::V3,
+ MetadataVersion::V4,
+ MetadataVersion::V5
+ };
+ return values;
+}
+
+inline const char * const *EnumNamesMetadataVersion() {
+ static const char * const names[6] = {
+ "V1",
+ "V2",
+ "V3",
+ "V4",
+ "V5",
+ nullptr
+ };
+ return names;
+}
+
+inline const char *EnumNameMetadataVersion(MetadataVersion e) {
+ if (flatbuffers::IsOutRange(e, MetadataVersion::V1, MetadataVersion::V5)) return "";
+ const size_t index = static_cast<size_t>(e);
+ return EnumNamesMetadataVersion()[index];
+}
+
+/// Represents Arrow Features that might not have full support
+/// within implementations. This is intended to be used in
+/// two scenarios:
+/// 1. A mechanism for readers of Arrow Streams
+/// and files to understand that the stream or file makes
+/// use of a feature that isn't supported or unknown to
+/// the implementation (and therefore can meet the Arrow
+/// forward compatibility guarantees).
+/// 2. A means of negotiating between a client and server
+/// what features a stream is allowed to use. The enums
+/// values here are intented to represent higher level
+/// features, additional details maybe negotiated
+/// with key-value pairs specific to the protocol.
+///
+/// Enums added to this list should be assigned power-of-two values
+/// to facilitate exchanging and comparing bitmaps for supported
+/// features.
+enum class Feature : int64_t {
+ /// Needed to make flatbuffers happy.
+ UNUSED = 0,
+ /// The stream makes use of multiple full dictionaries with the
+ /// same ID and assumes clients implement dictionary replacement
+ /// correctly.
+ DICTIONARY_REPLACEMENT = 1LL,
+ /// The stream makes use of compressed bodies as described
+ /// in Message.fbs.
+ COMPRESSED_BODY = 2LL,
+ MIN = UNUSED,
+ MAX = COMPRESSED_BODY
+};
+
+inline const Feature (&EnumValuesFeature())[3] {
+ static const Feature values[] = {
+ Feature::UNUSED,
+ Feature::DICTIONARY_REPLACEMENT,
+ Feature::COMPRESSED_BODY
+ };
+ return values;
+}
+
+inline const char * const *EnumNamesFeature() {
+ static const char * const names[4] = {
+ "UNUSED",
+ "DICTIONARY_REPLACEMENT",
+ "COMPRESSED_BODY",
+ nullptr
+ };
+ return names;
+}
+
+inline const char *EnumNameFeature(Feature e) {
+ if (flatbuffers::IsOutRange(e, Feature::UNUSED, Feature::COMPRESSED_BODY)) return "";
+ const size_t index = static_cast<size_t>(e);
+ return EnumNamesFeature()[index];
+}
+
+enum class UnionMode : int16_t {
+ Sparse = 0,
+ Dense = 1,
+ MIN = Sparse,
+ MAX = Dense
+};
+
+inline const UnionMode (&EnumValuesUnionMode())[2] {
+ static const UnionMode values[] = {
+ UnionMode::Sparse,
+ UnionMode::Dense
+ };
+ return values;
+}
+
+inline const char * const *EnumNamesUnionMode() {
+ static const char * const names[3] = {
+ "Sparse",
+ "Dense",
+ nullptr
+ };
+ return names;
+}
+
+inline const char *EnumNameUnionMode(UnionMode e) {
+ if (flatbuffers::IsOutRange(e, UnionMode::Sparse, UnionMode::Dense)) return "";
+ const size_t index = static_cast<size_t>(e);
+ return EnumNamesUnionMode()[index];
+}
+
+enum class Precision : int16_t {
+ HALF = 0,
+ SINGLE = 1,
+ DOUBLE = 2,
+ MIN = HALF,
+ MAX = DOUBLE
+};
+
+inline const Precision (&EnumValuesPrecision())[3] {
+ static const Precision values[] = {
+ Precision::HALF,
+ Precision::SINGLE,
+ Precision::DOUBLE
+ };
+ return values;
+}
+
+inline const char * const *EnumNamesPrecision() {
+ static const char * const names[4] = {
+ "HALF",
+ "SINGLE",
+ "DOUBLE",
+ nullptr
+ };
+ return names;
+}
+
+inline const char *EnumNamePrecision(Precision e) {
+ if (flatbuffers::IsOutRange(e, Precision::HALF, Precision::DOUBLE)) return "";
+ const size_t index = static_cast<size_t>(e);
+ return EnumNamesPrecision()[index];
+}
+
+enum class DateUnit : int16_t {
+ DAY = 0,
+ MILLISECOND = 1,
+ MIN = DAY,
+ MAX = MILLISECOND
+};
+
+inline const DateUnit (&EnumValuesDateUnit())[2] {
+ static const DateUnit values[] = {
+ DateUnit::DAY,
+ DateUnit::MILLISECOND
+ };
+ return values;
+}
+
+inline const char * const *EnumNamesDateUnit() {
+ static const char * const names[3] = {
+ "DAY",
+ "MILLISECOND",
+ nullptr
+ };
+ return names;
+}
+
+inline const char *EnumNameDateUnit(DateUnit e) {
+ if (flatbuffers::IsOutRange(e, DateUnit::DAY, DateUnit::MILLISECOND)) return "";
+ const size_t index = static_cast<size_t>(e);
+ return EnumNamesDateUnit()[index];
+}
+
+enum class TimeUnit : int16_t {
+ SECOND = 0,
+ MILLISECOND = 1,
+ MICROSECOND = 2,
+ NANOSECOND = 3,
+ MIN = SECOND,
+ MAX = NANOSECOND
+};
+
+inline const TimeUnit (&EnumValuesTimeUnit())[4] {
+ static const TimeUnit values[] = {
+ TimeUnit::SECOND,
+ TimeUnit::MILLISECOND,
+ TimeUnit::MICROSECOND,
+ TimeUnit::NANOSECOND
+ };
+ return values;
+}
+
+inline const char * const *EnumNamesTimeUnit() {
+ static const char * const names[5] = {
+ "SECOND",
+ "MILLISECOND",
+ "MICROSECOND",
+ "NANOSECOND",
+ nullptr
+ };
+ return names;
+}
+
+inline const char *EnumNameTimeUnit(TimeUnit e) {
+ if (flatbuffers::IsOutRange(e, TimeUnit::SECOND, TimeUnit::NANOSECOND)) return "";
+ const size_t index = static_cast<size_t>(e);
+ return EnumNamesTimeUnit()[index];
+}
+
+enum class IntervalUnit : int16_t {
+ YEAR_MONTH = 0,
+ DAY_TIME = 1,
+ MIN = YEAR_MONTH,
+ MAX = DAY_TIME
+};
+
+inline const IntervalUnit (&EnumValuesIntervalUnit())[2] {
+ static const IntervalUnit values[] = {
+ IntervalUnit::YEAR_MONTH,
+ IntervalUnit::DAY_TIME
+ };
+ return values;
+}
+
+inline const char * const *EnumNamesIntervalUnit() {
+ static const char * const names[3] = {
+ "YEAR_MONTH",
+ "DAY_TIME",
+ nullptr
+ };
+ return names;
+}
+
+inline const char *EnumNameIntervalUnit(IntervalUnit e) {
+ if (flatbuffers::IsOutRange(e, IntervalUnit::YEAR_MONTH, IntervalUnit::DAY_TIME)) return "";
+ const size_t index = static_cast<size_t>(e);
+ return EnumNamesIntervalUnit()[index];
+}
+
+/// ----------------------------------------------------------------------
+/// Top-level Type value, enabling extensible type-specific metadata. We can
+/// add new logical types to Type without breaking backwards compatibility
+enum class Type : uint8_t {
+ NONE = 0,
+ Null = 1,
+ Int = 2,
+ FloatingPoint = 3,
+ Binary = 4,
+ Utf8 = 5,
+ Bool = 6,
+ Decimal = 7,
+ Date = 8,
+ Time = 9,
+ Timestamp = 10,
+ Interval = 11,
+ List = 12,
+ Struct_ = 13,
+ Union = 14,
+ FixedSizeBinary = 15,
+ FixedSizeList = 16,
+ Map = 17,
+ Duration = 18,
+ LargeBinary = 19,
+ LargeUtf8 = 20,
+ LargeList = 21,
+ MIN = NONE,
+ MAX = LargeList
+};
+
+inline const Type (&EnumValuesType())[22] {
+ static const Type values[] = {
+ Type::NONE,
+ Type::Null,
+ Type::Int,
+ Type::FloatingPoint,
+ Type::Binary,
+ Type::Utf8,
+ Type::Bool,
+ Type::Decimal,
+ Type::Date,
+ Type::Time,
+ Type::Timestamp,
+ Type::Interval,
+ Type::List,
+ Type::Struct_,
+ Type::Union,
+ Type::FixedSizeBinary,
+ Type::FixedSizeList,
+ Type::Map,
+ Type::Duration,
+ Type::LargeBinary,
+ Type::LargeUtf8,
+ Type::LargeList
+ };
+ return values;
+}
+
+inline const char * const *EnumNamesType() {
+ static const char * const names[23] = {
+ "NONE",
+ "Null",
+ "Int",
+ "FloatingPoint",
+ "Binary",
+ "Utf8",
+ "Bool",
+ "Decimal",
+ "Date",
+ "Time",
+ "Timestamp",
+ "Interval",
+ "List",
+ "Struct_",
+ "Union",
+ "FixedSizeBinary",
+ "FixedSizeList",
+ "Map",
+ "Duration",
+ "LargeBinary",
+ "LargeUtf8",
+ "LargeList",
+ nullptr
+ };
+ return names;
+}
+
+inline const char *EnumNameType(Type e) {
+ if (flatbuffers::IsOutRange(e, Type::NONE, Type::LargeList)) return "";
+ const size_t index = static_cast<size_t>(e);
+ return EnumNamesType()[index];
+}
+
+template<typename T> struct TypeTraits {
+ static const Type enum_value = Type::NONE;
+};
+
+template<> struct TypeTraits<org::apache::arrow::flatbuf::Null> {
+ static const Type enum_value = Type::Null;
+};
+
+template<> struct TypeTraits<org::apache::arrow::flatbuf::Int> {
+ static const Type enum_value = Type::Int;
+};
+
+template<> struct TypeTraits<org::apache::arrow::flatbuf::FloatingPoint> {
+ static const Type enum_value = Type::FloatingPoint;
+};
+
+template<> struct TypeTraits<org::apache::arrow::flatbuf::Binary> {
+ static const Type enum_value = Type::Binary;
+};
+
+template<> struct TypeTraits<org::apache::arrow::flatbuf::Utf8> {
+ static const Type enum_value = Type::Utf8;
+};
+
+template<> struct TypeTraits<org::apache::arrow::flatbuf::Bool> {
+ static const Type enum_value = Type::Bool;
+};
+
+template<> struct TypeTraits<org::apache::arrow::flatbuf::Decimal> {
+ static const Type enum_value = Type::Decimal;
+};
+
+template<> struct TypeTraits<org::apache::arrow::flatbuf::Date> {
+ static const Type enum_value = Type::Date;
+};
+
+template<> struct TypeTraits<org::apache::arrow::flatbuf::Time> {
+ static const Type enum_value = Type::Time;
+};
+
+template<> struct TypeTraits<org::apache::arrow::flatbuf::Timestamp> {
+ static const Type enum_value = Type::Timestamp;
+};
+
+template<> struct TypeTraits<org::apache::arrow::flatbuf::Interval> {
+ static const Type enum_value = Type::Interval;
+};
+
+template<> struct TypeTraits<org::apache::arrow::flatbuf::List> {
+ static const Type enum_value = Type::List;
+};
+
+template<> struct TypeTraits<org::apache::arrow::flatbuf::Struct_> {
+ static const Type enum_value = Type::Struct_;
+};
+
+template<> struct TypeTraits<org::apache::arrow::flatbuf::Union> {
+ static const Type enum_value = Type::Union;
+};
+
+template<> struct TypeTraits<org::apache::arrow::flatbuf::FixedSizeBinary> {
+ static const Type enum_value = Type::FixedSizeBinary;
+};
+
+template<> struct TypeTraits<org::apache::arrow::flatbuf::FixedSizeList> {
+ static const Type enum_value = Type::FixedSizeList;
+};
+
+template<> struct TypeTraits<org::apache::arrow::flatbuf::Map> {
+ static const Type enum_value = Type::Map;
+};
+
+template<> struct TypeTraits<org::apache::arrow::flatbuf::Duration> {
+ static const Type enum_value = Type::Duration;
+};
+
+template<> struct TypeTraits<org::apache::arrow::flatbuf::LargeBinary> {
+ static const Type enum_value = Type::LargeBinary;
+};
+
+template<> struct TypeTraits<org::apache::arrow::flatbuf::LargeUtf8> {
+ static const Type enum_value = Type::LargeUtf8;
+};
+
+template<> struct TypeTraits<org::apache::arrow::flatbuf::LargeList> {
+ static const Type enum_value = Type::LargeList;
+};
+
+bool VerifyType(flatbuffers::Verifier &verifier, const void *obj, Type type);
+bool VerifyTypeVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types);
+
+/// ----------------------------------------------------------------------
+/// Dictionary encoding metadata
+/// Maintained for forwards compatibility, in the future
+/// Dictionaries might be explicit maps between integers and values
+/// allowing for non-contiguous index values
+enum class DictionaryKind : int16_t {
+ DenseArray = 0,
+ MIN = DenseArray,
+ MAX = DenseArray
+};
+
+inline const DictionaryKind (&EnumValuesDictionaryKind())[1] {
+ static const DictionaryKind values[] = {
+ DictionaryKind::DenseArray
+ };
+ return values;
+}
+
+inline const char * const *EnumNamesDictionaryKind() {
+ static const char * const names[2] = {
+ "DenseArray",
+ nullptr
+ };
+ return names;
+}
+
+inline const char *EnumNameDictionaryKind(DictionaryKind e) {
+ if (flatbuffers::IsOutRange(e, DictionaryKind::DenseArray, DictionaryKind::DenseArray)) return "";
+ const size_t index = static_cast<size_t>(e);
+ return EnumNamesDictionaryKind()[index];
+}
+
+/// ----------------------------------------------------------------------
+/// Endianness of the platform producing the data
+enum class Endianness : int16_t {
+ Little = 0,
+ Big = 1,
+ MIN = Little,
+ MAX = Big
+};
+
+inline const Endianness (&EnumValuesEndianness())[2] {
+ static const Endianness values[] = {
+ Endianness::Little,
+ Endianness::Big
+ };
+ return values;
+}
+
+inline const char * const *EnumNamesEndianness() {
+ static const char * const names[3] = {
+ "Little",
+ "Big",
+ nullptr
+ };
+ return names;
+}
+
+inline const char *EnumNameEndianness(Endianness e) {
+ if (flatbuffers::IsOutRange(e, Endianness::Little, Endianness::Big)) return "";
+ const size_t index = static_cast<size_t>(e);
+ return EnumNamesEndianness()[index];
+}
+
+/// ----------------------------------------------------------------------
+/// A Buffer represents a single contiguous memory segment
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(8) Buffer FLATBUFFERS_FINAL_CLASS {
+ private:
+ int64_t offset_;
+ int64_t length_;
+
+ public:
+ Buffer() {
+ memset(static_cast<void *>(this), 0, sizeof(Buffer));
+ }
+ Buffer(int64_t _offset, int64_t _length)
+ : offset_(flatbuffers::EndianScalar(_offset)),
+ length_(flatbuffers::EndianScalar(_length)) {
+ }
+ /// The relative offset into the shared memory page where the bytes for this
+ /// buffer starts
+ int64_t offset() const {
+ return flatbuffers::EndianScalar(offset_);
+ }
+ /// The absolute length (in bytes) of the memory buffer. The memory is found
+ /// from offset (inclusive) to offset + length (non-inclusive). When building
+ /// messages using the encapsulated IPC message, padding bytes may be written
+ /// after a buffer, but such padding bytes do not need to be accounted for in
+ /// the size here.
+ int64_t length() const {
+ return flatbuffers::EndianScalar(length_);
+ }
+};
+FLATBUFFERS_STRUCT_END(Buffer, 16);
+
+/// These are stored in the flatbuffer in the Type union below
+struct Null FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+ typedef NullBuilder Builder;
+ bool Verify(flatbuffers::Verifier &verifier) const {
+ return VerifyTableStart(verifier) &&
+ verifier.EndTable();
+ }
+};
+
+struct NullBuilder {
+ typedef Null Table;
+ flatbuffers::FlatBufferBuilder &fbb_;
+ flatbuffers::uoffset_t start_;
+ explicit NullBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+ : fbb_(_fbb) {
+ start_ = fbb_.StartTable();
+ }
+ NullBuilder &operator=(const NullBuilder &);
+ flatbuffers::Offset<Null> Finish() {
+ const auto end = fbb_.EndTable(start_);
+ auto o = flatbuffers::Offset<Null>(end);
+ return o;
+ }
+};
+
+inline flatbuffers::Offset<Null> CreateNull(
+ flatbuffers::FlatBufferBuilder &_fbb) {
+ NullBuilder builder_(_fbb);
+ return builder_.Finish();
+}
+
+/// A Struct_ in the flatbuffer metadata is the same as an Arrow Struct
+/// (according to the physical memory layout). We used Struct_ here as
+/// Struct is a reserved word in Flatbuffers
+struct Struct_ FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+ typedef Struct_Builder Builder;
+ bool Verify(flatbuffers::Verifier &verifier) const {
+ return VerifyTableStart(verifier) &&
+ verifier.EndTable();
+ }
+};
+
+struct Struct_Builder {
+ typedef Struct_ Table;
+ flatbuffers::FlatBufferBuilder &fbb_;
+ flatbuffers::uoffset_t start_;
+ explicit Struct_Builder(flatbuffers::FlatBufferBuilder &_fbb)
+ : fbb_(_fbb) {
+ start_ = fbb_.StartTable();
+ }
+ Struct_Builder &operator=(const Struct_Builder &);
+ flatbuffers::Offset<Struct_> Finish() {
+ const auto end = fbb_.EndTable(start_);
+ auto o = flatbuffers::Offset<Struct_>(end);
+ return o;
+ }
+};
+
+inline flatbuffers::Offset<Struct_> CreateStruct_(
+ flatbuffers::FlatBufferBuilder &_fbb) {
+ Struct_Builder builder_(_fbb);
+ return builder_.Finish();
+}
+
+struct List FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+ typedef ListBuilder Builder;
+ bool Verify(flatbuffers::Verifier &verifier) const {
+ return VerifyTableStart(verifier) &&
+ verifier.EndTable();
+ }
+};
+
+struct ListBuilder {
+ typedef List Table;
+ flatbuffers::FlatBufferBuilder &fbb_;
+ flatbuffers::uoffset_t start_;
+ explicit ListBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+ : fbb_(_fbb) {
+ start_ = fbb_.StartTable();
+ }
+ ListBuilder &operator=(const ListBuilder &);
+ flatbuffers::Offset<List> Finish() {
+ const auto end = fbb_.EndTable(start_);
+ auto o = flatbuffers::Offset<List>(end);
+ return o;
+ }
+};
+
+inline flatbuffers::Offset<List> CreateList(
+ flatbuffers::FlatBufferBuilder &_fbb) {
+ ListBuilder builder_(_fbb);
+ return builder_.Finish();
+}
+
+/// Same as List, but with 64-bit offsets, allowing to represent
+/// extremely large data values.
+struct LargeList FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+ typedef LargeListBuilder Builder;
+ bool Verify(flatbuffers::Verifier &verifier) const {
+ return VerifyTableStart(verifier) &&
+ verifier.EndTable();
+ }
+};
+
+struct LargeListBuilder {
+ typedef LargeList Table;
+ flatbuffers::FlatBufferBuilder &fbb_;
+ flatbuffers::uoffset_t start_;
+ explicit LargeListBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+ : fbb_(_fbb) {
+ start_ = fbb_.StartTable();
+ }
+ LargeListBuilder &operator=(const LargeListBuilder &);
+ flatbuffers::Offset<LargeList> Finish() {
+ const auto end = fbb_.EndTable(start_);
+ auto o = flatbuffers::Offset<LargeList>(end);
+ return o;
+ }
+};
+
+inline flatbuffers::Offset<LargeList> CreateLargeList(
+ flatbuffers::FlatBufferBuilder &_fbb) {
+ LargeListBuilder builder_(_fbb);
+ return builder_.Finish();
+}
+
+struct FixedSizeList FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+ typedef FixedSizeListBuilder Builder;
+ enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+ VT_LISTSIZE = 4
+ };
+ /// Number of list items per value
+ int32_t listSize() const {
+ return GetField<int32_t>(VT_LISTSIZE, 0);
+ }
+ bool Verify(flatbuffers::Verifier &verifier) const {
+ return VerifyTableStart(verifier) &&
+ VerifyField<int32_t>(verifier, VT_LISTSIZE) &&
+ verifier.EndTable();
+ }
+};
+
+struct FixedSizeListBuilder {
+ typedef FixedSizeList Table;
+ flatbuffers::FlatBufferBuilder &fbb_;
+ flatbuffers::uoffset_t start_;
+ void add_listSize(int32_t listSize) {
+ fbb_.AddElement<int32_t>(FixedSizeList::VT_LISTSIZE, listSize, 0);
+ }
+ explicit FixedSizeListBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+ : fbb_(_fbb) {
+ start_ = fbb_.StartTable();
+ }
+ FixedSizeListBuilder &operator=(const FixedSizeListBuilder &);
+ flatbuffers::Offset<FixedSizeList> Finish() {
+ const auto end = fbb_.EndTable(start_);
+ auto o = flatbuffers::Offset<FixedSizeList>(end);
+ return o;
+ }
+};
+
+inline flatbuffers::Offset<FixedSizeList> CreateFixedSizeList(
+ flatbuffers::FlatBufferBuilder &_fbb,
+ int32_t listSize = 0) {
+ FixedSizeListBuilder builder_(_fbb);
+ builder_.add_listSize(listSize);
+ return builder_.Finish();
+}
+
+/// A Map is a logical nested type that is represented as
+///
+/// List<entries: Struct<key: K, value: V>>
+///
+/// In this layout, the keys and values are each respectively contiguous. We do
+/// not constrain the key and value types, so the application is responsible
+/// for ensuring that the keys are hashable and unique. Whether the keys are sorted
+/// may be set in the metadata for this field.
+///
+/// In a field with Map type, the field has a child Struct field, which then
+/// has two children: key type and the second the value type. The names of the
+/// child fields may be respectively "entries", "key", and "value", but this is
+/// not enforced.
+///
+/// Map
+/// - child[0] entries: Struct
+/// - child[0] key: K
+/// - child[1] value: V
+///
+/// Neither the "entries" field nor the "key" field may be nullable.
+///
+/// The metadata is structured so that Arrow systems without special handling
+/// for Map can make Map an alias for List. The "layout" attribute for the Map
+/// field must have the same contents as a List.
+struct Map FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+ typedef MapBuilder Builder;
+ enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+ VT_KEYSSORTED = 4
+ };
+ /// Set to true if the keys within each value are sorted
+ bool keysSorted() const {
+ return GetField<uint8_t>(VT_KEYSSORTED, 0) != 0;
+ }
+ bool Verify(flatbuffers::Verifier &verifier) const {
+ return VerifyTableStart(verifier) &&
+ VerifyField<uint8_t>(verifier, VT_KEYSSORTED) &&
+ verifier.EndTable();
+ }
+};
+
+struct MapBuilder {
+ typedef Map Table;
+ flatbuffers::FlatBufferBuilder &fbb_;
+ flatbuffers::uoffset_t start_;
+ void add_keysSorted(bool keysSorted) {
+ fbb_.AddElement<uint8_t>(Map::VT_KEYSSORTED, static_cast<uint8_t>(keysSorted), 0);
+ }
+ explicit MapBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+ : fbb_(_fbb) {
+ start_ = fbb_.StartTable();
+ }
+ MapBuilder &operator=(const MapBuilder &);
+ flatbuffers::Offset<Map> Finish() {
+ const auto end = fbb_.EndTable(start_);
+ auto o = flatbuffers::Offset<Map>(end);
+ return o;
+ }
+};
+
+inline flatbuffers::Offset<Map> CreateMap(
+ flatbuffers::FlatBufferBuilder &_fbb,
+ bool keysSorted = false) {
+ MapBuilder builder_(_fbb);
+ builder_.add_keysSorted(keysSorted);
+ return builder_.Finish();
+}
+
+/// A union is a complex type with children in Field
+/// By default ids in the type vector refer to the offsets in the children
+/// optionally typeIds provides an indirection between the child offset and the type id
+/// for each child typeIds[offset] is the id used in the type vector
+struct Union FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+ typedef UnionBuilder Builder;
+ enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+ VT_MODE = 4,
+ VT_TYPEIDS = 6
+ };
+ org::apache::arrow::flatbuf::UnionMode mode() const {
+ return static_cast<org::apache::arrow::flatbuf::UnionMode>(GetField<int16_t>(VT_MODE, 0));
+ }
+ const flatbuffers::Vector<int32_t> *typeIds() const {
+ return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_TYPEIDS);
+ }
+ bool Verify(flatbuffers::Verifier &verifier) const {
+ return VerifyTableStart(verifier) &&
+ VerifyField<int16_t>(verifier, VT_MODE) &&
+ VerifyOffset(verifier, VT_TYPEIDS) &&
+ verifier.VerifyVector(typeIds()) &&
+ verifier.EndTable();
+ }
+};
+
+struct UnionBuilder {
+ typedef Union Table;
+ flatbuffers::FlatBufferBuilder &fbb_;
+ flatbuffers::uoffset_t start_;
+ void add_mode(org::apache::arrow::flatbuf::UnionMode mode) {
+ fbb_.AddElement<int16_t>(Union::VT_MODE, static_cast<int16_t>(mode), 0);
+ }
+ void add_typeIds(flatbuffers::Offset<flatbuffers::Vector<int32_t>> typeIds) {
+ fbb_.AddOffset(Union::VT_TYPEIDS, typeIds);
+ }
+ explicit UnionBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+ : fbb_(_fbb) {
+ start_ = fbb_.StartTable();
+ }
+ UnionBuilder &operator=(const UnionBuilder &);
+ flatbuffers::Offset<Union> Finish() {
+ const auto end = fbb_.EndTable(start_);
+ auto o = flatbuffers::Offset<Union>(end);
+ return o;
+ }
+};
+
+inline flatbuffers::Offset<Union> CreateUnion(
+ flatbuffers::FlatBufferBuilder &_fbb,
+ org::apache::arrow::flatbuf::UnionMode mode = org::apache::arrow::flatbuf::UnionMode::Sparse,
+ flatbuffers::Offset<flatbuffers::Vector<int32_t>> typeIds = 0) {
+ UnionBuilder builder_(_fbb);
+ builder_.add_typeIds(typeIds);
+ builder_.add_mode(mode);
+ return builder_.Finish();
+}
+
+inline flatbuffers::Offset<Union> CreateUnionDirect(
+ flatbuffers::FlatBufferBuilder &_fbb,
+ org::apache::arrow::flatbuf::UnionMode mode = org::apache::arrow::flatbuf::UnionMode::Sparse,
+ const std::vector<int32_t> *typeIds = nullptr) {
+ auto typeIds__ = typeIds ? _fbb.CreateVector<int32_t>(*typeIds) : 0;
+ return org::apache::arrow::flatbuf::CreateUnion(
+ _fbb,
+ mode,
+ typeIds__);
+}
+
+struct Int FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+ typedef IntBuilder Builder;
+ enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+ VT_BITWIDTH = 4,
+ VT_IS_SIGNED = 6
+ };
+ int32_t bitWidth() const {
+ return GetField<int32_t>(VT_BITWIDTH, 0);
+ }
+ bool is_signed() const {
+ return GetField<uint8_t>(VT_IS_SIGNED, 0) != 0;
+ }
+ bool Verify(flatbuffers::Verifier &verifier) const {
+ return VerifyTableStart(verifier) &&
+ VerifyField<int32_t>(verifier, VT_BITWIDTH) &&
+ VerifyField<uint8_t>(verifier, VT_IS_SIGNED) &&
+ verifier.EndTable();
+ }
+};
+
+struct IntBuilder {
+ typedef Int Table;
+ flatbuffers::FlatBufferBuilder &fbb_;
+ flatbuffers::uoffset_t start_;
+ void add_bitWidth(int32_t bitWidth) {
+ fbb_.AddElement<int32_t>(Int::VT_BITWIDTH, bitWidth, 0);
+ }
+ void add_is_signed(bool is_signed) {
+ fbb_.AddElement<uint8_t>(Int::VT_IS_SIGNED, static_cast<uint8_t>(is_signed), 0);
+ }
+ explicit IntBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+ : fbb_(_fbb) {
+ start_ = fbb_.StartTable();
+ }
+ IntBuilder &operator=(const IntBuilder &);
+ flatbuffers::Offset<Int> Finish() {
+ const auto end = fbb_.EndTable(start_);
+ auto o = flatbuffers::Offset<Int>(end);
+ return o;
+ }
+};
+
+inline flatbuffers::Offset<Int> CreateInt(
+ flatbuffers::FlatBufferBuilder &_fbb,
+ int32_t bitWidth = 0,
+ bool is_signed = false) {
+ IntBuilder builder_(_fbb);
+ builder_.add_bitWidth(bitWidth);
+ builder_.add_is_signed(is_signed);
+ return builder_.Finish();
+}
+
+struct FloatingPoint FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+ typedef FloatingPointBuilder Builder;
+ enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+ VT_PRECISION = 4
+ };
+ org::apache::arrow::flatbuf::Precision precision() const {
+ return static_cast<org::apache::arrow::flatbuf::Precision>(GetField<int16_t>(VT_PRECISION, 0));
+ }
+ bool Verify(flatbuffers::Verifier &verifier) const {
+ return VerifyTableStart(verifier) &&
+ VerifyField<int16_t>(verifier, VT_PRECISION) &&
+ verifier.EndTable();
+ }
+};
+
+struct FloatingPointBuilder {
+ typedef FloatingPoint Table;
+ flatbuffers::FlatBufferBuilder &fbb_;
+ flatbuffers::uoffset_t start_;
+ void add_precision(org::apache::arrow::flatbuf::Precision precision) {
+ fbb_.AddElement<int16_t>(FloatingPoint::VT_PRECISION, static_cast<int16_t>(precision), 0);
+ }
+ explicit FloatingPointBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+ : fbb_(_fbb) {
+ start_ = fbb_.StartTable();
+ }
+ FloatingPointBuilder &operator=(const FloatingPointBuilder &);
+ flatbuffers::Offset<FloatingPoint> Finish() {
+ const auto end = fbb_.EndTable(start_);
+ auto o = flatbuffers::Offset<FloatingPoint>(end);
+ return o;
+ }
+};
+
+inline flatbuffers::Offset<FloatingPoint> CreateFloatingPoint(
+ flatbuffers::FlatBufferBuilder &_fbb,
+ org::apache::arrow::flatbuf::Precision precision = org::apache::arrow::flatbuf::Precision::HALF) {
+ FloatingPointBuilder builder_(_fbb);
+ builder_.add_precision(precision);
+ return builder_.Finish();
+}
+
+/// Unicode with UTF-8 encoding
+struct Utf8 FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+ typedef Utf8Builder Builder;
+ bool Verify(flatbuffers::Verifier &verifier) const {
+ return VerifyTableStart(verifier) &&
+ verifier.EndTable();
+ }
+};
+
+struct Utf8Builder {
+ typedef Utf8 Table;
+ flatbuffers::FlatBufferBuilder &fbb_;
+ flatbuffers::uoffset_t start_;
+ explicit Utf8Builder(flatbuffers::FlatBufferBuilder &_fbb)
+ : fbb_(_fbb) {
+ start_ = fbb_.StartTable();
+ }
+ Utf8Builder &operator=(const Utf8Builder &);
+ flatbuffers::Offset<Utf8> Finish() {
+ const auto end = fbb_.EndTable(start_);
+ auto o = flatbuffers::Offset<Utf8>(end);
+ return o;
+ }
+};
+
+inline flatbuffers::Offset<Utf8> CreateUtf8(
+ flatbuffers::FlatBufferBuilder &_fbb) {
+ Utf8Builder builder_(_fbb);
+ return builder_.Finish();
+}
+
+/// Opaque binary data
+struct Binary FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+ typedef BinaryBuilder Builder;
+ bool Verify(flatbuffers::Verifier &verifier) const {
+ return VerifyTableStart(verifier) &&
+ verifier.EndTable();
+ }
+};
+
+struct BinaryBuilder {
+ typedef Binary Table;
+ flatbuffers::FlatBufferBuilder &fbb_;
+ flatbuffers::uoffset_t start_;
+ explicit BinaryBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+ : fbb_(_fbb) {
+ start_ = fbb_.StartTable();
+ }
+ BinaryBuilder &operator=(const BinaryBuilder &);
+ flatbuffers::Offset<Binary> Finish() {
+ const auto end = fbb_.EndTable(start_);
+ auto o = flatbuffers::Offset<Binary>(end);
+ return o;
+ }
+};
+
+inline flatbuffers::Offset<Binary> CreateBinary(
+ flatbuffers::FlatBufferBuilder &_fbb) {
+ BinaryBuilder builder_(_fbb);
+ return builder_.Finish();
+}
+
+/// Same as Utf8, but with 64-bit offsets, allowing to represent
+/// extremely large data values.
+struct LargeUtf8 FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+ typedef LargeUtf8Builder Builder;
+ bool Verify(flatbuffers::Verifier &verifier) const {
+ return VerifyTableStart(verifier) &&
+ verifier.EndTable();
+ }
+};
+
+struct LargeUtf8Builder {
+ typedef LargeUtf8 Table;
+ flatbuffers::FlatBufferBuilder &fbb_;
+ flatbuffers::uoffset_t start_;
+ explicit LargeUtf8Builder(flatbuffers::FlatBufferBuilder &_fbb)
+ : fbb_(_fbb) {
+ start_ = fbb_.StartTable();
+ }
+ LargeUtf8Builder &operator=(const LargeUtf8Builder &);
+ flatbuffers::Offset<LargeUtf8> Finish() {
+ const auto end = fbb_.EndTable(start_);
+ auto o = flatbuffers::Offset<LargeUtf8>(end);
+ return o;
+ }
+};
+
+inline flatbuffers::Offset<LargeUtf8> CreateLargeUtf8(
+ flatbuffers::FlatBufferBuilder &_fbb) {
+ LargeUtf8Builder builder_(_fbb);
+ return builder_.Finish();
+}
+
+/// Same as Binary, but with 64-bit offsets, allowing to represent
+/// extremely large data values.
+struct LargeBinary FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+ typedef LargeBinaryBuilder Builder;
+ bool Verify(flatbuffers::Verifier &verifier) const {
+ return VerifyTableStart(verifier) &&
+ verifier.EndTable();
+ }
+};
+
+struct LargeBinaryBuilder {
+ typedef LargeBinary Table;
+ flatbuffers::FlatBufferBuilder &fbb_;
+ flatbuffers::uoffset_t start_;
+ explicit LargeBinaryBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+ : fbb_(_fbb) {
+ start_ = fbb_.StartTable();
+ }
+ LargeBinaryBuilder &operator=(const LargeBinaryBuilder &);
+ flatbuffers::Offset<LargeBinary> Finish() {
+ const auto end = fbb_.EndTable(start_);
+ auto o = flatbuffers::Offset<LargeBinary>(end);
+ return o;
+ }
+};
+
+inline flatbuffers::Offset<LargeBinary> CreateLargeBinary(
+ flatbuffers::FlatBufferBuilder &_fbb) {
+ LargeBinaryBuilder builder_(_fbb);
+ return builder_.Finish();
+}
+
+struct FixedSizeBinary FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+ typedef FixedSizeBinaryBuilder Builder;
+ enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+ VT_BYTEWIDTH = 4
+ };
+ /// Number of bytes per value
+ int32_t byteWidth() const {
+ return GetField<int32_t>(VT_BYTEWIDTH, 0);
+ }
+ bool Verify(flatbuffers::Verifier &verifier) const {
+ return VerifyTableStart(verifier) &&
+ VerifyField<int32_t>(verifier, VT_BYTEWIDTH) &&
+ verifier.EndTable();
+ }
+};
+
+struct FixedSizeBinaryBuilder {
+ typedef FixedSizeBinary Table;
+ flatbuffers::FlatBufferBuilder &fbb_;
+ flatbuffers::uoffset_t start_;
+ void add_byteWidth(int32_t byteWidth) {
+ fbb_.AddElement<int32_t>(FixedSizeBinary::VT_BYTEWIDTH, byteWidth, 0);
+ }
+ explicit FixedSizeBinaryBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+ : fbb_(_fbb) {
+ start_ = fbb_.StartTable();
+ }
+ FixedSizeBinaryBuilder &operator=(const FixedSizeBinaryBuilder &);
+ flatbuffers::Offset<FixedSizeBinary> Finish() {
+ const auto end = fbb_.EndTable(start_);
+ auto o = flatbuffers::Offset<FixedSizeBinary>(end);
+ return o;
+ }
+};
+
+inline flatbuffers::Offset<FixedSizeBinary> CreateFixedSizeBinary(
+ flatbuffers::FlatBufferBuilder &_fbb,
+ int32_t byteWidth = 0) {
+ FixedSizeBinaryBuilder builder_(_fbb);
+ builder_.add_byteWidth(byteWidth);
+ return builder_.Finish();
+}
+
+struct Bool FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+ typedef BoolBuilder Builder;
+ bool Verify(flatbuffers::Verifier &verifier) const {
+ return VerifyTableStart(verifier) &&
+ verifier.EndTable();
+ }
+};
+
+struct BoolBuilder {
+ typedef Bool Table;
+ flatbuffers::FlatBufferBuilder &fbb_;
+ flatbuffers::uoffset_t start_;
+ explicit BoolBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+ : fbb_(_fbb) {
+ start_ = fbb_.StartTable();
+ }
+ BoolBuilder &operator=(const BoolBuilder &);
+ flatbuffers::Offset<Bool> Finish() {
+ const auto end = fbb_.EndTable(start_);
+ auto o = flatbuffers::Offset<Bool>(end);
+ return o;
+ }
+};
+
+inline flatbuffers::Offset<Bool> CreateBool(
+ flatbuffers::FlatBufferBuilder &_fbb) {
+ BoolBuilder builder_(_fbb);
+ return builder_.Finish();
+}
+
+/// Exact decimal value represented as an integer value in two's
+/// complement. Currently only 128-bit (16-byte) integers are used but this may
+/// be expanded in the future. The representation uses the endianness indicated
+/// in the Schema.
+struct Decimal FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+ typedef DecimalBuilder Builder;
+ enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+ VT_PRECISION = 4,
+ VT_SCALE = 6,
+ VT_BITWIDTH = 8
+ };
+ /// Total number of decimal digits
+ int32_t precision() const {
+ return GetField<int32_t>(VT_PRECISION, 0);
+ }
+ /// Number of digits after the decimal point "."
+ int32_t scale() const {
+ return GetField<int32_t>(VT_SCALE, 0);
+ }
+ /// Number of bits per value. The only accepted width right now is 128 but
+ /// this field exists for forward compatibility so that other bit widths may
+ /// be supported in future format versions. We use bitWidth for consistency
+ /// with Int::bitWidth.
+ int32_t bitWidth() const {
+ return GetField<int32_t>(VT_BITWIDTH, 128);
+ }
+ bool Verify(flatbuffers::Verifier &verifier) const {
+ return VerifyTableStart(verifier) &&
+ VerifyField<int32_t>(verifier, VT_PRECISION) &&
+ VerifyField<int32_t>(verifier, VT_SCALE) &&
+ VerifyField<int32_t>(verifier, VT_BITWIDTH) &&
+ verifier.EndTable();
+ }
+};
+
+struct DecimalBuilder {
+ typedef Decimal Table;
+ flatbuffers::FlatBufferBuilder &fbb_;
+ flatbuffers::uoffset_t start_;
+ void add_precision(int32_t precision) {
+ fbb_.AddElement<int32_t>(Decimal::VT_PRECISION, precision, 0);
+ }
+ void add_scale(int32_t scale) {
+ fbb_.AddElement<int32_t>(Decimal::VT_SCALE, scale, 0);
+ }
+ void add_bitWidth(int32_t bitWidth) {
+ fbb_.AddElement<int32_t>(Decimal::VT_BITWIDTH, bitWidth, 128);
+ }
+ explicit DecimalBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+ : fbb_(_fbb) {
+ start_ = fbb_.StartTable();
+ }
+ DecimalBuilder &operator=(const DecimalBuilder &);
+ flatbuffers::Offset<Decimal> Finish() {
+ const auto end = fbb_.EndTable(start_);
+ auto o = flatbuffers::Offset<Decimal>(end);
+ return o;
+ }
+};
+
+inline flatbuffers::Offset<Decimal> CreateDecimal(
+ flatbuffers::FlatBufferBuilder &_fbb,
+ int32_t precision = 0,
+ int32_t scale = 0,
+ int32_t bitWidth = 128) {
+ DecimalBuilder builder_(_fbb);
+ builder_.add_bitWidth(bitWidth);
+ builder_.add_scale(scale);
+ builder_.add_precision(precision);
+ return builder_.Finish();
+}
+
+/// Date is either a 32-bit or 64-bit type representing elapsed time since UNIX
+/// epoch (1970-01-01), stored in either of two units:
+///
+/// * Milliseconds (64 bits) indicating UNIX time elapsed since the epoch (no
+/// leap seconds), where the values are evenly divisible by 86400000
+/// * Days (32 bits) since the UNIX epoch
+struct Date FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+ typedef DateBuilder Builder;
+ enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+ VT_UNIT = 4
+ };
+ org::apache::arrow::flatbuf::DateUnit unit() const {
+ return static_cast<org::apache::arrow::flatbuf::DateUnit>(GetField<int16_t>(VT_UNIT, 1));
+ }
+ bool Verify(flatbuffers::Verifier &verifier) const {
+ return VerifyTableStart(verifier) &&
+ VerifyField<int16_t>(verifier, VT_UNIT) &&
+ verifier.EndTable();
+ }
+};
+
+struct DateBuilder {
+ typedef Date Table;
+ flatbuffers::FlatBufferBuilder &fbb_;
+ flatbuffers::uoffset_t start_;
+ void add_unit(org::apache::arrow::flatbuf::DateUnit unit) {
+ fbb_.AddElement<int16_t>(Date::VT_UNIT, static_cast<int16_t>(unit), 1);
+ }
+ explicit DateBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+ : fbb_(_fbb) {
+ start_ = fbb_.StartTable();
+ }
+ DateBuilder &operator=(const DateBuilder &);
+ flatbuffers::Offset<Date> Finish() {
+ const auto end = fbb_.EndTable(start_);
+ auto o = flatbuffers::Offset<Date>(end);
+ return o;
+ }
+};
+
+inline flatbuffers::Offset<Date> CreateDate(
+ flatbuffers::FlatBufferBuilder &_fbb,
+ org::apache::arrow::flatbuf::DateUnit unit = org::apache::arrow::flatbuf::DateUnit::MILLISECOND) {
+ DateBuilder builder_(_fbb);
+ builder_.add_unit(unit);
+ return builder_.Finish();
+}
+
+/// Time type. The physical storage type depends on the unit
+/// - SECOND and MILLISECOND: 32 bits
+/// - MICROSECOND and NANOSECOND: 64 bits
+struct Time FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+ typedef TimeBuilder Builder;
+ enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+ VT_UNIT = 4,
+ VT_BITWIDTH = 6
+ };
+ org::apache::arrow::flatbuf::TimeUnit unit() const {
+ return static_cast<org::apache::arrow::flatbuf::TimeUnit>(GetField<int16_t>(VT_UNIT, 1));
+ }
+ int32_t bitWidth() const {
+ return GetField<int32_t>(VT_BITWIDTH, 32);
+ }
+ bool Verify(flatbuffers::Verifier &verifier) const {
+ return VerifyTableStart(verifier) &&
+ VerifyField<int16_t>(verifier, VT_UNIT) &&
+ VerifyField<int32_t>(verifier, VT_BITWIDTH) &&
+ verifier.EndTable();
+ }
+};
+
+struct TimeBuilder {
+ typedef Time Table;
+ flatbuffers::FlatBufferBuilder &fbb_;
+ flatbuffers::uoffset_t start_;
+ void add_unit(org::apache::arrow::flatbuf::TimeUnit unit) {
+ fbb_.AddElement<int16_t>(Time::VT_UNIT, static_cast<int16_t>(unit), 1);
+ }
+ void add_bitWidth(int32_t bitWidth) {
+ fbb_.AddElement<int32_t>(Time::VT_BITWIDTH, bitWidth, 32);
+ }
+ explicit TimeBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+ : fbb_(_fbb) {
+ start_ = fbb_.StartTable();
+ }
+ TimeBuilder &operator=(const TimeBuilder &);
+ flatbuffers::Offset<Time> Finish() {
+ const auto end = fbb_.EndTable(start_);
+ auto o = flatbuffers::Offset<Time>(end);
+ return o;
+ }
+};
+
+inline flatbuffers::Offset<Time> CreateTime(
+ flatbuffers::FlatBufferBuilder &_fbb,
+ org::apache::arrow::flatbuf::TimeUnit unit = org::apache::arrow::flatbuf::TimeUnit::MILLISECOND,
+ int32_t bitWidth = 32) {
+ TimeBuilder builder_(_fbb);
+ builder_.add_bitWidth(bitWidth);
+ builder_.add_unit(unit);
+ return builder_.Finish();
+}
+
+/// Time elapsed from the Unix epoch, 00:00:00.000 on 1 January 1970, excluding
+/// leap seconds, as a 64-bit integer. Note that UNIX time does not include
+/// leap seconds.
+///
+/// The Timestamp metadata supports both "time zone naive" and "time zone
+/// aware" timestamps. Read about the timezone attribute for more detail
+struct Timestamp FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+ typedef TimestampBuilder Builder;
+ enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+ VT_UNIT = 4,
+ VT_TIMEZONE = 6
+ };
+ org::apache::arrow::flatbuf::TimeUnit unit() const {
+ return static_cast<org::apache::arrow::flatbuf::TimeUnit>(GetField<int16_t>(VT_UNIT, 0));
+ }
+ /// The time zone is a string indicating the name of a time zone, one of:
+ ///
+ /// * As used in the Olson time zone database (the "tz database" or
+ /// "tzdata"), such as "America/New_York"
+ /// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30
+ ///
+ /// Whether a timezone string is present indicates different semantics about
+ /// the data:
+ ///
+ /// * If the time zone is null or equal to an empty string, the data is "time
+ /// zone naive" and shall be displayed *as is* to the user, not localized
+ /// to the locale of the user. This data can be though of as UTC but
+ /// without having "UTC" as the time zone, it is not considered to be
+ /// localized to any time zone
+ ///
+ /// * If the time zone is set to a valid value, values can be displayed as
+ /// "localized" to that time zone, even though the underlying 64-bit
+ /// integers are identical to the same data stored in UTC. Converting
+ /// between time zones is a metadata-only operation and does not change the
+ /// underlying values
+ const flatbuffers::String *timezone() const {
+ return GetPointer<const flatbuffers::String *>(VT_TIMEZONE);
+ }
+ bool Verify(flatbuffers::Verifier &verifier) const {
+ return VerifyTableStart(verifier) &&
+ VerifyField<int16_t>(verifier, VT_UNIT) &&
+ VerifyOffset(verifier, VT_TIMEZONE) &&
+ verifier.VerifyString(timezone()) &&
+ verifier.EndTable();
+ }
+};
+
+struct TimestampBuilder {
+ typedef Timestamp Table;
+ flatbuffers::FlatBufferBuilder &fbb_;
+ flatbuffers::uoffset_t start_;
+ void add_unit(org::apache::arrow::flatbuf::TimeUnit unit) {
+ fbb_.AddElement<int16_t>(Timestamp::VT_UNIT, static_cast<int16_t>(unit), 0);
+ }
+ void add_timezone(flatbuffers::Offset<flatbuffers::String> timezone) {
+ fbb_.AddOffset(Timestamp::VT_TIMEZONE, timezone);
+ }
+ explicit TimestampBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+ : fbb_(_fbb) {
+ start_ = fbb_.StartTable();
+ }
+ TimestampBuilder &operator=(const TimestampBuilder &);
+ flatbuffers::Offset<Timestamp> Finish() {
+ const auto end = fbb_.EndTable(start_);
+ auto o = flatbuffers::Offset<Timestamp>(end);
+ return o;
+ }
+};
+
+inline flatbuffers::Offset<Timestamp> CreateTimestamp(
+ flatbuffers::FlatBufferBuilder &_fbb,
+ org::apache::arrow::flatbuf::TimeUnit unit = org::apache::arrow::flatbuf::TimeUnit::SECOND,
+ flatbuffers::Offset<flatbuffers::String> timezone = 0) {
+ TimestampBuilder builder_(_fbb);
+ builder_.add_timezone(timezone);
+ builder_.add_unit(unit);
+ return builder_.Finish();
+}
+
+inline flatbuffers::Offset<Timestamp> CreateTimestampDirect(
+ flatbuffers::FlatBufferBuilder &_fbb,
+ org::apache::arrow::flatbuf::TimeUnit unit = org::apache::arrow::flatbuf::TimeUnit::SECOND,
+ const char *timezone = nullptr) {
+ auto timezone__ = timezone ? _fbb.CreateString(timezone) : 0;
+ return org::apache::arrow::flatbuf::CreateTimestamp(
+ _fbb,
+ unit,
+ timezone__);
+}
+
+struct Interval FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+ typedef IntervalBuilder Builder;
+ enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+ VT_UNIT = 4
+ };
+ org::apache::arrow::flatbuf::IntervalUnit unit() const {
+ return static_cast<org::apache::arrow::flatbuf::IntervalUnit>(GetField<int16_t>(VT_UNIT, 0));
+ }
+ bool Verify(flatbuffers::Verifier &verifier) const {
+ return VerifyTableStart(verifier) &&
+ VerifyField<int16_t>(verifier, VT_UNIT) &&
+ verifier.EndTable();
+ }
+};
+
+struct IntervalBuilder {
+ typedef Interval Table;
+ flatbuffers::FlatBufferBuilder &fbb_;
+ flatbuffers::uoffset_t start_;
+ void add_unit(org::apache::arrow::flatbuf::IntervalUnit unit) {
+ fbb_.AddElement<int16_t>(Interval::VT_UNIT, static_cast<int16_t>(unit), 0);
+ }
+ explicit IntervalBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+ : fbb_(_fbb) {
+ start_ = fbb_.StartTable();
+ }
+ IntervalBuilder &operator=(const IntervalBuilder &);
+ flatbuffers::Offset<Interval> Finish() {
+ const auto end = fbb_.EndTable(start_);
+ auto o = flatbuffers::Offset<Interval>(end);
+ return o;
+ }
+};
+
+inline flatbuffers::Offset<Interval> CreateInterval(
+ flatbuffers::FlatBufferBuilder &_fbb,
+ org::apache::arrow::flatbuf::IntervalUnit unit = org::apache::arrow::flatbuf::IntervalUnit::YEAR_MONTH) {
+ IntervalBuilder builder_(_fbb);
+ builder_.add_unit(unit);
+ return builder_.Finish();
+}
+
+struct Duration FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+ typedef DurationBuilder Builder;
+ enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+ VT_UNIT = 4
+ };
+ org::apache::arrow::flatbuf::TimeUnit unit() const {
+ return static_cast<org::apache::arrow::flatbuf::TimeUnit>(GetField<int16_t>(VT_UNIT, 1));
+ }
+ bool Verify(flatbuffers::Verifier &verifier) const {
+ return VerifyTableStart(verifier) &&
+ VerifyField<int16_t>(verifier, VT_UNIT) &&
+ verifier.EndTable();
+ }
+};
+
+struct DurationBuilder {
+ typedef Duration Table;
+ flatbuffers::FlatBufferBuilder &fbb_;
+ flatbuffers::uoffset_t start_;
+ void add_unit(org::apache::arrow::flatbuf::TimeUnit unit) {
+ fbb_.AddElement<int16_t>(Duration::VT_UNIT, static_cast<int16_t>(unit), 1);
+ }
+ explicit DurationBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+ : fbb_(_fbb) {
+ start_ = fbb_.StartTable();
+ }
+ DurationBuilder &operator=(const DurationBuilder &);
+ flatbuffers::Offset<Duration> Finish() {
+ const auto end = fbb_.EndTable(start_);
+ auto o = flatbuffers::Offset<Duration>(end);
+ return o;
+ }
+};
+
+inline flatbuffers::Offset<Duration> CreateDuration(
+ flatbuffers::FlatBufferBuilder &_fbb,
+ org::apache::arrow::flatbuf::TimeUnit unit = org::apache::arrow::flatbuf::TimeUnit::MILLISECOND) {
+ DurationBuilder builder_(_fbb);
+ builder_.add_unit(unit);
+ return builder_.Finish();
+}
+
+/// ----------------------------------------------------------------------
+/// user defined key value pairs to add custom metadata to arrow
+/// key namespacing is the responsibility of the user
+struct KeyValue FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+ typedef KeyValueBuilder Builder;
+ enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+ VT_KEY = 4,
+ VT_VALUE = 6
+ };
+ const flatbuffers::String *key() const {
+ return GetPointer<const flatbuffers::String *>(VT_KEY);
+ }
+ const flatbuffers::String *value() const {
+ return GetPointer<const flatbuffers::String *>(VT_VALUE);
+ }
+ bool Verify(flatbuffers::Verifier &verifier) const {
+ return VerifyTableStart(verifier) &&
+ VerifyOffset(verifier, VT_KEY) &&
+ verifier.VerifyString(key()) &&
+ VerifyOffset(verifier, VT_VALUE) &&
+ verifier.VerifyString(value()) &&
+ verifier.EndTable();
+ }
+};
+
+struct KeyValueBuilder {
+ typedef KeyValue Table;
+ flatbuffers::FlatBufferBuilder &fbb_;
+ flatbuffers::uoffset_t start_;
+ void add_key(flatbuffers::Offset<flatbuffers::String> key) {
+ fbb_.AddOffset(KeyValue::VT_KEY, key);
+ }
+ void add_value(flatbuffers::Offset<flatbuffers::String> value) {
+ fbb_.AddOffset(KeyValue::VT_VALUE, value);
+ }
+ explicit KeyValueBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+ : fbb_(_fbb) {
+ start_ = fbb_.StartTable();
+ }
+ KeyValueBuilder &operator=(const KeyValueBuilder &);
+ flatbuffers::Offset<KeyValue> Finish() {
+ const auto end = fbb_.EndTable(start_);
+ auto o = flatbuffers::Offset<KeyValue>(end);
+ return o;
+ }
+};
+
+inline flatbuffers::Offset<KeyValue> CreateKeyValue(
+ flatbuffers::FlatBufferBuilder &_fbb,
+ flatbuffers::Offset<flatbuffers::String> key = 0,
+ flatbuffers::Offset<flatbuffers::String> value = 0) {
+ KeyValueBuilder builder_(_fbb);
+ builder_.add_value(value);
+ builder_.add_key(key);
+ return builder_.Finish();
+}
+
+inline flatbuffers::Offset<KeyValue> CreateKeyValueDirect(
+ flatbuffers::FlatBufferBuilder &_fbb,
+ const char *key = nullptr,
+ const char *value = nullptr) {
+ auto key__ = key ? _fbb.CreateString(key) : 0;
+ auto value__ = value ? _fbb.CreateString(value) : 0;
+ return org::apache::arrow::flatbuf::CreateKeyValue(
+ _fbb,
+ key__,
+ value__);
+}
+
+struct DictionaryEncoding FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+ typedef DictionaryEncodingBuilder Builder;
+ enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+ VT_ID = 4,
+ VT_INDEXTYPE = 6,
+ VT_ISORDERED = 8,
+ VT_DICTIONARYKIND = 10
+ };
+ /// The known dictionary id in the application where this data is used. In
+ /// the file or streaming formats, the dictionary ids are found in the
+ /// DictionaryBatch messages
+ int64_t id() const {
+ return GetField<int64_t>(VT_ID, 0);
+ }
+ /// The dictionary indices are constrained to be non-negative integers. If
+ /// this field is null, the indices must be signed int32. To maximize
+ /// cross-language compatibility and performance, implementations are
+ /// recommended to prefer signed integer types over unsigned integer types
+ /// and to avoid uint64 indices unless they are required by an application.
+ const org::apache::arrow::flatbuf::Int *indexType() const {
+ return GetPointer<const org::apache::arrow::flatbuf::Int *>(VT_INDEXTYPE);
+ }
+ /// By default, dictionaries are not ordered, or the order does not have
+ /// semantic meaning. In some statistical, applications, dictionary-encoding
+ /// is used to represent ordered categorical data, and we provide a way to
+ /// preserve that metadata here
+ bool isOrdered() const {
+ return GetField<uint8_t>(VT_ISORDERED, 0) != 0;
+ }
+ org::apache::arrow::flatbuf::DictionaryKind dictionaryKind() const {
+ return static_cast<org::apache::arrow::flatbuf::DictionaryKind>(GetField<int16_t>(VT_DICTIONARYKIND, 0));
+ }
+ bool Verify(flatbuffers::Verifier &verifier) const {
+ return VerifyTableStart(verifier) &&
+ VerifyField<int64_t>(verifier, VT_ID) &&
+ VerifyOffset(verifier, VT_INDEXTYPE) &&
+ verifier.VerifyTable(indexType()) &&
+ VerifyField<uint8_t>(verifier, VT_ISORDERED) &&
+ VerifyField<int16_t>(verifier, VT_DICTIONARYKIND) &&
+ verifier.EndTable();
+ }
+};
+
+struct DictionaryEncodingBuilder {
+ typedef DictionaryEncoding Table;
+ flatbuffers::FlatBufferBuilder &fbb_;
+ flatbuffers::uoffset_t start_;
+ void add_id(int64_t id) {
+ fbb_.AddElement<int64_t>(DictionaryEncoding::VT_ID, id, 0);
+ }
+ void add_indexType(flatbuffers::Offset<org::apache::arrow::flatbuf::Int> indexType) {
+ fbb_.AddOffset(DictionaryEncoding::VT_INDEXTYPE, indexType);
+ }
+ void add_isOrdered(bool isOrdered) {
+ fbb_.AddElement<uint8_t>(DictionaryEncoding::VT_ISORDERED, static_cast<uint8_t>(isOrdered), 0);
+ }
+ void add_dictionaryKind(org::apache::arrow::flatbuf::DictionaryKind dictionaryKind) {
+ fbb_.AddElement<int16_t>(DictionaryEncoding::VT_DICTIONARYKIND, static_cast<int16_t>(dictionaryKind), 0);
+ }
+ explicit DictionaryEncodingBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+ : fbb_(_fbb) {
+ start_ = fbb_.StartTable();
+ }
+ DictionaryEncodingBuilder &operator=(const DictionaryEncodingBuilder &);
+ flatbuffers::Offset<DictionaryEncoding> Finish() {
+ const auto end = fbb_.EndTable(start_);
+ auto o = flatbuffers::Offset<DictionaryEncoding>(end);
+ return o;
+ }
+};
+
+inline flatbuffers::Offset<DictionaryEncoding> CreateDictionaryEncoding(
+ flatbuffers::FlatBufferBuilder &_fbb,
+ int64_t id = 0,
+ flatbuffers::Offset<org::apache::arrow::flatbuf::Int> indexType = 0,
+ bool isOrdered = false,
+ org::apache::arrow::flatbuf::DictionaryKind dictionaryKind = org::apache::arrow::flatbuf::DictionaryKind::DenseArray) {
+ DictionaryEncodingBuilder builder_(_fbb);
+ builder_.add_id(id);
+ builder_.add_indexType(indexType);
+ builder_.add_dictionaryKind(dictionaryKind);
+ builder_.add_isOrdered(isOrdered);
+ return builder_.Finish();
+}
+
+/// ----------------------------------------------------------------------
+/// A field represents a named column in a record / row batch or child of a
+/// nested type.
+struct Field FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+ typedef FieldBuilder Builder;
+ enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+ VT_NAME = 4,
+ VT_NULLABLE = 6,
+ VT_TYPE_TYPE = 8,
+ VT_TYPE = 10,
+ VT_DICTIONARY = 12,
+ VT_CHILDREN = 14,
+ VT_CUSTOM_METADATA = 16
+ };
+ /// Name is not required, in i.e. a List
+ const flatbuffers::String *name() const {
+ return GetPointer<const flatbuffers::String *>(VT_NAME);
+ }
+ /// Whether or not this field can contain nulls. Should be true in general.
+ bool nullable() const {
+ return GetField<uint8_t>(VT_NULLABLE, 0) != 0;
+ }
+ org::apache::arrow::flatbuf::Type type_type() const {
+ return static_cast<org::apache::arrow::flatbuf::Type>(GetField<uint8_t>(VT_TYPE_TYPE, 0));
+ }
+ /// This is the type of the decoded value if the field is dictionary encoded.
+ const void *type() const {
+ return GetPointer<const void *>(VT_TYPE);
+ }
+ template<typename T> const T *type_as() const;
+ const org::apache::arrow::flatbuf::Null *type_as_Null() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::Null ? static_cast<const org::apache::arrow::flatbuf::Null *>(type()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::Int *type_as_Int() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::Int ? static_cast<const org::apache::arrow::flatbuf::Int *>(type()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::FloatingPoint *type_as_FloatingPoint() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::FloatingPoint ? static_cast<const org::apache::arrow::flatbuf::FloatingPoint *>(type()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::Binary *type_as_Binary() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::Binary ? static_cast<const org::apache::arrow::flatbuf::Binary *>(type()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::Utf8 *type_as_Utf8() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::Utf8 ? static_cast<const org::apache::arrow::flatbuf::Utf8 *>(type()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::Bool *type_as_Bool() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::Bool ? static_cast<const org::apache::arrow::flatbuf::Bool *>(type()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::Decimal *type_as_Decimal() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::Decimal ? static_cast<const org::apache::arrow::flatbuf::Decimal *>(type()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::Date *type_as_Date() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::Date ? static_cast<const org::apache::arrow::flatbuf::Date *>(type()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::Time *type_as_Time() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::Time ? static_cast<const org::apache::arrow::flatbuf::Time *>(type()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::Timestamp *type_as_Timestamp() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::Timestamp ? static_cast<const org::apache::arrow::flatbuf::Timestamp *>(type()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::Interval *type_as_Interval() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::Interval ? static_cast<const org::apache::arrow::flatbuf::Interval *>(type()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::List *type_as_List() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::List ? static_cast<const org::apache::arrow::flatbuf::List *>(type()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::Struct_ *type_as_Struct_() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::Struct_ ? static_cast<const org::apache::arrow::flatbuf::Struct_ *>(type()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::Union *type_as_Union() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::Union ? static_cast<const org::apache::arrow::flatbuf::Union *>(type()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::FixedSizeBinary *type_as_FixedSizeBinary() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::FixedSizeBinary ? static_cast<const org::apache::arrow::flatbuf::FixedSizeBinary *>(type()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::FixedSizeList *type_as_FixedSizeList() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::FixedSizeList ? static_cast<const org::apache::arrow::flatbuf::FixedSizeList *>(type()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::Map *type_as_Map() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::Map ? static_cast<const org::apache::arrow::flatbuf::Map *>(type()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::Duration *type_as_Duration() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::Duration ? static_cast<const org::apache::arrow::flatbuf::Duration *>(type()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::LargeBinary *type_as_LargeBinary() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::LargeBinary ? static_cast<const org::apache::arrow::flatbuf::LargeBinary *>(type()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::LargeUtf8 *type_as_LargeUtf8() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::LargeUtf8 ? static_cast<const org::apache::arrow::flatbuf::LargeUtf8 *>(type()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::LargeList *type_as_LargeList() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::LargeList ? static_cast<const org::apache::arrow::flatbuf::LargeList *>(type()) : nullptr;
+ }
+ /// Present only if the field is dictionary encoded.
+ const org::apache::arrow::flatbuf::DictionaryEncoding *dictionary() const {
+ return GetPointer<const org::apache::arrow::flatbuf::DictionaryEncoding *>(VT_DICTIONARY);
+ }
+ /// children apply only to nested data types like Struct, List and Union. For
+ /// primitive types children will have length 0.
+ const flatbuffers::Vector<flatbuffers::Offset<org::apache::arrow::flatbuf::Field>> *children() const {
+ return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<org::apache::arrow::flatbuf::Field>> *>(VT_CHILDREN);
+ }
+ /// User-defined metadata
+ const flatbuffers::Vector<flatbuffers::Offset<org::apache::arrow::flatbuf::KeyValue>> *custom_metadata() const {
+ return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<org::apache::arrow::flatbuf::KeyValue>> *>(VT_CUSTOM_METADATA);
+ }
+ bool Verify(flatbuffers::Verifier &verifier) const {
+ return VerifyTableStart(verifier) &&
+ VerifyOffset(verifier, VT_NAME) &&
+ verifier.VerifyString(name()) &&
+ VerifyField<uint8_t>(verifier, VT_NULLABLE) &&
+ VerifyField<uint8_t>(verifier, VT_TYPE_TYPE) &&
+ VerifyOffset(verifier, VT_TYPE) &&
+ VerifyType(verifier, type(), type_type()) &&
+ VerifyOffset(verifier, VT_DICTIONARY) &&
+ verifier.VerifyTable(dictionary()) &&
+ VerifyOffset(verifier, VT_CHILDREN) &&
+ verifier.VerifyVector(children()) &&
+ verifier.VerifyVectorOfTables(children()) &&
+ VerifyOffset(verifier, VT_CUSTOM_METADATA) &&
+ verifier.VerifyVector(custom_metadata()) &&
+ verifier.VerifyVectorOfTables(custom_metadata()) &&
+ verifier.EndTable();
+ }
+};
+
+template<> inline const org::apache::arrow::flatbuf::Null *Field::type_as<org::apache::arrow::flatbuf::Null>() const {
+ return type_as_Null();
+}
+
+template<> inline const org::apache::arrow::flatbuf::Int *Field::type_as<org::apache::arrow::flatbuf::Int>() const {
+ return type_as_Int();
+}
+
+template<> inline const org::apache::arrow::flatbuf::FloatingPoint *Field::type_as<org::apache::arrow::flatbuf::FloatingPoint>() const {
+ return type_as_FloatingPoint();
+}
+
+template<> inline const org::apache::arrow::flatbuf::Binary *Field::type_as<org::apache::arrow::flatbuf::Binary>() const {
+ return type_as_Binary();
+}
+
+template<> inline const org::apache::arrow::flatbuf::Utf8 *Field::type_as<org::apache::arrow::flatbuf::Utf8>() const {
+ return type_as_Utf8();
+}
+
+template<> inline const org::apache::arrow::flatbuf::Bool *Field::type_as<org::apache::arrow::flatbuf::Bool>() const {
+ return type_as_Bool();
+}
+
+template<> inline const org::apache::arrow::flatbuf::Decimal *Field::type_as<org::apache::arrow::flatbuf::Decimal>() const {
+ return type_as_Decimal();
+}
+
+template<> inline const org::apache::arrow::flatbuf::Date *Field::type_as<org::apache::arrow::flatbuf::Date>() const {
+ return type_as_Date();
+}
+
+template<> inline const org::apache::arrow::flatbuf::Time *Field::type_as<org::apache::arrow::flatbuf::Time>() const {
+ return type_as_Time();
+}
+
+template<> inline const org::apache::arrow::flatbuf::Timestamp *Field::type_as<org::apache::arrow::flatbuf::Timestamp>() const {
+ return type_as_Timestamp();
+}
+
+template<> inline const org::apache::arrow::flatbuf::Interval *Field::type_as<org::apache::arrow::flatbuf::Interval>() const {
+ return type_as_Interval();
+}
+
+template<> inline const org::apache::arrow::flatbuf::List *Field::type_as<org::apache::arrow::flatbuf::List>() const {
+ return type_as_List();
+}
+
+template<> inline const org::apache::arrow::flatbuf::Struct_ *Field::type_as<org::apache::arrow::flatbuf::Struct_>() const {
+ return type_as_Struct_();
+}
+
+template<> inline const org::apache::arrow::flatbuf::Union *Field::type_as<org::apache::arrow::flatbuf::Union>() const {
+ return type_as_Union();
+}
+
+template<> inline const org::apache::arrow::flatbuf::FixedSizeBinary *Field::type_as<org::apache::arrow::flatbuf::FixedSizeBinary>() const {
+ return type_as_FixedSizeBinary();
+}
+
+template<> inline const org::apache::arrow::flatbuf::FixedSizeList *Field::type_as<org::apache::arrow::flatbuf::FixedSizeList>() const {
+ return type_as_FixedSizeList();
+}
+
+template<> inline const org::apache::arrow::flatbuf::Map *Field::type_as<org::apache::arrow::flatbuf::Map>() const {
+ return type_as_Map();
+}
+
+template<> inline const org::apache::arrow::flatbuf::Duration *Field::type_as<org::apache::arrow::flatbuf::Duration>() const {
+ return type_as_Duration();
+}
+
+template<> inline const org::apache::arrow::flatbuf::LargeBinary *Field::type_as<org::apache::arrow::flatbuf::LargeBinary>() const {
+ return type_as_LargeBinary();
+}
+
+template<> inline const org::apache::arrow::flatbuf::LargeUtf8 *Field::type_as<org::apache::arrow::flatbuf::LargeUtf8>() const {
+ return type_as_LargeUtf8();
+}
+
+template<> inline const org::apache::arrow::flatbuf::LargeList *Field::type_as<org::apache::arrow::flatbuf::LargeList>() const {
+ return type_as_LargeList();
+}
+
+struct FieldBuilder {
+ typedef Field Table;
+ flatbuffers::FlatBufferBuilder &fbb_;
+ flatbuffers::uoffset_t start_;
+ void add_name(flatbuffers::Offset<flatbuffers::String> name) {
+ fbb_.AddOffset(Field::VT_NAME, name);
+ }
+ void add_nullable(bool nullable) {
+ fbb_.AddElement<uint8_t>(Field::VT_NULLABLE, static_cast<uint8_t>(nullable), 0);
+ }
+ void add_type_type(org::apache::arrow::flatbuf::Type type_type) {
+ fbb_.AddElement<uint8_t>(Field::VT_TYPE_TYPE, static_cast<uint8_t>(type_type), 0);
+ }
+ void add_type(flatbuffers::Offset<void> type) {
+ fbb_.AddOffset(Field::VT_TYPE, type);
+ }
+ void add_dictionary(flatbuffers::Offset<org::apache::arrow::flatbuf::DictionaryEncoding> dictionary) {
+ fbb_.AddOffset(Field::VT_DICTIONARY, dictionary);
+ }
+ void add_children(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<org::apache::arrow::flatbuf::Field>>> children) {
+ fbb_.AddOffset(Field::VT_CHILDREN, children);
+ }
+ void add_custom_metadata(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<org::apache::arrow::flatbuf::KeyValue>>> custom_metadata) {
+ fbb_.AddOffset(Field::VT_CUSTOM_METADATA, custom_metadata);
+ }
+ explicit FieldBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+ : fbb_(_fbb) {
+ start_ = fbb_.StartTable();
+ }
+ FieldBuilder &operator=(const FieldBuilder &);
+ flatbuffers::Offset<Field> Finish() {
+ const auto end = fbb_.EndTable(start_);
+ auto o = flatbuffers::Offset<Field>(end);
+ return o;
+ }
+};
+
+inline flatbuffers::Offset<Field> CreateField(
+ flatbuffers::FlatBufferBuilder &_fbb,
+ flatbuffers::Offset<flatbuffers::String> name = 0,
+ bool nullable = false,
+ org::apache::arrow::flatbuf::Type type_type = org::apache::arrow::flatbuf::Type::NONE,
+ flatbuffers::Offset<void> type = 0,
+ flatbuffers::Offset<org::apache::arrow::flatbuf::DictionaryEncoding> dictionary = 0,
+ flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<org::apache::arrow::flatbuf::Field>>> children = 0,
+ flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<org::apache::arrow::flatbuf::KeyValue>>> custom_metadata = 0) {
+ FieldBuilder builder_(_fbb);
+ builder_.add_custom_metadata(custom_metadata);
+ builder_.add_children(children);
+ builder_.add_dictionary(dictionary);
+ builder_.add_type(type);
+ builder_.add_name(name);
+ builder_.add_type_type(type_type);
+ builder_.add_nullable(nullable);
+ return builder_.Finish();
+}
+
+inline flatbuffers::Offset<Field> CreateFieldDirect(
+ flatbuffers::FlatBufferBuilder &_fbb,
+ const char *name = nullptr,
+ bool nullable = false,
+ org::apache::arrow::flatbuf::Type type_type = org::apache::arrow::flatbuf::Type::NONE,
+ flatbuffers::Offset<void> type = 0,
+ flatbuffers::Offset<org::apache::arrow::flatbuf::DictionaryEncoding> dictionary = 0,
+ const std::vector<flatbuffers::Offset<org::apache::arrow::flatbuf::Field>> *children = nullptr,
+ const std::vector<flatbuffers::Offset<org::apache::arrow::flatbuf::KeyValue>> *custom_metadata = nullptr) {
+ auto name__ = name ? _fbb.CreateString(name) : 0;
+ auto children__ = children ? _fbb.CreateVector<flatbuffers::Offset<org::apache::arrow::flatbuf::Field>>(*children) : 0;
+ auto custom_metadata__ = custom_metadata ? _fbb.CreateVector<flatbuffers::Offset<org::apache::arrow::flatbuf::KeyValue>>(*custom_metadata) : 0;
+ return org::apache::arrow::flatbuf::CreateField(
+ _fbb,
+ name__,
+ nullable,
+ type_type,
+ type,
+ dictionary,
+ children__,
+ custom_metadata__);
+}
+
+/// ----------------------------------------------------------------------
+/// A Schema describes the columns in a row batch
+struct Schema FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+ typedef SchemaBuilder Builder;
+ enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+ VT_ENDIANNESS = 4,
+ VT_FIELDS = 6,
+ VT_CUSTOM_METADATA = 8,
+ VT_FEATURES = 10
+ };
+ /// endianness of the buffer
+ /// it is Little Endian by default
+ /// if endianness doesn't match the underlying system then the vectors need to be converted
+ org::apache::arrow::flatbuf::Endianness endianness() const {
+ return static_cast<org::apache::arrow::flatbuf::Endianness>(GetField<int16_t>(VT_ENDIANNESS, 0));
+ }
+ const flatbuffers::Vector<flatbuffers::Offset<org::apache::arrow::flatbuf::Field>> *fields() const {
+ return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<org::apache::arrow::flatbuf::Field>> *>(VT_FIELDS);
+ }
+ const flatbuffers::Vector<flatbuffers::Offset<org::apache::arrow::flatbuf::KeyValue>> *custom_metadata() const {
+ return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<org::apache::arrow::flatbuf::KeyValue>> *>(VT_CUSTOM_METADATA);
+ }
+ /// Features used in the stream/file.
+ const flatbuffers::Vector<int64_t> *features() const {
+ return GetPointer<const flatbuffers::Vector<int64_t> *>(VT_FEATURES);
+ }
+ bool Verify(flatbuffers::Verifier &verifier) const {
+ return VerifyTableStart(verifier) &&
+ VerifyField<int16_t>(verifier, VT_ENDIANNESS) &&
+ VerifyOffset(verifier, VT_FIELDS) &&
+ verifier.VerifyVector(fields()) &&
+ verifier.VerifyVectorOfTables(fields()) &&
+ VerifyOffset(verifier, VT_CUSTOM_METADATA) &&
+ verifier.VerifyVector(custom_metadata()) &&
+ verifier.VerifyVectorOfTables(custom_metadata()) &&
+ VerifyOffset(verifier, VT_FEATURES) &&
+ verifier.VerifyVector(features()) &&
+ verifier.EndTable();
+ }
+};
+
+struct SchemaBuilder {
+ typedef Schema Table;
+ flatbuffers::FlatBufferBuilder &fbb_;
+ flatbuffers::uoffset_t start_;
+ void add_endianness(org::apache::arrow::flatbuf::Endianness endianness) {
+ fbb_.AddElement<int16_t>(Schema::VT_ENDIANNESS, static_cast<int16_t>(endianness), 0);
+ }
+ void add_fields(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<org::apache::arrow::flatbuf::Field>>> fields) {
+ fbb_.AddOffset(Schema::VT_FIELDS, fields);
+ }
+ void add_custom_metadata(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<org::apache::arrow::flatbuf::KeyValue>>> custom_metadata) {
+ fbb_.AddOffset(Schema::VT_CUSTOM_METADATA, custom_metadata);
+ }
+ void add_features(flatbuffers::Offset<flatbuffers::Vector<int64_t>> features) {
+ fbb_.AddOffset(Schema::VT_FEATURES, features);
+ }
+ explicit SchemaBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+ : fbb_(_fbb) {
+ start_ = fbb_.StartTable();
+ }
+ SchemaBuilder &operator=(const SchemaBuilder &);
+ flatbuffers::Offset<Schema> Finish() {
+ const auto end = fbb_.EndTable(start_);
+ auto o = flatbuffers::Offset<Schema>(end);
+ return o;
+ }
+};
+
+inline flatbuffers::Offset<Schema> CreateSchema(
+ flatbuffers::FlatBufferBuilder &_fbb,
+ org::apache::arrow::flatbuf::Endianness endianness = org::apache::arrow::flatbuf::Endianness::Little,
+ flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<org::apache::arrow::flatbuf::Field>>> fields = 0,
+ flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<org::apache::arrow::flatbuf::KeyValue>>> custom_metadata = 0,
+ flatbuffers::Offset<flatbuffers::Vector<int64_t>> features = 0) {
+ SchemaBuilder builder_(_fbb);
+ builder_.add_features(features);
+ builder_.add_custom_metadata(custom_metadata);
+ builder_.add_fields(fields);
+ builder_.add_endianness(endianness);
+ return builder_.Finish();
+}
+
+inline flatbuffers::Offset<Schema> CreateSchemaDirect(
+ flatbuffers::FlatBufferBuilder &_fbb,
+ org::apache::arrow::flatbuf::Endianness endianness = org::apache::arrow::flatbuf::Endianness::Little,
+ const std::vector<flatbuffers::Offset<org::apache::arrow::flatbuf::Field>> *fields = nullptr,
+ const std::vector<flatbuffers::Offset<org::apache::arrow::flatbuf::KeyValue>> *custom_metadata = nullptr,
+ const std::vector<int64_t> *features = nullptr) {
+ auto fields__ = fields ? _fbb.CreateVector<flatbuffers::Offset<org::apache::arrow::flatbuf::Field>>(*fields) : 0;
+ auto custom_metadata__ = custom_metadata ? _fbb.CreateVector<flatbuffers::Offset<org::apache::arrow::flatbuf::KeyValue>>(*custom_metadata) : 0;
+ auto features__ = features ? _fbb.CreateVector<int64_t>(*features) : 0;
+ return org::apache::arrow::flatbuf::CreateSchema(
+ _fbb,
+ endianness,
+ fields__,
+ custom_metadata__,
+ features__);
+}
+
+inline bool VerifyType(flatbuffers::Verifier &verifier, const void *obj, Type type) {
+ switch (type) {
+ case Type::NONE: {
+ return true;
+ }
+ case Type::Null: {
+ auto ptr = reinterpret_cast<const org::apache::arrow::flatbuf::Null *>(obj);
+ return verifier.VerifyTable(ptr);
+ }
+ case Type::Int: {
+ auto ptr = reinterpret_cast<const org::apache::arrow::flatbuf::Int *>(obj);
+ return verifier.VerifyTable(ptr);
+ }
+ case Type::FloatingPoint: {
+ auto ptr = reinterpret_cast<const org::apache::arrow::flatbuf::FloatingPoint *>(obj);
+ return verifier.VerifyTable(ptr);
+ }
+ case Type::Binary: {
+ auto ptr = reinterpret_cast<const org::apache::arrow::flatbuf::Binary *>(obj);
+ return verifier.VerifyTable(ptr);
+ }
+ case Type::Utf8: {
+ auto ptr = reinterpret_cast<const org::apache::arrow::flatbuf::Utf8 *>(obj);
+ return verifier.VerifyTable(ptr);
+ }
+ case Type::Bool: {
+ auto ptr = reinterpret_cast<const org::apache::arrow::flatbuf::Bool *>(obj);
+ return verifier.VerifyTable(ptr);
+ }
+ case Type::Decimal: {
+ auto ptr = reinterpret_cast<const org::apache::arrow::flatbuf::Decimal *>(obj);
+ return verifier.VerifyTable(ptr);
+ }
+ case Type::Date: {
+ auto ptr = reinterpret_cast<const org::apache::arrow::flatbuf::Date *>(obj);
+ return verifier.VerifyTable(ptr);
+ }
+ case Type::Time: {
+ auto ptr = reinterpret_cast<const org::apache::arrow::flatbuf::Time *>(obj);
+ return verifier.VerifyTable(ptr);
+ }
+ case Type::Timestamp: {
+ auto ptr = reinterpret_cast<const org::apache::arrow::flatbuf::Timestamp *>(obj);
+ return verifier.VerifyTable(ptr);
+ }
+ case Type::Interval: {
+ auto ptr = reinterpret_cast<const org::apache::arrow::flatbuf::Interval *>(obj);
+ return verifier.VerifyTable(ptr);
+ }
+ case Type::List: {
+ auto ptr = reinterpret_cast<const org::apache::arrow::flatbuf::List *>(obj);
+ return verifier.VerifyTable(ptr);
+ }
+ case Type::Struct_: {
+ auto ptr = reinterpret_cast<const org::apache::arrow::flatbuf::Struct_ *>(obj);
+ return verifier.VerifyTable(ptr);
+ }
+ case Type::Union: {
+ auto ptr = reinterpret_cast<const org::apache::arrow::flatbuf::Union *>(obj);
+ return verifier.VerifyTable(ptr);
+ }
+ case Type::FixedSizeBinary: {
+ auto ptr = reinterpret_cast<const org::apache::arrow::flatbuf::FixedSizeBinary *>(obj);
+ return verifier.VerifyTable(ptr);
+ }
+ case Type::FixedSizeList: {
+ auto ptr = reinterpret_cast<const org::apache::arrow::flatbuf::FixedSizeList *>(obj);
+ return verifier.VerifyTable(ptr);
+ }
+ case Type::Map: {
+ auto ptr = reinterpret_cast<const org::apache::arrow::flatbuf::Map *>(obj);
+ return verifier.VerifyTable(ptr);
+ }
+ case Type::Duration: {
+ auto ptr = reinterpret_cast<const org::apache::arrow::flatbuf::Duration *>(obj);
+ return verifier.VerifyTable(ptr);
+ }
+ case Type::LargeBinary: {
+ auto ptr = reinterpret_cast<const org::apache::arrow::flatbuf::LargeBinary *>(obj);
+ return verifier.VerifyTable(ptr);
+ }
+ case Type::LargeUtf8: {
+ auto ptr = reinterpret_cast<const org::apache::arrow::flatbuf::LargeUtf8 *>(obj);
+ return verifier.VerifyTable(ptr);
+ }
+ case Type::LargeList: {
+ auto ptr = reinterpret_cast<const org::apache::arrow::flatbuf::LargeList *>(obj);
+ return verifier.VerifyTable(ptr);
+ }
+ default: return true;
+ }
+}
+
+inline bool VerifyTypeVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types) {
+ if (!values || !types) return !values && !types;
+ if (values->size() != types->size()) return false;
+ for (flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
+ if (!VerifyType(
+ verifier, values->Get(i), types->GetEnum<Type>(i))) {
+ return false;
+ }
+ }
+ return true;
+}
+
+inline const org::apache::arrow::flatbuf::Schema *GetSchema(const void *buf) {
+ return flatbuffers::GetRoot<org::apache::arrow::flatbuf::Schema>(buf);
+}
+
+inline const org::apache::arrow::flatbuf::Schema *GetSizePrefixedSchema(const void *buf) {
+ return flatbuffers::GetSizePrefixedRoot<org::apache::arrow::flatbuf::Schema>(buf);
+}
+
+inline bool VerifySchemaBuffer(
+ flatbuffers::Verifier &verifier) {
+ return verifier.VerifyBuffer<org::apache::arrow::flatbuf::Schema>(nullptr);
+}
+
+inline bool VerifySizePrefixedSchemaBuffer(
+ flatbuffers::Verifier &verifier) {
+ return verifier.VerifySizePrefixedBuffer<org::apache::arrow::flatbuf::Schema>(nullptr);
+}
+
+inline void FinishSchemaBuffer(
+ flatbuffers::FlatBufferBuilder &fbb,
+ flatbuffers::Offset<org::apache::arrow::flatbuf::Schema> root) {
+ fbb.Finish(root);
+}
+
+inline void FinishSizePrefixedSchemaBuffer(
+ flatbuffers::FlatBufferBuilder &fbb,
+ flatbuffers::Offset<org::apache::arrow::flatbuf::Schema> root) {
+ fbb.FinishSizePrefixed(root);
+}
+
+} // namespace flatbuf
+} // namespace arrow
+} // namespace apache
+} // namespace org
+
+#endif // FLATBUFFERS_GENERATED_SCHEMA_ORG_APACHE_ARROW_FLATBUF_H_
diff --git a/contrib/libs/apache/arrow/cpp/src/generated/SparseTensor_generated.h b/contrib/libs/apache/arrow/cpp/src/generated/SparseTensor_generated.h
new file mode 100644
index 00000000000..ec4d414d4fe
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/generated/SparseTensor_generated.h
@@ -0,0 +1,913 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+
+#ifndef FLATBUFFERS_GENERATED_SPARSETENSOR_ORG_APACHE_ARROW_FLATBUF_H_
+#define FLATBUFFERS_GENERATED_SPARSETENSOR_ORG_APACHE_ARROW_FLATBUF_H_
+
+#include "flatbuffers/flatbuffers.h"
+
+#include "Schema_generated.h"
+#include "Tensor_generated.h"
+
+namespace org {
+namespace apache {
+namespace arrow {
+namespace flatbuf {
+
+struct SparseTensorIndexCOO;
+struct SparseTensorIndexCOOBuilder;
+
+struct SparseMatrixIndexCSX;
+struct SparseMatrixIndexCSXBuilder;
+
+struct SparseTensorIndexCSF;
+struct SparseTensorIndexCSFBuilder;
+
+struct SparseTensor;
+struct SparseTensorBuilder;
+
+enum class SparseMatrixCompressedAxis : int16_t {
+ Row = 0,
+ Column = 1,
+ MIN = Row,
+ MAX = Column
+};
+
+inline const SparseMatrixCompressedAxis (&EnumValuesSparseMatrixCompressedAxis())[2] {
+ static const SparseMatrixCompressedAxis values[] = {
+ SparseMatrixCompressedAxis::Row,
+ SparseMatrixCompressedAxis::Column
+ };
+ return values;
+}
+
+inline const char * const *EnumNamesSparseMatrixCompressedAxis() {
+ static const char * const names[3] = {
+ "Row",
+ "Column",
+ nullptr
+ };
+ return names;
+}
+
+inline const char *EnumNameSparseMatrixCompressedAxis(SparseMatrixCompressedAxis e) {
+ if (flatbuffers::IsOutRange(e, SparseMatrixCompressedAxis::Row, SparseMatrixCompressedAxis::Column)) return "";
+ const size_t index = static_cast<size_t>(e);
+ return EnumNamesSparseMatrixCompressedAxis()[index];
+}
+
+enum class SparseTensorIndex : uint8_t {
+ NONE = 0,
+ SparseTensorIndexCOO = 1,
+ SparseMatrixIndexCSX = 2,
+ SparseTensorIndexCSF = 3,
+ MIN = NONE,
+ MAX = SparseTensorIndexCSF
+};
+
+inline const SparseTensorIndex (&EnumValuesSparseTensorIndex())[4] {
+ static const SparseTensorIndex values[] = {
+ SparseTensorIndex::NONE,
+ SparseTensorIndex::SparseTensorIndexCOO,
+ SparseTensorIndex::SparseMatrixIndexCSX,
+ SparseTensorIndex::SparseTensorIndexCSF
+ };
+ return values;
+}
+
+inline const char * const *EnumNamesSparseTensorIndex() {
+ static const char * const names[5] = {
+ "NONE",
+ "SparseTensorIndexCOO",
+ "SparseMatrixIndexCSX",
+ "SparseTensorIndexCSF",
+ nullptr
+ };
+ return names;
+}
+
+inline const char *EnumNameSparseTensorIndex(SparseTensorIndex e) {
+ if (flatbuffers::IsOutRange(e, SparseTensorIndex::NONE, SparseTensorIndex::SparseTensorIndexCSF)) return "";
+ const size_t index = static_cast<size_t>(e);
+ return EnumNamesSparseTensorIndex()[index];
+}
+
+template<typename T> struct SparseTensorIndexTraits {
+ static const SparseTensorIndex enum_value = SparseTensorIndex::NONE;
+};
+
+template<> struct SparseTensorIndexTraits<org::apache::arrow::flatbuf::SparseTensorIndexCOO> {
+ static const SparseTensorIndex enum_value = SparseTensorIndex::SparseTensorIndexCOO;
+};
+
+template<> struct SparseTensorIndexTraits<org::apache::arrow::flatbuf::SparseMatrixIndexCSX> {
+ static const SparseTensorIndex enum_value = SparseTensorIndex::SparseMatrixIndexCSX;
+};
+
+template<> struct SparseTensorIndexTraits<org::apache::arrow::flatbuf::SparseTensorIndexCSF> {
+ static const SparseTensorIndex enum_value = SparseTensorIndex::SparseTensorIndexCSF;
+};
+
+bool VerifySparseTensorIndex(flatbuffers::Verifier &verifier, const void *obj, SparseTensorIndex type);
+bool VerifySparseTensorIndexVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types);
+
+/// ----------------------------------------------------------------------
+/// EXPERIMENTAL: Data structures for sparse tensors
+/// Coordinate (COO) format of sparse tensor index.
+///
+/// COO's index list are represented as a NxM matrix,
+/// where N is the number of non-zero values,
+/// and M is the number of dimensions of a sparse tensor.
+///
+/// indicesBuffer stores the location and size of the data of this indices
+/// matrix. The value type and the stride of the indices matrix is
+/// specified in indicesType and indicesStrides fields.
+///
+/// For example, let X be a 2x3x4x5 tensor, and it has the following
+/// 6 non-zero values:
+///
+/// X[0, 1, 2, 0] := 1
+/// X[1, 1, 2, 3] := 2
+/// X[0, 2, 1, 0] := 3
+/// X[0, 1, 3, 0] := 4
+/// X[0, 1, 2, 1] := 5
+/// X[1, 2, 0, 4] := 6
+///
+/// In COO format, the index matrix of X is the following 4x6 matrix:
+///
+/// [[0, 0, 0, 0, 1, 1],
+/// [1, 1, 1, 2, 1, 2],
+/// [2, 2, 3, 1, 2, 0],
+/// [0, 1, 0, 0, 3, 4]]
+///
+/// Note that the indices are sorted in lexicographical order.
+struct SparseTensorIndexCOO FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+ typedef SparseTensorIndexCOOBuilder Builder;
+ enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+ VT_INDICESTYPE = 4,
+ VT_INDICESSTRIDES = 6,
+ VT_INDICESBUFFER = 8,
+ VT_ISCANONICAL = 10
+ };
+ /// The type of values in indicesBuffer
+ const org::apache::arrow::flatbuf::Int *indicesType() const {
+ return GetPointer<const org::apache::arrow::flatbuf::Int *>(VT_INDICESTYPE);
+ }
+ /// Non-negative byte offsets to advance one value cell along each dimension
+ /// If omitted, default to row-major order (C-like).
+ const flatbuffers::Vector<int64_t> *indicesStrides() const {
+ return GetPointer<const flatbuffers::Vector<int64_t> *>(VT_INDICESSTRIDES);
+ }
+ /// The location and size of the indices matrix's data
+ const org::apache::arrow::flatbuf::Buffer *indicesBuffer() const {
+ return GetStruct<const org::apache::arrow::flatbuf::Buffer *>(VT_INDICESBUFFER);
+ }
+ /// The canonicality flag
+ bool isCanonical() const {
+ return GetField<uint8_t>(VT_ISCANONICAL, 0) != 0;
+ }
+ bool Verify(flatbuffers::Verifier &verifier) const {
+ return VerifyTableStart(verifier) &&
+ VerifyOffsetRequired(verifier, VT_INDICESTYPE) &&
+ verifier.VerifyTable(indicesType()) &&
+ VerifyOffset(verifier, VT_INDICESSTRIDES) &&
+ verifier.VerifyVector(indicesStrides()) &&
+ VerifyFieldRequired<org::apache::arrow::flatbuf::Buffer>(verifier, VT_INDICESBUFFER) &&
+ VerifyField<uint8_t>(verifier, VT_ISCANONICAL) &&
+ verifier.EndTable();
+ }
+};
+
+struct SparseTensorIndexCOOBuilder {
+ typedef SparseTensorIndexCOO Table;
+ flatbuffers::FlatBufferBuilder &fbb_;
+ flatbuffers::uoffset_t start_;
+ void add_indicesType(flatbuffers::Offset<org::apache::arrow::flatbuf::Int> indicesType) {
+ fbb_.AddOffset(SparseTensorIndexCOO::VT_INDICESTYPE, indicesType);
+ }
+ void add_indicesStrides(flatbuffers::Offset<flatbuffers::Vector<int64_t>> indicesStrides) {
+ fbb_.AddOffset(SparseTensorIndexCOO::VT_INDICESSTRIDES, indicesStrides);
+ }
+ void add_indicesBuffer(const org::apache::arrow::flatbuf::Buffer *indicesBuffer) {
+ fbb_.AddStruct(SparseTensorIndexCOO::VT_INDICESBUFFER, indicesBuffer);
+ }
+ void add_isCanonical(bool isCanonical) {
+ fbb_.AddElement<uint8_t>(SparseTensorIndexCOO::VT_ISCANONICAL, static_cast<uint8_t>(isCanonical), 0);
+ }
+ explicit SparseTensorIndexCOOBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+ : fbb_(_fbb) {
+ start_ = fbb_.StartTable();
+ }
+ SparseTensorIndexCOOBuilder &operator=(const SparseTensorIndexCOOBuilder &);
+ flatbuffers::Offset<SparseTensorIndexCOO> Finish() {
+ const auto end = fbb_.EndTable(start_);
+ auto o = flatbuffers::Offset<SparseTensorIndexCOO>(end);
+ fbb_.Required(o, SparseTensorIndexCOO::VT_INDICESTYPE);
+ fbb_.Required(o, SparseTensorIndexCOO::VT_INDICESBUFFER);
+ return o;
+ }
+};
+
+inline flatbuffers::Offset<SparseTensorIndexCOO> CreateSparseTensorIndexCOO(
+ flatbuffers::FlatBufferBuilder &_fbb,
+ flatbuffers::Offset<org::apache::arrow::flatbuf::Int> indicesType = 0,
+ flatbuffers::Offset<flatbuffers::Vector<int64_t>> indicesStrides = 0,
+ const org::apache::arrow::flatbuf::Buffer *indicesBuffer = 0,
+ bool isCanonical = false) {
+ SparseTensorIndexCOOBuilder builder_(_fbb);
+ builder_.add_indicesBuffer(indicesBuffer);
+ builder_.add_indicesStrides(indicesStrides);
+ builder_.add_indicesType(indicesType);
+ builder_.add_isCanonical(isCanonical);
+ return builder_.Finish();
+}
+
+inline flatbuffers::Offset<SparseTensorIndexCOO> CreateSparseTensorIndexCOODirect(
+ flatbuffers::FlatBufferBuilder &_fbb,
+ flatbuffers::Offset<org::apache::arrow::flatbuf::Int> indicesType = 0,
+ const std::vector<int64_t> *indicesStrides = nullptr,
+ const org::apache::arrow::flatbuf::Buffer *indicesBuffer = 0,
+ bool isCanonical = false) {
+ auto indicesStrides__ = indicesStrides ? _fbb.CreateVector<int64_t>(*indicesStrides) : 0;
+ return org::apache::arrow::flatbuf::CreateSparseTensorIndexCOO(
+ _fbb,
+ indicesType,
+ indicesStrides__,
+ indicesBuffer,
+ isCanonical);
+}
+
+/// Compressed Sparse format, that is matrix-specific.
+struct SparseMatrixIndexCSX FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+ typedef SparseMatrixIndexCSXBuilder Builder;
+ enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+ VT_COMPRESSEDAXIS = 4,
+ VT_INDPTRTYPE = 6,
+ VT_INDPTRBUFFER = 8,
+ VT_INDICESTYPE = 10,
+ VT_INDICESBUFFER = 12
+ };
+ /// Which axis, row or column, is compressed
+ org::apache::arrow::flatbuf::SparseMatrixCompressedAxis compressedAxis() const {
+ return static_cast<org::apache::arrow::flatbuf::SparseMatrixCompressedAxis>(GetField<int16_t>(VT_COMPRESSEDAXIS, 0));
+ }
+ /// The type of values in indptrBuffer
+ const org::apache::arrow::flatbuf::Int *indptrType() const {
+ return GetPointer<const org::apache::arrow::flatbuf::Int *>(VT_INDPTRTYPE);
+ }
+ /// indptrBuffer stores the location and size of indptr array that
+ /// represents the range of the rows.
+ /// The i-th row spans from indptr[i] to indptr[i+1] in the data.
+ /// The length of this array is 1 + (the number of rows), and the type
+ /// of index value is long.
+ ///
+ /// For example, let X be the following 6x4 matrix:
+ ///
+ /// X := [[0, 1, 2, 0],
+ /// [0, 0, 3, 0],
+ /// [0, 4, 0, 5],
+ /// [0, 0, 0, 0],
+ /// [6, 0, 7, 8],
+ /// [0, 9, 0, 0]].
+ ///
+ /// The array of non-zero values in X is:
+ ///
+ /// values(X) = [1, 2, 3, 4, 5, 6, 7, 8, 9].
+ ///
+ /// And the indptr of X is:
+ ///
+ /// indptr(X) = [0, 2, 3, 5, 5, 8, 10].
+ const org::apache::arrow::flatbuf::Buffer *indptrBuffer() const {
+ return GetStruct<const org::apache::arrow::flatbuf::Buffer *>(VT_INDPTRBUFFER);
+ }
+ /// The type of values in indicesBuffer
+ const org::apache::arrow::flatbuf::Int *indicesType() const {
+ return GetPointer<const org::apache::arrow::flatbuf::Int *>(VT_INDICESTYPE);
+ }
+ /// indicesBuffer stores the location and size of the array that
+ /// contains the column indices of the corresponding non-zero values.
+ /// The type of index value is long.
+ ///
+ /// For example, the indices of the above X is:
+ ///
+ /// indices(X) = [1, 2, 2, 1, 3, 0, 2, 3, 1].
+ ///
+ /// Note that the indices are sorted in lexicographical order for each row.
+ const org::apache::arrow::flatbuf::Buffer *indicesBuffer() const {
+ return GetStruct<const org::apache::arrow::flatbuf::Buffer *>(VT_INDICESBUFFER);
+ }
+ bool Verify(flatbuffers::Verifier &verifier) const {
+ return VerifyTableStart(verifier) &&
+ VerifyField<int16_t>(verifier, VT_COMPRESSEDAXIS) &&
+ VerifyOffsetRequired(verifier, VT_INDPTRTYPE) &&
+ verifier.VerifyTable(indptrType()) &&
+ VerifyFieldRequired<org::apache::arrow::flatbuf::Buffer>(verifier, VT_INDPTRBUFFER) &&
+ VerifyOffsetRequired(verifier, VT_INDICESTYPE) &&
+ verifier.VerifyTable(indicesType()) &&
+ VerifyFieldRequired<org::apache::arrow::flatbuf::Buffer>(verifier, VT_INDICESBUFFER) &&
+ verifier.EndTable();
+ }
+};
+
+struct SparseMatrixIndexCSXBuilder {
+ typedef SparseMatrixIndexCSX Table;
+ flatbuffers::FlatBufferBuilder &fbb_;
+ flatbuffers::uoffset_t start_;
+ void add_compressedAxis(org::apache::arrow::flatbuf::SparseMatrixCompressedAxis compressedAxis) {
+ fbb_.AddElement<int16_t>(SparseMatrixIndexCSX::VT_COMPRESSEDAXIS, static_cast<int16_t>(compressedAxis), 0);
+ }
+ void add_indptrType(flatbuffers::Offset<org::apache::arrow::flatbuf::Int> indptrType) {
+ fbb_.AddOffset(SparseMatrixIndexCSX::VT_INDPTRTYPE, indptrType);
+ }
+ void add_indptrBuffer(const org::apache::arrow::flatbuf::Buffer *indptrBuffer) {
+ fbb_.AddStruct(SparseMatrixIndexCSX::VT_INDPTRBUFFER, indptrBuffer);
+ }
+ void add_indicesType(flatbuffers::Offset<org::apache::arrow::flatbuf::Int> indicesType) {
+ fbb_.AddOffset(SparseMatrixIndexCSX::VT_INDICESTYPE, indicesType);
+ }
+ void add_indicesBuffer(const org::apache::arrow::flatbuf::Buffer *indicesBuffer) {
+ fbb_.AddStruct(SparseMatrixIndexCSX::VT_INDICESBUFFER, indicesBuffer);
+ }
+ explicit SparseMatrixIndexCSXBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+ : fbb_(_fbb) {
+ start_ = fbb_.StartTable();
+ }
+ SparseMatrixIndexCSXBuilder &operator=(const SparseMatrixIndexCSXBuilder &);
+ flatbuffers::Offset<SparseMatrixIndexCSX> Finish() {
+ const auto end = fbb_.EndTable(start_);
+ auto o = flatbuffers::Offset<SparseMatrixIndexCSX>(end);
+ fbb_.Required(o, SparseMatrixIndexCSX::VT_INDPTRTYPE);
+ fbb_.Required(o, SparseMatrixIndexCSX::VT_INDPTRBUFFER);
+ fbb_.Required(o, SparseMatrixIndexCSX::VT_INDICESTYPE);
+ fbb_.Required(o, SparseMatrixIndexCSX::VT_INDICESBUFFER);
+ return o;
+ }
+};
+
+inline flatbuffers::Offset<SparseMatrixIndexCSX> CreateSparseMatrixIndexCSX(
+ flatbuffers::FlatBufferBuilder &_fbb,
+ org::apache::arrow::flatbuf::SparseMatrixCompressedAxis compressedAxis = org::apache::arrow::flatbuf::SparseMatrixCompressedAxis::Row,
+ flatbuffers::Offset<org::apache::arrow::flatbuf::Int> indptrType = 0,
+ const org::apache::arrow::flatbuf::Buffer *indptrBuffer = 0,
+ flatbuffers::Offset<org::apache::arrow::flatbuf::Int> indicesType = 0,
+ const org::apache::arrow::flatbuf::Buffer *indicesBuffer = 0) {
+ SparseMatrixIndexCSXBuilder builder_(_fbb);
+ builder_.add_indicesBuffer(indicesBuffer);
+ builder_.add_indicesType(indicesType);
+ builder_.add_indptrBuffer(indptrBuffer);
+ builder_.add_indptrType(indptrType);
+ builder_.add_compressedAxis(compressedAxis);
+ return builder_.Finish();
+}
+
+/// Compressed Sparse Fiber (CSF) sparse tensor index.
+struct SparseTensorIndexCSF FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+ typedef SparseTensorIndexCSFBuilder Builder;
+ enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+ VT_INDPTRTYPE = 4,
+ VT_INDPTRBUFFERS = 6,
+ VT_INDICESTYPE = 8,
+ VT_INDICESBUFFERS = 10,
+ VT_AXISORDER = 12
+ };
+ /// CSF is a generalization of compressed sparse row (CSR) index.
+ /// See [smith2017knl]: http://shaden.io/pub-files/smith2017knl.pdf
+ ///
+ /// CSF index recursively compresses each dimension of a tensor into a set
+ /// of prefix trees. Each path from a root to leaf forms one tensor
+ /// non-zero index. CSF is implemented with two arrays of buffers and one
+ /// arrays of integers.
+ ///
+ /// For example, let X be a 2x3x4x5 tensor and let it have the following
+ /// 8 non-zero values:
+ ///
+ /// X[0, 0, 0, 1] := 1
+ /// X[0, 0, 0, 2] := 2
+ /// X[0, 1, 0, 0] := 3
+ /// X[0, 1, 0, 2] := 4
+ /// X[0, 1, 1, 0] := 5
+ /// X[1, 1, 1, 0] := 6
+ /// X[1, 1, 1, 1] := 7
+ /// X[1, 1, 1, 2] := 8
+ ///
+ /// As a prefix tree this would be represented as:
+ ///
+ /// 0 1
+ /// / \ |
+ /// 0 1 1
+ /// / / \ |
+ /// 0 0 1 1
+ /// /| /| | /| |
+ /// 1 2 0 2 0 0 1 2
+ /// The type of values in indptrBuffers
+ const org::apache::arrow::flatbuf::Int *indptrType() const {
+ return GetPointer<const org::apache::arrow::flatbuf::Int *>(VT_INDPTRTYPE);
+ }
+ /// indptrBuffers stores the sparsity structure.
+ /// Each two consecutive dimensions in a tensor correspond to a buffer in
+ /// indptrBuffers. A pair of consecutive values at indptrBuffers[dim][i]
+ /// and indptrBuffers[dim][i + 1] signify a range of nodes in
+ /// indicesBuffers[dim + 1] who are children of indicesBuffers[dim][i] node.
+ ///
+ /// For example, the indptrBuffers for the above X is:
+ ///
+ /// indptrBuffer(X) = [
+ /// [0, 2, 3],
+ /// [0, 1, 3, 4],
+ /// [0, 2, 4, 5, 8]
+ /// ].
+ ///
+ const flatbuffers::Vector<const org::apache::arrow::flatbuf::Buffer *> *indptrBuffers() const {
+ return GetPointer<const flatbuffers::Vector<const org::apache::arrow::flatbuf::Buffer *> *>(VT_INDPTRBUFFERS);
+ }
+ /// The type of values in indicesBuffers
+ const org::apache::arrow::flatbuf::Int *indicesType() const {
+ return GetPointer<const org::apache::arrow::flatbuf::Int *>(VT_INDICESTYPE);
+ }
+ /// indicesBuffers stores values of nodes.
+ /// Each tensor dimension corresponds to a buffer in indicesBuffers.
+ /// For example, the indicesBuffers for the above X is:
+ ///
+ /// indicesBuffer(X) = [
+ /// [0, 1],
+ /// [0, 1, 1],
+ /// [0, 0, 1, 1],
+ /// [1, 2, 0, 2, 0, 0, 1, 2]
+ /// ].
+ ///
+ const flatbuffers::Vector<const org::apache::arrow::flatbuf::Buffer *> *indicesBuffers() const {
+ return GetPointer<const flatbuffers::Vector<const org::apache::arrow::flatbuf::Buffer *> *>(VT_INDICESBUFFERS);
+ }
+ /// axisOrder stores the sequence in which dimensions were traversed to
+ /// produce the prefix tree.
+ /// For example, the axisOrder for the above X is:
+ ///
+ /// axisOrder(X) = [0, 1, 2, 3].
+ ///
+ const flatbuffers::Vector<int32_t> *axisOrder() const {
+ return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_AXISORDER);
+ }
+ bool Verify(flatbuffers::Verifier &verifier) const {
+ return VerifyTableStart(verifier) &&
+ VerifyOffsetRequired(verifier, VT_INDPTRTYPE) &&
+ verifier.VerifyTable(indptrType()) &&
+ VerifyOffsetRequired(verifier, VT_INDPTRBUFFERS) &&
+ verifier.VerifyVector(indptrBuffers()) &&
+ VerifyOffsetRequired(verifier, VT_INDICESTYPE) &&
+ verifier.VerifyTable(indicesType()) &&
+ VerifyOffsetRequired(verifier, VT_INDICESBUFFERS) &&
+ verifier.VerifyVector(indicesBuffers()) &&
+ VerifyOffsetRequired(verifier, VT_AXISORDER) &&
+ verifier.VerifyVector(axisOrder()) &&
+ verifier.EndTable();
+ }
+};
+
+struct SparseTensorIndexCSFBuilder {
+ typedef SparseTensorIndexCSF Table;
+ flatbuffers::FlatBufferBuilder &fbb_;
+ flatbuffers::uoffset_t start_;
+ void add_indptrType(flatbuffers::Offset<org::apache::arrow::flatbuf::Int> indptrType) {
+ fbb_.AddOffset(SparseTensorIndexCSF::VT_INDPTRTYPE, indptrType);
+ }
+ void add_indptrBuffers(flatbuffers::Offset<flatbuffers::Vector<const org::apache::arrow::flatbuf::Buffer *>> indptrBuffers) {
+ fbb_.AddOffset(SparseTensorIndexCSF::VT_INDPTRBUFFERS, indptrBuffers);
+ }
+ void add_indicesType(flatbuffers::Offset<org::apache::arrow::flatbuf::Int> indicesType) {
+ fbb_.AddOffset(SparseTensorIndexCSF::VT_INDICESTYPE, indicesType);
+ }
+ void add_indicesBuffers(flatbuffers::Offset<flatbuffers::Vector<const org::apache::arrow::flatbuf::Buffer *>> indicesBuffers) {
+ fbb_.AddOffset(SparseTensorIndexCSF::VT_INDICESBUFFERS, indicesBuffers);
+ }
+ void add_axisOrder(flatbuffers::Offset<flatbuffers::Vector<int32_t>> axisOrder) {
+ fbb_.AddOffset(SparseTensorIndexCSF::VT_AXISORDER, axisOrder);
+ }
+ explicit SparseTensorIndexCSFBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+ : fbb_(_fbb) {
+ start_ = fbb_.StartTable();
+ }
+ SparseTensorIndexCSFBuilder &operator=(const SparseTensorIndexCSFBuilder &);
+ flatbuffers::Offset<SparseTensorIndexCSF> Finish() {
+ const auto end = fbb_.EndTable(start_);
+ auto o = flatbuffers::Offset<SparseTensorIndexCSF>(end);
+ fbb_.Required(o, SparseTensorIndexCSF::VT_INDPTRTYPE);
+ fbb_.Required(o, SparseTensorIndexCSF::VT_INDPTRBUFFERS);
+ fbb_.Required(o, SparseTensorIndexCSF::VT_INDICESTYPE);
+ fbb_.Required(o, SparseTensorIndexCSF::VT_INDICESBUFFERS);
+ fbb_.Required(o, SparseTensorIndexCSF::VT_AXISORDER);
+ return o;
+ }
+};
+
+inline flatbuffers::Offset<SparseTensorIndexCSF> CreateSparseTensorIndexCSF(
+ flatbuffers::FlatBufferBuilder &_fbb,
+ flatbuffers::Offset<org::apache::arrow::flatbuf::Int> indptrType = 0,
+ flatbuffers::Offset<flatbuffers::Vector<const org::apache::arrow::flatbuf::Buffer *>> indptrBuffers = 0,
+ flatbuffers::Offset<org::apache::arrow::flatbuf::Int> indicesType = 0,
+ flatbuffers::Offset<flatbuffers::Vector<const org::apache::arrow::flatbuf::Buffer *>> indicesBuffers = 0,
+ flatbuffers::Offset<flatbuffers::Vector<int32_t>> axisOrder = 0) {
+ SparseTensorIndexCSFBuilder builder_(_fbb);
+ builder_.add_axisOrder(axisOrder);
+ builder_.add_indicesBuffers(indicesBuffers);
+ builder_.add_indicesType(indicesType);
+ builder_.add_indptrBuffers(indptrBuffers);
+ builder_.add_indptrType(indptrType);
+ return builder_.Finish();
+}
+
+inline flatbuffers::Offset<SparseTensorIndexCSF> CreateSparseTensorIndexCSFDirect(
+ flatbuffers::FlatBufferBuilder &_fbb,
+ flatbuffers::Offset<org::apache::arrow::flatbuf::Int> indptrType = 0,
+ const std::vector<org::apache::arrow::flatbuf::Buffer> *indptrBuffers = nullptr,
+ flatbuffers::Offset<org::apache::arrow::flatbuf::Int> indicesType = 0,
+ const std::vector<org::apache::arrow::flatbuf::Buffer> *indicesBuffers = nullptr,
+ const std::vector<int32_t> *axisOrder = nullptr) {
+ auto indptrBuffers__ = indptrBuffers ? _fbb.CreateVectorOfStructs<org::apache::arrow::flatbuf::Buffer>(*indptrBuffers) : 0;
+ auto indicesBuffers__ = indicesBuffers ? _fbb.CreateVectorOfStructs<org::apache::arrow::flatbuf::Buffer>(*indicesBuffers) : 0;
+ auto axisOrder__ = axisOrder ? _fbb.CreateVector<int32_t>(*axisOrder) : 0;
+ return org::apache::arrow::flatbuf::CreateSparseTensorIndexCSF(
+ _fbb,
+ indptrType,
+ indptrBuffers__,
+ indicesType,
+ indicesBuffers__,
+ axisOrder__);
+}
+
+struct SparseTensor FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+ typedef SparseTensorBuilder Builder;
+ enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+ VT_TYPE_TYPE = 4,
+ VT_TYPE = 6,
+ VT_SHAPE = 8,
+ VT_NON_ZERO_LENGTH = 10,
+ VT_SPARSEINDEX_TYPE = 12,
+ VT_SPARSEINDEX = 14,
+ VT_DATA = 16
+ };
+ org::apache::arrow::flatbuf::Type type_type() const {
+ return static_cast<org::apache::arrow::flatbuf::Type>(GetField<uint8_t>(VT_TYPE_TYPE, 0));
+ }
+ /// The type of data contained in a value cell.
+ /// Currently only fixed-width value types are supported,
+ /// no strings or nested types.
+ const void *type() const {
+ return GetPointer<const void *>(VT_TYPE);
+ }
+ template<typename T> const T *type_as() const;
+ const org::apache::arrow::flatbuf::Null *type_as_Null() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::Null ? static_cast<const org::apache::arrow::flatbuf::Null *>(type()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::Int *type_as_Int() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::Int ? static_cast<const org::apache::arrow::flatbuf::Int *>(type()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::FloatingPoint *type_as_FloatingPoint() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::FloatingPoint ? static_cast<const org::apache::arrow::flatbuf::FloatingPoint *>(type()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::Binary *type_as_Binary() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::Binary ? static_cast<const org::apache::arrow::flatbuf::Binary *>(type()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::Utf8 *type_as_Utf8() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::Utf8 ? static_cast<const org::apache::arrow::flatbuf::Utf8 *>(type()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::Bool *type_as_Bool() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::Bool ? static_cast<const org::apache::arrow::flatbuf::Bool *>(type()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::Decimal *type_as_Decimal() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::Decimal ? static_cast<const org::apache::arrow::flatbuf::Decimal *>(type()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::Date *type_as_Date() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::Date ? static_cast<const org::apache::arrow::flatbuf::Date *>(type()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::Time *type_as_Time() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::Time ? static_cast<const org::apache::arrow::flatbuf::Time *>(type()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::Timestamp *type_as_Timestamp() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::Timestamp ? static_cast<const org::apache::arrow::flatbuf::Timestamp *>(type()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::Interval *type_as_Interval() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::Interval ? static_cast<const org::apache::arrow::flatbuf::Interval *>(type()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::List *type_as_List() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::List ? static_cast<const org::apache::arrow::flatbuf::List *>(type()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::Struct_ *type_as_Struct_() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::Struct_ ? static_cast<const org::apache::arrow::flatbuf::Struct_ *>(type()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::Union *type_as_Union() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::Union ? static_cast<const org::apache::arrow::flatbuf::Union *>(type()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::FixedSizeBinary *type_as_FixedSizeBinary() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::FixedSizeBinary ? static_cast<const org::apache::arrow::flatbuf::FixedSizeBinary *>(type()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::FixedSizeList *type_as_FixedSizeList() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::FixedSizeList ? static_cast<const org::apache::arrow::flatbuf::FixedSizeList *>(type()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::Map *type_as_Map() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::Map ? static_cast<const org::apache::arrow::flatbuf::Map *>(type()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::Duration *type_as_Duration() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::Duration ? static_cast<const org::apache::arrow::flatbuf::Duration *>(type()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::LargeBinary *type_as_LargeBinary() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::LargeBinary ? static_cast<const org::apache::arrow::flatbuf::LargeBinary *>(type()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::LargeUtf8 *type_as_LargeUtf8() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::LargeUtf8 ? static_cast<const org::apache::arrow::flatbuf::LargeUtf8 *>(type()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::LargeList *type_as_LargeList() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::LargeList ? static_cast<const org::apache::arrow::flatbuf::LargeList *>(type()) : nullptr;
+ }
+ /// The dimensions of the tensor, optionally named.
+ const flatbuffers::Vector<flatbuffers::Offset<org::apache::arrow::flatbuf::TensorDim>> *shape() const {
+ return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<org::apache::arrow::flatbuf::TensorDim>> *>(VT_SHAPE);
+ }
+ /// The number of non-zero values in a sparse tensor.
+ int64_t non_zero_length() const {
+ return GetField<int64_t>(VT_NON_ZERO_LENGTH, 0);
+ }
+ org::apache::arrow::flatbuf::SparseTensorIndex sparseIndex_type() const {
+ return static_cast<org::apache::arrow::flatbuf::SparseTensorIndex>(GetField<uint8_t>(VT_SPARSEINDEX_TYPE, 0));
+ }
+ /// Sparse tensor index
+ const void *sparseIndex() const {
+ return GetPointer<const void *>(VT_SPARSEINDEX);
+ }
+ template<typename T> const T *sparseIndex_as() const;
+ const org::apache::arrow::flatbuf::SparseTensorIndexCOO *sparseIndex_as_SparseTensorIndexCOO() const {
+ return sparseIndex_type() == org::apache::arrow::flatbuf::SparseTensorIndex::SparseTensorIndexCOO ? static_cast<const org::apache::arrow::flatbuf::SparseTensorIndexCOO *>(sparseIndex()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::SparseMatrixIndexCSX *sparseIndex_as_SparseMatrixIndexCSX() const {
+ return sparseIndex_type() == org::apache::arrow::flatbuf::SparseTensorIndex::SparseMatrixIndexCSX ? static_cast<const org::apache::arrow::flatbuf::SparseMatrixIndexCSX *>(sparseIndex()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::SparseTensorIndexCSF *sparseIndex_as_SparseTensorIndexCSF() const {
+ return sparseIndex_type() == org::apache::arrow::flatbuf::SparseTensorIndex::SparseTensorIndexCSF ? static_cast<const org::apache::arrow::flatbuf::SparseTensorIndexCSF *>(sparseIndex()) : nullptr;
+ }
+ /// The location and size of the tensor's data
+ const org::apache::arrow::flatbuf::Buffer *data() const {
+ return GetStruct<const org::apache::arrow::flatbuf::Buffer *>(VT_DATA);
+ }
+ bool Verify(flatbuffers::Verifier &verifier) const {
+ return VerifyTableStart(verifier) &&
+ VerifyField<uint8_t>(verifier, VT_TYPE_TYPE) &&
+ VerifyOffsetRequired(verifier, VT_TYPE) &&
+ VerifyType(verifier, type(), type_type()) &&
+ VerifyOffsetRequired(verifier, VT_SHAPE) &&
+ verifier.VerifyVector(shape()) &&
+ verifier.VerifyVectorOfTables(shape()) &&
+ VerifyField<int64_t>(verifier, VT_NON_ZERO_LENGTH) &&
+ VerifyField<uint8_t>(verifier, VT_SPARSEINDEX_TYPE) &&
+ VerifyOffsetRequired(verifier, VT_SPARSEINDEX) &&
+ VerifySparseTensorIndex(verifier, sparseIndex(), sparseIndex_type()) &&
+ VerifyFieldRequired<org::apache::arrow::flatbuf::Buffer>(verifier, VT_DATA) &&
+ verifier.EndTable();
+ }
+};
+
+template<> inline const org::apache::arrow::flatbuf::Null *SparseTensor::type_as<org::apache::arrow::flatbuf::Null>() const {
+ return type_as_Null();
+}
+
+template<> inline const org::apache::arrow::flatbuf::Int *SparseTensor::type_as<org::apache::arrow::flatbuf::Int>() const {
+ return type_as_Int();
+}
+
+template<> inline const org::apache::arrow::flatbuf::FloatingPoint *SparseTensor::type_as<org::apache::arrow::flatbuf::FloatingPoint>() const {
+ return type_as_FloatingPoint();
+}
+
+template<> inline const org::apache::arrow::flatbuf::Binary *SparseTensor::type_as<org::apache::arrow::flatbuf::Binary>() const {
+ return type_as_Binary();
+}
+
+template<> inline const org::apache::arrow::flatbuf::Utf8 *SparseTensor::type_as<org::apache::arrow::flatbuf::Utf8>() const {
+ return type_as_Utf8();
+}
+
+template<> inline const org::apache::arrow::flatbuf::Bool *SparseTensor::type_as<org::apache::arrow::flatbuf::Bool>() const {
+ return type_as_Bool();
+}
+
+template<> inline const org::apache::arrow::flatbuf::Decimal *SparseTensor::type_as<org::apache::arrow::flatbuf::Decimal>() const {
+ return type_as_Decimal();
+}
+
+template<> inline const org::apache::arrow::flatbuf::Date *SparseTensor::type_as<org::apache::arrow::flatbuf::Date>() const {
+ return type_as_Date();
+}
+
+template<> inline const org::apache::arrow::flatbuf::Time *SparseTensor::type_as<org::apache::arrow::flatbuf::Time>() const {
+ return type_as_Time();
+}
+
+template<> inline const org::apache::arrow::flatbuf::Timestamp *SparseTensor::type_as<org::apache::arrow::flatbuf::Timestamp>() const {
+ return type_as_Timestamp();
+}
+
+template<> inline const org::apache::arrow::flatbuf::Interval *SparseTensor::type_as<org::apache::arrow::flatbuf::Interval>() const {
+ return type_as_Interval();
+}
+
+template<> inline const org::apache::arrow::flatbuf::List *SparseTensor::type_as<org::apache::arrow::flatbuf::List>() const {
+ return type_as_List();
+}
+
+template<> inline const org::apache::arrow::flatbuf::Struct_ *SparseTensor::type_as<org::apache::arrow::flatbuf::Struct_>() const {
+ return type_as_Struct_();
+}
+
+template<> inline const org::apache::arrow::flatbuf::Union *SparseTensor::type_as<org::apache::arrow::flatbuf::Union>() const {
+ return type_as_Union();
+}
+
+template<> inline const org::apache::arrow::flatbuf::FixedSizeBinary *SparseTensor::type_as<org::apache::arrow::flatbuf::FixedSizeBinary>() const {
+ return type_as_FixedSizeBinary();
+}
+
+template<> inline const org::apache::arrow::flatbuf::FixedSizeList *SparseTensor::type_as<org::apache::arrow::flatbuf::FixedSizeList>() const {
+ return type_as_FixedSizeList();
+}
+
+template<> inline const org::apache::arrow::flatbuf::Map *SparseTensor::type_as<org::apache::arrow::flatbuf::Map>() const {
+ return type_as_Map();
+}
+
+template<> inline const org::apache::arrow::flatbuf::Duration *SparseTensor::type_as<org::apache::arrow::flatbuf::Duration>() const {
+ return type_as_Duration();
+}
+
+template<> inline const org::apache::arrow::flatbuf::LargeBinary *SparseTensor::type_as<org::apache::arrow::flatbuf::LargeBinary>() const {
+ return type_as_LargeBinary();
+}
+
+template<> inline const org::apache::arrow::flatbuf::LargeUtf8 *SparseTensor::type_as<org::apache::arrow::flatbuf::LargeUtf8>() const {
+ return type_as_LargeUtf8();
+}
+
+template<> inline const org::apache::arrow::flatbuf::LargeList *SparseTensor::type_as<org::apache::arrow::flatbuf::LargeList>() const {
+ return type_as_LargeList();
+}
+
+template<> inline const org::apache::arrow::flatbuf::SparseTensorIndexCOO *SparseTensor::sparseIndex_as<org::apache::arrow::flatbuf::SparseTensorIndexCOO>() const {
+ return sparseIndex_as_SparseTensorIndexCOO();
+}
+
+template<> inline const org::apache::arrow::flatbuf::SparseMatrixIndexCSX *SparseTensor::sparseIndex_as<org::apache::arrow::flatbuf::SparseMatrixIndexCSX>() const {
+ return sparseIndex_as_SparseMatrixIndexCSX();
+}
+
+template<> inline const org::apache::arrow::flatbuf::SparseTensorIndexCSF *SparseTensor::sparseIndex_as<org::apache::arrow::flatbuf::SparseTensorIndexCSF>() const {
+ return sparseIndex_as_SparseTensorIndexCSF();
+}
+
+struct SparseTensorBuilder {
+ typedef SparseTensor Table;
+ flatbuffers::FlatBufferBuilder &fbb_;
+ flatbuffers::uoffset_t start_;
+ void add_type_type(org::apache::arrow::flatbuf::Type type_type) {
+ fbb_.AddElement<uint8_t>(SparseTensor::VT_TYPE_TYPE, static_cast<uint8_t>(type_type), 0);
+ }
+ void add_type(flatbuffers::Offset<void> type) {
+ fbb_.AddOffset(SparseTensor::VT_TYPE, type);
+ }
+ void add_shape(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<org::apache::arrow::flatbuf::TensorDim>>> shape) {
+ fbb_.AddOffset(SparseTensor::VT_SHAPE, shape);
+ }
+ void add_non_zero_length(int64_t non_zero_length) {
+ fbb_.AddElement<int64_t>(SparseTensor::VT_NON_ZERO_LENGTH, non_zero_length, 0);
+ }
+ void add_sparseIndex_type(org::apache::arrow::flatbuf::SparseTensorIndex sparseIndex_type) {
+ fbb_.AddElement<uint8_t>(SparseTensor::VT_SPARSEINDEX_TYPE, static_cast<uint8_t>(sparseIndex_type), 0);
+ }
+ void add_sparseIndex(flatbuffers::Offset<void> sparseIndex) {
+ fbb_.AddOffset(SparseTensor::VT_SPARSEINDEX, sparseIndex);
+ }
+ void add_data(const org::apache::arrow::flatbuf::Buffer *data) {
+ fbb_.AddStruct(SparseTensor::VT_DATA, data);
+ }
+ explicit SparseTensorBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+ : fbb_(_fbb) {
+ start_ = fbb_.StartTable();
+ }
+ SparseTensorBuilder &operator=(const SparseTensorBuilder &);
+ flatbuffers::Offset<SparseTensor> Finish() {
+ const auto end = fbb_.EndTable(start_);
+ auto o = flatbuffers::Offset<SparseTensor>(end);
+ fbb_.Required(o, SparseTensor::VT_TYPE);
+ fbb_.Required(o, SparseTensor::VT_SHAPE);
+ fbb_.Required(o, SparseTensor::VT_SPARSEINDEX);
+ fbb_.Required(o, SparseTensor::VT_DATA);
+ return o;
+ }
+};
+
+inline flatbuffers::Offset<SparseTensor> CreateSparseTensor(
+ flatbuffers::FlatBufferBuilder &_fbb,
+ org::apache::arrow::flatbuf::Type type_type = org::apache::arrow::flatbuf::Type::NONE,
+ flatbuffers::Offset<void> type = 0,
+ flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<org::apache::arrow::flatbuf::TensorDim>>> shape = 0,
+ int64_t non_zero_length = 0,
+ org::apache::arrow::flatbuf::SparseTensorIndex sparseIndex_type = org::apache::arrow::flatbuf::SparseTensorIndex::NONE,
+ flatbuffers::Offset<void> sparseIndex = 0,
+ const org::apache::arrow::flatbuf::Buffer *data = 0) {
+ SparseTensorBuilder builder_(_fbb);
+ builder_.add_non_zero_length(non_zero_length);
+ builder_.add_data(data);
+ builder_.add_sparseIndex(sparseIndex);
+ builder_.add_shape(shape);
+ builder_.add_type(type);
+ builder_.add_sparseIndex_type(sparseIndex_type);
+ builder_.add_type_type(type_type);
+ return builder_.Finish();
+}
+
+inline flatbuffers::Offset<SparseTensor> CreateSparseTensorDirect(
+ flatbuffers::FlatBufferBuilder &_fbb,
+ org::apache::arrow::flatbuf::Type type_type = org::apache::arrow::flatbuf::Type::NONE,
+ flatbuffers::Offset<void> type = 0,
+ const std::vector<flatbuffers::Offset<org::apache::arrow::flatbuf::TensorDim>> *shape = nullptr,
+ int64_t non_zero_length = 0,
+ org::apache::arrow::flatbuf::SparseTensorIndex sparseIndex_type = org::apache::arrow::flatbuf::SparseTensorIndex::NONE,
+ flatbuffers::Offset<void> sparseIndex = 0,
+ const org::apache::arrow::flatbuf::Buffer *data = 0) {
+ auto shape__ = shape ? _fbb.CreateVector<flatbuffers::Offset<org::apache::arrow::flatbuf::TensorDim>>(*shape) : 0;
+ return org::apache::arrow::flatbuf::CreateSparseTensor(
+ _fbb,
+ type_type,
+ type,
+ shape__,
+ non_zero_length,
+ sparseIndex_type,
+ sparseIndex,
+ data);
+}
+
+inline bool VerifySparseTensorIndex(flatbuffers::Verifier &verifier, const void *obj, SparseTensorIndex type) {
+ switch (type) {
+ case SparseTensorIndex::NONE: {
+ return true;
+ }
+ case SparseTensorIndex::SparseTensorIndexCOO: {
+ auto ptr = reinterpret_cast<const org::apache::arrow::flatbuf::SparseTensorIndexCOO *>(obj);
+ return verifier.VerifyTable(ptr);
+ }
+ case SparseTensorIndex::SparseMatrixIndexCSX: {
+ auto ptr = reinterpret_cast<const org::apache::arrow::flatbuf::SparseMatrixIndexCSX *>(obj);
+ return verifier.VerifyTable(ptr);
+ }
+ case SparseTensorIndex::SparseTensorIndexCSF: {
+ auto ptr = reinterpret_cast<const org::apache::arrow::flatbuf::SparseTensorIndexCSF *>(obj);
+ return verifier.VerifyTable(ptr);
+ }
+ default: return true;
+ }
+}
+
+inline bool VerifySparseTensorIndexVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types) {
+ if (!values || !types) return !values && !types;
+ if (values->size() != types->size()) return false;
+ for (flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
+ if (!VerifySparseTensorIndex(
+ verifier, values->Get(i), types->GetEnum<SparseTensorIndex>(i))) {
+ return false;
+ }
+ }
+ return true;
+}
+
+inline const org::apache::arrow::flatbuf::SparseTensor *GetSparseTensor(const void *buf) {
+ return flatbuffers::GetRoot<org::apache::arrow::flatbuf::SparseTensor>(buf);
+}
+
+inline const org::apache::arrow::flatbuf::SparseTensor *GetSizePrefixedSparseTensor(const void *buf) {
+ return flatbuffers::GetSizePrefixedRoot<org::apache::arrow::flatbuf::SparseTensor>(buf);
+}
+
+inline bool VerifySparseTensorBuffer(
+ flatbuffers::Verifier &verifier) {
+ return verifier.VerifyBuffer<org::apache::arrow::flatbuf::SparseTensor>(nullptr);
+}
+
+inline bool VerifySizePrefixedSparseTensorBuffer(
+ flatbuffers::Verifier &verifier) {
+ return verifier.VerifySizePrefixedBuffer<org::apache::arrow::flatbuf::SparseTensor>(nullptr);
+}
+
+inline void FinishSparseTensorBuffer(
+ flatbuffers::FlatBufferBuilder &fbb,
+ flatbuffers::Offset<org::apache::arrow::flatbuf::SparseTensor> root) {
+ fbb.Finish(root);
+}
+
+inline void FinishSizePrefixedSparseTensorBuffer(
+ flatbuffers::FlatBufferBuilder &fbb,
+ flatbuffers::Offset<org::apache::arrow::flatbuf::SparseTensor> root) {
+ fbb.FinishSizePrefixed(root);
+}
+
+} // namespace flatbuf
+} // namespace arrow
+} // namespace apache
+} // namespace org
+
+#endif // FLATBUFFERS_GENERATED_SPARSETENSOR_ORG_APACHE_ARROW_FLATBUF_H_
diff --git a/contrib/libs/apache/arrow/cpp/src/generated/Tensor_generated.h b/contrib/libs/apache/arrow/cpp/src/generated/Tensor_generated.h
new file mode 100644
index 00000000000..062a3b91aaa
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/generated/Tensor_generated.h
@@ -0,0 +1,387 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+
+#ifndef FLATBUFFERS_GENERATED_TENSOR_ORG_APACHE_ARROW_FLATBUF_H_
+#define FLATBUFFERS_GENERATED_TENSOR_ORG_APACHE_ARROW_FLATBUF_H_
+
+#include "flatbuffers/flatbuffers.h"
+
+#include "Schema_generated.h"
+
+namespace org {
+namespace apache {
+namespace arrow {
+namespace flatbuf {
+
+struct TensorDim;
+struct TensorDimBuilder;
+
+struct Tensor;
+struct TensorBuilder;
+
+/// ----------------------------------------------------------------------
+/// Data structures for dense tensors
+/// Shape data for a single axis in a tensor
+struct TensorDim FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+ typedef TensorDimBuilder Builder;
+ enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+ VT_SIZE = 4,
+ VT_NAME = 6
+ };
+ /// Length of dimension
+ int64_t size() const {
+ return GetField<int64_t>(VT_SIZE, 0);
+ }
+ /// Name of the dimension, optional
+ const flatbuffers::String *name() const {
+ return GetPointer<const flatbuffers::String *>(VT_NAME);
+ }
+ bool Verify(flatbuffers::Verifier &verifier) const {
+ return VerifyTableStart(verifier) &&
+ VerifyField<int64_t>(verifier, VT_SIZE) &&
+ VerifyOffset(verifier, VT_NAME) &&
+ verifier.VerifyString(name()) &&
+ verifier.EndTable();
+ }
+};
+
+struct TensorDimBuilder {
+ typedef TensorDim Table;
+ flatbuffers::FlatBufferBuilder &fbb_;
+ flatbuffers::uoffset_t start_;
+ void add_size(int64_t size) {
+ fbb_.AddElement<int64_t>(TensorDim::VT_SIZE, size, 0);
+ }
+ void add_name(flatbuffers::Offset<flatbuffers::String> name) {
+ fbb_.AddOffset(TensorDim::VT_NAME, name);
+ }
+ explicit TensorDimBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+ : fbb_(_fbb) {
+ start_ = fbb_.StartTable();
+ }
+ TensorDimBuilder &operator=(const TensorDimBuilder &);
+ flatbuffers::Offset<TensorDim> Finish() {
+ const auto end = fbb_.EndTable(start_);
+ auto o = flatbuffers::Offset<TensorDim>(end);
+ return o;
+ }
+};
+
+inline flatbuffers::Offset<TensorDim> CreateTensorDim(
+ flatbuffers::FlatBufferBuilder &_fbb,
+ int64_t size = 0,
+ flatbuffers::Offset<flatbuffers::String> name = 0) {
+ TensorDimBuilder builder_(_fbb);
+ builder_.add_size(size);
+ builder_.add_name(name);
+ return builder_.Finish();
+}
+
+inline flatbuffers::Offset<TensorDim> CreateTensorDimDirect(
+ flatbuffers::FlatBufferBuilder &_fbb,
+ int64_t size = 0,
+ const char *name = nullptr) {
+ auto name__ = name ? _fbb.CreateString(name) : 0;
+ return org::apache::arrow::flatbuf::CreateTensorDim(
+ _fbb,
+ size,
+ name__);
+}
+
+struct Tensor FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+ typedef TensorBuilder Builder;
+ enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+ VT_TYPE_TYPE = 4,
+ VT_TYPE = 6,
+ VT_SHAPE = 8,
+ VT_STRIDES = 10,
+ VT_DATA = 12
+ };
+ org::apache::arrow::flatbuf::Type type_type() const {
+ return static_cast<org::apache::arrow::flatbuf::Type>(GetField<uint8_t>(VT_TYPE_TYPE, 0));
+ }
+ /// The type of data contained in a value cell. Currently only fixed-width
+ /// value types are supported, no strings or nested types
+ const void *type() const {
+ return GetPointer<const void *>(VT_TYPE);
+ }
+ template<typename T> const T *type_as() const;
+ const org::apache::arrow::flatbuf::Null *type_as_Null() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::Null ? static_cast<const org::apache::arrow::flatbuf::Null *>(type()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::Int *type_as_Int() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::Int ? static_cast<const org::apache::arrow::flatbuf::Int *>(type()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::FloatingPoint *type_as_FloatingPoint() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::FloatingPoint ? static_cast<const org::apache::arrow::flatbuf::FloatingPoint *>(type()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::Binary *type_as_Binary() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::Binary ? static_cast<const org::apache::arrow::flatbuf::Binary *>(type()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::Utf8 *type_as_Utf8() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::Utf8 ? static_cast<const org::apache::arrow::flatbuf::Utf8 *>(type()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::Bool *type_as_Bool() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::Bool ? static_cast<const org::apache::arrow::flatbuf::Bool *>(type()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::Decimal *type_as_Decimal() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::Decimal ? static_cast<const org::apache::arrow::flatbuf::Decimal *>(type()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::Date *type_as_Date() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::Date ? static_cast<const org::apache::arrow::flatbuf::Date *>(type()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::Time *type_as_Time() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::Time ? static_cast<const org::apache::arrow::flatbuf::Time *>(type()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::Timestamp *type_as_Timestamp() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::Timestamp ? static_cast<const org::apache::arrow::flatbuf::Timestamp *>(type()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::Interval *type_as_Interval() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::Interval ? static_cast<const org::apache::arrow::flatbuf::Interval *>(type()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::List *type_as_List() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::List ? static_cast<const org::apache::arrow::flatbuf::List *>(type()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::Struct_ *type_as_Struct_() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::Struct_ ? static_cast<const org::apache::arrow::flatbuf::Struct_ *>(type()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::Union *type_as_Union() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::Union ? static_cast<const org::apache::arrow::flatbuf::Union *>(type()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::FixedSizeBinary *type_as_FixedSizeBinary() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::FixedSizeBinary ? static_cast<const org::apache::arrow::flatbuf::FixedSizeBinary *>(type()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::FixedSizeList *type_as_FixedSizeList() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::FixedSizeList ? static_cast<const org::apache::arrow::flatbuf::FixedSizeList *>(type()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::Map *type_as_Map() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::Map ? static_cast<const org::apache::arrow::flatbuf::Map *>(type()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::Duration *type_as_Duration() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::Duration ? static_cast<const org::apache::arrow::flatbuf::Duration *>(type()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::LargeBinary *type_as_LargeBinary() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::LargeBinary ? static_cast<const org::apache::arrow::flatbuf::LargeBinary *>(type()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::LargeUtf8 *type_as_LargeUtf8() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::LargeUtf8 ? static_cast<const org::apache::arrow::flatbuf::LargeUtf8 *>(type()) : nullptr;
+ }
+ const org::apache::arrow::flatbuf::LargeList *type_as_LargeList() const {
+ return type_type() == org::apache::arrow::flatbuf::Type::LargeList ? static_cast<const org::apache::arrow::flatbuf::LargeList *>(type()) : nullptr;
+ }
+ /// The dimensions of the tensor, optionally named
+ const flatbuffers::Vector<flatbuffers::Offset<org::apache::arrow::flatbuf::TensorDim>> *shape() const {
+ return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<org::apache::arrow::flatbuf::TensorDim>> *>(VT_SHAPE);
+ }
+ /// Non-negative byte offsets to advance one value cell along each dimension
+ /// If omitted, default to row-major order (C-like).
+ const flatbuffers::Vector<int64_t> *strides() const {
+ return GetPointer<const flatbuffers::Vector<int64_t> *>(VT_STRIDES);
+ }
+ /// The location and size of the tensor's data
+ const org::apache::arrow::flatbuf::Buffer *data() const {
+ return GetStruct<const org::apache::arrow::flatbuf::Buffer *>(VT_DATA);
+ }
+ bool Verify(flatbuffers::Verifier &verifier) const {
+ return VerifyTableStart(verifier) &&
+ VerifyField<uint8_t>(verifier, VT_TYPE_TYPE) &&
+ VerifyOffsetRequired(verifier, VT_TYPE) &&
+ VerifyType(verifier, type(), type_type()) &&
+ VerifyOffsetRequired(verifier, VT_SHAPE) &&
+ verifier.VerifyVector(shape()) &&
+ verifier.VerifyVectorOfTables(shape()) &&
+ VerifyOffset(verifier, VT_STRIDES) &&
+ verifier.VerifyVector(strides()) &&
+ VerifyFieldRequired<org::apache::arrow::flatbuf::Buffer>(verifier, VT_DATA) &&
+ verifier.EndTable();
+ }
+};
+
+template<> inline const org::apache::arrow::flatbuf::Null *Tensor::type_as<org::apache::arrow::flatbuf::Null>() const {
+ return type_as_Null();
+}
+
+template<> inline const org::apache::arrow::flatbuf::Int *Tensor::type_as<org::apache::arrow::flatbuf::Int>() const {
+ return type_as_Int();
+}
+
+template<> inline const org::apache::arrow::flatbuf::FloatingPoint *Tensor::type_as<org::apache::arrow::flatbuf::FloatingPoint>() const {
+ return type_as_FloatingPoint();
+}
+
+template<> inline const org::apache::arrow::flatbuf::Binary *Tensor::type_as<org::apache::arrow::flatbuf::Binary>() const {
+ return type_as_Binary();
+}
+
+template<> inline const org::apache::arrow::flatbuf::Utf8 *Tensor::type_as<org::apache::arrow::flatbuf::Utf8>() const {
+ return type_as_Utf8();
+}
+
+template<> inline const org::apache::arrow::flatbuf::Bool *Tensor::type_as<org::apache::arrow::flatbuf::Bool>() const {
+ return type_as_Bool();
+}
+
+template<> inline const org::apache::arrow::flatbuf::Decimal *Tensor::type_as<org::apache::arrow::flatbuf::Decimal>() const {
+ return type_as_Decimal();
+}
+
+template<> inline const org::apache::arrow::flatbuf::Date *Tensor::type_as<org::apache::arrow::flatbuf::Date>() const {
+ return type_as_Date();
+}
+
+template<> inline const org::apache::arrow::flatbuf::Time *Tensor::type_as<org::apache::arrow::flatbuf::Time>() const {
+ return type_as_Time();
+}
+
+template<> inline const org::apache::arrow::flatbuf::Timestamp *Tensor::type_as<org::apache::arrow::flatbuf::Timestamp>() const {
+ return type_as_Timestamp();
+}
+
+template<> inline const org::apache::arrow::flatbuf::Interval *Tensor::type_as<org::apache::arrow::flatbuf::Interval>() const {
+ return type_as_Interval();
+}
+
+template<> inline const org::apache::arrow::flatbuf::List *Tensor::type_as<org::apache::arrow::flatbuf::List>() const {
+ return type_as_List();
+}
+
+template<> inline const org::apache::arrow::flatbuf::Struct_ *Tensor::type_as<org::apache::arrow::flatbuf::Struct_>() const {
+ return type_as_Struct_();
+}
+
+template<> inline const org::apache::arrow::flatbuf::Union *Tensor::type_as<org::apache::arrow::flatbuf::Union>() const {
+ return type_as_Union();
+}
+
+template<> inline const org::apache::arrow::flatbuf::FixedSizeBinary *Tensor::type_as<org::apache::arrow::flatbuf::FixedSizeBinary>() const {
+ return type_as_FixedSizeBinary();
+}
+
+template<> inline const org::apache::arrow::flatbuf::FixedSizeList *Tensor::type_as<org::apache::arrow::flatbuf::FixedSizeList>() const {
+ return type_as_FixedSizeList();
+}
+
+template<> inline const org::apache::arrow::flatbuf::Map *Tensor::type_as<org::apache::arrow::flatbuf::Map>() const {
+ return type_as_Map();
+}
+
+template<> inline const org::apache::arrow::flatbuf::Duration *Tensor::type_as<org::apache::arrow::flatbuf::Duration>() const {
+ return type_as_Duration();
+}
+
+template<> inline const org::apache::arrow::flatbuf::LargeBinary *Tensor::type_as<org::apache::arrow::flatbuf::LargeBinary>() const {
+ return type_as_LargeBinary();
+}
+
+template<> inline const org::apache::arrow::flatbuf::LargeUtf8 *Tensor::type_as<org::apache::arrow::flatbuf::LargeUtf8>() const {
+ return type_as_LargeUtf8();
+}
+
+template<> inline const org::apache::arrow::flatbuf::LargeList *Tensor::type_as<org::apache::arrow::flatbuf::LargeList>() const {
+ return type_as_LargeList();
+}
+
+struct TensorBuilder {
+ typedef Tensor Table;
+ flatbuffers::FlatBufferBuilder &fbb_;
+ flatbuffers::uoffset_t start_;
+ void add_type_type(org::apache::arrow::flatbuf::Type type_type) {
+ fbb_.AddElement<uint8_t>(Tensor::VT_TYPE_TYPE, static_cast<uint8_t>(type_type), 0);
+ }
+ void add_type(flatbuffers::Offset<void> type) {
+ fbb_.AddOffset(Tensor::VT_TYPE, type);
+ }
+ void add_shape(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<org::apache::arrow::flatbuf::TensorDim>>> shape) {
+ fbb_.AddOffset(Tensor::VT_SHAPE, shape);
+ }
+ void add_strides(flatbuffers::Offset<flatbuffers::Vector<int64_t>> strides) {
+ fbb_.AddOffset(Tensor::VT_STRIDES, strides);
+ }
+ void add_data(const org::apache::arrow::flatbuf::Buffer *data) {
+ fbb_.AddStruct(Tensor::VT_DATA, data);
+ }
+ explicit TensorBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+ : fbb_(_fbb) {
+ start_ = fbb_.StartTable();
+ }
+ TensorBuilder &operator=(const TensorBuilder &);
+ flatbuffers::Offset<Tensor> Finish() {
+ const auto end = fbb_.EndTable(start_);
+ auto o = flatbuffers::Offset<Tensor>(end);
+ fbb_.Required(o, Tensor::VT_TYPE);
+ fbb_.Required(o, Tensor::VT_SHAPE);
+ fbb_.Required(o, Tensor::VT_DATA);
+ return o;
+ }
+};
+
+inline flatbuffers::Offset<Tensor> CreateTensor(
+ flatbuffers::FlatBufferBuilder &_fbb,
+ org::apache::arrow::flatbuf::Type type_type = org::apache::arrow::flatbuf::Type::NONE,
+ flatbuffers::Offset<void> type = 0,
+ flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<org::apache::arrow::flatbuf::TensorDim>>> shape = 0,
+ flatbuffers::Offset<flatbuffers::Vector<int64_t>> strides = 0,
+ const org::apache::arrow::flatbuf::Buffer *data = 0) {
+ TensorBuilder builder_(_fbb);
+ builder_.add_data(data);
+ builder_.add_strides(strides);
+ builder_.add_shape(shape);
+ builder_.add_type(type);
+ builder_.add_type_type(type_type);
+ return builder_.Finish();
+}
+
+inline flatbuffers::Offset<Tensor> CreateTensorDirect(
+ flatbuffers::FlatBufferBuilder &_fbb,
+ org::apache::arrow::flatbuf::Type type_type = org::apache::arrow::flatbuf::Type::NONE,
+ flatbuffers::Offset<void> type = 0,
+ const std::vector<flatbuffers::Offset<org::apache::arrow::flatbuf::TensorDim>> *shape = nullptr,
+ const std::vector<int64_t> *strides = nullptr,
+ const org::apache::arrow::flatbuf::Buffer *data = 0) {
+ auto shape__ = shape ? _fbb.CreateVector<flatbuffers::Offset<org::apache::arrow::flatbuf::TensorDim>>(*shape) : 0;
+ auto strides__ = strides ? _fbb.CreateVector<int64_t>(*strides) : 0;
+ return org::apache::arrow::flatbuf::CreateTensor(
+ _fbb,
+ type_type,
+ type,
+ shape__,
+ strides__,
+ data);
+}
+
+inline const org::apache::arrow::flatbuf::Tensor *GetTensor(const void *buf) {
+ return flatbuffers::GetRoot<org::apache::arrow::flatbuf::Tensor>(buf);
+}
+
+inline const org::apache::arrow::flatbuf::Tensor *GetSizePrefixedTensor(const void *buf) {
+ return flatbuffers::GetSizePrefixedRoot<org::apache::arrow::flatbuf::Tensor>(buf);
+}
+
+inline bool VerifyTensorBuffer(
+ flatbuffers::Verifier &verifier) {
+ return verifier.VerifyBuffer<org::apache::arrow::flatbuf::Tensor>(nullptr);
+}
+
+inline bool VerifySizePrefixedTensorBuffer(
+ flatbuffers::Verifier &verifier) {
+ return verifier.VerifySizePrefixedBuffer<org::apache::arrow::flatbuf::Tensor>(nullptr);
+}
+
+inline void FinishTensorBuffer(
+ flatbuffers::FlatBufferBuilder &fbb,
+ flatbuffers::Offset<org::apache::arrow::flatbuf::Tensor> root) {
+ fbb.Finish(root);
+}
+
+inline void FinishSizePrefixedTensorBuffer(
+ flatbuffers::FlatBufferBuilder &fbb,
+ flatbuffers::Offset<org::apache::arrow::flatbuf::Tensor> root) {
+ fbb.FinishSizePrefixed(root);
+}
+
+} // namespace flatbuf
+} // namespace arrow
+} // namespace apache
+} // namespace org
+
+#endif // FLATBUFFERS_GENERATED_TENSOR_ORG_APACHE_ARROW_FLATBUF_H_
diff --git a/contrib/libs/apache/arrow/cpp/src/generated/feather_generated.h b/contrib/libs/apache/arrow/cpp/src/generated/feather_generated.h
new file mode 100644
index 00000000000..b925eb2bc6a
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/generated/feather_generated.h
@@ -0,0 +1,863 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+
+#ifndef FLATBUFFERS_GENERATED_FEATHER_ARROW_IPC_FEATHER_FBS_H_
+#define FLATBUFFERS_GENERATED_FEATHER_ARROW_IPC_FEATHER_FBS_H_
+
+#include "flatbuffers/flatbuffers.h"
+
+namespace arrow {
+namespace ipc {
+namespace feather {
+namespace fbs {
+
+struct PrimitiveArray;
+struct PrimitiveArrayBuilder;
+
+struct CategoryMetadata;
+struct CategoryMetadataBuilder;
+
+struct TimestampMetadata;
+struct TimestampMetadataBuilder;
+
+struct DateMetadata;
+struct DateMetadataBuilder;
+
+struct TimeMetadata;
+struct TimeMetadataBuilder;
+
+struct Column;
+struct ColumnBuilder;
+
+struct CTable;
+struct CTableBuilder;
+
+/// Feather is an experimental serialization format implemented using
+/// techniques from Apache Arrow. It was created as a proof-of-concept of an
+/// interoperable file format for storing data frames originating in Python or
+/// R. It enabled the developers to sidestep some of the open design questions
+/// in Arrow from early 2016 and instead create something simple and useful for
+/// the intended use cases.
+enum class Type : int8_t {
+ BOOL = 0,
+ INT8 = 1,
+ INT16 = 2,
+ INT32 = 3,
+ INT64 = 4,
+ UINT8 = 5,
+ UINT16 = 6,
+ UINT32 = 7,
+ UINT64 = 8,
+ FLOAT = 9,
+ DOUBLE = 10,
+ UTF8 = 11,
+ BINARY = 12,
+ CATEGORY = 13,
+ TIMESTAMP = 14,
+ DATE = 15,
+ TIME = 16,
+ LARGE_UTF8 = 17,
+ LARGE_BINARY = 18,
+ MIN = BOOL,
+ MAX = LARGE_BINARY
+};
+
+inline const Type (&EnumValuesType())[19] {
+ static const Type values[] = {
+ Type::BOOL,
+ Type::INT8,
+ Type::INT16,
+ Type::INT32,
+ Type::INT64,
+ Type::UINT8,
+ Type::UINT16,
+ Type::UINT32,
+ Type::UINT64,
+ Type::FLOAT,
+ Type::DOUBLE,
+ Type::UTF8,
+ Type::BINARY,
+ Type::CATEGORY,
+ Type::TIMESTAMP,
+ Type::DATE,
+ Type::TIME,
+ Type::LARGE_UTF8,
+ Type::LARGE_BINARY
+ };
+ return values;
+}
+
+inline const char * const *EnumNamesType() {
+ static const char * const names[20] = {
+ "BOOL",
+ "INT8",
+ "INT16",
+ "INT32",
+ "INT64",
+ "UINT8",
+ "UINT16",
+ "UINT32",
+ "UINT64",
+ "FLOAT",
+ "DOUBLE",
+ "UTF8",
+ "BINARY",
+ "CATEGORY",
+ "TIMESTAMP",
+ "DATE",
+ "TIME",
+ "LARGE_UTF8",
+ "LARGE_BINARY",
+ nullptr
+ };
+ return names;
+}
+
+inline const char *EnumNameType(Type e) {
+ if (flatbuffers::IsOutRange(e, Type::BOOL, Type::LARGE_BINARY)) return "";
+ const size_t index = static_cast<size_t>(e);
+ return EnumNamesType()[index];
+}
+
+enum class Encoding : int8_t {
+ PLAIN = 0,
+ /// Data is stored dictionary-encoded
+ /// dictionary size: <INT32 Dictionary size>
+ /// dictionary data: <TYPE primitive array>
+ /// dictionary index: <INT32 primitive array>
+ ///
+ /// TODO: do we care about storing the index values in a smaller typeclass
+ DICTIONARY = 1,
+ MIN = PLAIN,
+ MAX = DICTIONARY
+};
+
+inline const Encoding (&EnumValuesEncoding())[2] {
+ static const Encoding values[] = {
+ Encoding::PLAIN,
+ Encoding::DICTIONARY
+ };
+ return values;
+}
+
+inline const char * const *EnumNamesEncoding() {
+ static const char * const names[3] = {
+ "PLAIN",
+ "DICTIONARY",
+ nullptr
+ };
+ return names;
+}
+
+inline const char *EnumNameEncoding(Encoding e) {
+ if (flatbuffers::IsOutRange(e, Encoding::PLAIN, Encoding::DICTIONARY)) return "";
+ const size_t index = static_cast<size_t>(e);
+ return EnumNamesEncoding()[index];
+}
+
+enum class TimeUnit : int8_t {
+ SECOND = 0,
+ MILLISECOND = 1,
+ MICROSECOND = 2,
+ NANOSECOND = 3,
+ MIN = SECOND,
+ MAX = NANOSECOND
+};
+
+inline const TimeUnit (&EnumValuesTimeUnit())[4] {
+ static const TimeUnit values[] = {
+ TimeUnit::SECOND,
+ TimeUnit::MILLISECOND,
+ TimeUnit::MICROSECOND,
+ TimeUnit::NANOSECOND
+ };
+ return values;
+}
+
+inline const char * const *EnumNamesTimeUnit() {
+ static const char * const names[5] = {
+ "SECOND",
+ "MILLISECOND",
+ "MICROSECOND",
+ "NANOSECOND",
+ nullptr
+ };
+ return names;
+}
+
+inline const char *EnumNameTimeUnit(TimeUnit e) {
+ if (flatbuffers::IsOutRange(e, TimeUnit::SECOND, TimeUnit::NANOSECOND)) return "";
+ const size_t index = static_cast<size_t>(e);
+ return EnumNamesTimeUnit()[index];
+}
+
+enum class TypeMetadata : uint8_t {
+ NONE = 0,
+ CategoryMetadata = 1,
+ TimestampMetadata = 2,
+ DateMetadata = 3,
+ TimeMetadata = 4,
+ MIN = NONE,
+ MAX = TimeMetadata
+};
+
+inline const TypeMetadata (&EnumValuesTypeMetadata())[5] {
+ static const TypeMetadata values[] = {
+ TypeMetadata::NONE,
+ TypeMetadata::CategoryMetadata,
+ TypeMetadata::TimestampMetadata,
+ TypeMetadata::DateMetadata,
+ TypeMetadata::TimeMetadata
+ };
+ return values;
+}
+
+inline const char * const *EnumNamesTypeMetadata() {
+ static const char * const names[6] = {
+ "NONE",
+ "CategoryMetadata",
+ "TimestampMetadata",
+ "DateMetadata",
+ "TimeMetadata",
+ nullptr
+ };
+ return names;
+}
+
+inline const char *EnumNameTypeMetadata(TypeMetadata e) {
+ if (flatbuffers::IsOutRange(e, TypeMetadata::NONE, TypeMetadata::TimeMetadata)) return "";
+ const size_t index = static_cast<size_t>(e);
+ return EnumNamesTypeMetadata()[index];
+}
+
+template<typename T> struct TypeMetadataTraits {
+ static const TypeMetadata enum_value = TypeMetadata::NONE;
+};
+
+template<> struct TypeMetadataTraits<arrow::ipc::feather::fbs::CategoryMetadata> {
+ static const TypeMetadata enum_value = TypeMetadata::CategoryMetadata;
+};
+
+template<> struct TypeMetadataTraits<arrow::ipc::feather::fbs::TimestampMetadata> {
+ static const TypeMetadata enum_value = TypeMetadata::TimestampMetadata;
+};
+
+template<> struct TypeMetadataTraits<arrow::ipc::feather::fbs::DateMetadata> {
+ static const TypeMetadata enum_value = TypeMetadata::DateMetadata;
+};
+
+template<> struct TypeMetadataTraits<arrow::ipc::feather::fbs::TimeMetadata> {
+ static const TypeMetadata enum_value = TypeMetadata::TimeMetadata;
+};
+
+bool VerifyTypeMetadata(flatbuffers::Verifier &verifier, const void *obj, TypeMetadata type);
+bool VerifyTypeMetadataVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types);
+
+struct PrimitiveArray FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+ typedef PrimitiveArrayBuilder Builder;
+ enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+ VT_TYPE = 4,
+ VT_ENCODING = 6,
+ VT_OFFSET = 8,
+ VT_LENGTH = 10,
+ VT_NULL_COUNT = 12,
+ VT_TOTAL_BYTES = 14
+ };
+ arrow::ipc::feather::fbs::Type type() const {
+ return static_cast<arrow::ipc::feather::fbs::Type>(GetField<int8_t>(VT_TYPE, 0));
+ }
+ arrow::ipc::feather::fbs::Encoding encoding() const {
+ return static_cast<arrow::ipc::feather::fbs::Encoding>(GetField<int8_t>(VT_ENCODING, 0));
+ }
+ /// Relative memory offset of the start of the array data excluding the size
+ /// of the metadata
+ int64_t offset() const {
+ return GetField<int64_t>(VT_OFFSET, 0);
+ }
+ /// The number of logical values in the array
+ int64_t length() const {
+ return GetField<int64_t>(VT_LENGTH, 0);
+ }
+ /// The number of observed nulls
+ int64_t null_count() const {
+ return GetField<int64_t>(VT_NULL_COUNT, 0);
+ }
+ /// The total size of the actual data in the file
+ int64_t total_bytes() const {
+ return GetField<int64_t>(VT_TOTAL_BYTES, 0);
+ }
+ bool Verify(flatbuffers::Verifier &verifier) const {
+ return VerifyTableStart(verifier) &&
+ VerifyField<int8_t>(verifier, VT_TYPE) &&
+ VerifyField<int8_t>(verifier, VT_ENCODING) &&
+ VerifyField<int64_t>(verifier, VT_OFFSET) &&
+ VerifyField<int64_t>(verifier, VT_LENGTH) &&
+ VerifyField<int64_t>(verifier, VT_NULL_COUNT) &&
+ VerifyField<int64_t>(verifier, VT_TOTAL_BYTES) &&
+ verifier.EndTable();
+ }
+};
+
+struct PrimitiveArrayBuilder {
+ typedef PrimitiveArray Table;
+ flatbuffers::FlatBufferBuilder &fbb_;
+ flatbuffers::uoffset_t start_;
+ void add_type(arrow::ipc::feather::fbs::Type type) {
+ fbb_.AddElement<int8_t>(PrimitiveArray::VT_TYPE, static_cast<int8_t>(type), 0);
+ }
+ void add_encoding(arrow::ipc::feather::fbs::Encoding encoding) {
+ fbb_.AddElement<int8_t>(PrimitiveArray::VT_ENCODING, static_cast<int8_t>(encoding), 0);
+ }
+ void add_offset(int64_t offset) {
+ fbb_.AddElement<int64_t>(PrimitiveArray::VT_OFFSET, offset, 0);
+ }
+ void add_length(int64_t length) {
+ fbb_.AddElement<int64_t>(PrimitiveArray::VT_LENGTH, length, 0);
+ }
+ void add_null_count(int64_t null_count) {
+ fbb_.AddElement<int64_t>(PrimitiveArray::VT_NULL_COUNT, null_count, 0);
+ }
+ void add_total_bytes(int64_t total_bytes) {
+ fbb_.AddElement<int64_t>(PrimitiveArray::VT_TOTAL_BYTES, total_bytes, 0);
+ }
+ explicit PrimitiveArrayBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+ : fbb_(_fbb) {
+ start_ = fbb_.StartTable();
+ }
+ PrimitiveArrayBuilder &operator=(const PrimitiveArrayBuilder &);
+ flatbuffers::Offset<PrimitiveArray> Finish() {
+ const auto end = fbb_.EndTable(start_);
+ auto o = flatbuffers::Offset<PrimitiveArray>(end);
+ return o;
+ }
+};
+
+inline flatbuffers::Offset<PrimitiveArray> CreatePrimitiveArray(
+ flatbuffers::FlatBufferBuilder &_fbb,
+ arrow::ipc::feather::fbs::Type type = arrow::ipc::feather::fbs::Type::BOOL,
+ arrow::ipc::feather::fbs::Encoding encoding = arrow::ipc::feather::fbs::Encoding::PLAIN,
+ int64_t offset = 0,
+ int64_t length = 0,
+ int64_t null_count = 0,
+ int64_t total_bytes = 0) {
+ PrimitiveArrayBuilder builder_(_fbb);
+ builder_.add_total_bytes(total_bytes);
+ builder_.add_null_count(null_count);
+ builder_.add_length(length);
+ builder_.add_offset(offset);
+ builder_.add_encoding(encoding);
+ builder_.add_type(type);
+ return builder_.Finish();
+}
+
+struct CategoryMetadata FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+ typedef CategoryMetadataBuilder Builder;
+ enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+ VT_LEVELS = 4,
+ VT_ORDERED = 6
+ };
+ /// The category codes are presumed to be integers that are valid indexes into
+ /// the levels array
+ const arrow::ipc::feather::fbs::PrimitiveArray *levels() const {
+ return GetPointer<const arrow::ipc::feather::fbs::PrimitiveArray *>(VT_LEVELS);
+ }
+ bool ordered() const {
+ return GetField<uint8_t>(VT_ORDERED, 0) != 0;
+ }
+ bool Verify(flatbuffers::Verifier &verifier) const {
+ return VerifyTableStart(verifier) &&
+ VerifyOffset(verifier, VT_LEVELS) &&
+ verifier.VerifyTable(levels()) &&
+ VerifyField<uint8_t>(verifier, VT_ORDERED) &&
+ verifier.EndTable();
+ }
+};
+
+struct CategoryMetadataBuilder {
+ typedef CategoryMetadata Table;
+ flatbuffers::FlatBufferBuilder &fbb_;
+ flatbuffers::uoffset_t start_;
+ void add_levels(flatbuffers::Offset<arrow::ipc::feather::fbs::PrimitiveArray> levels) {
+ fbb_.AddOffset(CategoryMetadata::VT_LEVELS, levels);
+ }
+ void add_ordered(bool ordered) {
+ fbb_.AddElement<uint8_t>(CategoryMetadata::VT_ORDERED, static_cast<uint8_t>(ordered), 0);
+ }
+ explicit CategoryMetadataBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+ : fbb_(_fbb) {
+ start_ = fbb_.StartTable();
+ }
+ CategoryMetadataBuilder &operator=(const CategoryMetadataBuilder &);
+ flatbuffers::Offset<CategoryMetadata> Finish() {
+ const auto end = fbb_.EndTable(start_);
+ auto o = flatbuffers::Offset<CategoryMetadata>(end);
+ return o;
+ }
+};
+
+inline flatbuffers::Offset<CategoryMetadata> CreateCategoryMetadata(
+ flatbuffers::FlatBufferBuilder &_fbb,
+ flatbuffers::Offset<arrow::ipc::feather::fbs::PrimitiveArray> levels = 0,
+ bool ordered = false) {
+ CategoryMetadataBuilder builder_(_fbb);
+ builder_.add_levels(levels);
+ builder_.add_ordered(ordered);
+ return builder_.Finish();
+}
+
+struct TimestampMetadata FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+ typedef TimestampMetadataBuilder Builder;
+ enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+ VT_UNIT = 4,
+ VT_TIMEZONE = 6
+ };
+ arrow::ipc::feather::fbs::TimeUnit unit() const {
+ return static_cast<arrow::ipc::feather::fbs::TimeUnit>(GetField<int8_t>(VT_UNIT, 0));
+ }
+ /// Timestamp data is assumed to be UTC, but the time zone is stored here for
+ /// presentation as localized
+ const flatbuffers::String *timezone() const {
+ return GetPointer<const flatbuffers::String *>(VT_TIMEZONE);
+ }
+ bool Verify(flatbuffers::Verifier &verifier) const {
+ return VerifyTableStart(verifier) &&
+ VerifyField<int8_t>(verifier, VT_UNIT) &&
+ VerifyOffset(verifier, VT_TIMEZONE) &&
+ verifier.VerifyString(timezone()) &&
+ verifier.EndTable();
+ }
+};
+
+struct TimestampMetadataBuilder {
+ typedef TimestampMetadata Table;
+ flatbuffers::FlatBufferBuilder &fbb_;
+ flatbuffers::uoffset_t start_;
+ void add_unit(arrow::ipc::feather::fbs::TimeUnit unit) {
+ fbb_.AddElement<int8_t>(TimestampMetadata::VT_UNIT, static_cast<int8_t>(unit), 0);
+ }
+ void add_timezone(flatbuffers::Offset<flatbuffers::String> timezone) {
+ fbb_.AddOffset(TimestampMetadata::VT_TIMEZONE, timezone);
+ }
+ explicit TimestampMetadataBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+ : fbb_(_fbb) {
+ start_ = fbb_.StartTable();
+ }
+ TimestampMetadataBuilder &operator=(const TimestampMetadataBuilder &);
+ flatbuffers::Offset<TimestampMetadata> Finish() {
+ const auto end = fbb_.EndTable(start_);
+ auto o = flatbuffers::Offset<TimestampMetadata>(end);
+ return o;
+ }
+};
+
+inline flatbuffers::Offset<TimestampMetadata> CreateTimestampMetadata(
+ flatbuffers::FlatBufferBuilder &_fbb,
+ arrow::ipc::feather::fbs::TimeUnit unit = arrow::ipc::feather::fbs::TimeUnit::SECOND,
+ flatbuffers::Offset<flatbuffers::String> timezone = 0) {
+ TimestampMetadataBuilder builder_(_fbb);
+ builder_.add_timezone(timezone);
+ builder_.add_unit(unit);
+ return builder_.Finish();
+}
+
+inline flatbuffers::Offset<TimestampMetadata> CreateTimestampMetadataDirect(
+ flatbuffers::FlatBufferBuilder &_fbb,
+ arrow::ipc::feather::fbs::TimeUnit unit = arrow::ipc::feather::fbs::TimeUnit::SECOND,
+ const char *timezone = nullptr) {
+ auto timezone__ = timezone ? _fbb.CreateString(timezone) : 0;
+ return arrow::ipc::feather::fbs::CreateTimestampMetadata(
+ _fbb,
+ unit,
+ timezone__);
+}
+
+struct DateMetadata FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+ typedef DateMetadataBuilder Builder;
+ bool Verify(flatbuffers::Verifier &verifier) const {
+ return VerifyTableStart(verifier) &&
+ verifier.EndTable();
+ }
+};
+
+struct DateMetadataBuilder {
+ typedef DateMetadata Table;
+ flatbuffers::FlatBufferBuilder &fbb_;
+ flatbuffers::uoffset_t start_;
+ explicit DateMetadataBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+ : fbb_(_fbb) {
+ start_ = fbb_.StartTable();
+ }
+ DateMetadataBuilder &operator=(const DateMetadataBuilder &);
+ flatbuffers::Offset<DateMetadata> Finish() {
+ const auto end = fbb_.EndTable(start_);
+ auto o = flatbuffers::Offset<DateMetadata>(end);
+ return o;
+ }
+};
+
+inline flatbuffers::Offset<DateMetadata> CreateDateMetadata(
+ flatbuffers::FlatBufferBuilder &_fbb) {
+ DateMetadataBuilder builder_(_fbb);
+ return builder_.Finish();
+}
+
+struct TimeMetadata FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+ typedef TimeMetadataBuilder Builder;
+ enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+ VT_UNIT = 4
+ };
+ arrow::ipc::feather::fbs::TimeUnit unit() const {
+ return static_cast<arrow::ipc::feather::fbs::TimeUnit>(GetField<int8_t>(VT_UNIT, 0));
+ }
+ bool Verify(flatbuffers::Verifier &verifier) const {
+ return VerifyTableStart(verifier) &&
+ VerifyField<int8_t>(verifier, VT_UNIT) &&
+ verifier.EndTable();
+ }
+};
+
+struct TimeMetadataBuilder {
+ typedef TimeMetadata Table;
+ flatbuffers::FlatBufferBuilder &fbb_;
+ flatbuffers::uoffset_t start_;
+ void add_unit(arrow::ipc::feather::fbs::TimeUnit unit) {
+ fbb_.AddElement<int8_t>(TimeMetadata::VT_UNIT, static_cast<int8_t>(unit), 0);
+ }
+ explicit TimeMetadataBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+ : fbb_(_fbb) {
+ start_ = fbb_.StartTable();
+ }
+ TimeMetadataBuilder &operator=(const TimeMetadataBuilder &);
+ flatbuffers::Offset<TimeMetadata> Finish() {
+ const auto end = fbb_.EndTable(start_);
+ auto o = flatbuffers::Offset<TimeMetadata>(end);
+ return o;
+ }
+};
+
+inline flatbuffers::Offset<TimeMetadata> CreateTimeMetadata(
+ flatbuffers::FlatBufferBuilder &_fbb,
+ arrow::ipc::feather::fbs::TimeUnit unit = arrow::ipc::feather::fbs::TimeUnit::SECOND) {
+ TimeMetadataBuilder builder_(_fbb);
+ builder_.add_unit(unit);
+ return builder_.Finish();
+}
+
+struct Column FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+ typedef ColumnBuilder Builder;
+ enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+ VT_NAME = 4,
+ VT_VALUES = 6,
+ VT_METADATA_TYPE = 8,
+ VT_METADATA = 10,
+ VT_USER_METADATA = 12
+ };
+ const flatbuffers::String *name() const {
+ return GetPointer<const flatbuffers::String *>(VT_NAME);
+ }
+ const arrow::ipc::feather::fbs::PrimitiveArray *values() const {
+ return GetPointer<const arrow::ipc::feather::fbs::PrimitiveArray *>(VT_VALUES);
+ }
+ arrow::ipc::feather::fbs::TypeMetadata metadata_type() const {
+ return static_cast<arrow::ipc::feather::fbs::TypeMetadata>(GetField<uint8_t>(VT_METADATA_TYPE, 0));
+ }
+ const void *metadata() const {
+ return GetPointer<const void *>(VT_METADATA);
+ }
+ template<typename T> const T *metadata_as() const;
+ const arrow::ipc::feather::fbs::CategoryMetadata *metadata_as_CategoryMetadata() const {
+ return metadata_type() == arrow::ipc::feather::fbs::TypeMetadata::CategoryMetadata ? static_cast<const arrow::ipc::feather::fbs::CategoryMetadata *>(metadata()) : nullptr;
+ }
+ const arrow::ipc::feather::fbs::TimestampMetadata *metadata_as_TimestampMetadata() const {
+ return metadata_type() == arrow::ipc::feather::fbs::TypeMetadata::TimestampMetadata ? static_cast<const arrow::ipc::feather::fbs::TimestampMetadata *>(metadata()) : nullptr;
+ }
+ const arrow::ipc::feather::fbs::DateMetadata *metadata_as_DateMetadata() const {
+ return metadata_type() == arrow::ipc::feather::fbs::TypeMetadata::DateMetadata ? static_cast<const arrow::ipc::feather::fbs::DateMetadata *>(metadata()) : nullptr;
+ }
+ const arrow::ipc::feather::fbs::TimeMetadata *metadata_as_TimeMetadata() const {
+ return metadata_type() == arrow::ipc::feather::fbs::TypeMetadata::TimeMetadata ? static_cast<const arrow::ipc::feather::fbs::TimeMetadata *>(metadata()) : nullptr;
+ }
+ /// This should (probably) be JSON
+ const flatbuffers::String *user_metadata() const {
+ return GetPointer<const flatbuffers::String *>(VT_USER_METADATA);
+ }
+ bool Verify(flatbuffers::Verifier &verifier) const {
+ return VerifyTableStart(verifier) &&
+ VerifyOffset(verifier, VT_NAME) &&
+ verifier.VerifyString(name()) &&
+ VerifyOffset(verifier, VT_VALUES) &&
+ verifier.VerifyTable(values()) &&
+ VerifyField<uint8_t>(verifier, VT_METADATA_TYPE) &&
+ VerifyOffset(verifier, VT_METADATA) &&
+ VerifyTypeMetadata(verifier, metadata(), metadata_type()) &&
+ VerifyOffset(verifier, VT_USER_METADATA) &&
+ verifier.VerifyString(user_metadata()) &&
+ verifier.EndTable();
+ }
+};
+
+template<> inline const arrow::ipc::feather::fbs::CategoryMetadata *Column::metadata_as<arrow::ipc::feather::fbs::CategoryMetadata>() const {
+ return metadata_as_CategoryMetadata();
+}
+
+template<> inline const arrow::ipc::feather::fbs::TimestampMetadata *Column::metadata_as<arrow::ipc::feather::fbs::TimestampMetadata>() const {
+ return metadata_as_TimestampMetadata();
+}
+
+template<> inline const arrow::ipc::feather::fbs::DateMetadata *Column::metadata_as<arrow::ipc::feather::fbs::DateMetadata>() const {
+ return metadata_as_DateMetadata();
+}
+
+template<> inline const arrow::ipc::feather::fbs::TimeMetadata *Column::metadata_as<arrow::ipc::feather::fbs::TimeMetadata>() const {
+ return metadata_as_TimeMetadata();
+}
+
+struct ColumnBuilder {
+ typedef Column Table;
+ flatbuffers::FlatBufferBuilder &fbb_;
+ flatbuffers::uoffset_t start_;
+ void add_name(flatbuffers::Offset<flatbuffers::String> name) {
+ fbb_.AddOffset(Column::VT_NAME, name);
+ }
+ void add_values(flatbuffers::Offset<arrow::ipc::feather::fbs::PrimitiveArray> values) {
+ fbb_.AddOffset(Column::VT_VALUES, values);
+ }
+ void add_metadata_type(arrow::ipc::feather::fbs::TypeMetadata metadata_type) {
+ fbb_.AddElement<uint8_t>(Column::VT_METADATA_TYPE, static_cast<uint8_t>(metadata_type), 0);
+ }
+ void add_metadata(flatbuffers::Offset<void> metadata) {
+ fbb_.AddOffset(Column::VT_METADATA, metadata);
+ }
+ void add_user_metadata(flatbuffers::Offset<flatbuffers::String> user_metadata) {
+ fbb_.AddOffset(Column::VT_USER_METADATA, user_metadata);
+ }
+ explicit ColumnBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+ : fbb_(_fbb) {
+ start_ = fbb_.StartTable();
+ }
+ ColumnBuilder &operator=(const ColumnBuilder &);
+ flatbuffers::Offset<Column> Finish() {
+ const auto end = fbb_.EndTable(start_);
+ auto o = flatbuffers::Offset<Column>(end);
+ return o;
+ }
+};
+
+inline flatbuffers::Offset<Column> CreateColumn(
+ flatbuffers::FlatBufferBuilder &_fbb,
+ flatbuffers::Offset<flatbuffers::String> name = 0,
+ flatbuffers::Offset<arrow::ipc::feather::fbs::PrimitiveArray> values = 0,
+ arrow::ipc::feather::fbs::TypeMetadata metadata_type = arrow::ipc::feather::fbs::TypeMetadata::NONE,
+ flatbuffers::Offset<void> metadata = 0,
+ flatbuffers::Offset<flatbuffers::String> user_metadata = 0) {
+ ColumnBuilder builder_(_fbb);
+ builder_.add_user_metadata(user_metadata);
+ builder_.add_metadata(metadata);
+ builder_.add_values(values);
+ builder_.add_name(name);
+ builder_.add_metadata_type(metadata_type);
+ return builder_.Finish();
+}
+
+inline flatbuffers::Offset<Column> CreateColumnDirect(
+ flatbuffers::FlatBufferBuilder &_fbb,
+ const char *name = nullptr,
+ flatbuffers::Offset<arrow::ipc::feather::fbs::PrimitiveArray> values = 0,
+ arrow::ipc::feather::fbs::TypeMetadata metadata_type = arrow::ipc::feather::fbs::TypeMetadata::NONE,
+ flatbuffers::Offset<void> metadata = 0,
+ const char *user_metadata = nullptr) {
+ auto name__ = name ? _fbb.CreateString(name) : 0;
+ auto user_metadata__ = user_metadata ? _fbb.CreateString(user_metadata) : 0;
+ return arrow::ipc::feather::fbs::CreateColumn(
+ _fbb,
+ name__,
+ values,
+ metadata_type,
+ metadata,
+ user_metadata__);
+}
+
+struct CTable FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+ typedef CTableBuilder Builder;
+ enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+ VT_DESCRIPTION = 4,
+ VT_NUM_ROWS = 6,
+ VT_COLUMNS = 8,
+ VT_VERSION = 10,
+ VT_METADATA = 12
+ };
+ /// Some text (or a name) metadata about what the file is, optional
+ const flatbuffers::String *description() const {
+ return GetPointer<const flatbuffers::String *>(VT_DESCRIPTION);
+ }
+ int64_t num_rows() const {
+ return GetField<int64_t>(VT_NUM_ROWS, 0);
+ }
+ const flatbuffers::Vector<flatbuffers::Offset<arrow::ipc::feather::fbs::Column>> *columns() const {
+ return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<arrow::ipc::feather::fbs::Column>> *>(VT_COLUMNS);
+ }
+ /// Version number of the Feather format
+ ///
+ /// Internal versions 0, 1, and 2: Implemented in Apache Arrow <= 0.16.0 and
+ /// wesm/feather. Uses "custom" metadata defined in this file.
+ int32_t version() const {
+ return GetField<int32_t>(VT_VERSION, 0);
+ }
+ /// Table metadata (likely JSON), not yet used
+ const flatbuffers::String *metadata() const {
+ return GetPointer<const flatbuffers::String *>(VT_METADATA);
+ }
+ bool Verify(flatbuffers::Verifier &verifier) const {
+ return VerifyTableStart(verifier) &&
+ VerifyOffset(verifier, VT_DESCRIPTION) &&
+ verifier.VerifyString(description()) &&
+ VerifyField<int64_t>(verifier, VT_NUM_ROWS) &&
+ VerifyOffset(verifier, VT_COLUMNS) &&
+ verifier.VerifyVector(columns()) &&
+ verifier.VerifyVectorOfTables(columns()) &&
+ VerifyField<int32_t>(verifier, VT_VERSION) &&
+ VerifyOffset(verifier, VT_METADATA) &&
+ verifier.VerifyString(metadata()) &&
+ verifier.EndTable();
+ }
+};
+
+struct CTableBuilder {
+ typedef CTable Table;
+ flatbuffers::FlatBufferBuilder &fbb_;
+ flatbuffers::uoffset_t start_;
+ void add_description(flatbuffers::Offset<flatbuffers::String> description) {
+ fbb_.AddOffset(CTable::VT_DESCRIPTION, description);
+ }
+ void add_num_rows(int64_t num_rows) {
+ fbb_.AddElement<int64_t>(CTable::VT_NUM_ROWS, num_rows, 0);
+ }
+ void add_columns(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<arrow::ipc::feather::fbs::Column>>> columns) {
+ fbb_.AddOffset(CTable::VT_COLUMNS, columns);
+ }
+ void add_version(int32_t version) {
+ fbb_.AddElement<int32_t>(CTable::VT_VERSION, version, 0);
+ }
+ void add_metadata(flatbuffers::Offset<flatbuffers::String> metadata) {
+ fbb_.AddOffset(CTable::VT_METADATA, metadata);
+ }
+ explicit CTableBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+ : fbb_(_fbb) {
+ start_ = fbb_.StartTable();
+ }
+ CTableBuilder &operator=(const CTableBuilder &);
+ flatbuffers::Offset<CTable> Finish() {
+ const auto end = fbb_.EndTable(start_);
+ auto o = flatbuffers::Offset<CTable>(end);
+ return o;
+ }
+};
+
+inline flatbuffers::Offset<CTable> CreateCTable(
+ flatbuffers::FlatBufferBuilder &_fbb,
+ flatbuffers::Offset<flatbuffers::String> description = 0,
+ int64_t num_rows = 0,
+ flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<arrow::ipc::feather::fbs::Column>>> columns = 0,
+ int32_t version = 0,
+ flatbuffers::Offset<flatbuffers::String> metadata = 0) {
+ CTableBuilder builder_(_fbb);
+ builder_.add_num_rows(num_rows);
+ builder_.add_metadata(metadata);
+ builder_.add_version(version);
+ builder_.add_columns(columns);
+ builder_.add_description(description);
+ return builder_.Finish();
+}
+
+inline flatbuffers::Offset<CTable> CreateCTableDirect(
+ flatbuffers::FlatBufferBuilder &_fbb,
+ const char *description = nullptr,
+ int64_t num_rows = 0,
+ const std::vector<flatbuffers::Offset<arrow::ipc::feather::fbs::Column>> *columns = nullptr,
+ int32_t version = 0,
+ const char *metadata = nullptr) {
+ auto description__ = description ? _fbb.CreateString(description) : 0;
+ auto columns__ = columns ? _fbb.CreateVector<flatbuffers::Offset<arrow::ipc::feather::fbs::Column>>(*columns) : 0;
+ auto metadata__ = metadata ? _fbb.CreateString(metadata) : 0;
+ return arrow::ipc::feather::fbs::CreateCTable(
+ _fbb,
+ description__,
+ num_rows,
+ columns__,
+ version,
+ metadata__);
+}
+
+inline bool VerifyTypeMetadata(flatbuffers::Verifier &verifier, const void *obj, TypeMetadata type) {
+ switch (type) {
+ case TypeMetadata::NONE: {
+ return true;
+ }
+ case TypeMetadata::CategoryMetadata: {
+ auto ptr = reinterpret_cast<const arrow::ipc::feather::fbs::CategoryMetadata *>(obj);
+ return verifier.VerifyTable(ptr);
+ }
+ case TypeMetadata::TimestampMetadata: {
+ auto ptr = reinterpret_cast<const arrow::ipc::feather::fbs::TimestampMetadata *>(obj);
+ return verifier.VerifyTable(ptr);
+ }
+ case TypeMetadata::DateMetadata: {
+ auto ptr = reinterpret_cast<const arrow::ipc::feather::fbs::DateMetadata *>(obj);
+ return verifier.VerifyTable(ptr);
+ }
+ case TypeMetadata::TimeMetadata: {
+ auto ptr = reinterpret_cast<const arrow::ipc::feather::fbs::TimeMetadata *>(obj);
+ return verifier.VerifyTable(ptr);
+ }
+ default: return true;
+ }
+}
+
+inline bool VerifyTypeMetadataVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types) {
+ if (!values || !types) return !values && !types;
+ if (values->size() != types->size()) return false;
+ for (flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
+ if (!VerifyTypeMetadata(
+ verifier, values->Get(i), types->GetEnum<TypeMetadata>(i))) {
+ return false;
+ }
+ }
+ return true;
+}
+
+inline const arrow::ipc::feather::fbs::CTable *GetCTable(const void *buf) {
+ return flatbuffers::GetRoot<arrow::ipc::feather::fbs::CTable>(buf);
+}
+
+inline const arrow::ipc::feather::fbs::CTable *GetSizePrefixedCTable(const void *buf) {
+ return flatbuffers::GetSizePrefixedRoot<arrow::ipc::feather::fbs::CTable>(buf);
+}
+
+inline bool VerifyCTableBuffer(
+ flatbuffers::Verifier &verifier) {
+ return verifier.VerifyBuffer<arrow::ipc::feather::fbs::CTable>(nullptr);
+}
+
+inline bool VerifySizePrefixedCTableBuffer(
+ flatbuffers::Verifier &verifier) {
+ return verifier.VerifySizePrefixedBuffer<arrow::ipc::feather::fbs::CTable>(nullptr);
+}
+
+inline void FinishCTableBuffer(
+ flatbuffers::FlatBufferBuilder &fbb,
+ flatbuffers::Offset<arrow::ipc::feather::fbs::CTable> root) {
+ fbb.Finish(root);
+}
+
+inline void FinishSizePrefixedCTableBuffer(
+ flatbuffers::FlatBufferBuilder &fbb,
+ flatbuffers::Offset<arrow::ipc::feather::fbs::CTable> root) {
+ fbb.FinishSizePrefixed(root);
+}
+
+} // namespace fbs
+} // namespace feather
+} // namespace ipc
+} // namespace arrow
+
+#endif // FLATBUFFERS_GENERATED_FEATHER_ARROW_IPC_FEATHER_FBS_H_
diff --git a/contrib/libs/apache/arrow/cpp/src/generated/parquet_constants.cpp b/contrib/libs/apache/arrow/cpp/src/generated/parquet_constants.cpp
new file mode 100644
index 00000000000..b1b4ce62673
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/generated/parquet_constants.cpp
@@ -0,0 +1,17 @@
+/**
+ * Autogenerated by Thrift Compiler (0.13.0)
+ *
+ * DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
+ * @generated
+ */
+#include "parquet_constants.h"
+
+namespace parquet { namespace format {
+
+const parquetConstants g_parquet_constants;
+
+parquetConstants::parquetConstants() {
+}
+
+}} // namespace
+
diff --git a/contrib/libs/apache/arrow/cpp/src/generated/parquet_constants.h b/contrib/libs/apache/arrow/cpp/src/generated/parquet_constants.h
new file mode 100644
index 00000000000..1e288c7cd1f
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/generated/parquet_constants.h
@@ -0,0 +1,24 @@
+/**
+ * Autogenerated by Thrift Compiler (0.13.0)
+ *
+ * DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
+ * @generated
+ */
+#ifndef parquet_CONSTANTS_H
+#define parquet_CONSTANTS_H
+
+#include "parquet_types.h"
+
+namespace parquet { namespace format {
+
+class parquetConstants {
+ public:
+ parquetConstants();
+
+};
+
+extern const parquetConstants g_parquet_constants;
+
+}} // namespace
+
+#endif
diff --git a/contrib/libs/apache/arrow/cpp/src/generated/parquet_types.cpp b/contrib/libs/apache/arrow/cpp/src/generated/parquet_types.cpp
new file mode 100644
index 00000000000..7c7289658ee
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/generated/parquet_types.cpp
@@ -0,0 +1,7415 @@
+/**
+ * Autogenerated by Thrift Compiler (0.13.0)
+ *
+ * DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
+ * @generated
+ */
+#include "parquet_types.h"
+
+#include <algorithm>
+#include <ostream>
+
+#include <thrift/TToString.h>
+
+namespace parquet { namespace format {
+
+int _kTypeValues[] = {
+ Type::BOOLEAN,
+ Type::INT32,
+ Type::INT64,
+ Type::INT96,
+ Type::FLOAT,
+ Type::DOUBLE,
+ Type::BYTE_ARRAY,
+ Type::FIXED_LEN_BYTE_ARRAY
+};
+const char* _kTypeNames[] = {
+ "BOOLEAN",
+ "INT32",
+ "INT64",
+ "INT96",
+ "FLOAT",
+ "DOUBLE",
+ "BYTE_ARRAY",
+ "FIXED_LEN_BYTE_ARRAY"
+};
+const std::map<int, const char*> _Type_VALUES_TO_NAMES(::apache::thrift::TEnumIterator(8, _kTypeValues, _kTypeNames), ::apache::thrift::TEnumIterator(-1, NULL, NULL));
+
+std::ostream& operator<<(std::ostream& out, const Type::type& val) {
+ std::map<int, const char*>::const_iterator it = _Type_VALUES_TO_NAMES.find(val);
+ if (it != _Type_VALUES_TO_NAMES.end()) {
+ out << it->second;
+ } else {
+ out << static_cast<int>(val);
+ }
+ return out;
+}
+
+std::string to_string(const Type::type& val) {
+ std::map<int, const char*>::const_iterator it = _Type_VALUES_TO_NAMES.find(val);
+ if (it != _Type_VALUES_TO_NAMES.end()) {
+ return std::string(it->second);
+ } else {
+ return std::to_string(static_cast<int>(val));
+ }
+}
+
+int _kConvertedTypeValues[] = {
+ ConvertedType::UTF8,
+ ConvertedType::MAP,
+ ConvertedType::MAP_KEY_VALUE,
+ ConvertedType::LIST,
+ ConvertedType::ENUM,
+ ConvertedType::DECIMAL,
+ ConvertedType::DATE,
+ ConvertedType::TIME_MILLIS,
+ ConvertedType::TIME_MICROS,
+ ConvertedType::TIMESTAMP_MILLIS,
+ ConvertedType::TIMESTAMP_MICROS,
+ ConvertedType::UINT_8,
+ ConvertedType::UINT_16,
+ ConvertedType::UINT_32,
+ ConvertedType::UINT_64,
+ ConvertedType::INT_8,
+ ConvertedType::INT_16,
+ ConvertedType::INT_32,
+ ConvertedType::INT_64,
+ ConvertedType::JSON,
+ ConvertedType::BSON,
+ ConvertedType::INTERVAL
+};
+const char* _kConvertedTypeNames[] = {
+ "UTF8",
+ "MAP",
+ "MAP_KEY_VALUE",
+ "LIST",
+ "ENUM",
+ "DECIMAL",
+ "DATE",
+ "TIME_MILLIS",
+ "TIME_MICROS",
+ "TIMESTAMP_MILLIS",
+ "TIMESTAMP_MICROS",
+ "UINT_8",
+ "UINT_16",
+ "UINT_32",
+ "UINT_64",
+ "INT_8",
+ "INT_16",
+ "INT_32",
+ "INT_64",
+ "JSON",
+ "BSON",
+ "INTERVAL"
+};
+const std::map<int, const char*> _ConvertedType_VALUES_TO_NAMES(::apache::thrift::TEnumIterator(22, _kConvertedTypeValues, _kConvertedTypeNames), ::apache::thrift::TEnumIterator(-1, NULL, NULL));
+
+std::ostream& operator<<(std::ostream& out, const ConvertedType::type& val) {
+ std::map<int, const char*>::const_iterator it = _ConvertedType_VALUES_TO_NAMES.find(val);
+ if (it != _ConvertedType_VALUES_TO_NAMES.end()) {
+ out << it->second;
+ } else {
+ out << static_cast<int>(val);
+ }
+ return out;
+}
+
+std::string to_string(const ConvertedType::type& val) {
+ std::map<int, const char*>::const_iterator it = _ConvertedType_VALUES_TO_NAMES.find(val);
+ if (it != _ConvertedType_VALUES_TO_NAMES.end()) {
+ return std::string(it->second);
+ } else {
+ return std::to_string(static_cast<int>(val));
+ }
+}
+
+int _kFieldRepetitionTypeValues[] = {
+ FieldRepetitionType::REQUIRED,
+ FieldRepetitionType::OPTIONAL,
+ FieldRepetitionType::REPEATED
+};
+const char* _kFieldRepetitionTypeNames[] = {
+ "REQUIRED",
+ "OPTIONAL",
+ "REPEATED"
+};
+const std::map<int, const char*> _FieldRepetitionType_VALUES_TO_NAMES(::apache::thrift::TEnumIterator(3, _kFieldRepetitionTypeValues, _kFieldRepetitionTypeNames), ::apache::thrift::TEnumIterator(-1, NULL, NULL));
+
+std::ostream& operator<<(std::ostream& out, const FieldRepetitionType::type& val) {
+ std::map<int, const char*>::const_iterator it = _FieldRepetitionType_VALUES_TO_NAMES.find(val);
+ if (it != _FieldRepetitionType_VALUES_TO_NAMES.end()) {
+ out << it->second;
+ } else {
+ out << static_cast<int>(val);
+ }
+ return out;
+}
+
+std::string to_string(const FieldRepetitionType::type& val) {
+ std::map<int, const char*>::const_iterator it = _FieldRepetitionType_VALUES_TO_NAMES.find(val);
+ if (it != _FieldRepetitionType_VALUES_TO_NAMES.end()) {
+ return std::string(it->second);
+ } else {
+ return std::to_string(static_cast<int>(val));
+ }
+}
+
+int _kEncodingValues[] = {
+ Encoding::PLAIN,
+ Encoding::PLAIN_DICTIONARY,
+ Encoding::RLE,
+ Encoding::BIT_PACKED,
+ Encoding::DELTA_BINARY_PACKED,
+ Encoding::DELTA_LENGTH_BYTE_ARRAY,
+ Encoding::DELTA_BYTE_ARRAY,
+ Encoding::RLE_DICTIONARY,
+ Encoding::BYTE_STREAM_SPLIT
+};
+const char* _kEncodingNames[] = {
+ "PLAIN",
+ "PLAIN_DICTIONARY",
+ "RLE",
+ "BIT_PACKED",
+ "DELTA_BINARY_PACKED",
+ "DELTA_LENGTH_BYTE_ARRAY",
+ "DELTA_BYTE_ARRAY",
+ "RLE_DICTIONARY",
+ "BYTE_STREAM_SPLIT"
+};
+const std::map<int, const char*> _Encoding_VALUES_TO_NAMES(::apache::thrift::TEnumIterator(9, _kEncodingValues, _kEncodingNames), ::apache::thrift::TEnumIterator(-1, NULL, NULL));
+
+std::ostream& operator<<(std::ostream& out, const Encoding::type& val) {
+ std::map<int, const char*>::const_iterator it = _Encoding_VALUES_TO_NAMES.find(val);
+ if (it != _Encoding_VALUES_TO_NAMES.end()) {
+ out << it->second;
+ } else {
+ out << static_cast<int>(val);
+ }
+ return out;
+}
+
+std::string to_string(const Encoding::type& val) {
+ std::map<int, const char*>::const_iterator it = _Encoding_VALUES_TO_NAMES.find(val);
+ if (it != _Encoding_VALUES_TO_NAMES.end()) {
+ return std::string(it->second);
+ } else {
+ return std::to_string(static_cast<int>(val));
+ }
+}
+
+int _kCompressionCodecValues[] = {
+ CompressionCodec::UNCOMPRESSED,
+ CompressionCodec::SNAPPY,
+ CompressionCodec::GZIP,
+ CompressionCodec::LZO,
+ CompressionCodec::BROTLI,
+ CompressionCodec::LZ4,
+ CompressionCodec::ZSTD,
+ CompressionCodec::LZ4_RAW
+};
+const char* _kCompressionCodecNames[] = {
+ "UNCOMPRESSED",
+ "SNAPPY",
+ "GZIP",
+ "LZO",
+ "BROTLI",
+ "LZ4",
+ "ZSTD",
+ "LZ4_RAW"
+};
+const std::map<int, const char*> _CompressionCodec_VALUES_TO_NAMES(::apache::thrift::TEnumIterator(8, _kCompressionCodecValues, _kCompressionCodecNames), ::apache::thrift::TEnumIterator(-1, NULL, NULL));
+
+std::ostream& operator<<(std::ostream& out, const CompressionCodec::type& val) {
+ std::map<int, const char*>::const_iterator it = _CompressionCodec_VALUES_TO_NAMES.find(val);
+ if (it != _CompressionCodec_VALUES_TO_NAMES.end()) {
+ out << it->second;
+ } else {
+ out << static_cast<int>(val);
+ }
+ return out;
+}
+
+std::string to_string(const CompressionCodec::type& val) {
+ std::map<int, const char*>::const_iterator it = _CompressionCodec_VALUES_TO_NAMES.find(val);
+ if (it != _CompressionCodec_VALUES_TO_NAMES.end()) {
+ return std::string(it->second);
+ } else {
+ return std::to_string(static_cast<int>(val));
+ }
+}
+
+int _kPageTypeValues[] = {
+ PageType::DATA_PAGE,
+ PageType::INDEX_PAGE,
+ PageType::DICTIONARY_PAGE,
+ PageType::DATA_PAGE_V2
+};
+const char* _kPageTypeNames[] = {
+ "DATA_PAGE",
+ "INDEX_PAGE",
+ "DICTIONARY_PAGE",
+ "DATA_PAGE_V2"
+};
+const std::map<int, const char*> _PageType_VALUES_TO_NAMES(::apache::thrift::TEnumIterator(4, _kPageTypeValues, _kPageTypeNames), ::apache::thrift::TEnumIterator(-1, NULL, NULL));
+
+std::ostream& operator<<(std::ostream& out, const PageType::type& val) {
+ std::map<int, const char*>::const_iterator it = _PageType_VALUES_TO_NAMES.find(val);
+ if (it != _PageType_VALUES_TO_NAMES.end()) {
+ out << it->second;
+ } else {
+ out << static_cast<int>(val);
+ }
+ return out;
+}
+
+std::string to_string(const PageType::type& val) {
+ std::map<int, const char*>::const_iterator it = _PageType_VALUES_TO_NAMES.find(val);
+ if (it != _PageType_VALUES_TO_NAMES.end()) {
+ return std::string(it->second);
+ } else {
+ return std::to_string(static_cast<int>(val));
+ }
+}
+
+int _kBoundaryOrderValues[] = {
+ BoundaryOrder::UNORDERED,
+ BoundaryOrder::ASCENDING,
+ BoundaryOrder::DESCENDING
+};
+const char* _kBoundaryOrderNames[] = {
+ "UNORDERED",
+ "ASCENDING",
+ "DESCENDING"
+};
+const std::map<int, const char*> _BoundaryOrder_VALUES_TO_NAMES(::apache::thrift::TEnumIterator(3, _kBoundaryOrderValues, _kBoundaryOrderNames), ::apache::thrift::TEnumIterator(-1, NULL, NULL));
+
+std::ostream& operator<<(std::ostream& out, const BoundaryOrder::type& val) {
+ std::map<int, const char*>::const_iterator it = _BoundaryOrder_VALUES_TO_NAMES.find(val);
+ if (it != _BoundaryOrder_VALUES_TO_NAMES.end()) {
+ out << it->second;
+ } else {
+ out << static_cast<int>(val);
+ }
+ return out;
+}
+
+std::string to_string(const BoundaryOrder::type& val) {
+ std::map<int, const char*>::const_iterator it = _BoundaryOrder_VALUES_TO_NAMES.find(val);
+ if (it != _BoundaryOrder_VALUES_TO_NAMES.end()) {
+ return std::string(it->second);
+ } else {
+ return std::to_string(static_cast<int>(val));
+ }
+}
+
+
+Statistics::~Statistics() noexcept {
+}
+
+
+void Statistics::__set_max(const std::string& val) {
+ this->max = val;
+__isset.max = true;
+}
+
+void Statistics::__set_min(const std::string& val) {
+ this->min = val;
+__isset.min = true;
+}
+
+void Statistics::__set_null_count(const int64_t val) {
+ this->null_count = val;
+__isset.null_count = true;
+}
+
+void Statistics::__set_distinct_count(const int64_t val) {
+ this->distinct_count = val;
+__isset.distinct_count = true;
+}
+
+void Statistics::__set_max_value(const std::string& val) {
+ this->max_value = val;
+__isset.max_value = true;
+}
+
+void Statistics::__set_min_value(const std::string& val) {
+ this->min_value = val;
+__isset.min_value = true;
+}
+std::ostream& operator<<(std::ostream& out, const Statistics& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t Statistics::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_STRING) {
+ xfer += iprot->readBinary(this->max);
+ this->__isset.max = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_STRING) {
+ xfer += iprot->readBinary(this->min);
+ this->__isset.min = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 3:
+ if (ftype == ::apache::thrift::protocol::T_I64) {
+ xfer += iprot->readI64(this->null_count);
+ this->__isset.null_count = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 4:
+ if (ftype == ::apache::thrift::protocol::T_I64) {
+ xfer += iprot->readI64(this->distinct_count);
+ this->__isset.distinct_count = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 5:
+ if (ftype == ::apache::thrift::protocol::T_STRING) {
+ xfer += iprot->readBinary(this->max_value);
+ this->__isset.max_value = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 6:
+ if (ftype == ::apache::thrift::protocol::T_STRING) {
+ xfer += iprot->readBinary(this->min_value);
+ this->__isset.min_value = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t Statistics::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("Statistics");
+
+ if (this->__isset.max) {
+ xfer += oprot->writeFieldBegin("max", ::apache::thrift::protocol::T_STRING, 1);
+ xfer += oprot->writeBinary(this->max);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.min) {
+ xfer += oprot->writeFieldBegin("min", ::apache::thrift::protocol::T_STRING, 2);
+ xfer += oprot->writeBinary(this->min);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.null_count) {
+ xfer += oprot->writeFieldBegin("null_count", ::apache::thrift::protocol::T_I64, 3);
+ xfer += oprot->writeI64(this->null_count);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.distinct_count) {
+ xfer += oprot->writeFieldBegin("distinct_count", ::apache::thrift::protocol::T_I64, 4);
+ xfer += oprot->writeI64(this->distinct_count);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.max_value) {
+ xfer += oprot->writeFieldBegin("max_value", ::apache::thrift::protocol::T_STRING, 5);
+ xfer += oprot->writeBinary(this->max_value);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.min_value) {
+ xfer += oprot->writeFieldBegin("min_value", ::apache::thrift::protocol::T_STRING, 6);
+ xfer += oprot->writeBinary(this->min_value);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(Statistics &a, Statistics &b) {
+ using ::std::swap;
+ swap(a.max, b.max);
+ swap(a.min, b.min);
+ swap(a.null_count, b.null_count);
+ swap(a.distinct_count, b.distinct_count);
+ swap(a.max_value, b.max_value);
+ swap(a.min_value, b.min_value);
+ swap(a.__isset, b.__isset);
+}
+
+Statistics::Statistics(const Statistics& other0) {
+ max = other0.max;
+ min = other0.min;
+ null_count = other0.null_count;
+ distinct_count = other0.distinct_count;
+ max_value = other0.max_value;
+ min_value = other0.min_value;
+ __isset = other0.__isset;
+}
+Statistics& Statistics::operator=(const Statistics& other1) {
+ max = other1.max;
+ min = other1.min;
+ null_count = other1.null_count;
+ distinct_count = other1.distinct_count;
+ max_value = other1.max_value;
+ min_value = other1.min_value;
+ __isset = other1.__isset;
+ return *this;
+}
+void Statistics::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "Statistics(";
+ out << "max="; (__isset.max ? (out << to_string(max)) : (out << "<null>"));
+ out << ", " << "min="; (__isset.min ? (out << to_string(min)) : (out << "<null>"));
+ out << ", " << "null_count="; (__isset.null_count ? (out << to_string(null_count)) : (out << "<null>"));
+ out << ", " << "distinct_count="; (__isset.distinct_count ? (out << to_string(distinct_count)) : (out << "<null>"));
+ out << ", " << "max_value="; (__isset.max_value ? (out << to_string(max_value)) : (out << "<null>"));
+ out << ", " << "min_value="; (__isset.min_value ? (out << to_string(min_value)) : (out << "<null>"));
+ out << ")";
+}
+
+
+StringType::~StringType() noexcept {
+}
+
+std::ostream& operator<<(std::ostream& out, const StringType& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t StringType::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ xfer += iprot->skip(ftype);
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t StringType::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("StringType");
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(StringType &a, StringType &b) {
+ using ::std::swap;
+ (void) a;
+ (void) b;
+}
+
+StringType::StringType(const StringType& other2) {
+ (void) other2;
+}
+StringType& StringType::operator=(const StringType& other3) {
+ (void) other3;
+ return *this;
+}
+void StringType::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "StringType(";
+ out << ")";
+}
+
+
+UUIDType::~UUIDType() noexcept {
+}
+
+std::ostream& operator<<(std::ostream& out, const UUIDType& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t UUIDType::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ xfer += iprot->skip(ftype);
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t UUIDType::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("UUIDType");
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(UUIDType &a, UUIDType &b) {
+ using ::std::swap;
+ (void) a;
+ (void) b;
+}
+
+UUIDType::UUIDType(const UUIDType& other4) {
+ (void) other4;
+}
+UUIDType& UUIDType::operator=(const UUIDType& other5) {
+ (void) other5;
+ return *this;
+}
+void UUIDType::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "UUIDType(";
+ out << ")";
+}
+
+
+MapType::~MapType() noexcept {
+}
+
+std::ostream& operator<<(std::ostream& out, const MapType& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t MapType::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ xfer += iprot->skip(ftype);
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t MapType::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("MapType");
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(MapType &a, MapType &b) {
+ using ::std::swap;
+ (void) a;
+ (void) b;
+}
+
+MapType::MapType(const MapType& other6) {
+ (void) other6;
+}
+MapType& MapType::operator=(const MapType& other7) {
+ (void) other7;
+ return *this;
+}
+void MapType::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "MapType(";
+ out << ")";
+}
+
+
+ListType::~ListType() noexcept {
+}
+
+std::ostream& operator<<(std::ostream& out, const ListType& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t ListType::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ xfer += iprot->skip(ftype);
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t ListType::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("ListType");
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(ListType &a, ListType &b) {
+ using ::std::swap;
+ (void) a;
+ (void) b;
+}
+
+ListType::ListType(const ListType& other8) {
+ (void) other8;
+}
+ListType& ListType::operator=(const ListType& other9) {
+ (void) other9;
+ return *this;
+}
+void ListType::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "ListType(";
+ out << ")";
+}
+
+
+EnumType::~EnumType() noexcept {
+}
+
+std::ostream& operator<<(std::ostream& out, const EnumType& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t EnumType::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ xfer += iprot->skip(ftype);
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t EnumType::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("EnumType");
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(EnumType &a, EnumType &b) {
+ using ::std::swap;
+ (void) a;
+ (void) b;
+}
+
+EnumType::EnumType(const EnumType& other10) {
+ (void) other10;
+}
+EnumType& EnumType::operator=(const EnumType& other11) {
+ (void) other11;
+ return *this;
+}
+void EnumType::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "EnumType(";
+ out << ")";
+}
+
+
+DateType::~DateType() noexcept {
+}
+
+std::ostream& operator<<(std::ostream& out, const DateType& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t DateType::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ xfer += iprot->skip(ftype);
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t DateType::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("DateType");
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(DateType &a, DateType &b) {
+ using ::std::swap;
+ (void) a;
+ (void) b;
+}
+
+DateType::DateType(const DateType& other12) {
+ (void) other12;
+}
+DateType& DateType::operator=(const DateType& other13) {
+ (void) other13;
+ return *this;
+}
+void DateType::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "DateType(";
+ out << ")";
+}
+
+
+NullType::~NullType() noexcept {
+}
+
+std::ostream& operator<<(std::ostream& out, const NullType& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t NullType::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ xfer += iprot->skip(ftype);
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t NullType::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("NullType");
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(NullType &a, NullType &b) {
+ using ::std::swap;
+ (void) a;
+ (void) b;
+}
+
+NullType::NullType(const NullType& other14) {
+ (void) other14;
+}
+NullType& NullType::operator=(const NullType& other15) {
+ (void) other15;
+ return *this;
+}
+void NullType::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "NullType(";
+ out << ")";
+}
+
+
+DecimalType::~DecimalType() noexcept {
+}
+
+
+void DecimalType::__set_scale(const int32_t val) {
+ this->scale = val;
+}
+
+void DecimalType::__set_precision(const int32_t val) {
+ this->precision = val;
+}
+std::ostream& operator<<(std::ostream& out, const DecimalType& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t DecimalType::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_scale = false;
+ bool isset_precision = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->scale);
+ isset_scale = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->precision);
+ isset_precision = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_scale)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_precision)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t DecimalType::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("DecimalType");
+
+ xfer += oprot->writeFieldBegin("scale", ::apache::thrift::protocol::T_I32, 1);
+ xfer += oprot->writeI32(this->scale);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("precision", ::apache::thrift::protocol::T_I32, 2);
+ xfer += oprot->writeI32(this->precision);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(DecimalType &a, DecimalType &b) {
+ using ::std::swap;
+ swap(a.scale, b.scale);
+ swap(a.precision, b.precision);
+}
+
+DecimalType::DecimalType(const DecimalType& other16) {
+ scale = other16.scale;
+ precision = other16.precision;
+}
+DecimalType& DecimalType::operator=(const DecimalType& other17) {
+ scale = other17.scale;
+ precision = other17.precision;
+ return *this;
+}
+void DecimalType::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "DecimalType(";
+ out << "scale=" << to_string(scale);
+ out << ", " << "precision=" << to_string(precision);
+ out << ")";
+}
+
+
+MilliSeconds::~MilliSeconds() noexcept {
+}
+
+std::ostream& operator<<(std::ostream& out, const MilliSeconds& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t MilliSeconds::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ xfer += iprot->skip(ftype);
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t MilliSeconds::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("MilliSeconds");
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(MilliSeconds &a, MilliSeconds &b) {
+ using ::std::swap;
+ (void) a;
+ (void) b;
+}
+
+MilliSeconds::MilliSeconds(const MilliSeconds& other18) {
+ (void) other18;
+}
+MilliSeconds& MilliSeconds::operator=(const MilliSeconds& other19) {
+ (void) other19;
+ return *this;
+}
+void MilliSeconds::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "MilliSeconds(";
+ out << ")";
+}
+
+
+MicroSeconds::~MicroSeconds() noexcept {
+}
+
+std::ostream& operator<<(std::ostream& out, const MicroSeconds& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t MicroSeconds::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ xfer += iprot->skip(ftype);
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t MicroSeconds::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("MicroSeconds");
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(MicroSeconds &a, MicroSeconds &b) {
+ using ::std::swap;
+ (void) a;
+ (void) b;
+}
+
+MicroSeconds::MicroSeconds(const MicroSeconds& other20) {
+ (void) other20;
+}
+MicroSeconds& MicroSeconds::operator=(const MicroSeconds& other21) {
+ (void) other21;
+ return *this;
+}
+void MicroSeconds::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "MicroSeconds(";
+ out << ")";
+}
+
+
+NanoSeconds::~NanoSeconds() noexcept {
+}
+
+std::ostream& operator<<(std::ostream& out, const NanoSeconds& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t NanoSeconds::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ xfer += iprot->skip(ftype);
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t NanoSeconds::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("NanoSeconds");
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(NanoSeconds &a, NanoSeconds &b) {
+ using ::std::swap;
+ (void) a;
+ (void) b;
+}
+
+NanoSeconds::NanoSeconds(const NanoSeconds& other22) {
+ (void) other22;
+}
+NanoSeconds& NanoSeconds::operator=(const NanoSeconds& other23) {
+ (void) other23;
+ return *this;
+}
+void NanoSeconds::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "NanoSeconds(";
+ out << ")";
+}
+
+
+TimeUnit::~TimeUnit() noexcept {
+}
+
+
+void TimeUnit::__set_MILLIS(const MilliSeconds& val) {
+ this->MILLIS = val;
+__isset.MILLIS = true;
+}
+
+void TimeUnit::__set_MICROS(const MicroSeconds& val) {
+ this->MICROS = val;
+__isset.MICROS = true;
+}
+
+void TimeUnit::__set_NANOS(const NanoSeconds& val) {
+ this->NANOS = val;
+__isset.NANOS = true;
+}
+std::ostream& operator<<(std::ostream& out, const TimeUnit& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t TimeUnit::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->MILLIS.read(iprot);
+ this->__isset.MILLIS = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->MICROS.read(iprot);
+ this->__isset.MICROS = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 3:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->NANOS.read(iprot);
+ this->__isset.NANOS = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t TimeUnit::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("TimeUnit");
+
+ if (this->__isset.MILLIS) {
+ xfer += oprot->writeFieldBegin("MILLIS", ::apache::thrift::protocol::T_STRUCT, 1);
+ xfer += this->MILLIS.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.MICROS) {
+ xfer += oprot->writeFieldBegin("MICROS", ::apache::thrift::protocol::T_STRUCT, 2);
+ xfer += this->MICROS.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.NANOS) {
+ xfer += oprot->writeFieldBegin("NANOS", ::apache::thrift::protocol::T_STRUCT, 3);
+ xfer += this->NANOS.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(TimeUnit &a, TimeUnit &b) {
+ using ::std::swap;
+ swap(a.MILLIS, b.MILLIS);
+ swap(a.MICROS, b.MICROS);
+ swap(a.NANOS, b.NANOS);
+ swap(a.__isset, b.__isset);
+}
+
+TimeUnit::TimeUnit(const TimeUnit& other24) {
+ MILLIS = other24.MILLIS;
+ MICROS = other24.MICROS;
+ NANOS = other24.NANOS;
+ __isset = other24.__isset;
+}
+TimeUnit& TimeUnit::operator=(const TimeUnit& other25) {
+ MILLIS = other25.MILLIS;
+ MICROS = other25.MICROS;
+ NANOS = other25.NANOS;
+ __isset = other25.__isset;
+ return *this;
+}
+void TimeUnit::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "TimeUnit(";
+ out << "MILLIS="; (__isset.MILLIS ? (out << to_string(MILLIS)) : (out << "<null>"));
+ out << ", " << "MICROS="; (__isset.MICROS ? (out << to_string(MICROS)) : (out << "<null>"));
+ out << ", " << "NANOS="; (__isset.NANOS ? (out << to_string(NANOS)) : (out << "<null>"));
+ out << ")";
+}
+
+
+TimestampType::~TimestampType() noexcept {
+}
+
+
+void TimestampType::__set_isAdjustedToUTC(const bool val) {
+ this->isAdjustedToUTC = val;
+}
+
+void TimestampType::__set_unit(const TimeUnit& val) {
+ this->unit = val;
+}
+std::ostream& operator<<(std::ostream& out, const TimestampType& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t TimestampType::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_isAdjustedToUTC = false;
+ bool isset_unit = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_BOOL) {
+ xfer += iprot->readBool(this->isAdjustedToUTC);
+ isset_isAdjustedToUTC = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->unit.read(iprot);
+ isset_unit = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_isAdjustedToUTC)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_unit)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t TimestampType::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("TimestampType");
+
+ xfer += oprot->writeFieldBegin("isAdjustedToUTC", ::apache::thrift::protocol::T_BOOL, 1);
+ xfer += oprot->writeBool(this->isAdjustedToUTC);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("unit", ::apache::thrift::protocol::T_STRUCT, 2);
+ xfer += this->unit.write(oprot);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(TimestampType &a, TimestampType &b) {
+ using ::std::swap;
+ swap(a.isAdjustedToUTC, b.isAdjustedToUTC);
+ swap(a.unit, b.unit);
+}
+
+TimestampType::TimestampType(const TimestampType& other26) {
+ isAdjustedToUTC = other26.isAdjustedToUTC;
+ unit = other26.unit;
+}
+TimestampType& TimestampType::operator=(const TimestampType& other27) {
+ isAdjustedToUTC = other27.isAdjustedToUTC;
+ unit = other27.unit;
+ return *this;
+}
+void TimestampType::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "TimestampType(";
+ out << "isAdjustedToUTC=" << to_string(isAdjustedToUTC);
+ out << ", " << "unit=" << to_string(unit);
+ out << ")";
+}
+
+
+TimeType::~TimeType() noexcept {
+}
+
+
+void TimeType::__set_isAdjustedToUTC(const bool val) {
+ this->isAdjustedToUTC = val;
+}
+
+void TimeType::__set_unit(const TimeUnit& val) {
+ this->unit = val;
+}
+std::ostream& operator<<(std::ostream& out, const TimeType& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t TimeType::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_isAdjustedToUTC = false;
+ bool isset_unit = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_BOOL) {
+ xfer += iprot->readBool(this->isAdjustedToUTC);
+ isset_isAdjustedToUTC = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->unit.read(iprot);
+ isset_unit = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_isAdjustedToUTC)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_unit)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t TimeType::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("TimeType");
+
+ xfer += oprot->writeFieldBegin("isAdjustedToUTC", ::apache::thrift::protocol::T_BOOL, 1);
+ xfer += oprot->writeBool(this->isAdjustedToUTC);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("unit", ::apache::thrift::protocol::T_STRUCT, 2);
+ xfer += this->unit.write(oprot);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(TimeType &a, TimeType &b) {
+ using ::std::swap;
+ swap(a.isAdjustedToUTC, b.isAdjustedToUTC);
+ swap(a.unit, b.unit);
+}
+
+TimeType::TimeType(const TimeType& other28) {
+ isAdjustedToUTC = other28.isAdjustedToUTC;
+ unit = other28.unit;
+}
+TimeType& TimeType::operator=(const TimeType& other29) {
+ isAdjustedToUTC = other29.isAdjustedToUTC;
+ unit = other29.unit;
+ return *this;
+}
+void TimeType::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "TimeType(";
+ out << "isAdjustedToUTC=" << to_string(isAdjustedToUTC);
+ out << ", " << "unit=" << to_string(unit);
+ out << ")";
+}
+
+
+IntType::~IntType() noexcept {
+}
+
+
+void IntType::__set_bitWidth(const int8_t val) {
+ this->bitWidth = val;
+}
+
+void IntType::__set_isSigned(const bool val) {
+ this->isSigned = val;
+}
+std::ostream& operator<<(std::ostream& out, const IntType& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t IntType::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_bitWidth = false;
+ bool isset_isSigned = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_BYTE) {
+ xfer += iprot->readByte(this->bitWidth);
+ isset_bitWidth = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_BOOL) {
+ xfer += iprot->readBool(this->isSigned);
+ isset_isSigned = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_bitWidth)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_isSigned)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t IntType::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("IntType");
+
+ xfer += oprot->writeFieldBegin("bitWidth", ::apache::thrift::protocol::T_BYTE, 1);
+ xfer += oprot->writeByte(this->bitWidth);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("isSigned", ::apache::thrift::protocol::T_BOOL, 2);
+ xfer += oprot->writeBool(this->isSigned);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(IntType &a, IntType &b) {
+ using ::std::swap;
+ swap(a.bitWidth, b.bitWidth);
+ swap(a.isSigned, b.isSigned);
+}
+
+IntType::IntType(const IntType& other30) {
+ bitWidth = other30.bitWidth;
+ isSigned = other30.isSigned;
+}
+IntType& IntType::operator=(const IntType& other31) {
+ bitWidth = other31.bitWidth;
+ isSigned = other31.isSigned;
+ return *this;
+}
+void IntType::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "IntType(";
+ out << "bitWidth=" << to_string(bitWidth);
+ out << ", " << "isSigned=" << to_string(isSigned);
+ out << ")";
+}
+
+
+JsonType::~JsonType() noexcept {
+}
+
+std::ostream& operator<<(std::ostream& out, const JsonType& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t JsonType::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ xfer += iprot->skip(ftype);
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t JsonType::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("JsonType");
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(JsonType &a, JsonType &b) {
+ using ::std::swap;
+ (void) a;
+ (void) b;
+}
+
+JsonType::JsonType(const JsonType& other32) {
+ (void) other32;
+}
+JsonType& JsonType::operator=(const JsonType& other33) {
+ (void) other33;
+ return *this;
+}
+void JsonType::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "JsonType(";
+ out << ")";
+}
+
+
+BsonType::~BsonType() noexcept {
+}
+
+std::ostream& operator<<(std::ostream& out, const BsonType& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t BsonType::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ xfer += iprot->skip(ftype);
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t BsonType::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("BsonType");
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(BsonType &a, BsonType &b) {
+ using ::std::swap;
+ (void) a;
+ (void) b;
+}
+
+BsonType::BsonType(const BsonType& other34) {
+ (void) other34;
+}
+BsonType& BsonType::operator=(const BsonType& other35) {
+ (void) other35;
+ return *this;
+}
+void BsonType::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "BsonType(";
+ out << ")";
+}
+
+
+LogicalType::~LogicalType() noexcept {
+}
+
+
+void LogicalType::__set_STRING(const StringType& val) {
+ this->STRING = val;
+__isset.STRING = true;
+}
+
+void LogicalType::__set_MAP(const MapType& val) {
+ this->MAP = val;
+__isset.MAP = true;
+}
+
+void LogicalType::__set_LIST(const ListType& val) {
+ this->LIST = val;
+__isset.LIST = true;
+}
+
+void LogicalType::__set_ENUM(const EnumType& val) {
+ this->ENUM = val;
+__isset.ENUM = true;
+}
+
+void LogicalType::__set_DECIMAL(const DecimalType& val) {
+ this->DECIMAL = val;
+__isset.DECIMAL = true;
+}
+
+void LogicalType::__set_DATE(const DateType& val) {
+ this->DATE = val;
+__isset.DATE = true;
+}
+
+void LogicalType::__set_TIME(const TimeType& val) {
+ this->TIME = val;
+__isset.TIME = true;
+}
+
+void LogicalType::__set_TIMESTAMP(const TimestampType& val) {
+ this->TIMESTAMP = val;
+__isset.TIMESTAMP = true;
+}
+
+void LogicalType::__set_INTEGER(const IntType& val) {
+ this->INTEGER = val;
+__isset.INTEGER = true;
+}
+
+void LogicalType::__set_UNKNOWN(const NullType& val) {
+ this->UNKNOWN = val;
+__isset.UNKNOWN = true;
+}
+
+void LogicalType::__set_JSON(const JsonType& val) {
+ this->JSON = val;
+__isset.JSON = true;
+}
+
+void LogicalType::__set_BSON(const BsonType& val) {
+ this->BSON = val;
+__isset.BSON = true;
+}
+
+void LogicalType::__set_UUID(const UUIDType& val) {
+ this->UUID = val;
+__isset.UUID = true;
+}
+std::ostream& operator<<(std::ostream& out, const LogicalType& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t LogicalType::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->STRING.read(iprot);
+ this->__isset.STRING = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->MAP.read(iprot);
+ this->__isset.MAP = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 3:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->LIST.read(iprot);
+ this->__isset.LIST = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 4:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->ENUM.read(iprot);
+ this->__isset.ENUM = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 5:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->DECIMAL.read(iprot);
+ this->__isset.DECIMAL = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 6:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->DATE.read(iprot);
+ this->__isset.DATE = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 7:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->TIME.read(iprot);
+ this->__isset.TIME = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 8:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->TIMESTAMP.read(iprot);
+ this->__isset.TIMESTAMP = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 10:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->INTEGER.read(iprot);
+ this->__isset.INTEGER = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 11:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->UNKNOWN.read(iprot);
+ this->__isset.UNKNOWN = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 12:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->JSON.read(iprot);
+ this->__isset.JSON = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 13:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->BSON.read(iprot);
+ this->__isset.BSON = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 14:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->UUID.read(iprot);
+ this->__isset.UUID = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t LogicalType::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("LogicalType");
+
+ if (this->__isset.STRING) {
+ xfer += oprot->writeFieldBegin("STRING", ::apache::thrift::protocol::T_STRUCT, 1);
+ xfer += this->STRING.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.MAP) {
+ xfer += oprot->writeFieldBegin("MAP", ::apache::thrift::protocol::T_STRUCT, 2);
+ xfer += this->MAP.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.LIST) {
+ xfer += oprot->writeFieldBegin("LIST", ::apache::thrift::protocol::T_STRUCT, 3);
+ xfer += this->LIST.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.ENUM) {
+ xfer += oprot->writeFieldBegin("ENUM", ::apache::thrift::protocol::T_STRUCT, 4);
+ xfer += this->ENUM.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.DECIMAL) {
+ xfer += oprot->writeFieldBegin("DECIMAL", ::apache::thrift::protocol::T_STRUCT, 5);
+ xfer += this->DECIMAL.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.DATE) {
+ xfer += oprot->writeFieldBegin("DATE", ::apache::thrift::protocol::T_STRUCT, 6);
+ xfer += this->DATE.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.TIME) {
+ xfer += oprot->writeFieldBegin("TIME", ::apache::thrift::protocol::T_STRUCT, 7);
+ xfer += this->TIME.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.TIMESTAMP) {
+ xfer += oprot->writeFieldBegin("TIMESTAMP", ::apache::thrift::protocol::T_STRUCT, 8);
+ xfer += this->TIMESTAMP.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.INTEGER) {
+ xfer += oprot->writeFieldBegin("INTEGER", ::apache::thrift::protocol::T_STRUCT, 10);
+ xfer += this->INTEGER.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.UNKNOWN) {
+ xfer += oprot->writeFieldBegin("UNKNOWN", ::apache::thrift::protocol::T_STRUCT, 11);
+ xfer += this->UNKNOWN.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.JSON) {
+ xfer += oprot->writeFieldBegin("JSON", ::apache::thrift::protocol::T_STRUCT, 12);
+ xfer += this->JSON.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.BSON) {
+ xfer += oprot->writeFieldBegin("BSON", ::apache::thrift::protocol::T_STRUCT, 13);
+ xfer += this->BSON.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.UUID) {
+ xfer += oprot->writeFieldBegin("UUID", ::apache::thrift::protocol::T_STRUCT, 14);
+ xfer += this->UUID.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(LogicalType &a, LogicalType &b) {
+ using ::std::swap;
+ swap(a.STRING, b.STRING);
+ swap(a.MAP, b.MAP);
+ swap(a.LIST, b.LIST);
+ swap(a.ENUM, b.ENUM);
+ swap(a.DECIMAL, b.DECIMAL);
+ swap(a.DATE, b.DATE);
+ swap(a.TIME, b.TIME);
+ swap(a.TIMESTAMP, b.TIMESTAMP);
+ swap(a.INTEGER, b.INTEGER);
+ swap(a.UNKNOWN, b.UNKNOWN);
+ swap(a.JSON, b.JSON);
+ swap(a.BSON, b.BSON);
+ swap(a.UUID, b.UUID);
+ swap(a.__isset, b.__isset);
+}
+
+LogicalType::LogicalType(const LogicalType& other36) {
+ STRING = other36.STRING;
+ MAP = other36.MAP;
+ LIST = other36.LIST;
+ ENUM = other36.ENUM;
+ DECIMAL = other36.DECIMAL;
+ DATE = other36.DATE;
+ TIME = other36.TIME;
+ TIMESTAMP = other36.TIMESTAMP;
+ INTEGER = other36.INTEGER;
+ UNKNOWN = other36.UNKNOWN;
+ JSON = other36.JSON;
+ BSON = other36.BSON;
+ UUID = other36.UUID;
+ __isset = other36.__isset;
+}
+LogicalType& LogicalType::operator=(const LogicalType& other37) {
+ STRING = other37.STRING;
+ MAP = other37.MAP;
+ LIST = other37.LIST;
+ ENUM = other37.ENUM;
+ DECIMAL = other37.DECIMAL;
+ DATE = other37.DATE;
+ TIME = other37.TIME;
+ TIMESTAMP = other37.TIMESTAMP;
+ INTEGER = other37.INTEGER;
+ UNKNOWN = other37.UNKNOWN;
+ JSON = other37.JSON;
+ BSON = other37.BSON;
+ UUID = other37.UUID;
+ __isset = other37.__isset;
+ return *this;
+}
+void LogicalType::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "LogicalType(";
+ out << "STRING="; (__isset.STRING ? (out << to_string(STRING)) : (out << "<null>"));
+ out << ", " << "MAP="; (__isset.MAP ? (out << to_string(MAP)) : (out << "<null>"));
+ out << ", " << "LIST="; (__isset.LIST ? (out << to_string(LIST)) : (out << "<null>"));
+ out << ", " << "ENUM="; (__isset.ENUM ? (out << to_string(ENUM)) : (out << "<null>"));
+ out << ", " << "DECIMAL="; (__isset.DECIMAL ? (out << to_string(DECIMAL)) : (out << "<null>"));
+ out << ", " << "DATE="; (__isset.DATE ? (out << to_string(DATE)) : (out << "<null>"));
+ out << ", " << "TIME="; (__isset.TIME ? (out << to_string(TIME)) : (out << "<null>"));
+ out << ", " << "TIMESTAMP="; (__isset.TIMESTAMP ? (out << to_string(TIMESTAMP)) : (out << "<null>"));
+ out << ", " << "INTEGER="; (__isset.INTEGER ? (out << to_string(INTEGER)) : (out << "<null>"));
+ out << ", " << "UNKNOWN="; (__isset.UNKNOWN ? (out << to_string(UNKNOWN)) : (out << "<null>"));
+ out << ", " << "JSON="; (__isset.JSON ? (out << to_string(JSON)) : (out << "<null>"));
+ out << ", " << "BSON="; (__isset.BSON ? (out << to_string(BSON)) : (out << "<null>"));
+ out << ", " << "UUID="; (__isset.UUID ? (out << to_string(UUID)) : (out << "<null>"));
+ out << ")";
+}
+
+
+SchemaElement::~SchemaElement() noexcept {
+}
+
+
+void SchemaElement::__set_type(const Type::type val) {
+ this->type = val;
+__isset.type = true;
+}
+
+void SchemaElement::__set_type_length(const int32_t val) {
+ this->type_length = val;
+__isset.type_length = true;
+}
+
+void SchemaElement::__set_repetition_type(const FieldRepetitionType::type val) {
+ this->repetition_type = val;
+__isset.repetition_type = true;
+}
+
+void SchemaElement::__set_name(const std::string& val) {
+ this->name = val;
+}
+
+void SchemaElement::__set_num_children(const int32_t val) {
+ this->num_children = val;
+__isset.num_children = true;
+}
+
+void SchemaElement::__set_converted_type(const ConvertedType::type val) {
+ this->converted_type = val;
+__isset.converted_type = true;
+}
+
+void SchemaElement::__set_scale(const int32_t val) {
+ this->scale = val;
+__isset.scale = true;
+}
+
+void SchemaElement::__set_precision(const int32_t val) {
+ this->precision = val;
+__isset.precision = true;
+}
+
+void SchemaElement::__set_field_id(const int32_t val) {
+ this->field_id = val;
+__isset.field_id = true;
+}
+
+void SchemaElement::__set_logicalType(const LogicalType& val) {
+ this->logicalType = val;
+__isset.logicalType = true;
+}
+std::ostream& operator<<(std::ostream& out, const SchemaElement& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t SchemaElement::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_name = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ int32_t ecast38;
+ xfer += iprot->readI32(ecast38);
+ this->type = (Type::type)ecast38;
+ this->__isset.type = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->type_length);
+ this->__isset.type_length = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 3:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ int32_t ecast39;
+ xfer += iprot->readI32(ecast39);
+ this->repetition_type = (FieldRepetitionType::type)ecast39;
+ this->__isset.repetition_type = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 4:
+ if (ftype == ::apache::thrift::protocol::T_STRING) {
+ xfer += iprot->readString(this->name);
+ isset_name = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 5:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->num_children);
+ this->__isset.num_children = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 6:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ int32_t ecast40;
+ xfer += iprot->readI32(ecast40);
+ this->converted_type = (ConvertedType::type)ecast40;
+ this->__isset.converted_type = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 7:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->scale);
+ this->__isset.scale = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 8:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->precision);
+ this->__isset.precision = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 9:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->field_id);
+ this->__isset.field_id = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 10:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->logicalType.read(iprot);
+ this->__isset.logicalType = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_name)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t SchemaElement::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("SchemaElement");
+
+ if (this->__isset.type) {
+ xfer += oprot->writeFieldBegin("type", ::apache::thrift::protocol::T_I32, 1);
+ xfer += oprot->writeI32((int32_t)this->type);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.type_length) {
+ xfer += oprot->writeFieldBegin("type_length", ::apache::thrift::protocol::T_I32, 2);
+ xfer += oprot->writeI32(this->type_length);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.repetition_type) {
+ xfer += oprot->writeFieldBegin("repetition_type", ::apache::thrift::protocol::T_I32, 3);
+ xfer += oprot->writeI32((int32_t)this->repetition_type);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldBegin("name", ::apache::thrift::protocol::T_STRING, 4);
+ xfer += oprot->writeString(this->name);
+ xfer += oprot->writeFieldEnd();
+
+ if (this->__isset.num_children) {
+ xfer += oprot->writeFieldBegin("num_children", ::apache::thrift::protocol::T_I32, 5);
+ xfer += oprot->writeI32(this->num_children);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.converted_type) {
+ xfer += oprot->writeFieldBegin("converted_type", ::apache::thrift::protocol::T_I32, 6);
+ xfer += oprot->writeI32((int32_t)this->converted_type);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.scale) {
+ xfer += oprot->writeFieldBegin("scale", ::apache::thrift::protocol::T_I32, 7);
+ xfer += oprot->writeI32(this->scale);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.precision) {
+ xfer += oprot->writeFieldBegin("precision", ::apache::thrift::protocol::T_I32, 8);
+ xfer += oprot->writeI32(this->precision);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.field_id) {
+ xfer += oprot->writeFieldBegin("field_id", ::apache::thrift::protocol::T_I32, 9);
+ xfer += oprot->writeI32(this->field_id);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.logicalType) {
+ xfer += oprot->writeFieldBegin("logicalType", ::apache::thrift::protocol::T_STRUCT, 10);
+ xfer += this->logicalType.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(SchemaElement &a, SchemaElement &b) {
+ using ::std::swap;
+ swap(a.type, b.type);
+ swap(a.type_length, b.type_length);
+ swap(a.repetition_type, b.repetition_type);
+ swap(a.name, b.name);
+ swap(a.num_children, b.num_children);
+ swap(a.converted_type, b.converted_type);
+ swap(a.scale, b.scale);
+ swap(a.precision, b.precision);
+ swap(a.field_id, b.field_id);
+ swap(a.logicalType, b.logicalType);
+ swap(a.__isset, b.__isset);
+}
+
+SchemaElement::SchemaElement(const SchemaElement& other41) {
+ type = other41.type;
+ type_length = other41.type_length;
+ repetition_type = other41.repetition_type;
+ name = other41.name;
+ num_children = other41.num_children;
+ converted_type = other41.converted_type;
+ scale = other41.scale;
+ precision = other41.precision;
+ field_id = other41.field_id;
+ logicalType = other41.logicalType;
+ __isset = other41.__isset;
+}
+SchemaElement& SchemaElement::operator=(const SchemaElement& other42) {
+ type = other42.type;
+ type_length = other42.type_length;
+ repetition_type = other42.repetition_type;
+ name = other42.name;
+ num_children = other42.num_children;
+ converted_type = other42.converted_type;
+ scale = other42.scale;
+ precision = other42.precision;
+ field_id = other42.field_id;
+ logicalType = other42.logicalType;
+ __isset = other42.__isset;
+ return *this;
+}
+void SchemaElement::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "SchemaElement(";
+ out << "type="; (__isset.type ? (out << to_string(type)) : (out << "<null>"));
+ out << ", " << "type_length="; (__isset.type_length ? (out << to_string(type_length)) : (out << "<null>"));
+ out << ", " << "repetition_type="; (__isset.repetition_type ? (out << to_string(repetition_type)) : (out << "<null>"));
+ out << ", " << "name=" << to_string(name);
+ out << ", " << "num_children="; (__isset.num_children ? (out << to_string(num_children)) : (out << "<null>"));
+ out << ", " << "converted_type="; (__isset.converted_type ? (out << to_string(converted_type)) : (out << "<null>"));
+ out << ", " << "scale="; (__isset.scale ? (out << to_string(scale)) : (out << "<null>"));
+ out << ", " << "precision="; (__isset.precision ? (out << to_string(precision)) : (out << "<null>"));
+ out << ", " << "field_id="; (__isset.field_id ? (out << to_string(field_id)) : (out << "<null>"));
+ out << ", " << "logicalType="; (__isset.logicalType ? (out << to_string(logicalType)) : (out << "<null>"));
+ out << ")";
+}
+
+
+DataPageHeader::~DataPageHeader() noexcept {
+}
+
+
+void DataPageHeader::__set_num_values(const int32_t val) {
+ this->num_values = val;
+}
+
+void DataPageHeader::__set_encoding(const Encoding::type val) {
+ this->encoding = val;
+}
+
+void DataPageHeader::__set_definition_level_encoding(const Encoding::type val) {
+ this->definition_level_encoding = val;
+}
+
+void DataPageHeader::__set_repetition_level_encoding(const Encoding::type val) {
+ this->repetition_level_encoding = val;
+}
+
+void DataPageHeader::__set_statistics(const Statistics& val) {
+ this->statistics = val;
+__isset.statistics = true;
+}
+std::ostream& operator<<(std::ostream& out, const DataPageHeader& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t DataPageHeader::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_num_values = false;
+ bool isset_encoding = false;
+ bool isset_definition_level_encoding = false;
+ bool isset_repetition_level_encoding = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->num_values);
+ isset_num_values = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ int32_t ecast43;
+ xfer += iprot->readI32(ecast43);
+ this->encoding = (Encoding::type)ecast43;
+ isset_encoding = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 3:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ int32_t ecast44;
+ xfer += iprot->readI32(ecast44);
+ this->definition_level_encoding = (Encoding::type)ecast44;
+ isset_definition_level_encoding = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 4:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ int32_t ecast45;
+ xfer += iprot->readI32(ecast45);
+ this->repetition_level_encoding = (Encoding::type)ecast45;
+ isset_repetition_level_encoding = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 5:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->statistics.read(iprot);
+ this->__isset.statistics = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_num_values)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_encoding)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_definition_level_encoding)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_repetition_level_encoding)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t DataPageHeader::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("DataPageHeader");
+
+ xfer += oprot->writeFieldBegin("num_values", ::apache::thrift::protocol::T_I32, 1);
+ xfer += oprot->writeI32(this->num_values);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("encoding", ::apache::thrift::protocol::T_I32, 2);
+ xfer += oprot->writeI32((int32_t)this->encoding);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("definition_level_encoding", ::apache::thrift::protocol::T_I32, 3);
+ xfer += oprot->writeI32((int32_t)this->definition_level_encoding);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("repetition_level_encoding", ::apache::thrift::protocol::T_I32, 4);
+ xfer += oprot->writeI32((int32_t)this->repetition_level_encoding);
+ xfer += oprot->writeFieldEnd();
+
+ if (this->__isset.statistics) {
+ xfer += oprot->writeFieldBegin("statistics", ::apache::thrift::protocol::T_STRUCT, 5);
+ xfer += this->statistics.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(DataPageHeader &a, DataPageHeader &b) {
+ using ::std::swap;
+ swap(a.num_values, b.num_values);
+ swap(a.encoding, b.encoding);
+ swap(a.definition_level_encoding, b.definition_level_encoding);
+ swap(a.repetition_level_encoding, b.repetition_level_encoding);
+ swap(a.statistics, b.statistics);
+ swap(a.__isset, b.__isset);
+}
+
+DataPageHeader::DataPageHeader(const DataPageHeader& other46) {
+ num_values = other46.num_values;
+ encoding = other46.encoding;
+ definition_level_encoding = other46.definition_level_encoding;
+ repetition_level_encoding = other46.repetition_level_encoding;
+ statistics = other46.statistics;
+ __isset = other46.__isset;
+}
+DataPageHeader& DataPageHeader::operator=(const DataPageHeader& other47) {
+ num_values = other47.num_values;
+ encoding = other47.encoding;
+ definition_level_encoding = other47.definition_level_encoding;
+ repetition_level_encoding = other47.repetition_level_encoding;
+ statistics = other47.statistics;
+ __isset = other47.__isset;
+ return *this;
+}
+void DataPageHeader::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "DataPageHeader(";
+ out << "num_values=" << to_string(num_values);
+ out << ", " << "encoding=" << to_string(encoding);
+ out << ", " << "definition_level_encoding=" << to_string(definition_level_encoding);
+ out << ", " << "repetition_level_encoding=" << to_string(repetition_level_encoding);
+ out << ", " << "statistics="; (__isset.statistics ? (out << to_string(statistics)) : (out << "<null>"));
+ out << ")";
+}
+
+
+IndexPageHeader::~IndexPageHeader() noexcept {
+}
+
+std::ostream& operator<<(std::ostream& out, const IndexPageHeader& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t IndexPageHeader::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ xfer += iprot->skip(ftype);
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t IndexPageHeader::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("IndexPageHeader");
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(IndexPageHeader &a, IndexPageHeader &b) {
+ using ::std::swap;
+ (void) a;
+ (void) b;
+}
+
+IndexPageHeader::IndexPageHeader(const IndexPageHeader& other48) {
+ (void) other48;
+}
+IndexPageHeader& IndexPageHeader::operator=(const IndexPageHeader& other49) {
+ (void) other49;
+ return *this;
+}
+void IndexPageHeader::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "IndexPageHeader(";
+ out << ")";
+}
+
+
+DictionaryPageHeader::~DictionaryPageHeader() noexcept {
+}
+
+
+void DictionaryPageHeader::__set_num_values(const int32_t val) {
+ this->num_values = val;
+}
+
+void DictionaryPageHeader::__set_encoding(const Encoding::type val) {
+ this->encoding = val;
+}
+
+void DictionaryPageHeader::__set_is_sorted(const bool val) {
+ this->is_sorted = val;
+__isset.is_sorted = true;
+}
+std::ostream& operator<<(std::ostream& out, const DictionaryPageHeader& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t DictionaryPageHeader::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_num_values = false;
+ bool isset_encoding = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->num_values);
+ isset_num_values = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ int32_t ecast50;
+ xfer += iprot->readI32(ecast50);
+ this->encoding = (Encoding::type)ecast50;
+ isset_encoding = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 3:
+ if (ftype == ::apache::thrift::protocol::T_BOOL) {
+ xfer += iprot->readBool(this->is_sorted);
+ this->__isset.is_sorted = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_num_values)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_encoding)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t DictionaryPageHeader::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("DictionaryPageHeader");
+
+ xfer += oprot->writeFieldBegin("num_values", ::apache::thrift::protocol::T_I32, 1);
+ xfer += oprot->writeI32(this->num_values);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("encoding", ::apache::thrift::protocol::T_I32, 2);
+ xfer += oprot->writeI32((int32_t)this->encoding);
+ xfer += oprot->writeFieldEnd();
+
+ if (this->__isset.is_sorted) {
+ xfer += oprot->writeFieldBegin("is_sorted", ::apache::thrift::protocol::T_BOOL, 3);
+ xfer += oprot->writeBool(this->is_sorted);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(DictionaryPageHeader &a, DictionaryPageHeader &b) {
+ using ::std::swap;
+ swap(a.num_values, b.num_values);
+ swap(a.encoding, b.encoding);
+ swap(a.is_sorted, b.is_sorted);
+ swap(a.__isset, b.__isset);
+}
+
+DictionaryPageHeader::DictionaryPageHeader(const DictionaryPageHeader& other51) {
+ num_values = other51.num_values;
+ encoding = other51.encoding;
+ is_sorted = other51.is_sorted;
+ __isset = other51.__isset;
+}
+DictionaryPageHeader& DictionaryPageHeader::operator=(const DictionaryPageHeader& other52) {
+ num_values = other52.num_values;
+ encoding = other52.encoding;
+ is_sorted = other52.is_sorted;
+ __isset = other52.__isset;
+ return *this;
+}
+void DictionaryPageHeader::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "DictionaryPageHeader(";
+ out << "num_values=" << to_string(num_values);
+ out << ", " << "encoding=" << to_string(encoding);
+ out << ", " << "is_sorted="; (__isset.is_sorted ? (out << to_string(is_sorted)) : (out << "<null>"));
+ out << ")";
+}
+
+
+DataPageHeaderV2::~DataPageHeaderV2() noexcept {
+}
+
+
+void DataPageHeaderV2::__set_num_values(const int32_t val) {
+ this->num_values = val;
+}
+
+void DataPageHeaderV2::__set_num_nulls(const int32_t val) {
+ this->num_nulls = val;
+}
+
+void DataPageHeaderV2::__set_num_rows(const int32_t val) {
+ this->num_rows = val;
+}
+
+void DataPageHeaderV2::__set_encoding(const Encoding::type val) {
+ this->encoding = val;
+}
+
+void DataPageHeaderV2::__set_definition_levels_byte_length(const int32_t val) {
+ this->definition_levels_byte_length = val;
+}
+
+void DataPageHeaderV2::__set_repetition_levels_byte_length(const int32_t val) {
+ this->repetition_levels_byte_length = val;
+}
+
+void DataPageHeaderV2::__set_is_compressed(const bool val) {
+ this->is_compressed = val;
+__isset.is_compressed = true;
+}
+
+void DataPageHeaderV2::__set_statistics(const Statistics& val) {
+ this->statistics = val;
+__isset.statistics = true;
+}
+std::ostream& operator<<(std::ostream& out, const DataPageHeaderV2& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t DataPageHeaderV2::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_num_values = false;
+ bool isset_num_nulls = false;
+ bool isset_num_rows = false;
+ bool isset_encoding = false;
+ bool isset_definition_levels_byte_length = false;
+ bool isset_repetition_levels_byte_length = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->num_values);
+ isset_num_values = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->num_nulls);
+ isset_num_nulls = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 3:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->num_rows);
+ isset_num_rows = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 4:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ int32_t ecast53;
+ xfer += iprot->readI32(ecast53);
+ this->encoding = (Encoding::type)ecast53;
+ isset_encoding = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 5:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->definition_levels_byte_length);
+ isset_definition_levels_byte_length = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 6:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->repetition_levels_byte_length);
+ isset_repetition_levels_byte_length = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 7:
+ if (ftype == ::apache::thrift::protocol::T_BOOL) {
+ xfer += iprot->readBool(this->is_compressed);
+ this->__isset.is_compressed = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 8:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->statistics.read(iprot);
+ this->__isset.statistics = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_num_values)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_num_nulls)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_num_rows)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_encoding)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_definition_levels_byte_length)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_repetition_levels_byte_length)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t DataPageHeaderV2::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("DataPageHeaderV2");
+
+ xfer += oprot->writeFieldBegin("num_values", ::apache::thrift::protocol::T_I32, 1);
+ xfer += oprot->writeI32(this->num_values);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("num_nulls", ::apache::thrift::protocol::T_I32, 2);
+ xfer += oprot->writeI32(this->num_nulls);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("num_rows", ::apache::thrift::protocol::T_I32, 3);
+ xfer += oprot->writeI32(this->num_rows);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("encoding", ::apache::thrift::protocol::T_I32, 4);
+ xfer += oprot->writeI32((int32_t)this->encoding);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("definition_levels_byte_length", ::apache::thrift::protocol::T_I32, 5);
+ xfer += oprot->writeI32(this->definition_levels_byte_length);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("repetition_levels_byte_length", ::apache::thrift::protocol::T_I32, 6);
+ xfer += oprot->writeI32(this->repetition_levels_byte_length);
+ xfer += oprot->writeFieldEnd();
+
+ if (this->__isset.is_compressed) {
+ xfer += oprot->writeFieldBegin("is_compressed", ::apache::thrift::protocol::T_BOOL, 7);
+ xfer += oprot->writeBool(this->is_compressed);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.statistics) {
+ xfer += oprot->writeFieldBegin("statistics", ::apache::thrift::protocol::T_STRUCT, 8);
+ xfer += this->statistics.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(DataPageHeaderV2 &a, DataPageHeaderV2 &b) {
+ using ::std::swap;
+ swap(a.num_values, b.num_values);
+ swap(a.num_nulls, b.num_nulls);
+ swap(a.num_rows, b.num_rows);
+ swap(a.encoding, b.encoding);
+ swap(a.definition_levels_byte_length, b.definition_levels_byte_length);
+ swap(a.repetition_levels_byte_length, b.repetition_levels_byte_length);
+ swap(a.is_compressed, b.is_compressed);
+ swap(a.statistics, b.statistics);
+ swap(a.__isset, b.__isset);
+}
+
+DataPageHeaderV2::DataPageHeaderV2(const DataPageHeaderV2& other54) {
+ num_values = other54.num_values;
+ num_nulls = other54.num_nulls;
+ num_rows = other54.num_rows;
+ encoding = other54.encoding;
+ definition_levels_byte_length = other54.definition_levels_byte_length;
+ repetition_levels_byte_length = other54.repetition_levels_byte_length;
+ is_compressed = other54.is_compressed;
+ statistics = other54.statistics;
+ __isset = other54.__isset;
+}
+DataPageHeaderV2& DataPageHeaderV2::operator=(const DataPageHeaderV2& other55) {
+ num_values = other55.num_values;
+ num_nulls = other55.num_nulls;
+ num_rows = other55.num_rows;
+ encoding = other55.encoding;
+ definition_levels_byte_length = other55.definition_levels_byte_length;
+ repetition_levels_byte_length = other55.repetition_levels_byte_length;
+ is_compressed = other55.is_compressed;
+ statistics = other55.statistics;
+ __isset = other55.__isset;
+ return *this;
+}
+void DataPageHeaderV2::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "DataPageHeaderV2(";
+ out << "num_values=" << to_string(num_values);
+ out << ", " << "num_nulls=" << to_string(num_nulls);
+ out << ", " << "num_rows=" << to_string(num_rows);
+ out << ", " << "encoding=" << to_string(encoding);
+ out << ", " << "definition_levels_byte_length=" << to_string(definition_levels_byte_length);
+ out << ", " << "repetition_levels_byte_length=" << to_string(repetition_levels_byte_length);
+ out << ", " << "is_compressed="; (__isset.is_compressed ? (out << to_string(is_compressed)) : (out << "<null>"));
+ out << ", " << "statistics="; (__isset.statistics ? (out << to_string(statistics)) : (out << "<null>"));
+ out << ")";
+}
+
+
+SplitBlockAlgorithm::~SplitBlockAlgorithm() noexcept {
+}
+
+std::ostream& operator<<(std::ostream& out, const SplitBlockAlgorithm& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t SplitBlockAlgorithm::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ xfer += iprot->skip(ftype);
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t SplitBlockAlgorithm::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("SplitBlockAlgorithm");
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(SplitBlockAlgorithm &a, SplitBlockAlgorithm &b) {
+ using ::std::swap;
+ (void) a;
+ (void) b;
+}
+
+SplitBlockAlgorithm::SplitBlockAlgorithm(const SplitBlockAlgorithm& other56) {
+ (void) other56;
+}
+SplitBlockAlgorithm& SplitBlockAlgorithm::operator=(const SplitBlockAlgorithm& other57) {
+ (void) other57;
+ return *this;
+}
+void SplitBlockAlgorithm::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "SplitBlockAlgorithm(";
+ out << ")";
+}
+
+
+BloomFilterAlgorithm::~BloomFilterAlgorithm() noexcept {
+}
+
+
+void BloomFilterAlgorithm::__set_BLOCK(const SplitBlockAlgorithm& val) {
+ this->BLOCK = val;
+__isset.BLOCK = true;
+}
+std::ostream& operator<<(std::ostream& out, const BloomFilterAlgorithm& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t BloomFilterAlgorithm::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->BLOCK.read(iprot);
+ this->__isset.BLOCK = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t BloomFilterAlgorithm::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("BloomFilterAlgorithm");
+
+ if (this->__isset.BLOCK) {
+ xfer += oprot->writeFieldBegin("BLOCK", ::apache::thrift::protocol::T_STRUCT, 1);
+ xfer += this->BLOCK.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(BloomFilterAlgorithm &a, BloomFilterAlgorithm &b) {
+ using ::std::swap;
+ swap(a.BLOCK, b.BLOCK);
+ swap(a.__isset, b.__isset);
+}
+
+BloomFilterAlgorithm::BloomFilterAlgorithm(const BloomFilterAlgorithm& other58) {
+ BLOCK = other58.BLOCK;
+ __isset = other58.__isset;
+}
+BloomFilterAlgorithm& BloomFilterAlgorithm::operator=(const BloomFilterAlgorithm& other59) {
+ BLOCK = other59.BLOCK;
+ __isset = other59.__isset;
+ return *this;
+}
+void BloomFilterAlgorithm::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "BloomFilterAlgorithm(";
+ out << "BLOCK="; (__isset.BLOCK ? (out << to_string(BLOCK)) : (out << "<null>"));
+ out << ")";
+}
+
+
+XxHash::~XxHash() noexcept {
+}
+
+std::ostream& operator<<(std::ostream& out, const XxHash& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t XxHash::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ xfer += iprot->skip(ftype);
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t XxHash::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("XxHash");
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(XxHash &a, XxHash &b) {
+ using ::std::swap;
+ (void) a;
+ (void) b;
+}
+
+XxHash::XxHash(const XxHash& other60) {
+ (void) other60;
+}
+XxHash& XxHash::operator=(const XxHash& other61) {
+ (void) other61;
+ return *this;
+}
+void XxHash::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "XxHash(";
+ out << ")";
+}
+
+
+BloomFilterHash::~BloomFilterHash() noexcept {
+}
+
+
+void BloomFilterHash::__set_XXHASH(const XxHash& val) {
+ this->XXHASH = val;
+__isset.XXHASH = true;
+}
+std::ostream& operator<<(std::ostream& out, const BloomFilterHash& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t BloomFilterHash::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->XXHASH.read(iprot);
+ this->__isset.XXHASH = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t BloomFilterHash::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("BloomFilterHash");
+
+ if (this->__isset.XXHASH) {
+ xfer += oprot->writeFieldBegin("XXHASH", ::apache::thrift::protocol::T_STRUCT, 1);
+ xfer += this->XXHASH.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(BloomFilterHash &a, BloomFilterHash &b) {
+ using ::std::swap;
+ swap(a.XXHASH, b.XXHASH);
+ swap(a.__isset, b.__isset);
+}
+
+BloomFilterHash::BloomFilterHash(const BloomFilterHash& other62) {
+ XXHASH = other62.XXHASH;
+ __isset = other62.__isset;
+}
+BloomFilterHash& BloomFilterHash::operator=(const BloomFilterHash& other63) {
+ XXHASH = other63.XXHASH;
+ __isset = other63.__isset;
+ return *this;
+}
+void BloomFilterHash::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "BloomFilterHash(";
+ out << "XXHASH="; (__isset.XXHASH ? (out << to_string(XXHASH)) : (out << "<null>"));
+ out << ")";
+}
+
+
+Uncompressed::~Uncompressed() noexcept {
+}
+
+std::ostream& operator<<(std::ostream& out, const Uncompressed& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t Uncompressed::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ xfer += iprot->skip(ftype);
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t Uncompressed::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("Uncompressed");
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(Uncompressed &a, Uncompressed &b) {
+ using ::std::swap;
+ (void) a;
+ (void) b;
+}
+
+Uncompressed::Uncompressed(const Uncompressed& other64) {
+ (void) other64;
+}
+Uncompressed& Uncompressed::operator=(const Uncompressed& other65) {
+ (void) other65;
+ return *this;
+}
+void Uncompressed::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "Uncompressed(";
+ out << ")";
+}
+
+
+BloomFilterCompression::~BloomFilterCompression() noexcept {
+}
+
+
+void BloomFilterCompression::__set_UNCOMPRESSED(const Uncompressed& val) {
+ this->UNCOMPRESSED = val;
+__isset.UNCOMPRESSED = true;
+}
+std::ostream& operator<<(std::ostream& out, const BloomFilterCompression& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t BloomFilterCompression::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->UNCOMPRESSED.read(iprot);
+ this->__isset.UNCOMPRESSED = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t BloomFilterCompression::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("BloomFilterCompression");
+
+ if (this->__isset.UNCOMPRESSED) {
+ xfer += oprot->writeFieldBegin("UNCOMPRESSED", ::apache::thrift::protocol::T_STRUCT, 1);
+ xfer += this->UNCOMPRESSED.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(BloomFilterCompression &a, BloomFilterCompression &b) {
+ using ::std::swap;
+ swap(a.UNCOMPRESSED, b.UNCOMPRESSED);
+ swap(a.__isset, b.__isset);
+}
+
+BloomFilterCompression::BloomFilterCompression(const BloomFilterCompression& other66) {
+ UNCOMPRESSED = other66.UNCOMPRESSED;
+ __isset = other66.__isset;
+}
+BloomFilterCompression& BloomFilterCompression::operator=(const BloomFilterCompression& other67) {
+ UNCOMPRESSED = other67.UNCOMPRESSED;
+ __isset = other67.__isset;
+ return *this;
+}
+void BloomFilterCompression::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "BloomFilterCompression(";
+ out << "UNCOMPRESSED="; (__isset.UNCOMPRESSED ? (out << to_string(UNCOMPRESSED)) : (out << "<null>"));
+ out << ")";
+}
+
+
+BloomFilterHeader::~BloomFilterHeader() noexcept {
+}
+
+
+void BloomFilterHeader::__set_numBytes(const int32_t val) {
+ this->numBytes = val;
+}
+
+void BloomFilterHeader::__set_algorithm(const BloomFilterAlgorithm& val) {
+ this->algorithm = val;
+}
+
+void BloomFilterHeader::__set_hash(const BloomFilterHash& val) {
+ this->hash = val;
+}
+
+void BloomFilterHeader::__set_compression(const BloomFilterCompression& val) {
+ this->compression = val;
+}
+std::ostream& operator<<(std::ostream& out, const BloomFilterHeader& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t BloomFilterHeader::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_numBytes = false;
+ bool isset_algorithm = false;
+ bool isset_hash = false;
+ bool isset_compression = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->numBytes);
+ isset_numBytes = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->algorithm.read(iprot);
+ isset_algorithm = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 3:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->hash.read(iprot);
+ isset_hash = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 4:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->compression.read(iprot);
+ isset_compression = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_numBytes)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_algorithm)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_hash)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_compression)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t BloomFilterHeader::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("BloomFilterHeader");
+
+ xfer += oprot->writeFieldBegin("numBytes", ::apache::thrift::protocol::T_I32, 1);
+ xfer += oprot->writeI32(this->numBytes);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("algorithm", ::apache::thrift::protocol::T_STRUCT, 2);
+ xfer += this->algorithm.write(oprot);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("hash", ::apache::thrift::protocol::T_STRUCT, 3);
+ xfer += this->hash.write(oprot);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("compression", ::apache::thrift::protocol::T_STRUCT, 4);
+ xfer += this->compression.write(oprot);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(BloomFilterHeader &a, BloomFilterHeader &b) {
+ using ::std::swap;
+ swap(a.numBytes, b.numBytes);
+ swap(a.algorithm, b.algorithm);
+ swap(a.hash, b.hash);
+ swap(a.compression, b.compression);
+}
+
+BloomFilterHeader::BloomFilterHeader(const BloomFilterHeader& other68) {
+ numBytes = other68.numBytes;
+ algorithm = other68.algorithm;
+ hash = other68.hash;
+ compression = other68.compression;
+}
+BloomFilterHeader& BloomFilterHeader::operator=(const BloomFilterHeader& other69) {
+ numBytes = other69.numBytes;
+ algorithm = other69.algorithm;
+ hash = other69.hash;
+ compression = other69.compression;
+ return *this;
+}
+void BloomFilterHeader::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "BloomFilterHeader(";
+ out << "numBytes=" << to_string(numBytes);
+ out << ", " << "algorithm=" << to_string(algorithm);
+ out << ", " << "hash=" << to_string(hash);
+ out << ", " << "compression=" << to_string(compression);
+ out << ")";
+}
+
+
+PageHeader::~PageHeader() noexcept {
+}
+
+
+void PageHeader::__set_type(const PageType::type val) {
+ this->type = val;
+}
+
+void PageHeader::__set_uncompressed_page_size(const int32_t val) {
+ this->uncompressed_page_size = val;
+}
+
+void PageHeader::__set_compressed_page_size(const int32_t val) {
+ this->compressed_page_size = val;
+}
+
+void PageHeader::__set_crc(const int32_t val) {
+ this->crc = val;
+__isset.crc = true;
+}
+
+void PageHeader::__set_data_page_header(const DataPageHeader& val) {
+ this->data_page_header = val;
+__isset.data_page_header = true;
+}
+
+void PageHeader::__set_index_page_header(const IndexPageHeader& val) {
+ this->index_page_header = val;
+__isset.index_page_header = true;
+}
+
+void PageHeader::__set_dictionary_page_header(const DictionaryPageHeader& val) {
+ this->dictionary_page_header = val;
+__isset.dictionary_page_header = true;
+}
+
+void PageHeader::__set_data_page_header_v2(const DataPageHeaderV2& val) {
+ this->data_page_header_v2 = val;
+__isset.data_page_header_v2 = true;
+}
+std::ostream& operator<<(std::ostream& out, const PageHeader& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t PageHeader::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_type = false;
+ bool isset_uncompressed_page_size = false;
+ bool isset_compressed_page_size = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ int32_t ecast70;
+ xfer += iprot->readI32(ecast70);
+ this->type = (PageType::type)ecast70;
+ isset_type = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->uncompressed_page_size);
+ isset_uncompressed_page_size = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 3:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->compressed_page_size);
+ isset_compressed_page_size = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 4:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->crc);
+ this->__isset.crc = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 5:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->data_page_header.read(iprot);
+ this->__isset.data_page_header = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 6:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->index_page_header.read(iprot);
+ this->__isset.index_page_header = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 7:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->dictionary_page_header.read(iprot);
+ this->__isset.dictionary_page_header = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 8:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->data_page_header_v2.read(iprot);
+ this->__isset.data_page_header_v2 = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_type)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_uncompressed_page_size)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_compressed_page_size)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t PageHeader::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("PageHeader");
+
+ xfer += oprot->writeFieldBegin("type", ::apache::thrift::protocol::T_I32, 1);
+ xfer += oprot->writeI32((int32_t)this->type);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("uncompressed_page_size", ::apache::thrift::protocol::T_I32, 2);
+ xfer += oprot->writeI32(this->uncompressed_page_size);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("compressed_page_size", ::apache::thrift::protocol::T_I32, 3);
+ xfer += oprot->writeI32(this->compressed_page_size);
+ xfer += oprot->writeFieldEnd();
+
+ if (this->__isset.crc) {
+ xfer += oprot->writeFieldBegin("crc", ::apache::thrift::protocol::T_I32, 4);
+ xfer += oprot->writeI32(this->crc);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.data_page_header) {
+ xfer += oprot->writeFieldBegin("data_page_header", ::apache::thrift::protocol::T_STRUCT, 5);
+ xfer += this->data_page_header.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.index_page_header) {
+ xfer += oprot->writeFieldBegin("index_page_header", ::apache::thrift::protocol::T_STRUCT, 6);
+ xfer += this->index_page_header.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.dictionary_page_header) {
+ xfer += oprot->writeFieldBegin("dictionary_page_header", ::apache::thrift::protocol::T_STRUCT, 7);
+ xfer += this->dictionary_page_header.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.data_page_header_v2) {
+ xfer += oprot->writeFieldBegin("data_page_header_v2", ::apache::thrift::protocol::T_STRUCT, 8);
+ xfer += this->data_page_header_v2.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(PageHeader &a, PageHeader &b) {
+ using ::std::swap;
+ swap(a.type, b.type);
+ swap(a.uncompressed_page_size, b.uncompressed_page_size);
+ swap(a.compressed_page_size, b.compressed_page_size);
+ swap(a.crc, b.crc);
+ swap(a.data_page_header, b.data_page_header);
+ swap(a.index_page_header, b.index_page_header);
+ swap(a.dictionary_page_header, b.dictionary_page_header);
+ swap(a.data_page_header_v2, b.data_page_header_v2);
+ swap(a.__isset, b.__isset);
+}
+
+PageHeader::PageHeader(const PageHeader& other71) {
+ type = other71.type;
+ uncompressed_page_size = other71.uncompressed_page_size;
+ compressed_page_size = other71.compressed_page_size;
+ crc = other71.crc;
+ data_page_header = other71.data_page_header;
+ index_page_header = other71.index_page_header;
+ dictionary_page_header = other71.dictionary_page_header;
+ data_page_header_v2 = other71.data_page_header_v2;
+ __isset = other71.__isset;
+}
+PageHeader& PageHeader::operator=(const PageHeader& other72) {
+ type = other72.type;
+ uncompressed_page_size = other72.uncompressed_page_size;
+ compressed_page_size = other72.compressed_page_size;
+ crc = other72.crc;
+ data_page_header = other72.data_page_header;
+ index_page_header = other72.index_page_header;
+ dictionary_page_header = other72.dictionary_page_header;
+ data_page_header_v2 = other72.data_page_header_v2;
+ __isset = other72.__isset;
+ return *this;
+}
+void PageHeader::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "PageHeader(";
+ out << "type=" << to_string(type);
+ out << ", " << "uncompressed_page_size=" << to_string(uncompressed_page_size);
+ out << ", " << "compressed_page_size=" << to_string(compressed_page_size);
+ out << ", " << "crc="; (__isset.crc ? (out << to_string(crc)) : (out << "<null>"));
+ out << ", " << "data_page_header="; (__isset.data_page_header ? (out << to_string(data_page_header)) : (out << "<null>"));
+ out << ", " << "index_page_header="; (__isset.index_page_header ? (out << to_string(index_page_header)) : (out << "<null>"));
+ out << ", " << "dictionary_page_header="; (__isset.dictionary_page_header ? (out << to_string(dictionary_page_header)) : (out << "<null>"));
+ out << ", " << "data_page_header_v2="; (__isset.data_page_header_v2 ? (out << to_string(data_page_header_v2)) : (out << "<null>"));
+ out << ")";
+}
+
+
+KeyValue::~KeyValue() noexcept {
+}
+
+
+void KeyValue::__set_key(const std::string& val) {
+ this->key = val;
+}
+
+void KeyValue::__set_value(const std::string& val) {
+ this->value = val;
+__isset.value = true;
+}
+std::ostream& operator<<(std::ostream& out, const KeyValue& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t KeyValue::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_key = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_STRING) {
+ xfer += iprot->readString(this->key);
+ isset_key = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_STRING) {
+ xfer += iprot->readString(this->value);
+ this->__isset.value = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_key)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t KeyValue::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("KeyValue");
+
+ xfer += oprot->writeFieldBegin("key", ::apache::thrift::protocol::T_STRING, 1);
+ xfer += oprot->writeString(this->key);
+ xfer += oprot->writeFieldEnd();
+
+ if (this->__isset.value) {
+ xfer += oprot->writeFieldBegin("value", ::apache::thrift::protocol::T_STRING, 2);
+ xfer += oprot->writeString(this->value);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(KeyValue &a, KeyValue &b) {
+ using ::std::swap;
+ swap(a.key, b.key);
+ swap(a.value, b.value);
+ swap(a.__isset, b.__isset);
+}
+
+KeyValue::KeyValue(const KeyValue& other73) {
+ key = other73.key;
+ value = other73.value;
+ __isset = other73.__isset;
+}
+KeyValue& KeyValue::operator=(const KeyValue& other74) {
+ key = other74.key;
+ value = other74.value;
+ __isset = other74.__isset;
+ return *this;
+}
+void KeyValue::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "KeyValue(";
+ out << "key=" << to_string(key);
+ out << ", " << "value="; (__isset.value ? (out << to_string(value)) : (out << "<null>"));
+ out << ")";
+}
+
+
+SortingColumn::~SortingColumn() noexcept {
+}
+
+
+void SortingColumn::__set_column_idx(const int32_t val) {
+ this->column_idx = val;
+}
+
+void SortingColumn::__set_descending(const bool val) {
+ this->descending = val;
+}
+
+void SortingColumn::__set_nulls_first(const bool val) {
+ this->nulls_first = val;
+}
+std::ostream& operator<<(std::ostream& out, const SortingColumn& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t SortingColumn::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_column_idx = false;
+ bool isset_descending = false;
+ bool isset_nulls_first = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->column_idx);
+ isset_column_idx = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_BOOL) {
+ xfer += iprot->readBool(this->descending);
+ isset_descending = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 3:
+ if (ftype == ::apache::thrift::protocol::T_BOOL) {
+ xfer += iprot->readBool(this->nulls_first);
+ isset_nulls_first = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_column_idx)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_descending)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_nulls_first)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t SortingColumn::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("SortingColumn");
+
+ xfer += oprot->writeFieldBegin("column_idx", ::apache::thrift::protocol::T_I32, 1);
+ xfer += oprot->writeI32(this->column_idx);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("descending", ::apache::thrift::protocol::T_BOOL, 2);
+ xfer += oprot->writeBool(this->descending);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("nulls_first", ::apache::thrift::protocol::T_BOOL, 3);
+ xfer += oprot->writeBool(this->nulls_first);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(SortingColumn &a, SortingColumn &b) {
+ using ::std::swap;
+ swap(a.column_idx, b.column_idx);
+ swap(a.descending, b.descending);
+ swap(a.nulls_first, b.nulls_first);
+}
+
+SortingColumn::SortingColumn(const SortingColumn& other75) {
+ column_idx = other75.column_idx;
+ descending = other75.descending;
+ nulls_first = other75.nulls_first;
+}
+SortingColumn& SortingColumn::operator=(const SortingColumn& other76) {
+ column_idx = other76.column_idx;
+ descending = other76.descending;
+ nulls_first = other76.nulls_first;
+ return *this;
+}
+void SortingColumn::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "SortingColumn(";
+ out << "column_idx=" << to_string(column_idx);
+ out << ", " << "descending=" << to_string(descending);
+ out << ", " << "nulls_first=" << to_string(nulls_first);
+ out << ")";
+}
+
+
+PageEncodingStats::~PageEncodingStats() noexcept {
+}
+
+
+void PageEncodingStats::__set_page_type(const PageType::type val) {
+ this->page_type = val;
+}
+
+void PageEncodingStats::__set_encoding(const Encoding::type val) {
+ this->encoding = val;
+}
+
+void PageEncodingStats::__set_count(const int32_t val) {
+ this->count = val;
+}
+std::ostream& operator<<(std::ostream& out, const PageEncodingStats& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t PageEncodingStats::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_page_type = false;
+ bool isset_encoding = false;
+ bool isset_count = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ int32_t ecast77;
+ xfer += iprot->readI32(ecast77);
+ this->page_type = (PageType::type)ecast77;
+ isset_page_type = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ int32_t ecast78;
+ xfer += iprot->readI32(ecast78);
+ this->encoding = (Encoding::type)ecast78;
+ isset_encoding = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 3:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->count);
+ isset_count = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_page_type)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_encoding)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_count)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t PageEncodingStats::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("PageEncodingStats");
+
+ xfer += oprot->writeFieldBegin("page_type", ::apache::thrift::protocol::T_I32, 1);
+ xfer += oprot->writeI32((int32_t)this->page_type);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("encoding", ::apache::thrift::protocol::T_I32, 2);
+ xfer += oprot->writeI32((int32_t)this->encoding);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("count", ::apache::thrift::protocol::T_I32, 3);
+ xfer += oprot->writeI32(this->count);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(PageEncodingStats &a, PageEncodingStats &b) {
+ using ::std::swap;
+ swap(a.page_type, b.page_type);
+ swap(a.encoding, b.encoding);
+ swap(a.count, b.count);
+}
+
+PageEncodingStats::PageEncodingStats(const PageEncodingStats& other79) {
+ page_type = other79.page_type;
+ encoding = other79.encoding;
+ count = other79.count;
+}
+PageEncodingStats& PageEncodingStats::operator=(const PageEncodingStats& other80) {
+ page_type = other80.page_type;
+ encoding = other80.encoding;
+ count = other80.count;
+ return *this;
+}
+void PageEncodingStats::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "PageEncodingStats(";
+ out << "page_type=" << to_string(page_type);
+ out << ", " << "encoding=" << to_string(encoding);
+ out << ", " << "count=" << to_string(count);
+ out << ")";
+}
+
+
+ColumnMetaData::~ColumnMetaData() noexcept {
+}
+
+
+void ColumnMetaData::__set_type(const Type::type val) {
+ this->type = val;
+}
+
+void ColumnMetaData::__set_encodings(const std::vector<Encoding::type> & val) {
+ this->encodings = val;
+}
+
+void ColumnMetaData::__set_path_in_schema(const std::vector<std::string> & val) {
+ this->path_in_schema = val;
+}
+
+void ColumnMetaData::__set_codec(const CompressionCodec::type val) {
+ this->codec = val;
+}
+
+void ColumnMetaData::__set_num_values(const int64_t val) {
+ this->num_values = val;
+}
+
+void ColumnMetaData::__set_total_uncompressed_size(const int64_t val) {
+ this->total_uncompressed_size = val;
+}
+
+void ColumnMetaData::__set_total_compressed_size(const int64_t val) {
+ this->total_compressed_size = val;
+}
+
+void ColumnMetaData::__set_key_value_metadata(const std::vector<KeyValue> & val) {
+ this->key_value_metadata = val;
+__isset.key_value_metadata = true;
+}
+
+void ColumnMetaData::__set_data_page_offset(const int64_t val) {
+ this->data_page_offset = val;
+}
+
+void ColumnMetaData::__set_index_page_offset(const int64_t val) {
+ this->index_page_offset = val;
+__isset.index_page_offset = true;
+}
+
+void ColumnMetaData::__set_dictionary_page_offset(const int64_t val) {
+ this->dictionary_page_offset = val;
+__isset.dictionary_page_offset = true;
+}
+
+void ColumnMetaData::__set_statistics(const Statistics& val) {
+ this->statistics = val;
+__isset.statistics = true;
+}
+
+void ColumnMetaData::__set_encoding_stats(const std::vector<PageEncodingStats> & val) {
+ this->encoding_stats = val;
+__isset.encoding_stats = true;
+}
+
+void ColumnMetaData::__set_bloom_filter_offset(const int64_t val) {
+ this->bloom_filter_offset = val;
+__isset.bloom_filter_offset = true;
+}
+std::ostream& operator<<(std::ostream& out, const ColumnMetaData& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t ColumnMetaData::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_type = false;
+ bool isset_encodings = false;
+ bool isset_path_in_schema = false;
+ bool isset_codec = false;
+ bool isset_num_values = false;
+ bool isset_total_uncompressed_size = false;
+ bool isset_total_compressed_size = false;
+ bool isset_data_page_offset = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ int32_t ecast81;
+ xfer += iprot->readI32(ecast81);
+ this->type = (Type::type)ecast81;
+ isset_type = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_LIST) {
+ {
+ this->encodings.clear();
+ uint32_t _size82;
+ ::apache::thrift::protocol::TType _etype85;
+ xfer += iprot->readListBegin(_etype85, _size82);
+ this->encodings.resize(_size82);
+ uint32_t _i86;
+ for (_i86 = 0; _i86 < _size82; ++_i86)
+ {
+ int32_t ecast87;
+ xfer += iprot->readI32(ecast87);
+ this->encodings[_i86] = (Encoding::type)ecast87;
+ }
+ xfer += iprot->readListEnd();
+ }
+ isset_encodings = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 3:
+ if (ftype == ::apache::thrift::protocol::T_LIST) {
+ {
+ this->path_in_schema.clear();
+ uint32_t _size88;
+ ::apache::thrift::protocol::TType _etype91;
+ xfer += iprot->readListBegin(_etype91, _size88);
+ this->path_in_schema.resize(_size88);
+ uint32_t _i92;
+ for (_i92 = 0; _i92 < _size88; ++_i92)
+ {
+ xfer += iprot->readString(this->path_in_schema[_i92]);
+ }
+ xfer += iprot->readListEnd();
+ }
+ isset_path_in_schema = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 4:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ int32_t ecast93;
+ xfer += iprot->readI32(ecast93);
+ this->codec = (CompressionCodec::type)ecast93;
+ isset_codec = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 5:
+ if (ftype == ::apache::thrift::protocol::T_I64) {
+ xfer += iprot->readI64(this->num_values);
+ isset_num_values = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 6:
+ if (ftype == ::apache::thrift::protocol::T_I64) {
+ xfer += iprot->readI64(this->total_uncompressed_size);
+ isset_total_uncompressed_size = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 7:
+ if (ftype == ::apache::thrift::protocol::T_I64) {
+ xfer += iprot->readI64(this->total_compressed_size);
+ isset_total_compressed_size = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 8:
+ if (ftype == ::apache::thrift::protocol::T_LIST) {
+ {
+ this->key_value_metadata.clear();
+ uint32_t _size94;
+ ::apache::thrift::protocol::TType _etype97;
+ xfer += iprot->readListBegin(_etype97, _size94);
+ this->key_value_metadata.resize(_size94);
+ uint32_t _i98;
+ for (_i98 = 0; _i98 < _size94; ++_i98)
+ {
+ xfer += this->key_value_metadata[_i98].read(iprot);
+ }
+ xfer += iprot->readListEnd();
+ }
+ this->__isset.key_value_metadata = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 9:
+ if (ftype == ::apache::thrift::protocol::T_I64) {
+ xfer += iprot->readI64(this->data_page_offset);
+ isset_data_page_offset = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 10:
+ if (ftype == ::apache::thrift::protocol::T_I64) {
+ xfer += iprot->readI64(this->index_page_offset);
+ this->__isset.index_page_offset = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 11:
+ if (ftype == ::apache::thrift::protocol::T_I64) {
+ xfer += iprot->readI64(this->dictionary_page_offset);
+ this->__isset.dictionary_page_offset = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 12:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->statistics.read(iprot);
+ this->__isset.statistics = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 13:
+ if (ftype == ::apache::thrift::protocol::T_LIST) {
+ {
+ this->encoding_stats.clear();
+ uint32_t _size99;
+ ::apache::thrift::protocol::TType _etype102;
+ xfer += iprot->readListBegin(_etype102, _size99);
+ this->encoding_stats.resize(_size99);
+ uint32_t _i103;
+ for (_i103 = 0; _i103 < _size99; ++_i103)
+ {
+ xfer += this->encoding_stats[_i103].read(iprot);
+ }
+ xfer += iprot->readListEnd();
+ }
+ this->__isset.encoding_stats = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 14:
+ if (ftype == ::apache::thrift::protocol::T_I64) {
+ xfer += iprot->readI64(this->bloom_filter_offset);
+ this->__isset.bloom_filter_offset = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_type)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_encodings)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_path_in_schema)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_codec)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_num_values)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_total_uncompressed_size)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_total_compressed_size)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_data_page_offset)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t ColumnMetaData::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("ColumnMetaData");
+
+ xfer += oprot->writeFieldBegin("type", ::apache::thrift::protocol::T_I32, 1);
+ xfer += oprot->writeI32((int32_t)this->type);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("encodings", ::apache::thrift::protocol::T_LIST, 2);
+ {
+ xfer += oprot->writeListBegin(::apache::thrift::protocol::T_I32, static_cast<uint32_t>(this->encodings.size()));
+ std::vector<Encoding::type> ::const_iterator _iter104;
+ for (_iter104 = this->encodings.begin(); _iter104 != this->encodings.end(); ++_iter104)
+ {
+ xfer += oprot->writeI32((int32_t)(*_iter104));
+ }
+ xfer += oprot->writeListEnd();
+ }
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("path_in_schema", ::apache::thrift::protocol::T_LIST, 3);
+ {
+ xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRING, static_cast<uint32_t>(this->path_in_schema.size()));
+ std::vector<std::string> ::const_iterator _iter105;
+ for (_iter105 = this->path_in_schema.begin(); _iter105 != this->path_in_schema.end(); ++_iter105)
+ {
+ xfer += oprot->writeString((*_iter105));
+ }
+ xfer += oprot->writeListEnd();
+ }
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("codec", ::apache::thrift::protocol::T_I32, 4);
+ xfer += oprot->writeI32((int32_t)this->codec);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("num_values", ::apache::thrift::protocol::T_I64, 5);
+ xfer += oprot->writeI64(this->num_values);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("total_uncompressed_size", ::apache::thrift::protocol::T_I64, 6);
+ xfer += oprot->writeI64(this->total_uncompressed_size);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("total_compressed_size", ::apache::thrift::protocol::T_I64, 7);
+ xfer += oprot->writeI64(this->total_compressed_size);
+ xfer += oprot->writeFieldEnd();
+
+ if (this->__isset.key_value_metadata) {
+ xfer += oprot->writeFieldBegin("key_value_metadata", ::apache::thrift::protocol::T_LIST, 8);
+ {
+ xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast<uint32_t>(this->key_value_metadata.size()));
+ std::vector<KeyValue> ::const_iterator _iter106;
+ for (_iter106 = this->key_value_metadata.begin(); _iter106 != this->key_value_metadata.end(); ++_iter106)
+ {
+ xfer += (*_iter106).write(oprot);
+ }
+ xfer += oprot->writeListEnd();
+ }
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldBegin("data_page_offset", ::apache::thrift::protocol::T_I64, 9);
+ xfer += oprot->writeI64(this->data_page_offset);
+ xfer += oprot->writeFieldEnd();
+
+ if (this->__isset.index_page_offset) {
+ xfer += oprot->writeFieldBegin("index_page_offset", ::apache::thrift::protocol::T_I64, 10);
+ xfer += oprot->writeI64(this->index_page_offset);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.dictionary_page_offset) {
+ xfer += oprot->writeFieldBegin("dictionary_page_offset", ::apache::thrift::protocol::T_I64, 11);
+ xfer += oprot->writeI64(this->dictionary_page_offset);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.statistics) {
+ xfer += oprot->writeFieldBegin("statistics", ::apache::thrift::protocol::T_STRUCT, 12);
+ xfer += this->statistics.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.encoding_stats) {
+ xfer += oprot->writeFieldBegin("encoding_stats", ::apache::thrift::protocol::T_LIST, 13);
+ {
+ xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast<uint32_t>(this->encoding_stats.size()));
+ std::vector<PageEncodingStats> ::const_iterator _iter107;
+ for (_iter107 = this->encoding_stats.begin(); _iter107 != this->encoding_stats.end(); ++_iter107)
+ {
+ xfer += (*_iter107).write(oprot);
+ }
+ xfer += oprot->writeListEnd();
+ }
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.bloom_filter_offset) {
+ xfer += oprot->writeFieldBegin("bloom_filter_offset", ::apache::thrift::protocol::T_I64, 14);
+ xfer += oprot->writeI64(this->bloom_filter_offset);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(ColumnMetaData &a, ColumnMetaData &b) {
+ using ::std::swap;
+ swap(a.type, b.type);
+ swap(a.encodings, b.encodings);
+ swap(a.path_in_schema, b.path_in_schema);
+ swap(a.codec, b.codec);
+ swap(a.num_values, b.num_values);
+ swap(a.total_uncompressed_size, b.total_uncompressed_size);
+ swap(a.total_compressed_size, b.total_compressed_size);
+ swap(a.key_value_metadata, b.key_value_metadata);
+ swap(a.data_page_offset, b.data_page_offset);
+ swap(a.index_page_offset, b.index_page_offset);
+ swap(a.dictionary_page_offset, b.dictionary_page_offset);
+ swap(a.statistics, b.statistics);
+ swap(a.encoding_stats, b.encoding_stats);
+ swap(a.bloom_filter_offset, b.bloom_filter_offset);
+ swap(a.__isset, b.__isset);
+}
+
+ColumnMetaData::ColumnMetaData(const ColumnMetaData& other108) {
+ type = other108.type;
+ encodings = other108.encodings;
+ path_in_schema = other108.path_in_schema;
+ codec = other108.codec;
+ num_values = other108.num_values;
+ total_uncompressed_size = other108.total_uncompressed_size;
+ total_compressed_size = other108.total_compressed_size;
+ key_value_metadata = other108.key_value_metadata;
+ data_page_offset = other108.data_page_offset;
+ index_page_offset = other108.index_page_offset;
+ dictionary_page_offset = other108.dictionary_page_offset;
+ statistics = other108.statistics;
+ encoding_stats = other108.encoding_stats;
+ bloom_filter_offset = other108.bloom_filter_offset;
+ __isset = other108.__isset;
+}
+ColumnMetaData& ColumnMetaData::operator=(const ColumnMetaData& other109) {
+ type = other109.type;
+ encodings = other109.encodings;
+ path_in_schema = other109.path_in_schema;
+ codec = other109.codec;
+ num_values = other109.num_values;
+ total_uncompressed_size = other109.total_uncompressed_size;
+ total_compressed_size = other109.total_compressed_size;
+ key_value_metadata = other109.key_value_metadata;
+ data_page_offset = other109.data_page_offset;
+ index_page_offset = other109.index_page_offset;
+ dictionary_page_offset = other109.dictionary_page_offset;
+ statistics = other109.statistics;
+ encoding_stats = other109.encoding_stats;
+ bloom_filter_offset = other109.bloom_filter_offset;
+ __isset = other109.__isset;
+ return *this;
+}
+void ColumnMetaData::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "ColumnMetaData(";
+ out << "type=" << to_string(type);
+ out << ", " << "encodings=" << to_string(encodings);
+ out << ", " << "path_in_schema=" << to_string(path_in_schema);
+ out << ", " << "codec=" << to_string(codec);
+ out << ", " << "num_values=" << to_string(num_values);
+ out << ", " << "total_uncompressed_size=" << to_string(total_uncompressed_size);
+ out << ", " << "total_compressed_size=" << to_string(total_compressed_size);
+ out << ", " << "key_value_metadata="; (__isset.key_value_metadata ? (out << to_string(key_value_metadata)) : (out << "<null>"));
+ out << ", " << "data_page_offset=" << to_string(data_page_offset);
+ out << ", " << "index_page_offset="; (__isset.index_page_offset ? (out << to_string(index_page_offset)) : (out << "<null>"));
+ out << ", " << "dictionary_page_offset="; (__isset.dictionary_page_offset ? (out << to_string(dictionary_page_offset)) : (out << "<null>"));
+ out << ", " << "statistics="; (__isset.statistics ? (out << to_string(statistics)) : (out << "<null>"));
+ out << ", " << "encoding_stats="; (__isset.encoding_stats ? (out << to_string(encoding_stats)) : (out << "<null>"));
+ out << ", " << "bloom_filter_offset="; (__isset.bloom_filter_offset ? (out << to_string(bloom_filter_offset)) : (out << "<null>"));
+ out << ")";
+}
+
+
+EncryptionWithFooterKey::~EncryptionWithFooterKey() noexcept {
+}
+
+std::ostream& operator<<(std::ostream& out, const EncryptionWithFooterKey& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t EncryptionWithFooterKey::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ xfer += iprot->skip(ftype);
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t EncryptionWithFooterKey::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("EncryptionWithFooterKey");
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(EncryptionWithFooterKey &a, EncryptionWithFooterKey &b) {
+ using ::std::swap;
+ (void) a;
+ (void) b;
+}
+
+EncryptionWithFooterKey::EncryptionWithFooterKey(const EncryptionWithFooterKey& other110) {
+ (void) other110;
+}
+EncryptionWithFooterKey& EncryptionWithFooterKey::operator=(const EncryptionWithFooterKey& other111) {
+ (void) other111;
+ return *this;
+}
+void EncryptionWithFooterKey::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "EncryptionWithFooterKey(";
+ out << ")";
+}
+
+
+EncryptionWithColumnKey::~EncryptionWithColumnKey() noexcept {
+}
+
+
+void EncryptionWithColumnKey::__set_path_in_schema(const std::vector<std::string> & val) {
+ this->path_in_schema = val;
+}
+
+void EncryptionWithColumnKey::__set_key_metadata(const std::string& val) {
+ this->key_metadata = val;
+__isset.key_metadata = true;
+}
+std::ostream& operator<<(std::ostream& out, const EncryptionWithColumnKey& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t EncryptionWithColumnKey::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_path_in_schema = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_LIST) {
+ {
+ this->path_in_schema.clear();
+ uint32_t _size112;
+ ::apache::thrift::protocol::TType _etype115;
+ xfer += iprot->readListBegin(_etype115, _size112);
+ this->path_in_schema.resize(_size112);
+ uint32_t _i116;
+ for (_i116 = 0; _i116 < _size112; ++_i116)
+ {
+ xfer += iprot->readString(this->path_in_schema[_i116]);
+ }
+ xfer += iprot->readListEnd();
+ }
+ isset_path_in_schema = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_STRING) {
+ xfer += iprot->readBinary(this->key_metadata);
+ this->__isset.key_metadata = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_path_in_schema)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t EncryptionWithColumnKey::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("EncryptionWithColumnKey");
+
+ xfer += oprot->writeFieldBegin("path_in_schema", ::apache::thrift::protocol::T_LIST, 1);
+ {
+ xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRING, static_cast<uint32_t>(this->path_in_schema.size()));
+ std::vector<std::string> ::const_iterator _iter117;
+ for (_iter117 = this->path_in_schema.begin(); _iter117 != this->path_in_schema.end(); ++_iter117)
+ {
+ xfer += oprot->writeString((*_iter117));
+ }
+ xfer += oprot->writeListEnd();
+ }
+ xfer += oprot->writeFieldEnd();
+
+ if (this->__isset.key_metadata) {
+ xfer += oprot->writeFieldBegin("key_metadata", ::apache::thrift::protocol::T_STRING, 2);
+ xfer += oprot->writeBinary(this->key_metadata);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(EncryptionWithColumnKey &a, EncryptionWithColumnKey &b) {
+ using ::std::swap;
+ swap(a.path_in_schema, b.path_in_schema);
+ swap(a.key_metadata, b.key_metadata);
+ swap(a.__isset, b.__isset);
+}
+
+EncryptionWithColumnKey::EncryptionWithColumnKey(const EncryptionWithColumnKey& other118) {
+ path_in_schema = other118.path_in_schema;
+ key_metadata = other118.key_metadata;
+ __isset = other118.__isset;
+}
+EncryptionWithColumnKey& EncryptionWithColumnKey::operator=(const EncryptionWithColumnKey& other119) {
+ path_in_schema = other119.path_in_schema;
+ key_metadata = other119.key_metadata;
+ __isset = other119.__isset;
+ return *this;
+}
+void EncryptionWithColumnKey::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "EncryptionWithColumnKey(";
+ out << "path_in_schema=" << to_string(path_in_schema);
+ out << ", " << "key_metadata="; (__isset.key_metadata ? (out << to_string(key_metadata)) : (out << "<null>"));
+ out << ")";
+}
+
+
+ColumnCryptoMetaData::~ColumnCryptoMetaData() noexcept {
+}
+
+
+void ColumnCryptoMetaData::__set_ENCRYPTION_WITH_FOOTER_KEY(const EncryptionWithFooterKey& val) {
+ this->ENCRYPTION_WITH_FOOTER_KEY = val;
+__isset.ENCRYPTION_WITH_FOOTER_KEY = true;
+}
+
+void ColumnCryptoMetaData::__set_ENCRYPTION_WITH_COLUMN_KEY(const EncryptionWithColumnKey& val) {
+ this->ENCRYPTION_WITH_COLUMN_KEY = val;
+__isset.ENCRYPTION_WITH_COLUMN_KEY = true;
+}
+std::ostream& operator<<(std::ostream& out, const ColumnCryptoMetaData& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t ColumnCryptoMetaData::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->ENCRYPTION_WITH_FOOTER_KEY.read(iprot);
+ this->__isset.ENCRYPTION_WITH_FOOTER_KEY = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->ENCRYPTION_WITH_COLUMN_KEY.read(iprot);
+ this->__isset.ENCRYPTION_WITH_COLUMN_KEY = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t ColumnCryptoMetaData::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("ColumnCryptoMetaData");
+
+ if (this->__isset.ENCRYPTION_WITH_FOOTER_KEY) {
+ xfer += oprot->writeFieldBegin("ENCRYPTION_WITH_FOOTER_KEY", ::apache::thrift::protocol::T_STRUCT, 1);
+ xfer += this->ENCRYPTION_WITH_FOOTER_KEY.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.ENCRYPTION_WITH_COLUMN_KEY) {
+ xfer += oprot->writeFieldBegin("ENCRYPTION_WITH_COLUMN_KEY", ::apache::thrift::protocol::T_STRUCT, 2);
+ xfer += this->ENCRYPTION_WITH_COLUMN_KEY.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(ColumnCryptoMetaData &a, ColumnCryptoMetaData &b) {
+ using ::std::swap;
+ swap(a.ENCRYPTION_WITH_FOOTER_KEY, b.ENCRYPTION_WITH_FOOTER_KEY);
+ swap(a.ENCRYPTION_WITH_COLUMN_KEY, b.ENCRYPTION_WITH_COLUMN_KEY);
+ swap(a.__isset, b.__isset);
+}
+
+ColumnCryptoMetaData::ColumnCryptoMetaData(const ColumnCryptoMetaData& other120) {
+ ENCRYPTION_WITH_FOOTER_KEY = other120.ENCRYPTION_WITH_FOOTER_KEY;
+ ENCRYPTION_WITH_COLUMN_KEY = other120.ENCRYPTION_WITH_COLUMN_KEY;
+ __isset = other120.__isset;
+}
+ColumnCryptoMetaData& ColumnCryptoMetaData::operator=(const ColumnCryptoMetaData& other121) {
+ ENCRYPTION_WITH_FOOTER_KEY = other121.ENCRYPTION_WITH_FOOTER_KEY;
+ ENCRYPTION_WITH_COLUMN_KEY = other121.ENCRYPTION_WITH_COLUMN_KEY;
+ __isset = other121.__isset;
+ return *this;
+}
+void ColumnCryptoMetaData::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "ColumnCryptoMetaData(";
+ out << "ENCRYPTION_WITH_FOOTER_KEY="; (__isset.ENCRYPTION_WITH_FOOTER_KEY ? (out << to_string(ENCRYPTION_WITH_FOOTER_KEY)) : (out << "<null>"));
+ out << ", " << "ENCRYPTION_WITH_COLUMN_KEY="; (__isset.ENCRYPTION_WITH_COLUMN_KEY ? (out << to_string(ENCRYPTION_WITH_COLUMN_KEY)) : (out << "<null>"));
+ out << ")";
+}
+
+
+ColumnChunk::~ColumnChunk() noexcept {
+}
+
+
+void ColumnChunk::__set_file_path(const std::string& val) {
+ this->file_path = val;
+__isset.file_path = true;
+}
+
+void ColumnChunk::__set_file_offset(const int64_t val) {
+ this->file_offset = val;
+}
+
+void ColumnChunk::__set_meta_data(const ColumnMetaData& val) {
+ this->meta_data = val;
+__isset.meta_data = true;
+}
+
+void ColumnChunk::__set_offset_index_offset(const int64_t val) {
+ this->offset_index_offset = val;
+__isset.offset_index_offset = true;
+}
+
+void ColumnChunk::__set_offset_index_length(const int32_t val) {
+ this->offset_index_length = val;
+__isset.offset_index_length = true;
+}
+
+void ColumnChunk::__set_column_index_offset(const int64_t val) {
+ this->column_index_offset = val;
+__isset.column_index_offset = true;
+}
+
+void ColumnChunk::__set_column_index_length(const int32_t val) {
+ this->column_index_length = val;
+__isset.column_index_length = true;
+}
+
+void ColumnChunk::__set_crypto_metadata(const ColumnCryptoMetaData& val) {
+ this->crypto_metadata = val;
+__isset.crypto_metadata = true;
+}
+
+void ColumnChunk::__set_encrypted_column_metadata(const std::string& val) {
+ this->encrypted_column_metadata = val;
+__isset.encrypted_column_metadata = true;
+}
+std::ostream& operator<<(std::ostream& out, const ColumnChunk& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t ColumnChunk::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_file_offset = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_STRING) {
+ xfer += iprot->readString(this->file_path);
+ this->__isset.file_path = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_I64) {
+ xfer += iprot->readI64(this->file_offset);
+ isset_file_offset = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 3:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->meta_data.read(iprot);
+ this->__isset.meta_data = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 4:
+ if (ftype == ::apache::thrift::protocol::T_I64) {
+ xfer += iprot->readI64(this->offset_index_offset);
+ this->__isset.offset_index_offset = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 5:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->offset_index_length);
+ this->__isset.offset_index_length = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 6:
+ if (ftype == ::apache::thrift::protocol::T_I64) {
+ xfer += iprot->readI64(this->column_index_offset);
+ this->__isset.column_index_offset = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 7:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->column_index_length);
+ this->__isset.column_index_length = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 8:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->crypto_metadata.read(iprot);
+ this->__isset.crypto_metadata = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 9:
+ if (ftype == ::apache::thrift::protocol::T_STRING) {
+ xfer += iprot->readBinary(this->encrypted_column_metadata);
+ this->__isset.encrypted_column_metadata = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_file_offset)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t ColumnChunk::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("ColumnChunk");
+
+ if (this->__isset.file_path) {
+ xfer += oprot->writeFieldBegin("file_path", ::apache::thrift::protocol::T_STRING, 1);
+ xfer += oprot->writeString(this->file_path);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldBegin("file_offset", ::apache::thrift::protocol::T_I64, 2);
+ xfer += oprot->writeI64(this->file_offset);
+ xfer += oprot->writeFieldEnd();
+
+ if (this->__isset.meta_data) {
+ xfer += oprot->writeFieldBegin("meta_data", ::apache::thrift::protocol::T_STRUCT, 3);
+ xfer += this->meta_data.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.offset_index_offset) {
+ xfer += oprot->writeFieldBegin("offset_index_offset", ::apache::thrift::protocol::T_I64, 4);
+ xfer += oprot->writeI64(this->offset_index_offset);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.offset_index_length) {
+ xfer += oprot->writeFieldBegin("offset_index_length", ::apache::thrift::protocol::T_I32, 5);
+ xfer += oprot->writeI32(this->offset_index_length);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.column_index_offset) {
+ xfer += oprot->writeFieldBegin("column_index_offset", ::apache::thrift::protocol::T_I64, 6);
+ xfer += oprot->writeI64(this->column_index_offset);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.column_index_length) {
+ xfer += oprot->writeFieldBegin("column_index_length", ::apache::thrift::protocol::T_I32, 7);
+ xfer += oprot->writeI32(this->column_index_length);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.crypto_metadata) {
+ xfer += oprot->writeFieldBegin("crypto_metadata", ::apache::thrift::protocol::T_STRUCT, 8);
+ xfer += this->crypto_metadata.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.encrypted_column_metadata) {
+ xfer += oprot->writeFieldBegin("encrypted_column_metadata", ::apache::thrift::protocol::T_STRING, 9);
+ xfer += oprot->writeBinary(this->encrypted_column_metadata);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(ColumnChunk &a, ColumnChunk &b) {
+ using ::std::swap;
+ swap(a.file_path, b.file_path);
+ swap(a.file_offset, b.file_offset);
+ swap(a.meta_data, b.meta_data);
+ swap(a.offset_index_offset, b.offset_index_offset);
+ swap(a.offset_index_length, b.offset_index_length);
+ swap(a.column_index_offset, b.column_index_offset);
+ swap(a.column_index_length, b.column_index_length);
+ swap(a.crypto_metadata, b.crypto_metadata);
+ swap(a.encrypted_column_metadata, b.encrypted_column_metadata);
+ swap(a.__isset, b.__isset);
+}
+
+ColumnChunk::ColumnChunk(const ColumnChunk& other122) {
+ file_path = other122.file_path;
+ file_offset = other122.file_offset;
+ meta_data = other122.meta_data;
+ offset_index_offset = other122.offset_index_offset;
+ offset_index_length = other122.offset_index_length;
+ column_index_offset = other122.column_index_offset;
+ column_index_length = other122.column_index_length;
+ crypto_metadata = other122.crypto_metadata;
+ encrypted_column_metadata = other122.encrypted_column_metadata;
+ __isset = other122.__isset;
+}
+ColumnChunk& ColumnChunk::operator=(const ColumnChunk& other123) {
+ file_path = other123.file_path;
+ file_offset = other123.file_offset;
+ meta_data = other123.meta_data;
+ offset_index_offset = other123.offset_index_offset;
+ offset_index_length = other123.offset_index_length;
+ column_index_offset = other123.column_index_offset;
+ column_index_length = other123.column_index_length;
+ crypto_metadata = other123.crypto_metadata;
+ encrypted_column_metadata = other123.encrypted_column_metadata;
+ __isset = other123.__isset;
+ return *this;
+}
+void ColumnChunk::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "ColumnChunk(";
+ out << "file_path="; (__isset.file_path ? (out << to_string(file_path)) : (out << "<null>"));
+ out << ", " << "file_offset=" << to_string(file_offset);
+ out << ", " << "meta_data="; (__isset.meta_data ? (out << to_string(meta_data)) : (out << "<null>"));
+ out << ", " << "offset_index_offset="; (__isset.offset_index_offset ? (out << to_string(offset_index_offset)) : (out << "<null>"));
+ out << ", " << "offset_index_length="; (__isset.offset_index_length ? (out << to_string(offset_index_length)) : (out << "<null>"));
+ out << ", " << "column_index_offset="; (__isset.column_index_offset ? (out << to_string(column_index_offset)) : (out << "<null>"));
+ out << ", " << "column_index_length="; (__isset.column_index_length ? (out << to_string(column_index_length)) : (out << "<null>"));
+ out << ", " << "crypto_metadata="; (__isset.crypto_metadata ? (out << to_string(crypto_metadata)) : (out << "<null>"));
+ out << ", " << "encrypted_column_metadata="; (__isset.encrypted_column_metadata ? (out << to_string(encrypted_column_metadata)) : (out << "<null>"));
+ out << ")";
+}
+
+
+RowGroup::~RowGroup() noexcept {
+}
+
+
+void RowGroup::__set_columns(const std::vector<ColumnChunk> & val) {
+ this->columns = val;
+}
+
+void RowGroup::__set_total_byte_size(const int64_t val) {
+ this->total_byte_size = val;
+}
+
+void RowGroup::__set_num_rows(const int64_t val) {
+ this->num_rows = val;
+}
+
+void RowGroup::__set_sorting_columns(const std::vector<SortingColumn> & val) {
+ this->sorting_columns = val;
+__isset.sorting_columns = true;
+}
+
+void RowGroup::__set_file_offset(const int64_t val) {
+ this->file_offset = val;
+__isset.file_offset = true;
+}
+
+void RowGroup::__set_total_compressed_size(const int64_t val) {
+ this->total_compressed_size = val;
+__isset.total_compressed_size = true;
+}
+
+void RowGroup::__set_ordinal(const int16_t val) {
+ this->ordinal = val;
+__isset.ordinal = true;
+}
+std::ostream& operator<<(std::ostream& out, const RowGroup& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t RowGroup::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_columns = false;
+ bool isset_total_byte_size = false;
+ bool isset_num_rows = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_LIST) {
+ {
+ this->columns.clear();
+ uint32_t _size124;
+ ::apache::thrift::protocol::TType _etype127;
+ xfer += iprot->readListBegin(_etype127, _size124);
+ this->columns.resize(_size124);
+ uint32_t _i128;
+ for (_i128 = 0; _i128 < _size124; ++_i128)
+ {
+ xfer += this->columns[_i128].read(iprot);
+ }
+ xfer += iprot->readListEnd();
+ }
+ isset_columns = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_I64) {
+ xfer += iprot->readI64(this->total_byte_size);
+ isset_total_byte_size = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 3:
+ if (ftype == ::apache::thrift::protocol::T_I64) {
+ xfer += iprot->readI64(this->num_rows);
+ isset_num_rows = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 4:
+ if (ftype == ::apache::thrift::protocol::T_LIST) {
+ {
+ this->sorting_columns.clear();
+ uint32_t _size129;
+ ::apache::thrift::protocol::TType _etype132;
+ xfer += iprot->readListBegin(_etype132, _size129);
+ this->sorting_columns.resize(_size129);
+ uint32_t _i133;
+ for (_i133 = 0; _i133 < _size129; ++_i133)
+ {
+ xfer += this->sorting_columns[_i133].read(iprot);
+ }
+ xfer += iprot->readListEnd();
+ }
+ this->__isset.sorting_columns = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 5:
+ if (ftype == ::apache::thrift::protocol::T_I64) {
+ xfer += iprot->readI64(this->file_offset);
+ this->__isset.file_offset = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 6:
+ if (ftype == ::apache::thrift::protocol::T_I64) {
+ xfer += iprot->readI64(this->total_compressed_size);
+ this->__isset.total_compressed_size = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 7:
+ if (ftype == ::apache::thrift::protocol::T_I16) {
+ xfer += iprot->readI16(this->ordinal);
+ this->__isset.ordinal = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_columns)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_total_byte_size)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_num_rows)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t RowGroup::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("RowGroup");
+
+ xfer += oprot->writeFieldBegin("columns", ::apache::thrift::protocol::T_LIST, 1);
+ {
+ xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast<uint32_t>(this->columns.size()));
+ std::vector<ColumnChunk> ::const_iterator _iter134;
+ for (_iter134 = this->columns.begin(); _iter134 != this->columns.end(); ++_iter134)
+ {
+ xfer += (*_iter134).write(oprot);
+ }
+ xfer += oprot->writeListEnd();
+ }
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("total_byte_size", ::apache::thrift::protocol::T_I64, 2);
+ xfer += oprot->writeI64(this->total_byte_size);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("num_rows", ::apache::thrift::protocol::T_I64, 3);
+ xfer += oprot->writeI64(this->num_rows);
+ xfer += oprot->writeFieldEnd();
+
+ if (this->__isset.sorting_columns) {
+ xfer += oprot->writeFieldBegin("sorting_columns", ::apache::thrift::protocol::T_LIST, 4);
+ {
+ xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast<uint32_t>(this->sorting_columns.size()));
+ std::vector<SortingColumn> ::const_iterator _iter135;
+ for (_iter135 = this->sorting_columns.begin(); _iter135 != this->sorting_columns.end(); ++_iter135)
+ {
+ xfer += (*_iter135).write(oprot);
+ }
+ xfer += oprot->writeListEnd();
+ }
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.file_offset) {
+ xfer += oprot->writeFieldBegin("file_offset", ::apache::thrift::protocol::T_I64, 5);
+ xfer += oprot->writeI64(this->file_offset);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.total_compressed_size) {
+ xfer += oprot->writeFieldBegin("total_compressed_size", ::apache::thrift::protocol::T_I64, 6);
+ xfer += oprot->writeI64(this->total_compressed_size);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.ordinal) {
+ xfer += oprot->writeFieldBegin("ordinal", ::apache::thrift::protocol::T_I16, 7);
+ xfer += oprot->writeI16(this->ordinal);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(RowGroup &a, RowGroup &b) {
+ using ::std::swap;
+ swap(a.columns, b.columns);
+ swap(a.total_byte_size, b.total_byte_size);
+ swap(a.num_rows, b.num_rows);
+ swap(a.sorting_columns, b.sorting_columns);
+ swap(a.file_offset, b.file_offset);
+ swap(a.total_compressed_size, b.total_compressed_size);
+ swap(a.ordinal, b.ordinal);
+ swap(a.__isset, b.__isset);
+}
+
+RowGroup::RowGroup(const RowGroup& other136) {
+ columns = other136.columns;
+ total_byte_size = other136.total_byte_size;
+ num_rows = other136.num_rows;
+ sorting_columns = other136.sorting_columns;
+ file_offset = other136.file_offset;
+ total_compressed_size = other136.total_compressed_size;
+ ordinal = other136.ordinal;
+ __isset = other136.__isset;
+}
+RowGroup& RowGroup::operator=(const RowGroup& other137) {
+ columns = other137.columns;
+ total_byte_size = other137.total_byte_size;
+ num_rows = other137.num_rows;
+ sorting_columns = other137.sorting_columns;
+ file_offset = other137.file_offset;
+ total_compressed_size = other137.total_compressed_size;
+ ordinal = other137.ordinal;
+ __isset = other137.__isset;
+ return *this;
+}
+void RowGroup::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "RowGroup(";
+ out << "columns=" << to_string(columns);
+ out << ", " << "total_byte_size=" << to_string(total_byte_size);
+ out << ", " << "num_rows=" << to_string(num_rows);
+ out << ", " << "sorting_columns="; (__isset.sorting_columns ? (out << to_string(sorting_columns)) : (out << "<null>"));
+ out << ", " << "file_offset="; (__isset.file_offset ? (out << to_string(file_offset)) : (out << "<null>"));
+ out << ", " << "total_compressed_size="; (__isset.total_compressed_size ? (out << to_string(total_compressed_size)) : (out << "<null>"));
+ out << ", " << "ordinal="; (__isset.ordinal ? (out << to_string(ordinal)) : (out << "<null>"));
+ out << ")";
+}
+
+
+TypeDefinedOrder::~TypeDefinedOrder() noexcept {
+}
+
+std::ostream& operator<<(std::ostream& out, const TypeDefinedOrder& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t TypeDefinedOrder::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ xfer += iprot->skip(ftype);
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t TypeDefinedOrder::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("TypeDefinedOrder");
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(TypeDefinedOrder &a, TypeDefinedOrder &b) {
+ using ::std::swap;
+ (void) a;
+ (void) b;
+}
+
+TypeDefinedOrder::TypeDefinedOrder(const TypeDefinedOrder& other138) {
+ (void) other138;
+}
+TypeDefinedOrder& TypeDefinedOrder::operator=(const TypeDefinedOrder& other139) {
+ (void) other139;
+ return *this;
+}
+void TypeDefinedOrder::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "TypeDefinedOrder(";
+ out << ")";
+}
+
+
+ColumnOrder::~ColumnOrder() noexcept {
+}
+
+
+void ColumnOrder::__set_TYPE_ORDER(const TypeDefinedOrder& val) {
+ this->TYPE_ORDER = val;
+__isset.TYPE_ORDER = true;
+}
+std::ostream& operator<<(std::ostream& out, const ColumnOrder& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t ColumnOrder::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->TYPE_ORDER.read(iprot);
+ this->__isset.TYPE_ORDER = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t ColumnOrder::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("ColumnOrder");
+
+ if (this->__isset.TYPE_ORDER) {
+ xfer += oprot->writeFieldBegin("TYPE_ORDER", ::apache::thrift::protocol::T_STRUCT, 1);
+ xfer += this->TYPE_ORDER.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(ColumnOrder &a, ColumnOrder &b) {
+ using ::std::swap;
+ swap(a.TYPE_ORDER, b.TYPE_ORDER);
+ swap(a.__isset, b.__isset);
+}
+
+ColumnOrder::ColumnOrder(const ColumnOrder& other140) {
+ TYPE_ORDER = other140.TYPE_ORDER;
+ __isset = other140.__isset;
+}
+ColumnOrder& ColumnOrder::operator=(const ColumnOrder& other141) {
+ TYPE_ORDER = other141.TYPE_ORDER;
+ __isset = other141.__isset;
+ return *this;
+}
+void ColumnOrder::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "ColumnOrder(";
+ out << "TYPE_ORDER="; (__isset.TYPE_ORDER ? (out << to_string(TYPE_ORDER)) : (out << "<null>"));
+ out << ")";
+}
+
+
+PageLocation::~PageLocation() noexcept {
+}
+
+
+void PageLocation::__set_offset(const int64_t val) {
+ this->offset = val;
+}
+
+void PageLocation::__set_compressed_page_size(const int32_t val) {
+ this->compressed_page_size = val;
+}
+
+void PageLocation::__set_first_row_index(const int64_t val) {
+ this->first_row_index = val;
+}
+std::ostream& operator<<(std::ostream& out, const PageLocation& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t PageLocation::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_offset = false;
+ bool isset_compressed_page_size = false;
+ bool isset_first_row_index = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_I64) {
+ xfer += iprot->readI64(this->offset);
+ isset_offset = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->compressed_page_size);
+ isset_compressed_page_size = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 3:
+ if (ftype == ::apache::thrift::protocol::T_I64) {
+ xfer += iprot->readI64(this->first_row_index);
+ isset_first_row_index = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_offset)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_compressed_page_size)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_first_row_index)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t PageLocation::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("PageLocation");
+
+ xfer += oprot->writeFieldBegin("offset", ::apache::thrift::protocol::T_I64, 1);
+ xfer += oprot->writeI64(this->offset);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("compressed_page_size", ::apache::thrift::protocol::T_I32, 2);
+ xfer += oprot->writeI32(this->compressed_page_size);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("first_row_index", ::apache::thrift::protocol::T_I64, 3);
+ xfer += oprot->writeI64(this->first_row_index);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(PageLocation &a, PageLocation &b) {
+ using ::std::swap;
+ swap(a.offset, b.offset);
+ swap(a.compressed_page_size, b.compressed_page_size);
+ swap(a.first_row_index, b.first_row_index);
+}
+
+PageLocation::PageLocation(const PageLocation& other142) {
+ offset = other142.offset;
+ compressed_page_size = other142.compressed_page_size;
+ first_row_index = other142.first_row_index;
+}
+PageLocation& PageLocation::operator=(const PageLocation& other143) {
+ offset = other143.offset;
+ compressed_page_size = other143.compressed_page_size;
+ first_row_index = other143.first_row_index;
+ return *this;
+}
+void PageLocation::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "PageLocation(";
+ out << "offset=" << to_string(offset);
+ out << ", " << "compressed_page_size=" << to_string(compressed_page_size);
+ out << ", " << "first_row_index=" << to_string(first_row_index);
+ out << ")";
+}
+
+
+OffsetIndex::~OffsetIndex() noexcept {
+}
+
+
+void OffsetIndex::__set_page_locations(const std::vector<PageLocation> & val) {
+ this->page_locations = val;
+}
+std::ostream& operator<<(std::ostream& out, const OffsetIndex& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t OffsetIndex::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_page_locations = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_LIST) {
+ {
+ this->page_locations.clear();
+ uint32_t _size144;
+ ::apache::thrift::protocol::TType _etype147;
+ xfer += iprot->readListBegin(_etype147, _size144);
+ this->page_locations.resize(_size144);
+ uint32_t _i148;
+ for (_i148 = 0; _i148 < _size144; ++_i148)
+ {
+ xfer += this->page_locations[_i148].read(iprot);
+ }
+ xfer += iprot->readListEnd();
+ }
+ isset_page_locations = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_page_locations)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t OffsetIndex::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("OffsetIndex");
+
+ xfer += oprot->writeFieldBegin("page_locations", ::apache::thrift::protocol::T_LIST, 1);
+ {
+ xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast<uint32_t>(this->page_locations.size()));
+ std::vector<PageLocation> ::const_iterator _iter149;
+ for (_iter149 = this->page_locations.begin(); _iter149 != this->page_locations.end(); ++_iter149)
+ {
+ xfer += (*_iter149).write(oprot);
+ }
+ xfer += oprot->writeListEnd();
+ }
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(OffsetIndex &a, OffsetIndex &b) {
+ using ::std::swap;
+ swap(a.page_locations, b.page_locations);
+}
+
+OffsetIndex::OffsetIndex(const OffsetIndex& other150) {
+ page_locations = other150.page_locations;
+}
+OffsetIndex& OffsetIndex::operator=(const OffsetIndex& other151) {
+ page_locations = other151.page_locations;
+ return *this;
+}
+void OffsetIndex::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "OffsetIndex(";
+ out << "page_locations=" << to_string(page_locations);
+ out << ")";
+}
+
+
+ColumnIndex::~ColumnIndex() noexcept {
+}
+
+
+void ColumnIndex::__set_null_pages(const std::vector<bool> & val) {
+ this->null_pages = val;
+}
+
+void ColumnIndex::__set_min_values(const std::vector<std::string> & val) {
+ this->min_values = val;
+}
+
+void ColumnIndex::__set_max_values(const std::vector<std::string> & val) {
+ this->max_values = val;
+}
+
+void ColumnIndex::__set_boundary_order(const BoundaryOrder::type val) {
+ this->boundary_order = val;
+}
+
+void ColumnIndex::__set_null_counts(const std::vector<int64_t> & val) {
+ this->null_counts = val;
+__isset.null_counts = true;
+}
+std::ostream& operator<<(std::ostream& out, const ColumnIndex& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t ColumnIndex::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_null_pages = false;
+ bool isset_min_values = false;
+ bool isset_max_values = false;
+ bool isset_boundary_order = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_LIST) {
+ {
+ this->null_pages.clear();
+ uint32_t _size152;
+ ::apache::thrift::protocol::TType _etype155;
+ xfer += iprot->readListBegin(_etype155, _size152);
+ this->null_pages.resize(_size152);
+ uint32_t _i156;
+ for (_i156 = 0; _i156 < _size152; ++_i156)
+ {
+ bool result;
+ xfer += iprot->readBool(result);
+ this->null_pages[_i156] = result;
+ }
+ xfer += iprot->readListEnd();
+ }
+ isset_null_pages = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_LIST) {
+ {
+ this->min_values.clear();
+ uint32_t _size157;
+ ::apache::thrift::protocol::TType _etype160;
+ xfer += iprot->readListBegin(_etype160, _size157);
+ this->min_values.resize(_size157);
+ uint32_t _i161;
+ for (_i161 = 0; _i161 < _size157; ++_i161)
+ {
+ xfer += iprot->readBinary(this->min_values[_i161]);
+ }
+ xfer += iprot->readListEnd();
+ }
+ isset_min_values = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 3:
+ if (ftype == ::apache::thrift::protocol::T_LIST) {
+ {
+ this->max_values.clear();
+ uint32_t _size162;
+ ::apache::thrift::protocol::TType _etype165;
+ xfer += iprot->readListBegin(_etype165, _size162);
+ this->max_values.resize(_size162);
+ uint32_t _i166;
+ for (_i166 = 0; _i166 < _size162; ++_i166)
+ {
+ xfer += iprot->readBinary(this->max_values[_i166]);
+ }
+ xfer += iprot->readListEnd();
+ }
+ isset_max_values = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 4:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ int32_t ecast167;
+ xfer += iprot->readI32(ecast167);
+ this->boundary_order = (BoundaryOrder::type)ecast167;
+ isset_boundary_order = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 5:
+ if (ftype == ::apache::thrift::protocol::T_LIST) {
+ {
+ this->null_counts.clear();
+ uint32_t _size168;
+ ::apache::thrift::protocol::TType _etype171;
+ xfer += iprot->readListBegin(_etype171, _size168);
+ this->null_counts.resize(_size168);
+ uint32_t _i172;
+ for (_i172 = 0; _i172 < _size168; ++_i172)
+ {
+ xfer += iprot->readI64(this->null_counts[_i172]);
+ }
+ xfer += iprot->readListEnd();
+ }
+ this->__isset.null_counts = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_null_pages)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_min_values)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_max_values)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_boundary_order)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t ColumnIndex::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("ColumnIndex");
+
+ xfer += oprot->writeFieldBegin("null_pages", ::apache::thrift::protocol::T_LIST, 1);
+ {
+ xfer += oprot->writeListBegin(::apache::thrift::protocol::T_BOOL, static_cast<uint32_t>(this->null_pages.size()));
+ std::vector<bool> ::const_iterator _iter173;
+ for (_iter173 = this->null_pages.begin(); _iter173 != this->null_pages.end(); ++_iter173)
+ {
+ xfer += oprot->writeBool((*_iter173));
+ }
+ xfer += oprot->writeListEnd();
+ }
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("min_values", ::apache::thrift::protocol::T_LIST, 2);
+ {
+ xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRING, static_cast<uint32_t>(this->min_values.size()));
+ std::vector<std::string> ::const_iterator _iter174;
+ for (_iter174 = this->min_values.begin(); _iter174 != this->min_values.end(); ++_iter174)
+ {
+ xfer += oprot->writeBinary((*_iter174));
+ }
+ xfer += oprot->writeListEnd();
+ }
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("max_values", ::apache::thrift::protocol::T_LIST, 3);
+ {
+ xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRING, static_cast<uint32_t>(this->max_values.size()));
+ std::vector<std::string> ::const_iterator _iter175;
+ for (_iter175 = this->max_values.begin(); _iter175 != this->max_values.end(); ++_iter175)
+ {
+ xfer += oprot->writeBinary((*_iter175));
+ }
+ xfer += oprot->writeListEnd();
+ }
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("boundary_order", ::apache::thrift::protocol::T_I32, 4);
+ xfer += oprot->writeI32((int32_t)this->boundary_order);
+ xfer += oprot->writeFieldEnd();
+
+ if (this->__isset.null_counts) {
+ xfer += oprot->writeFieldBegin("null_counts", ::apache::thrift::protocol::T_LIST, 5);
+ {
+ xfer += oprot->writeListBegin(::apache::thrift::protocol::T_I64, static_cast<uint32_t>(this->null_counts.size()));
+ std::vector<int64_t> ::const_iterator _iter176;
+ for (_iter176 = this->null_counts.begin(); _iter176 != this->null_counts.end(); ++_iter176)
+ {
+ xfer += oprot->writeI64((*_iter176));
+ }
+ xfer += oprot->writeListEnd();
+ }
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(ColumnIndex &a, ColumnIndex &b) {
+ using ::std::swap;
+ swap(a.null_pages, b.null_pages);
+ swap(a.min_values, b.min_values);
+ swap(a.max_values, b.max_values);
+ swap(a.boundary_order, b.boundary_order);
+ swap(a.null_counts, b.null_counts);
+ swap(a.__isset, b.__isset);
+}
+
+ColumnIndex::ColumnIndex(const ColumnIndex& other177) {
+ null_pages = other177.null_pages;
+ min_values = other177.min_values;
+ max_values = other177.max_values;
+ boundary_order = other177.boundary_order;
+ null_counts = other177.null_counts;
+ __isset = other177.__isset;
+}
+ColumnIndex& ColumnIndex::operator=(const ColumnIndex& other178) {
+ null_pages = other178.null_pages;
+ min_values = other178.min_values;
+ max_values = other178.max_values;
+ boundary_order = other178.boundary_order;
+ null_counts = other178.null_counts;
+ __isset = other178.__isset;
+ return *this;
+}
+void ColumnIndex::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "ColumnIndex(";
+ out << "null_pages=" << to_string(null_pages);
+ out << ", " << "min_values=" << to_string(min_values);
+ out << ", " << "max_values=" << to_string(max_values);
+ out << ", " << "boundary_order=" << to_string(boundary_order);
+ out << ", " << "null_counts="; (__isset.null_counts ? (out << to_string(null_counts)) : (out << "<null>"));
+ out << ")";
+}
+
+
+AesGcmV1::~AesGcmV1() noexcept {
+}
+
+
+void AesGcmV1::__set_aad_prefix(const std::string& val) {
+ this->aad_prefix = val;
+__isset.aad_prefix = true;
+}
+
+void AesGcmV1::__set_aad_file_unique(const std::string& val) {
+ this->aad_file_unique = val;
+__isset.aad_file_unique = true;
+}
+
+void AesGcmV1::__set_supply_aad_prefix(const bool val) {
+ this->supply_aad_prefix = val;
+__isset.supply_aad_prefix = true;
+}
+std::ostream& operator<<(std::ostream& out, const AesGcmV1& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t AesGcmV1::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_STRING) {
+ xfer += iprot->readBinary(this->aad_prefix);
+ this->__isset.aad_prefix = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_STRING) {
+ xfer += iprot->readBinary(this->aad_file_unique);
+ this->__isset.aad_file_unique = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 3:
+ if (ftype == ::apache::thrift::protocol::T_BOOL) {
+ xfer += iprot->readBool(this->supply_aad_prefix);
+ this->__isset.supply_aad_prefix = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t AesGcmV1::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("AesGcmV1");
+
+ if (this->__isset.aad_prefix) {
+ xfer += oprot->writeFieldBegin("aad_prefix", ::apache::thrift::protocol::T_STRING, 1);
+ xfer += oprot->writeBinary(this->aad_prefix);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.aad_file_unique) {
+ xfer += oprot->writeFieldBegin("aad_file_unique", ::apache::thrift::protocol::T_STRING, 2);
+ xfer += oprot->writeBinary(this->aad_file_unique);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.supply_aad_prefix) {
+ xfer += oprot->writeFieldBegin("supply_aad_prefix", ::apache::thrift::protocol::T_BOOL, 3);
+ xfer += oprot->writeBool(this->supply_aad_prefix);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(AesGcmV1 &a, AesGcmV1 &b) {
+ using ::std::swap;
+ swap(a.aad_prefix, b.aad_prefix);
+ swap(a.aad_file_unique, b.aad_file_unique);
+ swap(a.supply_aad_prefix, b.supply_aad_prefix);
+ swap(a.__isset, b.__isset);
+}
+
+AesGcmV1::AesGcmV1(const AesGcmV1& other179) {
+ aad_prefix = other179.aad_prefix;
+ aad_file_unique = other179.aad_file_unique;
+ supply_aad_prefix = other179.supply_aad_prefix;
+ __isset = other179.__isset;
+}
+AesGcmV1& AesGcmV1::operator=(const AesGcmV1& other180) {
+ aad_prefix = other180.aad_prefix;
+ aad_file_unique = other180.aad_file_unique;
+ supply_aad_prefix = other180.supply_aad_prefix;
+ __isset = other180.__isset;
+ return *this;
+}
+void AesGcmV1::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "AesGcmV1(";
+ out << "aad_prefix="; (__isset.aad_prefix ? (out << to_string(aad_prefix)) : (out << "<null>"));
+ out << ", " << "aad_file_unique="; (__isset.aad_file_unique ? (out << to_string(aad_file_unique)) : (out << "<null>"));
+ out << ", " << "supply_aad_prefix="; (__isset.supply_aad_prefix ? (out << to_string(supply_aad_prefix)) : (out << "<null>"));
+ out << ")";
+}
+
+
+AesGcmCtrV1::~AesGcmCtrV1() noexcept {
+}
+
+
+void AesGcmCtrV1::__set_aad_prefix(const std::string& val) {
+ this->aad_prefix = val;
+__isset.aad_prefix = true;
+}
+
+void AesGcmCtrV1::__set_aad_file_unique(const std::string& val) {
+ this->aad_file_unique = val;
+__isset.aad_file_unique = true;
+}
+
+void AesGcmCtrV1::__set_supply_aad_prefix(const bool val) {
+ this->supply_aad_prefix = val;
+__isset.supply_aad_prefix = true;
+}
+std::ostream& operator<<(std::ostream& out, const AesGcmCtrV1& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t AesGcmCtrV1::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_STRING) {
+ xfer += iprot->readBinary(this->aad_prefix);
+ this->__isset.aad_prefix = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_STRING) {
+ xfer += iprot->readBinary(this->aad_file_unique);
+ this->__isset.aad_file_unique = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 3:
+ if (ftype == ::apache::thrift::protocol::T_BOOL) {
+ xfer += iprot->readBool(this->supply_aad_prefix);
+ this->__isset.supply_aad_prefix = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t AesGcmCtrV1::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("AesGcmCtrV1");
+
+ if (this->__isset.aad_prefix) {
+ xfer += oprot->writeFieldBegin("aad_prefix", ::apache::thrift::protocol::T_STRING, 1);
+ xfer += oprot->writeBinary(this->aad_prefix);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.aad_file_unique) {
+ xfer += oprot->writeFieldBegin("aad_file_unique", ::apache::thrift::protocol::T_STRING, 2);
+ xfer += oprot->writeBinary(this->aad_file_unique);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.supply_aad_prefix) {
+ xfer += oprot->writeFieldBegin("supply_aad_prefix", ::apache::thrift::protocol::T_BOOL, 3);
+ xfer += oprot->writeBool(this->supply_aad_prefix);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(AesGcmCtrV1 &a, AesGcmCtrV1 &b) {
+ using ::std::swap;
+ swap(a.aad_prefix, b.aad_prefix);
+ swap(a.aad_file_unique, b.aad_file_unique);
+ swap(a.supply_aad_prefix, b.supply_aad_prefix);
+ swap(a.__isset, b.__isset);
+}
+
+AesGcmCtrV1::AesGcmCtrV1(const AesGcmCtrV1& other181) {
+ aad_prefix = other181.aad_prefix;
+ aad_file_unique = other181.aad_file_unique;
+ supply_aad_prefix = other181.supply_aad_prefix;
+ __isset = other181.__isset;
+}
+AesGcmCtrV1& AesGcmCtrV1::operator=(const AesGcmCtrV1& other182) {
+ aad_prefix = other182.aad_prefix;
+ aad_file_unique = other182.aad_file_unique;
+ supply_aad_prefix = other182.supply_aad_prefix;
+ __isset = other182.__isset;
+ return *this;
+}
+void AesGcmCtrV1::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "AesGcmCtrV1(";
+ out << "aad_prefix="; (__isset.aad_prefix ? (out << to_string(aad_prefix)) : (out << "<null>"));
+ out << ", " << "aad_file_unique="; (__isset.aad_file_unique ? (out << to_string(aad_file_unique)) : (out << "<null>"));
+ out << ", " << "supply_aad_prefix="; (__isset.supply_aad_prefix ? (out << to_string(supply_aad_prefix)) : (out << "<null>"));
+ out << ")";
+}
+
+
+EncryptionAlgorithm::~EncryptionAlgorithm() noexcept {
+}
+
+
+void EncryptionAlgorithm::__set_AES_GCM_V1(const AesGcmV1& val) {
+ this->AES_GCM_V1 = val;
+__isset.AES_GCM_V1 = true;
+}
+
+void EncryptionAlgorithm::__set_AES_GCM_CTR_V1(const AesGcmCtrV1& val) {
+ this->AES_GCM_CTR_V1 = val;
+__isset.AES_GCM_CTR_V1 = true;
+}
+std::ostream& operator<<(std::ostream& out, const EncryptionAlgorithm& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t EncryptionAlgorithm::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->AES_GCM_V1.read(iprot);
+ this->__isset.AES_GCM_V1 = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->AES_GCM_CTR_V1.read(iprot);
+ this->__isset.AES_GCM_CTR_V1 = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t EncryptionAlgorithm::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("EncryptionAlgorithm");
+
+ if (this->__isset.AES_GCM_V1) {
+ xfer += oprot->writeFieldBegin("AES_GCM_V1", ::apache::thrift::protocol::T_STRUCT, 1);
+ xfer += this->AES_GCM_V1.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.AES_GCM_CTR_V1) {
+ xfer += oprot->writeFieldBegin("AES_GCM_CTR_V1", ::apache::thrift::protocol::T_STRUCT, 2);
+ xfer += this->AES_GCM_CTR_V1.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(EncryptionAlgorithm &a, EncryptionAlgorithm &b) {
+ using ::std::swap;
+ swap(a.AES_GCM_V1, b.AES_GCM_V1);
+ swap(a.AES_GCM_CTR_V1, b.AES_GCM_CTR_V1);
+ swap(a.__isset, b.__isset);
+}
+
+EncryptionAlgorithm::EncryptionAlgorithm(const EncryptionAlgorithm& other183) {
+ AES_GCM_V1 = other183.AES_GCM_V1;
+ AES_GCM_CTR_V1 = other183.AES_GCM_CTR_V1;
+ __isset = other183.__isset;
+}
+EncryptionAlgorithm& EncryptionAlgorithm::operator=(const EncryptionAlgorithm& other184) {
+ AES_GCM_V1 = other184.AES_GCM_V1;
+ AES_GCM_CTR_V1 = other184.AES_GCM_CTR_V1;
+ __isset = other184.__isset;
+ return *this;
+}
+void EncryptionAlgorithm::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "EncryptionAlgorithm(";
+ out << "AES_GCM_V1="; (__isset.AES_GCM_V1 ? (out << to_string(AES_GCM_V1)) : (out << "<null>"));
+ out << ", " << "AES_GCM_CTR_V1="; (__isset.AES_GCM_CTR_V1 ? (out << to_string(AES_GCM_CTR_V1)) : (out << "<null>"));
+ out << ")";
+}
+
+
+FileMetaData::~FileMetaData() noexcept {
+}
+
+
+void FileMetaData::__set_version(const int32_t val) {
+ this->version = val;
+}
+
+void FileMetaData::__set_schema(const std::vector<SchemaElement> & val) {
+ this->schema = val;
+}
+
+void FileMetaData::__set_num_rows(const int64_t val) {
+ this->num_rows = val;
+}
+
+void FileMetaData::__set_row_groups(const std::vector<RowGroup> & val) {
+ this->row_groups = val;
+}
+
+void FileMetaData::__set_key_value_metadata(const std::vector<KeyValue> & val) {
+ this->key_value_metadata = val;
+__isset.key_value_metadata = true;
+}
+
+void FileMetaData::__set_created_by(const std::string& val) {
+ this->created_by = val;
+__isset.created_by = true;
+}
+
+void FileMetaData::__set_column_orders(const std::vector<ColumnOrder> & val) {
+ this->column_orders = val;
+__isset.column_orders = true;
+}
+
+void FileMetaData::__set_encryption_algorithm(const EncryptionAlgorithm& val) {
+ this->encryption_algorithm = val;
+__isset.encryption_algorithm = true;
+}
+
+void FileMetaData::__set_footer_signing_key_metadata(const std::string& val) {
+ this->footer_signing_key_metadata = val;
+__isset.footer_signing_key_metadata = true;
+}
+std::ostream& operator<<(std::ostream& out, const FileMetaData& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t FileMetaData::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_version = false;
+ bool isset_schema = false;
+ bool isset_num_rows = false;
+ bool isset_row_groups = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->version);
+ isset_version = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_LIST) {
+ {
+ this->schema.clear();
+ uint32_t _size185;
+ ::apache::thrift::protocol::TType _etype188;
+ xfer += iprot->readListBegin(_etype188, _size185);
+ this->schema.resize(_size185);
+ uint32_t _i189;
+ for (_i189 = 0; _i189 < _size185; ++_i189)
+ {
+ xfer += this->schema[_i189].read(iprot);
+ }
+ xfer += iprot->readListEnd();
+ }
+ isset_schema = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 3:
+ if (ftype == ::apache::thrift::protocol::T_I64) {
+ xfer += iprot->readI64(this->num_rows);
+ isset_num_rows = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 4:
+ if (ftype == ::apache::thrift::protocol::T_LIST) {
+ {
+ this->row_groups.clear();
+ uint32_t _size190;
+ ::apache::thrift::protocol::TType _etype193;
+ xfer += iprot->readListBegin(_etype193, _size190);
+ this->row_groups.resize(_size190);
+ uint32_t _i194;
+ for (_i194 = 0; _i194 < _size190; ++_i194)
+ {
+ xfer += this->row_groups[_i194].read(iprot);
+ }
+ xfer += iprot->readListEnd();
+ }
+ isset_row_groups = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 5:
+ if (ftype == ::apache::thrift::protocol::T_LIST) {
+ {
+ this->key_value_metadata.clear();
+ uint32_t _size195;
+ ::apache::thrift::protocol::TType _etype198;
+ xfer += iprot->readListBegin(_etype198, _size195);
+ this->key_value_metadata.resize(_size195);
+ uint32_t _i199;
+ for (_i199 = 0; _i199 < _size195; ++_i199)
+ {
+ xfer += this->key_value_metadata[_i199].read(iprot);
+ }
+ xfer += iprot->readListEnd();
+ }
+ this->__isset.key_value_metadata = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 6:
+ if (ftype == ::apache::thrift::protocol::T_STRING) {
+ xfer += iprot->readString(this->created_by);
+ this->__isset.created_by = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 7:
+ if (ftype == ::apache::thrift::protocol::T_LIST) {
+ {
+ this->column_orders.clear();
+ uint32_t _size200;
+ ::apache::thrift::protocol::TType _etype203;
+ xfer += iprot->readListBegin(_etype203, _size200);
+ this->column_orders.resize(_size200);
+ uint32_t _i204;
+ for (_i204 = 0; _i204 < _size200; ++_i204)
+ {
+ xfer += this->column_orders[_i204].read(iprot);
+ }
+ xfer += iprot->readListEnd();
+ }
+ this->__isset.column_orders = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 8:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->encryption_algorithm.read(iprot);
+ this->__isset.encryption_algorithm = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 9:
+ if (ftype == ::apache::thrift::protocol::T_STRING) {
+ xfer += iprot->readBinary(this->footer_signing_key_metadata);
+ this->__isset.footer_signing_key_metadata = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_version)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_schema)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_num_rows)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_row_groups)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t FileMetaData::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("FileMetaData");
+
+ xfer += oprot->writeFieldBegin("version", ::apache::thrift::protocol::T_I32, 1);
+ xfer += oprot->writeI32(this->version);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("schema", ::apache::thrift::protocol::T_LIST, 2);
+ {
+ xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast<uint32_t>(this->schema.size()));
+ std::vector<SchemaElement> ::const_iterator _iter205;
+ for (_iter205 = this->schema.begin(); _iter205 != this->schema.end(); ++_iter205)
+ {
+ xfer += (*_iter205).write(oprot);
+ }
+ xfer += oprot->writeListEnd();
+ }
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("num_rows", ::apache::thrift::protocol::T_I64, 3);
+ xfer += oprot->writeI64(this->num_rows);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("row_groups", ::apache::thrift::protocol::T_LIST, 4);
+ {
+ xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast<uint32_t>(this->row_groups.size()));
+ std::vector<RowGroup> ::const_iterator _iter206;
+ for (_iter206 = this->row_groups.begin(); _iter206 != this->row_groups.end(); ++_iter206)
+ {
+ xfer += (*_iter206).write(oprot);
+ }
+ xfer += oprot->writeListEnd();
+ }
+ xfer += oprot->writeFieldEnd();
+
+ if (this->__isset.key_value_metadata) {
+ xfer += oprot->writeFieldBegin("key_value_metadata", ::apache::thrift::protocol::T_LIST, 5);
+ {
+ xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast<uint32_t>(this->key_value_metadata.size()));
+ std::vector<KeyValue> ::const_iterator _iter207;
+ for (_iter207 = this->key_value_metadata.begin(); _iter207 != this->key_value_metadata.end(); ++_iter207)
+ {
+ xfer += (*_iter207).write(oprot);
+ }
+ xfer += oprot->writeListEnd();
+ }
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.created_by) {
+ xfer += oprot->writeFieldBegin("created_by", ::apache::thrift::protocol::T_STRING, 6);
+ xfer += oprot->writeString(this->created_by);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.column_orders) {
+ xfer += oprot->writeFieldBegin("column_orders", ::apache::thrift::protocol::T_LIST, 7);
+ {
+ xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast<uint32_t>(this->column_orders.size()));
+ std::vector<ColumnOrder> ::const_iterator _iter208;
+ for (_iter208 = this->column_orders.begin(); _iter208 != this->column_orders.end(); ++_iter208)
+ {
+ xfer += (*_iter208).write(oprot);
+ }
+ xfer += oprot->writeListEnd();
+ }
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.encryption_algorithm) {
+ xfer += oprot->writeFieldBegin("encryption_algorithm", ::apache::thrift::protocol::T_STRUCT, 8);
+ xfer += this->encryption_algorithm.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.footer_signing_key_metadata) {
+ xfer += oprot->writeFieldBegin("footer_signing_key_metadata", ::apache::thrift::protocol::T_STRING, 9);
+ xfer += oprot->writeBinary(this->footer_signing_key_metadata);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(FileMetaData &a, FileMetaData &b) {
+ using ::std::swap;
+ swap(a.version, b.version);
+ swap(a.schema, b.schema);
+ swap(a.num_rows, b.num_rows);
+ swap(a.row_groups, b.row_groups);
+ swap(a.key_value_metadata, b.key_value_metadata);
+ swap(a.created_by, b.created_by);
+ swap(a.column_orders, b.column_orders);
+ swap(a.encryption_algorithm, b.encryption_algorithm);
+ swap(a.footer_signing_key_metadata, b.footer_signing_key_metadata);
+ swap(a.__isset, b.__isset);
+}
+
+FileMetaData::FileMetaData(const FileMetaData& other209) {
+ version = other209.version;
+ schema = other209.schema;
+ num_rows = other209.num_rows;
+ row_groups = other209.row_groups;
+ key_value_metadata = other209.key_value_metadata;
+ created_by = other209.created_by;
+ column_orders = other209.column_orders;
+ encryption_algorithm = other209.encryption_algorithm;
+ footer_signing_key_metadata = other209.footer_signing_key_metadata;
+ __isset = other209.__isset;
+}
+FileMetaData& FileMetaData::operator=(const FileMetaData& other210) {
+ version = other210.version;
+ schema = other210.schema;
+ num_rows = other210.num_rows;
+ row_groups = other210.row_groups;
+ key_value_metadata = other210.key_value_metadata;
+ created_by = other210.created_by;
+ column_orders = other210.column_orders;
+ encryption_algorithm = other210.encryption_algorithm;
+ footer_signing_key_metadata = other210.footer_signing_key_metadata;
+ __isset = other210.__isset;
+ return *this;
+}
+void FileMetaData::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "FileMetaData(";
+ out << "version=" << to_string(version);
+ out << ", " << "schema=" << to_string(schema);
+ out << ", " << "num_rows=" << to_string(num_rows);
+ out << ", " << "row_groups=" << to_string(row_groups);
+ out << ", " << "key_value_metadata="; (__isset.key_value_metadata ? (out << to_string(key_value_metadata)) : (out << "<null>"));
+ out << ", " << "created_by="; (__isset.created_by ? (out << to_string(created_by)) : (out << "<null>"));
+ out << ", " << "column_orders="; (__isset.column_orders ? (out << to_string(column_orders)) : (out << "<null>"));
+ out << ", " << "encryption_algorithm="; (__isset.encryption_algorithm ? (out << to_string(encryption_algorithm)) : (out << "<null>"));
+ out << ", " << "footer_signing_key_metadata="; (__isset.footer_signing_key_metadata ? (out << to_string(footer_signing_key_metadata)) : (out << "<null>"));
+ out << ")";
+}
+
+
+FileCryptoMetaData::~FileCryptoMetaData() noexcept {
+}
+
+
+void FileCryptoMetaData::__set_encryption_algorithm(const EncryptionAlgorithm& val) {
+ this->encryption_algorithm = val;
+}
+
+void FileCryptoMetaData::__set_key_metadata(const std::string& val) {
+ this->key_metadata = val;
+__isset.key_metadata = true;
+}
+std::ostream& operator<<(std::ostream& out, const FileCryptoMetaData& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t FileCryptoMetaData::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_encryption_algorithm = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->encryption_algorithm.read(iprot);
+ isset_encryption_algorithm = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_STRING) {
+ xfer += iprot->readBinary(this->key_metadata);
+ this->__isset.key_metadata = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_encryption_algorithm)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t FileCryptoMetaData::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("FileCryptoMetaData");
+
+ xfer += oprot->writeFieldBegin("encryption_algorithm", ::apache::thrift::protocol::T_STRUCT, 1);
+ xfer += this->encryption_algorithm.write(oprot);
+ xfer += oprot->writeFieldEnd();
+
+ if (this->__isset.key_metadata) {
+ xfer += oprot->writeFieldBegin("key_metadata", ::apache::thrift::protocol::T_STRING, 2);
+ xfer += oprot->writeBinary(this->key_metadata);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(FileCryptoMetaData &a, FileCryptoMetaData &b) {
+ using ::std::swap;
+ swap(a.encryption_algorithm, b.encryption_algorithm);
+ swap(a.key_metadata, b.key_metadata);
+ swap(a.__isset, b.__isset);
+}
+
+FileCryptoMetaData::FileCryptoMetaData(const FileCryptoMetaData& other211) {
+ encryption_algorithm = other211.encryption_algorithm;
+ key_metadata = other211.key_metadata;
+ __isset = other211.__isset;
+}
+FileCryptoMetaData& FileCryptoMetaData::operator=(const FileCryptoMetaData& other212) {
+ encryption_algorithm = other212.encryption_algorithm;
+ key_metadata = other212.key_metadata;
+ __isset = other212.__isset;
+ return *this;
+}
+void FileCryptoMetaData::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "FileCryptoMetaData(";
+ out << "encryption_algorithm=" << to_string(encryption_algorithm);
+ out << ", " << "key_metadata="; (__isset.key_metadata ? (out << to_string(key_metadata)) : (out << "<null>"));
+ out << ")";
+}
+
+}} // namespace
diff --git a/contrib/libs/apache/arrow/cpp/src/generated/parquet_types.h b/contrib/libs/apache/arrow/cpp/src/generated/parquet_types.h
new file mode 100644
index 00000000000..3d7edd40983
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/generated/parquet_types.h
@@ -0,0 +1,2917 @@
+/**
+ * Autogenerated by Thrift Compiler (0.13.0)
+ *
+ * DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
+ * @generated
+ */
+#ifndef parquet_TYPES_H
+#define parquet_TYPES_H
+
+#include <iosfwd>
+
+#include <thrift/Thrift.h>
+#include <thrift/TApplicationException.h>
+#include <thrift/TBase.h>
+#include <thrift/protocol/TProtocol.h>
+#include <thrift/transport/TTransport.h>
+
+#include <functional>
+#include <memory>
+
+#include "parquet/windows_compatibility.h"
+
+namespace parquet { namespace format {
+
+struct Type {
+ enum type {
+ BOOLEAN = 0,
+ INT32 = 1,
+ INT64 = 2,
+ INT96 = 3,
+ FLOAT = 4,
+ DOUBLE = 5,
+ BYTE_ARRAY = 6,
+ FIXED_LEN_BYTE_ARRAY = 7
+ };
+};
+
+extern const std::map<int, const char*> _Type_VALUES_TO_NAMES;
+
+std::ostream& operator<<(std::ostream& out, const Type::type& val);
+
+std::string to_string(const Type::type& val);
+
+struct ConvertedType {
+ enum type {
+ UTF8 = 0,
+ MAP = 1,
+ MAP_KEY_VALUE = 2,
+ LIST = 3,
+ ENUM = 4,
+ DECIMAL = 5,
+ DATE = 6,
+ TIME_MILLIS = 7,
+ TIME_MICROS = 8,
+ TIMESTAMP_MILLIS = 9,
+ TIMESTAMP_MICROS = 10,
+ UINT_8 = 11,
+ UINT_16 = 12,
+ UINT_32 = 13,
+ UINT_64 = 14,
+ INT_8 = 15,
+ INT_16 = 16,
+ INT_32 = 17,
+ INT_64 = 18,
+ JSON = 19,
+ BSON = 20,
+ INTERVAL = 21
+ };
+};
+
+extern const std::map<int, const char*> _ConvertedType_VALUES_TO_NAMES;
+
+std::ostream& operator<<(std::ostream& out, const ConvertedType::type& val);
+
+std::string to_string(const ConvertedType::type& val);
+
+struct FieldRepetitionType {
+ enum type {
+ REQUIRED = 0,
+ OPTIONAL = 1,
+ REPEATED = 2
+ };
+};
+
+extern const std::map<int, const char*> _FieldRepetitionType_VALUES_TO_NAMES;
+
+std::ostream& operator<<(std::ostream& out, const FieldRepetitionType::type& val);
+
+std::string to_string(const FieldRepetitionType::type& val);
+
+struct Encoding {
+ enum type {
+ PLAIN = 0,
+ PLAIN_DICTIONARY = 2,
+ RLE = 3,
+ BIT_PACKED = 4,
+ DELTA_BINARY_PACKED = 5,
+ DELTA_LENGTH_BYTE_ARRAY = 6,
+ DELTA_BYTE_ARRAY = 7,
+ RLE_DICTIONARY = 8,
+ BYTE_STREAM_SPLIT = 9
+ };
+};
+
+extern const std::map<int, const char*> _Encoding_VALUES_TO_NAMES;
+
+std::ostream& operator<<(std::ostream& out, const Encoding::type& val);
+
+std::string to_string(const Encoding::type& val);
+
+struct CompressionCodec {
+ enum type {
+ UNCOMPRESSED = 0,
+ SNAPPY = 1,
+ GZIP = 2,
+ LZO = 3,
+ BROTLI = 4,
+ LZ4 = 5,
+ ZSTD = 6,
+ LZ4_RAW = 7
+ };
+};
+
+extern const std::map<int, const char*> _CompressionCodec_VALUES_TO_NAMES;
+
+std::ostream& operator<<(std::ostream& out, const CompressionCodec::type& val);
+
+std::string to_string(const CompressionCodec::type& val);
+
+struct PageType {
+ enum type {
+ DATA_PAGE = 0,
+ INDEX_PAGE = 1,
+ DICTIONARY_PAGE = 2,
+ DATA_PAGE_V2 = 3
+ };
+};
+
+extern const std::map<int, const char*> _PageType_VALUES_TO_NAMES;
+
+std::ostream& operator<<(std::ostream& out, const PageType::type& val);
+
+std::string to_string(const PageType::type& val);
+
+struct BoundaryOrder {
+ enum type {
+ UNORDERED = 0,
+ ASCENDING = 1,
+ DESCENDING = 2
+ };
+};
+
+extern const std::map<int, const char*> _BoundaryOrder_VALUES_TO_NAMES;
+
+std::ostream& operator<<(std::ostream& out, const BoundaryOrder::type& val);
+
+std::string to_string(const BoundaryOrder::type& val);
+
+class Statistics;
+
+class StringType;
+
+class UUIDType;
+
+class MapType;
+
+class ListType;
+
+class EnumType;
+
+class DateType;
+
+class NullType;
+
+class DecimalType;
+
+class MilliSeconds;
+
+class MicroSeconds;
+
+class NanoSeconds;
+
+class TimeUnit;
+
+class TimestampType;
+
+class TimeType;
+
+class IntType;
+
+class JsonType;
+
+class BsonType;
+
+class LogicalType;
+
+class SchemaElement;
+
+class DataPageHeader;
+
+class IndexPageHeader;
+
+class DictionaryPageHeader;
+
+class DataPageHeaderV2;
+
+class SplitBlockAlgorithm;
+
+class BloomFilterAlgorithm;
+
+class XxHash;
+
+class BloomFilterHash;
+
+class Uncompressed;
+
+class BloomFilterCompression;
+
+class BloomFilterHeader;
+
+class PageHeader;
+
+class KeyValue;
+
+class SortingColumn;
+
+class PageEncodingStats;
+
+class ColumnMetaData;
+
+class EncryptionWithFooterKey;
+
+class EncryptionWithColumnKey;
+
+class ColumnCryptoMetaData;
+
+class ColumnChunk;
+
+class RowGroup;
+
+class TypeDefinedOrder;
+
+class ColumnOrder;
+
+class PageLocation;
+
+class OffsetIndex;
+
+class ColumnIndex;
+
+class AesGcmV1;
+
+class AesGcmCtrV1;
+
+class EncryptionAlgorithm;
+
+class FileMetaData;
+
+class FileCryptoMetaData;
+
+typedef struct _Statistics__isset {
+ _Statistics__isset() : max(false), min(false), null_count(false), distinct_count(false), max_value(false), min_value(false) {}
+ bool max :1;
+ bool min :1;
+ bool null_count :1;
+ bool distinct_count :1;
+ bool max_value :1;
+ bool min_value :1;
+} _Statistics__isset;
+
+class Statistics : public virtual ::apache::thrift::TBase {
+ public:
+
+ Statistics(const Statistics&);
+ Statistics& operator=(const Statistics&);
+ Statistics() : max(), min(), null_count(0), distinct_count(0), max_value(), min_value() {
+ }
+
+ virtual ~Statistics() noexcept;
+ std::string max;
+ std::string min;
+ int64_t null_count;
+ int64_t distinct_count;
+ std::string max_value;
+ std::string min_value;
+
+ _Statistics__isset __isset;
+
+ void __set_max(const std::string& val);
+
+ void __set_min(const std::string& val);
+
+ void __set_null_count(const int64_t val);
+
+ void __set_distinct_count(const int64_t val);
+
+ void __set_max_value(const std::string& val);
+
+ void __set_min_value(const std::string& val);
+
+ bool operator == (const Statistics & rhs) const
+ {
+ if (__isset.max != rhs.__isset.max)
+ return false;
+ else if (__isset.max && !(max == rhs.max))
+ return false;
+ if (__isset.min != rhs.__isset.min)
+ return false;
+ else if (__isset.min && !(min == rhs.min))
+ return false;
+ if (__isset.null_count != rhs.__isset.null_count)
+ return false;
+ else if (__isset.null_count && !(null_count == rhs.null_count))
+ return false;
+ if (__isset.distinct_count != rhs.__isset.distinct_count)
+ return false;
+ else if (__isset.distinct_count && !(distinct_count == rhs.distinct_count))
+ return false;
+ if (__isset.max_value != rhs.__isset.max_value)
+ return false;
+ else if (__isset.max_value && !(max_value == rhs.max_value))
+ return false;
+ if (__isset.min_value != rhs.__isset.min_value)
+ return false;
+ else if (__isset.min_value && !(min_value == rhs.min_value))
+ return false;
+ return true;
+ }
+ bool operator != (const Statistics &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const Statistics & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(Statistics &a, Statistics &b);
+
+std::ostream& operator<<(std::ostream& out, const Statistics& obj);
+
+
+class StringType : public virtual ::apache::thrift::TBase {
+ public:
+
+ StringType(const StringType&);
+ StringType& operator=(const StringType&);
+ StringType() {
+ }
+
+ virtual ~StringType() noexcept;
+
+ bool operator == (const StringType & /* rhs */) const
+ {
+ return true;
+ }
+ bool operator != (const StringType &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const StringType & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(StringType &a, StringType &b);
+
+std::ostream& operator<<(std::ostream& out, const StringType& obj);
+
+
+class UUIDType : public virtual ::apache::thrift::TBase {
+ public:
+
+ UUIDType(const UUIDType&);
+ UUIDType& operator=(const UUIDType&);
+ UUIDType() {
+ }
+
+ virtual ~UUIDType() noexcept;
+
+ bool operator == (const UUIDType & /* rhs */) const
+ {
+ return true;
+ }
+ bool operator != (const UUIDType &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const UUIDType & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(UUIDType &a, UUIDType &b);
+
+std::ostream& operator<<(std::ostream& out, const UUIDType& obj);
+
+
+class MapType : public virtual ::apache::thrift::TBase {
+ public:
+
+ MapType(const MapType&);
+ MapType& operator=(const MapType&);
+ MapType() {
+ }
+
+ virtual ~MapType() noexcept;
+
+ bool operator == (const MapType & /* rhs */) const
+ {
+ return true;
+ }
+ bool operator != (const MapType &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const MapType & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(MapType &a, MapType &b);
+
+std::ostream& operator<<(std::ostream& out, const MapType& obj);
+
+
+class ListType : public virtual ::apache::thrift::TBase {
+ public:
+
+ ListType(const ListType&);
+ ListType& operator=(const ListType&);
+ ListType() {
+ }
+
+ virtual ~ListType() noexcept;
+
+ bool operator == (const ListType & /* rhs */) const
+ {
+ return true;
+ }
+ bool operator != (const ListType &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const ListType & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(ListType &a, ListType &b);
+
+std::ostream& operator<<(std::ostream& out, const ListType& obj);
+
+
+class EnumType : public virtual ::apache::thrift::TBase {
+ public:
+
+ EnumType(const EnumType&);
+ EnumType& operator=(const EnumType&);
+ EnumType() {
+ }
+
+ virtual ~EnumType() noexcept;
+
+ bool operator == (const EnumType & /* rhs */) const
+ {
+ return true;
+ }
+ bool operator != (const EnumType &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const EnumType & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(EnumType &a, EnumType &b);
+
+std::ostream& operator<<(std::ostream& out, const EnumType& obj);
+
+
+class DateType : public virtual ::apache::thrift::TBase {
+ public:
+
+ DateType(const DateType&);
+ DateType& operator=(const DateType&);
+ DateType() {
+ }
+
+ virtual ~DateType() noexcept;
+
+ bool operator == (const DateType & /* rhs */) const
+ {
+ return true;
+ }
+ bool operator != (const DateType &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const DateType & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(DateType &a, DateType &b);
+
+std::ostream& operator<<(std::ostream& out, const DateType& obj);
+
+
+class NullType : public virtual ::apache::thrift::TBase {
+ public:
+
+ NullType(const NullType&);
+ NullType& operator=(const NullType&);
+ NullType() {
+ }
+
+ virtual ~NullType() noexcept;
+
+ bool operator == (const NullType & /* rhs */) const
+ {
+ return true;
+ }
+ bool operator != (const NullType &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const NullType & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(NullType &a, NullType &b);
+
+std::ostream& operator<<(std::ostream& out, const NullType& obj);
+
+
+class DecimalType : public virtual ::apache::thrift::TBase {
+ public:
+
+ DecimalType(const DecimalType&);
+ DecimalType& operator=(const DecimalType&);
+ DecimalType() : scale(0), precision(0) {
+ }
+
+ virtual ~DecimalType() noexcept;
+ int32_t scale;
+ int32_t precision;
+
+ void __set_scale(const int32_t val);
+
+ void __set_precision(const int32_t val);
+
+ bool operator == (const DecimalType & rhs) const
+ {
+ if (!(scale == rhs.scale))
+ return false;
+ if (!(precision == rhs.precision))
+ return false;
+ return true;
+ }
+ bool operator != (const DecimalType &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const DecimalType & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(DecimalType &a, DecimalType &b);
+
+std::ostream& operator<<(std::ostream& out, const DecimalType& obj);
+
+
+class MilliSeconds : public virtual ::apache::thrift::TBase {
+ public:
+
+ MilliSeconds(const MilliSeconds&);
+ MilliSeconds& operator=(const MilliSeconds&);
+ MilliSeconds() {
+ }
+
+ virtual ~MilliSeconds() noexcept;
+
+ bool operator == (const MilliSeconds & /* rhs */) const
+ {
+ return true;
+ }
+ bool operator != (const MilliSeconds &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const MilliSeconds & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(MilliSeconds &a, MilliSeconds &b);
+
+std::ostream& operator<<(std::ostream& out, const MilliSeconds& obj);
+
+
+class MicroSeconds : public virtual ::apache::thrift::TBase {
+ public:
+
+ MicroSeconds(const MicroSeconds&);
+ MicroSeconds& operator=(const MicroSeconds&);
+ MicroSeconds() {
+ }
+
+ virtual ~MicroSeconds() noexcept;
+
+ bool operator == (const MicroSeconds & /* rhs */) const
+ {
+ return true;
+ }
+ bool operator != (const MicroSeconds &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const MicroSeconds & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(MicroSeconds &a, MicroSeconds &b);
+
+std::ostream& operator<<(std::ostream& out, const MicroSeconds& obj);
+
+
+class NanoSeconds : public virtual ::apache::thrift::TBase {
+ public:
+
+ NanoSeconds(const NanoSeconds&);
+ NanoSeconds& operator=(const NanoSeconds&);
+ NanoSeconds() {
+ }
+
+ virtual ~NanoSeconds() noexcept;
+
+ bool operator == (const NanoSeconds & /* rhs */) const
+ {
+ return true;
+ }
+ bool operator != (const NanoSeconds &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const NanoSeconds & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(NanoSeconds &a, NanoSeconds &b);
+
+std::ostream& operator<<(std::ostream& out, const NanoSeconds& obj);
+
+typedef struct _TimeUnit__isset {
+ _TimeUnit__isset() : MILLIS(false), MICROS(false), NANOS(false) {}
+ bool MILLIS :1;
+ bool MICROS :1;
+ bool NANOS :1;
+} _TimeUnit__isset;
+
+class TimeUnit : public virtual ::apache::thrift::TBase {
+ public:
+
+ TimeUnit(const TimeUnit&);
+ TimeUnit& operator=(const TimeUnit&);
+ TimeUnit() {
+ }
+
+ virtual ~TimeUnit() noexcept;
+ MilliSeconds MILLIS;
+ MicroSeconds MICROS;
+ NanoSeconds NANOS;
+
+ _TimeUnit__isset __isset;
+
+ void __set_MILLIS(const MilliSeconds& val);
+
+ void __set_MICROS(const MicroSeconds& val);
+
+ void __set_NANOS(const NanoSeconds& val);
+
+ bool operator == (const TimeUnit & rhs) const
+ {
+ if (__isset.MILLIS != rhs.__isset.MILLIS)
+ return false;
+ else if (__isset.MILLIS && !(MILLIS == rhs.MILLIS))
+ return false;
+ if (__isset.MICROS != rhs.__isset.MICROS)
+ return false;
+ else if (__isset.MICROS && !(MICROS == rhs.MICROS))
+ return false;
+ if (__isset.NANOS != rhs.__isset.NANOS)
+ return false;
+ else if (__isset.NANOS && !(NANOS == rhs.NANOS))
+ return false;
+ return true;
+ }
+ bool operator != (const TimeUnit &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const TimeUnit & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(TimeUnit &a, TimeUnit &b);
+
+std::ostream& operator<<(std::ostream& out, const TimeUnit& obj);
+
+
+class TimestampType : public virtual ::apache::thrift::TBase {
+ public:
+
+ TimestampType(const TimestampType&);
+ TimestampType& operator=(const TimestampType&);
+ TimestampType() : isAdjustedToUTC(0) {
+ }
+
+ virtual ~TimestampType() noexcept;
+ bool isAdjustedToUTC;
+ TimeUnit unit;
+
+ void __set_isAdjustedToUTC(const bool val);
+
+ void __set_unit(const TimeUnit& val);
+
+ bool operator == (const TimestampType & rhs) const
+ {
+ if (!(isAdjustedToUTC == rhs.isAdjustedToUTC))
+ return false;
+ if (!(unit == rhs.unit))
+ return false;
+ return true;
+ }
+ bool operator != (const TimestampType &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const TimestampType & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(TimestampType &a, TimestampType &b);
+
+std::ostream& operator<<(std::ostream& out, const TimestampType& obj);
+
+
+class TimeType : public virtual ::apache::thrift::TBase {
+ public:
+
+ TimeType(const TimeType&);
+ TimeType& operator=(const TimeType&);
+ TimeType() : isAdjustedToUTC(0) {
+ }
+
+ virtual ~TimeType() noexcept;
+ bool isAdjustedToUTC;
+ TimeUnit unit;
+
+ void __set_isAdjustedToUTC(const bool val);
+
+ void __set_unit(const TimeUnit& val);
+
+ bool operator == (const TimeType & rhs) const
+ {
+ if (!(isAdjustedToUTC == rhs.isAdjustedToUTC))
+ return false;
+ if (!(unit == rhs.unit))
+ return false;
+ return true;
+ }
+ bool operator != (const TimeType &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const TimeType & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(TimeType &a, TimeType &b);
+
+std::ostream& operator<<(std::ostream& out, const TimeType& obj);
+
+
+class IntType : public virtual ::apache::thrift::TBase {
+ public:
+
+ IntType(const IntType&);
+ IntType& operator=(const IntType&);
+ IntType() : bitWidth(0), isSigned(0) {
+ }
+
+ virtual ~IntType() noexcept;
+ int8_t bitWidth;
+ bool isSigned;
+
+ void __set_bitWidth(const int8_t val);
+
+ void __set_isSigned(const bool val);
+
+ bool operator == (const IntType & rhs) const
+ {
+ if (!(bitWidth == rhs.bitWidth))
+ return false;
+ if (!(isSigned == rhs.isSigned))
+ return false;
+ return true;
+ }
+ bool operator != (const IntType &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const IntType & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(IntType &a, IntType &b);
+
+std::ostream& operator<<(std::ostream& out, const IntType& obj);
+
+
+class JsonType : public virtual ::apache::thrift::TBase {
+ public:
+
+ JsonType(const JsonType&);
+ JsonType& operator=(const JsonType&);
+ JsonType() {
+ }
+
+ virtual ~JsonType() noexcept;
+
+ bool operator == (const JsonType & /* rhs */) const
+ {
+ return true;
+ }
+ bool operator != (const JsonType &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const JsonType & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(JsonType &a, JsonType &b);
+
+std::ostream& operator<<(std::ostream& out, const JsonType& obj);
+
+
+class BsonType : public virtual ::apache::thrift::TBase {
+ public:
+
+ BsonType(const BsonType&);
+ BsonType& operator=(const BsonType&);
+ BsonType() {
+ }
+
+ virtual ~BsonType() noexcept;
+
+ bool operator == (const BsonType & /* rhs */) const
+ {
+ return true;
+ }
+ bool operator != (const BsonType &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const BsonType & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(BsonType &a, BsonType &b);
+
+std::ostream& operator<<(std::ostream& out, const BsonType& obj);
+
+typedef struct _LogicalType__isset {
+ _LogicalType__isset() : STRING(false), MAP(false), LIST(false), ENUM(false), DECIMAL(false), DATE(false), TIME(false), TIMESTAMP(false), INTEGER(false), UNKNOWN(false), JSON(false), BSON(false), UUID(false) {}
+ bool STRING :1;
+ bool MAP :1;
+ bool LIST :1;
+ bool ENUM :1;
+ bool DECIMAL :1;
+ bool DATE :1;
+ bool TIME :1;
+ bool TIMESTAMP :1;
+ bool INTEGER :1;
+ bool UNKNOWN :1;
+ bool JSON :1;
+ bool BSON :1;
+ bool UUID :1;
+} _LogicalType__isset;
+
+class LogicalType : public virtual ::apache::thrift::TBase {
+ public:
+
+ LogicalType(const LogicalType&);
+ LogicalType& operator=(const LogicalType&);
+ LogicalType() {
+ }
+
+ virtual ~LogicalType() noexcept;
+ StringType STRING;
+ MapType MAP;
+ ListType LIST;
+ EnumType ENUM;
+ DecimalType DECIMAL;
+ DateType DATE;
+ TimeType TIME;
+ TimestampType TIMESTAMP;
+ IntType INTEGER;
+ NullType UNKNOWN;
+ JsonType JSON;
+ BsonType BSON;
+ UUIDType UUID;
+
+ _LogicalType__isset __isset;
+
+ void __set_STRING(const StringType& val);
+
+ void __set_MAP(const MapType& val);
+
+ void __set_LIST(const ListType& val);
+
+ void __set_ENUM(const EnumType& val);
+
+ void __set_DECIMAL(const DecimalType& val);
+
+ void __set_DATE(const DateType& val);
+
+ void __set_TIME(const TimeType& val);
+
+ void __set_TIMESTAMP(const TimestampType& val);
+
+ void __set_INTEGER(const IntType& val);
+
+ void __set_UNKNOWN(const NullType& val);
+
+ void __set_JSON(const JsonType& val);
+
+ void __set_BSON(const BsonType& val);
+
+ void __set_UUID(const UUIDType& val);
+
+ bool operator == (const LogicalType & rhs) const
+ {
+ if (__isset.STRING != rhs.__isset.STRING)
+ return false;
+ else if (__isset.STRING && !(STRING == rhs.STRING))
+ return false;
+ if (__isset.MAP != rhs.__isset.MAP)
+ return false;
+ else if (__isset.MAP && !(MAP == rhs.MAP))
+ return false;
+ if (__isset.LIST != rhs.__isset.LIST)
+ return false;
+ else if (__isset.LIST && !(LIST == rhs.LIST))
+ return false;
+ if (__isset.ENUM != rhs.__isset.ENUM)
+ return false;
+ else if (__isset.ENUM && !(ENUM == rhs.ENUM))
+ return false;
+ if (__isset.DECIMAL != rhs.__isset.DECIMAL)
+ return false;
+ else if (__isset.DECIMAL && !(DECIMAL == rhs.DECIMAL))
+ return false;
+ if (__isset.DATE != rhs.__isset.DATE)
+ return false;
+ else if (__isset.DATE && !(DATE == rhs.DATE))
+ return false;
+ if (__isset.TIME != rhs.__isset.TIME)
+ return false;
+ else if (__isset.TIME && !(TIME == rhs.TIME))
+ return false;
+ if (__isset.TIMESTAMP != rhs.__isset.TIMESTAMP)
+ return false;
+ else if (__isset.TIMESTAMP && !(TIMESTAMP == rhs.TIMESTAMP))
+ return false;
+ if (__isset.INTEGER != rhs.__isset.INTEGER)
+ return false;
+ else if (__isset.INTEGER && !(INTEGER == rhs.INTEGER))
+ return false;
+ if (__isset.UNKNOWN != rhs.__isset.UNKNOWN)
+ return false;
+ else if (__isset.UNKNOWN && !(UNKNOWN == rhs.UNKNOWN))
+ return false;
+ if (__isset.JSON != rhs.__isset.JSON)
+ return false;
+ else if (__isset.JSON && !(JSON == rhs.JSON))
+ return false;
+ if (__isset.BSON != rhs.__isset.BSON)
+ return false;
+ else if (__isset.BSON && !(BSON == rhs.BSON))
+ return false;
+ if (__isset.UUID != rhs.__isset.UUID)
+ return false;
+ else if (__isset.UUID && !(UUID == rhs.UUID))
+ return false;
+ return true;
+ }
+ bool operator != (const LogicalType &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const LogicalType & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(LogicalType &a, LogicalType &b);
+
+std::ostream& operator<<(std::ostream& out, const LogicalType& obj);
+
+typedef struct _SchemaElement__isset {
+ _SchemaElement__isset() : type(false), type_length(false), repetition_type(false), num_children(false), converted_type(false), scale(false), precision(false), field_id(false), logicalType(false) {}
+ bool type :1;
+ bool type_length :1;
+ bool repetition_type :1;
+ bool num_children :1;
+ bool converted_type :1;
+ bool scale :1;
+ bool precision :1;
+ bool field_id :1;
+ bool logicalType :1;
+} _SchemaElement__isset;
+
+class SchemaElement : public virtual ::apache::thrift::TBase {
+ public:
+
+ SchemaElement(const SchemaElement&);
+ SchemaElement& operator=(const SchemaElement&);
+ SchemaElement() : type((Type::type)0), type_length(0), repetition_type((FieldRepetitionType::type)0), name(), num_children(0), converted_type((ConvertedType::type)0), scale(0), precision(0), field_id(0) {
+ }
+
+ virtual ~SchemaElement() noexcept;
+ Type::type type;
+ int32_t type_length;
+ FieldRepetitionType::type repetition_type;
+ std::string name;
+ int32_t num_children;
+ ConvertedType::type converted_type;
+ int32_t scale;
+ int32_t precision;
+ int32_t field_id;
+ LogicalType logicalType;
+
+ _SchemaElement__isset __isset;
+
+ void __set_type(const Type::type val);
+
+ void __set_type_length(const int32_t val);
+
+ void __set_repetition_type(const FieldRepetitionType::type val);
+
+ void __set_name(const std::string& val);
+
+ void __set_num_children(const int32_t val);
+
+ void __set_converted_type(const ConvertedType::type val);
+
+ void __set_scale(const int32_t val);
+
+ void __set_precision(const int32_t val);
+
+ void __set_field_id(const int32_t val);
+
+ void __set_logicalType(const LogicalType& val);
+
+ bool operator == (const SchemaElement & rhs) const
+ {
+ if (__isset.type != rhs.__isset.type)
+ return false;
+ else if (__isset.type && !(type == rhs.type))
+ return false;
+ if (__isset.type_length != rhs.__isset.type_length)
+ return false;
+ else if (__isset.type_length && !(type_length == rhs.type_length))
+ return false;
+ if (__isset.repetition_type != rhs.__isset.repetition_type)
+ return false;
+ else if (__isset.repetition_type && !(repetition_type == rhs.repetition_type))
+ return false;
+ if (!(name == rhs.name))
+ return false;
+ if (__isset.num_children != rhs.__isset.num_children)
+ return false;
+ else if (__isset.num_children && !(num_children == rhs.num_children))
+ return false;
+ if (__isset.converted_type != rhs.__isset.converted_type)
+ return false;
+ else if (__isset.converted_type && !(converted_type == rhs.converted_type))
+ return false;
+ if (__isset.scale != rhs.__isset.scale)
+ return false;
+ else if (__isset.scale && !(scale == rhs.scale))
+ return false;
+ if (__isset.precision != rhs.__isset.precision)
+ return false;
+ else if (__isset.precision && !(precision == rhs.precision))
+ return false;
+ if (__isset.field_id != rhs.__isset.field_id)
+ return false;
+ else if (__isset.field_id && !(field_id == rhs.field_id))
+ return false;
+ if (__isset.logicalType != rhs.__isset.logicalType)
+ return false;
+ else if (__isset.logicalType && !(logicalType == rhs.logicalType))
+ return false;
+ return true;
+ }
+ bool operator != (const SchemaElement &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const SchemaElement & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(SchemaElement &a, SchemaElement &b);
+
+std::ostream& operator<<(std::ostream& out, const SchemaElement& obj);
+
+typedef struct _DataPageHeader__isset {
+ _DataPageHeader__isset() : statistics(false) {}
+ bool statistics :1;
+} _DataPageHeader__isset;
+
+class DataPageHeader : public virtual ::apache::thrift::TBase {
+ public:
+
+ DataPageHeader(const DataPageHeader&);
+ DataPageHeader& operator=(const DataPageHeader&);
+ DataPageHeader() : num_values(0), encoding((Encoding::type)0), definition_level_encoding((Encoding::type)0), repetition_level_encoding((Encoding::type)0) {
+ }
+
+ virtual ~DataPageHeader() noexcept;
+ int32_t num_values;
+ Encoding::type encoding;
+ Encoding::type definition_level_encoding;
+ Encoding::type repetition_level_encoding;
+ Statistics statistics;
+
+ _DataPageHeader__isset __isset;
+
+ void __set_num_values(const int32_t val);
+
+ void __set_encoding(const Encoding::type val);
+
+ void __set_definition_level_encoding(const Encoding::type val);
+
+ void __set_repetition_level_encoding(const Encoding::type val);
+
+ void __set_statistics(const Statistics& val);
+
+ bool operator == (const DataPageHeader & rhs) const
+ {
+ if (!(num_values == rhs.num_values))
+ return false;
+ if (!(encoding == rhs.encoding))
+ return false;
+ if (!(definition_level_encoding == rhs.definition_level_encoding))
+ return false;
+ if (!(repetition_level_encoding == rhs.repetition_level_encoding))
+ return false;
+ if (__isset.statistics != rhs.__isset.statistics)
+ return false;
+ else if (__isset.statistics && !(statistics == rhs.statistics))
+ return false;
+ return true;
+ }
+ bool operator != (const DataPageHeader &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const DataPageHeader & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(DataPageHeader &a, DataPageHeader &b);
+
+std::ostream& operator<<(std::ostream& out, const DataPageHeader& obj);
+
+
+class IndexPageHeader : public virtual ::apache::thrift::TBase {
+ public:
+
+ IndexPageHeader(const IndexPageHeader&);
+ IndexPageHeader& operator=(const IndexPageHeader&);
+ IndexPageHeader() {
+ }
+
+ virtual ~IndexPageHeader() noexcept;
+
+ bool operator == (const IndexPageHeader & /* rhs */) const
+ {
+ return true;
+ }
+ bool operator != (const IndexPageHeader &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const IndexPageHeader & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(IndexPageHeader &a, IndexPageHeader &b);
+
+std::ostream& operator<<(std::ostream& out, const IndexPageHeader& obj);
+
+typedef struct _DictionaryPageHeader__isset {
+ _DictionaryPageHeader__isset() : is_sorted(false) {}
+ bool is_sorted :1;
+} _DictionaryPageHeader__isset;
+
+class DictionaryPageHeader : public virtual ::apache::thrift::TBase {
+ public:
+
+ DictionaryPageHeader(const DictionaryPageHeader&);
+ DictionaryPageHeader& operator=(const DictionaryPageHeader&);
+ DictionaryPageHeader() : num_values(0), encoding((Encoding::type)0), is_sorted(0) {
+ }
+
+ virtual ~DictionaryPageHeader() noexcept;
+ int32_t num_values;
+ Encoding::type encoding;
+ bool is_sorted;
+
+ _DictionaryPageHeader__isset __isset;
+
+ void __set_num_values(const int32_t val);
+
+ void __set_encoding(const Encoding::type val);
+
+ void __set_is_sorted(const bool val);
+
+ bool operator == (const DictionaryPageHeader & rhs) const
+ {
+ if (!(num_values == rhs.num_values))
+ return false;
+ if (!(encoding == rhs.encoding))
+ return false;
+ if (__isset.is_sorted != rhs.__isset.is_sorted)
+ return false;
+ else if (__isset.is_sorted && !(is_sorted == rhs.is_sorted))
+ return false;
+ return true;
+ }
+ bool operator != (const DictionaryPageHeader &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const DictionaryPageHeader & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(DictionaryPageHeader &a, DictionaryPageHeader &b);
+
+std::ostream& operator<<(std::ostream& out, const DictionaryPageHeader& obj);
+
+typedef struct _DataPageHeaderV2__isset {
+ _DataPageHeaderV2__isset() : is_compressed(true), statistics(false) {}
+ bool is_compressed :1;
+ bool statistics :1;
+} _DataPageHeaderV2__isset;
+
+class DataPageHeaderV2 : public virtual ::apache::thrift::TBase {
+ public:
+
+ DataPageHeaderV2(const DataPageHeaderV2&);
+ DataPageHeaderV2& operator=(const DataPageHeaderV2&);
+ DataPageHeaderV2() : num_values(0), num_nulls(0), num_rows(0), encoding((Encoding::type)0), definition_levels_byte_length(0), repetition_levels_byte_length(0), is_compressed(true) {
+ }
+
+ virtual ~DataPageHeaderV2() noexcept;
+ int32_t num_values;
+ int32_t num_nulls;
+ int32_t num_rows;
+ Encoding::type encoding;
+ int32_t definition_levels_byte_length;
+ int32_t repetition_levels_byte_length;
+ bool is_compressed;
+ Statistics statistics;
+
+ _DataPageHeaderV2__isset __isset;
+
+ void __set_num_values(const int32_t val);
+
+ void __set_num_nulls(const int32_t val);
+
+ void __set_num_rows(const int32_t val);
+
+ void __set_encoding(const Encoding::type val);
+
+ void __set_definition_levels_byte_length(const int32_t val);
+
+ void __set_repetition_levels_byte_length(const int32_t val);
+
+ void __set_is_compressed(const bool val);
+
+ void __set_statistics(const Statistics& val);
+
+ bool operator == (const DataPageHeaderV2 & rhs) const
+ {
+ if (!(num_values == rhs.num_values))
+ return false;
+ if (!(num_nulls == rhs.num_nulls))
+ return false;
+ if (!(num_rows == rhs.num_rows))
+ return false;
+ if (!(encoding == rhs.encoding))
+ return false;
+ if (!(definition_levels_byte_length == rhs.definition_levels_byte_length))
+ return false;
+ if (!(repetition_levels_byte_length == rhs.repetition_levels_byte_length))
+ return false;
+ if (__isset.is_compressed != rhs.__isset.is_compressed)
+ return false;
+ else if (__isset.is_compressed && !(is_compressed == rhs.is_compressed))
+ return false;
+ if (__isset.statistics != rhs.__isset.statistics)
+ return false;
+ else if (__isset.statistics && !(statistics == rhs.statistics))
+ return false;
+ return true;
+ }
+ bool operator != (const DataPageHeaderV2 &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const DataPageHeaderV2 & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(DataPageHeaderV2 &a, DataPageHeaderV2 &b);
+
+std::ostream& operator<<(std::ostream& out, const DataPageHeaderV2& obj);
+
+
+class SplitBlockAlgorithm : public virtual ::apache::thrift::TBase {
+ public:
+
+ SplitBlockAlgorithm(const SplitBlockAlgorithm&);
+ SplitBlockAlgorithm& operator=(const SplitBlockAlgorithm&);
+ SplitBlockAlgorithm() {
+ }
+
+ virtual ~SplitBlockAlgorithm() noexcept;
+
+ bool operator == (const SplitBlockAlgorithm & /* rhs */) const
+ {
+ return true;
+ }
+ bool operator != (const SplitBlockAlgorithm &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const SplitBlockAlgorithm & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(SplitBlockAlgorithm &a, SplitBlockAlgorithm &b);
+
+std::ostream& operator<<(std::ostream& out, const SplitBlockAlgorithm& obj);
+
+typedef struct _BloomFilterAlgorithm__isset {
+ _BloomFilterAlgorithm__isset() : BLOCK(false) {}
+ bool BLOCK :1;
+} _BloomFilterAlgorithm__isset;
+
+class BloomFilterAlgorithm : public virtual ::apache::thrift::TBase {
+ public:
+
+ BloomFilterAlgorithm(const BloomFilterAlgorithm&);
+ BloomFilterAlgorithm& operator=(const BloomFilterAlgorithm&);
+ BloomFilterAlgorithm() {
+ }
+
+ virtual ~BloomFilterAlgorithm() noexcept;
+ SplitBlockAlgorithm BLOCK;
+
+ _BloomFilterAlgorithm__isset __isset;
+
+ void __set_BLOCK(const SplitBlockAlgorithm& val);
+
+ bool operator == (const BloomFilterAlgorithm & rhs) const
+ {
+ if (__isset.BLOCK != rhs.__isset.BLOCK)
+ return false;
+ else if (__isset.BLOCK && !(BLOCK == rhs.BLOCK))
+ return false;
+ return true;
+ }
+ bool operator != (const BloomFilterAlgorithm &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const BloomFilterAlgorithm & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(BloomFilterAlgorithm &a, BloomFilterAlgorithm &b);
+
+std::ostream& operator<<(std::ostream& out, const BloomFilterAlgorithm& obj);
+
+
+class XxHash : public virtual ::apache::thrift::TBase {
+ public:
+
+ XxHash(const XxHash&);
+ XxHash& operator=(const XxHash&);
+ XxHash() {
+ }
+
+ virtual ~XxHash() noexcept;
+
+ bool operator == (const XxHash & /* rhs */) const
+ {
+ return true;
+ }
+ bool operator != (const XxHash &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const XxHash & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(XxHash &a, XxHash &b);
+
+std::ostream& operator<<(std::ostream& out, const XxHash& obj);
+
+typedef struct _BloomFilterHash__isset {
+ _BloomFilterHash__isset() : XXHASH(false) {}
+ bool XXHASH :1;
+} _BloomFilterHash__isset;
+
+class BloomFilterHash : public virtual ::apache::thrift::TBase {
+ public:
+
+ BloomFilterHash(const BloomFilterHash&);
+ BloomFilterHash& operator=(const BloomFilterHash&);
+ BloomFilterHash() {
+ }
+
+ virtual ~BloomFilterHash() noexcept;
+ XxHash XXHASH;
+
+ _BloomFilterHash__isset __isset;
+
+ void __set_XXHASH(const XxHash& val);
+
+ bool operator == (const BloomFilterHash & rhs) const
+ {
+ if (__isset.XXHASH != rhs.__isset.XXHASH)
+ return false;
+ else if (__isset.XXHASH && !(XXHASH == rhs.XXHASH))
+ return false;
+ return true;
+ }
+ bool operator != (const BloomFilterHash &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const BloomFilterHash & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(BloomFilterHash &a, BloomFilterHash &b);
+
+std::ostream& operator<<(std::ostream& out, const BloomFilterHash& obj);
+
+
+class Uncompressed : public virtual ::apache::thrift::TBase {
+ public:
+
+ Uncompressed(const Uncompressed&);
+ Uncompressed& operator=(const Uncompressed&);
+ Uncompressed() {
+ }
+
+ virtual ~Uncompressed() noexcept;
+
+ bool operator == (const Uncompressed & /* rhs */) const
+ {
+ return true;
+ }
+ bool operator != (const Uncompressed &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const Uncompressed & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(Uncompressed &a, Uncompressed &b);
+
+std::ostream& operator<<(std::ostream& out, const Uncompressed& obj);
+
+typedef struct _BloomFilterCompression__isset {
+ _BloomFilterCompression__isset() : UNCOMPRESSED(false) {}
+ bool UNCOMPRESSED :1;
+} _BloomFilterCompression__isset;
+
+class BloomFilterCompression : public virtual ::apache::thrift::TBase {
+ public:
+
+ BloomFilterCompression(const BloomFilterCompression&);
+ BloomFilterCompression& operator=(const BloomFilterCompression&);
+ BloomFilterCompression() {
+ }
+
+ virtual ~BloomFilterCompression() noexcept;
+ Uncompressed UNCOMPRESSED;
+
+ _BloomFilterCompression__isset __isset;
+
+ void __set_UNCOMPRESSED(const Uncompressed& val);
+
+ bool operator == (const BloomFilterCompression & rhs) const
+ {
+ if (__isset.UNCOMPRESSED != rhs.__isset.UNCOMPRESSED)
+ return false;
+ else if (__isset.UNCOMPRESSED && !(UNCOMPRESSED == rhs.UNCOMPRESSED))
+ return false;
+ return true;
+ }
+ bool operator != (const BloomFilterCompression &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const BloomFilterCompression & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(BloomFilterCompression &a, BloomFilterCompression &b);
+
+std::ostream& operator<<(std::ostream& out, const BloomFilterCompression& obj);
+
+
+class BloomFilterHeader : public virtual ::apache::thrift::TBase {
+ public:
+
+ BloomFilterHeader(const BloomFilterHeader&);
+ BloomFilterHeader& operator=(const BloomFilterHeader&);
+ BloomFilterHeader() : numBytes(0) {
+ }
+
+ virtual ~BloomFilterHeader() noexcept;
+ int32_t numBytes;
+ BloomFilterAlgorithm algorithm;
+ BloomFilterHash hash;
+ BloomFilterCompression compression;
+
+ void __set_numBytes(const int32_t val);
+
+ void __set_algorithm(const BloomFilterAlgorithm& val);
+
+ void __set_hash(const BloomFilterHash& val);
+
+ void __set_compression(const BloomFilterCompression& val);
+
+ bool operator == (const BloomFilterHeader & rhs) const
+ {
+ if (!(numBytes == rhs.numBytes))
+ return false;
+ if (!(algorithm == rhs.algorithm))
+ return false;
+ if (!(hash == rhs.hash))
+ return false;
+ if (!(compression == rhs.compression))
+ return false;
+ return true;
+ }
+ bool operator != (const BloomFilterHeader &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const BloomFilterHeader & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(BloomFilterHeader &a, BloomFilterHeader &b);
+
+std::ostream& operator<<(std::ostream& out, const BloomFilterHeader& obj);
+
+typedef struct _PageHeader__isset {
+ _PageHeader__isset() : crc(false), data_page_header(false), index_page_header(false), dictionary_page_header(false), data_page_header_v2(false) {}
+ bool crc :1;
+ bool data_page_header :1;
+ bool index_page_header :1;
+ bool dictionary_page_header :1;
+ bool data_page_header_v2 :1;
+} _PageHeader__isset;
+
+class PageHeader : public virtual ::apache::thrift::TBase {
+ public:
+
+ PageHeader(const PageHeader&);
+ PageHeader& operator=(const PageHeader&);
+ PageHeader() : type((PageType::type)0), uncompressed_page_size(0), compressed_page_size(0), crc(0) {
+ }
+
+ virtual ~PageHeader() noexcept;
+ PageType::type type;
+ int32_t uncompressed_page_size;
+ int32_t compressed_page_size;
+ int32_t crc;
+ DataPageHeader data_page_header;
+ IndexPageHeader index_page_header;
+ DictionaryPageHeader dictionary_page_header;
+ DataPageHeaderV2 data_page_header_v2;
+
+ _PageHeader__isset __isset;
+
+ void __set_type(const PageType::type val);
+
+ void __set_uncompressed_page_size(const int32_t val);
+
+ void __set_compressed_page_size(const int32_t val);
+
+ void __set_crc(const int32_t val);
+
+ void __set_data_page_header(const DataPageHeader& val);
+
+ void __set_index_page_header(const IndexPageHeader& val);
+
+ void __set_dictionary_page_header(const DictionaryPageHeader& val);
+
+ void __set_data_page_header_v2(const DataPageHeaderV2& val);
+
+ bool operator == (const PageHeader & rhs) const
+ {
+ if (!(type == rhs.type))
+ return false;
+ if (!(uncompressed_page_size == rhs.uncompressed_page_size))
+ return false;
+ if (!(compressed_page_size == rhs.compressed_page_size))
+ return false;
+ if (__isset.crc != rhs.__isset.crc)
+ return false;
+ else if (__isset.crc && !(crc == rhs.crc))
+ return false;
+ if (__isset.data_page_header != rhs.__isset.data_page_header)
+ return false;
+ else if (__isset.data_page_header && !(data_page_header == rhs.data_page_header))
+ return false;
+ if (__isset.index_page_header != rhs.__isset.index_page_header)
+ return false;
+ else if (__isset.index_page_header && !(index_page_header == rhs.index_page_header))
+ return false;
+ if (__isset.dictionary_page_header != rhs.__isset.dictionary_page_header)
+ return false;
+ else if (__isset.dictionary_page_header && !(dictionary_page_header == rhs.dictionary_page_header))
+ return false;
+ if (__isset.data_page_header_v2 != rhs.__isset.data_page_header_v2)
+ return false;
+ else if (__isset.data_page_header_v2 && !(data_page_header_v2 == rhs.data_page_header_v2))
+ return false;
+ return true;
+ }
+ bool operator != (const PageHeader &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const PageHeader & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(PageHeader &a, PageHeader &b);
+
+std::ostream& operator<<(std::ostream& out, const PageHeader& obj);
+
+typedef struct _KeyValue__isset {
+ _KeyValue__isset() : value(false) {}
+ bool value :1;
+} _KeyValue__isset;
+
+class KeyValue : public virtual ::apache::thrift::TBase {
+ public:
+
+ KeyValue(const KeyValue&);
+ KeyValue& operator=(const KeyValue&);
+ KeyValue() : key(), value() {
+ }
+
+ virtual ~KeyValue() noexcept;
+ std::string key;
+ std::string value;
+
+ _KeyValue__isset __isset;
+
+ void __set_key(const std::string& val);
+
+ void __set_value(const std::string& val);
+
+ bool operator == (const KeyValue & rhs) const
+ {
+ if (!(key == rhs.key))
+ return false;
+ if (__isset.value != rhs.__isset.value)
+ return false;
+ else if (__isset.value && !(value == rhs.value))
+ return false;
+ return true;
+ }
+ bool operator != (const KeyValue &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const KeyValue & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(KeyValue &a, KeyValue &b);
+
+std::ostream& operator<<(std::ostream& out, const KeyValue& obj);
+
+
+class SortingColumn : public virtual ::apache::thrift::TBase {
+ public:
+
+ SortingColumn(const SortingColumn&);
+ SortingColumn& operator=(const SortingColumn&);
+ SortingColumn() : column_idx(0), descending(0), nulls_first(0) {
+ }
+
+ virtual ~SortingColumn() noexcept;
+ int32_t column_idx;
+ bool descending;
+ bool nulls_first;
+
+ void __set_column_idx(const int32_t val);
+
+ void __set_descending(const bool val);
+
+ void __set_nulls_first(const bool val);
+
+ bool operator == (const SortingColumn & rhs) const
+ {
+ if (!(column_idx == rhs.column_idx))
+ return false;
+ if (!(descending == rhs.descending))
+ return false;
+ if (!(nulls_first == rhs.nulls_first))
+ return false;
+ return true;
+ }
+ bool operator != (const SortingColumn &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const SortingColumn & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(SortingColumn &a, SortingColumn &b);
+
+std::ostream& operator<<(std::ostream& out, const SortingColumn& obj);
+
+
+class PageEncodingStats : public virtual ::apache::thrift::TBase {
+ public:
+
+ PageEncodingStats(const PageEncodingStats&);
+ PageEncodingStats& operator=(const PageEncodingStats&);
+ PageEncodingStats() : page_type((PageType::type)0), encoding((Encoding::type)0), count(0) {
+ }
+
+ virtual ~PageEncodingStats() noexcept;
+ PageType::type page_type;
+ Encoding::type encoding;
+ int32_t count;
+
+ void __set_page_type(const PageType::type val);
+
+ void __set_encoding(const Encoding::type val);
+
+ void __set_count(const int32_t val);
+
+ bool operator == (const PageEncodingStats & rhs) const
+ {
+ if (!(page_type == rhs.page_type))
+ return false;
+ if (!(encoding == rhs.encoding))
+ return false;
+ if (!(count == rhs.count))
+ return false;
+ return true;
+ }
+ bool operator != (const PageEncodingStats &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const PageEncodingStats & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(PageEncodingStats &a, PageEncodingStats &b);
+
+std::ostream& operator<<(std::ostream& out, const PageEncodingStats& obj);
+
+typedef struct _ColumnMetaData__isset {
+ _ColumnMetaData__isset() : key_value_metadata(false), index_page_offset(false), dictionary_page_offset(false), statistics(false), encoding_stats(false), bloom_filter_offset(false) {}
+ bool key_value_metadata :1;
+ bool index_page_offset :1;
+ bool dictionary_page_offset :1;
+ bool statistics :1;
+ bool encoding_stats :1;
+ bool bloom_filter_offset :1;
+} _ColumnMetaData__isset;
+
+class ColumnMetaData : public virtual ::apache::thrift::TBase {
+ public:
+
+ ColumnMetaData(const ColumnMetaData&);
+ ColumnMetaData& operator=(const ColumnMetaData&);
+ ColumnMetaData() : type((Type::type)0), codec((CompressionCodec::type)0), num_values(0), total_uncompressed_size(0), total_compressed_size(0), data_page_offset(0), index_page_offset(0), dictionary_page_offset(0), bloom_filter_offset(0) {
+ }
+
+ virtual ~ColumnMetaData() noexcept;
+ Type::type type;
+ std::vector<Encoding::type> encodings;
+ std::vector<std::string> path_in_schema;
+ CompressionCodec::type codec;
+ int64_t num_values;
+ int64_t total_uncompressed_size;
+ int64_t total_compressed_size;
+ std::vector<KeyValue> key_value_metadata;
+ int64_t data_page_offset;
+ int64_t index_page_offset;
+ int64_t dictionary_page_offset;
+ Statistics statistics;
+ std::vector<PageEncodingStats> encoding_stats;
+ int64_t bloom_filter_offset;
+
+ _ColumnMetaData__isset __isset;
+
+ void __set_type(const Type::type val);
+
+ void __set_encodings(const std::vector<Encoding::type> & val);
+
+ void __set_path_in_schema(const std::vector<std::string> & val);
+
+ void __set_codec(const CompressionCodec::type val);
+
+ void __set_num_values(const int64_t val);
+
+ void __set_total_uncompressed_size(const int64_t val);
+
+ void __set_total_compressed_size(const int64_t val);
+
+ void __set_key_value_metadata(const std::vector<KeyValue> & val);
+
+ void __set_data_page_offset(const int64_t val);
+
+ void __set_index_page_offset(const int64_t val);
+
+ void __set_dictionary_page_offset(const int64_t val);
+
+ void __set_statistics(const Statistics& val);
+
+ void __set_encoding_stats(const std::vector<PageEncodingStats> & val);
+
+ void __set_bloom_filter_offset(const int64_t val);
+
+ bool operator == (const ColumnMetaData & rhs) const
+ {
+ if (!(type == rhs.type))
+ return false;
+ if (!(encodings == rhs.encodings))
+ return false;
+ if (!(path_in_schema == rhs.path_in_schema))
+ return false;
+ if (!(codec == rhs.codec))
+ return false;
+ if (!(num_values == rhs.num_values))
+ return false;
+ if (!(total_uncompressed_size == rhs.total_uncompressed_size))
+ return false;
+ if (!(total_compressed_size == rhs.total_compressed_size))
+ return false;
+ if (__isset.key_value_metadata != rhs.__isset.key_value_metadata)
+ return false;
+ else if (__isset.key_value_metadata && !(key_value_metadata == rhs.key_value_metadata))
+ return false;
+ if (!(data_page_offset == rhs.data_page_offset))
+ return false;
+ if (__isset.index_page_offset != rhs.__isset.index_page_offset)
+ return false;
+ else if (__isset.index_page_offset && !(index_page_offset == rhs.index_page_offset))
+ return false;
+ if (__isset.dictionary_page_offset != rhs.__isset.dictionary_page_offset)
+ return false;
+ else if (__isset.dictionary_page_offset && !(dictionary_page_offset == rhs.dictionary_page_offset))
+ return false;
+ if (__isset.statistics != rhs.__isset.statistics)
+ return false;
+ else if (__isset.statistics && !(statistics == rhs.statistics))
+ return false;
+ if (__isset.encoding_stats != rhs.__isset.encoding_stats)
+ return false;
+ else if (__isset.encoding_stats && !(encoding_stats == rhs.encoding_stats))
+ return false;
+ if (__isset.bloom_filter_offset != rhs.__isset.bloom_filter_offset)
+ return false;
+ else if (__isset.bloom_filter_offset && !(bloom_filter_offset == rhs.bloom_filter_offset))
+ return false;
+ return true;
+ }
+ bool operator != (const ColumnMetaData &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const ColumnMetaData & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(ColumnMetaData &a, ColumnMetaData &b);
+
+std::ostream& operator<<(std::ostream& out, const ColumnMetaData& obj);
+
+
+class EncryptionWithFooterKey : public virtual ::apache::thrift::TBase {
+ public:
+
+ EncryptionWithFooterKey(const EncryptionWithFooterKey&);
+ EncryptionWithFooterKey& operator=(const EncryptionWithFooterKey&);
+ EncryptionWithFooterKey() {
+ }
+
+ virtual ~EncryptionWithFooterKey() noexcept;
+
+ bool operator == (const EncryptionWithFooterKey & /* rhs */) const
+ {
+ return true;
+ }
+ bool operator != (const EncryptionWithFooterKey &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const EncryptionWithFooterKey & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(EncryptionWithFooterKey &a, EncryptionWithFooterKey &b);
+
+std::ostream& operator<<(std::ostream& out, const EncryptionWithFooterKey& obj);
+
+typedef struct _EncryptionWithColumnKey__isset {
+ _EncryptionWithColumnKey__isset() : key_metadata(false) {}
+ bool key_metadata :1;
+} _EncryptionWithColumnKey__isset;
+
+class EncryptionWithColumnKey : public virtual ::apache::thrift::TBase {
+ public:
+
+ EncryptionWithColumnKey(const EncryptionWithColumnKey&);
+ EncryptionWithColumnKey& operator=(const EncryptionWithColumnKey&);
+ EncryptionWithColumnKey() : key_metadata() {
+ }
+
+ virtual ~EncryptionWithColumnKey() noexcept;
+ std::vector<std::string> path_in_schema;
+ std::string key_metadata;
+
+ _EncryptionWithColumnKey__isset __isset;
+
+ void __set_path_in_schema(const std::vector<std::string> & val);
+
+ void __set_key_metadata(const std::string& val);
+
+ bool operator == (const EncryptionWithColumnKey & rhs) const
+ {
+ if (!(path_in_schema == rhs.path_in_schema))
+ return false;
+ if (__isset.key_metadata != rhs.__isset.key_metadata)
+ return false;
+ else if (__isset.key_metadata && !(key_metadata == rhs.key_metadata))
+ return false;
+ return true;
+ }
+ bool operator != (const EncryptionWithColumnKey &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const EncryptionWithColumnKey & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(EncryptionWithColumnKey &a, EncryptionWithColumnKey &b);
+
+std::ostream& operator<<(std::ostream& out, const EncryptionWithColumnKey& obj);
+
+typedef struct _ColumnCryptoMetaData__isset {
+ _ColumnCryptoMetaData__isset() : ENCRYPTION_WITH_FOOTER_KEY(false), ENCRYPTION_WITH_COLUMN_KEY(false) {}
+ bool ENCRYPTION_WITH_FOOTER_KEY :1;
+ bool ENCRYPTION_WITH_COLUMN_KEY :1;
+} _ColumnCryptoMetaData__isset;
+
+class ColumnCryptoMetaData : public virtual ::apache::thrift::TBase {
+ public:
+
+ ColumnCryptoMetaData(const ColumnCryptoMetaData&);
+ ColumnCryptoMetaData& operator=(const ColumnCryptoMetaData&);
+ ColumnCryptoMetaData() {
+ }
+
+ virtual ~ColumnCryptoMetaData() noexcept;
+ EncryptionWithFooterKey ENCRYPTION_WITH_FOOTER_KEY;
+ EncryptionWithColumnKey ENCRYPTION_WITH_COLUMN_KEY;
+
+ _ColumnCryptoMetaData__isset __isset;
+
+ void __set_ENCRYPTION_WITH_FOOTER_KEY(const EncryptionWithFooterKey& val);
+
+ void __set_ENCRYPTION_WITH_COLUMN_KEY(const EncryptionWithColumnKey& val);
+
+ bool operator == (const ColumnCryptoMetaData & rhs) const
+ {
+ if (__isset.ENCRYPTION_WITH_FOOTER_KEY != rhs.__isset.ENCRYPTION_WITH_FOOTER_KEY)
+ return false;
+ else if (__isset.ENCRYPTION_WITH_FOOTER_KEY && !(ENCRYPTION_WITH_FOOTER_KEY == rhs.ENCRYPTION_WITH_FOOTER_KEY))
+ return false;
+ if (__isset.ENCRYPTION_WITH_COLUMN_KEY != rhs.__isset.ENCRYPTION_WITH_COLUMN_KEY)
+ return false;
+ else if (__isset.ENCRYPTION_WITH_COLUMN_KEY && !(ENCRYPTION_WITH_COLUMN_KEY == rhs.ENCRYPTION_WITH_COLUMN_KEY))
+ return false;
+ return true;
+ }
+ bool operator != (const ColumnCryptoMetaData &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const ColumnCryptoMetaData & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(ColumnCryptoMetaData &a, ColumnCryptoMetaData &b);
+
+std::ostream& operator<<(std::ostream& out, const ColumnCryptoMetaData& obj);
+
+typedef struct _ColumnChunk__isset {
+ _ColumnChunk__isset() : file_path(false), meta_data(false), offset_index_offset(false), offset_index_length(false), column_index_offset(false), column_index_length(false), crypto_metadata(false), encrypted_column_metadata(false) {}
+ bool file_path :1;
+ bool meta_data :1;
+ bool offset_index_offset :1;
+ bool offset_index_length :1;
+ bool column_index_offset :1;
+ bool column_index_length :1;
+ bool crypto_metadata :1;
+ bool encrypted_column_metadata :1;
+} _ColumnChunk__isset;
+
+class ColumnChunk : public virtual ::apache::thrift::TBase {
+ public:
+
+ ColumnChunk(const ColumnChunk&);
+ ColumnChunk& operator=(const ColumnChunk&);
+ ColumnChunk() : file_path(), file_offset(0), offset_index_offset(0), offset_index_length(0), column_index_offset(0), column_index_length(0), encrypted_column_metadata() {
+ }
+
+ virtual ~ColumnChunk() noexcept;
+ std::string file_path;
+ int64_t file_offset;
+ ColumnMetaData meta_data;
+ int64_t offset_index_offset;
+ int32_t offset_index_length;
+ int64_t column_index_offset;
+ int32_t column_index_length;
+ ColumnCryptoMetaData crypto_metadata;
+ std::string encrypted_column_metadata;
+
+ _ColumnChunk__isset __isset;
+
+ void __set_file_path(const std::string& val);
+
+ void __set_file_offset(const int64_t val);
+
+ void __set_meta_data(const ColumnMetaData& val);
+
+ void __set_offset_index_offset(const int64_t val);
+
+ void __set_offset_index_length(const int32_t val);
+
+ void __set_column_index_offset(const int64_t val);
+
+ void __set_column_index_length(const int32_t val);
+
+ void __set_crypto_metadata(const ColumnCryptoMetaData& val);
+
+ void __set_encrypted_column_metadata(const std::string& val);
+
+ bool operator == (const ColumnChunk & rhs) const
+ {
+ if (__isset.file_path != rhs.__isset.file_path)
+ return false;
+ else if (__isset.file_path && !(file_path == rhs.file_path))
+ return false;
+ if (!(file_offset == rhs.file_offset))
+ return false;
+ if (__isset.meta_data != rhs.__isset.meta_data)
+ return false;
+ else if (__isset.meta_data && !(meta_data == rhs.meta_data))
+ return false;
+ if (__isset.offset_index_offset != rhs.__isset.offset_index_offset)
+ return false;
+ else if (__isset.offset_index_offset && !(offset_index_offset == rhs.offset_index_offset))
+ return false;
+ if (__isset.offset_index_length != rhs.__isset.offset_index_length)
+ return false;
+ else if (__isset.offset_index_length && !(offset_index_length == rhs.offset_index_length))
+ return false;
+ if (__isset.column_index_offset != rhs.__isset.column_index_offset)
+ return false;
+ else if (__isset.column_index_offset && !(column_index_offset == rhs.column_index_offset))
+ return false;
+ if (__isset.column_index_length != rhs.__isset.column_index_length)
+ return false;
+ else if (__isset.column_index_length && !(column_index_length == rhs.column_index_length))
+ return false;
+ if (__isset.crypto_metadata != rhs.__isset.crypto_metadata)
+ return false;
+ else if (__isset.crypto_metadata && !(crypto_metadata == rhs.crypto_metadata))
+ return false;
+ if (__isset.encrypted_column_metadata != rhs.__isset.encrypted_column_metadata)
+ return false;
+ else if (__isset.encrypted_column_metadata && !(encrypted_column_metadata == rhs.encrypted_column_metadata))
+ return false;
+ return true;
+ }
+ bool operator != (const ColumnChunk &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const ColumnChunk & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(ColumnChunk &a, ColumnChunk &b);
+
+std::ostream& operator<<(std::ostream& out, const ColumnChunk& obj);
+
+typedef struct _RowGroup__isset {
+ _RowGroup__isset() : sorting_columns(false), file_offset(false), total_compressed_size(false), ordinal(false) {}
+ bool sorting_columns :1;
+ bool file_offset :1;
+ bool total_compressed_size :1;
+ bool ordinal :1;
+} _RowGroup__isset;
+
+class RowGroup : public virtual ::apache::thrift::TBase {
+ public:
+
+ RowGroup(const RowGroup&);
+ RowGroup& operator=(const RowGroup&);
+ RowGroup() : total_byte_size(0), num_rows(0), file_offset(0), total_compressed_size(0), ordinal(0) {
+ }
+
+ virtual ~RowGroup() noexcept;
+ std::vector<ColumnChunk> columns;
+ int64_t total_byte_size;
+ int64_t num_rows;
+ std::vector<SortingColumn> sorting_columns;
+ int64_t file_offset;
+ int64_t total_compressed_size;
+ int16_t ordinal;
+
+ _RowGroup__isset __isset;
+
+ void __set_columns(const std::vector<ColumnChunk> & val);
+
+ void __set_total_byte_size(const int64_t val);
+
+ void __set_num_rows(const int64_t val);
+
+ void __set_sorting_columns(const std::vector<SortingColumn> & val);
+
+ void __set_file_offset(const int64_t val);
+
+ void __set_total_compressed_size(const int64_t val);
+
+ void __set_ordinal(const int16_t val);
+
+ bool operator == (const RowGroup & rhs) const
+ {
+ if (!(columns == rhs.columns))
+ return false;
+ if (!(total_byte_size == rhs.total_byte_size))
+ return false;
+ if (!(num_rows == rhs.num_rows))
+ return false;
+ if (__isset.sorting_columns != rhs.__isset.sorting_columns)
+ return false;
+ else if (__isset.sorting_columns && !(sorting_columns == rhs.sorting_columns))
+ return false;
+ if (__isset.file_offset != rhs.__isset.file_offset)
+ return false;
+ else if (__isset.file_offset && !(file_offset == rhs.file_offset))
+ return false;
+ if (__isset.total_compressed_size != rhs.__isset.total_compressed_size)
+ return false;
+ else if (__isset.total_compressed_size && !(total_compressed_size == rhs.total_compressed_size))
+ return false;
+ if (__isset.ordinal != rhs.__isset.ordinal)
+ return false;
+ else if (__isset.ordinal && !(ordinal == rhs.ordinal))
+ return false;
+ return true;
+ }
+ bool operator != (const RowGroup &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const RowGroup & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(RowGroup &a, RowGroup &b);
+
+std::ostream& operator<<(std::ostream& out, const RowGroup& obj);
+
+
+class TypeDefinedOrder : public virtual ::apache::thrift::TBase {
+ public:
+
+ TypeDefinedOrder(const TypeDefinedOrder&);
+ TypeDefinedOrder& operator=(const TypeDefinedOrder&);
+ TypeDefinedOrder() {
+ }
+
+ virtual ~TypeDefinedOrder() noexcept;
+
+ bool operator == (const TypeDefinedOrder & /* rhs */) const
+ {
+ return true;
+ }
+ bool operator != (const TypeDefinedOrder &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const TypeDefinedOrder & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(TypeDefinedOrder &a, TypeDefinedOrder &b);
+
+std::ostream& operator<<(std::ostream& out, const TypeDefinedOrder& obj);
+
+typedef struct _ColumnOrder__isset {
+ _ColumnOrder__isset() : TYPE_ORDER(false) {}
+ bool TYPE_ORDER :1;
+} _ColumnOrder__isset;
+
+class ColumnOrder : public virtual ::apache::thrift::TBase {
+ public:
+
+ ColumnOrder(const ColumnOrder&);
+ ColumnOrder& operator=(const ColumnOrder&);
+ ColumnOrder() {
+ }
+
+ virtual ~ColumnOrder() noexcept;
+ TypeDefinedOrder TYPE_ORDER;
+
+ _ColumnOrder__isset __isset;
+
+ void __set_TYPE_ORDER(const TypeDefinedOrder& val);
+
+ bool operator == (const ColumnOrder & rhs) const
+ {
+ if (__isset.TYPE_ORDER != rhs.__isset.TYPE_ORDER)
+ return false;
+ else if (__isset.TYPE_ORDER && !(TYPE_ORDER == rhs.TYPE_ORDER))
+ return false;
+ return true;
+ }
+ bool operator != (const ColumnOrder &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const ColumnOrder & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(ColumnOrder &a, ColumnOrder &b);
+
+std::ostream& operator<<(std::ostream& out, const ColumnOrder& obj);
+
+
+class PageLocation : public virtual ::apache::thrift::TBase {
+ public:
+
+ PageLocation(const PageLocation&);
+ PageLocation& operator=(const PageLocation&);
+ PageLocation() : offset(0), compressed_page_size(0), first_row_index(0) {
+ }
+
+ virtual ~PageLocation() noexcept;
+ int64_t offset;
+ int32_t compressed_page_size;
+ int64_t first_row_index;
+
+ void __set_offset(const int64_t val);
+
+ void __set_compressed_page_size(const int32_t val);
+
+ void __set_first_row_index(const int64_t val);
+
+ bool operator == (const PageLocation & rhs) const
+ {
+ if (!(offset == rhs.offset))
+ return false;
+ if (!(compressed_page_size == rhs.compressed_page_size))
+ return false;
+ if (!(first_row_index == rhs.first_row_index))
+ return false;
+ return true;
+ }
+ bool operator != (const PageLocation &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const PageLocation & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(PageLocation &a, PageLocation &b);
+
+std::ostream& operator<<(std::ostream& out, const PageLocation& obj);
+
+
+class OffsetIndex : public virtual ::apache::thrift::TBase {
+ public:
+
+ OffsetIndex(const OffsetIndex&);
+ OffsetIndex& operator=(const OffsetIndex&);
+ OffsetIndex() {
+ }
+
+ virtual ~OffsetIndex() noexcept;
+ std::vector<PageLocation> page_locations;
+
+ void __set_page_locations(const std::vector<PageLocation> & val);
+
+ bool operator == (const OffsetIndex & rhs) const
+ {
+ if (!(page_locations == rhs.page_locations))
+ return false;
+ return true;
+ }
+ bool operator != (const OffsetIndex &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const OffsetIndex & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(OffsetIndex &a, OffsetIndex &b);
+
+std::ostream& operator<<(std::ostream& out, const OffsetIndex& obj);
+
+typedef struct _ColumnIndex__isset {
+ _ColumnIndex__isset() : null_counts(false) {}
+ bool null_counts :1;
+} _ColumnIndex__isset;
+
+class ColumnIndex : public virtual ::apache::thrift::TBase {
+ public:
+
+ ColumnIndex(const ColumnIndex&);
+ ColumnIndex& operator=(const ColumnIndex&);
+ ColumnIndex() : boundary_order((BoundaryOrder::type)0) {
+ }
+
+ virtual ~ColumnIndex() noexcept;
+ std::vector<bool> null_pages;
+ std::vector<std::string> min_values;
+ std::vector<std::string> max_values;
+ BoundaryOrder::type boundary_order;
+ std::vector<int64_t> null_counts;
+
+ _ColumnIndex__isset __isset;
+
+ void __set_null_pages(const std::vector<bool> & val);
+
+ void __set_min_values(const std::vector<std::string> & val);
+
+ void __set_max_values(const std::vector<std::string> & val);
+
+ void __set_boundary_order(const BoundaryOrder::type val);
+
+ void __set_null_counts(const std::vector<int64_t> & val);
+
+ bool operator == (const ColumnIndex & rhs) const
+ {
+ if (!(null_pages == rhs.null_pages))
+ return false;
+ if (!(min_values == rhs.min_values))
+ return false;
+ if (!(max_values == rhs.max_values))
+ return false;
+ if (!(boundary_order == rhs.boundary_order))
+ return false;
+ if (__isset.null_counts != rhs.__isset.null_counts)
+ return false;
+ else if (__isset.null_counts && !(null_counts == rhs.null_counts))
+ return false;
+ return true;
+ }
+ bool operator != (const ColumnIndex &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const ColumnIndex & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(ColumnIndex &a, ColumnIndex &b);
+
+std::ostream& operator<<(std::ostream& out, const ColumnIndex& obj);
+
+typedef struct _AesGcmV1__isset {
+ _AesGcmV1__isset() : aad_prefix(false), aad_file_unique(false), supply_aad_prefix(false) {}
+ bool aad_prefix :1;
+ bool aad_file_unique :1;
+ bool supply_aad_prefix :1;
+} _AesGcmV1__isset;
+
+class AesGcmV1 : public virtual ::apache::thrift::TBase {
+ public:
+
+ AesGcmV1(const AesGcmV1&);
+ AesGcmV1& operator=(const AesGcmV1&);
+ AesGcmV1() : aad_prefix(), aad_file_unique(), supply_aad_prefix(0) {
+ }
+
+ virtual ~AesGcmV1() noexcept;
+ std::string aad_prefix;
+ std::string aad_file_unique;
+ bool supply_aad_prefix;
+
+ _AesGcmV1__isset __isset;
+
+ void __set_aad_prefix(const std::string& val);
+
+ void __set_aad_file_unique(const std::string& val);
+
+ void __set_supply_aad_prefix(const bool val);
+
+ bool operator == (const AesGcmV1 & rhs) const
+ {
+ if (__isset.aad_prefix != rhs.__isset.aad_prefix)
+ return false;
+ else if (__isset.aad_prefix && !(aad_prefix == rhs.aad_prefix))
+ return false;
+ if (__isset.aad_file_unique != rhs.__isset.aad_file_unique)
+ return false;
+ else if (__isset.aad_file_unique && !(aad_file_unique == rhs.aad_file_unique))
+ return false;
+ if (__isset.supply_aad_prefix != rhs.__isset.supply_aad_prefix)
+ return false;
+ else if (__isset.supply_aad_prefix && !(supply_aad_prefix == rhs.supply_aad_prefix))
+ return false;
+ return true;
+ }
+ bool operator != (const AesGcmV1 &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const AesGcmV1 & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(AesGcmV1 &a, AesGcmV1 &b);
+
+std::ostream& operator<<(std::ostream& out, const AesGcmV1& obj);
+
+typedef struct _AesGcmCtrV1__isset {
+ _AesGcmCtrV1__isset() : aad_prefix(false), aad_file_unique(false), supply_aad_prefix(false) {}
+ bool aad_prefix :1;
+ bool aad_file_unique :1;
+ bool supply_aad_prefix :1;
+} _AesGcmCtrV1__isset;
+
+class AesGcmCtrV1 : public virtual ::apache::thrift::TBase {
+ public:
+
+ AesGcmCtrV1(const AesGcmCtrV1&);
+ AesGcmCtrV1& operator=(const AesGcmCtrV1&);
+ AesGcmCtrV1() : aad_prefix(), aad_file_unique(), supply_aad_prefix(0) {
+ }
+
+ virtual ~AesGcmCtrV1() noexcept;
+ std::string aad_prefix;
+ std::string aad_file_unique;
+ bool supply_aad_prefix;
+
+ _AesGcmCtrV1__isset __isset;
+
+ void __set_aad_prefix(const std::string& val);
+
+ void __set_aad_file_unique(const std::string& val);
+
+ void __set_supply_aad_prefix(const bool val);
+
+ bool operator == (const AesGcmCtrV1 & rhs) const
+ {
+ if (__isset.aad_prefix != rhs.__isset.aad_prefix)
+ return false;
+ else if (__isset.aad_prefix && !(aad_prefix == rhs.aad_prefix))
+ return false;
+ if (__isset.aad_file_unique != rhs.__isset.aad_file_unique)
+ return false;
+ else if (__isset.aad_file_unique && !(aad_file_unique == rhs.aad_file_unique))
+ return false;
+ if (__isset.supply_aad_prefix != rhs.__isset.supply_aad_prefix)
+ return false;
+ else if (__isset.supply_aad_prefix && !(supply_aad_prefix == rhs.supply_aad_prefix))
+ return false;
+ return true;
+ }
+ bool operator != (const AesGcmCtrV1 &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const AesGcmCtrV1 & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(AesGcmCtrV1 &a, AesGcmCtrV1 &b);
+
+std::ostream& operator<<(std::ostream& out, const AesGcmCtrV1& obj);
+
+typedef struct _EncryptionAlgorithm__isset {
+ _EncryptionAlgorithm__isset() : AES_GCM_V1(false), AES_GCM_CTR_V1(false) {}
+ bool AES_GCM_V1 :1;
+ bool AES_GCM_CTR_V1 :1;
+} _EncryptionAlgorithm__isset;
+
+class EncryptionAlgorithm : public virtual ::apache::thrift::TBase {
+ public:
+
+ EncryptionAlgorithm(const EncryptionAlgorithm&);
+ EncryptionAlgorithm& operator=(const EncryptionAlgorithm&);
+ EncryptionAlgorithm() {
+ }
+
+ virtual ~EncryptionAlgorithm() noexcept;
+ AesGcmV1 AES_GCM_V1;
+ AesGcmCtrV1 AES_GCM_CTR_V1;
+
+ _EncryptionAlgorithm__isset __isset;
+
+ void __set_AES_GCM_V1(const AesGcmV1& val);
+
+ void __set_AES_GCM_CTR_V1(const AesGcmCtrV1& val);
+
+ bool operator == (const EncryptionAlgorithm & rhs) const
+ {
+ if (__isset.AES_GCM_V1 != rhs.__isset.AES_GCM_V1)
+ return false;
+ else if (__isset.AES_GCM_V1 && !(AES_GCM_V1 == rhs.AES_GCM_V1))
+ return false;
+ if (__isset.AES_GCM_CTR_V1 != rhs.__isset.AES_GCM_CTR_V1)
+ return false;
+ else if (__isset.AES_GCM_CTR_V1 && !(AES_GCM_CTR_V1 == rhs.AES_GCM_CTR_V1))
+ return false;
+ return true;
+ }
+ bool operator != (const EncryptionAlgorithm &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const EncryptionAlgorithm & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(EncryptionAlgorithm &a, EncryptionAlgorithm &b);
+
+std::ostream& operator<<(std::ostream& out, const EncryptionAlgorithm& obj);
+
+typedef struct _FileMetaData__isset {
+ _FileMetaData__isset() : key_value_metadata(false), created_by(false), column_orders(false), encryption_algorithm(false), footer_signing_key_metadata(false) {}
+ bool key_value_metadata :1;
+ bool created_by :1;
+ bool column_orders :1;
+ bool encryption_algorithm :1;
+ bool footer_signing_key_metadata :1;
+} _FileMetaData__isset;
+
+class FileMetaData : public virtual ::apache::thrift::TBase {
+ public:
+
+ FileMetaData(const FileMetaData&);
+ FileMetaData& operator=(const FileMetaData&);
+ FileMetaData() : version(0), num_rows(0), created_by(), footer_signing_key_metadata() {
+ }
+
+ virtual ~FileMetaData() noexcept;
+ int32_t version;
+ std::vector<SchemaElement> schema;
+ int64_t num_rows;
+ std::vector<RowGroup> row_groups;
+ std::vector<KeyValue> key_value_metadata;
+ std::string created_by;
+ std::vector<ColumnOrder> column_orders;
+ EncryptionAlgorithm encryption_algorithm;
+ std::string footer_signing_key_metadata;
+
+ _FileMetaData__isset __isset;
+
+ void __set_version(const int32_t val);
+
+ void __set_schema(const std::vector<SchemaElement> & val);
+
+ void __set_num_rows(const int64_t val);
+
+ void __set_row_groups(const std::vector<RowGroup> & val);
+
+ void __set_key_value_metadata(const std::vector<KeyValue> & val);
+
+ void __set_created_by(const std::string& val);
+
+ void __set_column_orders(const std::vector<ColumnOrder> & val);
+
+ void __set_encryption_algorithm(const EncryptionAlgorithm& val);
+
+ void __set_footer_signing_key_metadata(const std::string& val);
+
+ bool operator == (const FileMetaData & rhs) const
+ {
+ if (!(version == rhs.version))
+ return false;
+ if (!(schema == rhs.schema))
+ return false;
+ if (!(num_rows == rhs.num_rows))
+ return false;
+ if (!(row_groups == rhs.row_groups))
+ return false;
+ if (__isset.key_value_metadata != rhs.__isset.key_value_metadata)
+ return false;
+ else if (__isset.key_value_metadata && !(key_value_metadata == rhs.key_value_metadata))
+ return false;
+ if (__isset.created_by != rhs.__isset.created_by)
+ return false;
+ else if (__isset.created_by && !(created_by == rhs.created_by))
+ return false;
+ if (__isset.column_orders != rhs.__isset.column_orders)
+ return false;
+ else if (__isset.column_orders && !(column_orders == rhs.column_orders))
+ return false;
+ if (__isset.encryption_algorithm != rhs.__isset.encryption_algorithm)
+ return false;
+ else if (__isset.encryption_algorithm && !(encryption_algorithm == rhs.encryption_algorithm))
+ return false;
+ if (__isset.footer_signing_key_metadata != rhs.__isset.footer_signing_key_metadata)
+ return false;
+ else if (__isset.footer_signing_key_metadata && !(footer_signing_key_metadata == rhs.footer_signing_key_metadata))
+ return false;
+ return true;
+ }
+ bool operator != (const FileMetaData &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const FileMetaData & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(FileMetaData &a, FileMetaData &b);
+
+std::ostream& operator<<(std::ostream& out, const FileMetaData& obj);
+
+typedef struct _FileCryptoMetaData__isset {
+ _FileCryptoMetaData__isset() : key_metadata(false) {}
+ bool key_metadata :1;
+} _FileCryptoMetaData__isset;
+
+class FileCryptoMetaData : public virtual ::apache::thrift::TBase {
+ public:
+
+ FileCryptoMetaData(const FileCryptoMetaData&);
+ FileCryptoMetaData& operator=(const FileCryptoMetaData&);
+ FileCryptoMetaData() : key_metadata() {
+ }
+
+ virtual ~FileCryptoMetaData() noexcept;
+ EncryptionAlgorithm encryption_algorithm;
+ std::string key_metadata;
+
+ _FileCryptoMetaData__isset __isset;
+
+ void __set_encryption_algorithm(const EncryptionAlgorithm& val);
+
+ void __set_key_metadata(const std::string& val);
+
+ bool operator == (const FileCryptoMetaData & rhs) const
+ {
+ if (!(encryption_algorithm == rhs.encryption_algorithm))
+ return false;
+ if (__isset.key_metadata != rhs.__isset.key_metadata)
+ return false;
+ else if (__isset.key_metadata && !(key_metadata == rhs.key_metadata))
+ return false;
+ return true;
+ }
+ bool operator != (const FileCryptoMetaData &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const FileCryptoMetaData & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(FileCryptoMetaData &a, FileCryptoMetaData &b);
+
+std::ostream& operator<<(std::ostream& out, const FileCryptoMetaData& obj);
+
+}} // namespace
+
+#endif
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/README b/contrib/libs/apache/arrow/cpp/src/parquet/README
new file mode 100644
index 00000000000..fc16a46ca08
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/README
@@ -0,0 +1,10 @@
+The CompatibilityTest of bloom_filter-test.cc is used to test cross compatibility of
+Bloom filters between parquet-mr and parquet-cpp. It reads the Bloom filter binary
+generated by the Bloom filter class in the parquet-mr project and tests whether the
+values inserted before could be filtered or not.
+
+The Bloom filter binary is generated by three steps from Parquet-mr:
+Step 1: Construct a Bloom filter with 1024 bytes of bitset.
+Step 2: Insert hashes of "hello", "parquet", "bloom", "filter" strings to Bloom filter
+by calling hash and insert APIs.
+Step 3: Call writeTo API to write to File.
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/arrow/path_internal.cc b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/path_internal.cc
new file mode 100644
index 00000000000..a51773c44d3
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/path_internal.cc
@@ -0,0 +1,900 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Overview.
+//
+// The strategy used for this code for repetition/definition
+// is to dissect the top level array into a list of paths
+// from the top level array to the final primitive (possibly
+// dictionary encoded array). It then evaluates each one of
+// those paths to produce results for the callback iteratively.
+//
+// This approach was taken to reduce the aggregate memory required if we were
+// to build all def/rep levels in parallel as apart of a tree traversal. It
+// also allows for straightforward parallelization at the path level if that is
+// desired in the future.
+//
+// The main downside to this approach is it duplicates effort for nodes
+// that share common ancestors. This can be mitigated to some degree
+// by adding in optimizations that detect leaf arrays that share
+// the same common list ancestor and reuse the repetition levels
+// from the first leaf encountered (only definition levels greater
+// the list ancestor need to be re-evaluated. This is left for future
+// work.
+//
+// Algorithm.
+//
+// As mentioned above this code dissects arrays into constituent parts:
+// nullability data, and list offset data. It tries to optimize for
+// some special cases, where it is known ahead of time that a step
+// can be skipped (e.g. a nullable array happens to have all of its
+// values) or batch filled (a nullable array has all null values).
+// One further optimization that is not implemented but could be done
+// in the future is special handling for nested list arrays that
+// have some intermediate data which indicates the final array contains only
+// nulls.
+//
+// In general, the algorithm attempts to batch work at each node as much
+// as possible. For nullability nodes this means finding runs of null
+// values and batch filling those interspersed with finding runs of non-null values
+// to process in batch at the next column.
+//
+// Similarly, list runs of empty lists are all processed in one batch
+// followed by either:
+// - A single list entry for non-terminal lists (i.e. the upper part of a nested list)
+// - Runs of non-empty lists for the terminal list (i.e. the lowest part of a nested
+// list).
+//
+// This makes use of the following observations.
+// 1. Null values at any node on the path are terminal (repetition and definition
+// level can be set directly when a Null value is encountered).
+// 2. Empty lists share this eager termination property with Null values.
+// 3. In order to keep repetition/definition level populated the algorithm is lazy
+// in assigning repetition levels. The algorithm tracks whether it is currently
+// in the middle of a list by comparing the lengths of repetition/definition levels.
+// If it is currently in the middle of a list the the number of repetition levels
+// populated will be greater than definition levels (the start of a List requires
+// adding the first element). If there are equal numbers of definition and repetition
+// levels populated this indicates a list is waiting to be started and the next list
+// encountered will have its repetition level signify the beginning of the list.
+//
+// Other implementation notes.
+//
+// This code hasn't been benchmarked (or assembly analyzed) but did the following
+// as optimizations (yes premature optimization is the root of all evil).
+// - This code does not use recursion, instead it constructs its own stack and manages
+// updating elements accordingly.
+// - It tries to avoid using Status for common return states.
+// - Avoids virtual dispatch in favor of if/else statements on a set of well known
+// classes.
+
+#include "parquet/arrow/path_internal.h"
+
+#include <atomic>
+#include <cstddef>
+#include <memory>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/buffer.h"
+#include "arrow/buffer_builder.h"
+#include "arrow/extension_type.h"
+#include "arrow/memory_pool.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/bit_run_reader.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_visit.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/make_unique.h"
+#include "arrow/util/variant.h"
+#include "arrow/visitor_inline.h"
+#include "parquet/properties.h"
+
+namespace parquet {
+namespace arrow {
+
+namespace {
+
+using ::arrow::Array;
+using ::arrow::Status;
+using ::arrow::TypedBufferBuilder;
+
+constexpr static int16_t kLevelNotSet = -1;
+
+/// \brief Simple result of a iterating over a column to determine values.
+enum IterationResult {
+ /// Processing is done at this node. Move back up the path
+ /// to continue processing.
+ kDone = -1,
+ /// Move down towards the leaf for processing.
+ kNext = 1,
+ /// An error occurred while processing.
+ kError = 2
+};
+
+#define RETURN_IF_ERROR(iteration_result) \
+ do { \
+ if (ARROW_PREDICT_FALSE(iteration_result == kError)) { \
+ return iteration_result; \
+ } \
+ } while (false)
+
+int64_t LazyNullCount(const Array& array) { return array.data()->null_count.load(); }
+
+bool LazyNoNulls(const Array& array) {
+ int64_t null_count = LazyNullCount(array);
+ return null_count == 0 ||
+ // kUnkownNullCount comparison is needed to account
+ // for null arrays.
+ (null_count == ::arrow::kUnknownNullCount &&
+ array.null_bitmap_data() == nullptr);
+}
+
+struct PathWriteContext {
+ PathWriteContext(::arrow::MemoryPool* pool,
+ std::shared_ptr<::arrow::ResizableBuffer> def_levels_buffer)
+ : rep_levels(pool), def_levels(std::move(def_levels_buffer), pool) {}
+ IterationResult ReserveDefLevels(int64_t elements) {
+ last_status = def_levels.Reserve(elements);
+ if (ARROW_PREDICT_TRUE(last_status.ok())) {
+ return kDone;
+ }
+ return kError;
+ }
+
+ IterationResult AppendDefLevel(int16_t def_level) {
+ last_status = def_levels.Append(def_level);
+ if (ARROW_PREDICT_TRUE(last_status.ok())) {
+ return kDone;
+ }
+ return kError;
+ }
+
+ IterationResult AppendDefLevels(int64_t count, int16_t def_level) {
+ last_status = def_levels.Append(count, def_level);
+ if (ARROW_PREDICT_TRUE(last_status.ok())) {
+ return kDone;
+ }
+ return kError;
+ }
+
+ void UnsafeAppendDefLevel(int16_t def_level) { def_levels.UnsafeAppend(def_level); }
+
+ IterationResult AppendRepLevel(int16_t rep_level) {
+ last_status = rep_levels.Append(rep_level);
+
+ if (ARROW_PREDICT_TRUE(last_status.ok())) {
+ return kDone;
+ }
+ return kError;
+ }
+
+ IterationResult AppendRepLevels(int64_t count, int16_t rep_level) {
+ last_status = rep_levels.Append(count, rep_level);
+ if (ARROW_PREDICT_TRUE(last_status.ok())) {
+ return kDone;
+ }
+ return kError;
+ }
+
+ bool EqualRepDefLevelsLengths() const {
+ return rep_levels.length() == def_levels.length();
+ }
+
+ // Incorporates |range| into visited elements. If the |range| is contiguous
+ // with the last range, extend the last range, otherwise add |range| separately
+ // tot he list.
+ void RecordPostListVisit(const ElementRange& range) {
+ if (!visited_elements.empty() && range.start == visited_elements.back().end) {
+ visited_elements.back().end = range.end;
+ return;
+ }
+ visited_elements.push_back(range);
+ }
+
+ Status last_status;
+ TypedBufferBuilder<int16_t> rep_levels;
+ TypedBufferBuilder<int16_t> def_levels;
+ std::vector<ElementRange> visited_elements;
+};
+
+IterationResult FillRepLevels(int64_t count, int16_t rep_level,
+ PathWriteContext* context) {
+ if (rep_level == kLevelNotSet) {
+ return kDone;
+ }
+ int64_t fill_count = count;
+ // This condition occurs (rep and dep levels equals), in one of
+ // in a few cases:
+ // 1. Before any list is encountered.
+ // 2. After rep-level has been filled in due to null/empty
+ // values above it.
+ // 3. After finishing a list.
+ if (!context->EqualRepDefLevelsLengths()) {
+ fill_count--;
+ }
+ return context->AppendRepLevels(fill_count, rep_level);
+}
+
+// A node for handling an array that is discovered to have all
+// null elements. It is referred to as a TerminalNode because
+// traversal of nodes will not continue it when generating
+// rep/def levels. However, there could be many nested children
+// elements beyond it in the Array that is being processed.
+class AllNullsTerminalNode {
+ public:
+ explicit AllNullsTerminalNode(int16_t def_level, int16_t rep_level = kLevelNotSet)
+ : def_level_(def_level), rep_level_(rep_level) {}
+ void SetRepLevelIfNull(int16_t rep_level) { rep_level_ = rep_level; }
+ IterationResult Run(const ElementRange& range, PathWriteContext* context) {
+ int64_t size = range.Size();
+ RETURN_IF_ERROR(FillRepLevels(size, rep_level_, context));
+ return context->AppendDefLevels(size, def_level_);
+ }
+
+ private:
+ int16_t def_level_;
+ int16_t rep_level_;
+};
+
+// Handles the case where all remaining arrays until the leaf have no nulls
+// (and are not interrupted by lists). Unlike AllNullsTerminalNode this is
+// always the last node in a path. We don't need an analogue to the AllNullsTerminalNode
+// because if all values are present at an intermediate array no node is added for it
+// (the def-level for the next nullable node is incremented).
+struct AllPresentTerminalNode {
+ IterationResult Run(const ElementRange& range, PathWriteContext* context) {
+ return context->AppendDefLevels(range.end - range.start, def_level);
+ // No need to worry about rep levels, because this state should
+ // only be applicable for after all list/repeated values
+ // have been evaluated in the path.
+ }
+ int16_t def_level;
+};
+
+/// Node for handling the case when the leaf-array is nullable
+/// and contains null elements.
+struct NullableTerminalNode {
+ NullableTerminalNode() = default;
+
+ NullableTerminalNode(const uint8_t* bitmap, int64_t element_offset,
+ int16_t def_level_if_present)
+ : bitmap_(bitmap),
+ element_offset_(element_offset),
+ def_level_if_present_(def_level_if_present),
+ def_level_if_null_(def_level_if_present - 1) {}
+
+ IterationResult Run(const ElementRange& range, PathWriteContext* context) {
+ int64_t elements = range.Size();
+ RETURN_IF_ERROR(context->ReserveDefLevels(elements));
+
+ DCHECK_GT(elements, 0);
+
+ auto bit_visitor = [&](bool is_set) {
+ context->UnsafeAppendDefLevel(is_set ? def_level_if_present_ : def_level_if_null_);
+ };
+
+ if (elements > 16) { // 16 guarantees at least one unrolled loop.
+ ::arrow::internal::VisitBitsUnrolled(bitmap_, range.start + element_offset_,
+ elements, bit_visitor);
+ } else {
+ ::arrow::internal::VisitBits(bitmap_, range.start + element_offset_, elements,
+ bit_visitor);
+ }
+ return kDone;
+ }
+ const uint8_t* bitmap_;
+ int64_t element_offset_;
+ int16_t def_level_if_present_;
+ int16_t def_level_if_null_;
+};
+
+// List nodes handle populating rep_level for Arrow Lists and def-level for empty lists.
+// Nullability (both list and children) is handled by other Nodes. By
+// construction all list nodes will be intermediate nodes (they will always be followed by
+// at least one other node).
+//
+// Type parameters:
+// |RangeSelector| - A strategy for determine the the range of the child node to
+// process.
+// this varies depending on the type of list (int32_t* offsets, int64_t* offsets of
+// fixed.
+template <typename RangeSelector>
+class ListPathNode {
+ public:
+ ListPathNode(RangeSelector selector, int16_t rep_lev, int16_t def_level_if_empty)
+ : selector_(std::move(selector)),
+ prev_rep_level_(rep_lev - 1),
+ rep_level_(rep_lev),
+ def_level_if_empty_(def_level_if_empty) {}
+
+ int16_t rep_level() const { return rep_level_; }
+
+ IterationResult Run(ElementRange* range, ElementRange* child_range,
+ PathWriteContext* context) {
+ if (range->Empty()) {
+ return kDone;
+ }
+
+ // Find the first non-empty list (skipping a run of empties).
+ int64_t start = range->start;
+ // Retrieves the range of elements that this list contains.
+ // Uses the strategy pattern to distinguish between the different
+ // lists that are supported in Arrow (fixed size, normal and "large").
+ *child_range = selector_.GetRange(range->start);
+ while (child_range->Empty() && !range->Empty()) {
+ ++range->start;
+ *child_range = selector_.GetRange(range->start);
+ }
+ // Loops post-condition:
+ // * range is either empty (we are done processing at this node)
+ // or start corresponds a non-empty list.
+ // * If range is non-empty child_range contains
+ // the bounds of non-empty list.
+
+ // Handle any skipped over empty lists.
+ int64_t empty_elements = range->start - start;
+ if (empty_elements > 0) {
+ RETURN_IF_ERROR(FillRepLevels(empty_elements, prev_rep_level_, context));
+ RETURN_IF_ERROR(context->AppendDefLevels(empty_elements, def_level_if_empty_));
+ }
+ // Start of a new list. Note that for nested lists adding the element
+ // here effectively suppresses this code until we either encounter null
+ // elements or empty lists between here and the innermost list (since
+ // we make the rep levels repetition and definition levels unequal).
+ // Similarly when we are backtracking up the stack the repetition and
+ // definition levels are again equal so if we encounter an intermediate list
+ // with more elements this will detect it as a new list.
+ if (context->EqualRepDefLevelsLengths() && !range->Empty()) {
+ RETURN_IF_ERROR(context->AppendRepLevel(prev_rep_level_));
+ }
+
+ if (range->Empty()) {
+ return kDone;
+ }
+
+ ++range->start;
+ if (is_last_) {
+ // If this is the last repeated node, we can extend try
+ // to extend the child range as wide as possible before
+ // continuing to the next node.
+ return FillForLast(range, child_range, context);
+ }
+ return kNext;
+ }
+
+ void SetLast() { is_last_ = true; }
+
+ private:
+ IterationResult FillForLast(ElementRange* range, ElementRange* child_range,
+ PathWriteContext* context) {
+ // First fill int the remainder of the list.
+ RETURN_IF_ERROR(FillRepLevels(child_range->Size(), rep_level_, context));
+ // Once we've reached this point the following preconditions should hold:
+ // 1. There are no more repeated path nodes to deal with.
+ // 2. All elements in |range| represent contiguous elements in the
+ // child array (Null values would have shortened the range to ensure
+ // all remaining list elements are present (though they may be empty lists)).
+ // 3. No element of range spans a parent list (intermediate
+ // list nodes only handle one list entry at a time).
+ //
+ // Given these preconditions it should be safe to fill runs on non-empty
+ // lists here and expand the range in the child node accordingly.
+
+ while (!range->Empty()) {
+ ElementRange size_check = selector_.GetRange(range->start);
+ if (size_check.Empty()) {
+ // The empty range will need to be handled after we pass down the accumulated
+ // range because it affects def_level placement and we need to get the children
+ // def_levels entered first.
+ break;
+ }
+ // This is the start of a new list. We can be sure it only applies
+ // to the previous list (and doesn't jump to the start of any list
+ // further up in nesting due to the constraints mentioned at the start
+ // of the function).
+ RETURN_IF_ERROR(context->AppendRepLevel(prev_rep_level_));
+ RETURN_IF_ERROR(context->AppendRepLevels(size_check.Size() - 1, rep_level_));
+ DCHECK_EQ(size_check.start, child_range->end);
+ child_range->end = size_check.end;
+ ++range->start;
+ }
+
+ // Do book-keeping to track the elements of the arrays that are actually visited
+ // beyond this point. This is necessary to identify "gaps" in values that should
+ // not be processed (written out to parquet).
+ context->RecordPostListVisit(*child_range);
+ return kNext;
+ }
+
+ RangeSelector selector_;
+ int16_t prev_rep_level_;
+ int16_t rep_level_;
+ int16_t def_level_if_empty_;
+ bool is_last_ = false;
+};
+
+template <typename OffsetType>
+struct VarRangeSelector {
+ ElementRange GetRange(int64_t index) const {
+ return ElementRange{offsets[index], offsets[index + 1]};
+ }
+
+ // Either int32_t* or int64_t*.
+ const OffsetType* offsets;
+};
+
+struct FixedSizedRangeSelector {
+ ElementRange GetRange(int64_t index) const {
+ int64_t start = index * list_size;
+ return ElementRange{start, start + list_size};
+ }
+ int list_size;
+};
+
+// An intermediate node that handles null values.
+class NullableNode {
+ public:
+ NullableNode(const uint8_t* null_bitmap, int64_t entry_offset,
+ int16_t def_level_if_null, int16_t rep_level_if_null = kLevelNotSet)
+ : null_bitmap_(null_bitmap),
+ entry_offset_(entry_offset),
+ valid_bits_reader_(MakeReader(ElementRange{0, 0})),
+ def_level_if_null_(def_level_if_null),
+ rep_level_if_null_(rep_level_if_null),
+ new_range_(true) {}
+
+ void SetRepLevelIfNull(int16_t rep_level) { rep_level_if_null_ = rep_level; }
+
+ ::arrow::internal::BitRunReader MakeReader(const ElementRange& range) {
+ return ::arrow::internal::BitRunReader(null_bitmap_, entry_offset_ + range.start,
+ range.Size());
+ }
+
+ IterationResult Run(ElementRange* range, ElementRange* child_range,
+ PathWriteContext* context) {
+ if (new_range_) {
+ // Reset the reader each time we are starting fresh on a range.
+ // We can't rely on continuity because nulls above can
+ // cause discontinuities.
+ valid_bits_reader_ = MakeReader(*range);
+ }
+ child_range->start = range->start;
+ ::arrow::internal::BitRun run = valid_bits_reader_.NextRun();
+ if (!run.set) {
+ range->start += run.length;
+ RETURN_IF_ERROR(FillRepLevels(run.length, rep_level_if_null_, context));
+ RETURN_IF_ERROR(context->AppendDefLevels(run.length, def_level_if_null_));
+ run = valid_bits_reader_.NextRun();
+ }
+ if (range->Empty()) {
+ new_range_ = true;
+ return kDone;
+ }
+ child_range->end = child_range->start = range->start;
+ child_range->end += run.length;
+
+ DCHECK(!child_range->Empty());
+ range->start += child_range->Size();
+ new_range_ = false;
+ return kNext;
+ }
+
+ const uint8_t* null_bitmap_;
+ int64_t entry_offset_;
+ ::arrow::internal::BitRunReader valid_bits_reader_;
+ int16_t def_level_if_null_;
+ int16_t rep_level_if_null_;
+
+ // Whether the next invocation will be a new range.
+ bool new_range_ = true;
+};
+
+using ListNode = ListPathNode<VarRangeSelector<int32_t>>;
+using LargeListNode = ListPathNode<VarRangeSelector<int64_t>>;
+using FixedSizeListNode = ListPathNode<FixedSizedRangeSelector>;
+
+// Contains static information derived from traversing the schema.
+struct PathInfo {
+ // The vectors are expected to the same length info.
+
+ // Note index order matters here.
+ using Node = ::arrow::util::Variant<NullableTerminalNode, ListNode, LargeListNode,
+ FixedSizeListNode, NullableNode,
+ AllPresentTerminalNode, AllNullsTerminalNode>;
+
+ std::vector<Node> path;
+ std::shared_ptr<Array> primitive_array;
+ int16_t max_def_level = 0;
+ int16_t max_rep_level = 0;
+ bool has_dictionary = false;
+ bool leaf_is_nullable = false;
+};
+
+/// Contains logic for writing a single leaf node to parquet.
+/// This tracks the path from root to leaf.
+///
+/// |writer| will be called after all of the definition/repetition
+/// values have been calculated for root_range with the calculated
+/// values. It is intended to abstract the complexity of writing
+/// the levels and values to parquet.
+Status WritePath(ElementRange root_range, PathInfo* path_info,
+ ArrowWriteContext* arrow_context,
+ MultipathLevelBuilder::CallbackFunction writer) {
+ std::vector<ElementRange> stack(path_info->path.size());
+ MultipathLevelBuilderResult builder_result;
+ builder_result.leaf_array = path_info->primitive_array;
+ builder_result.leaf_is_nullable = path_info->leaf_is_nullable;
+
+ if (path_info->max_def_level == 0) {
+ // This case only occurs when there are no nullable or repeated
+ // columns in the path from the root to leaf.
+ int64_t leaf_length = builder_result.leaf_array->length();
+ builder_result.def_rep_level_count = leaf_length;
+ builder_result.post_list_visited_elements.push_back({0, leaf_length});
+ return writer(builder_result);
+ }
+ stack[0] = root_range;
+ RETURN_NOT_OK(
+ arrow_context->def_levels_buffer->Resize(/*new_size=*/0, /*shrink_to_fit*/ false));
+ PathWriteContext context(arrow_context->memory_pool, arrow_context->def_levels_buffer);
+ // We should need at least this many entries so reserve the space ahead of time.
+ RETURN_NOT_OK(context.def_levels.Reserve(root_range.Size()));
+ if (path_info->max_rep_level > 0) {
+ RETURN_NOT_OK(context.rep_levels.Reserve(root_range.Size()));
+ }
+
+ auto stack_base = &stack[0];
+ auto stack_position = stack_base;
+ // This is the main loop for calculated rep/def levels. The nodes
+ // in the path implement a chain-of-responsibility like pattern
+ // where each node can add some number of repetition/definition
+ // levels to PathWriteContext and also delegate to the next node
+ // in the path to add values. The values are added through each Run(...)
+ // call and the choice to delegate to the next node (or return to the
+ // previous node) is communicated by the return value of Run(...).
+ // The loop terminates after the first node indicates all values in
+ // |root_range| are processed.
+ while (stack_position >= stack_base) {
+ PathInfo::Node& node = path_info->path[stack_position - stack_base];
+ struct {
+ IterationResult operator()(NullableNode* node) {
+ return node->Run(stack_position, stack_position + 1, context);
+ }
+ IterationResult operator()(ListNode* node) {
+ return node->Run(stack_position, stack_position + 1, context);
+ }
+ IterationResult operator()(NullableTerminalNode* node) {
+ return node->Run(*stack_position, context);
+ }
+ IterationResult operator()(FixedSizeListNode* node) {
+ return node->Run(stack_position, stack_position + 1, context);
+ }
+ IterationResult operator()(AllPresentTerminalNode* node) {
+ return node->Run(*stack_position, context);
+ }
+ IterationResult operator()(AllNullsTerminalNode* node) {
+ return node->Run(*stack_position, context);
+ }
+ IterationResult operator()(LargeListNode* node) {
+ return node->Run(stack_position, stack_position + 1, context);
+ }
+ ElementRange* stack_position;
+ PathWriteContext* context;
+ } visitor = {stack_position, &context};
+
+ IterationResult result = ::arrow::util::visit(visitor, &node);
+
+ if (ARROW_PREDICT_FALSE(result == kError)) {
+ DCHECK(!context.last_status.ok());
+ return context.last_status;
+ }
+ stack_position += static_cast<int>(result);
+ }
+ RETURN_NOT_OK(context.last_status);
+ builder_result.def_rep_level_count = context.def_levels.length();
+
+ if (context.rep_levels.length() > 0) {
+ // This case only occurs when there was a repeated element that needs to be
+ // processed.
+ builder_result.rep_levels = context.rep_levels.data();
+ std::swap(builder_result.post_list_visited_elements, context.visited_elements);
+ // If it is possible when processing lists that all lists where empty. In this
+ // case no elements would have been added to post_list_visited_elements. By
+ // added an empty element we avoid special casing in downstream consumers.
+ if (builder_result.post_list_visited_elements.empty()) {
+ builder_result.post_list_visited_elements.push_back({0, 0});
+ }
+ } else {
+ builder_result.post_list_visited_elements.push_back(
+ {0, builder_result.leaf_array->length()});
+ builder_result.rep_levels = nullptr;
+ }
+
+ builder_result.def_levels = context.def_levels.data();
+ return writer(builder_result);
+}
+
+struct FixupVisitor {
+ int max_rep_level = -1;
+ int16_t rep_level_if_null = kLevelNotSet;
+
+ template <typename T>
+ void HandleListNode(T* arg) {
+ if (arg->rep_level() == max_rep_level) {
+ arg->SetLast();
+ // after the last list node we don't need to fill
+ // rep levels on null.
+ rep_level_if_null = kLevelNotSet;
+ } else {
+ rep_level_if_null = arg->rep_level();
+ }
+ }
+ void operator()(ListNode* node) { HandleListNode(node); }
+ void operator()(LargeListNode* node) { HandleListNode(node); }
+ void operator()(FixedSizeListNode* node) { HandleListNode(node); }
+
+ // For non-list intermediate nodes.
+ template <typename T>
+ void HandleIntermediateNode(T* arg) {
+ if (rep_level_if_null != kLevelNotSet) {
+ arg->SetRepLevelIfNull(rep_level_if_null);
+ }
+ }
+
+ void operator()(NullableNode* arg) { HandleIntermediateNode(arg); }
+
+ void operator()(AllNullsTerminalNode* arg) {
+ // Even though no processing happens past this point we
+ // still need to adjust it if a list occurred after an
+ // all null array.
+ HandleIntermediateNode(arg);
+ }
+
+ void operator()(NullableTerminalNode*) {}
+ void operator()(AllPresentTerminalNode*) {}
+};
+
+PathInfo Fixup(PathInfo info) {
+ // We only need to fixup the path if there were repeated
+ // elements on it.
+ if (info.max_rep_level == 0) {
+ return info;
+ }
+ FixupVisitor visitor;
+ visitor.max_rep_level = info.max_rep_level;
+ if (visitor.max_rep_level > 0) {
+ visitor.rep_level_if_null = 0;
+ }
+ for (size_t x = 0; x < info.path.size(); x++) {
+ ::arrow::util::visit(visitor, &info.path[x]);
+ }
+ return info;
+}
+
+class PathBuilder {
+ public:
+ explicit PathBuilder(bool start_nullable) : nullable_in_parent_(start_nullable) {}
+ template <typename T>
+ void AddTerminalInfo(const T& array) {
+ info_.leaf_is_nullable = nullable_in_parent_;
+ if (nullable_in_parent_) {
+ info_.max_def_level++;
+ }
+ // We don't use null_count() because if the null_count isn't known
+ // and the array does in fact contain nulls, we will end up
+ // traversing the null bitmap twice (once here and once when calculating
+ // rep/def levels).
+ if (LazyNoNulls(array)) {
+ info_.path.emplace_back(AllPresentTerminalNode{info_.max_def_level});
+ } else if (LazyNullCount(array) == array.length()) {
+ info_.path.emplace_back(AllNullsTerminalNode(info_.max_def_level - 1));
+ } else {
+ info_.path.emplace_back(NullableTerminalNode(array.null_bitmap_data(),
+ array.offset(), info_.max_def_level));
+ }
+ info_.primitive_array = std::make_shared<T>(array.data());
+ paths_.push_back(Fixup(info_));
+ }
+
+ template <typename T>
+ ::arrow::enable_if_t<std::is_base_of<::arrow::FlatArray, T>::value, Status> Visit(
+ const T& array) {
+ AddTerminalInfo(array);
+ return Status::OK();
+ }
+
+ template <typename T>
+ ::arrow::enable_if_t<std::is_same<::arrow::ListArray, T>::value ||
+ std::is_same<::arrow::LargeListArray, T>::value,
+ Status>
+ Visit(const T& array) {
+ MaybeAddNullable(array);
+ // Increment necessary due to empty lists.
+ info_.max_def_level++;
+ info_.max_rep_level++;
+ // raw_value_offsets() accounts for any slice offset.
+ ListPathNode<VarRangeSelector<typename T::offset_type>> node(
+ VarRangeSelector<typename T::offset_type>{array.raw_value_offsets()},
+ info_.max_rep_level, info_.max_def_level - 1);
+ info_.path.emplace_back(std::move(node));
+ nullable_in_parent_ = array.list_type()->value_field()->nullable();
+ return VisitInline(*array.values());
+ }
+
+ Status Visit(const ::arrow::DictionaryArray& array) {
+ // Only currently handle DictionaryArray where the dictionary is a
+ // primitive type
+ if (array.dict_type()->value_type()->num_fields() > 0) {
+ return Status::NotImplemented(
+ "Writing DictionaryArray with nested dictionary "
+ "type not yet supported");
+ }
+ if (array.dictionary()->null_count() > 0) {
+ return Status::NotImplemented(
+ "Writing DictionaryArray with null encoded in dictionary "
+ "type not yet supported");
+ }
+ AddTerminalInfo(array);
+ return Status::OK();
+ }
+
+ void MaybeAddNullable(const Array& array) {
+ if (!nullable_in_parent_) {
+ return;
+ }
+ info_.max_def_level++;
+ // We don't use null_count() because if the null_count isn't known
+ // and the array does in fact contain nulls, we will end up
+ // traversing the null bitmap twice (once here and once when calculating
+ // rep/def levels). Because this isn't terminal this might not be
+ // the right decision for structs that share the same nullable
+ // parents.
+ if (LazyNoNulls(array)) {
+ // Don't add anything because there won't be any point checking
+ // null values for the array. There will always be at least
+ // one more array to handle nullability.
+ return;
+ }
+ if (LazyNullCount(array) == array.length()) {
+ info_.path.emplace_back(AllNullsTerminalNode(info_.max_def_level - 1));
+ return;
+ }
+ info_.path.emplace_back(
+ NullableNode(array.null_bitmap_data(), array.offset(),
+ /* def_level_if_null = */ info_.max_def_level - 1));
+ }
+
+ Status VisitInline(const Array& array);
+
+ Status Visit(const ::arrow::MapArray& array) {
+ return Visit(static_cast<const ::arrow::ListArray&>(array));
+ }
+
+ Status Visit(const ::arrow::StructArray& array) {
+ MaybeAddNullable(array);
+ PathInfo info_backup = info_;
+ for (int x = 0; x < array.num_fields(); x++) {
+ nullable_in_parent_ = array.type()->field(x)->nullable();
+ RETURN_NOT_OK(VisitInline(*array.field(x)));
+ info_ = info_backup;
+ }
+ return Status::OK();
+ }
+
+ Status Visit(const ::arrow::FixedSizeListArray& array) {
+ MaybeAddNullable(array);
+ int32_t list_size = array.list_type()->list_size();
+ // Technically we could encode fixed size lists with two level encodings
+ // but since we always use 3 level encoding we increment def levels as
+ // well.
+ info_.max_def_level++;
+ info_.max_rep_level++;
+ info_.path.emplace_back(FixedSizeListNode(FixedSizedRangeSelector{list_size},
+ info_.max_rep_level, info_.max_def_level));
+ nullable_in_parent_ = array.list_type()->value_field()->nullable();
+ if (array.offset() > 0) {
+ return VisitInline(*array.values()->Slice(array.value_offset(0)));
+ }
+ return VisitInline(*array.values());
+ }
+
+ Status Visit(const ::arrow::ExtensionArray& array) {
+ return VisitInline(*array.storage());
+ }
+
+#define NOT_IMPLEMENTED_VISIT(ArrowTypePrefix) \
+ Status Visit(const ::arrow::ArrowTypePrefix##Array& array) { \
+ return Status::NotImplemented("Level generation for " #ArrowTypePrefix \
+ " not supported yet"); \
+ }
+
+ // Union types aren't supported in Parquet.
+ NOT_IMPLEMENTED_VISIT(Union)
+
+#undef NOT_IMPLEMENTED_VISIT
+ std::vector<PathInfo>& paths() { return paths_; }
+
+ private:
+ PathInfo info_;
+ std::vector<PathInfo> paths_;
+ bool nullable_in_parent_;
+};
+
+Status PathBuilder::VisitInline(const Array& array) {
+ return ::arrow::VisitArrayInline(array, this);
+}
+
+#undef RETURN_IF_ERROR
+} // namespace
+
+class MultipathLevelBuilderImpl : public MultipathLevelBuilder {
+ public:
+ MultipathLevelBuilderImpl(std::shared_ptr<::arrow::ArrayData> data,
+ std::unique_ptr<PathBuilder> path_builder)
+ : root_range_{0, data->length},
+ data_(std::move(data)),
+ path_builder_(std::move(path_builder)) {}
+
+ int GetLeafCount() const override {
+ return static_cast<int>(path_builder_->paths().size());
+ }
+
+ ::arrow::Status Write(int leaf_index, ArrowWriteContext* context,
+ CallbackFunction write_leaf_callback) override {
+ DCHECK_GE(leaf_index, 0);
+ DCHECK_LT(leaf_index, GetLeafCount());
+ return WritePath(root_range_, &path_builder_->paths()[leaf_index], context,
+ std::move(write_leaf_callback));
+ }
+
+ private:
+ ElementRange root_range_;
+ // Reference holder to ensure the data stays valid.
+ std::shared_ptr<::arrow::ArrayData> data_;
+ std::unique_ptr<PathBuilder> path_builder_;
+};
+
+// static
+::arrow::Result<std::unique_ptr<MultipathLevelBuilder>> MultipathLevelBuilder::Make(
+ const ::arrow::Array& array, bool array_field_nullable) {
+ auto constructor = ::arrow::internal::make_unique<PathBuilder>(array_field_nullable);
+ RETURN_NOT_OK(VisitArrayInline(array, constructor.get()));
+ return ::arrow::internal::make_unique<MultipathLevelBuilderImpl>(
+ array.data(), std::move(constructor));
+}
+
+// static
+Status MultipathLevelBuilder::Write(const Array& array, bool array_field_nullable,
+ ArrowWriteContext* context,
+ MultipathLevelBuilder::CallbackFunction callback) {
+ ARROW_ASSIGN_OR_RAISE(std::unique_ptr<MultipathLevelBuilder> builder,
+ MultipathLevelBuilder::Make(array, array_field_nullable));
+ PathBuilder constructor(array_field_nullable);
+ RETURN_NOT_OK(VisitArrayInline(array, &constructor));
+ for (int leaf_idx = 0; leaf_idx < builder->GetLeafCount(); leaf_idx++) {
+ RETURN_NOT_OK(builder->Write(leaf_idx, context, callback));
+ }
+ return Status::OK();
+}
+
+} // namespace arrow
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/arrow/path_internal.h b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/path_internal.h
new file mode 100644
index 00000000000..c5b7fdfdac3
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/path_internal.h
@@ -0,0 +1,155 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "arrow/result.h"
+#include "arrow/status.h"
+
+#include "parquet/platform.h"
+
+namespace arrow {
+
+class Array;
+
+} // namespace arrow
+
+namespace parquet {
+
+struct ArrowWriteContext;
+
+namespace arrow {
+
+// This files contain internal implementation details and should not be considered
+// part of the public API.
+
+// The MultipathLevelBuilder is intended to fully support all Arrow nested types that
+// map to parquet types (i.e. Everything but Unions).
+//
+
+/// \brief Half open range of elements in an array.
+struct ElementRange {
+ /// Upper bound of range (inclusive)
+ int64_t start;
+ /// Upper bound of range (exclusive)
+ int64_t end;
+
+ bool Empty() const { return start == end; }
+
+ int64_t Size() const { return end - start; }
+};
+
+/// \brief Result for a single leaf array when running the builder on the
+/// its root.
+struct MultipathLevelBuilderResult {
+ /// \brief The Array containing only the values to write (after all nesting has
+ /// been processed.
+ ///
+ /// No additional processing is done on this array (it is copied as is when
+ /// visited via a DFS).
+ std::shared_ptr<::arrow::Array> leaf_array;
+
+ /// \brief Might be null.
+ const int16_t* def_levels = nullptr;
+
+ /// \brief Might be null.
+ const int16_t* rep_levels = nullptr;
+
+ /// \brief Number of items (int16_t) contained in def/rep_levels when present.
+ int64_t def_rep_level_count = 0;
+
+ /// \brief Contains element ranges of the required visiting on the
+ /// descendants of the final list ancestor for any leaf node.
+ ///
+ /// The algorithm will attempt to consolidate visited ranges into
+ /// the smallest number possible.
+ ///
+ /// This data is necessary to pass along because after producing
+ /// def-rep levels for each leaf array it is impossible to determine
+ /// which values have to be sent to parquet when a null list value
+ /// in a nullable ListArray is non-empty.
+ ///
+ /// This allows for the parquet writing to determine which values ultimately
+ /// needs to be written.
+ std::vector<ElementRange> post_list_visited_elements;
+
+ /// Whether the leaf array is nullable.
+ bool leaf_is_nullable;
+};
+
+/// \brief Logic for being able to write out nesting (rep/def level) data that is
+/// needed for writing to parquet.
+class PARQUET_EXPORT MultipathLevelBuilder {
+ public:
+ /// \brief A callback function that will receive results from the call to
+ /// Write(...) below. The MultipathLevelBuilderResult passed in will
+ /// only remain valid for the function call (i.e. storing it and relying
+ /// for its data to be consistent afterwards will result in undefined
+ /// behavior.
+ using CallbackFunction =
+ std::function<::arrow::Status(const MultipathLevelBuilderResult&)>;
+
+ /// \brief Determine rep/def level information for the array.
+ ///
+ /// The callback will be invoked for each leaf Array that is a
+ /// descendant of array. Each leaf array is processed in a depth
+ /// first traversal-order.
+ ///
+ /// \param[in] array The array to process.
+ /// \param[in] array_field_nullable Whether the algorithm should consider
+ /// the the array column as nullable (as determined by its type's parent
+ /// field).
+ /// \param[in, out] context for use when allocating memory, etc.
+ /// \param[out] write_leaf_callback Callback to receive results.
+ /// There will be one call to the write_leaf_callback for each leaf node.
+ static ::arrow::Status Write(const ::arrow::Array& array, bool array_field_nullable,
+ ArrowWriteContext* context,
+ CallbackFunction write_leaf_callback);
+
+ /// \brief Construct a new instance of the builder.
+ ///
+ /// \param[in] array The array to process.
+ /// \param[in] array_field_nullable Whether the algorithm should consider
+ /// the the array column as nullable (as determined by its type's parent
+ /// field).
+ static ::arrow::Result<std::unique_ptr<MultipathLevelBuilder>> Make(
+ const ::arrow::Array& array, bool array_field_nullable);
+
+ virtual ~MultipathLevelBuilder() = default;
+
+ /// \brief Returns the number of leaf columns that need to be written
+ /// to Parquet.
+ virtual int GetLeafCount() const = 0;
+
+ /// \brief Calls write_leaf_callback with the MultipathLevelBuilderResult corresponding
+ /// to |leaf_index|.
+ ///
+ /// \param[in] leaf_index The index of the leaf column to write. Must be in the range
+ /// [0, GetLeafCount()].
+ /// \param[in, out] context for use when allocating memory, etc.
+ /// \param[out] write_leaf_callback Callback to receive the result.
+ virtual ::arrow::Status Write(int leaf_index, ArrowWriteContext* context,
+ CallbackFunction write_leaf_callback) = 0;
+};
+
+} // namespace arrow
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/arrow/reader.cc b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/reader.cc
new file mode 100644
index 00000000000..4f5f79c964a
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/reader.cc
@@ -0,0 +1,1248 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/arrow/reader.h"
+
+#include <algorithm>
+#include <cstring>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/buffer.h"
+#include "arrow/extension_type.h"
+#include "arrow/io/memory.h"
+#include "arrow/record_batch.h"
+#include "arrow/table.h"
+#include "arrow/type.h"
+#include "arrow/util/async_generator.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/future.h"
+#include "arrow/util/iterator.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/make_unique.h"
+#include "arrow/util/parallel.h"
+#include "arrow/util/range.h"
+#include "parquet/arrow/reader_internal.h"
+#include "parquet/column_reader.h"
+#include "parquet/exception.h"
+#include "parquet/file_reader.h"
+#include "parquet/metadata.h"
+#include "parquet/properties.h"
+#include "parquet/schema.h"
+
+using arrow::Array;
+using arrow::ArrayData;
+using arrow::BooleanArray;
+using arrow::ChunkedArray;
+using arrow::DataType;
+using arrow::ExtensionType;
+using arrow::Field;
+using arrow::Future;
+using arrow::Int32Array;
+using arrow::ListArray;
+using arrow::MemoryPool;
+using arrow::RecordBatchReader;
+using arrow::ResizableBuffer;
+using arrow::Status;
+using arrow::StructArray;
+using arrow::Table;
+using arrow::TimestampArray;
+
+using arrow::internal::checked_cast;
+using arrow::internal::Iota;
+
+// Help reduce verbosity
+using ParquetReader = parquet::ParquetFileReader;
+
+using parquet::internal::RecordReader;
+
+namespace BitUtil = arrow::BitUtil;
+
+namespace parquet {
+namespace arrow {
+namespace {
+
+::arrow::Result<std::shared_ptr<ArrayData>> ChunksToSingle(const ChunkedArray& chunked) {
+ switch (chunked.num_chunks()) {
+ case 0: {
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Array> array,
+ ::arrow::MakeArrayOfNull(chunked.type(), 0));
+ return array->data();
+ }
+ case 1:
+ return chunked.chunk(0)->data();
+ default:
+ // ARROW-3762(wesm): If item reader yields a chunked array, we reject as
+ // this is not yet implemented
+ return Status::NotImplemented(
+ "Nested data conversions not implemented for chunked array outputs");
+ }
+}
+
+} // namespace
+
+class ColumnReaderImpl : public ColumnReader {
+ public:
+ virtual Status GetDefLevels(const int16_t** data, int64_t* length) = 0;
+ virtual Status GetRepLevels(const int16_t** data, int64_t* length) = 0;
+ virtual const std::shared_ptr<Field> field() = 0;
+
+ ::arrow::Status NextBatch(int64_t batch_size,
+ std::shared_ptr<::arrow::ChunkedArray>* out) final {
+ RETURN_NOT_OK(LoadBatch(batch_size));
+ RETURN_NOT_OK(BuildArray(batch_size, out));
+ for (int x = 0; x < (*out)->num_chunks(); x++) {
+ RETURN_NOT_OK((*out)->chunk(x)->Validate());
+ }
+ return Status::OK();
+ }
+
+ virtual ::arrow::Status LoadBatch(int64_t num_records) = 0;
+
+ virtual ::arrow::Status BuildArray(int64_t length_upper_bound,
+ std::shared_ptr<::arrow::ChunkedArray>* out) = 0;
+ virtual bool IsOrHasRepeatedChild() const = 0;
+};
+
+namespace {
+
+std::shared_ptr<std::unordered_set<int>> VectorToSharedSet(
+ const std::vector<int>& values) {
+ std::shared_ptr<std::unordered_set<int>> result(new std::unordered_set<int>());
+ result->insert(values.begin(), values.end());
+ return result;
+}
+
+// Forward declaration
+Status GetReader(const SchemaField& field, const std::shared_ptr<ReaderContext>& context,
+ std::unique_ptr<ColumnReaderImpl>* out);
+
+// ----------------------------------------------------------------------
+// FileReaderImpl forward declaration
+
+class FileReaderImpl : public FileReader {
+ public:
+ FileReaderImpl(MemoryPool* pool, std::unique_ptr<ParquetFileReader> reader,
+ ArrowReaderProperties properties)
+ : pool_(pool),
+ reader_(std::move(reader)),
+ reader_properties_(std::move(properties)) {}
+
+ Status Init() {
+ return SchemaManifest::Make(reader_->metadata()->schema(),
+ reader_->metadata()->key_value_metadata(),
+ reader_properties_, &manifest_);
+ }
+
+ FileColumnIteratorFactory SomeRowGroupsFactory(std::vector<int> row_groups) {
+ return [row_groups](int i, ParquetFileReader* reader) {
+ return new FileColumnIterator(i, reader, row_groups);
+ };
+ }
+
+ FileColumnIteratorFactory AllRowGroupsFactory() {
+ return SomeRowGroupsFactory(Iota(reader_->metadata()->num_row_groups()));
+ }
+
+ Status BoundsCheckColumn(int column) {
+ if (column < 0 || column >= this->num_columns()) {
+ return Status::Invalid("Column index out of bounds (got ", column,
+ ", should be "
+ "between 0 and ",
+ this->num_columns() - 1, ")");
+ }
+ return Status::OK();
+ }
+
+ Status BoundsCheckRowGroup(int row_group) {
+ // row group indices check
+ if (row_group < 0 || row_group >= num_row_groups()) {
+ return Status::Invalid("Some index in row_group_indices is ", row_group,
+ ", which is either < 0 or >= num_row_groups(",
+ num_row_groups(), ")");
+ }
+ return Status::OK();
+ }
+
+ Status BoundsCheck(const std::vector<int>& row_groups,
+ const std::vector<int>& column_indices) {
+ for (int i : row_groups) {
+ RETURN_NOT_OK(BoundsCheckRowGroup(i));
+ }
+ for (int i : column_indices) {
+ RETURN_NOT_OK(BoundsCheckColumn(i));
+ }
+ return Status::OK();
+ }
+
+ std::shared_ptr<RowGroupReader> RowGroup(int row_group_index) override;
+
+ Status ReadTable(const std::vector<int>& indices,
+ std::shared_ptr<Table>* out) override {
+ return ReadRowGroups(Iota(reader_->metadata()->num_row_groups()), indices, out);
+ }
+
+ Status GetFieldReader(int i,
+ const std::shared_ptr<std::unordered_set<int>>& included_leaves,
+ const std::vector<int>& row_groups,
+ std::unique_ptr<ColumnReaderImpl>* out) {
+ auto ctx = std::make_shared<ReaderContext>();
+ ctx->reader = reader_.get();
+ ctx->pool = pool_;
+ ctx->iterator_factory = SomeRowGroupsFactory(row_groups);
+ ctx->filter_leaves = true;
+ ctx->included_leaves = included_leaves;
+ return GetReader(manifest_.schema_fields[i], ctx, out);
+ }
+
+ Status GetFieldReaders(const std::vector<int>& column_indices,
+ const std::vector<int>& row_groups,
+ std::vector<std::shared_ptr<ColumnReaderImpl>>* out,
+ std::shared_ptr<::arrow::Schema>* out_schema) {
+ // We only need to read schema fields which have columns indicated
+ // in the indices vector
+ ARROW_ASSIGN_OR_RAISE(std::vector<int> field_indices,
+ manifest_.GetFieldIndices(column_indices));
+
+ auto included_leaves = VectorToSharedSet(column_indices);
+
+ out->resize(field_indices.size());
+ ::arrow::FieldVector out_fields(field_indices.size());
+ for (size_t i = 0; i < out->size(); ++i) {
+ std::unique_ptr<ColumnReaderImpl> reader;
+ RETURN_NOT_OK(
+ GetFieldReader(field_indices[i], included_leaves, row_groups, &reader));
+
+ out_fields[i] = reader->field();
+ out->at(i) = std::move(reader);
+ }
+
+ *out_schema = ::arrow::schema(std::move(out_fields), manifest_.schema_metadata);
+ return Status::OK();
+ }
+
+ Status GetColumn(int i, FileColumnIteratorFactory iterator_factory,
+ std::unique_ptr<ColumnReader>* out);
+
+ Status GetColumn(int i, std::unique_ptr<ColumnReader>* out) override {
+ return GetColumn(i, AllRowGroupsFactory(), out);
+ }
+
+ Status GetSchema(std::shared_ptr<::arrow::Schema>* out) override {
+ return FromParquetSchema(reader_->metadata()->schema(), reader_properties_,
+ reader_->metadata()->key_value_metadata(), out);
+ }
+
+ Status ReadSchemaField(int i, std::shared_ptr<ChunkedArray>* out) override {
+ auto included_leaves = VectorToSharedSet(Iota(reader_->metadata()->num_columns()));
+ std::vector<int> row_groups = Iota(reader_->metadata()->num_row_groups());
+
+ std::unique_ptr<ColumnReaderImpl> reader;
+ RETURN_NOT_OK(GetFieldReader(i, included_leaves, row_groups, &reader));
+
+ return ReadColumn(i, row_groups, reader.get(), out);
+ }
+
+ Status ReadColumn(int i, const std::vector<int>& row_groups, ColumnReader* reader,
+ std::shared_ptr<ChunkedArray>* out) {
+ BEGIN_PARQUET_CATCH_EXCEPTIONS
+ // TODO(wesm): This calculation doesn't make much sense when we have repeated
+ // schema nodes
+ int64_t records_to_read = 0;
+ for (auto row_group : row_groups) {
+ // Can throw exception
+ records_to_read +=
+ reader_->metadata()->RowGroup(row_group)->ColumnChunk(i)->num_values();
+ }
+ return reader->NextBatch(records_to_read, out);
+ END_PARQUET_CATCH_EXCEPTIONS
+ }
+
+ Status ReadColumn(int i, const std::vector<int>& row_groups,
+ std::shared_ptr<ChunkedArray>* out) {
+ std::unique_ptr<ColumnReader> flat_column_reader;
+ RETURN_NOT_OK(GetColumn(i, SomeRowGroupsFactory(row_groups), &flat_column_reader));
+ return ReadColumn(i, row_groups, flat_column_reader.get(), out);
+ }
+
+ Status ReadColumn(int i, std::shared_ptr<ChunkedArray>* out) override {
+ return ReadColumn(i, Iota(reader_->metadata()->num_row_groups()), out);
+ }
+
+ Status ReadTable(std::shared_ptr<Table>* table) override {
+ return ReadTable(Iota(reader_->metadata()->num_columns()), table);
+ }
+
+ Status ReadRowGroups(const std::vector<int>& row_groups,
+ const std::vector<int>& indices,
+ std::shared_ptr<Table>* table) override;
+
+ // Helper method used by ReadRowGroups - read the given row groups/columns, skipping
+ // bounds checks and pre-buffering. Takes a shared_ptr to self to keep the reader
+ // alive in async contexts.
+ Future<std::shared_ptr<Table>> DecodeRowGroups(
+ std::shared_ptr<FileReaderImpl> self, const std::vector<int>& row_groups,
+ const std::vector<int>& column_indices, ::arrow::internal::Executor* cpu_executor);
+
+ Status ReadRowGroups(const std::vector<int>& row_groups,
+ std::shared_ptr<Table>* table) override {
+ return ReadRowGroups(row_groups, Iota(reader_->metadata()->num_columns()), table);
+ }
+
+ Status ReadRowGroup(int row_group_index, const std::vector<int>& column_indices,
+ std::shared_ptr<Table>* out) override {
+ return ReadRowGroups({row_group_index}, column_indices, out);
+ }
+
+ Status ReadRowGroup(int i, std::shared_ptr<Table>* table) override {
+ return ReadRowGroup(i, Iota(reader_->metadata()->num_columns()), table);
+ }
+
+ Status GetRecordBatchReader(const std::vector<int>& row_group_indices,
+ const std::vector<int>& column_indices,
+ std::unique_ptr<RecordBatchReader>* out) override;
+
+ Status GetRecordBatchReader(const std::vector<int>& row_group_indices,
+ std::unique_ptr<RecordBatchReader>* out) override {
+ return GetRecordBatchReader(row_group_indices,
+ Iota(reader_->metadata()->num_columns()), out);
+ }
+
+ ::arrow::Result<::arrow::AsyncGenerator<std::shared_ptr<::arrow::RecordBatch>>>
+ GetRecordBatchGenerator(std::shared_ptr<FileReader> reader,
+ const std::vector<int> row_group_indices,
+ const std::vector<int> column_indices,
+ ::arrow::internal::Executor* cpu_executor) override;
+
+ int num_columns() const { return reader_->metadata()->num_columns(); }
+
+ ParquetFileReader* parquet_reader() const override { return reader_.get(); }
+
+ int num_row_groups() const override { return reader_->metadata()->num_row_groups(); }
+
+ void set_use_threads(bool use_threads) override {
+ reader_properties_.set_use_threads(use_threads);
+ }
+
+ void set_batch_size(int64_t batch_size) override {
+ reader_properties_.set_batch_size(batch_size);
+ }
+
+ const ArrowReaderProperties& properties() const override { return reader_properties_; }
+
+ const SchemaManifest& manifest() const override { return manifest_; }
+
+ Status ScanContents(std::vector<int> columns, const int32_t column_batch_size,
+ int64_t* num_rows) override {
+ BEGIN_PARQUET_CATCH_EXCEPTIONS
+ *num_rows = ScanFileContents(columns, column_batch_size, reader_.get());
+ return Status::OK();
+ END_PARQUET_CATCH_EXCEPTIONS
+ }
+
+ MemoryPool* pool_;
+ std::unique_ptr<ParquetFileReader> reader_;
+ ArrowReaderProperties reader_properties_;
+
+ SchemaManifest manifest_;
+};
+
+class RowGroupRecordBatchReader : public ::arrow::RecordBatchReader {
+ public:
+ RowGroupRecordBatchReader(::arrow::RecordBatchIterator batches,
+ std::shared_ptr<::arrow::Schema> schema)
+ : batches_(std::move(batches)), schema_(std::move(schema)) {}
+
+ ~RowGroupRecordBatchReader() override {}
+
+ Status ReadNext(std::shared_ptr<::arrow::RecordBatch>* out) override {
+ return batches_.Next().Value(out);
+ }
+
+ std::shared_ptr<::arrow::Schema> schema() const override { return schema_; }
+
+ private:
+ ::arrow::Iterator<std::shared_ptr<::arrow::RecordBatch>> batches_;
+ std::shared_ptr<::arrow::Schema> schema_;
+};
+
+class ColumnChunkReaderImpl : public ColumnChunkReader {
+ public:
+ ColumnChunkReaderImpl(FileReaderImpl* impl, int row_group_index, int column_index)
+ : impl_(impl), column_index_(column_index), row_group_index_(row_group_index) {}
+
+ Status Read(std::shared_ptr<::arrow::ChunkedArray>* out) override {
+ return impl_->ReadColumn(column_index_, {row_group_index_}, out);
+ }
+
+ private:
+ FileReaderImpl* impl_;
+ int column_index_;
+ int row_group_index_;
+};
+
+class RowGroupReaderImpl : public RowGroupReader {
+ public:
+ RowGroupReaderImpl(FileReaderImpl* impl, int row_group_index)
+ : impl_(impl), row_group_index_(row_group_index) {}
+
+ std::shared_ptr<ColumnChunkReader> Column(int column_index) override {
+ return std::shared_ptr<ColumnChunkReader>(
+ new ColumnChunkReaderImpl(impl_, row_group_index_, column_index));
+ }
+
+ Status ReadTable(const std::vector<int>& column_indices,
+ std::shared_ptr<::arrow::Table>* out) override {
+ return impl_->ReadRowGroup(row_group_index_, column_indices, out);
+ }
+
+ Status ReadTable(std::shared_ptr<::arrow::Table>* out) override {
+ return impl_->ReadRowGroup(row_group_index_, out);
+ }
+
+ private:
+ FileReaderImpl* impl_;
+ int row_group_index_;
+};
+
+// ----------------------------------------------------------------------
+// Column reader implementations
+
+// Leaf reader is for primitive arrays and primitive children of nested arrays
+class LeafReader : public ColumnReaderImpl {
+ public:
+ LeafReader(std::shared_ptr<ReaderContext> ctx, std::shared_ptr<Field> field,
+ std::unique_ptr<FileColumnIterator> input,
+ ::parquet::internal::LevelInfo leaf_info)
+ : ctx_(std::move(ctx)),
+ field_(std::move(field)),
+ input_(std::move(input)),
+ descr_(input_->descr()) {
+ record_reader_ = RecordReader::Make(
+ descr_, leaf_info, ctx_->pool, field_->type()->id() == ::arrow::Type::DICTIONARY);
+ NextRowGroup();
+ }
+
+ Status GetDefLevels(const int16_t** data, int64_t* length) final {
+ *data = record_reader_->def_levels();
+ *length = record_reader_->levels_position();
+ return Status::OK();
+ }
+
+ Status GetRepLevels(const int16_t** data, int64_t* length) final {
+ *data = record_reader_->rep_levels();
+ *length = record_reader_->levels_position();
+ return Status::OK();
+ }
+
+ bool IsOrHasRepeatedChild() const final { return false; }
+
+ Status LoadBatch(int64_t records_to_read) final {
+ BEGIN_PARQUET_CATCH_EXCEPTIONS
+ out_ = nullptr;
+ record_reader_->Reset();
+ // Pre-allocation gives much better performance for flat columns
+ record_reader_->Reserve(records_to_read);
+ while (records_to_read > 0) {
+ if (!record_reader_->HasMoreData()) {
+ break;
+ }
+ int64_t records_read = record_reader_->ReadRecords(records_to_read);
+ records_to_read -= records_read;
+ if (records_read == 0) {
+ NextRowGroup();
+ }
+ }
+ RETURN_NOT_OK(TransferColumnData(record_reader_.get(), field_->type(), descr_,
+ ctx_->pool, &out_));
+ return Status::OK();
+ END_PARQUET_CATCH_EXCEPTIONS
+ }
+
+ ::arrow::Status BuildArray(int64_t length_upper_bound,
+ std::shared_ptr<::arrow::ChunkedArray>* out) final {
+ *out = out_;
+ return Status::OK();
+ }
+
+ const std::shared_ptr<Field> field() override { return field_; }
+
+ private:
+ std::shared_ptr<ChunkedArray> out_;
+ void NextRowGroup() {
+ std::unique_ptr<PageReader> page_reader = input_->NextChunk();
+ record_reader_->SetPageReader(std::move(page_reader));
+ }
+
+ std::shared_ptr<ReaderContext> ctx_;
+ std::shared_ptr<Field> field_;
+ std::unique_ptr<FileColumnIterator> input_;
+ const ColumnDescriptor* descr_;
+ std::shared_ptr<RecordReader> record_reader_;
+};
+
+// Column reader for extension arrays
+class ExtensionReader : public ColumnReaderImpl {
+ public:
+ ExtensionReader(std::shared_ptr<Field> field,
+ std::unique_ptr<ColumnReaderImpl> storage_reader)
+ : field_(std::move(field)), storage_reader_(std::move(storage_reader)) {}
+
+ Status GetDefLevels(const int16_t** data, int64_t* length) override {
+ return storage_reader_->GetDefLevels(data, length);
+ }
+
+ Status GetRepLevels(const int16_t** data, int64_t* length) override {
+ return storage_reader_->GetRepLevels(data, length);
+ }
+
+ Status LoadBatch(int64_t number_of_records) final {
+ return storage_reader_->LoadBatch(number_of_records);
+ }
+
+ Status BuildArray(int64_t length_upper_bound,
+ std::shared_ptr<ChunkedArray>* out) override {
+ std::shared_ptr<ChunkedArray> storage;
+ RETURN_NOT_OK(storage_reader_->BuildArray(length_upper_bound, &storage));
+ *out = ExtensionType::WrapArray(field_->type(), storage);
+ return Status::OK();
+ }
+
+ bool IsOrHasRepeatedChild() const final {
+ return storage_reader_->IsOrHasRepeatedChild();
+ }
+
+ const std::shared_ptr<Field> field() override { return field_; }
+
+ private:
+ std::shared_ptr<Field> field_;
+ std::unique_ptr<ColumnReaderImpl> storage_reader_;
+};
+
+template <typename IndexType>
+class ListReader : public ColumnReaderImpl {
+ public:
+ ListReader(std::shared_ptr<ReaderContext> ctx, std::shared_ptr<Field> field,
+ ::parquet::internal::LevelInfo level_info,
+ std::unique_ptr<ColumnReaderImpl> child_reader)
+ : ctx_(std::move(ctx)),
+ field_(std::move(field)),
+ level_info_(level_info),
+ item_reader_(std::move(child_reader)) {}
+
+ Status GetDefLevels(const int16_t** data, int64_t* length) override {
+ return item_reader_->GetDefLevels(data, length);
+ }
+
+ Status GetRepLevels(const int16_t** data, int64_t* length) override {
+ return item_reader_->GetRepLevels(data, length);
+ }
+
+ bool IsOrHasRepeatedChild() const final { return true; }
+
+ Status LoadBatch(int64_t number_of_records) final {
+ return item_reader_->LoadBatch(number_of_records);
+ }
+
+ virtual ::arrow::Result<std::shared_ptr<ChunkedArray>> AssembleArray(
+ std::shared_ptr<ArrayData> data) {
+ if (field_->type()->id() == ::arrow::Type::MAP) {
+ // Error out if data is not map-compliant instead of aborting in MakeArray below
+ RETURN_NOT_OK(::arrow::MapArray::ValidateChildData(data->child_data));
+ }
+ std::shared_ptr<Array> result = ::arrow::MakeArray(data);
+ return std::make_shared<ChunkedArray>(result);
+ }
+
+ Status BuildArray(int64_t length_upper_bound,
+ std::shared_ptr<ChunkedArray>* out) override {
+ const int16_t* def_levels;
+ const int16_t* rep_levels;
+ int64_t num_levels;
+ RETURN_NOT_OK(item_reader_->GetDefLevels(&def_levels, &num_levels));
+ RETURN_NOT_OK(item_reader_->GetRepLevels(&rep_levels, &num_levels));
+
+ std::shared_ptr<ResizableBuffer> validity_buffer;
+ ::parquet::internal::ValidityBitmapInputOutput validity_io;
+ validity_io.values_read_upper_bound = length_upper_bound;
+ if (field_->nullable()) {
+ ARROW_ASSIGN_OR_RAISE(
+ validity_buffer,
+ AllocateResizableBuffer(BitUtil::BytesForBits(length_upper_bound), ctx_->pool));
+ validity_io.valid_bits = validity_buffer->mutable_data();
+ }
+ ARROW_ASSIGN_OR_RAISE(
+ std::shared_ptr<ResizableBuffer> offsets_buffer,
+ AllocateResizableBuffer(
+ sizeof(IndexType) * std::max(int64_t{1}, length_upper_bound + 1),
+ ctx_->pool));
+ // Ensure zero initialization in case we have reached a zero length list (and
+ // because first entry is always zero).
+ IndexType* offset_data = reinterpret_cast<IndexType*>(offsets_buffer->mutable_data());
+ offset_data[0] = 0;
+ BEGIN_PARQUET_CATCH_EXCEPTIONS
+ ::parquet::internal::DefRepLevelsToList(def_levels, rep_levels, num_levels,
+ level_info_, &validity_io, offset_data);
+ END_PARQUET_CATCH_EXCEPTIONS
+
+ RETURN_NOT_OK(item_reader_->BuildArray(offset_data[validity_io.values_read], out));
+
+ // Resize to actual number of elements returned.
+ RETURN_NOT_OK(
+ offsets_buffer->Resize((validity_io.values_read + 1) * sizeof(IndexType)));
+ if (validity_buffer != nullptr) {
+ RETURN_NOT_OK(
+ validity_buffer->Resize(BitUtil::BytesForBits(validity_io.values_read)));
+ validity_buffer->ZeroPadding();
+ }
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<ArrayData> item_chunk, ChunksToSingle(**out));
+
+ std::vector<std::shared_ptr<Buffer>> buffers{
+ validity_io.null_count > 0 ? validity_buffer : nullptr, offsets_buffer};
+ auto data = std::make_shared<ArrayData>(
+ field_->type(),
+ /*length=*/validity_io.values_read, std::move(buffers),
+ std::vector<std::shared_ptr<ArrayData>>{item_chunk}, validity_io.null_count);
+
+ ARROW_ASSIGN_OR_RAISE(*out, AssembleArray(std::move(data)));
+ return Status::OK();
+ }
+
+ const std::shared_ptr<Field> field() override { return field_; }
+
+ private:
+ std::shared_ptr<ReaderContext> ctx_;
+ std::shared_ptr<Field> field_;
+ ::parquet::internal::LevelInfo level_info_;
+ std::unique_ptr<ColumnReaderImpl> item_reader_;
+};
+
+class PARQUET_NO_EXPORT FixedSizeListReader : public ListReader<int32_t> {
+ public:
+ FixedSizeListReader(std::shared_ptr<ReaderContext> ctx, std::shared_ptr<Field> field,
+ ::parquet::internal::LevelInfo level_info,
+ std::unique_ptr<ColumnReaderImpl> child_reader)
+ : ListReader(std::move(ctx), std::move(field), level_info,
+ std::move(child_reader)) {}
+ ::arrow::Result<std::shared_ptr<ChunkedArray>> AssembleArray(
+ std::shared_ptr<ArrayData> data) final {
+ DCHECK_EQ(data->buffers.size(), 2);
+ DCHECK_EQ(field()->type()->id(), ::arrow::Type::FIXED_SIZE_LIST);
+ const auto& type = checked_cast<::arrow::FixedSizeListType&>(*field()->type());
+ const int32_t* offsets = reinterpret_cast<const int32_t*>(data->buffers[1]->data());
+ for (int x = 1; x <= data->length; x++) {
+ int32_t size = offsets[x] - offsets[x - 1];
+ if (size != type.list_size()) {
+ return Status::Invalid("Expected all lists to be of size=", type.list_size(),
+ " but index ", x, " had size=", size);
+ }
+ }
+ data->buffers.resize(1);
+ std::shared_ptr<Array> result = ::arrow::MakeArray(data);
+ return std::make_shared<ChunkedArray>(result);
+ }
+};
+
+class PARQUET_NO_EXPORT StructReader : public ColumnReaderImpl {
+ public:
+ explicit StructReader(std::shared_ptr<ReaderContext> ctx,
+ std::shared_ptr<Field> filtered_field,
+ ::parquet::internal::LevelInfo level_info,
+ std::vector<std::unique_ptr<ColumnReaderImpl>> children)
+ : ctx_(std::move(ctx)),
+ filtered_field_(std::move(filtered_field)),
+ level_info_(level_info),
+ children_(std::move(children)) {
+ // There could be a mix of children some might be repeated some might not be.
+ // If possible use one that isn't since that will be guaranteed to have the least
+ // number of levels to reconstruct a nullable bitmap.
+ auto result = std::find_if(children_.begin(), children_.end(),
+ [](const std::unique_ptr<ColumnReaderImpl>& child) {
+ return !child->IsOrHasRepeatedChild();
+ });
+ if (result != children_.end()) {
+ def_rep_level_child_ = result->get();
+ has_repeated_child_ = false;
+ } else if (!children_.empty()) {
+ def_rep_level_child_ = children_.front().get();
+ has_repeated_child_ = true;
+ }
+ }
+
+ bool IsOrHasRepeatedChild() const final { return has_repeated_child_; }
+
+ Status LoadBatch(int64_t records_to_read) override {
+ for (const std::unique_ptr<ColumnReaderImpl>& reader : children_) {
+ RETURN_NOT_OK(reader->LoadBatch(records_to_read));
+ }
+ return Status::OK();
+ }
+ Status BuildArray(int64_t length_upper_bound,
+ std::shared_ptr<ChunkedArray>* out) override;
+ Status GetDefLevels(const int16_t** data, int64_t* length) override;
+ Status GetRepLevels(const int16_t** data, int64_t* length) override;
+ const std::shared_ptr<Field> field() override { return filtered_field_; }
+
+ private:
+ const std::shared_ptr<ReaderContext> ctx_;
+ const std::shared_ptr<Field> filtered_field_;
+ const ::parquet::internal::LevelInfo level_info_;
+ const std::vector<std::unique_ptr<ColumnReaderImpl>> children_;
+ ColumnReaderImpl* def_rep_level_child_ = nullptr;
+ bool has_repeated_child_;
+};
+
+Status StructReader::GetDefLevels(const int16_t** data, int64_t* length) {
+ *data = nullptr;
+ if (children_.size() == 0) {
+ *length = 0;
+ return Status::Invalid("StructReader had no children");
+ }
+
+ // This method should only be called when this struct or one of its parents
+ // are optional/repeated or it has a repeated child.
+ // Meaning all children must have rep/def levels associated
+ // with them.
+ RETURN_NOT_OK(def_rep_level_child_->GetDefLevels(data, length));
+ return Status::OK();
+}
+
+Status StructReader::GetRepLevels(const int16_t** data, int64_t* length) {
+ *data = nullptr;
+ if (children_.size() == 0) {
+ *length = 0;
+ return Status::Invalid("StructReader had no childre");
+ }
+
+ // This method should only be called when this struct or one of its parents
+ // are optional/repeated or it has repeated child.
+ // Meaning all children must have rep/def levels associated
+ // with them.
+ RETURN_NOT_OK(def_rep_level_child_->GetRepLevels(data, length));
+ return Status::OK();
+}
+
+Status StructReader::BuildArray(int64_t length_upper_bound,
+ std::shared_ptr<ChunkedArray>* out) {
+ std::vector<std::shared_ptr<ArrayData>> children_array_data;
+ std::shared_ptr<ResizableBuffer> null_bitmap;
+
+ ::parquet::internal::ValidityBitmapInputOutput validity_io;
+ validity_io.values_read_upper_bound = length_upper_bound;
+ // This simplifies accounting below.
+ validity_io.values_read = length_upper_bound;
+
+ BEGIN_PARQUET_CATCH_EXCEPTIONS
+ const int16_t* def_levels;
+ const int16_t* rep_levels;
+ int64_t num_levels;
+
+ if (has_repeated_child_) {
+ ARROW_ASSIGN_OR_RAISE(
+ null_bitmap,
+ AllocateResizableBuffer(BitUtil::BytesForBits(length_upper_bound), ctx_->pool));
+ validity_io.valid_bits = null_bitmap->mutable_data();
+ RETURN_NOT_OK(GetDefLevels(&def_levels, &num_levels));
+ RETURN_NOT_OK(GetRepLevels(&rep_levels, &num_levels));
+ DefRepLevelsToBitmap(def_levels, rep_levels, num_levels, level_info_, &validity_io);
+ } else if (filtered_field_->nullable()) {
+ ARROW_ASSIGN_OR_RAISE(
+ null_bitmap,
+ AllocateResizableBuffer(BitUtil::BytesForBits(length_upper_bound), ctx_->pool));
+ validity_io.valid_bits = null_bitmap->mutable_data();
+ RETURN_NOT_OK(GetDefLevels(&def_levels, &num_levels));
+ DefLevelsToBitmap(def_levels, num_levels, level_info_, &validity_io);
+ }
+
+ // Ensure all values are initialized.
+ if (null_bitmap) {
+ RETURN_NOT_OK(null_bitmap->Resize(BitUtil::BytesForBits(validity_io.values_read)));
+ null_bitmap->ZeroPadding();
+ }
+
+ END_PARQUET_CATCH_EXCEPTIONS
+ // Gather children arrays and def levels
+ for (auto& child : children_) {
+ std::shared_ptr<ChunkedArray> field;
+ RETURN_NOT_OK(child->BuildArray(validity_io.values_read, &field));
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<ArrayData> array_data, ChunksToSingle(*field));
+ children_array_data.push_back(std::move(array_data));
+ }
+
+ if (!filtered_field_->nullable() && !has_repeated_child_) {
+ validity_io.values_read = children_array_data.front()->length;
+ }
+
+ std::vector<std::shared_ptr<Buffer>> buffers{validity_io.null_count > 0 ? null_bitmap
+ : nullptr};
+ auto data =
+ std::make_shared<ArrayData>(filtered_field_->type(),
+ /*length=*/validity_io.values_read, std::move(buffers),
+ std::move(children_array_data));
+ std::shared_ptr<Array> result = ::arrow::MakeArray(data);
+
+ *out = std::make_shared<ChunkedArray>(result);
+ return Status::OK();
+}
+
+// ----------------------------------------------------------------------
+// File reader implementation
+
+Status GetReader(const SchemaField& field, const std::shared_ptr<Field>& arrow_field,
+ const std::shared_ptr<ReaderContext>& ctx,
+ std::unique_ptr<ColumnReaderImpl>* out) {
+ BEGIN_PARQUET_CATCH_EXCEPTIONS
+
+ auto type_id = arrow_field->type()->id();
+
+ if (type_id == ::arrow::Type::EXTENSION) {
+ auto storage_field = arrow_field->WithType(
+ checked_cast<const ExtensionType&>(*arrow_field->type()).storage_type());
+ RETURN_NOT_OK(GetReader(field, storage_field, ctx, out));
+ out->reset(new ExtensionReader(arrow_field, std::move(*out)));
+ return Status::OK();
+ }
+
+ if (field.children.size() == 0) {
+ if (!field.is_leaf()) {
+ return Status::Invalid("Parquet non-leaf node has no children");
+ }
+ if (!ctx->IncludesLeaf(field.column_index)) {
+ *out = nullptr;
+ return Status::OK();
+ }
+ std::unique_ptr<FileColumnIterator> input(
+ ctx->iterator_factory(field.column_index, ctx->reader));
+ out->reset(new LeafReader(ctx, arrow_field, std::move(input), field.level_info));
+ } else if (type_id == ::arrow::Type::LIST || type_id == ::arrow::Type::MAP ||
+ type_id == ::arrow::Type::FIXED_SIZE_LIST ||
+ type_id == ::arrow::Type::LARGE_LIST) {
+ auto list_field = arrow_field;
+ auto child = &field.children[0];
+ std::unique_ptr<ColumnReaderImpl> child_reader;
+ RETURN_NOT_OK(GetReader(*child, ctx, &child_reader));
+ if (child_reader == nullptr) {
+ *out = nullptr;
+ return Status::OK();
+ }
+ if (type_id == ::arrow::Type::LIST ||
+ type_id == ::arrow::Type::MAP) { // Map can be reconstructed as list of structs.
+ if (type_id == ::arrow::Type::MAP &&
+ child_reader->field()->type()->num_fields() != 2) {
+ // This case applies if either key or value is filtered.
+ list_field = list_field->WithType(::arrow::list(child_reader->field()));
+ }
+ out->reset(new ListReader<int32_t>(ctx, list_field, field.level_info,
+ std::move(child_reader)));
+ } else if (type_id == ::arrow::Type::LARGE_LIST) {
+ out->reset(new ListReader<int64_t>(ctx, list_field, field.level_info,
+ std::move(child_reader)));
+
+ } else if (type_id == ::arrow::Type::FIXED_SIZE_LIST) {
+ out->reset(new FixedSizeListReader(ctx, list_field, field.level_info,
+ std::move(child_reader)));
+ } else {
+ return Status::UnknownError("Unknown list type: ", field.field->ToString());
+ }
+ } else if (type_id == ::arrow::Type::STRUCT) {
+ std::vector<std::shared_ptr<Field>> child_fields;
+ std::vector<std::unique_ptr<ColumnReaderImpl>> child_readers;
+ for (const auto& child : field.children) {
+ std::unique_ptr<ColumnReaderImpl> child_reader;
+ RETURN_NOT_OK(GetReader(child, ctx, &child_reader));
+ if (!child_reader) {
+ // If all children were pruned, then we do not try to read this field
+ continue;
+ }
+ child_fields.push_back(child.field);
+ child_readers.emplace_back(std::move(child_reader));
+ }
+ if (child_fields.size() == 0) {
+ *out = nullptr;
+ return Status::OK();
+ }
+ auto filtered_field =
+ ::arrow::field(arrow_field->name(), ::arrow::struct_(child_fields),
+ arrow_field->nullable(), arrow_field->metadata());
+ out->reset(new StructReader(ctx, filtered_field, field.level_info,
+ std::move(child_readers)));
+ } else {
+ return Status::Invalid("Unsupported nested type: ", arrow_field->ToString());
+ }
+ return Status::OK();
+
+ END_PARQUET_CATCH_EXCEPTIONS
+}
+
+Status GetReader(const SchemaField& field, const std::shared_ptr<ReaderContext>& ctx,
+ std::unique_ptr<ColumnReaderImpl>* out) {
+ return GetReader(field, field.field, ctx, out);
+}
+
+} // namespace
+
+Status FileReaderImpl::GetRecordBatchReader(const std::vector<int>& row_groups,
+ const std::vector<int>& column_indices,
+ std::unique_ptr<RecordBatchReader>* out) {
+ RETURN_NOT_OK(BoundsCheck(row_groups, column_indices));
+
+ if (reader_properties_.pre_buffer()) {
+ // PARQUET-1698/PARQUET-1820: pre-buffer row groups/column chunks if enabled
+ BEGIN_PARQUET_CATCH_EXCEPTIONS
+ reader_->PreBuffer(row_groups, column_indices, reader_properties_.io_context(),
+ reader_properties_.cache_options());
+ END_PARQUET_CATCH_EXCEPTIONS
+ }
+
+ std::vector<std::shared_ptr<ColumnReaderImpl>> readers;
+ std::shared_ptr<::arrow::Schema> batch_schema;
+ RETURN_NOT_OK(GetFieldReaders(column_indices, row_groups, &readers, &batch_schema));
+
+ if (readers.empty()) {
+ // Just generate all batches right now; they're cheap since they have no columns.
+ int64_t batch_size = properties().batch_size();
+ auto max_sized_batch =
+ ::arrow::RecordBatch::Make(batch_schema, batch_size, ::arrow::ArrayVector{});
+
+ ::arrow::RecordBatchVector batches;
+
+ for (int row_group : row_groups) {
+ int64_t num_rows = parquet_reader()->metadata()->RowGroup(row_group)->num_rows();
+
+ batches.insert(batches.end(), num_rows / batch_size, max_sized_batch);
+
+ if (int64_t trailing_rows = num_rows % batch_size) {
+ batches.push_back(max_sized_batch->Slice(0, trailing_rows));
+ }
+ }
+
+ *out = ::arrow::internal::make_unique<RowGroupRecordBatchReader>(
+ ::arrow::MakeVectorIterator(std::move(batches)), std::move(batch_schema));
+
+ return Status::OK();
+ }
+
+ int64_t num_rows = 0;
+ for (int row_group : row_groups) {
+ num_rows += parquet_reader()->metadata()->RowGroup(row_group)->num_rows();
+ }
+
+ using ::arrow::RecordBatchIterator;
+
+ // NB: This lambda will be invoked outside the scope of this call to
+ // `GetRecordBatchReader()`, so it must capture `readers` and `batch_schema` by value.
+ // `this` is a non-owning pointer so we are relying on the parent FileReader outliving
+ // this RecordBatchReader.
+ ::arrow::Iterator<RecordBatchIterator> batches = ::arrow::MakeFunctionIterator(
+ [readers, batch_schema, num_rows,
+ this]() mutable -> ::arrow::Result<RecordBatchIterator> {
+ ::arrow::ChunkedArrayVector columns(readers.size());
+
+ // don't reserve more rows than necessary
+ int64_t batch_size = std::min(properties().batch_size(), num_rows);
+ num_rows -= batch_size;
+
+ RETURN_NOT_OK(::arrow::internal::OptionalParallelFor(
+ reader_properties_.use_threads(), static_cast<int>(readers.size()),
+ [&](int i) { return readers[i]->NextBatch(batch_size, &columns[i]); }));
+
+ for (const auto& column : columns) {
+ if (column == nullptr || column->length() == 0) {
+ return ::arrow::IterationTraits<RecordBatchIterator>::End();
+ }
+ }
+
+ auto table = ::arrow::Table::Make(batch_schema, std::move(columns));
+ auto table_reader = std::make_shared<::arrow::TableBatchReader>(*table);
+
+ // NB: explicitly preserve table so that table_reader doesn't outlive it
+ return ::arrow::MakeFunctionIterator(
+ [table, table_reader] { return table_reader->Next(); });
+ });
+
+ *out = ::arrow::internal::make_unique<RowGroupRecordBatchReader>(
+ ::arrow::MakeFlattenIterator(std::move(batches)), std::move(batch_schema));
+
+ return Status::OK();
+}
+
+/// Given a file reader and a list of row groups, this is a generator of record
+/// batch generators (where each sub-generator is the contents of a single row group).
+class RowGroupGenerator {
+ public:
+ using RecordBatchGenerator =
+ ::arrow::AsyncGenerator<std::shared_ptr<::arrow::RecordBatch>>;
+
+ explicit RowGroupGenerator(std::shared_ptr<FileReaderImpl> arrow_reader,
+ ::arrow::internal::Executor* cpu_executor,
+ std::vector<int> row_groups, std::vector<int> column_indices)
+ : arrow_reader_(std::move(arrow_reader)),
+ cpu_executor_(cpu_executor),
+ row_groups_(std::move(row_groups)),
+ column_indices_(std::move(column_indices)),
+ index_(0) {}
+
+ ::arrow::Future<RecordBatchGenerator> operator()() {
+ if (index_ >= row_groups_.size()) {
+ return ::arrow::AsyncGeneratorEnd<RecordBatchGenerator>();
+ }
+ int row_group = row_groups_[index_++];
+ std::vector<int> column_indices = column_indices_;
+ auto reader = arrow_reader_;
+ if (!reader->properties().pre_buffer()) {
+ return SubmitRead(cpu_executor_, reader, row_group, column_indices);
+ }
+ auto ready = reader->parquet_reader()->WhenBuffered({row_group}, column_indices);
+ if (cpu_executor_) ready = cpu_executor_->TransferAlways(ready);
+ return ready.Then([=]() -> ::arrow::Future<RecordBatchGenerator> {
+ return ReadOneRowGroup(cpu_executor_, reader, row_group, column_indices);
+ });
+ }
+
+ private:
+ // Synchronous fallback for when pre-buffer isn't enabled.
+ //
+ // Making the Parquet reader truly asynchronous requires heavy refactoring, so the
+ // generator piggybacks on ReadRangeCache. The lazy ReadRangeCache can be used for
+ // async I/O without forcing readahead.
+ static ::arrow::Future<RecordBatchGenerator> SubmitRead(
+ ::arrow::internal::Executor* cpu_executor, std::shared_ptr<FileReaderImpl> self,
+ const int row_group, const std::vector<int>& column_indices) {
+ if (!cpu_executor) {
+ return ReadOneRowGroup(cpu_executor, self, row_group, column_indices);
+ }
+ // If we have an executor, then force transfer (even if I/O was complete)
+ return ::arrow::DeferNotOk(cpu_executor->Submit(ReadOneRowGroup, cpu_executor, self,
+ row_group, column_indices));
+ }
+
+ static ::arrow::Future<RecordBatchGenerator> ReadOneRowGroup(
+ ::arrow::internal::Executor* cpu_executor, std::shared_ptr<FileReaderImpl> self,
+ const int row_group, const std::vector<int>& column_indices) {
+ // Skips bound checks/pre-buffering, since we've done that already
+ return self->DecodeRowGroups(self, {row_group}, column_indices, cpu_executor)
+ .Then([](const std::shared_ptr<Table>& table)
+ -> ::arrow::Result<RecordBatchGenerator> {
+ ::arrow::TableBatchReader table_reader(*table);
+ ::arrow::RecordBatchVector batches;
+ RETURN_NOT_OK(table_reader.ReadAll(&batches));
+ return ::arrow::MakeVectorGenerator(std::move(batches));
+ });
+ }
+
+ std::shared_ptr<FileReaderImpl> arrow_reader_;
+ ::arrow::internal::Executor* cpu_executor_;
+ std::vector<int> row_groups_;
+ std::vector<int> column_indices_;
+ size_t index_;
+};
+
+::arrow::Result<::arrow::AsyncGenerator<std::shared_ptr<::arrow::RecordBatch>>>
+FileReaderImpl::GetRecordBatchGenerator(std::shared_ptr<FileReader> reader,
+ const std::vector<int> row_group_indices,
+ const std::vector<int> column_indices,
+ ::arrow::internal::Executor* cpu_executor) {
+ RETURN_NOT_OK(BoundsCheck(row_group_indices, column_indices));
+ if (reader_properties_.pre_buffer()) {
+ BEGIN_PARQUET_CATCH_EXCEPTIONS
+ reader_->PreBuffer(row_group_indices, column_indices, reader_properties_.io_context(),
+ reader_properties_.cache_options());
+ END_PARQUET_CATCH_EXCEPTIONS
+ }
+ ::arrow::AsyncGenerator<RowGroupGenerator::RecordBatchGenerator> row_group_generator =
+ RowGroupGenerator(::arrow::internal::checked_pointer_cast<FileReaderImpl>(reader),
+ cpu_executor, row_group_indices, column_indices);
+ return ::arrow::MakeConcatenatedGenerator(std::move(row_group_generator));
+}
+
+Status FileReaderImpl::GetColumn(int i, FileColumnIteratorFactory iterator_factory,
+ std::unique_ptr<ColumnReader>* out) {
+ RETURN_NOT_OK(BoundsCheckColumn(i));
+ auto ctx = std::make_shared<ReaderContext>();
+ ctx->reader = reader_.get();
+ ctx->pool = pool_;
+ ctx->iterator_factory = iterator_factory;
+ ctx->filter_leaves = false;
+ std::unique_ptr<ColumnReaderImpl> result;
+ RETURN_NOT_OK(GetReader(manifest_.schema_fields[i], ctx, &result));
+ out->reset(result.release());
+ return Status::OK();
+}
+
+Status FileReaderImpl::ReadRowGroups(const std::vector<int>& row_groups,
+ const std::vector<int>& column_indices,
+ std::shared_ptr<Table>* out) {
+ RETURN_NOT_OK(BoundsCheck(row_groups, column_indices));
+
+ // PARQUET-1698/PARQUET-1820: pre-buffer row groups/column chunks if enabled
+ if (reader_properties_.pre_buffer()) {
+ BEGIN_PARQUET_CATCH_EXCEPTIONS
+ parquet_reader()->PreBuffer(row_groups, column_indices,
+ reader_properties_.io_context(),
+ reader_properties_.cache_options());
+ END_PARQUET_CATCH_EXCEPTIONS
+ }
+
+ auto fut = DecodeRowGroups(/*self=*/nullptr, row_groups, column_indices,
+ /*cpu_executor=*/nullptr);
+ ARROW_ASSIGN_OR_RAISE(*out, fut.MoveResult());
+ return Status::OK();
+}
+
+Future<std::shared_ptr<Table>> FileReaderImpl::DecodeRowGroups(
+ std::shared_ptr<FileReaderImpl> self, const std::vector<int>& row_groups,
+ const std::vector<int>& column_indices, ::arrow::internal::Executor* cpu_executor) {
+ // `self` is used solely to keep `this` alive in an async context - but we use this
+ // in a sync context too so use `this` over `self`
+ std::vector<std::shared_ptr<ColumnReaderImpl>> readers;
+ std::shared_ptr<::arrow::Schema> result_schema;
+ RETURN_NOT_OK(GetFieldReaders(column_indices, row_groups, &readers, &result_schema));
+ // OptionalParallelForAsync requires an executor
+ if (!cpu_executor) cpu_executor = ::arrow::internal::GetCpuThreadPool();
+
+ auto read_column = [row_groups, self, this](size_t i,
+ std::shared_ptr<ColumnReaderImpl> reader)
+ -> ::arrow::Result<std::shared_ptr<::arrow::ChunkedArray>> {
+ std::shared_ptr<::arrow::ChunkedArray> column;
+ RETURN_NOT_OK(ReadColumn(static_cast<int>(i), row_groups, reader.get(), &column));
+ return column;
+ };
+ auto make_table = [result_schema, row_groups, self,
+ this](const ::arrow::ChunkedArrayVector& columns)
+ -> ::arrow::Result<std::shared_ptr<Table>> {
+ int64_t num_rows = 0;
+ if (!columns.empty()) {
+ num_rows = columns[0]->length();
+ } else {
+ for (int i : row_groups) {
+ num_rows += parquet_reader()->metadata()->RowGroup(i)->num_rows();
+ }
+ }
+ auto table = Table::Make(std::move(result_schema), columns, num_rows);
+ RETURN_NOT_OK(table->Validate());
+ return table;
+ };
+ return ::arrow::internal::OptionalParallelForAsync(reader_properties_.use_threads(),
+ std::move(readers), read_column,
+ cpu_executor)
+ .Then(std::move(make_table));
+}
+
+std::shared_ptr<RowGroupReader> FileReaderImpl::RowGroup(int row_group_index) {
+ return std::make_shared<RowGroupReaderImpl>(this, row_group_index);
+}
+
+// ----------------------------------------------------------------------
+// Public factory functions
+
+Status FileReader::GetRecordBatchReader(const std::vector<int>& row_group_indices,
+ std::shared_ptr<RecordBatchReader>* out) {
+ std::unique_ptr<RecordBatchReader> tmp;
+ ARROW_RETURN_NOT_OK(GetRecordBatchReader(row_group_indices, &tmp));
+ out->reset(tmp.release());
+ return Status::OK();
+}
+
+Status FileReader::GetRecordBatchReader(const std::vector<int>& row_group_indices,
+ const std::vector<int>& column_indices,
+ std::shared_ptr<RecordBatchReader>* out) {
+ std::unique_ptr<RecordBatchReader> tmp;
+ ARROW_RETURN_NOT_OK(GetRecordBatchReader(row_group_indices, column_indices, &tmp));
+ out->reset(tmp.release());
+ return Status::OK();
+}
+
+Status FileReader::Make(::arrow::MemoryPool* pool,
+ std::unique_ptr<ParquetFileReader> reader,
+ const ArrowReaderProperties& properties,
+ std::unique_ptr<FileReader>* out) {
+ out->reset(new FileReaderImpl(pool, std::move(reader), properties));
+ return static_cast<FileReaderImpl*>(out->get())->Init();
+}
+
+Status FileReader::Make(::arrow::MemoryPool* pool,
+ std::unique_ptr<ParquetFileReader> reader,
+ std::unique_ptr<FileReader>* out) {
+ return Make(pool, std::move(reader), default_arrow_reader_properties(), out);
+}
+
+FileReaderBuilder::FileReaderBuilder()
+ : pool_(::arrow::default_memory_pool()),
+ properties_(default_arrow_reader_properties()) {}
+
+Status FileReaderBuilder::Open(std::shared_ptr<::arrow::io::RandomAccessFile> file,
+ const ReaderProperties& properties,
+ std::shared_ptr<FileMetaData> metadata) {
+ PARQUET_CATCH_NOT_OK(raw_reader_ = ParquetReader::Open(std::move(file), properties,
+ std::move(metadata)));
+ return Status::OK();
+}
+
+FileReaderBuilder* FileReaderBuilder::memory_pool(::arrow::MemoryPool* pool) {
+ pool_ = pool;
+ return this;
+}
+
+FileReaderBuilder* FileReaderBuilder::properties(
+ const ArrowReaderProperties& arg_properties) {
+ properties_ = arg_properties;
+ return this;
+}
+
+Status FileReaderBuilder::Build(std::unique_ptr<FileReader>* out) {
+ return FileReader::Make(pool_, std::move(raw_reader_), properties_, out);
+}
+
+Status OpenFile(std::shared_ptr<::arrow::io::RandomAccessFile> file, MemoryPool* pool,
+ std::unique_ptr<FileReader>* reader) {
+ FileReaderBuilder builder;
+ RETURN_NOT_OK(builder.Open(std::move(file)));
+ return builder.memory_pool(pool)->Build(reader);
+}
+
+namespace internal {
+
+Status FuzzReader(std::unique_ptr<FileReader> reader) {
+ auto st = Status::OK();
+ for (int i = 0; i < reader->num_row_groups(); ++i) {
+ std::shared_ptr<Table> table;
+ auto row_group_status = reader->ReadRowGroup(i, &table);
+ if (row_group_status.ok()) {
+ row_group_status &= table->ValidateFull();
+ }
+ st &= row_group_status;
+ }
+ return st;
+}
+
+Status FuzzReader(const uint8_t* data, int64_t size) {
+ auto buffer = std::make_shared<::arrow::Buffer>(data, size);
+ auto file = std::make_shared<::arrow::io::BufferReader>(buffer);
+ FileReaderBuilder builder;
+ RETURN_NOT_OK(builder.Open(std::move(file)));
+
+ std::unique_ptr<FileReader> reader;
+ RETURN_NOT_OK(builder.Build(&reader));
+ return FuzzReader(std::move(reader));
+}
+
+} // namespace internal
+
+} // namespace arrow
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/arrow/reader.h b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/reader.h
new file mode 100644
index 00000000000..2d6a5ef2c3e
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/reader.h
@@ -0,0 +1,343 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+// N.B. we don't include async_generator.h as it's relatively heavy
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "parquet/file_reader.h"
+#include "parquet/platform.h"
+#include "parquet/properties.h"
+
+namespace arrow {
+
+class ChunkedArray;
+class KeyValueMetadata;
+class RecordBatchReader;
+struct Scalar;
+class Schema;
+class Table;
+class RecordBatch;
+
+} // namespace arrow
+
+namespace parquet {
+
+class FileMetaData;
+class SchemaDescriptor;
+
+namespace arrow {
+
+class ColumnChunkReader;
+class ColumnReader;
+struct SchemaManifest;
+class RowGroupReader;
+
+/// \brief Arrow read adapter class for deserializing Parquet files as Arrow row batches.
+///
+/// This interfaces caters for different use cases and thus provides different
+/// interfaces. In its most simplistic form, we cater for a user that wants to
+/// read the whole Parquet at once with the `FileReader::ReadTable` method.
+///
+/// More advanced users that also want to implement parallelism on top of each
+/// single Parquet files should do this on the RowGroup level. For this, they can
+/// call `FileReader::RowGroup(i)->ReadTable` to receive only the specified
+/// RowGroup as a table.
+///
+/// In the most advanced situation, where a consumer wants to independently read
+/// RowGroups in parallel and consume each column individually, they can call
+/// `FileReader::RowGroup(i)->Column(j)->Read` and receive an `arrow::Column`
+/// instance.
+///
+/// The parquet format supports an optional integer field_id which can be assigned
+/// to a field. Arrow will convert these field IDs to a metadata key named
+/// PARQUET:field_id on the appropriate field.
+// TODO(wesm): nested data does not always make sense with this user
+// interface unless you are only reading a single leaf node from a branch of
+// a table. For example:
+//
+// repeated group data {
+// optional group record {
+// optional int32 val1;
+// optional byte_array val2;
+// optional bool val3;
+// }
+// optional int32 val4;
+// }
+//
+// In the Parquet file, there are 3 leaf nodes:
+//
+// * data.record.val1
+// * data.record.val2
+// * data.record.val3
+// * data.val4
+//
+// When materializing this data in an Arrow array, we would have:
+//
+// data: list<struct<
+// record: struct<
+// val1: int32,
+// val2: string (= list<uint8>),
+// val3: bool,
+// >,
+// val4: int32
+// >>
+//
+// However, in the Parquet format, each leaf node has its own repetition and
+// definition levels describing the structure of the intermediate nodes in
+// this array structure. Thus, we will need to scan the leaf data for a group
+// of leaf nodes part of the same type tree to create a single result Arrow
+// nested array structure.
+//
+// This is additionally complicated "chunky" repeated fields or very large byte
+// arrays
+class PARQUET_EXPORT FileReader {
+ public:
+ /// Factory function to create a FileReader from a ParquetFileReader and properties
+ static ::arrow::Status Make(::arrow::MemoryPool* pool,
+ std::unique_ptr<ParquetFileReader> reader,
+ const ArrowReaderProperties& properties,
+ std::unique_ptr<FileReader>* out);
+
+ /// Factory function to create a FileReader from a ParquetFileReader
+ static ::arrow::Status Make(::arrow::MemoryPool* pool,
+ std::unique_ptr<ParquetFileReader> reader,
+ std::unique_ptr<FileReader>* out);
+
+ // Since the distribution of columns amongst a Parquet file's row groups may
+ // be uneven (the number of values in each column chunk can be different), we
+ // provide a column-oriented read interface. The ColumnReader hides the
+ // details of paging through the file's row groups and yielding
+ // fully-materialized arrow::Array instances
+ //
+ // Returns error status if the column of interest is not flat.
+ virtual ::arrow::Status GetColumn(int i, std::unique_ptr<ColumnReader>* out) = 0;
+
+ /// \brief Return arrow schema for all the columns.
+ virtual ::arrow::Status GetSchema(std::shared_ptr<::arrow::Schema>* out) = 0;
+
+ /// \brief Read column as a whole into a chunked array.
+ ///
+ /// The indicated column index is relative to the schema
+ virtual ::arrow::Status ReadColumn(int i,
+ std::shared_ptr<::arrow::ChunkedArray>* out) = 0;
+
+ // NOTE: Experimental API
+ // Reads a specific top level schema field into an Array
+ // The index i refers the index of the top level schema field, which may
+ // be nested or flat - e.g.
+ //
+ // 0 foo.bar
+ // foo.bar.baz
+ // foo.qux
+ // 1 foo2
+ // 2 foo3
+ //
+ // i=0 will read the entire foo struct, i=1 the foo2 primitive column etc
+ virtual ::arrow::Status ReadSchemaField(
+ int i, std::shared_ptr<::arrow::ChunkedArray>* out) = 0;
+
+ /// \brief Return a RecordBatchReader of row groups selected from row_group_indices.
+ ///
+ /// Note that the ordering in row_group_indices matters. FileReaders must outlive
+ /// their RecordBatchReaders.
+ ///
+ /// \returns error Status if row_group_indices contains an invalid index
+ virtual ::arrow::Status GetRecordBatchReader(
+ const std::vector<int>& row_group_indices,
+ std::unique_ptr<::arrow::RecordBatchReader>* out) = 0;
+
+ ::arrow::Status GetRecordBatchReader(const std::vector<int>& row_group_indices,
+ std::shared_ptr<::arrow::RecordBatchReader>* out);
+
+ /// \brief Return a RecordBatchReader of row groups selected from
+ /// row_group_indices, whose columns are selected by column_indices.
+ ///
+ /// Note that the ordering in row_group_indices and column_indices
+ /// matter. FileReaders must outlive their RecordBatchReaders.
+ ///
+ /// \returns error Status if either row_group_indices or column_indices
+ /// contains an invalid index
+ virtual ::arrow::Status GetRecordBatchReader(
+ const std::vector<int>& row_group_indices, const std::vector<int>& column_indices,
+ std::unique_ptr<::arrow::RecordBatchReader>* out) = 0;
+
+ /// \brief Return a generator of record batches.
+ ///
+ /// The FileReader must outlive the generator, so this requires that you pass in a
+ /// shared_ptr.
+ ///
+ /// \returns error Result if either row_group_indices or column_indices contains an
+ /// invalid index
+ virtual ::arrow::Result<
+ std::function<::arrow::Future<std::shared_ptr<::arrow::RecordBatch>>()>>
+ GetRecordBatchGenerator(std::shared_ptr<FileReader> reader,
+ const std::vector<int> row_group_indices,
+ const std::vector<int> column_indices,
+ ::arrow::internal::Executor* cpu_executor = NULLPTR) = 0;
+
+ ::arrow::Status GetRecordBatchReader(const std::vector<int>& row_group_indices,
+ const std::vector<int>& column_indices,
+ std::shared_ptr<::arrow::RecordBatchReader>* out);
+
+ /// Read all columns into a Table
+ virtual ::arrow::Status ReadTable(std::shared_ptr<::arrow::Table>* out) = 0;
+
+ /// \brief Read the given columns into a Table
+ ///
+ /// The indicated column indices are relative to the schema
+ virtual ::arrow::Status ReadTable(const std::vector<int>& column_indices,
+ std::shared_ptr<::arrow::Table>* out) = 0;
+
+ virtual ::arrow::Status ReadRowGroup(int i, const std::vector<int>& column_indices,
+ std::shared_ptr<::arrow::Table>* out) = 0;
+
+ virtual ::arrow::Status ReadRowGroup(int i, std::shared_ptr<::arrow::Table>* out) = 0;
+
+ virtual ::arrow::Status ReadRowGroups(const std::vector<int>& row_groups,
+ const std::vector<int>& column_indices,
+ std::shared_ptr<::arrow::Table>* out) = 0;
+
+ virtual ::arrow::Status ReadRowGroups(const std::vector<int>& row_groups,
+ std::shared_ptr<::arrow::Table>* out) = 0;
+
+ /// \brief Scan file contents with one thread, return number of rows
+ virtual ::arrow::Status ScanContents(std::vector<int> columns,
+ const int32_t column_batch_size,
+ int64_t* num_rows) = 0;
+
+ /// \brief Return a reader for the RowGroup, this object must not outlive the
+ /// FileReader.
+ virtual std::shared_ptr<RowGroupReader> RowGroup(int row_group_index) = 0;
+
+ /// \brief The number of row groups in the file
+ virtual int num_row_groups() const = 0;
+
+ virtual ParquetFileReader* parquet_reader() const = 0;
+
+ /// Set whether to use multiple threads during reads of multiple columns.
+ /// By default only one thread is used.
+ virtual void set_use_threads(bool use_threads) = 0;
+
+ /// Set number of records to read per batch for the RecordBatchReader.
+ virtual void set_batch_size(int64_t batch_size) = 0;
+
+ virtual const ArrowReaderProperties& properties() const = 0;
+
+ virtual const SchemaManifest& manifest() const = 0;
+
+ virtual ~FileReader() = default;
+};
+
+class RowGroupReader {
+ public:
+ virtual ~RowGroupReader() = default;
+ virtual std::shared_ptr<ColumnChunkReader> Column(int column_index) = 0;
+ virtual ::arrow::Status ReadTable(const std::vector<int>& column_indices,
+ std::shared_ptr<::arrow::Table>* out) = 0;
+ virtual ::arrow::Status ReadTable(std::shared_ptr<::arrow::Table>* out) = 0;
+
+ private:
+ struct Iterator;
+};
+
+class ColumnChunkReader {
+ public:
+ virtual ~ColumnChunkReader() = default;
+ virtual ::arrow::Status Read(std::shared_ptr<::arrow::ChunkedArray>* out) = 0;
+};
+
+// At this point, the column reader is a stream iterator. It only knows how to
+// read the next batch of values for a particular column from the file until it
+// runs out.
+//
+// We also do not expose any internal Parquet details, such as row groups. This
+// might change in the future.
+class PARQUET_EXPORT ColumnReader {
+ public:
+ virtual ~ColumnReader() = default;
+
+ // Scan the next array of the indicated size. The actual size of the
+ // returned array may be less than the passed size depending how much data is
+ // available in the file.
+ //
+ // When all the data in the file has been exhausted, the result is set to
+ // nullptr.
+ //
+ // Returns Status::OK on a successful read, including if you have exhausted
+ // the data available in the file.
+ virtual ::arrow::Status NextBatch(int64_t batch_size,
+ std::shared_ptr<::arrow::ChunkedArray>* out) = 0;
+};
+
+/// \brief Experimental helper class for bindings (like Python) that struggle
+/// either with std::move or C++ exceptions
+class PARQUET_EXPORT FileReaderBuilder {
+ public:
+ FileReaderBuilder();
+
+ /// Create FileReaderBuilder from Arrow file and optional properties / metadata
+ ::arrow::Status Open(std::shared_ptr<::arrow::io::RandomAccessFile> file,
+ const ReaderProperties& properties = default_reader_properties(),
+ std::shared_ptr<FileMetaData> metadata = NULLPTR);
+
+ ParquetFileReader* raw_reader() { return raw_reader_.get(); }
+
+ /// Set Arrow MemoryPool for memory allocation
+ FileReaderBuilder* memory_pool(::arrow::MemoryPool* pool);
+ /// Set Arrow reader properties
+ FileReaderBuilder* properties(const ArrowReaderProperties& arg_properties);
+ /// Build FileReader instance
+ ::arrow::Status Build(std::unique_ptr<FileReader>* out);
+
+ private:
+ ::arrow::MemoryPool* pool_;
+ ArrowReaderProperties properties_;
+ std::unique_ptr<ParquetFileReader> raw_reader_;
+};
+
+/// \defgroup parquet-arrow-reader-factories Factory functions for Parquet Arrow readers
+///
+/// @{
+
+/// \brief Build FileReader from Arrow file and MemoryPool
+///
+/// Advanced settings are supported through the FileReaderBuilder class.
+PARQUET_EXPORT
+::arrow::Status OpenFile(std::shared_ptr<::arrow::io::RandomAccessFile>,
+ ::arrow::MemoryPool* allocator,
+ std::unique_ptr<FileReader>* reader);
+
+/// @}
+
+PARQUET_EXPORT
+::arrow::Status StatisticsAsScalars(const Statistics& Statistics,
+ std::shared_ptr<::arrow::Scalar>* min,
+ std::shared_ptr<::arrow::Scalar>* max);
+
+namespace internal {
+
+PARQUET_EXPORT
+::arrow::Status FuzzReader(const uint8_t* data, int64_t size);
+
+} // namespace internal
+} // namespace arrow
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/arrow/reader_internal.cc b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/reader_internal.cc
new file mode 100644
index 00000000000..f13687079d4
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/reader_internal.cc
@@ -0,0 +1,791 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/arrow/reader_internal.h"
+
+#include <algorithm>
+#include <climits>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/compute/api.h"
+#include "arrow/datum.h"
+#include "arrow/io/memory.h"
+#include "arrow/ipc/reader.h"
+#include "arrow/ipc/writer.h"
+#include "arrow/scalar.h"
+#include "arrow/status.h"
+#include "arrow/table.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/base64.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/endian.h"
+#include "arrow/util/int_util_internal.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/string_view.h"
+#include "arrow/util/ubsan.h"
+#include "arrow/visitor_inline.h"
+#include "parquet/arrow/reader.h"
+#include "parquet/arrow/schema.h"
+#include "parquet/arrow/schema_internal.h"
+#include "parquet/column_reader.h"
+#include "parquet/platform.h"
+#include "parquet/properties.h"
+#include "parquet/schema.h"
+#include "parquet/statistics.h"
+#include "parquet/types.h"
+// Required after "arrow/util/int_util_internal.h" (for OPTIONAL)
+#include "parquet/windows_compatibility.h"
+
+using arrow::Array;
+using arrow::BooleanArray;
+using arrow::ChunkedArray;
+using arrow::DataType;
+using arrow::Datum;
+using arrow::Decimal128;
+using arrow::Decimal128Array;
+using arrow::Decimal128Type;
+using arrow::Decimal256;
+using arrow::Decimal256Array;
+using arrow::Decimal256Type;
+using arrow::Field;
+using arrow::Int32Array;
+using arrow::ListArray;
+using arrow::MemoryPool;
+using arrow::ResizableBuffer;
+using arrow::Status;
+using arrow::StructArray;
+using arrow::Table;
+using arrow::TimestampArray;
+
+using ::arrow::BitUtil::FromBigEndian;
+using ::arrow::internal::checked_cast;
+using ::arrow::internal::checked_pointer_cast;
+using ::arrow::internal::SafeLeftShift;
+using ::arrow::util::SafeLoadAs;
+
+using parquet::internal::BinaryRecordReader;
+using parquet::internal::DictionaryRecordReader;
+using parquet::internal::RecordReader;
+using parquet::schema::GroupNode;
+using parquet::schema::Node;
+using parquet::schema::PrimitiveNode;
+using ParquetType = parquet::Type;
+
+namespace BitUtil = arrow::BitUtil;
+
+namespace parquet {
+namespace arrow {
+namespace {
+
+template <typename ArrowType>
+using ArrayType = typename ::arrow::TypeTraits<ArrowType>::ArrayType;
+
+template <typename CType, typename StatisticsType>
+Status MakeMinMaxScalar(const StatisticsType& statistics,
+ std::shared_ptr<::arrow::Scalar>* min,
+ std::shared_ptr<::arrow::Scalar>* max) {
+ *min = ::arrow::MakeScalar(static_cast<CType>(statistics.min()));
+ *max = ::arrow::MakeScalar(static_cast<CType>(statistics.max()));
+ return Status::OK();
+}
+
+template <typename CType, typename StatisticsType>
+Status MakeMinMaxTypedScalar(const StatisticsType& statistics,
+ std::shared_ptr<DataType> type,
+ std::shared_ptr<::arrow::Scalar>* min,
+ std::shared_ptr<::arrow::Scalar>* max) {
+ ARROW_ASSIGN_OR_RAISE(*min, ::arrow::MakeScalar(type, statistics.min()));
+ ARROW_ASSIGN_OR_RAISE(*max, ::arrow::MakeScalar(type, statistics.max()));
+ return Status::OK();
+}
+
+template <typename StatisticsType>
+Status MakeMinMaxIntegralScalar(const StatisticsType& statistics,
+ const ::arrow::DataType& arrow_type,
+ std::shared_ptr<::arrow::Scalar>* min,
+ std::shared_ptr<::arrow::Scalar>* max) {
+ const auto column_desc = statistics.descr();
+ const auto& logical_type = column_desc->logical_type();
+ const auto& integer = checked_pointer_cast<const IntLogicalType>(logical_type);
+ const bool is_signed = integer->is_signed();
+
+ switch (integer->bit_width()) {
+ case 8:
+ return is_signed ? MakeMinMaxScalar<int8_t>(statistics, min, max)
+ : MakeMinMaxScalar<uint8_t>(statistics, min, max);
+ case 16:
+ return is_signed ? MakeMinMaxScalar<int16_t>(statistics, min, max)
+ : MakeMinMaxScalar<uint16_t>(statistics, min, max);
+ case 32:
+ return is_signed ? MakeMinMaxScalar<int32_t>(statistics, min, max)
+ : MakeMinMaxScalar<uint32_t>(statistics, min, max);
+ case 64:
+ return is_signed ? MakeMinMaxScalar<int64_t>(statistics, min, max)
+ : MakeMinMaxScalar<uint64_t>(statistics, min, max);
+ }
+
+ return Status::OK();
+}
+
+static Status FromInt32Statistics(const Int32Statistics& statistics,
+ const LogicalType& logical_type,
+ std::shared_ptr<::arrow::Scalar>* min,
+ std::shared_ptr<::arrow::Scalar>* max) {
+ ARROW_ASSIGN_OR_RAISE(auto type, FromInt32(logical_type));
+
+ switch (logical_type.type()) {
+ case LogicalType::Type::INT:
+ return MakeMinMaxIntegralScalar(statistics, *type, min, max);
+ break;
+ case LogicalType::Type::DATE:
+ case LogicalType::Type::TIME:
+ case LogicalType::Type::NONE:
+ return MakeMinMaxTypedScalar<int32_t>(statistics, type, min, max);
+ break;
+ default:
+ break;
+ }
+
+ return Status::NotImplemented("Cannot extract statistics for type ");
+}
+
+static Status FromInt64Statistics(const Int64Statistics& statistics,
+ const LogicalType& logical_type,
+ std::shared_ptr<::arrow::Scalar>* min,
+ std::shared_ptr<::arrow::Scalar>* max) {
+ ARROW_ASSIGN_OR_RAISE(auto type, FromInt64(logical_type));
+
+ switch (logical_type.type()) {
+ case LogicalType::Type::INT:
+ return MakeMinMaxIntegralScalar(statistics, *type, min, max);
+ break;
+ case LogicalType::Type::TIME:
+ case LogicalType::Type::TIMESTAMP:
+ case LogicalType::Type::NONE:
+ return MakeMinMaxTypedScalar<int64_t>(statistics, type, min, max);
+ break;
+ default:
+ break;
+ }
+
+ return Status::NotImplemented("Cannot extract statistics for type ");
+}
+
+template <typename DecimalType>
+Result<std::shared_ptr<::arrow::Scalar>> FromBigEndianString(
+ const std::string& data, std::shared_ptr<DataType> arrow_type) {
+ ARROW_ASSIGN_OR_RAISE(
+ DecimalType decimal,
+ DecimalType::FromBigEndian(reinterpret_cast<const uint8_t*>(data.data()),
+ static_cast<int32_t>(data.size())));
+ return ::arrow::MakeScalar(std::move(arrow_type), decimal);
+}
+
+// Extracts Min and Max scalar from bytes like types (i.e. types where
+// decimal is encoded as little endian.
+Status ExtractDecimalMinMaxFromBytesType(const Statistics& statistics,
+ const LogicalType& logical_type,
+ std::shared_ptr<::arrow::Scalar>* min,
+ std::shared_ptr<::arrow::Scalar>* max) {
+ const DecimalLogicalType& decimal_type =
+ checked_cast<const DecimalLogicalType&>(logical_type);
+
+ Result<std::shared_ptr<DataType>> maybe_type =
+ Decimal128Type::Make(decimal_type.precision(), decimal_type.scale());
+ std::shared_ptr<DataType> arrow_type;
+ if (maybe_type.ok()) {
+ arrow_type = maybe_type.ValueOrDie();
+ ARROW_ASSIGN_OR_RAISE(
+ *min, FromBigEndianString<Decimal128>(statistics.EncodeMin(), arrow_type));
+ ARROW_ASSIGN_OR_RAISE(*max, FromBigEndianString<Decimal128>(statistics.EncodeMax(),
+ std::move(arrow_type)));
+ return Status::OK();
+ }
+ // Fallback to see if Decimal256 can represent the type.
+ ARROW_ASSIGN_OR_RAISE(
+ arrow_type, Decimal256Type::Make(decimal_type.precision(), decimal_type.scale()));
+ ARROW_ASSIGN_OR_RAISE(
+ *min, FromBigEndianString<Decimal256>(statistics.EncodeMin(), arrow_type));
+ ARROW_ASSIGN_OR_RAISE(*max, FromBigEndianString<Decimal256>(statistics.EncodeMax(),
+ std::move(arrow_type)));
+
+ return Status::OK();
+}
+
+Status ByteArrayStatisticsAsScalars(const Statistics& statistics,
+ std::shared_ptr<::arrow::Scalar>* min,
+ std::shared_ptr<::arrow::Scalar>* max) {
+ auto logical_type = statistics.descr()->logical_type();
+ if (logical_type->type() == LogicalType::Type::DECIMAL) {
+ return ExtractDecimalMinMaxFromBytesType(statistics, *logical_type, min, max);
+ }
+ std::shared_ptr<::arrow::DataType> type;
+ if (statistics.descr()->physical_type() == Type::FIXED_LEN_BYTE_ARRAY) {
+ type = ::arrow::fixed_size_binary(statistics.descr()->type_length());
+ } else {
+ type = logical_type->type() == LogicalType::Type::STRING ? ::arrow::utf8()
+ : ::arrow::binary();
+ }
+ ARROW_ASSIGN_OR_RAISE(
+ *min, ::arrow::MakeScalar(type, Buffer::FromString(statistics.EncodeMin())));
+ ARROW_ASSIGN_OR_RAISE(
+ *max, ::arrow::MakeScalar(type, Buffer::FromString(statistics.EncodeMax())));
+
+ return Status::OK();
+}
+
+} // namespace
+
+Status StatisticsAsScalars(const Statistics& statistics,
+ std::shared_ptr<::arrow::Scalar>* min,
+ std::shared_ptr<::arrow::Scalar>* max) {
+ if (!statistics.HasMinMax()) {
+ return Status::Invalid("Statistics has no min max.");
+ }
+
+ auto column_desc = statistics.descr();
+ if (column_desc == nullptr) {
+ return Status::Invalid("Statistics carries no descriptor, can't infer arrow type.");
+ }
+
+ auto physical_type = column_desc->physical_type();
+ auto logical_type = column_desc->logical_type();
+ switch (physical_type) {
+ case Type::BOOLEAN:
+ return MakeMinMaxScalar<bool, BoolStatistics>(
+ checked_cast<const BoolStatistics&>(statistics), min, max);
+ case Type::FLOAT:
+ return MakeMinMaxScalar<float, FloatStatistics>(
+ checked_cast<const FloatStatistics&>(statistics), min, max);
+ case Type::DOUBLE:
+ return MakeMinMaxScalar<double, DoubleStatistics>(
+ checked_cast<const DoubleStatistics&>(statistics), min, max);
+ case Type::INT32:
+ return FromInt32Statistics(checked_cast<const Int32Statistics&>(statistics),
+ *logical_type, min, max);
+ case Type::INT64:
+ return FromInt64Statistics(checked_cast<const Int64Statistics&>(statistics),
+ *logical_type, min, max);
+ case Type::BYTE_ARRAY:
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ return ByteArrayStatisticsAsScalars(statistics, min, max);
+ default:
+ return Status::NotImplemented("Extract statistics unsupported for physical_type ",
+ physical_type, " unsupported.");
+ }
+
+ return Status::OK();
+}
+
+// ----------------------------------------------------------------------
+// Primitive types
+
+namespace {
+
+template <typename ArrowType, typename ParquetType>
+Status TransferInt(RecordReader* reader, MemoryPool* pool,
+ const std::shared_ptr<DataType>& type, Datum* out) {
+ using ArrowCType = typename ArrowType::c_type;
+ using ParquetCType = typename ParquetType::c_type;
+ int64_t length = reader->values_written();
+ ARROW_ASSIGN_OR_RAISE(auto data,
+ ::arrow::AllocateBuffer(length * sizeof(ArrowCType), pool));
+
+ auto values = reinterpret_cast<const ParquetCType*>(reader->values());
+ auto out_ptr = reinterpret_cast<ArrowCType*>(data->mutable_data());
+ std::copy(values, values + length, out_ptr);
+ *out = std::make_shared<ArrayType<ArrowType>>(
+ type, length, std::move(data), reader->ReleaseIsValid(), reader->null_count());
+ return Status::OK();
+}
+
+std::shared_ptr<Array> TransferZeroCopy(RecordReader* reader,
+ const std::shared_ptr<DataType>& type) {
+ std::vector<std::shared_ptr<Buffer>> buffers = {reader->ReleaseIsValid(),
+ reader->ReleaseValues()};
+ auto data = std::make_shared<::arrow::ArrayData>(type, reader->values_written(),
+ buffers, reader->null_count());
+ return ::arrow::MakeArray(data);
+}
+
+Status TransferBool(RecordReader* reader, MemoryPool* pool, Datum* out) {
+ int64_t length = reader->values_written();
+
+ const int64_t buffer_size = BitUtil::BytesForBits(length);
+ ARROW_ASSIGN_OR_RAISE(auto data, ::arrow::AllocateBuffer(buffer_size, pool));
+
+ // Transfer boolean values to packed bitmap
+ auto values = reinterpret_cast<const bool*>(reader->values());
+ uint8_t* data_ptr = data->mutable_data();
+ memset(data_ptr, 0, buffer_size);
+
+ for (int64_t i = 0; i < length; i++) {
+ if (values[i]) {
+ ::arrow::BitUtil::SetBit(data_ptr, i);
+ }
+ }
+
+ *out = std::make_shared<BooleanArray>(length, std::move(data), reader->ReleaseIsValid(),
+ reader->null_count());
+ return Status::OK();
+}
+
+Status TransferInt96(RecordReader* reader, MemoryPool* pool,
+ const std::shared_ptr<DataType>& type, Datum* out,
+ const ::arrow::TimeUnit::type int96_arrow_time_unit) {
+ int64_t length = reader->values_written();
+ auto values = reinterpret_cast<const Int96*>(reader->values());
+ ARROW_ASSIGN_OR_RAISE(auto data,
+ ::arrow::AllocateBuffer(length * sizeof(int64_t), pool));
+ auto data_ptr = reinterpret_cast<int64_t*>(data->mutable_data());
+ for (int64_t i = 0; i < length; i++) {
+ if (values[i].value[2] == 0) {
+ // Happens for null entries: avoid triggering UBSAN as that Int96 timestamp
+ // isn't representable as a 64-bit Unix timestamp.
+ *data_ptr++ = 0;
+ } else {
+ switch (int96_arrow_time_unit) {
+ case ::arrow::TimeUnit::NANO:
+ *data_ptr++ = Int96GetNanoSeconds(values[i]);
+ break;
+ case ::arrow::TimeUnit::MICRO:
+ *data_ptr++ = Int96GetMicroSeconds(values[i]);
+ break;
+ case ::arrow::TimeUnit::MILLI:
+ *data_ptr++ = Int96GetMilliSeconds(values[i]);
+ break;
+ case ::arrow::TimeUnit::SECOND:
+ *data_ptr++ = Int96GetSeconds(values[i]);
+ break;
+ }
+ }
+ }
+ *out = std::make_shared<TimestampArray>(type, length, std::move(data),
+ reader->ReleaseIsValid(), reader->null_count());
+ return Status::OK();
+}
+
+Status TransferDate64(RecordReader* reader, MemoryPool* pool,
+ const std::shared_ptr<DataType>& type, Datum* out) {
+ int64_t length = reader->values_written();
+ auto values = reinterpret_cast<const int32_t*>(reader->values());
+
+ ARROW_ASSIGN_OR_RAISE(auto data,
+ ::arrow::AllocateBuffer(length * sizeof(int64_t), pool));
+ auto out_ptr = reinterpret_cast<int64_t*>(data->mutable_data());
+
+ for (int64_t i = 0; i < length; i++) {
+ *out_ptr++ = static_cast<int64_t>(values[i]) * kMillisecondsPerDay;
+ }
+
+ *out = std::make_shared<::arrow::Date64Array>(
+ type, length, std::move(data), reader->ReleaseIsValid(), reader->null_count());
+ return Status::OK();
+}
+
+// ----------------------------------------------------------------------
+// Binary, direct to dictionary-encoded
+
+Status TransferDictionary(RecordReader* reader,
+ const std::shared_ptr<DataType>& logical_value_type,
+ std::shared_ptr<ChunkedArray>* out) {
+ auto dict_reader = dynamic_cast<DictionaryRecordReader*>(reader);
+ DCHECK(dict_reader);
+ *out = dict_reader->GetResult();
+ if (!logical_value_type->Equals(*(*out)->type())) {
+ ARROW_ASSIGN_OR_RAISE(*out, (*out)->View(logical_value_type));
+ }
+ return Status::OK();
+}
+
+Status TransferBinary(RecordReader* reader, MemoryPool* pool,
+ const std::shared_ptr<DataType>& logical_value_type,
+ std::shared_ptr<ChunkedArray>* out) {
+ if (reader->read_dictionary()) {
+ return TransferDictionary(
+ reader, ::arrow::dictionary(::arrow::int32(), logical_value_type), out);
+ }
+ ::arrow::compute::ExecContext ctx(pool);
+ ::arrow::compute::CastOptions cast_options;
+ cast_options.allow_invalid_utf8 = true; // avoid spending time validating UTF8 data
+
+ auto binary_reader = dynamic_cast<BinaryRecordReader*>(reader);
+ DCHECK(binary_reader);
+ auto chunks = binary_reader->GetBuilderChunks();
+ for (auto& chunk : chunks) {
+ if (!chunk->type()->Equals(*logical_value_type)) {
+ // XXX: if a LargeBinary chunk is larger than 2GB, the MSBs of offsets
+ // will be lost because they are first created as int32 and then cast to int64.
+ ARROW_ASSIGN_OR_RAISE(
+ chunk, ::arrow::compute::Cast(*chunk, logical_value_type, cast_options, &ctx));
+ }
+ }
+ *out = std::make_shared<ChunkedArray>(chunks, logical_value_type);
+ return Status::OK();
+}
+
+// ----------------------------------------------------------------------
+// INT32 / INT64 / BYTE_ARRAY / FIXED_LEN_BYTE_ARRAY -> Decimal128 || Decimal256
+
+template <typename DecimalType>
+Status RawBytesToDecimalBytes(const uint8_t* value, int32_t byte_width,
+ uint8_t* out_buf) {
+ ARROW_ASSIGN_OR_RAISE(DecimalType t, DecimalType::FromBigEndian(value, byte_width));
+ t.ToBytes(out_buf);
+ return ::arrow::Status::OK();
+}
+
+template <typename DecimalArrayType>
+struct DecimalTypeTrait;
+
+template <>
+struct DecimalTypeTrait<::arrow::Decimal128Array> {
+ using value = ::arrow::Decimal128;
+};
+
+template <>
+struct DecimalTypeTrait<::arrow::Decimal256Array> {
+ using value = ::arrow::Decimal256;
+};
+
+template <typename DecimalArrayType, typename ParquetType>
+struct DecimalConverter {
+ static inline Status ConvertToDecimal(const Array& array,
+ const std::shared_ptr<DataType>&,
+ MemoryPool* pool, std::shared_ptr<Array>*) {
+ return Status::NotImplemented("not implemented");
+ }
+};
+
+template <typename DecimalArrayType>
+struct DecimalConverter<DecimalArrayType, FLBAType> {
+ static inline Status ConvertToDecimal(const Array& array,
+ const std::shared_ptr<DataType>& type,
+ MemoryPool* pool, std::shared_ptr<Array>* out) {
+ const auto& fixed_size_binary_array =
+ checked_cast<const ::arrow::FixedSizeBinaryArray&>(array);
+
+ // The byte width of each decimal value
+ const int32_t type_length =
+ checked_cast<const ::arrow::DecimalType&>(*type).byte_width();
+
+ // number of elements in the entire array
+ const int64_t length = fixed_size_binary_array.length();
+
+ // Get the byte width of the values in the FixedSizeBinaryArray. Most of the time
+ // this will be different from the decimal array width because we write the minimum
+ // number of bytes necessary to represent a given precision
+ const int32_t byte_width =
+ checked_cast<const ::arrow::FixedSizeBinaryType&>(*fixed_size_binary_array.type())
+ .byte_width();
+ // allocate memory for the decimal array
+ ARROW_ASSIGN_OR_RAISE(auto data, ::arrow::AllocateBuffer(length * type_length, pool));
+
+ // raw bytes that we can write to
+ uint8_t* out_ptr = data->mutable_data();
+
+ // convert each FixedSizeBinary value to valid decimal bytes
+ const int64_t null_count = fixed_size_binary_array.null_count();
+
+ using DecimalType = typename DecimalTypeTrait<DecimalArrayType>::value;
+ if (null_count > 0) {
+ for (int64_t i = 0; i < length; ++i, out_ptr += type_length) {
+ if (!fixed_size_binary_array.IsNull(i)) {
+ RETURN_NOT_OK(RawBytesToDecimalBytes<DecimalType>(
+ fixed_size_binary_array.GetValue(i), byte_width, out_ptr));
+ } else {
+ std::memset(out_ptr, 0, type_length);
+ }
+ }
+ } else {
+ for (int64_t i = 0; i < length; ++i, out_ptr += type_length) {
+ RETURN_NOT_OK(RawBytesToDecimalBytes<DecimalType>(
+ fixed_size_binary_array.GetValue(i), byte_width, out_ptr));
+ }
+ }
+
+ *out = std::make_shared<DecimalArrayType>(
+ type, length, std::move(data), fixed_size_binary_array.null_bitmap(), null_count);
+
+ return Status::OK();
+ }
+};
+
+template <typename DecimalArrayType>
+struct DecimalConverter<DecimalArrayType, ByteArrayType> {
+ static inline Status ConvertToDecimal(const Array& array,
+ const std::shared_ptr<DataType>& type,
+ MemoryPool* pool, std::shared_ptr<Array>* out) {
+ const auto& binary_array = checked_cast<const ::arrow::BinaryArray&>(array);
+ const int64_t length = binary_array.length();
+
+ const auto& decimal_type = checked_cast<const ::arrow::DecimalType&>(*type);
+ const int64_t type_length = decimal_type.byte_width();
+
+ ARROW_ASSIGN_OR_RAISE(auto data, ::arrow::AllocateBuffer(length * type_length, pool));
+
+ // raw bytes that we can write to
+ uint8_t* out_ptr = data->mutable_data();
+
+ const int64_t null_count = binary_array.null_count();
+
+ // convert each BinaryArray value to valid decimal bytes
+ for (int64_t i = 0; i < length; i++, out_ptr += type_length) {
+ int32_t record_len = 0;
+ const uint8_t* record_loc = binary_array.GetValue(i, &record_len);
+
+ if (record_len < 0 || record_len > type_length) {
+ return Status::Invalid("Invalid BYTE_ARRAY length for ", type->ToString());
+ }
+
+ auto out_ptr_view = reinterpret_cast<uint64_t*>(out_ptr);
+ out_ptr_view[0] = 0;
+ out_ptr_view[1] = 0;
+
+ // only convert rows that are not null if there are nulls, or
+ // all rows, if there are not
+ if ((null_count > 0 && !binary_array.IsNull(i)) || null_count <= 0) {
+ using DecimalType = typename DecimalTypeTrait<DecimalArrayType>::value;
+ RETURN_NOT_OK(
+ RawBytesToDecimalBytes<DecimalType>(record_loc, record_len, out_ptr));
+ }
+ }
+ *out = std::make_shared<DecimalArrayType>(type, length, std::move(data),
+ binary_array.null_bitmap(), null_count);
+ return Status::OK();
+ }
+};
+
+/// \brief Convert an Int32 or Int64 array into a Decimal128Array
+/// The parquet spec allows systems to write decimals in int32, int64 if the values are
+/// small enough to fit in less 4 bytes or less than 8 bytes, respectively.
+/// This function implements the conversion from int32 and int64 arrays to decimal arrays.
+template <
+ typename ParquetIntegerType,
+ typename = ::arrow::enable_if_t<std::is_same<ParquetIntegerType, Int32Type>::value ||
+ std::is_same<ParquetIntegerType, Int64Type>::value>>
+static Status DecimalIntegerTransfer(RecordReader* reader, MemoryPool* pool,
+ const std::shared_ptr<DataType>& type, Datum* out) {
+ // Decimal128 and Decimal256 are only Arrow constructs. Parquet does not
+ // specifically distinguish between decimal byte widths.
+ // Decimal256 isn't relevant here because the Arrow-Parquet C++ bindings never
+ // write Decimal values as integers and if the decimal value can fit in an
+ // integer it is wasteful to use Decimal256. Put another way, the only
+ // way an integer column could be construed as Decimal256 is if an arrow
+ // schema was stored as metadata in the file indicating the column was
+ // Decimal256. The current Arrow-Parquet C++ bindings will never do this.
+ DCHECK(type->id() == ::arrow::Type::DECIMAL128);
+
+ const int64_t length = reader->values_written();
+
+ using ElementType = typename ParquetIntegerType::c_type;
+ static_assert(std::is_same<ElementType, int32_t>::value ||
+ std::is_same<ElementType, int64_t>::value,
+ "ElementType must be int32_t or int64_t");
+
+ const auto values = reinterpret_cast<const ElementType*>(reader->values());
+
+ const auto& decimal_type = checked_cast<const ::arrow::DecimalType&>(*type);
+ const int64_t type_length = decimal_type.byte_width();
+
+ ARROW_ASSIGN_OR_RAISE(auto data, ::arrow::AllocateBuffer(length * type_length, pool));
+ uint8_t* out_ptr = data->mutable_data();
+
+ using ::arrow::BitUtil::FromLittleEndian;
+
+ for (int64_t i = 0; i < length; ++i, out_ptr += type_length) {
+ // sign/zero extend int32_t values, otherwise a no-op
+ const auto value = static_cast<int64_t>(values[i]);
+
+ ::arrow::Decimal128 decimal(value);
+ decimal.ToBytes(out_ptr);
+ }
+
+ if (reader->nullable_values()) {
+ std::shared_ptr<ResizableBuffer> is_valid = reader->ReleaseIsValid();
+ *out = std::make_shared<Decimal128Array>(type, length, std::move(data), is_valid,
+ reader->null_count());
+ } else {
+ *out = std::make_shared<Decimal128Array>(type, length, std::move(data));
+ }
+ return Status::OK();
+}
+
+/// \brief Convert an arrow::BinaryArray to an arrow::Decimal{128,256}Array
+/// We do this by:
+/// 1. Creating an arrow::BinaryArray from the RecordReader's builder
+/// 2. Allocating a buffer for the arrow::Decimal{128,256}Array
+/// 3. Converting the big-endian bytes in each BinaryArray entry to two integers
+/// representing the high and low bits of each decimal value.
+template <typename DecimalArrayType, typename ParquetType>
+Status TransferDecimal(RecordReader* reader, MemoryPool* pool,
+ const std::shared_ptr<DataType>& type, Datum* out) {
+ auto binary_reader = dynamic_cast<BinaryRecordReader*>(reader);
+ DCHECK(binary_reader);
+ ::arrow::ArrayVector chunks = binary_reader->GetBuilderChunks();
+ for (size_t i = 0; i < chunks.size(); ++i) {
+ std::shared_ptr<Array> chunk_as_decimal;
+ auto fn = &DecimalConverter<DecimalArrayType, ParquetType>::ConvertToDecimal;
+ RETURN_NOT_OK(fn(*chunks[i], type, pool, &chunk_as_decimal));
+ // Replace the chunk, which will hopefully also free memory as we go
+ chunks[i] = chunk_as_decimal;
+ }
+ *out = std::make_shared<ChunkedArray>(chunks, type);
+ return Status::OK();
+}
+
+} // namespace
+
+#define TRANSFER_INT32(ENUM, ArrowType) \
+ case ::arrow::Type::ENUM: { \
+ Status s = TransferInt<ArrowType, Int32Type>(reader, pool, value_type, &result); \
+ RETURN_NOT_OK(s); \
+ } break;
+
+#define TRANSFER_INT64(ENUM, ArrowType) \
+ case ::arrow::Type::ENUM: { \
+ Status s = TransferInt<ArrowType, Int64Type>(reader, pool, value_type, &result); \
+ RETURN_NOT_OK(s); \
+ } break;
+
+Status TransferColumnData(RecordReader* reader, std::shared_ptr<DataType> value_type,
+ const ColumnDescriptor* descr, MemoryPool* pool,
+ std::shared_ptr<ChunkedArray>* out) {
+ Datum result;
+ std::shared_ptr<ChunkedArray> chunked_result;
+ switch (value_type->id()) {
+ case ::arrow::Type::DICTIONARY: {
+ RETURN_NOT_OK(TransferDictionary(reader, value_type, &chunked_result));
+ result = chunked_result;
+ } break;
+ case ::arrow::Type::NA: {
+ result = std::make_shared<::arrow::NullArray>(reader->values_written());
+ break;
+ }
+ case ::arrow::Type::INT32:
+ case ::arrow::Type::INT64:
+ case ::arrow::Type::FLOAT:
+ case ::arrow::Type::DOUBLE:
+ result = TransferZeroCopy(reader, value_type);
+ break;
+ case ::arrow::Type::BOOL:
+ RETURN_NOT_OK(TransferBool(reader, pool, &result));
+ break;
+ TRANSFER_INT32(UINT8, ::arrow::UInt8Type);
+ TRANSFER_INT32(INT8, ::arrow::Int8Type);
+ TRANSFER_INT32(UINT16, ::arrow::UInt16Type);
+ TRANSFER_INT32(INT16, ::arrow::Int16Type);
+ TRANSFER_INT32(UINT32, ::arrow::UInt32Type);
+ TRANSFER_INT64(UINT64, ::arrow::UInt64Type);
+ TRANSFER_INT32(DATE32, ::arrow::Date32Type);
+ TRANSFER_INT32(TIME32, ::arrow::Time32Type);
+ TRANSFER_INT64(TIME64, ::arrow::Time64Type);
+ case ::arrow::Type::DATE64:
+ RETURN_NOT_OK(TransferDate64(reader, pool, value_type, &result));
+ break;
+ case ::arrow::Type::FIXED_SIZE_BINARY:
+ case ::arrow::Type::BINARY:
+ case ::arrow::Type::STRING:
+ case ::arrow::Type::LARGE_BINARY:
+ case ::arrow::Type::LARGE_STRING: {
+ RETURN_NOT_OK(TransferBinary(reader, pool, value_type, &chunked_result));
+ result = chunked_result;
+ } break;
+ case ::arrow::Type::DECIMAL128: {
+ switch (descr->physical_type()) {
+ case ::parquet::Type::INT32: {
+ auto fn = DecimalIntegerTransfer<Int32Type>;
+ RETURN_NOT_OK(fn(reader, pool, value_type, &result));
+ } break;
+ case ::parquet::Type::INT64: {
+ auto fn = &DecimalIntegerTransfer<Int64Type>;
+ RETURN_NOT_OK(fn(reader, pool, value_type, &result));
+ } break;
+ case ::parquet::Type::BYTE_ARRAY: {
+ auto fn = &TransferDecimal<Decimal128Array, ByteArrayType>;
+ RETURN_NOT_OK(fn(reader, pool, value_type, &result));
+ } break;
+ case ::parquet::Type::FIXED_LEN_BYTE_ARRAY: {
+ auto fn = &TransferDecimal<Decimal128Array, FLBAType>;
+ RETURN_NOT_OK(fn(reader, pool, value_type, &result));
+ } break;
+ default:
+ return Status::Invalid(
+ "Physical type for decimal128 must be int32, int64, byte array, or fixed "
+ "length binary");
+ }
+ } break;
+ case ::arrow::Type::DECIMAL256:
+ switch (descr->physical_type()) {
+ case ::parquet::Type::BYTE_ARRAY: {
+ auto fn = &TransferDecimal<Decimal256Array, ByteArrayType>;
+ RETURN_NOT_OK(fn(reader, pool, value_type, &result));
+ } break;
+ case ::parquet::Type::FIXED_LEN_BYTE_ARRAY: {
+ auto fn = &TransferDecimal<Decimal256Array, FLBAType>;
+ RETURN_NOT_OK(fn(reader, pool, value_type, &result));
+ } break;
+ default:
+ return Status::Invalid(
+ "Physical type for decimal256 must be fixed length binary");
+ }
+ break;
+
+ case ::arrow::Type::TIMESTAMP: {
+ const ::arrow::TimestampType& timestamp_type =
+ checked_cast<::arrow::TimestampType&>(*value_type);
+ if (descr->physical_type() == ::parquet::Type::INT96) {
+ RETURN_NOT_OK(
+ TransferInt96(reader, pool, value_type, &result, timestamp_type.unit()));
+ } else {
+ switch (timestamp_type.unit()) {
+ case ::arrow::TimeUnit::MILLI:
+ case ::arrow::TimeUnit::MICRO:
+ case ::arrow::TimeUnit::NANO:
+ result = TransferZeroCopy(reader, value_type);
+ break;
+ default:
+ return Status::NotImplemented("TimeUnit not supported");
+ }
+ }
+ } break;
+ default:
+ return Status::NotImplemented("No support for reading columns of type ",
+ value_type->ToString());
+ }
+
+ if (result.kind() == Datum::ARRAY) {
+ *out = std::make_shared<ChunkedArray>(result.make_array());
+ } else if (result.kind() == Datum::CHUNKED_ARRAY) {
+ *out = result.chunked_array();
+ } else {
+ DCHECK(false) << "Should be impossible, result was " << result.ToString();
+ }
+
+ return Status::OK();
+}
+
+} // namespace arrow
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/arrow/reader_internal.h b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/reader_internal.h
new file mode 100644
index 00000000000..ad0b781576f
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/reader_internal.h
@@ -0,0 +1,122 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cstdint>
+#include <deque>
+#include <functional>
+#include <memory>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "parquet/arrow/schema.h"
+#include "parquet/column_reader.h"
+#include "parquet/file_reader.h"
+#include "parquet/metadata.h"
+#include "parquet/platform.h"
+#include "parquet/schema.h"
+
+namespace arrow {
+
+class Array;
+class ChunkedArray;
+class DataType;
+class Field;
+class KeyValueMetadata;
+class Schema;
+
+} // namespace arrow
+
+using arrow::Status;
+
+namespace parquet {
+
+class ArrowReaderProperties;
+
+namespace arrow {
+
+class ColumnReaderImpl;
+
+// ----------------------------------------------------------------------
+// Iteration utilities
+
+// Abstraction to decouple row group iteration details from the ColumnReader,
+// so we can read only a single row group if we want
+class FileColumnIterator {
+ public:
+ explicit FileColumnIterator(int column_index, ParquetFileReader* reader,
+ std::vector<int> row_groups)
+ : column_index_(column_index),
+ reader_(reader),
+ schema_(reader->metadata()->schema()),
+ row_groups_(row_groups.begin(), row_groups.end()) {}
+
+ virtual ~FileColumnIterator() {}
+
+ std::unique_ptr<::parquet::PageReader> NextChunk() {
+ if (row_groups_.empty()) {
+ return nullptr;
+ }
+
+ auto row_group_reader = reader_->RowGroup(row_groups_.front());
+ row_groups_.pop_front();
+ return row_group_reader->GetColumnPageReader(column_index_);
+ }
+
+ const SchemaDescriptor* schema() const { return schema_; }
+
+ const ColumnDescriptor* descr() const { return schema_->Column(column_index_); }
+
+ std::shared_ptr<FileMetaData> metadata() const { return reader_->metadata(); }
+
+ int column_index() const { return column_index_; }
+
+ protected:
+ int column_index_;
+ ParquetFileReader* reader_;
+ const SchemaDescriptor* schema_;
+ std::deque<int> row_groups_;
+};
+
+using FileColumnIteratorFactory =
+ std::function<FileColumnIterator*(int, ParquetFileReader*)>;
+
+Status TransferColumnData(::parquet::internal::RecordReader* reader,
+ std::shared_ptr<::arrow::DataType> value_type,
+ const ColumnDescriptor* descr, ::arrow::MemoryPool* pool,
+ std::shared_ptr<::arrow::ChunkedArray>* out);
+
+struct ReaderContext {
+ ParquetFileReader* reader;
+ ::arrow::MemoryPool* pool;
+ FileColumnIteratorFactory iterator_factory;
+ bool filter_leaves;
+ std::shared_ptr<std::unordered_set<int>> included_leaves;
+
+ bool IncludesLeaf(int leaf_index) const {
+ if (this->filter_leaves) {
+ return this->included_leaves->find(leaf_index) != this->included_leaves->end();
+ }
+ return true;
+ }
+};
+
+} // namespace arrow
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/arrow/schema.cc b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/schema.cc
new file mode 100644
index 00000000000..eb7fd628dfc
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/schema.cc
@@ -0,0 +1,1087 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/arrow/schema.h"
+
+#include <functional>
+#include <string>
+#include <vector>
+
+#include "arrow/extension_type.h"
+#include "arrow/io/memory.h"
+#include "arrow/ipc/api.h"
+#include "arrow/result_internal.h"
+#include "arrow/type.h"
+#include "arrow/util/base64.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/key_value_metadata.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/value_parsing.h"
+
+#include "parquet/arrow/schema_internal.h"
+#include "parquet/exception.h"
+#include "parquet/properties.h"
+#include "parquet/types.h"
+
+using arrow::DecimalType;
+using arrow::Field;
+using arrow::FieldVector;
+using arrow::KeyValueMetadata;
+using arrow::Status;
+using arrow::internal::checked_cast;
+
+using ArrowType = arrow::DataType;
+using ArrowTypeId = arrow::Type;
+
+using parquet::Repetition;
+using parquet::schema::GroupNode;
+using parquet::schema::Node;
+using parquet::schema::NodePtr;
+using parquet::schema::PrimitiveNode;
+
+using ParquetType = parquet::Type;
+using parquet::ConvertedType;
+using parquet::LogicalType;
+
+using parquet::internal::LevelInfo;
+
+namespace parquet {
+
+namespace arrow {
+
+// ----------------------------------------------------------------------
+// Parquet to Arrow schema conversion
+
+namespace {
+
+Repetition::type RepetitionFromNullable(bool is_nullable) {
+ return is_nullable ? Repetition::OPTIONAL : Repetition::REQUIRED;
+}
+
+Status FieldToNode(const std::string& name, const std::shared_ptr<Field>& field,
+ const WriterProperties& properties,
+ const ArrowWriterProperties& arrow_properties, NodePtr* out);
+
+Status ListToNode(const std::shared_ptr<::arrow::BaseListType>& type,
+ const std::string& name, bool nullable,
+ const WriterProperties& properties,
+ const ArrowWriterProperties& arrow_properties, NodePtr* out) {
+ NodePtr element;
+ std::string value_name =
+ arrow_properties.compliant_nested_types() ? "element" : type->value_field()->name();
+ RETURN_NOT_OK(FieldToNode(value_name, type->value_field(), properties, arrow_properties,
+ &element));
+
+ NodePtr list = GroupNode::Make("list", Repetition::REPEATED, {element});
+ *out = GroupNode::Make(name, RepetitionFromNullable(nullable), {list},
+ LogicalType::List());
+ return Status::OK();
+}
+
+Status MapToNode(const std::shared_ptr<::arrow::MapType>& type, const std::string& name,
+ bool nullable, const WriterProperties& properties,
+ const ArrowWriterProperties& arrow_properties, NodePtr* out) {
+ // TODO: Should we offer a non-compliant mode that forwards the type names?
+ NodePtr key_node;
+ RETURN_NOT_OK(
+ FieldToNode("key", type->key_field(), properties, arrow_properties, &key_node));
+
+ NodePtr value_node;
+ RETURN_NOT_OK(FieldToNode("value", type->item_field(), properties, arrow_properties,
+ &value_node));
+
+ NodePtr key_value =
+ GroupNode::Make("key_value", Repetition::REPEATED, {key_node, value_node});
+ *out = GroupNode::Make(name, RepetitionFromNullable(nullable), {key_value},
+ LogicalType::Map());
+ return Status::OK();
+}
+
+Status StructToNode(const std::shared_ptr<::arrow::StructType>& type,
+ const std::string& name, bool nullable,
+ const WriterProperties& properties,
+ const ArrowWriterProperties& arrow_properties, NodePtr* out) {
+ std::vector<NodePtr> children(type->num_fields());
+ if (type->num_fields() != 0) {
+ for (int i = 0; i < type->num_fields(); i++) {
+ RETURN_NOT_OK(FieldToNode(type->field(i)->name(), type->field(i), properties,
+ arrow_properties, &children[i]));
+ }
+ } else {
+ // XXX (ARROW-10928) We could add a dummy primitive node but that would
+ // require special handling when writing and reading, to avoid column index
+ // mismatches.
+ return Status::NotImplemented("Cannot write struct type '", name,
+ "' with no child field to Parquet. "
+ "Consider adding a dummy child field.");
+ }
+
+ *out = GroupNode::Make(name, RepetitionFromNullable(nullable), std::move(children));
+ return Status::OK();
+}
+
+static std::shared_ptr<const LogicalType> TimestampLogicalTypeFromArrowTimestamp(
+ const ::arrow::TimestampType& timestamp_type, ::arrow::TimeUnit::type time_unit) {
+ const bool utc = !(timestamp_type.timezone().empty());
+ // ARROW-5878(wesm): for forward compatibility reasons, and because
+ // there's no other way to signal to old readers that values are
+ // timestamps, we force the ConvertedType field to be set to the
+ // corresponding TIMESTAMP_* value. This does cause some ambiguity
+ // as Parquet readers have not been consistent about the
+ // interpretation of TIMESTAMP_* values as being UTC-normalized.
+ switch (time_unit) {
+ case ::arrow::TimeUnit::MILLI:
+ return LogicalType::Timestamp(utc, LogicalType::TimeUnit::MILLIS,
+ /*is_from_converted_type=*/false,
+ /*force_set_converted_type=*/true);
+ case ::arrow::TimeUnit::MICRO:
+ return LogicalType::Timestamp(utc, LogicalType::TimeUnit::MICROS,
+ /*is_from_converted_type=*/false,
+ /*force_set_converted_type=*/true);
+ case ::arrow::TimeUnit::NANO:
+ return LogicalType::Timestamp(utc, LogicalType::TimeUnit::NANOS);
+ case ::arrow::TimeUnit::SECOND:
+ // No equivalent parquet logical type.
+ break;
+ }
+ return LogicalType::None();
+}
+
+static Status GetTimestampMetadata(const ::arrow::TimestampType& type,
+ const WriterProperties& properties,
+ const ArrowWriterProperties& arrow_properties,
+ ParquetType::type* physical_type,
+ std::shared_ptr<const LogicalType>* logical_type) {
+ const bool coerce = arrow_properties.coerce_timestamps_enabled();
+ const auto target_unit =
+ coerce ? arrow_properties.coerce_timestamps_unit() : type.unit();
+
+ // The user is explicitly asking for Impala int96 encoding, there is no
+ // logical type.
+ if (arrow_properties.support_deprecated_int96_timestamps()) {
+ *physical_type = ParquetType::INT96;
+ return Status::OK();
+ }
+
+ *physical_type = ParquetType::INT64;
+ *logical_type = TimestampLogicalTypeFromArrowTimestamp(type, target_unit);
+
+ // The user is explicitly asking for timestamp data to be converted to the
+ // specified units (target_unit).
+ if (coerce) {
+ if (properties.version() == ::parquet::ParquetVersion::PARQUET_1_0) {
+ switch (target_unit) {
+ case ::arrow::TimeUnit::MILLI:
+ case ::arrow::TimeUnit::MICRO:
+ break;
+ case ::arrow::TimeUnit::NANO:
+ case ::arrow::TimeUnit::SECOND:
+ return Status::NotImplemented(
+ "For Parquet version 1.0 files, can only coerce Arrow timestamps to "
+ "milliseconds or microseconds");
+ }
+ } else {
+ switch (target_unit) {
+ case ::arrow::TimeUnit::MILLI:
+ case ::arrow::TimeUnit::MICRO:
+ case ::arrow::TimeUnit::NANO:
+ break;
+ case ::arrow::TimeUnit::SECOND:
+ return Status::NotImplemented(
+ "For Parquet files, can only coerce Arrow timestamps to milliseconds, "
+ "microseconds, or nanoseconds");
+ }
+ }
+ return Status::OK();
+ }
+
+ // The user implicitly wants timestamp data to retain its original time units,
+ // however the ConvertedType field used to indicate logical types for Parquet
+ // version 1.0 fields does not allow for nanosecond time units and so nanoseconds
+ // must be coerced to microseconds.
+ if (properties.version() == ::parquet::ParquetVersion::PARQUET_1_0 &&
+ type.unit() == ::arrow::TimeUnit::NANO) {
+ *logical_type =
+ TimestampLogicalTypeFromArrowTimestamp(type, ::arrow::TimeUnit::MICRO);
+ return Status::OK();
+ }
+
+ // The user implicitly wants timestamp data to retain its original time units,
+ // however the Arrow seconds time unit can not be represented (annotated) in
+ // any version of Parquet and so must be coerced to milliseconds.
+ if (type.unit() == ::arrow::TimeUnit::SECOND) {
+ *logical_type =
+ TimestampLogicalTypeFromArrowTimestamp(type, ::arrow::TimeUnit::MILLI);
+ return Status::OK();
+ }
+
+ return Status::OK();
+}
+
+static constexpr char FIELD_ID_KEY[] = "PARQUET:field_id";
+
+std::shared_ptr<::arrow::KeyValueMetadata> FieldIdMetadata(int field_id) {
+ if (field_id >= 0) {
+ return ::arrow::key_value_metadata({FIELD_ID_KEY}, {std::to_string(field_id)});
+ } else {
+ return nullptr;
+ }
+}
+
+int FieldIdFromMetadata(
+ const std::shared_ptr<const ::arrow::KeyValueMetadata>& metadata) {
+ if (!metadata) {
+ return -1;
+ }
+ int key = metadata->FindKey(FIELD_ID_KEY);
+ if (key < 0) {
+ return -1;
+ }
+ std::string field_id_str = metadata->value(key);
+ int field_id;
+ if (::arrow::internal::ParseValue<::arrow::Int32Type>(
+ field_id_str.c_str(), field_id_str.length(), &field_id)) {
+ if (field_id < 0) {
+ // Thrift should convert any negative value to null but normalize to -1 here in case
+ // we later check this in logic.
+ return -1;
+ }
+ return field_id;
+ } else {
+ return -1;
+ }
+}
+
+Status FieldToNode(const std::string& name, const std::shared_ptr<Field>& field,
+ const WriterProperties& properties,
+ const ArrowWriterProperties& arrow_properties, NodePtr* out) {
+ std::shared_ptr<const LogicalType> logical_type = LogicalType::None();
+ ParquetType::type type;
+ Repetition::type repetition = RepetitionFromNullable(field->nullable());
+
+ int length = -1;
+ int precision = -1;
+ int scale = -1;
+
+ switch (field->type()->id()) {
+ case ArrowTypeId::NA: {
+ type = ParquetType::INT32;
+ logical_type = LogicalType::Null();
+ if (repetition != Repetition::OPTIONAL) {
+ return Status::Invalid("NullType Arrow field must be nullable");
+ }
+ } break;
+ case ArrowTypeId::BOOL:
+ type = ParquetType::BOOLEAN;
+ break;
+ case ArrowTypeId::UINT8:
+ type = ParquetType::INT32;
+ logical_type = LogicalType::Int(8, false);
+ break;
+ case ArrowTypeId::INT8:
+ type = ParquetType::INT32;
+ logical_type = LogicalType::Int(8, true);
+ break;
+ case ArrowTypeId::UINT16:
+ type = ParquetType::INT32;
+ logical_type = LogicalType::Int(16, false);
+ break;
+ case ArrowTypeId::INT16:
+ type = ParquetType::INT32;
+ logical_type = LogicalType::Int(16, true);
+ break;
+ case ArrowTypeId::UINT32:
+ if (properties.version() == ::parquet::ParquetVersion::PARQUET_1_0) {
+ type = ParquetType::INT64;
+ } else {
+ type = ParquetType::INT32;
+ logical_type = LogicalType::Int(32, false);
+ }
+ break;
+ case ArrowTypeId::INT32:
+ type = ParquetType::INT32;
+ break;
+ case ArrowTypeId::UINT64:
+ type = ParquetType::INT64;
+ logical_type = LogicalType::Int(64, false);
+ break;
+ case ArrowTypeId::INT64:
+ type = ParquetType::INT64;
+ break;
+ case ArrowTypeId::FLOAT:
+ type = ParquetType::FLOAT;
+ break;
+ case ArrowTypeId::DOUBLE:
+ type = ParquetType::DOUBLE;
+ break;
+ case ArrowTypeId::LARGE_STRING:
+ case ArrowTypeId::STRING:
+ type = ParquetType::BYTE_ARRAY;
+ logical_type = LogicalType::String();
+ break;
+ case ArrowTypeId::LARGE_BINARY:
+ case ArrowTypeId::BINARY:
+ type = ParquetType::BYTE_ARRAY;
+ break;
+ case ArrowTypeId::FIXED_SIZE_BINARY: {
+ type = ParquetType::FIXED_LEN_BYTE_ARRAY;
+ const auto& fixed_size_binary_type =
+ static_cast<const ::arrow::FixedSizeBinaryType&>(*field->type());
+ length = fixed_size_binary_type.byte_width();
+ } break;
+ case ArrowTypeId::DECIMAL128:
+ case ArrowTypeId::DECIMAL256: {
+ type = ParquetType::FIXED_LEN_BYTE_ARRAY;
+ const auto& decimal_type = static_cast<const ::arrow::DecimalType&>(*field->type());
+ precision = decimal_type.precision();
+ scale = decimal_type.scale();
+ length = DecimalType::DecimalSize(precision);
+ PARQUET_CATCH_NOT_OK(logical_type = LogicalType::Decimal(precision, scale));
+ } break;
+ case ArrowTypeId::DATE32:
+ type = ParquetType::INT32;
+ logical_type = LogicalType::Date();
+ break;
+ case ArrowTypeId::DATE64:
+ type = ParquetType::INT32;
+ logical_type = LogicalType::Date();
+ break;
+ case ArrowTypeId::TIMESTAMP:
+ RETURN_NOT_OK(
+ GetTimestampMetadata(static_cast<::arrow::TimestampType&>(*field->type()),
+ properties, arrow_properties, &type, &logical_type));
+ break;
+ case ArrowTypeId::TIME32:
+ type = ParquetType::INT32;
+ logical_type =
+ LogicalType::Time(/*is_adjusted_to_utc=*/true, LogicalType::TimeUnit::MILLIS);
+ break;
+ case ArrowTypeId::TIME64: {
+ type = ParquetType::INT64;
+ auto time_type = static_cast<::arrow::Time64Type*>(field->type().get());
+ if (time_type->unit() == ::arrow::TimeUnit::NANO) {
+ logical_type =
+ LogicalType::Time(/*is_adjusted_to_utc=*/true, LogicalType::TimeUnit::NANOS);
+ } else {
+ logical_type =
+ LogicalType::Time(/*is_adjusted_to_utc=*/true, LogicalType::TimeUnit::MICROS);
+ }
+ } break;
+ case ArrowTypeId::STRUCT: {
+ auto struct_type = std::static_pointer_cast<::arrow::StructType>(field->type());
+ return StructToNode(struct_type, name, field->nullable(), properties,
+ arrow_properties, out);
+ }
+ case ArrowTypeId::FIXED_SIZE_LIST:
+ case ArrowTypeId::LARGE_LIST:
+ case ArrowTypeId::LIST: {
+ auto list_type = std::static_pointer_cast<::arrow::BaseListType>(field->type());
+ return ListToNode(list_type, name, field->nullable(), properties, arrow_properties,
+ out);
+ }
+ case ArrowTypeId::DICTIONARY: {
+ // Parquet has no Dictionary type, dictionary-encoded is handled on
+ // the encoding, not the schema level.
+ const ::arrow::DictionaryType& dict_type =
+ static_cast<const ::arrow::DictionaryType&>(*field->type());
+ std::shared_ptr<::arrow::Field> unpacked_field = ::arrow::field(
+ name, dict_type.value_type(), field->nullable(), field->metadata());
+ return FieldToNode(name, unpacked_field, properties, arrow_properties, out);
+ }
+ case ArrowTypeId::EXTENSION: {
+ auto ext_type = std::static_pointer_cast<::arrow::ExtensionType>(field->type());
+ std::shared_ptr<::arrow::Field> storage_field = ::arrow::field(
+ name, ext_type->storage_type(), field->nullable(), field->metadata());
+ return FieldToNode(name, storage_field, properties, arrow_properties, out);
+ }
+ case ArrowTypeId::MAP: {
+ auto map_type = std::static_pointer_cast<::arrow::MapType>(field->type());
+ return MapToNode(map_type, name, field->nullable(), properties, arrow_properties,
+ out);
+ }
+
+ default: {
+ // TODO: DENSE_UNION, SPARE_UNION, JSON_SCALAR, DECIMAL_TEXT, VARCHAR
+ return Status::NotImplemented(
+ "Unhandled type for Arrow to Parquet schema conversion: ",
+ field->type()->ToString());
+ }
+ }
+
+ int field_id = FieldIdFromMetadata(field->metadata());
+ PARQUET_CATCH_NOT_OK(*out = PrimitiveNode::Make(name, repetition, logical_type, type,
+ length, field_id));
+
+ return Status::OK();
+}
+
+struct SchemaTreeContext {
+ SchemaManifest* manifest;
+ ArrowReaderProperties properties;
+ const SchemaDescriptor* schema;
+
+ void LinkParent(const SchemaField* child, const SchemaField* parent) {
+ manifest->child_to_parent[child] = parent;
+ }
+
+ void RecordLeaf(const SchemaField* leaf) {
+ manifest->column_index_to_field[leaf->column_index] = leaf;
+ }
+};
+
+bool IsDictionaryReadSupported(const ArrowType& type) {
+ // Only supported currently for BYTE_ARRAY types
+ return type.id() == ::arrow::Type::BINARY || type.id() == ::arrow::Type::STRING;
+}
+
+// ----------------------------------------------------------------------
+// Schema logic
+
+::arrow::Result<std::shared_ptr<ArrowType>> GetTypeForNode(
+ int column_index, const schema::PrimitiveNode& primitive_node,
+ SchemaTreeContext* ctx) {
+ ASSIGN_OR_RAISE(
+ std::shared_ptr<ArrowType> storage_type,
+ GetArrowType(primitive_node, ctx->properties.coerce_int96_timestamp_unit()));
+ if (ctx->properties.read_dictionary(column_index) &&
+ IsDictionaryReadSupported(*storage_type)) {
+ return ::arrow::dictionary(::arrow::int32(), storage_type);
+ }
+ return storage_type;
+}
+
+Status NodeToSchemaField(const Node& node, LevelInfo current_levels,
+ SchemaTreeContext* ctx, const SchemaField* parent,
+ SchemaField* out);
+
+Status GroupToSchemaField(const GroupNode& node, LevelInfo current_levels,
+ SchemaTreeContext* ctx, const SchemaField* parent,
+ SchemaField* out);
+
+Status PopulateLeaf(int column_index, const std::shared_ptr<Field>& field,
+ LevelInfo current_levels, SchemaTreeContext* ctx,
+ const SchemaField* parent, SchemaField* out) {
+ out->field = field;
+ out->column_index = column_index;
+ out->level_info = current_levels;
+ ctx->RecordLeaf(out);
+ ctx->LinkParent(out, parent);
+ return Status::OK();
+}
+
+// Special case mentioned in the format spec:
+// If the name is array or ends in _tuple, this should be a list of struct
+// even for single child elements.
+bool HasStructListName(const GroupNode& node) {
+ ::arrow::util::string_view name{node.name()};
+ return name == "array" || name.ends_with("_tuple");
+}
+
+Status GroupToStruct(const GroupNode& node, LevelInfo current_levels,
+ SchemaTreeContext* ctx, const SchemaField* parent,
+ SchemaField* out) {
+ std::vector<std::shared_ptr<Field>> arrow_fields;
+ out->children.resize(node.field_count());
+ // All level increments for the node are expected to happen by callers.
+ // This is required because repeated elements need to have there own
+ // SchemaField.
+
+ for (int i = 0; i < node.field_count(); i++) {
+ RETURN_NOT_OK(
+ NodeToSchemaField(*node.field(i), current_levels, ctx, out, &out->children[i]));
+ arrow_fields.push_back(out->children[i].field);
+ }
+ auto struct_type = ::arrow::struct_(arrow_fields);
+ out->field = ::arrow::field(node.name(), struct_type, node.is_optional(),
+ FieldIdMetadata(node.field_id()));
+ out->level_info = current_levels;
+ return Status::OK();
+}
+
+Status ListToSchemaField(const GroupNode& group, LevelInfo current_levels,
+ SchemaTreeContext* ctx, const SchemaField* parent,
+ SchemaField* out);
+
+Status MapToSchemaField(const GroupNode& group, LevelInfo current_levels,
+ SchemaTreeContext* ctx, const SchemaField* parent,
+ SchemaField* out) {
+ if (group.field_count() != 1) {
+ return Status::Invalid("MAP-annotated groups must have a single child.");
+ }
+ if (group.is_repeated()) {
+ return Status::Invalid("MAP-annotated groups must not be repeated.");
+ }
+
+ const Node& key_value_node = *group.field(0);
+
+ if (!key_value_node.is_repeated()) {
+ return Status::Invalid(
+ "Non-repeated key value in a MAP-annotated group are not supported.");
+ }
+
+ if (!key_value_node.is_group()) {
+ return Status::Invalid("Key-value node must be a group.");
+ }
+
+ const GroupNode& key_value = checked_cast<const GroupNode&>(key_value_node);
+ if (key_value.field_count() != 1 && key_value.field_count() != 2) {
+ return Status::Invalid("Key-value map node must have 1 or 2 child elements. Found: ",
+ key_value.field_count());
+ }
+ const Node& key_node = *key_value.field(0);
+ if (!key_node.is_required()) {
+ return Status::Invalid("Map keys must be annotated as required.");
+ }
+ // Arrow doesn't support 1 column maps (i.e. Sets). The options are to either
+ // make the values column nullable, or process the map as a list. We choose the latter
+ // as it is simpler.
+ if (key_value.field_count() == 1) {
+ return ListToSchemaField(group, current_levels, ctx, parent, out);
+ }
+
+ current_levels.Increment(group);
+ int16_t repeated_ancestor_def_level = current_levels.IncrementRepeated();
+
+ out->children.resize(1);
+ SchemaField* key_value_field = &out->children[0];
+
+ key_value_field->children.resize(2);
+ SchemaField* key_field = &key_value_field->children[0];
+ SchemaField* value_field = &key_value_field->children[1];
+
+ ctx->LinkParent(out, parent);
+ ctx->LinkParent(key_value_field, out);
+ ctx->LinkParent(key_field, key_value_field);
+ ctx->LinkParent(value_field, key_value_field);
+
+ // required/optional group name=whatever {
+ // repeated group name=key_values{
+ // required TYPE key;
+ // required/optional TYPE value;
+ // }
+ // }
+ //
+
+ RETURN_NOT_OK(NodeToSchemaField(*key_value.field(0), current_levels, ctx,
+ key_value_field, key_field));
+ RETURN_NOT_OK(NodeToSchemaField(*key_value.field(1), current_levels, ctx,
+ key_value_field, value_field));
+
+ key_value_field->field = ::arrow::field(
+ group.name(), ::arrow::struct_({key_field->field, value_field->field}),
+ /*nullable=*/false, FieldIdMetadata(key_value.field_id()));
+ key_value_field->level_info = current_levels;
+
+ out->field = ::arrow::field(group.name(),
+ ::arrow::map(key_field->field->type(), value_field->field),
+ group.is_optional(), FieldIdMetadata(group.field_id()));
+ out->level_info = current_levels;
+ // At this point current levels contains the def level for this list,
+ // we need to reset to the prior parent.
+ out->level_info.repeated_ancestor_def_level = repeated_ancestor_def_level;
+ return Status::OK();
+}
+
+Status ListToSchemaField(const GroupNode& group, LevelInfo current_levels,
+ SchemaTreeContext* ctx, const SchemaField* parent,
+ SchemaField* out) {
+ if (group.field_count() != 1) {
+ return Status::Invalid("LIST-annotated groups must have a single child.");
+ }
+ if (group.is_repeated()) {
+ return Status::Invalid("LIST-annotated groups must not be repeated.");
+ }
+ current_levels.Increment(group);
+
+ out->children.resize(group.field_count());
+ SchemaField* child_field = &out->children[0];
+
+ ctx->LinkParent(out, parent);
+ ctx->LinkParent(child_field, out);
+
+ const Node& list_node = *group.field(0);
+
+ if (!list_node.is_repeated()) {
+ return Status::Invalid(
+ "Non-repeated nodes in a LIST-annotated group are not supported.");
+ }
+
+ int16_t repeated_ancestor_def_level = current_levels.IncrementRepeated();
+ if (list_node.is_group()) {
+ // Resolve 3-level encoding
+ //
+ // required/optional group name=whatever {
+ // repeated group name=list {
+ // required/optional TYPE item;
+ // }
+ // }
+ //
+ // yields list<item: TYPE ?nullable> ?nullable
+ //
+ // We distinguish the special case that we have
+ //
+ // required/optional group name=whatever {
+ // repeated group name=array or $SOMETHING_tuple {
+ // required/optional TYPE item;
+ // }
+ // }
+ //
+ // In this latter case, the inner type of the list should be a struct
+ // rather than a primitive value
+ //
+ // yields list<item: struct<item: TYPE ?nullable> not null> ?nullable
+ const auto& list_group = static_cast<const GroupNode&>(list_node);
+ // Special case mentioned in the format spec:
+ // If the name is array or ends in _tuple, this should be a list of struct
+ // even for single child elements.
+ if (list_group.field_count() == 1 && !HasStructListName(list_group)) {
+ // List of primitive type
+ RETURN_NOT_OK(
+ NodeToSchemaField(*list_group.field(0), current_levels, ctx, out, child_field));
+ } else {
+ RETURN_NOT_OK(GroupToStruct(list_group, current_levels, ctx, out, child_field));
+ }
+ } else {
+ // Two-level list encoding
+ //
+ // required/optional group LIST {
+ // repeated TYPE;
+ // }
+ const auto& primitive_node = static_cast<const PrimitiveNode&>(list_node);
+ int column_index = ctx->schema->GetColumnIndex(primitive_node);
+ ASSIGN_OR_RAISE(std::shared_ptr<ArrowType> type,
+ GetTypeForNode(column_index, primitive_node, ctx));
+ auto item_field = ::arrow::field(list_node.name(), type, /*nullable=*/false,
+ FieldIdMetadata(list_node.field_id()));
+ RETURN_NOT_OK(
+ PopulateLeaf(column_index, item_field, current_levels, ctx, out, child_field));
+ }
+ out->field = ::arrow::field(group.name(), ::arrow::list(child_field->field),
+ group.is_optional(), FieldIdMetadata(group.field_id()));
+ out->level_info = current_levels;
+ // At this point current levels contains the def level for this list,
+ // we need to reset to the prior parent.
+ out->level_info.repeated_ancestor_def_level = repeated_ancestor_def_level;
+ return Status::OK();
+}
+
+Status GroupToSchemaField(const GroupNode& node, LevelInfo current_levels,
+ SchemaTreeContext* ctx, const SchemaField* parent,
+ SchemaField* out) {
+ if (node.logical_type()->is_list()) {
+ return ListToSchemaField(node, current_levels, ctx, parent, out);
+ } else if (node.logical_type()->is_map()) {
+ return MapToSchemaField(node, current_levels, ctx, parent, out);
+ }
+ std::shared_ptr<ArrowType> type;
+ if (node.is_repeated()) {
+ // Simple repeated struct
+ //
+ // repeated group $NAME {
+ // r/o TYPE[0] f0
+ // r/o TYPE[1] f1
+ // }
+ out->children.resize(1);
+
+ int16_t repeated_ancestor_def_level = current_levels.IncrementRepeated();
+ RETURN_NOT_OK(GroupToStruct(node, current_levels, ctx, out, &out->children[0]));
+ out->field = ::arrow::field(node.name(), ::arrow::list(out->children[0].field),
+ /*nullable=*/false, FieldIdMetadata(node.field_id()));
+
+ ctx->LinkParent(&out->children[0], out);
+ out->level_info = current_levels;
+ // At this point current_levels contains this list as the def level, we need to
+ // use the previous ancenstor of thi slist.
+ out->level_info.repeated_ancestor_def_level = repeated_ancestor_def_level;
+ return Status::OK();
+ } else {
+ current_levels.Increment(node);
+ return GroupToStruct(node, current_levels, ctx, parent, out);
+ }
+}
+
+Status NodeToSchemaField(const Node& node, LevelInfo current_levels,
+ SchemaTreeContext* ctx, const SchemaField* parent,
+ SchemaField* out) {
+ // Workhorse function for converting a Parquet schema node to an Arrow
+ // type. Handles different conventions for nested data.
+
+ ctx->LinkParent(out, parent);
+
+ // Now, walk the schema and create a ColumnDescriptor for each leaf node
+ if (node.is_group()) {
+ // A nested field, but we don't know what kind yet
+ return GroupToSchemaField(static_cast<const GroupNode&>(node), current_levels, ctx,
+ parent, out);
+ } else {
+ // Either a normal flat primitive type, or a list type encoded with 1-level
+ // list encoding. Note that the 3-level encoding is the form recommended by
+ // the parquet specification, but technically we can have either
+ //
+ // required/optional $TYPE $FIELD_NAME
+ //
+ // or
+ //
+ // repeated $TYPE $FIELD_NAME
+ const auto& primitive_node = static_cast<const PrimitiveNode&>(node);
+ int column_index = ctx->schema->GetColumnIndex(primitive_node);
+ ASSIGN_OR_RAISE(std::shared_ptr<ArrowType> type,
+ GetTypeForNode(column_index, primitive_node, ctx));
+ if (node.is_repeated()) {
+ // One-level list encoding, e.g.
+ // a: repeated int32;
+ int16_t repeated_ancestor_def_level = current_levels.IncrementRepeated();
+ out->children.resize(1);
+ auto child_field = ::arrow::field(node.name(), type, /*nullable=*/false);
+ RETURN_NOT_OK(PopulateLeaf(column_index, child_field, current_levels, ctx, out,
+ &out->children[0]));
+
+ out->field = ::arrow::field(node.name(), ::arrow::list(child_field),
+ /*nullable=*/false, FieldIdMetadata(node.field_id()));
+ out->level_info = current_levels;
+ // At this point current_levels has consider this list the ancestor so restore
+ // the actual ancenstor.
+ out->level_info.repeated_ancestor_def_level = repeated_ancestor_def_level;
+ return Status::OK();
+ } else {
+ current_levels.Increment(node);
+ // A normal (required/optional) primitive node
+ return PopulateLeaf(column_index,
+ ::arrow::field(node.name(), type, node.is_optional(),
+ FieldIdMetadata(node.field_id())),
+ current_levels, ctx, parent, out);
+ }
+ }
+}
+
+// Get the original Arrow schema, as serialized in the Parquet metadata
+Status GetOriginSchema(const std::shared_ptr<const KeyValueMetadata>& metadata,
+ std::shared_ptr<const KeyValueMetadata>* clean_metadata,
+ std::shared_ptr<::arrow::Schema>* out) {
+ if (metadata == nullptr) {
+ *out = nullptr;
+ *clean_metadata = nullptr;
+ return Status::OK();
+ }
+
+ static const std::string kArrowSchemaKey = "ARROW:schema";
+ int schema_index = metadata->FindKey(kArrowSchemaKey);
+ if (schema_index == -1) {
+ *out = nullptr;
+ *clean_metadata = metadata;
+ return Status::OK();
+ }
+
+ // The original Arrow schema was serialized using the store_schema option.
+ // We deserialize it here and use it to inform read options such as
+ // dictionary-encoded fields.
+ auto decoded = ::arrow::util::base64_decode(metadata->value(schema_index));
+ auto schema_buf = std::make_shared<Buffer>(decoded);
+
+ ::arrow::ipc::DictionaryMemo dict_memo;
+ ::arrow::io::BufferReader input(schema_buf);
+
+ ARROW_ASSIGN_OR_RAISE(*out, ::arrow::ipc::ReadSchema(&input, &dict_memo));
+
+ if (metadata->size() > 1) {
+ // Copy the metadata without the schema key
+ auto new_metadata = ::arrow::key_value_metadata({}, {});
+ new_metadata->reserve(metadata->size() - 1);
+ for (int64_t i = 0; i < metadata->size(); ++i) {
+ if (i == schema_index) continue;
+ new_metadata->Append(metadata->key(i), metadata->value(i));
+ }
+ *clean_metadata = new_metadata;
+ } else {
+ // No other keys, let metadata be null
+ *clean_metadata = nullptr;
+ }
+ return Status::OK();
+}
+
+// Restore original Arrow field information that was serialized as Parquet metadata
+// but that is not necessarily present in the field reconstitued from Parquet data
+// (for example, Parquet timestamp types doesn't carry timezone information).
+
+Result<bool> ApplyOriginalMetadata(const Field& origin_field, SchemaField* inferred);
+
+std::function<std::shared_ptr<::arrow::DataType>(FieldVector)> GetNestedFactory(
+ const ArrowType& origin_type, const ArrowType& inferred_type) {
+ switch (inferred_type.id()) {
+ case ::arrow::Type::STRUCT:
+ if (origin_type.id() == ::arrow::Type::STRUCT) {
+ return ::arrow::struct_;
+ }
+ break;
+ case ::arrow::Type::LIST:
+ if (origin_type.id() == ::arrow::Type::LIST) {
+ return [](FieldVector fields) {
+ DCHECK_EQ(fields.size(), 1);
+ return ::arrow::list(std::move(fields[0]));
+ };
+ }
+ if (origin_type.id() == ::arrow::Type::LARGE_LIST) {
+ return [](FieldVector fields) {
+ DCHECK_EQ(fields.size(), 1);
+ return ::arrow::large_list(std::move(fields[0]));
+ };
+ }
+ if (origin_type.id() == ::arrow::Type::FIXED_SIZE_LIST) {
+ const auto list_size =
+ checked_cast<const ::arrow::FixedSizeListType&>(origin_type).list_size();
+ return [list_size](FieldVector fields) {
+ DCHECK_EQ(fields.size(), 1);
+ return ::arrow::fixed_size_list(std::move(fields[0]), list_size);
+ };
+ }
+ break;
+ default:
+ break;
+ }
+ return {};
+}
+
+Result<bool> ApplyOriginalStorageMetadata(const Field& origin_field,
+ SchemaField* inferred) {
+ bool modified = false;
+
+ auto origin_type = origin_field.type();
+ auto inferred_type = inferred->field->type();
+
+ const int num_children = inferred_type->num_fields();
+
+ if (num_children > 0 && origin_type->num_fields() == num_children) {
+ DCHECK_EQ(static_cast<int>(inferred->children.size()), num_children);
+ const auto factory = GetNestedFactory(*origin_type, *inferred_type);
+ if (factory) {
+ // The type may be modified (e.g. LargeList) while the children stay the same
+ modified |= origin_type->id() != inferred_type->id();
+
+ // Apply original metadata recursively to children
+ for (int i = 0; i < inferred_type->num_fields(); ++i) {
+ ARROW_ASSIGN_OR_RAISE(
+ const bool child_modified,
+ ApplyOriginalMetadata(*origin_type->field(i), &inferred->children[i]));
+ modified |= child_modified;
+ }
+ if (modified) {
+ // Recreate this field using the modified child fields
+ ::arrow::FieldVector modified_children(inferred_type->num_fields());
+ for (int i = 0; i < inferred_type->num_fields(); ++i) {
+ modified_children[i] = inferred->children[i].field;
+ }
+ inferred->field =
+ inferred->field->WithType(factory(std::move(modified_children)));
+ }
+ }
+ }
+
+ if (origin_type->id() == ::arrow::Type::TIMESTAMP &&
+ inferred_type->id() == ::arrow::Type::TIMESTAMP) {
+ // Restore time zone, if any
+ const auto& ts_type = checked_cast<const ::arrow::TimestampType&>(*inferred_type);
+ const auto& ts_origin_type =
+ checked_cast<const ::arrow::TimestampType&>(*origin_type);
+
+ // If the data is tz-aware, then set the original time zone, since Parquet
+ // has no native storage for timezones
+ if (ts_type.timezone() == "UTC" && ts_origin_type.timezone() != "") {
+ if (ts_type.unit() == ts_origin_type.unit()) {
+ inferred->field = inferred->field->WithType(origin_type);
+ } else {
+ auto ts_type_new = ::arrow::timestamp(ts_type.unit(), ts_origin_type.timezone());
+ inferred->field = inferred->field->WithType(ts_type_new);
+ }
+ }
+ modified = true;
+ }
+
+ if (origin_type->id() == ::arrow::Type::DICTIONARY &&
+ inferred_type->id() != ::arrow::Type::DICTIONARY &&
+ IsDictionaryReadSupported(*inferred_type)) {
+ // Direct dictionary reads are only suppored for a couple primitive types,
+ // so no need to recurse on value types.
+ const auto& dict_origin_type =
+ checked_cast<const ::arrow::DictionaryType&>(*origin_type);
+ inferred->field = inferred->field->WithType(
+ ::arrow::dictionary(::arrow::int32(), inferred_type, dict_origin_type.ordered()));
+ modified = true;
+ }
+
+ if ((origin_type->id() == ::arrow::Type::LARGE_BINARY &&
+ inferred_type->id() == ::arrow::Type::BINARY) ||
+ (origin_type->id() == ::arrow::Type::LARGE_STRING &&
+ inferred_type->id() == ::arrow::Type::STRING)) {
+ // Read back binary-like arrays with the intended offset width.
+ inferred->field = inferred->field->WithType(origin_type);
+ modified = true;
+ }
+
+ if (origin_type->id() == ::arrow::Type::DECIMAL256 &&
+ inferred_type->id() == ::arrow::Type::DECIMAL128) {
+ inferred->field = inferred->field->WithType(origin_type);
+ modified = true;
+ }
+
+ // Restore field metadata
+ std::shared_ptr<const KeyValueMetadata> field_metadata = origin_field.metadata();
+ if (field_metadata != nullptr) {
+ if (inferred->field->metadata()) {
+ // Prefer the metadata keys (like field_id) from the current metadata
+ field_metadata = field_metadata->Merge(*inferred->field->metadata());
+ }
+ inferred->field = inferred->field->WithMetadata(field_metadata);
+ modified = true;
+ }
+
+ return modified;
+}
+
+Result<bool> ApplyOriginalMetadata(const Field& origin_field, SchemaField* inferred) {
+ bool modified = false;
+
+ auto origin_type = origin_field.type();
+ auto inferred_type = inferred->field->type();
+
+ if (origin_type->id() == ::arrow::Type::EXTENSION) {
+ const auto& ex_type = checked_cast<const ::arrow::ExtensionType&>(*origin_type);
+ auto origin_storage_field = origin_field.WithType(ex_type.storage_type());
+
+ // Apply metadata recursively to storage type
+ RETURN_NOT_OK(ApplyOriginalStorageMetadata(*origin_storage_field, inferred));
+
+ // Restore extension type, if the storage type is the same as inferred
+ // from the Parquet type
+ if (ex_type.storage_type()->Equals(*inferred->field->type())) {
+ inferred->field = inferred->field->WithType(origin_type);
+ }
+ modified = true;
+ } else {
+ ARROW_ASSIGN_OR_RAISE(modified, ApplyOriginalStorageMetadata(origin_field, inferred));
+ }
+
+ return modified;
+}
+
+} // namespace
+
+Status FieldToNode(const std::shared_ptr<Field>& field,
+ const WriterProperties& properties,
+ const ArrowWriterProperties& arrow_properties, NodePtr* out) {
+ return FieldToNode(field->name(), field, properties, arrow_properties, out);
+}
+
+Status ToParquetSchema(const ::arrow::Schema* arrow_schema,
+ const WriterProperties& properties,
+ const ArrowWriterProperties& arrow_properties,
+ std::shared_ptr<SchemaDescriptor>* out) {
+ std::vector<NodePtr> nodes(arrow_schema->num_fields());
+ for (int i = 0; i < arrow_schema->num_fields(); i++) {
+ RETURN_NOT_OK(
+ FieldToNode(arrow_schema->field(i), properties, arrow_properties, &nodes[i]));
+ }
+
+ NodePtr schema = GroupNode::Make("schema", Repetition::REQUIRED, nodes);
+ *out = std::make_shared<::parquet::SchemaDescriptor>();
+ PARQUET_CATCH_NOT_OK((*out)->Init(schema));
+
+ return Status::OK();
+}
+
+Status ToParquetSchema(const ::arrow::Schema* arrow_schema,
+ const WriterProperties& properties,
+ std::shared_ptr<SchemaDescriptor>* out) {
+ return ToParquetSchema(arrow_schema, properties, *default_arrow_writer_properties(),
+ out);
+}
+
+Status FromParquetSchema(
+ const SchemaDescriptor* schema, const ArrowReaderProperties& properties,
+ const std::shared_ptr<const KeyValueMetadata>& key_value_metadata,
+ std::shared_ptr<::arrow::Schema>* out) {
+ SchemaManifest manifest;
+ RETURN_NOT_OK(SchemaManifest::Make(schema, key_value_metadata, properties, &manifest));
+ std::vector<std::shared_ptr<Field>> fields(manifest.schema_fields.size());
+
+ for (int i = 0; i < static_cast<int>(fields.size()); i++) {
+ const auto& schema_field = manifest.schema_fields[i];
+ fields[i] = schema_field.field;
+ }
+ if (manifest.origin_schema) {
+ // ARROW-8980: If the ARROW:schema was in the input metadata, then
+ // manifest.origin_schema will have it scrubbed out
+ *out = ::arrow::schema(fields, manifest.origin_schema->metadata());
+ } else {
+ *out = ::arrow::schema(fields, key_value_metadata);
+ }
+ return Status::OK();
+}
+
+Status FromParquetSchema(const SchemaDescriptor* parquet_schema,
+ const ArrowReaderProperties& properties,
+ std::shared_ptr<::arrow::Schema>* out) {
+ return FromParquetSchema(parquet_schema, properties, nullptr, out);
+}
+
+Status FromParquetSchema(const SchemaDescriptor* parquet_schema,
+ std::shared_ptr<::arrow::Schema>* out) {
+ ArrowReaderProperties properties;
+ return FromParquetSchema(parquet_schema, properties, nullptr, out);
+}
+
+Status SchemaManifest::Make(const SchemaDescriptor* schema,
+ const std::shared_ptr<const KeyValueMetadata>& metadata,
+ const ArrowReaderProperties& properties,
+ SchemaManifest* manifest) {
+ SchemaTreeContext ctx;
+ ctx.manifest = manifest;
+ ctx.properties = properties;
+ ctx.schema = schema;
+ const GroupNode& schema_node = *schema->group_node();
+ manifest->descr = schema;
+ manifest->schema_fields.resize(schema_node.field_count());
+
+ // Try to deserialize original Arrow schema
+ RETURN_NOT_OK(
+ GetOriginSchema(metadata, &manifest->schema_metadata, &manifest->origin_schema));
+ // Ignore original schema if it's not compatible with the Parquet schema
+ if (manifest->origin_schema != nullptr &&
+ manifest->origin_schema->num_fields() != schema_node.field_count()) {
+ manifest->origin_schema = nullptr;
+ }
+
+ for (int i = 0; i < static_cast<int>(schema_node.field_count()); ++i) {
+ SchemaField* out_field = &manifest->schema_fields[i];
+ RETURN_NOT_OK(NodeToSchemaField(*schema_node.field(i), LevelInfo(), &ctx,
+ /*parent=*/nullptr, out_field));
+
+ // TODO(wesm): as follow up to ARROW-3246, we should really pass the origin
+ // schema (if any) through all functions in the schema reconstruction, but
+ // I'm being lazy and just setting dictionary fields at the top level for
+ // now
+ if (manifest->origin_schema == nullptr) {
+ continue;
+ }
+
+ auto origin_field = manifest->origin_schema->field(i);
+ RETURN_NOT_OK(ApplyOriginalMetadata(*origin_field, out_field));
+ }
+ return Status::OK();
+}
+
+} // namespace arrow
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/arrow/schema.h b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/schema.h
new file mode 100644
index 00000000000..dd60fde4342
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/schema.h
@@ -0,0 +1,184 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cassert>
+#include <memory>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_fwd.h"
+
+#include "parquet/level_conversion.h"
+#include "parquet/platform.h"
+#include "parquet/schema.h"
+
+namespace parquet {
+
+class ArrowReaderProperties;
+class ArrowWriterProperties;
+class WriterProperties;
+
+namespace arrow {
+
+/// \defgroup arrow-to-parquet-schema-conversion Functions to convert an Arrow
+/// schema into a Parquet schema.
+///
+/// @{
+
+PARQUET_EXPORT
+::arrow::Status FieldToNode(const std::shared_ptr<::arrow::Field>& field,
+ const WriterProperties& properties,
+ const ArrowWriterProperties& arrow_properties,
+ schema::NodePtr* out);
+
+PARQUET_EXPORT
+::arrow::Status ToParquetSchema(const ::arrow::Schema* arrow_schema,
+ const WriterProperties& properties,
+ const ArrowWriterProperties& arrow_properties,
+ std::shared_ptr<SchemaDescriptor>* out);
+
+PARQUET_EXPORT
+::arrow::Status ToParquetSchema(const ::arrow::Schema* arrow_schema,
+ const WriterProperties& properties,
+ std::shared_ptr<SchemaDescriptor>* out);
+
+/// @}
+
+/// \defgroup parquet-to-arrow-schema-conversion Functions to convert a Parquet
+/// schema into an Arrow schema.
+///
+/// @{
+
+PARQUET_EXPORT
+::arrow::Status FromParquetSchema(
+ const SchemaDescriptor* parquet_schema, const ArrowReaderProperties& properties,
+ const std::shared_ptr<const ::arrow::KeyValueMetadata>& key_value_metadata,
+ std::shared_ptr<::arrow::Schema>* out);
+
+PARQUET_EXPORT
+::arrow::Status FromParquetSchema(const SchemaDescriptor* parquet_schema,
+ const ArrowReaderProperties& properties,
+ std::shared_ptr<::arrow::Schema>* out);
+
+PARQUET_EXPORT
+::arrow::Status FromParquetSchema(const SchemaDescriptor* parquet_schema,
+ std::shared_ptr<::arrow::Schema>* out);
+
+/// @}
+
+/// \brief Bridge between an arrow::Field and parquet column indices.
+struct PARQUET_EXPORT SchemaField {
+ std::shared_ptr<::arrow::Field> field;
+ std::vector<SchemaField> children;
+
+ // Only set for leaf nodes
+ int column_index = -1;
+
+ parquet::internal::LevelInfo level_info;
+
+ bool is_leaf() const { return column_index != -1; }
+};
+
+/// \brief Bridge between a parquet Schema and an arrow Schema.
+///
+/// Expose parquet columns as a tree structure. Useful traverse and link
+/// between arrow's Schema and parquet's Schema.
+struct PARQUET_EXPORT SchemaManifest {
+ static ::arrow::Status Make(
+ const SchemaDescriptor* schema,
+ const std::shared_ptr<const ::arrow::KeyValueMetadata>& metadata,
+ const ArrowReaderProperties& properties, SchemaManifest* manifest);
+
+ const SchemaDescriptor* descr;
+ std::shared_ptr<::arrow::Schema> origin_schema;
+ std::shared_ptr<const ::arrow::KeyValueMetadata> schema_metadata;
+ std::vector<SchemaField> schema_fields;
+
+ std::unordered_map<int, const SchemaField*> column_index_to_field;
+ std::unordered_map<const SchemaField*, const SchemaField*> child_to_parent;
+
+ ::arrow::Status GetColumnField(int column_index, const SchemaField** out) const {
+ auto it = column_index_to_field.find(column_index);
+ if (it == column_index_to_field.end()) {
+ return ::arrow::Status::KeyError("Column index ", column_index,
+ " not found in schema manifest, may be malformed");
+ }
+ *out = it->second;
+ return ::arrow::Status::OK();
+ }
+
+ const SchemaField* GetParent(const SchemaField* field) const {
+ // Returns nullptr also if not found
+ auto it = child_to_parent.find(field);
+ if (it == child_to_parent.end()) {
+ return NULLPTR;
+ }
+ return it->second;
+ }
+
+ /// Coalesce a list of field indices (relative to the equivalent arrow::Schema) which
+ /// correspond to the column root (first node below the parquet schema's root group) of
+ /// each leaf referenced in column_indices.
+ ///
+ /// For example, for leaves `a.b.c`, `a.b.d.e`, and `i.j.k` (column_indices=[0,1,3])
+ /// the roots are `a` and `i` (return=[0,2]).
+ ///
+ /// root
+ /// -- a <------
+ /// -- -- b | |
+ /// -- -- -- c |
+ /// -- -- -- d |
+ /// -- -- -- -- e
+ /// -- f
+ /// -- -- g
+ /// -- -- -- h
+ /// -- i <---
+ /// -- -- j |
+ /// -- -- -- k
+ ::arrow::Result<std::vector<int>> GetFieldIndices(
+ const std::vector<int>& column_indices) const {
+ const schema::GroupNode* group = descr->group_node();
+ std::unordered_set<int> already_added;
+
+ std::vector<int> out;
+ for (int column_idx : column_indices) {
+ if (column_idx < 0 || column_idx >= descr->num_columns()) {
+ return ::arrow::Status::IndexError("Column index ", column_idx, " is not valid");
+ }
+
+ auto field_node = descr->GetColumnRoot(column_idx);
+ auto field_idx = group->FieldIndex(*field_node);
+ if (field_idx == -1) {
+ return ::arrow::Status::IndexError("Column index ", column_idx, " is not valid");
+ }
+
+ if (already_added.insert(field_idx).second) {
+ out.push_back(field_idx);
+ }
+ }
+ return out;
+ }
+};
+
+} // namespace arrow
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/arrow/schema_internal.cc b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/schema_internal.cc
new file mode 100644
index 00000000000..064bf4f55cc
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/schema_internal.cc
@@ -0,0 +1,222 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/arrow/schema_internal.h"
+
+#include "arrow/type.h"
+
+using ArrowType = ::arrow::DataType;
+using ArrowTypeId = ::arrow::Type;
+using ParquetType = parquet::Type;
+
+namespace parquet {
+
+namespace arrow {
+
+using ::arrow::Result;
+using ::arrow::Status;
+using ::arrow::internal::checked_cast;
+
+Result<std::shared_ptr<ArrowType>> MakeArrowDecimal(const LogicalType& logical_type) {
+ const auto& decimal = checked_cast<const DecimalLogicalType&>(logical_type);
+ if (decimal.precision() <= ::arrow::Decimal128Type::kMaxPrecision) {
+ return ::arrow::Decimal128Type::Make(decimal.precision(), decimal.scale());
+ }
+ return ::arrow::Decimal256Type::Make(decimal.precision(), decimal.scale());
+}
+
+Result<std::shared_ptr<ArrowType>> MakeArrowInt(const LogicalType& logical_type) {
+ const auto& integer = checked_cast<const IntLogicalType&>(logical_type);
+ switch (integer.bit_width()) {
+ case 8:
+ return integer.is_signed() ? ::arrow::int8() : ::arrow::uint8();
+ case 16:
+ return integer.is_signed() ? ::arrow::int16() : ::arrow::uint16();
+ case 32:
+ return integer.is_signed() ? ::arrow::int32() : ::arrow::uint32();
+ default:
+ return Status::TypeError(logical_type.ToString(),
+ " can not annotate physical type Int32");
+ }
+}
+
+Result<std::shared_ptr<ArrowType>> MakeArrowInt64(const LogicalType& logical_type) {
+ const auto& integer = checked_cast<const IntLogicalType&>(logical_type);
+ switch (integer.bit_width()) {
+ case 64:
+ return integer.is_signed() ? ::arrow::int64() : ::arrow::uint64();
+ default:
+ return Status::TypeError(logical_type.ToString(),
+ " can not annotate physical type Int64");
+ }
+}
+
+Result<std::shared_ptr<ArrowType>> MakeArrowTime32(const LogicalType& logical_type) {
+ const auto& time = checked_cast<const TimeLogicalType&>(logical_type);
+ switch (time.time_unit()) {
+ case LogicalType::TimeUnit::MILLIS:
+ return ::arrow::time32(::arrow::TimeUnit::MILLI);
+ default:
+ return Status::TypeError(logical_type.ToString(),
+ " can not annotate physical type Time32");
+ }
+}
+
+Result<std::shared_ptr<ArrowType>> MakeArrowTime64(const LogicalType& logical_type) {
+ const auto& time = checked_cast<const TimeLogicalType&>(logical_type);
+ switch (time.time_unit()) {
+ case LogicalType::TimeUnit::MICROS:
+ return ::arrow::time64(::arrow::TimeUnit::MICRO);
+ case LogicalType::TimeUnit::NANOS:
+ return ::arrow::time64(::arrow::TimeUnit::NANO);
+ default:
+ return Status::TypeError(logical_type.ToString(),
+ " can not annotate physical type Time64");
+ }
+}
+
+Result<std::shared_ptr<ArrowType>> MakeArrowTimestamp(const LogicalType& logical_type) {
+ const auto& timestamp = checked_cast<const TimestampLogicalType&>(logical_type);
+ const bool utc_normalized =
+ timestamp.is_from_converted_type() ? false : timestamp.is_adjusted_to_utc();
+ static const char* utc_timezone = "UTC";
+ switch (timestamp.time_unit()) {
+ case LogicalType::TimeUnit::MILLIS:
+ return (utc_normalized ? ::arrow::timestamp(::arrow::TimeUnit::MILLI, utc_timezone)
+ : ::arrow::timestamp(::arrow::TimeUnit::MILLI));
+ case LogicalType::TimeUnit::MICROS:
+ return (utc_normalized ? ::arrow::timestamp(::arrow::TimeUnit::MICRO, utc_timezone)
+ : ::arrow::timestamp(::arrow::TimeUnit::MICRO));
+ case LogicalType::TimeUnit::NANOS:
+ return (utc_normalized ? ::arrow::timestamp(::arrow::TimeUnit::NANO, utc_timezone)
+ : ::arrow::timestamp(::arrow::TimeUnit::NANO));
+ default:
+ return Status::TypeError("Unrecognized time unit in timestamp logical_type: ",
+ logical_type.ToString());
+ }
+}
+
+Result<std::shared_ptr<ArrowType>> FromByteArray(const LogicalType& logical_type) {
+ switch (logical_type.type()) {
+ case LogicalType::Type::STRING:
+ return ::arrow::utf8();
+ case LogicalType::Type::DECIMAL:
+ return MakeArrowDecimal(logical_type);
+ case LogicalType::Type::NONE:
+ case LogicalType::Type::ENUM:
+ case LogicalType::Type::JSON:
+ case LogicalType::Type::BSON:
+ return ::arrow::binary();
+ default:
+ return Status::NotImplemented("Unhandled logical logical_type ",
+ logical_type.ToString(), " for binary array");
+ }
+}
+
+Result<std::shared_ptr<ArrowType>> FromFLBA(const LogicalType& logical_type,
+ int32_t physical_length) {
+ switch (logical_type.type()) {
+ case LogicalType::Type::DECIMAL:
+ return MakeArrowDecimal(logical_type);
+ case LogicalType::Type::NONE:
+ case LogicalType::Type::INTERVAL:
+ case LogicalType::Type::UUID:
+ return ::arrow::fixed_size_binary(physical_length);
+ default:
+ return Status::NotImplemented("Unhandled logical logical_type ",
+ logical_type.ToString(),
+ " for fixed-length binary array");
+ }
+}
+
+::arrow::Result<std::shared_ptr<ArrowType>> FromInt32(const LogicalType& logical_type) {
+ switch (logical_type.type()) {
+ case LogicalType::Type::INT:
+ return MakeArrowInt(logical_type);
+ case LogicalType::Type::DATE:
+ return ::arrow::date32();
+ case LogicalType::Type::TIME:
+ return MakeArrowTime32(logical_type);
+ case LogicalType::Type::DECIMAL:
+ return MakeArrowDecimal(logical_type);
+ case LogicalType::Type::NONE:
+ return ::arrow::int32();
+ default:
+ return Status::NotImplemented("Unhandled logical type ", logical_type.ToString(),
+ " for INT32");
+ }
+}
+
+Result<std::shared_ptr<ArrowType>> FromInt64(const LogicalType& logical_type) {
+ switch (logical_type.type()) {
+ case LogicalType::Type::INT:
+ return MakeArrowInt64(logical_type);
+ case LogicalType::Type::DECIMAL:
+ return MakeArrowDecimal(logical_type);
+ case LogicalType::Type::TIMESTAMP:
+ return MakeArrowTimestamp(logical_type);
+ case LogicalType::Type::TIME:
+ return MakeArrowTime64(logical_type);
+ case LogicalType::Type::NONE:
+ return ::arrow::int64();
+ default:
+ return Status::NotImplemented("Unhandled logical type ", logical_type.ToString(),
+ " for INT64");
+ }
+}
+
+Result<std::shared_ptr<ArrowType>> GetArrowType(
+ Type::type physical_type, const LogicalType& logical_type, int type_length,
+ const ::arrow::TimeUnit::type int96_arrow_time_unit) {
+ if (logical_type.is_invalid() || logical_type.is_null()) {
+ return ::arrow::null();
+ }
+
+ switch (physical_type) {
+ case ParquetType::BOOLEAN:
+ return ::arrow::boolean();
+ case ParquetType::INT32:
+ return FromInt32(logical_type);
+ case ParquetType::INT64:
+ return FromInt64(logical_type);
+ case ParquetType::INT96:
+ return ::arrow::timestamp(int96_arrow_time_unit);
+ case ParquetType::FLOAT:
+ return ::arrow::float32();
+ case ParquetType::DOUBLE:
+ return ::arrow::float64();
+ case ParquetType::BYTE_ARRAY:
+ return FromByteArray(logical_type);
+ case ParquetType::FIXED_LEN_BYTE_ARRAY:
+ return FromFLBA(logical_type, type_length);
+ default: {
+ // PARQUET-1565: This can occur if the file is corrupt
+ return Status::IOError("Invalid physical column type: ",
+ TypeToString(physical_type));
+ }
+ }
+}
+
+Result<std::shared_ptr<ArrowType>> GetArrowType(
+ const schema::PrimitiveNode& primitive,
+ const ::arrow::TimeUnit::type int96_arrow_time_unit) {
+ return GetArrowType(primitive.physical_type(), *primitive.logical_type(),
+ primitive.type_length(), int96_arrow_time_unit);
+}
+
+} // namespace arrow
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/arrow/schema_internal.h b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/schema_internal.h
new file mode 100644
index 00000000000..fb837c3ee6c
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/schema_internal.h
@@ -0,0 +1,51 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/result.h"
+#include "parquet/schema.h"
+
+namespace arrow {
+class DataType;
+}
+
+namespace parquet {
+namespace arrow {
+
+using ::arrow::Result;
+
+Result<std::shared_ptr<::arrow::DataType>> FromByteArray(const LogicalType& logical_type);
+Result<std::shared_ptr<::arrow::DataType>> FromFLBA(const LogicalType& logical_type,
+ int32_t physical_length);
+Result<std::shared_ptr<::arrow::DataType>> FromInt32(const LogicalType& logical_type);
+Result<std::shared_ptr<::arrow::DataType>> FromInt64(const LogicalType& logical_type);
+
+Result<std::shared_ptr<::arrow::DataType>> GetArrowType(Type::type physical_type,
+ const LogicalType& logical_type,
+ int type_length);
+
+Result<std::shared_ptr<::arrow::DataType>> GetArrowType(
+ Type::type physical_type, const LogicalType& logical_type, int type_length,
+ ::arrow::TimeUnit::type int96_arrow_time_unit = ::arrow::TimeUnit::NANO);
+
+Result<std::shared_ptr<::arrow::DataType>> GetArrowType(
+ const schema::PrimitiveNode& primitive,
+ ::arrow::TimeUnit::type int96_arrow_time_unit = ::arrow::TimeUnit::NANO);
+
+} // namespace arrow
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/arrow/writer.cc b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/writer.cc
new file mode 100644
index 00000000000..2fbebf27fce
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/writer.cc
@@ -0,0 +1,482 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/arrow/writer.h"
+
+#include <algorithm>
+#include <deque>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/extension_type.h"
+#include "arrow/ipc/writer.h"
+#include "arrow/table.h"
+#include "arrow/type.h"
+#include "arrow/util/base64.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/key_value_metadata.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/make_unique.h"
+#include "arrow/visitor_inline.h"
+
+#include "parquet/arrow/path_internal.h"
+#include "parquet/arrow/reader_internal.h"
+#include "parquet/arrow/schema.h"
+#include "parquet/column_writer.h"
+#include "parquet/exception.h"
+#include "parquet/file_writer.h"
+#include "parquet/platform.h"
+#include "parquet/schema.h"
+
+using arrow::Array;
+using arrow::BinaryArray;
+using arrow::BooleanArray;
+using arrow::ChunkedArray;
+using arrow::DataType;
+using arrow::DictionaryArray;
+using arrow::ExtensionArray;
+using arrow::ExtensionType;
+using arrow::Field;
+using arrow::FixedSizeBinaryArray;
+using arrow::ListArray;
+using arrow::MemoryPool;
+using arrow::NumericArray;
+using arrow::PrimitiveArray;
+using arrow::ResizableBuffer;
+using arrow::Status;
+using arrow::Table;
+using arrow::TimeUnit;
+
+using arrow::internal::checked_cast;
+
+using parquet::ParquetFileWriter;
+using parquet::ParquetVersion;
+using parquet::schema::GroupNode;
+
+namespace parquet {
+namespace arrow {
+
+namespace {
+
+int CalculateLeafCount(const DataType* type) {
+ if (type->id() == ::arrow::Type::EXTENSION) {
+ type = checked_cast<const ExtensionType&>(*type).storage_type().get();
+ }
+ // Note num_fields() can be 0 for an empty struct type
+ if (!::arrow::is_nested(type->id())) {
+ // Primitive type.
+ return 1;
+ }
+
+ int num_leaves = 0;
+ for (const auto& field : type->fields()) {
+ num_leaves += CalculateLeafCount(field->type().get());
+ }
+ return num_leaves;
+}
+
+// Determines if the |schema_field|'s root ancestor is nullable.
+bool HasNullableRoot(const SchemaManifest& schema_manifest,
+ const SchemaField* schema_field) {
+ DCHECK(schema_field != nullptr);
+ const SchemaField* current_field = schema_field;
+ bool nullable = schema_field->field->nullable();
+ while (current_field != nullptr) {
+ nullable = current_field->field->nullable();
+ current_field = schema_manifest.GetParent(current_field);
+ }
+ return nullable;
+}
+
+// Manages writing nested parquet columns with support for all nested types
+// supported by parquet.
+class ArrowColumnWriterV2 {
+ public:
+ // Constructs a new object (use Make() method below to construct from
+ // A ChunkedArray).
+ // level_builders should contain one MultipathLevelBuilder per chunk of the
+ // Arrow-column to write.
+ ArrowColumnWriterV2(std::vector<std::unique_ptr<MultipathLevelBuilder>> level_builders,
+ int leaf_count, RowGroupWriter* row_group_writer)
+ : level_builders_(std::move(level_builders)),
+ leaf_count_(leaf_count),
+ row_group_writer_(row_group_writer) {}
+
+ // Writes out all leaf parquet columns to the RowGroupWriter that this
+ // object was constructed with. Each leaf column is written fully before
+ // the next column is written (i.e. no buffering is assumed).
+ //
+ // Columns are written in DFS order.
+ Status Write(ArrowWriteContext* ctx) {
+ for (int leaf_idx = 0; leaf_idx < leaf_count_; leaf_idx++) {
+ ColumnWriter* column_writer;
+ PARQUET_CATCH_NOT_OK(column_writer = row_group_writer_->NextColumn());
+ for (auto& level_builder : level_builders_) {
+ RETURN_NOT_OK(level_builder->Write(
+ leaf_idx, ctx, [&](const MultipathLevelBuilderResult& result) {
+ size_t visited_component_size = result.post_list_visited_elements.size();
+ DCHECK_GT(visited_component_size, 0);
+ if (visited_component_size != 1) {
+ return Status::NotImplemented(
+ "Lists with non-zero length null components are not supported");
+ }
+ const ElementRange& range = result.post_list_visited_elements[0];
+ std::shared_ptr<Array> values_array =
+ result.leaf_array->Slice(range.start, range.Size());
+
+ return column_writer->WriteArrow(result.def_levels, result.rep_levels,
+ result.def_rep_level_count, *values_array,
+ ctx, result.leaf_is_nullable);
+ }));
+ }
+
+ PARQUET_CATCH_NOT_OK(column_writer->Close());
+ }
+ return Status::OK();
+ }
+
+ // Make a new object by converting each chunk in |data| to a MultipathLevelBuilder.
+ //
+ // It is necessary to create a new builder per array because the MultipathlevelBuilder
+ // extracts the data necessary for writing each leaf column at construction time.
+ // (it optimizes based on null count) and with slicing via |offset| ephemeral
+ // chunks are created which need to be tracked across each leaf column-write.
+ // This decision could potentially be revisited if we wanted to use "buffered"
+ // RowGroupWriters (we could construct each builder on demand in that case).
+ static ::arrow::Result<std::unique_ptr<ArrowColumnWriterV2>> Make(
+ const ChunkedArray& data, int64_t offset, const int64_t size,
+ const SchemaManifest& schema_manifest, RowGroupWriter* row_group_writer) {
+ int64_t absolute_position = 0;
+ int chunk_index = 0;
+ int64_t chunk_offset = 0;
+ if (data.length() == 0) {
+ return ::arrow::internal::make_unique<ArrowColumnWriterV2>(
+ std::vector<std::unique_ptr<MultipathLevelBuilder>>{},
+ CalculateLeafCount(data.type().get()), row_group_writer);
+ }
+ while (chunk_index < data.num_chunks() && absolute_position < offset) {
+ const int64_t chunk_length = data.chunk(chunk_index)->length();
+ if (absolute_position + chunk_length > offset) {
+ // Relative offset into the chunk to reach the desired start offset for
+ // writing
+ chunk_offset = offset - absolute_position;
+ break;
+ } else {
+ ++chunk_index;
+ absolute_position += chunk_length;
+ }
+ }
+
+ if (absolute_position >= data.length()) {
+ return Status::Invalid("Cannot write data at offset past end of chunked array");
+ }
+
+ int64_t values_written = 0;
+ std::vector<std::unique_ptr<MultipathLevelBuilder>> builders;
+ const int leaf_count = CalculateLeafCount(data.type().get());
+ bool is_nullable = false;
+ // The row_group_writer hasn't been advanced yet so add 1 to the current
+ // which is the one this instance will start writing for.
+ int column_index = row_group_writer->current_column() + 1;
+ for (int leaf_offset = 0; leaf_offset < leaf_count; ++leaf_offset) {
+ const SchemaField* schema_field = nullptr;
+ RETURN_NOT_OK(
+ schema_manifest.GetColumnField(column_index + leaf_offset, &schema_field));
+ bool nullable_root = HasNullableRoot(schema_manifest, schema_field);
+ if (leaf_offset == 0) {
+ is_nullable = nullable_root;
+ }
+
+// Don't validate common ancestry for all leafs if not in debug.
+#ifndef NDEBUG
+ break;
+#else
+ if (is_nullable != nullable_root) {
+ return Status::UnknownError(
+ "Unexpected mismatched nullability between column index",
+ column_index + leaf_offset, " and ", column_index);
+ }
+#endif
+ }
+ while (values_written < size) {
+ const Array& chunk = *data.chunk(chunk_index);
+ const int64_t available_values = chunk.length() - chunk_offset;
+ const int64_t chunk_write_size = std::min(size - values_written, available_values);
+
+ // The chunk offset here will be 0 except for possibly the first chunk
+ // because of the advancing logic above
+ std::shared_ptr<Array> array_to_write = chunk.Slice(chunk_offset, chunk_write_size);
+
+ if (array_to_write->length() > 0) {
+ ARROW_ASSIGN_OR_RAISE(std::unique_ptr<MultipathLevelBuilder> builder,
+ MultipathLevelBuilder::Make(*array_to_write, is_nullable));
+ if (leaf_count != builder->GetLeafCount()) {
+ return Status::UnknownError("data type leaf_count != builder_leaf_count",
+ leaf_count, " ", builder->GetLeafCount());
+ }
+ builders.emplace_back(std::move(builder));
+ }
+
+ if (chunk_write_size == available_values) {
+ chunk_offset = 0;
+ ++chunk_index;
+ }
+ values_written += chunk_write_size;
+ }
+ return ::arrow::internal::make_unique<ArrowColumnWriterV2>(
+ std::move(builders), leaf_count, row_group_writer);
+ }
+
+ private:
+ // One builder per column-chunk.
+ std::vector<std::unique_ptr<MultipathLevelBuilder>> level_builders_;
+ int leaf_count_;
+ RowGroupWriter* row_group_writer_;
+};
+
+} // namespace
+
+// ----------------------------------------------------------------------
+// FileWriter implementation
+
+class FileWriterImpl : public FileWriter {
+ public:
+ FileWriterImpl(std::shared_ptr<::arrow::Schema> schema, MemoryPool* pool,
+ std::unique_ptr<ParquetFileWriter> writer,
+ std::shared_ptr<ArrowWriterProperties> arrow_properties)
+ : schema_(std::move(schema)),
+ writer_(std::move(writer)),
+ row_group_writer_(nullptr),
+ column_write_context_(pool, arrow_properties.get()),
+ arrow_properties_(std::move(arrow_properties)),
+ closed_(false) {}
+
+ Status Init() {
+ return SchemaManifest::Make(writer_->schema(), /*schema_metadata=*/nullptr,
+ default_arrow_reader_properties(), &schema_manifest_);
+ }
+
+ Status NewRowGroup(int64_t chunk_size) override {
+ if (row_group_writer_ != nullptr) {
+ PARQUET_CATCH_NOT_OK(row_group_writer_->Close());
+ }
+ PARQUET_CATCH_NOT_OK(row_group_writer_ = writer_->AppendRowGroup());
+ return Status::OK();
+ }
+
+ Status Close() override {
+ if (!closed_) {
+ // Make idempotent
+ closed_ = true;
+ if (row_group_writer_ != nullptr) {
+ PARQUET_CATCH_NOT_OK(row_group_writer_->Close());
+ }
+ PARQUET_CATCH_NOT_OK(writer_->Close());
+ }
+ return Status::OK();
+ }
+
+ Status WriteColumnChunk(const Array& data) override {
+ // A bit awkward here since cannot instantiate ChunkedArray from const Array&
+ auto chunk = ::arrow::MakeArray(data.data());
+ auto chunked_array = std::make_shared<::arrow::ChunkedArray>(chunk);
+ return WriteColumnChunk(chunked_array, 0, data.length());
+ }
+
+ Status WriteColumnChunk(const std::shared_ptr<ChunkedArray>& data, int64_t offset,
+ int64_t size) override {
+ if (arrow_properties_->engine_version() == ArrowWriterProperties::V2 ||
+ arrow_properties_->engine_version() == ArrowWriterProperties::V1) {
+ ARROW_ASSIGN_OR_RAISE(
+ std::unique_ptr<ArrowColumnWriterV2> writer,
+ ArrowColumnWriterV2::Make(*data, offset, size, schema_manifest_,
+ row_group_writer_));
+ return writer->Write(&column_write_context_);
+ }
+ return Status::NotImplemented("Unknown engine version.");
+ }
+
+ Status WriteColumnChunk(const std::shared_ptr<::arrow::ChunkedArray>& data) override {
+ return WriteColumnChunk(data, 0, data->length());
+ }
+
+ std::shared_ptr<::arrow::Schema> schema() const override { return schema_; }
+
+ Status WriteTable(const Table& table, int64_t chunk_size) override {
+ RETURN_NOT_OK(table.Validate());
+
+ if (chunk_size <= 0 && table.num_rows() > 0) {
+ return Status::Invalid("chunk size per row_group must be greater than 0");
+ } else if (!table.schema()->Equals(*schema_, false)) {
+ return Status::Invalid("table schema does not match this writer's. table:'",
+ table.schema()->ToString(), "' this:'", schema_->ToString(),
+ "'");
+ } else if (chunk_size > this->properties().max_row_group_length()) {
+ chunk_size = this->properties().max_row_group_length();
+ }
+
+ auto WriteRowGroup = [&](int64_t offset, int64_t size) {
+ RETURN_NOT_OK(NewRowGroup(size));
+ for (int i = 0; i < table.num_columns(); i++) {
+ RETURN_NOT_OK(WriteColumnChunk(table.column(i), offset, size));
+ }
+ return Status::OK();
+ };
+
+ if (table.num_rows() == 0) {
+ // Append a row group with 0 rows
+ RETURN_NOT_OK_ELSE(WriteRowGroup(0, 0), PARQUET_IGNORE_NOT_OK(Close()));
+ return Status::OK();
+ }
+
+ for (int chunk = 0; chunk * chunk_size < table.num_rows(); chunk++) {
+ int64_t offset = chunk * chunk_size;
+ RETURN_NOT_OK_ELSE(
+ WriteRowGroup(offset, std::min(chunk_size, table.num_rows() - offset)),
+ PARQUET_IGNORE_NOT_OK(Close()));
+ }
+ return Status::OK();
+ }
+
+ const WriterProperties& properties() const { return *writer_->properties(); }
+
+ ::arrow::MemoryPool* memory_pool() const override {
+ return column_write_context_.memory_pool;
+ }
+
+ const std::shared_ptr<FileMetaData> metadata() const override {
+ return writer_->metadata();
+ }
+
+ private:
+ friend class FileWriter;
+
+ std::shared_ptr<::arrow::Schema> schema_;
+
+ SchemaManifest schema_manifest_;
+
+ std::unique_ptr<ParquetFileWriter> writer_;
+ RowGroupWriter* row_group_writer_;
+ ArrowWriteContext column_write_context_;
+ std::shared_ptr<ArrowWriterProperties> arrow_properties_;
+ bool closed_;
+};
+
+FileWriter::~FileWriter() {}
+
+Status FileWriter::Make(::arrow::MemoryPool* pool,
+ std::unique_ptr<ParquetFileWriter> writer,
+ std::shared_ptr<::arrow::Schema> schema,
+ std::shared_ptr<ArrowWriterProperties> arrow_properties,
+ std::unique_ptr<FileWriter>* out) {
+ std::unique_ptr<FileWriterImpl> impl(new FileWriterImpl(
+ std::move(schema), pool, std::move(writer), std::move(arrow_properties)));
+ RETURN_NOT_OK(impl->Init());
+ *out = std::move(impl);
+ return Status::OK();
+}
+
+Status FileWriter::Open(const ::arrow::Schema& schema, ::arrow::MemoryPool* pool,
+ std::shared_ptr<::arrow::io::OutputStream> sink,
+ std::shared_ptr<WriterProperties> properties,
+ std::unique_ptr<FileWriter>* writer) {
+ return Open(std::move(schema), pool, std::move(sink), std::move(properties),
+ default_arrow_writer_properties(), writer);
+}
+
+Status GetSchemaMetadata(const ::arrow::Schema& schema, ::arrow::MemoryPool* pool,
+ const ArrowWriterProperties& properties,
+ std::shared_ptr<const KeyValueMetadata>* out) {
+ if (!properties.store_schema()) {
+ *out = nullptr;
+ return Status::OK();
+ }
+
+ static const std::string kArrowSchemaKey = "ARROW:schema";
+ std::shared_ptr<KeyValueMetadata> result;
+ if (schema.metadata()) {
+ result = schema.metadata()->Copy();
+ } else {
+ result = ::arrow::key_value_metadata({}, {});
+ }
+
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Buffer> serialized,
+ ::arrow::ipc::SerializeSchema(schema, pool));
+
+ // The serialized schema is not UTF-8, which is required for Thrift
+ std::string schema_as_string = serialized->ToString();
+ std::string schema_base64 = ::arrow::util::base64_encode(
+ reinterpret_cast<const unsigned char*>(schema_as_string.data()),
+ static_cast<unsigned int>(schema_as_string.size()));
+ result->Append(kArrowSchemaKey, schema_base64);
+ *out = result;
+ return Status::OK();
+}
+
+Status FileWriter::Open(const ::arrow::Schema& schema, ::arrow::MemoryPool* pool,
+ std::shared_ptr<::arrow::io::OutputStream> sink,
+ std::shared_ptr<WriterProperties> properties,
+ std::shared_ptr<ArrowWriterProperties> arrow_properties,
+ std::unique_ptr<FileWriter>* writer) {
+ std::shared_ptr<SchemaDescriptor> parquet_schema;
+ RETURN_NOT_OK(
+ ToParquetSchema(&schema, *properties, *arrow_properties, &parquet_schema));
+
+ auto schema_node = std::static_pointer_cast<GroupNode>(parquet_schema->schema_root());
+
+ std::shared_ptr<const KeyValueMetadata> metadata;
+ RETURN_NOT_OK(GetSchemaMetadata(schema, pool, *arrow_properties, &metadata));
+
+ std::unique_ptr<ParquetFileWriter> base_writer;
+ PARQUET_CATCH_NOT_OK(base_writer = ParquetFileWriter::Open(std::move(sink), schema_node,
+ std::move(properties),
+ std::move(metadata)));
+
+ auto schema_ptr = std::make_shared<::arrow::Schema>(schema);
+ return Make(pool, std::move(base_writer), std::move(schema_ptr),
+ std::move(arrow_properties), writer);
+}
+
+Status WriteFileMetaData(const FileMetaData& file_metadata,
+ ::arrow::io::OutputStream* sink) {
+ PARQUET_CATCH_NOT_OK(::parquet::WriteFileMetaData(file_metadata, sink));
+ return Status::OK();
+}
+
+Status WriteMetaDataFile(const FileMetaData& file_metadata,
+ ::arrow::io::OutputStream* sink) {
+ PARQUET_CATCH_NOT_OK(::parquet::WriteMetaDataFile(file_metadata, sink));
+ return Status::OK();
+}
+
+Status WriteTable(const ::arrow::Table& table, ::arrow::MemoryPool* pool,
+ std::shared_ptr<::arrow::io::OutputStream> sink, int64_t chunk_size,
+ std::shared_ptr<WriterProperties> properties,
+ std::shared_ptr<ArrowWriterProperties> arrow_properties) {
+ std::unique_ptr<FileWriter> writer;
+ RETURN_NOT_OK(FileWriter::Open(*table.schema(), pool, std::move(sink),
+ std::move(properties), std::move(arrow_properties),
+ &writer));
+ RETURN_NOT_OK(writer->WriteTable(table, chunk_size));
+ return writer->Close();
+}
+
+} // namespace arrow
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/arrow/writer.h b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/writer.h
new file mode 100644
index 00000000000..f31f3d03def
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/writer.h
@@ -0,0 +1,109 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+
+#include "parquet/platform.h"
+#include "parquet/properties.h"
+
+namespace arrow {
+
+class Array;
+class ChunkedArray;
+class Schema;
+class Table;
+
+} // namespace arrow
+
+namespace parquet {
+
+class FileMetaData;
+class ParquetFileWriter;
+
+namespace arrow {
+
+/// \brief Iterative FileWriter class
+///
+/// Start a new RowGroup or Chunk with NewRowGroup.
+/// Write column-by-column the whole column chunk.
+///
+/// If PARQUET:field_id is present as a metadata key on a field, and the corresponding
+/// value is a nonnegative integer, then it will be used as the field_id in the parquet
+/// file.
+class PARQUET_EXPORT FileWriter {
+ public:
+ static ::arrow::Status Make(MemoryPool* pool, std::unique_ptr<ParquetFileWriter> writer,
+ std::shared_ptr<::arrow::Schema> schema,
+ std::shared_ptr<ArrowWriterProperties> arrow_properties,
+ std::unique_ptr<FileWriter>* out);
+
+ static ::arrow::Status Open(const ::arrow::Schema& schema, MemoryPool* pool,
+ std::shared_ptr<::arrow::io::OutputStream> sink,
+ std::shared_ptr<WriterProperties> properties,
+ std::unique_ptr<FileWriter>* writer);
+
+ static ::arrow::Status Open(const ::arrow::Schema& schema, MemoryPool* pool,
+ std::shared_ptr<::arrow::io::OutputStream> sink,
+ std::shared_ptr<WriterProperties> properties,
+ std::shared_ptr<ArrowWriterProperties> arrow_properties,
+ std::unique_ptr<FileWriter>* writer);
+
+ virtual std::shared_ptr<::arrow::Schema> schema() const = 0;
+
+ /// \brief Write a Table to Parquet.
+ virtual ::arrow::Status WriteTable(const ::arrow::Table& table, int64_t chunk_size) = 0;
+
+ virtual ::arrow::Status NewRowGroup(int64_t chunk_size) = 0;
+ virtual ::arrow::Status WriteColumnChunk(const ::arrow::Array& data) = 0;
+
+ /// \brief Write ColumnChunk in row group using slice of a ChunkedArray
+ virtual ::arrow::Status WriteColumnChunk(
+ const std::shared_ptr<::arrow::ChunkedArray>& data, int64_t offset,
+ int64_t size) = 0;
+
+ virtual ::arrow::Status WriteColumnChunk(
+ const std::shared_ptr<::arrow::ChunkedArray>& data) = 0;
+ virtual ::arrow::Status Close() = 0;
+ virtual ~FileWriter();
+
+ virtual MemoryPool* memory_pool() const = 0;
+ virtual const std::shared_ptr<FileMetaData> metadata() const = 0;
+};
+
+/// \brief Write Parquet file metadata only to indicated Arrow OutputStream
+PARQUET_EXPORT
+::arrow::Status WriteFileMetaData(const FileMetaData& file_metadata,
+ ::arrow::io::OutputStream* sink);
+
+/// \brief Write metadata-only Parquet file to indicated Arrow OutputStream
+PARQUET_EXPORT
+::arrow::Status WriteMetaDataFile(const FileMetaData& file_metadata,
+ ::arrow::io::OutputStream* sink);
+
+/// \brief Write a Table to Parquet.
+::arrow::Status PARQUET_EXPORT
+WriteTable(const ::arrow::Table& table, MemoryPool* pool,
+ std::shared_ptr<::arrow::io::OutputStream> sink, int64_t chunk_size,
+ std::shared_ptr<WriterProperties> properties = default_writer_properties(),
+ std::shared_ptr<ArrowWriterProperties> arrow_properties =
+ default_arrow_writer_properties());
+
+} // namespace arrow
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/bloom_filter.cc b/contrib/libs/apache/arrow/cpp/src/parquet/bloom_filter.cc
new file mode 100644
index 00000000000..f6f6d327d06
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/bloom_filter.cc
@@ -0,0 +1,162 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cstdint>
+#include <cstring>
+
+#include "arrow/result.h"
+#include "arrow/util/logging.h"
+#include "parquet/bloom_filter.h"
+#include "parquet/exception.h"
+#include "parquet/murmur3.h"
+
+namespace parquet {
+constexpr uint32_t BlockSplitBloomFilter::SALT[kBitsSetPerBlock];
+
+BlockSplitBloomFilter::BlockSplitBloomFilter()
+ : pool_(::arrow::default_memory_pool()),
+ hash_strategy_(HashStrategy::MURMUR3_X64_128),
+ algorithm_(Algorithm::BLOCK) {}
+
+void BlockSplitBloomFilter::Init(uint32_t num_bytes) {
+ if (num_bytes < kMinimumBloomFilterBytes) {
+ num_bytes = kMinimumBloomFilterBytes;
+ }
+
+ // Get next power of 2 if it is not power of 2.
+ if ((num_bytes & (num_bytes - 1)) != 0) {
+ num_bytes = static_cast<uint32_t>(::arrow::BitUtil::NextPower2(num_bytes));
+ }
+
+ if (num_bytes > kMaximumBloomFilterBytes) {
+ num_bytes = kMaximumBloomFilterBytes;
+ }
+
+ num_bytes_ = num_bytes;
+ PARQUET_ASSIGN_OR_THROW(data_, ::arrow::AllocateBuffer(num_bytes_, pool_));
+ memset(data_->mutable_data(), 0, num_bytes_);
+
+ this->hasher_.reset(new MurmurHash3());
+}
+
+void BlockSplitBloomFilter::Init(const uint8_t* bitset, uint32_t num_bytes) {
+ DCHECK(bitset != nullptr);
+
+ if (num_bytes < kMinimumBloomFilterBytes || num_bytes > kMaximumBloomFilterBytes ||
+ (num_bytes & (num_bytes - 1)) != 0) {
+ throw ParquetException("Given length of bitset is illegal");
+ }
+
+ num_bytes_ = num_bytes;
+ PARQUET_ASSIGN_OR_THROW(data_, ::arrow::AllocateBuffer(num_bytes_, pool_));
+ memcpy(data_->mutable_data(), bitset, num_bytes_);
+
+ this->hasher_.reset(new MurmurHash3());
+}
+
+BlockSplitBloomFilter BlockSplitBloomFilter::Deserialize(ArrowInputStream* input) {
+ uint32_t len, hash, algorithm;
+ int64_t bytes_available;
+
+ PARQUET_ASSIGN_OR_THROW(bytes_available, input->Read(sizeof(uint32_t), &len));
+ if (static_cast<uint32_t>(bytes_available) != sizeof(uint32_t)) {
+ throw ParquetException("Failed to deserialize from input stream");
+ }
+
+ PARQUET_ASSIGN_OR_THROW(bytes_available, input->Read(sizeof(uint32_t), &hash));
+ if (static_cast<uint32_t>(bytes_available) != sizeof(uint32_t)) {
+ throw ParquetException("Failed to deserialize from input stream");
+ }
+ if (static_cast<HashStrategy>(hash) != HashStrategy::MURMUR3_X64_128) {
+ throw ParquetException("Unsupported hash strategy");
+ }
+
+ PARQUET_ASSIGN_OR_THROW(bytes_available, input->Read(sizeof(uint32_t), &algorithm));
+ if (static_cast<uint32_t>(bytes_available) != sizeof(uint32_t)) {
+ throw ParquetException("Failed to deserialize from input stream");
+ }
+ if (static_cast<Algorithm>(algorithm) != BloomFilter::Algorithm::BLOCK) {
+ throw ParquetException("Unsupported Bloom filter algorithm");
+ }
+
+ BlockSplitBloomFilter bloom_filter;
+
+ PARQUET_ASSIGN_OR_THROW(auto buffer, input->Read(len));
+ bloom_filter.Init(buffer->data(), len);
+ return bloom_filter;
+}
+
+void BlockSplitBloomFilter::WriteTo(ArrowOutputStream* sink) const {
+ DCHECK(sink != nullptr);
+
+ PARQUET_THROW_NOT_OK(
+ sink->Write(reinterpret_cast<const uint8_t*>(&num_bytes_), sizeof(num_bytes_)));
+ PARQUET_THROW_NOT_OK(sink->Write(reinterpret_cast<const uint8_t*>(&hash_strategy_),
+ sizeof(hash_strategy_)));
+ PARQUET_THROW_NOT_OK(
+ sink->Write(reinterpret_cast<const uint8_t*>(&algorithm_), sizeof(algorithm_)));
+ PARQUET_THROW_NOT_OK(sink->Write(data_->mutable_data(), num_bytes_));
+}
+
+void BlockSplitBloomFilter::SetMask(uint32_t key, BlockMask& block_mask) const {
+ for (int i = 0; i < kBitsSetPerBlock; ++i) {
+ block_mask.item[i] = key * SALT[i];
+ }
+
+ for (int i = 0; i < kBitsSetPerBlock; ++i) {
+ block_mask.item[i] = block_mask.item[i] >> 27;
+ }
+
+ for (int i = 0; i < kBitsSetPerBlock; ++i) {
+ block_mask.item[i] = UINT32_C(0x1) << block_mask.item[i];
+ }
+}
+
+bool BlockSplitBloomFilter::FindHash(uint64_t hash) const {
+ const uint32_t bucket_index =
+ static_cast<uint32_t>((hash >> 32) & (num_bytes_ / kBytesPerFilterBlock - 1));
+ uint32_t key = static_cast<uint32_t>(hash);
+ uint32_t* bitset32 = reinterpret_cast<uint32_t*>(data_->mutable_data());
+
+ // Calculate mask for bucket.
+ BlockMask block_mask;
+ SetMask(key, block_mask);
+
+ for (int i = 0; i < kBitsSetPerBlock; ++i) {
+ if (0 == (bitset32[kBitsSetPerBlock * bucket_index + i] & block_mask.item[i])) {
+ return false;
+ }
+ }
+ return true;
+}
+
+void BlockSplitBloomFilter::InsertHash(uint64_t hash) {
+ const uint32_t bucket_index =
+ static_cast<uint32_t>(hash >> 32) & (num_bytes_ / kBytesPerFilterBlock - 1);
+ uint32_t key = static_cast<uint32_t>(hash);
+ uint32_t* bitset32 = reinterpret_cast<uint32_t*>(data_->mutable_data());
+
+ // Calculate mask for bucket.
+ BlockMask block_mask;
+ SetMask(key, block_mask);
+
+ for (int i = 0; i < kBitsSetPerBlock; i++) {
+ bitset32[bucket_index * kBitsSetPerBlock + i] |= block_mask.item[i];
+ }
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/bloom_filter.h b/contrib/libs/apache/arrow/cpp/src/parquet/bloom_filter.h
new file mode 100644
index 00000000000..39f9561ae5b
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/bloom_filter.h
@@ -0,0 +1,247 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cmath>
+#include <cstdint>
+#include <memory>
+
+#include "arrow/util/bit_util.h"
+#include "arrow/util/logging.h"
+#include "parquet/hasher.h"
+#include "parquet/platform.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+// A Bloom filter is a compact structure to indicate whether an item is not in a set or
+// probably in a set. The Bloom filter usually consists of a bit set that represents a
+// set of elements, a hash strategy and a Bloom filter algorithm.
+class PARQUET_EXPORT BloomFilter {
+ public:
+ // Maximum Bloom filter size, it sets to HDFS default block size 128MB
+ // This value will be reconsidered when implementing Bloom filter producer.
+ static constexpr uint32_t kMaximumBloomFilterBytes = 128 * 1024 * 1024;
+
+ /// Determine whether an element exist in set or not.
+ ///
+ /// @param hash the element to contain.
+ /// @return false if value is definitely not in set, and true means PROBABLY
+ /// in set.
+ virtual bool FindHash(uint64_t hash) const = 0;
+
+ /// Insert element to set represented by Bloom filter bitset.
+ /// @param hash the hash of value to insert into Bloom filter.
+ virtual void InsertHash(uint64_t hash) = 0;
+
+ /// Write this Bloom filter to an output stream. A Bloom filter structure should
+ /// include bitset length, hash strategy, algorithm, and bitset.
+ ///
+ /// @param sink the output stream to write
+ virtual void WriteTo(ArrowOutputStream* sink) const = 0;
+
+ /// Get the number of bytes of bitset
+ virtual uint32_t GetBitsetSize() const = 0;
+
+ /// Compute hash for 32 bits value by using its plain encoding result.
+ ///
+ /// @param value the value to hash.
+ /// @return hash result.
+ virtual uint64_t Hash(int32_t value) const = 0;
+
+ /// Compute hash for 64 bits value by using its plain encoding result.
+ ///
+ /// @param value the value to hash.
+ /// @return hash result.
+ virtual uint64_t Hash(int64_t value) const = 0;
+
+ /// Compute hash for float value by using its plain encoding result.
+ ///
+ /// @param value the value to hash.
+ /// @return hash result.
+ virtual uint64_t Hash(float value) const = 0;
+
+ /// Compute hash for double value by using its plain encoding result.
+ ///
+ /// @param value the value to hash.
+ /// @return hash result.
+ virtual uint64_t Hash(double value) const = 0;
+
+ /// Compute hash for Int96 value by using its plain encoding result.
+ ///
+ /// @param value the value to hash.
+ /// @return hash result.
+ virtual uint64_t Hash(const Int96* value) const = 0;
+
+ /// Compute hash for ByteArray value by using its plain encoding result.
+ ///
+ /// @param value the value to hash.
+ /// @return hash result.
+ virtual uint64_t Hash(const ByteArray* value) const = 0;
+
+ /// Compute hash for fixed byte array value by using its plain encoding result.
+ ///
+ /// @param value the value address.
+ /// @param len the value length.
+ /// @return hash result.
+ virtual uint64_t Hash(const FLBA* value, uint32_t len) const = 0;
+
+ virtual ~BloomFilter() {}
+
+ protected:
+ // Hash strategy available for Bloom filter.
+ enum class HashStrategy : uint32_t { MURMUR3_X64_128 = 0 };
+
+ // Bloom filter algorithm.
+ enum class Algorithm : uint32_t { BLOCK = 0 };
+};
+
+// The BlockSplitBloomFilter is implemented using block-based Bloom filters from
+// Putze et al.'s "Cache-,Hash- and Space-Efficient Bloom filters". The basic idea is to
+// hash the item to a tiny Bloom filter which size fit a single cache line or smaller.
+//
+// This implementation sets 8 bits in each tiny Bloom filter. Each tiny Bloom
+// filter is 32 bytes to take advantage of 32-byte SIMD instructions.
+class PARQUET_EXPORT BlockSplitBloomFilter : public BloomFilter {
+ public:
+ /// The constructor of BlockSplitBloomFilter. It uses murmur3_x64_128 as hash function.
+ BlockSplitBloomFilter();
+
+ /// Initialize the BlockSplitBloomFilter. The range of num_bytes should be within
+ /// [kMinimumBloomFilterBytes, kMaximumBloomFilterBytes], it will be
+ /// rounded up/down to lower/upper bound if num_bytes is out of range and also
+ /// will be rounded up to a power of 2.
+ ///
+ /// @param num_bytes The number of bytes to store Bloom filter bitset.
+ void Init(uint32_t num_bytes);
+
+ /// Initialize the BlockSplitBloomFilter. It copies the bitset as underlying
+ /// bitset because the given bitset may not satisfy the 32-byte alignment requirement
+ /// which may lead to segfault when performing SIMD instructions. It is the caller's
+ /// responsibility to free the bitset passed in. This is used when reconstructing
+ /// a Bloom filter from a parquet file.
+ ///
+ /// @param bitset The given bitset to initialize the Bloom filter.
+ /// @param num_bytes The number of bytes of given bitset.
+ void Init(const uint8_t* bitset, uint32_t num_bytes);
+
+ // Minimum Bloom filter size, it sets to 32 bytes to fit a tiny Bloom filter.
+ static constexpr uint32_t kMinimumBloomFilterBytes = 32;
+
+ /// Calculate optimal size according to the number of distinct values and false
+ /// positive probability.
+ ///
+ /// @param ndv The number of distinct values.
+ /// @param fpp The false positive probability.
+ /// @return it always return a value between kMinimumBloomFilterBytes and
+ /// kMaximumBloomFilterBytes, and the return value is always a power of 2
+ static uint32_t OptimalNumOfBits(uint32_t ndv, double fpp) {
+ DCHECK(fpp > 0.0 && fpp < 1.0);
+ const double m = -8.0 * ndv / log(1 - pow(fpp, 1.0 / 8));
+ uint32_t num_bits;
+
+ // Handle overflow.
+ if (m < 0 || m > kMaximumBloomFilterBytes << 3) {
+ num_bits = static_cast<uint32_t>(kMaximumBloomFilterBytes << 3);
+ } else {
+ num_bits = static_cast<uint32_t>(m);
+ }
+
+ // Round up to lower bound
+ if (num_bits < kMinimumBloomFilterBytes << 3) {
+ num_bits = kMinimumBloomFilterBytes << 3;
+ }
+
+ // Get next power of 2 if bits is not power of 2.
+ if ((num_bits & (num_bits - 1)) != 0) {
+ num_bits = static_cast<uint32_t>(::arrow::BitUtil::NextPower2(num_bits));
+ }
+
+ // Round down to upper bound
+ if (num_bits > kMaximumBloomFilterBytes << 3) {
+ num_bits = kMaximumBloomFilterBytes << 3;
+ }
+
+ return num_bits;
+ }
+
+ bool FindHash(uint64_t hash) const override;
+ void InsertHash(uint64_t hash) override;
+ void WriteTo(ArrowOutputStream* sink) const override;
+ uint32_t GetBitsetSize() const override { return num_bytes_; }
+
+ uint64_t Hash(int64_t value) const override { return hasher_->Hash(value); }
+ uint64_t Hash(float value) const override { return hasher_->Hash(value); }
+ uint64_t Hash(double value) const override { return hasher_->Hash(value); }
+ uint64_t Hash(const Int96* value) const override { return hasher_->Hash(value); }
+ uint64_t Hash(const ByteArray* value) const override { return hasher_->Hash(value); }
+ uint64_t Hash(int32_t value) const override { return hasher_->Hash(value); }
+ uint64_t Hash(const FLBA* value, uint32_t len) const override {
+ return hasher_->Hash(value, len);
+ }
+
+ /// Deserialize the Bloom filter from an input stream. It is used when reconstructing
+ /// a Bloom filter from a parquet filter.
+ ///
+ /// @param input_stream The input stream from which to construct the Bloom filter
+ /// @return The BlockSplitBloomFilter.
+ static BlockSplitBloomFilter Deserialize(ArrowInputStream* input_stream);
+
+ private:
+ // Bytes in a tiny Bloom filter block.
+ static constexpr int kBytesPerFilterBlock = 32;
+
+ // The number of bits to be set in each tiny Bloom filter
+ static constexpr int kBitsSetPerBlock = 8;
+
+ // A mask structure used to set bits in each tiny Bloom filter.
+ struct BlockMask {
+ uint32_t item[kBitsSetPerBlock];
+ };
+
+ // The block-based algorithm needs eight odd SALT values to calculate eight indexes
+ // of bit to set, one bit in each 32-bit word.
+ static constexpr uint32_t SALT[kBitsSetPerBlock] = {
+ 0x47b6137bU, 0x44974d91U, 0x8824ad5bU, 0xa2b7289dU,
+ 0x705495c7U, 0x2df1424bU, 0x9efc4947U, 0x5c6bfb31U};
+
+ /// Set bits in mask array according to input key.
+ /// @param key the value to calculate mask values.
+ /// @param mask the mask array is used to set inside a block
+ void SetMask(uint32_t key, BlockMask& mask) const;
+
+ // Memory pool to allocate aligned buffer for bitset
+ ::arrow::MemoryPool* pool_;
+
+ // The underlying buffer of bitset.
+ std::shared_ptr<Buffer> data_;
+
+ // The number of bytes of Bloom filter bitset.
+ uint32_t num_bytes_;
+
+ // Hash strategy used in this Bloom filter.
+ HashStrategy hash_strategy_;
+
+ // Algorithm used in this Bloom filter.
+ Algorithm algorithm_;
+
+ // The hash pointer points to actual hash class used.
+ std::unique_ptr<Hasher> hasher_;
+};
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/column_page.h b/contrib/libs/apache/arrow/cpp/src/parquet/column_page.h
new file mode 100644
index 00000000000..2fab77ed01a
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/column_page.h
@@ -0,0 +1,160 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This module defines an abstract interface for iterating through pages in a
+// Parquet column chunk within a row group. It could be extended in the future
+// to iterate through all data pages in all chunks in a file.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+
+#include "parquet/statistics.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+// TODO: Parallel processing is not yet safe because of memory-ownership
+// semantics (the PageReader may or may not own the memory referenced by a
+// page)
+//
+// TODO(wesm): In the future Parquet implementations may store the crc code
+// in format::PageHeader. parquet-mr currently does not, so we also skip it
+// here, both on the read and write path
+class Page {
+ public:
+ Page(const std::shared_ptr<Buffer>& buffer, PageType::type type)
+ : buffer_(buffer), type_(type) {}
+
+ PageType::type type() const { return type_; }
+
+ std::shared_ptr<Buffer> buffer() const { return buffer_; }
+
+ // @returns: a pointer to the page's data
+ const uint8_t* data() const { return buffer_->data(); }
+
+ // @returns: the total size in bytes of the page's data buffer
+ int32_t size() const { return static_cast<int32_t>(buffer_->size()); }
+
+ private:
+ std::shared_ptr<Buffer> buffer_;
+ PageType::type type_;
+};
+
+/// \brief Base type for DataPageV1 and DataPageV2 including common attributes
+class DataPage : public Page {
+ public:
+ int32_t num_values() const { return num_values_; }
+ Encoding::type encoding() const { return encoding_; }
+ int64_t uncompressed_size() const { return uncompressed_size_; }
+ const EncodedStatistics& statistics() const { return statistics_; }
+
+ virtual ~DataPage() = default;
+
+ protected:
+ DataPage(PageType::type type, const std::shared_ptr<Buffer>& buffer, int32_t num_values,
+ Encoding::type encoding, int64_t uncompressed_size,
+ const EncodedStatistics& statistics = EncodedStatistics())
+ : Page(buffer, type),
+ num_values_(num_values),
+ encoding_(encoding),
+ uncompressed_size_(uncompressed_size),
+ statistics_(statistics) {}
+
+ int32_t num_values_;
+ Encoding::type encoding_;
+ int64_t uncompressed_size_;
+ EncodedStatistics statistics_;
+};
+
+class DataPageV1 : public DataPage {
+ public:
+ DataPageV1(const std::shared_ptr<Buffer>& buffer, int32_t num_values,
+ Encoding::type encoding, Encoding::type definition_level_encoding,
+ Encoding::type repetition_level_encoding, int64_t uncompressed_size,
+ const EncodedStatistics& statistics = EncodedStatistics())
+ : DataPage(PageType::DATA_PAGE, buffer, num_values, encoding, uncompressed_size,
+ statistics),
+ definition_level_encoding_(definition_level_encoding),
+ repetition_level_encoding_(repetition_level_encoding) {}
+
+ Encoding::type repetition_level_encoding() const { return repetition_level_encoding_; }
+
+ Encoding::type definition_level_encoding() const { return definition_level_encoding_; }
+
+ private:
+ Encoding::type definition_level_encoding_;
+ Encoding::type repetition_level_encoding_;
+};
+
+class DataPageV2 : public DataPage {
+ public:
+ DataPageV2(const std::shared_ptr<Buffer>& buffer, int32_t num_values, int32_t num_nulls,
+ int32_t num_rows, Encoding::type encoding,
+ int32_t definition_levels_byte_length, int32_t repetition_levels_byte_length,
+ int64_t uncompressed_size, bool is_compressed = false,
+ const EncodedStatistics& statistics = EncodedStatistics())
+ : DataPage(PageType::DATA_PAGE_V2, buffer, num_values, encoding, uncompressed_size,
+ statistics),
+ num_nulls_(num_nulls),
+ num_rows_(num_rows),
+ definition_levels_byte_length_(definition_levels_byte_length),
+ repetition_levels_byte_length_(repetition_levels_byte_length),
+ is_compressed_(is_compressed) {}
+
+ int32_t num_nulls() const { return num_nulls_; }
+
+ int32_t num_rows() const { return num_rows_; }
+
+ int32_t definition_levels_byte_length() const { return definition_levels_byte_length_; }
+
+ int32_t repetition_levels_byte_length() const { return repetition_levels_byte_length_; }
+
+ bool is_compressed() const { return is_compressed_; }
+
+ private:
+ int32_t num_nulls_;
+ int32_t num_rows_;
+ int32_t definition_levels_byte_length_;
+ int32_t repetition_levels_byte_length_;
+ bool is_compressed_;
+};
+
+class DictionaryPage : public Page {
+ public:
+ DictionaryPage(const std::shared_ptr<Buffer>& buffer, int32_t num_values,
+ Encoding::type encoding, bool is_sorted = false)
+ : Page(buffer, PageType::DICTIONARY_PAGE),
+ num_values_(num_values),
+ encoding_(encoding),
+ is_sorted_(is_sorted) {}
+
+ int32_t num_values() const { return num_values_; }
+
+ Encoding::type encoding() const { return encoding_; }
+
+ bool is_sorted() const { return is_sorted_; }
+
+ private:
+ int32_t num_values_;
+ Encoding::type encoding_;
+ bool is_sorted_;
+};
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/column_reader.cc b/contrib/libs/apache/arrow/cpp/src/parquet/column_reader.cc
new file mode 100644
index 00000000000..047d99fed9a
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/column_reader.cc
@@ -0,0 +1,1802 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/column_reader.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <exception>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/array/builder_binary.h"
+#include "arrow/array/builder_dict.h"
+#include "arrow/array/builder_primitive.h"
+#include "arrow/chunked_array.h"
+#include "arrow/type.h"
+#include "arrow/util/bit_stream_utils.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/compression.h"
+#include "arrow/util/int_util_internal.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/rle_encoding.h"
+#include "parquet/column_page.h"
+#include "parquet/encoding.h"
+#include "parquet/encryption/encryption_internal.h"
+#include "parquet/encryption/internal_file_decryptor.h"
+#include "parquet/level_comparison.h"
+#include "parquet/level_conversion.h"
+#include "parquet/properties.h"
+#include "parquet/statistics.h"
+#include "parquet/thrift_internal.h" // IWYU pragma: keep
+// Required after "arrow/util/int_util_internal.h" (for OPTIONAL)
+#include "parquet/windows_compatibility.h"
+
+using arrow::MemoryPool;
+using arrow::internal::AddWithOverflow;
+using arrow::internal::checked_cast;
+using arrow::internal::MultiplyWithOverflow;
+
+namespace BitUtil = arrow::BitUtil;
+
+namespace parquet {
+namespace {
+inline bool HasSpacedValues(const ColumnDescriptor* descr) {
+ if (descr->max_repetition_level() > 0) {
+ // repeated+flat case
+ return !descr->schema_node()->is_required();
+ } else {
+ // non-repeated+nested case
+ // Find if a node forces nulls in the lowest level along the hierarchy
+ const schema::Node* node = descr->schema_node().get();
+ while (node) {
+ if (node->is_optional()) {
+ return true;
+ }
+ node = node->parent();
+ }
+ return false;
+ }
+}
+} // namespace
+
+LevelDecoder::LevelDecoder() : num_values_remaining_(0) {}
+
+LevelDecoder::~LevelDecoder() {}
+
+int LevelDecoder::SetData(Encoding::type encoding, int16_t max_level,
+ int num_buffered_values, const uint8_t* data,
+ int32_t data_size) {
+ max_level_ = max_level;
+ int32_t num_bytes = 0;
+ encoding_ = encoding;
+ num_values_remaining_ = num_buffered_values;
+ bit_width_ = BitUtil::Log2(max_level + 1);
+ switch (encoding) {
+ case Encoding::RLE: {
+ if (data_size < 4) {
+ throw ParquetException("Received invalid levels (corrupt data page?)");
+ }
+ num_bytes = ::arrow::util::SafeLoadAs<int32_t>(data);
+ if (num_bytes < 0 || num_bytes > data_size - 4) {
+ throw ParquetException("Received invalid number of bytes (corrupt data page?)");
+ }
+ const uint8_t* decoder_data = data + 4;
+ if (!rle_decoder_) {
+ rle_decoder_.reset(
+ new ::arrow::util::RleDecoder(decoder_data, num_bytes, bit_width_));
+ } else {
+ rle_decoder_->Reset(decoder_data, num_bytes, bit_width_);
+ }
+ return 4 + num_bytes;
+ }
+ case Encoding::BIT_PACKED: {
+ int num_bits = 0;
+ if (MultiplyWithOverflow(num_buffered_values, bit_width_, &num_bits)) {
+ throw ParquetException(
+ "Number of buffered values too large (corrupt data page?)");
+ }
+ num_bytes = static_cast<int32_t>(BitUtil::BytesForBits(num_bits));
+ if (num_bytes < 0 || num_bytes > data_size - 4) {
+ throw ParquetException("Received invalid number of bytes (corrupt data page?)");
+ }
+ if (!bit_packed_decoder_) {
+ bit_packed_decoder_.reset(new ::arrow::BitUtil::BitReader(data, num_bytes));
+ } else {
+ bit_packed_decoder_->Reset(data, num_bytes);
+ }
+ return num_bytes;
+ }
+ default:
+ throw ParquetException("Unknown encoding type for levels.");
+ }
+ return -1;
+}
+
+void LevelDecoder::SetDataV2(int32_t num_bytes, int16_t max_level,
+ int num_buffered_values, const uint8_t* data) {
+ max_level_ = max_level;
+ // Repetition and definition levels always uses RLE encoding
+ // in the DataPageV2 format.
+ if (num_bytes < 0) {
+ throw ParquetException("Invalid page header (corrupt data page?)");
+ }
+ encoding_ = Encoding::RLE;
+ num_values_remaining_ = num_buffered_values;
+ bit_width_ = BitUtil::Log2(max_level + 1);
+
+ if (!rle_decoder_) {
+ rle_decoder_.reset(new ::arrow::util::RleDecoder(data, num_bytes, bit_width_));
+ } else {
+ rle_decoder_->Reset(data, num_bytes, bit_width_);
+ }
+}
+
+int LevelDecoder::Decode(int batch_size, int16_t* levels) {
+ int num_decoded = 0;
+
+ int num_values = std::min(num_values_remaining_, batch_size);
+ if (encoding_ == Encoding::RLE) {
+ num_decoded = rle_decoder_->GetBatch(levels, num_values);
+ } else {
+ num_decoded = bit_packed_decoder_->GetBatch(bit_width_, levels, num_values);
+ }
+ if (num_decoded > 0) {
+ internal::MinMax min_max = internal::FindMinMax(levels, num_decoded);
+ if (ARROW_PREDICT_FALSE(min_max.min < 0 || min_max.max > max_level_)) {
+ std::stringstream ss;
+ ss << "Malformed levels. min: " << min_max.min << " max: " << min_max.max
+ << " out of range. Max Level: " << max_level_;
+ throw ParquetException(ss.str());
+ }
+ }
+ num_values_remaining_ -= num_decoded;
+ return num_decoded;
+}
+
+ReaderProperties default_reader_properties() {
+ static ReaderProperties default_reader_properties;
+ return default_reader_properties;
+}
+
+namespace {
+
+// Extracts encoded statistics from V1 and V2 data page headers
+template <typename H>
+EncodedStatistics ExtractStatsFromHeader(const H& header) {
+ EncodedStatistics page_statistics;
+ if (!header.__isset.statistics) {
+ return page_statistics;
+ }
+ const format::Statistics& stats = header.statistics;
+ if (stats.__isset.max) {
+ page_statistics.set_max(stats.max);
+ }
+ if (stats.__isset.min) {
+ page_statistics.set_min(stats.min);
+ }
+ if (stats.__isset.null_count) {
+ page_statistics.set_null_count(stats.null_count);
+ }
+ if (stats.__isset.distinct_count) {
+ page_statistics.set_distinct_count(stats.distinct_count);
+ }
+ return page_statistics;
+}
+
+// ----------------------------------------------------------------------
+// SerializedPageReader deserializes Thrift metadata and pages that have been
+// assembled in a serialized stream for storing in a Parquet files
+
+// This subclass delimits pages appearing in a serialized stream, each preceded
+// by a serialized Thrift format::PageHeader indicating the type of each page
+// and the page metadata.
+class SerializedPageReader : public PageReader {
+ public:
+ SerializedPageReader(std::shared_ptr<ArrowInputStream> stream, int64_t total_num_rows,
+ Compression::type codec, ::arrow::MemoryPool* pool,
+ const CryptoContext* crypto_ctx)
+ : stream_(std::move(stream)),
+ decompression_buffer_(AllocateBuffer(pool, 0)),
+ page_ordinal_(0),
+ seen_num_rows_(0),
+ total_num_rows_(total_num_rows),
+ decryption_buffer_(AllocateBuffer(pool, 0)) {
+ if (crypto_ctx != nullptr) {
+ crypto_ctx_ = *crypto_ctx;
+ InitDecryption();
+ }
+ max_page_header_size_ = kDefaultMaxPageHeaderSize;
+ decompressor_ = GetCodec(codec);
+ }
+
+ // Implement the PageReader interface
+ std::shared_ptr<Page> NextPage() override;
+
+ void set_max_page_header_size(uint32_t size) override { max_page_header_size_ = size; }
+
+ private:
+ void UpdateDecryption(const std::shared_ptr<Decryptor>& decryptor, int8_t module_type,
+ const std::string& page_aad);
+
+ void InitDecryption();
+
+ std::shared_ptr<Buffer> DecompressIfNeeded(std::shared_ptr<Buffer> page_buffer,
+ int compressed_len, int uncompressed_len,
+ int levels_byte_len = 0);
+
+ std::shared_ptr<ArrowInputStream> stream_;
+
+ format::PageHeader current_page_header_;
+ std::shared_ptr<Page> current_page_;
+
+ // Compression codec to use.
+ std::unique_ptr<::arrow::util::Codec> decompressor_;
+ std::shared_ptr<ResizableBuffer> decompression_buffer_;
+
+ // The fields below are used for calculation of AAD (additional authenticated data)
+ // suffix which is part of the Parquet Modular Encryption.
+ // The AAD suffix for a parquet module is built internally by
+ // concatenating different parts some of which include
+ // the row group ordinal, column ordinal and page ordinal.
+ // Please refer to the encryption specification for more details:
+ // https://github.com/apache/parquet-format/blob/encryption/Encryption.md#44-additional-authenticated-data
+
+ // The ordinal fields in the context below are used for AAD suffix calculation.
+ CryptoContext crypto_ctx_;
+ int16_t page_ordinal_; // page ordinal does not count the dictionary page
+
+ // Maximum allowed page size
+ uint32_t max_page_header_size_;
+
+ // Number of rows read in data pages so far
+ int64_t seen_num_rows_;
+
+ // Number of rows in all the data pages
+ int64_t total_num_rows_;
+
+ // data_page_aad_ and data_page_header_aad_ contain the AAD for data page and data page
+ // header in a single column respectively.
+ // While calculating AAD for different pages in a single column the pages AAD is
+ // updated by only the page ordinal.
+ std::string data_page_aad_;
+ std::string data_page_header_aad_;
+ // Encryption
+ std::shared_ptr<ResizableBuffer> decryption_buffer_;
+};
+
+void SerializedPageReader::InitDecryption() {
+ // Prepare the AAD for quick update later.
+ if (crypto_ctx_.data_decryptor != nullptr) {
+ DCHECK(!crypto_ctx_.data_decryptor->file_aad().empty());
+ data_page_aad_ = encryption::CreateModuleAad(
+ crypto_ctx_.data_decryptor->file_aad(), encryption::kDataPage,
+ crypto_ctx_.row_group_ordinal, crypto_ctx_.column_ordinal, kNonPageOrdinal);
+ }
+ if (crypto_ctx_.meta_decryptor != nullptr) {
+ DCHECK(!crypto_ctx_.meta_decryptor->file_aad().empty());
+ data_page_header_aad_ = encryption::CreateModuleAad(
+ crypto_ctx_.meta_decryptor->file_aad(), encryption::kDataPageHeader,
+ crypto_ctx_.row_group_ordinal, crypto_ctx_.column_ordinal, kNonPageOrdinal);
+ }
+}
+
+void SerializedPageReader::UpdateDecryption(const std::shared_ptr<Decryptor>& decryptor,
+ int8_t module_type,
+ const std::string& page_aad) {
+ DCHECK(decryptor != nullptr);
+ if (crypto_ctx_.start_decrypt_with_dictionary_page) {
+ std::string aad = encryption::CreateModuleAad(
+ decryptor->file_aad(), module_type, crypto_ctx_.row_group_ordinal,
+ crypto_ctx_.column_ordinal, kNonPageOrdinal);
+ decryptor->UpdateAad(aad);
+ } else {
+ encryption::QuickUpdatePageAad(page_aad, page_ordinal_);
+ decryptor->UpdateAad(page_aad);
+ }
+}
+
+std::shared_ptr<Page> SerializedPageReader::NextPage() {
+ // Loop here because there may be unhandled page types that we skip until
+ // finding a page that we do know what to do with
+
+ while (seen_num_rows_ < total_num_rows_) {
+ uint32_t header_size = 0;
+ uint32_t allowed_page_size = kDefaultPageHeaderSize;
+
+ // Page headers can be very large because of page statistics
+ // We try to deserialize a larger buffer progressively
+ // until a maximum allowed header limit
+ while (true) {
+ PARQUET_ASSIGN_OR_THROW(auto view, stream_->Peek(allowed_page_size));
+ if (view.size() == 0) {
+ return std::shared_ptr<Page>(nullptr);
+ }
+
+ // This gets used, then set by DeserializeThriftMsg
+ header_size = static_cast<uint32_t>(view.size());
+ try {
+ if (crypto_ctx_.meta_decryptor != nullptr) {
+ UpdateDecryption(crypto_ctx_.meta_decryptor, encryption::kDictionaryPageHeader,
+ data_page_header_aad_);
+ }
+ DeserializeThriftMsg(reinterpret_cast<const uint8_t*>(view.data()), &header_size,
+ &current_page_header_, crypto_ctx_.meta_decryptor);
+ break;
+ } catch (std::exception& e) {
+ // Failed to deserialize. Double the allowed page header size and try again
+ std::stringstream ss;
+ ss << e.what();
+ allowed_page_size *= 2;
+ if (allowed_page_size > max_page_header_size_) {
+ ss << "Deserializing page header failed.\n";
+ throw ParquetException(ss.str());
+ }
+ }
+ }
+ // Advance the stream offset
+ PARQUET_THROW_NOT_OK(stream_->Advance(header_size));
+
+ int compressed_len = current_page_header_.compressed_page_size;
+ int uncompressed_len = current_page_header_.uncompressed_page_size;
+ if (compressed_len < 0 || uncompressed_len < 0) {
+ throw ParquetException("Invalid page header");
+ }
+
+ if (crypto_ctx_.data_decryptor != nullptr) {
+ UpdateDecryption(crypto_ctx_.data_decryptor, encryption::kDictionaryPage,
+ data_page_aad_);
+ }
+
+ // Read the compressed data page.
+ PARQUET_ASSIGN_OR_THROW(auto page_buffer, stream_->Read(compressed_len));
+ if (page_buffer->size() != compressed_len) {
+ std::stringstream ss;
+ ss << "Page was smaller (" << page_buffer->size() << ") than expected ("
+ << compressed_len << ")";
+ ParquetException::EofException(ss.str());
+ }
+
+ // Decrypt it if we need to
+ if (crypto_ctx_.data_decryptor != nullptr) {
+ PARQUET_THROW_NOT_OK(decryption_buffer_->Resize(
+ compressed_len - crypto_ctx_.data_decryptor->CiphertextSizeDelta(), false));
+ compressed_len = crypto_ctx_.data_decryptor->Decrypt(
+ page_buffer->data(), compressed_len, decryption_buffer_->mutable_data());
+
+ page_buffer = decryption_buffer_;
+ }
+
+ const PageType::type page_type = LoadEnumSafe(&current_page_header_.type);
+
+ if (page_type == PageType::DICTIONARY_PAGE) {
+ crypto_ctx_.start_decrypt_with_dictionary_page = false;
+ const format::DictionaryPageHeader& dict_header =
+ current_page_header_.dictionary_page_header;
+
+ bool is_sorted = dict_header.__isset.is_sorted ? dict_header.is_sorted : false;
+ if (dict_header.num_values < 0) {
+ throw ParquetException("Invalid page header (negative number of values)");
+ }
+
+ // Uncompress if needed
+ page_buffer =
+ DecompressIfNeeded(std::move(page_buffer), compressed_len, uncompressed_len);
+
+ return std::make_shared<DictionaryPage>(page_buffer, dict_header.num_values,
+ LoadEnumSafe(&dict_header.encoding),
+ is_sorted);
+ } else if (page_type == PageType::DATA_PAGE) {
+ ++page_ordinal_;
+ const format::DataPageHeader& header = current_page_header_.data_page_header;
+
+ if (header.num_values < 0) {
+ throw ParquetException("Invalid page header (negative number of values)");
+ }
+ EncodedStatistics page_statistics = ExtractStatsFromHeader(header);
+ seen_num_rows_ += header.num_values;
+
+ // Uncompress if needed
+ page_buffer =
+ DecompressIfNeeded(std::move(page_buffer), compressed_len, uncompressed_len);
+
+ return std::make_shared<DataPageV1>(page_buffer, header.num_values,
+ LoadEnumSafe(&header.encoding),
+ LoadEnumSafe(&header.definition_level_encoding),
+ LoadEnumSafe(&header.repetition_level_encoding),
+ uncompressed_len, page_statistics);
+ } else if (page_type == PageType::DATA_PAGE_V2) {
+ ++page_ordinal_;
+ const format::DataPageHeaderV2& header = current_page_header_.data_page_header_v2;
+
+ if (header.num_values < 0) {
+ throw ParquetException("Invalid page header (negative number of values)");
+ }
+ if (header.definition_levels_byte_length < 0 ||
+ header.repetition_levels_byte_length < 0) {
+ throw ParquetException("Invalid page header (negative levels byte length)");
+ }
+ bool is_compressed = header.__isset.is_compressed ? header.is_compressed : false;
+ EncodedStatistics page_statistics = ExtractStatsFromHeader(header);
+ seen_num_rows_ += header.num_values;
+
+ // Uncompress if needed
+ int levels_byte_len;
+ if (AddWithOverflow(header.definition_levels_byte_length,
+ header.repetition_levels_byte_length, &levels_byte_len)) {
+ throw ParquetException("Levels size too large (corrupt file?)");
+ }
+ // DecompressIfNeeded doesn't take `is_compressed` into account as
+ // it's page type-agnostic.
+ if (is_compressed) {
+ page_buffer = DecompressIfNeeded(std::move(page_buffer), compressed_len,
+ uncompressed_len, levels_byte_len);
+ }
+
+ return std::make_shared<DataPageV2>(
+ page_buffer, header.num_values, header.num_nulls, header.num_rows,
+ LoadEnumSafe(&header.encoding), header.definition_levels_byte_length,
+ header.repetition_levels_byte_length, uncompressed_len, is_compressed,
+ page_statistics);
+ } else {
+ // We don't know what this page type is. We're allowed to skip non-data
+ // pages.
+ continue;
+ }
+ }
+ return std::shared_ptr<Page>(nullptr);
+}
+
+std::shared_ptr<Buffer> SerializedPageReader::DecompressIfNeeded(
+ std::shared_ptr<Buffer> page_buffer, int compressed_len, int uncompressed_len,
+ int levels_byte_len) {
+ if (decompressor_ == nullptr) {
+ return page_buffer;
+ }
+ if (compressed_len < levels_byte_len || uncompressed_len < levels_byte_len) {
+ throw ParquetException("Invalid page header");
+ }
+
+ // Grow the uncompressed buffer if we need to.
+ if (uncompressed_len > static_cast<int>(decompression_buffer_->size())) {
+ PARQUET_THROW_NOT_OK(decompression_buffer_->Resize(uncompressed_len, false));
+ }
+
+ if (levels_byte_len > 0) {
+ // First copy the levels as-is
+ uint8_t* decompressed = decompression_buffer_->mutable_data();
+ memcpy(decompressed, page_buffer->data(), levels_byte_len);
+ }
+
+ // Decompress the values
+ PARQUET_THROW_NOT_OK(decompressor_->Decompress(
+ compressed_len - levels_byte_len, page_buffer->data() + levels_byte_len,
+ uncompressed_len - levels_byte_len,
+ decompression_buffer_->mutable_data() + levels_byte_len));
+
+ return decompression_buffer_;
+}
+
+} // namespace
+
+std::unique_ptr<PageReader> PageReader::Open(std::shared_ptr<ArrowInputStream> stream,
+ int64_t total_num_rows,
+ Compression::type codec,
+ ::arrow::MemoryPool* pool,
+ const CryptoContext* ctx) {
+ return std::unique_ptr<PageReader>(
+ new SerializedPageReader(std::move(stream), total_num_rows, codec, pool, ctx));
+}
+
+namespace {
+
+// ----------------------------------------------------------------------
+// Impl base class for TypedColumnReader and RecordReader
+
+// PLAIN_DICTIONARY is deprecated but used to be used as a dictionary index
+// encoding.
+static bool IsDictionaryIndexEncoding(const Encoding::type& e) {
+ return e == Encoding::RLE_DICTIONARY || e == Encoding::PLAIN_DICTIONARY;
+}
+
+template <typename DType>
+class ColumnReaderImplBase {
+ public:
+ using T = typename DType::c_type;
+
+ ColumnReaderImplBase(const ColumnDescriptor* descr, ::arrow::MemoryPool* pool)
+ : descr_(descr),
+ max_def_level_(descr->max_definition_level()),
+ max_rep_level_(descr->max_repetition_level()),
+ num_buffered_values_(0),
+ num_decoded_values_(0),
+ pool_(pool),
+ current_decoder_(nullptr),
+ current_encoding_(Encoding::UNKNOWN) {}
+
+ virtual ~ColumnReaderImplBase() = default;
+
+ protected:
+ // Read up to batch_size values from the current data page into the
+ // pre-allocated memory T*
+ //
+ // @returns: the number of values read into the out buffer
+ int64_t ReadValues(int64_t batch_size, T* out) {
+ int64_t num_decoded = current_decoder_->Decode(out, static_cast<int>(batch_size));
+ return num_decoded;
+ }
+
+ // Read up to batch_size values from the current data page into the
+ // pre-allocated memory T*, leaving spaces for null entries according
+ // to the def_levels.
+ //
+ // @returns: the number of values read into the out buffer
+ int64_t ReadValuesSpaced(int64_t batch_size, T* out, int64_t null_count,
+ uint8_t* valid_bits, int64_t valid_bits_offset) {
+ return current_decoder_->DecodeSpaced(out, static_cast<int>(batch_size),
+ static_cast<int>(null_count), valid_bits,
+ valid_bits_offset);
+ }
+
+ // Read multiple definition levels into preallocated memory
+ //
+ // Returns the number of decoded definition levels
+ int64_t ReadDefinitionLevels(int64_t batch_size, int16_t* levels) {
+ if (max_def_level_ == 0) {
+ return 0;
+ }
+ return definition_level_decoder_.Decode(static_cast<int>(batch_size), levels);
+ }
+
+ bool HasNextInternal() {
+ // Either there is no data page available yet, or the data page has been
+ // exhausted
+ if (num_buffered_values_ == 0 || num_decoded_values_ == num_buffered_values_) {
+ if (!ReadNewPage() || num_buffered_values_ == 0) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ // Read multiple repetition levels into preallocated memory
+ // Returns the number of decoded repetition levels
+ int64_t ReadRepetitionLevels(int64_t batch_size, int16_t* levels) {
+ if (max_rep_level_ == 0) {
+ return 0;
+ }
+ return repetition_level_decoder_.Decode(static_cast<int>(batch_size), levels);
+ }
+
+ // Advance to the next data page
+ bool ReadNewPage() {
+ // Loop until we find the next data page.
+ while (true) {
+ current_page_ = pager_->NextPage();
+ if (!current_page_) {
+ // EOS
+ return false;
+ }
+
+ if (current_page_->type() == PageType::DICTIONARY_PAGE) {
+ ConfigureDictionary(static_cast<const DictionaryPage*>(current_page_.get()));
+ continue;
+ } else if (current_page_->type() == PageType::DATA_PAGE) {
+ const auto page = std::static_pointer_cast<DataPageV1>(current_page_);
+ const int64_t levels_byte_size = InitializeLevelDecoders(
+ *page, page->repetition_level_encoding(), page->definition_level_encoding());
+ InitializeDataDecoder(*page, levels_byte_size);
+ return true;
+ } else if (current_page_->type() == PageType::DATA_PAGE_V2) {
+ const auto page = std::static_pointer_cast<DataPageV2>(current_page_);
+ int64_t levels_byte_size = InitializeLevelDecodersV2(*page);
+ InitializeDataDecoder(*page, levels_byte_size);
+ return true;
+ } else {
+ // We don't know what this page type is. We're allowed to skip non-data
+ // pages.
+ continue;
+ }
+ }
+ return true;
+ }
+
+ void ConfigureDictionary(const DictionaryPage* page) {
+ int encoding = static_cast<int>(page->encoding());
+ if (page->encoding() == Encoding::PLAIN_DICTIONARY ||
+ page->encoding() == Encoding::PLAIN) {
+ encoding = static_cast<int>(Encoding::RLE_DICTIONARY);
+ }
+
+ auto it = decoders_.find(encoding);
+ if (it != decoders_.end()) {
+ throw ParquetException("Column cannot have more than one dictionary.");
+ }
+
+ if (page->encoding() == Encoding::PLAIN_DICTIONARY ||
+ page->encoding() == Encoding::PLAIN) {
+ auto dictionary = MakeTypedDecoder<DType>(Encoding::PLAIN, descr_);
+ dictionary->SetData(page->num_values(), page->data(), page->size());
+
+ // The dictionary is fully decoded during DictionaryDecoder::Init, so the
+ // DictionaryPage buffer is no longer required after this step
+ //
+ // TODO(wesm): investigate whether this all-or-nothing decoding of the
+ // dictionary makes sense and whether performance can be improved
+
+ std::unique_ptr<DictDecoder<DType>> decoder = MakeDictDecoder<DType>(descr_, pool_);
+ decoder->SetDict(dictionary.get());
+ decoders_[encoding] =
+ std::unique_ptr<DecoderType>(dynamic_cast<DecoderType*>(decoder.release()));
+ } else {
+ ParquetException::NYI("only plain dictionary encoding has been implemented");
+ }
+
+ new_dictionary_ = true;
+ current_decoder_ = decoders_[encoding].get();
+ DCHECK(current_decoder_);
+ }
+
+ // Initialize repetition and definition level decoders on the next data page.
+
+ // If the data page includes repetition and definition levels, we
+ // initialize the level decoders and return the number of encoded level bytes.
+ // The return value helps determine the number of bytes in the encoded data.
+ int64_t InitializeLevelDecoders(const DataPage& page,
+ Encoding::type repetition_level_encoding,
+ Encoding::type definition_level_encoding) {
+ // Read a data page.
+ num_buffered_values_ = page.num_values();
+
+ // Have not decoded any values from the data page yet
+ num_decoded_values_ = 0;
+
+ const uint8_t* buffer = page.data();
+ int32_t levels_byte_size = 0;
+ int32_t max_size = page.size();
+
+ // Data page Layout: Repetition Levels - Definition Levels - encoded values.
+ // Levels are encoded as rle or bit-packed.
+ // Init repetition levels
+ if (max_rep_level_ > 0) {
+ int32_t rep_levels_bytes = repetition_level_decoder_.SetData(
+ repetition_level_encoding, max_rep_level_,
+ static_cast<int>(num_buffered_values_), buffer, max_size);
+ buffer += rep_levels_bytes;
+ levels_byte_size += rep_levels_bytes;
+ max_size -= rep_levels_bytes;
+ }
+ // TODO figure a way to set max_def_level_ to 0
+ // if the initial value is invalid
+
+ // Init definition levels
+ if (max_def_level_ > 0) {
+ int32_t def_levels_bytes = definition_level_decoder_.SetData(
+ definition_level_encoding, max_def_level_,
+ static_cast<int>(num_buffered_values_), buffer, max_size);
+ levels_byte_size += def_levels_bytes;
+ max_size -= def_levels_bytes;
+ }
+
+ return levels_byte_size;
+ }
+
+ int64_t InitializeLevelDecodersV2(const DataPageV2& page) {
+ // Read a data page.
+ num_buffered_values_ = page.num_values();
+
+ // Have not decoded any values from the data page yet
+ num_decoded_values_ = 0;
+ const uint8_t* buffer = page.data();
+
+ const int64_t total_levels_length =
+ static_cast<int64_t>(page.repetition_levels_byte_length()) +
+ page.definition_levels_byte_length();
+
+ if (total_levels_length > page.size()) {
+ throw ParquetException("Data page too small for levels (corrupt header?)");
+ }
+
+ if (max_rep_level_ > 0) {
+ repetition_level_decoder_.SetDataV2(page.repetition_levels_byte_length(),
+ max_rep_level_,
+ static_cast<int>(num_buffered_values_), buffer);
+ buffer += page.repetition_levels_byte_length();
+ }
+
+ if (max_def_level_ > 0) {
+ definition_level_decoder_.SetDataV2(page.definition_levels_byte_length(),
+ max_def_level_,
+ static_cast<int>(num_buffered_values_), buffer);
+ }
+
+ return total_levels_length;
+ }
+
+ // Get a decoder object for this page or create a new decoder if this is the
+ // first page with this encoding.
+ void InitializeDataDecoder(const DataPage& page, int64_t levels_byte_size) {
+ const uint8_t* buffer = page.data() + levels_byte_size;
+ const int64_t data_size = page.size() - levels_byte_size;
+
+ if (data_size < 0) {
+ throw ParquetException("Page smaller than size of encoded levels");
+ }
+
+ Encoding::type encoding = page.encoding();
+
+ if (IsDictionaryIndexEncoding(encoding)) {
+ encoding = Encoding::RLE_DICTIONARY;
+ }
+
+ auto it = decoders_.find(static_cast<int>(encoding));
+ if (it != decoders_.end()) {
+ DCHECK(it->second.get() != nullptr);
+ if (encoding == Encoding::RLE_DICTIONARY) {
+ DCHECK(current_decoder_->encoding() == Encoding::RLE_DICTIONARY);
+ }
+ current_decoder_ = it->second.get();
+ } else {
+ switch (encoding) {
+ case Encoding::PLAIN: {
+ auto decoder = MakeTypedDecoder<DType>(Encoding::PLAIN, descr_);
+ current_decoder_ = decoder.get();
+ decoders_[static_cast<int>(encoding)] = std::move(decoder);
+ break;
+ }
+ case Encoding::BYTE_STREAM_SPLIT: {
+ auto decoder = MakeTypedDecoder<DType>(Encoding::BYTE_STREAM_SPLIT, descr_);
+ current_decoder_ = decoder.get();
+ decoders_[static_cast<int>(encoding)] = std::move(decoder);
+ break;
+ }
+ case Encoding::RLE_DICTIONARY:
+ throw ParquetException("Dictionary page must be before data page.");
+
+ case Encoding::DELTA_BINARY_PACKED:
+ case Encoding::DELTA_LENGTH_BYTE_ARRAY:
+ case Encoding::DELTA_BYTE_ARRAY:
+ ParquetException::NYI("Unsupported encoding");
+
+ default:
+ throw ParquetException("Unknown encoding type.");
+ }
+ }
+ current_encoding_ = encoding;
+ current_decoder_->SetData(static_cast<int>(num_buffered_values_), buffer,
+ static_cast<int>(data_size));
+ }
+
+ const ColumnDescriptor* descr_;
+ const int16_t max_def_level_;
+ const int16_t max_rep_level_;
+
+ std::unique_ptr<PageReader> pager_;
+ std::shared_ptr<Page> current_page_;
+
+ // Not set if full schema for this field has no optional or repeated elements
+ LevelDecoder definition_level_decoder_;
+
+ // Not set for flat schemas.
+ LevelDecoder repetition_level_decoder_;
+
+ // The total number of values stored in the data page. This is the maximum of
+ // the number of encoded definition levels or encoded values. For
+ // non-repeated, required columns, this is equal to the number of encoded
+ // values. For repeated or optional values, there may be fewer data values
+ // than levels, and this tells you how many encoded levels there are in that
+ // case.
+ int64_t num_buffered_values_;
+
+ // The number of values from the current data page that have been decoded
+ // into memory
+ int64_t num_decoded_values_;
+
+ ::arrow::MemoryPool* pool_;
+
+ using DecoderType = TypedDecoder<DType>;
+ DecoderType* current_decoder_;
+ Encoding::type current_encoding_;
+
+ /// Flag to signal when a new dictionary has been set, for the benefit of
+ /// DictionaryRecordReader
+ bool new_dictionary_;
+
+ // The exposed encoding
+ ExposedEncoding exposed_encoding_ = ExposedEncoding::NO_ENCODING;
+
+ // Map of encoding type to the respective decoder object. For example, a
+ // column chunk's data pages may include both dictionary-encoded and
+ // plain-encoded data.
+ std::unordered_map<int, std::unique_ptr<DecoderType>> decoders_;
+
+ void ConsumeBufferedValues(int64_t num_values) { num_decoded_values_ += num_values; }
+};
+
+// ----------------------------------------------------------------------
+// TypedColumnReader implementations
+
+template <typename DType>
+class TypedColumnReaderImpl : public TypedColumnReader<DType>,
+ public ColumnReaderImplBase<DType> {
+ public:
+ using T = typename DType::c_type;
+
+ TypedColumnReaderImpl(const ColumnDescriptor* descr, std::unique_ptr<PageReader> pager,
+ ::arrow::MemoryPool* pool)
+ : ColumnReaderImplBase<DType>(descr, pool) {
+ this->pager_ = std::move(pager);
+ }
+
+ bool HasNext() override { return this->HasNextInternal(); }
+
+ int64_t ReadBatch(int64_t batch_size, int16_t* def_levels, int16_t* rep_levels,
+ T* values, int64_t* values_read) override;
+
+ int64_t ReadBatchSpaced(int64_t batch_size, int16_t* def_levels, int16_t* rep_levels,
+ T* values, uint8_t* valid_bits, int64_t valid_bits_offset,
+ int64_t* levels_read, int64_t* values_read,
+ int64_t* null_count) override;
+
+ int64_t Skip(int64_t num_rows_to_skip) override;
+
+ Type::type type() const override { return this->descr_->physical_type(); }
+
+ const ColumnDescriptor* descr() const override { return this->descr_; }
+
+ ExposedEncoding GetExposedEncoding() override { return this->exposed_encoding_; };
+
+ int64_t ReadBatchWithDictionary(int64_t batch_size, int16_t* def_levels,
+ int16_t* rep_levels, int32_t* indices,
+ int64_t* indices_read, const T** dict,
+ int32_t* dict_len) override;
+
+ protected:
+ void SetExposedEncoding(ExposedEncoding encoding) override {
+ this->exposed_encoding_ = encoding;
+ }
+
+ private:
+ // Read dictionary indices. Similar to ReadValues but decode data to dictionary indices.
+ // This function is called only by ReadBatchWithDictionary().
+ int64_t ReadDictionaryIndices(int64_t indices_to_read, int32_t* indices) {
+ auto decoder = dynamic_cast<DictDecoder<DType>*>(this->current_decoder_);
+ return decoder->DecodeIndices(static_cast<int>(indices_to_read), indices);
+ }
+
+ // Get dictionary. The dictionary should have been set by SetDict(). The dictionary is
+ // owned by the internal decoder and is destroyed when the reader is destroyed. This
+ // function is called only by ReadBatchWithDictionary() after dictionary is configured.
+ void GetDictionary(const T** dictionary, int32_t* dictionary_length) {
+ auto decoder = dynamic_cast<DictDecoder<DType>*>(this->current_decoder_);
+ decoder->GetDictionary(dictionary, dictionary_length);
+ }
+
+ // Read definition and repetition levels. Also return the number of definition levels
+ // and number of values to read. This function is called before reading values.
+ void ReadLevels(int64_t batch_size, int16_t* def_levels, int16_t* rep_levels,
+ int64_t* num_def_levels, int64_t* values_to_read) {
+ batch_size =
+ std::min(batch_size, this->num_buffered_values_ - this->num_decoded_values_);
+
+ // If the field is required and non-repeated, there are no definition levels
+ if (this->max_def_level_ > 0 && def_levels != nullptr) {
+ *num_def_levels = this->ReadDefinitionLevels(batch_size, def_levels);
+ // TODO(wesm): this tallying of values-to-decode can be performed with better
+ // cache-efficiency if fused with the level decoding.
+ for (int64_t i = 0; i < *num_def_levels; ++i) {
+ if (def_levels[i] == this->max_def_level_) {
+ ++(*values_to_read);
+ }
+ }
+ } else {
+ // Required field, read all values
+ *values_to_read = batch_size;
+ }
+
+ // Not present for non-repeated fields
+ if (this->max_rep_level_ > 0 && rep_levels != nullptr) {
+ int64_t num_rep_levels = this->ReadRepetitionLevels(batch_size, rep_levels);
+ if (def_levels != nullptr && *num_def_levels != num_rep_levels) {
+ throw ParquetException("Number of decoded rep / def levels did not match");
+ }
+ }
+ }
+};
+
+template <typename DType>
+int64_t TypedColumnReaderImpl<DType>::ReadBatchWithDictionary(
+ int64_t batch_size, int16_t* def_levels, int16_t* rep_levels, int32_t* indices,
+ int64_t* indices_read, const T** dict, int32_t* dict_len) {
+ bool has_dict_output = dict != nullptr && dict_len != nullptr;
+ // Similar logic as ReadValues to get pages.
+ if (!HasNext()) {
+ *indices_read = 0;
+ if (has_dict_output) {
+ *dict = nullptr;
+ *dict_len = 0;
+ }
+ return 0;
+ }
+
+ // Verify the current data page is dictionary encoded.
+ if (this->current_encoding_ != Encoding::RLE_DICTIONARY) {
+ std::stringstream ss;
+ ss << "Data page is not dictionary encoded. Encoding: "
+ << EncodingToString(this->current_encoding_);
+ throw ParquetException(ss.str());
+ }
+
+ // Get dictionary pointer and length.
+ if (has_dict_output) {
+ GetDictionary(dict, dict_len);
+ }
+
+ // Similar logic as ReadValues to get def levels and rep levels.
+ int64_t num_def_levels = 0;
+ int64_t indices_to_read = 0;
+ ReadLevels(batch_size, def_levels, rep_levels, &num_def_levels, &indices_to_read);
+
+ // Read dictionary indices.
+ *indices_read = ReadDictionaryIndices(indices_to_read, indices);
+ int64_t total_indices = std::max(num_def_levels, *indices_read);
+ this->ConsumeBufferedValues(total_indices);
+
+ return total_indices;
+}
+
+template <typename DType>
+int64_t TypedColumnReaderImpl<DType>::ReadBatch(int64_t batch_size, int16_t* def_levels,
+ int16_t* rep_levels, T* values,
+ int64_t* values_read) {
+ // HasNext invokes ReadNewPage
+ if (!HasNext()) {
+ *values_read = 0;
+ return 0;
+ }
+
+ // TODO(wesm): keep reading data pages until batch_size is reached, or the
+ // row group is finished
+ int64_t num_def_levels = 0;
+ int64_t values_to_read = 0;
+ ReadLevels(batch_size, def_levels, rep_levels, &num_def_levels, &values_to_read);
+
+ *values_read = this->ReadValues(values_to_read, values);
+ int64_t total_values = std::max(num_def_levels, *values_read);
+ this->ConsumeBufferedValues(total_values);
+
+ return total_values;
+}
+
+template <typename DType>
+int64_t TypedColumnReaderImpl<DType>::ReadBatchSpaced(
+ int64_t batch_size, int16_t* def_levels, int16_t* rep_levels, T* values,
+ uint8_t* valid_bits, int64_t valid_bits_offset, int64_t* levels_read,
+ int64_t* values_read, int64_t* null_count_out) {
+ // HasNext invokes ReadNewPage
+ if (!HasNext()) {
+ *levels_read = 0;
+ *values_read = 0;
+ *null_count_out = 0;
+ return 0;
+ }
+
+ int64_t total_values;
+ // TODO(wesm): keep reading data pages until batch_size is reached, or the
+ // row group is finished
+ batch_size =
+ std::min(batch_size, this->num_buffered_values_ - this->num_decoded_values_);
+
+ // If the field is required and non-repeated, there are no definition levels
+ if (this->max_def_level_ > 0) {
+ int64_t num_def_levels = this->ReadDefinitionLevels(batch_size, def_levels);
+
+ // Not present for non-repeated fields
+ if (this->max_rep_level_ > 0) {
+ int64_t num_rep_levels = this->ReadRepetitionLevels(batch_size, rep_levels);
+ if (num_def_levels != num_rep_levels) {
+ throw ParquetException("Number of decoded rep / def levels did not match");
+ }
+ }
+
+ const bool has_spaced_values = HasSpacedValues(this->descr_);
+ int64_t null_count = 0;
+ if (!has_spaced_values) {
+ int values_to_read = 0;
+ for (int64_t i = 0; i < num_def_levels; ++i) {
+ if (def_levels[i] == this->max_def_level_) {
+ ++values_to_read;
+ }
+ }
+ total_values = this->ReadValues(values_to_read, values);
+ ::arrow::BitUtil::SetBitsTo(valid_bits, valid_bits_offset,
+ /*length=*/total_values,
+ /*bits_are_set=*/true);
+ *values_read = total_values;
+ } else {
+ internal::LevelInfo info;
+ info.repeated_ancestor_def_level = this->max_def_level_ - 1;
+ info.def_level = this->max_def_level_;
+ info.rep_level = this->max_rep_level_;
+ internal::ValidityBitmapInputOutput validity_io;
+ validity_io.values_read_upper_bound = num_def_levels;
+ validity_io.valid_bits = valid_bits;
+ validity_io.valid_bits_offset = valid_bits_offset;
+ validity_io.null_count = null_count;
+ validity_io.values_read = *values_read;
+
+ internal::DefLevelsToBitmap(def_levels, num_def_levels, info, &validity_io);
+ null_count = validity_io.null_count;
+ *values_read = validity_io.values_read;
+
+ total_values =
+ this->ReadValuesSpaced(*values_read, values, static_cast<int>(null_count),
+ valid_bits, valid_bits_offset);
+ }
+ *levels_read = num_def_levels;
+ *null_count_out = null_count;
+
+ } else {
+ // Required field, read all values
+ total_values = this->ReadValues(batch_size, values);
+ ::arrow::BitUtil::SetBitsTo(valid_bits, valid_bits_offset,
+ /*length=*/total_values,
+ /*bits_are_set=*/true);
+ *null_count_out = 0;
+ *values_read = total_values;
+ *levels_read = total_values;
+ }
+
+ this->ConsumeBufferedValues(*levels_read);
+ return total_values;
+}
+
+template <typename DType>
+int64_t TypedColumnReaderImpl<DType>::Skip(int64_t num_rows_to_skip) {
+ int64_t rows_to_skip = num_rows_to_skip;
+ while (HasNext() && rows_to_skip > 0) {
+ // If the number of rows to skip is more than the number of undecoded values, skip the
+ // Page.
+ if (rows_to_skip > (this->num_buffered_values_ - this->num_decoded_values_)) {
+ rows_to_skip -= this->num_buffered_values_ - this->num_decoded_values_;
+ this->num_decoded_values_ = this->num_buffered_values_;
+ } else {
+ // We need to read this Page
+ // Jump to the right offset in the Page
+ int64_t batch_size = 1024; // ReadBatch with a smaller memory footprint
+ int64_t values_read = 0;
+
+ // This will be enough scratch space to accommodate 16-bit levels or any
+ // value type
+ std::shared_ptr<ResizableBuffer> scratch = AllocateBuffer(
+ this->pool_, batch_size * type_traits<DType::type_num>::value_byte_size);
+
+ do {
+ batch_size = std::min(batch_size, rows_to_skip);
+ values_read =
+ ReadBatch(static_cast<int>(batch_size),
+ reinterpret_cast<int16_t*>(scratch->mutable_data()),
+ reinterpret_cast<int16_t*>(scratch->mutable_data()),
+ reinterpret_cast<T*>(scratch->mutable_data()), &values_read);
+ rows_to_skip -= values_read;
+ } while (values_read > 0 && rows_to_skip > 0);
+ }
+ }
+ return num_rows_to_skip - rows_to_skip;
+}
+
+} // namespace
+
+// ----------------------------------------------------------------------
+// Dynamic column reader constructor
+
+std::shared_ptr<ColumnReader> ColumnReader::Make(const ColumnDescriptor* descr,
+ std::unique_ptr<PageReader> pager,
+ MemoryPool* pool) {
+ switch (descr->physical_type()) {
+ case Type::BOOLEAN:
+ return std::make_shared<TypedColumnReaderImpl<BooleanType>>(descr, std::move(pager),
+ pool);
+ case Type::INT32:
+ return std::make_shared<TypedColumnReaderImpl<Int32Type>>(descr, std::move(pager),
+ pool);
+ case Type::INT64:
+ return std::make_shared<TypedColumnReaderImpl<Int64Type>>(descr, std::move(pager),
+ pool);
+ case Type::INT96:
+ return std::make_shared<TypedColumnReaderImpl<Int96Type>>(descr, std::move(pager),
+ pool);
+ case Type::FLOAT:
+ return std::make_shared<TypedColumnReaderImpl<FloatType>>(descr, std::move(pager),
+ pool);
+ case Type::DOUBLE:
+ return std::make_shared<TypedColumnReaderImpl<DoubleType>>(descr, std::move(pager),
+ pool);
+ case Type::BYTE_ARRAY:
+ return std::make_shared<TypedColumnReaderImpl<ByteArrayType>>(
+ descr, std::move(pager), pool);
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ return std::make_shared<TypedColumnReaderImpl<FLBAType>>(descr, std::move(pager),
+ pool);
+ default:
+ ParquetException::NYI("type reader not implemented");
+ }
+ // Unreachable code, but suppress compiler warning
+ return std::shared_ptr<ColumnReader>(nullptr);
+}
+
+// ----------------------------------------------------------------------
+// RecordReader
+
+namespace internal {
+namespace {
+
+// The minimum number of repetition/definition levels to decode at a time, for
+// better vectorized performance when doing many smaller record reads
+constexpr int64_t kMinLevelBatchSize = 1024;
+
+template <typename DType>
+class TypedRecordReader : public ColumnReaderImplBase<DType>,
+ virtual public RecordReader {
+ public:
+ using T = typename DType::c_type;
+ using BASE = ColumnReaderImplBase<DType>;
+ TypedRecordReader(const ColumnDescriptor* descr, LevelInfo leaf_info, MemoryPool* pool)
+ : BASE(descr, pool) {
+ leaf_info_ = leaf_info;
+ nullable_values_ = leaf_info.HasNullableValues();
+ at_record_start_ = true;
+ records_read_ = 0;
+ values_written_ = 0;
+ values_capacity_ = 0;
+ null_count_ = 0;
+ levels_written_ = 0;
+ levels_position_ = 0;
+ levels_capacity_ = 0;
+ uses_values_ = !(descr->physical_type() == Type::BYTE_ARRAY);
+
+ if (uses_values_) {
+ values_ = AllocateBuffer(pool);
+ }
+ valid_bits_ = AllocateBuffer(pool);
+ def_levels_ = AllocateBuffer(pool);
+ rep_levels_ = AllocateBuffer(pool);
+ Reset();
+ }
+
+ int64_t available_values_current_page() const {
+ return this->num_buffered_values_ - this->num_decoded_values_;
+ }
+
+ // Compute the values capacity in bytes for the given number of elements
+ int64_t bytes_for_values(int64_t nitems) const {
+ int64_t type_size = GetTypeByteSize(this->descr_->physical_type());
+ int64_t bytes_for_values = -1;
+ if (MultiplyWithOverflow(nitems, type_size, &bytes_for_values)) {
+ throw ParquetException("Total size of items too large");
+ }
+ return bytes_for_values;
+ }
+
+ int64_t ReadRecords(int64_t num_records) override {
+ // Delimit records, then read values at the end
+ int64_t records_read = 0;
+
+ if (levels_position_ < levels_written_) {
+ records_read += ReadRecordData(num_records);
+ }
+
+ int64_t level_batch_size = std::max(kMinLevelBatchSize, num_records);
+
+ // If we are in the middle of a record, we continue until reaching the
+ // desired number of records or the end of the current record if we've found
+ // enough records
+ while (!at_record_start_ || records_read < num_records) {
+ // Is there more data to read in this row group?
+ if (!this->HasNextInternal()) {
+ if (!at_record_start_) {
+ // We ended the row group while inside a record that we haven't seen
+ // the end of yet. So increment the record count for the last record in
+ // the row group
+ ++records_read;
+ at_record_start_ = true;
+ }
+ break;
+ }
+
+ /// We perform multiple batch reads until we either exhaust the row group
+ /// or observe the desired number of records
+ int64_t batch_size = std::min(level_batch_size, available_values_current_page());
+
+ // No more data in column
+ if (batch_size == 0) {
+ break;
+ }
+
+ if (this->max_def_level_ > 0) {
+ ReserveLevels(batch_size);
+
+ int16_t* def_levels = this->def_levels() + levels_written_;
+ int16_t* rep_levels = this->rep_levels() + levels_written_;
+
+ // Not present for non-repeated fields
+ int64_t levels_read = 0;
+ if (this->max_rep_level_ > 0) {
+ levels_read = this->ReadDefinitionLevels(batch_size, def_levels);
+ if (this->ReadRepetitionLevels(batch_size, rep_levels) != levels_read) {
+ throw ParquetException("Number of decoded rep / def levels did not match");
+ }
+ } else if (this->max_def_level_ > 0) {
+ levels_read = this->ReadDefinitionLevels(batch_size, def_levels);
+ }
+
+ // Exhausted column chunk
+ if (levels_read == 0) {
+ break;
+ }
+
+ levels_written_ += levels_read;
+ records_read += ReadRecordData(num_records - records_read);
+ } else {
+ // No repetition or definition levels
+ batch_size = std::min(num_records - records_read, batch_size);
+ records_read += ReadRecordData(batch_size);
+ }
+ }
+
+ return records_read;
+ }
+
+ // We may outwardly have the appearance of having exhausted a column chunk
+ // when in fact we are in the middle of processing the last batch
+ bool has_values_to_process() const { return levels_position_ < levels_written_; }
+
+ std::shared_ptr<ResizableBuffer> ReleaseValues() override {
+ if (uses_values_) {
+ auto result = values_;
+ PARQUET_THROW_NOT_OK(result->Resize(bytes_for_values(values_written_), true));
+ values_ = AllocateBuffer(this->pool_);
+ values_capacity_ = 0;
+ return result;
+ } else {
+ return nullptr;
+ }
+ }
+
+ std::shared_ptr<ResizableBuffer> ReleaseIsValid() override {
+ if (leaf_info_.HasNullableValues()) {
+ auto result = valid_bits_;
+ PARQUET_THROW_NOT_OK(result->Resize(BitUtil::BytesForBits(values_written_), true));
+ valid_bits_ = AllocateBuffer(this->pool_);
+ return result;
+ } else {
+ return nullptr;
+ }
+ }
+
+ // Process written repetition/definition levels to reach the end of
+ // records. Process no more levels than necessary to delimit the indicated
+ // number of logical records. Updates internal state of RecordReader
+ //
+ // \return Number of records delimited
+ int64_t DelimitRecords(int64_t num_records, int64_t* values_seen) {
+ int64_t values_to_read = 0;
+ int64_t records_read = 0;
+
+ const int16_t* def_levels = this->def_levels() + levels_position_;
+ const int16_t* rep_levels = this->rep_levels() + levels_position_;
+
+ DCHECK_GT(this->max_rep_level_, 0);
+
+ // Count logical records and number of values to read
+ while (levels_position_ < levels_written_) {
+ const int16_t rep_level = *rep_levels++;
+ if (rep_level == 0) {
+ // If at_record_start_ is true, we are seeing the start of a record
+ // for the second time, such as after repeated calls to
+ // DelimitRecords. In this case we must continue until we find
+ // another record start or exhausting the ColumnChunk
+ if (!at_record_start_) {
+ // We've reached the end of a record; increment the record count.
+ ++records_read;
+ if (records_read == num_records) {
+ // We've found the number of records we were looking for. Set
+ // at_record_start_ to true and break
+ at_record_start_ = true;
+ break;
+ }
+ }
+ }
+ // We have decided to consume the level at this position; therefore we
+ // must advance until we find another record boundary
+ at_record_start_ = false;
+
+ const int16_t def_level = *def_levels++;
+ if (def_level == this->max_def_level_) {
+ ++values_to_read;
+ }
+ ++levels_position_;
+ }
+ *values_seen = values_to_read;
+ return records_read;
+ }
+
+ void Reserve(int64_t capacity) override {
+ ReserveLevels(capacity);
+ ReserveValues(capacity);
+ }
+
+ int64_t UpdateCapacity(int64_t capacity, int64_t size, int64_t extra_size) {
+ if (extra_size < 0) {
+ throw ParquetException("Negative size (corrupt file?)");
+ }
+ int64_t target_size = -1;
+ if (AddWithOverflow(size, extra_size, &target_size)) {
+ throw ParquetException("Allocation size too large (corrupt file?)");
+ }
+ if (target_size >= (1LL << 62)) {
+ throw ParquetException("Allocation size too large (corrupt file?)");
+ }
+ if (capacity >= target_size) {
+ return capacity;
+ }
+ return BitUtil::NextPower2(target_size);
+ }
+
+ void ReserveLevels(int64_t extra_levels) {
+ if (this->max_def_level_ > 0) {
+ const int64_t new_levels_capacity =
+ UpdateCapacity(levels_capacity_, levels_written_, extra_levels);
+ if (new_levels_capacity > levels_capacity_) {
+ constexpr auto kItemSize = static_cast<int64_t>(sizeof(int16_t));
+ int64_t capacity_in_bytes = -1;
+ if (MultiplyWithOverflow(new_levels_capacity, kItemSize, &capacity_in_bytes)) {
+ throw ParquetException("Allocation size too large (corrupt file?)");
+ }
+ PARQUET_THROW_NOT_OK(def_levels_->Resize(capacity_in_bytes, false));
+ if (this->max_rep_level_ > 0) {
+ PARQUET_THROW_NOT_OK(rep_levels_->Resize(capacity_in_bytes, false));
+ }
+ levels_capacity_ = new_levels_capacity;
+ }
+ }
+ }
+
+ void ReserveValues(int64_t extra_values) {
+ const int64_t new_values_capacity =
+ UpdateCapacity(values_capacity_, values_written_, extra_values);
+ if (new_values_capacity > values_capacity_) {
+ // XXX(wesm): A hack to avoid memory allocation when reading directly
+ // into builder classes
+ if (uses_values_) {
+ PARQUET_THROW_NOT_OK(
+ values_->Resize(bytes_for_values(new_values_capacity), false));
+ }
+ values_capacity_ = new_values_capacity;
+ }
+ if (leaf_info_.HasNullableValues()) {
+ int64_t valid_bytes_new = BitUtil::BytesForBits(values_capacity_);
+ if (valid_bits_->size() < valid_bytes_new) {
+ int64_t valid_bytes_old = BitUtil::BytesForBits(values_written_);
+ PARQUET_THROW_NOT_OK(valid_bits_->Resize(valid_bytes_new, false));
+
+ // Avoid valgrind warnings
+ memset(valid_bits_->mutable_data() + valid_bytes_old, 0,
+ valid_bytes_new - valid_bytes_old);
+ }
+ }
+ }
+
+ void Reset() override {
+ ResetValues();
+
+ if (levels_written_ > 0) {
+ const int64_t levels_remaining = levels_written_ - levels_position_;
+ // Shift remaining levels to beginning of buffer and trim to only the number
+ // of decoded levels remaining
+ int16_t* def_data = def_levels();
+ int16_t* rep_data = rep_levels();
+
+ std::copy(def_data + levels_position_, def_data + levels_written_, def_data);
+ PARQUET_THROW_NOT_OK(
+ def_levels_->Resize(levels_remaining * sizeof(int16_t), false));
+
+ if (this->max_rep_level_ > 0) {
+ std::copy(rep_data + levels_position_, rep_data + levels_written_, rep_data);
+ PARQUET_THROW_NOT_OK(
+ rep_levels_->Resize(levels_remaining * sizeof(int16_t), false));
+ }
+
+ levels_written_ -= levels_position_;
+ levels_position_ = 0;
+ levels_capacity_ = levels_remaining;
+ }
+
+ records_read_ = 0;
+
+ // Call Finish on the binary builders to reset them
+ }
+
+ void SetPageReader(std::unique_ptr<PageReader> reader) override {
+ at_record_start_ = true;
+ this->pager_ = std::move(reader);
+ ResetDecoders();
+ }
+
+ bool HasMoreData() const override { return this->pager_ != nullptr; }
+
+ // Dictionary decoders must be reset when advancing row groups
+ void ResetDecoders() { this->decoders_.clear(); }
+
+ virtual void ReadValuesSpaced(int64_t values_with_nulls, int64_t null_count) {
+ uint8_t* valid_bits = valid_bits_->mutable_data();
+ const int64_t valid_bits_offset = values_written_;
+
+ int64_t num_decoded = this->current_decoder_->DecodeSpaced(
+ ValuesHead<T>(), static_cast<int>(values_with_nulls),
+ static_cast<int>(null_count), valid_bits, valid_bits_offset);
+ DCHECK_EQ(num_decoded, values_with_nulls);
+ }
+
+ virtual void ReadValuesDense(int64_t values_to_read) {
+ int64_t num_decoded =
+ this->current_decoder_->Decode(ValuesHead<T>(), static_cast<int>(values_to_read));
+ DCHECK_EQ(num_decoded, values_to_read);
+ }
+
+ // Return number of logical records read
+ int64_t ReadRecordData(int64_t num_records) {
+ // Conservative upper bound
+ const int64_t possible_num_values =
+ std::max(num_records, levels_written_ - levels_position_);
+ ReserveValues(possible_num_values);
+
+ const int64_t start_levels_position = levels_position_;
+
+ int64_t values_to_read = 0;
+ int64_t records_read = 0;
+ if (this->max_rep_level_ > 0) {
+ records_read = DelimitRecords(num_records, &values_to_read);
+ } else if (this->max_def_level_ > 0) {
+ // No repetition levels, skip delimiting logic. Each level represents a
+ // null or not null entry
+ records_read = std::min(levels_written_ - levels_position_, num_records);
+
+ // This is advanced by DelimitRecords, which we skipped
+ levels_position_ += records_read;
+ } else {
+ records_read = values_to_read = num_records;
+ }
+
+ int64_t null_count = 0;
+ if (leaf_info_.HasNullableValues()) {
+ ValidityBitmapInputOutput validity_io;
+ validity_io.values_read_upper_bound = levels_position_ - start_levels_position;
+ validity_io.valid_bits = valid_bits_->mutable_data();
+ validity_io.valid_bits_offset = values_written_;
+
+ DefLevelsToBitmap(def_levels() + start_levels_position,
+ levels_position_ - start_levels_position, leaf_info_,
+ &validity_io);
+ values_to_read = validity_io.values_read - validity_io.null_count;
+ null_count = validity_io.null_count;
+ DCHECK_GE(values_to_read, 0);
+ ReadValuesSpaced(validity_io.values_read, null_count);
+ } else {
+ DCHECK_GE(values_to_read, 0);
+ ReadValuesDense(values_to_read);
+ }
+ if (this->leaf_info_.def_level > 0) {
+ // Optional, repeated, or some mix thereof
+ this->ConsumeBufferedValues(levels_position_ - start_levels_position);
+ } else {
+ // Flat, non-repeated
+ this->ConsumeBufferedValues(values_to_read);
+ }
+ // Total values, including null spaces, if any
+ values_written_ += values_to_read + null_count;
+ null_count_ += null_count;
+
+ return records_read;
+ }
+
+ void DebugPrintState() override {
+ const int16_t* def_levels = this->def_levels();
+ const int16_t* rep_levels = this->rep_levels();
+ const int64_t total_levels_read = levels_position_;
+
+ const T* vals = reinterpret_cast<const T*>(this->values());
+
+ std::cout << "def levels: ";
+ for (int64_t i = 0; i < total_levels_read; ++i) {
+ std::cout << def_levels[i] << " ";
+ }
+ std::cout << std::endl;
+
+ std::cout << "rep levels: ";
+ for (int64_t i = 0; i < total_levels_read; ++i) {
+ std::cout << rep_levels[i] << " ";
+ }
+ std::cout << std::endl;
+
+ std::cout << "values: ";
+ for (int64_t i = 0; i < this->values_written(); ++i) {
+ std::cout << vals[i] << " ";
+ }
+ std::cout << std::endl;
+ }
+
+ void ResetValues() {
+ if (values_written_ > 0) {
+ // Resize to 0, but do not shrink to fit
+ if (uses_values_) {
+ PARQUET_THROW_NOT_OK(values_->Resize(0, false));
+ }
+ PARQUET_THROW_NOT_OK(valid_bits_->Resize(0, false));
+ values_written_ = 0;
+ values_capacity_ = 0;
+ null_count_ = 0;
+ }
+ }
+
+ protected:
+ template <typename T>
+ T* ValuesHead() {
+ return reinterpret_cast<T*>(values_->mutable_data()) + values_written_;
+ }
+ LevelInfo leaf_info_;
+};
+
+class FLBARecordReader : public TypedRecordReader<FLBAType>,
+ virtual public BinaryRecordReader {
+ public:
+ FLBARecordReader(const ColumnDescriptor* descr, LevelInfo leaf_info,
+ ::arrow::MemoryPool* pool)
+ : TypedRecordReader<FLBAType>(descr, leaf_info, pool), builder_(nullptr) {
+ DCHECK_EQ(descr_->physical_type(), Type::FIXED_LEN_BYTE_ARRAY);
+ int byte_width = descr_->type_length();
+ std::shared_ptr<::arrow::DataType> type = ::arrow::fixed_size_binary(byte_width);
+ builder_.reset(new ::arrow::FixedSizeBinaryBuilder(type, this->pool_));
+ }
+
+ ::arrow::ArrayVector GetBuilderChunks() override {
+ std::shared_ptr<::arrow::Array> chunk;
+ PARQUET_THROW_NOT_OK(builder_->Finish(&chunk));
+ return ::arrow::ArrayVector({chunk});
+ }
+
+ void ReadValuesDense(int64_t values_to_read) override {
+ auto values = ValuesHead<FLBA>();
+ int64_t num_decoded =
+ this->current_decoder_->Decode(values, static_cast<int>(values_to_read));
+ DCHECK_EQ(num_decoded, values_to_read);
+
+ for (int64_t i = 0; i < num_decoded; i++) {
+ PARQUET_THROW_NOT_OK(builder_->Append(values[i].ptr));
+ }
+ ResetValues();
+ }
+
+ void ReadValuesSpaced(int64_t values_to_read, int64_t null_count) override {
+ uint8_t* valid_bits = valid_bits_->mutable_data();
+ const int64_t valid_bits_offset = values_written_;
+ auto values = ValuesHead<FLBA>();
+
+ int64_t num_decoded = this->current_decoder_->DecodeSpaced(
+ values, static_cast<int>(values_to_read), static_cast<int>(null_count),
+ valid_bits, valid_bits_offset);
+ DCHECK_EQ(num_decoded, values_to_read);
+
+ for (int64_t i = 0; i < num_decoded; i++) {
+ if (::arrow::BitUtil::GetBit(valid_bits, valid_bits_offset + i)) {
+ PARQUET_THROW_NOT_OK(builder_->Append(values[i].ptr));
+ } else {
+ PARQUET_THROW_NOT_OK(builder_->AppendNull());
+ }
+ }
+ ResetValues();
+ }
+
+ private:
+ std::unique_ptr<::arrow::FixedSizeBinaryBuilder> builder_;
+};
+
+class ByteArrayChunkedRecordReader : public TypedRecordReader<ByteArrayType>,
+ virtual public BinaryRecordReader {
+ public:
+ ByteArrayChunkedRecordReader(const ColumnDescriptor* descr, LevelInfo leaf_info,
+ ::arrow::MemoryPool* pool)
+ : TypedRecordReader<ByteArrayType>(descr, leaf_info, pool) {
+ DCHECK_EQ(descr_->physical_type(), Type::BYTE_ARRAY);
+ accumulator_.builder.reset(new ::arrow::BinaryBuilder(pool));
+ }
+
+ ::arrow::ArrayVector GetBuilderChunks() override {
+ ::arrow::ArrayVector result = accumulator_.chunks;
+ if (result.size() == 0 || accumulator_.builder->length() > 0) {
+ std::shared_ptr<::arrow::Array> last_chunk;
+ PARQUET_THROW_NOT_OK(accumulator_.builder->Finish(&last_chunk));
+ result.push_back(std::move(last_chunk));
+ }
+ accumulator_.chunks = {};
+ return result;
+ }
+
+ void ReadValuesDense(int64_t values_to_read) override {
+ int64_t num_decoded = this->current_decoder_->DecodeArrowNonNull(
+ static_cast<int>(values_to_read), &accumulator_);
+ DCHECK_EQ(num_decoded, values_to_read);
+ ResetValues();
+ }
+
+ void ReadValuesSpaced(int64_t values_to_read, int64_t null_count) override {
+ int64_t num_decoded = this->current_decoder_->DecodeArrow(
+ static_cast<int>(values_to_read), static_cast<int>(null_count),
+ valid_bits_->mutable_data(), values_written_, &accumulator_);
+ DCHECK_EQ(num_decoded, values_to_read - null_count);
+ ResetValues();
+ }
+
+ private:
+ // Helper data structure for accumulating builder chunks
+ typename EncodingTraits<ByteArrayType>::Accumulator accumulator_;
+};
+
+class ByteArrayDictionaryRecordReader : public TypedRecordReader<ByteArrayType>,
+ virtual public DictionaryRecordReader {
+ public:
+ ByteArrayDictionaryRecordReader(const ColumnDescriptor* descr, LevelInfo leaf_info,
+ ::arrow::MemoryPool* pool)
+ : TypedRecordReader<ByteArrayType>(descr, leaf_info, pool), builder_(pool) {
+ this->read_dictionary_ = true;
+ }
+
+ std::shared_ptr<::arrow::ChunkedArray> GetResult() override {
+ FlushBuilder();
+ std::vector<std::shared_ptr<::arrow::Array>> result;
+ std::swap(result, result_chunks_);
+ return std::make_shared<::arrow::ChunkedArray>(std::move(result), builder_.type());
+ }
+
+ void FlushBuilder() {
+ if (builder_.length() > 0) {
+ std::shared_ptr<::arrow::Array> chunk;
+ PARQUET_THROW_NOT_OK(builder_.Finish(&chunk));
+ result_chunks_.emplace_back(std::move(chunk));
+
+ // Also clears the dictionary memo table
+ builder_.Reset();
+ }
+ }
+
+ void MaybeWriteNewDictionary() {
+ if (this->new_dictionary_) {
+ /// If there is a new dictionary, we may need to flush the builder, then
+ /// insert the new dictionary values
+ FlushBuilder();
+ builder_.ResetFull();
+ auto decoder = dynamic_cast<BinaryDictDecoder*>(this->current_decoder_);
+ decoder->InsertDictionary(&builder_);
+ this->new_dictionary_ = false;
+ }
+ }
+
+ void ReadValuesDense(int64_t values_to_read) override {
+ int64_t num_decoded = 0;
+ if (current_encoding_ == Encoding::RLE_DICTIONARY) {
+ MaybeWriteNewDictionary();
+ auto decoder = dynamic_cast<BinaryDictDecoder*>(this->current_decoder_);
+ num_decoded = decoder->DecodeIndices(static_cast<int>(values_to_read), &builder_);
+ } else {
+ num_decoded = this->current_decoder_->DecodeArrowNonNull(
+ static_cast<int>(values_to_read), &builder_);
+
+ /// Flush values since they have been copied into the builder
+ ResetValues();
+ }
+ DCHECK_EQ(num_decoded, values_to_read);
+ }
+
+ void ReadValuesSpaced(int64_t values_to_read, int64_t null_count) override {
+ int64_t num_decoded = 0;
+ if (current_encoding_ == Encoding::RLE_DICTIONARY) {
+ MaybeWriteNewDictionary();
+ auto decoder = dynamic_cast<BinaryDictDecoder*>(this->current_decoder_);
+ num_decoded = decoder->DecodeIndicesSpaced(
+ static_cast<int>(values_to_read), static_cast<int>(null_count),
+ valid_bits_->mutable_data(), values_written_, &builder_);
+ } else {
+ num_decoded = this->current_decoder_->DecodeArrow(
+ static_cast<int>(values_to_read), static_cast<int>(null_count),
+ valid_bits_->mutable_data(), values_written_, &builder_);
+
+ /// Flush values since they have been copied into the builder
+ ResetValues();
+ }
+ DCHECK_EQ(num_decoded, values_to_read - null_count);
+ }
+
+ private:
+ using BinaryDictDecoder = DictDecoder<ByteArrayType>;
+
+ ::arrow::BinaryDictionary32Builder builder_;
+ std::vector<std::shared_ptr<::arrow::Array>> result_chunks_;
+};
+
+// TODO(wesm): Implement these to some satisfaction
+template <>
+void TypedRecordReader<Int96Type>::DebugPrintState() {}
+
+template <>
+void TypedRecordReader<ByteArrayType>::DebugPrintState() {}
+
+template <>
+void TypedRecordReader<FLBAType>::DebugPrintState() {}
+
+std::shared_ptr<RecordReader> MakeByteArrayRecordReader(const ColumnDescriptor* descr,
+ LevelInfo leaf_info,
+ ::arrow::MemoryPool* pool,
+ bool read_dictionary) {
+ if (read_dictionary) {
+ return std::make_shared<ByteArrayDictionaryRecordReader>(descr, leaf_info, pool);
+ } else {
+ return std::make_shared<ByteArrayChunkedRecordReader>(descr, leaf_info, pool);
+ }
+}
+
+} // namespace
+
+std::shared_ptr<RecordReader> RecordReader::Make(const ColumnDescriptor* descr,
+ LevelInfo leaf_info, MemoryPool* pool,
+ const bool read_dictionary) {
+ switch (descr->physical_type()) {
+ case Type::BOOLEAN:
+ return std::make_shared<TypedRecordReader<BooleanType>>(descr, leaf_info, pool);
+ case Type::INT32:
+ return std::make_shared<TypedRecordReader<Int32Type>>(descr, leaf_info, pool);
+ case Type::INT64:
+ return std::make_shared<TypedRecordReader<Int64Type>>(descr, leaf_info, pool);
+ case Type::INT96:
+ return std::make_shared<TypedRecordReader<Int96Type>>(descr, leaf_info, pool);
+ case Type::FLOAT:
+ return std::make_shared<TypedRecordReader<FloatType>>(descr, leaf_info, pool);
+ case Type::DOUBLE:
+ return std::make_shared<TypedRecordReader<DoubleType>>(descr, leaf_info, pool);
+ case Type::BYTE_ARRAY:
+ return MakeByteArrayRecordReader(descr, leaf_info, pool, read_dictionary);
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ return std::make_shared<FLBARecordReader>(descr, leaf_info, pool);
+ default: {
+ // PARQUET-1481: This can occur if the file is corrupt
+ std::stringstream ss;
+ ss << "Invalid physical column type: " << static_cast<int>(descr->physical_type());
+ throw ParquetException(ss.str());
+ }
+ }
+ // Unreachable code, but suppress compiler warning
+ return nullptr;
+}
+
+} // namespace internal
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/column_reader.h b/contrib/libs/apache/arrow/cpp/src/parquet/column_reader.h
new file mode 100644
index 00000000000..8c48e4d7843
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/column_reader.h
@@ -0,0 +1,376 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "parquet/exception.h"
+#include "parquet/level_conversion.h"
+#include "parquet/platform.h"
+#include "parquet/schema.h"
+#include "parquet/types.h"
+
+namespace arrow {
+
+class Array;
+class ChunkedArray;
+
+namespace BitUtil {
+class BitReader;
+} // namespace BitUtil
+
+namespace util {
+class RleDecoder;
+} // namespace util
+
+} // namespace arrow
+
+namespace parquet {
+
+class Decryptor;
+class Page;
+
+// 16 MB is the default maximum page header size
+static constexpr uint32_t kDefaultMaxPageHeaderSize = 16 * 1024 * 1024;
+
+// 16 KB is the default expected page header size
+static constexpr uint32_t kDefaultPageHeaderSize = 16 * 1024;
+
+class PARQUET_EXPORT LevelDecoder {
+ public:
+ LevelDecoder();
+ ~LevelDecoder();
+
+ // Initialize the LevelDecoder state with new data
+ // and return the number of bytes consumed
+ int SetData(Encoding::type encoding, int16_t max_level, int num_buffered_values,
+ const uint8_t* data, int32_t data_size);
+
+ void SetDataV2(int32_t num_bytes, int16_t max_level, int num_buffered_values,
+ const uint8_t* data);
+
+ // Decodes a batch of levels into an array and returns the number of levels decoded
+ int Decode(int batch_size, int16_t* levels);
+
+ private:
+ int bit_width_;
+ int num_values_remaining_;
+ Encoding::type encoding_;
+ std::unique_ptr<::arrow::util::RleDecoder> rle_decoder_;
+ std::unique_ptr<::arrow::BitUtil::BitReader> bit_packed_decoder_;
+ int16_t max_level_;
+};
+
+struct CryptoContext {
+ CryptoContext(bool start_with_dictionary_page, int16_t rg_ordinal, int16_t col_ordinal,
+ std::shared_ptr<Decryptor> meta, std::shared_ptr<Decryptor> data)
+ : start_decrypt_with_dictionary_page(start_with_dictionary_page),
+ row_group_ordinal(rg_ordinal),
+ column_ordinal(col_ordinal),
+ meta_decryptor(std::move(meta)),
+ data_decryptor(std::move(data)) {}
+ CryptoContext() {}
+
+ bool start_decrypt_with_dictionary_page = false;
+ int16_t row_group_ordinal = -1;
+ int16_t column_ordinal = -1;
+ std::shared_ptr<Decryptor> meta_decryptor;
+ std::shared_ptr<Decryptor> data_decryptor;
+};
+
+// Abstract page iterator interface. This way, we can feed column pages to the
+// ColumnReader through whatever mechanism we choose
+class PARQUET_EXPORT PageReader {
+ public:
+ virtual ~PageReader() = default;
+
+ static std::unique_ptr<PageReader> Open(
+ std::shared_ptr<ArrowInputStream> stream, int64_t total_num_rows,
+ Compression::type codec, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(),
+ const CryptoContext* ctx = NULLPTR);
+
+ // @returns: shared_ptr<Page>(nullptr) on EOS, std::shared_ptr<Page>
+ // containing new Page otherwise
+ virtual std::shared_ptr<Page> NextPage() = 0;
+
+ virtual void set_max_page_header_size(uint32_t size) = 0;
+};
+
+class PARQUET_EXPORT ColumnReader {
+ public:
+ virtual ~ColumnReader() = default;
+
+ static std::shared_ptr<ColumnReader> Make(
+ const ColumnDescriptor* descr, std::unique_ptr<PageReader> pager,
+ ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
+
+ // Returns true if there are still values in this column.
+ virtual bool HasNext() = 0;
+
+ virtual Type::type type() const = 0;
+
+ virtual const ColumnDescriptor* descr() const = 0;
+
+ // Get the encoding that can be exposed by this reader. If it returns
+ // dictionary encoding, then ReadBatchWithDictionary can be used to read data.
+ //
+ // \note API EXPERIMENTAL
+ virtual ExposedEncoding GetExposedEncoding() = 0;
+
+ protected:
+ friend class RowGroupReader;
+ // Set the encoding that can be exposed by this reader.
+ //
+ // \note API EXPERIMENTAL
+ virtual void SetExposedEncoding(ExposedEncoding encoding) = 0;
+};
+
+// API to read values from a single column. This is a main client facing API.
+template <typename DType>
+class TypedColumnReader : public ColumnReader {
+ public:
+ typedef typename DType::c_type T;
+
+ // Read a batch of repetition levels, definition levels, and values from the
+ // column.
+ //
+ // Since null values are not stored in the values, the number of values read
+ // may be less than the number of repetition and definition levels. With
+ // nested data this is almost certainly true.
+ //
+ // Set def_levels or rep_levels to nullptr if you want to skip reading them.
+ // This is only safe if you know through some other source that there are no
+ // undefined values.
+ //
+ // To fully exhaust a row group, you must read batches until the number of
+ // values read reaches the number of stored values according to the metadata.
+ //
+ // This API is the same for both V1 and V2 of the DataPage
+ //
+ // @returns: actual number of levels read (see values_read for number of values read)
+ virtual int64_t ReadBatch(int64_t batch_size, int16_t* def_levels, int16_t* rep_levels,
+ T* values, int64_t* values_read) = 0;
+
+ /// Read a batch of repetition levels, definition levels, and values from the
+ /// column and leave spaces for null entries on the lowest level in the values
+ /// buffer.
+ ///
+ /// In comparison to ReadBatch the length of repetition and definition levels
+ /// is the same as of the number of values read for max_definition_level == 1.
+ /// In the case of max_definition_level > 1, the repetition and definition
+ /// levels are larger than the values but the values include the null entries
+ /// with definition_level == (max_definition_level - 1).
+ ///
+ /// To fully exhaust a row group, you must read batches until the number of
+ /// values read reaches the number of stored values according to the metadata.
+ ///
+ /// @param batch_size the number of levels to read
+ /// @param[out] def_levels The Parquet definition levels, output has
+ /// the length levels_read.
+ /// @param[out] rep_levels The Parquet repetition levels, output has
+ /// the length levels_read.
+ /// @param[out] values The values in the lowest nested level including
+ /// spacing for nulls on the lowest levels; output has the length
+ /// values_read.
+ /// @param[out] valid_bits Memory allocated for a bitmap that indicates if
+ /// the row is null or on the maximum definition level. For performance
+ /// reasons the underlying buffer should be able to store 1 bit more than
+ /// required. If this requires an additional byte, this byte is only read
+ /// but never written to.
+ /// @param valid_bits_offset The offset in bits of the valid_bits where the
+ /// first relevant bit resides.
+ /// @param[out] levels_read The number of repetition/definition levels that were read.
+ /// @param[out] values_read The number of values read, this includes all
+ /// non-null entries as well as all null-entries on the lowest level
+ /// (i.e. definition_level == max_definition_level - 1)
+ /// @param[out] null_count The number of nulls on the lowest levels.
+ /// (i.e. (values_read - null_count) is total number of non-null entries)
+ ///
+ /// \deprecated Since 4.0.0
+ ARROW_DEPRECATED("Doesn't handle nesting correctly and unused outside of unit tests.")
+ virtual int64_t ReadBatchSpaced(int64_t batch_size, int16_t* def_levels,
+ int16_t* rep_levels, T* values, uint8_t* valid_bits,
+ int64_t valid_bits_offset, int64_t* levels_read,
+ int64_t* values_read, int64_t* null_count) = 0;
+
+ // Skip reading levels
+ // Returns the number of levels skipped
+ virtual int64_t Skip(int64_t num_rows_to_skip) = 0;
+
+ // Read a batch of repetition levels, definition levels, and indices from the
+ // column. And read the dictionary if a dictionary page is encountered during
+ // reading pages. This API is similar to ReadBatch(), with ability to read
+ // dictionary and indices. It is only valid to call this method when the reader can
+ // expose dictionary encoding. (i.e., the reader's GetExposedEncoding() returns
+ // DICTIONARY).
+ //
+ // The dictionary is read along with the data page. When there's no data page,
+ // the dictionary won't be returned.
+ //
+ // @param batch_size The batch size to read
+ // @param[out] def_levels The Parquet definition levels.
+ // @param[out] rep_levels The Parquet repetition levels.
+ // @param[out] indices The dictionary indices.
+ // @param[out] indices_read The number of indices read.
+ // @param[out] dict The pointer to dictionary values. It will return nullptr if
+ // there's no data page. Each column chunk only has one dictionary page. The dictionary
+ // is owned by the reader, so the caller is responsible for copying the dictionary
+ // values before the reader gets destroyed.
+ // @param[out] dict_len The dictionary length. It will return 0 if there's no data
+ // page.
+ // @returns: actual number of levels read (see indices_read for number of
+ // indices read
+ //
+ // \note API EXPERIMENTAL
+ virtual int64_t ReadBatchWithDictionary(int64_t batch_size, int16_t* def_levels,
+ int16_t* rep_levels, int32_t* indices,
+ int64_t* indices_read, const T** dict,
+ int32_t* dict_len) = 0;
+};
+
+namespace internal {
+
+/// \brief Stateful column reader that delimits semantic records for both flat
+/// and nested columns
+///
+/// \note API EXPERIMENTAL
+/// \since 1.3.0
+class RecordReader {
+ public:
+ static std::shared_ptr<RecordReader> Make(
+ const ColumnDescriptor* descr, LevelInfo leaf_info,
+ ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(),
+ const bool read_dictionary = false);
+
+ virtual ~RecordReader() = default;
+
+ /// \brief Attempt to read indicated number of records from column chunk
+ /// \return number of records read
+ virtual int64_t ReadRecords(int64_t num_records) = 0;
+
+ /// \brief Pre-allocate space for data. Results in better flat read performance
+ virtual void Reserve(int64_t num_values) = 0;
+
+ /// \brief Clear consumed values and repetition/definition levels as the
+ /// result of calling ReadRecords
+ virtual void Reset() = 0;
+
+ /// \brief Transfer filled values buffer to caller. A new one will be
+ /// allocated in subsequent ReadRecords calls
+ virtual std::shared_ptr<ResizableBuffer> ReleaseValues() = 0;
+
+ /// \brief Transfer filled validity bitmap buffer to caller. A new one will
+ /// be allocated in subsequent ReadRecords calls
+ virtual std::shared_ptr<ResizableBuffer> ReleaseIsValid() = 0;
+
+ /// \brief Return true if the record reader has more internal data yet to
+ /// process
+ virtual bool HasMoreData() const = 0;
+
+ /// \brief Advance record reader to the next row group
+ /// \param[in] reader obtained from RowGroupReader::GetColumnPageReader
+ virtual void SetPageReader(std::unique_ptr<PageReader> reader) = 0;
+
+ virtual void DebugPrintState() = 0;
+
+ /// \brief Decoded definition levels
+ int16_t* def_levels() const {
+ return reinterpret_cast<int16_t*>(def_levels_->mutable_data());
+ }
+
+ /// \brief Decoded repetition levels
+ int16_t* rep_levels() const {
+ return reinterpret_cast<int16_t*>(rep_levels_->mutable_data());
+ }
+
+ /// \brief Decoded values, including nulls, if any
+ uint8_t* values() const { return values_->mutable_data(); }
+
+ /// \brief Number of values written including nulls (if any)
+ int64_t values_written() const { return values_written_; }
+
+ /// \brief Number of definition / repetition levels (from those that have
+ /// been decoded) that have been consumed inside the reader.
+ int64_t levels_position() const { return levels_position_; }
+
+ /// \brief Number of definition / repetition levels that have been written
+ /// internally in the reader
+ int64_t levels_written() const { return levels_written_; }
+
+ /// \brief Number of nulls in the leaf
+ int64_t null_count() const { return null_count_; }
+
+ /// \brief True if the leaf values are nullable
+ bool nullable_values() const { return nullable_values_; }
+
+ /// \brief True if reading directly as Arrow dictionary-encoded
+ bool read_dictionary() const { return read_dictionary_; }
+
+ protected:
+ bool nullable_values_;
+
+ bool at_record_start_;
+ int64_t records_read_;
+
+ int64_t values_written_;
+ int64_t values_capacity_;
+ int64_t null_count_;
+
+ int64_t levels_written_;
+ int64_t levels_position_;
+ int64_t levels_capacity_;
+
+ std::shared_ptr<::arrow::ResizableBuffer> values_;
+ // In the case of false, don't allocate the values buffer (when we directly read into
+ // builder classes).
+ bool uses_values_;
+
+ std::shared_ptr<::arrow::ResizableBuffer> valid_bits_;
+ std::shared_ptr<::arrow::ResizableBuffer> def_levels_;
+ std::shared_ptr<::arrow::ResizableBuffer> rep_levels_;
+
+ bool read_dictionary_ = false;
+};
+
+class BinaryRecordReader : virtual public RecordReader {
+ public:
+ virtual std::vector<std::shared_ptr<::arrow::Array>> GetBuilderChunks() = 0;
+};
+
+/// \brief Read records directly to dictionary-encoded Arrow form (int32
+/// indices). Only valid for BYTE_ARRAY columns
+class DictionaryRecordReader : virtual public RecordReader {
+ public:
+ virtual std::shared_ptr<::arrow::ChunkedArray> GetResult() = 0;
+};
+
+} // namespace internal
+
+using BoolReader = TypedColumnReader<BooleanType>;
+using Int32Reader = TypedColumnReader<Int32Type>;
+using Int64Reader = TypedColumnReader<Int64Type>;
+using Int96Reader = TypedColumnReader<Int96Type>;
+using FloatReader = TypedColumnReader<FloatType>;
+using DoubleReader = TypedColumnReader<DoubleType>;
+using ByteArrayReader = TypedColumnReader<ByteArrayType>;
+using FixedLenByteArrayReader = TypedColumnReader<FLBAType>;
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/column_scanner.cc b/contrib/libs/apache/arrow/cpp/src/parquet/column_scanner.cc
new file mode 100644
index 00000000000..9ab1663ccd7
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/column_scanner.cc
@@ -0,0 +1,91 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/column_scanner.h"
+
+#include <cstdint>
+#include <memory>
+
+#include "parquet/column_reader.h"
+
+using arrow::MemoryPool;
+
+namespace parquet {
+
+std::shared_ptr<Scanner> Scanner::Make(std::shared_ptr<ColumnReader> col_reader,
+ int64_t batch_size, MemoryPool* pool) {
+ switch (col_reader->type()) {
+ case Type::BOOLEAN:
+ return std::make_shared<BoolScanner>(std::move(col_reader), batch_size, pool);
+ case Type::INT32:
+ return std::make_shared<Int32Scanner>(std::move(col_reader), batch_size, pool);
+ case Type::INT64:
+ return std::make_shared<Int64Scanner>(std::move(col_reader), batch_size, pool);
+ case Type::INT96:
+ return std::make_shared<Int96Scanner>(std::move(col_reader), batch_size, pool);
+ case Type::FLOAT:
+ return std::make_shared<FloatScanner>(std::move(col_reader), batch_size, pool);
+ case Type::DOUBLE:
+ return std::make_shared<DoubleScanner>(std::move(col_reader), batch_size, pool);
+ case Type::BYTE_ARRAY:
+ return std::make_shared<ByteArrayScanner>(std::move(col_reader), batch_size, pool);
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ return std::make_shared<FixedLenByteArrayScanner>(std::move(col_reader), batch_size,
+ pool);
+ default:
+ ParquetException::NYI("type reader not implemented");
+ }
+ // Unreachable code, but suppress compiler warning
+ return std::shared_ptr<Scanner>(nullptr);
+}
+
+int64_t ScanAllValues(int32_t batch_size, int16_t* def_levels, int16_t* rep_levels,
+ uint8_t* values, int64_t* values_buffered,
+ parquet::ColumnReader* reader) {
+ switch (reader->type()) {
+ case parquet::Type::BOOLEAN:
+ return ScanAll<parquet::BoolReader>(batch_size, def_levels, rep_levels, values,
+ values_buffered, reader);
+ case parquet::Type::INT32:
+ return ScanAll<parquet::Int32Reader>(batch_size, def_levels, rep_levels, values,
+ values_buffered, reader);
+ case parquet::Type::INT64:
+ return ScanAll<parquet::Int64Reader>(batch_size, def_levels, rep_levels, values,
+ values_buffered, reader);
+ case parquet::Type::INT96:
+ return ScanAll<parquet::Int96Reader>(batch_size, def_levels, rep_levels, values,
+ values_buffered, reader);
+ case parquet::Type::FLOAT:
+ return ScanAll<parquet::FloatReader>(batch_size, def_levels, rep_levels, values,
+ values_buffered, reader);
+ case parquet::Type::DOUBLE:
+ return ScanAll<parquet::DoubleReader>(batch_size, def_levels, rep_levels, values,
+ values_buffered, reader);
+ case parquet::Type::BYTE_ARRAY:
+ return ScanAll<parquet::ByteArrayReader>(batch_size, def_levels, rep_levels, values,
+ values_buffered, reader);
+ case parquet::Type::FIXED_LEN_BYTE_ARRAY:
+ return ScanAll<parquet::FixedLenByteArrayReader>(batch_size, def_levels, rep_levels,
+ values, values_buffered, reader);
+ default:
+ parquet::ParquetException::NYI("type reader not implemented");
+ }
+ // Unreachable code, but suppress compiler warning
+ return 0;
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/column_scanner.h b/contrib/libs/apache/arrow/cpp/src/parquet/column_scanner.h
new file mode 100644
index 00000000000..d53435f03cd
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/column_scanner.h
@@ -0,0 +1,262 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <stdio.h>
+
+#include <cstdint>
+#include <memory>
+#include <ostream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "parquet/column_reader.h"
+#include "parquet/exception.h"
+#include "parquet/platform.h"
+#include "parquet/schema.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+static constexpr int64_t DEFAULT_SCANNER_BATCH_SIZE = 128;
+
+class PARQUET_EXPORT Scanner {
+ public:
+ explicit Scanner(std::shared_ptr<ColumnReader> reader,
+ int64_t batch_size = DEFAULT_SCANNER_BATCH_SIZE,
+ ::arrow::MemoryPool* pool = ::arrow::default_memory_pool())
+ : batch_size_(batch_size),
+ level_offset_(0),
+ levels_buffered_(0),
+ value_buffer_(AllocateBuffer(pool)),
+ value_offset_(0),
+ values_buffered_(0),
+ reader_(std::move(reader)) {
+ def_levels_.resize(descr()->max_definition_level() > 0 ? batch_size_ : 0);
+ rep_levels_.resize(descr()->max_repetition_level() > 0 ? batch_size_ : 0);
+ }
+
+ virtual ~Scanner() {}
+
+ static std::shared_ptr<Scanner> Make(
+ std::shared_ptr<ColumnReader> col_reader,
+ int64_t batch_size = DEFAULT_SCANNER_BATCH_SIZE,
+ ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
+
+ virtual void PrintNext(std::ostream& out, int width, bool with_levels = false) = 0;
+
+ bool HasNext() { return level_offset_ < levels_buffered_ || reader_->HasNext(); }
+
+ const ColumnDescriptor* descr() const { return reader_->descr(); }
+
+ int64_t batch_size() const { return batch_size_; }
+
+ void SetBatchSize(int64_t batch_size) { batch_size_ = batch_size; }
+
+ protected:
+ int64_t batch_size_;
+
+ std::vector<int16_t> def_levels_;
+ std::vector<int16_t> rep_levels_;
+ int level_offset_;
+ int levels_buffered_;
+
+ std::shared_ptr<ResizableBuffer> value_buffer_;
+ int value_offset_;
+ int64_t values_buffered_;
+ std::shared_ptr<ColumnReader> reader_;
+};
+
+template <typename DType>
+class PARQUET_TEMPLATE_CLASS_EXPORT TypedScanner : public Scanner {
+ public:
+ typedef typename DType::c_type T;
+
+ explicit TypedScanner(std::shared_ptr<ColumnReader> reader,
+ int64_t batch_size = DEFAULT_SCANNER_BATCH_SIZE,
+ ::arrow::MemoryPool* pool = ::arrow::default_memory_pool())
+ : Scanner(std::move(reader), batch_size, pool) {
+ typed_reader_ = static_cast<TypedColumnReader<DType>*>(reader_.get());
+ int value_byte_size = type_traits<DType::type_num>::value_byte_size;
+ PARQUET_THROW_NOT_OK(value_buffer_->Resize(batch_size_ * value_byte_size));
+ values_ = reinterpret_cast<T*>(value_buffer_->mutable_data());
+ }
+
+ virtual ~TypedScanner() {}
+
+ bool NextLevels(int16_t* def_level, int16_t* rep_level) {
+ if (level_offset_ == levels_buffered_) {
+ levels_buffered_ = static_cast<int>(
+ typed_reader_->ReadBatch(static_cast<int>(batch_size_), def_levels_.data(),
+ rep_levels_.data(), values_, &values_buffered_));
+
+ value_offset_ = 0;
+ level_offset_ = 0;
+ if (!levels_buffered_) {
+ return false;
+ }
+ }
+ *def_level = descr()->max_definition_level() > 0 ? def_levels_[level_offset_] : 0;
+ *rep_level = descr()->max_repetition_level() > 0 ? rep_levels_[level_offset_] : 0;
+ level_offset_++;
+ return true;
+ }
+
+ bool Next(T* val, int16_t* def_level, int16_t* rep_level, bool* is_null) {
+ if (level_offset_ == levels_buffered_) {
+ if (!HasNext()) {
+ // Out of data pages
+ return false;
+ }
+ }
+
+ NextLevels(def_level, rep_level);
+ *is_null = *def_level < descr()->max_definition_level();
+
+ if (*is_null) {
+ return true;
+ }
+
+ if (value_offset_ == values_buffered_) {
+ throw ParquetException("Value was non-null, but has not been buffered");
+ }
+ *val = values_[value_offset_++];
+ return true;
+ }
+
+ // Returns true if there is a next value
+ bool NextValue(T* val, bool* is_null) {
+ if (level_offset_ == levels_buffered_) {
+ if (!HasNext()) {
+ // Out of data pages
+ return false;
+ }
+ }
+
+ // Out of values
+ int16_t def_level = -1;
+ int16_t rep_level = -1;
+ NextLevels(&def_level, &rep_level);
+ *is_null = def_level < descr()->max_definition_level();
+
+ if (*is_null) {
+ return true;
+ }
+
+ if (value_offset_ == values_buffered_) {
+ throw ParquetException("Value was non-null, but has not been buffered");
+ }
+ *val = values_[value_offset_++];
+ return true;
+ }
+
+ virtual void PrintNext(std::ostream& out, int width, bool with_levels = false) {
+ T val{};
+ int16_t def_level = -1;
+ int16_t rep_level = -1;
+ bool is_null = false;
+ char buffer[80];
+
+ if (!Next(&val, &def_level, &rep_level, &is_null)) {
+ throw ParquetException("No more values buffered");
+ }
+
+ if (with_levels) {
+ out << " D:" << def_level << " R:" << rep_level << " ";
+ if (!is_null) {
+ out << "V:";
+ }
+ }
+
+ if (is_null) {
+ std::string null_fmt = format_fwf<ByteArrayType>(width);
+ snprintf(buffer, sizeof(buffer), null_fmt.c_str(), "NULL");
+ } else {
+ FormatValue(&val, buffer, sizeof(buffer), width);
+ }
+ out << buffer;
+ }
+
+ private:
+ // The ownership of this object is expressed through the reader_ variable in the base
+ TypedColumnReader<DType>* typed_reader_;
+
+ inline void FormatValue(void* val, char* buffer, int bufsize, int width);
+
+ T* values_;
+};
+
+template <typename DType>
+inline void TypedScanner<DType>::FormatValue(void* val, char* buffer, int bufsize,
+ int width) {
+ std::string fmt = format_fwf<DType>(width);
+ snprintf(buffer, bufsize, fmt.c_str(), *reinterpret_cast<T*>(val));
+}
+
+template <>
+inline void TypedScanner<Int96Type>::FormatValue(void* val, char* buffer, int bufsize,
+ int width) {
+ std::string fmt = format_fwf<Int96Type>(width);
+ std::string result = Int96ToString(*reinterpret_cast<Int96*>(val));
+ snprintf(buffer, bufsize, fmt.c_str(), result.c_str());
+}
+
+template <>
+inline void TypedScanner<ByteArrayType>::FormatValue(void* val, char* buffer, int bufsize,
+ int width) {
+ std::string fmt = format_fwf<ByteArrayType>(width);
+ std::string result = ByteArrayToString(*reinterpret_cast<ByteArray*>(val));
+ snprintf(buffer, bufsize, fmt.c_str(), result.c_str());
+}
+
+template <>
+inline void TypedScanner<FLBAType>::FormatValue(void* val, char* buffer, int bufsize,
+ int width) {
+ std::string fmt = format_fwf<FLBAType>(width);
+ std::string result = FixedLenByteArrayToString(
+ *reinterpret_cast<FixedLenByteArray*>(val), descr()->type_length());
+ snprintf(buffer, bufsize, fmt.c_str(), result.c_str());
+}
+
+typedef TypedScanner<BooleanType> BoolScanner;
+typedef TypedScanner<Int32Type> Int32Scanner;
+typedef TypedScanner<Int64Type> Int64Scanner;
+typedef TypedScanner<Int96Type> Int96Scanner;
+typedef TypedScanner<FloatType> FloatScanner;
+typedef TypedScanner<DoubleType> DoubleScanner;
+typedef TypedScanner<ByteArrayType> ByteArrayScanner;
+typedef TypedScanner<FLBAType> FixedLenByteArrayScanner;
+
+template <typename RType>
+int64_t ScanAll(int32_t batch_size, int16_t* def_levels, int16_t* rep_levels,
+ uint8_t* values, int64_t* values_buffered,
+ parquet::ColumnReader* reader) {
+ typedef typename RType::T Type;
+ auto typed_reader = static_cast<RType*>(reader);
+ auto vals = reinterpret_cast<Type*>(&values[0]);
+ return typed_reader->ReadBatch(batch_size, def_levels, rep_levels, vals,
+ values_buffered);
+}
+
+int64_t PARQUET_EXPORT ScanAllValues(int32_t batch_size, int16_t* def_levels,
+ int16_t* rep_levels, uint8_t* values,
+ int64_t* values_buffered,
+ parquet::ColumnReader* reader);
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/column_writer.cc b/contrib/libs/apache/arrow/cpp/src/parquet/column_writer.cc
new file mode 100644
index 00000000000..446fe25e644
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/column_writer.cc
@@ -0,0 +1,2067 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/column_writer.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/buffer_builder.h"
+#include "arrow/compute/api.h"
+#include "arrow/io/memory.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/bit_stream_utils.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_ops.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/compression.h"
+#include "arrow/util/endian.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/rle_encoding.h"
+#include "arrow/visitor_inline.h"
+#include "parquet/column_page.h"
+#include "parquet/encoding.h"
+#include "parquet/encryption/encryption_internal.h"
+#include "parquet/encryption/internal_file_encryptor.h"
+#include "parquet/level_conversion.h"
+#include "parquet/metadata.h"
+#include "parquet/platform.h"
+#include "parquet/properties.h"
+#include "parquet/schema.h"
+#include "parquet/statistics.h"
+#include "parquet/thrift_internal.h"
+#include "parquet/types.h"
+
+using arrow::Array;
+using arrow::ArrayData;
+using arrow::Datum;
+using arrow::Result;
+using arrow::Status;
+using arrow::BitUtil::BitWriter;
+using arrow::internal::checked_cast;
+using arrow::internal::checked_pointer_cast;
+using arrow::util::RleEncoder;
+
+namespace BitUtil = arrow::BitUtil;
+
+namespace parquet {
+
+namespace {
+
+// Visitor that exracts the value buffer from a FlatArray at a given offset.
+struct ValueBufferSlicer {
+ template <typename T>
+ ::arrow::enable_if_base_binary<typename T::TypeClass, Status> Visit(const T& array) {
+ auto data = array.data();
+ buffer_ =
+ SliceBuffer(data->buffers[1], data->offset * sizeof(typename T::offset_type),
+ data->length * sizeof(typename T::offset_type));
+ return Status::OK();
+ }
+
+ template <typename T>
+ ::arrow::enable_if_fixed_size_binary<typename T::TypeClass, Status> Visit(
+ const T& array) {
+ auto data = array.data();
+ buffer_ = SliceBuffer(data->buffers[1], data->offset * array.byte_width(),
+ data->length * array.byte_width());
+ return Status::OK();
+ }
+
+ template <typename T>
+ ::arrow::enable_if_t<::arrow::has_c_type<typename T::TypeClass>::value &&
+ !std::is_same<BooleanType, typename T::TypeClass>::value,
+ Status>
+ Visit(const T& array) {
+ auto data = array.data();
+ buffer_ = SliceBuffer(
+ data->buffers[1],
+ ::arrow::TypeTraits<typename T::TypeClass>::bytes_required(data->offset),
+ ::arrow::TypeTraits<typename T::TypeClass>::bytes_required(data->length));
+ return Status::OK();
+ }
+
+ Status Visit(const ::arrow::BooleanArray& array) {
+ auto data = array.data();
+ if (BitUtil::IsMultipleOf8(data->offset)) {
+ buffer_ = SliceBuffer(data->buffers[1], BitUtil::BytesForBits(data->offset),
+ BitUtil::BytesForBits(data->length));
+ return Status::OK();
+ }
+ PARQUET_ASSIGN_OR_THROW(buffer_,
+ ::arrow::internal::CopyBitmap(pool_, data->buffers[1]->data(),
+ data->offset, data->length));
+ return Status::OK();
+ }
+#define NOT_IMPLEMENTED_VISIT(ArrowTypePrefix) \
+ Status Visit(const ::arrow::ArrowTypePrefix##Array& array) { \
+ return Status::NotImplemented("Slicing not implemented for " #ArrowTypePrefix); \
+ }
+
+ NOT_IMPLEMENTED_VISIT(Null);
+ NOT_IMPLEMENTED_VISIT(Union);
+ NOT_IMPLEMENTED_VISIT(List);
+ NOT_IMPLEMENTED_VISIT(LargeList);
+ NOT_IMPLEMENTED_VISIT(Struct);
+ NOT_IMPLEMENTED_VISIT(FixedSizeList);
+ NOT_IMPLEMENTED_VISIT(Dictionary);
+ NOT_IMPLEMENTED_VISIT(Extension);
+
+#undef NOT_IMPLEMENTED_VISIT
+
+ MemoryPool* pool_;
+ std::shared_ptr<Buffer> buffer_;
+};
+
+internal::LevelInfo ComputeLevelInfo(const ColumnDescriptor* descr) {
+ internal::LevelInfo level_info;
+ level_info.def_level = descr->max_definition_level();
+ level_info.rep_level = descr->max_repetition_level();
+
+ int16_t min_spaced_def_level = descr->max_definition_level();
+ const ::parquet::schema::Node* node = descr->schema_node().get();
+ while (node != nullptr && !node->is_repeated()) {
+ if (node->is_optional()) {
+ min_spaced_def_level--;
+ }
+ node = node->parent();
+ }
+ level_info.repeated_ancestor_def_level = min_spaced_def_level;
+ return level_info;
+}
+
+template <class T>
+inline const T* AddIfNotNull(const T* base, int64_t offset) {
+ if (base != nullptr) {
+ return base + offset;
+ }
+ return nullptr;
+}
+
+} // namespace
+
+LevelEncoder::LevelEncoder() {}
+LevelEncoder::~LevelEncoder() {}
+
+void LevelEncoder::Init(Encoding::type encoding, int16_t max_level,
+ int num_buffered_values, uint8_t* data, int data_size) {
+ bit_width_ = BitUtil::Log2(max_level + 1);
+ encoding_ = encoding;
+ switch (encoding) {
+ case Encoding::RLE: {
+ rle_encoder_.reset(new RleEncoder(data, data_size, bit_width_));
+ break;
+ }
+ case Encoding::BIT_PACKED: {
+ int num_bytes =
+ static_cast<int>(BitUtil::BytesForBits(num_buffered_values * bit_width_));
+ bit_packed_encoder_.reset(new BitWriter(data, num_bytes));
+ break;
+ }
+ default:
+ throw ParquetException("Unknown encoding type for levels.");
+ }
+}
+
+int LevelEncoder::MaxBufferSize(Encoding::type encoding, int16_t max_level,
+ int num_buffered_values) {
+ int bit_width = BitUtil::Log2(max_level + 1);
+ int num_bytes = 0;
+ switch (encoding) {
+ case Encoding::RLE: {
+ // TODO: Due to the way we currently check if the buffer is full enough,
+ // we need to have MinBufferSize as head room.
+ num_bytes = RleEncoder::MaxBufferSize(bit_width, num_buffered_values) +
+ RleEncoder::MinBufferSize(bit_width);
+ break;
+ }
+ case Encoding::BIT_PACKED: {
+ num_bytes =
+ static_cast<int>(BitUtil::BytesForBits(num_buffered_values * bit_width));
+ break;
+ }
+ default:
+ throw ParquetException("Unknown encoding type for levels.");
+ }
+ return num_bytes;
+}
+
+int LevelEncoder::Encode(int batch_size, const int16_t* levels) {
+ int num_encoded = 0;
+ if (!rle_encoder_ && !bit_packed_encoder_) {
+ throw ParquetException("Level encoders are not initialized.");
+ }
+
+ if (encoding_ == Encoding::RLE) {
+ for (int i = 0; i < batch_size; ++i) {
+ if (!rle_encoder_->Put(*(levels + i))) {
+ break;
+ }
+ ++num_encoded;
+ }
+ rle_encoder_->Flush();
+ rle_length_ = rle_encoder_->len();
+ } else {
+ for (int i = 0; i < batch_size; ++i) {
+ if (!bit_packed_encoder_->PutValue(*(levels + i), bit_width_)) {
+ break;
+ }
+ ++num_encoded;
+ }
+ bit_packed_encoder_->Flush();
+ }
+ return num_encoded;
+}
+
+// ----------------------------------------------------------------------
+// PageWriter implementation
+
+// This subclass delimits pages appearing in a serialized stream, each preceded
+// by a serialized Thrift format::PageHeader indicating the type of each page
+// and the page metadata.
+class SerializedPageWriter : public PageWriter {
+ public:
+ SerializedPageWriter(std::shared_ptr<ArrowOutputStream> sink, Compression::type codec,
+ int compression_level, ColumnChunkMetaDataBuilder* metadata,
+ int16_t row_group_ordinal, int16_t column_chunk_ordinal,
+ MemoryPool* pool = ::arrow::default_memory_pool(),
+ std::shared_ptr<Encryptor> meta_encryptor = nullptr,
+ std::shared_ptr<Encryptor> data_encryptor = nullptr)
+ : sink_(std::move(sink)),
+ metadata_(metadata),
+ pool_(pool),
+ num_values_(0),
+ dictionary_page_offset_(0),
+ data_page_offset_(0),
+ total_uncompressed_size_(0),
+ total_compressed_size_(0),
+ page_ordinal_(0),
+ row_group_ordinal_(row_group_ordinal),
+ column_ordinal_(column_chunk_ordinal),
+ meta_encryptor_(std::move(meta_encryptor)),
+ data_encryptor_(std::move(data_encryptor)),
+ encryption_buffer_(AllocateBuffer(pool, 0)) {
+ if (data_encryptor_ != nullptr || meta_encryptor_ != nullptr) {
+ InitEncryption();
+ }
+ compressor_ = GetCodec(codec, compression_level);
+ thrift_serializer_.reset(new ThriftSerializer);
+ }
+
+ int64_t WriteDictionaryPage(const DictionaryPage& page) override {
+ int64_t uncompressed_size = page.size();
+ std::shared_ptr<Buffer> compressed_data;
+ if (has_compressor()) {
+ auto buffer = std::static_pointer_cast<ResizableBuffer>(
+ AllocateBuffer(pool_, uncompressed_size));
+ Compress(*(page.buffer().get()), buffer.get());
+ compressed_data = std::static_pointer_cast<Buffer>(buffer);
+ } else {
+ compressed_data = page.buffer();
+ }
+
+ format::DictionaryPageHeader dict_page_header;
+ dict_page_header.__set_num_values(page.num_values());
+ dict_page_header.__set_encoding(ToThrift(page.encoding()));
+ dict_page_header.__set_is_sorted(page.is_sorted());
+
+ const uint8_t* output_data_buffer = compressed_data->data();
+ int32_t output_data_len = static_cast<int32_t>(compressed_data->size());
+
+ if (data_encryptor_.get()) {
+ UpdateEncryption(encryption::kDictionaryPage);
+ PARQUET_THROW_NOT_OK(encryption_buffer_->Resize(
+ data_encryptor_->CiphertextSizeDelta() + output_data_len, false));
+ output_data_len = data_encryptor_->Encrypt(compressed_data->data(), output_data_len,
+ encryption_buffer_->mutable_data());
+ output_data_buffer = encryption_buffer_->data();
+ }
+
+ format::PageHeader page_header;
+ page_header.__set_type(format::PageType::DICTIONARY_PAGE);
+ page_header.__set_uncompressed_page_size(static_cast<int32_t>(uncompressed_size));
+ page_header.__set_compressed_page_size(static_cast<int32_t>(output_data_len));
+ page_header.__set_dictionary_page_header(dict_page_header);
+ // TODO(PARQUET-594) crc checksum
+
+ PARQUET_ASSIGN_OR_THROW(int64_t start_pos, sink_->Tell());
+ if (dictionary_page_offset_ == 0) {
+ dictionary_page_offset_ = start_pos;
+ }
+
+ if (meta_encryptor_) {
+ UpdateEncryption(encryption::kDictionaryPageHeader);
+ }
+ const int64_t header_size =
+ thrift_serializer_->Serialize(&page_header, sink_.get(), meta_encryptor_);
+
+ PARQUET_THROW_NOT_OK(sink_->Write(output_data_buffer, output_data_len));
+
+ total_uncompressed_size_ += uncompressed_size + header_size;
+ total_compressed_size_ += output_data_len + header_size;
+ ++dict_encoding_stats_[page.encoding()];
+ return uncompressed_size + header_size;
+ }
+
+ void Close(bool has_dictionary, bool fallback) override {
+ if (meta_encryptor_ != nullptr) {
+ UpdateEncryption(encryption::kColumnMetaData);
+ }
+ // index_page_offset = -1 since they are not supported
+ metadata_->Finish(num_values_, dictionary_page_offset_, -1, data_page_offset_,
+ total_compressed_size_, total_uncompressed_size_, has_dictionary,
+ fallback, dict_encoding_stats_, data_encoding_stats_,
+ meta_encryptor_);
+ // Write metadata at end of column chunk
+ metadata_->WriteTo(sink_.get());
+ }
+
+ /**
+ * Compress a buffer.
+ */
+ void Compress(const Buffer& src_buffer, ResizableBuffer* dest_buffer) override {
+ DCHECK(compressor_ != nullptr);
+
+ // Compress the data
+ int64_t max_compressed_size =
+ compressor_->MaxCompressedLen(src_buffer.size(), src_buffer.data());
+
+ // Use Arrow::Buffer::shrink_to_fit = false
+ // underlying buffer only keeps growing. Resize to a smaller size does not reallocate.
+ PARQUET_THROW_NOT_OK(dest_buffer->Resize(max_compressed_size, false));
+
+ PARQUET_ASSIGN_OR_THROW(
+ int64_t compressed_size,
+ compressor_->Compress(src_buffer.size(), src_buffer.data(), max_compressed_size,
+ dest_buffer->mutable_data()));
+ PARQUET_THROW_NOT_OK(dest_buffer->Resize(compressed_size, false));
+ }
+
+ int64_t WriteDataPage(const DataPage& page) override {
+ const int64_t uncompressed_size = page.uncompressed_size();
+ std::shared_ptr<Buffer> compressed_data = page.buffer();
+ const uint8_t* output_data_buffer = compressed_data->data();
+ int32_t output_data_len = static_cast<int32_t>(compressed_data->size());
+
+ if (data_encryptor_.get()) {
+ PARQUET_THROW_NOT_OK(encryption_buffer_->Resize(
+ data_encryptor_->CiphertextSizeDelta() + output_data_len, false));
+ UpdateEncryption(encryption::kDataPage);
+ output_data_len = data_encryptor_->Encrypt(compressed_data->data(), output_data_len,
+ encryption_buffer_->mutable_data());
+ output_data_buffer = encryption_buffer_->data();
+ }
+
+ format::PageHeader page_header;
+ page_header.__set_uncompressed_page_size(static_cast<int32_t>(uncompressed_size));
+ page_header.__set_compressed_page_size(static_cast<int32_t>(output_data_len));
+ // TODO(PARQUET-594) crc checksum
+
+ if (page.type() == PageType::DATA_PAGE) {
+ const DataPageV1& v1_page = checked_cast<const DataPageV1&>(page);
+ SetDataPageHeader(page_header, v1_page);
+ } else if (page.type() == PageType::DATA_PAGE_V2) {
+ const DataPageV2& v2_page = checked_cast<const DataPageV2&>(page);
+ SetDataPageV2Header(page_header, v2_page);
+ } else {
+ throw ParquetException("Unexpected page type");
+ }
+
+ PARQUET_ASSIGN_OR_THROW(int64_t start_pos, sink_->Tell());
+ if (page_ordinal_ == 0) {
+ data_page_offset_ = start_pos;
+ }
+
+ if (meta_encryptor_) {
+ UpdateEncryption(encryption::kDataPageHeader);
+ }
+ const int64_t header_size =
+ thrift_serializer_->Serialize(&page_header, sink_.get(), meta_encryptor_);
+ PARQUET_THROW_NOT_OK(sink_->Write(output_data_buffer, output_data_len));
+
+ total_uncompressed_size_ += uncompressed_size + header_size;
+ total_compressed_size_ += output_data_len + header_size;
+ num_values_ += page.num_values();
+ ++data_encoding_stats_[page.encoding()];
+ ++page_ordinal_;
+ return uncompressed_size + header_size;
+ }
+
+ void SetDataPageHeader(format::PageHeader& page_header, const DataPageV1& page) {
+ format::DataPageHeader data_page_header;
+ data_page_header.__set_num_values(page.num_values());
+ data_page_header.__set_encoding(ToThrift(page.encoding()));
+ data_page_header.__set_definition_level_encoding(
+ ToThrift(page.definition_level_encoding()));
+ data_page_header.__set_repetition_level_encoding(
+ ToThrift(page.repetition_level_encoding()));
+ data_page_header.__set_statistics(ToThrift(page.statistics()));
+
+ page_header.__set_type(format::PageType::DATA_PAGE);
+ page_header.__set_data_page_header(data_page_header);
+ }
+
+ void SetDataPageV2Header(format::PageHeader& page_header, const DataPageV2 page) {
+ format::DataPageHeaderV2 data_page_header;
+ data_page_header.__set_num_values(page.num_values());
+ data_page_header.__set_num_nulls(page.num_nulls());
+ data_page_header.__set_num_rows(page.num_rows());
+ data_page_header.__set_encoding(ToThrift(page.encoding()));
+
+ data_page_header.__set_definition_levels_byte_length(
+ page.definition_levels_byte_length());
+ data_page_header.__set_repetition_levels_byte_length(
+ page.repetition_levels_byte_length());
+
+ data_page_header.__set_is_compressed(page.is_compressed());
+ data_page_header.__set_statistics(ToThrift(page.statistics()));
+
+ page_header.__set_type(format::PageType::DATA_PAGE_V2);
+ page_header.__set_data_page_header_v2(data_page_header);
+ }
+
+ bool has_compressor() override { return (compressor_ != nullptr); }
+
+ int64_t num_values() { return num_values_; }
+
+ int64_t dictionary_page_offset() { return dictionary_page_offset_; }
+
+ int64_t data_page_offset() { return data_page_offset_; }
+
+ int64_t total_compressed_size() { return total_compressed_size_; }
+
+ int64_t total_uncompressed_size() { return total_uncompressed_size_; }
+
+ private:
+ // To allow UpdateEncryption on Close
+ friend class BufferedPageWriter;
+
+ void InitEncryption() {
+ // Prepare the AAD for quick update later.
+ if (data_encryptor_ != nullptr) {
+ data_page_aad_ = encryption::CreateModuleAad(
+ data_encryptor_->file_aad(), encryption::kDataPage, row_group_ordinal_,
+ column_ordinal_, kNonPageOrdinal);
+ }
+ if (meta_encryptor_ != nullptr) {
+ data_page_header_aad_ = encryption::CreateModuleAad(
+ meta_encryptor_->file_aad(), encryption::kDataPageHeader, row_group_ordinal_,
+ column_ordinal_, kNonPageOrdinal);
+ }
+ }
+
+ void UpdateEncryption(int8_t module_type) {
+ switch (module_type) {
+ case encryption::kColumnMetaData: {
+ meta_encryptor_->UpdateAad(encryption::CreateModuleAad(
+ meta_encryptor_->file_aad(), module_type, row_group_ordinal_, column_ordinal_,
+ kNonPageOrdinal));
+ break;
+ }
+ case encryption::kDataPage: {
+ encryption::QuickUpdatePageAad(data_page_aad_, page_ordinal_);
+ data_encryptor_->UpdateAad(data_page_aad_);
+ break;
+ }
+ case encryption::kDataPageHeader: {
+ encryption::QuickUpdatePageAad(data_page_header_aad_, page_ordinal_);
+ meta_encryptor_->UpdateAad(data_page_header_aad_);
+ break;
+ }
+ case encryption::kDictionaryPageHeader: {
+ meta_encryptor_->UpdateAad(encryption::CreateModuleAad(
+ meta_encryptor_->file_aad(), module_type, row_group_ordinal_, column_ordinal_,
+ kNonPageOrdinal));
+ break;
+ }
+ case encryption::kDictionaryPage: {
+ data_encryptor_->UpdateAad(encryption::CreateModuleAad(
+ data_encryptor_->file_aad(), module_type, row_group_ordinal_, column_ordinal_,
+ kNonPageOrdinal));
+ break;
+ }
+ default:
+ throw ParquetException("Unknown module type in UpdateEncryption");
+ }
+ }
+
+ std::shared_ptr<ArrowOutputStream> sink_;
+ ColumnChunkMetaDataBuilder* metadata_;
+ MemoryPool* pool_;
+ int64_t num_values_;
+ int64_t dictionary_page_offset_;
+ int64_t data_page_offset_;
+ int64_t total_uncompressed_size_;
+ int64_t total_compressed_size_;
+ int16_t page_ordinal_;
+ int16_t row_group_ordinal_;
+ int16_t column_ordinal_;
+
+ std::unique_ptr<ThriftSerializer> thrift_serializer_;
+
+ // Compression codec to use.
+ std::unique_ptr<::arrow::util::Codec> compressor_;
+
+ std::string data_page_aad_;
+ std::string data_page_header_aad_;
+
+ std::shared_ptr<Encryptor> meta_encryptor_;
+ std::shared_ptr<Encryptor> data_encryptor_;
+
+ std::shared_ptr<ResizableBuffer> encryption_buffer_;
+
+ std::map<Encoding::type, int32_t> dict_encoding_stats_;
+ std::map<Encoding::type, int32_t> data_encoding_stats_;
+};
+
+// This implementation of the PageWriter writes to the final sink on Close .
+class BufferedPageWriter : public PageWriter {
+ public:
+ BufferedPageWriter(std::shared_ptr<ArrowOutputStream> sink, Compression::type codec,
+ int compression_level, ColumnChunkMetaDataBuilder* metadata,
+ int16_t row_group_ordinal, int16_t current_column_ordinal,
+ MemoryPool* pool = ::arrow::default_memory_pool(),
+ std::shared_ptr<Encryptor> meta_encryptor = nullptr,
+ std::shared_ptr<Encryptor> data_encryptor = nullptr)
+ : final_sink_(std::move(sink)), metadata_(metadata), has_dictionary_pages_(false) {
+ in_memory_sink_ = CreateOutputStream(pool);
+ pager_ = std::unique_ptr<SerializedPageWriter>(
+ new SerializedPageWriter(in_memory_sink_, codec, compression_level, metadata,
+ row_group_ordinal, current_column_ordinal, pool,
+ std::move(meta_encryptor), std::move(data_encryptor)));
+ }
+
+ int64_t WriteDictionaryPage(const DictionaryPage& page) override {
+ has_dictionary_pages_ = true;
+ return pager_->WriteDictionaryPage(page);
+ }
+
+ void Close(bool has_dictionary, bool fallback) override {
+ if (pager_->meta_encryptor_ != nullptr) {
+ pager_->UpdateEncryption(encryption::kColumnMetaData);
+ }
+ // index_page_offset = -1 since they are not supported
+ PARQUET_ASSIGN_OR_THROW(int64_t final_position, final_sink_->Tell());
+ // dictionary page offset should be 0 iff there are no dictionary pages
+ auto dictionary_page_offset =
+ has_dictionary_pages_ ? pager_->dictionary_page_offset() + final_position : 0;
+ metadata_->Finish(pager_->num_values(), dictionary_page_offset, -1,
+ pager_->data_page_offset() + final_position,
+ pager_->total_compressed_size(), pager_->total_uncompressed_size(),
+ has_dictionary, fallback, pager_->dict_encoding_stats_,
+ pager_->data_encoding_stats_, pager_->meta_encryptor_);
+
+ // Write metadata at end of column chunk
+ metadata_->WriteTo(in_memory_sink_.get());
+
+ // flush everything to the serialized sink
+ PARQUET_ASSIGN_OR_THROW(auto buffer, in_memory_sink_->Finish());
+ PARQUET_THROW_NOT_OK(final_sink_->Write(buffer));
+ }
+
+ int64_t WriteDataPage(const DataPage& page) override {
+ return pager_->WriteDataPage(page);
+ }
+
+ void Compress(const Buffer& src_buffer, ResizableBuffer* dest_buffer) override {
+ pager_->Compress(src_buffer, dest_buffer);
+ }
+
+ bool has_compressor() override { return pager_->has_compressor(); }
+
+ private:
+ std::shared_ptr<ArrowOutputStream> final_sink_;
+ ColumnChunkMetaDataBuilder* metadata_;
+ std::shared_ptr<::arrow::io::BufferOutputStream> in_memory_sink_;
+ std::unique_ptr<SerializedPageWriter> pager_;
+ bool has_dictionary_pages_;
+};
+
+std::unique_ptr<PageWriter> PageWriter::Open(
+ std::shared_ptr<ArrowOutputStream> sink, Compression::type codec,
+ int compression_level, ColumnChunkMetaDataBuilder* metadata,
+ int16_t row_group_ordinal, int16_t column_chunk_ordinal, MemoryPool* pool,
+ bool buffered_row_group, std::shared_ptr<Encryptor> meta_encryptor,
+ std::shared_ptr<Encryptor> data_encryptor) {
+ if (buffered_row_group) {
+ return std::unique_ptr<PageWriter>(
+ new BufferedPageWriter(std::move(sink), codec, compression_level, metadata,
+ row_group_ordinal, column_chunk_ordinal, pool,
+ std::move(meta_encryptor), std::move(data_encryptor)));
+ } else {
+ return std::unique_ptr<PageWriter>(
+ new SerializedPageWriter(std::move(sink), codec, compression_level, metadata,
+ row_group_ordinal, column_chunk_ordinal, pool,
+ std::move(meta_encryptor), std::move(data_encryptor)));
+ }
+}
+
+// ----------------------------------------------------------------------
+// ColumnWriter
+
+const std::shared_ptr<WriterProperties>& default_writer_properties() {
+ static std::shared_ptr<WriterProperties> default_writer_properties =
+ WriterProperties::Builder().build();
+ return default_writer_properties;
+}
+
+class ColumnWriterImpl {
+ public:
+ ColumnWriterImpl(ColumnChunkMetaDataBuilder* metadata,
+ std::unique_ptr<PageWriter> pager, const bool use_dictionary,
+ Encoding::type encoding, const WriterProperties* properties)
+ : metadata_(metadata),
+ descr_(metadata->descr()),
+ level_info_(ComputeLevelInfo(metadata->descr())),
+ pager_(std::move(pager)),
+ has_dictionary_(use_dictionary),
+ encoding_(encoding),
+ properties_(properties),
+ allocator_(properties->memory_pool()),
+ num_buffered_values_(0),
+ num_buffered_encoded_values_(0),
+ rows_written_(0),
+ total_bytes_written_(0),
+ total_compressed_bytes_(0),
+ closed_(false),
+ fallback_(false),
+ definition_levels_sink_(allocator_),
+ repetition_levels_sink_(allocator_) {
+ definition_levels_rle_ =
+ std::static_pointer_cast<ResizableBuffer>(AllocateBuffer(allocator_, 0));
+ repetition_levels_rle_ =
+ std::static_pointer_cast<ResizableBuffer>(AllocateBuffer(allocator_, 0));
+ uncompressed_data_ =
+ std::static_pointer_cast<ResizableBuffer>(AllocateBuffer(allocator_, 0));
+
+ if (pager_->has_compressor()) {
+ compressor_temp_buffer_ =
+ std::static_pointer_cast<ResizableBuffer>(AllocateBuffer(allocator_, 0));
+ }
+ }
+
+ virtual ~ColumnWriterImpl() = default;
+
+ int64_t Close();
+
+ protected:
+ virtual std::shared_ptr<Buffer> GetValuesBuffer() = 0;
+
+ // Serializes Dictionary Page if enabled
+ virtual void WriteDictionaryPage() = 0;
+
+ // Plain-encoded statistics of the current page
+ virtual EncodedStatistics GetPageStatistics() = 0;
+
+ // Plain-encoded statistics of the whole chunk
+ virtual EncodedStatistics GetChunkStatistics() = 0;
+
+ // Merges page statistics into chunk statistics, then resets the values
+ virtual void ResetPageStatistics() = 0;
+
+ // Adds Data Pages to an in memory buffer in dictionary encoding mode
+ // Serializes the Data Pages in other encoding modes
+ void AddDataPage();
+
+ void BuildDataPageV1(int64_t definition_levels_rle_size,
+ int64_t repetition_levels_rle_size, int64_t uncompressed_size,
+ const std::shared_ptr<Buffer>& values);
+ void BuildDataPageV2(int64_t definition_levels_rle_size,
+ int64_t repetition_levels_rle_size, int64_t uncompressed_size,
+ const std::shared_ptr<Buffer>& values);
+
+ // Serializes Data Pages
+ void WriteDataPage(const DataPage& page) {
+ total_bytes_written_ += pager_->WriteDataPage(page);
+ }
+
+ // Write multiple definition levels
+ void WriteDefinitionLevels(int64_t num_levels, const int16_t* levels) {
+ DCHECK(!closed_);
+ PARQUET_THROW_NOT_OK(
+ definition_levels_sink_.Append(levels, sizeof(int16_t) * num_levels));
+ }
+
+ // Write multiple repetition levels
+ void WriteRepetitionLevels(int64_t num_levels, const int16_t* levels) {
+ DCHECK(!closed_);
+ PARQUET_THROW_NOT_OK(
+ repetition_levels_sink_.Append(levels, sizeof(int16_t) * num_levels));
+ }
+
+ // RLE encode the src_buffer into dest_buffer and return the encoded size
+ int64_t RleEncodeLevels(const void* src_buffer, ResizableBuffer* dest_buffer,
+ int16_t max_level, bool include_length_prefix = true);
+
+ // Serialize the buffered Data Pages
+ void FlushBufferedDataPages();
+
+ ColumnChunkMetaDataBuilder* metadata_;
+ const ColumnDescriptor* descr_;
+ // scratch buffer if validity bits need to be recalculated.
+ std::shared_ptr<ResizableBuffer> bits_buffer_;
+ const internal::LevelInfo level_info_;
+
+ std::unique_ptr<PageWriter> pager_;
+
+ bool has_dictionary_;
+ Encoding::type encoding_;
+ const WriterProperties* properties_;
+
+ LevelEncoder level_encoder_;
+
+ MemoryPool* allocator_;
+
+ // The total number of values stored in the data page. This is the maximum of
+ // the number of encoded definition levels or encoded values. For
+ // non-repeated, required columns, this is equal to the number of encoded
+ // values. For repeated or optional values, there may be fewer data values
+ // than levels, and this tells you how many encoded levels there are in that
+ // case.
+ int64_t num_buffered_values_;
+
+ // The total number of stored values. For repeated or optional values, this
+ // number may be lower than num_buffered_values_.
+ int64_t num_buffered_encoded_values_;
+
+ // Total number of rows written with this ColumnWriter
+ int rows_written_;
+
+ // Records the total number of uncompressed bytes written by the serializer
+ int64_t total_bytes_written_;
+
+ // Records the current number of compressed bytes in a column
+ int64_t total_compressed_bytes_;
+
+ // Flag to check if the Writer has been closed
+ bool closed_;
+
+ // Flag to infer if dictionary encoding has fallen back to PLAIN
+ bool fallback_;
+
+ ::arrow::BufferBuilder definition_levels_sink_;
+ ::arrow::BufferBuilder repetition_levels_sink_;
+
+ std::shared_ptr<ResizableBuffer> definition_levels_rle_;
+ std::shared_ptr<ResizableBuffer> repetition_levels_rle_;
+
+ std::shared_ptr<ResizableBuffer> uncompressed_data_;
+ std::shared_ptr<ResizableBuffer> compressor_temp_buffer_;
+
+ std::vector<std::unique_ptr<DataPage>> data_pages_;
+
+ private:
+ void InitSinks() {
+ definition_levels_sink_.Rewind(0);
+ repetition_levels_sink_.Rewind(0);
+ }
+
+ // Concatenate the encoded levels and values into one buffer
+ void ConcatenateBuffers(int64_t definition_levels_rle_size,
+ int64_t repetition_levels_rle_size,
+ const std::shared_ptr<Buffer>& values, uint8_t* combined) {
+ memcpy(combined, repetition_levels_rle_->data(), repetition_levels_rle_size);
+ combined += repetition_levels_rle_size;
+ memcpy(combined, definition_levels_rle_->data(), definition_levels_rle_size);
+ combined += definition_levels_rle_size;
+ memcpy(combined, values->data(), values->size());
+ }
+};
+
+// return the size of the encoded buffer
+int64_t ColumnWriterImpl::RleEncodeLevels(const void* src_buffer,
+ ResizableBuffer* dest_buffer, int16_t max_level,
+ bool include_length_prefix) {
+ // V1 DataPage includes the length of the RLE level as a prefix.
+ int32_t prefix_size = include_length_prefix ? sizeof(int32_t) : 0;
+
+ // TODO: This only works with due to some RLE specifics
+ int64_t rle_size = LevelEncoder::MaxBufferSize(Encoding::RLE, max_level,
+ static_cast<int>(num_buffered_values_)) +
+ prefix_size;
+
+ // Use Arrow::Buffer::shrink_to_fit = false
+ // underlying buffer only keeps growing. Resize to a smaller size does not reallocate.
+ PARQUET_THROW_NOT_OK(dest_buffer->Resize(rle_size, false));
+
+ level_encoder_.Init(Encoding::RLE, max_level, static_cast<int>(num_buffered_values_),
+ dest_buffer->mutable_data() + prefix_size,
+ static_cast<int>(dest_buffer->size() - prefix_size));
+ int encoded = level_encoder_.Encode(static_cast<int>(num_buffered_values_),
+ reinterpret_cast<const int16_t*>(src_buffer));
+ DCHECK_EQ(encoded, num_buffered_values_);
+
+ if (include_length_prefix) {
+ reinterpret_cast<int32_t*>(dest_buffer->mutable_data())[0] = level_encoder_.len();
+ }
+
+ return level_encoder_.len() + prefix_size;
+}
+
+void ColumnWriterImpl::AddDataPage() {
+ int64_t definition_levels_rle_size = 0;
+ int64_t repetition_levels_rle_size = 0;
+
+ std::shared_ptr<Buffer> values = GetValuesBuffer();
+ bool is_v1_data_page = properties_->data_page_version() == ParquetDataPageVersion::V1;
+
+ if (descr_->max_definition_level() > 0) {
+ definition_levels_rle_size = RleEncodeLevels(
+ definition_levels_sink_.data(), definition_levels_rle_.get(),
+ descr_->max_definition_level(), /*include_length_prefix=*/is_v1_data_page);
+ }
+
+ if (descr_->max_repetition_level() > 0) {
+ repetition_levels_rle_size = RleEncodeLevels(
+ repetition_levels_sink_.data(), repetition_levels_rle_.get(),
+ descr_->max_repetition_level(), /*include_length_prefix=*/is_v1_data_page);
+ }
+
+ int64_t uncompressed_size =
+ definition_levels_rle_size + repetition_levels_rle_size + values->size();
+
+ if (is_v1_data_page) {
+ BuildDataPageV1(definition_levels_rle_size, repetition_levels_rle_size,
+ uncompressed_size, values);
+ } else {
+ BuildDataPageV2(definition_levels_rle_size, repetition_levels_rle_size,
+ uncompressed_size, values);
+ }
+
+ // Re-initialize the sinks for next Page.
+ InitSinks();
+ num_buffered_values_ = 0;
+ num_buffered_encoded_values_ = 0;
+}
+
+void ColumnWriterImpl::BuildDataPageV1(int64_t definition_levels_rle_size,
+ int64_t repetition_levels_rle_size,
+ int64_t uncompressed_size,
+ const std::shared_ptr<Buffer>& values) {
+ // Use Arrow::Buffer::shrink_to_fit = false
+ // underlying buffer only keeps growing. Resize to a smaller size does not reallocate.
+ PARQUET_THROW_NOT_OK(uncompressed_data_->Resize(uncompressed_size, false));
+ ConcatenateBuffers(definition_levels_rle_size, repetition_levels_rle_size, values,
+ uncompressed_data_->mutable_data());
+
+ EncodedStatistics page_stats = GetPageStatistics();
+ page_stats.ApplyStatSizeLimits(properties_->max_statistics_size(descr_->path()));
+ page_stats.set_is_signed(SortOrder::SIGNED == descr_->sort_order());
+ ResetPageStatistics();
+
+ std::shared_ptr<Buffer> compressed_data;
+ if (pager_->has_compressor()) {
+ pager_->Compress(*(uncompressed_data_.get()), compressor_temp_buffer_.get());
+ compressed_data = compressor_temp_buffer_;
+ } else {
+ compressed_data = uncompressed_data_;
+ }
+
+ // Write the page to OutputStream eagerly if there is no dictionary or
+ // if dictionary encoding has fallen back to PLAIN
+ if (has_dictionary_ && !fallback_) { // Save pages until end of dictionary encoding
+ PARQUET_ASSIGN_OR_THROW(
+ auto compressed_data_copy,
+ compressed_data->CopySlice(0, compressed_data->size(), allocator_));
+ std::unique_ptr<DataPage> page_ptr(new DataPageV1(
+ compressed_data_copy, static_cast<int32_t>(num_buffered_values_), encoding_,
+ Encoding::RLE, Encoding::RLE, uncompressed_size, page_stats));
+ total_compressed_bytes_ += page_ptr->size() + sizeof(format::PageHeader);
+
+ data_pages_.push_back(std::move(page_ptr));
+ } else { // Eagerly write pages
+ DataPageV1 page(compressed_data, static_cast<int32_t>(num_buffered_values_),
+ encoding_, Encoding::RLE, Encoding::RLE, uncompressed_size,
+ page_stats);
+ WriteDataPage(page);
+ }
+}
+
+void ColumnWriterImpl::BuildDataPageV2(int64_t definition_levels_rle_size,
+ int64_t repetition_levels_rle_size,
+ int64_t uncompressed_size,
+ const std::shared_ptr<Buffer>& values) {
+ // Compress the values if needed. Repetition and definition levels are uncompressed in
+ // V2.
+ std::shared_ptr<Buffer> compressed_values;
+ if (pager_->has_compressor()) {
+ pager_->Compress(*values, compressor_temp_buffer_.get());
+ compressed_values = compressor_temp_buffer_;
+ } else {
+ compressed_values = values;
+ }
+
+ // Concatenate uncompressed levels and the possibly compressed values
+ int64_t combined_size =
+ definition_levels_rle_size + repetition_levels_rle_size + compressed_values->size();
+ std::shared_ptr<ResizableBuffer> combined = AllocateBuffer(allocator_, combined_size);
+
+ ConcatenateBuffers(definition_levels_rle_size, repetition_levels_rle_size,
+ compressed_values, combined->mutable_data());
+
+ EncodedStatistics page_stats = GetPageStatistics();
+ page_stats.ApplyStatSizeLimits(properties_->max_statistics_size(descr_->path()));
+ page_stats.set_is_signed(SortOrder::SIGNED == descr_->sort_order());
+ ResetPageStatistics();
+
+ int32_t num_values = static_cast<int32_t>(num_buffered_values_);
+ int32_t null_count = static_cast<int32_t>(page_stats.null_count);
+ int32_t def_levels_byte_length = static_cast<int32_t>(definition_levels_rle_size);
+ int32_t rep_levels_byte_length = static_cast<int32_t>(repetition_levels_rle_size);
+
+ // Write the page to OutputStream eagerly if there is no dictionary or
+ // if dictionary encoding has fallen back to PLAIN
+ if (has_dictionary_ && !fallback_) { // Save pages until end of dictionary encoding
+ PARQUET_ASSIGN_OR_THROW(auto data_copy,
+ combined->CopySlice(0, combined->size(), allocator_));
+ std::unique_ptr<DataPage> page_ptr(new DataPageV2(
+ combined, num_values, null_count, num_values, encoding_, def_levels_byte_length,
+ rep_levels_byte_length, uncompressed_size, pager_->has_compressor()));
+ total_compressed_bytes_ += page_ptr->size() + sizeof(format::PageHeader);
+ data_pages_.push_back(std::move(page_ptr));
+ } else {
+ DataPageV2 page(combined, num_values, null_count, num_values, encoding_,
+ def_levels_byte_length, rep_levels_byte_length, uncompressed_size,
+ pager_->has_compressor());
+ WriteDataPage(page);
+ }
+}
+
+int64_t ColumnWriterImpl::Close() {
+ if (!closed_) {
+ closed_ = true;
+ if (has_dictionary_ && !fallback_) {
+ WriteDictionaryPage();
+ }
+
+ FlushBufferedDataPages();
+
+ EncodedStatistics chunk_statistics = GetChunkStatistics();
+ chunk_statistics.ApplyStatSizeLimits(
+ properties_->max_statistics_size(descr_->path()));
+ chunk_statistics.set_is_signed(SortOrder::SIGNED == descr_->sort_order());
+
+ // Write stats only if the column has at least one row written
+ if (rows_written_ > 0 && chunk_statistics.is_set()) {
+ metadata_->SetStatistics(chunk_statistics);
+ }
+ pager_->Close(has_dictionary_, fallback_);
+ }
+
+ return total_bytes_written_;
+}
+
+void ColumnWriterImpl::FlushBufferedDataPages() {
+ // Write all outstanding data to a new page
+ if (num_buffered_values_ > 0) {
+ AddDataPage();
+ }
+ for (const auto& page_ptr : data_pages_) {
+ WriteDataPage(*page_ptr);
+ }
+ data_pages_.clear();
+ total_compressed_bytes_ = 0;
+}
+
+// ----------------------------------------------------------------------
+// TypedColumnWriter
+
+template <typename Action>
+inline void DoInBatches(int64_t total, int64_t batch_size, Action&& action) {
+ int64_t num_batches = static_cast<int>(total / batch_size);
+ for (int round = 0; round < num_batches; round++) {
+ action(round * batch_size, batch_size);
+ }
+ // Write the remaining values
+ if (total % batch_size > 0) {
+ action(num_batches * batch_size, total % batch_size);
+ }
+}
+
+bool DictionaryDirectWriteSupported(const ::arrow::Array& array) {
+ DCHECK_EQ(array.type_id(), ::arrow::Type::DICTIONARY);
+ const ::arrow::DictionaryType& dict_type =
+ static_cast<const ::arrow::DictionaryType&>(*array.type());
+ return ::arrow::is_base_binary_like(dict_type.value_type()->id());
+}
+
+Status ConvertDictionaryToDense(const ::arrow::Array& array, MemoryPool* pool,
+ std::shared_ptr<::arrow::Array>* out) {
+ const ::arrow::DictionaryType& dict_type =
+ static_cast<const ::arrow::DictionaryType&>(*array.type());
+
+ ::arrow::compute::ExecContext ctx(pool);
+ ARROW_ASSIGN_OR_RAISE(Datum cast_output,
+ ::arrow::compute::Cast(array.data(), dict_type.value_type(),
+ ::arrow::compute::CastOptions(), &ctx));
+ *out = cast_output.make_array();
+ return Status::OK();
+}
+
+static inline bool IsDictionaryEncoding(Encoding::type encoding) {
+ return encoding == Encoding::PLAIN_DICTIONARY;
+}
+
+template <typename DType>
+class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter<DType> {
+ public:
+ using T = typename DType::c_type;
+
+ TypedColumnWriterImpl(ColumnChunkMetaDataBuilder* metadata,
+ std::unique_ptr<PageWriter> pager, const bool use_dictionary,
+ Encoding::type encoding, const WriterProperties* properties)
+ : ColumnWriterImpl(metadata, std::move(pager), use_dictionary, encoding,
+ properties) {
+ current_encoder_ = MakeEncoder(DType::type_num, encoding, use_dictionary, descr_,
+ properties->memory_pool());
+
+ if (properties->statistics_enabled(descr_->path()) &&
+ (SortOrder::UNKNOWN != descr_->sort_order())) {
+ page_statistics_ = MakeStatistics<DType>(descr_, allocator_);
+ chunk_statistics_ = MakeStatistics<DType>(descr_, allocator_);
+ }
+ }
+
+ int64_t Close() override { return ColumnWriterImpl::Close(); }
+
+ int64_t WriteBatch(int64_t num_values, const int16_t* def_levels,
+ const int16_t* rep_levels, const T* values) override {
+ // We check for DataPage limits only after we have inserted the values. If a user
+ // writes a large number of values, the DataPage size can be much above the limit.
+ // The purpose of this chunking is to bound this. Even if a user writes large number
+ // of values, the chunking will ensure the AddDataPage() is called at a reasonable
+ // pagesize limit
+ int64_t value_offset = 0;
+
+ auto WriteChunk = [&](int64_t offset, int64_t batch_size) {
+ int64_t values_to_write = WriteLevels(batch_size, AddIfNotNull(def_levels, offset),
+ AddIfNotNull(rep_levels, offset));
+
+ // PARQUET-780
+ if (values_to_write > 0) {
+ DCHECK_NE(nullptr, values);
+ }
+ WriteValues(AddIfNotNull(values, value_offset), values_to_write,
+ batch_size - values_to_write);
+ CommitWriteAndCheckPageLimit(batch_size, values_to_write);
+ value_offset += values_to_write;
+
+ // Dictionary size checked separately from data page size since we
+ // circumvent this check when writing ::arrow::DictionaryArray directly
+ CheckDictionarySizeLimit();
+ };
+ DoInBatches(num_values, properties_->write_batch_size(), WriteChunk);
+ return value_offset;
+ }
+
+ void WriteBatchSpaced(int64_t num_values, const int16_t* def_levels,
+ const int16_t* rep_levels, const uint8_t* valid_bits,
+ int64_t valid_bits_offset, const T* values) override {
+ // Like WriteBatch, but for spaced values
+ int64_t value_offset = 0;
+ auto WriteChunk = [&](int64_t offset, int64_t batch_size) {
+ int64_t batch_num_values = 0;
+ int64_t batch_num_spaced_values = 0;
+ int64_t null_count;
+ MaybeCalculateValidityBits(AddIfNotNull(def_levels, offset), batch_size,
+ &batch_num_values, &batch_num_spaced_values,
+ &null_count);
+
+ WriteLevelsSpaced(batch_size, AddIfNotNull(def_levels, offset),
+ AddIfNotNull(rep_levels, offset));
+ if (bits_buffer_ != nullptr) {
+ WriteValuesSpaced(AddIfNotNull(values, value_offset), batch_num_values,
+ batch_num_spaced_values, bits_buffer_->data(), /*offset=*/0);
+ } else {
+ WriteValuesSpaced(AddIfNotNull(values, value_offset), batch_num_values,
+ batch_num_spaced_values, valid_bits,
+ valid_bits_offset + value_offset);
+ }
+ CommitWriteAndCheckPageLimit(batch_size, batch_num_spaced_values);
+ value_offset += batch_num_spaced_values;
+
+ // Dictionary size checked separately from data page size since we
+ // circumvent this check when writing ::arrow::DictionaryArray directly
+ CheckDictionarySizeLimit();
+ };
+ DoInBatches(num_values, properties_->write_batch_size(), WriteChunk);
+ }
+
+ Status WriteArrow(const int16_t* def_levels, const int16_t* rep_levels,
+ int64_t num_levels, const ::arrow::Array& leaf_array,
+ ArrowWriteContext* ctx, bool leaf_field_nullable) override {
+ BEGIN_PARQUET_CATCH_EXCEPTIONS
+ // Leaf nulls are canonical when there is only a single null element after a list
+ // and it is at the leaf.
+ bool single_nullable_element =
+ (level_info_.def_level == level_info_.repeated_ancestor_def_level + 1) &&
+ leaf_field_nullable;
+ bool maybe_parent_nulls = level_info_.HasNullableValues() && !single_nullable_element;
+ if (maybe_parent_nulls) {
+ ARROW_ASSIGN_OR_RAISE(
+ bits_buffer_,
+ ::arrow::AllocateResizableBuffer(
+ BitUtil::BytesForBits(properties_->write_batch_size()), ctx->memory_pool));
+ bits_buffer_->ZeroPadding();
+ }
+
+ if (leaf_array.type()->id() == ::arrow::Type::DICTIONARY) {
+ return WriteArrowDictionary(def_levels, rep_levels, num_levels, leaf_array, ctx,
+ maybe_parent_nulls);
+ } else {
+ return WriteArrowDense(def_levels, rep_levels, num_levels, leaf_array, ctx,
+ maybe_parent_nulls);
+ }
+ END_PARQUET_CATCH_EXCEPTIONS
+ }
+
+ int64_t EstimatedBufferedValueBytes() const override {
+ return current_encoder_->EstimatedDataEncodedSize();
+ }
+
+ protected:
+ std::shared_ptr<Buffer> GetValuesBuffer() override {
+ return current_encoder_->FlushValues();
+ }
+
+ // Internal function to handle direct writing of ::arrow::DictionaryArray,
+ // since the standard logic concerning dictionary size limits and fallback to
+ // plain encoding is circumvented
+ Status WriteArrowDictionary(const int16_t* def_levels, const int16_t* rep_levels,
+ int64_t num_levels, const ::arrow::Array& array,
+ ArrowWriteContext* context, bool maybe_parent_nulls);
+
+ Status WriteArrowDense(const int16_t* def_levels, const int16_t* rep_levels,
+ int64_t num_levels, const ::arrow::Array& array,
+ ArrowWriteContext* context, bool maybe_parent_nulls);
+
+ void WriteDictionaryPage() override {
+ // We have to dynamic cast here because of TypedEncoder<Type> as
+ // some compilers don't want to cast through virtual inheritance
+ auto dict_encoder = dynamic_cast<DictEncoder<DType>*>(current_encoder_.get());
+ DCHECK(dict_encoder);
+ std::shared_ptr<ResizableBuffer> buffer =
+ AllocateBuffer(properties_->memory_pool(), dict_encoder->dict_encoded_size());
+ dict_encoder->WriteDict(buffer->mutable_data());
+
+ DictionaryPage page(buffer, dict_encoder->num_entries(),
+ properties_->dictionary_page_encoding());
+ total_bytes_written_ += pager_->WriteDictionaryPage(page);
+ }
+
+ EncodedStatistics GetPageStatistics() override {
+ EncodedStatistics result;
+ if (page_statistics_) result = page_statistics_->Encode();
+ return result;
+ }
+
+ EncodedStatistics GetChunkStatistics() override {
+ EncodedStatistics result;
+ if (chunk_statistics_) result = chunk_statistics_->Encode();
+ return result;
+ }
+
+ void ResetPageStatistics() override {
+ if (chunk_statistics_ != nullptr) {
+ chunk_statistics_->Merge(*page_statistics_);
+ page_statistics_->Reset();
+ }
+ }
+
+ Type::type type() const override { return descr_->physical_type(); }
+
+ const ColumnDescriptor* descr() const override { return descr_; }
+
+ int64_t rows_written() const override { return rows_written_; }
+
+ int64_t total_compressed_bytes() const override { return total_compressed_bytes_; }
+
+ int64_t total_bytes_written() const override { return total_bytes_written_; }
+
+ const WriterProperties* properties() override { return properties_; }
+
+ private:
+ using ValueEncoderType = typename EncodingTraits<DType>::Encoder;
+ using TypedStats = TypedStatistics<DType>;
+ std::unique_ptr<Encoder> current_encoder_;
+ std::shared_ptr<TypedStats> page_statistics_;
+ std::shared_ptr<TypedStats> chunk_statistics_;
+
+ // If writing a sequence of ::arrow::DictionaryArray to the writer, we keep the
+ // dictionary passed to DictEncoder<T>::PutDictionary so we can check
+ // subsequent array chunks to see either if materialization is required (in
+ // which case we call back to the dense write path)
+ std::shared_ptr<::arrow::Array> preserved_dictionary_;
+
+ int64_t WriteLevels(int64_t num_values, const int16_t* def_levels,
+ const int16_t* rep_levels) {
+ int64_t values_to_write = 0;
+ // If the field is required and non-repeated, there are no definition levels
+ if (descr_->max_definition_level() > 0) {
+ for (int64_t i = 0; i < num_values; ++i) {
+ if (def_levels[i] == descr_->max_definition_level()) {
+ ++values_to_write;
+ }
+ }
+
+ WriteDefinitionLevels(num_values, def_levels);
+ } else {
+ // Required field, write all values
+ values_to_write = num_values;
+ }
+
+ // Not present for non-repeated fields
+ if (descr_->max_repetition_level() > 0) {
+ // A row could include more than one value
+ // Count the occasions where we start a new row
+ for (int64_t i = 0; i < num_values; ++i) {
+ if (rep_levels[i] == 0) {
+ rows_written_++;
+ }
+ }
+
+ WriteRepetitionLevels(num_values, rep_levels);
+ } else {
+ // Each value is exactly one row
+ rows_written_ += static_cast<int>(num_values);
+ }
+ return values_to_write;
+ }
+
+ // This method will always update the three output parameters,
+ // out_values_to_write, out_spaced_values_to_write and null_count. Additionally
+ // it will update the validity bitmap if required (i.e. if at least one level
+ // of nullable structs directly precede the leaf node).
+ void MaybeCalculateValidityBits(const int16_t* def_levels, int64_t batch_size,
+ int64_t* out_values_to_write,
+ int64_t* out_spaced_values_to_write,
+ int64_t* null_count) {
+ if (bits_buffer_ == nullptr) {
+ if (level_info_.def_level == 0) {
+ // In this case def levels should be null and we only
+ // need to output counts which will always be equal to
+ // the batch size passed in (max def_level == 0 indicates
+ // there cannot be repeated or null fields).
+ DCHECK_EQ(def_levels, nullptr);
+ *out_values_to_write = batch_size;
+ *out_spaced_values_to_write = batch_size;
+ *null_count = 0;
+ } else {
+ for (int x = 0; x < batch_size; x++) {
+ *out_values_to_write += def_levels[x] == level_info_.def_level ? 1 : 0;
+ *out_spaced_values_to_write +=
+ def_levels[x] >= level_info_.repeated_ancestor_def_level ? 1 : 0;
+ }
+ *null_count = *out_values_to_write - *out_spaced_values_to_write;
+ }
+ return;
+ }
+ // Shrink to fit possible causes another allocation, and would only be necessary
+ // on the last batch.
+ int64_t new_bitmap_size = BitUtil::BytesForBits(batch_size);
+ if (new_bitmap_size != bits_buffer_->size()) {
+ PARQUET_THROW_NOT_OK(
+ bits_buffer_->Resize(new_bitmap_size, /*shrink_to_fit=*/false));
+ bits_buffer_->ZeroPadding();
+ }
+ internal::ValidityBitmapInputOutput io;
+ io.valid_bits = bits_buffer_->mutable_data();
+ io.values_read_upper_bound = batch_size;
+ internal::DefLevelsToBitmap(def_levels, batch_size, level_info_, &io);
+ *out_values_to_write = io.values_read - io.null_count;
+ *out_spaced_values_to_write = io.values_read;
+ *null_count = io.null_count;
+ }
+
+ Result<std::shared_ptr<Array>> MaybeReplaceValidity(std::shared_ptr<Array> array,
+ int64_t new_null_count,
+ ::arrow::MemoryPool* memory_pool) {
+ if (bits_buffer_ == nullptr) {
+ return array;
+ }
+ std::vector<std::shared_ptr<Buffer>> buffers = array->data()->buffers;
+ if (buffers.empty()) {
+ return array;
+ }
+ buffers[0] = bits_buffer_;
+ // Should be a leaf array.
+ DCHECK_GT(buffers.size(), 1);
+ ValueBufferSlicer slicer{memory_pool, /*buffer=*/nullptr};
+ if (array->data()->offset > 0) {
+ RETURN_NOT_OK(::arrow::VisitArrayInline(*array, &slicer));
+ buffers[1] = slicer.buffer_;
+ }
+ return ::arrow::MakeArray(std::make_shared<ArrayData>(
+ array->type(), array->length(), std::move(buffers), new_null_count));
+ }
+
+ void WriteLevelsSpaced(int64_t num_levels, const int16_t* def_levels,
+ const int16_t* rep_levels) {
+ // If the field is required and non-repeated, there are no definition levels
+ if (descr_->max_definition_level() > 0) {
+ WriteDefinitionLevels(num_levels, def_levels);
+ }
+ // Not present for non-repeated fields
+ if (descr_->max_repetition_level() > 0) {
+ // A row could include more than one value
+ // Count the occasions where we start a new row
+ for (int64_t i = 0; i < num_levels; ++i) {
+ if (rep_levels[i] == 0) {
+ rows_written_++;
+ }
+ }
+ WriteRepetitionLevels(num_levels, rep_levels);
+ } else {
+ // Each value is exactly one row
+ rows_written_ += static_cast<int>(num_levels);
+ }
+ }
+
+ void CommitWriteAndCheckPageLimit(int64_t num_levels, int64_t num_values) {
+ num_buffered_values_ += num_levels;
+ num_buffered_encoded_values_ += num_values;
+
+ if (current_encoder_->EstimatedDataEncodedSize() >= properties_->data_pagesize()) {
+ AddDataPage();
+ }
+ }
+
+ void FallbackToPlainEncoding() {
+ if (IsDictionaryEncoding(current_encoder_->encoding())) {
+ WriteDictionaryPage();
+ // Serialize the buffered Dictionary Indices
+ FlushBufferedDataPages();
+ fallback_ = true;
+ // Only PLAIN encoding is supported for fallback in V1
+ current_encoder_ = MakeEncoder(DType::type_num, Encoding::PLAIN, false, descr_,
+ properties_->memory_pool());
+ encoding_ = Encoding::PLAIN;
+ }
+ }
+
+ // Checks if the Dictionary Page size limit is reached
+ // If the limit is reached, the Dictionary and Data Pages are serialized
+ // The encoding is switched to PLAIN
+ //
+ // Only one Dictionary Page is written.
+ // Fallback to PLAIN if dictionary page limit is reached.
+ void CheckDictionarySizeLimit() {
+ if (!has_dictionary_ || fallback_) {
+ // Either not using dictionary encoding, or we have already fallen back
+ // to PLAIN encoding because the size threshold was reached
+ return;
+ }
+
+ // We have to dynamic cast here because TypedEncoder<Type> as some compilers
+ // don't want to cast through virtual inheritance
+ auto dict_encoder = dynamic_cast<DictEncoder<DType>*>(current_encoder_.get());
+ if (dict_encoder->dict_encoded_size() >= properties_->dictionary_pagesize_limit()) {
+ FallbackToPlainEncoding();
+ }
+ }
+
+ void WriteValues(const T* values, int64_t num_values, int64_t num_nulls) {
+ dynamic_cast<ValueEncoderType*>(current_encoder_.get())
+ ->Put(values, static_cast<int>(num_values));
+ if (page_statistics_ != nullptr) {
+ page_statistics_->Update(values, num_values, num_nulls);
+ }
+ }
+
+ void WriteValuesSpaced(const T* values, int64_t num_values, int64_t num_spaced_values,
+ const uint8_t* valid_bits, int64_t valid_bits_offset) {
+ if (num_values != num_spaced_values) {
+ dynamic_cast<ValueEncoderType*>(current_encoder_.get())
+ ->PutSpaced(values, static_cast<int>(num_spaced_values), valid_bits,
+ valid_bits_offset);
+ } else {
+ dynamic_cast<ValueEncoderType*>(current_encoder_.get())
+ ->Put(values, static_cast<int>(num_values));
+ }
+ if (page_statistics_ != nullptr) {
+ const int64_t num_nulls = num_spaced_values - num_values;
+ page_statistics_->UpdateSpaced(values, valid_bits, valid_bits_offset, num_values,
+ num_nulls);
+ }
+ }
+};
+
+template <typename DType>
+Status TypedColumnWriterImpl<DType>::WriteArrowDictionary(
+ const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels,
+ const ::arrow::Array& array, ArrowWriteContext* ctx, bool maybe_parent_nulls) {
+ // If this is the first time writing a DictionaryArray, then there's
+ // a few possible paths to take:
+ //
+ // - If dictionary encoding is not enabled, convert to densely
+ // encoded and call WriteArrow
+ // - Dictionary encoding enabled
+ // - If this is the first time this is called, then we call
+ // PutDictionary into the encoder and then PutIndices on each
+ // chunk. We store the dictionary that was written in
+ // preserved_dictionary_ so that subsequent calls to this method
+ // can make sure the dictionary has not changed
+ // - On subsequent calls, we have to check whether the dictionary
+ // has changed. If it has, then we trigger the varying
+ // dictionary path and materialize each chunk and then call
+ // WriteArrow with that
+ auto WriteDense = [&] {
+ std::shared_ptr<::arrow::Array> dense_array;
+ RETURN_NOT_OK(
+ ConvertDictionaryToDense(array, properties_->memory_pool(), &dense_array));
+ return WriteArrowDense(def_levels, rep_levels, num_levels, *dense_array, ctx,
+ maybe_parent_nulls);
+ };
+
+ if (!IsDictionaryEncoding(current_encoder_->encoding()) ||
+ !DictionaryDirectWriteSupported(array)) {
+ // No longer dictionary-encoding for whatever reason, maybe we never were
+ // or we decided to stop. Note that WriteArrow can be invoked multiple
+ // times with both dense and dictionary-encoded versions of the same data
+ // without a problem. Any dense data will be hashed to indices until the
+ // dictionary page limit is reached, at which everything (dictionary and
+ // dense) will fall back to plain encoding
+ return WriteDense();
+ }
+
+ auto dict_encoder = dynamic_cast<DictEncoder<DType>*>(current_encoder_.get());
+ const auto& data = checked_cast<const ::arrow::DictionaryArray&>(array);
+ std::shared_ptr<::arrow::Array> dictionary = data.dictionary();
+ std::shared_ptr<::arrow::Array> indices = data.indices();
+
+ int64_t value_offset = 0;
+ auto WriteIndicesChunk = [&](int64_t offset, int64_t batch_size) {
+ int64_t batch_num_values = 0;
+ int64_t batch_num_spaced_values = 0;
+ int64_t null_count = ::arrow::kUnknownNullCount;
+ // Bits is not null for nullable values. At this point in the code we can't determine
+ // if the leaf array has the same null values as any parents it might have had so we
+ // need to recompute it from def levels.
+ MaybeCalculateValidityBits(AddIfNotNull(def_levels, offset), batch_size,
+ &batch_num_values, &batch_num_spaced_values, &null_count);
+ WriteLevelsSpaced(batch_size, AddIfNotNull(def_levels, offset),
+ AddIfNotNull(rep_levels, offset));
+ std::shared_ptr<Array> writeable_indices =
+ indices->Slice(value_offset, batch_num_spaced_values);
+ PARQUET_ASSIGN_OR_THROW(
+ writeable_indices,
+ MaybeReplaceValidity(writeable_indices, null_count, ctx->memory_pool));
+ dict_encoder->PutIndices(*writeable_indices);
+ CommitWriteAndCheckPageLimit(batch_size, batch_num_values);
+ value_offset += batch_num_spaced_values;
+ };
+
+ // Handle seeing dictionary for the first time
+ if (!preserved_dictionary_) {
+ // It's a new dictionary. Call PutDictionary and keep track of it
+ PARQUET_CATCH_NOT_OK(dict_encoder->PutDictionary(*dictionary));
+
+ // If there were duplicate value in the dictionary, the encoder's memo table
+ // will be out of sync with the indices in the Arrow array.
+ // The easiest solution for this uncommon case is to fallback to plain encoding.
+ if (dict_encoder->num_entries() != dictionary->length()) {
+ PARQUET_CATCH_NOT_OK(FallbackToPlainEncoding());
+ return WriteDense();
+ }
+
+ // TODO(wesm): If some dictionary values are unobserved, then the
+ // statistics will be inaccurate. Do we care enough to fix it?
+ if (page_statistics_ != nullptr) {
+ PARQUET_CATCH_NOT_OK(page_statistics_->Update(*dictionary));
+ }
+ preserved_dictionary_ = dictionary;
+ } else if (!dictionary->Equals(*preserved_dictionary_)) {
+ // Dictionary has changed
+ PARQUET_CATCH_NOT_OK(FallbackToPlainEncoding());
+ return WriteDense();
+ }
+
+ PARQUET_CATCH_NOT_OK(
+ DoInBatches(num_levels, properties_->write_batch_size(), WriteIndicesChunk));
+ return Status::OK();
+}
+
+// ----------------------------------------------------------------------
+// Direct Arrow write path
+
+template <typename ParquetType, typename ArrowType, typename Enable = void>
+struct SerializeFunctor {
+ using ArrowCType = typename ArrowType::c_type;
+ using ArrayType = typename ::arrow::TypeTraits<ArrowType>::ArrayType;
+ using ParquetCType = typename ParquetType::c_type;
+ Status Serialize(const ArrayType& array, ArrowWriteContext*, ParquetCType* out) {
+ const ArrowCType* input = array.raw_values();
+ if (array.null_count() > 0) {
+ for (int i = 0; i < array.length(); i++) {
+ out[i] = static_cast<ParquetCType>(input[i]);
+ }
+ } else {
+ std::copy(input, input + array.length(), out);
+ }
+ return Status::OK();
+ }
+};
+
+template <typename ParquetType, typename ArrowType>
+Status WriteArrowSerialize(const ::arrow::Array& array, int64_t num_levels,
+ const int16_t* def_levels, const int16_t* rep_levels,
+ ArrowWriteContext* ctx, TypedColumnWriter<ParquetType>* writer,
+ bool maybe_parent_nulls) {
+ using ParquetCType = typename ParquetType::c_type;
+ using ArrayType = typename ::arrow::TypeTraits<ArrowType>::ArrayType;
+
+ ParquetCType* buffer = nullptr;
+ PARQUET_THROW_NOT_OK(ctx->GetScratchData<ParquetCType>(array.length(), &buffer));
+
+ SerializeFunctor<ParquetType, ArrowType> functor;
+ RETURN_NOT_OK(functor.Serialize(checked_cast<const ArrayType&>(array), ctx, buffer));
+ bool no_nulls =
+ writer->descr()->schema_node()->is_required() || (array.null_count() == 0);
+ if (!maybe_parent_nulls && no_nulls) {
+ PARQUET_CATCH_NOT_OK(writer->WriteBatch(num_levels, def_levels, rep_levels, buffer));
+ } else {
+ PARQUET_CATCH_NOT_OK(writer->WriteBatchSpaced(num_levels, def_levels, rep_levels,
+ array.null_bitmap_data(),
+ array.offset(), buffer));
+ }
+ return Status::OK();
+}
+
+template <typename ParquetType>
+Status WriteArrowZeroCopy(const ::arrow::Array& array, int64_t num_levels,
+ const int16_t* def_levels, const int16_t* rep_levels,
+ ArrowWriteContext* ctx, TypedColumnWriter<ParquetType>* writer,
+ bool maybe_parent_nulls) {
+ using T = typename ParquetType::c_type;
+ const auto& data = static_cast<const ::arrow::PrimitiveArray&>(array);
+ const T* values = nullptr;
+ // The values buffer may be null if the array is empty (ARROW-2744)
+ if (data.values() != nullptr) {
+ values = reinterpret_cast<const T*>(data.values()->data()) + data.offset();
+ } else {
+ DCHECK_EQ(data.length(), 0);
+ }
+ bool no_nulls =
+ writer->descr()->schema_node()->is_required() || (array.null_count() == 0);
+
+ if (!maybe_parent_nulls && no_nulls) {
+ PARQUET_CATCH_NOT_OK(writer->WriteBatch(num_levels, def_levels, rep_levels, values));
+ } else {
+ PARQUET_CATCH_NOT_OK(writer->WriteBatchSpaced(num_levels, def_levels, rep_levels,
+ data.null_bitmap_data(), data.offset(),
+ values));
+ }
+ return Status::OK();
+}
+
+#define WRITE_SERIALIZE_CASE(ArrowEnum, ArrowType, ParquetType) \
+ case ::arrow::Type::ArrowEnum: \
+ return WriteArrowSerialize<ParquetType, ::arrow::ArrowType>( \
+ array, num_levels, def_levels, rep_levels, ctx, this, maybe_parent_nulls);
+
+#define WRITE_ZERO_COPY_CASE(ArrowEnum, ArrowType, ParquetType) \
+ case ::arrow::Type::ArrowEnum: \
+ return WriteArrowZeroCopy<ParquetType>(array, num_levels, def_levels, rep_levels, \
+ ctx, this, maybe_parent_nulls);
+
+#define ARROW_UNSUPPORTED() \
+ std::stringstream ss; \
+ ss << "Arrow type " << array.type()->ToString() \
+ << " cannot be written to Parquet type " << descr_->ToString(); \
+ return Status::Invalid(ss.str());
+
+// ----------------------------------------------------------------------
+// Write Arrow to BooleanType
+
+template <>
+struct SerializeFunctor<BooleanType, ::arrow::BooleanType> {
+ Status Serialize(const ::arrow::BooleanArray& data, ArrowWriteContext*, bool* out) {
+ for (int i = 0; i < data.length(); i++) {
+ *out++ = data.Value(i);
+ }
+ return Status::OK();
+ }
+};
+
+template <>
+Status TypedColumnWriterImpl<BooleanType>::WriteArrowDense(
+ const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels,
+ const ::arrow::Array& array, ArrowWriteContext* ctx, bool maybe_parent_nulls) {
+ if (array.type_id() != ::arrow::Type::BOOL) {
+ ARROW_UNSUPPORTED();
+ }
+ return WriteArrowSerialize<BooleanType, ::arrow::BooleanType>(
+ array, num_levels, def_levels, rep_levels, ctx, this, maybe_parent_nulls);
+}
+
+// ----------------------------------------------------------------------
+// Write Arrow types to INT32
+
+template <>
+struct SerializeFunctor<Int32Type, ::arrow::Date64Type> {
+ Status Serialize(const ::arrow::Date64Array& array, ArrowWriteContext*, int32_t* out) {
+ const int64_t* input = array.raw_values();
+ for (int i = 0; i < array.length(); i++) {
+ *out++ = static_cast<int32_t>(*input++ / 86400000);
+ }
+ return Status::OK();
+ }
+};
+
+template <>
+struct SerializeFunctor<Int32Type, ::arrow::Time32Type> {
+ Status Serialize(const ::arrow::Time32Array& array, ArrowWriteContext*, int32_t* out) {
+ const int32_t* input = array.raw_values();
+ const auto& type = static_cast<const ::arrow::Time32Type&>(*array.type());
+ if (type.unit() == ::arrow::TimeUnit::SECOND) {
+ for (int i = 0; i < array.length(); i++) {
+ out[i] = input[i] * 1000;
+ }
+ } else {
+ std::copy(input, input + array.length(), out);
+ }
+ return Status::OK();
+ }
+};
+
+template <>
+Status TypedColumnWriterImpl<Int32Type>::WriteArrowDense(
+ const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels,
+ const ::arrow::Array& array, ArrowWriteContext* ctx, bool maybe_parent_nulls) {
+ switch (array.type()->id()) {
+ case ::arrow::Type::NA: {
+ PARQUET_CATCH_NOT_OK(WriteBatch(num_levels, def_levels, rep_levels, nullptr));
+ } break;
+ WRITE_SERIALIZE_CASE(INT8, Int8Type, Int32Type)
+ WRITE_SERIALIZE_CASE(UINT8, UInt8Type, Int32Type)
+ WRITE_SERIALIZE_CASE(INT16, Int16Type, Int32Type)
+ WRITE_SERIALIZE_CASE(UINT16, UInt16Type, Int32Type)
+ WRITE_SERIALIZE_CASE(UINT32, UInt32Type, Int32Type)
+ WRITE_ZERO_COPY_CASE(INT32, Int32Type, Int32Type)
+ WRITE_ZERO_COPY_CASE(DATE32, Date32Type, Int32Type)
+ WRITE_SERIALIZE_CASE(DATE64, Date64Type, Int32Type)
+ WRITE_SERIALIZE_CASE(TIME32, Time32Type, Int32Type)
+ default:
+ ARROW_UNSUPPORTED()
+ }
+ return Status::OK();
+}
+
+// ----------------------------------------------------------------------
+// Write Arrow to Int64 and Int96
+
+#define INT96_CONVERT_LOOP(ConversionFunction) \
+ for (int64_t i = 0; i < array.length(); i++) ConversionFunction(input[i], &out[i]);
+
+template <>
+struct SerializeFunctor<Int96Type, ::arrow::TimestampType> {
+ Status Serialize(const ::arrow::TimestampArray& array, ArrowWriteContext*, Int96* out) {
+ const int64_t* input = array.raw_values();
+ const auto& type = static_cast<const ::arrow::TimestampType&>(*array.type());
+ switch (type.unit()) {
+ case ::arrow::TimeUnit::NANO:
+ INT96_CONVERT_LOOP(internal::NanosecondsToImpalaTimestamp);
+ break;
+ case ::arrow::TimeUnit::MICRO:
+ INT96_CONVERT_LOOP(internal::MicrosecondsToImpalaTimestamp);
+ break;
+ case ::arrow::TimeUnit::MILLI:
+ INT96_CONVERT_LOOP(internal::MillisecondsToImpalaTimestamp);
+ break;
+ case ::arrow::TimeUnit::SECOND:
+ INT96_CONVERT_LOOP(internal::SecondsToImpalaTimestamp);
+ break;
+ }
+ return Status::OK();
+ }
+};
+
+#define COERCE_DIVIDE -1
+#define COERCE_INVALID 0
+#define COERCE_MULTIPLY +1
+
+static std::pair<int, int64_t> kTimestampCoercionFactors[4][4] = {
+ // from seconds ...
+ {{COERCE_INVALID, 0}, // ... to seconds
+ {COERCE_MULTIPLY, 1000}, // ... to millis
+ {COERCE_MULTIPLY, 1000000}, // ... to micros
+ {COERCE_MULTIPLY, INT64_C(1000000000)}}, // ... to nanos
+ // from millis ...
+ {{COERCE_INVALID, 0},
+ {COERCE_MULTIPLY, 1},
+ {COERCE_MULTIPLY, 1000},
+ {COERCE_MULTIPLY, 1000000}},
+ // from micros ...
+ {{COERCE_INVALID, 0},
+ {COERCE_DIVIDE, 1000},
+ {COERCE_MULTIPLY, 1},
+ {COERCE_MULTIPLY, 1000}},
+ // from nanos ...
+ {{COERCE_INVALID, 0},
+ {COERCE_DIVIDE, 1000000},
+ {COERCE_DIVIDE, 1000},
+ {COERCE_MULTIPLY, 1}}};
+
+template <>
+struct SerializeFunctor<Int64Type, ::arrow::TimestampType> {
+ Status Serialize(const ::arrow::TimestampArray& array, ArrowWriteContext* ctx,
+ int64_t* out) {
+ const auto& source_type = static_cast<const ::arrow::TimestampType&>(*array.type());
+ auto source_unit = source_type.unit();
+ const int64_t* values = array.raw_values();
+
+ ::arrow::TimeUnit::type target_unit = ctx->properties->coerce_timestamps_unit();
+ auto target_type = ::arrow::timestamp(target_unit);
+ bool truncation_allowed = ctx->properties->truncated_timestamps_allowed();
+
+ auto DivideBy = [&](const int64_t factor) {
+ for (int64_t i = 0; i < array.length(); i++) {
+ if (!truncation_allowed && array.IsValid(i) && (values[i] % factor != 0)) {
+ return Status::Invalid("Casting from ", source_type.ToString(), " to ",
+ target_type->ToString(),
+ " would lose data: ", values[i]);
+ }
+ out[i] = values[i] / factor;
+ }
+ return Status::OK();
+ };
+
+ auto MultiplyBy = [&](const int64_t factor) {
+ for (int64_t i = 0; i < array.length(); i++) {
+ out[i] = values[i] * factor;
+ }
+ return Status::OK();
+ };
+
+ const auto& coercion = kTimestampCoercionFactors[static_cast<int>(source_unit)]
+ [static_cast<int>(target_unit)];
+
+ // .first -> coercion operation; .second -> scale factor
+ DCHECK_NE(coercion.first, COERCE_INVALID);
+ return coercion.first == COERCE_DIVIDE ? DivideBy(coercion.second)
+ : MultiplyBy(coercion.second);
+ }
+};
+
+#undef COERCE_DIVIDE
+#undef COERCE_INVALID
+#undef COERCE_MULTIPLY
+
+Status WriteTimestamps(const ::arrow::Array& values, int64_t num_levels,
+ const int16_t* def_levels, const int16_t* rep_levels,
+ ArrowWriteContext* ctx, TypedColumnWriter<Int64Type>* writer,
+ bool maybe_parent_nulls) {
+ const auto& source_type = static_cast<const ::arrow::TimestampType&>(*values.type());
+
+ auto WriteCoerce = [&](const ArrowWriterProperties* properties) {
+ ArrowWriteContext temp_ctx = *ctx;
+ temp_ctx.properties = properties;
+ return WriteArrowSerialize<Int64Type, ::arrow::TimestampType>(
+ values, num_levels, def_levels, rep_levels, &temp_ctx, writer,
+ maybe_parent_nulls);
+ };
+
+ if (ctx->properties->coerce_timestamps_enabled()) {
+ // User explicitly requested coercion to specific unit
+ if (source_type.unit() == ctx->properties->coerce_timestamps_unit()) {
+ // No data conversion necessary
+ return WriteArrowZeroCopy<Int64Type>(values, num_levels, def_levels, rep_levels,
+ ctx, writer, maybe_parent_nulls);
+ } else {
+ return WriteCoerce(ctx->properties);
+ }
+ } else if (writer->properties()->version() == ParquetVersion::PARQUET_1_0 &&
+ source_type.unit() == ::arrow::TimeUnit::NANO) {
+ // Absent superseding user instructions, when writing Parquet version 1.0 files,
+ // timestamps in nanoseconds are coerced to microseconds
+ std::shared_ptr<ArrowWriterProperties> properties =
+ (ArrowWriterProperties::Builder())
+ .coerce_timestamps(::arrow::TimeUnit::MICRO)
+ ->disallow_truncated_timestamps()
+ ->build();
+ return WriteCoerce(properties.get());
+ } else if (source_type.unit() == ::arrow::TimeUnit::SECOND) {
+ // Absent superseding user instructions, timestamps in seconds are coerced to
+ // milliseconds
+ std::shared_ptr<ArrowWriterProperties> properties =
+ (ArrowWriterProperties::Builder())
+ .coerce_timestamps(::arrow::TimeUnit::MILLI)
+ ->build();
+ return WriteCoerce(properties.get());
+ } else {
+ // No data conversion necessary
+ return WriteArrowZeroCopy<Int64Type>(values, num_levels, def_levels, rep_levels, ctx,
+ writer, maybe_parent_nulls);
+ }
+}
+
+template <>
+Status TypedColumnWriterImpl<Int64Type>::WriteArrowDense(
+ const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels,
+ const ::arrow::Array& array, ArrowWriteContext* ctx, bool maybe_parent_nulls) {
+ switch (array.type()->id()) {
+ case ::arrow::Type::TIMESTAMP:
+ return WriteTimestamps(array, num_levels, def_levels, rep_levels, ctx, this,
+ maybe_parent_nulls);
+ WRITE_ZERO_COPY_CASE(INT64, Int64Type, Int64Type)
+ WRITE_SERIALIZE_CASE(UINT32, UInt32Type, Int64Type)
+ WRITE_SERIALIZE_CASE(UINT64, UInt64Type, Int64Type)
+ WRITE_ZERO_COPY_CASE(TIME64, Time64Type, Int64Type)
+ default:
+ ARROW_UNSUPPORTED();
+ }
+}
+
+template <>
+Status TypedColumnWriterImpl<Int96Type>::WriteArrowDense(
+ const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels,
+ const ::arrow::Array& array, ArrowWriteContext* ctx, bool maybe_parent_nulls) {
+ if (array.type_id() != ::arrow::Type::TIMESTAMP) {
+ ARROW_UNSUPPORTED();
+ }
+ return WriteArrowSerialize<Int96Type, ::arrow::TimestampType>(
+ array, num_levels, def_levels, rep_levels, ctx, this, maybe_parent_nulls);
+}
+
+// ----------------------------------------------------------------------
+// Floating point types
+
+template <>
+Status TypedColumnWriterImpl<FloatType>::WriteArrowDense(
+ const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels,
+ const ::arrow::Array& array, ArrowWriteContext* ctx, bool maybe_parent_nulls) {
+ if (array.type_id() != ::arrow::Type::FLOAT) {
+ ARROW_UNSUPPORTED();
+ }
+ return WriteArrowZeroCopy<FloatType>(array, num_levels, def_levels, rep_levels, ctx,
+ this, maybe_parent_nulls);
+}
+
+template <>
+Status TypedColumnWriterImpl<DoubleType>::WriteArrowDense(
+ const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels,
+ const ::arrow::Array& array, ArrowWriteContext* ctx, bool maybe_parent_nulls) {
+ if (array.type_id() != ::arrow::Type::DOUBLE) {
+ ARROW_UNSUPPORTED();
+ }
+ return WriteArrowZeroCopy<DoubleType>(array, num_levels, def_levels, rep_levels, ctx,
+ this, maybe_parent_nulls);
+}
+
+// ----------------------------------------------------------------------
+// Write Arrow to BYTE_ARRAY
+
+template <>
+Status TypedColumnWriterImpl<ByteArrayType>::WriteArrowDense(
+ const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels,
+ const ::arrow::Array& array, ArrowWriteContext* ctx, bool maybe_parent_nulls) {
+ if (!::arrow::is_base_binary_like(array.type()->id())) {
+ ARROW_UNSUPPORTED();
+ }
+
+ int64_t value_offset = 0;
+ auto WriteChunk = [&](int64_t offset, int64_t batch_size) {
+ int64_t batch_num_values = 0;
+ int64_t batch_num_spaced_values = 0;
+ int64_t null_count = 0;
+
+ MaybeCalculateValidityBits(AddIfNotNull(def_levels, offset), batch_size,
+ &batch_num_values, &batch_num_spaced_values, &null_count);
+ WriteLevelsSpaced(batch_size, AddIfNotNull(def_levels, offset),
+ AddIfNotNull(rep_levels, offset));
+ std::shared_ptr<Array> data_slice =
+ array.Slice(value_offset, batch_num_spaced_values);
+ PARQUET_ASSIGN_OR_THROW(
+ data_slice, MaybeReplaceValidity(data_slice, null_count, ctx->memory_pool));
+
+ current_encoder_->Put(*data_slice);
+ if (page_statistics_ != nullptr) {
+ page_statistics_->Update(*data_slice);
+ }
+ CommitWriteAndCheckPageLimit(batch_size, batch_num_values);
+ CheckDictionarySizeLimit();
+ value_offset += batch_num_spaced_values;
+ };
+
+ PARQUET_CATCH_NOT_OK(
+ DoInBatches(num_levels, properties_->write_batch_size(), WriteChunk));
+ return Status::OK();
+}
+
+// ----------------------------------------------------------------------
+// Write Arrow to FIXED_LEN_BYTE_ARRAY
+
+template <typename ParquetType, typename ArrowType>
+struct SerializeFunctor<
+ ParquetType, ArrowType,
+ ::arrow::enable_if_t<::arrow::is_fixed_size_binary_type<ArrowType>::value &&
+ !::arrow::is_decimal_type<ArrowType>::value>> {
+ Status Serialize(const ::arrow::FixedSizeBinaryArray& array, ArrowWriteContext*,
+ FLBA* out) {
+ if (array.null_count() == 0) {
+ // no nulls, just dump the data
+ // todo(advancedxy): use a writeBatch to avoid this step
+ for (int64_t i = 0; i < array.length(); i++) {
+ out[i] = FixedLenByteArray(array.GetValue(i));
+ }
+ } else {
+ for (int64_t i = 0; i < array.length(); i++) {
+ if (array.IsValid(i)) {
+ out[i] = FixedLenByteArray(array.GetValue(i));
+ }
+ }
+ }
+ return Status::OK();
+ }
+};
+
+// ----------------------------------------------------------------------
+// Write Arrow to Decimal128
+
+// Requires a custom serializer because decimal in parquet are in big-endian
+// format. Thus, a temporary local buffer is required.
+template <typename ParquetType, typename ArrowType>
+struct SerializeFunctor<ParquetType, ArrowType, ::arrow::enable_if_decimal<ArrowType>> {
+ Status Serialize(const typename ::arrow::TypeTraits<ArrowType>::ArrayType& array,
+ ArrowWriteContext* ctx, FLBA* out) {
+ AllocateScratch(array, ctx);
+ auto offset = Offset(array);
+
+ if (array.null_count() == 0) {
+ for (int64_t i = 0; i < array.length(); i++) {
+ out[i] = FixDecimalEndianess<ArrowType::kByteWidth>(array.GetValue(i), offset);
+ }
+ } else {
+ for (int64_t i = 0; i < array.length(); i++) {
+ out[i] = array.IsValid(i) ? FixDecimalEndianess<ArrowType::kByteWidth>(
+ array.GetValue(i), offset)
+ : FixedLenByteArray();
+ }
+ }
+
+ return Status::OK();
+ }
+
+ // Parquet's Decimal are stored with FixedLength values where the length is
+ // proportional to the precision. Arrow's Decimal are always stored with 16/32
+ // bytes. Thus the internal FLBA pointer must be adjusted by the offset calculated
+ // here.
+ int32_t Offset(const Array& array) {
+ auto decimal_type = checked_pointer_cast<::arrow::DecimalType>(array.type());
+ return decimal_type->byte_width() -
+ ::arrow::DecimalType::DecimalSize(decimal_type->precision());
+ }
+
+ void AllocateScratch(const typename ::arrow::TypeTraits<ArrowType>::ArrayType& array,
+ ArrowWriteContext* ctx) {
+ int64_t non_null_count = array.length() - array.null_count();
+ int64_t size = non_null_count * ArrowType::kByteWidth;
+ scratch_buffer = AllocateBuffer(ctx->memory_pool, size);
+ scratch = reinterpret_cast<int64_t*>(scratch_buffer->mutable_data());
+ }
+
+ template <int byte_width>
+ FixedLenByteArray FixDecimalEndianess(const uint8_t* in, int64_t offset) {
+ const auto* u64_in = reinterpret_cast<const int64_t*>(in);
+ auto out = reinterpret_cast<const uint8_t*>(scratch) + offset;
+ static_assert(byte_width == 16 || byte_width == 32,
+ "only 16 and 32 byte Decimals supported");
+ if (byte_width == 32) {
+ *scratch++ = ::arrow::BitUtil::ToBigEndian(u64_in[3]);
+ *scratch++ = ::arrow::BitUtil::ToBigEndian(u64_in[2]);
+ *scratch++ = ::arrow::BitUtil::ToBigEndian(u64_in[1]);
+ *scratch++ = ::arrow::BitUtil::ToBigEndian(u64_in[0]);
+ } else {
+ *scratch++ = ::arrow::BitUtil::ToBigEndian(u64_in[1]);
+ *scratch++ = ::arrow::BitUtil::ToBigEndian(u64_in[0]);
+ }
+ return FixedLenByteArray(out);
+ }
+
+ std::shared_ptr<ResizableBuffer> scratch_buffer;
+ int64_t* scratch;
+};
+
+template <>
+Status TypedColumnWriterImpl<FLBAType>::WriteArrowDense(
+ const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels,
+ const ::arrow::Array& array, ArrowWriteContext* ctx, bool maybe_parent_nulls) {
+ switch (array.type()->id()) {
+ WRITE_SERIALIZE_CASE(FIXED_SIZE_BINARY, FixedSizeBinaryType, FLBAType)
+ WRITE_SERIALIZE_CASE(DECIMAL128, Decimal128Type, FLBAType)
+ WRITE_SERIALIZE_CASE(DECIMAL256, Decimal256Type, FLBAType)
+ default:
+ break;
+ }
+ return Status::OK();
+}
+
+// ----------------------------------------------------------------------
+// Dynamic column writer constructor
+
+std::shared_ptr<ColumnWriter> ColumnWriter::Make(ColumnChunkMetaDataBuilder* metadata,
+ std::unique_ptr<PageWriter> pager,
+ const WriterProperties* properties) {
+ const ColumnDescriptor* descr = metadata->descr();
+ const bool use_dictionary = properties->dictionary_enabled(descr->path()) &&
+ descr->physical_type() != Type::BOOLEAN;
+ Encoding::type encoding = properties->encoding(descr->path());
+ if (use_dictionary) {
+ encoding = properties->dictionary_index_encoding();
+ }
+ switch (descr->physical_type()) {
+ case Type::BOOLEAN:
+ return std::make_shared<TypedColumnWriterImpl<BooleanType>>(
+ metadata, std::move(pager), use_dictionary, encoding, properties);
+ case Type::INT32:
+ return std::make_shared<TypedColumnWriterImpl<Int32Type>>(
+ metadata, std::move(pager), use_dictionary, encoding, properties);
+ case Type::INT64:
+ return std::make_shared<TypedColumnWriterImpl<Int64Type>>(
+ metadata, std::move(pager), use_dictionary, encoding, properties);
+ case Type::INT96:
+ return std::make_shared<TypedColumnWriterImpl<Int96Type>>(
+ metadata, std::move(pager), use_dictionary, encoding, properties);
+ case Type::FLOAT:
+ return std::make_shared<TypedColumnWriterImpl<FloatType>>(
+ metadata, std::move(pager), use_dictionary, encoding, properties);
+ case Type::DOUBLE:
+ return std::make_shared<TypedColumnWriterImpl<DoubleType>>(
+ metadata, std::move(pager), use_dictionary, encoding, properties);
+ case Type::BYTE_ARRAY:
+ return std::make_shared<TypedColumnWriterImpl<ByteArrayType>>(
+ metadata, std::move(pager), use_dictionary, encoding, properties);
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ return std::make_shared<TypedColumnWriterImpl<FLBAType>>(
+ metadata, std::move(pager), use_dictionary, encoding, properties);
+ default:
+ ParquetException::NYI("type reader not implemented");
+ }
+ // Unreachable code, but suppress compiler warning
+ return std::shared_ptr<ColumnWriter>(nullptr);
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/column_writer.h b/contrib/libs/apache/arrow/cpp/src/parquet/column_writer.h
new file mode 100644
index 00000000000..0a609021739
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/column_writer.h
@@ -0,0 +1,270 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <cstring>
+#include <memory>
+
+#include "parquet/exception.h"
+#include "parquet/platform.h"
+#include "parquet/types.h"
+
+namespace arrow {
+
+class Array;
+
+namespace BitUtil {
+class BitWriter;
+} // namespace BitUtil
+
+namespace util {
+class RleEncoder;
+} // namespace util
+
+} // namespace arrow
+
+namespace parquet {
+
+struct ArrowWriteContext;
+class ColumnDescriptor;
+class DataPage;
+class DictionaryPage;
+class ColumnChunkMetaDataBuilder;
+class Encryptor;
+class WriterProperties;
+
+class PARQUET_EXPORT LevelEncoder {
+ public:
+ LevelEncoder();
+ ~LevelEncoder();
+
+ static int MaxBufferSize(Encoding::type encoding, int16_t max_level,
+ int num_buffered_values);
+
+ // Initialize the LevelEncoder.
+ void Init(Encoding::type encoding, int16_t max_level, int num_buffered_values,
+ uint8_t* data, int data_size);
+
+ // Encodes a batch of levels from an array and returns the number of levels encoded
+ int Encode(int batch_size, const int16_t* levels);
+
+ int32_t len() {
+ if (encoding_ != Encoding::RLE) {
+ throw ParquetException("Only implemented for RLE encoding");
+ }
+ return rle_length_;
+ }
+
+ private:
+ int bit_width_;
+ int rle_length_;
+ Encoding::type encoding_;
+ std::unique_ptr<::arrow::util::RleEncoder> rle_encoder_;
+ std::unique_ptr<::arrow::BitUtil::BitWriter> bit_packed_encoder_;
+};
+
+class PARQUET_EXPORT PageWriter {
+ public:
+ virtual ~PageWriter() {}
+
+ static std::unique_ptr<PageWriter> Open(
+ std::shared_ptr<ArrowOutputStream> sink, Compression::type codec,
+ int compression_level, ColumnChunkMetaDataBuilder* metadata,
+ int16_t row_group_ordinal = -1, int16_t column_chunk_ordinal = -1,
+ ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(),
+ bool buffered_row_group = false,
+ std::shared_ptr<Encryptor> header_encryptor = NULLPTR,
+ std::shared_ptr<Encryptor> data_encryptor = NULLPTR);
+
+ // The Column Writer decides if dictionary encoding is used if set and
+ // if the dictionary encoding has fallen back to default encoding on reaching dictionary
+ // page limit
+ virtual void Close(bool has_dictionary, bool fallback) = 0;
+
+ // Return the number of uncompressed bytes written (including header size)
+ virtual int64_t WriteDataPage(const DataPage& page) = 0;
+
+ // Return the number of uncompressed bytes written (including header size)
+ virtual int64_t WriteDictionaryPage(const DictionaryPage& page) = 0;
+
+ virtual bool has_compressor() = 0;
+
+ virtual void Compress(const Buffer& src_buffer, ResizableBuffer* dest_buffer) = 0;
+};
+
+static constexpr int WRITE_BATCH_SIZE = 1000;
+class PARQUET_EXPORT ColumnWriter {
+ public:
+ virtual ~ColumnWriter() = default;
+
+ static std::shared_ptr<ColumnWriter> Make(ColumnChunkMetaDataBuilder*,
+ std::unique_ptr<PageWriter>,
+ const WriterProperties* properties);
+
+ /// \brief Closes the ColumnWriter, commits any buffered values to pages.
+ /// \return Total size of the column in bytes
+ virtual int64_t Close() = 0;
+
+ /// \brief The physical Parquet type of the column
+ virtual Type::type type() const = 0;
+
+ /// \brief The schema for the column
+ virtual const ColumnDescriptor* descr() const = 0;
+
+ /// \brief The number of rows written so far
+ virtual int64_t rows_written() const = 0;
+
+ /// \brief The total size of the compressed pages + page headers. Some values
+ /// might be still buffered and not written to a page yet
+ virtual int64_t total_compressed_bytes() const = 0;
+
+ /// \brief The total number of bytes written as serialized data and
+ /// dictionary pages to the ColumnChunk so far
+ virtual int64_t total_bytes_written() const = 0;
+
+ /// \brief The file-level writer properties
+ virtual const WriterProperties* properties() = 0;
+
+ /// \brief Write Apache Arrow columnar data directly to ColumnWriter. Returns
+ /// error status if the array data type is not compatible with the concrete
+ /// writer type.
+ ///
+ /// leaf_array is always a primitive (possibly dictionary encoded type).
+ /// Leaf_field_nullable indicates whether the leaf array is considered nullable
+ /// according to its schema in a Table or its parent array.
+ virtual ::arrow::Status WriteArrow(const int16_t* def_levels, const int16_t* rep_levels,
+ int64_t num_levels, const ::arrow::Array& leaf_array,
+ ArrowWriteContext* ctx,
+ bool leaf_field_nullable) = 0;
+};
+
+// API to write values to a single column. This is the main client facing API.
+template <typename DType>
+class TypedColumnWriter : public ColumnWriter {
+ public:
+ using T = typename DType::c_type;
+
+ // Write a batch of repetition levels, definition levels, and values to the
+ // column.
+ // `num_values` is the number of logical leaf values.
+ // `def_levels` (resp. `rep_levels`) can be null if the column's max definition level
+ // (resp. max repetition level) is 0.
+ // If not null, each of `def_levels` and `rep_levels` must have at least
+ // `num_values`.
+ //
+ // The number of physical values written (taken from `values`) is returned.
+ // It can be smaller than `num_values` is there are some undefined values.
+ virtual int64_t WriteBatch(int64_t num_values, const int16_t* def_levels,
+ const int16_t* rep_levels, const T* values) = 0;
+
+ /// Write a batch of repetition levels, definition levels, and values to the
+ /// column.
+ ///
+ /// In comparison to WriteBatch the length of repetition and definition levels
+ /// is the same as of the number of values read for max_definition_level == 1.
+ /// In the case of max_definition_level > 1, the repetition and definition
+ /// levels are larger than the values but the values include the null entries
+ /// with definition_level == (max_definition_level - 1). Thus we have to differentiate
+ /// in the parameters of this function if the input has the length of num_values or the
+ /// _number of rows in the lowest nesting level_.
+ ///
+ /// In the case that the most inner node in the Parquet is required, the _number of rows
+ /// in the lowest nesting level_ is equal to the number of non-null values. If the
+ /// inner-most schema node is optional, the _number of rows in the lowest nesting level_
+ /// also includes all values with definition_level == (max_definition_level - 1).
+ ///
+ /// @param num_values number of levels to write.
+ /// @param def_levels The Parquet definition levels, length is num_values
+ /// @param rep_levels The Parquet repetition levels, length is num_values
+ /// @param valid_bits Bitmap that indicates if the row is null on the lowest nesting
+ /// level. The length is number of rows in the lowest nesting level.
+ /// @param valid_bits_offset The offset in bits of the valid_bits where the
+ /// first relevant bit resides.
+ /// @param values The values in the lowest nested level including
+ /// spacing for nulls on the lowest levels; input has the length
+ /// of the number of rows on the lowest nesting level.
+ virtual void WriteBatchSpaced(int64_t num_values, const int16_t* def_levels,
+ const int16_t* rep_levels, const uint8_t* valid_bits,
+ int64_t valid_bits_offset, const T* values) = 0;
+
+ // Estimated size of the values that are not written to a page yet
+ virtual int64_t EstimatedBufferedValueBytes() const = 0;
+};
+
+using BoolWriter = TypedColumnWriter<BooleanType>;
+using Int32Writer = TypedColumnWriter<Int32Type>;
+using Int64Writer = TypedColumnWriter<Int64Type>;
+using Int96Writer = TypedColumnWriter<Int96Type>;
+using FloatWriter = TypedColumnWriter<FloatType>;
+using DoubleWriter = TypedColumnWriter<DoubleType>;
+using ByteArrayWriter = TypedColumnWriter<ByteArrayType>;
+using FixedLenByteArrayWriter = TypedColumnWriter<FLBAType>;
+
+namespace internal {
+
+/**
+ * Timestamp conversion constants
+ */
+constexpr int64_t kJulianEpochOffsetDays = INT64_C(2440588);
+
+template <int64_t UnitPerDay, int64_t NanosecondsPerUnit>
+inline void ArrowTimestampToImpalaTimestamp(const int64_t time, Int96* impala_timestamp) {
+ int64_t julian_days = (time / UnitPerDay) + kJulianEpochOffsetDays;
+ (*impala_timestamp).value[2] = (uint32_t)julian_days;
+
+ int64_t last_day_units = time % UnitPerDay;
+ auto last_day_nanos = last_day_units * NanosecondsPerUnit;
+ // impala_timestamp will be unaligned every other entry so do memcpy instead
+ // of assign and reinterpret cast to avoid undefined behavior.
+ std::memcpy(impala_timestamp, &last_day_nanos, sizeof(int64_t));
+}
+
+constexpr int64_t kSecondsInNanos = INT64_C(1000000000);
+
+inline void SecondsToImpalaTimestamp(const int64_t seconds, Int96* impala_timestamp) {
+ ArrowTimestampToImpalaTimestamp<kSecondsPerDay, kSecondsInNanos>(seconds,
+ impala_timestamp);
+}
+
+constexpr int64_t kMillisecondsInNanos = kSecondsInNanos / INT64_C(1000);
+
+inline void MillisecondsToImpalaTimestamp(const int64_t milliseconds,
+ Int96* impala_timestamp) {
+ ArrowTimestampToImpalaTimestamp<kMillisecondsPerDay, kMillisecondsInNanos>(
+ milliseconds, impala_timestamp);
+}
+
+constexpr int64_t kMicrosecondsInNanos = kMillisecondsInNanos / INT64_C(1000);
+
+inline void MicrosecondsToImpalaTimestamp(const int64_t microseconds,
+ Int96* impala_timestamp) {
+ ArrowTimestampToImpalaTimestamp<kMicrosecondsPerDay, kMicrosecondsInNanos>(
+ microseconds, impala_timestamp);
+}
+
+constexpr int64_t kNanosecondsInNanos = INT64_C(1);
+
+inline void NanosecondsToImpalaTimestamp(const int64_t nanoseconds,
+ Int96* impala_timestamp) {
+ ArrowTimestampToImpalaTimestamp<kNanosecondsPerDay, kNanosecondsInNanos>(
+ nanoseconds, impala_timestamp);
+}
+
+} // namespace internal
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/encoding.cc b/contrib/libs/apache/arrow/cpp/src/parquet/encoding.cc
new file mode 100644
index 00000000000..6e8f7ee5491
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/encoding.cc
@@ -0,0 +1,2547 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/encoding.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstdlib>
+#include <limits>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/array/builder_dict.h"
+#include "arrow/stl_allocator.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/bit_run_reader.h"
+#include "arrow/util/bit_stream_utils.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_ops.h"
+#include "arrow/util/bitmap_writer.h"
+#include "arrow/util/byte_stream_split.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/hashing.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/rle_encoding.h"
+#include "arrow/util/ubsan.h"
+#include "arrow/visitor_inline.h"
+
+#include "parquet/exception.h"
+#include "parquet/platform.h"
+#include "parquet/schema.h"
+#include "parquet/types.h"
+
+namespace BitUtil = arrow::BitUtil;
+
+using arrow::Status;
+using arrow::VisitNullBitmapInline;
+using arrow::internal::checked_cast;
+
+template <typename T>
+using ArrowPoolVector = std::vector<T, ::arrow::stl::allocator<T>>;
+
+namespace parquet {
+namespace {
+
+constexpr int64_t kInMemoryDefaultCapacity = 1024;
+// The Parquet spec isn't very clear whether ByteArray lengths are signed or
+// unsigned, but the Java implementation uses signed ints.
+constexpr size_t kMaxByteArraySize = std::numeric_limits<int32_t>::max();
+
+class EncoderImpl : virtual public Encoder {
+ public:
+ EncoderImpl(const ColumnDescriptor* descr, Encoding::type encoding, MemoryPool* pool)
+ : descr_(descr),
+ encoding_(encoding),
+ pool_(pool),
+ type_length_(descr ? descr->type_length() : -1) {}
+
+ Encoding::type encoding() const override { return encoding_; }
+
+ MemoryPool* memory_pool() const override { return pool_; }
+
+ protected:
+ // For accessing type-specific metadata, like FIXED_LEN_BYTE_ARRAY
+ const ColumnDescriptor* descr_;
+ const Encoding::type encoding_;
+ MemoryPool* pool_;
+
+ /// Type length from descr
+ int type_length_;
+};
+
+// ----------------------------------------------------------------------
+// Plain encoder implementation
+
+template <typename DType>
+class PlainEncoder : public EncoderImpl, virtual public TypedEncoder<DType> {
+ public:
+ using T = typename DType::c_type;
+
+ explicit PlainEncoder(const ColumnDescriptor* descr, MemoryPool* pool)
+ : EncoderImpl(descr, Encoding::PLAIN, pool), sink_(pool) {}
+
+ int64_t EstimatedDataEncodedSize() override { return sink_.length(); }
+
+ std::shared_ptr<Buffer> FlushValues() override {
+ std::shared_ptr<Buffer> buffer;
+ PARQUET_THROW_NOT_OK(sink_.Finish(&buffer));
+ return buffer;
+ }
+
+ using TypedEncoder<DType>::Put;
+
+ void Put(const T* buffer, int num_values) override;
+
+ void Put(const ::arrow::Array& values) override;
+
+ void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
+ int64_t valid_bits_offset) override {
+ if (valid_bits != NULLPTR) {
+ PARQUET_ASSIGN_OR_THROW(auto buffer, ::arrow::AllocateBuffer(num_values * sizeof(T),
+ this->memory_pool()));
+ T* data = reinterpret_cast<T*>(buffer->mutable_data());
+ int num_valid_values = ::arrow::util::internal::SpacedCompress<T>(
+ src, num_values, valid_bits, valid_bits_offset, data);
+ Put(data, num_valid_values);
+ } else {
+ Put(src, num_values);
+ }
+ }
+
+ void UnsafePutByteArray(const void* data, uint32_t length) {
+ DCHECK(length == 0 || data != nullptr) << "Value ptr cannot be NULL";
+ sink_.UnsafeAppend(&length, sizeof(uint32_t));
+ sink_.UnsafeAppend(data, static_cast<int64_t>(length));
+ }
+
+ void Put(const ByteArray& val) {
+ // Write the result to the output stream
+ const int64_t increment = static_cast<int64_t>(val.len + sizeof(uint32_t));
+ if (ARROW_PREDICT_FALSE(sink_.length() + increment > sink_.capacity())) {
+ PARQUET_THROW_NOT_OK(sink_.Reserve(increment));
+ }
+ UnsafePutByteArray(val.ptr, val.len);
+ }
+
+ protected:
+ template <typename ArrayType>
+ void PutBinaryArray(const ArrayType& array) {
+ const int64_t total_bytes =
+ array.value_offset(array.length()) - array.value_offset(0);
+ PARQUET_THROW_NOT_OK(sink_.Reserve(total_bytes + array.length() * sizeof(uint32_t)));
+
+ PARQUET_THROW_NOT_OK(::arrow::VisitArrayDataInline<typename ArrayType::TypeClass>(
+ *array.data(),
+ [&](::arrow::util::string_view view) {
+ if (ARROW_PREDICT_FALSE(view.size() > kMaxByteArraySize)) {
+ return Status::Invalid("Parquet cannot store strings with size 2GB or more");
+ }
+ UnsafePutByteArray(view.data(), static_cast<uint32_t>(view.size()));
+ return Status::OK();
+ },
+ []() { return Status::OK(); }));
+ }
+
+ ::arrow::BufferBuilder sink_;
+};
+
+template <typename DType>
+void PlainEncoder<DType>::Put(const T* buffer, int num_values) {
+ if (num_values > 0) {
+ PARQUET_THROW_NOT_OK(sink_.Append(buffer, num_values * sizeof(T)));
+ }
+}
+
+template <>
+inline void PlainEncoder<ByteArrayType>::Put(const ByteArray* src, int num_values) {
+ for (int i = 0; i < num_values; ++i) {
+ Put(src[i]);
+ }
+}
+
+template <typename ArrayType>
+void DirectPutImpl(const ::arrow::Array& values, ::arrow::BufferBuilder* sink) {
+ if (values.type_id() != ArrayType::TypeClass::type_id) {
+ std::string type_name = ArrayType::TypeClass::type_name();
+ throw ParquetException("direct put to " + type_name + " from " +
+ values.type()->ToString() + " not supported");
+ }
+
+ using value_type = typename ArrayType::value_type;
+ constexpr auto value_size = sizeof(value_type);
+ auto raw_values = checked_cast<const ArrayType&>(values).raw_values();
+
+ if (values.null_count() == 0) {
+ // no nulls, just dump the data
+ PARQUET_THROW_NOT_OK(sink->Append(raw_values, values.length() * value_size));
+ } else {
+ PARQUET_THROW_NOT_OK(
+ sink->Reserve((values.length() - values.null_count()) * value_size));
+
+ for (int64_t i = 0; i < values.length(); i++) {
+ if (values.IsValid(i)) {
+ sink->UnsafeAppend(&raw_values[i], value_size);
+ }
+ }
+ }
+}
+
+template <>
+void PlainEncoder<Int32Type>::Put(const ::arrow::Array& values) {
+ DirectPutImpl<::arrow::Int32Array>(values, &sink_);
+}
+
+template <>
+void PlainEncoder<Int64Type>::Put(const ::arrow::Array& values) {
+ DirectPutImpl<::arrow::Int64Array>(values, &sink_);
+}
+
+template <>
+void PlainEncoder<Int96Type>::Put(const ::arrow::Array& values) {
+ ParquetException::NYI("direct put to Int96");
+}
+
+template <>
+void PlainEncoder<FloatType>::Put(const ::arrow::Array& values) {
+ DirectPutImpl<::arrow::FloatArray>(values, &sink_);
+}
+
+template <>
+void PlainEncoder<DoubleType>::Put(const ::arrow::Array& values) {
+ DirectPutImpl<::arrow::DoubleArray>(values, &sink_);
+}
+
+template <typename DType>
+void PlainEncoder<DType>::Put(const ::arrow::Array& values) {
+ ParquetException::NYI("direct put of " + values.type()->ToString());
+}
+
+void AssertBaseBinary(const ::arrow::Array& values) {
+ if (!::arrow::is_base_binary_like(values.type_id())) {
+ throw ParquetException("Only BaseBinaryArray and subclasses supported");
+ }
+}
+
+template <>
+inline void PlainEncoder<ByteArrayType>::Put(const ::arrow::Array& values) {
+ AssertBaseBinary(values);
+
+ if (::arrow::is_binary_like(values.type_id())) {
+ PutBinaryArray(checked_cast<const ::arrow::BinaryArray&>(values));
+ } else {
+ DCHECK(::arrow::is_large_binary_like(values.type_id()));
+ PutBinaryArray(checked_cast<const ::arrow::LargeBinaryArray&>(values));
+ }
+}
+
+void AssertFixedSizeBinary(const ::arrow::Array& values, int type_length) {
+ if (values.type_id() != ::arrow::Type::FIXED_SIZE_BINARY &&
+ values.type_id() != ::arrow::Type::DECIMAL) {
+ throw ParquetException("Only FixedSizeBinaryArray and subclasses supported");
+ }
+ if (checked_cast<const ::arrow::FixedSizeBinaryType&>(*values.type()).byte_width() !=
+ type_length) {
+ throw ParquetException("Size mismatch: " + values.type()->ToString() +
+ " should have been " + std::to_string(type_length) + " wide");
+ }
+}
+
+template <>
+inline void PlainEncoder<FLBAType>::Put(const ::arrow::Array& values) {
+ AssertFixedSizeBinary(values, descr_->type_length());
+ const auto& data = checked_cast<const ::arrow::FixedSizeBinaryArray&>(values);
+
+ if (data.null_count() == 0) {
+ // no nulls, just dump the data
+ PARQUET_THROW_NOT_OK(
+ sink_.Append(data.raw_values(), data.length() * data.byte_width()));
+ } else {
+ const int64_t total_bytes =
+ data.length() * data.byte_width() - data.null_count() * data.byte_width();
+ PARQUET_THROW_NOT_OK(sink_.Reserve(total_bytes));
+ for (int64_t i = 0; i < data.length(); i++) {
+ if (data.IsValid(i)) {
+ sink_.UnsafeAppend(data.Value(i), data.byte_width());
+ }
+ }
+ }
+}
+
+template <>
+inline void PlainEncoder<FLBAType>::Put(const FixedLenByteArray* src, int num_values) {
+ if (descr_->type_length() == 0) {
+ return;
+ }
+ for (int i = 0; i < num_values; ++i) {
+ // Write the result to the output stream
+ DCHECK(src[i].ptr != nullptr) << "Value ptr cannot be NULL";
+ PARQUET_THROW_NOT_OK(sink_.Append(src[i].ptr, descr_->type_length()));
+ }
+}
+
+template <>
+class PlainEncoder<BooleanType> : public EncoderImpl, virtual public BooleanEncoder {
+ public:
+ explicit PlainEncoder(const ColumnDescriptor* descr, MemoryPool* pool)
+ : EncoderImpl(descr, Encoding::PLAIN, pool),
+ bits_available_(kInMemoryDefaultCapacity * 8),
+ bits_buffer_(AllocateBuffer(pool, kInMemoryDefaultCapacity)),
+ sink_(pool),
+ bit_writer_(bits_buffer_->mutable_data(),
+ static_cast<int>(bits_buffer_->size())) {}
+
+ int64_t EstimatedDataEncodedSize() override;
+ std::shared_ptr<Buffer> FlushValues() override;
+
+ void Put(const bool* src, int num_values) override;
+
+ void Put(const std::vector<bool>& src, int num_values) override;
+
+ void PutSpaced(const bool* src, int num_values, const uint8_t* valid_bits,
+ int64_t valid_bits_offset) override {
+ if (valid_bits != NULLPTR) {
+ PARQUET_ASSIGN_OR_THROW(auto buffer, ::arrow::AllocateBuffer(num_values * sizeof(T),
+ this->memory_pool()));
+ T* data = reinterpret_cast<T*>(buffer->mutable_data());
+ int num_valid_values = ::arrow::util::internal::SpacedCompress<T>(
+ src, num_values, valid_bits, valid_bits_offset, data);
+ Put(data, num_valid_values);
+ } else {
+ Put(src, num_values);
+ }
+ }
+
+ void Put(const ::arrow::Array& values) override {
+ if (values.type_id() != ::arrow::Type::BOOL) {
+ throw ParquetException("direct put to boolean from " + values.type()->ToString() +
+ " not supported");
+ }
+
+ const auto& data = checked_cast<const ::arrow::BooleanArray&>(values);
+ if (data.null_count() == 0) {
+ PARQUET_THROW_NOT_OK(sink_.Reserve(BitUtil::BytesForBits(data.length())));
+ // no nulls, just dump the data
+ ::arrow::internal::CopyBitmap(data.data()->GetValues<uint8_t>(1), data.offset(),
+ data.length(), sink_.mutable_data(), sink_.length());
+ } else {
+ auto n_valid = BitUtil::BytesForBits(data.length() - data.null_count());
+ PARQUET_THROW_NOT_OK(sink_.Reserve(n_valid));
+ ::arrow::internal::FirstTimeBitmapWriter writer(sink_.mutable_data(),
+ sink_.length(), n_valid);
+
+ for (int64_t i = 0; i < data.length(); i++) {
+ if (data.IsValid(i)) {
+ if (data.Value(i)) {
+ writer.Set();
+ } else {
+ writer.Clear();
+ }
+ writer.Next();
+ }
+ }
+ writer.Finish();
+ }
+ sink_.UnsafeAdvance(data.length());
+ }
+
+ private:
+ int bits_available_;
+ std::shared_ptr<ResizableBuffer> bits_buffer_;
+ ::arrow::BufferBuilder sink_;
+ ::arrow::BitUtil::BitWriter bit_writer_;
+
+ template <typename SequenceType>
+ void PutImpl(const SequenceType& src, int num_values);
+};
+
+template <typename SequenceType>
+void PlainEncoder<BooleanType>::PutImpl(const SequenceType& src, int num_values) {
+ int bit_offset = 0;
+ if (bits_available_ > 0) {
+ int bits_to_write = std::min(bits_available_, num_values);
+ for (int i = 0; i < bits_to_write; i++) {
+ bit_writer_.PutValue(src[i], 1);
+ }
+ bits_available_ -= bits_to_write;
+ bit_offset = bits_to_write;
+
+ if (bits_available_ == 0) {
+ bit_writer_.Flush();
+ PARQUET_THROW_NOT_OK(
+ sink_.Append(bit_writer_.buffer(), bit_writer_.bytes_written()));
+ bit_writer_.Clear();
+ }
+ }
+
+ int bits_remaining = num_values - bit_offset;
+ while (bit_offset < num_values) {
+ bits_available_ = static_cast<int>(bits_buffer_->size()) * 8;
+
+ int bits_to_write = std::min(bits_available_, bits_remaining);
+ for (int i = bit_offset; i < bit_offset + bits_to_write; i++) {
+ bit_writer_.PutValue(src[i], 1);
+ }
+ bit_offset += bits_to_write;
+ bits_available_ -= bits_to_write;
+ bits_remaining -= bits_to_write;
+
+ if (bits_available_ == 0) {
+ bit_writer_.Flush();
+ PARQUET_THROW_NOT_OK(
+ sink_.Append(bit_writer_.buffer(), bit_writer_.bytes_written()));
+ bit_writer_.Clear();
+ }
+ }
+}
+
+int64_t PlainEncoder<BooleanType>::EstimatedDataEncodedSize() {
+ int64_t position = sink_.length();
+ return position + bit_writer_.bytes_written();
+}
+
+std::shared_ptr<Buffer> PlainEncoder<BooleanType>::FlushValues() {
+ if (bits_available_ > 0) {
+ bit_writer_.Flush();
+ PARQUET_THROW_NOT_OK(sink_.Append(bit_writer_.buffer(), bit_writer_.bytes_written()));
+ bit_writer_.Clear();
+ bits_available_ = static_cast<int>(bits_buffer_->size()) * 8;
+ }
+
+ std::shared_ptr<Buffer> buffer;
+ PARQUET_THROW_NOT_OK(sink_.Finish(&buffer));
+ return buffer;
+}
+
+void PlainEncoder<BooleanType>::Put(const bool* src, int num_values) {
+ PutImpl(src, num_values);
+}
+
+void PlainEncoder<BooleanType>::Put(const std::vector<bool>& src, int num_values) {
+ PutImpl(src, num_values);
+}
+
+// ----------------------------------------------------------------------
+// DictEncoder<T> implementations
+
+template <typename DType>
+struct DictEncoderTraits {
+ using c_type = typename DType::c_type;
+ using MemoTableType = ::arrow::internal::ScalarMemoTable<c_type>;
+};
+
+template <>
+struct DictEncoderTraits<ByteArrayType> {
+ using MemoTableType = ::arrow::internal::BinaryMemoTable<::arrow::BinaryBuilder>;
+};
+
+template <>
+struct DictEncoderTraits<FLBAType> {
+ using MemoTableType = ::arrow::internal::BinaryMemoTable<::arrow::BinaryBuilder>;
+};
+
+// Initially 1024 elements
+static constexpr int32_t kInitialHashTableSize = 1 << 10;
+
+/// See the dictionary encoding section of
+/// https://github.com/Parquet/parquet-format. The encoding supports
+/// streaming encoding. Values are encoded as they are added while the
+/// dictionary is being constructed. At any time, the buffered values
+/// can be written out with the current dictionary size. More values
+/// can then be added to the encoder, including new dictionary
+/// entries.
+template <typename DType>
+class DictEncoderImpl : public EncoderImpl, virtual public DictEncoder<DType> {
+ using MemoTableType = typename DictEncoderTraits<DType>::MemoTableType;
+
+ public:
+ typedef typename DType::c_type T;
+
+ explicit DictEncoderImpl(const ColumnDescriptor* desc, MemoryPool* pool)
+ : EncoderImpl(desc, Encoding::PLAIN_DICTIONARY, pool),
+ buffered_indices_(::arrow::stl::allocator<int32_t>(pool)),
+ dict_encoded_size_(0),
+ memo_table_(pool, kInitialHashTableSize) {}
+
+ ~DictEncoderImpl() override { DCHECK(buffered_indices_.empty()); }
+
+ int dict_encoded_size() override { return dict_encoded_size_; }
+
+ int WriteIndices(uint8_t* buffer, int buffer_len) override {
+ // Write bit width in first byte
+ *buffer = static_cast<uint8_t>(bit_width());
+ ++buffer;
+ --buffer_len;
+
+ ::arrow::util::RleEncoder encoder(buffer, buffer_len, bit_width());
+
+ for (int32_t index : buffered_indices_) {
+ if (!encoder.Put(index)) return -1;
+ }
+ encoder.Flush();
+
+ ClearIndices();
+ return 1 + encoder.len();
+ }
+
+ void set_type_length(int type_length) { this->type_length_ = type_length; }
+
+ /// Returns a conservative estimate of the number of bytes needed to encode the buffered
+ /// indices. Used to size the buffer passed to WriteIndices().
+ int64_t EstimatedDataEncodedSize() override {
+ // Note: because of the way RleEncoder::CheckBufferFull() is called, we have to
+ // reserve
+ // an extra "RleEncoder::MinBufferSize" bytes. These extra bytes won't be used
+ // but not reserving them would cause the encoder to fail.
+ return 1 +
+ ::arrow::util::RleEncoder::MaxBufferSize(
+ bit_width(), static_cast<int>(buffered_indices_.size())) +
+ ::arrow::util::RleEncoder::MinBufferSize(bit_width());
+ }
+
+ /// The minimum bit width required to encode the currently buffered indices.
+ int bit_width() const override {
+ if (ARROW_PREDICT_FALSE(num_entries() == 0)) return 0;
+ if (ARROW_PREDICT_FALSE(num_entries() == 1)) return 1;
+ return BitUtil::Log2(num_entries());
+ }
+
+ /// Encode value. Note that this does not actually write any data, just
+ /// buffers the value's index to be written later.
+ inline void Put(const T& value);
+
+ // Not implemented for other data types
+ inline void PutByteArray(const void* ptr, int32_t length);
+
+ void Put(const T* src, int num_values) override {
+ for (int32_t i = 0; i < num_values; i++) {
+ Put(src[i]);
+ }
+ }
+
+ void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
+ int64_t valid_bits_offset) override {
+ ::arrow::internal::VisitSetBitRunsVoid(valid_bits, valid_bits_offset, num_values,
+ [&](int64_t position, int64_t length) {
+ for (int64_t i = 0; i < length; i++) {
+ Put(src[i + position]);
+ }
+ });
+ }
+
+ using TypedEncoder<DType>::Put;
+
+ void Put(const ::arrow::Array& values) override;
+ void PutDictionary(const ::arrow::Array& values) override;
+
+ template <typename ArrowType, typename T = typename ArrowType::c_type>
+ void PutIndicesTyped(const ::arrow::Array& data) {
+ auto values = data.data()->GetValues<T>(1);
+ size_t buffer_position = buffered_indices_.size();
+ buffered_indices_.resize(buffer_position +
+ static_cast<size_t>(data.length() - data.null_count()));
+ ::arrow::internal::VisitSetBitRunsVoid(
+ data.null_bitmap_data(), data.offset(), data.length(),
+ [&](int64_t position, int64_t length) {
+ for (int64_t i = 0; i < length; ++i) {
+ buffered_indices_[buffer_position++] =
+ static_cast<int32_t>(values[i + position]);
+ }
+ });
+ }
+
+ void PutIndices(const ::arrow::Array& data) override {
+ switch (data.type()->id()) {
+ case ::arrow::Type::UINT8:
+ case ::arrow::Type::INT8:
+ return PutIndicesTyped<::arrow::UInt8Type>(data);
+ case ::arrow::Type::UINT16:
+ case ::arrow::Type::INT16:
+ return PutIndicesTyped<::arrow::UInt16Type>(data);
+ case ::arrow::Type::UINT32:
+ case ::arrow::Type::INT32:
+ return PutIndicesTyped<::arrow::UInt32Type>(data);
+ case ::arrow::Type::UINT64:
+ case ::arrow::Type::INT64:
+ return PutIndicesTyped<::arrow::UInt64Type>(data);
+ default:
+ throw ParquetException("Passed non-integer array to PutIndices");
+ }
+ }
+
+ std::shared_ptr<Buffer> FlushValues() override {
+ std::shared_ptr<ResizableBuffer> buffer =
+ AllocateBuffer(this->pool_, EstimatedDataEncodedSize());
+ int result_size = WriteIndices(buffer->mutable_data(),
+ static_cast<int>(EstimatedDataEncodedSize()));
+ PARQUET_THROW_NOT_OK(buffer->Resize(result_size, false));
+ return std::move(buffer);
+ }
+
+ /// Writes out the encoded dictionary to buffer. buffer must be preallocated to
+ /// dict_encoded_size() bytes.
+ void WriteDict(uint8_t* buffer) override;
+
+ /// The number of entries in the dictionary.
+ int num_entries() const override { return memo_table_.size(); }
+
+ private:
+ /// Clears all the indices (but leaves the dictionary).
+ void ClearIndices() { buffered_indices_.clear(); }
+
+ /// Indices that have not yet be written out by WriteIndices().
+ ArrowPoolVector<int32_t> buffered_indices_;
+
+ template <typename ArrayType>
+ void PutBinaryArray(const ArrayType& array) {
+ PARQUET_THROW_NOT_OK(::arrow::VisitArrayDataInline<typename ArrayType::TypeClass>(
+ *array.data(),
+ [&](::arrow::util::string_view view) {
+ if (ARROW_PREDICT_FALSE(view.size() > kMaxByteArraySize)) {
+ return Status::Invalid("Parquet cannot store strings with size 2GB or more");
+ }
+ PutByteArray(view.data(), static_cast<uint32_t>(view.size()));
+ return Status::OK();
+ },
+ []() { return Status::OK(); }));
+ }
+
+ template <typename ArrayType>
+ void PutBinaryDictionaryArray(const ArrayType& array) {
+ DCHECK_EQ(array.null_count(), 0);
+ for (int64_t i = 0; i < array.length(); i++) {
+ auto v = array.GetView(i);
+ if (ARROW_PREDICT_FALSE(v.size() > kMaxByteArraySize)) {
+ throw ParquetException("Parquet cannot store strings with size 2GB or more");
+ }
+ dict_encoded_size_ += static_cast<int>(v.size() + sizeof(uint32_t));
+ int32_t unused_memo_index;
+ PARQUET_THROW_NOT_OK(memo_table_.GetOrInsert(
+ v.data(), static_cast<int32_t>(v.size()), &unused_memo_index));
+ }
+ }
+
+ /// The number of bytes needed to encode the dictionary.
+ int dict_encoded_size_;
+
+ MemoTableType memo_table_;
+};
+
+template <typename DType>
+void DictEncoderImpl<DType>::WriteDict(uint8_t* buffer) {
+ // For primitive types, only a memcpy
+ DCHECK_EQ(static_cast<size_t>(dict_encoded_size_), sizeof(T) * memo_table_.size());
+ memo_table_.CopyValues(0 /* start_pos */, reinterpret_cast<T*>(buffer));
+}
+
+// ByteArray and FLBA already have the dictionary encoded in their data heaps
+template <>
+void DictEncoderImpl<ByteArrayType>::WriteDict(uint8_t* buffer) {
+ memo_table_.VisitValues(0, [&buffer](const ::arrow::util::string_view& v) {
+ uint32_t len = static_cast<uint32_t>(v.length());
+ memcpy(buffer, &len, sizeof(len));
+ buffer += sizeof(len);
+ memcpy(buffer, v.data(), len);
+ buffer += len;
+ });
+}
+
+template <>
+void DictEncoderImpl<FLBAType>::WriteDict(uint8_t* buffer) {
+ memo_table_.VisitValues(0, [&](const ::arrow::util::string_view& v) {
+ DCHECK_EQ(v.length(), static_cast<size_t>(type_length_));
+ memcpy(buffer, v.data(), type_length_);
+ buffer += type_length_;
+ });
+}
+
+template <typename DType>
+inline void DictEncoderImpl<DType>::Put(const T& v) {
+ // Put() implementation for primitive types
+ auto on_found = [](int32_t memo_index) {};
+ auto on_not_found = [this](int32_t memo_index) {
+ dict_encoded_size_ += static_cast<int>(sizeof(T));
+ };
+
+ int32_t memo_index;
+ PARQUET_THROW_NOT_OK(memo_table_.GetOrInsert(v, on_found, on_not_found, &memo_index));
+ buffered_indices_.push_back(memo_index);
+}
+
+template <typename DType>
+inline void DictEncoderImpl<DType>::PutByteArray(const void* ptr, int32_t length) {
+ DCHECK(false);
+}
+
+template <>
+inline void DictEncoderImpl<ByteArrayType>::PutByteArray(const void* ptr,
+ int32_t length) {
+ static const uint8_t empty[] = {0};
+
+ auto on_found = [](int32_t memo_index) {};
+ auto on_not_found = [&](int32_t memo_index) {
+ dict_encoded_size_ += static_cast<int>(length + sizeof(uint32_t));
+ };
+
+ DCHECK(ptr != nullptr || length == 0);
+ ptr = (ptr != nullptr) ? ptr : empty;
+ int32_t memo_index;
+ PARQUET_THROW_NOT_OK(
+ memo_table_.GetOrInsert(ptr, length, on_found, on_not_found, &memo_index));
+ buffered_indices_.push_back(memo_index);
+}
+
+template <>
+inline void DictEncoderImpl<ByteArrayType>::Put(const ByteArray& val) {
+ return PutByteArray(val.ptr, static_cast<int32_t>(val.len));
+}
+
+template <>
+inline void DictEncoderImpl<FLBAType>::Put(const FixedLenByteArray& v) {
+ static const uint8_t empty[] = {0};
+
+ auto on_found = [](int32_t memo_index) {};
+ auto on_not_found = [this](int32_t memo_index) { dict_encoded_size_ += type_length_; };
+
+ DCHECK(v.ptr != nullptr || type_length_ == 0);
+ const void* ptr = (v.ptr != nullptr) ? v.ptr : empty;
+ int32_t memo_index;
+ PARQUET_THROW_NOT_OK(
+ memo_table_.GetOrInsert(ptr, type_length_, on_found, on_not_found, &memo_index));
+ buffered_indices_.push_back(memo_index);
+}
+
+template <>
+void DictEncoderImpl<Int96Type>::Put(const ::arrow::Array& values) {
+ ParquetException::NYI("Direct put to Int96");
+}
+
+template <>
+void DictEncoderImpl<Int96Type>::PutDictionary(const ::arrow::Array& values) {
+ ParquetException::NYI("Direct put to Int96");
+}
+
+template <typename DType>
+void DictEncoderImpl<DType>::Put(const ::arrow::Array& values) {
+ using ArrayType = typename ::arrow::CTypeTraits<typename DType::c_type>::ArrayType;
+ const auto& data = checked_cast<const ArrayType&>(values);
+ if (data.null_count() == 0) {
+ // no nulls, just dump the data
+ for (int64_t i = 0; i < data.length(); i++) {
+ Put(data.Value(i));
+ }
+ } else {
+ for (int64_t i = 0; i < data.length(); i++) {
+ if (data.IsValid(i)) {
+ Put(data.Value(i));
+ }
+ }
+ }
+}
+
+template <>
+void DictEncoderImpl<FLBAType>::Put(const ::arrow::Array& values) {
+ AssertFixedSizeBinary(values, type_length_);
+ const auto& data = checked_cast<const ::arrow::FixedSizeBinaryArray&>(values);
+ if (data.null_count() == 0) {
+ // no nulls, just dump the data
+ for (int64_t i = 0; i < data.length(); i++) {
+ Put(FixedLenByteArray(data.Value(i)));
+ }
+ } else {
+ std::vector<uint8_t> empty(type_length_, 0);
+ for (int64_t i = 0; i < data.length(); i++) {
+ if (data.IsValid(i)) {
+ Put(FixedLenByteArray(data.Value(i)));
+ }
+ }
+ }
+}
+
+template <>
+void DictEncoderImpl<ByteArrayType>::Put(const ::arrow::Array& values) {
+ AssertBaseBinary(values);
+ if (::arrow::is_binary_like(values.type_id())) {
+ PutBinaryArray(checked_cast<const ::arrow::BinaryArray&>(values));
+ } else {
+ DCHECK(::arrow::is_large_binary_like(values.type_id()));
+ PutBinaryArray(checked_cast<const ::arrow::LargeBinaryArray&>(values));
+ }
+}
+
+template <typename DType>
+void AssertCanPutDictionary(DictEncoderImpl<DType>* encoder, const ::arrow::Array& dict) {
+ if (dict.null_count() > 0) {
+ throw ParquetException("Inserted dictionary cannot cannot contain nulls");
+ }
+
+ if (encoder->num_entries() > 0) {
+ throw ParquetException("Can only call PutDictionary on an empty DictEncoder");
+ }
+}
+
+template <typename DType>
+void DictEncoderImpl<DType>::PutDictionary(const ::arrow::Array& values) {
+ AssertCanPutDictionary(this, values);
+
+ using ArrayType = typename ::arrow::CTypeTraits<typename DType::c_type>::ArrayType;
+ const auto& data = checked_cast<const ArrayType&>(values);
+
+ dict_encoded_size_ += static_cast<int>(sizeof(typename DType::c_type) * data.length());
+ for (int64_t i = 0; i < data.length(); i++) {
+ int32_t unused_memo_index;
+ PARQUET_THROW_NOT_OK(memo_table_.GetOrInsert(data.Value(i), &unused_memo_index));
+ }
+}
+
+template <>
+void DictEncoderImpl<FLBAType>::PutDictionary(const ::arrow::Array& values) {
+ AssertFixedSizeBinary(values, type_length_);
+ AssertCanPutDictionary(this, values);
+
+ const auto& data = checked_cast<const ::arrow::FixedSizeBinaryArray&>(values);
+
+ dict_encoded_size_ += static_cast<int>(type_length_ * data.length());
+ for (int64_t i = 0; i < data.length(); i++) {
+ int32_t unused_memo_index;
+ PARQUET_THROW_NOT_OK(
+ memo_table_.GetOrInsert(data.Value(i), type_length_, &unused_memo_index));
+ }
+}
+
+template <>
+void DictEncoderImpl<ByteArrayType>::PutDictionary(const ::arrow::Array& values) {
+ AssertBaseBinary(values);
+ AssertCanPutDictionary(this, values);
+
+ if (::arrow::is_binary_like(values.type_id())) {
+ PutBinaryDictionaryArray(checked_cast<const ::arrow::BinaryArray&>(values));
+ } else {
+ DCHECK(::arrow::is_large_binary_like(values.type_id()));
+ PutBinaryDictionaryArray(checked_cast<const ::arrow::LargeBinaryArray&>(values));
+ }
+}
+
+// ----------------------------------------------------------------------
+// ByteStreamSplitEncoder<T> implementations
+
+template <typename DType>
+class ByteStreamSplitEncoder : public EncoderImpl, virtual public TypedEncoder<DType> {
+ public:
+ using T = typename DType::c_type;
+ using TypedEncoder<DType>::Put;
+
+ explicit ByteStreamSplitEncoder(
+ const ColumnDescriptor* descr,
+ ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
+
+ int64_t EstimatedDataEncodedSize() override;
+ std::shared_ptr<Buffer> FlushValues() override;
+
+ void Put(const T* buffer, int num_values) override;
+ void Put(const ::arrow::Array& values) override;
+ void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
+ int64_t valid_bits_offset) override;
+
+ protected:
+ template <typename ArrowType>
+ void PutImpl(const ::arrow::Array& values) {
+ if (values.type_id() != ArrowType::type_id) {
+ throw ParquetException(std::string() + "direct put to " + ArrowType::type_name() +
+ " from " + values.type()->ToString() + " not supported");
+ }
+ const auto& data = *values.data();
+ PutSpaced(data.GetValues<typename ArrowType::c_type>(1),
+ static_cast<int>(data.length), data.GetValues<uint8_t>(0, 0), data.offset);
+ }
+
+ ::arrow::BufferBuilder sink_;
+ int64_t num_values_in_buffer_;
+};
+
+template <typename DType>
+ByteStreamSplitEncoder<DType>::ByteStreamSplitEncoder(const ColumnDescriptor* descr,
+ ::arrow::MemoryPool* pool)
+ : EncoderImpl(descr, Encoding::BYTE_STREAM_SPLIT, pool),
+ sink_{pool},
+ num_values_in_buffer_{0} {}
+
+template <typename DType>
+int64_t ByteStreamSplitEncoder<DType>::EstimatedDataEncodedSize() {
+ return sink_.length();
+}
+
+template <typename DType>
+std::shared_ptr<Buffer> ByteStreamSplitEncoder<DType>::FlushValues() {
+ std::shared_ptr<ResizableBuffer> output_buffer =
+ AllocateBuffer(this->memory_pool(), EstimatedDataEncodedSize());
+ uint8_t* output_buffer_raw = output_buffer->mutable_data();
+ const uint8_t* raw_values = sink_.data();
+ ::arrow::util::internal::ByteStreamSplitEncode<T>(raw_values, num_values_in_buffer_,
+ output_buffer_raw);
+ sink_.Reset();
+ num_values_in_buffer_ = 0;
+ return std::move(output_buffer);
+}
+
+template <typename DType>
+void ByteStreamSplitEncoder<DType>::Put(const T* buffer, int num_values) {
+ if (num_values > 0) {
+ PARQUET_THROW_NOT_OK(sink_.Append(buffer, num_values * sizeof(T)));
+ num_values_in_buffer_ += num_values;
+ }
+}
+
+template <>
+void ByteStreamSplitEncoder<FloatType>::Put(const ::arrow::Array& values) {
+ PutImpl<::arrow::FloatType>(values);
+}
+
+template <>
+void ByteStreamSplitEncoder<DoubleType>::Put(const ::arrow::Array& values) {
+ PutImpl<::arrow::DoubleType>(values);
+}
+
+template <typename DType>
+void ByteStreamSplitEncoder<DType>::PutSpaced(const T* src, int num_values,
+ const uint8_t* valid_bits,
+ int64_t valid_bits_offset) {
+ if (valid_bits != NULLPTR) {
+ PARQUET_ASSIGN_OR_THROW(auto buffer, ::arrow::AllocateBuffer(num_values * sizeof(T),
+ this->memory_pool()));
+ T* data = reinterpret_cast<T*>(buffer->mutable_data());
+ int num_valid_values = ::arrow::util::internal::SpacedCompress<T>(
+ src, num_values, valid_bits, valid_bits_offset, data);
+ Put(data, num_valid_values);
+ } else {
+ Put(src, num_values);
+ }
+}
+
+class DecoderImpl : virtual public Decoder {
+ public:
+ void SetData(int num_values, const uint8_t* data, int len) override {
+ num_values_ = num_values;
+ data_ = data;
+ len_ = len;
+ }
+
+ int values_left() const override { return num_values_; }
+ Encoding::type encoding() const override { return encoding_; }
+
+ protected:
+ explicit DecoderImpl(const ColumnDescriptor* descr, Encoding::type encoding)
+ : descr_(descr), encoding_(encoding), num_values_(0), data_(NULLPTR), len_(0) {}
+
+ // For accessing type-specific metadata, like FIXED_LEN_BYTE_ARRAY
+ const ColumnDescriptor* descr_;
+
+ const Encoding::type encoding_;
+ int num_values_;
+ const uint8_t* data_;
+ int len_;
+ int type_length_;
+};
+
+template <typename DType>
+class PlainDecoder : public DecoderImpl, virtual public TypedDecoder<DType> {
+ public:
+ using T = typename DType::c_type;
+ explicit PlainDecoder(const ColumnDescriptor* descr);
+
+ int Decode(T* buffer, int max_values) override;
+
+ int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ typename EncodingTraits<DType>::Accumulator* builder) override;
+
+ int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ typename EncodingTraits<DType>::DictAccumulator* builder) override;
+};
+
+template <>
+inline int PlainDecoder<Int96Type>::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<Int96Type>::Accumulator* builder) {
+ ParquetException::NYI("DecodeArrow not supported for Int96");
+}
+
+template <>
+inline int PlainDecoder<Int96Type>::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<Int96Type>::DictAccumulator* builder) {
+ ParquetException::NYI("DecodeArrow not supported for Int96");
+}
+
+template <>
+inline int PlainDecoder<BooleanType>::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<BooleanType>::DictAccumulator* builder) {
+ ParquetException::NYI("dictionaries of BooleanType");
+}
+
+template <typename DType>
+int PlainDecoder<DType>::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<DType>::Accumulator* builder) {
+ using value_type = typename DType::c_type;
+
+ constexpr int value_size = static_cast<int>(sizeof(value_type));
+ int values_decoded = num_values - null_count;
+ if (ARROW_PREDICT_FALSE(len_ < value_size * values_decoded)) {
+ ParquetException::EofException();
+ }
+
+ PARQUET_THROW_NOT_OK(builder->Reserve(num_values));
+
+ VisitNullBitmapInline(
+ valid_bits, valid_bits_offset, num_values, null_count,
+ [&]() {
+ builder->UnsafeAppend(::arrow::util::SafeLoadAs<value_type>(data_));
+ data_ += sizeof(value_type);
+ },
+ [&]() { builder->UnsafeAppendNull(); });
+
+ num_values_ -= values_decoded;
+ len_ -= sizeof(value_type) * values_decoded;
+ return values_decoded;
+}
+
+template <typename DType>
+int PlainDecoder<DType>::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<DType>::DictAccumulator* builder) {
+ using value_type = typename DType::c_type;
+
+ constexpr int value_size = static_cast<int>(sizeof(value_type));
+ int values_decoded = num_values - null_count;
+ if (ARROW_PREDICT_FALSE(len_ < value_size * values_decoded)) {
+ ParquetException::EofException();
+ }
+
+ PARQUET_THROW_NOT_OK(builder->Reserve(num_values));
+
+ VisitNullBitmapInline(
+ valid_bits, valid_bits_offset, num_values, null_count,
+ [&]() {
+ PARQUET_THROW_NOT_OK(
+ builder->Append(::arrow::util::SafeLoadAs<value_type>(data_)));
+ data_ += sizeof(value_type);
+ },
+ [&]() { PARQUET_THROW_NOT_OK(builder->AppendNull()); });
+
+ num_values_ -= values_decoded;
+ len_ -= sizeof(value_type) * values_decoded;
+ return values_decoded;
+}
+
+// Decode routine templated on C++ type rather than type enum
+template <typename T>
+inline int DecodePlain(const uint8_t* data, int64_t data_size, int num_values,
+ int type_length, T* out) {
+ int64_t bytes_to_decode = num_values * static_cast<int64_t>(sizeof(T));
+ if (bytes_to_decode > data_size || bytes_to_decode > INT_MAX) {
+ ParquetException::EofException();
+ }
+ // If bytes_to_decode == 0, data could be null
+ if (bytes_to_decode > 0) {
+ memcpy(out, data, bytes_to_decode);
+ }
+ return static_cast<int>(bytes_to_decode);
+}
+
+template <typename DType>
+PlainDecoder<DType>::PlainDecoder(const ColumnDescriptor* descr)
+ : DecoderImpl(descr, Encoding::PLAIN) {
+ if (descr_ && descr_->physical_type() == Type::FIXED_LEN_BYTE_ARRAY) {
+ type_length_ = descr_->type_length();
+ } else {
+ type_length_ = -1;
+ }
+}
+
+// Template specialization for BYTE_ARRAY. The written values do not own their
+// own data.
+
+static inline int64_t ReadByteArray(const uint8_t* data, int64_t data_size,
+ ByteArray* out) {
+ if (ARROW_PREDICT_FALSE(data_size < 4)) {
+ ParquetException::EofException();
+ }
+ const int32_t len = ::arrow::util::SafeLoadAs<int32_t>(data);
+ if (len < 0) {
+ throw ParquetException("Invalid BYTE_ARRAY value");
+ }
+ const int64_t consumed_length = static_cast<int64_t>(len) + 4;
+ if (ARROW_PREDICT_FALSE(data_size < consumed_length)) {
+ ParquetException::EofException();
+ }
+ *out = ByteArray{static_cast<uint32_t>(len), data + 4};
+ return consumed_length;
+}
+
+template <>
+inline int DecodePlain<ByteArray>(const uint8_t* data, int64_t data_size, int num_values,
+ int type_length, ByteArray* out) {
+ int bytes_decoded = 0;
+ for (int i = 0; i < num_values; ++i) {
+ const auto increment = ReadByteArray(data, data_size, out + i);
+ if (ARROW_PREDICT_FALSE(increment > INT_MAX - bytes_decoded)) {
+ throw ParquetException("BYTE_ARRAY chunk too large");
+ }
+ data += increment;
+ data_size -= increment;
+ bytes_decoded += static_cast<int>(increment);
+ }
+ return bytes_decoded;
+}
+
+// Template specialization for FIXED_LEN_BYTE_ARRAY. The written values do not
+// own their own data.
+template <>
+inline int DecodePlain<FixedLenByteArray>(const uint8_t* data, int64_t data_size,
+ int num_values, int type_length,
+ FixedLenByteArray* out) {
+ int64_t bytes_to_decode = static_cast<int64_t>(type_length) * num_values;
+ if (bytes_to_decode > data_size || bytes_to_decode > INT_MAX) {
+ ParquetException::EofException();
+ }
+ for (int i = 0; i < num_values; ++i) {
+ out[i].ptr = data;
+ data += type_length;
+ data_size -= type_length;
+ }
+ return static_cast<int>(bytes_to_decode);
+}
+
+template <typename DType>
+int PlainDecoder<DType>::Decode(T* buffer, int max_values) {
+ max_values = std::min(max_values, num_values_);
+ int bytes_consumed = DecodePlain<T>(data_, len_, max_values, type_length_, buffer);
+ data_ += bytes_consumed;
+ len_ -= bytes_consumed;
+ num_values_ -= max_values;
+ return max_values;
+}
+
+class PlainBooleanDecoder : public DecoderImpl,
+ virtual public TypedDecoder<BooleanType>,
+ virtual public BooleanDecoder {
+ public:
+ explicit PlainBooleanDecoder(const ColumnDescriptor* descr);
+ void SetData(int num_values, const uint8_t* data, int len) override;
+
+ // Two flavors of bool decoding
+ int Decode(uint8_t* buffer, int max_values) override;
+ int Decode(bool* buffer, int max_values) override;
+ int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ typename EncodingTraits<BooleanType>::Accumulator* out) override;
+
+ int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ typename EncodingTraits<BooleanType>::DictAccumulator* out) override;
+
+ private:
+ std::unique_ptr<::arrow::BitUtil::BitReader> bit_reader_;
+};
+
+PlainBooleanDecoder::PlainBooleanDecoder(const ColumnDescriptor* descr)
+ : DecoderImpl(descr, Encoding::PLAIN) {}
+
+void PlainBooleanDecoder::SetData(int num_values, const uint8_t* data, int len) {
+ num_values_ = num_values;
+ bit_reader_.reset(new BitUtil::BitReader(data, len));
+}
+
+int PlainBooleanDecoder::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<BooleanType>::Accumulator* builder) {
+ int values_decoded = num_values - null_count;
+ if (ARROW_PREDICT_FALSE(num_values_ < values_decoded)) {
+ ParquetException::EofException();
+ }
+
+ PARQUET_THROW_NOT_OK(builder->Reserve(num_values));
+
+ VisitNullBitmapInline(
+ valid_bits, valid_bits_offset, num_values, null_count,
+ [&]() {
+ bool value;
+ ARROW_IGNORE_EXPR(bit_reader_->GetValue(1, &value));
+ builder->UnsafeAppend(value);
+ },
+ [&]() { builder->UnsafeAppendNull(); });
+
+ num_values_ -= values_decoded;
+ return values_decoded;
+}
+
+inline int PlainBooleanDecoder::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<BooleanType>::DictAccumulator* builder) {
+ ParquetException::NYI("dictionaries of BooleanType");
+}
+
+int PlainBooleanDecoder::Decode(uint8_t* buffer, int max_values) {
+ max_values = std::min(max_values, num_values_);
+ bool val;
+ ::arrow::internal::BitmapWriter bit_writer(buffer, 0, max_values);
+ for (int i = 0; i < max_values; ++i) {
+ if (!bit_reader_->GetValue(1, &val)) {
+ ParquetException::EofException();
+ }
+ if (val) {
+ bit_writer.Set();
+ }
+ bit_writer.Next();
+ }
+ bit_writer.Finish();
+ num_values_ -= max_values;
+ return max_values;
+}
+
+int PlainBooleanDecoder::Decode(bool* buffer, int max_values) {
+ max_values = std::min(max_values, num_values_);
+ if (bit_reader_->GetBatch(1, buffer, max_values) != max_values) {
+ ParquetException::EofException();
+ }
+ num_values_ -= max_values;
+ return max_values;
+}
+
+struct ArrowBinaryHelper {
+ explicit ArrowBinaryHelper(typename EncodingTraits<ByteArrayType>::Accumulator* out) {
+ this->out = out;
+ this->builder = out->builder.get();
+ this->chunk_space_remaining =
+ ::arrow::kBinaryMemoryLimit - this->builder->value_data_length();
+ }
+
+ Status PushChunk() {
+ std::shared_ptr<::arrow::Array> result;
+ RETURN_NOT_OK(builder->Finish(&result));
+ out->chunks.push_back(result);
+ chunk_space_remaining = ::arrow::kBinaryMemoryLimit;
+ return Status::OK();
+ }
+
+ bool CanFit(int64_t length) const { return length <= chunk_space_remaining; }
+
+ void UnsafeAppend(const uint8_t* data, int32_t length) {
+ chunk_space_remaining -= length;
+ builder->UnsafeAppend(data, length);
+ }
+
+ void UnsafeAppendNull() { builder->UnsafeAppendNull(); }
+
+ Status Append(const uint8_t* data, int32_t length) {
+ chunk_space_remaining -= length;
+ return builder->Append(data, length);
+ }
+
+ Status AppendNull() { return builder->AppendNull(); }
+
+ typename EncodingTraits<ByteArrayType>::Accumulator* out;
+ ::arrow::BinaryBuilder* builder;
+ int64_t chunk_space_remaining;
+};
+
+template <>
+inline int PlainDecoder<ByteArrayType>::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<ByteArrayType>::Accumulator* builder) {
+ ParquetException::NYI();
+}
+
+template <>
+inline int PlainDecoder<ByteArrayType>::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<ByteArrayType>::DictAccumulator* builder) {
+ ParquetException::NYI();
+}
+
+template <>
+inline int PlainDecoder<FLBAType>::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<FLBAType>::Accumulator* builder) {
+ int values_decoded = num_values - null_count;
+ if (ARROW_PREDICT_FALSE(len_ < descr_->type_length() * values_decoded)) {
+ ParquetException::EofException();
+ }
+
+ PARQUET_THROW_NOT_OK(builder->Reserve(num_values));
+
+ VisitNullBitmapInline(
+ valid_bits, valid_bits_offset, num_values, null_count,
+ [&]() {
+ builder->UnsafeAppend(data_);
+ data_ += descr_->type_length();
+ },
+ [&]() { builder->UnsafeAppendNull(); });
+
+ num_values_ -= values_decoded;
+ len_ -= descr_->type_length() * values_decoded;
+ return values_decoded;
+}
+
+template <>
+inline int PlainDecoder<FLBAType>::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<FLBAType>::DictAccumulator* builder) {
+ int values_decoded = num_values - null_count;
+ if (ARROW_PREDICT_FALSE(len_ < descr_->type_length() * values_decoded)) {
+ ParquetException::EofException();
+ }
+
+ PARQUET_THROW_NOT_OK(builder->Reserve(num_values));
+
+ VisitNullBitmapInline(
+ valid_bits, valid_bits_offset, num_values, null_count,
+ [&]() {
+ PARQUET_THROW_NOT_OK(builder->Append(data_));
+ data_ += descr_->type_length();
+ },
+ [&]() { PARQUET_THROW_NOT_OK(builder->AppendNull()); });
+
+ num_values_ -= values_decoded;
+ len_ -= descr_->type_length() * values_decoded;
+ return values_decoded;
+}
+
+class PlainByteArrayDecoder : public PlainDecoder<ByteArrayType>,
+ virtual public ByteArrayDecoder {
+ public:
+ using Base = PlainDecoder<ByteArrayType>;
+ using Base::DecodeSpaced;
+ using Base::PlainDecoder;
+
+ // ----------------------------------------------------------------------
+ // Dictionary read paths
+
+ int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ ::arrow::BinaryDictionary32Builder* builder) override {
+ int result = 0;
+ PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits,
+ valid_bits_offset, builder, &result));
+ return result;
+ }
+
+ // ----------------------------------------------------------------------
+ // Optimized dense binary read paths
+
+ int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ typename EncodingTraits<ByteArrayType>::Accumulator* out) override {
+ int result = 0;
+ PARQUET_THROW_NOT_OK(DecodeArrowDense(num_values, null_count, valid_bits,
+ valid_bits_offset, out, &result));
+ return result;
+ }
+
+ private:
+ Status DecodeArrowDense(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ typename EncodingTraits<ByteArrayType>::Accumulator* out,
+ int* out_values_decoded) {
+ ArrowBinaryHelper helper(out);
+ int values_decoded = 0;
+
+ RETURN_NOT_OK(helper.builder->Reserve(num_values));
+ RETURN_NOT_OK(helper.builder->ReserveData(
+ std::min<int64_t>(len_, helper.chunk_space_remaining)));
+
+ int i = 0;
+ RETURN_NOT_OK(VisitNullBitmapInline(
+ valid_bits, valid_bits_offset, num_values, null_count,
+ [&]() {
+ if (ARROW_PREDICT_FALSE(len_ < 4)) {
+ ParquetException::EofException();
+ }
+ auto value_len = ::arrow::util::SafeLoadAs<int32_t>(data_);
+ if (ARROW_PREDICT_FALSE(value_len < 0 || value_len > INT32_MAX - 4)) {
+ return Status::Invalid("Invalid or corrupted value_len '", value_len, "'");
+ }
+ auto increment = value_len + 4;
+ if (ARROW_PREDICT_FALSE(len_ < increment)) {
+ ParquetException::EofException();
+ }
+ if (ARROW_PREDICT_FALSE(!helper.CanFit(value_len))) {
+ // This element would exceed the capacity of a chunk
+ RETURN_NOT_OK(helper.PushChunk());
+ RETURN_NOT_OK(helper.builder->Reserve(num_values - i));
+ RETURN_NOT_OK(helper.builder->ReserveData(
+ std::min<int64_t>(len_, helper.chunk_space_remaining)));
+ }
+ helper.UnsafeAppend(data_ + 4, value_len);
+ data_ += increment;
+ len_ -= increment;
+ ++values_decoded;
+ ++i;
+ return Status::OK();
+ },
+ [&]() {
+ helper.UnsafeAppendNull();
+ ++i;
+ return Status::OK();
+ }));
+
+ num_values_ -= values_decoded;
+ *out_values_decoded = values_decoded;
+ return Status::OK();
+ }
+
+ template <typename BuilderType>
+ Status DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset, BuilderType* builder,
+ int* out_values_decoded) {
+ RETURN_NOT_OK(builder->Reserve(num_values));
+ int values_decoded = 0;
+
+ RETURN_NOT_OK(VisitNullBitmapInline(
+ valid_bits, valid_bits_offset, num_values, null_count,
+ [&]() {
+ if (ARROW_PREDICT_FALSE(len_ < 4)) {
+ ParquetException::EofException();
+ }
+ auto value_len = ::arrow::util::SafeLoadAs<int32_t>(data_);
+ if (ARROW_PREDICT_FALSE(value_len < 0 || value_len > INT32_MAX - 4)) {
+ return Status::Invalid("Invalid or corrupted value_len '", value_len, "'");
+ }
+ auto increment = value_len + 4;
+ if (ARROW_PREDICT_FALSE(len_ < increment)) {
+ ParquetException::EofException();
+ }
+ RETURN_NOT_OK(builder->Append(data_ + 4, value_len));
+ data_ += increment;
+ len_ -= increment;
+ ++values_decoded;
+ return Status::OK();
+ },
+ [&]() { return builder->AppendNull(); }));
+
+ num_values_ -= values_decoded;
+ *out_values_decoded = values_decoded;
+ return Status::OK();
+ }
+};
+
+class PlainFLBADecoder : public PlainDecoder<FLBAType>, virtual public FLBADecoder {
+ public:
+ using Base = PlainDecoder<FLBAType>;
+ using Base::PlainDecoder;
+};
+
+// ----------------------------------------------------------------------
+// Dictionary encoding and decoding
+
+template <typename Type>
+class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder<Type> {
+ public:
+ typedef typename Type::c_type T;
+
+ // Initializes the dictionary with values from 'dictionary'. The data in
+ // dictionary is not guaranteed to persist in memory after this call so the
+ // dictionary decoder needs to copy the data out if necessary.
+ explicit DictDecoderImpl(const ColumnDescriptor* descr,
+ MemoryPool* pool = ::arrow::default_memory_pool())
+ : DecoderImpl(descr, Encoding::RLE_DICTIONARY),
+ dictionary_(AllocateBuffer(pool, 0)),
+ dictionary_length_(0),
+ byte_array_data_(AllocateBuffer(pool, 0)),
+ byte_array_offsets_(AllocateBuffer(pool, 0)),
+ indices_scratch_space_(AllocateBuffer(pool, 0)) {}
+
+ // Perform type-specific initiatialization
+ void SetDict(TypedDecoder<Type>* dictionary) override;
+
+ void SetData(int num_values, const uint8_t* data, int len) override {
+ num_values_ = num_values;
+ if (len == 0) {
+ // Initialize dummy decoder to avoid crashes later on
+ idx_decoder_ = ::arrow::util::RleDecoder(data, len, /*bit_width=*/1);
+ return;
+ }
+ uint8_t bit_width = *data;
+ if (ARROW_PREDICT_FALSE(bit_width >= 64)) {
+ throw ParquetException("Invalid or corrupted bit_width");
+ }
+ idx_decoder_ = ::arrow::util::RleDecoder(++data, --len, bit_width);
+ }
+
+ int Decode(T* buffer, int num_values) override {
+ num_values = std::min(num_values, num_values_);
+ int decoded_values =
+ idx_decoder_.GetBatchWithDict(reinterpret_cast<const T*>(dictionary_->data()),
+ dictionary_length_, buffer, num_values);
+ if (decoded_values != num_values) {
+ ParquetException::EofException();
+ }
+ num_values_ -= num_values;
+ return num_values;
+ }
+
+ int DecodeSpaced(T* buffer, int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset) override {
+ num_values = std::min(num_values, num_values_);
+ if (num_values != idx_decoder_.GetBatchWithDictSpaced(
+ reinterpret_cast<const T*>(dictionary_->data()),
+ dictionary_length_, buffer, num_values, null_count, valid_bits,
+ valid_bits_offset)) {
+ ParquetException::EofException();
+ }
+ num_values_ -= num_values;
+ return num_values;
+ }
+
+ int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ typename EncodingTraits<Type>::Accumulator* out) override;
+
+ int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ typename EncodingTraits<Type>::DictAccumulator* out) override;
+
+ void InsertDictionary(::arrow::ArrayBuilder* builder) override;
+
+ int DecodeIndicesSpaced(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ ::arrow::ArrayBuilder* builder) override {
+ if (num_values > 0) {
+ // TODO(wesm): Refactor to batch reads for improved memory use. It is not
+ // trivial because the null_count is relative to the entire bitmap
+ PARQUET_THROW_NOT_OK(indices_scratch_space_->TypedResize<int32_t>(
+ num_values, /*shrink_to_fit=*/false));
+ }
+
+ auto indices_buffer =
+ reinterpret_cast<int32_t*>(indices_scratch_space_->mutable_data());
+
+ if (num_values != idx_decoder_.GetBatchSpaced(num_values, null_count, valid_bits,
+ valid_bits_offset, indices_buffer)) {
+ ParquetException::EofException();
+ }
+
+ /// XXX(wesm): Cannot append "valid bits" directly to the builder
+ std::vector<uint8_t> valid_bytes(num_values);
+ ::arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, num_values);
+ for (int64_t i = 0; i < num_values; ++i) {
+ valid_bytes[i] = static_cast<uint8_t>(bit_reader.IsSet());
+ bit_reader.Next();
+ }
+
+ auto binary_builder = checked_cast<::arrow::BinaryDictionary32Builder*>(builder);
+ PARQUET_THROW_NOT_OK(
+ binary_builder->AppendIndices(indices_buffer, num_values, valid_bytes.data()));
+ num_values_ -= num_values - null_count;
+ return num_values - null_count;
+ }
+
+ int DecodeIndices(int num_values, ::arrow::ArrayBuilder* builder) override {
+ num_values = std::min(num_values, num_values_);
+ if (num_values > 0) {
+ // TODO(wesm): Refactor to batch reads for improved memory use. This is
+ // relatively simple here because we don't have to do any bookkeeping of
+ // nulls
+ PARQUET_THROW_NOT_OK(indices_scratch_space_->TypedResize<int32_t>(
+ num_values, /*shrink_to_fit=*/false));
+ }
+ auto indices_buffer =
+ reinterpret_cast<int32_t*>(indices_scratch_space_->mutable_data());
+ if (num_values != idx_decoder_.GetBatch(indices_buffer, num_values)) {
+ ParquetException::EofException();
+ }
+ auto binary_builder = checked_cast<::arrow::BinaryDictionary32Builder*>(builder);
+ PARQUET_THROW_NOT_OK(binary_builder->AppendIndices(indices_buffer, num_values));
+ num_values_ -= num_values;
+ return num_values;
+ }
+
+ int DecodeIndices(int num_values, int32_t* indices) override {
+ if (num_values != idx_decoder_.GetBatch(indices, num_values)) {
+ ParquetException::EofException();
+ }
+ num_values_ -= num_values;
+ return num_values;
+ }
+
+ void GetDictionary(const T** dictionary, int32_t* dictionary_length) override {
+ *dictionary_length = dictionary_length_;
+ *dictionary = reinterpret_cast<T*>(dictionary_->mutable_data());
+ }
+
+ protected:
+ Status IndexInBounds(int32_t index) {
+ if (ARROW_PREDICT_TRUE(0 <= index && index < dictionary_length_)) {
+ return Status::OK();
+ }
+ return Status::Invalid("Index not in dictionary bounds");
+ }
+
+ inline void DecodeDict(TypedDecoder<Type>* dictionary) {
+ dictionary_length_ = static_cast<int32_t>(dictionary->values_left());
+ PARQUET_THROW_NOT_OK(dictionary_->Resize(dictionary_length_ * sizeof(T),
+ /*shrink_to_fit=*/false));
+ dictionary->Decode(reinterpret_cast<T*>(dictionary_->mutable_data()),
+ dictionary_length_);
+ }
+
+ // Only one is set.
+ std::shared_ptr<ResizableBuffer> dictionary_;
+
+ int32_t dictionary_length_;
+
+ // Data that contains the byte array data (byte_array_dictionary_ just has the
+ // pointers).
+ std::shared_ptr<ResizableBuffer> byte_array_data_;
+
+ // Arrow-style byte offsets for each dictionary value. We maintain two
+ // representations of the dictionary, one as ByteArray* for non-Arrow
+ // consumers and this one for Arrow consumers. Since dictionaries are
+ // generally pretty small to begin with this doesn't mean too much extra
+ // memory use in most cases
+ std::shared_ptr<ResizableBuffer> byte_array_offsets_;
+
+ // Reusable buffer for decoding dictionary indices to be appended to a
+ // BinaryDictionary32Builder
+ std::shared_ptr<ResizableBuffer> indices_scratch_space_;
+
+ ::arrow::util::RleDecoder idx_decoder_;
+};
+
+template <typename Type>
+void DictDecoderImpl<Type>::SetDict(TypedDecoder<Type>* dictionary) {
+ DecodeDict(dictionary);
+}
+
+template <>
+void DictDecoderImpl<BooleanType>::SetDict(TypedDecoder<BooleanType>* dictionary) {
+ ParquetException::NYI("Dictionary encoding is not implemented for boolean values");
+}
+
+template <>
+void DictDecoderImpl<ByteArrayType>::SetDict(TypedDecoder<ByteArrayType>* dictionary) {
+ DecodeDict(dictionary);
+
+ auto dict_values = reinterpret_cast<ByteArray*>(dictionary_->mutable_data());
+
+ int total_size = 0;
+ for (int i = 0; i < dictionary_length_; ++i) {
+ total_size += dict_values[i].len;
+ }
+ PARQUET_THROW_NOT_OK(byte_array_data_->Resize(total_size,
+ /*shrink_to_fit=*/false));
+ PARQUET_THROW_NOT_OK(
+ byte_array_offsets_->Resize((dictionary_length_ + 1) * sizeof(int32_t),
+ /*shrink_to_fit=*/false));
+
+ int32_t offset = 0;
+ uint8_t* bytes_data = byte_array_data_->mutable_data();
+ int32_t* bytes_offsets =
+ reinterpret_cast<int32_t*>(byte_array_offsets_->mutable_data());
+ for (int i = 0; i < dictionary_length_; ++i) {
+ memcpy(bytes_data + offset, dict_values[i].ptr, dict_values[i].len);
+ bytes_offsets[i] = offset;
+ dict_values[i].ptr = bytes_data + offset;
+ offset += dict_values[i].len;
+ }
+ bytes_offsets[dictionary_length_] = offset;
+}
+
+template <>
+inline void DictDecoderImpl<FLBAType>::SetDict(TypedDecoder<FLBAType>* dictionary) {
+ DecodeDict(dictionary);
+
+ auto dict_values = reinterpret_cast<FLBA*>(dictionary_->mutable_data());
+
+ int fixed_len = descr_->type_length();
+ int total_size = dictionary_length_ * fixed_len;
+
+ PARQUET_THROW_NOT_OK(byte_array_data_->Resize(total_size,
+ /*shrink_to_fit=*/false));
+ uint8_t* bytes_data = byte_array_data_->mutable_data();
+ for (int32_t i = 0, offset = 0; i < dictionary_length_; ++i, offset += fixed_len) {
+ memcpy(bytes_data + offset, dict_values[i].ptr, fixed_len);
+ dict_values[i].ptr = bytes_data + offset;
+ }
+}
+
+template <>
+inline int DictDecoderImpl<Int96Type>::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<Int96Type>::Accumulator* builder) {
+ ParquetException::NYI("DecodeArrow to Int96Type");
+}
+
+template <>
+inline int DictDecoderImpl<Int96Type>::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<Int96Type>::DictAccumulator* builder) {
+ ParquetException::NYI("DecodeArrow to Int96Type");
+}
+
+template <>
+inline int DictDecoderImpl<ByteArrayType>::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<ByteArrayType>::Accumulator* builder) {
+ ParquetException::NYI("DecodeArrow implemented elsewhere");
+}
+
+template <>
+inline int DictDecoderImpl<ByteArrayType>::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<ByteArrayType>::DictAccumulator* builder) {
+ ParquetException::NYI("DecodeArrow implemented elsewhere");
+}
+
+template <typename DType>
+int DictDecoderImpl<DType>::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<DType>::DictAccumulator* builder) {
+ PARQUET_THROW_NOT_OK(builder->Reserve(num_values));
+
+ auto dict_values = reinterpret_cast<const typename DType::c_type*>(dictionary_->data());
+
+ VisitNullBitmapInline(
+ valid_bits, valid_bits_offset, num_values, null_count,
+ [&]() {
+ int32_t index;
+ if (ARROW_PREDICT_FALSE(!idx_decoder_.Get(&index))) {
+ throw ParquetException("");
+ }
+ PARQUET_THROW_NOT_OK(IndexInBounds(index));
+ PARQUET_THROW_NOT_OK(builder->Append(dict_values[index]));
+ },
+ [&]() { PARQUET_THROW_NOT_OK(builder->AppendNull()); });
+
+ return num_values - null_count;
+}
+
+template <>
+int DictDecoderImpl<BooleanType>::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<BooleanType>::DictAccumulator* builder) {
+ ParquetException::NYI("No dictionary encoding for BooleanType");
+}
+
+template <>
+inline int DictDecoderImpl<FLBAType>::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<FLBAType>::Accumulator* builder) {
+ if (builder->byte_width() != descr_->type_length()) {
+ throw ParquetException("Byte width mismatch: builder was " +
+ std::to_string(builder->byte_width()) + " but decoder was " +
+ std::to_string(descr_->type_length()));
+ }
+
+ PARQUET_THROW_NOT_OK(builder->Reserve(num_values));
+
+ auto dict_values = reinterpret_cast<const FLBA*>(dictionary_->data());
+
+ VisitNullBitmapInline(
+ valid_bits, valid_bits_offset, num_values, null_count,
+ [&]() {
+ int32_t index;
+ if (ARROW_PREDICT_FALSE(!idx_decoder_.Get(&index))) {
+ throw ParquetException("");
+ }
+ PARQUET_THROW_NOT_OK(IndexInBounds(index));
+ builder->UnsafeAppend(dict_values[index].ptr);
+ },
+ [&]() { builder->UnsafeAppendNull(); });
+
+ return num_values - null_count;
+}
+
+template <>
+int DictDecoderImpl<FLBAType>::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<FLBAType>::DictAccumulator* builder) {
+ auto value_type =
+ checked_cast<const ::arrow::DictionaryType&>(*builder->type()).value_type();
+ auto byte_width =
+ checked_cast<const ::arrow::FixedSizeBinaryType&>(*value_type).byte_width();
+ if (byte_width != descr_->type_length()) {
+ throw ParquetException("Byte width mismatch: builder was " +
+ std::to_string(byte_width) + " but decoder was " +
+ std::to_string(descr_->type_length()));
+ }
+
+ PARQUET_THROW_NOT_OK(builder->Reserve(num_values));
+
+ auto dict_values = reinterpret_cast<const FLBA*>(dictionary_->data());
+
+ VisitNullBitmapInline(
+ valid_bits, valid_bits_offset, num_values, null_count,
+ [&]() {
+ int32_t index;
+ if (ARROW_PREDICT_FALSE(!idx_decoder_.Get(&index))) {
+ throw ParquetException("");
+ }
+ PARQUET_THROW_NOT_OK(IndexInBounds(index));
+ PARQUET_THROW_NOT_OK(builder->Append(dict_values[index].ptr));
+ },
+ [&]() { PARQUET_THROW_NOT_OK(builder->AppendNull()); });
+
+ return num_values - null_count;
+}
+
+template <typename Type>
+int DictDecoderImpl<Type>::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<Type>::Accumulator* builder) {
+ PARQUET_THROW_NOT_OK(builder->Reserve(num_values));
+
+ using value_type = typename Type::c_type;
+ auto dict_values = reinterpret_cast<const value_type*>(dictionary_->data());
+
+ VisitNullBitmapInline(
+ valid_bits, valid_bits_offset, num_values, null_count,
+ [&]() {
+ int32_t index;
+ if (ARROW_PREDICT_FALSE(!idx_decoder_.Get(&index))) {
+ throw ParquetException("");
+ }
+ PARQUET_THROW_NOT_OK(IndexInBounds(index));
+ builder->UnsafeAppend(dict_values[index]);
+ },
+ [&]() { builder->UnsafeAppendNull(); });
+
+ return num_values - null_count;
+}
+
+template <typename Type>
+void DictDecoderImpl<Type>::InsertDictionary(::arrow::ArrayBuilder* builder) {
+ ParquetException::NYI("InsertDictionary only implemented for BYTE_ARRAY types");
+}
+
+template <>
+void DictDecoderImpl<ByteArrayType>::InsertDictionary(::arrow::ArrayBuilder* builder) {
+ auto binary_builder = checked_cast<::arrow::BinaryDictionary32Builder*>(builder);
+
+ // Make a BinaryArray referencing the internal dictionary data
+ auto arr = std::make_shared<::arrow::BinaryArray>(
+ dictionary_length_, byte_array_offsets_, byte_array_data_);
+ PARQUET_THROW_NOT_OK(binary_builder->InsertMemoValues(*arr));
+}
+
+class DictByteArrayDecoderImpl : public DictDecoderImpl<ByteArrayType>,
+ virtual public ByteArrayDecoder {
+ public:
+ using BASE = DictDecoderImpl<ByteArrayType>;
+ using BASE::DictDecoderImpl;
+
+ int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ ::arrow::BinaryDictionary32Builder* builder) override {
+ int result = 0;
+ if (null_count == 0) {
+ PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, builder, &result));
+ } else {
+ PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits,
+ valid_bits_offset, builder, &result));
+ }
+ return result;
+ }
+
+ int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ typename EncodingTraits<ByteArrayType>::Accumulator* out) override {
+ int result = 0;
+ if (null_count == 0) {
+ PARQUET_THROW_NOT_OK(DecodeArrowDenseNonNull(num_values, out, &result));
+ } else {
+ PARQUET_THROW_NOT_OK(DecodeArrowDense(num_values, null_count, valid_bits,
+ valid_bits_offset, out, &result));
+ }
+ return result;
+ }
+
+ private:
+ Status DecodeArrowDense(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ typename EncodingTraits<ByteArrayType>::Accumulator* out,
+ int* out_num_values) {
+ constexpr int32_t kBufferSize = 1024;
+ int32_t indices[kBufferSize];
+
+ ArrowBinaryHelper helper(out);
+
+ ::arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, num_values);
+
+ auto dict_values = reinterpret_cast<const ByteArray*>(dictionary_->data());
+ int values_decoded = 0;
+ int num_appended = 0;
+ while (num_appended < num_values) {
+ bool is_valid = bit_reader.IsSet();
+ bit_reader.Next();
+
+ if (is_valid) {
+ int32_t batch_size =
+ std::min<int32_t>(kBufferSize, num_values - num_appended - null_count);
+ int num_indices = idx_decoder_.GetBatch(indices, batch_size);
+
+ if (ARROW_PREDICT_FALSE(num_indices < 1)) {
+ return Status::Invalid("Invalid number of indices '", num_indices, "'");
+ }
+
+ int i = 0;
+ while (true) {
+ // Consume all indices
+ if (is_valid) {
+ auto idx = indices[i];
+ RETURN_NOT_OK(IndexInBounds(idx));
+ const auto& val = dict_values[idx];
+ if (ARROW_PREDICT_FALSE(!helper.CanFit(val.len))) {
+ RETURN_NOT_OK(helper.PushChunk());
+ }
+ RETURN_NOT_OK(helper.Append(val.ptr, static_cast<int32_t>(val.len)));
+ ++i;
+ ++values_decoded;
+ } else {
+ RETURN_NOT_OK(helper.AppendNull());
+ --null_count;
+ }
+ ++num_appended;
+ if (i == num_indices) {
+ // Do not advance the bit_reader if we have fulfilled the decode
+ // request
+ break;
+ }
+ is_valid = bit_reader.IsSet();
+ bit_reader.Next();
+ }
+ } else {
+ RETURN_NOT_OK(helper.AppendNull());
+ --null_count;
+ ++num_appended;
+ }
+ }
+ *out_num_values = values_decoded;
+ return Status::OK();
+ }
+
+ Status DecodeArrowDenseNonNull(int num_values,
+ typename EncodingTraits<ByteArrayType>::Accumulator* out,
+ int* out_num_values) {
+ constexpr int32_t kBufferSize = 2048;
+ int32_t indices[kBufferSize];
+ int values_decoded = 0;
+
+ ArrowBinaryHelper helper(out);
+ auto dict_values = reinterpret_cast<const ByteArray*>(dictionary_->data());
+
+ while (values_decoded < num_values) {
+ int32_t batch_size = std::min<int32_t>(kBufferSize, num_values - values_decoded);
+ int num_indices = idx_decoder_.GetBatch(indices, batch_size);
+ if (num_indices == 0) ParquetException::EofException();
+ for (int i = 0; i < num_indices; ++i) {
+ auto idx = indices[i];
+ RETURN_NOT_OK(IndexInBounds(idx));
+ const auto& val = dict_values[idx];
+ if (ARROW_PREDICT_FALSE(!helper.CanFit(val.len))) {
+ RETURN_NOT_OK(helper.PushChunk());
+ }
+ RETURN_NOT_OK(helper.Append(val.ptr, static_cast<int32_t>(val.len)));
+ }
+ values_decoded += num_indices;
+ }
+ *out_num_values = values_decoded;
+ return Status::OK();
+ }
+
+ template <typename BuilderType>
+ Status DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset, BuilderType* builder,
+ int* out_num_values) {
+ constexpr int32_t kBufferSize = 1024;
+ int32_t indices[kBufferSize];
+
+ RETURN_NOT_OK(builder->Reserve(num_values));
+ ::arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, num_values);
+
+ auto dict_values = reinterpret_cast<const ByteArray*>(dictionary_->data());
+
+ int values_decoded = 0;
+ int num_appended = 0;
+ while (num_appended < num_values) {
+ bool is_valid = bit_reader.IsSet();
+ bit_reader.Next();
+
+ if (is_valid) {
+ int32_t batch_size =
+ std::min<int32_t>(kBufferSize, num_values - num_appended - null_count);
+ int num_indices = idx_decoder_.GetBatch(indices, batch_size);
+
+ int i = 0;
+ while (true) {
+ // Consume all indices
+ if (is_valid) {
+ auto idx = indices[i];
+ RETURN_NOT_OK(IndexInBounds(idx));
+ const auto& val = dict_values[idx];
+ RETURN_NOT_OK(builder->Append(val.ptr, val.len));
+ ++i;
+ ++values_decoded;
+ } else {
+ RETURN_NOT_OK(builder->AppendNull());
+ --null_count;
+ }
+ ++num_appended;
+ if (i == num_indices) {
+ // Do not advance the bit_reader if we have fulfilled the decode
+ // request
+ break;
+ }
+ is_valid = bit_reader.IsSet();
+ bit_reader.Next();
+ }
+ } else {
+ RETURN_NOT_OK(builder->AppendNull());
+ --null_count;
+ ++num_appended;
+ }
+ }
+ *out_num_values = values_decoded;
+ return Status::OK();
+ }
+
+ template <typename BuilderType>
+ Status DecodeArrowNonNull(int num_values, BuilderType* builder, int* out_num_values) {
+ constexpr int32_t kBufferSize = 2048;
+ int32_t indices[kBufferSize];
+
+ RETURN_NOT_OK(builder->Reserve(num_values));
+
+ auto dict_values = reinterpret_cast<const ByteArray*>(dictionary_->data());
+
+ int values_decoded = 0;
+ while (values_decoded < num_values) {
+ int32_t batch_size = std::min<int32_t>(kBufferSize, num_values - values_decoded);
+ int num_indices = idx_decoder_.GetBatch(indices, batch_size);
+ if (num_indices == 0) ParquetException::EofException();
+ for (int i = 0; i < num_indices; ++i) {
+ auto idx = indices[i];
+ RETURN_NOT_OK(IndexInBounds(idx));
+ const auto& val = dict_values[idx];
+ RETURN_NOT_OK(builder->Append(val.ptr, val.len));
+ }
+ values_decoded += num_indices;
+ }
+ *out_num_values = values_decoded;
+ return Status::OK();
+ }
+};
+
+// ----------------------------------------------------------------------
+// DeltaBitPackDecoder
+
+template <typename DType>
+class DeltaBitPackDecoder : public DecoderImpl, virtual public TypedDecoder<DType> {
+ public:
+ typedef typename DType::c_type T;
+
+ explicit DeltaBitPackDecoder(const ColumnDescriptor* descr,
+ MemoryPool* pool = ::arrow::default_memory_pool())
+ : DecoderImpl(descr, Encoding::DELTA_BINARY_PACKED), pool_(pool) {
+ if (DType::type_num != Type::INT32 && DType::type_num != Type::INT64) {
+ throw ParquetException("Delta bit pack encoding should only be for integer data.");
+ }
+ }
+
+ void SetData(int num_values, const uint8_t* data, int len) override {
+ this->num_values_ = num_values;
+ decoder_ = ::arrow::BitUtil::BitReader(data, len);
+ values_current_block_ = 0;
+ values_current_mini_block_ = 0;
+ }
+
+ int Decode(T* buffer, int max_values) override {
+ return GetInternal(buffer, max_values);
+ }
+
+ int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ typename EncodingTraits<DType>::Accumulator* out) override {
+ if (null_count != 0) {
+ ParquetException::NYI("Delta bit pack DecodeArrow with null slots");
+ }
+ std::vector<T> values(num_values);
+ GetInternal(values.data(), num_values);
+ PARQUET_THROW_NOT_OK(out->AppendValues(values));
+ return num_values;
+ }
+
+ int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ typename EncodingTraits<DType>::DictAccumulator* out) override {
+ if (null_count != 0) {
+ ParquetException::NYI("Delta bit pack DecodeArrow with null slots");
+ }
+ std::vector<T> values(num_values);
+ GetInternal(values.data(), num_values);
+ PARQUET_THROW_NOT_OK(out->Reserve(num_values));
+ for (T value : values) {
+ PARQUET_THROW_NOT_OK(out->Append(value));
+ }
+ return num_values;
+ }
+
+ private:
+ void InitBlock() {
+ // The number of values per block.
+ uint32_t block_size;
+ if (!decoder_.GetVlqInt(&block_size)) ParquetException::EofException();
+ if (!decoder_.GetVlqInt(&num_mini_blocks_)) ParquetException::EofException();
+ if (!decoder_.GetVlqInt(&values_current_block_)) {
+ ParquetException::EofException();
+ }
+ if (!decoder_.GetZigZagVlqInt(&last_value_)) ParquetException::EofException();
+
+ delta_bit_widths_ = AllocateBuffer(pool_, num_mini_blocks_);
+ uint8_t* bit_width_data = delta_bit_widths_->mutable_data();
+
+ if (!decoder_.GetZigZagVlqInt(&min_delta_)) ParquetException::EofException();
+ for (uint32_t i = 0; i < num_mini_blocks_; ++i) {
+ if (!decoder_.GetAligned<uint8_t>(1, bit_width_data + i)) {
+ ParquetException::EofException();
+ }
+ }
+ values_per_mini_block_ = block_size / num_mini_blocks_;
+ mini_block_idx_ = 0;
+ delta_bit_width_ = bit_width_data[0];
+ values_current_mini_block_ = values_per_mini_block_;
+ }
+
+ template <typename T>
+ int GetInternal(T* buffer, int max_values) {
+ max_values = std::min(max_values, this->num_values_);
+ const uint8_t* bit_width_data = delta_bit_widths_->data();
+ for (int i = 0; i < max_values; ++i) {
+ if (ARROW_PREDICT_FALSE(values_current_mini_block_ == 0)) {
+ ++mini_block_idx_;
+ if (mini_block_idx_ < static_cast<size_t>(delta_bit_widths_->size())) {
+ delta_bit_width_ = bit_width_data[mini_block_idx_];
+ values_current_mini_block_ = values_per_mini_block_;
+ } else {
+ InitBlock();
+ buffer[i] = last_value_;
+ continue;
+ }
+ }
+
+ // TODO: the key to this algorithm is to decode the entire miniblock at once.
+ int64_t delta;
+ if (!decoder_.GetValue(delta_bit_width_, &delta)) ParquetException::EofException();
+ delta += min_delta_;
+ last_value_ += static_cast<int32_t>(delta);
+ buffer[i] = last_value_;
+ --values_current_mini_block_;
+ }
+ this->num_values_ -= max_values;
+ return max_values;
+ }
+
+ MemoryPool* pool_;
+ ::arrow::BitUtil::BitReader decoder_;
+ uint32_t values_current_block_;
+ uint32_t num_mini_blocks_;
+ uint64_t values_per_mini_block_;
+ uint64_t values_current_mini_block_;
+
+ int32_t min_delta_;
+ size_t mini_block_idx_;
+ std::shared_ptr<ResizableBuffer> delta_bit_widths_;
+ int delta_bit_width_;
+
+ int32_t last_value_;
+};
+
+// ----------------------------------------------------------------------
+// DELTA_LENGTH_BYTE_ARRAY
+
+class DeltaLengthByteArrayDecoder : public DecoderImpl,
+ virtual public TypedDecoder<ByteArrayType> {
+ public:
+ explicit DeltaLengthByteArrayDecoder(const ColumnDescriptor* descr,
+ MemoryPool* pool = ::arrow::default_memory_pool())
+ : DecoderImpl(descr, Encoding::DELTA_LENGTH_BYTE_ARRAY),
+ len_decoder_(nullptr, pool),
+ pool_(pool) {}
+
+ void SetData(int num_values, const uint8_t* data, int len) override {
+ num_values_ = num_values;
+ if (len == 0) return;
+ int total_lengths_len = ::arrow::util::SafeLoadAs<int32_t>(data);
+ data += 4;
+ this->len_decoder_.SetData(num_values, data, total_lengths_len);
+ data_ = data + total_lengths_len;
+ this->len_ = len - 4 - total_lengths_len;
+ }
+
+ int Decode(ByteArray* buffer, int max_values) override {
+ using VectorT = ArrowPoolVector<int>;
+ max_values = std::min(max_values, num_values_);
+ VectorT lengths(max_values, 0, ::arrow::stl::allocator<int>(pool_));
+ len_decoder_.Decode(lengths.data(), max_values);
+ for (int i = 0; i < max_values; ++i) {
+ buffer[i].len = lengths[i];
+ buffer[i].ptr = data_;
+ this->data_ += lengths[i];
+ this->len_ -= lengths[i];
+ }
+ this->num_values_ -= max_values;
+ return max_values;
+ }
+
+ int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ typename EncodingTraits<ByteArrayType>::Accumulator* out) override {
+ ParquetException::NYI("DecodeArrow for DeltaLengthByteArrayDecoder");
+ }
+
+ int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ typename EncodingTraits<ByteArrayType>::DictAccumulator* out) override {
+ ParquetException::NYI("DecodeArrow for DeltaLengthByteArrayDecoder");
+ }
+
+ private:
+ DeltaBitPackDecoder<Int32Type> len_decoder_;
+ ::arrow::MemoryPool* pool_;
+};
+
+// ----------------------------------------------------------------------
+// DELTA_BYTE_ARRAY
+
+class DeltaByteArrayDecoder : public DecoderImpl,
+ virtual public TypedDecoder<ByteArrayType> {
+ public:
+ explicit DeltaByteArrayDecoder(const ColumnDescriptor* descr,
+ MemoryPool* pool = ::arrow::default_memory_pool())
+ : DecoderImpl(descr, Encoding::DELTA_BYTE_ARRAY),
+ prefix_len_decoder_(nullptr, pool),
+ suffix_decoder_(nullptr, pool),
+ last_value_(0, nullptr) {}
+
+ virtual void SetData(int num_values, const uint8_t* data, int len) {
+ num_values_ = num_values;
+ if (len == 0) return;
+ int prefix_len_length = ::arrow::util::SafeLoadAs<int32_t>(data);
+ data += 4;
+ len -= 4;
+ prefix_len_decoder_.SetData(num_values, data, prefix_len_length);
+ data += prefix_len_length;
+ len -= prefix_len_length;
+ suffix_decoder_.SetData(num_values, data, len);
+ }
+
+ // TODO: this doesn't work and requires memory management. We need to allocate
+ // new strings to store the results.
+ virtual int Decode(ByteArray* buffer, int max_values) {
+ max_values = std::min(max_values, this->num_values_);
+ for (int i = 0; i < max_values; ++i) {
+ int prefix_len = 0;
+ prefix_len_decoder_.Decode(&prefix_len, 1);
+ ByteArray suffix = {0, nullptr};
+ suffix_decoder_.Decode(&suffix, 1);
+ buffer[i].len = prefix_len + suffix.len;
+
+ uint8_t* result = reinterpret_cast<uint8_t*>(malloc(buffer[i].len));
+ memcpy(result, last_value_.ptr, prefix_len);
+ memcpy(result + prefix_len, suffix.ptr, suffix.len);
+
+ buffer[i].ptr = result;
+ last_value_ = buffer[i];
+ }
+ this->num_values_ -= max_values;
+ return max_values;
+ }
+
+ private:
+ DeltaBitPackDecoder<Int32Type> prefix_len_decoder_;
+ DeltaLengthByteArrayDecoder suffix_decoder_;
+ ByteArray last_value_;
+};
+
+// ----------------------------------------------------------------------
+// BYTE_STREAM_SPLIT
+
+template <typename DType>
+class ByteStreamSplitDecoder : public DecoderImpl, virtual public TypedDecoder<DType> {
+ public:
+ using T = typename DType::c_type;
+ explicit ByteStreamSplitDecoder(const ColumnDescriptor* descr);
+
+ int Decode(T* buffer, int max_values) override;
+
+ int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ typename EncodingTraits<DType>::Accumulator* builder) override;
+
+ int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ typename EncodingTraits<DType>::DictAccumulator* builder) override;
+
+ void SetData(int num_values, const uint8_t* data, int len) override;
+
+ T* EnsureDecodeBuffer(int64_t min_values) {
+ const int64_t size = sizeof(T) * min_values;
+ if (!decode_buffer_ || decode_buffer_->size() < size) {
+ PARQUET_ASSIGN_OR_THROW(decode_buffer_, ::arrow::AllocateBuffer(size));
+ }
+ return reinterpret_cast<T*>(decode_buffer_->mutable_data());
+ }
+
+ private:
+ int num_values_in_buffer_{0};
+ std::shared_ptr<Buffer> decode_buffer_;
+
+ static constexpr size_t kNumStreams = sizeof(T);
+};
+
+template <typename DType>
+ByteStreamSplitDecoder<DType>::ByteStreamSplitDecoder(const ColumnDescriptor* descr)
+ : DecoderImpl(descr, Encoding::BYTE_STREAM_SPLIT) {}
+
+template <typename DType>
+void ByteStreamSplitDecoder<DType>::SetData(int num_values, const uint8_t* data,
+ int len) {
+ DecoderImpl::SetData(num_values, data, len);
+ if (num_values * static_cast<int64_t>(sizeof(T)) > len) {
+ throw ParquetException("Data size too small for number of values (corrupted file?)");
+ }
+ num_values_in_buffer_ = num_values;
+}
+
+template <typename DType>
+int ByteStreamSplitDecoder<DType>::Decode(T* buffer, int max_values) {
+ const int values_to_decode = std::min(num_values_, max_values);
+ const int num_decoded_previously = num_values_in_buffer_ - num_values_;
+ const uint8_t* data = data_ + num_decoded_previously;
+
+ ::arrow::util::internal::ByteStreamSplitDecode<T>(data, values_to_decode,
+ num_values_in_buffer_, buffer);
+ num_values_ -= values_to_decode;
+ len_ -= sizeof(T) * values_to_decode;
+ return values_to_decode;
+}
+
+template <typename DType>
+int ByteStreamSplitDecoder<DType>::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<DType>::Accumulator* builder) {
+ constexpr int value_size = static_cast<int>(kNumStreams);
+ int values_decoded = num_values - null_count;
+ if (ARROW_PREDICT_FALSE(len_ < value_size * values_decoded)) {
+ ParquetException::EofException();
+ }
+
+ PARQUET_THROW_NOT_OK(builder->Reserve(num_values));
+
+ const int num_decoded_previously = num_values_in_buffer_ - num_values_;
+ const uint8_t* data = data_ + num_decoded_previously;
+ int offset = 0;
+
+#if defined(ARROW_HAVE_SIMD_SPLIT)
+ // Use fast decoding into intermediate buffer. This will also decode
+ // some null values, but it's fast enough that we don't care.
+ T* decode_out = EnsureDecodeBuffer(values_decoded);
+ ::arrow::util::internal::ByteStreamSplitDecode<T>(data, values_decoded,
+ num_values_in_buffer_, decode_out);
+
+ // XXX If null_count is 0, we could even append in bulk or decode directly into
+ // builder
+ VisitNullBitmapInline(
+ valid_bits, valid_bits_offset, num_values, null_count,
+ [&]() {
+ builder->UnsafeAppend(decode_out[offset]);
+ ++offset;
+ },
+ [&]() { builder->UnsafeAppendNull(); });
+
+#else
+ VisitNullBitmapInline(
+ valid_bits, valid_bits_offset, num_values, null_count,
+ [&]() {
+ uint8_t gathered_byte_data[kNumStreams];
+ for (size_t b = 0; b < kNumStreams; ++b) {
+ const size_t byte_index = b * num_values_in_buffer_ + offset;
+ gathered_byte_data[b] = data[byte_index];
+ }
+ builder->UnsafeAppend(::arrow::util::SafeLoadAs<T>(&gathered_byte_data[0]));
+ ++offset;
+ },
+ [&]() { builder->UnsafeAppendNull(); });
+#endif
+
+ num_values_ -= values_decoded;
+ len_ -= sizeof(T) * values_decoded;
+ return values_decoded;
+}
+
+template <typename DType>
+int ByteStreamSplitDecoder<DType>::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<DType>::DictAccumulator* builder) {
+ ParquetException::NYI("DecodeArrow for ByteStreamSplitDecoder");
+}
+
+} // namespace
+
+// ----------------------------------------------------------------------
+// Encoder and decoder factory functions
+
+std::unique_ptr<Encoder> MakeEncoder(Type::type type_num, Encoding::type encoding,
+ bool use_dictionary, const ColumnDescriptor* descr,
+ MemoryPool* pool) {
+ if (use_dictionary) {
+ switch (type_num) {
+ case Type::INT32:
+ return std::unique_ptr<Encoder>(new DictEncoderImpl<Int32Type>(descr, pool));
+ case Type::INT64:
+ return std::unique_ptr<Encoder>(new DictEncoderImpl<Int64Type>(descr, pool));
+ case Type::INT96:
+ return std::unique_ptr<Encoder>(new DictEncoderImpl<Int96Type>(descr, pool));
+ case Type::FLOAT:
+ return std::unique_ptr<Encoder>(new DictEncoderImpl<FloatType>(descr, pool));
+ case Type::DOUBLE:
+ return std::unique_ptr<Encoder>(new DictEncoderImpl<DoubleType>(descr, pool));
+ case Type::BYTE_ARRAY:
+ return std::unique_ptr<Encoder>(new DictEncoderImpl<ByteArrayType>(descr, pool));
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ return std::unique_ptr<Encoder>(new DictEncoderImpl<FLBAType>(descr, pool));
+ default:
+ DCHECK(false) << "Encoder not implemented";
+ break;
+ }
+ } else if (encoding == Encoding::PLAIN) {
+ switch (type_num) {
+ case Type::BOOLEAN:
+ return std::unique_ptr<Encoder>(new PlainEncoder<BooleanType>(descr, pool));
+ case Type::INT32:
+ return std::unique_ptr<Encoder>(new PlainEncoder<Int32Type>(descr, pool));
+ case Type::INT64:
+ return std::unique_ptr<Encoder>(new PlainEncoder<Int64Type>(descr, pool));
+ case Type::INT96:
+ return std::unique_ptr<Encoder>(new PlainEncoder<Int96Type>(descr, pool));
+ case Type::FLOAT:
+ return std::unique_ptr<Encoder>(new PlainEncoder<FloatType>(descr, pool));
+ case Type::DOUBLE:
+ return std::unique_ptr<Encoder>(new PlainEncoder<DoubleType>(descr, pool));
+ case Type::BYTE_ARRAY:
+ return std::unique_ptr<Encoder>(new PlainEncoder<ByteArrayType>(descr, pool));
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ return std::unique_ptr<Encoder>(new PlainEncoder<FLBAType>(descr, pool));
+ default:
+ DCHECK(false) << "Encoder not implemented";
+ break;
+ }
+ } else if (encoding == Encoding::BYTE_STREAM_SPLIT) {
+ switch (type_num) {
+ case Type::FLOAT:
+ return std::unique_ptr<Encoder>(
+ new ByteStreamSplitEncoder<FloatType>(descr, pool));
+ case Type::DOUBLE:
+ return std::unique_ptr<Encoder>(
+ new ByteStreamSplitEncoder<DoubleType>(descr, pool));
+ default:
+ throw ParquetException("BYTE_STREAM_SPLIT only supports FLOAT and DOUBLE");
+ break;
+ }
+ } else {
+ ParquetException::NYI("Selected encoding is not supported");
+ }
+ DCHECK(false) << "Should not be able to reach this code";
+ return nullptr;
+}
+
+std::unique_ptr<Decoder> MakeDecoder(Type::type type_num, Encoding::type encoding,
+ const ColumnDescriptor* descr) {
+ if (encoding == Encoding::PLAIN) {
+ switch (type_num) {
+ case Type::BOOLEAN:
+ return std::unique_ptr<Decoder>(new PlainBooleanDecoder(descr));
+ case Type::INT32:
+ return std::unique_ptr<Decoder>(new PlainDecoder<Int32Type>(descr));
+ case Type::INT64:
+ return std::unique_ptr<Decoder>(new PlainDecoder<Int64Type>(descr));
+ case Type::INT96:
+ return std::unique_ptr<Decoder>(new PlainDecoder<Int96Type>(descr));
+ case Type::FLOAT:
+ return std::unique_ptr<Decoder>(new PlainDecoder<FloatType>(descr));
+ case Type::DOUBLE:
+ return std::unique_ptr<Decoder>(new PlainDecoder<DoubleType>(descr));
+ case Type::BYTE_ARRAY:
+ return std::unique_ptr<Decoder>(new PlainByteArrayDecoder(descr));
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ return std::unique_ptr<Decoder>(new PlainFLBADecoder(descr));
+ default:
+ break;
+ }
+ } else if (encoding == Encoding::BYTE_STREAM_SPLIT) {
+ switch (type_num) {
+ case Type::FLOAT:
+ return std::unique_ptr<Decoder>(new ByteStreamSplitDecoder<FloatType>(descr));
+ case Type::DOUBLE:
+ return std::unique_ptr<Decoder>(new ByteStreamSplitDecoder<DoubleType>(descr));
+ default:
+ throw ParquetException("BYTE_STREAM_SPLIT only supports FLOAT and DOUBLE");
+ break;
+ }
+ } else {
+ ParquetException::NYI("Selected encoding is not supported");
+ }
+ DCHECK(false) << "Should not be able to reach this code";
+ return nullptr;
+}
+
+namespace detail {
+std::unique_ptr<Decoder> MakeDictDecoder(Type::type type_num,
+ const ColumnDescriptor* descr,
+ MemoryPool* pool) {
+ switch (type_num) {
+ case Type::BOOLEAN:
+ ParquetException::NYI("Dictionary encoding not implemented for boolean type");
+ case Type::INT32:
+ return std::unique_ptr<Decoder>(new DictDecoderImpl<Int32Type>(descr, pool));
+ case Type::INT64:
+ return std::unique_ptr<Decoder>(new DictDecoderImpl<Int64Type>(descr, pool));
+ case Type::INT96:
+ return std::unique_ptr<Decoder>(new DictDecoderImpl<Int96Type>(descr, pool));
+ case Type::FLOAT:
+ return std::unique_ptr<Decoder>(new DictDecoderImpl<FloatType>(descr, pool));
+ case Type::DOUBLE:
+ return std::unique_ptr<Decoder>(new DictDecoderImpl<DoubleType>(descr, pool));
+ case Type::BYTE_ARRAY:
+ return std::unique_ptr<Decoder>(new DictByteArrayDecoderImpl(descr, pool));
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ return std::unique_ptr<Decoder>(new DictDecoderImpl<FLBAType>(descr, pool));
+ default:
+ break;
+ }
+ DCHECK(false) << "Should not be able to reach this code";
+ return nullptr;
+}
+
+} // namespace detail
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/encoding.h b/contrib/libs/apache/arrow/cpp/src/parquet/encoding.h
new file mode 100644
index 00000000000..b9ca7a7ee68
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/encoding.h
@@ -0,0 +1,460 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <vector>
+
+#include "arrow/util/spaced.h"
+
+#include "parquet/exception.h"
+#include "parquet/platform.h"
+#include "parquet/types.h"
+
+namespace arrow {
+
+class Array;
+class ArrayBuilder;
+class BinaryArray;
+class BinaryBuilder;
+class BooleanBuilder;
+class Int32Type;
+class Int64Type;
+class FloatType;
+class DoubleType;
+class FixedSizeBinaryType;
+template <typename T>
+class NumericBuilder;
+class FixedSizeBinaryBuilder;
+template <typename T>
+class Dictionary32Builder;
+
+} // namespace arrow
+
+namespace parquet {
+
+template <typename DType>
+class TypedEncoder;
+
+using BooleanEncoder = TypedEncoder<BooleanType>;
+using Int32Encoder = TypedEncoder<Int32Type>;
+using Int64Encoder = TypedEncoder<Int64Type>;
+using Int96Encoder = TypedEncoder<Int96Type>;
+using FloatEncoder = TypedEncoder<FloatType>;
+using DoubleEncoder = TypedEncoder<DoubleType>;
+using ByteArrayEncoder = TypedEncoder<ByteArrayType>;
+using FLBAEncoder = TypedEncoder<FLBAType>;
+
+template <typename DType>
+class TypedDecoder;
+
+class BooleanDecoder;
+using Int32Decoder = TypedDecoder<Int32Type>;
+using Int64Decoder = TypedDecoder<Int64Type>;
+using Int96Decoder = TypedDecoder<Int96Type>;
+using FloatDecoder = TypedDecoder<FloatType>;
+using DoubleDecoder = TypedDecoder<DoubleType>;
+using ByteArrayDecoder = TypedDecoder<ByteArrayType>;
+class FLBADecoder;
+
+template <typename T>
+struct EncodingTraits;
+
+template <>
+struct EncodingTraits<BooleanType> {
+ using Encoder = BooleanEncoder;
+ using Decoder = BooleanDecoder;
+
+ using ArrowType = ::arrow::BooleanType;
+ using Accumulator = ::arrow::BooleanBuilder;
+ struct DictAccumulator {};
+};
+
+template <>
+struct EncodingTraits<Int32Type> {
+ using Encoder = Int32Encoder;
+ using Decoder = Int32Decoder;
+
+ using ArrowType = ::arrow::Int32Type;
+ using Accumulator = ::arrow::NumericBuilder<::arrow::Int32Type>;
+ using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::Int32Type>;
+};
+
+template <>
+struct EncodingTraits<Int64Type> {
+ using Encoder = Int64Encoder;
+ using Decoder = Int64Decoder;
+
+ using ArrowType = ::arrow::Int64Type;
+ using Accumulator = ::arrow::NumericBuilder<::arrow::Int64Type>;
+ using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::Int64Type>;
+};
+
+template <>
+struct EncodingTraits<Int96Type> {
+ using Encoder = Int96Encoder;
+ using Decoder = Int96Decoder;
+
+ struct Accumulator {};
+ struct DictAccumulator {};
+};
+
+template <>
+struct EncodingTraits<FloatType> {
+ using Encoder = FloatEncoder;
+ using Decoder = FloatDecoder;
+
+ using ArrowType = ::arrow::FloatType;
+ using Accumulator = ::arrow::NumericBuilder<::arrow::FloatType>;
+ using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::FloatType>;
+};
+
+template <>
+struct EncodingTraits<DoubleType> {
+ using Encoder = DoubleEncoder;
+ using Decoder = DoubleDecoder;
+
+ using ArrowType = ::arrow::DoubleType;
+ using Accumulator = ::arrow::NumericBuilder<::arrow::DoubleType>;
+ using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::DoubleType>;
+};
+
+template <>
+struct EncodingTraits<ByteArrayType> {
+ using Encoder = ByteArrayEncoder;
+ using Decoder = ByteArrayDecoder;
+
+ /// \brief Internal helper class for decoding BYTE_ARRAY data where we can
+ /// overflow the capacity of a single arrow::BinaryArray
+ struct Accumulator {
+ std::unique_ptr<::arrow::BinaryBuilder> builder;
+ std::vector<std::shared_ptr<::arrow::Array>> chunks;
+ };
+ using ArrowType = ::arrow::BinaryType;
+ using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::BinaryType>;
+};
+
+template <>
+struct EncodingTraits<FLBAType> {
+ using Encoder = FLBAEncoder;
+ using Decoder = FLBADecoder;
+
+ using ArrowType = ::arrow::FixedSizeBinaryType;
+ using Accumulator = ::arrow::FixedSizeBinaryBuilder;
+ using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::FixedSizeBinaryType>;
+};
+
+class ColumnDescriptor;
+
+// Untyped base for all encoders
+class Encoder {
+ public:
+ virtual ~Encoder() = default;
+
+ virtual int64_t EstimatedDataEncodedSize() = 0;
+ virtual std::shared_ptr<Buffer> FlushValues() = 0;
+ virtual Encoding::type encoding() const = 0;
+
+ virtual void Put(const ::arrow::Array& values) = 0;
+
+ virtual MemoryPool* memory_pool() const = 0;
+};
+
+// Base class for value encoders. Since encoders may or not have state (e.g.,
+// dictionary encoding) we use a class instance to maintain any state.
+//
+// Encode interfaces are internal, subject to change without deprecation.
+template <typename DType>
+class TypedEncoder : virtual public Encoder {
+ public:
+ typedef typename DType::c_type T;
+
+ using Encoder::Put;
+
+ virtual void Put(const T* src, int num_values) = 0;
+
+ virtual void Put(const std::vector<T>& src, int num_values = -1);
+
+ virtual void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
+ int64_t valid_bits_offset) = 0;
+};
+
+template <typename DType>
+void TypedEncoder<DType>::Put(const std::vector<T>& src, int num_values) {
+ if (num_values == -1) {
+ num_values = static_cast<int>(src.size());
+ }
+ Put(src.data(), num_values);
+}
+
+template <>
+inline void TypedEncoder<BooleanType>::Put(const std::vector<bool>& src, int num_values) {
+ // NOTE(wesm): This stub is here only to satisfy the compiler; it is
+ // overridden later with the actual implementation
+}
+
+// Base class for dictionary encoders
+template <typename DType>
+class DictEncoder : virtual public TypedEncoder<DType> {
+ public:
+ /// Writes out any buffered indices to buffer preceded by the bit width of this data.
+ /// Returns the number of bytes written.
+ /// If the supplied buffer is not big enough, returns -1.
+ /// buffer must be preallocated with buffer_len bytes. Use EstimatedDataEncodedSize()
+ /// to size buffer.
+ virtual int WriteIndices(uint8_t* buffer, int buffer_len) = 0;
+
+ virtual int dict_encoded_size() = 0;
+ // virtual int dict_encoded_size() { return dict_encoded_size_; }
+
+ virtual int bit_width() const = 0;
+
+ /// Writes out the encoded dictionary to buffer. buffer must be preallocated to
+ /// dict_encoded_size() bytes.
+ virtual void WriteDict(uint8_t* buffer) = 0;
+
+ virtual int num_entries() const = 0;
+
+ /// \brief EXPERIMENTAL: Append dictionary indices into the encoder. It is
+ /// assumed (without any boundschecking) that the indices reference
+ /// pre-existing dictionary values
+ /// \param[in] indices the dictionary index values. Only Int32Array currently
+ /// supported
+ virtual void PutIndices(const ::arrow::Array& indices) = 0;
+
+ /// \brief EXPERIMENTAL: Append dictionary into encoder, inserting indices
+ /// separately. Currently throws exception if the current dictionary memo is
+ /// non-empty
+ /// \param[in] values the dictionary values. Only valid for certain
+ /// Parquet/Arrow type combinations, like BYTE_ARRAY/BinaryArray
+ virtual void PutDictionary(const ::arrow::Array& values) = 0;
+};
+
+// ----------------------------------------------------------------------
+// Value decoding
+
+class Decoder {
+ public:
+ virtual ~Decoder() = default;
+
+ // Sets the data for a new page. This will be called multiple times on the same
+ // decoder and should reset all internal state.
+ virtual void SetData(int num_values, const uint8_t* data, int len) = 0;
+
+ // Returns the number of values left (for the last call to SetData()). This is
+ // the number of values left in this page.
+ virtual int values_left() const = 0;
+ virtual Encoding::type encoding() const = 0;
+};
+
+template <typename DType>
+class TypedDecoder : virtual public Decoder {
+ public:
+ using T = typename DType::c_type;
+
+ /// \brief Decode values into a buffer
+ ///
+ /// Subclasses may override the more specialized Decode methods below.
+ ///
+ /// \param[in] buffer destination for decoded values
+ /// \param[in] max_values maximum number of values to decode
+ /// \return The number of values decoded. Should be identical to max_values except
+ /// at the end of the current data page.
+ virtual int Decode(T* buffer, int max_values) = 0;
+
+ /// \brief Decode the values in this data page but leave spaces for null entries.
+ ///
+ /// \param[in] buffer destination for decoded values
+ /// \param[in] num_values size of the def_levels and buffer arrays including the number
+ /// of null slots
+ /// \param[in] null_count number of null slots
+ /// \param[in] valid_bits bitmap data indicating position of valid slots
+ /// \param[in] valid_bits_offset offset into valid_bits
+ /// \return The number of values decoded, including nulls.
+ virtual int DecodeSpaced(T* buffer, int num_values, int null_count,
+ const uint8_t* valid_bits, int64_t valid_bits_offset) {
+ if (null_count > 0) {
+ int values_to_read = num_values - null_count;
+ int values_read = Decode(buffer, values_to_read);
+ if (values_read != values_to_read) {
+ throw ParquetException("Number of values / definition_levels read did not match");
+ }
+
+ return ::arrow::util::internal::SpacedExpand<T>(buffer, num_values, null_count,
+ valid_bits, valid_bits_offset);
+ } else {
+ return Decode(buffer, num_values);
+ }
+ }
+
+ /// \brief Decode into an ArrayBuilder or other accumulator
+ ///
+ /// This function assumes the definition levels were already decoded
+ /// as a validity bitmap in the given `valid_bits`. `null_count`
+ /// is the number of 0s in `valid_bits`.
+ /// As a space optimization, it is allowed for `valid_bits` to be null
+ /// if `null_count` is zero.
+ ///
+ /// \return number of values decoded
+ virtual int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ typename EncodingTraits<DType>::Accumulator* out) = 0;
+
+ /// \brief Decode into an ArrayBuilder or other accumulator ignoring nulls
+ ///
+ /// \return number of values decoded
+ int DecodeArrowNonNull(int num_values,
+ typename EncodingTraits<DType>::Accumulator* out) {
+ return DecodeArrow(num_values, 0, /*valid_bits=*/NULLPTR, 0, out);
+ }
+
+ /// \brief Decode into a DictionaryBuilder
+ ///
+ /// This function assumes the definition levels were already decoded
+ /// as a validity bitmap in the given `valid_bits`. `null_count`
+ /// is the number of 0s in `valid_bits`.
+ /// As a space optimization, it is allowed for `valid_bits` to be null
+ /// if `null_count` is zero.
+ ///
+ /// \return number of values decoded
+ virtual int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ typename EncodingTraits<DType>::DictAccumulator* builder) = 0;
+
+ /// \brief Decode into a DictionaryBuilder ignoring nulls
+ ///
+ /// \return number of values decoded
+ int DecodeArrowNonNull(int num_values,
+ typename EncodingTraits<DType>::DictAccumulator* builder) {
+ return DecodeArrow(num_values, 0, /*valid_bits=*/NULLPTR, 0, builder);
+ }
+};
+
+template <typename DType>
+class DictDecoder : virtual public TypedDecoder<DType> {
+ public:
+ using T = typename DType::c_type;
+
+ virtual void SetDict(TypedDecoder<DType>* dictionary) = 0;
+
+ /// \brief Insert dictionary values into the Arrow dictionary builder's memo,
+ /// but do not append any indices
+ virtual void InsertDictionary(::arrow::ArrayBuilder* builder) = 0;
+
+ /// \brief Decode only dictionary indices and append to dictionary
+ /// builder. The builder must have had the dictionary from this decoder
+ /// inserted already.
+ ///
+ /// \warning Remember to reset the builder each time the dict decoder is initialized
+ /// with a new dictionary page
+ virtual int DecodeIndicesSpaced(int num_values, int null_count,
+ const uint8_t* valid_bits, int64_t valid_bits_offset,
+ ::arrow::ArrayBuilder* builder) = 0;
+
+ /// \brief Decode only dictionary indices (no nulls)
+ ///
+ /// \warning Remember to reset the builder each time the dict decoder is initialized
+ /// with a new dictionary page
+ virtual int DecodeIndices(int num_values, ::arrow::ArrayBuilder* builder) = 0;
+
+ /// \brief Decode only dictionary indices (no nulls). Same as above
+ /// DecodeIndices but target is an array instead of a builder.
+ ///
+ /// \note API EXPERIMENTAL
+ virtual int DecodeIndices(int num_values, int32_t* indices) = 0;
+
+ /// \brief Get dictionary. The reader will call this API when it encounters a
+ /// new dictionary.
+ ///
+ /// @param[out] dictionary The pointer to dictionary values. Dictionary is owned by
+ /// the decoder and is destroyed when the decoder is destroyed.
+ /// @param[out] dictionary_length The dictionary length.
+ ///
+ /// \note API EXPERIMENTAL
+ virtual void GetDictionary(const T** dictionary, int32_t* dictionary_length) = 0;
+};
+
+// ----------------------------------------------------------------------
+// TypedEncoder specializations, traits, and factory functions
+
+class BooleanDecoder : virtual public TypedDecoder<BooleanType> {
+ public:
+ using TypedDecoder<BooleanType>::Decode;
+ virtual int Decode(uint8_t* buffer, int max_values) = 0;
+};
+
+class FLBADecoder : virtual public TypedDecoder<FLBAType> {
+ public:
+ using TypedDecoder<FLBAType>::DecodeSpaced;
+
+ // TODO(wesm): As possible follow-up to PARQUET-1508, we should examine if
+ // there is value in adding specialized read methods for
+ // FIXED_LEN_BYTE_ARRAY. If only Decimal data can occur with this data type
+ // then perhaps not
+};
+
+PARQUET_EXPORT
+std::unique_ptr<Encoder> MakeEncoder(
+ Type::type type_num, Encoding::type encoding, bool use_dictionary = false,
+ const ColumnDescriptor* descr = NULLPTR,
+ ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
+
+template <typename DType>
+std::unique_ptr<typename EncodingTraits<DType>::Encoder> MakeTypedEncoder(
+ Encoding::type encoding, bool use_dictionary = false,
+ const ColumnDescriptor* descr = NULLPTR,
+ ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
+ using OutType = typename EncodingTraits<DType>::Encoder;
+ std::unique_ptr<Encoder> base =
+ MakeEncoder(DType::type_num, encoding, use_dictionary, descr, pool);
+ return std::unique_ptr<OutType>(dynamic_cast<OutType*>(base.release()));
+}
+
+PARQUET_EXPORT
+std::unique_ptr<Decoder> MakeDecoder(Type::type type_num, Encoding::type encoding,
+ const ColumnDescriptor* descr = NULLPTR);
+
+namespace detail {
+
+PARQUET_EXPORT
+std::unique_ptr<Decoder> MakeDictDecoder(Type::type type_num,
+ const ColumnDescriptor* descr,
+ ::arrow::MemoryPool* pool);
+
+} // namespace detail
+
+template <typename DType>
+std::unique_ptr<DictDecoder<DType>> MakeDictDecoder(
+ const ColumnDescriptor* descr = NULLPTR,
+ ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
+ using OutType = DictDecoder<DType>;
+ auto decoder = detail::MakeDictDecoder(DType::type_num, descr, pool);
+ return std::unique_ptr<OutType>(dynamic_cast<OutType*>(decoder.release()));
+}
+
+template <typename DType>
+std::unique_ptr<typename EncodingTraits<DType>::Decoder> MakeTypedDecoder(
+ Encoding::type encoding, const ColumnDescriptor* descr = NULLPTR) {
+ using OutType = typename EncodingTraits<DType>::Decoder;
+ std::unique_ptr<Decoder> base = MakeDecoder(DType::type_num, encoding, descr);
+ return std::unique_ptr<OutType>(dynamic_cast<OutType*>(base.release()));
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/encryption/encryption.cc b/contrib/libs/apache/arrow/cpp/src/parquet/encryption/encryption.cc
new file mode 100644
index 00000000000..5927503aba3
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/encryption/encryption.cc
@@ -0,0 +1,412 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/encryption/encryption.h"
+
+#include <string.h>
+
+#include <map>
+#include <utility>
+
+#include "arrow/util/logging.h"
+#include "arrow/util/utf8.h"
+#include "parquet/encryption/encryption_internal.h"
+
+namespace parquet {
+
+// integer key retriever
+void IntegerKeyIdRetriever::PutKey(uint32_t key_id, const std::string& key) {
+ key_map_.insert({key_id, key});
+}
+
+std::string IntegerKeyIdRetriever::GetKey(const std::string& key_metadata) {
+ uint32_t key_id;
+ memcpy(reinterpret_cast<uint8_t*>(&key_id), key_metadata.c_str(), 4);
+
+ return key_map_.at(key_id);
+}
+
+// string key retriever
+void StringKeyIdRetriever::PutKey(const std::string& key_id, const std::string& key) {
+ key_map_.insert({key_id, key});
+}
+
+std::string StringKeyIdRetriever::GetKey(const std::string& key_id) {
+ return key_map_.at(key_id);
+}
+
+ColumnEncryptionProperties::Builder* ColumnEncryptionProperties::Builder::key(
+ std::string column_key) {
+ if (column_key.empty()) return this;
+
+ DCHECK(key_.empty());
+ key_ = column_key;
+ return this;
+}
+
+ColumnEncryptionProperties::Builder* ColumnEncryptionProperties::Builder::key_metadata(
+ const std::string& key_metadata) {
+ DCHECK(!key_metadata.empty());
+ DCHECK(key_metadata_.empty());
+ key_metadata_ = key_metadata;
+ return this;
+}
+
+ColumnEncryptionProperties::Builder* ColumnEncryptionProperties::Builder::key_id(
+ const std::string& key_id) {
+ // key_id is expected to be in UTF8 encoding
+ ::arrow::util::InitializeUTF8();
+ const uint8_t* data = reinterpret_cast<const uint8_t*>(key_id.c_str());
+ if (!::arrow::util::ValidateUTF8(data, key_id.size())) {
+ throw ParquetException("key id should be in UTF8 encoding");
+ }
+
+ DCHECK(!key_id.empty());
+ this->key_metadata(key_id);
+ return this;
+}
+
+FileDecryptionProperties::Builder* FileDecryptionProperties::Builder::column_keys(
+ const ColumnPathToDecryptionPropertiesMap& column_decryption_properties) {
+ if (column_decryption_properties.size() == 0) return this;
+
+ if (column_decryption_properties_.size() != 0)
+ throw ParquetException("Column properties already set");
+
+ for (const auto& element : column_decryption_properties) {
+ if (element.second->is_utilized()) {
+ throw ParquetException("Column properties utilized in another file");
+ }
+ element.second->set_utilized();
+ }
+
+ column_decryption_properties_ = column_decryption_properties;
+ return this;
+}
+
+void FileDecryptionProperties::WipeOutDecryptionKeys() {
+ footer_key_.clear();
+
+ for (const auto& element : column_decryption_properties_) {
+ element.second->WipeOutDecryptionKey();
+ }
+}
+
+bool FileDecryptionProperties::is_utilized() {
+ if (footer_key_.empty() && column_decryption_properties_.size() == 0 &&
+ aad_prefix_.empty())
+ return false;
+
+ return utilized_;
+}
+
+std::shared_ptr<FileDecryptionProperties> FileDecryptionProperties::DeepClone(
+ std::string new_aad_prefix) {
+ std::string footer_key_copy = footer_key_;
+ ColumnPathToDecryptionPropertiesMap column_decryption_properties_map_copy;
+
+ for (const auto& element : column_decryption_properties_) {
+ column_decryption_properties_map_copy.insert(
+ {element.second->column_path(), element.second->DeepClone()});
+ }
+
+ if (new_aad_prefix.empty()) new_aad_prefix = aad_prefix_;
+ return std::shared_ptr<FileDecryptionProperties>(new FileDecryptionProperties(
+ footer_key_copy, key_retriever_, check_plaintext_footer_integrity_, new_aad_prefix,
+ aad_prefix_verifier_, column_decryption_properties_map_copy,
+ plaintext_files_allowed_));
+}
+
+FileDecryptionProperties::Builder* FileDecryptionProperties::Builder::footer_key(
+ const std::string footer_key) {
+ if (footer_key.empty()) {
+ return this;
+ }
+ DCHECK(footer_key_.empty());
+ footer_key_ = footer_key;
+ return this;
+}
+
+FileDecryptionProperties::Builder* FileDecryptionProperties::Builder::key_retriever(
+ const std::shared_ptr<DecryptionKeyRetriever>& key_retriever) {
+ if (key_retriever == nullptr) return this;
+
+ DCHECK(key_retriever_ == nullptr);
+ key_retriever_ = key_retriever;
+ return this;
+}
+
+FileDecryptionProperties::Builder* FileDecryptionProperties::Builder::aad_prefix(
+ const std::string& aad_prefix) {
+ if (aad_prefix.empty()) {
+ return this;
+ }
+ DCHECK(aad_prefix_.empty());
+ aad_prefix_ = aad_prefix;
+ return this;
+}
+
+FileDecryptionProperties::Builder* FileDecryptionProperties::Builder::aad_prefix_verifier(
+ std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier) {
+ if (aad_prefix_verifier == nullptr) return this;
+
+ DCHECK(aad_prefix_verifier_ == nullptr);
+ aad_prefix_verifier_ = std::move(aad_prefix_verifier);
+ return this;
+}
+
+ColumnDecryptionProperties::Builder* ColumnDecryptionProperties::Builder::key(
+ const std::string& key) {
+ if (key.empty()) return this;
+
+ DCHECK(!key.empty());
+ key_ = key;
+ return this;
+}
+
+std::shared_ptr<ColumnDecryptionProperties> ColumnDecryptionProperties::Builder::build() {
+ return std::shared_ptr<ColumnDecryptionProperties>(
+ new ColumnDecryptionProperties(column_path_, key_));
+}
+
+void ColumnDecryptionProperties::WipeOutDecryptionKey() { key_.clear(); }
+
+std::shared_ptr<ColumnDecryptionProperties> ColumnDecryptionProperties::DeepClone() {
+ std::string key_copy = key_;
+ return std::shared_ptr<ColumnDecryptionProperties>(
+ new ColumnDecryptionProperties(column_path_, key_copy));
+}
+
+FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::footer_key_metadata(
+ const std::string& footer_key_metadata) {
+ if (footer_key_metadata.empty()) return this;
+
+ DCHECK(footer_key_metadata_.empty());
+ footer_key_metadata_ = footer_key_metadata;
+ return this;
+}
+
+FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::encrypted_columns(
+ const ColumnPathToEncryptionPropertiesMap& encrypted_columns) {
+ if (encrypted_columns.size() == 0) return this;
+
+ if (encrypted_columns_.size() != 0)
+ throw ParquetException("Column properties already set");
+
+ for (const auto& element : encrypted_columns) {
+ if (element.second->is_utilized()) {
+ throw ParquetException("Column properties utilized in another file");
+ }
+ element.second->set_utilized();
+ }
+ encrypted_columns_ = encrypted_columns;
+ return this;
+}
+
+void FileEncryptionProperties::WipeOutEncryptionKeys() {
+ footer_key_.clear();
+ for (const auto& element : encrypted_columns_) {
+ element.second->WipeOutEncryptionKey();
+ }
+}
+
+std::shared_ptr<FileEncryptionProperties> FileEncryptionProperties::DeepClone(
+ std::string new_aad_prefix) {
+ std::string footer_key_copy = footer_key_;
+ ColumnPathToEncryptionPropertiesMap encrypted_columns_map_copy;
+
+ for (const auto& element : encrypted_columns_) {
+ encrypted_columns_map_copy.insert(
+ {element.second->column_path(), element.second->DeepClone()});
+ }
+
+ if (new_aad_prefix.empty()) new_aad_prefix = aad_prefix_;
+ return std::shared_ptr<FileEncryptionProperties>(new FileEncryptionProperties(
+ algorithm_.algorithm, footer_key_copy, footer_key_metadata_, encrypted_footer_,
+ new_aad_prefix, store_aad_prefix_in_file_, encrypted_columns_map_copy));
+}
+
+FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::aad_prefix(
+ const std::string& aad_prefix) {
+ if (aad_prefix.empty()) return this;
+
+ DCHECK(aad_prefix_.empty());
+ aad_prefix_ = aad_prefix;
+ store_aad_prefix_in_file_ = true;
+ return this;
+}
+
+FileEncryptionProperties::Builder*
+FileEncryptionProperties::Builder::disable_aad_prefix_storage() {
+ DCHECK(!aad_prefix_.empty());
+
+ store_aad_prefix_in_file_ = false;
+ return this;
+}
+
+ColumnEncryptionProperties::ColumnEncryptionProperties(bool encrypted,
+ const std::string& column_path,
+ const std::string& key,
+ const std::string& key_metadata)
+ : column_path_(column_path) {
+ // column encryption properties object (with a column key) can be used for writing only
+ // one file.
+ // Upon completion of file writing, the encryption keys in the properties will be wiped
+ // out (set to 0 in memory).
+ utilized_ = false;
+
+ DCHECK(!column_path.empty());
+ if (!encrypted) {
+ DCHECK(key.empty() && key_metadata.empty());
+ }
+
+ if (!key.empty()) {
+ DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32);
+ }
+
+ encrypted_with_footer_key_ = (encrypted && key.empty());
+ if (encrypted_with_footer_key_) {
+ DCHECK(key_metadata.empty());
+ }
+
+ encrypted_ = encrypted;
+ key_metadata_ = key_metadata;
+ key_ = key;
+}
+
+ColumnDecryptionProperties::ColumnDecryptionProperties(const std::string& column_path,
+ const std::string& key)
+ : column_path_(column_path) {
+ utilized_ = false;
+ DCHECK(!column_path.empty());
+
+ if (!key.empty()) {
+ DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32);
+ }
+
+ key_ = key;
+}
+
+std::string FileDecryptionProperties::column_key(const std::string& column_path) const {
+ if (column_decryption_properties_.find(column_path) !=
+ column_decryption_properties_.end()) {
+ auto column_prop = column_decryption_properties_.at(column_path);
+ if (column_prop != nullptr) {
+ return column_prop->key();
+ }
+ }
+ return empty_string_;
+}
+
+FileDecryptionProperties::FileDecryptionProperties(
+ const std::string& footer_key, std::shared_ptr<DecryptionKeyRetriever> key_retriever,
+ bool check_plaintext_footer_integrity, const std::string& aad_prefix,
+ std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier,
+ const ColumnPathToDecryptionPropertiesMap& column_decryption_properties,
+ bool plaintext_files_allowed) {
+ DCHECK(!footer_key.empty() || nullptr != key_retriever ||
+ 0 != column_decryption_properties.size());
+
+ if (!footer_key.empty()) {
+ DCHECK(footer_key.length() == 16 || footer_key.length() == 24 ||
+ footer_key.length() == 32);
+ }
+ if (footer_key.empty() && check_plaintext_footer_integrity) {
+ DCHECK(nullptr != key_retriever);
+ }
+ aad_prefix_verifier_ = std::move(aad_prefix_verifier);
+ footer_key_ = footer_key;
+ check_plaintext_footer_integrity_ = check_plaintext_footer_integrity;
+ key_retriever_ = std::move(key_retriever);
+ aad_prefix_ = aad_prefix;
+ column_decryption_properties_ = column_decryption_properties;
+ plaintext_files_allowed_ = plaintext_files_allowed;
+ utilized_ = false;
+}
+
+FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::footer_key_id(
+ const std::string& key_id) {
+ // key_id is expected to be in UTF8 encoding
+ ::arrow::util::InitializeUTF8();
+ const uint8_t* data = reinterpret_cast<const uint8_t*>(key_id.c_str());
+ if (!::arrow::util::ValidateUTF8(data, key_id.size())) {
+ throw ParquetException("footer key id should be in UTF8 encoding");
+ }
+
+ if (key_id.empty()) {
+ return this;
+ }
+
+ return footer_key_metadata(key_id);
+}
+
+std::shared_ptr<ColumnEncryptionProperties>
+FileEncryptionProperties::column_encryption_properties(const std::string& column_path) {
+ if (encrypted_columns_.size() == 0) {
+ auto builder = std::make_shared<ColumnEncryptionProperties::Builder>(column_path);
+ return builder->build();
+ }
+ if (encrypted_columns_.find(column_path) != encrypted_columns_.end()) {
+ return encrypted_columns_[column_path];
+ }
+
+ return nullptr;
+}
+
+FileEncryptionProperties::FileEncryptionProperties(
+ ParquetCipher::type cipher, const std::string& footer_key,
+ const std::string& footer_key_metadata, bool encrypted_footer,
+ const std::string& aad_prefix, bool store_aad_prefix_in_file,
+ const ColumnPathToEncryptionPropertiesMap& encrypted_columns)
+ : footer_key_(footer_key),
+ footer_key_metadata_(footer_key_metadata),
+ encrypted_footer_(encrypted_footer),
+ aad_prefix_(aad_prefix),
+ store_aad_prefix_in_file_(store_aad_prefix_in_file),
+ encrypted_columns_(encrypted_columns) {
+ // file encryption properties object can be used for writing only one file.
+ // Upon completion of file writing, the encryption keys in the properties will be wiped
+ // out (set to 0 in memory).
+ utilized_ = false;
+
+ DCHECK(!footer_key.empty());
+ // footer_key must be either 16, 24 or 32 bytes.
+ DCHECK(footer_key.length() == 16 || footer_key.length() == 24 ||
+ footer_key.length() == 32);
+
+ uint8_t aad_file_unique[kAadFileUniqueLength];
+ memset(aad_file_unique, 0, kAadFileUniqueLength);
+ encryption::RandBytes(aad_file_unique, sizeof(kAadFileUniqueLength));
+ std::string aad_file_unique_str(reinterpret_cast<char const*>(aad_file_unique),
+ kAadFileUniqueLength);
+
+ bool supply_aad_prefix = false;
+ if (aad_prefix.empty()) {
+ file_aad_ = aad_file_unique_str;
+ } else {
+ file_aad_ = aad_prefix + aad_file_unique_str;
+ if (!store_aad_prefix_in_file) supply_aad_prefix = true;
+ }
+ algorithm_.algorithm = cipher;
+ algorithm_.aad.aad_file_unique = aad_file_unique_str;
+ algorithm_.aad.supply_aad_prefix = supply_aad_prefix;
+ if (!aad_prefix.empty() && store_aad_prefix_in_file) {
+ algorithm_.aad.aad_prefix = aad_prefix;
+ }
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/encryption/encryption.h b/contrib/libs/apache/arrow/cpp/src/parquet/encryption/encryption.h
new file mode 100644
index 00000000000..8fd7ec8d3d0
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/encryption/encryption.h
@@ -0,0 +1,510 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "parquet/exception.h"
+#include "parquet/schema.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+static constexpr ParquetCipher::type kDefaultEncryptionAlgorithm =
+ ParquetCipher::AES_GCM_V1;
+static constexpr int32_t kMaximalAadMetadataLength = 256;
+static constexpr bool kDefaultEncryptedFooter = true;
+static constexpr bool kDefaultCheckSignature = true;
+static constexpr bool kDefaultAllowPlaintextFiles = false;
+static constexpr int32_t kAadFileUniqueLength = 8;
+
+class ColumnDecryptionProperties;
+using ColumnPathToDecryptionPropertiesMap =
+ std::map<std::string, std::shared_ptr<ColumnDecryptionProperties>>;
+
+class ColumnEncryptionProperties;
+using ColumnPathToEncryptionPropertiesMap =
+ std::map<std::string, std::shared_ptr<ColumnEncryptionProperties>>;
+
+class PARQUET_EXPORT DecryptionKeyRetriever {
+ public:
+ virtual std::string GetKey(const std::string& key_metadata) = 0;
+ virtual ~DecryptionKeyRetriever() {}
+};
+
+/// Simple integer key retriever
+class PARQUET_EXPORT IntegerKeyIdRetriever : public DecryptionKeyRetriever {
+ public:
+ void PutKey(uint32_t key_id, const std::string& key);
+ std::string GetKey(const std::string& key_metadata) override;
+
+ private:
+ std::map<uint32_t, std::string> key_map_;
+};
+
+// Simple string key retriever
+class PARQUET_EXPORT StringKeyIdRetriever : public DecryptionKeyRetriever {
+ public:
+ void PutKey(const std::string& key_id, const std::string& key);
+ std::string GetKey(const std::string& key_metadata) override;
+
+ private:
+ std::map<std::string, std::string> key_map_;
+};
+
+class PARQUET_EXPORT HiddenColumnException : public ParquetException {
+ public:
+ explicit HiddenColumnException(const std::string& columnPath)
+ : ParquetException(columnPath.c_str()) {}
+};
+
+class PARQUET_EXPORT KeyAccessDeniedException : public ParquetException {
+ public:
+ explicit KeyAccessDeniedException(const std::string& columnPath)
+ : ParquetException(columnPath.c_str()) {}
+};
+
+inline const uint8_t* str2bytes(const std::string& str) {
+ if (str.empty()) return NULLPTR;
+
+ char* cbytes = const_cast<char*>(str.c_str());
+ return reinterpret_cast<const uint8_t*>(cbytes);
+}
+
+class PARQUET_EXPORT ColumnEncryptionProperties {
+ public:
+ class PARQUET_EXPORT Builder {
+ public:
+ /// Convenience builder for encrypted columns.
+ explicit Builder(const std::string& name) : Builder(name, true) {}
+
+ /// Convenience builder for encrypted columns.
+ explicit Builder(const std::shared_ptr<schema::ColumnPath>& path)
+ : Builder(path->ToDotString(), true) {}
+
+ /// Set a column-specific key.
+ /// If key is not set on an encrypted column, the column will
+ /// be encrypted with the footer key.
+ /// keyBytes Key length must be either 16, 24 or 32 bytes.
+ /// The key is cloned, and will be wiped out (array values set to 0) upon completion
+ /// of file writing.
+ /// Caller is responsible for wiping out the input key array.
+ Builder* key(std::string column_key);
+
+ /// Set a key retrieval metadata.
+ /// use either key_metadata() or key_id(), not both
+ Builder* key_metadata(const std::string& key_metadata);
+
+ /// A convenience function to set key metadata using a string id.
+ /// Set a key retrieval metadata (converted from String).
+ /// use either key_metadata() or key_id(), not both
+ /// key_id will be converted to metadata (UTF-8 array).
+ Builder* key_id(const std::string& key_id);
+
+ std::shared_ptr<ColumnEncryptionProperties> build() {
+ return std::shared_ptr<ColumnEncryptionProperties>(
+ new ColumnEncryptionProperties(encrypted_, column_path_, key_, key_metadata_));
+ }
+
+ private:
+ const std::string column_path_;
+ bool encrypted_;
+ std::string key_;
+ std::string key_metadata_;
+
+ Builder(const std::string path, bool encrypted)
+ : column_path_(path), encrypted_(encrypted) {}
+ };
+
+ std::string column_path() const { return column_path_; }
+ bool is_encrypted() const { return encrypted_; }
+ bool is_encrypted_with_footer_key() const { return encrypted_with_footer_key_; }
+ std::string key() const { return key_; }
+ std::string key_metadata() const { return key_metadata_; }
+
+ /// Upon completion of file writing, the encryption key
+ /// will be wiped out.
+ void WipeOutEncryptionKey() { key_.clear(); }
+
+ bool is_utilized() {
+ if (key_.empty())
+ return false; // can re-use column properties without encryption keys
+ return utilized_;
+ }
+
+ /// ColumnEncryptionProperties object can be used for writing one file only.
+ /// Mark ColumnEncryptionProperties as utilized once it is used in
+ /// FileEncryptionProperties as the encryption key will be wiped out upon
+ /// completion of file writing.
+ void set_utilized() { utilized_ = true; }
+
+ std::shared_ptr<ColumnEncryptionProperties> DeepClone() {
+ std::string key_copy = key_;
+ return std::shared_ptr<ColumnEncryptionProperties>(new ColumnEncryptionProperties(
+ encrypted_, column_path_, key_copy, key_metadata_));
+ }
+
+ ColumnEncryptionProperties() = default;
+ ColumnEncryptionProperties(const ColumnEncryptionProperties& other) = default;
+ ColumnEncryptionProperties(ColumnEncryptionProperties&& other) = default;
+
+ private:
+ const std::string column_path_;
+ bool encrypted_;
+ bool encrypted_with_footer_key_;
+ std::string key_;
+ std::string key_metadata_;
+ bool utilized_;
+ explicit ColumnEncryptionProperties(bool encrypted, const std::string& column_path,
+ const std::string& key,
+ const std::string& key_metadata);
+};
+
+class PARQUET_EXPORT ColumnDecryptionProperties {
+ public:
+ class PARQUET_EXPORT Builder {
+ public:
+ explicit Builder(const std::string& name) : column_path_(name) {}
+
+ explicit Builder(const std::shared_ptr<schema::ColumnPath>& path)
+ : Builder(path->ToDotString()) {}
+
+ /// Set an explicit column key. If applied on a file that contains
+ /// key metadata for this column the metadata will be ignored,
+ /// the column will be decrypted with this key.
+ /// key length must be either 16, 24 or 32 bytes.
+ Builder* key(const std::string& key);
+
+ std::shared_ptr<ColumnDecryptionProperties> build();
+
+ private:
+ const std::string column_path_;
+ std::string key_;
+ };
+
+ ColumnDecryptionProperties() = default;
+ ColumnDecryptionProperties(const ColumnDecryptionProperties& other) = default;
+ ColumnDecryptionProperties(ColumnDecryptionProperties&& other) = default;
+
+ std::string column_path() const { return column_path_; }
+ std::string key() const { return key_; }
+ bool is_utilized() { return utilized_; }
+
+ /// ColumnDecryptionProperties object can be used for reading one file only.
+ /// Mark ColumnDecryptionProperties as utilized once it is used in
+ /// FileDecryptionProperties as the encryption key will be wiped out upon
+ /// completion of file reading.
+ void set_utilized() { utilized_ = true; }
+
+ /// Upon completion of file reading, the encryption key
+ /// will be wiped out.
+ void WipeOutDecryptionKey();
+
+ std::shared_ptr<ColumnDecryptionProperties> DeepClone();
+
+ private:
+ const std::string column_path_;
+ std::string key_;
+ bool utilized_;
+
+ /// This class is only required for setting explicit column decryption keys -
+ /// to override key retriever (or to provide keys when key metadata and/or
+ /// key retriever are not available)
+ explicit ColumnDecryptionProperties(const std::string& column_path,
+ const std::string& key);
+};
+
+class PARQUET_EXPORT AADPrefixVerifier {
+ public:
+ /// Verifies identity (AAD Prefix) of individual file,
+ /// or of file collection in a data set.
+ /// Throws exception if an AAD prefix is wrong.
+ /// In a data set, AAD Prefixes should be collected,
+ /// and then checked for missing files.
+ virtual void Verify(const std::string& aad_prefix) = 0;
+ virtual ~AADPrefixVerifier() {}
+};
+
+class PARQUET_EXPORT FileDecryptionProperties {
+ public:
+ class PARQUET_EXPORT Builder {
+ public:
+ Builder() {
+ check_plaintext_footer_integrity_ = kDefaultCheckSignature;
+ plaintext_files_allowed_ = kDefaultAllowPlaintextFiles;
+ }
+
+ /// Set an explicit footer key. If applied on a file that contains
+ /// footer key metadata the metadata will be ignored, the footer
+ /// will be decrypted/verified with this key.
+ /// If explicit key is not set, footer key will be fetched from
+ /// key retriever.
+ /// With explicit keys or AAD prefix, new encryption properties object must be
+ /// created for each encrypted file.
+ /// Explicit encryption keys (footer and column) are cloned.
+ /// Upon completion of file reading, the cloned encryption keys in the properties
+ /// will be wiped out (array values set to 0).
+ /// Caller is responsible for wiping out the input key array.
+ /// param footerKey Key length must be either 16, 24 or 32 bytes.
+ Builder* footer_key(const std::string footer_key);
+
+ /// Set explicit column keys (decryption properties).
+ /// Its also possible to set a key retriever on this property object.
+ /// Upon file decryption, availability of explicit keys is checked before
+ /// invocation of the retriever callback.
+ /// If an explicit key is available for a footer or a column,
+ /// its key metadata will be ignored.
+ Builder* column_keys(
+ const ColumnPathToDecryptionPropertiesMap& column_decryption_properties);
+
+ /// Set a key retriever callback. Its also possible to
+ /// set explicit footer or column keys on this file property object.
+ /// Upon file decryption, availability of explicit keys is checked before
+ /// invocation of the retriever callback.
+ /// If an explicit key is available for a footer or a column,
+ /// its key metadata will be ignored.
+ Builder* key_retriever(const std::shared_ptr<DecryptionKeyRetriever>& key_retriever);
+
+ /// Skip integrity verification of plaintext footers.
+ /// If not called, integrity of plaintext footers will be checked in runtime,
+ /// and an exception will be thrown in the following situations:
+ /// - footer signing key is not available
+ /// (not passed, or not found by key retriever)
+ /// - footer content and signature don't match
+ Builder* disable_footer_signature_verification() {
+ check_plaintext_footer_integrity_ = false;
+ return this;
+ }
+
+ /// Explicitly supply the file AAD prefix.
+ /// A must when a prefix is used for file encryption, but not stored in file.
+ /// If AAD prefix is stored in file, it will be compared to the explicitly
+ /// supplied value and an exception will be thrown if they differ.
+ Builder* aad_prefix(const std::string& aad_prefix);
+
+ /// Set callback for verification of AAD Prefixes stored in file.
+ Builder* aad_prefix_verifier(std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier);
+
+ /// By default, reading plaintext (unencrypted) files is not
+ /// allowed when using a decryptor
+ /// - in order to detect files that were not encrypted by mistake.
+ /// However, the default behavior can be overridden by calling this method.
+ /// The caller should use then a different method to ensure encryption
+ /// of files with sensitive data.
+ Builder* plaintext_files_allowed() {
+ plaintext_files_allowed_ = true;
+ return this;
+ }
+
+ std::shared_ptr<FileDecryptionProperties> build() {
+ return std::shared_ptr<FileDecryptionProperties>(new FileDecryptionProperties(
+ footer_key_, key_retriever_, check_plaintext_footer_integrity_, aad_prefix_,
+ aad_prefix_verifier_, column_decryption_properties_, plaintext_files_allowed_));
+ }
+
+ private:
+ std::string footer_key_;
+ std::string aad_prefix_;
+ std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier_;
+ ColumnPathToDecryptionPropertiesMap column_decryption_properties_;
+
+ std::shared_ptr<DecryptionKeyRetriever> key_retriever_;
+ bool check_plaintext_footer_integrity_;
+ bool plaintext_files_allowed_;
+ };
+
+ std::string column_key(const std::string& column_path) const;
+
+ std::string footer_key() const { return footer_key_; }
+
+ std::string aad_prefix() const { return aad_prefix_; }
+
+ const std::shared_ptr<DecryptionKeyRetriever>& key_retriever() const {
+ return key_retriever_;
+ }
+
+ bool check_plaintext_footer_integrity() const {
+ return check_plaintext_footer_integrity_;
+ }
+
+ bool plaintext_files_allowed() const { return plaintext_files_allowed_; }
+
+ const std::shared_ptr<AADPrefixVerifier>& aad_prefix_verifier() const {
+ return aad_prefix_verifier_;
+ }
+
+ /// Upon completion of file reading, the encryption keys in the properties
+ /// will be wiped out (array values set to 0).
+ void WipeOutDecryptionKeys();
+
+ bool is_utilized();
+
+ /// FileDecryptionProperties object can be used for reading one file only.
+ /// Mark FileDecryptionProperties as utilized once it is used to read a file as the
+ /// encryption keys will be wiped out upon completion of file reading.
+ void set_utilized() { utilized_ = true; }
+
+ /// FileDecryptionProperties object can be used for reading one file only.
+ /// (unless this object keeps the keyRetrieval callback only, and no explicit
+ /// keys or aadPrefix).
+ /// At the end, keys are wiped out in the memory.
+ /// This method allows to clone identical properties for another file,
+ /// with an option to update the aadPrefix (if newAadPrefix is null,
+ /// aadPrefix will be cloned too)
+ std::shared_ptr<FileDecryptionProperties> DeepClone(std::string new_aad_prefix = "");
+
+ private:
+ std::string footer_key_;
+ std::string aad_prefix_;
+ std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier_;
+
+ const std::string empty_string_ = "";
+ ColumnPathToDecryptionPropertiesMap column_decryption_properties_;
+
+ std::shared_ptr<DecryptionKeyRetriever> key_retriever_;
+ bool check_plaintext_footer_integrity_;
+ bool plaintext_files_allowed_;
+ bool utilized_;
+
+ FileDecryptionProperties(
+ const std::string& footer_key,
+ std::shared_ptr<DecryptionKeyRetriever> key_retriever,
+ bool check_plaintext_footer_integrity, const std::string& aad_prefix,
+ std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier,
+ const ColumnPathToDecryptionPropertiesMap& column_decryption_properties,
+ bool plaintext_files_allowed);
+};
+
+class PARQUET_EXPORT FileEncryptionProperties {
+ public:
+ class PARQUET_EXPORT Builder {
+ public:
+ explicit Builder(const std::string& footer_key)
+ : parquet_cipher_(kDefaultEncryptionAlgorithm),
+ encrypted_footer_(kDefaultEncryptedFooter) {
+ footer_key_ = footer_key;
+ store_aad_prefix_in_file_ = false;
+ }
+
+ /// Create files with plaintext footer.
+ /// If not called, the files will be created with encrypted footer (default).
+ Builder* set_plaintext_footer() {
+ encrypted_footer_ = false;
+ return this;
+ }
+
+ /// Set encryption algorithm.
+ /// If not called, files will be encrypted with AES_GCM_V1 (default).
+ Builder* algorithm(ParquetCipher::type parquet_cipher) {
+ parquet_cipher_ = parquet_cipher;
+ return this;
+ }
+
+ /// Set a key retrieval metadata (converted from String).
+ /// use either footer_key_metadata or footer_key_id, not both.
+ Builder* footer_key_id(const std::string& key_id);
+
+ /// Set a key retrieval metadata.
+ /// use either footer_key_metadata or footer_key_id, not both.
+ Builder* footer_key_metadata(const std::string& footer_key_metadata);
+
+ /// Set the file AAD Prefix.
+ Builder* aad_prefix(const std::string& aad_prefix);
+
+ /// Skip storing AAD Prefix in file.
+ /// If not called, and if AAD Prefix is set, it will be stored.
+ Builder* disable_aad_prefix_storage();
+
+ /// Set the list of encrypted columns and their properties (keys etc).
+ /// If not called, all columns will be encrypted with the footer key.
+ /// If called, the file columns not in the list will be left unencrypted.
+ Builder* encrypted_columns(
+ const ColumnPathToEncryptionPropertiesMap& encrypted_columns);
+
+ std::shared_ptr<FileEncryptionProperties> build() {
+ return std::shared_ptr<FileEncryptionProperties>(new FileEncryptionProperties(
+ parquet_cipher_, footer_key_, footer_key_metadata_, encrypted_footer_,
+ aad_prefix_, store_aad_prefix_in_file_, encrypted_columns_));
+ }
+
+ private:
+ ParquetCipher::type parquet_cipher_;
+ bool encrypted_footer_;
+ std::string footer_key_;
+ std::string footer_key_metadata_;
+
+ std::string aad_prefix_;
+ bool store_aad_prefix_in_file_;
+ ColumnPathToEncryptionPropertiesMap encrypted_columns_;
+ };
+ bool encrypted_footer() const { return encrypted_footer_; }
+
+ EncryptionAlgorithm algorithm() const { return algorithm_; }
+
+ std::string footer_key() const { return footer_key_; }
+
+ std::string footer_key_metadata() const { return footer_key_metadata_; }
+
+ std::string file_aad() const { return file_aad_; }
+
+ std::shared_ptr<ColumnEncryptionProperties> column_encryption_properties(
+ const std::string& column_path);
+
+ bool is_utilized() const { return utilized_; }
+
+ /// FileEncryptionProperties object can be used for writing one file only.
+ /// Mark FileEncryptionProperties as utilized once it is used to write a file as the
+ /// encryption keys will be wiped out upon completion of file writing.
+ void set_utilized() { utilized_ = true; }
+
+ /// Upon completion of file writing, the encryption keys
+ /// will be wiped out (array values set to 0).
+ void WipeOutEncryptionKeys();
+
+ /// FileEncryptionProperties object can be used for writing one file only.
+ /// (at the end, keys are wiped out in the memory).
+ /// This method allows to clone identical properties for another file,
+ /// with an option to update the aadPrefix (if newAadPrefix is null,
+ /// aadPrefix will be cloned too)
+ std::shared_ptr<FileEncryptionProperties> DeepClone(std::string new_aad_prefix = "");
+
+ ColumnPathToEncryptionPropertiesMap encrypted_columns() const {
+ return encrypted_columns_;
+ }
+
+ private:
+ EncryptionAlgorithm algorithm_;
+ std::string footer_key_;
+ std::string footer_key_metadata_;
+ bool encrypted_footer_;
+ std::string file_aad_;
+ std::string aad_prefix_;
+ bool utilized_;
+ bool store_aad_prefix_in_file_;
+ ColumnPathToEncryptionPropertiesMap encrypted_columns_;
+
+ FileEncryptionProperties(ParquetCipher::type cipher, const std::string& footer_key,
+ const std::string& footer_key_metadata, bool encrypted_footer,
+ const std::string& aad_prefix, bool store_aad_prefix_in_file,
+ const ColumnPathToEncryptionPropertiesMap& encrypted_columns);
+};
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/encryption/encryption_internal.h b/contrib/libs/apache/arrow/cpp/src/parquet/encryption/encryption_internal.h
new file mode 100644
index 00000000000..e50fb9d0b8a
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/encryption/encryption_internal.h
@@ -0,0 +1,116 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "parquet/properties.h"
+#include "parquet/types.h"
+
+using parquet::ParquetCipher;
+
+namespace parquet {
+namespace encryption {
+
+constexpr int kGcmTagLength = 16;
+constexpr int kNonceLength = 12;
+
+// Module types
+constexpr int8_t kFooter = 0;
+constexpr int8_t kColumnMetaData = 1;
+constexpr int8_t kDataPage = 2;
+constexpr int8_t kDictionaryPage = 3;
+constexpr int8_t kDataPageHeader = 4;
+constexpr int8_t kDictionaryPageHeader = 5;
+constexpr int8_t kColumnIndex = 6;
+constexpr int8_t kOffsetIndex = 7;
+
+/// Performs AES encryption operations with GCM or CTR ciphers.
+class AesEncryptor {
+ public:
+ /// Can serve one key length only. Possible values: 16, 24, 32 bytes.
+ explicit AesEncryptor(ParquetCipher::type alg_id, int key_len, bool metadata);
+
+ static AesEncryptor* Make(ParquetCipher::type alg_id, int key_len, bool metadata,
+ std::vector<AesEncryptor*>* all_encryptors);
+
+ ~AesEncryptor();
+
+ /// Size difference between plaintext and ciphertext, for this cipher.
+ int CiphertextSizeDelta();
+
+ /// Encrypts plaintext with the key and aad. Key length is passed only for validation.
+ /// If different from value in constructor, exception will be thrown.
+ int Encrypt(const uint8_t* plaintext, int plaintext_len, const uint8_t* key,
+ int key_len, const uint8_t* aad, int aad_len, uint8_t* ciphertext);
+
+ /// Encrypts plaintext footer, in order to compute footer signature (tag).
+ int SignedFooterEncrypt(const uint8_t* footer, int footer_len, const uint8_t* key,
+ int key_len, const uint8_t* aad, int aad_len,
+ const uint8_t* nonce, uint8_t* encrypted_footer);
+
+ void WipeOut();
+
+ private:
+ // PIMPL Idiom
+ class AesEncryptorImpl;
+ std::unique_ptr<AesEncryptorImpl> impl_;
+};
+
+/// Performs AES decryption operations with GCM or CTR ciphers.
+class AesDecryptor {
+ public:
+ /// Can serve one key length only. Possible values: 16, 24, 32 bytes.
+ explicit AesDecryptor(ParquetCipher::type alg_id, int key_len, bool metadata);
+
+ static AesDecryptor* Make(ParquetCipher::type alg_id, int key_len, bool metadata,
+ std::vector<AesDecryptor*>* all_decryptors);
+
+ ~AesDecryptor();
+ void WipeOut();
+
+ /// Size difference between plaintext and ciphertext, for this cipher.
+ int CiphertextSizeDelta();
+
+ /// Decrypts ciphertext with the key and aad. Key length is passed only for
+ /// validation. If different from value in constructor, exception will be thrown.
+ int Decrypt(const uint8_t* ciphertext, int ciphertext_len, const uint8_t* key,
+ int key_len, const uint8_t* aad, int aad_len, uint8_t* plaintext);
+
+ private:
+ // PIMPL Idiom
+ class AesDecryptorImpl;
+ std::unique_ptr<AesDecryptorImpl> impl_;
+};
+
+std::string CreateModuleAad(const std::string& file_aad, int8_t module_type,
+ int16_t row_group_ordinal, int16_t column_ordinal,
+ int16_t page_ordinal);
+
+std::string CreateFooterAad(const std::string& aad_prefix_bytes);
+
+// Update last two bytes of page (or page header) module AAD
+void QuickUpdatePageAad(const std::string& AAD, int16_t new_page_ordinal);
+
+// Wraps OpenSSL RAND_bytes function
+void RandBytes(unsigned char* buf, int num);
+
+} // namespace encryption
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/encryption/encryption_internal_nossl.cc b/contrib/libs/apache/arrow/cpp/src/parquet/encryption/encryption_internal_nossl.cc
new file mode 100644
index 00000000000..7f2edfa1d78
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/encryption/encryption_internal_nossl.cc
@@ -0,0 +1,110 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/encryption/encryption_internal.h"
+#include "parquet/exception.h"
+
+namespace parquet {
+namespace encryption {
+
+void ThrowOpenSSLRequiredException() {
+ throw ParquetException(
+ "Calling encryption method in Arrow/Parquet built without OpenSSL");
+}
+
+class AesEncryptor::AesEncryptorImpl {};
+
+AesEncryptor::~AesEncryptor() {}
+
+int AesEncryptor::SignedFooterEncrypt(const uint8_t* footer, int footer_len,
+ const uint8_t* key, int key_len, const uint8_t* aad,
+ int aad_len, const uint8_t* nonce,
+ uint8_t* encrypted_footer) {
+ ThrowOpenSSLRequiredException();
+ return -1;
+}
+
+void AesEncryptor::WipeOut() { ThrowOpenSSLRequiredException(); }
+
+int AesEncryptor::CiphertextSizeDelta() {
+ ThrowOpenSSLRequiredException();
+ return -1;
+}
+
+int AesEncryptor::Encrypt(const uint8_t* plaintext, int plaintext_len, const uint8_t* key,
+ int key_len, const uint8_t* aad, int aad_len,
+ uint8_t* ciphertext) {
+ ThrowOpenSSLRequiredException();
+ return -1;
+}
+
+AesEncryptor::AesEncryptor(ParquetCipher::type alg_id, int key_len, bool metadata) {
+ ThrowOpenSSLRequiredException();
+}
+
+class AesDecryptor::AesDecryptorImpl {};
+
+int AesDecryptor::Decrypt(const uint8_t* plaintext, int plaintext_len, const uint8_t* key,
+ int key_len, const uint8_t* aad, int aad_len,
+ uint8_t* ciphertext) {
+ ThrowOpenSSLRequiredException();
+ return -1;
+}
+
+void AesDecryptor::WipeOut() { ThrowOpenSSLRequiredException(); }
+
+AesDecryptor::~AesDecryptor() {}
+
+AesEncryptor* AesEncryptor::Make(ParquetCipher::type alg_id, int key_len, bool metadata,
+ std::vector<AesEncryptor*>* all_encryptors) {
+ return NULLPTR;
+}
+
+AesDecryptor::AesDecryptor(ParquetCipher::type alg_id, int key_len, bool metadata) {
+ ThrowOpenSSLRequiredException();
+}
+
+AesDecryptor* AesDecryptor::Make(ParquetCipher::type alg_id, int key_len, bool metadata,
+ std::vector<AesDecryptor*>* all_decryptors) {
+ return NULLPTR;
+}
+
+int AesDecryptor::CiphertextSizeDelta() {
+ ThrowOpenSSLRequiredException();
+ return -1;
+}
+
+std::string CreateModuleAad(const std::string& file_aad, int8_t module_type,
+ int16_t row_group_ordinal, int16_t column_ordinal,
+ int16_t page_ordinal) {
+ ThrowOpenSSLRequiredException();
+ return "";
+}
+
+std::string CreateFooterAad(const std::string& aad_prefix_bytes) {
+ ThrowOpenSSLRequiredException();
+ return "";
+}
+
+void QuickUpdatePageAad(const std::string& AAD, int16_t new_page_ordinal) {
+ ThrowOpenSSLRequiredException();
+}
+
+void RandBytes(unsigned char* buf, int num) { ThrowOpenSSLRequiredException(); }
+
+} // namespace encryption
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/encryption/internal_file_decryptor.cc b/contrib/libs/apache/arrow/cpp/src/parquet/encryption/internal_file_decryptor.cc
new file mode 100644
index 00000000000..6381e4f37f7
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/encryption/internal_file_decryptor.cc
@@ -0,0 +1,240 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/encryption/internal_file_decryptor.h"
+#include "parquet/encryption/encryption.h"
+#include "parquet/encryption/encryption_internal.h"
+
+namespace parquet {
+
+// Decryptor
+Decryptor::Decryptor(encryption::AesDecryptor* aes_decryptor, const std::string& key,
+ const std::string& file_aad, const std::string& aad,
+ ::arrow::MemoryPool* pool)
+ : aes_decryptor_(aes_decryptor),
+ key_(key),
+ file_aad_(file_aad),
+ aad_(aad),
+ pool_(pool) {}
+
+int Decryptor::CiphertextSizeDelta() { return aes_decryptor_->CiphertextSizeDelta(); }
+
+int Decryptor::Decrypt(const uint8_t* ciphertext, int ciphertext_len,
+ uint8_t* plaintext) {
+ return aes_decryptor_->Decrypt(ciphertext, ciphertext_len, str2bytes(key_),
+ static_cast<int>(key_.size()), str2bytes(aad_),
+ static_cast<int>(aad_.size()), plaintext);
+}
+
+// InternalFileDecryptor
+InternalFileDecryptor::InternalFileDecryptor(FileDecryptionProperties* properties,
+ const std::string& file_aad,
+ ParquetCipher::type algorithm,
+ const std::string& footer_key_metadata,
+ ::arrow::MemoryPool* pool)
+ : properties_(properties),
+ file_aad_(file_aad),
+ algorithm_(algorithm),
+ footer_key_metadata_(footer_key_metadata),
+ pool_(pool) {
+ if (properties_->is_utilized()) {
+ throw ParquetException(
+ "Re-using decryption properties with explicit keys for another file");
+ }
+ properties_->set_utilized();
+}
+
+void InternalFileDecryptor::WipeOutDecryptionKeys() {
+ properties_->WipeOutDecryptionKeys();
+ for (auto const& i : all_decryptors_) {
+ i->WipeOut();
+ }
+}
+
+std::string InternalFileDecryptor::GetFooterKey() {
+ std::string footer_key = properties_->footer_key();
+ // ignore footer key metadata if footer key is explicitly set via API
+ if (footer_key.empty()) {
+ if (footer_key_metadata_.empty())
+ throw ParquetException("No footer key or key metadata");
+ if (properties_->key_retriever() == nullptr)
+ throw ParquetException("No footer key or key retriever");
+ try {
+ footer_key = properties_->key_retriever()->GetKey(footer_key_metadata_);
+ } catch (KeyAccessDeniedException& e) {
+ std::stringstream ss;
+ ss << "Footer key: access denied " << e.what() << "\n";
+ throw ParquetException(ss.str());
+ }
+ }
+ if (footer_key.empty()) {
+ throw ParquetException(
+ "Footer key unavailable. Could not verify "
+ "plaintext footer metadata");
+ }
+ return footer_key;
+}
+
+std::shared_ptr<Decryptor> InternalFileDecryptor::GetFooterDecryptor() {
+ std::string aad = encryption::CreateFooterAad(file_aad_);
+ return GetFooterDecryptor(aad, true);
+}
+
+std::shared_ptr<Decryptor> InternalFileDecryptor::GetFooterDecryptorForColumnMeta(
+ const std::string& aad) {
+ return GetFooterDecryptor(aad, true);
+}
+
+std::shared_ptr<Decryptor> InternalFileDecryptor::GetFooterDecryptorForColumnData(
+ const std::string& aad) {
+ return GetFooterDecryptor(aad, false);
+}
+
+std::shared_ptr<Decryptor> InternalFileDecryptor::GetFooterDecryptor(
+ const std::string& aad, bool metadata) {
+ if (metadata) {
+ if (footer_metadata_decryptor_ != nullptr) return footer_metadata_decryptor_;
+ } else {
+ if (footer_data_decryptor_ != nullptr) return footer_data_decryptor_;
+ }
+
+ std::string footer_key = properties_->footer_key();
+ if (footer_key.empty()) {
+ if (footer_key_metadata_.empty())
+ throw ParquetException("No footer key or key metadata");
+ if (properties_->key_retriever() == nullptr)
+ throw ParquetException("No footer key or key retriever");
+ try {
+ footer_key = properties_->key_retriever()->GetKey(footer_key_metadata_);
+ } catch (KeyAccessDeniedException& e) {
+ std::stringstream ss;
+ ss << "Footer key: access denied " << e.what() << "\n";
+ throw ParquetException(ss.str());
+ }
+ }
+ if (footer_key.empty()) {
+ throw ParquetException(
+ "Invalid footer encryption key. "
+ "Could not parse footer metadata");
+ }
+
+ // Create both data and metadata decryptors to avoid redundant retrieval of key
+ // from the key_retriever.
+ auto aes_metadata_decryptor = GetMetaAesDecryptor(footer_key.size());
+ auto aes_data_decryptor = GetDataAesDecryptor(footer_key.size());
+
+ footer_metadata_decryptor_ = std::make_shared<Decryptor>(
+ aes_metadata_decryptor, footer_key, file_aad_, aad, pool_);
+ footer_data_decryptor_ =
+ std::make_shared<Decryptor>(aes_data_decryptor, footer_key, file_aad_, aad, pool_);
+
+ if (metadata) return footer_metadata_decryptor_;
+ return footer_data_decryptor_;
+}
+
+std::shared_ptr<Decryptor> InternalFileDecryptor::GetColumnMetaDecryptor(
+ const std::string& column_path, const std::string& column_key_metadata,
+ const std::string& aad) {
+ return GetColumnDecryptor(column_path, column_key_metadata, aad, true);
+}
+
+std::shared_ptr<Decryptor> InternalFileDecryptor::GetColumnDataDecryptor(
+ const std::string& column_path, const std::string& column_key_metadata,
+ const std::string& aad) {
+ return GetColumnDecryptor(column_path, column_key_metadata, aad, false);
+}
+
+std::shared_ptr<Decryptor> InternalFileDecryptor::GetColumnDecryptor(
+ const std::string& column_path, const std::string& column_key_metadata,
+ const std::string& aad, bool metadata) {
+ std::string column_key;
+ // first look if we already got the decryptor from before
+ if (metadata) {
+ if (column_metadata_map_.find(column_path) != column_metadata_map_.end()) {
+ auto res(column_metadata_map_.at(column_path));
+ res->UpdateAad(aad);
+ return res;
+ }
+ } else {
+ if (column_data_map_.find(column_path) != column_data_map_.end()) {
+ auto res(column_data_map_.at(column_path));
+ res->UpdateAad(aad);
+ return res;
+ }
+ }
+
+ column_key = properties_->column_key(column_path);
+ // No explicit column key given via API. Retrieve via key metadata.
+ if (column_key.empty() && !column_key_metadata.empty() &&
+ properties_->key_retriever() != nullptr) {
+ try {
+ column_key = properties_->key_retriever()->GetKey(column_key_metadata);
+ } catch (KeyAccessDeniedException& e) {
+ std::stringstream ss;
+ ss << "HiddenColumnException, path=" + column_path + " " << e.what() << "\n";
+ throw HiddenColumnException(ss.str());
+ }
+ }
+ if (column_key.empty()) {
+ throw HiddenColumnException("HiddenColumnException, path=" + column_path);
+ }
+
+ // Create both data and metadata decryptors to avoid redundant retrieval of key
+ // using the key_retriever.
+ auto aes_metadata_decryptor = GetMetaAesDecryptor(column_key.size());
+ auto aes_data_decryptor = GetDataAesDecryptor(column_key.size());
+
+ column_metadata_map_[column_path] = std::make_shared<Decryptor>(
+ aes_metadata_decryptor, column_key, file_aad_, aad, pool_);
+ column_data_map_[column_path] =
+ std::make_shared<Decryptor>(aes_data_decryptor, column_key, file_aad_, aad, pool_);
+
+ if (metadata) return column_metadata_map_[column_path];
+ return column_data_map_[column_path];
+}
+
+int InternalFileDecryptor::MapKeyLenToDecryptorArrayIndex(int key_len) {
+ if (key_len == 16)
+ return 0;
+ else if (key_len == 24)
+ return 1;
+ else if (key_len == 32)
+ return 2;
+ throw ParquetException("decryption key must be 16, 24 or 32 bytes in length");
+}
+
+encryption::AesDecryptor* InternalFileDecryptor::GetMetaAesDecryptor(size_t key_size) {
+ int key_len = static_cast<int>(key_size);
+ int index = MapKeyLenToDecryptorArrayIndex(key_len);
+ if (meta_decryptor_[index] == nullptr) {
+ meta_decryptor_[index].reset(
+ encryption::AesDecryptor::Make(algorithm_, key_len, true, &all_decryptors_));
+ }
+ return meta_decryptor_[index].get();
+}
+
+encryption::AesDecryptor* InternalFileDecryptor::GetDataAesDecryptor(size_t key_size) {
+ int key_len = static_cast<int>(key_size);
+ int index = MapKeyLenToDecryptorArrayIndex(key_len);
+ if (data_decryptor_[index] == nullptr) {
+ data_decryptor_[index].reset(
+ encryption::AesDecryptor::Make(algorithm_, key_len, false, &all_decryptors_));
+ }
+ return data_decryptor_[index].get();
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/encryption/internal_file_decryptor.h b/contrib/libs/apache/arrow/cpp/src/parquet/encryption/internal_file_decryptor.h
new file mode 100644
index 00000000000..011c4acbeb6
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/encryption/internal_file_decryptor.h
@@ -0,0 +1,121 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "parquet/schema.h"
+
+namespace parquet {
+
+namespace encryption {
+class AesDecryptor;
+class AesEncryptor;
+} // namespace encryption
+
+class FileDecryptionProperties;
+
+class PARQUET_EXPORT Decryptor {
+ public:
+ Decryptor(encryption::AesDecryptor* decryptor, const std::string& key,
+ const std::string& file_aad, const std::string& aad,
+ ::arrow::MemoryPool* pool);
+
+ const std::string& file_aad() const { return file_aad_; }
+ void UpdateAad(const std::string& aad) { aad_ = aad; }
+ ::arrow::MemoryPool* pool() { return pool_; }
+
+ int CiphertextSizeDelta();
+ int Decrypt(const uint8_t* ciphertext, int ciphertext_len, uint8_t* plaintext);
+
+ private:
+ encryption::AesDecryptor* aes_decryptor_;
+ std::string key_;
+ std::string file_aad_;
+ std::string aad_;
+ ::arrow::MemoryPool* pool_;
+};
+
+class InternalFileDecryptor {
+ public:
+ explicit InternalFileDecryptor(FileDecryptionProperties* properties,
+ const std::string& file_aad,
+ ParquetCipher::type algorithm,
+ const std::string& footer_key_metadata,
+ ::arrow::MemoryPool* pool);
+
+ std::string& file_aad() { return file_aad_; }
+
+ std::string GetFooterKey();
+
+ ParquetCipher::type algorithm() { return algorithm_; }
+
+ std::string& footer_key_metadata() { return footer_key_metadata_; }
+
+ FileDecryptionProperties* properties() { return properties_; }
+
+ void WipeOutDecryptionKeys();
+
+ ::arrow::MemoryPool* pool() { return pool_; }
+
+ std::shared_ptr<Decryptor> GetFooterDecryptor();
+ std::shared_ptr<Decryptor> GetFooterDecryptorForColumnMeta(const std::string& aad = "");
+ std::shared_ptr<Decryptor> GetFooterDecryptorForColumnData(const std::string& aad = "");
+ std::shared_ptr<Decryptor> GetColumnMetaDecryptor(
+ const std::string& column_path, const std::string& column_key_metadata,
+ const std::string& aad = "");
+ std::shared_ptr<Decryptor> GetColumnDataDecryptor(
+ const std::string& column_path, const std::string& column_key_metadata,
+ const std::string& aad = "");
+
+ private:
+ FileDecryptionProperties* properties_;
+ // Concatenation of aad_prefix (if exists) and aad_file_unique
+ std::string file_aad_;
+ std::map<std::string, std::shared_ptr<Decryptor>> column_data_map_;
+ std::map<std::string, std::shared_ptr<Decryptor>> column_metadata_map_;
+
+ std::shared_ptr<Decryptor> footer_metadata_decryptor_;
+ std::shared_ptr<Decryptor> footer_data_decryptor_;
+ ParquetCipher::type algorithm_;
+ std::string footer_key_metadata_;
+ std::vector<encryption::AesDecryptor*> all_decryptors_;
+
+ /// Key must be 16, 24 or 32 bytes in length. Thus there could be up to three
+ // types of meta_decryptors and data_decryptors.
+ std::unique_ptr<encryption::AesDecryptor> meta_decryptor_[3];
+ std::unique_ptr<encryption::AesDecryptor> data_decryptor_[3];
+
+ ::arrow::MemoryPool* pool_;
+
+ std::shared_ptr<Decryptor> GetFooterDecryptor(const std::string& aad, bool metadata);
+ std::shared_ptr<Decryptor> GetColumnDecryptor(const std::string& column_path,
+ const std::string& column_key_metadata,
+ const std::string& aad,
+ bool metadata = false);
+
+ encryption::AesDecryptor* GetMetaAesDecryptor(size_t key_size);
+ encryption::AesDecryptor* GetDataAesDecryptor(size_t key_size);
+
+ int MapKeyLenToDecryptorArrayIndex(int key_len);
+};
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/encryption/internal_file_encryptor.cc b/contrib/libs/apache/arrow/cpp/src/parquet/encryption/internal_file_encryptor.cc
new file mode 100644
index 00000000000..15bf52b84dd
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/encryption/internal_file_encryptor.cc
@@ -0,0 +1,170 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/encryption/internal_file_encryptor.h"
+#include "parquet/encryption/encryption.h"
+#include "parquet/encryption/encryption_internal.h"
+
+namespace parquet {
+
+// Encryptor
+Encryptor::Encryptor(encryption::AesEncryptor* aes_encryptor, const std::string& key,
+ const std::string& file_aad, const std::string& aad,
+ ::arrow::MemoryPool* pool)
+ : aes_encryptor_(aes_encryptor),
+ key_(key),
+ file_aad_(file_aad),
+ aad_(aad),
+ pool_(pool) {}
+
+int Encryptor::CiphertextSizeDelta() { return aes_encryptor_->CiphertextSizeDelta(); }
+
+int Encryptor::Encrypt(const uint8_t* plaintext, int plaintext_len, uint8_t* ciphertext) {
+ return aes_encryptor_->Encrypt(plaintext, plaintext_len, str2bytes(key_),
+ static_cast<int>(key_.size()), str2bytes(aad_),
+ static_cast<int>(aad_.size()), ciphertext);
+}
+
+// InternalFileEncryptor
+InternalFileEncryptor::InternalFileEncryptor(FileEncryptionProperties* properties,
+ ::arrow::MemoryPool* pool)
+ : properties_(properties), pool_(pool) {
+ if (properties_->is_utilized()) {
+ throw ParquetException("Re-using encryption properties for another file");
+ }
+ properties_->set_utilized();
+}
+
+void InternalFileEncryptor::WipeOutEncryptionKeys() {
+ properties_->WipeOutEncryptionKeys();
+
+ for (auto const& i : all_encryptors_) {
+ i->WipeOut();
+ }
+}
+
+std::shared_ptr<Encryptor> InternalFileEncryptor::GetFooterEncryptor() {
+ if (footer_encryptor_ != nullptr) {
+ return footer_encryptor_;
+ }
+
+ ParquetCipher::type algorithm = properties_->algorithm().algorithm;
+ std::string footer_aad = encryption::CreateFooterAad(properties_->file_aad());
+ std::string footer_key = properties_->footer_key();
+ auto aes_encryptor = GetMetaAesEncryptor(algorithm, footer_key.size());
+ footer_encryptor_ = std::make_shared<Encryptor>(
+ aes_encryptor, footer_key, properties_->file_aad(), footer_aad, pool_);
+ return footer_encryptor_;
+}
+
+std::shared_ptr<Encryptor> InternalFileEncryptor::GetFooterSigningEncryptor() {
+ if (footer_signing_encryptor_ != nullptr) {
+ return footer_signing_encryptor_;
+ }
+
+ ParquetCipher::type algorithm = properties_->algorithm().algorithm;
+ std::string footer_aad = encryption::CreateFooterAad(properties_->file_aad());
+ std::string footer_signing_key = properties_->footer_key();
+ auto aes_encryptor = GetMetaAesEncryptor(algorithm, footer_signing_key.size());
+ footer_signing_encryptor_ = std::make_shared<Encryptor>(
+ aes_encryptor, footer_signing_key, properties_->file_aad(), footer_aad, pool_);
+ return footer_signing_encryptor_;
+}
+
+std::shared_ptr<Encryptor> InternalFileEncryptor::GetColumnMetaEncryptor(
+ const std::string& column_path) {
+ return GetColumnEncryptor(column_path, true);
+}
+
+std::shared_ptr<Encryptor> InternalFileEncryptor::GetColumnDataEncryptor(
+ const std::string& column_path) {
+ return GetColumnEncryptor(column_path, false);
+}
+
+std::shared_ptr<Encryptor>
+InternalFileEncryptor::InternalFileEncryptor::GetColumnEncryptor(
+ const std::string& column_path, bool metadata) {
+ // first look if we already got the encryptor from before
+ if (metadata) {
+ if (column_metadata_map_.find(column_path) != column_metadata_map_.end()) {
+ return column_metadata_map_.at(column_path);
+ }
+ } else {
+ if (column_data_map_.find(column_path) != column_data_map_.end()) {
+ return column_data_map_.at(column_path);
+ }
+ }
+ auto column_prop = properties_->column_encryption_properties(column_path);
+ if (column_prop == nullptr) {
+ return nullptr;
+ }
+
+ std::string key;
+ if (column_prop->is_encrypted_with_footer_key()) {
+ key = properties_->footer_key();
+ } else {
+ key = column_prop->key();
+ }
+
+ ParquetCipher::type algorithm = properties_->algorithm().algorithm;
+ auto aes_encryptor = metadata ? GetMetaAesEncryptor(algorithm, key.size())
+ : GetDataAesEncryptor(algorithm, key.size());
+
+ std::string file_aad = properties_->file_aad();
+ std::shared_ptr<Encryptor> encryptor =
+ std::make_shared<Encryptor>(aes_encryptor, key, file_aad, "", pool_);
+ if (metadata)
+ column_metadata_map_[column_path] = encryptor;
+ else
+ column_data_map_[column_path] = encryptor;
+
+ return encryptor;
+}
+
+int InternalFileEncryptor::MapKeyLenToEncryptorArrayIndex(int key_len) {
+ if (key_len == 16)
+ return 0;
+ else if (key_len == 24)
+ return 1;
+ else if (key_len == 32)
+ return 2;
+ throw ParquetException("encryption key must be 16, 24 or 32 bytes in length");
+}
+
+encryption::AesEncryptor* InternalFileEncryptor::GetMetaAesEncryptor(
+ ParquetCipher::type algorithm, size_t key_size) {
+ int key_len = static_cast<int>(key_size);
+ int index = MapKeyLenToEncryptorArrayIndex(key_len);
+ if (meta_encryptor_[index] == nullptr) {
+ meta_encryptor_[index].reset(
+ encryption::AesEncryptor::Make(algorithm, key_len, true, &all_encryptors_));
+ }
+ return meta_encryptor_[index].get();
+}
+
+encryption::AesEncryptor* InternalFileEncryptor::GetDataAesEncryptor(
+ ParquetCipher::type algorithm, size_t key_size) {
+ int key_len = static_cast<int>(key_size);
+ int index = MapKeyLenToEncryptorArrayIndex(key_len);
+ if (data_encryptor_[index] == nullptr) {
+ data_encryptor_[index].reset(
+ encryption::AesEncryptor::Make(algorithm, key_len, false, &all_encryptors_));
+ }
+ return data_encryptor_[index].get();
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/encryption/internal_file_encryptor.h b/contrib/libs/apache/arrow/cpp/src/parquet/encryption/internal_file_encryptor.h
new file mode 100644
index 00000000000..3cbe53500c2
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/encryption/internal_file_encryptor.h
@@ -0,0 +1,109 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "parquet/encryption/encryption.h"
+#include "parquet/schema.h"
+
+namespace parquet {
+
+namespace encryption {
+class AesEncryptor;
+} // namespace encryption
+
+class FileEncryptionProperties;
+class ColumnEncryptionProperties;
+
+class PARQUET_EXPORT Encryptor {
+ public:
+ Encryptor(encryption::AesEncryptor* aes_encryptor, const std::string& key,
+ const std::string& file_aad, const std::string& aad,
+ ::arrow::MemoryPool* pool);
+ const std::string& file_aad() { return file_aad_; }
+ void UpdateAad(const std::string& aad) { aad_ = aad; }
+ ::arrow::MemoryPool* pool() { return pool_; }
+
+ int CiphertextSizeDelta();
+ int Encrypt(const uint8_t* plaintext, int plaintext_len, uint8_t* ciphertext);
+
+ bool EncryptColumnMetaData(
+ bool encrypted_footer,
+ const std::shared_ptr<ColumnEncryptionProperties>& column_encryption_properties) {
+ // if column is not encrypted then do not encrypt the column metadata
+ if (!column_encryption_properties || !column_encryption_properties->is_encrypted())
+ return false;
+ // if plaintext footer then encrypt the column metadata
+ if (!encrypted_footer) return true;
+ // if column is not encrypted with footer key then encrypt the column metadata
+ return !column_encryption_properties->is_encrypted_with_footer_key();
+ }
+
+ private:
+ encryption::AesEncryptor* aes_encryptor_;
+ std::string key_;
+ std::string file_aad_;
+ std::string aad_;
+ ::arrow::MemoryPool* pool_;
+};
+
+class InternalFileEncryptor {
+ public:
+ explicit InternalFileEncryptor(FileEncryptionProperties* properties,
+ ::arrow::MemoryPool* pool);
+
+ std::shared_ptr<Encryptor> GetFooterEncryptor();
+ std::shared_ptr<Encryptor> GetFooterSigningEncryptor();
+ std::shared_ptr<Encryptor> GetColumnMetaEncryptor(const std::string& column_path);
+ std::shared_ptr<Encryptor> GetColumnDataEncryptor(const std::string& column_path);
+ void WipeOutEncryptionKeys();
+
+ private:
+ FileEncryptionProperties* properties_;
+
+ std::map<std::string, std::shared_ptr<Encryptor>> column_data_map_;
+ std::map<std::string, std::shared_ptr<Encryptor>> column_metadata_map_;
+
+ std::shared_ptr<Encryptor> footer_signing_encryptor_;
+ std::shared_ptr<Encryptor> footer_encryptor_;
+
+ std::vector<encryption::AesEncryptor*> all_encryptors_;
+
+ // Key must be 16, 24 or 32 bytes in length. Thus there could be up to three
+ // types of meta_encryptors and data_encryptors.
+ std::unique_ptr<encryption::AesEncryptor> meta_encryptor_[3];
+ std::unique_ptr<encryption::AesEncryptor> data_encryptor_[3];
+
+ ::arrow::MemoryPool* pool_;
+
+ std::shared_ptr<Encryptor> GetColumnEncryptor(const std::string& column_path,
+ bool metadata);
+
+ encryption::AesEncryptor* GetMetaAesEncryptor(ParquetCipher::type algorithm,
+ size_t key_len);
+ encryption::AesEncryptor* GetDataAesEncryptor(ParquetCipher::type algorithm,
+ size_t key_len);
+
+ int MapKeyLenToEncryptorArrayIndex(int key_len);
+};
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/exception.cc b/contrib/libs/apache/arrow/cpp/src/parquet/exception.cc
new file mode 100644
index 00000000000..c333957dd1d
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/exception.cc
@@ -0,0 +1,27 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/exception.h"
+
+namespace parquet {
+
+std::ostream& operator<<(std::ostream& os, const ParquetException& exception) {
+ os << exception.what();
+ return os;
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/exception.h b/contrib/libs/apache/arrow/cpp/src/parquet/exception.h
new file mode 100644
index 00000000000..826f5bdc8bf
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/exception.h
@@ -0,0 +1,158 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <exception>
+#include <sstream>
+#include <string>
+#include <utility>
+
+#include "arrow/type_fwd.h"
+#include "arrow/util/string_builder.h"
+#include "parquet/platform.h"
+
+// PARQUET-1085
+#if !defined(ARROW_UNUSED)
+#define ARROW_UNUSED(x) UNUSED(x)
+#endif
+
+// Parquet exception to Arrow Status
+
+#define BEGIN_PARQUET_CATCH_EXCEPTIONS try {
+#define END_PARQUET_CATCH_EXCEPTIONS \
+ } \
+ catch (const ::parquet::ParquetStatusException& e) { \
+ return e.status(); \
+ } \
+ catch (const ::parquet::ParquetException& e) { \
+ return ::arrow::Status::IOError(e.what()); \
+ }
+
+// clang-format off
+
+#define PARQUET_CATCH_NOT_OK(s) \
+ BEGIN_PARQUET_CATCH_EXCEPTIONS \
+ (s); \
+ END_PARQUET_CATCH_EXCEPTIONS
+
+// clang-format on
+
+#define PARQUET_CATCH_AND_RETURN(s) \
+ BEGIN_PARQUET_CATCH_EXCEPTIONS \
+ return (s); \
+ END_PARQUET_CATCH_EXCEPTIONS
+
+// Arrow Status to Parquet exception
+
+#define PARQUET_IGNORE_NOT_OK(s) \
+ do { \
+ ::arrow::Status _s = ::arrow::internal::GenericToStatus(s); \
+ ARROW_UNUSED(_s); \
+ } while (0)
+
+#define PARQUET_THROW_NOT_OK(s) \
+ do { \
+ ::arrow::Status _s = ::arrow::internal::GenericToStatus(s); \
+ if (!_s.ok()) { \
+ throw ::parquet::ParquetStatusException(std::move(_s)); \
+ } \
+ } while (0)
+
+#define PARQUET_ASSIGN_OR_THROW_IMPL(status_name, lhs, rexpr) \
+ auto status_name = (rexpr); \
+ PARQUET_THROW_NOT_OK(status_name.status()); \
+ lhs = std::move(status_name).ValueOrDie();
+
+#define PARQUET_ASSIGN_OR_THROW(lhs, rexpr) \
+ PARQUET_ASSIGN_OR_THROW_IMPL(ARROW_ASSIGN_OR_RAISE_NAME(_error_or_value, __COUNTER__), \
+ lhs, rexpr);
+
+namespace parquet {
+
+class ParquetException : public std::exception {
+ public:
+ PARQUET_NORETURN static void EofException(const std::string& msg = "") {
+ static std::string prefix = "Unexpected end of stream";
+ if (msg.empty()) {
+ throw ParquetException(prefix);
+ }
+ throw ParquetException(prefix, ": ", msg);
+ }
+
+ PARQUET_NORETURN static void NYI(const std::string& msg = "") {
+ throw ParquetException("Not yet implemented: ", msg, ".");
+ }
+
+ template <typename... Args>
+ explicit ParquetException(Args&&... args)
+ : msg_(::arrow::util::StringBuilder(std::forward<Args>(args)...)) {}
+
+ explicit ParquetException(std::string msg) : msg_(std::move(msg)) {}
+
+ explicit ParquetException(const char* msg, const std::exception&) : msg_(msg) {}
+
+ ParquetException(const ParquetException&) = default;
+ ParquetException& operator=(const ParquetException&) = default;
+ ParquetException(ParquetException&&) = default;
+ ParquetException& operator=(ParquetException&&) = default;
+
+ const char* what() const noexcept override { return msg_.c_str(); }
+
+ private:
+ std::string msg_;
+};
+
+// Support printing a ParquetException.
+// This is needed for clang-on-MSVC as there operator<< is not defined for
+// std::exception.
+PARQUET_EXPORT
+std::ostream& operator<<(std::ostream& os, const ParquetException& exception);
+
+class ParquetStatusException : public ParquetException {
+ public:
+ explicit ParquetStatusException(::arrow::Status status)
+ : ParquetException(status.ToString()), status_(std::move(status)) {}
+
+ const ::arrow::Status& status() const { return status_; }
+
+ private:
+ ::arrow::Status status_;
+};
+
+// This class exists for the purpose of detecting an invalid or corrupted file.
+class ParquetInvalidOrCorruptedFileException : public ParquetStatusException {
+ public:
+ ParquetInvalidOrCorruptedFileException(const ParquetInvalidOrCorruptedFileException&) =
+ default;
+
+ template <typename Arg,
+ typename std::enable_if<
+ !std::is_base_of<ParquetInvalidOrCorruptedFileException, Arg>::value,
+ int>::type = 0,
+ typename... Args>
+ explicit ParquetInvalidOrCorruptedFileException(Arg arg, Args&&... args)
+ : ParquetStatusException(::arrow::Status::Invalid(std::forward<Arg>(arg),
+ std::forward<Args>(args)...)) {}
+};
+
+template <typename StatusReturnBlock>
+void ThrowNotOk(StatusReturnBlock&& b) {
+ PARQUET_THROW_NOT_OK(b());
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/file_reader.cc b/contrib/libs/apache/arrow/cpp/src/parquet/file_reader.cc
new file mode 100644
index 00000000000..4e38901aa0d
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/file_reader.cc
@@ -0,0 +1,868 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/file_reader.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <ostream>
+#include <string>
+#include <utility>
+
+#include "arrow/io/caching.h"
+#include "arrow/io/file.h"
+#include "arrow/io/memory.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/future.h"
+#include "arrow/util/int_util_internal.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/ubsan.h"
+#include "parquet/column_reader.h"
+#include "parquet/column_scanner.h"
+#include "parquet/encryption/encryption_internal.h"
+#include "parquet/encryption/internal_file_decryptor.h"
+#include "parquet/exception.h"
+#include "parquet/file_writer.h"
+#include "parquet/metadata.h"
+#include "parquet/platform.h"
+#include "parquet/properties.h"
+#include "parquet/schema.h"
+#include "parquet/types.h"
+
+using arrow::internal::AddWithOverflow;
+
+namespace parquet {
+
+// PARQUET-978: Minimize footer reads by reading 64 KB from the end of the file
+static constexpr int64_t kDefaultFooterReadSize = 64 * 1024;
+static constexpr uint32_t kFooterSize = 8;
+
+// For PARQUET-816
+static constexpr int64_t kMaxDictHeaderSize = 100;
+
+// ----------------------------------------------------------------------
+// RowGroupReader public API
+
+RowGroupReader::RowGroupReader(std::unique_ptr<Contents> contents)
+ : contents_(std::move(contents)) {}
+
+std::shared_ptr<ColumnReader> RowGroupReader::Column(int i) {
+ if (i >= metadata()->num_columns()) {
+ std::stringstream ss;
+ ss << "Trying to read column index " << i << " but row group metadata has only "
+ << metadata()->num_columns() << " columns";
+ throw ParquetException(ss.str());
+ }
+ const ColumnDescriptor* descr = metadata()->schema()->Column(i);
+
+ std::unique_ptr<PageReader> page_reader = contents_->GetColumnPageReader(i);
+ return ColumnReader::Make(
+ descr, std::move(page_reader),
+ const_cast<ReaderProperties*>(contents_->properties())->memory_pool());
+}
+
+std::shared_ptr<ColumnReader> RowGroupReader::ColumnWithExposeEncoding(
+ int i, ExposedEncoding encoding_to_expose) {
+ std::shared_ptr<ColumnReader> reader = Column(i);
+
+ if (encoding_to_expose == ExposedEncoding::DICTIONARY) {
+ // Check the encoding_stats to see if all data pages are dictionary encoded.
+ std::unique_ptr<ColumnChunkMetaData> col = metadata()->ColumnChunk(i);
+ const std::vector<PageEncodingStats>& encoding_stats = col->encoding_stats();
+ if (encoding_stats.empty()) {
+ // Some parquet files may have empty encoding_stats. In this case we are
+ // not sure whether all data pages are dictionary encoded. So we do not
+ // enable exposing dictionary.
+ return reader;
+ }
+ // The 1st page should be the dictionary page.
+ if (encoding_stats[0].page_type != PageType::DICTIONARY_PAGE ||
+ (encoding_stats[0].encoding != Encoding::PLAIN &&
+ encoding_stats[0].encoding != Encoding::PLAIN_DICTIONARY)) {
+ return reader;
+ }
+ // The following pages should be dictionary encoded data pages.
+ for (size_t idx = 1; idx < encoding_stats.size(); ++idx) {
+ if ((encoding_stats[idx].encoding != Encoding::RLE_DICTIONARY &&
+ encoding_stats[idx].encoding != Encoding::PLAIN_DICTIONARY) ||
+ (encoding_stats[idx].page_type != PageType::DATA_PAGE &&
+ encoding_stats[idx].page_type != PageType::DATA_PAGE_V2)) {
+ return reader;
+ }
+ }
+ } else {
+ // Exposing other encodings are not supported for now.
+ return reader;
+ }
+
+ // Set exposed encoding.
+ reader->SetExposedEncoding(encoding_to_expose);
+ return reader;
+}
+
+std::unique_ptr<PageReader> RowGroupReader::GetColumnPageReader(int i) {
+ if (i >= metadata()->num_columns()) {
+ std::stringstream ss;
+ ss << "Trying to read column index " << i << " but row group metadata has only "
+ << metadata()->num_columns() << " columns";
+ throw ParquetException(ss.str());
+ }
+ return contents_->GetColumnPageReader(i);
+}
+
+// Returns the rowgroup metadata
+const RowGroupMetaData* RowGroupReader::metadata() const { return contents_->metadata(); }
+
+/// Compute the section of the file that should be read for the given
+/// row group and column chunk.
+::arrow::io::ReadRange ComputeColumnChunkRange(FileMetaData* file_metadata,
+ int64_t source_size, int row_group_index,
+ int column_index) {
+ auto row_group_metadata = file_metadata->RowGroup(row_group_index);
+ auto column_metadata = row_group_metadata->ColumnChunk(column_index);
+
+ int64_t col_start = column_metadata->data_page_offset();
+ if (column_metadata->has_dictionary_page() &&
+ column_metadata->dictionary_page_offset() > 0 &&
+ col_start > column_metadata->dictionary_page_offset()) {
+ col_start = column_metadata->dictionary_page_offset();
+ }
+
+ int64_t col_length = column_metadata->total_compressed_size();
+ int64_t col_end;
+ if (AddWithOverflow(col_start, col_length, &col_end) || col_end > source_size) {
+ throw ParquetException("Invalid column metadata (corrupt file?)");
+ }
+
+ // PARQUET-816 workaround for old files created by older parquet-mr
+ const ApplicationVersion& version = file_metadata->writer_version();
+ if (version.VersionLt(ApplicationVersion::PARQUET_816_FIXED_VERSION())) {
+ // The Parquet MR writer had a bug in 1.2.8 and below where it didn't include the
+ // dictionary page header size in total_compressed_size and total_uncompressed_size
+ // (see IMPALA-694). We add padding to compensate.
+ int64_t bytes_remaining = source_size - col_end;
+ int64_t padding = std::min<int64_t>(kMaxDictHeaderSize, bytes_remaining);
+ col_length += padding;
+ }
+
+ return {col_start, col_length};
+}
+
+// RowGroupReader::Contents implementation for the Parquet file specification
+class SerializedRowGroup : public RowGroupReader::Contents {
+ public:
+ SerializedRowGroup(std::shared_ptr<ArrowInputFile> source,
+ std::shared_ptr<::arrow::io::internal::ReadRangeCache> cached_source,
+ int64_t source_size, FileMetaData* file_metadata,
+ int row_group_number, const ReaderProperties& props,
+ std::shared_ptr<InternalFileDecryptor> file_decryptor = nullptr)
+ : source_(std::move(source)),
+ cached_source_(std::move(cached_source)),
+ source_size_(source_size),
+ file_metadata_(file_metadata),
+ properties_(props),
+ row_group_ordinal_(row_group_number),
+ file_decryptor_(file_decryptor) {
+ row_group_metadata_ = file_metadata->RowGroup(row_group_number);
+ }
+
+ const RowGroupMetaData* metadata() const override { return row_group_metadata_.get(); }
+
+ const ReaderProperties* properties() const override { return &properties_; }
+
+ std::unique_ptr<PageReader> GetColumnPageReader(int i) override {
+ // Read column chunk from the file
+ auto col = row_group_metadata_->ColumnChunk(i);
+
+ ::arrow::io::ReadRange col_range =
+ ComputeColumnChunkRange(file_metadata_, source_size_, row_group_ordinal_, i);
+ std::shared_ptr<ArrowInputStream> stream;
+ if (cached_source_) {
+ // PARQUET-1698: if read coalescing is enabled, read from pre-buffered
+ // segments.
+ PARQUET_ASSIGN_OR_THROW(auto buffer, cached_source_->Read(col_range));
+ stream = std::make_shared<::arrow::io::BufferReader>(buffer);
+ } else {
+ stream = properties_.GetStream(source_, col_range.offset, col_range.length);
+ }
+
+ std::unique_ptr<ColumnCryptoMetaData> crypto_metadata = col->crypto_metadata();
+
+ // Column is encrypted only if crypto_metadata exists.
+ if (!crypto_metadata) {
+ return PageReader::Open(stream, col->num_values(), col->compression(),
+ properties_.memory_pool());
+ }
+
+ if (file_decryptor_ == nullptr) {
+ throw ParquetException("RowGroup is noted as encrypted but no file decryptor");
+ }
+
+ constexpr auto kEncryptedRowGroupsLimit = 32767;
+ if (i > kEncryptedRowGroupsLimit) {
+ throw ParquetException("Encrypted files cannot contain more than 32767 row groups");
+ }
+
+ // The column is encrypted
+ std::shared_ptr<Decryptor> meta_decryptor;
+ std::shared_ptr<Decryptor> data_decryptor;
+ // The column is encrypted with footer key
+ if (crypto_metadata->encrypted_with_footer_key()) {
+ meta_decryptor = file_decryptor_->GetFooterDecryptorForColumnMeta();
+ data_decryptor = file_decryptor_->GetFooterDecryptorForColumnData();
+ CryptoContext ctx(col->has_dictionary_page(), row_group_ordinal_,
+ static_cast<int16_t>(i), meta_decryptor, data_decryptor);
+ return PageReader::Open(stream, col->num_values(), col->compression(),
+ properties_.memory_pool(), &ctx);
+ }
+
+ // The column is encrypted with its own key
+ std::string column_key_metadata = crypto_metadata->key_metadata();
+ const std::string column_path = crypto_metadata->path_in_schema()->ToDotString();
+
+ meta_decryptor =
+ file_decryptor_->GetColumnMetaDecryptor(column_path, column_key_metadata);
+ data_decryptor =
+ file_decryptor_->GetColumnDataDecryptor(column_path, column_key_metadata);
+
+ CryptoContext ctx(col->has_dictionary_page(), row_group_ordinal_,
+ static_cast<int16_t>(i), meta_decryptor, data_decryptor);
+ return PageReader::Open(stream, col->num_values(), col->compression(),
+ properties_.memory_pool(), &ctx);
+ }
+
+ private:
+ std::shared_ptr<ArrowInputFile> source_;
+ // Will be nullptr if PreBuffer() is not called.
+ std::shared_ptr<::arrow::io::internal::ReadRangeCache> cached_source_;
+ int64_t source_size_;
+ FileMetaData* file_metadata_;
+ std::unique_ptr<RowGroupMetaData> row_group_metadata_;
+ ReaderProperties properties_;
+ int row_group_ordinal_;
+ std::shared_ptr<InternalFileDecryptor> file_decryptor_;
+};
+
+// ----------------------------------------------------------------------
+// SerializedFile: An implementation of ParquetFileReader::Contents that deals
+// with the Parquet file structure, Thrift deserialization, and other internal
+// matters
+
+// This class takes ownership of the provided data source
+class SerializedFile : public ParquetFileReader::Contents {
+ public:
+ SerializedFile(std::shared_ptr<ArrowInputFile> source,
+ const ReaderProperties& props = default_reader_properties())
+ : source_(std::move(source)), properties_(props) {
+ PARQUET_ASSIGN_OR_THROW(source_size_, source_->GetSize());
+ }
+
+ ~SerializedFile() override {
+ try {
+ Close();
+ } catch (...) {
+ }
+ }
+
+ void Close() override {
+ if (file_decryptor_) file_decryptor_->WipeOutDecryptionKeys();
+ }
+
+ std::shared_ptr<RowGroupReader> GetRowGroup(int i) override {
+ std::unique_ptr<SerializedRowGroup> contents(
+ new SerializedRowGroup(source_, cached_source_, source_size_,
+ file_metadata_.get(), i, properties_, file_decryptor_));
+ return std::make_shared<RowGroupReader>(std::move(contents));
+ }
+
+ std::shared_ptr<FileMetaData> metadata() const override { return file_metadata_; }
+
+ void set_metadata(std::shared_ptr<FileMetaData> metadata) {
+ file_metadata_ = std::move(metadata);
+ }
+
+ void PreBuffer(const std::vector<int>& row_groups,
+ const std::vector<int>& column_indices,
+ const ::arrow::io::IOContext& ctx,
+ const ::arrow::io::CacheOptions& options) {
+ cached_source_ =
+ std::make_shared<::arrow::io::internal::ReadRangeCache>(source_, ctx, options);
+ std::vector<::arrow::io::ReadRange> ranges;
+ for (int row : row_groups) {
+ for (int col : column_indices) {
+ ranges.push_back(
+ ComputeColumnChunkRange(file_metadata_.get(), source_size_, row, col));
+ }
+ }
+ PARQUET_THROW_NOT_OK(cached_source_->Cache(ranges));
+ }
+
+ ::arrow::Future<> WhenBuffered(const std::vector<int>& row_groups,
+ const std::vector<int>& column_indices) const {
+ if (!cached_source_) {
+ return ::arrow::Status::Invalid("Must call PreBuffer before WhenBuffered");
+ }
+ std::vector<::arrow::io::ReadRange> ranges;
+ for (int row : row_groups) {
+ for (int col : column_indices) {
+ ranges.push_back(
+ ComputeColumnChunkRange(file_metadata_.get(), source_size_, row, col));
+ }
+ }
+ return cached_source_->WaitFor(ranges);
+ }
+
+ // Metadata/footer parsing. Divided up to separate sync/async paths, and to use
+ // exceptions for error handling (with the async path converting to Future/Status).
+
+ void ParseMetaData() {
+ int64_t footer_read_size = GetFooterReadSize();
+ PARQUET_ASSIGN_OR_THROW(
+ auto footer_buffer,
+ source_->ReadAt(source_size_ - footer_read_size, footer_read_size));
+ uint32_t metadata_len = ParseFooterLength(footer_buffer, footer_read_size);
+ int64_t metadata_start = source_size_ - kFooterSize - metadata_len;
+
+ std::shared_ptr<::arrow::Buffer> metadata_buffer;
+ if (footer_read_size >= (metadata_len + kFooterSize)) {
+ metadata_buffer = SliceBuffer(
+ footer_buffer, footer_read_size - metadata_len - kFooterSize, metadata_len);
+ } else {
+ PARQUET_ASSIGN_OR_THROW(metadata_buffer,
+ source_->ReadAt(metadata_start, metadata_len));
+ }
+
+ // Parse the footer depending on encryption type
+ const bool is_encrypted_footer =
+ memcmp(footer_buffer->data() + footer_read_size - 4, kParquetEMagic, 4) == 0;
+ if (is_encrypted_footer) {
+ // Encrypted file with Encrypted footer.
+ const std::pair<int64_t, uint32_t> read_size =
+ ParseMetaDataOfEncryptedFileWithEncryptedFooter(metadata_buffer, metadata_len);
+ // Read the actual footer
+ metadata_start = read_size.first;
+ metadata_len = read_size.second;
+ PARQUET_ASSIGN_OR_THROW(metadata_buffer,
+ source_->ReadAt(metadata_start, metadata_len));
+ // Fall through
+ }
+
+ const uint32_t read_metadata_len =
+ ParseUnencryptedFileMetadata(metadata_buffer, metadata_len);
+ auto file_decryption_properties = properties_.file_decryption_properties().get();
+ if (is_encrypted_footer) {
+ // Nothing else to do here.
+ return;
+ } else if (!file_metadata_->is_encryption_algorithm_set()) { // Non encrypted file.
+ if (file_decryption_properties != nullptr) {
+ if (!file_decryption_properties->plaintext_files_allowed()) {
+ throw ParquetException("Applying decryption properties on plaintext file");
+ }
+ }
+ } else {
+ // Encrypted file with plaintext footer mode.
+ ParseMetaDataOfEncryptedFileWithPlaintextFooter(
+ file_decryption_properties, metadata_buffer, metadata_len, read_metadata_len);
+ }
+ }
+
+ // Validate the source size and get the initial read size.
+ int64_t GetFooterReadSize() {
+ if (source_size_ == 0) {
+ throw ParquetInvalidOrCorruptedFileException("Parquet file size is 0 bytes");
+ } else if (source_size_ < kFooterSize) {
+ throw ParquetInvalidOrCorruptedFileException(
+ "Parquet file size is ", source_size_,
+ " bytes, smaller than the minimum file footer (", kFooterSize, " bytes)");
+ }
+ return std::min(source_size_, kDefaultFooterReadSize);
+ }
+
+ // Validate the magic bytes and get the length of the full footer.
+ uint32_t ParseFooterLength(const std::shared_ptr<::arrow::Buffer>& footer_buffer,
+ const int64_t footer_read_size) {
+ // Check if all bytes are read. Check if last 4 bytes read have the magic bits
+ if (footer_buffer->size() != footer_read_size ||
+ (memcmp(footer_buffer->data() + footer_read_size - 4, kParquetMagic, 4) != 0 &&
+ memcmp(footer_buffer->data() + footer_read_size - 4, kParquetEMagic, 4) != 0)) {
+ throw ParquetInvalidOrCorruptedFileException(
+ "Parquet magic bytes not found in footer. Either the file is corrupted or this "
+ "is not a parquet file.");
+ }
+ // Both encrypted/unencrypted footers have the same footer length check.
+ uint32_t metadata_len = ::arrow::util::SafeLoadAs<uint32_t>(
+ reinterpret_cast<const uint8_t*>(footer_buffer->data()) + footer_read_size -
+ kFooterSize);
+ if (metadata_len > source_size_ - kFooterSize) {
+ throw ParquetInvalidOrCorruptedFileException(
+ "Parquet file size is ", source_size_,
+ " bytes, smaller than the size reported by footer's (", metadata_len, "bytes)");
+ }
+ return metadata_len;
+ }
+
+ // Does not throw.
+ ::arrow::Future<> ParseMetaDataAsync() {
+ int64_t footer_read_size;
+ BEGIN_PARQUET_CATCH_EXCEPTIONS
+ footer_read_size = GetFooterReadSize();
+ END_PARQUET_CATCH_EXCEPTIONS
+ // Assumes this is kept alive externally
+ return source_->ReadAsync(source_size_ - footer_read_size, footer_read_size)
+ .Then([=](const std::shared_ptr<::arrow::Buffer>& footer_buffer)
+ -> ::arrow::Future<> {
+ uint32_t metadata_len;
+ BEGIN_PARQUET_CATCH_EXCEPTIONS
+ metadata_len = ParseFooterLength(footer_buffer, footer_read_size);
+ END_PARQUET_CATCH_EXCEPTIONS
+ int64_t metadata_start = source_size_ - kFooterSize - metadata_len;
+
+ std::shared_ptr<::arrow::Buffer> metadata_buffer;
+ if (footer_read_size >= (metadata_len + kFooterSize)) {
+ metadata_buffer =
+ SliceBuffer(footer_buffer, footer_read_size - metadata_len - kFooterSize,
+ metadata_len);
+ return ParseMaybeEncryptedMetaDataAsync(footer_buffer,
+ std::move(metadata_buffer),
+ footer_read_size, metadata_len);
+ }
+ return source_->ReadAsync(metadata_start, metadata_len)
+ .Then([=](const std::shared_ptr<::arrow::Buffer>& metadata_buffer) {
+ return ParseMaybeEncryptedMetaDataAsync(footer_buffer, metadata_buffer,
+ footer_read_size, metadata_len);
+ });
+ });
+ }
+
+ // Continuation
+ ::arrow::Future<> ParseMaybeEncryptedMetaDataAsync(
+ std::shared_ptr<::arrow::Buffer> footer_buffer,
+ std::shared_ptr<::arrow::Buffer> metadata_buffer, int64_t footer_read_size,
+ uint32_t metadata_len) {
+ // Parse the footer depending on encryption type
+ const bool is_encrypted_footer =
+ memcmp(footer_buffer->data() + footer_read_size - 4, kParquetEMagic, 4) == 0;
+ if (is_encrypted_footer) {
+ // Encrypted file with Encrypted footer.
+ std::pair<int64_t, uint32_t> read_size;
+ BEGIN_PARQUET_CATCH_EXCEPTIONS
+ read_size =
+ ParseMetaDataOfEncryptedFileWithEncryptedFooter(metadata_buffer, metadata_len);
+ END_PARQUET_CATCH_EXCEPTIONS
+ // Read the actual footer
+ int64_t metadata_start = read_size.first;
+ metadata_len = read_size.second;
+ return source_->ReadAsync(metadata_start, metadata_len)
+ .Then([=](const std::shared_ptr<::arrow::Buffer>& metadata_buffer) {
+ // Continue and read the file footer
+ return ParseMetaDataFinal(metadata_buffer, metadata_len, is_encrypted_footer);
+ });
+ }
+ return ParseMetaDataFinal(std::move(metadata_buffer), metadata_len,
+ is_encrypted_footer);
+ }
+
+ // Continuation
+ ::arrow::Status ParseMetaDataFinal(std::shared_ptr<::arrow::Buffer> metadata_buffer,
+ uint32_t metadata_len,
+ const bool is_encrypted_footer) {
+ BEGIN_PARQUET_CATCH_EXCEPTIONS
+ const uint32_t read_metadata_len =
+ ParseUnencryptedFileMetadata(metadata_buffer, metadata_len);
+ auto file_decryption_properties = properties_.file_decryption_properties().get();
+ if (is_encrypted_footer) {
+ // Nothing else to do here.
+ return ::arrow::Status::OK();
+ } else if (!file_metadata_->is_encryption_algorithm_set()) { // Non encrypted file.
+ if (file_decryption_properties != nullptr) {
+ if (!file_decryption_properties->plaintext_files_allowed()) {
+ throw ParquetException("Applying decryption properties on plaintext file");
+ }
+ }
+ } else {
+ // Encrypted file with plaintext footer mode.
+ ParseMetaDataOfEncryptedFileWithPlaintextFooter(
+ file_decryption_properties, metadata_buffer, metadata_len, read_metadata_len);
+ }
+ END_PARQUET_CATCH_EXCEPTIONS
+ return ::arrow::Status::OK();
+ }
+
+ private:
+ std::shared_ptr<ArrowInputFile> source_;
+ std::shared_ptr<::arrow::io::internal::ReadRangeCache> cached_source_;
+ int64_t source_size_;
+ std::shared_ptr<FileMetaData> file_metadata_;
+ ReaderProperties properties_;
+
+ std::shared_ptr<InternalFileDecryptor> file_decryptor_;
+
+ // \return The true length of the metadata in bytes
+ uint32_t ParseUnencryptedFileMetadata(const std::shared_ptr<Buffer>& footer_buffer,
+ const uint32_t metadata_len);
+
+ std::string HandleAadPrefix(FileDecryptionProperties* file_decryption_properties,
+ EncryptionAlgorithm& algo);
+
+ void ParseMetaDataOfEncryptedFileWithPlaintextFooter(
+ FileDecryptionProperties* file_decryption_properties,
+ const std::shared_ptr<Buffer>& metadata_buffer, uint32_t metadata_len,
+ uint32_t read_metadata_len);
+
+ // \return The position and size of the actual footer
+ std::pair<int64_t, uint32_t> ParseMetaDataOfEncryptedFileWithEncryptedFooter(
+ const std::shared_ptr<Buffer>& crypto_metadata_buffer, uint32_t footer_len);
+};
+
+uint32_t SerializedFile::ParseUnencryptedFileMetadata(
+ const std::shared_ptr<Buffer>& metadata_buffer, const uint32_t metadata_len) {
+ if (metadata_buffer->size() != metadata_len) {
+ throw ParquetException("Failed reading metadata buffer (requested " +
+ std::to_string(metadata_len) + " bytes but got " +
+ std::to_string(metadata_buffer->size()) + " bytes)");
+ }
+ uint32_t read_metadata_len = metadata_len;
+ // The encrypted read path falls through to here, so pass in the decryptor
+ file_metadata_ =
+ FileMetaData::Make(metadata_buffer->data(), &read_metadata_len, file_decryptor_);
+ return read_metadata_len;
+}
+
+std::pair<int64_t, uint32_t>
+SerializedFile::ParseMetaDataOfEncryptedFileWithEncryptedFooter(
+ const std::shared_ptr<::arrow::Buffer>& crypto_metadata_buffer,
+ // both metadata & crypto metadata length
+ const uint32_t footer_len) {
+ // encryption with encrypted footer
+ // Check if the footer_buffer contains the entire metadata
+ if (crypto_metadata_buffer->size() != footer_len) {
+ throw ParquetException("Failed reading encrypted metadata buffer (requested " +
+ std::to_string(footer_len) + " bytes but got " +
+ std::to_string(crypto_metadata_buffer->size()) + " bytes)");
+ }
+ auto file_decryption_properties = properties_.file_decryption_properties().get();
+ if (file_decryption_properties == nullptr) {
+ throw ParquetException(
+ "Could not read encrypted metadata, no decryption found in reader's properties");
+ }
+ uint32_t crypto_metadata_len = footer_len;
+ std::shared_ptr<FileCryptoMetaData> file_crypto_metadata =
+ FileCryptoMetaData::Make(crypto_metadata_buffer->data(), &crypto_metadata_len);
+ // Handle AAD prefix
+ EncryptionAlgorithm algo = file_crypto_metadata->encryption_algorithm();
+ std::string file_aad = HandleAadPrefix(file_decryption_properties, algo);
+ file_decryptor_ = std::make_shared<InternalFileDecryptor>(
+ file_decryption_properties, file_aad, algo.algorithm,
+ file_crypto_metadata->key_metadata(), properties_.memory_pool());
+
+ int64_t metadata_offset = source_size_ - kFooterSize - footer_len + crypto_metadata_len;
+ uint32_t metadata_len = footer_len - crypto_metadata_len;
+ return std::make_pair(metadata_offset, metadata_len);
+}
+
+void SerializedFile::ParseMetaDataOfEncryptedFileWithPlaintextFooter(
+ FileDecryptionProperties* file_decryption_properties,
+ const std::shared_ptr<Buffer>& metadata_buffer, uint32_t metadata_len,
+ uint32_t read_metadata_len) {
+ // Providing decryption properties in plaintext footer mode is not mandatory, for
+ // example when reading by legacy reader.
+ if (file_decryption_properties != nullptr) {
+ EncryptionAlgorithm algo = file_metadata_->encryption_algorithm();
+ // Handle AAD prefix
+ std::string file_aad = HandleAadPrefix(file_decryption_properties, algo);
+ file_decryptor_ = std::make_shared<InternalFileDecryptor>(
+ file_decryption_properties, file_aad, algo.algorithm,
+ file_metadata_->footer_signing_key_metadata(), properties_.memory_pool());
+ // set the InternalFileDecryptor in the metadata as well, as it's used
+ // for signature verification and for ColumnChunkMetaData creation.
+ file_metadata_->set_file_decryptor(file_decryptor_);
+
+ if (file_decryption_properties->check_plaintext_footer_integrity()) {
+ if (metadata_len - read_metadata_len !=
+ (parquet::encryption::kGcmTagLength + parquet::encryption::kNonceLength)) {
+ throw ParquetInvalidOrCorruptedFileException(
+ "Failed reading metadata for encryption signature (requested ",
+ parquet::encryption::kGcmTagLength + parquet::encryption::kNonceLength,
+ " bytes but have ", metadata_len - read_metadata_len, " bytes)");
+ }
+
+ if (!file_metadata_->VerifySignature(metadata_buffer->data() + read_metadata_len)) {
+ throw ParquetInvalidOrCorruptedFileException(
+ "Parquet crypto signature verification failed");
+ }
+ }
+ }
+}
+
+std::string SerializedFile::HandleAadPrefix(
+ FileDecryptionProperties* file_decryption_properties, EncryptionAlgorithm& algo) {
+ std::string aad_prefix_in_properties = file_decryption_properties->aad_prefix();
+ std::string aad_prefix = aad_prefix_in_properties;
+ bool file_has_aad_prefix = algo.aad.aad_prefix.empty() ? false : true;
+ std::string aad_prefix_in_file = algo.aad.aad_prefix;
+
+ if (algo.aad.supply_aad_prefix && aad_prefix_in_properties.empty()) {
+ throw ParquetException(
+ "AAD prefix used for file encryption, "
+ "but not stored in file and not supplied "
+ "in decryption properties");
+ }
+
+ if (file_has_aad_prefix) {
+ if (!aad_prefix_in_properties.empty()) {
+ if (aad_prefix_in_properties.compare(aad_prefix_in_file) != 0) {
+ throw ParquetException(
+ "AAD Prefix in file and in properties "
+ "is not the same");
+ }
+ }
+ aad_prefix = aad_prefix_in_file;
+ std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier =
+ file_decryption_properties->aad_prefix_verifier();
+ if (aad_prefix_verifier != nullptr) aad_prefix_verifier->Verify(aad_prefix);
+ } else {
+ if (!algo.aad.supply_aad_prefix && !aad_prefix_in_properties.empty()) {
+ throw ParquetException(
+ "AAD Prefix set in decryption properties, but was not used "
+ "for file encryption");
+ }
+ std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier =
+ file_decryption_properties->aad_prefix_verifier();
+ if (aad_prefix_verifier != nullptr) {
+ throw ParquetException(
+ "AAD Prefix Verifier is set, but AAD Prefix not found in file");
+ }
+ }
+ return aad_prefix + algo.aad.aad_file_unique;
+}
+
+// ----------------------------------------------------------------------
+// ParquetFileReader public API
+
+ParquetFileReader::ParquetFileReader() {}
+
+ParquetFileReader::~ParquetFileReader() {
+ try {
+ Close();
+ } catch (...) {
+ }
+}
+
+// Open the file. If no metadata is passed, it is parsed from the footer of
+// the file
+std::unique_ptr<ParquetFileReader::Contents> ParquetFileReader::Contents::Open(
+ std::shared_ptr<ArrowInputFile> source, const ReaderProperties& props,
+ std::shared_ptr<FileMetaData> metadata) {
+ std::unique_ptr<ParquetFileReader::Contents> result(
+ new SerializedFile(std::move(source), props));
+
+ // Access private methods here, but otherwise unavailable
+ SerializedFile* file = static_cast<SerializedFile*>(result.get());
+
+ if (metadata == nullptr) {
+ // Validates magic bytes, parses metadata, and initializes the SchemaDescriptor
+ file->ParseMetaData();
+ } else {
+ file->set_metadata(std::move(metadata));
+ }
+
+ return result;
+}
+
+::arrow::Future<std::unique_ptr<ParquetFileReader::Contents>>
+ParquetFileReader::Contents::OpenAsync(std::shared_ptr<ArrowInputFile> source,
+ const ReaderProperties& props,
+ std::shared_ptr<FileMetaData> metadata) {
+ BEGIN_PARQUET_CATCH_EXCEPTIONS
+ std::unique_ptr<ParquetFileReader::Contents> result(
+ new SerializedFile(std::move(source), props));
+ SerializedFile* file = static_cast<SerializedFile*>(result.get());
+ if (metadata == nullptr) {
+ // TODO(ARROW-12259): workaround since we have Future<(move-only type)>
+ struct {
+ ::arrow::Result<std::unique_ptr<ParquetFileReader::Contents>> operator()() {
+ return std::move(result);
+ }
+
+ std::unique_ptr<ParquetFileReader::Contents> result;
+ } Continuation;
+ Continuation.result = std::move(result);
+ return file->ParseMetaDataAsync().Then(std::move(Continuation));
+ } else {
+ file->set_metadata(std::move(metadata));
+ return ::arrow::Future<std::unique_ptr<ParquetFileReader::Contents>>::MakeFinished(
+ std::move(result));
+ }
+ END_PARQUET_CATCH_EXCEPTIONS
+}
+
+std::unique_ptr<ParquetFileReader> ParquetFileReader::Open(
+ std::shared_ptr<::arrow::io::RandomAccessFile> source, const ReaderProperties& props,
+ std::shared_ptr<FileMetaData> metadata) {
+ auto contents = SerializedFile::Open(std::move(source), props, std::move(metadata));
+ std::unique_ptr<ParquetFileReader> result(new ParquetFileReader());
+ result->Open(std::move(contents));
+ return result;
+}
+
+std::unique_ptr<ParquetFileReader> ParquetFileReader::OpenFile(
+ const std::string& path, bool memory_map, const ReaderProperties& props,
+ std::shared_ptr<FileMetaData> metadata) {
+ std::shared_ptr<::arrow::io::RandomAccessFile> source;
+ if (memory_map) {
+ PARQUET_ASSIGN_OR_THROW(
+ source, ::arrow::io::MemoryMappedFile::Open(path, ::arrow::io::FileMode::READ));
+ } else {
+ PARQUET_ASSIGN_OR_THROW(source,
+ ::arrow::io::ReadableFile::Open(path, props.memory_pool()));
+ }
+
+ return Open(std::move(source), props, std::move(metadata));
+}
+
+::arrow::Future<std::unique_ptr<ParquetFileReader>> ParquetFileReader::OpenAsync(
+ std::shared_ptr<::arrow::io::RandomAccessFile> source, const ReaderProperties& props,
+ std::shared_ptr<FileMetaData> metadata) {
+ BEGIN_PARQUET_CATCH_EXCEPTIONS
+ auto fut = SerializedFile::OpenAsync(std::move(source), props, std::move(metadata));
+ // TODO(ARROW-12259): workaround since we have Future<(move-only type)>
+ auto completed = ::arrow::Future<std::unique_ptr<ParquetFileReader>>::Make();
+ fut.AddCallback([fut, completed](
+ const ::arrow::Result<std::unique_ptr<ParquetFileReader::Contents>>&
+ contents) mutable {
+ if (!contents.ok()) {
+ completed.MarkFinished(contents.status());
+ return;
+ }
+ std::unique_ptr<ParquetFileReader> result(new ParquetFileReader());
+ result->Open(fut.MoveResult().MoveValueUnsafe());
+ completed.MarkFinished(std::move(result));
+ });
+ return completed;
+ END_PARQUET_CATCH_EXCEPTIONS
+}
+
+void ParquetFileReader::Open(std::unique_ptr<ParquetFileReader::Contents> contents) {
+ contents_ = std::move(contents);
+}
+
+void ParquetFileReader::Close() {
+ if (contents_) {
+ contents_->Close();
+ }
+}
+
+std::shared_ptr<FileMetaData> ParquetFileReader::metadata() const {
+ return contents_->metadata();
+}
+
+std::shared_ptr<RowGroupReader> ParquetFileReader::RowGroup(int i) {
+ if (i >= metadata()->num_row_groups()) {
+ std::stringstream ss;
+ ss << "Trying to read row group " << i << " but file only has "
+ << metadata()->num_row_groups() << " row groups";
+ throw ParquetException(ss.str());
+ }
+ return contents_->GetRowGroup(i);
+}
+
+void ParquetFileReader::PreBuffer(const std::vector<int>& row_groups,
+ const std::vector<int>& column_indices,
+ const ::arrow::io::IOContext& ctx,
+ const ::arrow::io::CacheOptions& options) {
+ // Access private methods here
+ SerializedFile* file =
+ ::arrow::internal::checked_cast<SerializedFile*>(contents_.get());
+ file->PreBuffer(row_groups, column_indices, ctx, options);
+}
+
+::arrow::Future<> ParquetFileReader::WhenBuffered(
+ const std::vector<int>& row_groups, const std::vector<int>& column_indices) const {
+ // Access private methods here
+ SerializedFile* file =
+ ::arrow::internal::checked_cast<SerializedFile*>(contents_.get());
+ return file->WhenBuffered(row_groups, column_indices);
+}
+
+// ----------------------------------------------------------------------
+// File metadata helpers
+
+std::shared_ptr<FileMetaData> ReadMetaData(
+ const std::shared_ptr<::arrow::io::RandomAccessFile>& source) {
+ return ParquetFileReader::Open(source)->metadata();
+}
+
+// ----------------------------------------------------------------------
+// File scanner for performance testing
+
+int64_t ScanFileContents(std::vector<int> columns, const int32_t column_batch_size,
+ ParquetFileReader* reader) {
+ std::vector<int16_t> rep_levels(column_batch_size);
+ std::vector<int16_t> def_levels(column_batch_size);
+
+ int num_columns = static_cast<int>(columns.size());
+
+ // columns are not specified explicitly. Add all columns
+ if (columns.size() == 0) {
+ num_columns = reader->metadata()->num_columns();
+ columns.resize(num_columns);
+ for (int i = 0; i < num_columns; i++) {
+ columns[i] = i;
+ }
+ }
+
+ std::vector<int64_t> total_rows(num_columns, 0);
+
+ for (int r = 0; r < reader->metadata()->num_row_groups(); ++r) {
+ auto group_reader = reader->RowGroup(r);
+ int col = 0;
+ for (auto i : columns) {
+ std::shared_ptr<ColumnReader> col_reader = group_reader->Column(i);
+ size_t value_byte_size = GetTypeByteSize(col_reader->descr()->physical_type());
+ std::vector<uint8_t> values(column_batch_size * value_byte_size);
+
+ int64_t values_read = 0;
+ while (col_reader->HasNext()) {
+ int64_t levels_read =
+ ScanAllValues(column_batch_size, def_levels.data(), rep_levels.data(),
+ values.data(), &values_read, col_reader.get());
+ if (col_reader->descr()->max_repetition_level() > 0) {
+ for (int64_t i = 0; i < levels_read; i++) {
+ if (rep_levels[i] == 0) {
+ total_rows[col]++;
+ }
+ }
+ } else {
+ total_rows[col] += levels_read;
+ }
+ }
+ col++;
+ }
+ }
+
+ for (int i = 1; i < num_columns; ++i) {
+ if (total_rows[0] != total_rows[i]) {
+ throw ParquetException("Parquet error: Total rows among columns do not match");
+ }
+ }
+
+ return total_rows[0];
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/file_reader.h b/contrib/libs/apache/arrow/cpp/src/parquet/file_reader.h
new file mode 100644
index 00000000000..0fc84054939
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/file_reader.h
@@ -0,0 +1,188 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/io/caching.h"
+#include "arrow/util/type_fwd.h"
+#include "parquet/metadata.h" // IWYU pragma: keep
+#include "parquet/platform.h"
+#include "parquet/properties.h"
+
+namespace parquet {
+
+class ColumnReader;
+class FileMetaData;
+class PageReader;
+class RowGroupMetaData;
+
+class PARQUET_EXPORT RowGroupReader {
+ public:
+ // Forward declare a virtual class 'Contents' to aid dependency injection and more
+ // easily create test fixtures
+ // An implementation of the Contents class is defined in the .cc file
+ struct Contents {
+ virtual ~Contents() {}
+ virtual std::unique_ptr<PageReader> GetColumnPageReader(int i) = 0;
+ virtual const RowGroupMetaData* metadata() const = 0;
+ virtual const ReaderProperties* properties() const = 0;
+ };
+
+ explicit RowGroupReader(std::unique_ptr<Contents> contents);
+
+ // Returns the rowgroup metadata
+ const RowGroupMetaData* metadata() const;
+
+ // Construct a ColumnReader for the indicated row group-relative
+ // column. Ownership is shared with the RowGroupReader.
+ std::shared_ptr<ColumnReader> Column(int i);
+
+ // Construct a ColumnReader, trying to enable exposed encoding.
+ //
+ // For dictionary encoding, currently we only support column chunks that are fully
+ // dictionary encoded, i.e., all data pages in the column chunk are dictionary encoded.
+ // If a column chunk uses dictionary encoding but then falls back to plain encoding, the
+ // encoding will not be exposed.
+ //
+ // The returned column reader provides an API GetExposedEncoding() for the
+ // users to check the exposed encoding and determine how to read the batches.
+ //
+ // \note API EXPERIMENTAL
+ std::shared_ptr<ColumnReader> ColumnWithExposeEncoding(
+ int i, ExposedEncoding encoding_to_expose);
+
+ std::unique_ptr<PageReader> GetColumnPageReader(int i);
+
+ private:
+ // Holds a pointer to an instance of Contents implementation
+ std::unique_ptr<Contents> contents_;
+};
+
+class PARQUET_EXPORT ParquetFileReader {
+ public:
+ // Declare a virtual class 'Contents' to aid dependency injection and more
+ // easily create test fixtures
+ // An implementation of the Contents class is defined in the .cc file
+ struct PARQUET_EXPORT Contents {
+ static std::unique_ptr<Contents> Open(
+ std::shared_ptr<::arrow::io::RandomAccessFile> source,
+ const ReaderProperties& props = default_reader_properties(),
+ std::shared_ptr<FileMetaData> metadata = NULLPTR);
+
+ static ::arrow::Future<std::unique_ptr<Contents>> OpenAsync(
+ std::shared_ptr<::arrow::io::RandomAccessFile> source,
+ const ReaderProperties& props = default_reader_properties(),
+ std::shared_ptr<FileMetaData> metadata = NULLPTR);
+
+ virtual ~Contents() = default;
+ // Perform any cleanup associated with the file contents
+ virtual void Close() = 0;
+ virtual std::shared_ptr<RowGroupReader> GetRowGroup(int i) = 0;
+ virtual std::shared_ptr<FileMetaData> metadata() const = 0;
+ };
+
+ ParquetFileReader();
+ ~ParquetFileReader();
+
+ // Create a file reader instance from an Arrow file object. Thread-safety is
+ // the responsibility of the file implementation
+ static std::unique_ptr<ParquetFileReader> Open(
+ std::shared_ptr<::arrow::io::RandomAccessFile> source,
+ const ReaderProperties& props = default_reader_properties(),
+ std::shared_ptr<FileMetaData> metadata = NULLPTR);
+
+ // API Convenience to open a serialized Parquet file on disk, using Arrow IO
+ // interfaces.
+ static std::unique_ptr<ParquetFileReader> OpenFile(
+ const std::string& path, bool memory_map = true,
+ const ReaderProperties& props = default_reader_properties(),
+ std::shared_ptr<FileMetaData> metadata = NULLPTR);
+
+ // Asynchronously open a file reader from an Arrow file object.
+ // Does not throw - all errors are reported through the Future.
+ static ::arrow::Future<std::unique_ptr<ParquetFileReader>> OpenAsync(
+ std::shared_ptr<::arrow::io::RandomAccessFile> source,
+ const ReaderProperties& props = default_reader_properties(),
+ std::shared_ptr<FileMetaData> metadata = NULLPTR);
+
+ void Open(std::unique_ptr<Contents> contents);
+ void Close();
+
+ // The RowGroupReader is owned by the FileReader
+ std::shared_ptr<RowGroupReader> RowGroup(int i);
+
+ // Returns the file metadata. Only one instance is ever created
+ std::shared_ptr<FileMetaData> metadata() const;
+
+ /// Pre-buffer the specified column indices in all row groups.
+ ///
+ /// Readers can optionally call this to cache the necessary slices
+ /// of the file in-memory before deserialization. Arrow readers can
+ /// automatically do this via an option. This is intended to
+ /// increase performance when reading from high-latency filesystems
+ /// (e.g. Amazon S3).
+ ///
+ /// After calling this, creating readers for row groups/column
+ /// indices that were not buffered may fail. Creating multiple
+ /// readers for the a subset of the buffered regions is
+ /// acceptable. This may be called again to buffer a different set
+ /// of row groups/columns.
+ ///
+ /// If memory usage is a concern, note that data will remain
+ /// buffered in memory until either \a PreBuffer() is called again,
+ /// or the reader itself is destructed. Reading - and buffering -
+ /// only one row group at a time may be useful.
+ ///
+ /// This method may throw.
+ void PreBuffer(const std::vector<int>& row_groups,
+ const std::vector<int>& column_indices,
+ const ::arrow::io::IOContext& ctx,
+ const ::arrow::io::CacheOptions& options);
+
+ /// Wait for the specified row groups and column indices to be pre-buffered.
+ ///
+ /// After the returned Future completes, reading the specified row
+ /// groups/columns will not block.
+ ///
+ /// PreBuffer must be called first. This method does not throw.
+ ::arrow::Future<> WhenBuffered(const std::vector<int>& row_groups,
+ const std::vector<int>& column_indices) const;
+
+ private:
+ // Holds a pointer to an instance of Contents implementation
+ std::unique_ptr<Contents> contents_;
+};
+
+// Read only Parquet file metadata
+std::shared_ptr<FileMetaData> PARQUET_EXPORT
+ReadMetaData(const std::shared_ptr<::arrow::io::RandomAccessFile>& source);
+
+/// \brief Scan all values in file. Useful for performance testing
+/// \param[in] columns the column numbers to scan. If empty scans all
+/// \param[in] column_batch_size number of values to read at a time when scanning column
+/// \param[in] reader a ParquetFileReader instance
+/// \return number of semantic rows in file
+PARQUET_EXPORT
+int64_t ScanFileContents(std::vector<int> columns, const int32_t column_batch_size,
+ ParquetFileReader* reader);
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/file_writer.cc b/contrib/libs/apache/arrow/cpp/src/parquet/file_writer.cc
new file mode 100644
index 00000000000..deac9586e5a
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/file_writer.cc
@@ -0,0 +1,547 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/file_writer.h"
+
+#include <cstddef>
+#include <ostream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "parquet/column_writer.h"
+#include "parquet/encryption/encryption_internal.h"
+#include "parquet/encryption/internal_file_encryptor.h"
+#include "parquet/exception.h"
+#include "parquet/platform.h"
+#include "parquet/schema.h"
+#include "parquet/types.h"
+
+using arrow::MemoryPool;
+
+using parquet::schema::GroupNode;
+
+namespace parquet {
+
+// ----------------------------------------------------------------------
+// RowGroupWriter public API
+
+RowGroupWriter::RowGroupWriter(std::unique_ptr<Contents> contents)
+ : contents_(std::move(contents)) {}
+
+void RowGroupWriter::Close() {
+ if (contents_) {
+ contents_->Close();
+ }
+}
+
+ColumnWriter* RowGroupWriter::NextColumn() { return contents_->NextColumn(); }
+
+ColumnWriter* RowGroupWriter::column(int i) { return contents_->column(i); }
+
+int64_t RowGroupWriter::total_compressed_bytes() const {
+ return contents_->total_compressed_bytes();
+}
+
+int64_t RowGroupWriter::total_bytes_written() const {
+ return contents_->total_bytes_written();
+}
+
+int RowGroupWriter::current_column() { return contents_->current_column(); }
+
+int RowGroupWriter::num_columns() const { return contents_->num_columns(); }
+
+int64_t RowGroupWriter::num_rows() const { return contents_->num_rows(); }
+
+inline void ThrowRowsMisMatchError(int col, int64_t prev, int64_t curr) {
+ std::stringstream ss;
+ ss << "Column " << col << " had " << curr << " while previous column had " << prev;
+ throw ParquetException(ss.str());
+}
+
+// ----------------------------------------------------------------------
+// RowGroupSerializer
+
+// RowGroupWriter::Contents implementation for the Parquet file specification
+class RowGroupSerializer : public RowGroupWriter::Contents {
+ public:
+ RowGroupSerializer(std::shared_ptr<ArrowOutputStream> sink,
+ RowGroupMetaDataBuilder* metadata, int16_t row_group_ordinal,
+ const WriterProperties* properties, bool buffered_row_group = false,
+ InternalFileEncryptor* file_encryptor = nullptr)
+ : sink_(std::move(sink)),
+ metadata_(metadata),
+ properties_(properties),
+ total_bytes_written_(0),
+ closed_(false),
+ row_group_ordinal_(row_group_ordinal),
+ next_column_index_(0),
+ num_rows_(0),
+ buffered_row_group_(buffered_row_group),
+ file_encryptor_(file_encryptor) {
+ if (buffered_row_group) {
+ InitColumns();
+ } else {
+ column_writers_.push_back(nullptr);
+ }
+ }
+
+ int num_columns() const override { return metadata_->num_columns(); }
+
+ int64_t num_rows() const override {
+ CheckRowsWritten();
+ // CheckRowsWritten ensures num_rows_ is set correctly
+ return num_rows_;
+ }
+
+ ColumnWriter* NextColumn() override {
+ if (buffered_row_group_) {
+ throw ParquetException(
+ "NextColumn() is not supported when a RowGroup is written by size");
+ }
+
+ if (column_writers_[0]) {
+ CheckRowsWritten();
+ }
+
+ // Throws an error if more columns are being written
+ auto col_meta = metadata_->NextColumnChunk();
+
+ if (column_writers_[0]) {
+ total_bytes_written_ += column_writers_[0]->Close();
+ }
+
+ ++next_column_index_;
+
+ const auto& path = col_meta->descr()->path();
+ auto meta_encryptor =
+ file_encryptor_ ? file_encryptor_->GetColumnMetaEncryptor(path->ToDotString())
+ : nullptr;
+ auto data_encryptor =
+ file_encryptor_ ? file_encryptor_->GetColumnDataEncryptor(path->ToDotString())
+ : nullptr;
+ std::unique_ptr<PageWriter> pager = PageWriter::Open(
+ sink_, properties_->compression(path), properties_->compression_level(path),
+ col_meta, row_group_ordinal_, static_cast<int16_t>(next_column_index_ - 1),
+ properties_->memory_pool(), false, meta_encryptor, data_encryptor);
+ column_writers_[0] = ColumnWriter::Make(col_meta, std::move(pager), properties_);
+ return column_writers_[0].get();
+ }
+
+ ColumnWriter* column(int i) override {
+ if (!buffered_row_group_) {
+ throw ParquetException(
+ "column() is only supported when a BufferedRowGroup is being written");
+ }
+
+ if (i >= 0 && i < static_cast<int>(column_writers_.size())) {
+ return column_writers_[i].get();
+ }
+ return nullptr;
+ }
+
+ int current_column() const override { return metadata_->current_column(); }
+
+ int64_t total_compressed_bytes() const override {
+ int64_t total_compressed_bytes = 0;
+ for (size_t i = 0; i < column_writers_.size(); i++) {
+ if (column_writers_[i]) {
+ total_compressed_bytes += column_writers_[i]->total_compressed_bytes();
+ }
+ }
+ return total_compressed_bytes;
+ }
+
+ int64_t total_bytes_written() const override {
+ int64_t total_bytes_written = 0;
+ for (size_t i = 0; i < column_writers_.size(); i++) {
+ if (column_writers_[i]) {
+ total_bytes_written += column_writers_[i]->total_bytes_written();
+ }
+ }
+ return total_bytes_written;
+ }
+
+ void Close() override {
+ if (!closed_) {
+ closed_ = true;
+ CheckRowsWritten();
+
+ for (size_t i = 0; i < column_writers_.size(); i++) {
+ if (column_writers_[i]) {
+ total_bytes_written_ += column_writers_[i]->Close();
+ column_writers_[i].reset();
+ }
+ }
+
+ column_writers_.clear();
+
+ // Ensures all columns have been written
+ metadata_->set_num_rows(num_rows_);
+ metadata_->Finish(total_bytes_written_, row_group_ordinal_);
+ }
+ }
+
+ private:
+ std::shared_ptr<ArrowOutputStream> sink_;
+ mutable RowGroupMetaDataBuilder* metadata_;
+ const WriterProperties* properties_;
+ int64_t total_bytes_written_;
+ bool closed_;
+ int16_t row_group_ordinal_;
+ int next_column_index_;
+ mutable int64_t num_rows_;
+ bool buffered_row_group_;
+ InternalFileEncryptor* file_encryptor_;
+
+ void CheckRowsWritten() const {
+ // verify when only one column is written at a time
+ if (!buffered_row_group_ && column_writers_.size() > 0 && column_writers_[0]) {
+ int64_t current_col_rows = column_writers_[0]->rows_written();
+ if (num_rows_ == 0) {
+ num_rows_ = current_col_rows;
+ } else if (num_rows_ != current_col_rows) {
+ ThrowRowsMisMatchError(next_column_index_, current_col_rows, num_rows_);
+ }
+ } else if (buffered_row_group_ &&
+ column_writers_.size() > 0) { // when buffered_row_group = true
+ int64_t current_col_rows = column_writers_[0]->rows_written();
+ for (int i = 1; i < static_cast<int>(column_writers_.size()); i++) {
+ int64_t current_col_rows_i = column_writers_[i]->rows_written();
+ if (current_col_rows != current_col_rows_i) {
+ ThrowRowsMisMatchError(i, current_col_rows_i, current_col_rows);
+ }
+ }
+ num_rows_ = current_col_rows;
+ }
+ }
+
+ void InitColumns() {
+ for (int i = 0; i < num_columns(); i++) {
+ auto col_meta = metadata_->NextColumnChunk();
+ const auto& path = col_meta->descr()->path();
+ auto meta_encryptor =
+ file_encryptor_ ? file_encryptor_->GetColumnMetaEncryptor(path->ToDotString())
+ : nullptr;
+ auto data_encryptor =
+ file_encryptor_ ? file_encryptor_->GetColumnDataEncryptor(path->ToDotString())
+ : nullptr;
+ std::unique_ptr<PageWriter> pager = PageWriter::Open(
+ sink_, properties_->compression(path), properties_->compression_level(path),
+ col_meta, static_cast<int16_t>(row_group_ordinal_),
+ static_cast<int16_t>(next_column_index_++), properties_->memory_pool(),
+ buffered_row_group_, meta_encryptor, data_encryptor);
+ column_writers_.push_back(
+ ColumnWriter::Make(col_meta, std::move(pager), properties_));
+ }
+ }
+
+ std::vector<std::shared_ptr<ColumnWriter>> column_writers_;
+};
+
+// ----------------------------------------------------------------------
+// FileSerializer
+
+// An implementation of ParquetFileWriter::Contents that deals with the Parquet
+// file structure, Thrift serialization, and other internal matters
+
+class FileSerializer : public ParquetFileWriter::Contents {
+ public:
+ static std::unique_ptr<ParquetFileWriter::Contents> Open(
+ std::shared_ptr<ArrowOutputStream> sink, std::shared_ptr<GroupNode> schema,
+ std::shared_ptr<WriterProperties> properties,
+ std::shared_ptr<const KeyValueMetadata> key_value_metadata) {
+ std::unique_ptr<ParquetFileWriter::Contents> result(
+ new FileSerializer(std::move(sink), std::move(schema), std::move(properties),
+ std::move(key_value_metadata)));
+
+ return result;
+ }
+
+ void Close() override {
+ if (is_open_) {
+ // If any functions here raise an exception, we set is_open_ to be false
+ // so that this does not get called again (possibly causing segfault)
+ is_open_ = false;
+ if (row_group_writer_) {
+ num_rows_ += row_group_writer_->num_rows();
+ row_group_writer_->Close();
+ }
+ row_group_writer_.reset();
+
+ // Write magic bytes and metadata
+ auto file_encryption_properties = properties_->file_encryption_properties();
+
+ if (file_encryption_properties == nullptr) { // Non encrypted file.
+ file_metadata_ = metadata_->Finish();
+ WriteFileMetaData(*file_metadata_, sink_.get());
+ } else { // Encrypted file
+ CloseEncryptedFile(file_encryption_properties);
+ }
+ }
+ }
+
+ int num_columns() const override { return schema_.num_columns(); }
+
+ int num_row_groups() const override { return num_row_groups_; }
+
+ int64_t num_rows() const override { return num_rows_; }
+
+ const std::shared_ptr<WriterProperties>& properties() const override {
+ return properties_;
+ }
+
+ RowGroupWriter* AppendRowGroup(bool buffered_row_group) {
+ if (row_group_writer_) {
+ row_group_writer_->Close();
+ }
+ num_row_groups_++;
+ auto rg_metadata = metadata_->AppendRowGroup();
+ std::unique_ptr<RowGroupWriter::Contents> contents(new RowGroupSerializer(
+ sink_, rg_metadata, static_cast<int16_t>(num_row_groups_ - 1), properties_.get(),
+ buffered_row_group, file_encryptor_.get()));
+ row_group_writer_.reset(new RowGroupWriter(std::move(contents)));
+ return row_group_writer_.get();
+ }
+
+ RowGroupWriter* AppendRowGroup() override { return AppendRowGroup(false); }
+
+ RowGroupWriter* AppendBufferedRowGroup() override { return AppendRowGroup(true); }
+
+ ~FileSerializer() override {
+ try {
+ Close();
+ } catch (...) {
+ }
+ }
+
+ private:
+ FileSerializer(std::shared_ptr<ArrowOutputStream> sink,
+ std::shared_ptr<GroupNode> schema,
+ std::shared_ptr<WriterProperties> properties,
+ std::shared_ptr<const KeyValueMetadata> key_value_metadata)
+ : ParquetFileWriter::Contents(std::move(schema), std::move(key_value_metadata)),
+ sink_(std::move(sink)),
+ is_open_(true),
+ properties_(std::move(properties)),
+ num_row_groups_(0),
+ num_rows_(0),
+ metadata_(FileMetaDataBuilder::Make(&schema_, properties_, key_value_metadata_)) {
+ PARQUET_ASSIGN_OR_THROW(int64_t position, sink_->Tell());
+ if (position == 0) {
+ StartFile();
+ } else {
+ throw ParquetException("Appending to file not implemented.");
+ }
+ }
+
+ void CloseEncryptedFile(FileEncryptionProperties* file_encryption_properties) {
+ // Encrypted file with encrypted footer
+ if (file_encryption_properties->encrypted_footer()) {
+ // encrypted footer
+ file_metadata_ = metadata_->Finish();
+
+ PARQUET_ASSIGN_OR_THROW(int64_t position, sink_->Tell());
+ uint64_t metadata_start = static_cast<uint64_t>(position);
+ auto crypto_metadata = metadata_->GetCryptoMetaData();
+ WriteFileCryptoMetaData(*crypto_metadata, sink_.get());
+
+ auto footer_encryptor = file_encryptor_->GetFooterEncryptor();
+ WriteEncryptedFileMetadata(*file_metadata_, sink_.get(), footer_encryptor, true);
+ PARQUET_ASSIGN_OR_THROW(position, sink_->Tell());
+ uint32_t footer_and_crypto_len = static_cast<uint32_t>(position - metadata_start);
+ PARQUET_THROW_NOT_OK(
+ sink_->Write(reinterpret_cast<uint8_t*>(&footer_and_crypto_len), 4));
+ PARQUET_THROW_NOT_OK(sink_->Write(kParquetEMagic, 4));
+ } else { // Encrypted file with plaintext footer
+ file_metadata_ = metadata_->Finish();
+ auto footer_signing_encryptor = file_encryptor_->GetFooterSigningEncryptor();
+ WriteEncryptedFileMetadata(*file_metadata_, sink_.get(), footer_signing_encryptor,
+ false);
+ }
+ if (file_encryptor_) {
+ file_encryptor_->WipeOutEncryptionKeys();
+ }
+ }
+
+ std::shared_ptr<ArrowOutputStream> sink_;
+ bool is_open_;
+ const std::shared_ptr<WriterProperties> properties_;
+ int num_row_groups_;
+ int64_t num_rows_;
+ std::unique_ptr<FileMetaDataBuilder> metadata_;
+ // Only one of the row group writers is active at a time
+ std::unique_ptr<RowGroupWriter> row_group_writer_;
+
+ std::unique_ptr<InternalFileEncryptor> file_encryptor_;
+
+ void StartFile() {
+ auto file_encryption_properties = properties_->file_encryption_properties();
+ if (file_encryption_properties == nullptr) {
+ // Unencrypted parquet files always start with PAR1
+ PARQUET_THROW_NOT_OK(sink_->Write(kParquetMagic, 4));
+ } else {
+ // Check that all columns in columnEncryptionProperties exist in the schema.
+ auto encrypted_columns = file_encryption_properties->encrypted_columns();
+ // if columnEncryptionProperties is empty, every column in file schema will be
+ // encrypted with footer key.
+ if (encrypted_columns.size() != 0) {
+ std::vector<std::string> column_path_vec;
+ // First, save all column paths in schema.
+ for (int i = 0; i < num_columns(); i++) {
+ column_path_vec.push_back(schema_.Column(i)->path()->ToDotString());
+ }
+ // Check if column exists in schema.
+ for (const auto& elem : encrypted_columns) {
+ auto it = std::find(column_path_vec.begin(), column_path_vec.end(), elem.first);
+ if (it == column_path_vec.end()) {
+ std::stringstream ss;
+ ss << "Encrypted column " + elem.first + " not in file schema";
+ throw ParquetException(ss.str());
+ }
+ }
+ }
+
+ file_encryptor_.reset(new InternalFileEncryptor(file_encryption_properties,
+ properties_->memory_pool()));
+ if (file_encryption_properties->encrypted_footer()) {
+ PARQUET_THROW_NOT_OK(sink_->Write(kParquetEMagic, 4));
+ } else {
+ // Encrypted file with plaintext footer mode.
+ PARQUET_THROW_NOT_OK(sink_->Write(kParquetMagic, 4));
+ }
+ }
+ }
+};
+
+// ----------------------------------------------------------------------
+// ParquetFileWriter public API
+
+ParquetFileWriter::ParquetFileWriter() {}
+
+ParquetFileWriter::~ParquetFileWriter() {
+ try {
+ Close();
+ } catch (...) {
+ }
+}
+
+std::unique_ptr<ParquetFileWriter> ParquetFileWriter::Open(
+ std::shared_ptr<::arrow::io::OutputStream> sink, std::shared_ptr<GroupNode> schema,
+ std::shared_ptr<WriterProperties> properties,
+ std::shared_ptr<const KeyValueMetadata> key_value_metadata) {
+ auto contents =
+ FileSerializer::Open(std::move(sink), std::move(schema), std::move(properties),
+ std::move(key_value_metadata));
+ std::unique_ptr<ParquetFileWriter> result(new ParquetFileWriter());
+ result->Open(std::move(contents));
+ return result;
+}
+
+void WriteFileMetaData(const FileMetaData& file_metadata, ArrowOutputStream* sink) {
+ // Write MetaData
+ PARQUET_ASSIGN_OR_THROW(int64_t position, sink->Tell());
+ uint32_t metadata_len = static_cast<uint32_t>(position);
+
+ file_metadata.WriteTo(sink);
+ PARQUET_ASSIGN_OR_THROW(position, sink->Tell());
+ metadata_len = static_cast<uint32_t>(position) - metadata_len;
+
+ // Write Footer
+ PARQUET_THROW_NOT_OK(sink->Write(reinterpret_cast<uint8_t*>(&metadata_len), 4));
+ PARQUET_THROW_NOT_OK(sink->Write(kParquetMagic, 4));
+}
+
+void WriteMetaDataFile(const FileMetaData& file_metadata, ArrowOutputStream* sink) {
+ PARQUET_THROW_NOT_OK(sink->Write(kParquetMagic, 4));
+ return WriteFileMetaData(file_metadata, sink);
+}
+
+void WriteEncryptedFileMetadata(const FileMetaData& file_metadata,
+ ArrowOutputStream* sink,
+ const std::shared_ptr<Encryptor>& encryptor,
+ bool encrypt_footer) {
+ if (encrypt_footer) { // Encrypted file with encrypted footer
+ // encrypt and write to sink
+ file_metadata.WriteTo(sink, encryptor);
+ } else { // Encrypted file with plaintext footer mode.
+ PARQUET_ASSIGN_OR_THROW(int64_t position, sink->Tell());
+ uint32_t metadata_len = static_cast<uint32_t>(position);
+ file_metadata.WriteTo(sink, encryptor);
+ PARQUET_ASSIGN_OR_THROW(position, sink->Tell());
+ metadata_len = static_cast<uint32_t>(position) - metadata_len;
+
+ PARQUET_THROW_NOT_OK(sink->Write(reinterpret_cast<uint8_t*>(&metadata_len), 4));
+ PARQUET_THROW_NOT_OK(sink->Write(kParquetMagic, 4));
+ }
+}
+
+void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata,
+ ArrowOutputStream* sink) {
+ crypto_metadata.WriteTo(sink);
+}
+
+const SchemaDescriptor* ParquetFileWriter::schema() const { return contents_->schema(); }
+
+const ColumnDescriptor* ParquetFileWriter::descr(int i) const {
+ return contents_->schema()->Column(i);
+}
+
+int ParquetFileWriter::num_columns() const { return contents_->num_columns(); }
+
+int64_t ParquetFileWriter::num_rows() const { return contents_->num_rows(); }
+
+int ParquetFileWriter::num_row_groups() const { return contents_->num_row_groups(); }
+
+const std::shared_ptr<const KeyValueMetadata>& ParquetFileWriter::key_value_metadata()
+ const {
+ return contents_->key_value_metadata();
+}
+
+const std::shared_ptr<FileMetaData> ParquetFileWriter::metadata() const {
+ return file_metadata_;
+}
+
+void ParquetFileWriter::Open(std::unique_ptr<ParquetFileWriter::Contents> contents) {
+ contents_ = std::move(contents);
+}
+
+void ParquetFileWriter::Close() {
+ if (contents_) {
+ contents_->Close();
+ file_metadata_ = contents_->metadata();
+ contents_.reset();
+ }
+}
+
+RowGroupWriter* ParquetFileWriter::AppendRowGroup() {
+ return contents_->AppendRowGroup();
+}
+
+RowGroupWriter* ParquetFileWriter::AppendBufferedRowGroup() {
+ return contents_->AppendBufferedRowGroup();
+}
+
+RowGroupWriter* ParquetFileWriter::AppendRowGroup(int64_t num_rows) {
+ return AppendRowGroup();
+}
+
+const std::shared_ptr<WriterProperties>& ParquetFileWriter::properties() const {
+ return contents_->properties();
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/file_writer.h b/contrib/libs/apache/arrow/cpp/src/parquet/file_writer.h
new file mode 100644
index 00000000000..4cfc24719a3
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/file_writer.h
@@ -0,0 +1,234 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+
+#include "parquet/metadata.h"
+#include "parquet/platform.h"
+#include "parquet/properties.h"
+#include "parquet/schema.h"
+
+namespace parquet {
+
+class ColumnWriter;
+
+// FIXME: copied from reader-internal.cc
+static constexpr uint8_t kParquetMagic[4] = {'P', 'A', 'R', '1'};
+static constexpr uint8_t kParquetEMagic[4] = {'P', 'A', 'R', 'E'};
+
+class PARQUET_EXPORT RowGroupWriter {
+ public:
+ // Forward declare a virtual class 'Contents' to aid dependency injection and more
+ // easily create test fixtures
+ // An implementation of the Contents class is defined in the .cc file
+ struct Contents {
+ virtual ~Contents() = default;
+ virtual int num_columns() const = 0;
+ virtual int64_t num_rows() const = 0;
+
+ // to be used only with ParquetFileWriter::AppendRowGroup
+ virtual ColumnWriter* NextColumn() = 0;
+ // to be used only with ParquetFileWriter::AppendBufferedRowGroup
+ virtual ColumnWriter* column(int i) = 0;
+
+ virtual int current_column() const = 0;
+ virtual void Close() = 0;
+
+ // total bytes written by the page writer
+ virtual int64_t total_bytes_written() const = 0;
+ // total bytes still compressed but not written
+ virtual int64_t total_compressed_bytes() const = 0;
+ };
+
+ explicit RowGroupWriter(std::unique_ptr<Contents> contents);
+
+ /// Construct a ColumnWriter for the indicated row group-relative column.
+ ///
+ /// To be used only with ParquetFileWriter::AppendRowGroup
+ /// Ownership is solely within the RowGroupWriter. The ColumnWriter is only
+ /// valid until the next call to NextColumn or Close. As the contents are
+ /// directly written to the sink, once a new column is started, the contents
+ /// of the previous one cannot be modified anymore.
+ ColumnWriter* NextColumn();
+ /// Index of currently written column. Equal to -1 if NextColumn()
+ /// has not been called yet.
+ int current_column();
+ void Close();
+
+ int num_columns() const;
+
+ /// Construct a ColumnWriter for the indicated row group column.
+ ///
+ /// To be used only with ParquetFileWriter::AppendBufferedRowGroup
+ /// Ownership is solely within the RowGroupWriter. The ColumnWriter is
+ /// valid until Close. The contents are buffered in memory and written to sink
+ /// on Close
+ ColumnWriter* column(int i);
+
+ /**
+ * Number of rows that shall be written as part of this RowGroup.
+ */
+ int64_t num_rows() const;
+
+ int64_t total_bytes_written() const;
+ int64_t total_compressed_bytes() const;
+
+ private:
+ // Holds a pointer to an instance of Contents implementation
+ std::unique_ptr<Contents> contents_;
+};
+
+PARQUET_EXPORT
+void WriteFileMetaData(const FileMetaData& file_metadata,
+ ::arrow::io::OutputStream* sink);
+
+PARQUET_EXPORT
+void WriteMetaDataFile(const FileMetaData& file_metadata,
+ ::arrow::io::OutputStream* sink);
+
+PARQUET_EXPORT
+void WriteEncryptedFileMetadata(const FileMetaData& file_metadata,
+ ArrowOutputStream* sink,
+ const std::shared_ptr<Encryptor>& encryptor,
+ bool encrypt_footer);
+
+PARQUET_EXPORT
+void WriteEncryptedFileMetadata(const FileMetaData& file_metadata,
+ ::arrow::io::OutputStream* sink,
+ const std::shared_ptr<Encryptor>& encryptor = NULLPTR,
+ bool encrypt_footer = false);
+PARQUET_EXPORT
+void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata,
+ ::arrow::io::OutputStream* sink);
+
+class PARQUET_EXPORT ParquetFileWriter {
+ public:
+ // Forward declare a virtual class 'Contents' to aid dependency injection and more
+ // easily create test fixtures
+ // An implementation of the Contents class is defined in the .cc file
+ struct Contents {
+ Contents(std::shared_ptr<::parquet::schema::GroupNode> schema,
+ std::shared_ptr<const KeyValueMetadata> key_value_metadata)
+ : schema_(), key_value_metadata_(std::move(key_value_metadata)) {
+ schema_.Init(std::move(schema));
+ }
+ virtual ~Contents() {}
+ // Perform any cleanup associated with the file contents
+ virtual void Close() = 0;
+
+ /// \note Deprecated since 1.3.0
+ RowGroupWriter* AppendRowGroup(int64_t num_rows);
+
+ virtual RowGroupWriter* AppendRowGroup() = 0;
+ virtual RowGroupWriter* AppendBufferedRowGroup() = 0;
+
+ virtual int64_t num_rows() const = 0;
+ virtual int num_columns() const = 0;
+ virtual int num_row_groups() const = 0;
+
+ virtual const std::shared_ptr<WriterProperties>& properties() const = 0;
+
+ const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const {
+ return key_value_metadata_;
+ }
+
+ // Return const-pointer to make it clear that this object is not to be copied
+ const SchemaDescriptor* schema() const { return &schema_; }
+
+ SchemaDescriptor schema_;
+
+ /// This should be the only place this is stored. Everything else is a const reference
+ std::shared_ptr<const KeyValueMetadata> key_value_metadata_;
+
+ const std::shared_ptr<FileMetaData>& metadata() const { return file_metadata_; }
+ std::shared_ptr<FileMetaData> file_metadata_;
+ };
+
+ ParquetFileWriter();
+ ~ParquetFileWriter();
+
+ static std::unique_ptr<ParquetFileWriter> Open(
+ std::shared_ptr<::arrow::io::OutputStream> sink,
+ std::shared_ptr<schema::GroupNode> schema,
+ std::shared_ptr<WriterProperties> properties = default_writer_properties(),
+ std::shared_ptr<const KeyValueMetadata> key_value_metadata = NULLPTR);
+
+ void Open(std::unique_ptr<Contents> contents);
+ void Close();
+
+ // Construct a RowGroupWriter for the indicated number of rows.
+ //
+ // Ownership is solely within the ParquetFileWriter. The RowGroupWriter is only valid
+ // until the next call to AppendRowGroup or AppendBufferedRowGroup or Close.
+ // @param num_rows The number of rows that are stored in the new RowGroup
+ //
+ // \deprecated Since 1.3.0
+ RowGroupWriter* AppendRowGroup(int64_t num_rows);
+
+ /// Construct a RowGroupWriter with an arbitrary number of rows.
+ ///
+ /// Ownership is solely within the ParquetFileWriter. The RowGroupWriter is only valid
+ /// until the next call to AppendRowGroup or AppendBufferedRowGroup or Close.
+ RowGroupWriter* AppendRowGroup();
+
+ /// Construct a RowGroupWriter that buffers all the values until the RowGroup is ready.
+ /// Use this if you want to write a RowGroup based on a certain size
+ ///
+ /// Ownership is solely within the ParquetFileWriter. The RowGroupWriter is only valid
+ /// until the next call to AppendRowGroup or AppendBufferedRowGroup or Close.
+ RowGroupWriter* AppendBufferedRowGroup();
+
+ /// Number of columns.
+ ///
+ /// This number is fixed during the lifetime of the writer as it is determined via
+ /// the schema.
+ int num_columns() const;
+
+ /// Number of rows in the yet started RowGroups.
+ ///
+ /// Changes on the addition of a new RowGroup.
+ int64_t num_rows() const;
+
+ /// Number of started RowGroups.
+ int num_row_groups() const;
+
+ /// Configuration passed to the writer, e.g. the used Parquet format version.
+ const std::shared_ptr<WriterProperties>& properties() const;
+
+ /// Returns the file schema descriptor
+ const SchemaDescriptor* schema() const;
+
+ /// Returns a column descriptor in schema
+ const ColumnDescriptor* descr(int i) const;
+
+ /// Returns the file custom metadata
+ const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const;
+
+ /// Returns the file metadata, only available after calling Close().
+ const std::shared_ptr<FileMetaData> metadata() const;
+
+ private:
+ // Holds a pointer to an instance of Contents implementation
+ std::unique_ptr<Contents> contents_;
+ std::shared_ptr<FileMetaData> file_metadata_;
+};
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/hasher.h b/contrib/libs/apache/arrow/cpp/src/parquet/hasher.h
new file mode 100644
index 00000000000..d699356a6c4
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/hasher.h
@@ -0,0 +1,72 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include "parquet/types.h"
+
+namespace parquet {
+// Abstract class for hash
+class Hasher {
+ public:
+ /// Compute hash for 32 bits value by using its plain encoding result.
+ ///
+ /// @param value the value to hash.
+ /// @return hash result.
+ virtual uint64_t Hash(int32_t value) const = 0;
+
+ /// Compute hash for 64 bits value by using its plain encoding result.
+ ///
+ /// @param value the value to hash.
+ /// @return hash result.
+ virtual uint64_t Hash(int64_t value) const = 0;
+
+ /// Compute hash for float value by using its plain encoding result.
+ ///
+ /// @param value the value to hash.
+ /// @return hash result.
+ virtual uint64_t Hash(float value) const = 0;
+
+ /// Compute hash for double value by using its plain encoding result.
+ ///
+ /// @param value the value to hash.
+ /// @return hash result.
+ virtual uint64_t Hash(double value) const = 0;
+
+ /// Compute hash for Int96 value by using its plain encoding result.
+ ///
+ /// @param value the value to hash.
+ /// @return hash result.
+ virtual uint64_t Hash(const Int96* value) const = 0;
+
+ /// Compute hash for ByteArray value by using its plain encoding result.
+ ///
+ /// @param value the value to hash.
+ /// @return hash result.
+ virtual uint64_t Hash(const ByteArray* value) const = 0;
+
+ /// Compute hash for fixed byte array value by using its plain encoding result.
+ ///
+ /// @param value the value address.
+ /// @param len the value length.
+ virtual uint64_t Hash(const FLBA* value, uint32_t len) const = 0;
+
+ virtual ~Hasher() = default;
+};
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/level_comparison.cc b/contrib/libs/apache/arrow/cpp/src/parquet/level_comparison.cc
new file mode 100644
index 00000000000..30614ae61fb
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/level_comparison.cc
@@ -0,0 +1,82 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/level_comparison.h"
+
+#define PARQUET_IMPL_NAMESPACE standard
+#include "parquet/level_comparison_inc.h"
+#undef PARQUET_IMPL_NAMESPACE
+
+#include <vector>
+
+#include "arrow/util/dispatch.h"
+
+namespace parquet {
+namespace internal {
+
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
+MinMax FindMinMaxAvx2(const int16_t* levels, int64_t num_levels);
+uint64_t GreaterThanBitmapAvx2(const int16_t* levels, int64_t num_levels, int16_t rhs);
+#endif
+
+namespace {
+
+using ::arrow::internal::DispatchLevel;
+using ::arrow::internal::DynamicDispatch;
+
+// defined in level_comparison_avx2.cc
+
+struct GreaterThanDynamicFunction {
+ using FunctionType = decltype(&GreaterThanBitmap);
+
+ static std::vector<std::pair<DispatchLevel, FunctionType>> implementations() {
+ return {
+ { DispatchLevel::NONE, standard::GreaterThanBitmapImpl }
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
+ , { DispatchLevel::AVX2, GreaterThanBitmapAvx2 }
+#endif
+ };
+ }
+};
+
+struct MinMaxDynamicFunction {
+ using FunctionType = decltype(&FindMinMax);
+
+ static std::vector<std::pair<DispatchLevel, FunctionType>> implementations() {
+ return {
+ { DispatchLevel::NONE, standard::FindMinMaxImpl }
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
+ , { DispatchLevel::AVX2, FindMinMaxAvx2 }
+#endif
+ };
+ }
+};
+
+} // namespace
+
+uint64_t GreaterThanBitmap(const int16_t* levels, int64_t num_levels, int16_t rhs) {
+ static DynamicDispatch<GreaterThanDynamicFunction> dispatch;
+ return dispatch.func(levels, num_levels, rhs);
+}
+
+MinMax FindMinMax(const int16_t* levels, int64_t num_levels) {
+ static DynamicDispatch<MinMaxDynamicFunction> dispatch;
+ return dispatch.func(levels, num_levels);
+}
+
+} // namespace internal
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/level_comparison.h b/contrib/libs/apache/arrow/cpp/src/parquet/level_comparison.h
new file mode 100644
index 00000000000..38e7ef8e2ec
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/level_comparison.h
@@ -0,0 +1,40 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+
+#include <algorithm>
+#include <cstdint>
+
+#include "parquet/platform.h"
+
+namespace parquet {
+namespace internal {
+
+/// Builds a bitmap where each set bit indicates the corresponding level is greater
+/// than rhs.
+uint64_t PARQUET_EXPORT GreaterThanBitmap(const int16_t* levels, int64_t num_levels,
+ int16_t rhs);
+
+struct MinMax {
+ int16_t min;
+ int16_t max;
+};
+
+MinMax FindMinMax(const int16_t* levels, int64_t num_levels);
+
+} // namespace internal
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/level_comparison_inc.h b/contrib/libs/apache/arrow/cpp/src/parquet/level_comparison_inc.h
new file mode 100644
index 00000000000..e21c3e5824d
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/level_comparison_inc.h
@@ -0,0 +1,65 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+
+#include "arrow/util/bit_util.h"
+#include "arrow/util/endian.h"
+#include "parquet/level_comparison.h"
+
+// Used to make sure ODR rule isn't violated.
+#ifndef PARQUET_IMPL_NAMESPACE
+#error "PARQUET_IMPL_NAMESPACE must be defined"
+#endif
+namespace parquet {
+namespace internal {
+namespace PARQUET_IMPL_NAMESPACE {
+/// Builds a bitmap by applying predicate to the level vector provided.
+///
+/// \param[in] levels Rep or def level array.
+/// \param[in] num_levels The number of levels to process (must be [0, 64])
+/// \param[in] predicate The predicate to apply (must have the signature `bool
+/// predicate(int16_t)`.
+/// \returns The bitmap using least significant "bit" ordering.
+///
+template <typename Predicate>
+inline uint64_t LevelsToBitmap(const int16_t* levels, int64_t num_levels,
+ Predicate predicate) {
+ // Both clang and GCC can vectorize this automatically with SSE4/AVX2.
+ uint64_t mask = 0;
+ for (int x = 0; x < num_levels; x++) {
+ mask |= static_cast<uint64_t>(predicate(levels[x]) ? 1 : 0) << x;
+ }
+ return ::arrow::BitUtil::ToLittleEndian(mask);
+}
+
+inline MinMax FindMinMaxImpl(const int16_t* levels, int64_t num_levels) {
+ MinMax out{std::numeric_limits<int16_t>::max(), std::numeric_limits<int16_t>::min()};
+ for (int x = 0; x < num_levels; x++) {
+ out.min = std::min(levels[x], out.min);
+ out.max = std::max(levels[x], out.max);
+ }
+ return out;
+}
+
+inline uint64_t GreaterThanBitmapImpl(const int16_t* levels, int64_t num_levels,
+ int16_t rhs) {
+ return LevelsToBitmap(levels, num_levels, [rhs](int16_t value) { return value > rhs; });
+}
+
+} // namespace PARQUET_IMPL_NAMESPACE
+} // namespace internal
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/level_conversion.cc b/contrib/libs/apache/arrow/cpp/src/parquet/level_conversion.cc
new file mode 100644
index 00000000000..ffdca476ddd
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/level_conversion.cc
@@ -0,0 +1,183 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#include "parquet/level_conversion.h"
+
+#include <algorithm>
+#include <limits>
+
+#include "arrow/util/bit_run_reader.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/cpu_info.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/optional.h"
+#include "parquet/exception.h"
+
+#include "parquet/level_comparison.h"
+#define PARQUET_IMPL_NAMESPACE standard
+#include "parquet/level_conversion_inc.h"
+#undef PARQUET_IMPL_NAMESPACE
+
+namespace parquet {
+namespace internal {
+namespace {
+
+using ::arrow::internal::CpuInfo;
+using ::arrow::util::optional;
+
+template <typename OffsetType>
+void DefRepLevelsToListInfo(const int16_t* def_levels, const int16_t* rep_levels,
+ int64_t num_def_levels, LevelInfo level_info,
+ ValidityBitmapInputOutput* output, OffsetType* offsets) {
+ OffsetType* orig_pos = offsets;
+ optional<::arrow::internal::FirstTimeBitmapWriter> valid_bits_writer;
+ if (output->valid_bits) {
+ valid_bits_writer.emplace(output->valid_bits, output->valid_bits_offset,
+ output->values_read_upper_bound);
+ }
+ for (int x = 0; x < num_def_levels; x++) {
+ // Skip items that belong to empty or null ancestor lists and further nested lists.
+ if (def_levels[x] < level_info.repeated_ancestor_def_level ||
+ rep_levels[x] > level_info.rep_level) {
+ continue;
+ }
+
+ if (rep_levels[x] == level_info.rep_level) {
+ // A continuation of an existing list.
+ // offsets can be null for structs with repeated children (we don't need to know
+ // offsets until we get to the children).
+ if (offsets != nullptr) {
+ if (ARROW_PREDICT_FALSE(*offsets == std::numeric_limits<OffsetType>::max())) {
+ throw ParquetException("List index overflow.");
+ }
+ *offsets += 1;
+ }
+ } else {
+ if (ARROW_PREDICT_FALSE(
+ (valid_bits_writer.has_value() &&
+ valid_bits_writer->position() >= output->values_read_upper_bound) ||
+ (offsets - orig_pos) >= output->values_read_upper_bound)) {
+ std::stringstream ss;
+ ss << "Definition levels exceeded upper bound: "
+ << output->values_read_upper_bound;
+ throw ParquetException(ss.str());
+ }
+
+ // current_rep < list rep_level i.e. start of a list (ancestor empty lists are
+ // filtered out above).
+ // offsets can be null for structs with repeated children (we don't need to know
+ // offsets until we get to the children).
+ if (offsets != nullptr) {
+ ++offsets;
+ // Use cumulative offsets because variable size lists are more common then
+ // fixed size lists so it should be cheaper to make these cumulative and
+ // subtract when validating fixed size lists.
+ *offsets = *(offsets - 1);
+ if (def_levels[x] >= level_info.def_level) {
+ if (ARROW_PREDICT_FALSE(*offsets == std::numeric_limits<OffsetType>::max())) {
+ throw ParquetException("List index overflow.");
+ }
+ *offsets += 1;
+ }
+ }
+
+ if (valid_bits_writer.has_value()) {
+ // the level_info def level for lists reflects element present level.
+ // the prior level distinguishes between empty lists.
+ if (def_levels[x] >= level_info.def_level - 1) {
+ valid_bits_writer->Set();
+ } else {
+ output->null_count++;
+ valid_bits_writer->Clear();
+ }
+ valid_bits_writer->Next();
+ }
+ }
+ }
+ if (valid_bits_writer.has_value()) {
+ valid_bits_writer->Finish();
+ }
+ if (offsets != nullptr) {
+ output->values_read = offsets - orig_pos;
+ } else if (valid_bits_writer.has_value()) {
+ output->values_read = valid_bits_writer->position();
+ }
+ if (output->null_count > 0 && level_info.null_slot_usage > 1) {
+ throw ParquetException(
+ "Null values with null_slot_usage > 1 not supported."
+ "(i.e. FixedSizeLists with null values are not supported)");
+ }
+}
+
+} // namespace
+
+#if defined(ARROW_HAVE_RUNTIME_BMI2)
+// defined in level_conversion_bmi2.cc for dynamic dispatch.
+void DefLevelsToBitmapBmi2WithRepeatedParent(const int16_t* def_levels,
+ int64_t num_def_levels, LevelInfo level_info,
+ ValidityBitmapInputOutput* output);
+#endif
+
+void DefLevelsToBitmap(const int16_t* def_levels, int64_t num_def_levels,
+ LevelInfo level_info, ValidityBitmapInputOutput* output) {
+ // It is simpler to rely on rep_level here until PARQUET-1899 is done and the code
+ // is deleted in a follow-up release.
+ if (level_info.rep_level > 0) {
+#if defined(ARROW_HAVE_RUNTIME_BMI2)
+ if (CpuInfo::GetInstance()->HasEfficientBmi2()) {
+ return DefLevelsToBitmapBmi2WithRepeatedParent(def_levels, num_def_levels,
+ level_info, output);
+ }
+#endif
+ standard::DefLevelsToBitmapSimd</*has_repeated_parent=*/true>(
+ def_levels, num_def_levels, level_info, output);
+ } else {
+ standard::DefLevelsToBitmapSimd</*has_repeated_parent=*/false>(
+ def_levels, num_def_levels, level_info, output);
+ }
+}
+
+uint64_t TestOnlyExtractBitsSoftware(uint64_t bitmap, uint64_t select_bitmap) {
+ return standard::ExtractBitsSoftware(bitmap, select_bitmap);
+}
+
+void DefRepLevelsToList(const int16_t* def_levels, const int16_t* rep_levels,
+ int64_t num_def_levels, LevelInfo level_info,
+ ValidityBitmapInputOutput* output, int32_t* offsets) {
+ DefRepLevelsToListInfo<int32_t>(def_levels, rep_levels, num_def_levels, level_info,
+ output, offsets);
+}
+
+void DefRepLevelsToList(const int16_t* def_levels, const int16_t* rep_levels,
+ int64_t num_def_levels, LevelInfo level_info,
+ ValidityBitmapInputOutput* output, int64_t* offsets) {
+ DefRepLevelsToListInfo<int64_t>(def_levels, rep_levels, num_def_levels, level_info,
+ output, offsets);
+}
+
+void DefRepLevelsToBitmap(const int16_t* def_levels, const int16_t* rep_levels,
+ int64_t num_def_levels, LevelInfo level_info,
+ ValidityBitmapInputOutput* output) {
+ // DefReplevelsToListInfo assumes it for the actual list method and this
+ // method is for parent structs, so we need to bump def and ref level.
+ level_info.rep_level += 1;
+ level_info.def_level += 1;
+ DefRepLevelsToListInfo<int32_t>(def_levels, rep_levels, num_def_levels, level_info,
+ output, /*offsets=*/nullptr);
+}
+
+} // namespace internal
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/level_conversion.h b/contrib/libs/apache/arrow/cpp/src/parquet/level_conversion.h
new file mode 100644
index 00000000000..e45a288e8c0
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/level_conversion.h
@@ -0,0 +1,199 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+
+#include "arrow/util/endian.h"
+#include "parquet/platform.h"
+#include "parquet/schema.h"
+
+namespace parquet {
+namespace internal {
+
+struct PARQUET_EXPORT LevelInfo {
+ LevelInfo()
+ : null_slot_usage(1), def_level(0), rep_level(0), repeated_ancestor_def_level(0) {}
+ LevelInfo(int32_t null_slots, int32_t definition_level, int32_t repetition_level,
+ int32_t repeated_ancestor_definition_level)
+ : null_slot_usage(null_slots),
+ def_level(definition_level),
+ rep_level(repetition_level),
+ repeated_ancestor_def_level(repeated_ancestor_definition_level) {}
+
+ bool operator==(const LevelInfo& b) const {
+ return null_slot_usage == b.null_slot_usage && def_level == b.def_level &&
+ rep_level == b.rep_level &&
+ repeated_ancestor_def_level == b.repeated_ancestor_def_level;
+ }
+
+ bool HasNullableValues() const { return repeated_ancestor_def_level < def_level; }
+
+ // How many slots an undefined but present (i.e. null) element in
+ // parquet consumes when decoding to Arrow.
+ // "Slot" is used in the same context as the Arrow specification
+ // (i.e. a value holder).
+ // This is only ever >1 for descendents of FixedSizeList.
+ int32_t null_slot_usage = 1;
+
+ // The definition level at which the value for the field
+ // is considered not null (definition levels greater than
+ // or equal to this value indicate a not-null
+ // value for the field). For list fields definition levels
+ // greater than or equal to this field indicate a present,
+ // possibly null, child value.
+ int16_t def_level = 0;
+
+ // The repetition level corresponding to this element
+ // or the closest repeated ancestor. Any repetition
+ // level less than this indicates either a new list OR
+ // an empty list (which is determined in conjunction
+ // with definition levels).
+ int16_t rep_level = 0;
+
+ // The definition level indicating the level at which the closest
+ // repeated ancestor is not empty. This is used to discriminate
+ // between a value less than |def_level| being null or excluded entirely.
+ // For instance if we have an arrow schema like:
+ // list(struct(f0: int)). Then then there are the following
+ // definition levels:
+ // 0 = null list
+ // 1 = present but empty list.
+ // 2 = a null value in the list
+ // 3 = a non null struct but null integer.
+ // 4 = a present integer.
+ // When reconstructing, the struct and integer arrays'
+ // repeated_ancestor_def_level would be 2. Any
+ // def_level < 2 indicates that there isn't a corresponding
+ // child value in the list.
+ // i.e. [null, [], [null], [{f0: null}], [{f0: 1}]]
+ // has the def levels [0, 1, 2, 3, 4]. The actual
+ // struct array is only of length 3: [not-set, set, set] and
+ // the int array is also of length 3: [N/A, null, 1].
+ //
+ int16_t repeated_ancestor_def_level = 0;
+
+ /// Increments levels according to the cardinality of node.
+ void Increment(const schema::Node& node) {
+ if (node.is_repeated()) {
+ IncrementRepeated();
+ return;
+ }
+ if (node.is_optional()) {
+ IncrementOptional();
+ return;
+ }
+ }
+
+ /// Incremetns level for a optional node.
+ void IncrementOptional() { def_level++; }
+
+ /// Increments levels for the repeated node. Returns
+ /// the previous ancestor_list_def_level.
+ int16_t IncrementRepeated() {
+ int16_t last_repeated_ancestor = repeated_ancestor_def_level;
+
+ // Repeated fields add both a repetition and definition level. This is used
+ // to distinguish between an empty list and a list with an item in it.
+ ++rep_level;
+ ++def_level;
+ // For levels >= repeated_ancenstor_def_level it indicates the list was
+ // non-null and had at least one element. This is important
+ // for later decoding because we need to add a slot for these
+ // values. for levels < current_def_level no slots are added
+ // to arrays.
+ repeated_ancestor_def_level = def_level;
+ return last_repeated_ancestor;
+ }
+
+ friend std::ostream& operator<<(std::ostream& os, const LevelInfo& levels) {
+ // This print method is to silence valgrind issues. What's printed
+ // is not important because all asserts happen directly on
+ // members.
+ os << "{def=" << levels.def_level << ", rep=" << levels.rep_level
+ << ", repeated_ancestor_def=" << levels.repeated_ancestor_def_level;
+ if (levels.null_slot_usage > 1) {
+ os << ", null_slot_usage=" << levels.null_slot_usage;
+ }
+ os << "}";
+ return os;
+ }
+};
+
+// Input/Output structure for reconstructed validity bitmaps.
+struct PARQUET_EXPORT ValidityBitmapInputOutput {
+ // Input only.
+ // The maximum number of values_read expected (actual
+ // values read must be less than or equal to this value).
+ // If this number is exceeded methods will throw a
+ // ParquetException. Exceeding this limit indicates
+ // either a corrupt or incorrectly written file.
+ int64_t values_read_upper_bound = 0;
+ // Output only. The number of values added to the encountered
+ // (this is logically the count of the number of elements
+ // for an Arrow array).
+ int64_t values_read = 0;
+ // Input/Output. The number of nulls encountered.
+ int64_t null_count = 0;
+ // Output only. The validity bitmap to populate. May be be null only
+ // for DefRepLevelsToListInfo (if all that is needed is list offsets).
+ uint8_t* valid_bits = NULLPTR;
+ // Input only, offset into valid_bits to start at.
+ int64_t valid_bits_offset = 0;
+};
+
+// Converts def_levels to validity bitmaps for non-list arrays and structs that have
+// at least one member that is not a list and has no list descendents.
+// For lists use DefRepLevelsToList and structs where all descendants contain
+// a list use DefRepLevelsToBitmap.
+void PARQUET_EXPORT DefLevelsToBitmap(const int16_t* def_levels, int64_t num_def_levels,
+ LevelInfo level_info,
+ ValidityBitmapInputOutput* output);
+
+// Reconstructs a validity bitmap and list offsets for a list arrays based on
+// def/rep levels. The first element of offsets will not be modified if rep_levels
+// starts with a new list. The first element of offsets will be used when calculating
+// the next offset. See documentation onf DefLevelsToBitmap for when to use this
+// method vs the other ones in this file for reconstruction.
+//
+// Offsets must be sized to 1 + values_read_upper_bound.
+void PARQUET_EXPORT DefRepLevelsToList(const int16_t* def_levels,
+ const int16_t* rep_levels, int64_t num_def_levels,
+ LevelInfo level_info,
+ ValidityBitmapInputOutput* output,
+ int32_t* offsets);
+void PARQUET_EXPORT DefRepLevelsToList(const int16_t* def_levels,
+ const int16_t* rep_levels, int64_t num_def_levels,
+ LevelInfo level_info,
+ ValidityBitmapInputOutput* output,
+ int64_t* offsets);
+
+// Reconstructs a validity bitmap for a struct every member is a list or has
+// a list descendant. See documentation on DefLevelsToBitmap for when more
+// details on this method compared to the other ones defined above.
+void PARQUET_EXPORT DefRepLevelsToBitmap(const int16_t* def_levels,
+ const int16_t* rep_levels,
+ int64_t num_def_levels, LevelInfo level_info,
+ ValidityBitmapInputOutput* output);
+
+// This is exposed to ensure we can properly test a software simulated pext function
+// (i.e. it isn't hidden by runtime dispatch).
+uint64_t PARQUET_EXPORT TestOnlyExtractBitsSoftware(uint64_t bitmap, uint64_t selection);
+
+} // namespace internal
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/level_conversion_inc.h b/contrib/libs/apache/arrow/cpp/src/parquet/level_conversion_inc.h
new file mode 100644
index 00000000000..75c7716c483
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/level_conversion_inc.h
@@ -0,0 +1,357 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+
+#include "parquet/level_conversion.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <limits>
+
+#include "arrow/util/bit_run_reader.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_writer.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/simd.h"
+#include "parquet/exception.h"
+#include "parquet/level_comparison.h"
+
+namespace parquet {
+namespace internal {
+#ifndef PARQUET_IMPL_NAMESPACE
+#error "PARQUET_IMPL_NAMESPACE must be defined"
+#endif
+namespace PARQUET_IMPL_NAMESPACE {
+
+// clang-format off
+/* Python code to generate lookup table:
+
+kLookupBits = 5
+count = 0
+print('constexpr int kLookupBits = {};'.format(kLookupBits))
+print('constexpr uint8_t kPextTable[1 << kLookupBits][1 << kLookupBits] = {')
+print(' ', end = '')
+for mask in range(1 << kLookupBits):
+ for data in range(1 << kLookupBits):
+ bit_value = 0
+ bit_len = 0
+ for i in range(kLookupBits):
+ if mask & (1 << i):
+ bit_value |= (((data >> i) & 1) << bit_len)
+ bit_len += 1
+ out = '0x{:02X},'.format(bit_value)
+ count += 1
+ if count % (1 << kLookupBits) == 1:
+ print(' {')
+ if count % 8 == 1:
+ print(' ', end = '')
+ if count % 8 == 0:
+ print(out, end = '\n')
+ else:
+ print(out, end = ' ')
+ if count % (1 << kLookupBits) == 0:
+ print(' },', end = '')
+print('\n};')
+
+*/
+// clang-format on
+
+constexpr int kLookupBits = 5;
+constexpr uint8_t kPextTable[1 << kLookupBits][1 << kLookupBits] = {
+ {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ },
+ {
+ 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00,
+ 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01,
+ 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01,
+ },
+ {
+ 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01,
+ 0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00,
+ 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01,
+ },
+ {
+ 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02,
+ 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01,
+ 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03,
+ },
+ {
+ 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00,
+ 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01,
+ 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01,
+ },
+ {
+ 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x00, 0x01, 0x00,
+ 0x01, 0x02, 0x03, 0x02, 0x03, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03,
+ 0x02, 0x03, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03,
+ },
+ {
+ 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x00, 0x00, 0x01,
+ 0x01, 0x02, 0x02, 0x03, 0x03, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02,
+ 0x03, 0x03, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03,
+ },
+ {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02,
+ 0x03, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
+ 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ },
+ {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01,
+ 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+ },
+ {
+ 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02,
+ 0x03, 0x02, 0x03, 0x02, 0x03, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01,
+ 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03,
+ },
+ {
+ 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03,
+ 0x03, 0x02, 0x02, 0x03, 0x03, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00,
+ 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x02, 0x02, 0x03, 0x03,
+ },
+ {
+ 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
+ 0x07, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01,
+ 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07,
+ },
+ {
+ 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02,
+ 0x02, 0x03, 0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01,
+ 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03,
+ },
+ {
+ 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04,
+ 0x05, 0x06, 0x07, 0x06, 0x07, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03,
+ 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x06, 0x07, 0x06, 0x07,
+ },
+ {
+ 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05,
+ 0x05, 0x06, 0x06, 0x07, 0x07, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02,
+ 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07,
+ },
+ {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A,
+ 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
+ 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
+ },
+ {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+ 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+ },
+ {
+ 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00,
+ 0x01, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03,
+ 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03,
+ },
+ {
+ 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01,
+ 0x01, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x02, 0x02,
+ 0x03, 0x03, 0x02, 0x02, 0x03, 0x03, 0x02, 0x02, 0x03, 0x03,
+ },
+ {
+ 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02,
+ 0x03, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05,
+ 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07,
+ },
+ {
+ 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00,
+ 0x00, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03,
+ 0x03, 0x03, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03,
+ },
+ {
+ 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x00, 0x01, 0x00,
+ 0x01, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x06, 0x07,
+ 0x06, 0x07, 0x04, 0x05, 0x04, 0x05, 0x06, 0x07, 0x06, 0x07,
+ },
+ {
+ 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x00, 0x00, 0x01,
+ 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06,
+ 0x07, 0x07, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07,
+ },
+ {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02,
+ 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
+ 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
+ },
+ {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01,
+ 0x01, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
+ 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
+ },
+ {
+ 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02,
+ 0x03, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05,
+ 0x04, 0x05, 0x06, 0x07, 0x06, 0x07, 0x06, 0x07, 0x06, 0x07,
+ },
+ {
+ 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03,
+ 0x03, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x04, 0x04,
+ 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x06, 0x06, 0x07, 0x07,
+ },
+ {
+ 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
+ 0x07, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x08, 0x09,
+ 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x0C, 0x0D, 0x0E, 0x0F,
+ },
+ {
+ 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02,
+ 0x02, 0x03, 0x03, 0x03, 0x03, 0x04, 0x04, 0x04, 0x04, 0x05, 0x05,
+ 0x05, 0x05, 0x06, 0x06, 0x06, 0x06, 0x07, 0x07, 0x07, 0x07,
+ },
+ {
+ 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04,
+ 0x05, 0x06, 0x07, 0x06, 0x07, 0x08, 0x09, 0x08, 0x09, 0x0A, 0x0B,
+ 0x0A, 0x0B, 0x0C, 0x0D, 0x0C, 0x0D, 0x0E, 0x0F, 0x0E, 0x0F,
+ },
+ {
+ 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05,
+ 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0A, 0x0A,
+ 0x0B, 0x0B, 0x0C, 0x0C, 0x0D, 0x0D, 0x0E, 0x0E, 0x0F, 0x0F,
+ },
+ {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A,
+ 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15,
+ 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
+ },
+};
+
+inline uint64_t ExtractBitsSoftware(uint64_t bitmap, uint64_t select_bitmap) {
+ // A software emulation of _pext_u64
+
+ // These checks should be inline and are likely to be common cases.
+ if (select_bitmap == ~uint64_t{0}) {
+ return bitmap;
+ } else if (select_bitmap == 0) {
+ return 0;
+ }
+
+ // Fallback to lookup table method
+ uint64_t bit_value = 0;
+ int bit_len = 0;
+ constexpr uint8_t kLookupMask = (1U << kLookupBits) - 1;
+ while (select_bitmap != 0) {
+ const auto mask_len = ARROW_POPCOUNT32(select_bitmap & kLookupMask);
+ const uint64_t value = kPextTable[select_bitmap & kLookupMask][bitmap & kLookupMask];
+ bit_value |= (value << bit_len);
+ bit_len += mask_len;
+ bitmap >>= kLookupBits;
+ select_bitmap >>= kLookupBits;
+ }
+ return bit_value;
+}
+
+#ifdef ARROW_HAVE_BMI2
+
+// Use _pext_u64 on 64-bit builds, _pext_u32 on 32-bit builds,
+#if UINTPTR_MAX == 0xFFFFFFFF
+
+using extract_bitmap_t = uint32_t;
+inline extract_bitmap_t ExtractBits(extract_bitmap_t bitmap,
+ extract_bitmap_t select_bitmap) {
+ return _pext_u32(bitmap, select_bitmap);
+}
+
+#else
+
+using extract_bitmap_t = uint64_t;
+inline extract_bitmap_t ExtractBits(extract_bitmap_t bitmap,
+ extract_bitmap_t select_bitmap) {
+ return _pext_u64(bitmap, select_bitmap);
+}
+
+#endif
+
+#else // !defined(ARROW_HAVE_BMI2)
+
+// Use 64-bit pext emulation when BMI2 isn't available.
+using extract_bitmap_t = uint64_t;
+inline extract_bitmap_t ExtractBits(extract_bitmap_t bitmap,
+ extract_bitmap_t select_bitmap) {
+ return ExtractBitsSoftware(bitmap, select_bitmap);
+}
+
+#endif
+
+static constexpr int64_t kExtractBitsSize = 8 * sizeof(extract_bitmap_t);
+
+template <bool has_repeated_parent>
+int64_t DefLevelsBatchToBitmap(const int16_t* def_levels, const int64_t batch_size,
+ int64_t upper_bound_remaining, LevelInfo level_info,
+ ::arrow::internal::FirstTimeBitmapWriter* writer) {
+ DCHECK_LE(batch_size, kExtractBitsSize);
+
+ // Greater than level_info.def_level - 1 implies >= the def_level
+ auto defined_bitmap = static_cast<extract_bitmap_t>(
+ internal::GreaterThanBitmap(def_levels, batch_size, level_info.def_level - 1));
+
+ if (has_repeated_parent) {
+ // Greater than level_info.repeated_ancestor_def_level - 1 implies >= the
+ // repeated_ancestor_def_level
+ auto present_bitmap = static_cast<extract_bitmap_t>(internal::GreaterThanBitmap(
+ def_levels, batch_size, level_info.repeated_ancestor_def_level - 1));
+ auto selected_bits = ExtractBits(defined_bitmap, present_bitmap);
+ int64_t selected_count = ::arrow::BitUtil::PopCount(present_bitmap);
+ if (ARROW_PREDICT_FALSE(selected_count > upper_bound_remaining)) {
+ throw ParquetException("Values read exceeded upper bound");
+ }
+ writer->AppendWord(selected_bits, selected_count);
+ return ::arrow::BitUtil::PopCount(selected_bits);
+ } else {
+ if (ARROW_PREDICT_FALSE(batch_size > upper_bound_remaining)) {
+ std::stringstream ss;
+ ss << "Values read exceeded upper bound";
+ throw ParquetException(ss.str());
+ }
+
+ writer->AppendWord(defined_bitmap, batch_size);
+ return ::arrow::BitUtil::PopCount(defined_bitmap);
+ }
+}
+
+template <bool has_repeated_parent>
+void DefLevelsToBitmapSimd(const int16_t* def_levels, int64_t num_def_levels,
+ LevelInfo level_info, ValidityBitmapInputOutput* output) {
+ ::arrow::internal::FirstTimeBitmapWriter writer(
+ output->valid_bits,
+ /*start_offset=*/output->valid_bits_offset,
+ /*length=*/num_def_levels);
+ int64_t set_count = 0;
+ output->values_read = 0;
+ int64_t values_read_remaining = output->values_read_upper_bound;
+ while (num_def_levels > kExtractBitsSize) {
+ set_count += DefLevelsBatchToBitmap<has_repeated_parent>(
+ def_levels, kExtractBitsSize, values_read_remaining, level_info, &writer);
+ def_levels += kExtractBitsSize;
+ num_def_levels -= kExtractBitsSize;
+ values_read_remaining = output->values_read_upper_bound - writer.position();
+ }
+ set_count += DefLevelsBatchToBitmap<has_repeated_parent>(
+ def_levels, num_def_levels, values_read_remaining, level_info, &writer);
+
+ output->values_read = writer.position();
+ output->null_count += output->values_read - set_count;
+ writer.Finish();
+}
+
+} // namespace PARQUET_IMPL_NAMESPACE
+} // namespace internal
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/metadata.cc b/contrib/libs/apache/arrow/cpp/src/parquet/metadata.cc
new file mode 100644
index 00000000000..bd9bf77c42d
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/metadata.cc
@@ -0,0 +1,1783 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/metadata.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <ostream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/io/memory.h"
+#include "arrow/util/key_value_metadata.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/string_view.h"
+#include "parquet/encryption/encryption_internal.h"
+#include "parquet/encryption/internal_file_decryptor.h"
+#include "parquet/exception.h"
+#include "parquet/schema.h"
+#include "parquet/schema_internal.h"
+#include "parquet/statistics.h"
+#include "parquet/thrift_internal.h"
+
+namespace parquet {
+
+const ApplicationVersion& ApplicationVersion::PARQUET_251_FIXED_VERSION() {
+ static ApplicationVersion version("parquet-mr", 1, 8, 0);
+ return version;
+}
+
+const ApplicationVersion& ApplicationVersion::PARQUET_816_FIXED_VERSION() {
+ static ApplicationVersion version("parquet-mr", 1, 2, 9);
+ return version;
+}
+
+const ApplicationVersion& ApplicationVersion::PARQUET_CPP_FIXED_STATS_VERSION() {
+ static ApplicationVersion version("parquet-cpp", 1, 3, 0);
+ return version;
+}
+
+const ApplicationVersion& ApplicationVersion::PARQUET_MR_FIXED_STATS_VERSION() {
+ static ApplicationVersion version("parquet-mr", 1, 10, 0);
+ return version;
+}
+
+std::string ParquetVersionToString(ParquetVersion::type ver) {
+ switch (ver) {
+ case ParquetVersion::PARQUET_1_0:
+ return "1.0";
+ case ParquetVersion::PARQUET_2_0:
+ return "2.0";
+ }
+
+ // This should be unreachable
+ return "UNKNOWN";
+}
+
+template <typename DType>
+static std::shared_ptr<Statistics> MakeTypedColumnStats(
+ const format::ColumnMetaData& metadata, const ColumnDescriptor* descr) {
+ // If ColumnOrder is defined, return max_value and min_value
+ if (descr->column_order().get_order() == ColumnOrder::TYPE_DEFINED_ORDER) {
+ return MakeStatistics<DType>(
+ descr, metadata.statistics.min_value, metadata.statistics.max_value,
+ metadata.num_values - metadata.statistics.null_count,
+ metadata.statistics.null_count, metadata.statistics.distinct_count,
+ metadata.statistics.__isset.max_value || metadata.statistics.__isset.min_value,
+ metadata.statistics.__isset.null_count,
+ metadata.statistics.__isset.distinct_count);
+ }
+ // Default behavior
+ return MakeStatistics<DType>(
+ descr, metadata.statistics.min, metadata.statistics.max,
+ metadata.num_values - metadata.statistics.null_count,
+ metadata.statistics.null_count, metadata.statistics.distinct_count,
+ metadata.statistics.__isset.max || metadata.statistics.__isset.min,
+ metadata.statistics.__isset.null_count, metadata.statistics.__isset.distinct_count);
+}
+
+std::shared_ptr<Statistics> MakeColumnStats(const format::ColumnMetaData& meta_data,
+ const ColumnDescriptor* descr) {
+ switch (static_cast<Type::type>(meta_data.type)) {
+ case Type::BOOLEAN:
+ return MakeTypedColumnStats<BooleanType>(meta_data, descr);
+ case Type::INT32:
+ return MakeTypedColumnStats<Int32Type>(meta_data, descr);
+ case Type::INT64:
+ return MakeTypedColumnStats<Int64Type>(meta_data, descr);
+ case Type::INT96:
+ return MakeTypedColumnStats<Int96Type>(meta_data, descr);
+ case Type::DOUBLE:
+ return MakeTypedColumnStats<DoubleType>(meta_data, descr);
+ case Type::FLOAT:
+ return MakeTypedColumnStats<FloatType>(meta_data, descr);
+ case Type::BYTE_ARRAY:
+ return MakeTypedColumnStats<ByteArrayType>(meta_data, descr);
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ return MakeTypedColumnStats<FLBAType>(meta_data, descr);
+ case Type::UNDEFINED:
+ break;
+ }
+ throw ParquetException("Can't decode page statistics for selected column type");
+}
+
+// MetaData Accessor
+
+// ColumnCryptoMetaData
+class ColumnCryptoMetaData::ColumnCryptoMetaDataImpl {
+ public:
+ explicit ColumnCryptoMetaDataImpl(const format::ColumnCryptoMetaData* crypto_metadata)
+ : crypto_metadata_(crypto_metadata) {}
+
+ bool encrypted_with_footer_key() const {
+ return crypto_metadata_->__isset.ENCRYPTION_WITH_FOOTER_KEY;
+ }
+ bool encrypted_with_column_key() const {
+ return crypto_metadata_->__isset.ENCRYPTION_WITH_COLUMN_KEY;
+ }
+ std::shared_ptr<schema::ColumnPath> path_in_schema() const {
+ return std::make_shared<schema::ColumnPath>(
+ crypto_metadata_->ENCRYPTION_WITH_COLUMN_KEY.path_in_schema);
+ }
+ const std::string& key_metadata() const {
+ return crypto_metadata_->ENCRYPTION_WITH_COLUMN_KEY.key_metadata;
+ }
+
+ private:
+ const format::ColumnCryptoMetaData* crypto_metadata_;
+};
+
+std::unique_ptr<ColumnCryptoMetaData> ColumnCryptoMetaData::Make(
+ const uint8_t* metadata) {
+ return std::unique_ptr<ColumnCryptoMetaData>(new ColumnCryptoMetaData(metadata));
+}
+
+ColumnCryptoMetaData::ColumnCryptoMetaData(const uint8_t* metadata)
+ : impl_(new ColumnCryptoMetaDataImpl(
+ reinterpret_cast<const format::ColumnCryptoMetaData*>(metadata))) {}
+
+ColumnCryptoMetaData::~ColumnCryptoMetaData() = default;
+
+std::shared_ptr<schema::ColumnPath> ColumnCryptoMetaData::path_in_schema() const {
+ return impl_->path_in_schema();
+}
+bool ColumnCryptoMetaData::encrypted_with_footer_key() const {
+ return impl_->encrypted_with_footer_key();
+}
+const std::string& ColumnCryptoMetaData::key_metadata() const {
+ return impl_->key_metadata();
+}
+
+// ColumnChunk metadata
+class ColumnChunkMetaData::ColumnChunkMetaDataImpl {
+ public:
+ explicit ColumnChunkMetaDataImpl(const format::ColumnChunk* column,
+ const ColumnDescriptor* descr,
+ int16_t row_group_ordinal, int16_t column_ordinal,
+ const ApplicationVersion* writer_version,
+ std::shared_ptr<InternalFileDecryptor> file_decryptor)
+ : column_(column), descr_(descr), writer_version_(writer_version) {
+ column_metadata_ = &column->meta_data;
+ if (column->__isset.crypto_metadata) { // column metadata is encrypted
+ format::ColumnCryptoMetaData ccmd = column->crypto_metadata;
+
+ if (ccmd.__isset.ENCRYPTION_WITH_COLUMN_KEY) {
+ if (file_decryptor != nullptr && file_decryptor->properties() != nullptr) {
+ // should decrypt metadata
+ std::shared_ptr<schema::ColumnPath> path = std::make_shared<schema::ColumnPath>(
+ ccmd.ENCRYPTION_WITH_COLUMN_KEY.path_in_schema);
+ std::string key_metadata = ccmd.ENCRYPTION_WITH_COLUMN_KEY.key_metadata;
+
+ std::string aad_column_metadata = encryption::CreateModuleAad(
+ file_decryptor->file_aad(), encryption::kColumnMetaData, row_group_ordinal,
+ column_ordinal, static_cast<int16_t>(-1));
+ auto decryptor = file_decryptor->GetColumnMetaDecryptor(
+ path->ToDotString(), key_metadata, aad_column_metadata);
+ auto len = static_cast<uint32_t>(column->encrypted_column_metadata.size());
+ DeserializeThriftMsg(
+ reinterpret_cast<const uint8_t*>(column->encrypted_column_metadata.c_str()),
+ &len, &decrypted_metadata_, decryptor);
+ column_metadata_ = &decrypted_metadata_;
+ } else {
+ throw ParquetException(
+ "Cannot decrypt ColumnMetadata."
+ " FileDecryption is not setup correctly");
+ }
+ }
+ }
+ for (const auto& encoding : column_metadata_->encodings) {
+ encodings_.push_back(LoadEnumSafe(&encoding));
+ }
+ for (const auto& encoding_stats : column_metadata_->encoding_stats) {
+ encoding_stats_.push_back({LoadEnumSafe(&encoding_stats.page_type),
+ LoadEnumSafe(&encoding_stats.encoding),
+ encoding_stats.count});
+ }
+ possible_stats_ = nullptr;
+ }
+
+ bool Equals(const ColumnChunkMetaDataImpl& other) const {
+ return *column_metadata_ == *other.column_metadata_;
+ }
+
+ // column chunk
+ inline int64_t file_offset() const { return column_->file_offset; }
+ inline const std::string& file_path() const { return column_->file_path; }
+
+ inline Type::type type() const { return LoadEnumSafe(&column_metadata_->type); }
+
+ inline int64_t num_values() const { return column_metadata_->num_values; }
+
+ std::shared_ptr<schema::ColumnPath> path_in_schema() {
+ return std::make_shared<schema::ColumnPath>(column_metadata_->path_in_schema);
+ }
+
+ // Check if statistics are set and are valid
+ // 1) Must be set in the metadata
+ // 2) Statistics must not be corrupted
+ inline bool is_stats_set() const {
+ DCHECK(writer_version_ != nullptr);
+ // If the column statistics don't exist or column sort order is unknown
+ // we cannot use the column stats
+ if (!column_metadata_->__isset.statistics ||
+ descr_->sort_order() == SortOrder::UNKNOWN) {
+ return false;
+ }
+ if (possible_stats_ == nullptr) {
+ possible_stats_ = MakeColumnStats(*column_metadata_, descr_);
+ }
+ EncodedStatistics encodedStatistics = possible_stats_->Encode();
+ return writer_version_->HasCorrectStatistics(type(), encodedStatistics,
+ descr_->sort_order());
+ }
+
+ inline std::shared_ptr<Statistics> statistics() const {
+ return is_stats_set() ? possible_stats_ : nullptr;
+ }
+
+ inline Compression::type compression() const {
+ return LoadEnumSafe(&column_metadata_->codec);
+ }
+
+ const std::vector<Encoding::type>& encodings() const { return encodings_; }
+
+ const std::vector<PageEncodingStats>& encoding_stats() const { return encoding_stats_; }
+
+ inline bool has_dictionary_page() const {
+ return column_metadata_->__isset.dictionary_page_offset;
+ }
+
+ inline int64_t dictionary_page_offset() const {
+ return column_metadata_->dictionary_page_offset;
+ }
+
+ inline int64_t data_page_offset() const { return column_metadata_->data_page_offset; }
+
+ inline bool has_index_page() const {
+ return column_metadata_->__isset.index_page_offset;
+ }
+
+ inline int64_t index_page_offset() const { return column_metadata_->index_page_offset; }
+
+ inline int64_t total_compressed_size() const {
+ return column_metadata_->total_compressed_size;
+ }
+
+ inline int64_t total_uncompressed_size() const {
+ return column_metadata_->total_uncompressed_size;
+ }
+
+ inline std::unique_ptr<ColumnCryptoMetaData> crypto_metadata() const {
+ if (column_->__isset.crypto_metadata) {
+ return ColumnCryptoMetaData::Make(
+ reinterpret_cast<const uint8_t*>(&column_->crypto_metadata));
+ } else {
+ return nullptr;
+ }
+ }
+
+ private:
+ mutable std::shared_ptr<Statistics> possible_stats_;
+ std::vector<Encoding::type> encodings_;
+ std::vector<PageEncodingStats> encoding_stats_;
+ const format::ColumnChunk* column_;
+ const format::ColumnMetaData* column_metadata_;
+ format::ColumnMetaData decrypted_metadata_;
+ const ColumnDescriptor* descr_;
+ const ApplicationVersion* writer_version_;
+};
+
+std::unique_ptr<ColumnChunkMetaData> ColumnChunkMetaData::Make(
+ const void* metadata, const ColumnDescriptor* descr,
+ const ApplicationVersion* writer_version, int16_t row_group_ordinal,
+ int16_t column_ordinal, std::shared_ptr<InternalFileDecryptor> file_decryptor) {
+ return std::unique_ptr<ColumnChunkMetaData>(
+ new ColumnChunkMetaData(metadata, descr, row_group_ordinal, column_ordinal,
+ writer_version, std::move(file_decryptor)));
+}
+
+ColumnChunkMetaData::ColumnChunkMetaData(
+ const void* metadata, const ColumnDescriptor* descr, int16_t row_group_ordinal,
+ int16_t column_ordinal, const ApplicationVersion* writer_version,
+ std::shared_ptr<InternalFileDecryptor> file_decryptor)
+ : impl_{new ColumnChunkMetaDataImpl(
+ reinterpret_cast<const format::ColumnChunk*>(metadata), descr,
+ row_group_ordinal, column_ordinal, writer_version, std::move(file_decryptor))} {
+}
+
+ColumnChunkMetaData::~ColumnChunkMetaData() = default;
+
+// column chunk
+int64_t ColumnChunkMetaData::file_offset() const { return impl_->file_offset(); }
+
+const std::string& ColumnChunkMetaData::file_path() const { return impl_->file_path(); }
+
+Type::type ColumnChunkMetaData::type() const { return impl_->type(); }
+
+int64_t ColumnChunkMetaData::num_values() const { return impl_->num_values(); }
+
+std::shared_ptr<schema::ColumnPath> ColumnChunkMetaData::path_in_schema() const {
+ return impl_->path_in_schema();
+}
+
+std::shared_ptr<Statistics> ColumnChunkMetaData::statistics() const {
+ return impl_->statistics();
+}
+
+bool ColumnChunkMetaData::is_stats_set() const { return impl_->is_stats_set(); }
+
+bool ColumnChunkMetaData::has_dictionary_page() const {
+ return impl_->has_dictionary_page();
+}
+
+int64_t ColumnChunkMetaData::dictionary_page_offset() const {
+ return impl_->dictionary_page_offset();
+}
+
+int64_t ColumnChunkMetaData::data_page_offset() const {
+ return impl_->data_page_offset();
+}
+
+bool ColumnChunkMetaData::has_index_page() const { return impl_->has_index_page(); }
+
+int64_t ColumnChunkMetaData::index_page_offset() const {
+ return impl_->index_page_offset();
+}
+
+Compression::type ColumnChunkMetaData::compression() const {
+ return impl_->compression();
+}
+
+bool ColumnChunkMetaData::can_decompress() const {
+ return ::arrow::util::Codec::IsAvailable(compression());
+}
+
+const std::vector<Encoding::type>& ColumnChunkMetaData::encodings() const {
+ return impl_->encodings();
+}
+
+const std::vector<PageEncodingStats>& ColumnChunkMetaData::encoding_stats() const {
+ return impl_->encoding_stats();
+}
+
+int64_t ColumnChunkMetaData::total_uncompressed_size() const {
+ return impl_->total_uncompressed_size();
+}
+
+int64_t ColumnChunkMetaData::total_compressed_size() const {
+ return impl_->total_compressed_size();
+}
+
+std::unique_ptr<ColumnCryptoMetaData> ColumnChunkMetaData::crypto_metadata() const {
+ return impl_->crypto_metadata();
+}
+
+bool ColumnChunkMetaData::Equals(const ColumnChunkMetaData& other) const {
+ return impl_->Equals(*other.impl_);
+}
+
+// row-group metadata
+class RowGroupMetaData::RowGroupMetaDataImpl {
+ public:
+ explicit RowGroupMetaDataImpl(const format::RowGroup* row_group,
+ const SchemaDescriptor* schema,
+ const ApplicationVersion* writer_version,
+ std::shared_ptr<InternalFileDecryptor> file_decryptor)
+ : row_group_(row_group),
+ schema_(schema),
+ writer_version_(writer_version),
+ file_decryptor_(std::move(file_decryptor)) {}
+
+ bool Equals(const RowGroupMetaDataImpl& other) const {
+ return *row_group_ == *other.row_group_;
+ }
+
+ inline int num_columns() const { return static_cast<int>(row_group_->columns.size()); }
+
+ inline int64_t num_rows() const { return row_group_->num_rows; }
+
+ inline int64_t total_byte_size() const { return row_group_->total_byte_size; }
+
+ inline int64_t total_compressed_size() const {
+ return row_group_->total_compressed_size;
+ }
+
+ inline int64_t file_offset() const { return row_group_->file_offset; }
+
+ inline const SchemaDescriptor* schema() const { return schema_; }
+
+ std::unique_ptr<ColumnChunkMetaData> ColumnChunk(int i) {
+ if (i < num_columns()) {
+ return ColumnChunkMetaData::Make(&row_group_->columns[i], schema_->Column(i),
+ writer_version_, row_group_->ordinal,
+ static_cast<int16_t>(i), file_decryptor_);
+ }
+ throw ParquetException("The file only has ", num_columns(),
+ " columns, requested metadata for column: ", i);
+ }
+
+ private:
+ const format::RowGroup* row_group_;
+ const SchemaDescriptor* schema_;
+ const ApplicationVersion* writer_version_;
+ std::shared_ptr<InternalFileDecryptor> file_decryptor_;
+};
+
+std::unique_ptr<RowGroupMetaData> RowGroupMetaData::Make(
+ const void* metadata, const SchemaDescriptor* schema,
+ const ApplicationVersion* writer_version,
+ std::shared_ptr<InternalFileDecryptor> file_decryptor) {
+ return std::unique_ptr<RowGroupMetaData>(
+ new RowGroupMetaData(metadata, schema, writer_version, std::move(file_decryptor)));
+}
+
+RowGroupMetaData::RowGroupMetaData(const void* metadata, const SchemaDescriptor* schema,
+ const ApplicationVersion* writer_version,
+ std::shared_ptr<InternalFileDecryptor> file_decryptor)
+ : impl_{new RowGroupMetaDataImpl(reinterpret_cast<const format::RowGroup*>(metadata),
+ schema, writer_version, std::move(file_decryptor))} {
+}
+
+RowGroupMetaData::~RowGroupMetaData() = default;
+
+bool RowGroupMetaData::Equals(const RowGroupMetaData& other) const {
+ return impl_->Equals(*other.impl_);
+}
+
+int RowGroupMetaData::num_columns() const { return impl_->num_columns(); }
+
+int64_t RowGroupMetaData::num_rows() const { return impl_->num_rows(); }
+
+int64_t RowGroupMetaData::total_byte_size() const { return impl_->total_byte_size(); }
+
+int64_t RowGroupMetaData::total_compressed_size() const {
+ return impl_->total_compressed_size();
+}
+
+int64_t RowGroupMetaData::file_offset() const { return impl_->file_offset(); }
+
+const SchemaDescriptor* RowGroupMetaData::schema() const { return impl_->schema(); }
+
+std::unique_ptr<ColumnChunkMetaData> RowGroupMetaData::ColumnChunk(int i) const {
+ return impl_->ColumnChunk(i);
+}
+
+bool RowGroupMetaData::can_decompress() const {
+ int n_columns = num_columns();
+ for (int i = 0; i < n_columns; i++) {
+ if (!ColumnChunk(i)->can_decompress()) {
+ return false;
+ }
+ }
+ return true;
+}
+
+// file metadata
+class FileMetaData::FileMetaDataImpl {
+ public:
+ FileMetaDataImpl() = default;
+
+ explicit FileMetaDataImpl(
+ const void* metadata, uint32_t* metadata_len,
+ std::shared_ptr<InternalFileDecryptor> file_decryptor = nullptr)
+ : file_decryptor_(file_decryptor) {
+ metadata_.reset(new format::FileMetaData);
+
+ auto footer_decryptor =
+ file_decryptor_ != nullptr ? file_decryptor->GetFooterDecryptor() : nullptr;
+
+ DeserializeThriftMsg(reinterpret_cast<const uint8_t*>(metadata), metadata_len,
+ metadata_.get(), footer_decryptor);
+ metadata_len_ = *metadata_len;
+
+ if (metadata_->__isset.created_by) {
+ writer_version_ = ApplicationVersion(metadata_->created_by);
+ } else {
+ writer_version_ = ApplicationVersion("unknown 0.0.0");
+ }
+
+ InitSchema();
+ InitColumnOrders();
+ InitKeyValueMetadata();
+ }
+
+ bool VerifySignature(const void* signature) {
+ // verify decryption properties are set
+ if (file_decryptor_ == nullptr) {
+ throw ParquetException("Decryption not set properly. cannot verify signature");
+ }
+ // serialize the footer
+ uint8_t* serialized_data;
+ uint32_t serialized_len = metadata_len_;
+ ThriftSerializer serializer;
+ serializer.SerializeToBuffer(metadata_.get(), &serialized_len, &serialized_data);
+
+ // encrypt with nonce
+ auto nonce = const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(signature));
+ auto tag = const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(signature)) +
+ encryption::kNonceLength;
+
+ std::string key = file_decryptor_->GetFooterKey();
+ std::string aad = encryption::CreateFooterAad(file_decryptor_->file_aad());
+
+ auto aes_encryptor = encryption::AesEncryptor::Make(
+ file_decryptor_->algorithm(), static_cast<int>(key.size()), true, nullptr);
+
+ std::shared_ptr<Buffer> encrypted_buffer = std::static_pointer_cast<ResizableBuffer>(
+ AllocateBuffer(file_decryptor_->pool(),
+ aes_encryptor->CiphertextSizeDelta() + serialized_len));
+ uint32_t encrypted_len = aes_encryptor->SignedFooterEncrypt(
+ serialized_data, serialized_len, str2bytes(key), static_cast<int>(key.size()),
+ str2bytes(aad), static_cast<int>(aad.size()), nonce,
+ encrypted_buffer->mutable_data());
+ // Delete AES encryptor object. It was created only to verify the footer signature.
+ aes_encryptor->WipeOut();
+ delete aes_encryptor;
+ return 0 ==
+ memcmp(encrypted_buffer->data() + encrypted_len - encryption::kGcmTagLength,
+ tag, encryption::kGcmTagLength);
+ }
+
+ inline uint32_t size() const { return metadata_len_; }
+ inline int num_columns() const { return schema_.num_columns(); }
+ inline int64_t num_rows() const { return metadata_->num_rows; }
+ inline int num_row_groups() const {
+ return static_cast<int>(metadata_->row_groups.size());
+ }
+ inline int32_t version() const { return metadata_->version; }
+ inline const std::string& created_by() const { return metadata_->created_by; }
+ inline int num_schema_elements() const {
+ return static_cast<int>(metadata_->schema.size());
+ }
+
+ inline bool is_encryption_algorithm_set() const {
+ return metadata_->__isset.encryption_algorithm;
+ }
+ inline EncryptionAlgorithm encryption_algorithm() {
+ return FromThrift(metadata_->encryption_algorithm);
+ }
+ inline const std::string& footer_signing_key_metadata() {
+ return metadata_->footer_signing_key_metadata;
+ }
+
+ const ApplicationVersion& writer_version() const { return writer_version_; }
+
+ void WriteTo(::arrow::io::OutputStream* dst,
+ const std::shared_ptr<Encryptor>& encryptor) const {
+ ThriftSerializer serializer;
+ // Only in encrypted files with plaintext footers the
+ // encryption_algorithm is set in footer
+ if (is_encryption_algorithm_set()) {
+ uint8_t* serialized_data;
+ uint32_t serialized_len;
+ serializer.SerializeToBuffer(metadata_.get(), &serialized_len, &serialized_data);
+
+ // encrypt the footer key
+ std::vector<uint8_t> encrypted_data(encryptor->CiphertextSizeDelta() +
+ serialized_len);
+ unsigned encrypted_len =
+ encryptor->Encrypt(serialized_data, serialized_len, encrypted_data.data());
+
+ // write unencrypted footer
+ PARQUET_THROW_NOT_OK(dst->Write(serialized_data, serialized_len));
+ // Write signature (nonce and tag)
+ PARQUET_THROW_NOT_OK(
+ dst->Write(encrypted_data.data() + 4, encryption::kNonceLength));
+ PARQUET_THROW_NOT_OK(
+ dst->Write(encrypted_data.data() + encrypted_len - encryption::kGcmTagLength,
+ encryption::kGcmTagLength));
+ } else { // either plaintext file (when encryptor is null)
+ // or encrypted file with encrypted footer
+ serializer.Serialize(metadata_.get(), dst, encryptor);
+ }
+ }
+
+ std::unique_ptr<RowGroupMetaData> RowGroup(int i) {
+ if (!(i < num_row_groups())) {
+ std::stringstream ss;
+ ss << "The file only has " << num_row_groups()
+ << " row groups, requested metadata for row group: " << i;
+ throw ParquetException(ss.str());
+ }
+ return RowGroupMetaData::Make(&metadata_->row_groups[i], &schema_, &writer_version_,
+ file_decryptor_);
+ }
+
+ bool Equals(const FileMetaDataImpl& other) const {
+ return *metadata_ == *other.metadata_;
+ }
+
+ const SchemaDescriptor* schema() const { return &schema_; }
+
+ const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const {
+ return key_value_metadata_;
+ }
+
+ void set_file_path(const std::string& path) {
+ for (format::RowGroup& row_group : metadata_->row_groups) {
+ for (format::ColumnChunk& chunk : row_group.columns) {
+ chunk.__set_file_path(path);
+ }
+ }
+ }
+
+ format::RowGroup& row_group(int i) {
+ DCHECK_LT(i, num_row_groups());
+ return metadata_->row_groups[i];
+ }
+
+ void AppendRowGroups(const std::unique_ptr<FileMetaDataImpl>& other) {
+ if (!schema()->Equals(*other->schema())) {
+ throw ParquetException("AppendRowGroups requires equal schemas.");
+ }
+
+ format::RowGroup other_rg;
+ for (int i = 0; i < other->num_row_groups(); i++) {
+ other_rg = other->row_group(i);
+ metadata_->row_groups.push_back(other_rg);
+ metadata_->num_rows += other_rg.num_rows;
+ }
+ }
+
+ std::shared_ptr<FileMetaData> Subset(const std::vector<int>& row_groups) {
+ for (int i : row_groups) {
+ if (i < num_row_groups()) continue;
+
+ throw ParquetException(
+ "The file only has ", num_row_groups(),
+ " row groups, but requested a subset including row group: ", i);
+ }
+
+ std::shared_ptr<FileMetaData> out(new FileMetaData());
+ out->impl_.reset(new FileMetaDataImpl());
+ out->impl_->metadata_.reset(new format::FileMetaData());
+
+ auto metadata = out->impl_->metadata_.get();
+ metadata->version = metadata_->version;
+ metadata->schema = metadata_->schema;
+
+ metadata->row_groups.resize(row_groups.size());
+ int i = 0;
+ for (int selected_index : row_groups) {
+ metadata->num_rows += row_group(selected_index).num_rows;
+ metadata->row_groups[i++] = row_group(selected_index);
+ }
+
+ metadata->key_value_metadata = metadata_->key_value_metadata;
+ metadata->created_by = metadata_->created_by;
+ metadata->column_orders = metadata_->column_orders;
+ metadata->encryption_algorithm = metadata_->encryption_algorithm;
+ metadata->footer_signing_key_metadata = metadata_->footer_signing_key_metadata;
+ metadata->__isset = metadata_->__isset;
+
+ out->impl_->schema_ = schema_;
+ out->impl_->writer_version_ = writer_version_;
+ out->impl_->key_value_metadata_ = key_value_metadata_;
+ out->impl_->file_decryptor_ = file_decryptor_;
+
+ return out;
+ }
+
+ void set_file_decryptor(std::shared_ptr<InternalFileDecryptor> file_decryptor) {
+ file_decryptor_ = file_decryptor;
+ }
+
+ private:
+ friend FileMetaDataBuilder;
+ uint32_t metadata_len_ = 0;
+ std::unique_ptr<format::FileMetaData> metadata_;
+ SchemaDescriptor schema_;
+ ApplicationVersion writer_version_;
+ std::shared_ptr<const KeyValueMetadata> key_value_metadata_;
+ std::shared_ptr<InternalFileDecryptor> file_decryptor_;
+
+ void InitSchema() {
+ if (metadata_->schema.empty()) {
+ throw ParquetException("Empty file schema (no root)");
+ }
+ schema_.Init(schema::Unflatten(&metadata_->schema[0],
+ static_cast<int>(metadata_->schema.size())));
+ }
+
+ void InitColumnOrders() {
+ // update ColumnOrder
+ std::vector<parquet::ColumnOrder> column_orders;
+ if (metadata_->__isset.column_orders) {
+ for (auto column_order : metadata_->column_orders) {
+ if (column_order.__isset.TYPE_ORDER) {
+ column_orders.push_back(ColumnOrder::type_defined_);
+ } else {
+ column_orders.push_back(ColumnOrder::undefined_);
+ }
+ }
+ } else {
+ column_orders.resize(schema_.num_columns(), ColumnOrder::undefined_);
+ }
+
+ schema_.updateColumnOrders(column_orders);
+ }
+
+ void InitKeyValueMetadata() {
+ std::shared_ptr<KeyValueMetadata> metadata = nullptr;
+ if (metadata_->__isset.key_value_metadata) {
+ metadata = std::make_shared<KeyValueMetadata>();
+ for (const auto& it : metadata_->key_value_metadata) {
+ metadata->Append(it.key, it.value);
+ }
+ }
+ key_value_metadata_ = std::move(metadata);
+ }
+};
+
+std::shared_ptr<FileMetaData> FileMetaData::Make(
+ const void* metadata, uint32_t* metadata_len,
+ std::shared_ptr<InternalFileDecryptor> file_decryptor) {
+ // This FileMetaData ctor is private, not compatible with std::make_shared
+ return std::shared_ptr<FileMetaData>(
+ new FileMetaData(metadata, metadata_len, file_decryptor));
+}
+
+FileMetaData::FileMetaData(const void* metadata, uint32_t* metadata_len,
+ std::shared_ptr<InternalFileDecryptor> file_decryptor)
+ : impl_{std::unique_ptr<FileMetaDataImpl>(
+ new FileMetaDataImpl(metadata, metadata_len, file_decryptor))} {}
+
+FileMetaData::FileMetaData()
+ : impl_{std::unique_ptr<FileMetaDataImpl>(new FileMetaDataImpl())} {}
+
+FileMetaData::~FileMetaData() = default;
+
+bool FileMetaData::Equals(const FileMetaData& other) const {
+ return impl_->Equals(*other.impl_);
+}
+
+std::unique_ptr<RowGroupMetaData> FileMetaData::RowGroup(int i) const {
+ return impl_->RowGroup(i);
+}
+
+bool FileMetaData::VerifySignature(const void* signature) {
+ return impl_->VerifySignature(signature);
+}
+
+uint32_t FileMetaData::size() const { return impl_->size(); }
+
+int FileMetaData::num_columns() const { return impl_->num_columns(); }
+
+int64_t FileMetaData::num_rows() const { return impl_->num_rows(); }
+
+int FileMetaData::num_row_groups() const { return impl_->num_row_groups(); }
+
+bool FileMetaData::can_decompress() const {
+ int n_row_groups = num_row_groups();
+ for (int i = 0; i < n_row_groups; i++) {
+ if (!RowGroup(i)->can_decompress()) {
+ return false;
+ }
+ }
+ return true;
+}
+
+bool FileMetaData::is_encryption_algorithm_set() const {
+ return impl_->is_encryption_algorithm_set();
+}
+
+EncryptionAlgorithm FileMetaData::encryption_algorithm() const {
+ return impl_->encryption_algorithm();
+}
+
+const std::string& FileMetaData::footer_signing_key_metadata() const {
+ return impl_->footer_signing_key_metadata();
+}
+
+void FileMetaData::set_file_decryptor(
+ std::shared_ptr<InternalFileDecryptor> file_decryptor) {
+ impl_->set_file_decryptor(file_decryptor);
+}
+
+ParquetVersion::type FileMetaData::version() const {
+ switch (impl_->version()) {
+ case 1:
+ return ParquetVersion::PARQUET_1_0;
+ case 2:
+ return ParquetVersion::PARQUET_2_0;
+ default:
+ // Improperly set version, assuming Parquet 1.0
+ break;
+ }
+ return ParquetVersion::PARQUET_1_0;
+}
+
+const ApplicationVersion& FileMetaData::writer_version() const {
+ return impl_->writer_version();
+}
+
+const std::string& FileMetaData::created_by() const { return impl_->created_by(); }
+
+int FileMetaData::num_schema_elements() const { return impl_->num_schema_elements(); }
+
+const SchemaDescriptor* FileMetaData::schema() const { return impl_->schema(); }
+
+const std::shared_ptr<const KeyValueMetadata>& FileMetaData::key_value_metadata() const {
+ return impl_->key_value_metadata();
+}
+
+void FileMetaData::set_file_path(const std::string& path) { impl_->set_file_path(path); }
+
+void FileMetaData::AppendRowGroups(const FileMetaData& other) {
+ impl_->AppendRowGroups(other.impl_);
+}
+
+std::shared_ptr<FileMetaData> FileMetaData::Subset(
+ const std::vector<int>& row_groups) const {
+ return impl_->Subset(row_groups);
+}
+
+void FileMetaData::WriteTo(::arrow::io::OutputStream* dst,
+ const std::shared_ptr<Encryptor>& encryptor) const {
+ return impl_->WriteTo(dst, encryptor);
+}
+
+class FileCryptoMetaData::FileCryptoMetaDataImpl {
+ public:
+ FileCryptoMetaDataImpl() = default;
+
+ explicit FileCryptoMetaDataImpl(const uint8_t* metadata, uint32_t* metadata_len) {
+ metadata_.reset(new format::FileCryptoMetaData);
+ DeserializeThriftMsg(metadata, metadata_len, metadata_.get());
+ metadata_len_ = *metadata_len;
+ }
+
+ EncryptionAlgorithm encryption_algorithm() {
+ return FromThrift(metadata_->encryption_algorithm);
+ }
+ const std::string& key_metadata() { return metadata_->key_metadata; }
+ void WriteTo(::arrow::io::OutputStream* dst) const {
+ ThriftSerializer serializer;
+ serializer.Serialize(metadata_.get(), dst);
+ }
+
+ private:
+ friend FileMetaDataBuilder;
+ std::unique_ptr<format::FileCryptoMetaData> metadata_;
+ uint32_t metadata_len_;
+};
+
+EncryptionAlgorithm FileCryptoMetaData::encryption_algorithm() const {
+ return impl_->encryption_algorithm();
+}
+
+const std::string& FileCryptoMetaData::key_metadata() const {
+ return impl_->key_metadata();
+}
+
+std::shared_ptr<FileCryptoMetaData> FileCryptoMetaData::Make(
+ const uint8_t* serialized_metadata, uint32_t* metadata_len) {
+ return std::shared_ptr<FileCryptoMetaData>(
+ new FileCryptoMetaData(serialized_metadata, metadata_len));
+}
+
+FileCryptoMetaData::FileCryptoMetaData(const uint8_t* serialized_metadata,
+ uint32_t* metadata_len)
+ : impl_(new FileCryptoMetaDataImpl(serialized_metadata, metadata_len)) {}
+
+FileCryptoMetaData::FileCryptoMetaData() : impl_(new FileCryptoMetaDataImpl()) {}
+
+FileCryptoMetaData::~FileCryptoMetaData() = default;
+
+void FileCryptoMetaData::WriteTo(::arrow::io::OutputStream* dst) const {
+ impl_->WriteTo(dst);
+}
+
+std::string FileMetaData::SerializeToString() const {
+ // We need to pass in an initial size. Since it will automatically
+ // increase the buffer size to hold the metadata, we just leave it 0.
+ PARQUET_ASSIGN_OR_THROW(auto serializer, ::arrow::io::BufferOutputStream::Create(0));
+ WriteTo(serializer.get());
+ PARQUET_ASSIGN_OR_THROW(auto metadata_buffer, serializer->Finish());
+ return metadata_buffer->ToString();
+}
+
+ApplicationVersion::ApplicationVersion(std::string application, int major, int minor,
+ int patch)
+ : application_(std::move(application)), version{major, minor, patch, "", "", ""} {}
+
+namespace {
+// Parse the application version format and set parsed values to
+// ApplicationVersion.
+//
+// The application version format must be compatible parquet-mr's
+// one. See also:
+// * https://github.com/apache/parquet-mr/blob/master/parquet-common/src/main/java/org/apache/parquet/VersionParser.java
+// * https://github.com/apache/parquet-mr/blob/master/parquet-common/src/main/java/org/apache/parquet/SemanticVersion.java
+//
+// The application version format:
+// "${APPLICATION_NAME}"
+// "${APPLICATION_NAME} version ${VERSION}"
+// "${APPLICATION_NAME} version ${VERSION} (build ${BUILD_NAME})"
+//
+// Eg:
+// parquet-cpp
+// parquet-cpp version 1.5.0ab-xyz5.5.0+cd
+// parquet-cpp version 1.5.0ab-xyz5.5.0+cd (build abcd)
+//
+// The VERSION format:
+// "${MAJOR}"
+// "${MAJOR}.${MINOR}"
+// "${MAJOR}.${MINOR}.${PATCH}"
+// "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}"
+// "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}-${PRE_RELEASE}"
+// "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}-${PRE_RELEASE}+${BUILD_INFO}"
+// "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}+${BUILD_INFO}"
+// "${MAJOR}.${MINOR}.${PATCH}-${PRE_RELEASE}"
+// "${MAJOR}.${MINOR}.${PATCH}-${PRE_RELEASE}+${BUILD_INFO}"
+// "${MAJOR}.${MINOR}.${PATCH}+${BUILD_INFO}"
+//
+// Eg:
+// 1
+// 1.5
+// 1.5.0
+// 1.5.0ab
+// 1.5.0ab-cdh5.5.0
+// 1.5.0ab-cdh5.5.0+cd
+// 1.5.0ab+cd
+// 1.5.0-cdh5.5.0
+// 1.5.0-cdh5.5.0+cd
+// 1.5.0+cd
+class ApplicationVersionParser {
+ public:
+ ApplicationVersionParser(const std::string& created_by,
+ ApplicationVersion& application_version)
+ : created_by_(created_by),
+ application_version_(application_version),
+ spaces_(" \t\v\r\n\f"),
+ digits_("0123456789") {}
+
+ void Parse() {
+ application_version_.application_ = "unknown";
+ application_version_.version = {0, 0, 0, "", "", ""};
+
+ if (!ParseApplicationName()) {
+ return;
+ }
+ if (!ParseVersion()) {
+ return;
+ }
+ if (!ParseBuildName()) {
+ return;
+ }
+ }
+
+ private:
+ bool IsSpace(const std::string& string, const size_t& offset) {
+ auto target = ::arrow::util::string_view(string).substr(offset, 1);
+ return target.find_first_of(spaces_) != ::arrow::util::string_view::npos;
+ }
+
+ void RemovePrecedingSpaces(const std::string& string, size_t& start,
+ const size_t& end) {
+ while (start < end && IsSpace(string, start)) {
+ ++start;
+ }
+ }
+
+ void RemoveTrailingSpaces(const std::string& string, const size_t& start, size_t& end) {
+ while (start < (end - 1) && (end - 1) < string.size() && IsSpace(string, end - 1)) {
+ --end;
+ }
+ }
+
+ bool ParseApplicationName() {
+ std::string version_mark(" version ");
+ auto version_mark_position = created_by_.find(version_mark);
+ size_t application_name_end;
+ // No VERSION and BUILD_NAME.
+ if (version_mark_position == std::string::npos) {
+ version_start_ = std::string::npos;
+ application_name_end = created_by_.size();
+ } else {
+ version_start_ = version_mark_position + version_mark.size();
+ application_name_end = version_mark_position;
+ }
+
+ size_t application_name_start = 0;
+ RemovePrecedingSpaces(created_by_, application_name_start, application_name_end);
+ RemoveTrailingSpaces(created_by_, application_name_start, application_name_end);
+ application_version_.application_ = created_by_.substr(
+ application_name_start, application_name_end - application_name_start);
+
+ return true;
+ }
+
+ bool ParseVersion() {
+ // No VERSION.
+ if (version_start_ == std::string::npos) {
+ return false;
+ }
+
+ RemovePrecedingSpaces(created_by_, version_start_, created_by_.size());
+ version_end_ = created_by_.find(" (", version_start_);
+ // No BUILD_NAME.
+ if (version_end_ == std::string::npos) {
+ version_end_ = created_by_.size();
+ }
+ RemoveTrailingSpaces(created_by_, version_start_, version_end_);
+ // No VERSION.
+ if (version_start_ == version_end_) {
+ return false;
+ }
+ version_string_ = created_by_.substr(version_start_, version_end_ - version_start_);
+
+ if (!ParseVersionMajor()) {
+ return false;
+ }
+ if (!ParseVersionMinor()) {
+ return false;
+ }
+ if (!ParseVersionPatch()) {
+ return false;
+ }
+ if (!ParseVersionUnknown()) {
+ return false;
+ }
+ if (!ParseVersionPreRelease()) {
+ return false;
+ }
+ if (!ParseVersionBuildInfo()) {
+ return false;
+ }
+
+ return true;
+ }
+
+ bool ParseVersionMajor() {
+ size_t version_major_start = 0;
+ auto version_major_end = version_string_.find_first_not_of(digits_);
+ // MAJOR only.
+ if (version_major_end == std::string::npos) {
+ version_major_end = version_string_.size();
+ version_parsing_position_ = version_major_end;
+ } else {
+ // No ".".
+ if (version_string_[version_major_end] != '.') {
+ return false;
+ }
+ // No MAJOR.
+ if (version_major_end == version_major_start) {
+ return false;
+ }
+ version_parsing_position_ = version_major_end + 1; // +1 is for '.'.
+ }
+ auto version_major_string = version_string_.substr(
+ version_major_start, version_major_end - version_major_start);
+ application_version_.version.major = atoi(version_major_string.c_str());
+ return true;
+ }
+
+ bool ParseVersionMinor() {
+ auto version_minor_start = version_parsing_position_;
+ auto version_minor_end =
+ version_string_.find_first_not_of(digits_, version_minor_start);
+ // MAJOR.MINOR only.
+ if (version_minor_end == std::string::npos) {
+ version_minor_end = version_string_.size();
+ version_parsing_position_ = version_minor_end;
+ } else {
+ // No ".".
+ if (version_string_[version_minor_end] != '.') {
+ return false;
+ }
+ // No MINOR.
+ if (version_minor_end == version_minor_start) {
+ return false;
+ }
+ version_parsing_position_ = version_minor_end + 1; // +1 is for '.'.
+ }
+ auto version_minor_string = version_string_.substr(
+ version_minor_start, version_minor_end - version_minor_start);
+ application_version_.version.minor = atoi(version_minor_string.c_str());
+ return true;
+ }
+
+ bool ParseVersionPatch() {
+ auto version_patch_start = version_parsing_position_;
+ auto version_patch_end =
+ version_string_.find_first_not_of(digits_, version_patch_start);
+ // No UNKNOWN, PRE_RELEASE and BUILD_INFO.
+ if (version_patch_end == std::string::npos) {
+ version_patch_end = version_string_.size();
+ }
+ // No PATCH.
+ if (version_patch_end == version_patch_start) {
+ return false;
+ }
+ auto version_patch_string = version_string_.substr(
+ version_patch_start, version_patch_end - version_patch_start);
+ application_version_.version.patch = atoi(version_patch_string.c_str());
+ version_parsing_position_ = version_patch_end;
+ return true;
+ }
+
+ bool ParseVersionUnknown() {
+ // No UNKNOWN.
+ if (version_parsing_position_ == version_string_.size()) {
+ return true;
+ }
+ auto version_unknown_start = version_parsing_position_;
+ auto version_unknown_end = version_string_.find_first_of("-+", version_unknown_start);
+ // No PRE_RELEASE and BUILD_INFO
+ if (version_unknown_end == std::string::npos) {
+ version_unknown_end = version_string_.size();
+ }
+ application_version_.version.unknown = version_string_.substr(
+ version_unknown_start, version_unknown_end - version_unknown_start);
+ version_parsing_position_ = version_unknown_end;
+ return true;
+ }
+
+ bool ParseVersionPreRelease() {
+ // No PRE_RELEASE.
+ if (version_parsing_position_ == version_string_.size() ||
+ version_string_[version_parsing_position_] != '-') {
+ return true;
+ }
+
+ auto version_pre_release_start = version_parsing_position_ + 1; // +1 is for '-'.
+ auto version_pre_release_end =
+ version_string_.find_first_of("+", version_pre_release_start);
+ // No BUILD_INFO
+ if (version_pre_release_end == std::string::npos) {
+ version_pre_release_end = version_string_.size();
+ }
+ application_version_.version.pre_release = version_string_.substr(
+ version_pre_release_start, version_pre_release_end - version_pre_release_start);
+ version_parsing_position_ = version_pre_release_end;
+ return true;
+ }
+
+ bool ParseVersionBuildInfo() {
+ // No BUILD_INFO.
+ if (version_parsing_position_ == version_string_.size() ||
+ version_string_[version_parsing_position_] != '+') {
+ return true;
+ }
+
+ auto version_build_info_start = version_parsing_position_ + 1; // +1 is for '+'.
+ application_version_.version.build_info =
+ version_string_.substr(version_build_info_start);
+ return true;
+ }
+
+ bool ParseBuildName() {
+ std::string build_mark(" (build ");
+ auto build_mark_position = created_by_.find(build_mark, version_end_);
+ // No BUILD_NAME.
+ if (build_mark_position == std::string::npos) {
+ return false;
+ }
+ auto build_name_start = build_mark_position + build_mark.size();
+ RemovePrecedingSpaces(created_by_, build_name_start, created_by_.size());
+ auto build_name_end = created_by_.find_first_of(")", build_name_start);
+ // No end ")".
+ if (build_name_end == std::string::npos) {
+ return false;
+ }
+ RemoveTrailingSpaces(created_by_, build_name_start, build_name_end);
+ application_version_.build_ =
+ created_by_.substr(build_name_start, build_name_end - build_name_start);
+
+ return true;
+ }
+
+ const std::string& created_by_;
+ ApplicationVersion& application_version_;
+
+ // For parsing.
+ std::string spaces_;
+ std::string digits_;
+ size_t version_parsing_position_;
+ size_t version_start_;
+ size_t version_end_;
+ std::string version_string_;
+};
+} // namespace
+
+ApplicationVersion::ApplicationVersion(const std::string& created_by) {
+ ApplicationVersionParser parser(created_by, *this);
+ parser.Parse();
+}
+
+bool ApplicationVersion::VersionLt(const ApplicationVersion& other_version) const {
+ if (application_ != other_version.application_) return false;
+
+ if (version.major < other_version.version.major) return true;
+ if (version.major > other_version.version.major) return false;
+ DCHECK_EQ(version.major, other_version.version.major);
+ if (version.minor < other_version.version.minor) return true;
+ if (version.minor > other_version.version.minor) return false;
+ DCHECK_EQ(version.minor, other_version.version.minor);
+ return version.patch < other_version.version.patch;
+}
+
+bool ApplicationVersion::VersionEq(const ApplicationVersion& other_version) const {
+ return application_ == other_version.application_ &&
+ version.major == other_version.version.major &&
+ version.minor == other_version.version.minor &&
+ version.patch == other_version.version.patch;
+}
+
+// Reference:
+// parquet-mr/parquet-column/src/main/java/org/apache/parquet/CorruptStatistics.java
+// PARQUET-686 has more discussion on statistics
+bool ApplicationVersion::HasCorrectStatistics(Type::type col_type,
+ EncodedStatistics& statistics,
+ SortOrder::type sort_order) const {
+ // parquet-cpp version 1.3.0 and parquet-mr 1.10.0 onwards stats are computed
+ // correctly for all types
+ if ((application_ == "parquet-cpp" && VersionLt(PARQUET_CPP_FIXED_STATS_VERSION())) ||
+ (application_ == "parquet-mr" && VersionLt(PARQUET_MR_FIXED_STATS_VERSION()))) {
+ // Only SIGNED are valid unless max and min are the same
+ // (in which case the sort order does not matter)
+ bool max_equals_min = statistics.has_min && statistics.has_max
+ ? statistics.min() == statistics.max()
+ : false;
+ if (SortOrder::SIGNED != sort_order && !max_equals_min) {
+ return false;
+ }
+
+ // Statistics of other types are OK
+ if (col_type != Type::FIXED_LEN_BYTE_ARRAY && col_type != Type::BYTE_ARRAY) {
+ return true;
+ }
+ }
+ // created_by is not populated, which could have been caused by
+ // parquet-mr during the same time as PARQUET-251, see PARQUET-297
+ if (application_ == "unknown") {
+ return true;
+ }
+
+ // Unknown sort order has incorrect stats
+ if (SortOrder::UNKNOWN == sort_order) {
+ return false;
+ }
+
+ // PARQUET-251
+ if (VersionLt(PARQUET_251_FIXED_VERSION())) {
+ return false;
+ }
+
+ return true;
+}
+
+// MetaData Builders
+// row-group metadata
+class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl {
+ public:
+ explicit ColumnChunkMetaDataBuilderImpl(std::shared_ptr<WriterProperties> props,
+ const ColumnDescriptor* column)
+ : owned_column_chunk_(new format::ColumnChunk),
+ properties_(std::move(props)),
+ column_(column) {
+ Init(owned_column_chunk_.get());
+ }
+
+ explicit ColumnChunkMetaDataBuilderImpl(std::shared_ptr<WriterProperties> props,
+ const ColumnDescriptor* column,
+ format::ColumnChunk* column_chunk)
+ : properties_(std::move(props)), column_(column) {
+ Init(column_chunk);
+ }
+
+ const void* contents() const { return column_chunk_; }
+
+ // column chunk
+ void set_file_path(const std::string& val) { column_chunk_->__set_file_path(val); }
+
+ // column metadata
+ void SetStatistics(const EncodedStatistics& val) {
+ column_chunk_->meta_data.__set_statistics(ToThrift(val));
+ }
+
+ void Finish(int64_t num_values, int64_t dictionary_page_offset,
+ int64_t index_page_offset, int64_t data_page_offset,
+ int64_t compressed_size, int64_t uncompressed_size, bool has_dictionary,
+ bool dictionary_fallback,
+ const std::map<Encoding::type, int32_t>& dict_encoding_stats,
+ const std::map<Encoding::type, int32_t>& data_encoding_stats,
+ const std::shared_ptr<Encryptor>& encryptor) {
+ if (dictionary_page_offset > 0) {
+ column_chunk_->meta_data.__set_dictionary_page_offset(dictionary_page_offset);
+ column_chunk_->__set_file_offset(dictionary_page_offset + compressed_size);
+ } else {
+ column_chunk_->__set_file_offset(data_page_offset + compressed_size);
+ }
+ column_chunk_->__isset.meta_data = true;
+ column_chunk_->meta_data.__set_num_values(num_values);
+ if (index_page_offset >= 0) {
+ column_chunk_->meta_data.__set_index_page_offset(index_page_offset);
+ }
+ column_chunk_->meta_data.__set_data_page_offset(data_page_offset);
+ column_chunk_->meta_data.__set_total_uncompressed_size(uncompressed_size);
+ column_chunk_->meta_data.__set_total_compressed_size(compressed_size);
+
+ std::vector<format::Encoding::type> thrift_encodings;
+ if (has_dictionary) {
+ thrift_encodings.push_back(ToThrift(properties_->dictionary_index_encoding()));
+ if (properties_->version() == ParquetVersion::PARQUET_1_0) {
+ thrift_encodings.push_back(ToThrift(Encoding::PLAIN));
+ } else {
+ thrift_encodings.push_back(ToThrift(properties_->dictionary_page_encoding()));
+ }
+ } else { // Dictionary not enabled
+ thrift_encodings.push_back(ToThrift(properties_->encoding(column_->path())));
+ }
+ thrift_encodings.push_back(ToThrift(Encoding::RLE));
+ // Only PLAIN encoding is supported for fallback in V1
+ // TODO(majetideepak): Use user specified encoding for V2
+ if (dictionary_fallback) {
+ thrift_encodings.push_back(ToThrift(Encoding::PLAIN));
+ }
+ column_chunk_->meta_data.__set_encodings(thrift_encodings);
+ std::vector<format::PageEncodingStats> thrift_encoding_stats;
+ // Add dictionary page encoding stats
+ for (const auto& entry : dict_encoding_stats) {
+ format::PageEncodingStats dict_enc_stat;
+ dict_enc_stat.__set_page_type(format::PageType::DICTIONARY_PAGE);
+ dict_enc_stat.__set_encoding(ToThrift(entry.first));
+ dict_enc_stat.__set_count(entry.second);
+ thrift_encoding_stats.push_back(dict_enc_stat);
+ }
+ // Add data page encoding stats
+ for (const auto& entry : data_encoding_stats) {
+ format::PageEncodingStats data_enc_stat;
+ data_enc_stat.__set_page_type(format::PageType::DATA_PAGE);
+ data_enc_stat.__set_encoding(ToThrift(entry.first));
+ data_enc_stat.__set_count(entry.second);
+ thrift_encoding_stats.push_back(data_enc_stat);
+ }
+ column_chunk_->meta_data.__set_encoding_stats(thrift_encoding_stats);
+
+ const auto& encrypt_md =
+ properties_->column_encryption_properties(column_->path()->ToDotString());
+ // column is encrypted
+ if (encrypt_md != nullptr && encrypt_md->is_encrypted()) {
+ column_chunk_->__isset.crypto_metadata = true;
+ format::ColumnCryptoMetaData ccmd;
+ if (encrypt_md->is_encrypted_with_footer_key()) {
+ // encrypted with footer key
+ ccmd.__isset.ENCRYPTION_WITH_FOOTER_KEY = true;
+ ccmd.__set_ENCRYPTION_WITH_FOOTER_KEY(format::EncryptionWithFooterKey());
+ } else { // encrypted with column key
+ format::EncryptionWithColumnKey eck;
+ eck.__set_key_metadata(encrypt_md->key_metadata());
+ eck.__set_path_in_schema(column_->path()->ToDotVector());
+ ccmd.__isset.ENCRYPTION_WITH_COLUMN_KEY = true;
+ ccmd.__set_ENCRYPTION_WITH_COLUMN_KEY(eck);
+ }
+ column_chunk_->__set_crypto_metadata(ccmd);
+
+ bool encrypted_footer =
+ properties_->file_encryption_properties()->encrypted_footer();
+ bool encrypt_metadata =
+ !encrypted_footer || !encrypt_md->is_encrypted_with_footer_key();
+ if (encrypt_metadata) {
+ ThriftSerializer serializer;
+ // Serialize and encrypt ColumnMetadata separately
+ // Thrift-serialize the ColumnMetaData structure,
+ // encrypt it with the column key, and write to encrypted_column_metadata
+ uint8_t* serialized_data;
+ uint32_t serialized_len;
+
+ serializer.SerializeToBuffer(&column_chunk_->meta_data, &serialized_len,
+ &serialized_data);
+
+ std::vector<uint8_t> encrypted_data(encryptor->CiphertextSizeDelta() +
+ serialized_len);
+ unsigned encrypted_len =
+ encryptor->Encrypt(serialized_data, serialized_len, encrypted_data.data());
+
+ const char* temp =
+ const_cast<const char*>(reinterpret_cast<char*>(encrypted_data.data()));
+ std::string encrypted_column_metadata(temp, encrypted_len);
+ column_chunk_->__set_encrypted_column_metadata(encrypted_column_metadata);
+
+ if (encrypted_footer) {
+ column_chunk_->__isset.meta_data = false;
+ } else {
+ // Keep redacted metadata version for old readers
+ column_chunk_->__isset.meta_data = true;
+ column_chunk_->meta_data.__isset.statistics = false;
+ column_chunk_->meta_data.__isset.encoding_stats = false;
+ }
+ }
+ }
+ }
+
+ void WriteTo(::arrow::io::OutputStream* sink) {
+ ThriftSerializer serializer;
+ serializer.Serialize(column_chunk_, sink);
+ }
+
+ const ColumnDescriptor* descr() const { return column_; }
+ int64_t total_compressed_size() const {
+ return column_chunk_->meta_data.total_compressed_size;
+ }
+
+ private:
+ void Init(format::ColumnChunk* column_chunk) {
+ column_chunk_ = column_chunk;
+
+ column_chunk_->meta_data.__set_type(ToThrift(column_->physical_type()));
+ column_chunk_->meta_data.__set_path_in_schema(column_->path()->ToDotVector());
+ column_chunk_->meta_data.__set_codec(
+ ToThrift(properties_->compression(column_->path())));
+ }
+
+ format::ColumnChunk* column_chunk_;
+ std::unique_ptr<format::ColumnChunk> owned_column_chunk_;
+ const std::shared_ptr<WriterProperties> properties_;
+ const ColumnDescriptor* column_;
+};
+
+std::unique_ptr<ColumnChunkMetaDataBuilder> ColumnChunkMetaDataBuilder::Make(
+ std::shared_ptr<WriterProperties> props, const ColumnDescriptor* column,
+ void* contents) {
+ return std::unique_ptr<ColumnChunkMetaDataBuilder>(
+ new ColumnChunkMetaDataBuilder(std::move(props), column, contents));
+}
+
+std::unique_ptr<ColumnChunkMetaDataBuilder> ColumnChunkMetaDataBuilder::Make(
+ std::shared_ptr<WriterProperties> props, const ColumnDescriptor* column) {
+ return std::unique_ptr<ColumnChunkMetaDataBuilder>(
+ new ColumnChunkMetaDataBuilder(std::move(props), column));
+}
+
+ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilder(
+ std::shared_ptr<WriterProperties> props, const ColumnDescriptor* column)
+ : impl_{std::unique_ptr<ColumnChunkMetaDataBuilderImpl>(
+ new ColumnChunkMetaDataBuilderImpl(std::move(props), column))} {}
+
+ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilder(
+ std::shared_ptr<WriterProperties> props, const ColumnDescriptor* column,
+ void* contents)
+ : impl_{std::unique_ptr<ColumnChunkMetaDataBuilderImpl>(
+ new ColumnChunkMetaDataBuilderImpl(
+ std::move(props), column,
+ reinterpret_cast<format::ColumnChunk*>(contents)))} {}
+
+ColumnChunkMetaDataBuilder::~ColumnChunkMetaDataBuilder() = default;
+
+const void* ColumnChunkMetaDataBuilder::contents() const { return impl_->contents(); }
+
+void ColumnChunkMetaDataBuilder::set_file_path(const std::string& path) {
+ impl_->set_file_path(path);
+}
+
+void ColumnChunkMetaDataBuilder::Finish(
+ int64_t num_values, int64_t dictionary_page_offset, int64_t index_page_offset,
+ int64_t data_page_offset, int64_t compressed_size, int64_t uncompressed_size,
+ bool has_dictionary, bool dictionary_fallback,
+ const std::map<Encoding::type, int32_t>& dict_encoding_stats,
+ const std::map<Encoding::type, int32_t>& data_encoding_stats,
+ const std::shared_ptr<Encryptor>& encryptor) {
+ impl_->Finish(num_values, dictionary_page_offset, index_page_offset, data_page_offset,
+ compressed_size, uncompressed_size, has_dictionary, dictionary_fallback,
+ dict_encoding_stats, data_encoding_stats, encryptor);
+}
+
+void ColumnChunkMetaDataBuilder::WriteTo(::arrow::io::OutputStream* sink) {
+ impl_->WriteTo(sink);
+}
+
+const ColumnDescriptor* ColumnChunkMetaDataBuilder::descr() const {
+ return impl_->descr();
+}
+
+void ColumnChunkMetaDataBuilder::SetStatistics(const EncodedStatistics& result) {
+ impl_->SetStatistics(result);
+}
+
+int64_t ColumnChunkMetaDataBuilder::total_compressed_size() const {
+ return impl_->total_compressed_size();
+}
+
+class RowGroupMetaDataBuilder::RowGroupMetaDataBuilderImpl {
+ public:
+ explicit RowGroupMetaDataBuilderImpl(std::shared_ptr<WriterProperties> props,
+ const SchemaDescriptor* schema, void* contents)
+ : properties_(std::move(props)), schema_(schema), next_column_(0) {
+ row_group_ = reinterpret_cast<format::RowGroup*>(contents);
+ InitializeColumns(schema->num_columns());
+ }
+
+ ColumnChunkMetaDataBuilder* NextColumnChunk() {
+ if (!(next_column_ < num_columns())) {
+ std::stringstream ss;
+ ss << "The schema only has " << num_columns()
+ << " columns, requested metadata for column: " << next_column_;
+ throw ParquetException(ss.str());
+ }
+ auto column = schema_->Column(next_column_);
+ auto column_builder = ColumnChunkMetaDataBuilder::Make(
+ properties_, column, &row_group_->columns[next_column_++]);
+ auto column_builder_ptr = column_builder.get();
+ column_builders_.push_back(std::move(column_builder));
+ return column_builder_ptr;
+ }
+
+ int current_column() { return next_column_ - 1; }
+
+ void Finish(int64_t total_bytes_written, int16_t row_group_ordinal) {
+ if (!(next_column_ == schema_->num_columns())) {
+ std::stringstream ss;
+ ss << "Only " << next_column_ - 1 << " out of " << schema_->num_columns()
+ << " columns are initialized";
+ throw ParquetException(ss.str());
+ }
+
+ int64_t file_offset = 0;
+ int64_t total_compressed_size = 0;
+ for (int i = 0; i < schema_->num_columns(); i++) {
+ if (!(row_group_->columns[i].file_offset >= 0)) {
+ std::stringstream ss;
+ ss << "Column " << i << " is not complete.";
+ throw ParquetException(ss.str());
+ }
+ if (i == 0) {
+ file_offset = row_group_->columns[0].file_offset;
+ }
+ // sometimes column metadata is encrypted and not available to read,
+ // so we must get total_compressed_size from column builder
+ total_compressed_size += column_builders_[i]->total_compressed_size();
+ }
+
+ row_group_->__set_file_offset(file_offset);
+ row_group_->__set_total_compressed_size(total_compressed_size);
+ row_group_->__set_total_byte_size(total_bytes_written);
+ row_group_->__set_ordinal(row_group_ordinal);
+ }
+
+ void set_num_rows(int64_t num_rows) { row_group_->num_rows = num_rows; }
+
+ int num_columns() { return static_cast<int>(row_group_->columns.size()); }
+
+ int64_t num_rows() { return row_group_->num_rows; }
+
+ private:
+ void InitializeColumns(int ncols) { row_group_->columns.resize(ncols); }
+
+ format::RowGroup* row_group_;
+ const std::shared_ptr<WriterProperties> properties_;
+ const SchemaDescriptor* schema_;
+ std::vector<std::unique_ptr<ColumnChunkMetaDataBuilder>> column_builders_;
+ int next_column_;
+};
+
+std::unique_ptr<RowGroupMetaDataBuilder> RowGroupMetaDataBuilder::Make(
+ std::shared_ptr<WriterProperties> props, const SchemaDescriptor* schema_,
+ void* contents) {
+ return std::unique_ptr<RowGroupMetaDataBuilder>(
+ new RowGroupMetaDataBuilder(std::move(props), schema_, contents));
+}
+
+RowGroupMetaDataBuilder::RowGroupMetaDataBuilder(std::shared_ptr<WriterProperties> props,
+ const SchemaDescriptor* schema_,
+ void* contents)
+ : impl_{new RowGroupMetaDataBuilderImpl(std::move(props), schema_, contents)} {}
+
+RowGroupMetaDataBuilder::~RowGroupMetaDataBuilder() = default;
+
+ColumnChunkMetaDataBuilder* RowGroupMetaDataBuilder::NextColumnChunk() {
+ return impl_->NextColumnChunk();
+}
+
+int RowGroupMetaDataBuilder::current_column() const { return impl_->current_column(); }
+
+int RowGroupMetaDataBuilder::num_columns() { return impl_->num_columns(); }
+
+int64_t RowGroupMetaDataBuilder::num_rows() { return impl_->num_rows(); }
+
+void RowGroupMetaDataBuilder::set_num_rows(int64_t num_rows) {
+ impl_->set_num_rows(num_rows);
+}
+
+void RowGroupMetaDataBuilder::Finish(int64_t total_bytes_written,
+ int16_t row_group_ordinal) {
+ impl_->Finish(total_bytes_written, row_group_ordinal);
+}
+
+// file metadata
+// TODO(PARQUET-595) Support key_value_metadata
+class FileMetaDataBuilder::FileMetaDataBuilderImpl {
+ public:
+ explicit FileMetaDataBuilderImpl(
+ const SchemaDescriptor* schema, std::shared_ptr<WriterProperties> props,
+ std::shared_ptr<const KeyValueMetadata> key_value_metadata)
+ : metadata_(new format::FileMetaData()),
+ properties_(std::move(props)),
+ schema_(schema),
+ key_value_metadata_(std::move(key_value_metadata)) {
+ if (properties_->file_encryption_properties() != nullptr &&
+ properties_->file_encryption_properties()->encrypted_footer()) {
+ crypto_metadata_.reset(new format::FileCryptoMetaData());
+ }
+ }
+
+ RowGroupMetaDataBuilder* AppendRowGroup() {
+ row_groups_.emplace_back();
+ current_row_group_builder_ =
+ RowGroupMetaDataBuilder::Make(properties_, schema_, &row_groups_.back());
+ return current_row_group_builder_.get();
+ }
+
+ std::unique_ptr<FileMetaData> Finish() {
+ int64_t total_rows = 0;
+ for (auto row_group : row_groups_) {
+ total_rows += row_group.num_rows;
+ }
+ metadata_->__set_num_rows(total_rows);
+ metadata_->__set_row_groups(row_groups_);
+
+ if (key_value_metadata_) {
+ metadata_->key_value_metadata.clear();
+ metadata_->key_value_metadata.reserve(key_value_metadata_->size());
+ for (int64_t i = 0; i < key_value_metadata_->size(); ++i) {
+ format::KeyValue kv_pair;
+ kv_pair.__set_key(key_value_metadata_->key(i));
+ kv_pair.__set_value(key_value_metadata_->value(i));
+ metadata_->key_value_metadata.push_back(kv_pair);
+ }
+ metadata_->__isset.key_value_metadata = true;
+ }
+
+ int32_t file_version = 0;
+ switch (properties_->version()) {
+ case ParquetVersion::PARQUET_1_0:
+ file_version = 1;
+ break;
+ case ParquetVersion::PARQUET_2_0:
+ file_version = 2;
+ break;
+ default:
+ break;
+ }
+ metadata_->__set_version(file_version);
+ metadata_->__set_created_by(properties_->created_by());
+
+ // Users cannot set the `ColumnOrder` since we donot not have user defined sort order
+ // in the spec yet.
+ // We always default to `TYPE_DEFINED_ORDER`. We can expose it in
+ // the API once we have user defined sort orders in the Parquet format.
+ // TypeDefinedOrder implies choose SortOrder based on ConvertedType/PhysicalType
+ format::TypeDefinedOrder type_defined_order;
+ format::ColumnOrder column_order;
+ column_order.__set_TYPE_ORDER(type_defined_order);
+ column_order.__isset.TYPE_ORDER = true;
+ metadata_->column_orders.resize(schema_->num_columns(), column_order);
+ metadata_->__isset.column_orders = true;
+
+ // if plaintext footer, set footer signing algorithm
+ auto file_encryption_properties = properties_->file_encryption_properties();
+ if (file_encryption_properties && !file_encryption_properties->encrypted_footer()) {
+ EncryptionAlgorithm signing_algorithm;
+ EncryptionAlgorithm algo = file_encryption_properties->algorithm();
+ signing_algorithm.aad.aad_file_unique = algo.aad.aad_file_unique;
+ signing_algorithm.aad.supply_aad_prefix = algo.aad.supply_aad_prefix;
+ if (!algo.aad.supply_aad_prefix) {
+ signing_algorithm.aad.aad_prefix = algo.aad.aad_prefix;
+ }
+ signing_algorithm.algorithm = ParquetCipher::AES_GCM_V1;
+
+ metadata_->__set_encryption_algorithm(ToThrift(signing_algorithm));
+ const std::string& footer_signing_key_metadata =
+ file_encryption_properties->footer_key_metadata();
+ if (footer_signing_key_metadata.size() > 0) {
+ metadata_->__set_footer_signing_key_metadata(footer_signing_key_metadata);
+ }
+ }
+
+ ToParquet(static_cast<parquet::schema::GroupNode*>(schema_->schema_root().get()),
+ &metadata_->schema);
+ auto file_meta_data = std::unique_ptr<FileMetaData>(new FileMetaData());
+ file_meta_data->impl_->metadata_ = std::move(metadata_);
+ file_meta_data->impl_->InitSchema();
+ file_meta_data->impl_->InitKeyValueMetadata();
+ return file_meta_data;
+ }
+
+ std::unique_ptr<FileCryptoMetaData> BuildFileCryptoMetaData() {
+ if (crypto_metadata_ == nullptr) {
+ return nullptr;
+ }
+
+ auto file_encryption_properties = properties_->file_encryption_properties();
+
+ crypto_metadata_->__set_encryption_algorithm(
+ ToThrift(file_encryption_properties->algorithm()));
+ std::string key_metadata = file_encryption_properties->footer_key_metadata();
+
+ if (!key_metadata.empty()) {
+ crypto_metadata_->__set_key_metadata(key_metadata);
+ }
+
+ std::unique_ptr<FileCryptoMetaData> file_crypto_metadata =
+ std::unique_ptr<FileCryptoMetaData>(new FileCryptoMetaData());
+ file_crypto_metadata->impl_->metadata_ = std::move(crypto_metadata_);
+
+ return file_crypto_metadata;
+ }
+
+ protected:
+ std::unique_ptr<format::FileMetaData> metadata_;
+ std::unique_ptr<format::FileCryptoMetaData> crypto_metadata_;
+
+ private:
+ const std::shared_ptr<WriterProperties> properties_;
+ std::vector<format::RowGroup> row_groups_;
+
+ std::unique_ptr<RowGroupMetaDataBuilder> current_row_group_builder_;
+ const SchemaDescriptor* schema_;
+ std::shared_ptr<const KeyValueMetadata> key_value_metadata_;
+};
+
+std::unique_ptr<FileMetaDataBuilder> FileMetaDataBuilder::Make(
+ const SchemaDescriptor* schema, std::shared_ptr<WriterProperties> props,
+ std::shared_ptr<const KeyValueMetadata> key_value_metadata) {
+ return std::unique_ptr<FileMetaDataBuilder>(
+ new FileMetaDataBuilder(schema, std::move(props), std::move(key_value_metadata)));
+}
+
+FileMetaDataBuilder::FileMetaDataBuilder(
+ const SchemaDescriptor* schema, std::shared_ptr<WriterProperties> props,
+ std::shared_ptr<const KeyValueMetadata> key_value_metadata)
+ : impl_{std::unique_ptr<FileMetaDataBuilderImpl>(new FileMetaDataBuilderImpl(
+ schema, std::move(props), std::move(key_value_metadata)))} {}
+
+FileMetaDataBuilder::~FileMetaDataBuilder() = default;
+
+RowGroupMetaDataBuilder* FileMetaDataBuilder::AppendRowGroup() {
+ return impl_->AppendRowGroup();
+}
+
+std::unique_ptr<FileMetaData> FileMetaDataBuilder::Finish() { return impl_->Finish(); }
+
+std::unique_ptr<FileCryptoMetaData> FileMetaDataBuilder::GetCryptoMetaData() {
+ return impl_->BuildFileCryptoMetaData();
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/metadata.h b/contrib/libs/apache/arrow/cpp/src/parquet/metadata.h
new file mode 100644
index 00000000000..1865115e423
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/metadata.h
@@ -0,0 +1,484 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "parquet/platform.h"
+#include "parquet/properties.h"
+#include "parquet/schema.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+class ColumnDescriptor;
+class EncodedStatistics;
+class Statistics;
+class SchemaDescriptor;
+
+class FileCryptoMetaData;
+class InternalFileDecryptor;
+class Decryptor;
+class Encryptor;
+class FooterSigningEncryptor;
+
+namespace schema {
+
+class ColumnPath;
+
+} // namespace schema
+
+using KeyValueMetadata = ::arrow::KeyValueMetadata;
+
+class PARQUET_EXPORT ApplicationVersion {
+ public:
+ // Known Versions with Issues
+ static const ApplicationVersion& PARQUET_251_FIXED_VERSION();
+ static const ApplicationVersion& PARQUET_816_FIXED_VERSION();
+ static const ApplicationVersion& PARQUET_CPP_FIXED_STATS_VERSION();
+ static const ApplicationVersion& PARQUET_MR_FIXED_STATS_VERSION();
+
+ // Application that wrote the file. e.g. "IMPALA"
+ std::string application_;
+ // Build name
+ std::string build_;
+
+ // Version of the application that wrote the file, expressed as
+ // (<major>.<minor>.<patch>). Unmatched parts default to 0.
+ // "1.2.3" => {1, 2, 3}
+ // "1.2" => {1, 2, 0}
+ // "1.2-cdh5" => {1, 2, 0}
+ struct {
+ int major;
+ int minor;
+ int patch;
+ std::string unknown;
+ std::string pre_release;
+ std::string build_info;
+ } version;
+
+ ApplicationVersion() = default;
+ explicit ApplicationVersion(const std::string& created_by);
+ ApplicationVersion(std::string application, int major, int minor, int patch);
+
+ // Returns true if version is strictly less than other_version
+ bool VersionLt(const ApplicationVersion& other_version) const;
+
+ // Returns true if version is strictly less than other_version
+ bool VersionEq(const ApplicationVersion& other_version) const;
+
+ // Checks if the Version has the correct statistics for a given column
+ bool HasCorrectStatistics(Type::type primitive, EncodedStatistics& statistics,
+ SortOrder::type sort_order = SortOrder::SIGNED) const;
+};
+
+class PARQUET_EXPORT ColumnCryptoMetaData {
+ public:
+ static std::unique_ptr<ColumnCryptoMetaData> Make(const uint8_t* metadata);
+ ~ColumnCryptoMetaData();
+
+ bool Equals(const ColumnCryptoMetaData& other) const;
+
+ std::shared_ptr<schema::ColumnPath> path_in_schema() const;
+ bool encrypted_with_footer_key() const;
+ const std::string& key_metadata() const;
+
+ private:
+ explicit ColumnCryptoMetaData(const uint8_t* metadata);
+
+ class ColumnCryptoMetaDataImpl;
+ std::unique_ptr<ColumnCryptoMetaDataImpl> impl_;
+};
+
+/// \brief Public struct for Thrift PageEncodingStats in ColumnChunkMetaData
+struct PageEncodingStats {
+ PageType::type page_type;
+ Encoding::type encoding;
+ int32_t count;
+};
+
+/// \brief ColumnChunkMetaData is a proxy around format::ColumnChunkMetaData.
+class PARQUET_EXPORT ColumnChunkMetaData {
+ public:
+ // API convenience to get a MetaData accessor
+ static std::unique_ptr<ColumnChunkMetaData> Make(
+ const void* metadata, const ColumnDescriptor* descr,
+ const ApplicationVersion* writer_version = NULLPTR, int16_t row_group_ordinal = -1,
+ int16_t column_ordinal = -1,
+ std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
+
+ ~ColumnChunkMetaData();
+
+ bool Equals(const ColumnChunkMetaData& other) const;
+
+ // column chunk
+ int64_t file_offset() const;
+
+ // parameter is only used when a dataset is spread across multiple files
+ const std::string& file_path() const;
+
+ // column metadata
+ bool is_metadata_set() const;
+ Type::type type() const;
+ int64_t num_values() const;
+ std::shared_ptr<schema::ColumnPath> path_in_schema() const;
+ bool is_stats_set() const;
+ std::shared_ptr<Statistics> statistics() const;
+
+ Compression::type compression() const;
+ // Indicate if the ColumnChunk compression is supported by the current
+ // compiled parquet library.
+ bool can_decompress() const;
+
+ const std::vector<Encoding::type>& encodings() const;
+ const std::vector<PageEncodingStats>& encoding_stats() const;
+ bool has_dictionary_page() const;
+ int64_t dictionary_page_offset() const;
+ int64_t data_page_offset() const;
+ bool has_index_page() const;
+ int64_t index_page_offset() const;
+ int64_t total_compressed_size() const;
+ int64_t total_uncompressed_size() const;
+ std::unique_ptr<ColumnCryptoMetaData> crypto_metadata() const;
+
+ private:
+ explicit ColumnChunkMetaData(
+ const void* metadata, const ColumnDescriptor* descr, int16_t row_group_ordinal,
+ int16_t column_ordinal, const ApplicationVersion* writer_version = NULLPTR,
+ std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
+ // PIMPL Idiom
+ class ColumnChunkMetaDataImpl;
+ std::unique_ptr<ColumnChunkMetaDataImpl> impl_;
+};
+
+/// \brief RowGroupMetaData is a proxy around format::RowGroupMetaData.
+class PARQUET_EXPORT RowGroupMetaData {
+ public:
+ /// \brief Create a RowGroupMetaData from a serialized thrift message.
+ static std::unique_ptr<RowGroupMetaData> Make(
+ const void* metadata, const SchemaDescriptor* schema,
+ const ApplicationVersion* writer_version = NULLPTR,
+ std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
+
+ ~RowGroupMetaData();
+
+ bool Equals(const RowGroupMetaData& other) const;
+
+ /// \brief The number of columns in this row group. The order must match the
+ /// parent's column ordering.
+ int num_columns() const;
+
+ /// \brief Return the ColumnChunkMetaData of the corresponding column ordinal.
+ ///
+ /// WARNING, the returned object references memory location in it's parent
+ /// (RowGroupMetaData) object. Hence, the parent must outlive the returned
+ /// object.
+ ///
+ /// \param[in] index of the ColumnChunkMetaData to retrieve.
+ ///
+ /// \throws ParquetException if the index is out of bound.
+ std::unique_ptr<ColumnChunkMetaData> ColumnChunk(int index) const;
+
+ /// \brief Number of rows in this row group.
+ int64_t num_rows() const;
+
+ /// \brief Total byte size of all the uncompressed column data in this row group.
+ int64_t total_byte_size() const;
+
+ /// \brief Total byte size of all the compressed (and potentially encrypted)
+ /// column data in this row group.
+ ///
+ /// This information is optional and may be 0 if omitted.
+ int64_t total_compressed_size() const;
+
+ /// \brief Byte offset from beginning of file to first page (data or
+ /// dictionary) in this row group
+ ///
+ /// The file_offset field that this method exposes is optional. This method
+ /// will return 0 if that field is not set to a meaningful value.
+ int64_t file_offset() const;
+ // Return const-pointer to make it clear that this object is not to be copied
+ const SchemaDescriptor* schema() const;
+ // Indicate if all of the RowGroup's ColumnChunks can be decompressed.
+ bool can_decompress() const;
+
+ private:
+ explicit RowGroupMetaData(
+ const void* metadata, const SchemaDescriptor* schema,
+ const ApplicationVersion* writer_version = NULLPTR,
+ std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
+ // PIMPL Idiom
+ class RowGroupMetaDataImpl;
+ std::unique_ptr<RowGroupMetaDataImpl> impl_;
+};
+
+class FileMetaDataBuilder;
+
+/// \brief FileMetaData is a proxy around format::FileMetaData.
+class PARQUET_EXPORT FileMetaData {
+ public:
+ /// \brief Create a FileMetaData from a serialized thrift message.
+ static std::shared_ptr<FileMetaData> Make(
+ const void* serialized_metadata, uint32_t* inout_metadata_len,
+ std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
+
+ ~FileMetaData();
+
+ bool Equals(const FileMetaData& other) const;
+
+ /// \brief The number of top-level columns in the schema.
+ ///
+ /// Parquet thrift definition requires that nested schema elements are
+ /// flattened. This method returns the number of columns in the un-flattened
+ /// version.
+ int num_columns() const;
+
+ /// \brief The number of flattened schema elements.
+ ///
+ /// Parquet thrift definition requires that nested schema elements are
+ /// flattened. This method returns the total number of elements in the
+ /// flattened list.
+ int num_schema_elements() const;
+
+ /// \brief The total number of rows.
+ int64_t num_rows() const;
+
+ /// \brief The number of row groups in the file.
+ int num_row_groups() const;
+
+ /// \brief Return the RowGroupMetaData of the corresponding row group ordinal.
+ ///
+ /// WARNING, the returned object references memory location in it's parent
+ /// (FileMetaData) object. Hence, the parent must outlive the returned object.
+ ///
+ /// \param[in] index of the RowGroup to retrieve.
+ ///
+ /// \throws ParquetException if the index is out of bound.
+ std::unique_ptr<RowGroupMetaData> RowGroup(int index) const;
+
+ /// \brief Return the version of the file.
+ ParquetVersion::type version() const;
+
+ /// \brief Return the application's user-agent string of the writer.
+ const std::string& created_by() const;
+
+ /// \brief Return the application's version of the writer.
+ const ApplicationVersion& writer_version() const;
+
+ /// \brief Size of the original thrift encoded metadata footer.
+ uint32_t size() const;
+
+ /// \brief Indicate if all of the FileMetadata's RowGroups can be decompressed.
+ ///
+ /// This will return false if any of the RowGroup's page is compressed with a
+ /// compression format which is not compiled in the current parquet library.
+ bool can_decompress() const;
+
+ bool is_encryption_algorithm_set() const;
+ EncryptionAlgorithm encryption_algorithm() const;
+ const std::string& footer_signing_key_metadata() const;
+
+ /// \brief Verify signature of FileMetaData when file is encrypted but footer
+ /// is not encrypted (plaintext footer).
+ bool VerifySignature(const void* signature);
+
+ void WriteTo(::arrow::io::OutputStream* dst,
+ const std::shared_ptr<Encryptor>& encryptor = NULLPTR) const;
+
+ /// \brief Return Thrift-serialized representation of the metadata as a
+ /// string
+ std::string SerializeToString() const;
+
+ // Return const-pointer to make it clear that this object is not to be copied
+ const SchemaDescriptor* schema() const;
+
+ const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const;
+
+ /// \brief Set a path to all ColumnChunk for all RowGroups.
+ ///
+ /// Commonly used by systems (Dask, Spark) who generates an metadata-only
+ /// parquet file. The path is usually relative to said index file.
+ ///
+ /// \param[in] path to set.
+ void set_file_path(const std::string& path);
+
+ /// \brief Merge row groups from another metadata file into this one.
+ ///
+ /// The schema of the input FileMetaData must be equal to the
+ /// schema of this object.
+ ///
+ /// This is used by systems who creates an aggregate metadata-only file by
+ /// concatenating the row groups of multiple files. This newly created
+ /// metadata file acts as an index of all available row groups.
+ ///
+ /// \param[in] other FileMetaData to merge the row groups from.
+ ///
+ /// \throws ParquetException if schemas are not equal.
+ void AppendRowGroups(const FileMetaData& other);
+
+ /// \brief Return a FileMetaData containing a subset of the row groups in this
+ /// FileMetaData.
+ std::shared_ptr<FileMetaData> Subset(const std::vector<int>& row_groups) const;
+
+ private:
+ friend FileMetaDataBuilder;
+ friend class SerializedFile;
+
+ explicit FileMetaData(const void* serialized_metadata, uint32_t* metadata_len,
+ std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
+
+ void set_file_decryptor(std::shared_ptr<InternalFileDecryptor> file_decryptor);
+
+ // PIMPL Idiom
+ FileMetaData();
+ class FileMetaDataImpl;
+ std::unique_ptr<FileMetaDataImpl> impl_;
+};
+
+class PARQUET_EXPORT FileCryptoMetaData {
+ public:
+ // API convenience to get a MetaData accessor
+ static std::shared_ptr<FileCryptoMetaData> Make(const uint8_t* serialized_metadata,
+ uint32_t* metadata_len);
+ ~FileCryptoMetaData();
+
+ EncryptionAlgorithm encryption_algorithm() const;
+ const std::string& key_metadata() const;
+
+ void WriteTo(::arrow::io::OutputStream* dst) const;
+
+ private:
+ friend FileMetaDataBuilder;
+ FileCryptoMetaData(const uint8_t* serialized_metadata, uint32_t* metadata_len);
+
+ // PIMPL Idiom
+ FileCryptoMetaData();
+ class FileCryptoMetaDataImpl;
+ std::unique_ptr<FileCryptoMetaDataImpl> impl_;
+};
+
+// Builder API
+class PARQUET_EXPORT ColumnChunkMetaDataBuilder {
+ public:
+ // API convenience to get a MetaData reader
+ static std::unique_ptr<ColumnChunkMetaDataBuilder> Make(
+ std::shared_ptr<WriterProperties> props, const ColumnDescriptor* column);
+
+ static std::unique_ptr<ColumnChunkMetaDataBuilder> Make(
+ std::shared_ptr<WriterProperties> props, const ColumnDescriptor* column,
+ void* contents);
+
+ ~ColumnChunkMetaDataBuilder();
+
+ // column chunk
+ // Used when a dataset is spread across multiple files
+ void set_file_path(const std::string& path);
+ // column metadata
+ void SetStatistics(const EncodedStatistics& stats);
+ // get the column descriptor
+ const ColumnDescriptor* descr() const;
+
+ int64_t total_compressed_size() const;
+ // commit the metadata
+
+ void Finish(int64_t num_values, int64_t dictionary_page_offset,
+ int64_t index_page_offset, int64_t data_page_offset,
+ int64_t compressed_size, int64_t uncompressed_size, bool has_dictionary,
+ bool dictionary_fallback,
+ const std::map<Encoding::type, int32_t>& dict_encoding_stats_,
+ const std::map<Encoding::type, int32_t>& data_encoding_stats_,
+ const std::shared_ptr<Encryptor>& encryptor = NULLPTR);
+
+ // The metadata contents, suitable for passing to ColumnChunkMetaData::Make
+ const void* contents() const;
+
+ // For writing metadata at end of column chunk
+ void WriteTo(::arrow::io::OutputStream* sink);
+
+ private:
+ explicit ColumnChunkMetaDataBuilder(std::shared_ptr<WriterProperties> props,
+ const ColumnDescriptor* column);
+ explicit ColumnChunkMetaDataBuilder(std::shared_ptr<WriterProperties> props,
+ const ColumnDescriptor* column, void* contents);
+ // PIMPL Idiom
+ class ColumnChunkMetaDataBuilderImpl;
+ std::unique_ptr<ColumnChunkMetaDataBuilderImpl> impl_;
+};
+
+class PARQUET_EXPORT RowGroupMetaDataBuilder {
+ public:
+ // API convenience to get a MetaData reader
+ static std::unique_ptr<RowGroupMetaDataBuilder> Make(
+ std::shared_ptr<WriterProperties> props, const SchemaDescriptor* schema_,
+ void* contents);
+
+ ~RowGroupMetaDataBuilder();
+
+ ColumnChunkMetaDataBuilder* NextColumnChunk();
+ int num_columns();
+ int64_t num_rows();
+ int current_column() const;
+
+ void set_num_rows(int64_t num_rows);
+
+ // commit the metadata
+ void Finish(int64_t total_bytes_written, int16_t row_group_ordinal = -1);
+
+ private:
+ explicit RowGroupMetaDataBuilder(std::shared_ptr<WriterProperties> props,
+ const SchemaDescriptor* schema_, void* contents);
+ // PIMPL Idiom
+ class RowGroupMetaDataBuilderImpl;
+ std::unique_ptr<RowGroupMetaDataBuilderImpl> impl_;
+};
+
+class PARQUET_EXPORT FileMetaDataBuilder {
+ public:
+ // API convenience to get a MetaData reader
+ static std::unique_ptr<FileMetaDataBuilder> Make(
+ const SchemaDescriptor* schema, std::shared_ptr<WriterProperties> props,
+ std::shared_ptr<const KeyValueMetadata> key_value_metadata = NULLPTR);
+
+ ~FileMetaDataBuilder();
+
+ // The prior RowGroupMetaDataBuilder (if any) is destroyed
+ RowGroupMetaDataBuilder* AppendRowGroup();
+
+ // Complete the Thrift structure
+ std::unique_ptr<FileMetaData> Finish();
+
+ // crypto metadata
+ std::unique_ptr<FileCryptoMetaData> GetCryptoMetaData();
+
+ private:
+ explicit FileMetaDataBuilder(
+ const SchemaDescriptor* schema, std::shared_ptr<WriterProperties> props,
+ std::shared_ptr<const KeyValueMetadata> key_value_metadata = NULLPTR);
+ // PIMPL Idiom
+ class FileMetaDataBuilderImpl;
+ std::unique_ptr<FileMetaDataBuilderImpl> impl_;
+};
+
+PARQUET_EXPORT std::string ParquetVersionToString(ParquetVersion::type ver);
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/murmur3.cc b/contrib/libs/apache/arrow/cpp/src/parquet/murmur3.cc
new file mode 100644
index 00000000000..07a936e0412
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/murmur3.cc
@@ -0,0 +1,222 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//-----------------------------------------------------------------------------
+// MurmurHash3 was written by Austin Appleby, and is placed in the public
+// domain. The author hereby disclaims copyright to this source code.
+
+// Note - The x86 and x64 versions do _not_ produce the same results, as the
+// algorithms are optimized for their respective platforms. You can still
+// compile and run any of them on any platform, but your performance with the
+// non-native version will be less than optimal.
+
+#include "parquet/murmur3.h"
+
+namespace parquet {
+
+#if defined(_MSC_VER)
+
+#define FORCE_INLINE __forceinline
+#define ROTL64(x, y) _rotl64(x, y)
+
+#else // defined(_MSC_VER)
+
+#define FORCE_INLINE inline __attribute__((always_inline))
+inline uint64_t rotl64(uint64_t x, int8_t r) { return (x << r) | (x >> (64 - r)); }
+#define ROTL64(x, y) rotl64(x, y)
+
+#endif // !defined(_MSC_VER)
+
+#define BIG_CONSTANT(x) (x##LLU)
+
+//-----------------------------------------------------------------------------
+// Block read - if your platform needs to do endian-swapping or can only
+// handle aligned reads, do the conversion here
+
+FORCE_INLINE uint32_t getblock32(const uint32_t* p, int i) { return p[i]; }
+
+FORCE_INLINE uint64_t getblock64(const uint64_t* p, int i) { return p[i]; }
+
+//-----------------------------------------------------------------------------
+// Finalization mix - force all bits of a hash block to avalanche
+
+FORCE_INLINE uint32_t fmix32(uint32_t h) {
+ h ^= h >> 16;
+ h *= 0x85ebca6b;
+ h ^= h >> 13;
+ h *= 0xc2b2ae35;
+ h ^= h >> 16;
+
+ return h;
+}
+
+//----------
+
+FORCE_INLINE uint64_t fmix64(uint64_t k) {
+ k ^= k >> 33;
+ k *= BIG_CONSTANT(0xff51afd7ed558ccd);
+ k ^= k >> 33;
+ k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53);
+ k ^= k >> 33;
+
+ return k;
+}
+
+//-----------------------------------------------------------------------------
+
+void Hash_x64_128(const void* key, const int len, const uint32_t seed, uint64_t out[2]) {
+ const uint8_t* data = (const uint8_t*)key;
+ const int nblocks = len / 16;
+
+ uint64_t h1 = seed;
+ uint64_t h2 = seed;
+
+ const uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5);
+ const uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f);
+
+ //----------
+ // body
+
+ const uint64_t* blocks = (const uint64_t*)(data);
+
+ for (int i = 0; i < nblocks; i++) {
+ uint64_t k1 = getblock64(blocks, i * 2 + 0);
+ uint64_t k2 = getblock64(blocks, i * 2 + 1);
+
+ k1 *= c1;
+ k1 = ROTL64(k1, 31);
+ k1 *= c2;
+ h1 ^= k1;
+
+ h1 = ROTL64(h1, 27);
+ h1 += h2;
+ h1 = h1 * 5 + 0x52dce729;
+
+ k2 *= c2;
+ k2 = ROTL64(k2, 33);
+ k2 *= c1;
+ h2 ^= k2;
+
+ h2 = ROTL64(h2, 31);
+ h2 += h1;
+ h2 = h2 * 5 + 0x38495ab5;
+ }
+
+ //----------
+ // tail
+
+ const uint8_t* tail = (const uint8_t*)(data + nblocks * 16);
+
+ uint64_t k1 = 0;
+ uint64_t k2 = 0;
+
+ switch (len & 15) {
+ case 15:
+ k2 ^= ((uint64_t)tail[14]) << 48; // fall through
+ case 14:
+ k2 ^= ((uint64_t)tail[13]) << 40; // fall through
+ case 13:
+ k2 ^= ((uint64_t)tail[12]) << 32; // fall through
+ case 12:
+ k2 ^= ((uint64_t)tail[11]) << 24; // fall through
+ case 11:
+ k2 ^= ((uint64_t)tail[10]) << 16; // fall through
+ case 10:
+ k2 ^= ((uint64_t)tail[9]) << 8; // fall through
+ case 9:
+ k2 ^= ((uint64_t)tail[8]) << 0;
+ k2 *= c2;
+ k2 = ROTL64(k2, 33);
+ k2 *= c1;
+ h2 ^= k2; // fall through
+
+ case 8:
+ k1 ^= ((uint64_t)tail[7]) << 56; // fall through
+ case 7:
+ k1 ^= ((uint64_t)tail[6]) << 48; // fall through
+ case 6:
+ k1 ^= ((uint64_t)tail[5]) << 40; // fall through
+ case 5:
+ k1 ^= ((uint64_t)tail[4]) << 32; // fall through
+ case 4:
+ k1 ^= ((uint64_t)tail[3]) << 24; // fall through
+ case 3:
+ k1 ^= ((uint64_t)tail[2]) << 16; // fall through
+ case 2:
+ k1 ^= ((uint64_t)tail[1]) << 8; // fall through
+ case 1:
+ k1 ^= ((uint64_t)tail[0]) << 0;
+ k1 *= c1;
+ k1 = ROTL64(k1, 31);
+ k1 *= c2;
+ h1 ^= k1;
+ }
+
+ //----------
+ // finalization
+
+ h1 ^= len;
+ h2 ^= len;
+
+ h1 += h2;
+ h2 += h1;
+
+ h1 = fmix64(h1);
+ h2 = fmix64(h2);
+
+ h1 += h2;
+ h2 += h1;
+
+ reinterpret_cast<uint64_t*>(out)[0] = h1;
+ reinterpret_cast<uint64_t*>(out)[1] = h2;
+}
+
+template <typename T>
+uint64_t HashHelper(T value, uint32_t seed) {
+ uint64_t output[2];
+ Hash_x64_128(reinterpret_cast<void*>(&value), sizeof(T), seed, output);
+ return output[0];
+}
+
+uint64_t MurmurHash3::Hash(int32_t value) const { return HashHelper(value, seed_); }
+
+uint64_t MurmurHash3::Hash(int64_t value) const { return HashHelper(value, seed_); }
+
+uint64_t MurmurHash3::Hash(float value) const { return HashHelper(value, seed_); }
+
+uint64_t MurmurHash3::Hash(double value) const { return HashHelper(value, seed_); }
+
+uint64_t MurmurHash3::Hash(const FLBA* value, uint32_t len) const {
+ uint64_t out[2];
+ Hash_x64_128(reinterpret_cast<const void*>(value->ptr), len, seed_, out);
+ return out[0];
+}
+
+uint64_t MurmurHash3::Hash(const Int96* value) const {
+ uint64_t out[2];
+ Hash_x64_128(reinterpret_cast<const void*>(value->value), sizeof(value->value), seed_,
+ out);
+ return out[0];
+}
+
+uint64_t MurmurHash3::Hash(const ByteArray* value) const {
+ uint64_t out[2];
+ Hash_x64_128(reinterpret_cast<const void*>(value->ptr), value->len, seed_, out);
+ return out[0];
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/murmur3.h b/contrib/libs/apache/arrow/cpp/src/parquet/murmur3.h
new file mode 100644
index 00000000000..acf7088e44b
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/murmur3.h
@@ -0,0 +1,54 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//-----------------------------------------------------------------------------
+// MurmurHash3 was written by Austin Appleby, and is placed in the public
+// domain. The author hereby disclaims copyright to this source code.
+
+#pragma once
+
+#include <cstdint>
+
+#include "parquet/hasher.h"
+#include "parquet/platform.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+/// Source:
+/// https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp
+/// (Modified to adapt to coding conventions and to inherit the Hasher abstract class)
+class PARQUET_EXPORT MurmurHash3 : public Hasher {
+ public:
+ MurmurHash3() : seed_(DEFAULT_SEED) {}
+ uint64_t Hash(int32_t value) const override;
+ uint64_t Hash(int64_t value) const override;
+ uint64_t Hash(float value) const override;
+ uint64_t Hash(double value) const override;
+ uint64_t Hash(const Int96* value) const override;
+ uint64_t Hash(const ByteArray* value) const override;
+ uint64_t Hash(const FLBA* val, uint32_t len) const override;
+
+ private:
+ // Default seed for hash which comes from Bloom filter in parquet-mr, it is generated
+ // by System.nanoTime() of java.
+ static constexpr int DEFAULT_SEED = 1361930890;
+
+ uint32_t seed_;
+};
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/platform.cc b/contrib/libs/apache/arrow/cpp/src/parquet/platform.cc
new file mode 100644
index 00000000000..5c355c28be1
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/platform.cc
@@ -0,0 +1,41 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/platform.h"
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+
+#include "arrow/io/memory.h"
+
+#include "parquet/exception.h"
+
+namespace parquet {
+
+std::shared_ptr<::arrow::io::BufferOutputStream> CreateOutputStream(MemoryPool* pool) {
+ PARQUET_ASSIGN_OR_THROW(auto stream, ::arrow::io::BufferOutputStream::Create(
+ kDefaultOutputStreamSize, pool));
+ return stream;
+}
+
+std::shared_ptr<ResizableBuffer> AllocateBuffer(MemoryPool* pool, int64_t size) {
+ PARQUET_ASSIGN_OR_THROW(auto result, ::arrow::AllocateResizableBuffer(size, pool));
+ return std::move(result);
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/platform.h b/contrib/libs/apache/arrow/cpp/src/parquet/platform.h
new file mode 100644
index 00000000000..00a193f144a
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/platform.h
@@ -0,0 +1,111 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+
+#include "arrow/buffer.h" // IWYU pragma: export
+#include "arrow/io/interfaces.h" // IWYU pragma: export
+#include "arrow/status.h" // IWYU pragma: export
+#include "arrow/type_fwd.h" // IWYU pragma: export
+#include "arrow/util/macros.h" // IWYU pragma: export
+
+#if defined(_WIN32) || defined(__CYGWIN__)
+
+#if defined(_MSC_VER)
+#pragma warning(push)
+// Disable warning for STL types usage in DLL interface
+// https://web.archive.org/web/20130317015847/http://connect.microsoft.com/VisualStudio/feedback/details/696593/vc-10-vs-2010-basic-string-exports
+#pragma warning(disable : 4275 4251)
+// Disable diamond inheritance warnings
+#pragma warning(disable : 4250)
+// Disable macro redefinition warnings
+#pragma warning(disable : 4005)
+// Disable extern before exported template warnings
+#pragma warning(disable : 4910)
+#else
+#pragma GCC diagnostic ignored "-Wattributes"
+#endif
+
+#ifdef PARQUET_STATIC
+#define PARQUET_EXPORT
+#elif defined(PARQUET_EXPORTING)
+#define PARQUET_EXPORT __declspec(dllexport)
+#else
+#define PARQUET_EXPORT __declspec(dllimport)
+#endif
+
+#define PARQUET_NO_EXPORT
+
+#else // Not Windows
+#ifndef PARQUET_EXPORT
+#define PARQUET_EXPORT __attribute__((visibility("default")))
+#endif
+#ifndef PARQUET_NO_EXPORT
+#define PARQUET_NO_EXPORT __attribute__((visibility("hidden")))
+#endif
+#endif // Non-Windows
+
+// This is a complicated topic, some reading on it:
+// http://www.codesynthesis.com/~boris/blog/2010/01/18/dll-export-cxx-templates/
+#if defined(_MSC_VER) || defined(__clang__)
+#define PARQUET_TEMPLATE_CLASS_EXPORT
+#define PARQUET_TEMPLATE_EXPORT PARQUET_EXPORT
+#else
+#define PARQUET_TEMPLATE_CLASS_EXPORT PARQUET_EXPORT
+#define PARQUET_TEMPLATE_EXPORT
+#endif
+
+#define PARQUET_DISALLOW_COPY_AND_ASSIGN ARROW_DISALLOW_COPY_AND_ASSIGN
+
+#define PARQUET_NORETURN ARROW_NORETURN
+#define PARQUET_DEPRECATED ARROW_DEPRECATED
+
+// If ARROW_VALGRIND set when compiling unit tests, also define
+// PARQUET_VALGRIND
+#ifdef ARROW_VALGRIND
+#define PARQUET_VALGRIND
+#endif
+
+namespace parquet {
+
+using Buffer = ::arrow::Buffer;
+using Codec = ::arrow::util::Codec;
+using Compression = ::arrow::Compression;
+using MemoryPool = ::arrow::MemoryPool;
+using MutableBuffer = ::arrow::MutableBuffer;
+using ResizableBuffer = ::arrow::ResizableBuffer;
+using ResizableBuffer = ::arrow::ResizableBuffer;
+using ArrowInputFile = ::arrow::io::RandomAccessFile;
+using ArrowInputStream = ::arrow::io::InputStream;
+using ArrowOutputStream = ::arrow::io::OutputStream;
+
+constexpr int64_t kDefaultOutputStreamSize = 1024;
+
+constexpr int16_t kNonPageOrdinal = static_cast<int16_t>(-1);
+
+PARQUET_EXPORT
+std::shared_ptr<::arrow::io::BufferOutputStream> CreateOutputStream(
+ ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
+
+PARQUET_EXPORT
+std::shared_ptr<ResizableBuffer> AllocateBuffer(
+ ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), int64_t size = 0);
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/printer.cc b/contrib/libs/apache/arrow/cpp/src/parquet/printer.cc
new file mode 100644
index 00000000000..dfd4bd802ee
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/printer.cc
@@ -0,0 +1,297 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/printer.h"
+
+#include <cstdint>
+#include <cstdio>
+#include <memory>
+#include <ostream>
+#include <string>
+#include <vector>
+
+#include "arrow/util/key_value_metadata.h"
+#include "arrow/util/string.h"
+
+#include "parquet/column_scanner.h"
+#include "parquet/exception.h"
+#include "parquet/file_reader.h"
+#include "parquet/metadata.h"
+#include "parquet/schema.h"
+#include "parquet/statistics.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+class ColumnReader;
+
+// ----------------------------------------------------------------------
+// ParquetFilePrinter::DebugPrint
+
+// the fixed initial size is just for an example
+#define COL_WIDTH 30
+
+void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list<int> selected_columns,
+ bool print_values, bool format_dump,
+ bool print_key_value_metadata, const char* filename) {
+ const FileMetaData* file_metadata = fileReader->metadata().get();
+
+ stream << "File Name: " << filename << "\n";
+ stream << "Version: " << ParquetVersionToString(file_metadata->version()) << "\n";
+ stream << "Created By: " << file_metadata->created_by() << "\n";
+ stream << "Total rows: " << file_metadata->num_rows() << "\n";
+
+ if (print_key_value_metadata && file_metadata->key_value_metadata()) {
+ auto key_value_metadata = file_metadata->key_value_metadata();
+ int64_t size_of_key_value_metadata = key_value_metadata->size();
+ stream << "Key Value File Metadata: " << size_of_key_value_metadata << " entries\n";
+ for (int64_t i = 0; i < size_of_key_value_metadata; i++) {
+ stream << " Key nr " << i << " " << key_value_metadata->key(i) << ": "
+ << key_value_metadata->value(i) << "\n";
+ }
+ }
+
+ stream << "Number of RowGroups: " << file_metadata->num_row_groups() << "\n";
+ stream << "Number of Real Columns: "
+ << file_metadata->schema()->group_node()->field_count() << "\n";
+
+ if (selected_columns.size() == 0) {
+ for (int i = 0; i < file_metadata->num_columns(); i++) {
+ selected_columns.push_back(i);
+ }
+ } else {
+ for (auto i : selected_columns) {
+ if (i < 0 || i >= file_metadata->num_columns()) {
+ throw ParquetException("Selected column is out of range");
+ }
+ }
+ }
+
+ stream << "Number of Columns: " << file_metadata->num_columns() << "\n";
+ stream << "Number of Selected Columns: " << selected_columns.size() << "\n";
+ for (auto i : selected_columns) {
+ const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
+ stream << "Column " << i << ": " << descr->path()->ToDotString() << " ("
+ << TypeToString(descr->physical_type());
+ const auto& logical_type = descr->logical_type();
+ if (!logical_type->is_none()) {
+ stream << " / " << logical_type->ToString();
+ }
+ if (descr->converted_type() != ConvertedType::NONE) {
+ stream << " / " << ConvertedTypeToString(descr->converted_type());
+ if (descr->converted_type() == ConvertedType::DECIMAL) {
+ stream << "(" << descr->type_precision() << "," << descr->type_scale() << ")";
+ }
+ }
+ stream << ")" << std::endl;
+ }
+
+ for (int r = 0; r < file_metadata->num_row_groups(); ++r) {
+ stream << "--- Row Group: " << r << " ---\n";
+
+ auto group_reader = fileReader->RowGroup(r);
+ std::unique_ptr<RowGroupMetaData> group_metadata = file_metadata->RowGroup(r);
+
+ stream << "--- Total Bytes: " << group_metadata->total_byte_size() << " ---\n";
+ stream << "--- Total Compressed Bytes: " << group_metadata->total_compressed_size()
+ << " ---\n";
+ stream << "--- Rows: " << group_metadata->num_rows() << " ---\n";
+
+ // Print column metadata
+ for (auto i : selected_columns) {
+ auto column_chunk = group_metadata->ColumnChunk(i);
+ std::shared_ptr<Statistics> stats = column_chunk->statistics();
+
+ const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
+ stream << "Column " << i << std::endl << " Values: " << column_chunk->num_values();
+ if (column_chunk->is_stats_set()) {
+ std::string min = stats->EncodeMin(), max = stats->EncodeMax();
+ stream << ", Null Values: " << stats->null_count()
+ << ", Distinct Values: " << stats->distinct_count() << std::endl
+ << " Max: " << FormatStatValue(descr->physical_type(), max)
+ << ", Min: " << FormatStatValue(descr->physical_type(), min);
+ } else {
+ stream << " Statistics Not Set";
+ }
+ stream << std::endl
+ << " Compression: "
+ << ::arrow::internal::AsciiToUpper(
+ Codec::GetCodecAsString(column_chunk->compression()))
+ << ", Encodings:";
+ for (auto encoding : column_chunk->encodings()) {
+ stream << " " << EncodingToString(encoding);
+ }
+ stream << std::endl
+ << " Uncompressed Size: " << column_chunk->total_uncompressed_size()
+ << ", Compressed Size: " << column_chunk->total_compressed_size()
+ << std::endl;
+ }
+
+ if (!print_values) {
+ continue;
+ }
+ stream << "--- Values ---\n";
+
+ static constexpr int bufsize = COL_WIDTH + 1;
+ char buffer[bufsize];
+
+ // Create readers for selected columns and print contents
+ std::vector<std::shared_ptr<Scanner>> scanners(selected_columns.size(), nullptr);
+ int j = 0;
+ for (auto i : selected_columns) {
+ std::shared_ptr<ColumnReader> col_reader = group_reader->Column(i);
+ // This is OK in this method as long as the RowGroupReader does not get
+ // deleted
+ auto& scanner = scanners[j++] = Scanner::Make(col_reader);
+
+ if (format_dump) {
+ stream << "Column " << i << std::endl;
+ while (scanner->HasNext()) {
+ scanner->PrintNext(stream, 0, true);
+ stream << "\n";
+ }
+ continue;
+ }
+
+ snprintf(buffer, bufsize, "%-*s", COL_WIDTH,
+ file_metadata->schema()->Column(i)->name().c_str());
+ stream << buffer << '|';
+ }
+ if (format_dump) {
+ continue;
+ }
+ stream << "\n";
+
+ bool hasRow;
+ do {
+ hasRow = false;
+ for (auto scanner : scanners) {
+ if (scanner->HasNext()) {
+ hasRow = true;
+ scanner->PrintNext(stream, COL_WIDTH);
+ stream << '|';
+ }
+ }
+ stream << "\n";
+ } while (hasRow);
+ }
+}
+
+void ParquetFilePrinter::JSONPrint(std::ostream& stream, std::list<int> selected_columns,
+ const char* filename) {
+ const FileMetaData* file_metadata = fileReader->metadata().get();
+ stream << "{\n";
+ stream << " \"FileName\": \"" << filename << "\",\n";
+ stream << " \"Version\": \"" << ParquetVersionToString(file_metadata->version())
+ << "\",\n";
+ stream << " \"CreatedBy\": \"" << file_metadata->created_by() << "\",\n";
+ stream << " \"TotalRows\": \"" << file_metadata->num_rows() << "\",\n";
+ stream << " \"NumberOfRowGroups\": \"" << file_metadata->num_row_groups() << "\",\n";
+ stream << " \"NumberOfRealColumns\": \""
+ << file_metadata->schema()->group_node()->field_count() << "\",\n";
+ stream << " \"NumberOfColumns\": \"" << file_metadata->num_columns() << "\",\n";
+
+ if (selected_columns.size() == 0) {
+ for (int i = 0; i < file_metadata->num_columns(); i++) {
+ selected_columns.push_back(i);
+ }
+ } else {
+ for (auto i : selected_columns) {
+ if (i < 0 || i >= file_metadata->num_columns()) {
+ throw ParquetException("Selected column is out of range");
+ }
+ }
+ }
+
+ stream << " \"Columns\": [\n";
+ int c = 0;
+ for (auto i : selected_columns) {
+ const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
+ stream << " { \"Id\": \"" << i << "\","
+ << " \"Name\": \"" << descr->path()->ToDotString() << "\","
+ << " \"PhysicalType\": \"" << TypeToString(descr->physical_type()) << "\","
+ << " \"ConvertedType\": \"" << ConvertedTypeToString(descr->converted_type())
+ << "\","
+ << " \"LogicalType\": " << (descr->logical_type())->ToJSON() << " }";
+ c++;
+ if (c != static_cast<int>(selected_columns.size())) {
+ stream << ",\n";
+ }
+ }
+
+ stream << "\n ],\n \"RowGroups\": [\n";
+ for (int r = 0; r < file_metadata->num_row_groups(); ++r) {
+ stream << " {\n \"Id\": \"" << r << "\", ";
+
+ auto group_reader = fileReader->RowGroup(r);
+ std::unique_ptr<RowGroupMetaData> group_metadata = file_metadata->RowGroup(r);
+
+ stream << " \"TotalBytes\": \"" << group_metadata->total_byte_size() << "\", ";
+ stream << " \"TotalCompressedBytes\": \"" << group_metadata->total_compressed_size()
+ << "\", ";
+ stream << " \"Rows\": \"" << group_metadata->num_rows() << "\",\n";
+
+ // Print column metadata
+ stream << " \"ColumnChunks\": [\n";
+ int c1 = 0;
+ for (auto i : selected_columns) {
+ auto column_chunk = group_metadata->ColumnChunk(i);
+ std::shared_ptr<Statistics> stats = column_chunk->statistics();
+
+ const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
+ stream << " {\"Id\": \"" << i << "\", \"Values\": \""
+ << column_chunk->num_values() << "\", "
+ << "\"StatsSet\": ";
+ if (column_chunk->is_stats_set()) {
+ stream << "\"True\", \"Stats\": {";
+ std::string min = stats->EncodeMin(), max = stats->EncodeMax();
+ stream << "\"NumNulls\": \"" << stats->null_count() << "\", "
+ << "\"DistinctValues\": \"" << stats->distinct_count() << "\", "
+ << "\"Max\": \"" << FormatStatValue(descr->physical_type(), max) << "\", "
+ << "\"Min\": \"" << FormatStatValue(descr->physical_type(), min)
+ << "\" },";
+ } else {
+ stream << "\"False\",";
+ }
+ stream << "\n \"Compression\": \""
+ << ::arrow::internal::AsciiToUpper(
+ Codec::GetCodecAsString(column_chunk->compression()))
+ << "\", \"Encodings\": \"";
+ for (auto encoding : column_chunk->encodings()) {
+ stream << EncodingToString(encoding) << " ";
+ }
+ stream << "\", "
+ << "\"UncompressedSize\": \"" << column_chunk->total_uncompressed_size()
+ << "\", \"CompressedSize\": \"" << column_chunk->total_compressed_size();
+
+ // end of a ColumnChunk
+ stream << "\" }";
+ c1++;
+ if (c1 != static_cast<int>(selected_columns.size())) {
+ stream << ",\n";
+ }
+ }
+
+ stream << "\n ]\n }";
+ if ((r + 1) != static_cast<int>(file_metadata->num_row_groups())) {
+ stream << ",\n";
+ }
+ }
+ stream << "\n ]\n}\n";
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/printer.h b/contrib/libs/apache/arrow/cpp/src/parquet/printer.h
new file mode 100644
index 00000000000..6bdf5b456fa
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/printer.h
@@ -0,0 +1,46 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <iosfwd>
+#include <list>
+
+#include "parquet/platform.h"
+
+namespace parquet {
+
+class ParquetFileReader;
+
+class PARQUET_EXPORT ParquetFilePrinter {
+ private:
+ ParquetFileReader* fileReader;
+
+ public:
+ explicit ParquetFilePrinter(ParquetFileReader* reader) : fileReader(reader) {}
+ ~ParquetFilePrinter() {}
+
+ void DebugPrint(std::ostream& stream, std::list<int> selected_columns,
+ bool print_values = false, bool format_dump = false,
+ bool print_key_value_metadata = false,
+ const char* filename = "No Name");
+
+ void JSONPrint(std::ostream& stream, std::list<int> selected_columns,
+ const char* filename = "No Name");
+};
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/properties.cc b/contrib/libs/apache/arrow/cpp/src/parquet/properties.cc
new file mode 100644
index 00000000000..93638dbe28a
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/properties.cc
@@ -0,0 +1,64 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <sstream>
+#include <utility>
+
+#include "parquet/properties.h"
+
+#include "arrow/io/buffered.h"
+#include "arrow/io/memory.h"
+#include "arrow/util/logging.h"
+
+namespace parquet {
+
+std::shared_ptr<ArrowInputStream> ReaderProperties::GetStream(
+ std::shared_ptr<ArrowInputFile> source, int64_t start, int64_t num_bytes) {
+ if (buffered_stream_enabled_) {
+ // ARROW-6180 / PARQUET-1636 Create isolated reader that references segment
+ // of source
+ std::shared_ptr<::arrow::io::InputStream> safe_stream =
+ ::arrow::io::RandomAccessFile::GetStream(source, start, num_bytes);
+ PARQUET_ASSIGN_OR_THROW(
+ auto stream, ::arrow::io::BufferedInputStream::Create(buffer_size_, pool_,
+ safe_stream, num_bytes));
+ return std::move(stream);
+ } else {
+ PARQUET_ASSIGN_OR_THROW(auto data, source->ReadAt(start, num_bytes));
+
+ if (data->size() != num_bytes) {
+ std::stringstream ss;
+ ss << "Tried reading " << num_bytes << " bytes starting at position " << start
+ << " from file but only got " << data->size();
+ throw ParquetException(ss.str());
+ }
+ return std::make_shared<::arrow::io::BufferReader>(data);
+ }
+}
+
+ArrowReaderProperties default_arrow_reader_properties() {
+ static ArrowReaderProperties default_reader_props;
+ return default_reader_props;
+}
+
+std::shared_ptr<ArrowWriterProperties> default_arrow_writer_properties() {
+ static std::shared_ptr<ArrowWriterProperties> default_writer_properties =
+ ArrowWriterProperties::Builder().build();
+ return default_writer_properties;
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/properties.h b/contrib/libs/apache/arrow/cpp/src/parquet/properties.h
new file mode 100644
index 00000000000..d217b8efa52
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/properties.h
@@ -0,0 +1,813 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+
+#include "arrow/io/caching.h"
+#include "arrow/type.h"
+#include "arrow/util/compression.h"
+#include "parquet/encryption/encryption.h"
+#include "parquet/exception.h"
+#include "parquet/parquet_version.h"
+#include "parquet/platform.h"
+#include "parquet/schema.h"
+#include "parquet/type_fwd.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+/// Determines use of Parquet Format version >= 2.0.0 logical types. For
+/// example, when writing from Arrow data structures, PARQUET_2_0 will enable
+/// use of INT_* and UINT_* converted types as well as nanosecond timestamps
+/// stored physically as INT64. Since some Parquet implementations do not
+/// support the logical types added in the 2.0.0 format version, if you want to
+/// maximize compatibility of your files you may want to use PARQUET_1_0.
+///
+/// Note that the 2.x format version series also introduced new serialized
+/// data page metadata and on disk data page layout. To enable this, use
+/// ParquetDataPageVersion.
+struct ParquetVersion;
+
+/// Controls serialization format of data pages. parquet-format v2.0.0
+/// introduced a new data page metadata type DataPageV2 and serialized page
+/// structure (for example, encoded levels are no longer compressed). Prior to
+/// the completion of PARQUET-457 in 2020, this library did not implement
+/// DataPageV2 correctly, so if you use the V2 data page format, you may have
+/// forward compatibility issues (older versions of the library will be unable
+/// to read the files). Note that some Parquet implementations do not implement
+/// DataPageV2 at all.
+enum class ParquetDataPageVersion { V1, V2 };
+
+/// Align the default buffer size to a small multiple of a page size.
+constexpr int64_t kDefaultBufferSize = 4096 * 4;
+
+class PARQUET_EXPORT ReaderProperties {
+ public:
+ explicit ReaderProperties(MemoryPool* pool = ::arrow::default_memory_pool())
+ : pool_(pool) {}
+
+ MemoryPool* memory_pool() const { return pool_; }
+
+ std::shared_ptr<ArrowInputStream> GetStream(std::shared_ptr<ArrowInputFile> source,
+ int64_t start, int64_t num_bytes);
+
+ /// Buffered stream reading allows the user to control the memory usage of
+ /// parquet readers. This ensure that all `RandomAccessFile::ReadAt` calls are
+ /// wrapped in a buffered reader that uses a fix sized buffer (of size
+ /// `buffer_size()`) instead of the full size of the ReadAt.
+ ///
+ /// The primary reason for this control knobs is for resource control and not
+ /// performance.
+ bool is_buffered_stream_enabled() const { return buffered_stream_enabled_; }
+ void enable_buffered_stream() { buffered_stream_enabled_ = true; }
+ void disable_buffered_stream() { buffered_stream_enabled_ = false; }
+
+ int64_t buffer_size() const { return buffer_size_; }
+ void set_buffer_size(int64_t size) { buffer_size_ = size; }
+
+ void file_decryption_properties(std::shared_ptr<FileDecryptionProperties> decryption) {
+ file_decryption_properties_ = std::move(decryption);
+ }
+
+ const std::shared_ptr<FileDecryptionProperties>& file_decryption_properties() const {
+ return file_decryption_properties_;
+ }
+
+ private:
+ MemoryPool* pool_;
+ int64_t buffer_size_ = kDefaultBufferSize;
+ bool buffered_stream_enabled_ = false;
+ std::shared_ptr<FileDecryptionProperties> file_decryption_properties_;
+};
+
+ReaderProperties PARQUET_EXPORT default_reader_properties();
+
+static constexpr int64_t kDefaultDataPageSize = 1024 * 1024;
+static constexpr bool DEFAULT_IS_DICTIONARY_ENABLED = true;
+static constexpr int64_t DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT = kDefaultDataPageSize;
+static constexpr int64_t DEFAULT_WRITE_BATCH_SIZE = 1024;
+static constexpr int64_t DEFAULT_MAX_ROW_GROUP_LENGTH = 64 * 1024 * 1024;
+static constexpr bool DEFAULT_ARE_STATISTICS_ENABLED = true;
+static constexpr int64_t DEFAULT_MAX_STATISTICS_SIZE = 4096;
+static constexpr Encoding::type DEFAULT_ENCODING = Encoding::PLAIN;
+static const char DEFAULT_CREATED_BY[] = CREATED_BY_VERSION;
+static constexpr Compression::type DEFAULT_COMPRESSION_TYPE = Compression::UNCOMPRESSED;
+
+class PARQUET_EXPORT ColumnProperties {
+ public:
+ ColumnProperties(Encoding::type encoding = DEFAULT_ENCODING,
+ Compression::type codec = DEFAULT_COMPRESSION_TYPE,
+ bool dictionary_enabled = DEFAULT_IS_DICTIONARY_ENABLED,
+ bool statistics_enabled = DEFAULT_ARE_STATISTICS_ENABLED,
+ size_t max_stats_size = DEFAULT_MAX_STATISTICS_SIZE)
+ : encoding_(encoding),
+ codec_(codec),
+ dictionary_enabled_(dictionary_enabled),
+ statistics_enabled_(statistics_enabled),
+ max_stats_size_(max_stats_size),
+ compression_level_(Codec::UseDefaultCompressionLevel()) {}
+
+ void set_encoding(Encoding::type encoding) { encoding_ = encoding; }
+
+ void set_compression(Compression::type codec) { codec_ = codec; }
+
+ void set_dictionary_enabled(bool dictionary_enabled) {
+ dictionary_enabled_ = dictionary_enabled;
+ }
+
+ void set_statistics_enabled(bool statistics_enabled) {
+ statistics_enabled_ = statistics_enabled;
+ }
+
+ void set_max_statistics_size(size_t max_stats_size) {
+ max_stats_size_ = max_stats_size;
+ }
+
+ void set_compression_level(int compression_level) {
+ compression_level_ = compression_level;
+ }
+
+ Encoding::type encoding() const { return encoding_; }
+
+ Compression::type compression() const { return codec_; }
+
+ bool dictionary_enabled() const { return dictionary_enabled_; }
+
+ bool statistics_enabled() const { return statistics_enabled_; }
+
+ size_t max_statistics_size() const { return max_stats_size_; }
+
+ int compression_level() const { return compression_level_; }
+
+ private:
+ Encoding::type encoding_;
+ Compression::type codec_;
+ bool dictionary_enabled_;
+ bool statistics_enabled_;
+ size_t max_stats_size_;
+ int compression_level_;
+};
+
+class PARQUET_EXPORT WriterProperties {
+ public:
+ class Builder {
+ public:
+ Builder()
+ : pool_(::arrow::default_memory_pool()),
+ dictionary_pagesize_limit_(DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT),
+ write_batch_size_(DEFAULT_WRITE_BATCH_SIZE),
+ max_row_group_length_(DEFAULT_MAX_ROW_GROUP_LENGTH),
+ pagesize_(kDefaultDataPageSize),
+ version_(ParquetVersion::PARQUET_1_0),
+ data_page_version_(ParquetDataPageVersion::V1),
+ created_by_(DEFAULT_CREATED_BY) {}
+ virtual ~Builder() {}
+
+ Builder* memory_pool(MemoryPool* pool) {
+ pool_ = pool;
+ return this;
+ }
+
+ Builder* enable_dictionary() {
+ default_column_properties_.set_dictionary_enabled(true);
+ return this;
+ }
+
+ Builder* disable_dictionary() {
+ default_column_properties_.set_dictionary_enabled(false);
+ return this;
+ }
+
+ Builder* enable_dictionary(const std::string& path) {
+ dictionary_enabled_[path] = true;
+ return this;
+ }
+
+ Builder* enable_dictionary(const std::shared_ptr<schema::ColumnPath>& path) {
+ return this->enable_dictionary(path->ToDotString());
+ }
+
+ Builder* disable_dictionary(const std::string& path) {
+ dictionary_enabled_[path] = false;
+ return this;
+ }
+
+ Builder* disable_dictionary(const std::shared_ptr<schema::ColumnPath>& path) {
+ return this->disable_dictionary(path->ToDotString());
+ }
+
+ Builder* dictionary_pagesize_limit(int64_t dictionary_psize_limit) {
+ dictionary_pagesize_limit_ = dictionary_psize_limit;
+ return this;
+ }
+
+ Builder* write_batch_size(int64_t write_batch_size) {
+ write_batch_size_ = write_batch_size;
+ return this;
+ }
+
+ Builder* max_row_group_length(int64_t max_row_group_length) {
+ max_row_group_length_ = max_row_group_length;
+ return this;
+ }
+
+ Builder* data_pagesize(int64_t pg_size) {
+ pagesize_ = pg_size;
+ return this;
+ }
+
+ Builder* data_page_version(ParquetDataPageVersion data_page_version) {
+ data_page_version_ = data_page_version;
+ return this;
+ }
+
+ Builder* version(ParquetVersion::type version) {
+ version_ = version;
+ return this;
+ }
+
+ Builder* created_by(const std::string& created_by) {
+ created_by_ = created_by;
+ return this;
+ }
+
+ /**
+ * Define the encoding that is used when we don't utilise dictionary encoding.
+ *
+ * This either apply if dictionary encoding is disabled or if we fallback
+ * as the dictionary grew too large.
+ */
+ Builder* encoding(Encoding::type encoding_type) {
+ if (encoding_type == Encoding::PLAIN_DICTIONARY ||
+ encoding_type == Encoding::RLE_DICTIONARY) {
+ throw ParquetException("Can't use dictionary encoding as fallback encoding");
+ }
+
+ default_column_properties_.set_encoding(encoding_type);
+ return this;
+ }
+
+ /**
+ * Define the encoding that is used when we don't utilise dictionary encoding.
+ *
+ * This either apply if dictionary encoding is disabled or if we fallback
+ * as the dictionary grew too large.
+ */
+ Builder* encoding(const std::string& path, Encoding::type encoding_type) {
+ if (encoding_type == Encoding::PLAIN_DICTIONARY ||
+ encoding_type == Encoding::RLE_DICTIONARY) {
+ throw ParquetException("Can't use dictionary encoding as fallback encoding");
+ }
+
+ encodings_[path] = encoding_type;
+ return this;
+ }
+
+ /**
+ * Define the encoding that is used when we don't utilise dictionary encoding.
+ *
+ * This either apply if dictionary encoding is disabled or if we fallback
+ * as the dictionary grew too large.
+ */
+ Builder* encoding(const std::shared_ptr<schema::ColumnPath>& path,
+ Encoding::type encoding_type) {
+ return this->encoding(path->ToDotString(), encoding_type);
+ }
+
+ Builder* compression(Compression::type codec) {
+ default_column_properties_.set_compression(codec);
+ return this;
+ }
+
+ Builder* max_statistics_size(size_t max_stats_sz) {
+ default_column_properties_.set_max_statistics_size(max_stats_sz);
+ return this;
+ }
+
+ Builder* compression(const std::string& path, Compression::type codec) {
+ codecs_[path] = codec;
+ return this;
+ }
+
+ Builder* compression(const std::shared_ptr<schema::ColumnPath>& path,
+ Compression::type codec) {
+ return this->compression(path->ToDotString(), codec);
+ }
+
+ /// \brief Specify the default compression level for the compressor in
+ /// every column. In case a column does not have an explicitly specified
+ /// compression level, the default one would be used.
+ ///
+ /// The provided compression level is compressor specific. The user would
+ /// have to familiarize oneself with the available levels for the selected
+ /// compressor. If the compressor does not allow for selecting different
+ /// compression levels, calling this function would not have any effect.
+ /// Parquet and Arrow do not validate the passed compression level. If no
+ /// level is selected by the user or if the special
+ /// std::numeric_limits<int>::min() value is passed, then Arrow selects the
+ /// compression level.
+ Builder* compression_level(int compression_level) {
+ default_column_properties_.set_compression_level(compression_level);
+ return this;
+ }
+
+ /// \brief Specify a compression level for the compressor for the column
+ /// described by path.
+ ///
+ /// The provided compression level is compressor specific. The user would
+ /// have to familiarize oneself with the available levels for the selected
+ /// compressor. If the compressor does not allow for selecting different
+ /// compression levels, calling this function would not have any effect.
+ /// Parquet and Arrow do not validate the passed compression level. If no
+ /// level is selected by the user or if the special
+ /// std::numeric_limits<int>::min() value is passed, then Arrow selects the
+ /// compression level.
+ Builder* compression_level(const std::string& path, int compression_level) {
+ codecs_compression_level_[path] = compression_level;
+ return this;
+ }
+
+ /// \brief Specify a compression level for the compressor for the column
+ /// described by path.
+ ///
+ /// The provided compression level is compressor specific. The user would
+ /// have to familiarize oneself with the available levels for the selected
+ /// compressor. If the compressor does not allow for selecting different
+ /// compression levels, calling this function would not have any effect.
+ /// Parquet and Arrow do not validate the passed compression level. If no
+ /// level is selected by the user or if the special
+ /// std::numeric_limits<int>::min() value is passed, then Arrow selects the
+ /// compression level.
+ Builder* compression_level(const std::shared_ptr<schema::ColumnPath>& path,
+ int compression_level) {
+ return this->compression_level(path->ToDotString(), compression_level);
+ }
+
+ Builder* encryption(
+ std::shared_ptr<FileEncryptionProperties> file_encryption_properties) {
+ file_encryption_properties_ = std::move(file_encryption_properties);
+ return this;
+ }
+
+ Builder* enable_statistics() {
+ default_column_properties_.set_statistics_enabled(true);
+ return this;
+ }
+
+ Builder* disable_statistics() {
+ default_column_properties_.set_statistics_enabled(false);
+ return this;
+ }
+
+ Builder* enable_statistics(const std::string& path) {
+ statistics_enabled_[path] = true;
+ return this;
+ }
+
+ Builder* enable_statistics(const std::shared_ptr<schema::ColumnPath>& path) {
+ return this->enable_statistics(path->ToDotString());
+ }
+
+ Builder* disable_statistics(const std::string& path) {
+ statistics_enabled_[path] = false;
+ return this;
+ }
+
+ Builder* disable_statistics(const std::shared_ptr<schema::ColumnPath>& path) {
+ return this->disable_statistics(path->ToDotString());
+ }
+
+ std::shared_ptr<WriterProperties> build() {
+ std::unordered_map<std::string, ColumnProperties> column_properties;
+ auto get = [&](const std::string& key) -> ColumnProperties& {
+ auto it = column_properties.find(key);
+ if (it == column_properties.end())
+ return column_properties[key] = default_column_properties_;
+ else
+ return it->second;
+ };
+
+ for (const auto& item : encodings_) get(item.first).set_encoding(item.second);
+ for (const auto& item : codecs_) get(item.first).set_compression(item.second);
+ for (const auto& item : codecs_compression_level_)
+ get(item.first).set_compression_level(item.second);
+ for (const auto& item : dictionary_enabled_)
+ get(item.first).set_dictionary_enabled(item.second);
+ for (const auto& item : statistics_enabled_)
+ get(item.first).set_statistics_enabled(item.second);
+
+ return std::shared_ptr<WriterProperties>(new WriterProperties(
+ pool_, dictionary_pagesize_limit_, write_batch_size_, max_row_group_length_,
+ pagesize_, version_, created_by_, std::move(file_encryption_properties_),
+ default_column_properties_, column_properties, data_page_version_));
+ }
+
+ private:
+ MemoryPool* pool_;
+ int64_t dictionary_pagesize_limit_;
+ int64_t write_batch_size_;
+ int64_t max_row_group_length_;
+ int64_t pagesize_;
+ ParquetVersion::type version_;
+ ParquetDataPageVersion data_page_version_;
+ std::string created_by_;
+
+ std::shared_ptr<FileEncryptionProperties> file_encryption_properties_;
+
+ // Settings used for each column unless overridden in any of the maps below
+ ColumnProperties default_column_properties_;
+ std::unordered_map<std::string, Encoding::type> encodings_;
+ std::unordered_map<std::string, Compression::type> codecs_;
+ std::unordered_map<std::string, int32_t> codecs_compression_level_;
+ std::unordered_map<std::string, bool> dictionary_enabled_;
+ std::unordered_map<std::string, bool> statistics_enabled_;
+ };
+
+ inline MemoryPool* memory_pool() const { return pool_; }
+
+ inline int64_t dictionary_pagesize_limit() const { return dictionary_pagesize_limit_; }
+
+ inline int64_t write_batch_size() const { return write_batch_size_; }
+
+ inline int64_t max_row_group_length() const { return max_row_group_length_; }
+
+ inline int64_t data_pagesize() const { return pagesize_; }
+
+ inline ParquetDataPageVersion data_page_version() const {
+ return parquet_data_page_version_;
+ }
+
+ inline ParquetVersion::type version() const { return parquet_version_; }
+
+ inline std::string created_by() const { return parquet_created_by_; }
+
+ inline Encoding::type dictionary_index_encoding() const {
+ if (parquet_version_ == ParquetVersion::PARQUET_1_0) {
+ return Encoding::PLAIN_DICTIONARY;
+ } else {
+ return Encoding::RLE_DICTIONARY;
+ }
+ }
+
+ inline Encoding::type dictionary_page_encoding() const {
+ if (parquet_version_ == ParquetVersion::PARQUET_1_0) {
+ return Encoding::PLAIN_DICTIONARY;
+ } else {
+ return Encoding::PLAIN;
+ }
+ }
+
+ const ColumnProperties& column_properties(
+ const std::shared_ptr<schema::ColumnPath>& path) const {
+ auto it = column_properties_.find(path->ToDotString());
+ if (it != column_properties_.end()) return it->second;
+ return default_column_properties_;
+ }
+
+ Encoding::type encoding(const std::shared_ptr<schema::ColumnPath>& path) const {
+ return column_properties(path).encoding();
+ }
+
+ Compression::type compression(const std::shared_ptr<schema::ColumnPath>& path) const {
+ return column_properties(path).compression();
+ }
+
+ int compression_level(const std::shared_ptr<schema::ColumnPath>& path) const {
+ return column_properties(path).compression_level();
+ }
+
+ bool dictionary_enabled(const std::shared_ptr<schema::ColumnPath>& path) const {
+ return column_properties(path).dictionary_enabled();
+ }
+
+ bool statistics_enabled(const std::shared_ptr<schema::ColumnPath>& path) const {
+ return column_properties(path).statistics_enabled();
+ }
+
+ size_t max_statistics_size(const std::shared_ptr<schema::ColumnPath>& path) const {
+ return column_properties(path).max_statistics_size();
+ }
+
+ inline FileEncryptionProperties* file_encryption_properties() const {
+ return file_encryption_properties_.get();
+ }
+
+ std::shared_ptr<ColumnEncryptionProperties> column_encryption_properties(
+ const std::string& path) const {
+ if (file_encryption_properties_) {
+ return file_encryption_properties_->column_encryption_properties(path);
+ } else {
+ return NULLPTR;
+ }
+ }
+
+ private:
+ explicit WriterProperties(
+ MemoryPool* pool, int64_t dictionary_pagesize_limit, int64_t write_batch_size,
+ int64_t max_row_group_length, int64_t pagesize, ParquetVersion::type version,
+ const std::string& created_by,
+ std::shared_ptr<FileEncryptionProperties> file_encryption_properties,
+ const ColumnProperties& default_column_properties,
+ const std::unordered_map<std::string, ColumnProperties>& column_properties,
+ ParquetDataPageVersion data_page_version)
+ : pool_(pool),
+ dictionary_pagesize_limit_(dictionary_pagesize_limit),
+ write_batch_size_(write_batch_size),
+ max_row_group_length_(max_row_group_length),
+ pagesize_(pagesize),
+ parquet_data_page_version_(data_page_version),
+ parquet_version_(version),
+ parquet_created_by_(created_by),
+ file_encryption_properties_(file_encryption_properties),
+ default_column_properties_(default_column_properties),
+ column_properties_(column_properties) {}
+
+ MemoryPool* pool_;
+ int64_t dictionary_pagesize_limit_;
+ int64_t write_batch_size_;
+ int64_t max_row_group_length_;
+ int64_t pagesize_;
+ ParquetDataPageVersion parquet_data_page_version_;
+ ParquetVersion::type parquet_version_;
+ std::string parquet_created_by_;
+
+ std::shared_ptr<FileEncryptionProperties> file_encryption_properties_;
+
+ ColumnProperties default_column_properties_;
+ std::unordered_map<std::string, ColumnProperties> column_properties_;
+};
+
+PARQUET_EXPORT const std::shared_ptr<WriterProperties>& default_writer_properties();
+
+// ----------------------------------------------------------------------
+// Properties specific to Apache Arrow columnar read and write
+
+static constexpr bool kArrowDefaultUseThreads = false;
+
+// Default number of rows to read when using ::arrow::RecordBatchReader
+static constexpr int64_t kArrowDefaultBatchSize = 64 * 1024;
+
+/// EXPERIMENTAL: Properties for configuring FileReader behavior.
+class PARQUET_EXPORT ArrowReaderProperties {
+ public:
+ explicit ArrowReaderProperties(bool use_threads = kArrowDefaultUseThreads)
+ : use_threads_(use_threads),
+ read_dict_indices_(),
+ batch_size_(kArrowDefaultBatchSize),
+ pre_buffer_(false),
+ cache_options_(::arrow::io::CacheOptions::Defaults()),
+ coerce_int96_timestamp_unit_(::arrow::TimeUnit::NANO) {}
+
+ void set_use_threads(bool use_threads) { use_threads_ = use_threads; }
+
+ bool use_threads() const { return use_threads_; }
+
+ void set_read_dictionary(int column_index, bool read_dict) {
+ if (read_dict) {
+ read_dict_indices_.insert(column_index);
+ } else {
+ read_dict_indices_.erase(column_index);
+ }
+ }
+ bool read_dictionary(int column_index) const {
+ if (read_dict_indices_.find(column_index) != read_dict_indices_.end()) {
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ void set_batch_size(int64_t batch_size) { batch_size_ = batch_size; }
+
+ int64_t batch_size() const { return batch_size_; }
+
+ /// Enable read coalescing.
+ ///
+ /// When enabled, the Arrow reader will pre-buffer necessary regions
+ /// of the file in-memory. This is intended to improve performance on
+ /// high-latency filesystems (e.g. Amazon S3).
+ void set_pre_buffer(bool pre_buffer) { pre_buffer_ = pre_buffer; }
+
+ bool pre_buffer() const { return pre_buffer_; }
+
+ /// Set options for read coalescing. This can be used to tune the
+ /// implementation for characteristics of different filesystems.
+ void set_cache_options(::arrow::io::CacheOptions options) { cache_options_ = options; }
+
+ const ::arrow::io::CacheOptions& cache_options() const { return cache_options_; }
+
+ /// Set execution context for read coalescing.
+ void set_io_context(const ::arrow::io::IOContext& ctx) { io_context_ = ctx; }
+
+ const ::arrow::io::IOContext& io_context() const { return io_context_; }
+
+ /// Set timestamp unit to use for deprecated INT96-encoded timestamps
+ /// (default is NANO).
+ void set_coerce_int96_timestamp_unit(::arrow::TimeUnit::type unit) {
+ coerce_int96_timestamp_unit_ = unit;
+ }
+
+ ::arrow::TimeUnit::type coerce_int96_timestamp_unit() const {
+ return coerce_int96_timestamp_unit_;
+ }
+
+ private:
+ bool use_threads_;
+ std::unordered_set<int> read_dict_indices_;
+ int64_t batch_size_;
+ bool pre_buffer_;
+ ::arrow::io::IOContext io_context_;
+ ::arrow::io::CacheOptions cache_options_;
+ ::arrow::TimeUnit::type coerce_int96_timestamp_unit_;
+};
+
+/// EXPERIMENTAL: Constructs the default ArrowReaderProperties
+PARQUET_EXPORT
+ArrowReaderProperties default_arrow_reader_properties();
+
+class PARQUET_EXPORT ArrowWriterProperties {
+ public:
+ enum EngineVersion {
+ V1, // Supports only nested lists.
+ V2 // Full support for all nesting combinations
+ };
+ class Builder {
+ public:
+ Builder()
+ : write_timestamps_as_int96_(false),
+ coerce_timestamps_enabled_(false),
+ coerce_timestamps_unit_(::arrow::TimeUnit::SECOND),
+ truncated_timestamps_allowed_(false),
+ store_schema_(false),
+ // TODO: At some point we should flip this.
+ compliant_nested_types_(false),
+ engine_version_(V2) {}
+ virtual ~Builder() = default;
+
+ Builder* disable_deprecated_int96_timestamps() {
+ write_timestamps_as_int96_ = false;
+ return this;
+ }
+
+ Builder* enable_deprecated_int96_timestamps() {
+ write_timestamps_as_int96_ = true;
+ return this;
+ }
+
+ Builder* coerce_timestamps(::arrow::TimeUnit::type unit) {
+ coerce_timestamps_enabled_ = true;
+ coerce_timestamps_unit_ = unit;
+ return this;
+ }
+
+ Builder* allow_truncated_timestamps() {
+ truncated_timestamps_allowed_ = true;
+ return this;
+ }
+
+ Builder* disallow_truncated_timestamps() {
+ truncated_timestamps_allowed_ = false;
+ return this;
+ }
+
+ /// \brief EXPERIMENTAL: Write binary serialized Arrow schema to the file,
+ /// to enable certain read options (like "read_dictionary") to be set
+ /// automatically
+ Builder* store_schema() {
+ store_schema_ = true;
+ return this;
+ }
+
+ Builder* enable_compliant_nested_types() {
+ compliant_nested_types_ = true;
+ return this;
+ }
+
+ Builder* disable_compliant_nested_types() {
+ compliant_nested_types_ = false;
+ return this;
+ }
+
+ Builder* set_engine_version(EngineVersion version) {
+ engine_version_ = version;
+ return this;
+ }
+
+ std::shared_ptr<ArrowWriterProperties> build() {
+ return std::shared_ptr<ArrowWriterProperties>(new ArrowWriterProperties(
+ write_timestamps_as_int96_, coerce_timestamps_enabled_, coerce_timestamps_unit_,
+ truncated_timestamps_allowed_, store_schema_, compliant_nested_types_,
+ engine_version_));
+ }
+
+ private:
+ bool write_timestamps_as_int96_;
+
+ bool coerce_timestamps_enabled_;
+ ::arrow::TimeUnit::type coerce_timestamps_unit_;
+ bool truncated_timestamps_allowed_;
+
+ bool store_schema_;
+ bool compliant_nested_types_;
+ EngineVersion engine_version_;
+ };
+
+ bool support_deprecated_int96_timestamps() const { return write_timestamps_as_int96_; }
+
+ bool coerce_timestamps_enabled() const { return coerce_timestamps_enabled_; }
+ ::arrow::TimeUnit::type coerce_timestamps_unit() const {
+ return coerce_timestamps_unit_;
+ }
+
+ bool truncated_timestamps_allowed() const { return truncated_timestamps_allowed_; }
+
+ bool store_schema() const { return store_schema_; }
+
+ /// \brief Enable nested type naming according to the parquet specification.
+ ///
+ /// Older versions of arrow wrote out field names for nested lists based on the name
+ /// of the field. According to the parquet specification they should always be
+ /// "element".
+ bool compliant_nested_types() const { return compliant_nested_types_; }
+
+ /// \brief The underlying engine version to use when writing Arrow data.
+ ///
+ /// V2 is currently the latest V1 is considered deprecated but left in
+ /// place in case there are bugs detected in V2.
+ EngineVersion engine_version() const { return engine_version_; }
+
+ private:
+ explicit ArrowWriterProperties(bool write_nanos_as_int96,
+ bool coerce_timestamps_enabled,
+ ::arrow::TimeUnit::type coerce_timestamps_unit,
+ bool truncated_timestamps_allowed, bool store_schema,
+ bool compliant_nested_types,
+ EngineVersion engine_version)
+ : write_timestamps_as_int96_(write_nanos_as_int96),
+ coerce_timestamps_enabled_(coerce_timestamps_enabled),
+ coerce_timestamps_unit_(coerce_timestamps_unit),
+ truncated_timestamps_allowed_(truncated_timestamps_allowed),
+ store_schema_(store_schema),
+ compliant_nested_types_(compliant_nested_types),
+ engine_version_(engine_version) {}
+
+ const bool write_timestamps_as_int96_;
+ const bool coerce_timestamps_enabled_;
+ const ::arrow::TimeUnit::type coerce_timestamps_unit_;
+ const bool truncated_timestamps_allowed_;
+ const bool store_schema_;
+ const bool compliant_nested_types_;
+ const EngineVersion engine_version_;
+};
+
+/// \brief State object used for writing Arrow data directly to a Parquet
+/// column chunk. API possibly not stable
+struct ArrowWriteContext {
+ ArrowWriteContext(MemoryPool* memory_pool, ArrowWriterProperties* properties)
+ : memory_pool(memory_pool),
+ properties(properties),
+ data_buffer(AllocateBuffer(memory_pool)),
+ def_levels_buffer(AllocateBuffer(memory_pool)) {}
+
+ template <typename T>
+ ::arrow::Status GetScratchData(const int64_t num_values, T** out) {
+ ARROW_RETURN_NOT_OK(this->data_buffer->Resize(num_values * sizeof(T), false));
+ *out = reinterpret_cast<T*>(this->data_buffer->mutable_data());
+ return ::arrow::Status::OK();
+ }
+
+ MemoryPool* memory_pool;
+ const ArrowWriterProperties* properties;
+
+ // Buffer used for storing the data of an array converted to the physical type
+ // as expected by parquet-cpp.
+ std::shared_ptr<ResizableBuffer> data_buffer;
+
+ // We use the shared ownership of this buffer
+ std::shared_ptr<ResizableBuffer> def_levels_buffer;
+};
+
+PARQUET_EXPORT
+std::shared_ptr<ArrowWriterProperties> default_arrow_writer_properties();
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/schema.cc b/contrib/libs/apache/arrow/cpp/src/parquet/schema.cc
new file mode 100644
index 00000000000..cfa6bdb2912
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/schema.cc
@@ -0,0 +1,945 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/schema.h"
+
+#include <algorithm>
+#include <cstring>
+#include <memory>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+#include "arrow/util/logging.h"
+#include "parquet/exception.h"
+#include "parquet/schema_internal.h"
+#include "parquet/thrift_internal.h"
+
+using parquet::format::SchemaElement;
+
+namespace parquet {
+
+namespace schema {
+
+namespace {
+
+void ThrowInvalidLogicalType(const LogicalType& logical_type) {
+ std::stringstream ss;
+ ss << "Invalid logical type: " << logical_type.ToString();
+ throw ParquetException(ss.str());
+}
+
+} // namespace
+
+// ----------------------------------------------------------------------
+// ColumnPath
+
+std::shared_ptr<ColumnPath> ColumnPath::FromDotString(const std::string& dotstring) {
+ std::stringstream ss(dotstring);
+ std::string item;
+ std::vector<std::string> path;
+ while (std::getline(ss, item, '.')) {
+ path.push_back(item);
+ }
+ return std::make_shared<ColumnPath>(std::move(path));
+}
+
+std::shared_ptr<ColumnPath> ColumnPath::FromNode(const Node& node) {
+ // Build the path in reverse order as we traverse the nodes to the top
+ std::vector<std::string> rpath_;
+ const Node* cursor = &node;
+ // The schema node is not part of the ColumnPath
+ while (cursor->parent()) {
+ rpath_.push_back(cursor->name());
+ cursor = cursor->parent();
+ }
+
+ // Build ColumnPath in correct order
+ std::vector<std::string> path(rpath_.crbegin(), rpath_.crend());
+ return std::make_shared<ColumnPath>(std::move(path));
+}
+
+std::shared_ptr<ColumnPath> ColumnPath::extend(const std::string& node_name) const {
+ std::vector<std::string> path;
+ path.reserve(path_.size() + 1);
+ path.resize(path_.size() + 1);
+ std::copy(path_.cbegin(), path_.cend(), path.begin());
+ path[path_.size()] = node_name;
+
+ return std::make_shared<ColumnPath>(std::move(path));
+}
+
+std::string ColumnPath::ToDotString() const {
+ std::stringstream ss;
+ for (auto it = path_.cbegin(); it != path_.cend(); ++it) {
+ if (it != path_.cbegin()) {
+ ss << ".";
+ }
+ ss << *it;
+ }
+ return ss.str();
+}
+
+const std::vector<std::string>& ColumnPath::ToDotVector() const { return path_; }
+
+// ----------------------------------------------------------------------
+// Base node
+
+const std::shared_ptr<ColumnPath> Node::path() const {
+ // TODO(itaiin): Cache the result, or more precisely, cache ->ToDotString()
+ // since it is being used to access the leaf nodes
+ return ColumnPath::FromNode(*this);
+}
+
+bool Node::EqualsInternal(const Node* other) const {
+ return type_ == other->type_ && name_ == other->name_ &&
+ repetition_ == other->repetition_ && converted_type_ == other->converted_type_ &&
+ field_id_ == other->field_id() &&
+ logical_type_->Equals(*(other->logical_type()));
+}
+
+void Node::SetParent(const Node* parent) { parent_ = parent; }
+
+// ----------------------------------------------------------------------
+// Primitive node
+
+PrimitiveNode::PrimitiveNode(const std::string& name, Repetition::type repetition,
+ Type::type type, ConvertedType::type converted_type,
+ int length, int precision, int scale, int id)
+ : Node(Node::PRIMITIVE, name, repetition, converted_type, id),
+ physical_type_(type),
+ type_length_(length) {
+ std::stringstream ss;
+
+ // PARQUET-842: In an earlier revision, decimal_metadata_.isset was being
+ // set to true, but Impala will raise an incompatible metadata in such cases
+ memset(&decimal_metadata_, 0, sizeof(decimal_metadata_));
+
+ // Check if the physical and logical types match
+ // Mapping referred from Apache parquet-mr as on 2016-02-22
+ switch (converted_type) {
+ case ConvertedType::NONE:
+ // Logical type not set
+ break;
+ case ConvertedType::UTF8:
+ case ConvertedType::JSON:
+ case ConvertedType::BSON:
+ if (type != Type::BYTE_ARRAY) {
+ ss << ConvertedTypeToString(converted_type);
+ ss << " can only annotate BYTE_ARRAY fields";
+ throw ParquetException(ss.str());
+ }
+ break;
+ case ConvertedType::DECIMAL:
+ if ((type != Type::INT32) && (type != Type::INT64) && (type != Type::BYTE_ARRAY) &&
+ (type != Type::FIXED_LEN_BYTE_ARRAY)) {
+ ss << "DECIMAL can only annotate INT32, INT64, BYTE_ARRAY, and FIXED";
+ throw ParquetException(ss.str());
+ }
+ if (precision <= 0) {
+ ss << "Invalid DECIMAL precision: " << precision
+ << ". Precision must be a number between 1 and 38 inclusive";
+ throw ParquetException(ss.str());
+ }
+ if (scale < 0) {
+ ss << "Invalid DECIMAL scale: " << scale
+ << ". Scale must be a number between 0 and precision inclusive";
+ throw ParquetException(ss.str());
+ }
+ if (scale > precision) {
+ ss << "Invalid DECIMAL scale " << scale;
+ ss << " cannot be greater than precision " << precision;
+ throw ParquetException(ss.str());
+ }
+ decimal_metadata_.isset = true;
+ decimal_metadata_.precision = precision;
+ decimal_metadata_.scale = scale;
+ break;
+ case ConvertedType::DATE:
+ case ConvertedType::TIME_MILLIS:
+ case ConvertedType::UINT_8:
+ case ConvertedType::UINT_16:
+ case ConvertedType::UINT_32:
+ case ConvertedType::INT_8:
+ case ConvertedType::INT_16:
+ case ConvertedType::INT_32:
+ if (type != Type::INT32) {
+ ss << ConvertedTypeToString(converted_type);
+ ss << " can only annotate INT32";
+ throw ParquetException(ss.str());
+ }
+ break;
+ case ConvertedType::TIME_MICROS:
+ case ConvertedType::TIMESTAMP_MILLIS:
+ case ConvertedType::TIMESTAMP_MICROS:
+ case ConvertedType::UINT_64:
+ case ConvertedType::INT_64:
+ if (type != Type::INT64) {
+ ss << ConvertedTypeToString(converted_type);
+ ss << " can only annotate INT64";
+ throw ParquetException(ss.str());
+ }
+ break;
+ case ConvertedType::INTERVAL:
+ if ((type != Type::FIXED_LEN_BYTE_ARRAY) || (length != 12)) {
+ ss << "INTERVAL can only annotate FIXED_LEN_BYTE_ARRAY(12)";
+ throw ParquetException(ss.str());
+ }
+ break;
+ case ConvertedType::ENUM:
+ if (type != Type::BYTE_ARRAY) {
+ ss << "ENUM can only annotate BYTE_ARRAY fields";
+ throw ParquetException(ss.str());
+ }
+ break;
+ case ConvertedType::NA:
+ // NA can annotate any type
+ break;
+ default:
+ ss << ConvertedTypeToString(converted_type);
+ ss << " cannot be applied to a primitive type";
+ throw ParquetException(ss.str());
+ }
+ // For forward compatibility, create an equivalent logical type
+ logical_type_ = LogicalType::FromConvertedType(converted_type_, decimal_metadata_);
+ if (!(logical_type_ && !logical_type_->is_nested() &&
+ logical_type_->is_compatible(converted_type_, decimal_metadata_))) {
+ ThrowInvalidLogicalType(*logical_type_);
+ }
+
+ if (type == Type::FIXED_LEN_BYTE_ARRAY) {
+ if (length <= 0) {
+ ss << "Invalid FIXED_LEN_BYTE_ARRAY length: " << length;
+ throw ParquetException(ss.str());
+ }
+ type_length_ = length;
+ }
+}
+
+PrimitiveNode::PrimitiveNode(const std::string& name, Repetition::type repetition,
+ std::shared_ptr<const LogicalType> logical_type,
+ Type::type physical_type, int physical_length, int id)
+ : Node(Node::PRIMITIVE, name, repetition, std::move(logical_type), id),
+ physical_type_(physical_type),
+ type_length_(physical_length) {
+ std::stringstream error;
+ if (logical_type_) {
+ // Check for logical type <=> node type consistency
+ if (!logical_type_->is_nested()) {
+ // Check for logical type <=> physical type consistency
+ if (logical_type_->is_applicable(physical_type, physical_length)) {
+ // For backward compatibility, assign equivalent legacy
+ // converted type (if possible)
+ converted_type_ = logical_type_->ToConvertedType(&decimal_metadata_);
+ } else {
+ error << logical_type_->ToString();
+ error << " can not be applied to primitive type ";
+ error << TypeToString(physical_type);
+ throw ParquetException(error.str());
+ }
+ } else {
+ error << "Nested logical type ";
+ error << logical_type_->ToString();
+ error << " can not be applied to non-group node";
+ throw ParquetException(error.str());
+ }
+ } else {
+ logical_type_ = NoLogicalType::Make();
+ converted_type_ = logical_type_->ToConvertedType(&decimal_metadata_);
+ }
+ if (!(logical_type_ && !logical_type_->is_nested() &&
+ logical_type_->is_compatible(converted_type_, decimal_metadata_))) {
+ ThrowInvalidLogicalType(*logical_type_);
+ }
+
+ if (physical_type == Type::FIXED_LEN_BYTE_ARRAY) {
+ if (physical_length <= 0) {
+ error << "Invalid FIXED_LEN_BYTE_ARRAY length: " << physical_length;
+ throw ParquetException(error.str());
+ }
+ }
+}
+
+bool PrimitiveNode::EqualsInternal(const PrimitiveNode* other) const {
+ bool is_equal = true;
+ if (physical_type_ != other->physical_type_) {
+ return false;
+ }
+ if (converted_type_ == ConvertedType::DECIMAL) {
+ is_equal &= (decimal_metadata_.precision == other->decimal_metadata_.precision) &&
+ (decimal_metadata_.scale == other->decimal_metadata_.scale);
+ }
+ if (physical_type_ == Type::FIXED_LEN_BYTE_ARRAY) {
+ is_equal &= (type_length_ == other->type_length_);
+ }
+ return is_equal;
+}
+
+bool PrimitiveNode::Equals(const Node* other) const {
+ if (!Node::EqualsInternal(other)) {
+ return false;
+ }
+ return EqualsInternal(static_cast<const PrimitiveNode*>(other));
+}
+
+void PrimitiveNode::Visit(Node::Visitor* visitor) { visitor->Visit(this); }
+
+void PrimitiveNode::VisitConst(Node::ConstVisitor* visitor) const {
+ visitor->Visit(this);
+}
+
+// ----------------------------------------------------------------------
+// Group node
+
+GroupNode::GroupNode(const std::string& name, Repetition::type repetition,
+ const NodeVector& fields, ConvertedType::type converted_type, int id)
+ : Node(Node::GROUP, name, repetition, converted_type, id), fields_(fields) {
+ // For forward compatibility, create an equivalent logical type
+ logical_type_ = LogicalType::FromConvertedType(converted_type_);
+ if (!(logical_type_ && (logical_type_->is_nested() || logical_type_->is_none()) &&
+ logical_type_->is_compatible(converted_type_))) {
+ ThrowInvalidLogicalType(*logical_type_);
+ }
+
+ field_name_to_idx_.clear();
+ auto field_idx = 0;
+ for (NodePtr& field : fields_) {
+ field->SetParent(this);
+ field_name_to_idx_.emplace(field->name(), field_idx++);
+ }
+}
+
+GroupNode::GroupNode(const std::string& name, Repetition::type repetition,
+ const NodeVector& fields,
+ std::shared_ptr<const LogicalType> logical_type, int id)
+ : Node(Node::GROUP, name, repetition, std::move(logical_type), id), fields_(fields) {
+ if (logical_type_) {
+ // Check for logical type <=> node type consistency
+ if (logical_type_->is_nested()) {
+ // For backward compatibility, assign equivalent legacy converted type (if possible)
+ converted_type_ = logical_type_->ToConvertedType(nullptr);
+ } else {
+ std::stringstream error;
+ error << "Logical type ";
+ error << logical_type_->ToString();
+ error << " can not be applied to group node";
+ throw ParquetException(error.str());
+ }
+ } else {
+ logical_type_ = NoLogicalType::Make();
+ converted_type_ = logical_type_->ToConvertedType(nullptr);
+ }
+ if (!(logical_type_ && (logical_type_->is_nested() || logical_type_->is_none()) &&
+ logical_type_->is_compatible(converted_type_))) {
+ ThrowInvalidLogicalType(*logical_type_);
+ }
+
+ field_name_to_idx_.clear();
+ auto field_idx = 0;
+ for (NodePtr& field : fields_) {
+ field->SetParent(this);
+ field_name_to_idx_.emplace(field->name(), field_idx++);
+ }
+}
+
+bool GroupNode::EqualsInternal(const GroupNode* other) const {
+ if (this == other) {
+ return true;
+ }
+ if (this->field_count() != other->field_count()) {
+ return false;
+ }
+ for (int i = 0; i < this->field_count(); ++i) {
+ if (!this->field(i)->Equals(other->field(i).get())) {
+ return false;
+ }
+ }
+ return true;
+}
+
+bool GroupNode::Equals(const Node* other) const {
+ if (!Node::EqualsInternal(other)) {
+ return false;
+ }
+ return EqualsInternal(static_cast<const GroupNode*>(other));
+}
+
+int GroupNode::FieldIndex(const std::string& name) const {
+ auto search = field_name_to_idx_.find(name);
+ if (search == field_name_to_idx_.end()) {
+ // Not found
+ return -1;
+ }
+ return search->second;
+}
+
+int GroupNode::FieldIndex(const Node& node) const {
+ auto search = field_name_to_idx_.equal_range(node.name());
+ for (auto it = search.first; it != search.second; ++it) {
+ const int idx = it->second;
+ if (&node == field(idx).get()) {
+ return idx;
+ }
+ }
+ return -1;
+}
+
+void GroupNode::Visit(Node::Visitor* visitor) { visitor->Visit(this); }
+
+void GroupNode::VisitConst(Node::ConstVisitor* visitor) const { visitor->Visit(this); }
+
+// ----------------------------------------------------------------------
+// Node construction from Parquet metadata
+
+std::unique_ptr<Node> GroupNode::FromParquet(const void* opaque_element,
+ NodeVector fields) {
+ const format::SchemaElement* element =
+ static_cast<const format::SchemaElement*>(opaque_element);
+
+ int field_id = -1;
+ if (element->__isset.field_id) {
+ field_id = element->field_id;
+ }
+
+ std::unique_ptr<GroupNode> group_node;
+ if (element->__isset.logicalType) {
+ // updated writer with logical type present
+ group_node = std::unique_ptr<GroupNode>(
+ new GroupNode(element->name, LoadEnumSafe(&element->repetition_type), fields,
+ LogicalType::FromThrift(element->logicalType), field_id));
+ } else {
+ group_node = std::unique_ptr<GroupNode>(new GroupNode(
+ element->name, LoadEnumSafe(&element->repetition_type), fields,
+ (element->__isset.converted_type ? LoadEnumSafe(&element->converted_type)
+ : ConvertedType::NONE),
+ field_id));
+ }
+
+ return std::unique_ptr<Node>(group_node.release());
+}
+
+std::unique_ptr<Node> PrimitiveNode::FromParquet(const void* opaque_element) {
+ const format::SchemaElement* element =
+ static_cast<const format::SchemaElement*>(opaque_element);
+
+ int field_id = -1;
+ if (element->__isset.field_id) {
+ field_id = element->field_id;
+ }
+
+ std::unique_ptr<PrimitiveNode> primitive_node;
+ if (element->__isset.logicalType) {
+ // updated writer with logical type present
+ primitive_node = std::unique_ptr<PrimitiveNode>(
+ new PrimitiveNode(element->name, LoadEnumSafe(&element->repetition_type),
+ LogicalType::FromThrift(element->logicalType),
+ LoadEnumSafe(&element->type), element->type_length, field_id));
+ } else if (element->__isset.converted_type) {
+ // legacy writer with converted type present
+ primitive_node = std::unique_ptr<PrimitiveNode>(new PrimitiveNode(
+ element->name, LoadEnumSafe(&element->repetition_type),
+ LoadEnumSafe(&element->type), LoadEnumSafe(&element->converted_type),
+ element->type_length, element->precision, element->scale, field_id));
+ } else {
+ // logical type not present
+ primitive_node = std::unique_ptr<PrimitiveNode>(new PrimitiveNode(
+ element->name, LoadEnumSafe(&element->repetition_type), NoLogicalType::Make(),
+ LoadEnumSafe(&element->type), element->type_length, field_id));
+ }
+
+ // Return as unique_ptr to the base type
+ return std::unique_ptr<Node>(primitive_node.release());
+}
+
+bool GroupNode::HasRepeatedFields() const {
+ for (int i = 0; i < this->field_count(); ++i) {
+ auto field = this->field(i);
+ if (field->repetition() == Repetition::REPEATED) {
+ return true;
+ }
+ if (field->is_group()) {
+ const auto& group = static_cast<const GroupNode&>(*field);
+ return group.HasRepeatedFields();
+ }
+ }
+ return false;
+}
+
+void GroupNode::ToParquet(void* opaque_element) const {
+ format::SchemaElement* element = static_cast<format::SchemaElement*>(opaque_element);
+ element->__set_name(name_);
+ element->__set_num_children(field_count());
+ element->__set_repetition_type(ToThrift(repetition_));
+ if (converted_type_ != ConvertedType::NONE) {
+ element->__set_converted_type(ToThrift(converted_type_));
+ }
+ if (field_id_ >= 0) {
+ element->__set_field_id(field_id_);
+ }
+ if (logical_type_ && logical_type_->is_serialized()) {
+ element->__set_logicalType(logical_type_->ToThrift());
+ }
+ return;
+}
+
+void PrimitiveNode::ToParquet(void* opaque_element) const {
+ format::SchemaElement* element = static_cast<format::SchemaElement*>(opaque_element);
+ element->__set_name(name_);
+ element->__set_repetition_type(ToThrift(repetition_));
+ if (converted_type_ != ConvertedType::NONE) {
+ if (converted_type_ != ConvertedType::NA) {
+ element->__set_converted_type(ToThrift(converted_type_));
+ } else {
+ // ConvertedType::NA is an unreleased, obsolete synonym for LogicalType::Null.
+ // Never emit it (see PARQUET-1990 for discussion).
+ if (!logical_type_ || !logical_type_->is_null()) {
+ throw ParquetException(
+ "ConvertedType::NA is obsolete, please use LogicalType::Null instead");
+ }
+ }
+ }
+ if (field_id_ >= 0) {
+ element->__set_field_id(field_id_);
+ }
+ if (logical_type_ && logical_type_->is_serialized() &&
+ // TODO(tpboudreau): remove the following conjunct to enable serialization
+ // of IntervalTypes after parquet.thrift recognizes them
+ !logical_type_->is_interval()) {
+ element->__set_logicalType(logical_type_->ToThrift());
+ }
+ element->__set_type(ToThrift(physical_type_));
+ if (physical_type_ == Type::FIXED_LEN_BYTE_ARRAY) {
+ element->__set_type_length(type_length_);
+ }
+ if (decimal_metadata_.isset) {
+ element->__set_precision(decimal_metadata_.precision);
+ element->__set_scale(decimal_metadata_.scale);
+ }
+ return;
+}
+
+// ----------------------------------------------------------------------
+// Schema converters
+
+std::unique_ptr<Node> Unflatten(const format::SchemaElement* elements, int length) {
+ if (elements[0].num_children == 0) {
+ if (length == 1) {
+ // Degenerate case of Parquet file with no columns
+ return GroupNode::FromParquet(elements, {});
+ } else {
+ throw ParquetException(
+ "Parquet schema had multiple nodes but root had no children");
+ }
+ }
+
+ // We don't check that the root node is repeated since this is not
+ // consistently set by implementations
+
+ int pos = 0;
+
+ std::function<std::unique_ptr<Node>()> NextNode = [&]() {
+ if (pos == length) {
+ throw ParquetException("Malformed schema: not enough elements");
+ }
+ const SchemaElement& element = elements[pos++];
+ const void* opaque_element = static_cast<const void*>(&element);
+
+ if (element.num_children == 0 && element.__isset.type) {
+ // Leaf (primitive) node: always has a type
+ return PrimitiveNode::FromParquet(opaque_element);
+ } else {
+ // Group node (may have 0 children, but cannot have a type)
+ NodeVector fields;
+ for (int i = 0; i < element.num_children; ++i) {
+ std::unique_ptr<Node> field = NextNode();
+ fields.push_back(NodePtr(field.release()));
+ }
+ return GroupNode::FromParquet(opaque_element, std::move(fields));
+ }
+ };
+ return NextNode();
+}
+
+std::shared_ptr<SchemaDescriptor> FromParquet(const std::vector<SchemaElement>& schema) {
+ if (schema.empty()) {
+ throw ParquetException("Empty file schema (no root)");
+ }
+ std::unique_ptr<Node> root = Unflatten(&schema[0], static_cast<int>(schema.size()));
+ std::shared_ptr<SchemaDescriptor> descr = std::make_shared<SchemaDescriptor>();
+ descr->Init(std::shared_ptr<GroupNode>(static_cast<GroupNode*>(root.release())));
+ return descr;
+}
+
+class SchemaVisitor : public Node::ConstVisitor {
+ public:
+ explicit SchemaVisitor(std::vector<format::SchemaElement>* elements)
+ : elements_(elements) {}
+
+ void Visit(const Node* node) override {
+ format::SchemaElement element;
+ node->ToParquet(&element);
+ elements_->push_back(element);
+
+ if (node->is_group()) {
+ const GroupNode* group_node = static_cast<const GroupNode*>(node);
+ for (int i = 0; i < group_node->field_count(); ++i) {
+ group_node->field(i)->VisitConst(this);
+ }
+ }
+ }
+
+ private:
+ std::vector<format::SchemaElement>* elements_;
+};
+
+void ToParquet(const GroupNode* schema, std::vector<format::SchemaElement>* out) {
+ SchemaVisitor visitor(out);
+ schema->VisitConst(&visitor);
+}
+
+// ----------------------------------------------------------------------
+// Schema printing
+
+static void PrintRepLevel(Repetition::type repetition, std::ostream& stream) {
+ switch (repetition) {
+ case Repetition::REQUIRED:
+ stream << "required";
+ break;
+ case Repetition::OPTIONAL:
+ stream << "optional";
+ break;
+ case Repetition::REPEATED:
+ stream << "repeated";
+ break;
+ default:
+ break;
+ }
+}
+
+static void PrintType(const PrimitiveNode* node, std::ostream& stream) {
+ switch (node->physical_type()) {
+ case Type::BOOLEAN:
+ stream << "boolean";
+ break;
+ case Type::INT32:
+ stream << "int32";
+ break;
+ case Type::INT64:
+ stream << "int64";
+ break;
+ case Type::INT96:
+ stream << "int96";
+ break;
+ case Type::FLOAT:
+ stream << "float";
+ break;
+ case Type::DOUBLE:
+ stream << "double";
+ break;
+ case Type::BYTE_ARRAY:
+ stream << "binary";
+ break;
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ stream << "fixed_len_byte_array(" << node->type_length() << ")";
+ break;
+ default:
+ break;
+ }
+}
+
+static void PrintConvertedType(const PrimitiveNode* node, std::ostream& stream) {
+ auto lt = node->converted_type();
+ auto la = node->logical_type();
+ if (la && la->is_valid() && !la->is_none()) {
+ stream << " (" << la->ToString() << ")";
+ } else if (lt == ConvertedType::DECIMAL) {
+ stream << " (" << ConvertedTypeToString(lt) << "("
+ << node->decimal_metadata().precision << "," << node->decimal_metadata().scale
+ << "))";
+ } else if (lt != ConvertedType::NONE) {
+ stream << " (" << ConvertedTypeToString(lt) << ")";
+ }
+}
+
+struct SchemaPrinter : public Node::ConstVisitor {
+ explicit SchemaPrinter(std::ostream& stream, int indent_width)
+ : stream_(stream), indent_(0), indent_width_(2) {}
+
+ void Indent() {
+ if (indent_ > 0) {
+ std::string spaces(indent_, ' ');
+ stream_ << spaces;
+ }
+ }
+
+ void Visit(const Node* node) {
+ Indent();
+ if (node->is_group()) {
+ Visit(static_cast<const GroupNode*>(node));
+ } else {
+ // Primitive
+ Visit(static_cast<const PrimitiveNode*>(node));
+ }
+ }
+
+ void Visit(const PrimitiveNode* node) {
+ PrintRepLevel(node->repetition(), stream_);
+ stream_ << " ";
+ PrintType(node, stream_);
+ stream_ << " field_id=" << node->field_id() << " " << node->name();
+ PrintConvertedType(node, stream_);
+ stream_ << ";" << std::endl;
+ }
+
+ void Visit(const GroupNode* node) {
+ PrintRepLevel(node->repetition(), stream_);
+ stream_ << " group "
+ << "field_id=" << node->field_id() << " " << node->name();
+ auto lt = node->converted_type();
+ auto la = node->logical_type();
+ if (la && la->is_valid() && !la->is_none()) {
+ stream_ << " (" << la->ToString() << ")";
+ } else if (lt != ConvertedType::NONE) {
+ stream_ << " (" << ConvertedTypeToString(lt) << ")";
+ }
+ stream_ << " {" << std::endl;
+
+ indent_ += indent_width_;
+ for (int i = 0; i < node->field_count(); ++i) {
+ node->field(i)->VisitConst(this);
+ }
+ indent_ -= indent_width_;
+ Indent();
+ stream_ << "}" << std::endl;
+ }
+
+ std::ostream& stream_;
+ int indent_;
+ int indent_width_;
+};
+
+void PrintSchema(const Node* schema, std::ostream& stream, int indent_width) {
+ SchemaPrinter printer(stream, indent_width);
+ printer.Visit(schema);
+}
+
+} // namespace schema
+
+using schema::ColumnPath;
+using schema::GroupNode;
+using schema::Node;
+using schema::NodePtr;
+using schema::PrimitiveNode;
+
+void SchemaDescriptor::Init(std::unique_ptr<schema::Node> schema) {
+ Init(NodePtr(schema.release()));
+}
+
+class SchemaUpdater : public Node::Visitor {
+ public:
+ explicit SchemaUpdater(const std::vector<ColumnOrder>& column_orders)
+ : column_orders_(column_orders), leaf_count_(0) {}
+
+ void Visit(Node* node) override {
+ if (node->is_group()) {
+ GroupNode* group_node = static_cast<GroupNode*>(node);
+ for (int i = 0; i < group_node->field_count(); ++i) {
+ group_node->field(i)->Visit(this);
+ }
+ } else { // leaf node
+ PrimitiveNode* leaf_node = static_cast<PrimitiveNode*>(node);
+ leaf_node->SetColumnOrder(column_orders_[leaf_count_++]);
+ }
+ }
+
+ private:
+ const std::vector<ColumnOrder>& column_orders_;
+ int leaf_count_;
+};
+
+void SchemaDescriptor::updateColumnOrders(const std::vector<ColumnOrder>& column_orders) {
+ if (static_cast<int>(column_orders.size()) != num_columns()) {
+ throw ParquetException("Malformed schema: not enough ColumnOrder values");
+ }
+ SchemaUpdater visitor(column_orders);
+ const_cast<GroupNode*>(group_node_)->Visit(&visitor);
+}
+
+void SchemaDescriptor::Init(NodePtr schema) {
+ schema_ = std::move(schema);
+
+ if (!schema_->is_group()) {
+ throw ParquetException("Must initialize with a schema group");
+ }
+
+ group_node_ = static_cast<const GroupNode*>(schema_.get());
+ leaves_.clear();
+
+ for (int i = 0; i < group_node_->field_count(); ++i) {
+ BuildTree(group_node_->field(i), 0, 0, group_node_->field(i));
+ }
+}
+
+bool SchemaDescriptor::Equals(const SchemaDescriptor& other) const {
+ if (this->num_columns() != other.num_columns()) {
+ return false;
+ }
+
+ for (int i = 0; i < this->num_columns(); ++i) {
+ if (!this->Column(i)->Equals(*other.Column(i))) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+void SchemaDescriptor::BuildTree(const NodePtr& node, int16_t max_def_level,
+ int16_t max_rep_level, const NodePtr& base) {
+ if (node->is_optional()) {
+ ++max_def_level;
+ } else if (node->is_repeated()) {
+ // Repeated fields add a definition level. This is used to distinguish
+ // between an empty list and a list with an item in it.
+ ++max_rep_level;
+ ++max_def_level;
+ }
+
+ // Now, walk the schema and create a ColumnDescriptor for each leaf node
+ if (node->is_group()) {
+ const GroupNode* group = static_cast<const GroupNode*>(node.get());
+ for (int i = 0; i < group->field_count(); ++i) {
+ BuildTree(group->field(i), max_def_level, max_rep_level, base);
+ }
+ } else {
+ node_to_leaf_index_[static_cast<const PrimitiveNode*>(node.get())] =
+ static_cast<int>(leaves_.size());
+
+ // Primitive node, append to leaves
+ leaves_.push_back(ColumnDescriptor(node, max_def_level, max_rep_level, this));
+ leaf_to_base_.emplace(static_cast<int>(leaves_.size()) - 1, base);
+ leaf_to_idx_.emplace(node->path()->ToDotString(),
+ static_cast<int>(leaves_.size()) - 1);
+ }
+}
+
+int SchemaDescriptor::GetColumnIndex(const PrimitiveNode& node) const {
+ auto it = node_to_leaf_index_.find(&node);
+ if (it == node_to_leaf_index_.end()) {
+ return -1;
+ }
+ return it->second;
+}
+
+ColumnDescriptor::ColumnDescriptor(schema::NodePtr node, int16_t max_definition_level,
+ int16_t max_repetition_level,
+ const SchemaDescriptor* schema_descr)
+ : node_(std::move(node)),
+ max_definition_level_(max_definition_level),
+ max_repetition_level_(max_repetition_level) {
+ if (!node_->is_primitive()) {
+ throw ParquetException("Must be a primitive type");
+ }
+ primitive_node_ = static_cast<const PrimitiveNode*>(node_.get());
+}
+
+bool ColumnDescriptor::Equals(const ColumnDescriptor& other) const {
+ return primitive_node_->Equals(other.primitive_node_) &&
+ max_repetition_level() == other.max_repetition_level() &&
+ max_definition_level() == other.max_definition_level();
+}
+
+const ColumnDescriptor* SchemaDescriptor::Column(int i) const {
+ DCHECK(i >= 0 && i < static_cast<int>(leaves_.size()));
+ return &leaves_[i];
+}
+
+int SchemaDescriptor::ColumnIndex(const std::string& node_path) const {
+ auto search = leaf_to_idx_.find(node_path);
+ if (search == leaf_to_idx_.end()) {
+ // Not found
+ return -1;
+ }
+ return search->second;
+}
+
+int SchemaDescriptor::ColumnIndex(const Node& node) const {
+ auto search = leaf_to_idx_.equal_range(node.path()->ToDotString());
+ for (auto it = search.first; it != search.second; ++it) {
+ const int idx = it->second;
+ if (&node == Column(idx)->schema_node().get()) {
+ return idx;
+ }
+ }
+ return -1;
+}
+
+const schema::Node* SchemaDescriptor::GetColumnRoot(int i) const {
+ DCHECK(i >= 0 && i < static_cast<int>(leaves_.size()));
+ return leaf_to_base_.find(i)->second.get();
+}
+
+bool SchemaDescriptor::HasRepeatedFields() const {
+ return group_node_->HasRepeatedFields();
+}
+
+std::string SchemaDescriptor::ToString() const {
+ std::ostringstream ss;
+ PrintSchema(schema_.get(), ss);
+ return ss.str();
+}
+
+std::string ColumnDescriptor::ToString() const {
+ std::ostringstream ss;
+ ss << "column descriptor = {" << std::endl
+ << " name: " << name() << "," << std::endl
+ << " path: " << path()->ToDotString() << "," << std::endl
+ << " physical_type: " << TypeToString(physical_type()) << "," << std::endl
+ << " converted_type: " << ConvertedTypeToString(converted_type()) << ","
+ << std::endl
+ << " logical_type: " << logical_type()->ToString() << "," << std::endl
+ << " max_definition_level: " << max_definition_level() << "," << std::endl
+ << " max_repetition_level: " << max_repetition_level() << "," << std::endl;
+
+ if (physical_type() == ::parquet::Type::FIXED_LEN_BYTE_ARRAY) {
+ ss << " length: " << type_length() << "," << std::endl;
+ }
+
+ if (converted_type() == parquet::ConvertedType::DECIMAL) {
+ ss << " precision: " << type_precision() << "," << std::endl
+ << " scale: " << type_scale() << "," << std::endl;
+ }
+
+ ss << "}";
+ return ss.str();
+}
+
+int ColumnDescriptor::type_scale() const {
+ return primitive_node_->decimal_metadata().scale;
+}
+
+int ColumnDescriptor::type_precision() const {
+ return primitive_node_->decimal_metadata().precision;
+}
+
+int ColumnDescriptor::type_length() const { return primitive_node_->type_length(); }
+
+const std::shared_ptr<ColumnPath> ColumnDescriptor::path() const {
+ return primitive_node_->path();
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/schema.h b/contrib/libs/apache/arrow/cpp/src/parquet/schema.h
new file mode 100644
index 00000000000..7dcfa7d144e
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/schema.h
@@ -0,0 +1,494 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This module contains the logical parquet-cpp types (independent of Thrift
+// structures), schema nodes, and related type tools
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <ostream>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "parquet/platform.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+class SchemaDescriptor;
+
+namespace schema {
+
+class Node;
+
+// List encodings: using the terminology from Impala to define different styles
+// of representing logical lists (a.k.a. ARRAY types) in Parquet schemas. Since
+// the converted type named in the Parquet metadata is ConvertedType::LIST we
+// use that terminology here. It also helps distinguish from the *_ARRAY
+// primitive types.
+//
+// One-level encoding: Only allows required lists with required cells
+// repeated value_type name
+//
+// Two-level encoding: Enables optional lists with only required cells
+// <required/optional> group list
+// repeated value_type item
+//
+// Three-level encoding: Enables optional lists with optional cells
+// <required/optional> group bag
+// repeated group list
+// <required/optional> value_type item
+//
+// 2- and 1-level encoding are respectively equivalent to 3-level encoding with
+// the non-repeated nodes set to required.
+//
+// The "official" encoding recommended in the Parquet spec is the 3-level, and
+// we use that as the default when creating list types. For semantic completeness
+// we allow the other two. Since all types of encodings will occur "in the
+// wild" we need to be able to interpret the associated definition levels in
+// the context of the actual encoding used in the file.
+//
+// NB: Some Parquet writers may not set ConvertedType::LIST on the repeated
+// SchemaElement, which could make things challenging if we are trying to infer
+// that a sequence of nodes semantically represents an array according to one
+// of these encodings (versus a struct containing an array). We should refuse
+// the temptation to guess, as they say.
+struct ListEncoding {
+ enum type { ONE_LEVEL, TWO_LEVEL, THREE_LEVEL };
+};
+
+class PARQUET_EXPORT ColumnPath {
+ public:
+ ColumnPath() : path_() {}
+ explicit ColumnPath(const std::vector<std::string>& path) : path_(path) {}
+ explicit ColumnPath(std::vector<std::string>&& path) : path_(std::move(path)) {}
+
+ static std::shared_ptr<ColumnPath> FromDotString(const std::string& dotstring);
+ static std::shared_ptr<ColumnPath> FromNode(const Node& node);
+
+ std::shared_ptr<ColumnPath> extend(const std::string& node_name) const;
+ std::string ToDotString() const;
+ const std::vector<std::string>& ToDotVector() const;
+
+ protected:
+ std::vector<std::string> path_;
+};
+
+// Base class for logical schema types. A type has a name, repetition level,
+// and optionally a logical type (ConvertedType in Parquet metadata parlance)
+class PARQUET_EXPORT Node {
+ public:
+ enum type { PRIMITIVE, GROUP };
+
+ virtual ~Node() {}
+
+ bool is_primitive() const { return type_ == Node::PRIMITIVE; }
+
+ bool is_group() const { return type_ == Node::GROUP; }
+
+ bool is_optional() const { return repetition_ == Repetition::OPTIONAL; }
+
+ bool is_repeated() const { return repetition_ == Repetition::REPEATED; }
+
+ bool is_required() const { return repetition_ == Repetition::REQUIRED; }
+
+ virtual bool Equals(const Node* other) const = 0;
+
+ const std::string& name() const { return name_; }
+
+ Node::type node_type() const { return type_; }
+
+ Repetition::type repetition() const { return repetition_; }
+
+ ConvertedType::type converted_type() const { return converted_type_; }
+
+ const std::shared_ptr<const LogicalType>& logical_type() const { return logical_type_; }
+
+ /// \brief The field_id value for the serialized SchemaElement. If the
+ /// field_id is less than 0 (e.g. -1), it will not be set when serialized to
+ /// Thrift.
+ int field_id() const { return field_id_; }
+
+ PARQUET_DEPRECATED("id() is deprecated. Use field_id() instead")
+ int id() const { return field_id_; }
+
+ const Node* parent() const { return parent_; }
+
+ const std::shared_ptr<ColumnPath> path() const;
+
+ virtual void ToParquet(void* element) const = 0;
+
+ // Node::Visitor abstract class for walking schemas with the visitor pattern
+ class Visitor {
+ public:
+ virtual ~Visitor() {}
+
+ virtual void Visit(Node* node) = 0;
+ };
+ class ConstVisitor {
+ public:
+ virtual ~ConstVisitor() {}
+
+ virtual void Visit(const Node* node) = 0;
+ };
+
+ virtual void Visit(Visitor* visitor) = 0;
+ virtual void VisitConst(ConstVisitor* visitor) const = 0;
+
+ protected:
+ friend class GroupNode;
+
+ Node(Node::type type, const std::string& name, Repetition::type repetition,
+ ConvertedType::type converted_type = ConvertedType::NONE, int field_id = -1)
+ : type_(type),
+ name_(name),
+ repetition_(repetition),
+ converted_type_(converted_type),
+ field_id_(field_id),
+ parent_(NULLPTR) {}
+
+ Node(Node::type type, const std::string& name, Repetition::type repetition,
+ std::shared_ptr<const LogicalType> logical_type, int field_id = -1)
+ : type_(type),
+ name_(name),
+ repetition_(repetition),
+ logical_type_(std::move(logical_type)),
+ field_id_(field_id),
+ parent_(NULLPTR) {}
+
+ Node::type type_;
+ std::string name_;
+ Repetition::type repetition_;
+ ConvertedType::type converted_type_;
+ std::shared_ptr<const LogicalType> logical_type_;
+ int field_id_;
+ // Nodes should not be shared, they have a single parent.
+ const Node* parent_;
+
+ bool EqualsInternal(const Node* other) const;
+ void SetParent(const Node* p_parent);
+
+ private:
+ PARQUET_DISALLOW_COPY_AND_ASSIGN(Node);
+};
+
+// Save our breath all over the place with these typedefs
+typedef std::shared_ptr<Node> NodePtr;
+typedef std::vector<NodePtr> NodeVector;
+
+// A type that is one of the primitive Parquet storage types. In addition to
+// the other type metadata (name, repetition level, logical type), also has the
+// physical storage type and their type-specific metadata (byte width, decimal
+// parameters)
+class PARQUET_EXPORT PrimitiveNode : public Node {
+ public:
+ static std::unique_ptr<Node> FromParquet(const void* opaque_element);
+
+ // A field_id -1 (or any negative value) will be serialized as null in Thrift
+ static inline NodePtr Make(const std::string& name, Repetition::type repetition,
+ Type::type type,
+ ConvertedType::type converted_type = ConvertedType::NONE,
+ int length = -1, int precision = -1, int scale = -1,
+ int field_id = -1) {
+ return NodePtr(new PrimitiveNode(name, repetition, type, converted_type, length,
+ precision, scale, field_id));
+ }
+
+ // If no logical type, pass LogicalType::None() or nullptr
+ // A field_id -1 (or any negative value) will be serialized as null in Thrift
+ static inline NodePtr Make(const std::string& name, Repetition::type repetition,
+ std::shared_ptr<const LogicalType> logical_type,
+ Type::type primitive_type, int primitive_length = -1,
+ int field_id = -1) {
+ return NodePtr(new PrimitiveNode(name, repetition, logical_type, primitive_type,
+ primitive_length, field_id));
+ }
+
+ bool Equals(const Node* other) const override;
+
+ Type::type physical_type() const { return physical_type_; }
+
+ ColumnOrder column_order() const { return column_order_; }
+
+ void SetColumnOrder(ColumnOrder column_order) { column_order_ = column_order; }
+
+ int32_t type_length() const { return type_length_; }
+
+ const DecimalMetadata& decimal_metadata() const { return decimal_metadata_; }
+
+ void ToParquet(void* element) const override;
+ void Visit(Visitor* visitor) override;
+ void VisitConst(ConstVisitor* visitor) const override;
+
+ private:
+ PrimitiveNode(const std::string& name, Repetition::type repetition, Type::type type,
+ ConvertedType::type converted_type = ConvertedType::NONE, int length = -1,
+ int precision = -1, int scale = -1, int field_id = -1);
+
+ PrimitiveNode(const std::string& name, Repetition::type repetition,
+ std::shared_ptr<const LogicalType> logical_type,
+ Type::type primitive_type, int primitive_length = -1, int field_id = -1);
+
+ Type::type physical_type_;
+ int32_t type_length_;
+ DecimalMetadata decimal_metadata_;
+ ColumnOrder column_order_;
+
+ // For FIXED_LEN_BYTE_ARRAY
+ void SetTypeLength(int32_t length) { type_length_ = length; }
+
+ bool EqualsInternal(const PrimitiveNode* other) const;
+
+ FRIEND_TEST(TestPrimitiveNode, Attrs);
+ FRIEND_TEST(TestPrimitiveNode, Equals);
+ FRIEND_TEST(TestPrimitiveNode, PhysicalLogicalMapping);
+ FRIEND_TEST(TestPrimitiveNode, FromParquet);
+};
+
+class PARQUET_EXPORT GroupNode : public Node {
+ public:
+ static std::unique_ptr<Node> FromParquet(const void* opaque_element,
+ NodeVector fields = {});
+
+ // A field_id -1 (or any negative value) will be serialized as null in Thrift
+ static inline NodePtr Make(const std::string& name, Repetition::type repetition,
+ const NodeVector& fields,
+ ConvertedType::type converted_type = ConvertedType::NONE,
+ int field_id = -1) {
+ return NodePtr(new GroupNode(name, repetition, fields, converted_type, field_id));
+ }
+
+ // If no logical type, pass nullptr
+ // A field_id -1 (or any negative value) will be serialized as null in Thrift
+ static inline NodePtr Make(const std::string& name, Repetition::type repetition,
+ const NodeVector& fields,
+ std::shared_ptr<const LogicalType> logical_type,
+ int field_id = -1) {
+ return NodePtr(new GroupNode(name, repetition, fields, logical_type, field_id));
+ }
+
+ bool Equals(const Node* other) const override;
+
+ NodePtr field(int i) const { return fields_[i]; }
+ // Get the index of a field by its name, or negative value if not found.
+ // If several fields share the same name, it is unspecified which one
+ // is returned.
+ int FieldIndex(const std::string& name) const;
+ // Get the index of a field by its node, or negative value if not found.
+ int FieldIndex(const Node& node) const;
+
+ int field_count() const { return static_cast<int>(fields_.size()); }
+
+ void ToParquet(void* element) const override;
+ void Visit(Visitor* visitor) override;
+ void VisitConst(ConstVisitor* visitor) const override;
+
+ /// \brief Return true if this node or any child node has REPEATED repetition
+ /// type
+ bool HasRepeatedFields() const;
+
+ private:
+ GroupNode(const std::string& name, Repetition::type repetition,
+ const NodeVector& fields,
+ ConvertedType::type converted_type = ConvertedType::NONE, int field_id = -1);
+
+ GroupNode(const std::string& name, Repetition::type repetition,
+ const NodeVector& fields, std::shared_ptr<const LogicalType> logical_type,
+ int field_id = -1);
+
+ NodeVector fields_;
+ bool EqualsInternal(const GroupNode* other) const;
+
+ // Mapping between field name to the field index
+ std::unordered_multimap<std::string, int> field_name_to_idx_;
+
+ FRIEND_TEST(TestGroupNode, Attrs);
+ FRIEND_TEST(TestGroupNode, Equals);
+ FRIEND_TEST(TestGroupNode, FieldIndex);
+ FRIEND_TEST(TestGroupNode, FieldIndexDuplicateName);
+};
+
+// ----------------------------------------------------------------------
+// Convenience primitive type factory functions
+
+#define PRIMITIVE_FACTORY(FuncName, TYPE) \
+ static inline NodePtr FuncName(const std::string& name, \
+ Repetition::type repetition = Repetition::OPTIONAL, \
+ int field_id = -1) { \
+ return PrimitiveNode::Make(name, repetition, Type::TYPE, ConvertedType::NONE, \
+ /*length=*/-1, /*precision=*/-1, /*scale=*/-1, field_id); \
+ }
+
+PRIMITIVE_FACTORY(Boolean, BOOLEAN)
+PRIMITIVE_FACTORY(Int32, INT32)
+PRIMITIVE_FACTORY(Int64, INT64)
+PRIMITIVE_FACTORY(Int96, INT96)
+PRIMITIVE_FACTORY(Float, FLOAT)
+PRIMITIVE_FACTORY(Double, DOUBLE)
+PRIMITIVE_FACTORY(ByteArray, BYTE_ARRAY)
+
+void PARQUET_EXPORT PrintSchema(const schema::Node* schema, std::ostream& stream,
+ int indent_width = 2);
+
+} // namespace schema
+
+// The ColumnDescriptor encapsulates information necessary to interpret
+// primitive column data in the context of a particular schema. We have to
+// examine the node structure of a column's path to the root in the schema tree
+// to be able to reassemble the nested structure from the repetition and
+// definition levels.
+class PARQUET_EXPORT ColumnDescriptor {
+ public:
+ ColumnDescriptor(schema::NodePtr node, int16_t max_definition_level,
+ int16_t max_repetition_level,
+ const SchemaDescriptor* schema_descr = NULLPTR);
+
+ bool Equals(const ColumnDescriptor& other) const;
+
+ int16_t max_definition_level() const { return max_definition_level_; }
+
+ int16_t max_repetition_level() const { return max_repetition_level_; }
+
+ Type::type physical_type() const { return primitive_node_->physical_type(); }
+
+ ConvertedType::type converted_type() const { return primitive_node_->converted_type(); }
+
+ const std::shared_ptr<const LogicalType>& logical_type() const {
+ return primitive_node_->logical_type();
+ }
+
+ ColumnOrder column_order() const { return primitive_node_->column_order(); }
+
+ SortOrder::type sort_order() const {
+ auto la = logical_type();
+ auto pt = physical_type();
+ return la ? GetSortOrder(la, pt) : GetSortOrder(converted_type(), pt);
+ }
+
+ const std::string& name() const { return primitive_node_->name(); }
+
+ const std::shared_ptr<schema::ColumnPath> path() const;
+
+ const schema::NodePtr& schema_node() const { return node_; }
+
+ std::string ToString() const;
+
+ int type_length() const;
+
+ int type_precision() const;
+
+ int type_scale() const;
+
+ private:
+ schema::NodePtr node_;
+ const schema::PrimitiveNode* primitive_node_;
+
+ int16_t max_definition_level_;
+ int16_t max_repetition_level_;
+};
+
+// Container for the converted Parquet schema with a computed information from
+// the schema analysis needed for file reading
+//
+// * Column index to Node
+// * Max repetition / definition levels for each primitive node
+//
+// The ColumnDescriptor objects produced by this class can be used to assist in
+// the reconstruction of fully materialized data structures from the
+// repetition-definition level encoding of nested data
+//
+// TODO(wesm): this object can be recomputed from a Schema
+class PARQUET_EXPORT SchemaDescriptor {
+ public:
+ SchemaDescriptor() {}
+ ~SchemaDescriptor() {}
+
+ // Analyze the schema
+ void Init(std::unique_ptr<schema::Node> schema);
+ void Init(schema::NodePtr schema);
+
+ const ColumnDescriptor* Column(int i) const;
+
+ // Get the index of a column by its dotstring path, or negative value if not found.
+ // If several columns share the same dotstring path, it is unspecified which one
+ // is returned.
+ int ColumnIndex(const std::string& node_path) const;
+ // Get the index of a column by its node, or negative value if not found.
+ int ColumnIndex(const schema::Node& node) const;
+
+ bool Equals(const SchemaDescriptor& other) const;
+
+ // The number of physical columns appearing in the file
+ int num_columns() const { return static_cast<int>(leaves_.size()); }
+
+ const schema::NodePtr& schema_root() const { return schema_; }
+
+ const schema::GroupNode* group_node() const { return group_node_; }
+
+ // Returns the root (child of the schema root) node of the leaf(column) node
+ const schema::Node* GetColumnRoot(int i) const;
+
+ const std::string& name() const { return group_node_->name(); }
+
+ std::string ToString() const;
+
+ void updateColumnOrders(const std::vector<ColumnOrder>& column_orders);
+
+ /// \brief Return column index corresponding to a particular
+ /// PrimitiveNode. Returns -1 if not found
+ int GetColumnIndex(const schema::PrimitiveNode& node) const;
+
+ /// \brief Return true if any field or their children have REPEATED repetition
+ /// type
+ bool HasRepeatedFields() const;
+
+ private:
+ friend class ColumnDescriptor;
+
+ // Root Node
+ schema::NodePtr schema_;
+ // Root Node
+ const schema::GroupNode* group_node_;
+
+ void BuildTree(const schema::NodePtr& node, int16_t max_def_level,
+ int16_t max_rep_level, const schema::NodePtr& base);
+
+ // Result of leaf node / tree analysis
+ std::vector<ColumnDescriptor> leaves_;
+
+ std::unordered_map<const schema::PrimitiveNode*, int> node_to_leaf_index_;
+
+ // Mapping between leaf nodes and root group of leaf (first node
+ // below the schema's root group)
+ //
+ // For example, the leaf `a.b.c.d` would have a link back to `a`
+ //
+ // -- a <------
+ // -- -- b |
+ // -- -- -- c |
+ // -- -- -- -- d
+ std::unordered_map<int, schema::NodePtr> leaf_to_base_;
+
+ // Mapping between ColumnPath DotString to the leaf index
+ std::unordered_multimap<std::string, int> leaf_to_idx_;
+};
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/schema_internal.h b/contrib/libs/apache/arrow/cpp/src/parquet/schema_internal.h
new file mode 100644
index 00000000000..c0cfffc87e2
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/schema_internal.h
@@ -0,0 +1,54 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Non-public Thrift schema serialization utilities
+
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include "parquet/platform.h"
+#include "parquet/schema.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+namespace format {
+class SchemaElement;
+}
+
+namespace schema {
+
+// ----------------------------------------------------------------------
+// Conversion from Parquet Thrift metadata
+
+PARQUET_EXPORT
+std::shared_ptr<SchemaDescriptor> FromParquet(
+ const std::vector<format::SchemaElement>& schema);
+
+PARQUET_EXPORT
+std::unique_ptr<Node> Unflatten(const format::SchemaElement* elements, int length);
+
+// ----------------------------------------------------------------------
+// Conversion to Parquet Thrift metadata
+
+PARQUET_EXPORT
+void ToParquet(const GroupNode* schema, std::vector<format::SchemaElement>* out);
+
+} // namespace schema
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/statistics.cc b/contrib/libs/apache/arrow/cpp/src/parquet/statistics.cc
new file mode 100644
index 00000000000..72341590e75
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/statistics.cc
@@ -0,0 +1,885 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/statistics.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+#include <limits>
+#include <type_traits>
+#include <utility>
+
+#include "arrow/array.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/bit_run_reader.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/optional.h"
+#include "arrow/util/ubsan.h"
+#include "arrow/visitor_inline.h"
+#include "parquet/encoding.h"
+#include "parquet/exception.h"
+#include "parquet/platform.h"
+#include "parquet/schema.h"
+
+using arrow::default_memory_pool;
+using arrow::MemoryPool;
+using arrow::internal::checked_cast;
+using arrow::util::SafeCopy;
+
+namespace parquet {
+namespace {
+
+// ----------------------------------------------------------------------
+// Comparator implementations
+
+constexpr int value_length(int value_length, const ByteArray& value) { return value.len; }
+constexpr int value_length(int type_length, const FLBA& value) { return type_length; }
+
+template <typename DType, bool is_signed>
+struct CompareHelper {
+ using T = typename DType::c_type;
+
+ static_assert(!std::is_unsigned<T>::value || std::is_same<T, bool>::value,
+ "T is an unsigned numeric");
+
+ constexpr static T DefaultMin() { return std::numeric_limits<T>::max(); }
+ constexpr static T DefaultMax() { return std::numeric_limits<T>::lowest(); }
+
+ // MSVC17 fix, isnan is not overloaded for IntegralType as per C++11
+ // standard requirements.
+ template <typename T1 = T>
+ static ::arrow::enable_if_t<std::is_floating_point<T1>::value, T> Coalesce(T val,
+ T fallback) {
+ return std::isnan(val) ? fallback : val;
+ }
+
+ template <typename T1 = T>
+ static ::arrow::enable_if_t<!std::is_floating_point<T1>::value, T> Coalesce(
+ T val, T fallback) {
+ return val;
+ }
+
+ static inline bool Compare(int type_length, const T& a, const T& b) { return a < b; }
+
+ static T Min(int type_length, T a, T b) { return a < b ? a : b; }
+ static T Max(int type_length, T a, T b) { return a < b ? b : a; }
+};
+
+template <typename DType>
+struct UnsignedCompareHelperBase {
+ using T = typename DType::c_type;
+ using UCType = typename std::make_unsigned<T>::type;
+
+ static_assert(!std::is_same<T, UCType>::value, "T is unsigned");
+ static_assert(sizeof(T) == sizeof(UCType), "T and UCType not the same size");
+
+ // NOTE: according to the C++ spec, unsigned-to-signed conversion is
+ // implementation-defined if the original value does not fit in the signed type
+ // (i.e., two's complement cannot be assumed even on mainstream machines,
+ // because the compiler may decide otherwise). Hence the use of `SafeCopy`
+ // below for deterministic bit-casting.
+ // (see "Integer conversions" in
+ // https://en.cppreference.com/w/cpp/language/implicit_conversion)
+
+ static const T DefaultMin() { return SafeCopy<T>(std::numeric_limits<UCType>::max()); }
+ static const T DefaultMax() { return 0; }
+
+ static T Coalesce(T val, T fallback) { return val; }
+
+ static bool Compare(int type_length, T a, T b) {
+ return SafeCopy<UCType>(a) < SafeCopy<UCType>(b);
+ }
+
+ static T Min(int type_length, T a, T b) { return Compare(type_length, a, b) ? a : b; }
+ static T Max(int type_length, T a, T b) { return Compare(type_length, a, b) ? b : a; }
+};
+
+template <>
+struct CompareHelper<Int32Type, false> : public UnsignedCompareHelperBase<Int32Type> {};
+
+template <>
+struct CompareHelper<Int64Type, false> : public UnsignedCompareHelperBase<Int64Type> {};
+
+template <bool is_signed>
+struct CompareHelper<Int96Type, is_signed> {
+ using T = typename Int96Type::c_type;
+ using msb_type = typename std::conditional<is_signed, int32_t, uint32_t>::type;
+
+ static T DefaultMin() {
+ uint32_t kMsbMax = SafeCopy<uint32_t>(std::numeric_limits<msb_type>::max());
+ uint32_t kMax = std::numeric_limits<uint32_t>::max();
+ return {kMax, kMax, kMsbMax};
+ }
+ static T DefaultMax() {
+ uint32_t kMsbMin = SafeCopy<uint32_t>(std::numeric_limits<msb_type>::min());
+ uint32_t kMin = std::numeric_limits<uint32_t>::min();
+ return {kMin, kMin, kMsbMin};
+ }
+ static T Coalesce(T val, T fallback) { return val; }
+
+ static inline bool Compare(int type_length, const T& a, const T& b) {
+ if (a.value[2] != b.value[2]) {
+ // Only the MSB bit is by Signed comparison. For little-endian, this is the
+ // last bit of Int96 type.
+ return SafeCopy<msb_type>(a.value[2]) < SafeCopy<msb_type>(b.value[2]);
+ } else if (a.value[1] != b.value[1]) {
+ return (a.value[1] < b.value[1]);
+ }
+ return (a.value[0] < b.value[0]);
+ }
+
+ static T Min(int type_length, const T& a, const T& b) {
+ return Compare(0, a, b) ? a : b;
+ }
+ static T Max(int type_length, const T& a, const T& b) {
+ return Compare(0, a, b) ? b : a;
+ }
+};
+
+template <typename T, bool is_signed>
+struct BinaryLikeComparer {};
+
+template <typename T>
+struct BinaryLikeComparer<T, /*is_signed=*/false> {
+ static bool Compare(int type_length, const T& a, const T& b) {
+ int a_length = value_length(type_length, a);
+ int b_length = value_length(type_length, b);
+ // Unsigned comparison is used for non-numeric types so straight
+ // lexiographic comparison makes sense. (a.ptr is always unsigned)....
+ return std::lexicographical_compare(a.ptr, a.ptr + a_length, b.ptr, b.ptr + b_length);
+ }
+};
+
+template <typename T>
+struct BinaryLikeComparer<T, /*is_signed=*/true> {
+ static bool Compare(int type_length, const T& a, const T& b) {
+ // Is signed is used for integers encoded as big-endian twos
+ // complement integers. (e.g. decimals).
+ int a_length = value_length(type_length, a);
+ int b_length = value_length(type_length, b);
+
+ // At least of the lengths is zero.
+ if (a_length == 0 || b_length == 0) {
+ return a_length == 0 && b_length > 0;
+ }
+
+ int8_t first_a = *a.ptr;
+ int8_t first_b = *b.ptr;
+ // We can short circuit for different signed numbers or
+ // for equal length bytes arrays that have different first bytes.
+ // The equality requirement is necessary for sign extension cases.
+ // 0xFF10 should be eqaul to 0x10 (due to big endian sign extension).
+ if ((0x80 & first_a) != (0x80 & first_b) ||
+ (a_length == b_length && first_a != first_b)) {
+ return first_a < first_b;
+ }
+ // When the lengths are unequal and the numbers are of the same
+ // sign we need to do comparison by sign extending the shorter
+ // value first, and once we get to equal sized arrays, lexicographical
+ // unsigned comparison of everything but the first byte is sufficient.
+ const uint8_t* a_start = a.ptr;
+ const uint8_t* b_start = b.ptr;
+ if (a_length != b_length) {
+ const uint8_t* lead_start = nullptr;
+ const uint8_t* lead_end = nullptr;
+ if (a_length > b_length) {
+ int lead_length = a_length - b_length;
+ lead_start = a.ptr;
+ lead_end = a.ptr + lead_length;
+ a_start += lead_length;
+ } else {
+ DCHECK_LT(a_length, b_length);
+ int lead_length = b_length - a_length;
+ lead_start = b.ptr;
+ lead_end = b.ptr + lead_length;
+ b_start += lead_length;
+ }
+ // Compare extra bytes to the sign extension of the first
+ // byte of the other number.
+ uint8_t extension = first_a < 0 ? 0xFF : 0;
+ bool not_equal = std::any_of(lead_start, lead_end,
+ [extension](uint8_t a) { return extension != a; });
+ if (not_equal) {
+ // Since sign extension are extrema values for unsigned bytes:
+ //
+ // Four cases exist:
+ // negative values:
+ // b is the longer value.
+ // b must be the lesser value: return false
+ // else:
+ // a must be the lesser value: return true
+ //
+ // positive values:
+ // b is the longer value.
+ // values in b must be greater than a: return true
+ // else:
+ // values in a must be greater than b: return false
+ bool negative_values = first_a < 0;
+ bool b_longer = a_length < b_length;
+ return negative_values != b_longer;
+ }
+ } else {
+ a_start++;
+ b_start++;
+ }
+ return std::lexicographical_compare(a_start, a.ptr + a_length, b_start,
+ b.ptr + b_length);
+ }
+};
+
+template <typename DType, bool is_signed>
+struct BinaryLikeCompareHelperBase {
+ using T = typename DType::c_type;
+
+ static T DefaultMin() { return {}; }
+ static T DefaultMax() { return {}; }
+ static T Coalesce(T val, T fallback) { return val; }
+
+ static inline bool Compare(int type_length, const T& a, const T& b) {
+ return BinaryLikeComparer<T, is_signed>::Compare(type_length, a, b);
+ }
+ static T Min(int type_length, const T& a, const T& b) {
+ if (a.ptr == nullptr) return b;
+ if (b.ptr == nullptr) return a;
+ return Compare(type_length, a, b) ? a : b;
+ }
+
+ static T Max(int type_length, const T& a, const T& b) {
+ if (a.ptr == nullptr) return b;
+ if (b.ptr == nullptr) return a;
+ return Compare(type_length, a, b) ? b : a;
+ }
+};
+
+template <bool is_signed>
+struct CompareHelper<ByteArrayType, is_signed>
+ : public BinaryLikeCompareHelperBase<ByteArrayType, is_signed> {};
+
+template <bool is_signed>
+struct CompareHelper<FLBAType, is_signed>
+ : public BinaryLikeCompareHelperBase<FLBAType, is_signed> {};
+
+using ::arrow::util::optional;
+
+template <typename T>
+::arrow::enable_if_t<std::is_integral<T>::value, optional<std::pair<T, T>>>
+CleanStatistic(std::pair<T, T> min_max) {
+ return min_max;
+}
+
+// In case of floating point types, the following rules are applied (as per
+// upstream parquet-mr):
+// - If any of min/max is NaN, return nothing.
+// - If min is 0.0f, replace with -0.0f
+// - If max is -0.0f, replace with 0.0f
+template <typename T>
+::arrow::enable_if_t<std::is_floating_point<T>::value, optional<std::pair<T, T>>>
+CleanStatistic(std::pair<T, T> min_max) {
+ T min = min_max.first;
+ T max = min_max.second;
+
+ // Ignore if one of the value is nan.
+ if (std::isnan(min) || std::isnan(max)) {
+ return ::arrow::util::nullopt;
+ }
+
+ if (min == std::numeric_limits<T>::max() && max == std::numeric_limits<T>::lowest()) {
+ return ::arrow::util::nullopt;
+ }
+
+ T zero{};
+
+ if (min == zero && !std::signbit(min)) {
+ min = -min;
+ }
+
+ if (max == zero && std::signbit(max)) {
+ max = -max;
+ }
+
+ return {{min, max}};
+}
+
+optional<std::pair<FLBA, FLBA>> CleanStatistic(std::pair<FLBA, FLBA> min_max) {
+ if (min_max.first.ptr == nullptr || min_max.second.ptr == nullptr) {
+ return ::arrow::util::nullopt;
+ }
+ return min_max;
+}
+
+optional<std::pair<ByteArray, ByteArray>> CleanStatistic(
+ std::pair<ByteArray, ByteArray> min_max) {
+ if (min_max.first.ptr == nullptr || min_max.second.ptr == nullptr) {
+ return ::arrow::util::nullopt;
+ }
+ return min_max;
+}
+
+template <bool is_signed, typename DType>
+class TypedComparatorImpl : virtual public TypedComparator<DType> {
+ public:
+ using T = typename DType::c_type;
+ using Helper = CompareHelper<DType, is_signed>;
+
+ explicit TypedComparatorImpl(int type_length = -1) : type_length_(type_length) {}
+
+ bool CompareInline(const T& a, const T& b) const {
+ return Helper::Compare(type_length_, a, b);
+ }
+
+ bool Compare(const T& a, const T& b) override { return CompareInline(a, b); }
+
+ std::pair<T, T> GetMinMax(const T* values, int64_t length) override {
+ DCHECK_GT(length, 0);
+
+ T min = Helper::DefaultMin();
+ T max = Helper::DefaultMax();
+
+ for (int64_t i = 0; i < length; i++) {
+ auto val = values[i];
+ min = Helper::Min(type_length_, min, Helper::Coalesce(val, Helper::DefaultMin()));
+ max = Helper::Max(type_length_, max, Helper::Coalesce(val, Helper::DefaultMax()));
+ }
+
+ return {min, max};
+ }
+
+ std::pair<T, T> GetMinMaxSpaced(const T* values, int64_t length,
+ const uint8_t* valid_bits,
+ int64_t valid_bits_offset) override {
+ DCHECK_GT(length, 0);
+
+ T min = Helper::DefaultMin();
+ T max = Helper::DefaultMax();
+
+ ::arrow::internal::VisitSetBitRunsVoid(
+ valid_bits, valid_bits_offset, length, [&](int64_t position, int64_t length) {
+ for (int64_t i = 0; i < length; i++) {
+ const auto val = values[i + position];
+ min = Helper::Min(type_length_, min,
+ Helper::Coalesce(val, Helper::DefaultMin()));
+ max = Helper::Max(type_length_, max,
+ Helper::Coalesce(val, Helper::DefaultMax()));
+ }
+ });
+
+ return {min, max};
+ }
+
+ std::pair<T, T> GetMinMax(const ::arrow::Array& values) override;
+
+ private:
+ int type_length_;
+};
+
+// ARROW-11675: A hand-written version of GetMinMax(), to work around
+// what looks like a MSVC code generation bug.
+// This does not seem to be required for GetMinMaxSpaced().
+template <>
+std::pair<int32_t, int32_t>
+TypedComparatorImpl</*is_signed=*/false, Int32Type>::GetMinMax(const int32_t* values,
+ int64_t length) {
+ DCHECK_GT(length, 0);
+
+ const uint32_t* unsigned_values = reinterpret_cast<const uint32_t*>(values);
+ uint32_t min = std::numeric_limits<uint32_t>::max();
+ uint32_t max = std::numeric_limits<uint32_t>::lowest();
+
+ for (int64_t i = 0; i < length; i++) {
+ const auto val = unsigned_values[i];
+ min = std::min<uint32_t>(min, val);
+ max = std::max<uint32_t>(max, val);
+ }
+
+ return {SafeCopy<int32_t>(min), SafeCopy<int32_t>(max)};
+}
+
+template <bool is_signed, typename DType>
+std::pair<typename DType::c_type, typename DType::c_type>
+TypedComparatorImpl<is_signed, DType>::GetMinMax(const ::arrow::Array& values) {
+ ParquetException::NYI(values.type()->ToString());
+}
+
+template <bool is_signed>
+std::pair<ByteArray, ByteArray> GetMinMaxBinaryHelper(
+ const TypedComparatorImpl<is_signed, ByteArrayType>& comparator,
+ const ::arrow::Array& values) {
+ using Helper = CompareHelper<ByteArrayType, is_signed>;
+
+ ByteArray min = Helper::DefaultMin();
+ ByteArray max = Helper::DefaultMax();
+ constexpr int type_length = -1;
+
+ const auto valid_func = [&](ByteArray val) {
+ min = Helper::Min(type_length, val, min);
+ max = Helper::Max(type_length, val, max);
+ };
+ const auto null_func = [&]() {};
+
+ if (::arrow::is_binary_like(values.type_id())) {
+ ::arrow::VisitArrayDataInline<::arrow::BinaryType>(
+ *values.data(), std::move(valid_func), std::move(null_func));
+ } else {
+ DCHECK(::arrow::is_large_binary_like(values.type_id()));
+ ::arrow::VisitArrayDataInline<::arrow::LargeBinaryType>(
+ *values.data(), std::move(valid_func), std::move(null_func));
+ }
+
+ return {min, max};
+}
+
+template <>
+std::pair<ByteArray, ByteArray> TypedComparatorImpl<true, ByteArrayType>::GetMinMax(
+ const ::arrow::Array& values) {
+ return GetMinMaxBinaryHelper<true>(*this, values);
+}
+
+template <>
+std::pair<ByteArray, ByteArray> TypedComparatorImpl<false, ByteArrayType>::GetMinMax(
+ const ::arrow::Array& values) {
+ return GetMinMaxBinaryHelper<false>(*this, values);
+}
+
+template <typename DType>
+class TypedStatisticsImpl : public TypedStatistics<DType> {
+ public:
+ using T = typename DType::c_type;
+
+ TypedStatisticsImpl(const ColumnDescriptor* descr, MemoryPool* pool)
+ : descr_(descr),
+ pool_(pool),
+ min_buffer_(AllocateBuffer(pool_, 0)),
+ max_buffer_(AllocateBuffer(pool_, 0)) {
+ auto comp = Comparator::Make(descr);
+ comparator_ = std::static_pointer_cast<TypedComparator<DType>>(comp);
+ Reset();
+ has_null_count_ = true;
+ has_distinct_count_ = true;
+ }
+
+ TypedStatisticsImpl(const T& min, const T& max, int64_t num_values, int64_t null_count,
+ int64_t distinct_count)
+ : pool_(default_memory_pool()),
+ min_buffer_(AllocateBuffer(pool_, 0)),
+ max_buffer_(AllocateBuffer(pool_, 0)) {
+ IncrementNumValues(num_values);
+ IncrementNullCount(null_count);
+ IncrementDistinctCount(distinct_count);
+
+ Copy(min, &min_, min_buffer_.get());
+ Copy(max, &max_, max_buffer_.get());
+ has_min_max_ = true;
+ }
+
+ TypedStatisticsImpl(const ColumnDescriptor* descr, const std::string& encoded_min,
+ const std::string& encoded_max, int64_t num_values,
+ int64_t null_count, int64_t distinct_count, bool has_min_max,
+ bool has_null_count, bool has_distinct_count, MemoryPool* pool)
+ : TypedStatisticsImpl(descr, pool) {
+ IncrementNumValues(num_values);
+ if (has_null_count_) {
+ IncrementNullCount(null_count);
+ }
+ if (has_distinct_count) {
+ IncrementDistinctCount(distinct_count);
+ }
+
+ if (!encoded_min.empty()) {
+ PlainDecode(encoded_min, &min_);
+ }
+ if (!encoded_max.empty()) {
+ PlainDecode(encoded_max, &max_);
+ }
+ has_min_max_ = has_min_max;
+ }
+
+ bool HasDistinctCount() const override { return has_distinct_count_; };
+ bool HasMinMax() const override { return has_min_max_; }
+ bool HasNullCount() const override { return has_null_count_; };
+
+ bool Equals(const Statistics& raw_other) const override {
+ if (physical_type() != raw_other.physical_type()) return false;
+
+ const auto& other = checked_cast<const TypedStatisticsImpl&>(raw_other);
+
+ if (has_min_max_ != other.has_min_max_) return false;
+
+ return (has_min_max_ && MinMaxEqual(other)) && null_count() == other.null_count() &&
+ distinct_count() == other.distinct_count() &&
+ num_values() == other.num_values();
+ }
+
+ bool MinMaxEqual(const TypedStatisticsImpl& other) const;
+
+ void Reset() override {
+ ResetCounts();
+ has_min_max_ = false;
+ has_distinct_count_ = false;
+ has_null_count_ = false;
+ }
+
+ void SetMinMax(const T& arg_min, const T& arg_max) override {
+ SetMinMaxPair({arg_min, arg_max});
+ }
+
+ void Merge(const TypedStatistics<DType>& other) override {
+ this->num_values_ += other.num_values();
+ if (other.HasNullCount()) {
+ this->statistics_.null_count += other.null_count();
+ }
+ if (other.HasDistinctCount()) {
+ this->statistics_.distinct_count += other.distinct_count();
+ }
+ if (other.HasMinMax()) {
+ SetMinMax(other.min(), other.max());
+ }
+ }
+
+ void Update(const T* values, int64_t num_not_null, int64_t num_null) override;
+ void UpdateSpaced(const T* values, const uint8_t* valid_bits, int64_t valid_bits_spaced,
+ int64_t num_not_null, int64_t num_null) override;
+
+ void Update(const ::arrow::Array& values) override {
+ IncrementNullCount(values.null_count());
+ IncrementNumValues(values.length() - values.null_count());
+
+ if (values.null_count() == values.length()) {
+ return;
+ }
+
+ SetMinMaxPair(comparator_->GetMinMax(values));
+ }
+
+ const T& min() const override { return min_; }
+
+ const T& max() const override { return max_; }
+
+ Type::type physical_type() const override { return descr_->physical_type(); }
+
+ const ColumnDescriptor* descr() const override { return descr_; }
+
+ std::string EncodeMin() const override {
+ std::string s;
+ if (HasMinMax()) this->PlainEncode(min_, &s);
+ return s;
+ }
+
+ std::string EncodeMax() const override {
+ std::string s;
+ if (HasMinMax()) this->PlainEncode(max_, &s);
+ return s;
+ }
+
+ EncodedStatistics Encode() override {
+ EncodedStatistics s;
+ if (HasMinMax()) {
+ s.set_min(this->EncodeMin());
+ s.set_max(this->EncodeMax());
+ }
+ if (HasNullCount()) {
+ s.set_null_count(this->null_count());
+ }
+ return s;
+ }
+
+ int64_t null_count() const override { return statistics_.null_count; }
+ int64_t distinct_count() const override { return statistics_.distinct_count; }
+ int64_t num_values() const override { return num_values_; }
+
+ private:
+ const ColumnDescriptor* descr_;
+ bool has_min_max_ = false;
+ bool has_null_count_ = false;
+ bool has_distinct_count_ = false;
+ T min_;
+ T max_;
+ ::arrow::MemoryPool* pool_;
+ int64_t num_values_ = 0;
+ EncodedStatistics statistics_;
+ std::shared_ptr<TypedComparator<DType>> comparator_;
+ std::shared_ptr<ResizableBuffer> min_buffer_, max_buffer_;
+
+ void PlainEncode(const T& src, std::string* dst) const;
+ void PlainDecode(const std::string& src, T* dst) const;
+
+ void Copy(const T& src, T* dst, ResizableBuffer*) { *dst = src; }
+
+ void IncrementNullCount(int64_t n) {
+ statistics_.null_count += n;
+ has_null_count_ = true;
+ }
+
+ void IncrementNumValues(int64_t n) { num_values_ += n; }
+
+ void IncrementDistinctCount(int64_t n) {
+ statistics_.distinct_count += n;
+ has_distinct_count_ = true;
+ }
+
+ void ResetCounts() {
+ this->statistics_.null_count = 0;
+ this->statistics_.distinct_count = 0;
+ this->num_values_ = 0;
+ }
+
+ void SetMinMaxPair(std::pair<T, T> min_max) {
+ // CleanStatistic can return a nullopt in case of erroneous values, e.g. NaN
+ auto maybe_min_max = CleanStatistic(min_max);
+ if (!maybe_min_max) return;
+
+ auto min = maybe_min_max.value().first;
+ auto max = maybe_min_max.value().second;
+
+ if (!has_min_max_) {
+ has_min_max_ = true;
+ Copy(min, &min_, min_buffer_.get());
+ Copy(max, &max_, max_buffer_.get());
+ } else {
+ Copy(comparator_->Compare(min_, min) ? min_ : min, &min_, min_buffer_.get());
+ Copy(comparator_->Compare(max_, max) ? max : max_, &max_, max_buffer_.get());
+ }
+ }
+};
+
+template <>
+inline bool TypedStatisticsImpl<FLBAType>::MinMaxEqual(
+ const TypedStatisticsImpl<FLBAType>& other) const {
+ uint32_t len = descr_->type_length();
+ return std::memcmp(min_.ptr, other.min_.ptr, len) == 0 &&
+ std::memcmp(max_.ptr, other.max_.ptr, len) == 0;
+}
+
+template <typename DType>
+bool TypedStatisticsImpl<DType>::MinMaxEqual(
+ const TypedStatisticsImpl<DType>& other) const {
+ return min_ != other.min_ && max_ != other.max_;
+}
+
+template <>
+inline void TypedStatisticsImpl<FLBAType>::Copy(const FLBA& src, FLBA* dst,
+ ResizableBuffer* buffer) {
+ if (dst->ptr == src.ptr) return;
+ uint32_t len = descr_->type_length();
+ PARQUET_THROW_NOT_OK(buffer->Resize(len, false));
+ std::memcpy(buffer->mutable_data(), src.ptr, len);
+ *dst = FLBA(buffer->data());
+}
+
+template <>
+inline void TypedStatisticsImpl<ByteArrayType>::Copy(const ByteArray& src, ByteArray* dst,
+ ResizableBuffer* buffer) {
+ if (dst->ptr == src.ptr) return;
+ PARQUET_THROW_NOT_OK(buffer->Resize(src.len, false));
+ std::memcpy(buffer->mutable_data(), src.ptr, src.len);
+ *dst = ByteArray(src.len, buffer->data());
+}
+
+template <typename DType>
+void TypedStatisticsImpl<DType>::Update(const T* values, int64_t num_not_null,
+ int64_t num_null) {
+ DCHECK_GE(num_not_null, 0);
+ DCHECK_GE(num_null, 0);
+
+ IncrementNullCount(num_null);
+ IncrementNumValues(num_not_null);
+
+ if (num_not_null == 0) return;
+ SetMinMaxPair(comparator_->GetMinMax(values, num_not_null));
+}
+
+template <typename DType>
+void TypedStatisticsImpl<DType>::UpdateSpaced(const T* values, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ int64_t num_not_null, int64_t num_null) {
+ DCHECK_GE(num_not_null, 0);
+ DCHECK_GE(num_null, 0);
+
+ IncrementNullCount(num_null);
+ IncrementNumValues(num_not_null);
+
+ if (num_not_null == 0) return;
+
+ int64_t length = num_null + num_not_null;
+ SetMinMaxPair(
+ comparator_->GetMinMaxSpaced(values, length, valid_bits, valid_bits_offset));
+}
+
+template <typename DType>
+void TypedStatisticsImpl<DType>::PlainEncode(const T& src, std::string* dst) const {
+ auto encoder = MakeTypedEncoder<DType>(Encoding::PLAIN, false, descr_, pool_);
+ encoder->Put(&src, 1);
+ auto buffer = encoder->FlushValues();
+ auto ptr = reinterpret_cast<const char*>(buffer->data());
+ dst->assign(ptr, buffer->size());
+}
+
+template <typename DType>
+void TypedStatisticsImpl<DType>::PlainDecode(const std::string& src, T* dst) const {
+ auto decoder = MakeTypedDecoder<DType>(Encoding::PLAIN, descr_);
+ decoder->SetData(1, reinterpret_cast<const uint8_t*>(src.c_str()),
+ static_cast<int>(src.size()));
+ decoder->Decode(dst, 1);
+}
+
+template <>
+void TypedStatisticsImpl<ByteArrayType>::PlainEncode(const T& src,
+ std::string* dst) const {
+ dst->assign(reinterpret_cast<const char*>(src.ptr), src.len);
+}
+
+template <>
+void TypedStatisticsImpl<ByteArrayType>::PlainDecode(const std::string& src,
+ T* dst) const {
+ dst->len = static_cast<uint32_t>(src.size());
+ dst->ptr = reinterpret_cast<const uint8_t*>(src.c_str());
+}
+
+} // namespace
+
+// ----------------------------------------------------------------------
+// Public factory functions
+
+std::shared_ptr<Comparator> Comparator::Make(Type::type physical_type,
+ SortOrder::type sort_order,
+ int type_length) {
+ if (SortOrder::SIGNED == sort_order) {
+ switch (physical_type) {
+ case Type::BOOLEAN:
+ return std::make_shared<TypedComparatorImpl<true, BooleanType>>();
+ case Type::INT32:
+ return std::make_shared<TypedComparatorImpl<true, Int32Type>>();
+ case Type::INT64:
+ return std::make_shared<TypedComparatorImpl<true, Int64Type>>();
+ case Type::INT96:
+ return std::make_shared<TypedComparatorImpl<true, Int96Type>>();
+ case Type::FLOAT:
+ return std::make_shared<TypedComparatorImpl<true, FloatType>>();
+ case Type::DOUBLE:
+ return std::make_shared<TypedComparatorImpl<true, DoubleType>>();
+ case Type::BYTE_ARRAY:
+ return std::make_shared<TypedComparatorImpl<true, ByteArrayType>>();
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ return std::make_shared<TypedComparatorImpl<true, FLBAType>>(type_length);
+ default:
+ ParquetException::NYI("Signed Compare not implemented");
+ }
+ } else if (SortOrder::UNSIGNED == sort_order) {
+ switch (physical_type) {
+ case Type::INT32:
+ return std::make_shared<TypedComparatorImpl<false, Int32Type>>();
+ case Type::INT64:
+ return std::make_shared<TypedComparatorImpl<false, Int64Type>>();
+ case Type::INT96:
+ return std::make_shared<TypedComparatorImpl<false, Int96Type>>();
+ case Type::BYTE_ARRAY:
+ return std::make_shared<TypedComparatorImpl<false, ByteArrayType>>();
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ return std::make_shared<TypedComparatorImpl<false, FLBAType>>(type_length);
+ default:
+ ParquetException::NYI("Unsigned Compare not implemented");
+ }
+ } else {
+ throw ParquetException("UNKNOWN Sort Order");
+ }
+ return nullptr;
+}
+
+std::shared_ptr<Comparator> Comparator::Make(const ColumnDescriptor* descr) {
+ return Make(descr->physical_type(), descr->sort_order(), descr->type_length());
+}
+
+std::shared_ptr<Statistics> Statistics::Make(const ColumnDescriptor* descr,
+ ::arrow::MemoryPool* pool) {
+ switch (descr->physical_type()) {
+ case Type::BOOLEAN:
+ return std::make_shared<TypedStatisticsImpl<BooleanType>>(descr, pool);
+ case Type::INT32:
+ return std::make_shared<TypedStatisticsImpl<Int32Type>>(descr, pool);
+ case Type::INT64:
+ return std::make_shared<TypedStatisticsImpl<Int64Type>>(descr, pool);
+ case Type::FLOAT:
+ return std::make_shared<TypedStatisticsImpl<FloatType>>(descr, pool);
+ case Type::DOUBLE:
+ return std::make_shared<TypedStatisticsImpl<DoubleType>>(descr, pool);
+ case Type::BYTE_ARRAY:
+ return std::make_shared<TypedStatisticsImpl<ByteArrayType>>(descr, pool);
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ return std::make_shared<TypedStatisticsImpl<FLBAType>>(descr, pool);
+ default:
+ ParquetException::NYI("Statistics not implemented");
+ }
+}
+
+std::shared_ptr<Statistics> Statistics::Make(Type::type physical_type, const void* min,
+ const void* max, int64_t num_values,
+ int64_t null_count, int64_t distinct_count) {
+#define MAKE_STATS(CAP_TYPE, KLASS) \
+ case Type::CAP_TYPE: \
+ return std::make_shared<TypedStatisticsImpl<KLASS>>( \
+ *reinterpret_cast<const typename KLASS::c_type*>(min), \
+ *reinterpret_cast<const typename KLASS::c_type*>(max), num_values, null_count, \
+ distinct_count)
+
+ switch (physical_type) {
+ MAKE_STATS(BOOLEAN, BooleanType);
+ MAKE_STATS(INT32, Int32Type);
+ MAKE_STATS(INT64, Int64Type);
+ MAKE_STATS(FLOAT, FloatType);
+ MAKE_STATS(DOUBLE, DoubleType);
+ MAKE_STATS(BYTE_ARRAY, ByteArrayType);
+ MAKE_STATS(FIXED_LEN_BYTE_ARRAY, FLBAType);
+ default:
+ break;
+ }
+#undef MAKE_STATS
+ DCHECK(false) << "Cannot reach here";
+ return nullptr;
+}
+
+std::shared_ptr<Statistics> Statistics::Make(const ColumnDescriptor* descr,
+ const std::string& encoded_min,
+ const std::string& encoded_max,
+ int64_t num_values, int64_t null_count,
+ int64_t distinct_count, bool has_min_max,
+ bool has_null_count, bool has_distinct_count,
+ ::arrow::MemoryPool* pool) {
+#define MAKE_STATS(CAP_TYPE, KLASS) \
+ case Type::CAP_TYPE: \
+ return std::make_shared<TypedStatisticsImpl<KLASS>>( \
+ descr, encoded_min, encoded_max, num_values, null_count, distinct_count, \
+ has_min_max, has_null_count, has_distinct_count, pool)
+
+ switch (descr->physical_type()) {
+ MAKE_STATS(BOOLEAN, BooleanType);
+ MAKE_STATS(INT32, Int32Type);
+ MAKE_STATS(INT64, Int64Type);
+ MAKE_STATS(FLOAT, FloatType);
+ MAKE_STATS(DOUBLE, DoubleType);
+ MAKE_STATS(BYTE_ARRAY, ByteArrayType);
+ MAKE_STATS(FIXED_LEN_BYTE_ARRAY, FLBAType);
+ default:
+ break;
+ }
+#undef MAKE_STATS
+ DCHECK(false) << "Cannot reach here";
+ return nullptr;
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/statistics.h b/contrib/libs/apache/arrow/cpp/src/parquet/statistics.h
new file mode 100644
index 00000000000..18f68f21b87
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/statistics.h
@@ -0,0 +1,342 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "parquet/platform.h"
+#include "parquet/types.h"
+
+namespace arrow {
+
+class Array;
+class BinaryArray;
+
+} // namespace arrow
+
+namespace parquet {
+
+class ColumnDescriptor;
+
+// ----------------------------------------------------------------------
+// Value comparator interfaces
+
+/// \brief Base class for value comparators. Generally used with
+/// TypedComparator<T>
+class PARQUET_EXPORT Comparator {
+ public:
+ virtual ~Comparator() {}
+
+ /// \brief Create a comparator explicitly from physical type and
+ /// sort order
+ /// \param[in] physical_type the physical type for the typed
+ /// comparator
+ /// \param[in] sort_order either SortOrder::SIGNED or
+ /// SortOrder::UNSIGNED
+ /// \param[in] type_length for FIXED_LEN_BYTE_ARRAY only
+ static std::shared_ptr<Comparator> Make(Type::type physical_type,
+ SortOrder::type sort_order,
+ int type_length = -1);
+
+ /// \brief Create typed comparator inferring default sort order from
+ /// ColumnDescriptor
+ /// \param[in] descr the Parquet column schema
+ static std::shared_ptr<Comparator> Make(const ColumnDescriptor* descr);
+};
+
+/// \brief Interface for comparison of physical types according to the
+/// semantics of a particular logical type.
+template <typename DType>
+class TypedComparator : public Comparator {
+ public:
+ using T = typename DType::c_type;
+
+ /// \brief Scalar comparison of two elements, return true if first
+ /// is strictly less than the second
+ virtual bool Compare(const T& a, const T& b) = 0;
+
+ /// \brief Compute maximum and minimum elements in a batch of
+ /// elements without any nulls
+ virtual std::pair<T, T> GetMinMax(const T* values, int64_t length) = 0;
+
+ /// \brief Compute minimum and maximum elements from an Arrow array. Only
+ /// valid for certain Parquet Type / Arrow Type combinations, like BYTE_ARRAY
+ /// / arrow::BinaryArray
+ virtual std::pair<T, T> GetMinMax(const ::arrow::Array& values) = 0;
+
+ /// \brief Compute maximum and minimum elements in a batch of
+ /// elements with accompanying bitmap indicating which elements are
+ /// included (bit set) and excluded (bit not set)
+ ///
+ /// \param[in] values the sequence of values
+ /// \param[in] length the length of the sequence
+ /// \param[in] valid_bits a bitmap indicating which elements are
+ /// included (1) or excluded (0)
+ /// \param[in] valid_bits_offset the bit offset into the bitmap of
+ /// the first element in the sequence
+ virtual std::pair<T, T> GetMinMaxSpaced(const T* values, int64_t length,
+ const uint8_t* valid_bits,
+ int64_t valid_bits_offset) = 0;
+};
+
+/// \brief Typed version of Comparator::Make
+template <typename DType>
+std::shared_ptr<TypedComparator<DType>> MakeComparator(Type::type physical_type,
+ SortOrder::type sort_order,
+ int type_length = -1) {
+ return std::static_pointer_cast<TypedComparator<DType>>(
+ Comparator::Make(physical_type, sort_order, type_length));
+}
+
+/// \brief Typed version of Comparator::Make
+template <typename DType>
+std::shared_ptr<TypedComparator<DType>> MakeComparator(const ColumnDescriptor* descr) {
+ return std::static_pointer_cast<TypedComparator<DType>>(Comparator::Make(descr));
+}
+
+// ----------------------------------------------------------------------
+
+/// \brief Structure represented encoded statistics to be written to
+/// and from Parquet serialized metadata
+class PARQUET_EXPORT EncodedStatistics {
+ std::shared_ptr<std::string> max_, min_;
+ bool is_signed_ = false;
+
+ public:
+ EncodedStatistics()
+ : max_(std::make_shared<std::string>()), min_(std::make_shared<std::string>()) {}
+
+ const std::string& max() const { return *max_; }
+ const std::string& min() const { return *min_; }
+
+ int64_t null_count = 0;
+ int64_t distinct_count = 0;
+
+ bool has_min = false;
+ bool has_max = false;
+ bool has_null_count = false;
+ bool has_distinct_count = false;
+
+ // From parquet-mr
+ // Don't write stats larger than the max size rather than truncating. The
+ // rationale is that some engines may use the minimum value in the page as
+ // the true minimum for aggregations and there is no way to mark that a
+ // value has been truncated and is a lower bound and not in the page.
+ void ApplyStatSizeLimits(size_t length) {
+ if (max_->length() > length) {
+ has_max = false;
+ }
+ if (min_->length() > length) {
+ has_min = false;
+ }
+ }
+
+ bool is_set() const {
+ return has_min || has_max || has_null_count || has_distinct_count;
+ }
+
+ bool is_signed() const { return is_signed_; }
+
+ void set_is_signed(bool is_signed) { is_signed_ = is_signed; }
+
+ EncodedStatistics& set_max(const std::string& value) {
+ *max_ = value;
+ has_max = true;
+ return *this;
+ }
+
+ EncodedStatistics& set_min(const std::string& value) {
+ *min_ = value;
+ has_min = true;
+ return *this;
+ }
+
+ EncodedStatistics& set_null_count(int64_t value) {
+ null_count = value;
+ has_null_count = true;
+ return *this;
+ }
+
+ EncodedStatistics& set_distinct_count(int64_t value) {
+ distinct_count = value;
+ has_distinct_count = true;
+ return *this;
+ }
+};
+
+/// \brief Base type for computing column statistics while writing a file
+class PARQUET_EXPORT Statistics {
+ public:
+ virtual ~Statistics() {}
+
+ /// \brief Create a new statistics instance given a column schema
+ /// definition
+ /// \param[in] descr the column schema
+ /// \param[in] pool a memory pool to use for any memory allocations, optional
+ static std::shared_ptr<Statistics> Make(
+ const ColumnDescriptor* descr,
+ ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
+
+ /// \brief Create a new statistics instance given a column schema
+ /// definition and pre-existing state
+ /// \param[in] descr the column schema
+ /// \param[in] encoded_min the encoded minimum value
+ /// \param[in] encoded_max the encoded maximum value
+ /// \param[in] num_values total number of values
+ /// \param[in] null_count number of null values
+ /// \param[in] distinct_count number of distinct values
+ /// \param[in] has_min_max whether the min/max statistics are set
+ /// \param[in] has_null_count whether the null_count statistics are set
+ /// \param[in] has_distinct_count whether the distinct_count statistics are set
+ /// \param[in] pool a memory pool to use for any memory allocations, optional
+ static std::shared_ptr<Statistics> Make(
+ const ColumnDescriptor* descr, const std::string& encoded_min,
+ const std::string& encoded_max, int64_t num_values, int64_t null_count,
+ int64_t distinct_count, bool has_min_max, bool has_null_count,
+ bool has_distinct_count,
+ ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
+
+ /// \brief Return true if the count of null values is set
+ virtual bool HasNullCount() const = 0;
+
+ /// \brief The number of null values, may not be set
+ virtual int64_t null_count() const = 0;
+
+ /// \brief Return true if the count of distinct values is set
+ virtual bool HasDistinctCount() const = 0;
+
+ /// \brief The number of distinct values, may not be set
+ virtual int64_t distinct_count() const = 0;
+
+ /// \brief The total number of values in the column
+ virtual int64_t num_values() const = 0;
+
+ /// \brief Return true if the min and max statistics are set. Obtain
+ /// with TypedStatistics<T>::min and max
+ virtual bool HasMinMax() const = 0;
+
+ /// \brief Reset state of object to initial (no data observed) state
+ virtual void Reset() = 0;
+
+ /// \brief Plain-encoded minimum value
+ virtual std::string EncodeMin() const = 0;
+
+ /// \brief Plain-encoded maximum value
+ virtual std::string EncodeMax() const = 0;
+
+ /// \brief The finalized encoded form of the statistics for transport
+ virtual EncodedStatistics Encode() = 0;
+
+ /// \brief The physical type of the column schema
+ virtual Type::type physical_type() const = 0;
+
+ /// \brief The full type descriptor from the column schema
+ virtual const ColumnDescriptor* descr() const = 0;
+
+ /// \brief Check two Statistics for equality
+ virtual bool Equals(const Statistics& other) const = 0;
+
+ protected:
+ static std::shared_ptr<Statistics> Make(Type::type physical_type, const void* min,
+ const void* max, int64_t num_values,
+ int64_t null_count, int64_t distinct_count);
+};
+
+/// \brief A typed implementation of Statistics
+template <typename DType>
+class TypedStatistics : public Statistics {
+ public:
+ using T = typename DType::c_type;
+
+ /// \brief The current minimum value
+ virtual const T& min() const = 0;
+
+ /// \brief The current maximum value
+ virtual const T& max() const = 0;
+
+ /// \brief Update state with state of another Statistics object
+ virtual void Merge(const TypedStatistics<DType>& other) = 0;
+
+ /// \brief Batch statistics update
+ virtual void Update(const T* values, int64_t num_not_null, int64_t num_null) = 0;
+
+ /// \brief Batch statistics update with supplied validity bitmap
+ virtual void UpdateSpaced(const T* values, const uint8_t* valid_bits,
+ int64_t valid_bits_offset, int64_t num_not_null,
+ int64_t num_null) = 0;
+
+ /// \brief EXPERIMENTAL: Update statistics with an Arrow array without
+ /// conversion to a primitive Parquet C type. Only implemented for certain
+ /// Parquet type / Arrow type combinations like BYTE_ARRAY /
+ /// arrow::BinaryArray
+ virtual void Update(const ::arrow::Array& values) = 0;
+
+ /// \brief Set min and max values to particular values
+ virtual void SetMinMax(const T& min, const T& max) = 0;
+};
+
+using BoolStatistics = TypedStatistics<BooleanType>;
+using Int32Statistics = TypedStatistics<Int32Type>;
+using Int64Statistics = TypedStatistics<Int64Type>;
+using FloatStatistics = TypedStatistics<FloatType>;
+using DoubleStatistics = TypedStatistics<DoubleType>;
+using ByteArrayStatistics = TypedStatistics<ByteArrayType>;
+using FLBAStatistics = TypedStatistics<FLBAType>;
+
+/// \brief Typed version of Statistics::Make
+template <typename DType>
+std::shared_ptr<TypedStatistics<DType>> MakeStatistics(
+ const ColumnDescriptor* descr,
+ ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
+ return std::static_pointer_cast<TypedStatistics<DType>>(Statistics::Make(descr, pool));
+}
+
+/// \brief Create Statistics initialized to a particular state
+/// \param[in] min the minimum value
+/// \param[in] max the minimum value
+/// \param[in] num_values number of values
+/// \param[in] null_count number of null values
+/// \param[in] distinct_count number of distinct values
+template <typename DType>
+std::shared_ptr<TypedStatistics<DType>> MakeStatistics(const typename DType::c_type& min,
+ const typename DType::c_type& max,
+ int64_t num_values,
+ int64_t null_count,
+ int64_t distinct_count) {
+ return std::static_pointer_cast<TypedStatistics<DType>>(Statistics::Make(
+ DType::type_num, &min, &max, num_values, null_count, distinct_count));
+}
+
+/// \brief Typed version of Statistics::Make
+template <typename DType>
+std::shared_ptr<TypedStatistics<DType>> MakeStatistics(
+ const ColumnDescriptor* descr, const std::string& encoded_min,
+ const std::string& encoded_max, int64_t num_values, int64_t null_count,
+ int64_t distinct_count, bool has_min_max, bool has_null_count,
+ bool has_distinct_count, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
+ return std::static_pointer_cast<TypedStatistics<DType>>(Statistics::Make(
+ descr, encoded_min, encoded_max, num_values, null_count, distinct_count,
+ has_min_max, has_null_count, has_distinct_count, pool));
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/stream_reader.cc b/contrib/libs/apache/arrow/cpp/src/parquet/stream_reader.cc
new file mode 100644
index 00000000000..9a7cc8cdf86
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/stream_reader.cc
@@ -0,0 +1,521 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/stream_reader.h"
+
+#include <set>
+#include <utility>
+
+namespace parquet {
+
+constexpr int64_t StreamReader::kBatchSizeOne;
+
+// The converted type expected by the stream reader does not always
+// exactly match with the schema in the Parquet file. The following
+// is a list of converted types which are allowed instead of the
+// expected converted type.
+// Each pair given is:
+// {<StreamReader expected type>, <Parquet file converted type>}
+// So for example {ConvertedType::INT_32, ConvertedType::NONE} means
+// that if the StreamReader was expecting the converted type INT_32,
+// then it will allow the Parquet file to use the converted type
+// NONE.
+//
+static const std::set<std::pair<ConvertedType::type, ConvertedType::type> >
+ converted_type_exceptions = {{ConvertedType::INT_32, ConvertedType::NONE},
+ {ConvertedType::INT_64, ConvertedType::NONE},
+ {ConvertedType::INT_32, ConvertedType::DECIMAL},
+ {ConvertedType::INT_64, ConvertedType::DECIMAL},
+ {ConvertedType::UTF8, ConvertedType::NONE}};
+
+StreamReader::StreamReader(std::unique_ptr<ParquetFileReader> reader)
+ : file_reader_{std::move(reader)}, eof_{false} {
+ file_metadata_ = file_reader_->metadata();
+
+ auto schema = file_metadata_->schema();
+ auto group_node = schema->group_node();
+
+ nodes_.resize(schema->num_columns());
+
+ for (auto i = 0; i < schema->num_columns(); ++i) {
+ nodes_[i] = std::static_pointer_cast<schema::PrimitiveNode>(group_node->field(i));
+ }
+ NextRowGroup();
+}
+
+int StreamReader::num_columns() const {
+ // Check for file metadata i.e. object is not default constructed.
+ if (file_metadata_) {
+ return file_metadata_->num_columns();
+ }
+ return 0;
+}
+
+int64_t StreamReader::num_rows() const {
+ // Check for file metadata i.e. object is not default constructed.
+ if (file_metadata_) {
+ return file_metadata_->num_rows();
+ }
+ return 0;
+}
+
+StreamReader& StreamReader::operator>>(bool& v) {
+ CheckColumn(Type::BOOLEAN, ConvertedType::NONE);
+ Read<BoolReader>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(int8_t& v) {
+ CheckColumn(Type::INT32, ConvertedType::INT_8);
+ Read<Int32Reader, int32_t>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(uint8_t& v) {
+ CheckColumn(Type::INT32, ConvertedType::UINT_8);
+ Read<Int32Reader, int32_t>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(int16_t& v) {
+ CheckColumn(Type::INT32, ConvertedType::INT_16);
+ Read<Int32Reader, int32_t>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(uint16_t& v) {
+ CheckColumn(Type::INT32, ConvertedType::UINT_16);
+ Read<Int32Reader, int32_t>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(int32_t& v) {
+ CheckColumn(Type::INT32, ConvertedType::INT_32);
+ Read<Int32Reader>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(uint32_t& v) {
+ CheckColumn(Type::INT32, ConvertedType::UINT_32);
+ Read<Int32Reader>(reinterpret_cast<int32_t*>(&v));
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(int64_t& v) {
+ CheckColumn(Type::INT64, ConvertedType::INT_64);
+ Read<Int64Reader>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(uint64_t& v) {
+ CheckColumn(Type::INT64, ConvertedType::UINT_64);
+ Read<Int64Reader>(reinterpret_cast<int64_t*>(&v));
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(std::chrono::milliseconds& v) {
+ CheckColumn(Type::INT64, ConvertedType::TIMESTAMP_MILLIS);
+ int64_t tmp;
+ Read<Int64Reader>(&tmp);
+ v = std::chrono::milliseconds{tmp};
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(std::chrono::microseconds& v) {
+ CheckColumn(Type::INT64, ConvertedType::TIMESTAMP_MICROS);
+ int64_t tmp;
+ Read<Int64Reader>(&tmp);
+ v = std::chrono::microseconds{tmp};
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(float& v) {
+ CheckColumn(Type::FLOAT, ConvertedType::NONE);
+ Read<FloatReader>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(double& v) {
+ CheckColumn(Type::DOUBLE, ConvertedType::NONE);
+ Read<DoubleReader>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(char& v) {
+ CheckColumn(Type::FIXED_LEN_BYTE_ARRAY, ConvertedType::NONE, 1);
+ FixedLenByteArray flba;
+
+ Read(&flba);
+ v = static_cast<char>(flba.ptr[0]);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(std::string& v) {
+ CheckColumn(Type::BYTE_ARRAY, ConvertedType::UTF8);
+ ByteArray ba;
+
+ Read(&ba);
+ v = std::string(reinterpret_cast<const char*>(ba.ptr), ba.len);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(optional<bool>& v) {
+ CheckColumn(Type::BOOLEAN, ConvertedType::NONE);
+ ReadOptional<BoolReader>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(optional<int8_t>& v) {
+ CheckColumn(Type::INT32, ConvertedType::INT_8);
+ ReadOptional<Int32Reader, int32_t>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(optional<uint8_t>& v) {
+ CheckColumn(Type::INT32, ConvertedType::UINT_8);
+ ReadOptional<Int32Reader, int32_t>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(optional<int16_t>& v) {
+ CheckColumn(Type::INT32, ConvertedType::INT_16);
+ ReadOptional<Int32Reader, int32_t>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(optional<uint16_t>& v) {
+ CheckColumn(Type::INT32, ConvertedType::UINT_16);
+ ReadOptional<Int32Reader, int32_t>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(optional<int32_t>& v) {
+ CheckColumn(Type::INT32, ConvertedType::INT_32);
+ ReadOptional<Int32Reader>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(optional<uint32_t>& v) {
+ CheckColumn(Type::INT32, ConvertedType::UINT_32);
+ ReadOptional<Int32Reader>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(optional<int64_t>& v) {
+ CheckColumn(Type::INT64, ConvertedType::INT_64);
+ ReadOptional<Int64Reader>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(optional<uint64_t>& v) {
+ CheckColumn(Type::INT64, ConvertedType::UINT_64);
+ ReadOptional<Int64Reader>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(optional<float>& v) {
+ CheckColumn(Type::FLOAT, ConvertedType::NONE);
+ ReadOptional<FloatReader>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(optional<double>& v) {
+ CheckColumn(Type::DOUBLE, ConvertedType::NONE);
+ ReadOptional<DoubleReader>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(optional<std::chrono::milliseconds>& v) {
+ CheckColumn(Type::INT64, ConvertedType::TIMESTAMP_MILLIS);
+ ReadOptional<Int64Reader, int64_t>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(optional<std::chrono::microseconds>& v) {
+ CheckColumn(Type::INT64, ConvertedType::TIMESTAMP_MICROS);
+ ReadOptional<Int64Reader, int64_t>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(optional<char>& v) {
+ CheckColumn(Type::FIXED_LEN_BYTE_ARRAY, ConvertedType::NONE, 1);
+ FixedLenByteArray flba;
+
+ if (ReadOptional(&flba)) {
+ v = static_cast<char>(flba.ptr[0]);
+ } else {
+ v.reset();
+ }
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(optional<std::string>& v) {
+ CheckColumn(Type::BYTE_ARRAY, ConvertedType::UTF8);
+ ByteArray ba;
+
+ if (ReadOptional(&ba)) {
+ v = std::string(reinterpret_cast<const char*>(ba.ptr), ba.len);
+ } else {
+ v.reset();
+ }
+ return *this;
+}
+
+void StreamReader::ReadFixedLength(char* ptr, int len) {
+ CheckColumn(Type::FIXED_LEN_BYTE_ARRAY, ConvertedType::NONE, len);
+ FixedLenByteArray flba;
+ Read(&flba);
+ std::memcpy(ptr, flba.ptr, len);
+}
+
+void StreamReader::Read(ByteArray* v) {
+ const auto& node = nodes_[column_index_];
+ auto reader = static_cast<ByteArrayReader*>(column_readers_[column_index_++].get());
+ int16_t def_level;
+ int16_t rep_level;
+ int64_t values_read;
+
+ reader->ReadBatch(kBatchSizeOne, &def_level, &rep_level, v, &values_read);
+
+ if (values_read != 1) {
+ ThrowReadFailedException(node);
+ }
+}
+
+bool StreamReader::ReadOptional(ByteArray* v) {
+ const auto& node = nodes_[column_index_];
+ auto reader = static_cast<ByteArrayReader*>(column_readers_[column_index_++].get());
+ int16_t def_level;
+ int16_t rep_level;
+ int64_t values_read;
+
+ reader->ReadBatch(kBatchSizeOne, &def_level, &rep_level, v, &values_read);
+
+ if (values_read == 1) {
+ return true;
+ } else if ((values_read == 0) && (def_level == 0)) {
+ return false;
+ }
+ ThrowReadFailedException(node);
+}
+
+void StreamReader::Read(FixedLenByteArray* v) {
+ const auto& node = nodes_[column_index_];
+ auto reader =
+ static_cast<FixedLenByteArrayReader*>(column_readers_[column_index_++].get());
+ int16_t def_level;
+ int16_t rep_level;
+ int64_t values_read;
+
+ reader->ReadBatch(kBatchSizeOne, &def_level, &rep_level, v, &values_read);
+
+ if (values_read != 1) {
+ ThrowReadFailedException(node);
+ }
+}
+
+bool StreamReader::ReadOptional(FixedLenByteArray* v) {
+ const auto& node = nodes_[column_index_];
+ auto reader =
+ static_cast<FixedLenByteArrayReader*>(column_readers_[column_index_++].get());
+ int16_t def_level;
+ int16_t rep_level;
+ int64_t values_read;
+
+ reader->ReadBatch(kBatchSizeOne, &def_level, &rep_level, v, &values_read);
+
+ if (values_read == 1) {
+ return true;
+ } else if ((values_read == 0) && (def_level == 0)) {
+ return false;
+ }
+ ThrowReadFailedException(node);
+}
+
+void StreamReader::EndRow() {
+ if (!file_reader_) {
+ throw ParquetException("StreamReader not initialized");
+ }
+ if (static_cast<std::size_t>(column_index_) < nodes_.size()) {
+ throw ParquetException("Cannot end row with " + std::to_string(column_index_) +
+ " of " + std::to_string(nodes_.size()) + " columns read");
+ }
+ column_index_ = 0;
+ ++current_row_;
+
+ if (!column_readers_[0]->HasNext()) {
+ NextRowGroup();
+ }
+}
+
+void StreamReader::NextRowGroup() {
+ // Find next none-empty row group
+ while (row_group_index_ < file_metadata_->num_row_groups()) {
+ row_group_reader_ = file_reader_->RowGroup(row_group_index_);
+ ++row_group_index_;
+
+ column_readers_.resize(file_metadata_->num_columns());
+
+ for (int i = 0; i < file_metadata_->num_columns(); ++i) {
+ column_readers_[i] = row_group_reader_->Column(i);
+ }
+ if (column_readers_[0]->HasNext()) {
+ row_group_row_offset_ = current_row_;
+ return;
+ }
+ }
+ // No more row groups found.
+ SetEof();
+}
+
+void StreamReader::SetEof() {
+ // Do not reset file_metadata_ to ensure queries on the number of
+ // rows/columns still function.
+ eof_ = true;
+ file_reader_.reset();
+ row_group_reader_.reset();
+ column_readers_.clear();
+ nodes_.clear();
+}
+
+int64_t StreamReader::SkipRows(int64_t num_rows_to_skip) {
+ if (0 != column_index_) {
+ throw ParquetException("Must finish reading current row before skipping rows.");
+ }
+ int64_t num_rows_remaining_to_skip = num_rows_to_skip;
+
+ while (!eof_ && (num_rows_remaining_to_skip > 0)) {
+ int64_t num_rows_in_row_group = row_group_reader_->metadata()->num_rows();
+ int64_t num_rows_remaining_in_row_group =
+ num_rows_in_row_group - current_row_ - row_group_row_offset_;
+
+ if (num_rows_remaining_in_row_group > num_rows_remaining_to_skip) {
+ for (auto reader : column_readers_) {
+ SkipRowsInColumn(reader.get(), num_rows_remaining_to_skip);
+ }
+ current_row_ += num_rows_remaining_to_skip;
+ num_rows_remaining_to_skip = 0;
+ } else {
+ num_rows_remaining_to_skip -= num_rows_remaining_in_row_group;
+ current_row_ += num_rows_remaining_in_row_group;
+ NextRowGroup();
+ }
+ }
+ return num_rows_to_skip - num_rows_remaining_to_skip;
+}
+
+int64_t StreamReader::SkipColumns(int64_t num_columns_to_skip) {
+ int64_t num_columns_skipped = 0;
+
+ if (!eof_) {
+ for (; (num_columns_to_skip > num_columns_skipped) &&
+ static_cast<std::size_t>(column_index_) < nodes_.size();
+ ++column_index_) {
+ SkipRowsInColumn(column_readers_[column_index_].get(), 1);
+ ++num_columns_skipped;
+ }
+ }
+ return num_columns_skipped;
+}
+
+void StreamReader::SkipRowsInColumn(ColumnReader* reader, int64_t num_rows_to_skip) {
+ int64_t num_skipped = 0;
+
+ switch (reader->type()) {
+ case Type::BOOLEAN:
+ num_skipped = static_cast<BoolReader*>(reader)->Skip(num_rows_to_skip);
+ break;
+ case Type::INT32:
+ num_skipped = static_cast<Int32Reader*>(reader)->Skip(num_rows_to_skip);
+ break;
+ case Type::INT64:
+ num_skipped = static_cast<Int64Reader*>(reader)->Skip(num_rows_to_skip);
+ break;
+ case Type::BYTE_ARRAY:
+ num_skipped = static_cast<ByteArrayReader*>(reader)->Skip(num_rows_to_skip);
+ break;
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ num_skipped = static_cast<FixedLenByteArrayReader*>(reader)->Skip(num_rows_to_skip);
+ break;
+ case Type::FLOAT:
+ num_skipped = static_cast<FloatReader*>(reader)->Skip(num_rows_to_skip);
+ break;
+ case Type::DOUBLE:
+ num_skipped = static_cast<DoubleReader*>(reader)->Skip(num_rows_to_skip);
+ break;
+ case Type::INT96:
+ num_skipped = static_cast<Int96Reader*>(reader)->Skip(num_rows_to_skip);
+ break;
+ case Type::UNDEFINED:
+ throw ParquetException("Unexpected type: " + TypeToString(reader->type()));
+ break;
+ }
+ if (num_rows_to_skip != num_skipped) {
+ throw ParquetException("Skipped " + std::to_string(num_skipped) + "/" +
+ std::to_string(num_rows_to_skip) + " rows in column " +
+ reader->descr()->name());
+ }
+}
+
+void StreamReader::CheckColumn(Type::type physical_type,
+ ConvertedType::type converted_type, int length) {
+ if (static_cast<std::size_t>(column_index_) >= nodes_.size()) {
+ if (eof_) {
+ ParquetException::EofException();
+ }
+ throw ParquetException("Column index out-of-bounds. Index " +
+ std::to_string(column_index_) + " is invalid for " +
+ std::to_string(nodes_.size()) + " columns");
+ }
+ const auto& node = nodes_[column_index_];
+
+ if (physical_type != node->physical_type()) {
+ throw ParquetException("Column physical type mismatch. Column '" + node->name() +
+ "' has physical type '" + TypeToString(node->physical_type()) +
+ "' not '" + TypeToString(physical_type) + "'");
+ }
+ if (converted_type != node->converted_type()) {
+ // The converted type does not always match with the value
+ // provided so check the set of exceptions.
+ if (converted_type_exceptions.find({converted_type, node->converted_type()}) ==
+ converted_type_exceptions.end()) {
+ throw ParquetException("Column converted type mismatch. Column '" + node->name() +
+ "' has converted type '" +
+ ConvertedTypeToString(node->converted_type()) + "' not '" +
+ ConvertedTypeToString(converted_type) + "'");
+ }
+ }
+ // Length must be exact.
+ if (length != node->type_length()) {
+ throw ParquetException("Column length mismatch. Column '" + node->name() +
+ "' has length " + std::to_string(node->type_length()) +
+ "] not " + std::to_string(length));
+ }
+} // namespace parquet
+
+void StreamReader::ThrowReadFailedException(
+ const std::shared_ptr<schema::PrimitiveNode>& node) {
+ throw ParquetException("Failed to read value for column '" + node->name() +
+ "' on row " + std::to_string(current_row_));
+}
+
+StreamReader& operator>>(StreamReader& os, EndRowType) {
+ os.EndRow();
+ return os;
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/stream_reader.h b/contrib/libs/apache/arrow/cpp/src/parquet/stream_reader.h
new file mode 100644
index 00000000000..806b0e8ad9a
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/stream_reader.h
@@ -0,0 +1,299 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <array>
+#include <chrono>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/util/optional.h"
+#include "parquet/column_reader.h"
+#include "parquet/file_reader.h"
+#include "parquet/stream_writer.h"
+
+namespace parquet {
+
+/// \brief A class for reading Parquet files using an output stream type API.
+///
+/// The values given must be of the correct type i.e. the type must
+/// match the file schema exactly otherwise a ParquetException will be
+/// thrown.
+///
+/// The user must explicitly advance to the next row using the
+/// EndRow() function or EndRow input manipulator.
+///
+/// Required and optional fields are supported:
+/// - Required fields are read using operator>>(T)
+/// - Optional fields are read with
+/// operator>>(arrow::util::optional<T>)
+///
+/// Note that operator>>(arrow::util::optional<T>) can be used to read
+/// required fields.
+///
+/// Similarly operator>>(T) can be used to read optional fields.
+/// However, if the value is not present then a ParquetException will
+/// be raised.
+///
+/// Currently there is no support for repeated fields.
+///
+class PARQUET_EXPORT StreamReader {
+ public:
+ template <typename T>
+ using optional = ::arrow::util::optional<T>;
+
+ // N.B. Default constructed objects are not usable. This
+ // constructor is provided so that the object may be move
+ // assigned afterwards.
+ StreamReader() = default;
+
+ explicit StreamReader(std::unique_ptr<ParquetFileReader> reader);
+
+ ~StreamReader() = default;
+
+ bool eof() const { return eof_; }
+
+ int current_column() const { return column_index_; }
+
+ int64_t current_row() const { return current_row_; }
+
+ int num_columns() const;
+
+ int64_t num_rows() const;
+
+ // Moving is possible.
+ StreamReader(StreamReader&&) = default;
+ StreamReader& operator=(StreamReader&&) = default;
+
+ // Copying is not allowed.
+ StreamReader(const StreamReader&) = delete;
+ StreamReader& operator=(const StreamReader&) = delete;
+
+ StreamReader& operator>>(bool& v);
+
+ StreamReader& operator>>(int8_t& v);
+
+ StreamReader& operator>>(uint8_t& v);
+
+ StreamReader& operator>>(int16_t& v);
+
+ StreamReader& operator>>(uint16_t& v);
+
+ StreamReader& operator>>(int32_t& v);
+
+ StreamReader& operator>>(uint32_t& v);
+
+ StreamReader& operator>>(int64_t& v);
+
+ StreamReader& operator>>(uint64_t& v);
+
+ StreamReader& operator>>(std::chrono::milliseconds& v);
+
+ StreamReader& operator>>(std::chrono::microseconds& v);
+
+ StreamReader& operator>>(float& v);
+
+ StreamReader& operator>>(double& v);
+
+ StreamReader& operator>>(char& v);
+
+ template <int N>
+ StreamReader& operator>>(char (&v)[N]) {
+ ReadFixedLength(v, N);
+ return *this;
+ }
+
+ template <std::size_t N>
+ StreamReader& operator>>(std::array<char, N>& v) {
+ ReadFixedLength(v.data(), static_cast<int>(N));
+ return *this;
+ }
+
+ // N.B. Cannot allow for reading to a arbitrary char pointer as the
+ // length cannot be verified. Also it would overshadow the
+ // char[N] input operator.
+ // StreamReader& operator>>(char * v);
+
+ StreamReader& operator>>(std::string& v);
+
+ // Input operators for optional fields.
+
+ StreamReader& operator>>(optional<bool>& v);
+
+ StreamReader& operator>>(optional<int8_t>& v);
+
+ StreamReader& operator>>(optional<uint8_t>& v);
+
+ StreamReader& operator>>(optional<int16_t>& v);
+
+ StreamReader& operator>>(optional<uint16_t>& v);
+
+ StreamReader& operator>>(optional<int32_t>& v);
+
+ StreamReader& operator>>(optional<uint32_t>& v);
+
+ StreamReader& operator>>(optional<int64_t>& v);
+
+ StreamReader& operator>>(optional<uint64_t>& v);
+
+ StreamReader& operator>>(optional<float>& v);
+
+ StreamReader& operator>>(optional<double>& v);
+
+ StreamReader& operator>>(optional<std::chrono::milliseconds>& v);
+
+ StreamReader& operator>>(optional<std::chrono::microseconds>& v);
+
+ StreamReader& operator>>(optional<char>& v);
+
+ StreamReader& operator>>(optional<std::string>& v);
+
+ template <std::size_t N>
+ StreamReader& operator>>(optional<std::array<char, N>>& v) {
+ CheckColumn(Type::FIXED_LEN_BYTE_ARRAY, ConvertedType::NONE, N);
+ FixedLenByteArray flba;
+ if (ReadOptional(&flba)) {
+ v = std::array<char, N>{};
+ std::memcpy(v->data(), flba.ptr, N);
+ } else {
+ v.reset();
+ }
+ return *this;
+ }
+
+ /// \brief Terminate current row and advance to next one.
+ /// \throws ParquetException if all columns in the row were not
+ /// read or skipped.
+ void EndRow();
+
+ /// \brief Skip the data in the next columns.
+ /// If the number of columns exceeds the columns remaining on the
+ /// current row then skipping is terminated - it does _not_ continue
+ /// skipping columns on the next row.
+ /// Skipping of columns still requires the use 'EndRow' even if all
+ /// remaining columns were skipped.
+ /// \return Number of columns actually skipped.
+ int64_t SkipColumns(int64_t num_columns_to_skip);
+
+ /// \brief Skip the data in the next rows.
+ /// Skipping of rows is not allowed if reading of data for the
+ /// current row is not finished.
+ /// Skipping of rows will be terminated if the end of file is
+ /// reached.
+ /// \return Number of rows actually skipped.
+ int64_t SkipRows(int64_t num_rows_to_skip);
+
+ protected:
+ [[noreturn]] void ThrowReadFailedException(
+ const std::shared_ptr<schema::PrimitiveNode>& node);
+
+ template <typename ReaderType, typename T>
+ void Read(T* v) {
+ const auto& node = nodes_[column_index_];
+ auto reader = static_cast<ReaderType*>(column_readers_[column_index_++].get());
+ int16_t def_level;
+ int16_t rep_level;
+ int64_t values_read;
+
+ reader->ReadBatch(kBatchSizeOne, &def_level, &rep_level, v, &values_read);
+
+ if (values_read != 1) {
+ ThrowReadFailedException(node);
+ }
+ }
+
+ template <typename ReaderType, typename ReadType, typename T>
+ void Read(T* v) {
+ const auto& node = nodes_[column_index_];
+ auto reader = static_cast<ReaderType*>(column_readers_[column_index_++].get());
+ int16_t def_level;
+ int16_t rep_level;
+ ReadType tmp;
+ int64_t values_read;
+
+ reader->ReadBatch(kBatchSizeOne, &def_level, &rep_level, &tmp, &values_read);
+
+ if (values_read == 1) {
+ *v = tmp;
+ } else {
+ ThrowReadFailedException(node);
+ }
+ }
+
+ template <typename ReaderType, typename ReadType = typename ReaderType::T, typename T>
+ void ReadOptional(optional<T>* v) {
+ const auto& node = nodes_[column_index_];
+ auto reader = static_cast<ReaderType*>(column_readers_[column_index_++].get());
+ int16_t def_level;
+ int16_t rep_level;
+ ReadType tmp;
+ int64_t values_read;
+
+ reader->ReadBatch(kBatchSizeOne, &def_level, &rep_level, &tmp, &values_read);
+
+ if (values_read == 1) {
+ *v = T(tmp);
+ } else if ((values_read == 0) && (def_level == 0)) {
+ v->reset();
+ } else {
+ ThrowReadFailedException(node);
+ }
+ }
+
+ void ReadFixedLength(char* ptr, int len);
+
+ void Read(ByteArray* v);
+
+ void Read(FixedLenByteArray* v);
+
+ bool ReadOptional(ByteArray* v);
+
+ bool ReadOptional(FixedLenByteArray* v);
+
+ void NextRowGroup();
+
+ void CheckColumn(Type::type physical_type, ConvertedType::type converted_type,
+ int length = 0);
+
+ void SkipRowsInColumn(ColumnReader* reader, int64_t num_rows_to_skip);
+
+ void SetEof();
+
+ private:
+ std::unique_ptr<ParquetFileReader> file_reader_;
+ std::shared_ptr<FileMetaData> file_metadata_;
+ std::shared_ptr<RowGroupReader> row_group_reader_;
+ std::vector<std::shared_ptr<ColumnReader>> column_readers_;
+ std::vector<std::shared_ptr<schema::PrimitiveNode>> nodes_;
+
+ bool eof_{true};
+ int row_group_index_{0};
+ int column_index_{0};
+ int64_t current_row_{0};
+ int64_t row_group_row_offset_{0};
+
+ static constexpr int64_t kBatchSizeOne = 1;
+}; // namespace parquet
+
+PARQUET_EXPORT
+StreamReader& operator>>(StreamReader&, EndRowType);
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/stream_writer.cc b/contrib/libs/apache/arrow/cpp/src/parquet/stream_writer.cc
new file mode 100644
index 00000000000..253ebf1bc91
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/stream_writer.cc
@@ -0,0 +1,324 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/stream_writer.h"
+
+#include <utility>
+
+namespace parquet {
+
+int64_t StreamWriter::default_row_group_size_{512 * 1024 * 1024}; // 512MB
+
+constexpr int16_t StreamWriter::kDefLevelZero;
+constexpr int16_t StreamWriter::kDefLevelOne;
+constexpr int16_t StreamWriter::kRepLevelZero;
+constexpr int64_t StreamWriter::kBatchSizeOne;
+
+StreamWriter::FixedStringView::FixedStringView(const char* data_ptr)
+ : data{data_ptr}, size{std::strlen(data_ptr)} {}
+
+StreamWriter::FixedStringView::FixedStringView(const char* data_ptr, std::size_t data_len)
+ : data{data_ptr}, size{data_len} {}
+
+StreamWriter::StreamWriter(std::unique_ptr<ParquetFileWriter> writer)
+ : file_writer_{std::move(writer)},
+ row_group_writer_{file_writer_->AppendBufferedRowGroup()} {
+ auto schema = file_writer_->schema();
+ auto group_node = schema->group_node();
+
+ nodes_.resize(schema->num_columns());
+
+ for (auto i = 0; i < schema->num_columns(); ++i) {
+ nodes_[i] = std::static_pointer_cast<schema::PrimitiveNode>(group_node->field(i));
+ }
+}
+
+void StreamWriter::SetDefaultMaxRowGroupSize(int64_t max_size) {
+ default_row_group_size_ = max_size;
+}
+
+void StreamWriter::SetMaxRowGroupSize(int64_t max_size) {
+ max_row_group_size_ = max_size;
+}
+
+int StreamWriter::num_columns() const { return static_cast<int>(nodes_.size()); }
+
+StreamWriter& StreamWriter::operator<<(bool v) {
+ CheckColumn(Type::BOOLEAN, ConvertedType::NONE);
+ return Write<BoolWriter>(v);
+}
+
+StreamWriter& StreamWriter::operator<<(int8_t v) {
+ CheckColumn(Type::INT32, ConvertedType::INT_8);
+ return Write<Int32Writer>(static_cast<int32_t>(v));
+}
+
+StreamWriter& StreamWriter::operator<<(uint8_t v) {
+ CheckColumn(Type::INT32, ConvertedType::UINT_8);
+ return Write<Int32Writer>(static_cast<int32_t>(v));
+}
+
+StreamWriter& StreamWriter::operator<<(int16_t v) {
+ CheckColumn(Type::INT32, ConvertedType::INT_16);
+ return Write<Int32Writer>(static_cast<int32_t>(v));
+}
+
+StreamWriter& StreamWriter::operator<<(uint16_t v) {
+ CheckColumn(Type::INT32, ConvertedType::UINT_16);
+ return Write<Int32Writer>(static_cast<int32_t>(v));
+}
+
+StreamWriter& StreamWriter::operator<<(int32_t v) {
+ CheckColumn(Type::INT32, ConvertedType::INT_32);
+ return Write<Int32Writer>(v);
+}
+
+StreamWriter& StreamWriter::operator<<(uint32_t v) {
+ CheckColumn(Type::INT32, ConvertedType::UINT_32);
+ return Write<Int32Writer>(static_cast<int32_t>(v));
+}
+
+StreamWriter& StreamWriter::operator<<(int64_t v) {
+ CheckColumn(Type::INT64, ConvertedType::INT_64);
+ return Write<Int64Writer>(v);
+}
+
+StreamWriter& StreamWriter::operator<<(uint64_t v) {
+ CheckColumn(Type::INT64, ConvertedType::UINT_64);
+ return Write<Int64Writer>(static_cast<int64_t>(v));
+}
+
+StreamWriter& StreamWriter::operator<<(const std::chrono::milliseconds& v) {
+ CheckColumn(Type::INT64, ConvertedType::TIMESTAMP_MILLIS);
+ return Write<Int64Writer>(static_cast<int64_t>(v.count()));
+}
+
+StreamWriter& StreamWriter::operator<<(const std::chrono::microseconds& v) {
+ CheckColumn(Type::INT64, ConvertedType::TIMESTAMP_MICROS);
+ return Write<Int64Writer>(static_cast<int64_t>(v.count()));
+}
+
+StreamWriter& StreamWriter::operator<<(float v) {
+ CheckColumn(Type::FLOAT, ConvertedType::NONE);
+ return Write<FloatWriter>(v);
+}
+
+StreamWriter& StreamWriter::operator<<(double v) {
+ CheckColumn(Type::DOUBLE, ConvertedType::NONE);
+ return Write<DoubleWriter>(v);
+}
+
+StreamWriter& StreamWriter::operator<<(char v) { return WriteFixedLength(&v, 1); }
+
+StreamWriter& StreamWriter::operator<<(FixedStringView v) {
+ return WriteFixedLength(v.data, v.size);
+}
+
+StreamWriter& StreamWriter::operator<<(const char* v) {
+ return WriteVariableLength(v, std::strlen(v));
+}
+
+StreamWriter& StreamWriter::operator<<(const std::string& v) {
+ return WriteVariableLength(v.data(), v.size());
+}
+
+StreamWriter& StreamWriter::operator<<(::arrow::util::string_view v) {
+ return WriteVariableLength(v.data(), v.size());
+}
+
+StreamWriter& StreamWriter::WriteVariableLength(const char* data_ptr,
+ std::size_t data_len) {
+ CheckColumn(Type::BYTE_ARRAY, ConvertedType::UTF8);
+
+ auto writer = static_cast<ByteArrayWriter*>(row_group_writer_->column(column_index_++));
+
+ if (data_ptr != nullptr) {
+ ByteArray ba_value;
+
+ ba_value.ptr = reinterpret_cast<const uint8_t*>(data_ptr);
+ ba_value.len = static_cast<uint32_t>(data_len);
+
+ writer->WriteBatch(kBatchSizeOne, &kDefLevelOne, &kRepLevelZero, &ba_value);
+ } else {
+ writer->WriteBatch(kBatchSizeOne, &kDefLevelZero, &kRepLevelZero, nullptr);
+ }
+ if (max_row_group_size_ > 0) {
+ row_group_size_ += writer->EstimatedBufferedValueBytes();
+ }
+ return *this;
+}
+
+StreamWriter& StreamWriter::WriteFixedLength(const char* data_ptr, std::size_t data_len) {
+ CheckColumn(Type::FIXED_LEN_BYTE_ARRAY, ConvertedType::NONE,
+ static_cast<int>(data_len));
+
+ auto writer =
+ static_cast<FixedLenByteArrayWriter*>(row_group_writer_->column(column_index_++));
+
+ if (data_ptr != nullptr) {
+ FixedLenByteArray flba_value;
+
+ flba_value.ptr = reinterpret_cast<const uint8_t*>(data_ptr);
+ writer->WriteBatch(kBatchSizeOne, &kDefLevelOne, &kRepLevelZero, &flba_value);
+ } else {
+ writer->WriteBatch(kBatchSizeOne, &kDefLevelZero, &kRepLevelZero, nullptr);
+ }
+ if (max_row_group_size_ > 0) {
+ row_group_size_ += writer->EstimatedBufferedValueBytes();
+ }
+ return *this;
+}
+
+void StreamWriter::CheckColumn(Type::type physical_type,
+ ConvertedType::type converted_type, int length) {
+ if (static_cast<std::size_t>(column_index_) >= nodes_.size()) {
+ throw ParquetException("Column index out-of-bounds. Index " +
+ std::to_string(column_index_) + " is invalid for " +
+ std::to_string(nodes_.size()) + " columns");
+ }
+ const auto& node = nodes_[column_index_];
+
+ if (physical_type != node->physical_type()) {
+ throw ParquetException("Column physical type mismatch. Column '" + node->name() +
+ "' has physical type '" + TypeToString(node->physical_type()) +
+ "' not '" + TypeToString(physical_type) + "'");
+ }
+ if (converted_type != node->converted_type()) {
+ throw ParquetException("Column converted type mismatch. Column '" + node->name() +
+ "' has converted type[" +
+ ConvertedTypeToString(node->converted_type()) + "] not '" +
+ ConvertedTypeToString(converted_type) + "'");
+ }
+ // Length must be exact.
+ // A shorter length fixed array is not acceptable as it would
+ // result in array bound read errors.
+ //
+ if (length != node->type_length()) {
+ throw ParquetException("Column length mismatch. Column '" + node->name() +
+ "' has length " + std::to_string(node->type_length()) +
+ " not " + std::to_string(length));
+ }
+}
+
+int64_t StreamWriter::SkipColumns(int num_columns_to_skip) {
+ int num_columns_skipped = 0;
+
+ for (; (num_columns_to_skip > num_columns_skipped) &&
+ static_cast<std::size_t>(column_index_) < nodes_.size();
+ ++num_columns_skipped) {
+ const auto& node = nodes_[column_index_];
+
+ if (node->is_required()) {
+ throw ParquetException("Cannot skip column '" + node->name() +
+ "' as it is required.");
+ }
+ auto writer = row_group_writer_->column(column_index_++);
+
+ WriteNullValue(writer);
+ }
+ return num_columns_skipped;
+}
+
+void StreamWriter::WriteNullValue(ColumnWriter* writer) {
+ switch (writer->type()) {
+ case Type::BOOLEAN:
+ static_cast<BoolWriter*>(writer)->WriteBatch(kBatchSizeOne, &kDefLevelZero,
+ &kRepLevelZero, nullptr);
+ break;
+ case Type::INT32:
+ static_cast<Int32Writer*>(writer)->WriteBatch(kBatchSizeOne, &kDefLevelZero,
+ &kRepLevelZero, nullptr);
+ break;
+ case Type::INT64:
+ static_cast<Int64Writer*>(writer)->WriteBatch(kBatchSizeOne, &kDefLevelZero,
+ &kRepLevelZero, nullptr);
+ break;
+ case Type::BYTE_ARRAY:
+ static_cast<ByteArrayWriter*>(writer)->WriteBatch(kBatchSizeOne, &kDefLevelZero,
+ &kRepLevelZero, nullptr);
+ break;
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ static_cast<FixedLenByteArrayWriter*>(writer)->WriteBatch(
+ kBatchSizeOne, &kDefLevelZero, &kRepLevelZero, nullptr);
+ break;
+ case Type::FLOAT:
+ static_cast<FloatWriter*>(writer)->WriteBatch(kBatchSizeOne, &kDefLevelZero,
+ &kRepLevelZero, nullptr);
+ break;
+ case Type::DOUBLE:
+ static_cast<DoubleWriter*>(writer)->WriteBatch(kBatchSizeOne, &kDefLevelZero,
+ &kRepLevelZero, nullptr);
+ break;
+ case Type::INT96:
+ case Type::UNDEFINED:
+ throw ParquetException("Unexpected type: " + TypeToString(writer->type()));
+ break;
+ }
+}
+
+void StreamWriter::SkipOptionalColumn() {
+ if (SkipColumns(1) != 1) {
+ throw ParquetException("Failed to skip optional column at column index " +
+ std::to_string(column_index_));
+ }
+}
+
+void StreamWriter::EndRow() {
+ if (!file_writer_) {
+ throw ParquetException("StreamWriter not initialized");
+ }
+ if (static_cast<std::size_t>(column_index_) < nodes_.size()) {
+ throw ParquetException("Cannot end row with " + std::to_string(column_index_) +
+ " of " + std::to_string(nodes_.size()) + " columns written");
+ }
+ column_index_ = 0;
+ ++current_row_;
+
+ if (max_row_group_size_ > 0) {
+ if (row_group_size_ > max_row_group_size_) {
+ EndRowGroup();
+ }
+ // Initialize for each row with size already written
+ // (compressed + uncompressed).
+ //
+ row_group_size_ = row_group_writer_->total_bytes_written() +
+ row_group_writer_->total_compressed_bytes();
+ }
+}
+
+void StreamWriter::EndRowGroup() {
+ if (!file_writer_) {
+ throw ParquetException("StreamWriter not initialized");
+ }
+ // Avoid creating empty row groups.
+ if (row_group_writer_->num_rows() > 0) {
+ row_group_writer_->Close();
+ row_group_writer_.reset(file_writer_->AppendBufferedRowGroup());
+ }
+}
+
+StreamWriter& operator<<(StreamWriter& os, EndRowType) {
+ os.EndRow();
+ return os;
+}
+
+StreamWriter& operator<<(StreamWriter& os, EndRowGroupType) {
+ os.EndRowGroup();
+ return os;
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/stream_writer.h b/contrib/libs/apache/arrow/cpp/src/parquet/stream_writer.h
new file mode 100644
index 00000000000..d0db850c341
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/stream_writer.h
@@ -0,0 +1,243 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <array>
+#include <chrono>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/util/optional.h"
+#include "arrow/util/string_view.h"
+#include "parquet/column_writer.h"
+#include "parquet/file_writer.h"
+
+namespace parquet {
+
+/// \brief A class for writing Parquet files using an output stream type API.
+///
+/// The values given must be of the correct type i.e. the type must
+/// match the file schema exactly otherwise a ParquetException will be
+/// thrown.
+///
+/// The user must explicitly indicate the end of the row using the
+/// EndRow() function or EndRow output manipulator.
+///
+/// A maximum row group size can be configured, the default size is
+/// 512MB. Alternatively the row group size can be set to zero and the
+/// user can create new row groups by calling the EndRowGroup()
+/// function or using the EndRowGroup output manipulator.
+///
+/// Required and optional fields are supported:
+/// - Required fields are written using operator<<(T)
+/// - Optional fields are written using
+/// operator<<(arrow::util::optional<T>).
+///
+/// Note that operator<<(T) can be used to write optional fields.
+///
+/// Similarly, operator<<(arrow::util::optional<T>) can be used to
+/// write required fields. However if the optional parameter does not
+/// have a value (i.e. it is nullopt) then a ParquetException will be
+/// raised.
+///
+/// Currently there is no support for repeated fields.
+///
+class PARQUET_EXPORT StreamWriter {
+ public:
+ template <typename T>
+ using optional = ::arrow::util::optional<T>;
+
+ // N.B. Default constructed objects are not usable. This
+ // constructor is provided so that the object may be move
+ // assigned afterwards.
+ StreamWriter() = default;
+
+ explicit StreamWriter(std::unique_ptr<ParquetFileWriter> writer);
+
+ ~StreamWriter() = default;
+
+ static void SetDefaultMaxRowGroupSize(int64_t max_size);
+
+ void SetMaxRowGroupSize(int64_t max_size);
+
+ int current_column() const { return column_index_; }
+
+ int64_t current_row() const { return current_row_; }
+
+ int num_columns() const;
+
+ // Moving is possible.
+ StreamWriter(StreamWriter&&) = default;
+ StreamWriter& operator=(StreamWriter&&) = default;
+
+ // Copying is not allowed.
+ StreamWriter(const StreamWriter&) = delete;
+ StreamWriter& operator=(const StreamWriter&) = delete;
+
+ /// \brief Output operators for required fields.
+ /// These can also be used for optional fields when a value must be set.
+ StreamWriter& operator<<(bool v);
+
+ StreamWriter& operator<<(int8_t v);
+
+ StreamWriter& operator<<(uint8_t v);
+
+ StreamWriter& operator<<(int16_t v);
+
+ StreamWriter& operator<<(uint16_t v);
+
+ StreamWriter& operator<<(int32_t v);
+
+ StreamWriter& operator<<(uint32_t v);
+
+ StreamWriter& operator<<(int64_t v);
+
+ StreamWriter& operator<<(uint64_t v);
+
+ StreamWriter& operator<<(const std::chrono::milliseconds& v);
+
+ StreamWriter& operator<<(const std::chrono::microseconds& v);
+
+ StreamWriter& operator<<(float v);
+
+ StreamWriter& operator<<(double v);
+
+ StreamWriter& operator<<(char v);
+
+ /// \brief Helper class to write fixed length strings.
+ /// This is useful as the standard string view (such as
+ /// arrow::util::string_view) is for variable length data.
+ struct PARQUET_EXPORT FixedStringView {
+ FixedStringView() = default;
+
+ explicit FixedStringView(const char* data_ptr);
+
+ FixedStringView(const char* data_ptr, std::size_t data_len);
+
+ const char* data{NULLPTR};
+ std::size_t size{0};
+ };
+
+ /// \brief Output operators for fixed length strings.
+ template <int N>
+ StreamWriter& operator<<(const char (&v)[N]) {
+ return WriteFixedLength(v, N);
+ }
+ template <std::size_t N>
+ StreamWriter& operator<<(const std::array<char, N>& v) {
+ return WriteFixedLength(v.data(), N);
+ }
+ StreamWriter& operator<<(FixedStringView v);
+
+ /// \brief Output operators for variable length strings.
+ StreamWriter& operator<<(const char* v);
+ StreamWriter& operator<<(const std::string& v);
+ StreamWriter& operator<<(::arrow::util::string_view v);
+
+ /// \brief Output operator for optional fields.
+ template <typename T>
+ StreamWriter& operator<<(const optional<T>& v) {
+ if (v) {
+ return operator<<(*v);
+ }
+ SkipOptionalColumn();
+ return *this;
+ }
+
+ /// \brief Skip the next N columns of optional data. If there are
+ /// less than N columns remaining then the excess columns are
+ /// ignored.
+ /// \throws ParquetException if there is an attempt to skip any
+ /// required column.
+ /// \return Number of columns actually skipped.
+ int64_t SkipColumns(int num_columns_to_skip);
+
+ /// \brief Terminate the current row and advance to next one.
+ /// \throws ParquetException if all columns in the row were not
+ /// written or skipped.
+ void EndRow();
+
+ /// \brief Terminate the current row group and create new one.
+ void EndRowGroup();
+
+ protected:
+ template <typename WriterType, typename T>
+ StreamWriter& Write(const T v) {
+ auto writer = static_cast<WriterType*>(row_group_writer_->column(column_index_++));
+
+ writer->WriteBatch(kBatchSizeOne, &kDefLevelOne, &kRepLevelZero, &v);
+
+ if (max_row_group_size_ > 0) {
+ row_group_size_ += writer->EstimatedBufferedValueBytes();
+ }
+ return *this;
+ }
+
+ StreamWriter& WriteVariableLength(const char* data_ptr, std::size_t data_len);
+
+ StreamWriter& WriteFixedLength(const char* data_ptr, std::size_t data_len);
+
+ void CheckColumn(Type::type physical_type, ConvertedType::type converted_type,
+ int length = -1);
+
+ /// \brief Skip the next column which must be optional.
+ /// \throws ParquetException if the next column does not exist or is
+ /// not optional.
+ void SkipOptionalColumn();
+
+ void WriteNullValue(ColumnWriter* writer);
+
+ private:
+ using node_ptr_type = std::shared_ptr<schema::PrimitiveNode>;
+
+ struct null_deleter {
+ void operator()(void*) {}
+ };
+
+ int32_t column_index_{0};
+ int64_t current_row_{0};
+ int64_t row_group_size_{0};
+ int64_t max_row_group_size_{default_row_group_size_};
+
+ std::unique_ptr<ParquetFileWriter> file_writer_;
+ std::unique_ptr<RowGroupWriter, null_deleter> row_group_writer_;
+ std::vector<node_ptr_type> nodes_;
+
+ static constexpr int16_t kDefLevelZero = 0;
+ static constexpr int16_t kDefLevelOne = 1;
+ static constexpr int16_t kRepLevelZero = 0;
+ static constexpr int64_t kBatchSizeOne = 1;
+
+ static int64_t default_row_group_size_;
+};
+
+struct PARQUET_EXPORT EndRowType {};
+constexpr EndRowType EndRow = {};
+
+struct PARQUET_EXPORT EndRowGroupType {};
+constexpr EndRowGroupType EndRowGroup = {};
+
+PARQUET_EXPORT
+StreamWriter& operator<<(StreamWriter&, EndRowType);
+
+PARQUET_EXPORT
+StreamWriter& operator<<(StreamWriter&, EndRowGroupType);
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/symbols.map b/contrib/libs/apache/arrow/cpp/src/parquet/symbols.map
new file mode 100644
index 00000000000..4bf032dd584
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/symbols.map
@@ -0,0 +1,40 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+{
+ # Symbols marked as 'local' are not exported by the DSO and thus may not
+ # be used by client applications.
+ local:
+ # devtoolset / static-libstdc++ symbols
+ __cxa_*;
+ __once_proxy;
+
+ extern "C++" {
+ # boost
+ boost::*;
+
+ # thrift
+ apache::thrift::*;
+
+ # devtoolset or -static-libstdc++ - the Red Hat devtoolset statically
+ # links c++11 symbols into binaries so that the result may be executed on
+ # a system with an older libstdc++ which doesn't include the necessary
+ # c++11 symbols.
+ std::*;
+ *std::__once_call*;
+ };
+};
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/thrift_internal.h b/contrib/libs/apache/arrow/cpp/src/parquet/thrift_internal.h
new file mode 100644
index 00000000000..ea7df209621
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/thrift_internal.h
@@ -0,0 +1,494 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/util/windows_compatibility.h"
+
+#include <cstdint>
+// Check if thrift version < 0.11.0
+// or if FORCE_BOOST_SMART_PTR is defined. Ref: https://thrift.apache.org/lib/cpp
+#if defined(PARQUET_THRIFT_USE_BOOST) || defined(FORCE_BOOST_SMART_PTR)
+#include <boost/shared_ptr.hpp>
+#else
+#include <memory>
+#endif
+#include <string>
+#include <vector>
+
+// TCompactProtocol requires some #defines to work right.
+#define SIGNED_RIGHT_SHIFT_IS 1
+#define ARITHMETIC_RIGHT_SHIFT 1
+#include <thrift/TApplicationException.h>
+#include <thrift/protocol/TCompactProtocol.h>
+#include <thrift/protocol/TDebugProtocol.h>
+
+#include <thrift/protocol/TBinaryProtocol.h>
+#include <thrift/transport/TBufferTransports.h>
+#include <sstream>
+
+#include "arrow/util/logging.h"
+
+#include "parquet/encryption/internal_file_decryptor.h"
+#include "parquet/encryption/internal_file_encryptor.h"
+#include "parquet/exception.h"
+#include "parquet/platform.h"
+#include "parquet/statistics.h"
+#include "parquet/types.h"
+
+#include "generated/parquet_types.h" // IYWU pragma: export
+
+namespace parquet {
+
+// Check if thrift version < 0.11.0
+// or if FORCE_BOOST_SMART_PTR is defined. Ref: https://thrift.apache.org/lib/cpp
+#if defined(PARQUET_THRIFT_USE_BOOST) || defined(FORCE_BOOST_SMART_PTR)
+using ::boost::shared_ptr;
+#else
+using ::std::shared_ptr;
+#endif
+
+// ----------------------------------------------------------------------
+// Convert Thrift enums to Parquet enums
+
+// Unsafe enum converters (input is not checked for validity)
+
+static inline Type::type FromThriftUnsafe(format::Type::type type) {
+ return static_cast<Type::type>(type);
+}
+
+static inline ConvertedType::type FromThriftUnsafe(format::ConvertedType::type type) {
+ // item 0 is NONE
+ return static_cast<ConvertedType::type>(static_cast<int>(type) + 1);
+}
+
+static inline Repetition::type FromThriftUnsafe(format::FieldRepetitionType::type type) {
+ return static_cast<Repetition::type>(type);
+}
+
+static inline Encoding::type FromThriftUnsafe(format::Encoding::type type) {
+ return static_cast<Encoding::type>(type);
+}
+
+static inline PageType::type FromThriftUnsafe(format::PageType::type type) {
+ return static_cast<PageType::type>(type);
+}
+
+static inline Compression::type FromThriftUnsafe(format::CompressionCodec::type type) {
+ switch (type) {
+ case format::CompressionCodec::UNCOMPRESSED:
+ return Compression::UNCOMPRESSED;
+ case format::CompressionCodec::SNAPPY:
+ return Compression::SNAPPY;
+ case format::CompressionCodec::GZIP:
+ return Compression::GZIP;
+ case format::CompressionCodec::LZO:
+ return Compression::LZO;
+ case format::CompressionCodec::BROTLI:
+ return Compression::BROTLI;
+ case format::CompressionCodec::LZ4:
+ return Compression::LZ4_HADOOP;
+ case format::CompressionCodec::LZ4_RAW:
+ return Compression::LZ4;
+ case format::CompressionCodec::ZSTD:
+ return Compression::ZSTD;
+ default:
+ DCHECK(false) << "Cannot reach here";
+ return Compression::UNCOMPRESSED;
+ }
+}
+
+namespace internal {
+
+template <typename T>
+struct ThriftEnumTypeTraits {};
+
+template <>
+struct ThriftEnumTypeTraits<::parquet::format::Type::type> {
+ using ParquetEnum = Type;
+};
+
+template <>
+struct ThriftEnumTypeTraits<::parquet::format::ConvertedType::type> {
+ using ParquetEnum = ConvertedType;
+};
+
+template <>
+struct ThriftEnumTypeTraits<::parquet::format::FieldRepetitionType::type> {
+ using ParquetEnum = Repetition;
+};
+
+template <>
+struct ThriftEnumTypeTraits<::parquet::format::Encoding::type> {
+ using ParquetEnum = Encoding;
+};
+
+template <>
+struct ThriftEnumTypeTraits<::parquet::format::PageType::type> {
+ using ParquetEnum = PageType;
+};
+
+// If the parquet file is corrupted it is possible the enum value decoded
+// will not be in the range of defined values, which is undefined behaviour.
+// This facility prevents this by loading the value as the underlying type
+// and checking to make sure it is in range.
+
+template <typename EnumType,
+ typename EnumTypeRaw = typename std::underlying_type<EnumType>::type>
+inline static EnumTypeRaw LoadEnumRaw(const EnumType* in) {
+ EnumTypeRaw raw_value;
+ // Use memcpy(), as a regular cast would be undefined behaviour on invalid values
+ memcpy(&raw_value, in, sizeof(EnumType));
+ return raw_value;
+}
+
+template <typename ApiType>
+struct SafeLoader {
+ using ApiTypeEnum = typename ApiType::type;
+ using ApiTypeRawEnum = typename std::underlying_type<ApiTypeEnum>::type;
+
+ template <typename ThriftType>
+ inline static ApiTypeRawEnum LoadRaw(const ThriftType* in) {
+ static_assert(sizeof(ApiTypeEnum) == sizeof(ThriftType),
+ "parquet type should always be the same size as thrift type");
+ return static_cast<ApiTypeRawEnum>(LoadEnumRaw(in));
+ }
+
+ template <typename ThriftType, bool IsUnsigned = true>
+ inline static ApiTypeEnum LoadChecked(
+ const typename std::enable_if<IsUnsigned, ThriftType>::type* in) {
+ auto raw_value = LoadRaw(in);
+ if (ARROW_PREDICT_FALSE(raw_value >=
+ static_cast<ApiTypeRawEnum>(ApiType::UNDEFINED))) {
+ return ApiType::UNDEFINED;
+ }
+ return FromThriftUnsafe(static_cast<ThriftType>(raw_value));
+ }
+
+ template <typename ThriftType, bool IsUnsigned = false>
+ inline static ApiTypeEnum LoadChecked(
+ const typename std::enable_if<!IsUnsigned, ThriftType>::type* in) {
+ auto raw_value = LoadRaw(in);
+ if (ARROW_PREDICT_FALSE(raw_value >=
+ static_cast<ApiTypeRawEnum>(ApiType::UNDEFINED) ||
+ raw_value < 0)) {
+ return ApiType::UNDEFINED;
+ }
+ return FromThriftUnsafe(static_cast<ThriftType>(raw_value));
+ }
+
+ template <typename ThriftType>
+ inline static ApiTypeEnum Load(const ThriftType* in) {
+ return LoadChecked<ThriftType, std::is_unsigned<ApiTypeRawEnum>::value>(in);
+ }
+};
+
+} // namespace internal
+
+// Safe enum loader: will check for invalid enum value before converting
+
+template <typename ThriftType,
+ typename ParquetEnum =
+ typename internal::ThriftEnumTypeTraits<ThriftType>::ParquetEnum>
+inline typename ParquetEnum::type LoadEnumSafe(const ThriftType* in) {
+ return internal::SafeLoader<ParquetEnum>::Load(in);
+}
+
+inline typename Compression::type LoadEnumSafe(const format::CompressionCodec::type* in) {
+ const auto raw_value = internal::LoadEnumRaw(in);
+ // Check bounds manually, as Compression::type doesn't have the same values
+ // as format::CompressionCodec.
+ const auto min_value =
+ static_cast<decltype(raw_value)>(format::CompressionCodec::UNCOMPRESSED);
+ const auto max_value =
+ static_cast<decltype(raw_value)>(format::CompressionCodec::LZ4_RAW);
+ if (raw_value < min_value || raw_value > max_value) {
+ return Compression::UNCOMPRESSED;
+ }
+ return FromThriftUnsafe(*in);
+}
+
+// Safe non-enum converters
+
+static inline AadMetadata FromThrift(format::AesGcmV1 aesGcmV1) {
+ return AadMetadata{aesGcmV1.aad_prefix, aesGcmV1.aad_file_unique,
+ aesGcmV1.supply_aad_prefix};
+}
+
+static inline AadMetadata FromThrift(format::AesGcmCtrV1 aesGcmCtrV1) {
+ return AadMetadata{aesGcmCtrV1.aad_prefix, aesGcmCtrV1.aad_file_unique,
+ aesGcmCtrV1.supply_aad_prefix};
+}
+
+static inline EncryptionAlgorithm FromThrift(format::EncryptionAlgorithm encryption) {
+ EncryptionAlgorithm encryption_algorithm;
+
+ if (encryption.__isset.AES_GCM_V1) {
+ encryption_algorithm.algorithm = ParquetCipher::AES_GCM_V1;
+ encryption_algorithm.aad = FromThrift(encryption.AES_GCM_V1);
+ } else if (encryption.__isset.AES_GCM_CTR_V1) {
+ encryption_algorithm.algorithm = ParquetCipher::AES_GCM_CTR_V1;
+ encryption_algorithm.aad = FromThrift(encryption.AES_GCM_CTR_V1);
+ } else {
+ throw ParquetException("Unsupported algorithm");
+ }
+ return encryption_algorithm;
+}
+
+// ----------------------------------------------------------------------
+// Convert Thrift enums from Parquet enums
+
+static inline format::Type::type ToThrift(Type::type type) {
+ return static_cast<format::Type::type>(type);
+}
+
+static inline format::ConvertedType::type ToThrift(ConvertedType::type type) {
+ // item 0 is NONE
+ DCHECK_NE(type, ConvertedType::NONE);
+ // it is forbidden to emit "NA" (PARQUET-1990)
+ DCHECK_NE(type, ConvertedType::NA);
+ DCHECK_NE(type, ConvertedType::UNDEFINED);
+ return static_cast<format::ConvertedType::type>(static_cast<int>(type) - 1);
+}
+
+static inline format::FieldRepetitionType::type ToThrift(Repetition::type type) {
+ return static_cast<format::FieldRepetitionType::type>(type);
+}
+
+static inline format::Encoding::type ToThrift(Encoding::type type) {
+ return static_cast<format::Encoding::type>(type);
+}
+
+static inline format::CompressionCodec::type ToThrift(Compression::type type) {
+ switch (type) {
+ case Compression::UNCOMPRESSED:
+ return format::CompressionCodec::UNCOMPRESSED;
+ case Compression::SNAPPY:
+ return format::CompressionCodec::SNAPPY;
+ case Compression::GZIP:
+ return format::CompressionCodec::GZIP;
+ case Compression::LZO:
+ return format::CompressionCodec::LZO;
+ case Compression::BROTLI:
+ return format::CompressionCodec::BROTLI;
+ case Compression::LZ4:
+ return format::CompressionCodec::LZ4_RAW;
+ case Compression::LZ4_HADOOP:
+ // Deprecated "LZ4" Parquet compression has Hadoop-specific framing
+ return format::CompressionCodec::LZ4;
+ case Compression::ZSTD:
+ return format::CompressionCodec::ZSTD;
+ default:
+ DCHECK(false) << "Cannot reach here";
+ return format::CompressionCodec::UNCOMPRESSED;
+ }
+}
+
+static inline format::Statistics ToThrift(const EncodedStatistics& stats) {
+ format::Statistics statistics;
+ if (stats.has_min) {
+ statistics.__set_min_value(stats.min());
+ // If the order is SIGNED, then the old min value must be set too.
+ // This for backward compatibility
+ if (stats.is_signed()) {
+ statistics.__set_min(stats.min());
+ }
+ }
+ if (stats.has_max) {
+ statistics.__set_max_value(stats.max());
+ // If the order is SIGNED, then the old max value must be set too.
+ // This for backward compatibility
+ if (stats.is_signed()) {
+ statistics.__set_max(stats.max());
+ }
+ }
+ if (stats.has_null_count) {
+ statistics.__set_null_count(stats.null_count);
+ }
+ if (stats.has_distinct_count) {
+ statistics.__set_distinct_count(stats.distinct_count);
+ }
+
+ return statistics;
+}
+
+static inline format::AesGcmV1 ToAesGcmV1Thrift(AadMetadata aad) {
+ format::AesGcmV1 aesGcmV1;
+ // aad_file_unique is always set
+ aesGcmV1.__set_aad_file_unique(aad.aad_file_unique);
+ aesGcmV1.__set_supply_aad_prefix(aad.supply_aad_prefix);
+ if (!aad.aad_prefix.empty()) {
+ aesGcmV1.__set_aad_prefix(aad.aad_prefix);
+ }
+ return aesGcmV1;
+}
+
+static inline format::AesGcmCtrV1 ToAesGcmCtrV1Thrift(AadMetadata aad) {
+ format::AesGcmCtrV1 aesGcmCtrV1;
+ // aad_file_unique is always set
+ aesGcmCtrV1.__set_aad_file_unique(aad.aad_file_unique);
+ aesGcmCtrV1.__set_supply_aad_prefix(aad.supply_aad_prefix);
+ if (!aad.aad_prefix.empty()) {
+ aesGcmCtrV1.__set_aad_prefix(aad.aad_prefix);
+ }
+ return aesGcmCtrV1;
+}
+
+static inline format::EncryptionAlgorithm ToThrift(EncryptionAlgorithm encryption) {
+ format::EncryptionAlgorithm encryption_algorithm;
+ if (encryption.algorithm == ParquetCipher::AES_GCM_V1) {
+ encryption_algorithm.__set_AES_GCM_V1(ToAesGcmV1Thrift(encryption.aad));
+ } else {
+ encryption_algorithm.__set_AES_GCM_CTR_V1(ToAesGcmCtrV1Thrift(encryption.aad));
+ }
+ return encryption_algorithm;
+}
+
+// ----------------------------------------------------------------------
+// Thrift struct serialization / deserialization utilities
+
+using ThriftBuffer = apache::thrift::transport::TMemoryBuffer;
+
+template <class T>
+inline void DeserializeThriftUnencryptedMsg(const uint8_t* buf, uint32_t* len,
+ T* deserialized_msg) {
+ // Deserialize msg bytes into c++ thrift msg using memory transport.
+ shared_ptr<ThriftBuffer> tmem_transport(
+ new ThriftBuffer(const_cast<uint8_t*>(buf), *len));
+ apache::thrift::protocol::TCompactProtocolFactoryT<ThriftBuffer> tproto_factory;
+ // Protect against CPU and memory bombs
+ tproto_factory.setStringSizeLimit(100 * 1000 * 1000);
+ // Structs in the thrift definition are relatively large (at least 300 bytes).
+ // This limits total memory to the same order of magnitude as stringSize.
+ tproto_factory.setContainerSizeLimit(1000 * 1000);
+ shared_ptr<apache::thrift::protocol::TProtocol> tproto = //
+ tproto_factory.getProtocol(tmem_transport);
+ try {
+ deserialized_msg->read(tproto.get());
+ } catch (std::exception& e) {
+ std::stringstream ss;
+ ss << "Couldn't deserialize thrift: " << e.what() << "\n";
+ throw ParquetException(ss.str());
+ }
+ uint32_t bytes_left = tmem_transport->available_read();
+ *len = *len - bytes_left;
+}
+
+// Deserialize a thrift message from buf/len. buf/len must at least contain
+// all the bytes needed to store the thrift message. On return, len will be
+// set to the actual length of the header.
+template <class T>
+inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deserialized_msg,
+ const std::shared_ptr<Decryptor>& decryptor = NULLPTR) {
+ // thrift message is not encrypted
+ if (decryptor == NULLPTR) {
+ DeserializeThriftUnencryptedMsg(buf, len, deserialized_msg);
+ } else { // thrift message is encrypted
+ uint32_t clen;
+ clen = *len;
+ // decrypt
+ std::shared_ptr<ResizableBuffer> decrypted_buffer =
+ std::static_pointer_cast<ResizableBuffer>(AllocateBuffer(
+ decryptor->pool(),
+ static_cast<int64_t>(clen - decryptor->CiphertextSizeDelta())));
+ const uint8_t* cipher_buf = buf;
+ uint32_t decrypted_buffer_len =
+ decryptor->Decrypt(cipher_buf, 0, decrypted_buffer->mutable_data());
+ if (decrypted_buffer_len <= 0) {
+ throw ParquetException("Couldn't decrypt buffer\n");
+ }
+ *len = decrypted_buffer_len + decryptor->CiphertextSizeDelta();
+ DeserializeThriftMsg(decrypted_buffer->data(), &decrypted_buffer_len,
+ deserialized_msg);
+ }
+}
+
+/// Utility class to serialize thrift objects to a binary format. This object
+/// should be reused if possible to reuse the underlying memory.
+/// Note: thrift will encode NULLs into the serialized buffer so it is not valid
+/// to treat it as a string.
+class ThriftSerializer {
+ public:
+ explicit ThriftSerializer(int initial_buffer_size = 1024)
+ : mem_buffer_(new ThriftBuffer(initial_buffer_size)) {
+ apache::thrift::protocol::TCompactProtocolFactoryT<ThriftBuffer> factory;
+ protocol_ = factory.getProtocol(mem_buffer_);
+ }
+
+ /// Serialize obj into a memory buffer. The result is returned in buffer/len. The
+ /// memory returned is owned by this object and will be invalid when another object
+ /// is serialized.
+ template <class T>
+ void SerializeToBuffer(const T* obj, uint32_t* len, uint8_t** buffer) {
+ SerializeObject(obj);
+ mem_buffer_->getBuffer(buffer, len);
+ }
+
+ template <class T>
+ void SerializeToString(const T* obj, std::string* result) {
+ SerializeObject(obj);
+ *result = mem_buffer_->getBufferAsString();
+ }
+
+ template <class T>
+ int64_t Serialize(const T* obj, ArrowOutputStream* out,
+ const std::shared_ptr<Encryptor>& encryptor = NULLPTR) {
+ uint8_t* out_buffer;
+ uint32_t out_length;
+ SerializeToBuffer(obj, &out_length, &out_buffer);
+
+ // obj is not encrypted
+ if (encryptor == NULLPTR) {
+ PARQUET_THROW_NOT_OK(out->Write(out_buffer, out_length));
+ return static_cast<int64_t>(out_length);
+ } else { // obj is encrypted
+ return SerializeEncryptedObj(out, out_buffer, out_length, encryptor);
+ }
+ }
+
+ private:
+ template <class T>
+ void SerializeObject(const T* obj) {
+ try {
+ mem_buffer_->resetBuffer();
+ obj->write(protocol_.get());
+ } catch (std::exception& e) {
+ std::stringstream ss;
+ ss << "Couldn't serialize thrift: " << e.what() << "\n";
+ throw ParquetException(ss.str());
+ }
+ }
+
+ int64_t SerializeEncryptedObj(ArrowOutputStream* out, uint8_t* out_buffer,
+ uint32_t out_length,
+ const std::shared_ptr<Encryptor>& encryptor) {
+ std::shared_ptr<ResizableBuffer> cipher_buffer =
+ std::static_pointer_cast<ResizableBuffer>(AllocateBuffer(
+ encryptor->pool(),
+ static_cast<int64_t>(encryptor->CiphertextSizeDelta() + out_length)));
+ int cipher_buffer_len =
+ encryptor->Encrypt(out_buffer, out_length, cipher_buffer->mutable_data());
+
+ PARQUET_THROW_NOT_OK(out->Write(cipher_buffer->data(), cipher_buffer_len));
+ return static_cast<int64_t>(cipher_buffer_len);
+ }
+
+ shared_ptr<ThriftBuffer> mem_buffer_;
+ shared_ptr<apache::thrift::protocol::TProtocol> protocol_;
+};
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/type_fwd.h b/contrib/libs/apache/arrow/cpp/src/parquet/type_fwd.h
new file mode 100644
index 00000000000..a427f5a9591
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/type_fwd.h
@@ -0,0 +1,43 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+namespace parquet {
+
+struct ParquetVersion {
+ enum type { PARQUET_1_0, PARQUET_2_0 };
+};
+
+class FileMetaData;
+class SchemaDescriptor;
+
+class ReaderProperties;
+class ArrowReaderProperties;
+
+class WriterProperties;
+class WriterPropertiesBuilder;
+class ArrowWriterProperties;
+class ArrowWriterPropertiesBuilder;
+
+namespace arrow {
+
+class FileWriter;
+class FileReader;
+
+} // namespace arrow
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/types.cc b/contrib/libs/apache/arrow/cpp/src/parquet/types.cc
new file mode 100644
index 00000000000..ef23c40662b
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/types.cc
@@ -0,0 +1,1567 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cmath>
+#include <cstdint>
+#include <memory>
+#include <sstream>
+#include <string>
+
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/compression.h"
+#include "arrow/util/logging.h"
+
+#include "parquet/exception.h"
+#include "parquet/types.h"
+
+#include "generated/parquet_types.h"
+
+using arrow::internal::checked_cast;
+using arrow::util::Codec;
+
+namespace parquet {
+
+bool IsCodecSupported(Compression::type codec) {
+ switch (codec) {
+ case Compression::UNCOMPRESSED:
+ case Compression::SNAPPY:
+ case Compression::GZIP:
+ case Compression::BROTLI:
+ case Compression::ZSTD:
+ case Compression::LZ4:
+ case Compression::LZ4_HADOOP:
+ return true;
+ default:
+ return false;
+ }
+}
+
+std::unique_ptr<Codec> GetCodec(Compression::type codec) {
+ return GetCodec(codec, Codec::UseDefaultCompressionLevel());
+}
+
+std::unique_ptr<Codec> GetCodec(Compression::type codec, int compression_level) {
+ std::unique_ptr<Codec> result;
+ if (codec == Compression::LZO) {
+ throw ParquetException(
+ "While LZO compression is supported by the Parquet format in "
+ "general, it is currently not supported by the C++ implementation.");
+ }
+
+ if (!IsCodecSupported(codec)) {
+ std::stringstream ss;
+ ss << "Codec type " << Codec::GetCodecAsString(codec)
+ << " not supported in Parquet format";
+ throw ParquetException(ss.str());
+ }
+
+ PARQUET_ASSIGN_OR_THROW(result, Codec::Create(codec, compression_level));
+ return result;
+}
+
+std::string FormatStatValue(Type::type parquet_type, ::arrow::util::string_view val) {
+ std::stringstream result;
+
+ const char* bytes = val.data();
+ switch (parquet_type) {
+ case Type::BOOLEAN:
+ result << reinterpret_cast<const bool*>(bytes)[0];
+ break;
+ case Type::INT32:
+ result << reinterpret_cast<const int32_t*>(bytes)[0];
+ break;
+ case Type::INT64:
+ result << reinterpret_cast<const int64_t*>(bytes)[0];
+ break;
+ case Type::DOUBLE:
+ result << reinterpret_cast<const double*>(bytes)[0];
+ break;
+ case Type::FLOAT:
+ result << reinterpret_cast<const float*>(bytes)[0];
+ break;
+ case Type::INT96: {
+ auto const i32_val = reinterpret_cast<const int32_t*>(bytes);
+ result << i32_val[0] << " " << i32_val[1] << " " << i32_val[2];
+ break;
+ }
+ case Type::BYTE_ARRAY: {
+ return std::string(val);
+ }
+ case Type::FIXED_LEN_BYTE_ARRAY: {
+ return std::string(val);
+ }
+ case Type::UNDEFINED:
+ default:
+ break;
+ }
+ return result.str();
+}
+
+std::string EncodingToString(Encoding::type t) {
+ switch (t) {
+ case Encoding::PLAIN:
+ return "PLAIN";
+ case Encoding::PLAIN_DICTIONARY:
+ return "PLAIN_DICTIONARY";
+ case Encoding::RLE:
+ return "RLE";
+ case Encoding::BIT_PACKED:
+ return "BIT_PACKED";
+ case Encoding::DELTA_BINARY_PACKED:
+ return "DELTA_BINARY_PACKED";
+ case Encoding::DELTA_LENGTH_BYTE_ARRAY:
+ return "DELTA_LENGTH_BYTE_ARRAY";
+ case Encoding::DELTA_BYTE_ARRAY:
+ return "DELTA_BYTE_ARRAY";
+ case Encoding::RLE_DICTIONARY:
+ return "RLE_DICTIONARY";
+ case Encoding::BYTE_STREAM_SPLIT:
+ return "BYTE_STREAM_SPLIT";
+ default:
+ return "UNKNOWN";
+ }
+}
+
+std::string TypeToString(Type::type t) {
+ switch (t) {
+ case Type::BOOLEAN:
+ return "BOOLEAN";
+ case Type::INT32:
+ return "INT32";
+ case Type::INT64:
+ return "INT64";
+ case Type::INT96:
+ return "INT96";
+ case Type::FLOAT:
+ return "FLOAT";
+ case Type::DOUBLE:
+ return "DOUBLE";
+ case Type::BYTE_ARRAY:
+ return "BYTE_ARRAY";
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ return "FIXED_LEN_BYTE_ARRAY";
+ case Type::UNDEFINED:
+ default:
+ return "UNKNOWN";
+ }
+}
+
+std::string ConvertedTypeToString(ConvertedType::type t) {
+ switch (t) {
+ case ConvertedType::NONE:
+ return "NONE";
+ case ConvertedType::UTF8:
+ return "UTF8";
+ case ConvertedType::MAP:
+ return "MAP";
+ case ConvertedType::MAP_KEY_VALUE:
+ return "MAP_KEY_VALUE";
+ case ConvertedType::LIST:
+ return "LIST";
+ case ConvertedType::ENUM:
+ return "ENUM";
+ case ConvertedType::DECIMAL:
+ return "DECIMAL";
+ case ConvertedType::DATE:
+ return "DATE";
+ case ConvertedType::TIME_MILLIS:
+ return "TIME_MILLIS";
+ case ConvertedType::TIME_MICROS:
+ return "TIME_MICROS";
+ case ConvertedType::TIMESTAMP_MILLIS:
+ return "TIMESTAMP_MILLIS";
+ case ConvertedType::TIMESTAMP_MICROS:
+ return "TIMESTAMP_MICROS";
+ case ConvertedType::UINT_8:
+ return "UINT_8";
+ case ConvertedType::UINT_16:
+ return "UINT_16";
+ case ConvertedType::UINT_32:
+ return "UINT_32";
+ case ConvertedType::UINT_64:
+ return "UINT_64";
+ case ConvertedType::INT_8:
+ return "INT_8";
+ case ConvertedType::INT_16:
+ return "INT_16";
+ case ConvertedType::INT_32:
+ return "INT_32";
+ case ConvertedType::INT_64:
+ return "INT_64";
+ case ConvertedType::JSON:
+ return "JSON";
+ case ConvertedType::BSON:
+ return "BSON";
+ case ConvertedType::INTERVAL:
+ return "INTERVAL";
+ case ConvertedType::UNDEFINED:
+ default:
+ return "UNKNOWN";
+ }
+}
+
+int GetTypeByteSize(Type::type parquet_type) {
+ switch (parquet_type) {
+ case Type::BOOLEAN:
+ return type_traits<BooleanType::type_num>::value_byte_size;
+ case Type::INT32:
+ return type_traits<Int32Type::type_num>::value_byte_size;
+ case Type::INT64:
+ return type_traits<Int64Type::type_num>::value_byte_size;
+ case Type::INT96:
+ return type_traits<Int96Type::type_num>::value_byte_size;
+ case Type::DOUBLE:
+ return type_traits<DoubleType::type_num>::value_byte_size;
+ case Type::FLOAT:
+ return type_traits<FloatType::type_num>::value_byte_size;
+ case Type::BYTE_ARRAY:
+ return type_traits<ByteArrayType::type_num>::value_byte_size;
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ return type_traits<FLBAType::type_num>::value_byte_size;
+ case Type::UNDEFINED:
+ default:
+ return 0;
+ }
+ return 0;
+}
+
+// Return the Sort Order of the Parquet Physical Types
+SortOrder::type DefaultSortOrder(Type::type primitive) {
+ switch (primitive) {
+ case Type::BOOLEAN:
+ case Type::INT32:
+ case Type::INT64:
+ case Type::FLOAT:
+ case Type::DOUBLE:
+ return SortOrder::SIGNED;
+ case Type::BYTE_ARRAY:
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ return SortOrder::UNSIGNED;
+ case Type::INT96:
+ case Type::UNDEFINED:
+ return SortOrder::UNKNOWN;
+ }
+ return SortOrder::UNKNOWN;
+}
+
+// Return the SortOrder of the Parquet Types using Logical or Physical Types
+SortOrder::type GetSortOrder(ConvertedType::type converted, Type::type primitive) {
+ if (converted == ConvertedType::NONE) return DefaultSortOrder(primitive);
+ switch (converted) {
+ case ConvertedType::INT_8:
+ case ConvertedType::INT_16:
+ case ConvertedType::INT_32:
+ case ConvertedType::INT_64:
+ case ConvertedType::DATE:
+ case ConvertedType::TIME_MICROS:
+ case ConvertedType::TIME_MILLIS:
+ case ConvertedType::TIMESTAMP_MICROS:
+ case ConvertedType::TIMESTAMP_MILLIS:
+ return SortOrder::SIGNED;
+ case ConvertedType::UINT_8:
+ case ConvertedType::UINT_16:
+ case ConvertedType::UINT_32:
+ case ConvertedType::UINT_64:
+ case ConvertedType::ENUM:
+ case ConvertedType::UTF8:
+ case ConvertedType::BSON:
+ case ConvertedType::JSON:
+ return SortOrder::UNSIGNED;
+ case ConvertedType::DECIMAL:
+ case ConvertedType::LIST:
+ case ConvertedType::MAP:
+ case ConvertedType::MAP_KEY_VALUE:
+ case ConvertedType::INTERVAL:
+ case ConvertedType::NONE: // required instead of default
+ case ConvertedType::NA: // required instead of default
+ case ConvertedType::UNDEFINED:
+ return SortOrder::UNKNOWN;
+ }
+ return SortOrder::UNKNOWN;
+}
+
+SortOrder::type GetSortOrder(const std::shared_ptr<const LogicalType>& logical_type,
+ Type::type primitive) {
+ SortOrder::type o = SortOrder::UNKNOWN;
+ if (logical_type && logical_type->is_valid()) {
+ o = (logical_type->is_none() ? DefaultSortOrder(primitive)
+ : logical_type->sort_order());
+ }
+ return o;
+}
+
+ColumnOrder ColumnOrder::undefined_ = ColumnOrder(ColumnOrder::UNDEFINED);
+ColumnOrder ColumnOrder::type_defined_ = ColumnOrder(ColumnOrder::TYPE_DEFINED_ORDER);
+
+// Static methods for LogicalType class
+
+std::shared_ptr<const LogicalType> LogicalType::FromConvertedType(
+ const ConvertedType::type converted_type,
+ const schema::DecimalMetadata converted_decimal_metadata) {
+ switch (converted_type) {
+ case ConvertedType::UTF8:
+ return StringLogicalType::Make();
+ case ConvertedType::MAP_KEY_VALUE:
+ case ConvertedType::MAP:
+ return MapLogicalType::Make();
+ case ConvertedType::LIST:
+ return ListLogicalType::Make();
+ case ConvertedType::ENUM:
+ return EnumLogicalType::Make();
+ case ConvertedType::DECIMAL:
+ return DecimalLogicalType::Make(converted_decimal_metadata.precision,
+ converted_decimal_metadata.scale);
+ case ConvertedType::DATE:
+ return DateLogicalType::Make();
+ case ConvertedType::TIME_MILLIS:
+ return TimeLogicalType::Make(true, LogicalType::TimeUnit::MILLIS);
+ case ConvertedType::TIME_MICROS:
+ return TimeLogicalType::Make(true, LogicalType::TimeUnit::MICROS);
+ case ConvertedType::TIMESTAMP_MILLIS:
+ return TimestampLogicalType::Make(true, LogicalType::TimeUnit::MILLIS,
+ /*is_from_converted_type=*/true,
+ /*force_set_converted_type=*/false);
+ case ConvertedType::TIMESTAMP_MICROS:
+ return TimestampLogicalType::Make(true, LogicalType::TimeUnit::MICROS,
+ /*is_from_converted_type=*/true,
+ /*force_set_converted_type=*/false);
+ case ConvertedType::INTERVAL:
+ return IntervalLogicalType::Make();
+ case ConvertedType::INT_8:
+ return IntLogicalType::Make(8, true);
+ case ConvertedType::INT_16:
+ return IntLogicalType::Make(16, true);
+ case ConvertedType::INT_32:
+ return IntLogicalType::Make(32, true);
+ case ConvertedType::INT_64:
+ return IntLogicalType::Make(64, true);
+ case ConvertedType::UINT_8:
+ return IntLogicalType::Make(8, false);
+ case ConvertedType::UINT_16:
+ return IntLogicalType::Make(16, false);
+ case ConvertedType::UINT_32:
+ return IntLogicalType::Make(32, false);
+ case ConvertedType::UINT_64:
+ return IntLogicalType::Make(64, false);
+ case ConvertedType::JSON:
+ return JSONLogicalType::Make();
+ case ConvertedType::BSON:
+ return BSONLogicalType::Make();
+ case ConvertedType::NA:
+ return NullLogicalType::Make();
+ case ConvertedType::NONE:
+ return NoLogicalType::Make();
+ case ConvertedType::UNDEFINED:
+ return UndefinedLogicalType::Make();
+ }
+ return UndefinedLogicalType::Make();
+}
+
+std::shared_ptr<const LogicalType> LogicalType::FromThrift(
+ const format::LogicalType& type) {
+ if (type.__isset.STRING) {
+ return StringLogicalType::Make();
+ } else if (type.__isset.MAP) {
+ return MapLogicalType::Make();
+ } else if (type.__isset.LIST) {
+ return ListLogicalType::Make();
+ } else if (type.__isset.ENUM) {
+ return EnumLogicalType::Make();
+ } else if (type.__isset.DECIMAL) {
+ return DecimalLogicalType::Make(type.DECIMAL.precision, type.DECIMAL.scale);
+ } else if (type.__isset.DATE) {
+ return DateLogicalType::Make();
+ } else if (type.__isset.TIME) {
+ LogicalType::TimeUnit::unit unit;
+ if (type.TIME.unit.__isset.MILLIS) {
+ unit = LogicalType::TimeUnit::MILLIS;
+ } else if (type.TIME.unit.__isset.MICROS) {
+ unit = LogicalType::TimeUnit::MICROS;
+ } else if (type.TIME.unit.__isset.NANOS) {
+ unit = LogicalType::TimeUnit::NANOS;
+ } else {
+ unit = LogicalType::TimeUnit::UNKNOWN;
+ }
+ return TimeLogicalType::Make(type.TIME.isAdjustedToUTC, unit);
+ } else if (type.__isset.TIMESTAMP) {
+ LogicalType::TimeUnit::unit unit;
+ if (type.TIMESTAMP.unit.__isset.MILLIS) {
+ unit = LogicalType::TimeUnit::MILLIS;
+ } else if (type.TIMESTAMP.unit.__isset.MICROS) {
+ unit = LogicalType::TimeUnit::MICROS;
+ } else if (type.TIMESTAMP.unit.__isset.NANOS) {
+ unit = LogicalType::TimeUnit::NANOS;
+ } else {
+ unit = LogicalType::TimeUnit::UNKNOWN;
+ }
+ return TimestampLogicalType::Make(type.TIMESTAMP.isAdjustedToUTC, unit);
+ // TODO(tpboudreau): activate the commented code after parquet.thrift
+ // recognizes IntervalType as a LogicalType
+ //} else if (type.__isset.INTERVAL) {
+ // return IntervalLogicalType::Make();
+ } else if (type.__isset.INTEGER) {
+ return IntLogicalType::Make(static_cast<int>(type.INTEGER.bitWidth),
+ type.INTEGER.isSigned);
+ } else if (type.__isset.UNKNOWN) {
+ return NullLogicalType::Make();
+ } else if (type.__isset.JSON) {
+ return JSONLogicalType::Make();
+ } else if (type.__isset.BSON) {
+ return BSONLogicalType::Make();
+ } else if (type.__isset.UUID) {
+ return UUIDLogicalType::Make();
+ } else {
+ throw ParquetException("Metadata contains Thrift LogicalType that is not recognized");
+ }
+}
+
+std::shared_ptr<const LogicalType> LogicalType::String() {
+ return StringLogicalType::Make();
+}
+
+std::shared_ptr<const LogicalType> LogicalType::Map() { return MapLogicalType::Make(); }
+
+std::shared_ptr<const LogicalType> LogicalType::List() { return ListLogicalType::Make(); }
+
+std::shared_ptr<const LogicalType> LogicalType::Enum() { return EnumLogicalType::Make(); }
+
+std::shared_ptr<const LogicalType> LogicalType::Decimal(int32_t precision,
+ int32_t scale) {
+ return DecimalLogicalType::Make(precision, scale);
+}
+
+std::shared_ptr<const LogicalType> LogicalType::Date() { return DateLogicalType::Make(); }
+
+std::shared_ptr<const LogicalType> LogicalType::Time(
+ bool is_adjusted_to_utc, LogicalType::TimeUnit::unit time_unit) {
+ DCHECK(time_unit != LogicalType::TimeUnit::UNKNOWN);
+ return TimeLogicalType::Make(is_adjusted_to_utc, time_unit);
+}
+
+std::shared_ptr<const LogicalType> LogicalType::Timestamp(
+ bool is_adjusted_to_utc, LogicalType::TimeUnit::unit time_unit,
+ bool is_from_converted_type, bool force_set_converted_type) {
+ DCHECK(time_unit != LogicalType::TimeUnit::UNKNOWN);
+ return TimestampLogicalType::Make(is_adjusted_to_utc, time_unit, is_from_converted_type,
+ force_set_converted_type);
+}
+
+std::shared_ptr<const LogicalType> LogicalType::Interval() {
+ return IntervalLogicalType::Make();
+}
+
+std::shared_ptr<const LogicalType> LogicalType::Int(int bit_width, bool is_signed) {
+ DCHECK(bit_width == 64 || bit_width == 32 || bit_width == 16 || bit_width == 8);
+ return IntLogicalType::Make(bit_width, is_signed);
+}
+
+std::shared_ptr<const LogicalType> LogicalType::Null() { return NullLogicalType::Make(); }
+
+std::shared_ptr<const LogicalType> LogicalType::JSON() { return JSONLogicalType::Make(); }
+
+std::shared_ptr<const LogicalType> LogicalType::BSON() { return BSONLogicalType::Make(); }
+
+std::shared_ptr<const LogicalType> LogicalType::UUID() { return UUIDLogicalType::Make(); }
+
+std::shared_ptr<const LogicalType> LogicalType::None() { return NoLogicalType::Make(); }
+
+/*
+ * The logical type implementation classes are built in four layers: (1) the base
+ * layer, which establishes the interface and provides generally reusable implementations
+ * for the ToJSON() and Equals() methods; (2) an intermediate derived layer for the
+ * "compatibility" methods, which provides implementations for is_compatible() and
+ * ToConvertedType(); (3) another intermediate layer for the "applicability" methods
+ * that provides several implementations for the is_applicable() method; and (4) the
+ * final derived classes, one for each logical type, which supply implementations
+ * for those methods that remain virtual (usually just ToString() and ToThrift()) or
+ * otherwise need to be overridden.
+ */
+
+// LogicalTypeImpl base class
+
+class LogicalType::Impl {
+ public:
+ virtual bool is_applicable(parquet::Type::type primitive_type,
+ int32_t primitive_length = -1) const = 0;
+
+ virtual bool is_compatible(ConvertedType::type converted_type,
+ schema::DecimalMetadata converted_decimal_metadata = {
+ false, -1, -1}) const = 0;
+
+ virtual ConvertedType::type ToConvertedType(
+ schema::DecimalMetadata* out_decimal_metadata) const = 0;
+
+ virtual std::string ToString() const = 0;
+
+ virtual bool is_serialized() const {
+ return !(type_ == LogicalType::Type::NONE || type_ == LogicalType::Type::UNDEFINED);
+ }
+
+ virtual std::string ToJSON() const {
+ std::stringstream json;
+ json << R"({"Type": ")" << ToString() << R"("})";
+ return json.str();
+ }
+
+ virtual format::LogicalType ToThrift() const {
+ // logical types inheriting this method should never be serialized
+ std::stringstream ss;
+ ss << "Logical type " << ToString() << " should not be serialized";
+ throw ParquetException(ss.str());
+ }
+
+ virtual bool Equals(const LogicalType& other) const { return other.type() == type_; }
+
+ LogicalType::Type::type type() const { return type_; }
+
+ SortOrder::type sort_order() const { return order_; }
+
+ Impl(const Impl&) = delete;
+ Impl& operator=(const Impl&) = delete;
+ virtual ~Impl() noexcept {}
+
+ class Compatible;
+ class SimpleCompatible;
+ class Incompatible;
+
+ class Applicable;
+ class SimpleApplicable;
+ class TypeLengthApplicable;
+ class UniversalApplicable;
+ class Inapplicable;
+
+ class String;
+ class Map;
+ class List;
+ class Enum;
+ class Decimal;
+ class Date;
+ class Time;
+ class Timestamp;
+ class Interval;
+ class Int;
+ class Null;
+ class JSON;
+ class BSON;
+ class UUID;
+ class No;
+ class Undefined;
+
+ protected:
+ Impl(LogicalType::Type::type t, SortOrder::type o) : type_(t), order_(o) {}
+ Impl() = default;
+
+ private:
+ LogicalType::Type::type type_ = LogicalType::Type::UNDEFINED;
+ SortOrder::type order_ = SortOrder::UNKNOWN;
+};
+
+// Special methods for public LogicalType class
+
+LogicalType::LogicalType() = default;
+LogicalType::~LogicalType() noexcept = default;
+
+// Delegating methods for public LogicalType class
+
+bool LogicalType::is_applicable(parquet::Type::type primitive_type,
+ int32_t primitive_length) const {
+ return impl_->is_applicable(primitive_type, primitive_length);
+}
+
+bool LogicalType::is_compatible(
+ ConvertedType::type converted_type,
+ schema::DecimalMetadata converted_decimal_metadata) const {
+ return impl_->is_compatible(converted_type, converted_decimal_metadata);
+}
+
+ConvertedType::type LogicalType::ToConvertedType(
+ schema::DecimalMetadata* out_decimal_metadata) const {
+ return impl_->ToConvertedType(out_decimal_metadata);
+}
+
+std::string LogicalType::ToString() const { return impl_->ToString(); }
+
+std::string LogicalType::ToJSON() const { return impl_->ToJSON(); }
+
+format::LogicalType LogicalType::ToThrift() const { return impl_->ToThrift(); }
+
+bool LogicalType::Equals(const LogicalType& other) const { return impl_->Equals(other); }
+
+LogicalType::Type::type LogicalType::type() const { return impl_->type(); }
+
+SortOrder::type LogicalType::sort_order() const { return impl_->sort_order(); }
+
+// Type checks for public LogicalType class
+
+bool LogicalType::is_string() const { return impl_->type() == LogicalType::Type::STRING; }
+bool LogicalType::is_map() const { return impl_->type() == LogicalType::Type::MAP; }
+bool LogicalType::is_list() const { return impl_->type() == LogicalType::Type::LIST; }
+bool LogicalType::is_enum() const { return impl_->type() == LogicalType::Type::ENUM; }
+bool LogicalType::is_decimal() const {
+ return impl_->type() == LogicalType::Type::DECIMAL;
+}
+bool LogicalType::is_date() const { return impl_->type() == LogicalType::Type::DATE; }
+bool LogicalType::is_time() const { return impl_->type() == LogicalType::Type::TIME; }
+bool LogicalType::is_timestamp() const {
+ return impl_->type() == LogicalType::Type::TIMESTAMP;
+}
+bool LogicalType::is_interval() const {
+ return impl_->type() == LogicalType::Type::INTERVAL;
+}
+bool LogicalType::is_int() const { return impl_->type() == LogicalType::Type::INT; }
+bool LogicalType::is_null() const { return impl_->type() == LogicalType::Type::NIL; }
+bool LogicalType::is_JSON() const { return impl_->type() == LogicalType::Type::JSON; }
+bool LogicalType::is_BSON() const { return impl_->type() == LogicalType::Type::BSON; }
+bool LogicalType::is_UUID() const { return impl_->type() == LogicalType::Type::UUID; }
+bool LogicalType::is_none() const { return impl_->type() == LogicalType::Type::NONE; }
+bool LogicalType::is_valid() const {
+ return impl_->type() != LogicalType::Type::UNDEFINED;
+}
+bool LogicalType::is_invalid() const { return !is_valid(); }
+bool LogicalType::is_nested() const {
+ return (impl_->type() == LogicalType::Type::LIST) ||
+ (impl_->type() == LogicalType::Type::MAP);
+}
+bool LogicalType::is_nonnested() const { return !is_nested(); }
+bool LogicalType::is_serialized() const { return impl_->is_serialized(); }
+
+// LogicalTypeImpl intermediate "compatibility" classes
+
+class LogicalType::Impl::Compatible : public virtual LogicalType::Impl {
+ protected:
+ Compatible() = default;
+};
+
+#define set_decimal_metadata(m___, i___, p___, s___) \
+ { \
+ if (m___) { \
+ (m___)->isset = (i___); \
+ (m___)->scale = (s___); \
+ (m___)->precision = (p___); \
+ } \
+ }
+
+#define reset_decimal_metadata(m___) \
+ { set_decimal_metadata(m___, false, -1, -1); }
+
+// For logical types that always translate to the same converted type
+class LogicalType::Impl::SimpleCompatible : public virtual LogicalType::Impl::Compatible {
+ public:
+ bool is_compatible(ConvertedType::type converted_type,
+ schema::DecimalMetadata converted_decimal_metadata) const override {
+ return (converted_type == converted_type_) && !converted_decimal_metadata.isset;
+ }
+
+ ConvertedType::type ToConvertedType(
+ schema::DecimalMetadata* out_decimal_metadata) const override {
+ reset_decimal_metadata(out_decimal_metadata);
+ return converted_type_;
+ }
+
+ protected:
+ explicit SimpleCompatible(ConvertedType::type c) : converted_type_(c) {}
+
+ private:
+ ConvertedType::type converted_type_ = ConvertedType::NA;
+};
+
+// For logical types that have no corresponding converted type
+class LogicalType::Impl::Incompatible : public virtual LogicalType::Impl {
+ public:
+ bool is_compatible(ConvertedType::type converted_type,
+ schema::DecimalMetadata converted_decimal_metadata) const override {
+ return (converted_type == ConvertedType::NONE ||
+ converted_type == ConvertedType::NA) &&
+ !converted_decimal_metadata.isset;
+ }
+
+ ConvertedType::type ToConvertedType(
+ schema::DecimalMetadata* out_decimal_metadata) const override {
+ reset_decimal_metadata(out_decimal_metadata);
+ return ConvertedType::NONE;
+ }
+
+ protected:
+ Incompatible() = default;
+};
+
+// LogicalTypeImpl intermediate "applicability" classes
+
+class LogicalType::Impl::Applicable : public virtual LogicalType::Impl {
+ protected:
+ Applicable() = default;
+};
+
+// For logical types that can apply only to a single
+// physical type
+class LogicalType::Impl::SimpleApplicable : public virtual LogicalType::Impl::Applicable {
+ public:
+ bool is_applicable(parquet::Type::type primitive_type,
+ int32_t primitive_length = -1) const override {
+ return primitive_type == type_;
+ }
+
+ protected:
+ explicit SimpleApplicable(parquet::Type::type t) : type_(t) {}
+
+ private:
+ parquet::Type::type type_;
+};
+
+// For logical types that can apply only to a particular
+// physical type and physical length combination
+class LogicalType::Impl::TypeLengthApplicable
+ : public virtual LogicalType::Impl::Applicable {
+ public:
+ bool is_applicable(parquet::Type::type primitive_type,
+ int32_t primitive_length = -1) const override {
+ return primitive_type == type_ && primitive_length == length_;
+ }
+
+ protected:
+ TypeLengthApplicable(parquet::Type::type t, int32_t l) : type_(t), length_(l) {}
+
+ private:
+ parquet::Type::type type_;
+ int32_t length_;
+};
+
+// For logical types that can apply to any physical type
+class LogicalType::Impl::UniversalApplicable
+ : public virtual LogicalType::Impl::Applicable {
+ public:
+ bool is_applicable(parquet::Type::type primitive_type,
+ int32_t primitive_length = -1) const override {
+ return true;
+ }
+
+ protected:
+ UniversalApplicable() = default;
+};
+
+// For logical types that can never apply to any primitive
+// physical type
+class LogicalType::Impl::Inapplicable : public virtual LogicalType::Impl {
+ public:
+ bool is_applicable(parquet::Type::type primitive_type,
+ int32_t primitive_length = -1) const override {
+ return false;
+ }
+
+ protected:
+ Inapplicable() = default;
+};
+
+// LogicalType implementation final classes
+
+#define OVERRIDE_TOSTRING(n___) \
+ std::string ToString() const override { return #n___; }
+
+#define OVERRIDE_TOTHRIFT(t___, s___) \
+ format::LogicalType ToThrift() const override { \
+ format::LogicalType type; \
+ format::t___ subtype; \
+ type.__set_##s___(subtype); \
+ return type; \
+ }
+
+class LogicalType::Impl::String final : public LogicalType::Impl::SimpleCompatible,
+ public LogicalType::Impl::SimpleApplicable {
+ public:
+ friend class StringLogicalType;
+
+ OVERRIDE_TOSTRING(String)
+ OVERRIDE_TOTHRIFT(StringType, STRING)
+
+ private:
+ String()
+ : LogicalType::Impl(LogicalType::Type::STRING, SortOrder::UNSIGNED),
+ LogicalType::Impl::SimpleCompatible(ConvertedType::UTF8),
+ LogicalType::Impl::SimpleApplicable(parquet::Type::BYTE_ARRAY) {}
+};
+
+// Each public logical type class's Make() creation method instantiates a corresponding
+// LogicalType::Impl::* object and installs that implementation in the logical type
+// it returns.
+
+#define GENERATE_MAKE(a___) \
+ std::shared_ptr<const LogicalType> a___##LogicalType::Make() { \
+ auto* logical_type = new a___##LogicalType(); \
+ logical_type->impl_.reset(new LogicalType::Impl::a___()); \
+ return std::shared_ptr<const LogicalType>(logical_type); \
+ }
+
+GENERATE_MAKE(String)
+
+class LogicalType::Impl::Map final : public LogicalType::Impl::SimpleCompatible,
+ public LogicalType::Impl::Inapplicable {
+ public:
+ friend class MapLogicalType;
+
+ bool is_compatible(ConvertedType::type converted_type,
+ schema::DecimalMetadata converted_decimal_metadata) const override {
+ return (converted_type == ConvertedType::MAP ||
+ converted_type == ConvertedType::MAP_KEY_VALUE) &&
+ !converted_decimal_metadata.isset;
+ }
+
+ OVERRIDE_TOSTRING(Map)
+ OVERRIDE_TOTHRIFT(MapType, MAP)
+
+ private:
+ Map()
+ : LogicalType::Impl(LogicalType::Type::MAP, SortOrder::UNKNOWN),
+ LogicalType::Impl::SimpleCompatible(ConvertedType::MAP) {}
+};
+
+GENERATE_MAKE(Map)
+
+class LogicalType::Impl::List final : public LogicalType::Impl::SimpleCompatible,
+ public LogicalType::Impl::Inapplicable {
+ public:
+ friend class ListLogicalType;
+
+ OVERRIDE_TOSTRING(List)
+ OVERRIDE_TOTHRIFT(ListType, LIST)
+
+ private:
+ List()
+ : LogicalType::Impl(LogicalType::Type::LIST, SortOrder::UNKNOWN),
+ LogicalType::Impl::SimpleCompatible(ConvertedType::LIST) {}
+};
+
+GENERATE_MAKE(List)
+
+class LogicalType::Impl::Enum final : public LogicalType::Impl::SimpleCompatible,
+ public LogicalType::Impl::SimpleApplicable {
+ public:
+ friend class EnumLogicalType;
+
+ OVERRIDE_TOSTRING(Enum)
+ OVERRIDE_TOTHRIFT(EnumType, ENUM)
+
+ private:
+ Enum()
+ : LogicalType::Impl(LogicalType::Type::ENUM, SortOrder::UNSIGNED),
+ LogicalType::Impl::SimpleCompatible(ConvertedType::ENUM),
+ LogicalType::Impl::SimpleApplicable(parquet::Type::BYTE_ARRAY) {}
+};
+
+GENERATE_MAKE(Enum)
+
+// The parameterized logical types (currently Decimal, Time, Timestamp, and Int)
+// generally can't reuse the simple method implementations available in the base and
+// intermediate classes and must (re)implement them all
+
+class LogicalType::Impl::Decimal final : public LogicalType::Impl::Compatible,
+ public LogicalType::Impl::Applicable {
+ public:
+ friend class DecimalLogicalType;
+
+ bool is_applicable(parquet::Type::type primitive_type,
+ int32_t primitive_length = -1) const override;
+ bool is_compatible(ConvertedType::type converted_type,
+ schema::DecimalMetadata converted_decimal_metadata) const override;
+ ConvertedType::type ToConvertedType(
+ schema::DecimalMetadata* out_decimal_metadata) const override;
+ std::string ToString() const override;
+ std::string ToJSON() const override;
+ format::LogicalType ToThrift() const override;
+ bool Equals(const LogicalType& other) const override;
+
+ int32_t precision() const { return precision_; }
+ int32_t scale() const { return scale_; }
+
+ private:
+ Decimal(int32_t p, int32_t s)
+ : LogicalType::Impl(LogicalType::Type::DECIMAL, SortOrder::SIGNED),
+ precision_(p),
+ scale_(s) {}
+ int32_t precision_ = -1;
+ int32_t scale_ = -1;
+};
+
+bool LogicalType::Impl::Decimal::is_applicable(parquet::Type::type primitive_type,
+ int32_t primitive_length) const {
+ bool ok = false;
+ switch (primitive_type) {
+ case parquet::Type::INT32: {
+ ok = (1 <= precision_) && (precision_ <= 9);
+ } break;
+ case parquet::Type::INT64: {
+ ok = (1 <= precision_) && (precision_ <= 18);
+ if (precision_ < 10) {
+ // FIXME(tpb): warn that INT32 could be used
+ }
+ } break;
+ case parquet::Type::FIXED_LEN_BYTE_ARRAY: {
+ ok = precision_ <= static_cast<int32_t>(std::floor(
+ std::log10(std::pow(2.0, (8.0 * primitive_length) - 1.0))));
+ } break;
+ case parquet::Type::BYTE_ARRAY: {
+ ok = true;
+ } break;
+ default: {
+ } break;
+ }
+ return ok;
+}
+
+bool LogicalType::Impl::Decimal::is_compatible(
+ ConvertedType::type converted_type,
+ schema::DecimalMetadata converted_decimal_metadata) const {
+ return converted_type == ConvertedType::DECIMAL &&
+ (converted_decimal_metadata.isset &&
+ converted_decimal_metadata.scale == scale_ &&
+ converted_decimal_metadata.precision == precision_);
+}
+
+ConvertedType::type LogicalType::Impl::Decimal::ToConvertedType(
+ schema::DecimalMetadata* out_decimal_metadata) const {
+ set_decimal_metadata(out_decimal_metadata, true, precision_, scale_);
+ return ConvertedType::DECIMAL;
+}
+
+std::string LogicalType::Impl::Decimal::ToString() const {
+ std::stringstream type;
+ type << "Decimal(precision=" << precision_ << ", scale=" << scale_ << ")";
+ return type.str();
+}
+
+std::string LogicalType::Impl::Decimal::ToJSON() const {
+ std::stringstream json;
+ json << R"({"Type": "Decimal", "precision": )" << precision_ << R"(, "scale": )"
+ << scale_ << "}";
+ return json.str();
+}
+
+format::LogicalType LogicalType::Impl::Decimal::ToThrift() const {
+ format::LogicalType type;
+ format::DecimalType decimal_type;
+ decimal_type.__set_precision(precision_);
+ decimal_type.__set_scale(scale_);
+ type.__set_DECIMAL(decimal_type);
+ return type;
+}
+
+bool LogicalType::Impl::Decimal::Equals(const LogicalType& other) const {
+ bool eq = false;
+ if (other.is_decimal()) {
+ const auto& other_decimal = checked_cast<const DecimalLogicalType&>(other);
+ eq = (precision_ == other_decimal.precision() && scale_ == other_decimal.scale());
+ }
+ return eq;
+}
+
+std::shared_ptr<const LogicalType> DecimalLogicalType::Make(int32_t precision,
+ int32_t scale) {
+ if (precision < 1) {
+ throw ParquetException(
+ "Precision must be greater than or equal to 1 for Decimal logical type");
+ }
+ if (scale < 0 || scale > precision) {
+ throw ParquetException(
+ "Scale must be a non-negative integer that does not exceed precision for "
+ "Decimal logical type");
+ }
+ auto* logical_type = new DecimalLogicalType();
+ logical_type->impl_.reset(new LogicalType::Impl::Decimal(precision, scale));
+ return std::shared_ptr<const LogicalType>(logical_type);
+}
+
+int32_t DecimalLogicalType::precision() const {
+ return (dynamic_cast<const LogicalType::Impl::Decimal&>(*impl_)).precision();
+}
+
+int32_t DecimalLogicalType::scale() const {
+ return (dynamic_cast<const LogicalType::Impl::Decimal&>(*impl_)).scale();
+}
+
+class LogicalType::Impl::Date final : public LogicalType::Impl::SimpleCompatible,
+ public LogicalType::Impl::SimpleApplicable {
+ public:
+ friend class DateLogicalType;
+
+ OVERRIDE_TOSTRING(Date)
+ OVERRIDE_TOTHRIFT(DateType, DATE)
+
+ private:
+ Date()
+ : LogicalType::Impl(LogicalType::Type::DATE, SortOrder::SIGNED),
+ LogicalType::Impl::SimpleCompatible(ConvertedType::DATE),
+ LogicalType::Impl::SimpleApplicable(parquet::Type::INT32) {}
+};
+
+GENERATE_MAKE(Date)
+
+#define time_unit_string(u___) \
+ ((u___) == LogicalType::TimeUnit::MILLIS \
+ ? "milliseconds" \
+ : ((u___) == LogicalType::TimeUnit::MICROS \
+ ? "microseconds" \
+ : ((u___) == LogicalType::TimeUnit::NANOS ? "nanoseconds" : "unknown")))
+
+class LogicalType::Impl::Time final : public LogicalType::Impl::Compatible,
+ public LogicalType::Impl::Applicable {
+ public:
+ friend class TimeLogicalType;
+
+ bool is_applicable(parquet::Type::type primitive_type,
+ int32_t primitive_length = -1) const override;
+ bool is_compatible(ConvertedType::type converted_type,
+ schema::DecimalMetadata converted_decimal_metadata) const override;
+ ConvertedType::type ToConvertedType(
+ schema::DecimalMetadata* out_decimal_metadata) const override;
+ std::string ToString() const override;
+ std::string ToJSON() const override;
+ format::LogicalType ToThrift() const override;
+ bool Equals(const LogicalType& other) const override;
+
+ bool is_adjusted_to_utc() const { return adjusted_; }
+ LogicalType::TimeUnit::unit time_unit() const { return unit_; }
+
+ private:
+ Time(bool a, LogicalType::TimeUnit::unit u)
+ : LogicalType::Impl(LogicalType::Type::TIME, SortOrder::SIGNED),
+ adjusted_(a),
+ unit_(u) {}
+ bool adjusted_ = false;
+ LogicalType::TimeUnit::unit unit_;
+};
+
+bool LogicalType::Impl::Time::is_applicable(parquet::Type::type primitive_type,
+ int32_t primitive_length) const {
+ return (primitive_type == parquet::Type::INT32 &&
+ unit_ == LogicalType::TimeUnit::MILLIS) ||
+ (primitive_type == parquet::Type::INT64 &&
+ (unit_ == LogicalType::TimeUnit::MICROS ||
+ unit_ == LogicalType::TimeUnit::NANOS));
+}
+
+bool LogicalType::Impl::Time::is_compatible(
+ ConvertedType::type converted_type,
+ schema::DecimalMetadata converted_decimal_metadata) const {
+ if (converted_decimal_metadata.isset) {
+ return false;
+ } else if (adjusted_ && unit_ == LogicalType::TimeUnit::MILLIS) {
+ return converted_type == ConvertedType::TIME_MILLIS;
+ } else if (adjusted_ && unit_ == LogicalType::TimeUnit::MICROS) {
+ return converted_type == ConvertedType::TIME_MICROS;
+ } else {
+ return (converted_type == ConvertedType::NONE) ||
+ (converted_type == ConvertedType::NA);
+ }
+}
+
+ConvertedType::type LogicalType::Impl::Time::ToConvertedType(
+ schema::DecimalMetadata* out_decimal_metadata) const {
+ reset_decimal_metadata(out_decimal_metadata);
+ if (adjusted_) {
+ if (unit_ == LogicalType::TimeUnit::MILLIS) {
+ return ConvertedType::TIME_MILLIS;
+ } else if (unit_ == LogicalType::TimeUnit::MICROS) {
+ return ConvertedType::TIME_MICROS;
+ }
+ }
+ return ConvertedType::NONE;
+}
+
+std::string LogicalType::Impl::Time::ToString() const {
+ std::stringstream type;
+ type << "Time(isAdjustedToUTC=" << std::boolalpha << adjusted_
+ << ", timeUnit=" << time_unit_string(unit_) << ")";
+ return type.str();
+}
+
+std::string LogicalType::Impl::Time::ToJSON() const {
+ std::stringstream json;
+ json << R"({"Type": "Time", "isAdjustedToUTC": )" << std::boolalpha << adjusted_
+ << R"(, "timeUnit": ")" << time_unit_string(unit_) << R"("})";
+ return json.str();
+}
+
+format::LogicalType LogicalType::Impl::Time::ToThrift() const {
+ format::LogicalType type;
+ format::TimeType time_type;
+ format::TimeUnit time_unit;
+ DCHECK(unit_ != LogicalType::TimeUnit::UNKNOWN);
+ if (unit_ == LogicalType::TimeUnit::MILLIS) {
+ format::MilliSeconds millis;
+ time_unit.__set_MILLIS(millis);
+ } else if (unit_ == LogicalType::TimeUnit::MICROS) {
+ format::MicroSeconds micros;
+ time_unit.__set_MICROS(micros);
+ } else if (unit_ == LogicalType::TimeUnit::NANOS) {
+ format::NanoSeconds nanos;
+ time_unit.__set_NANOS(nanos);
+ }
+ time_type.__set_isAdjustedToUTC(adjusted_);
+ time_type.__set_unit(time_unit);
+ type.__set_TIME(time_type);
+ return type;
+}
+
+bool LogicalType::Impl::Time::Equals(const LogicalType& other) const {
+ bool eq = false;
+ if (other.is_time()) {
+ const auto& other_time = checked_cast<const TimeLogicalType&>(other);
+ eq =
+ (adjusted_ == other_time.is_adjusted_to_utc() && unit_ == other_time.time_unit());
+ }
+ return eq;
+}
+
+std::shared_ptr<const LogicalType> TimeLogicalType::Make(
+ bool is_adjusted_to_utc, LogicalType::TimeUnit::unit time_unit) {
+ if (time_unit == LogicalType::TimeUnit::MILLIS ||
+ time_unit == LogicalType::TimeUnit::MICROS ||
+ time_unit == LogicalType::TimeUnit::NANOS) {
+ auto* logical_type = new TimeLogicalType();
+ logical_type->impl_.reset(new LogicalType::Impl::Time(is_adjusted_to_utc, time_unit));
+ return std::shared_ptr<const LogicalType>(logical_type);
+ } else {
+ throw ParquetException(
+ "TimeUnit must be one of MILLIS, MICROS, or NANOS for Time logical type");
+ }
+}
+
+bool TimeLogicalType::is_adjusted_to_utc() const {
+ return (dynamic_cast<const LogicalType::Impl::Time&>(*impl_)).is_adjusted_to_utc();
+}
+
+LogicalType::TimeUnit::unit TimeLogicalType::time_unit() const {
+ return (dynamic_cast<const LogicalType::Impl::Time&>(*impl_)).time_unit();
+}
+
+class LogicalType::Impl::Timestamp final : public LogicalType::Impl::Compatible,
+ public LogicalType::Impl::SimpleApplicable {
+ public:
+ friend class TimestampLogicalType;
+
+ bool is_serialized() const override;
+ bool is_compatible(ConvertedType::type converted_type,
+ schema::DecimalMetadata converted_decimal_metadata) const override;
+ ConvertedType::type ToConvertedType(
+ schema::DecimalMetadata* out_decimal_metadata) const override;
+ std::string ToString() const override;
+ std::string ToJSON() const override;
+ format::LogicalType ToThrift() const override;
+ bool Equals(const LogicalType& other) const override;
+
+ bool is_adjusted_to_utc() const { return adjusted_; }
+ LogicalType::TimeUnit::unit time_unit() const { return unit_; }
+
+ bool is_from_converted_type() const { return is_from_converted_type_; }
+ bool force_set_converted_type() const { return force_set_converted_type_; }
+
+ private:
+ Timestamp(bool adjusted, LogicalType::TimeUnit::unit unit, bool is_from_converted_type,
+ bool force_set_converted_type)
+ : LogicalType::Impl(LogicalType::Type::TIMESTAMP, SortOrder::SIGNED),
+ LogicalType::Impl::SimpleApplicable(parquet::Type::INT64),
+ adjusted_(adjusted),
+ unit_(unit),
+ is_from_converted_type_(is_from_converted_type),
+ force_set_converted_type_(force_set_converted_type) {}
+ bool adjusted_ = false;
+ LogicalType::TimeUnit::unit unit_;
+ bool is_from_converted_type_ = false;
+ bool force_set_converted_type_ = false;
+};
+
+bool LogicalType::Impl::Timestamp::is_serialized() const {
+ return !is_from_converted_type_;
+}
+
+bool LogicalType::Impl::Timestamp::is_compatible(
+ ConvertedType::type converted_type,
+ schema::DecimalMetadata converted_decimal_metadata) const {
+ if (converted_decimal_metadata.isset) {
+ return false;
+ } else if (unit_ == LogicalType::TimeUnit::MILLIS) {
+ if (adjusted_ || force_set_converted_type_) {
+ return converted_type == ConvertedType::TIMESTAMP_MILLIS;
+ } else {
+ return (converted_type == ConvertedType::NONE) ||
+ (converted_type == ConvertedType::NA);
+ }
+ } else if (unit_ == LogicalType::TimeUnit::MICROS) {
+ if (adjusted_ || force_set_converted_type_) {
+ return converted_type == ConvertedType::TIMESTAMP_MICROS;
+ } else {
+ return (converted_type == ConvertedType::NONE) ||
+ (converted_type == ConvertedType::NA);
+ }
+ } else {
+ return (converted_type == ConvertedType::NONE) ||
+ (converted_type == ConvertedType::NA);
+ }
+}
+
+ConvertedType::type LogicalType::Impl::Timestamp::ToConvertedType(
+ schema::DecimalMetadata* out_decimal_metadata) const {
+ reset_decimal_metadata(out_decimal_metadata);
+ if (adjusted_ || force_set_converted_type_) {
+ if (unit_ == LogicalType::TimeUnit::MILLIS) {
+ return ConvertedType::TIMESTAMP_MILLIS;
+ } else if (unit_ == LogicalType::TimeUnit::MICROS) {
+ return ConvertedType::TIMESTAMP_MICROS;
+ }
+ }
+ return ConvertedType::NONE;
+}
+
+std::string LogicalType::Impl::Timestamp::ToString() const {
+ std::stringstream type;
+ type << "Timestamp(isAdjustedToUTC=" << std::boolalpha << adjusted_
+ << ", timeUnit=" << time_unit_string(unit_)
+ << ", is_from_converted_type=" << is_from_converted_type_
+ << ", force_set_converted_type=" << force_set_converted_type_ << ")";
+ return type.str();
+}
+
+std::string LogicalType::Impl::Timestamp::ToJSON() const {
+ std::stringstream json;
+ json << R"({"Type": "Timestamp", "isAdjustedToUTC": )" << std::boolalpha << adjusted_
+ << R"(, "timeUnit": ")" << time_unit_string(unit_) << R"(")"
+ << R"(, "is_from_converted_type": )" << is_from_converted_type_
+ << R"(, "force_set_converted_type": )" << force_set_converted_type_ << R"(})";
+ return json.str();
+}
+
+format::LogicalType LogicalType::Impl::Timestamp::ToThrift() const {
+ format::LogicalType type;
+ format::TimestampType timestamp_type;
+ format::TimeUnit time_unit;
+ DCHECK(unit_ != LogicalType::TimeUnit::UNKNOWN);
+ if (unit_ == LogicalType::TimeUnit::MILLIS) {
+ format::MilliSeconds millis;
+ time_unit.__set_MILLIS(millis);
+ } else if (unit_ == LogicalType::TimeUnit::MICROS) {
+ format::MicroSeconds micros;
+ time_unit.__set_MICROS(micros);
+ } else if (unit_ == LogicalType::TimeUnit::NANOS) {
+ format::NanoSeconds nanos;
+ time_unit.__set_NANOS(nanos);
+ }
+ timestamp_type.__set_isAdjustedToUTC(adjusted_);
+ timestamp_type.__set_unit(time_unit);
+ type.__set_TIMESTAMP(timestamp_type);
+ return type;
+}
+
+bool LogicalType::Impl::Timestamp::Equals(const LogicalType& other) const {
+ bool eq = false;
+ if (other.is_timestamp()) {
+ const auto& other_timestamp = checked_cast<const TimestampLogicalType&>(other);
+ eq = (adjusted_ == other_timestamp.is_adjusted_to_utc() &&
+ unit_ == other_timestamp.time_unit());
+ }
+ return eq;
+}
+
+std::shared_ptr<const LogicalType> TimestampLogicalType::Make(
+ bool is_adjusted_to_utc, LogicalType::TimeUnit::unit time_unit,
+ bool is_from_converted_type, bool force_set_converted_type) {
+ if (time_unit == LogicalType::TimeUnit::MILLIS ||
+ time_unit == LogicalType::TimeUnit::MICROS ||
+ time_unit == LogicalType::TimeUnit::NANOS) {
+ auto* logical_type = new TimestampLogicalType();
+ logical_type->impl_.reset(new LogicalType::Impl::Timestamp(
+ is_adjusted_to_utc, time_unit, is_from_converted_type, force_set_converted_type));
+ return std::shared_ptr<const LogicalType>(logical_type);
+ } else {
+ throw ParquetException(
+ "TimeUnit must be one of MILLIS, MICROS, or NANOS for Timestamp logical type");
+ }
+}
+
+bool TimestampLogicalType::is_adjusted_to_utc() const {
+ return (dynamic_cast<const LogicalType::Impl::Timestamp&>(*impl_)).is_adjusted_to_utc();
+}
+
+LogicalType::TimeUnit::unit TimestampLogicalType::time_unit() const {
+ return (dynamic_cast<const LogicalType::Impl::Timestamp&>(*impl_)).time_unit();
+}
+
+bool TimestampLogicalType::is_from_converted_type() const {
+ return (dynamic_cast<const LogicalType::Impl::Timestamp&>(*impl_))
+ .is_from_converted_type();
+}
+
+bool TimestampLogicalType::force_set_converted_type() const {
+ return (dynamic_cast<const LogicalType::Impl::Timestamp&>(*impl_))
+ .force_set_converted_type();
+}
+
+class LogicalType::Impl::Interval final : public LogicalType::Impl::SimpleCompatible,
+ public LogicalType::Impl::TypeLengthApplicable {
+ public:
+ friend class IntervalLogicalType;
+
+ OVERRIDE_TOSTRING(Interval)
+ // TODO(tpboudreau): uncomment the following line to enable serialization after
+ // parquet.thrift recognizes IntervalType as a ConvertedType
+ // OVERRIDE_TOTHRIFT(IntervalType, INTERVAL)
+
+ private:
+ Interval()
+ : LogicalType::Impl(LogicalType::Type::INTERVAL, SortOrder::UNKNOWN),
+ LogicalType::Impl::SimpleCompatible(ConvertedType::INTERVAL),
+ LogicalType::Impl::TypeLengthApplicable(parquet::Type::FIXED_LEN_BYTE_ARRAY, 12) {
+ }
+};
+
+GENERATE_MAKE(Interval)
+
+class LogicalType::Impl::Int final : public LogicalType::Impl::Compatible,
+ public LogicalType::Impl::Applicable {
+ public:
+ friend class IntLogicalType;
+
+ bool is_applicable(parquet::Type::type primitive_type,
+ int32_t primitive_length = -1) const override;
+ bool is_compatible(ConvertedType::type converted_type,
+ schema::DecimalMetadata converted_decimal_metadata) const override;
+ ConvertedType::type ToConvertedType(
+ schema::DecimalMetadata* out_decimal_metadata) const override;
+ std::string ToString() const override;
+ std::string ToJSON() const override;
+ format::LogicalType ToThrift() const override;
+ bool Equals(const LogicalType& other) const override;
+
+ int bit_width() const { return width_; }
+ bool is_signed() const { return signed_; }
+
+ private:
+ Int(int w, bool s)
+ : LogicalType::Impl(LogicalType::Type::INT,
+ (s ? SortOrder::SIGNED : SortOrder::UNSIGNED)),
+ width_(w),
+ signed_(s) {}
+ int width_ = 0;
+ bool signed_ = false;
+};
+
+bool LogicalType::Impl::Int::is_applicable(parquet::Type::type primitive_type,
+ int32_t primitive_length) const {
+ return (primitive_type == parquet::Type::INT32 && width_ <= 32) ||
+ (primitive_type == parquet::Type::INT64 && width_ == 64);
+}
+
+bool LogicalType::Impl::Int::is_compatible(
+ ConvertedType::type converted_type,
+ schema::DecimalMetadata converted_decimal_metadata) const {
+ if (converted_decimal_metadata.isset) {
+ return false;
+ } else if (signed_ && width_ == 8) {
+ return converted_type == ConvertedType::INT_8;
+ } else if (signed_ && width_ == 16) {
+ return converted_type == ConvertedType::INT_16;
+ } else if (signed_ && width_ == 32) {
+ return converted_type == ConvertedType::INT_32;
+ } else if (signed_ && width_ == 64) {
+ return converted_type == ConvertedType::INT_64;
+ } else if (!signed_ && width_ == 8) {
+ return converted_type == ConvertedType::UINT_8;
+ } else if (!signed_ && width_ == 16) {
+ return converted_type == ConvertedType::UINT_16;
+ } else if (!signed_ && width_ == 32) {
+ return converted_type == ConvertedType::UINT_32;
+ } else if (!signed_ && width_ == 64) {
+ return converted_type == ConvertedType::UINT_64;
+ } else {
+ return false;
+ }
+}
+
+ConvertedType::type LogicalType::Impl::Int::ToConvertedType(
+ schema::DecimalMetadata* out_decimal_metadata) const {
+ reset_decimal_metadata(out_decimal_metadata);
+ if (signed_) {
+ switch (width_) {
+ case 8:
+ return ConvertedType::INT_8;
+ case 16:
+ return ConvertedType::INT_16;
+ case 32:
+ return ConvertedType::INT_32;
+ case 64:
+ return ConvertedType::INT_64;
+ }
+ } else { // unsigned
+ switch (width_) {
+ case 8:
+ return ConvertedType::UINT_8;
+ case 16:
+ return ConvertedType::UINT_16;
+ case 32:
+ return ConvertedType::UINT_32;
+ case 64:
+ return ConvertedType::UINT_64;
+ }
+ }
+ return ConvertedType::NONE;
+}
+
+std::string LogicalType::Impl::Int::ToString() const {
+ std::stringstream type;
+ type << "Int(bitWidth=" << width_ << ", isSigned=" << std::boolalpha << signed_ << ")";
+ return type.str();
+}
+
+std::string LogicalType::Impl::Int::ToJSON() const {
+ std::stringstream json;
+ json << R"({"Type": "Int", "bitWidth": )" << width_ << R"(, "isSigned": )"
+ << std::boolalpha << signed_ << "}";
+ return json.str();
+}
+
+format::LogicalType LogicalType::Impl::Int::ToThrift() const {
+ format::LogicalType type;
+ format::IntType int_type;
+ DCHECK(width_ == 64 || width_ == 32 || width_ == 16 || width_ == 8);
+ int_type.__set_bitWidth(static_cast<int8_t>(width_));
+ int_type.__set_isSigned(signed_);
+ type.__set_INTEGER(int_type);
+ return type;
+}
+
+bool LogicalType::Impl::Int::Equals(const LogicalType& other) const {
+ bool eq = false;
+ if (other.is_int()) {
+ const auto& other_int = checked_cast<const IntLogicalType&>(other);
+ eq = (width_ == other_int.bit_width() && signed_ == other_int.is_signed());
+ }
+ return eq;
+}
+
+std::shared_ptr<const LogicalType> IntLogicalType::Make(int bit_width, bool is_signed) {
+ if (bit_width == 8 || bit_width == 16 || bit_width == 32 || bit_width == 64) {
+ auto* logical_type = new IntLogicalType();
+ logical_type->impl_.reset(new LogicalType::Impl::Int(bit_width, is_signed));
+ return std::shared_ptr<const LogicalType>(logical_type);
+ } else {
+ throw ParquetException(
+ "Bit width must be exactly 8, 16, 32, or 64 for Int logical type");
+ }
+}
+
+int IntLogicalType::bit_width() const {
+ return (dynamic_cast<const LogicalType::Impl::Int&>(*impl_)).bit_width();
+}
+
+bool IntLogicalType::is_signed() const {
+ return (dynamic_cast<const LogicalType::Impl::Int&>(*impl_)).is_signed();
+}
+
+class LogicalType::Impl::Null final : public LogicalType::Impl::Incompatible,
+ public LogicalType::Impl::UniversalApplicable {
+ public:
+ friend class NullLogicalType;
+
+ OVERRIDE_TOSTRING(Null)
+ OVERRIDE_TOTHRIFT(NullType, UNKNOWN)
+
+ private:
+ Null() : LogicalType::Impl(LogicalType::Type::NIL, SortOrder::UNKNOWN) {}
+};
+
+GENERATE_MAKE(Null)
+
+class LogicalType::Impl::JSON final : public LogicalType::Impl::SimpleCompatible,
+ public LogicalType::Impl::SimpleApplicable {
+ public:
+ friend class JSONLogicalType;
+
+ OVERRIDE_TOSTRING(JSON)
+ OVERRIDE_TOTHRIFT(JsonType, JSON)
+
+ private:
+ JSON()
+ : LogicalType::Impl(LogicalType::Type::JSON, SortOrder::UNSIGNED),
+ LogicalType::Impl::SimpleCompatible(ConvertedType::JSON),
+ LogicalType::Impl::SimpleApplicable(parquet::Type::BYTE_ARRAY) {}
+};
+
+GENERATE_MAKE(JSON)
+
+class LogicalType::Impl::BSON final : public LogicalType::Impl::SimpleCompatible,
+ public LogicalType::Impl::SimpleApplicable {
+ public:
+ friend class BSONLogicalType;
+
+ OVERRIDE_TOSTRING(BSON)
+ OVERRIDE_TOTHRIFT(BsonType, BSON)
+
+ private:
+ BSON()
+ : LogicalType::Impl(LogicalType::Type::BSON, SortOrder::UNSIGNED),
+ LogicalType::Impl::SimpleCompatible(ConvertedType::BSON),
+ LogicalType::Impl::SimpleApplicable(parquet::Type::BYTE_ARRAY) {}
+};
+
+GENERATE_MAKE(BSON)
+
+class LogicalType::Impl::UUID final : public LogicalType::Impl::Incompatible,
+ public LogicalType::Impl::TypeLengthApplicable {
+ public:
+ friend class UUIDLogicalType;
+
+ OVERRIDE_TOSTRING(UUID)
+ OVERRIDE_TOTHRIFT(UUIDType, UUID)
+
+ private:
+ UUID()
+ : LogicalType::Impl(LogicalType::Type::UUID, SortOrder::UNSIGNED),
+ LogicalType::Impl::TypeLengthApplicable(parquet::Type::FIXED_LEN_BYTE_ARRAY, 16) {
+ }
+};
+
+GENERATE_MAKE(UUID)
+
+class LogicalType::Impl::No final : public LogicalType::Impl::SimpleCompatible,
+ public LogicalType::Impl::UniversalApplicable {
+ public:
+ friend class NoLogicalType;
+
+ OVERRIDE_TOSTRING(None)
+
+ private:
+ No()
+ : LogicalType::Impl(LogicalType::Type::NONE, SortOrder::UNKNOWN),
+ LogicalType::Impl::SimpleCompatible(ConvertedType::NONE) {}
+};
+
+GENERATE_MAKE(No)
+
+class LogicalType::Impl::Undefined final : public LogicalType::Impl::SimpleCompatible,
+ public LogicalType::Impl::UniversalApplicable {
+ public:
+ friend class UndefinedLogicalType;
+
+ OVERRIDE_TOSTRING(Undefined)
+
+ private:
+ Undefined()
+ : LogicalType::Impl(LogicalType::Type::UNDEFINED, SortOrder::UNKNOWN),
+ LogicalType::Impl::SimpleCompatible(ConvertedType::UNDEFINED) {}
+};
+
+GENERATE_MAKE(Undefined)
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/types.h b/contrib/libs/apache/arrow/cpp/src/parquet/types.h
new file mode 100644
index 00000000000..c25719830ec
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/types.h
@@ -0,0 +1,765 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <sstream>
+#include <string>
+
+#include "arrow/util/string_view.h"
+
+#include "parquet/platform.h"
+#include "parquet/type_fwd.h"
+
+#ifdef _WIN32
+
+// Repetition::OPTIONAL conflicts with a #define, so we undefine it
+#ifdef OPTIONAL
+#undef OPTIONAL
+#endif
+
+#endif // _WIN32
+
+namespace arrow {
+namespace util {
+
+class Codec;
+
+} // namespace util
+} // namespace arrow
+
+namespace parquet {
+
+// ----------------------------------------------------------------------
+// Metadata enums to match Thrift metadata
+//
+// The reason we maintain our own enums is to avoid transitive dependency on
+// the compiled Thrift headers (and thus thrift/Thrift.h) for users of the
+// public API. After building parquet-cpp, you should not need to include
+// Thrift headers in your application. This means some boilerplate to convert
+// between our types and Parquet's Thrift types.
+//
+// We can also add special values like NONE to distinguish between metadata
+// values being set and not set. As an example consider ConvertedType and
+// CompressionCodec
+
+// Mirrors parquet::Type
+struct Type {
+ enum type {
+ BOOLEAN = 0,
+ INT32 = 1,
+ INT64 = 2,
+ INT96 = 3,
+ FLOAT = 4,
+ DOUBLE = 5,
+ BYTE_ARRAY = 6,
+ FIXED_LEN_BYTE_ARRAY = 7,
+ // Should always be last element.
+ UNDEFINED = 8
+ };
+};
+
+// Mirrors parquet::ConvertedType
+struct ConvertedType {
+ enum type {
+ NONE, // Not a real converted type, but means no converted type is specified
+ UTF8,
+ MAP,
+ MAP_KEY_VALUE,
+ LIST,
+ ENUM,
+ DECIMAL,
+ DATE,
+ TIME_MILLIS,
+ TIME_MICROS,
+ TIMESTAMP_MILLIS,
+ TIMESTAMP_MICROS,
+ UINT_8,
+ UINT_16,
+ UINT_32,
+ UINT_64,
+ INT_8,
+ INT_16,
+ INT_32,
+ INT_64,
+ JSON,
+ BSON,
+ INTERVAL,
+ // DEPRECATED INVALID ConvertedType for all-null data.
+ // Only useful for reading legacy files written out by interim Parquet C++ releases.
+ // For writing, always emit LogicalType::Null instead.
+ // See PARQUET-1990.
+ NA = 25,
+ UNDEFINED = 26 // Not a real converted type; should always be last element
+ };
+};
+
+// forward declaration
+namespace format {
+
+class LogicalType;
+
+}
+
+// Mirrors parquet::FieldRepetitionType
+struct Repetition {
+ enum type { REQUIRED = 0, OPTIONAL = 1, REPEATED = 2, /*Always last*/ UNDEFINED = 3 };
+};
+
+// Reference:
+// parquet-mr/parquet-hadoop/src/main/java/org/apache/parquet/
+// format/converter/ParquetMetadataConverter.java
+// Sort order for page and column statistics. Types are associated with sort
+// orders (e.g., UTF8 columns should use UNSIGNED) and column stats are
+// aggregated using a sort order. As of parquet-format version 2.3.1, the
+// order used to aggregate stats is always SIGNED and is not stored in the
+// Parquet file. These stats are discarded for types that need unsigned.
+// See PARQUET-686.
+struct SortOrder {
+ enum type { SIGNED, UNSIGNED, UNKNOWN };
+};
+
+namespace schema {
+
+struct DecimalMetadata {
+ bool isset;
+ int32_t scale;
+ int32_t precision;
+};
+
+} // namespace schema
+
+/// \brief Implementation of parquet.thrift LogicalType types.
+class PARQUET_EXPORT LogicalType {
+ public:
+ struct Type {
+ enum type {
+ UNDEFINED = 0, // Not a real logical type
+ STRING = 1,
+ MAP,
+ LIST,
+ ENUM,
+ DECIMAL,
+ DATE,
+ TIME,
+ TIMESTAMP,
+ INTERVAL,
+ INT,
+ NIL, // Thrift NullType: annotates data that is always null
+ JSON,
+ BSON,
+ UUID,
+ NONE // Not a real logical type; should always be last element
+ };
+ };
+
+ struct TimeUnit {
+ enum unit { UNKNOWN = 0, MILLIS = 1, MICROS, NANOS };
+ };
+
+ /// \brief If possible, return a logical type equivalent to the given legacy
+ /// converted type (and decimal metadata if applicable).
+ static std::shared_ptr<const LogicalType> FromConvertedType(
+ const parquet::ConvertedType::type converted_type,
+ const parquet::schema::DecimalMetadata converted_decimal_metadata = {false, -1,
+ -1});
+
+ /// \brief Return the logical type represented by the Thrift intermediary object.
+ static std::shared_ptr<const LogicalType> FromThrift(
+ const parquet::format::LogicalType& thrift_logical_type);
+
+ /// \brief Return the explicitly requested logical type.
+ static std::shared_ptr<const LogicalType> String();
+ static std::shared_ptr<const LogicalType> Map();
+ static std::shared_ptr<const LogicalType> List();
+ static std::shared_ptr<const LogicalType> Enum();
+ static std::shared_ptr<const LogicalType> Decimal(int32_t precision, int32_t scale = 0);
+ static std::shared_ptr<const LogicalType> Date();
+ static std::shared_ptr<const LogicalType> Time(bool is_adjusted_to_utc,
+ LogicalType::TimeUnit::unit time_unit);
+
+ /// \brief Create a Timestamp logical type
+ /// \param[in] is_adjusted_to_utc set true if the data is UTC-normalized
+ /// \param[in] time_unit the resolution of the timestamp
+ /// \param[in] is_from_converted_type if true, the timestamp was generated
+ /// by translating a legacy converted type of TIMESTAMP_MILLIS or
+ /// TIMESTAMP_MICROS. Default is false.
+ /// \param[in] force_set_converted_type if true, always set the
+ /// legacy ConvertedType TIMESTAMP_MICROS and TIMESTAMP_MILLIS
+ /// metadata. Default is false
+ static std::shared_ptr<const LogicalType> Timestamp(
+ bool is_adjusted_to_utc, LogicalType::TimeUnit::unit time_unit,
+ bool is_from_converted_type = false, bool force_set_converted_type = false);
+
+ static std::shared_ptr<const LogicalType> Interval();
+ static std::shared_ptr<const LogicalType> Int(int bit_width, bool is_signed);
+
+ /// \brief Create a logical type for data that's always null
+ ///
+ /// Any physical type can be annotated with this logical type.
+ static std::shared_ptr<const LogicalType> Null();
+
+ static std::shared_ptr<const LogicalType> JSON();
+ static std::shared_ptr<const LogicalType> BSON();
+ static std::shared_ptr<const LogicalType> UUID();
+
+ /// \brief Create a placeholder for when no logical type is specified
+ static std::shared_ptr<const LogicalType> None();
+
+ /// \brief Return true if this logical type is consistent with the given underlying
+ /// physical type.
+ bool is_applicable(parquet::Type::type primitive_type,
+ int32_t primitive_length = -1) const;
+
+ /// \brief Return true if this logical type is equivalent to the given legacy converted
+ /// type (and decimal metadata if applicable).
+ bool is_compatible(parquet::ConvertedType::type converted_type,
+ parquet::schema::DecimalMetadata converted_decimal_metadata = {
+ false, -1, -1}) const;
+
+ /// \brief If possible, return the legacy converted type (and decimal metadata if
+ /// applicable) equivalent to this logical type.
+ parquet::ConvertedType::type ToConvertedType(
+ parquet::schema::DecimalMetadata* out_decimal_metadata) const;
+
+ /// \brief Return a printable representation of this logical type.
+ std::string ToString() const;
+
+ /// \brief Return a JSON representation of this logical type.
+ std::string ToJSON() const;
+
+ /// \brief Return a serializable Thrift object for this logical type.
+ parquet::format::LogicalType ToThrift() const;
+
+ /// \brief Return true if the given logical type is equivalent to this logical type.
+ bool Equals(const LogicalType& other) const;
+
+ /// \brief Return the enumerated type of this logical type.
+ LogicalType::Type::type type() const;
+
+ /// \brief Return the appropriate sort order for this logical type.
+ SortOrder::type sort_order() const;
+
+ // Type checks ...
+ bool is_string() const;
+ bool is_map() const;
+ bool is_list() const;
+ bool is_enum() const;
+ bool is_decimal() const;
+ bool is_date() const;
+ bool is_time() const;
+ bool is_timestamp() const;
+ bool is_interval() const;
+ bool is_int() const;
+ bool is_null() const;
+ bool is_JSON() const;
+ bool is_BSON() const;
+ bool is_UUID() const;
+ bool is_none() const;
+ /// \brief Return true if this logical type is of a known type.
+ bool is_valid() const;
+ bool is_invalid() const;
+ /// \brief Return true if this logical type is suitable for a schema GroupNode.
+ bool is_nested() const;
+ bool is_nonnested() const;
+ /// \brief Return true if this logical type is included in the Thrift output for its
+ /// node.
+ bool is_serialized() const;
+
+ LogicalType(const LogicalType&) = delete;
+ LogicalType& operator=(const LogicalType&) = delete;
+ virtual ~LogicalType() noexcept;
+
+ protected:
+ LogicalType();
+
+ class Impl;
+ std::unique_ptr<const Impl> impl_;
+};
+
+/// \brief Allowed for physical type BYTE_ARRAY, must be encoded as UTF-8.
+class PARQUET_EXPORT StringLogicalType : public LogicalType {
+ public:
+ static std::shared_ptr<const LogicalType> Make();
+
+ private:
+ StringLogicalType() = default;
+};
+
+/// \brief Allowed for group nodes only.
+class PARQUET_EXPORT MapLogicalType : public LogicalType {
+ public:
+ static std::shared_ptr<const LogicalType> Make();
+
+ private:
+ MapLogicalType() = default;
+};
+
+/// \brief Allowed for group nodes only.
+class PARQUET_EXPORT ListLogicalType : public LogicalType {
+ public:
+ static std::shared_ptr<const LogicalType> Make();
+
+ private:
+ ListLogicalType() = default;
+};
+
+/// \brief Allowed for physical type BYTE_ARRAY, must be encoded as UTF-8.
+class PARQUET_EXPORT EnumLogicalType : public LogicalType {
+ public:
+ static std::shared_ptr<const LogicalType> Make();
+
+ private:
+ EnumLogicalType() = default;
+};
+
+/// \brief Allowed for physical type INT32, INT64, FIXED_LEN_BYTE_ARRAY, or BYTE_ARRAY,
+/// depending on the precision.
+class PARQUET_EXPORT DecimalLogicalType : public LogicalType {
+ public:
+ static std::shared_ptr<const LogicalType> Make(int32_t precision, int32_t scale = 0);
+ int32_t precision() const;
+ int32_t scale() const;
+
+ private:
+ DecimalLogicalType() = default;
+};
+
+/// \brief Allowed for physical type INT32.
+class PARQUET_EXPORT DateLogicalType : public LogicalType {
+ public:
+ static std::shared_ptr<const LogicalType> Make();
+
+ private:
+ DateLogicalType() = default;
+};
+
+/// \brief Allowed for physical type INT32 (for MILLIS) or INT64 (for MICROS and NANOS).
+class PARQUET_EXPORT TimeLogicalType : public LogicalType {
+ public:
+ static std::shared_ptr<const LogicalType> Make(bool is_adjusted_to_utc,
+ LogicalType::TimeUnit::unit time_unit);
+ bool is_adjusted_to_utc() const;
+ LogicalType::TimeUnit::unit time_unit() const;
+
+ private:
+ TimeLogicalType() = default;
+};
+
+/// \brief Allowed for physical type INT64.
+class PARQUET_EXPORT TimestampLogicalType : public LogicalType {
+ public:
+ static std::shared_ptr<const LogicalType> Make(bool is_adjusted_to_utc,
+ LogicalType::TimeUnit::unit time_unit,
+ bool is_from_converted_type = false,
+ bool force_set_converted_type = false);
+ bool is_adjusted_to_utc() const;
+ LogicalType::TimeUnit::unit time_unit() const;
+
+ /// \brief If true, will not set LogicalType in Thrift metadata
+ bool is_from_converted_type() const;
+
+ /// \brief If true, will set ConvertedType for micros and millis
+ /// resolution in legacy ConvertedType Thrift metadata
+ bool force_set_converted_type() const;
+
+ private:
+ TimestampLogicalType() = default;
+};
+
+/// \brief Allowed for physical type FIXED_LEN_BYTE_ARRAY with length 12
+class PARQUET_EXPORT IntervalLogicalType : public LogicalType {
+ public:
+ static std::shared_ptr<const LogicalType> Make();
+
+ private:
+ IntervalLogicalType() = default;
+};
+
+/// \brief Allowed for physical type INT32 (for bit widths 8, 16, and 32) and INT64
+/// (for bit width 64).
+class PARQUET_EXPORT IntLogicalType : public LogicalType {
+ public:
+ static std::shared_ptr<const LogicalType> Make(int bit_width, bool is_signed);
+ int bit_width() const;
+ bool is_signed() const;
+
+ private:
+ IntLogicalType() = default;
+};
+
+/// \brief Allowed for any physical type.
+class PARQUET_EXPORT NullLogicalType : public LogicalType {
+ public:
+ static std::shared_ptr<const LogicalType> Make();
+
+ private:
+ NullLogicalType() = default;
+};
+
+/// \brief Allowed for physical type BYTE_ARRAY.
+class PARQUET_EXPORT JSONLogicalType : public LogicalType {
+ public:
+ static std::shared_ptr<const LogicalType> Make();
+
+ private:
+ JSONLogicalType() = default;
+};
+
+/// \brief Allowed for physical type BYTE_ARRAY.
+class PARQUET_EXPORT BSONLogicalType : public LogicalType {
+ public:
+ static std::shared_ptr<const LogicalType> Make();
+
+ private:
+ BSONLogicalType() = default;
+};
+
+/// \brief Allowed for physical type FIXED_LEN_BYTE_ARRAY with length 16,
+/// must encode raw UUID bytes.
+class PARQUET_EXPORT UUIDLogicalType : public LogicalType {
+ public:
+ static std::shared_ptr<const LogicalType> Make();
+
+ private:
+ UUIDLogicalType() = default;
+};
+
+/// \brief Allowed for any physical type.
+class PARQUET_EXPORT NoLogicalType : public LogicalType {
+ public:
+ static std::shared_ptr<const LogicalType> Make();
+
+ private:
+ NoLogicalType() = default;
+};
+
+// Internal API, for unrecognized logical types
+class PARQUET_EXPORT UndefinedLogicalType : public LogicalType {
+ public:
+ static std::shared_ptr<const LogicalType> Make();
+
+ private:
+ UndefinedLogicalType() = default;
+};
+
+// Data encodings. Mirrors parquet::Encoding
+struct Encoding {
+ enum type {
+ PLAIN = 0,
+ PLAIN_DICTIONARY = 2,
+ RLE = 3,
+ BIT_PACKED = 4,
+ DELTA_BINARY_PACKED = 5,
+ DELTA_LENGTH_BYTE_ARRAY = 6,
+ DELTA_BYTE_ARRAY = 7,
+ RLE_DICTIONARY = 8,
+ BYTE_STREAM_SPLIT = 9,
+ // Should always be last element (except UNKNOWN)
+ UNDEFINED = 10,
+ UNKNOWN = 999
+ };
+};
+
+// Exposed data encodings. It is the encoding of the data read from the file,
+// rather than the encoding of the data in the file. E.g., the data encoded as
+// RLE_DICTIONARY in the file can be read as dictionary indices by RLE
+// decoding, in which case the data read from the file is DICTIONARY encoded.
+enum class ExposedEncoding {
+ NO_ENCODING = 0, // data is not encoded, i.e. already decoded during reading
+ DICTIONARY = 1
+};
+
+/// \brief Return true if Parquet supports indicated compression type
+PARQUET_EXPORT
+bool IsCodecSupported(Compression::type codec);
+
+PARQUET_EXPORT
+std::unique_ptr<Codec> GetCodec(Compression::type codec);
+
+PARQUET_EXPORT
+std::unique_ptr<Codec> GetCodec(Compression::type codec, int compression_level);
+
+struct ParquetCipher {
+ enum type { AES_GCM_V1 = 0, AES_GCM_CTR_V1 = 1 };
+};
+
+struct AadMetadata {
+ std::string aad_prefix;
+ std::string aad_file_unique;
+ bool supply_aad_prefix;
+};
+
+struct EncryptionAlgorithm {
+ ParquetCipher::type algorithm;
+ AadMetadata aad;
+};
+
+// parquet::PageType
+struct PageType {
+ enum type {
+ DATA_PAGE,
+ INDEX_PAGE,
+ DICTIONARY_PAGE,
+ DATA_PAGE_V2,
+ // Should always be last element
+ UNDEFINED
+ };
+};
+
+class ColumnOrder {
+ public:
+ enum type { UNDEFINED, TYPE_DEFINED_ORDER };
+ explicit ColumnOrder(ColumnOrder::type column_order) : column_order_(column_order) {}
+ // Default to Type Defined Order
+ ColumnOrder() : column_order_(type::TYPE_DEFINED_ORDER) {}
+ ColumnOrder::type get_order() { return column_order_; }
+
+ static ColumnOrder undefined_;
+ static ColumnOrder type_defined_;
+
+ private:
+ ColumnOrder::type column_order_;
+};
+
+// ----------------------------------------------------------------------
+
+struct ByteArray {
+ ByteArray() : len(0), ptr(NULLPTR) {}
+ ByteArray(uint32_t len, const uint8_t* ptr) : len(len), ptr(ptr) {}
+
+ ByteArray(::arrow::util::string_view view) // NOLINT implicit conversion
+ : ByteArray(static_cast<uint32_t>(view.size()),
+ reinterpret_cast<const uint8_t*>(view.data())) {}
+ uint32_t len;
+ const uint8_t* ptr;
+};
+
+inline bool operator==(const ByteArray& left, const ByteArray& right) {
+ return left.len == right.len &&
+ (left.len == 0 || std::memcmp(left.ptr, right.ptr, left.len) == 0);
+}
+
+inline bool operator!=(const ByteArray& left, const ByteArray& right) {
+ return !(left == right);
+}
+
+struct FixedLenByteArray {
+ FixedLenByteArray() : ptr(NULLPTR) {}
+ explicit FixedLenByteArray(const uint8_t* ptr) : ptr(ptr) {}
+ const uint8_t* ptr;
+};
+
+using FLBA = FixedLenByteArray;
+
+// Julian day at unix epoch.
+//
+// The Julian Day Number (JDN) is the integer assigned to a whole solar day in
+// the Julian day count starting from noon Universal time, with Julian day
+// number 0 assigned to the day starting at noon on Monday, January 1, 4713 BC,
+// proleptic Julian calendar (November 24, 4714 BC, in the proleptic Gregorian
+// calendar),
+constexpr int64_t kJulianToUnixEpochDays = INT64_C(2440588);
+constexpr int64_t kSecondsPerDay = INT64_C(60 * 60 * 24);
+constexpr int64_t kMillisecondsPerDay = kSecondsPerDay * INT64_C(1000);
+constexpr int64_t kMicrosecondsPerDay = kMillisecondsPerDay * INT64_C(1000);
+constexpr int64_t kNanosecondsPerDay = kMicrosecondsPerDay * INT64_C(1000);
+
+MANUALLY_ALIGNED_STRUCT(1) Int96 { uint32_t value[3]; };
+STRUCT_END(Int96, 12);
+
+inline bool operator==(const Int96& left, const Int96& right) {
+ return std::equal(left.value, left.value + 3, right.value);
+}
+
+inline bool operator!=(const Int96& left, const Int96& right) { return !(left == right); }
+
+static inline std::string ByteArrayToString(const ByteArray& a) {
+ return std::string(reinterpret_cast<const char*>(a.ptr), a.len);
+}
+
+static inline void Int96SetNanoSeconds(parquet::Int96& i96, int64_t nanoseconds) {
+ std::memcpy(&i96.value, &nanoseconds, sizeof(nanoseconds));
+}
+
+struct DecodedInt96 {
+ uint64_t days_since_epoch;
+ uint64_t nanoseconds;
+};
+
+static inline DecodedInt96 DecodeInt96Timestamp(const parquet::Int96& i96) {
+ // We do the computations in the unsigned domain to avoid unsigned behaviour
+ // on overflow.
+ DecodedInt96 result;
+ result.days_since_epoch = i96.value[2] - static_cast<uint64_t>(kJulianToUnixEpochDays);
+ result.nanoseconds = 0;
+
+ memcpy(&result.nanoseconds, &i96.value, sizeof(uint64_t));
+ return result;
+}
+
+static inline int64_t Int96GetNanoSeconds(const parquet::Int96& i96) {
+ const auto decoded = DecodeInt96Timestamp(i96);
+ return static_cast<int64_t>(decoded.days_since_epoch * kNanosecondsPerDay +
+ decoded.nanoseconds);
+}
+
+static inline int64_t Int96GetMicroSeconds(const parquet::Int96& i96) {
+ const auto decoded = DecodeInt96Timestamp(i96);
+ uint64_t microseconds = decoded.nanoseconds / static_cast<uint64_t>(1000);
+ return static_cast<int64_t>(decoded.days_since_epoch * kMicrosecondsPerDay +
+ microseconds);
+}
+
+static inline int64_t Int96GetMilliSeconds(const parquet::Int96& i96) {
+ const auto decoded = DecodeInt96Timestamp(i96);
+ uint64_t milliseconds = decoded.nanoseconds / static_cast<uint64_t>(1000000);
+ return static_cast<int64_t>(decoded.days_since_epoch * kMillisecondsPerDay +
+ milliseconds);
+}
+
+static inline int64_t Int96GetSeconds(const parquet::Int96& i96) {
+ const auto decoded = DecodeInt96Timestamp(i96);
+ uint64_t seconds = decoded.nanoseconds / static_cast<uint64_t>(1000000000);
+ return static_cast<int64_t>(decoded.days_since_epoch * kSecondsPerDay + seconds);
+}
+
+static inline std::string Int96ToString(const Int96& a) {
+ std::ostringstream result;
+ std::copy(a.value, a.value + 3, std::ostream_iterator<uint32_t>(result, " "));
+ return result.str();
+}
+
+static inline std::string FixedLenByteArrayToString(const FixedLenByteArray& a, int len) {
+ std::ostringstream result;
+ std::copy(a.ptr, a.ptr + len, std::ostream_iterator<uint32_t>(result, " "));
+ return result.str();
+}
+
+template <Type::type TYPE>
+struct type_traits {};
+
+template <>
+struct type_traits<Type::BOOLEAN> {
+ using value_type = bool;
+
+ static constexpr int value_byte_size = 1;
+ static constexpr const char* printf_code = "d";
+};
+
+template <>
+struct type_traits<Type::INT32> {
+ using value_type = int32_t;
+
+ static constexpr int value_byte_size = 4;
+ static constexpr const char* printf_code = "d";
+};
+
+template <>
+struct type_traits<Type::INT64> {
+ using value_type = int64_t;
+
+ static constexpr int value_byte_size = 8;
+ static constexpr const char* printf_code = "ld";
+};
+
+template <>
+struct type_traits<Type::INT96> {
+ using value_type = Int96;
+
+ static constexpr int value_byte_size = 12;
+ static constexpr const char* printf_code = "s";
+};
+
+template <>
+struct type_traits<Type::FLOAT> {
+ using value_type = float;
+
+ static constexpr int value_byte_size = 4;
+ static constexpr const char* printf_code = "f";
+};
+
+template <>
+struct type_traits<Type::DOUBLE> {
+ using value_type = double;
+
+ static constexpr int value_byte_size = 8;
+ static constexpr const char* printf_code = "lf";
+};
+
+template <>
+struct type_traits<Type::BYTE_ARRAY> {
+ using value_type = ByteArray;
+
+ static constexpr int value_byte_size = sizeof(ByteArray);
+ static constexpr const char* printf_code = "s";
+};
+
+template <>
+struct type_traits<Type::FIXED_LEN_BYTE_ARRAY> {
+ using value_type = FixedLenByteArray;
+
+ static constexpr int value_byte_size = sizeof(FixedLenByteArray);
+ static constexpr const char* printf_code = "s";
+};
+
+template <Type::type TYPE>
+struct PhysicalType {
+ using c_type = typename type_traits<TYPE>::value_type;
+ static constexpr Type::type type_num = TYPE;
+};
+
+using BooleanType = PhysicalType<Type::BOOLEAN>;
+using Int32Type = PhysicalType<Type::INT32>;
+using Int64Type = PhysicalType<Type::INT64>;
+using Int96Type = PhysicalType<Type::INT96>;
+using FloatType = PhysicalType<Type::FLOAT>;
+using DoubleType = PhysicalType<Type::DOUBLE>;
+using ByteArrayType = PhysicalType<Type::BYTE_ARRAY>;
+using FLBAType = PhysicalType<Type::FIXED_LEN_BYTE_ARRAY>;
+
+template <typename Type>
+inline std::string format_fwf(int width) {
+ std::stringstream ss;
+ ss << "%-" << width << type_traits<Type::type_num>::printf_code;
+ return ss.str();
+}
+
+PARQUET_EXPORT std::string EncodingToString(Encoding::type t);
+
+PARQUET_EXPORT std::string ConvertedTypeToString(ConvertedType::type t);
+
+PARQUET_EXPORT std::string TypeToString(Type::type t);
+
+PARQUET_EXPORT std::string FormatStatValue(Type::type parquet_type,
+ ::arrow::util::string_view val);
+
+PARQUET_EXPORT int GetTypeByteSize(Type::type t);
+
+PARQUET_EXPORT SortOrder::type DefaultSortOrder(Type::type primitive);
+
+PARQUET_EXPORT SortOrder::type GetSortOrder(ConvertedType::type converted,
+ Type::type primitive);
+
+PARQUET_EXPORT SortOrder::type GetSortOrder(
+ const std::shared_ptr<const LogicalType>& logical_type, Type::type primitive);
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/windows_compatibility.h b/contrib/libs/apache/arrow/cpp/src/parquet/windows_compatibility.h
new file mode 100644
index 00000000000..31ca04c8b66
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/windows_compatibility.h
@@ -0,0 +1,30 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/util/windows_compatibility.h"
+
+#ifdef _WIN32
+
+// parquet.thrift's OPTIONAL RepetitionType conflicts with a #define from
+// above, so we undefine it
+#ifdef OPTIONAL
+#undef OPTIONAL
+#endif
+
+#endif