path: root/contrib/libs/apache
diff options
authorshmel1k <shmel1k@ydb.tech>2022-09-02 12:44:59 +0300
committershmel1k <shmel1k@ydb.tech>2022-09-02 12:44:59 +0300
commit90d450f74722da7859d6f510a869f6c6908fd12f (patch)
tree538c718dedc76cdfe37ad6d01ff250dd930d9278 /contrib/libs/apache
parent01f64c1ecd0d4ffa9e3a74478335f1745f26cc75 (diff)
[] add metering mode to CLI
Diffstat (limited to 'contrib/libs/apache')
45 files changed, 12732 insertions, 0 deletions
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/python/arrow_to_pandas.cc b/contrib/libs/apache/arrow/cpp/src/arrow/python/arrow_to_pandas.cc
new file mode 100644
index 0000000000..cc386f589a
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/python/arrow_to_pandas.cc
@@ -0,0 +1,2299 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+// Functions for pandas conversion via NumPy
+#include "arrow/python/arrow_to_pandas.h"
+#include "arrow/python/numpy_interop.h" // IWYU pragma: expand
+#include <cmath>
+#include <cstdint>
+#include <iostream>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include "arrow/array.h"
+#include "arrow/buffer.h"
+#include "arrow/datum.h"
+#include "arrow/status.h"
+#include "arrow/table.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/hashing.h"
+#include "arrow/util/int_util.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/parallel.h"
+#include "arrow/util/string_view.h"
+#include "arrow/visitor_inline.h"
+#include "arrow/compute/api.h"
+#include "arrow/python/common.h"
+#include "arrow/python/datetime.h"
+#include "arrow/python/decimal.h"
+#include "arrow/python/helpers.h"
+#include "arrow/python/numpy_convert.h"
+#include "arrow/python/numpy_internal.h"
+#include "arrow/python/pyarrow.h"
+#include "arrow/python/python_to_arrow.h"
+#include "arrow/python/type_traits.h"
+namespace arrow {
+class MemoryPool;
+using internal::checked_cast;
+using internal::CheckIndexBounds;
+using internal::GetByteWidth;
+using internal::OptionalParallelFor;
+namespace py {
+namespace {
+// Fix options for conversion of an inner (child) array.
+PandasOptions MakeInnerOptions(PandasOptions options) {
+ // Make sure conversion of inner dictionary arrays always returns an array,
+ // not a dict {'indices': array, 'dictionary': array, 'ordered': bool}
+ options.decode_dictionaries = true;
+ options.categorical_columns.clear();
+ options.strings_to_categorical = false;
+ // In ARROW-7723, we found as a result of ARROW-3789 that second
+ // through microsecond resolution tz-aware timestamps were being promoted to
+ // use the DATETIME_NANO_TZ conversion path, yielding a datetime64[ns] NumPy
+ // array in this function. PyArray_GETITEM returns datetime.datetime for
+ // units second through microsecond but PyLong for nanosecond (because
+ // datetime.datetime does not support nanoseconds).
+ // We force the object conversion to preserve the value of the timezone.
+ // Nanoseconds are returned as integers.
+ options.coerce_temporal_nanoseconds = false;
+ return options;
+// ----------------------------------------------------------------------
+// PyCapsule code for setting ndarray base to reference C++ object
+struct ArrayCapsule {
+ std::shared_ptr<Array> array;
+struct BufferCapsule {
+ std::shared_ptr<Buffer> buffer;
+void ArrayCapsule_Destructor(PyObject* capsule) {
+ delete reinterpret_cast<ArrayCapsule*>(PyCapsule_GetPointer(capsule, "arrow::Array"));
+void BufferCapsule_Destructor(PyObject* capsule) {
+ delete reinterpret_cast<BufferCapsule*>(PyCapsule_GetPointer(capsule, "arrow::Buffer"));
+// ----------------------------------------------------------------------
+// pandas 0.x DataFrame conversion internals
+using internal::arrow_traits;
+using internal::npy_traits;
+template <typename T>
+struct WrapBytes {};
+template <>
+struct WrapBytes<StringType> {
+ static inline PyObject* Wrap(const char* data, int64_t length) {
+ return PyUnicode_FromStringAndSize(data, length);
+ }
+template <>
+struct WrapBytes<LargeStringType> {
+ static inline PyObject* Wrap(const char* data, int64_t length) {
+ return PyUnicode_FromStringAndSize(data, length);
+ }
+template <>
+struct WrapBytes<BinaryType> {
+ static inline PyObject* Wrap(const char* data, int64_t length) {
+ return PyBytes_FromStringAndSize(data, length);
+ }
+template <>
+struct WrapBytes<LargeBinaryType> {
+ static inline PyObject* Wrap(const char* data, int64_t length) {
+ return PyBytes_FromStringAndSize(data, length);
+ }
+template <>
+struct WrapBytes<FixedSizeBinaryType> {
+ static inline PyObject* Wrap(const char* data, int64_t length) {
+ return PyBytes_FromStringAndSize(data, length);
+ }
+static inline bool ListTypeSupported(const DataType& type) {
+ switch (type.id()) {
+ case Type::BOOL:
+ case Type::UINT8:
+ case Type::INT8:
+ case Type::UINT16:
+ case Type::INT16:
+ case Type::UINT32:
+ case Type::INT32:
+ case Type::INT64:
+ case Type::UINT64:
+ case Type::FLOAT:
+ case Type::DOUBLE:
+ case Type::DECIMAL128:
+ case Type::DECIMAL256:
+ case Type::BINARY:
+ case Type::LARGE_BINARY:
+ case Type::STRING:
+ case Type::LARGE_STRING:
+ case Type::DATE32:
+ case Type::DATE64:
+ case Type::STRUCT:
+ case Type::TIME32:
+ case Type::TIME64:
+ case Type::TIMESTAMP:
+ case Type::DURATION:
+ case Type::DICTIONARY:
+ case Type::NA: // empty list
+ // The above types are all supported.
+ return true;
+ case Type::FIXED_SIZE_LIST:
+ case Type::LIST:
+ case Type::LARGE_LIST: {
+ const auto& list_type = checked_cast<const BaseListType&>(type);
+ return ListTypeSupported(*list_type.value_type());
+ }
+ default:
+ break;
+ }
+ return false;
+Status CapsulizeArray(const std::shared_ptr<Array>& arr, PyObject** out) {
+ auto capsule = new ArrayCapsule{{arr}};
+ *out = PyCapsule_New(reinterpret_cast<void*>(capsule), "arrow::Array",
+ &ArrayCapsule_Destructor);
+ if (*out == nullptr) {
+ delete capsule;
+ }
+ return Status::OK();
+Status CapsulizeBuffer(const std::shared_ptr<Buffer>& buffer, PyObject** out) {
+ auto capsule = new BufferCapsule{{buffer}};
+ *out = PyCapsule_New(reinterpret_cast<void*>(capsule), "arrow::Buffer",
+ &BufferCapsule_Destructor);
+ if (*out == nullptr) {
+ delete capsule;
+ }
+ return Status::OK();
+Status SetNdarrayBase(PyArrayObject* arr, PyObject* base) {
+ if (PyArray_SetBaseObject(arr, base) == -1) {
+ // Error occurred, trust that SetBaseObject sets the error state
+ Py_XDECREF(base);
+ }
+ return Status::OK();
+Status SetBufferBase(PyArrayObject* arr, const std::shared_ptr<Buffer>& buffer) {
+ PyObject* base;
+ RETURN_NOT_OK(CapsulizeBuffer(buffer, &base));
+ return SetNdarrayBase(arr, base);
+inline void set_numpy_metadata(int type, const DataType* datatype, PyArray_Descr* out) {
+ auto metadata = reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(out->c_metadata);
+ if (type == NPY_DATETIME) {
+ if (datatype->id() == Type::TIMESTAMP) {
+ const auto& timestamp_type = checked_cast<const TimestampType&>(*datatype);
+ metadata->meta.base = internal::NumPyFrequency(timestamp_type.unit());
+ } else {
+ DCHECK(false) << "NPY_DATETIME views only supported for Arrow TIMESTAMP types";
+ }
+ } else if (type == NPY_TIMEDELTA) {
+ DCHECK_EQ(datatype->id(), Type::DURATION);
+ const auto& duration_type = checked_cast<const DurationType&>(*datatype);
+ metadata->meta.base = internal::NumPyFrequency(duration_type.unit());
+ }
+Status PyArray_NewFromPool(int nd, npy_intp* dims, PyArray_Descr* descr, MemoryPool* pool,
+ PyObject** out) {
+ // ARROW-6570: Allocate memory from MemoryPool for a couple reasons
+ //
+ // * Track allocations
+ // * Get better performance through custom allocators
+ int64_t total_size = descr->elsize;
+ for (int i = 0; i < nd; ++i) {
+ total_size *= dims[i];
+ }
+ ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateBuffer(total_size, pool));
+ *out = PyArray_NewFromDescr(&PyArray_Type, descr, nd, dims,
+ /*strides=*/nullptr,
+ /*data=*/buffer->mutable_data(),
+ /*obj=*/nullptr);
+ if (*out == nullptr) {
+ // Trust that error set if NULL returned
+ }
+ return SetBufferBase(reinterpret_cast<PyArrayObject*>(*out), std::move(buffer));
+template <typename T = void>
+inline const T* GetPrimitiveValues(const Array& arr) {
+ if (arr.length() == 0) {
+ return nullptr;
+ }
+ const int elsize = GetByteWidth(*arr.type());
+ const auto& prim_arr = checked_cast<const PrimitiveArray&>(arr);
+ return reinterpret_cast<const T*>(prim_arr.values()->data() + arr.offset() * elsize);
+Status MakeNumPyView(std::shared_ptr<Array> arr, PyObject* py_ref, int npy_type, int ndim,
+ npy_intp* dims, PyObject** out) {
+ PyAcquireGIL lock;
+ PyArray_Descr* descr = internal::GetSafeNumPyDtype(npy_type);
+ set_numpy_metadata(npy_type, arr->type().get(), descr);
+ PyObject* result = PyArray_NewFromDescr(
+ &PyArray_Type, descr, ndim, dims, /*strides=*/nullptr,
+ const_cast<void*>(GetPrimitiveValues(*arr)), /*flags=*/0, nullptr);
+ PyArrayObject* np_arr = reinterpret_cast<PyArrayObject*>(result);
+ if (np_arr == nullptr) {
+ // Error occurred, trust that error set
+ return Status::OK();
+ }
+ PyObject* base;
+ if (py_ref == nullptr) {
+ // Capsule will be owned by the ndarray, no incref necessary. See
+ // ARROW-1973
+ RETURN_NOT_OK(CapsulizeArray(arr, &base));
+ } else {
+ Py_INCREF(py_ref);
+ base = py_ref;
+ }
+ RETURN_NOT_OK(SetNdarrayBase(np_arr, base));
+ // Do not allow Arrow data to be mutated
+ *out = result;
+ return Status::OK();
+class PandasWriter {
+ public:
+ enum type {
+ UINT8,
+ INT8,
+ UINT16,
+ INT16,
+ UINT32,
+ INT32,
+ UINT64,
+ INT64,
+ };
+ PandasWriter(const PandasOptions& options, int64_t num_rows, int num_columns)
+ : options_(options), num_rows_(num_rows), num_columns_(num_columns) {}
+ virtual ~PandasWriter() {}
+ void SetBlockData(PyObject* arr) {
+ block_arr_.reset(arr);
+ block_data_ =
+ reinterpret_cast<uint8_t*>(PyArray_DATA(reinterpret_cast<PyArrayObject*>(arr)));
+ }
+ /// \brief Either copy or wrap single array to create pandas-compatible array
+ /// for Series or DataFrame. num_columns_ can only be 1. Will try to zero
+ /// copy if possible (or error if not possible and zero_copy_only=True)
+ virtual Status TransferSingle(std::shared_ptr<ChunkedArray> data, PyObject* py_ref) = 0;
+ /// \brief Copy ChunkedArray into a multi-column block
+ virtual Status CopyInto(std::shared_ptr<ChunkedArray> data, int64_t rel_placement) = 0;
+ Status EnsurePlacementAllocated() {
+ std::lock_guard<std::mutex> guard(allocation_lock_);
+ if (placement_data_ != nullptr) {
+ return Status::OK();
+ }
+ PyAcquireGIL lock;
+ npy_intp placement_dims[1] = {num_columns_};
+ PyObject* placement_arr = PyArray_SimpleNew(1, placement_dims, NPY_INT64);
+ placement_arr_.reset(placement_arr);
+ placement_data_ = reinterpret_cast<int64_t*>(
+ PyArray_DATA(reinterpret_cast<PyArrayObject*>(placement_arr)));
+ return Status::OK();
+ }
+ Status EnsureAllocated() {
+ std::lock_guard<std::mutex> guard(allocation_lock_);
+ if (block_data_ != nullptr) {
+ return Status::OK();
+ }
+ RETURN_NOT_OK(Allocate());
+ return Status::OK();
+ }
+ virtual bool CanZeroCopy(const ChunkedArray& data) const { return false; }
+ virtual Status Write(std::shared_ptr<ChunkedArray> data, int64_t abs_placement,
+ int64_t rel_placement) {
+ RETURN_NOT_OK(EnsurePlacementAllocated());
+ if (num_columns_ == 1 && options_.allow_zero_copy_blocks) {
+ RETURN_NOT_OK(TransferSingle(data, /*py_ref=*/nullptr));
+ } else {
+ CheckNoZeroCopy("Cannot do zero copy conversion into "
+ "multi-column DataFrame block"));
+ RETURN_NOT_OK(EnsureAllocated());
+ RETURN_NOT_OK(CopyInto(data, rel_placement));
+ }
+ placement_data_[rel_placement] = abs_placement;
+ return Status::OK();
+ }
+ virtual Status GetDataFrameResult(PyObject** out) {
+ PyObject* result = PyDict_New();
+ PyObject* block;
+ RETURN_NOT_OK(GetResultBlock(&block));
+ PyDict_SetItemString(result, "block", block);
+ PyDict_SetItemString(result, "placement", placement_arr_.obj());
+ RETURN_NOT_OK(AddResultMetadata(result));
+ *out = result;
+ return Status::OK();
+ }
+ // Caller steals the reference to this object
+ virtual Status GetSeriesResult(PyObject** out) {
+ RETURN_NOT_OK(MakeBlock1D());
+ // Caller owns the object now
+ *out = block_arr_.detach();
+ return Status::OK();
+ }
+ protected:
+ virtual Status AddResultMetadata(PyObject* result) { return Status::OK(); }
+ Status MakeBlock1D() {
+ // For Series or for certain DataFrame block types, we need to shape to a
+ // 1D array when there is only one column
+ PyAcquireGIL lock;
+ DCHECK_EQ(1, num_columns_);
+ npy_intp new_dims[1] = {static_cast<npy_intp>(num_rows_)};
+ PyArray_Dims dims;
+ dims.ptr = new_dims;
+ dims.len = 1;
+ PyObject* reshaped = PyArray_Newshape(
+ reinterpret_cast<PyArrayObject*>(block_arr_.obj()), &dims, NPY_ANYORDER);
+ // ARROW-8801: Here a PyArrayObject is created that is not being managed by
+ // any OwnedRef object. This object is then put in the resulting object
+ // with PyDict_SetItemString, which increments the reference count, so a
+ // memory leak ensues. There are several ways to fix the memory leak but a
+ // simple one is to put the reshaped 1D block array in this OwnedRefNoGIL
+ // so it will be correctly decref'd when this class is destructed.
+ block_arr_.reset(reshaped);
+ return Status::OK();
+ }
+ virtual Status GetResultBlock(PyObject** out) {
+ *out = block_arr_.obj();
+ return Status::OK();
+ }
+ Status CheckNoZeroCopy(const std::string& message) {
+ if (options_.zero_copy_only) {
+ return Status::Invalid(message);
+ }
+ return Status::OK();
+ }
+ Status CheckNotZeroCopyOnly(const ChunkedArray& data) {
+ if (options_.zero_copy_only) {
+ return Status::Invalid("Needed to copy ", data.num_chunks(), " chunks with ",
+ data.null_count(), " nulls, but zero_copy_only was True");
+ }
+ return Status::OK();
+ }
+ virtual Status Allocate() {
+ return Status::NotImplemented("Override Allocate in subclasses");
+ }
+ Status AllocateNDArray(int npy_type, int ndim = 2) {
+ PyAcquireGIL lock;
+ PyObject* block_arr;
+ npy_intp block_dims[2] = {0, 0};
+ if (ndim == 2) {
+ block_dims[0] = num_columns_;
+ block_dims[1] = num_rows_;
+ } else {
+ block_dims[0] = num_rows_;
+ }
+ PyArray_Descr* descr = internal::GetSafeNumPyDtype(npy_type);
+ if (PyDataType_REFCHK(descr)) {
+ // ARROW-6876: if the array has refcounted items, let Numpy
+ // own the array memory so as to decref elements on array destruction
+ block_arr = PyArray_SimpleNewFromDescr(ndim, block_dims, descr);
+ } else {
+ PyArray_NewFromPool(ndim, block_dims, descr, options_.pool, &block_arr));
+ }
+ SetBlockData(block_arr);
+ return Status::OK();
+ }
+ void SetDatetimeUnit(NPY_DATETIMEUNIT unit) {
+ PyAcquireGIL lock;
+ auto date_dtype = reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(
+ PyArray_DESCR(reinterpret_cast<PyArrayObject*>(block_arr_.obj()))->c_metadata);
+ date_dtype->meta.base = unit;
+ }
+ PandasOptions options_;
+ std::mutex allocation_lock_;
+ int64_t num_rows_;
+ int num_columns_;
+ OwnedRefNoGIL block_arr_;
+ uint8_t* block_data_ = nullptr;
+ // ndarray<int32>
+ OwnedRefNoGIL placement_arr_;
+ int64_t* placement_data_ = nullptr;
+ private:
+template <typename InType, typename OutType>
+inline void ConvertIntegerWithNulls(const PandasOptions& options,
+ const ChunkedArray& data, OutType* out_values) {
+ for (int c = 0; c < data.num_chunks(); c++) {
+ const auto& arr = *data.chunk(c);
+ const InType* in_values = GetPrimitiveValues<InType>(arr);
+ // Upcast to double, set NaN as appropriate
+ for (int i = 0; i < arr.length(); ++i) {
+ *out_values++ =
+ arr.IsNull(i) ? static_cast<OutType>(NAN) : static_cast<OutType>(in_values[i]);
+ }
+ }
+template <typename T>
+inline void ConvertIntegerNoNullsSameType(const PandasOptions& options,
+ const ChunkedArray& data, T* out_values) {
+ for (int c = 0; c < data.num_chunks(); c++) {
+ const auto& arr = *data.chunk(c);
+ if (arr.length() > 0) {
+ const T* in_values = GetPrimitiveValues<T>(arr);
+ memcpy(out_values, in_values, sizeof(T) * arr.length());
+ out_values += arr.length();
+ }
+ }
+template <typename InType, typename OutType>
+inline void ConvertIntegerNoNullsCast(const PandasOptions& options,
+ const ChunkedArray& data, OutType* out_values) {
+ for (int c = 0; c < data.num_chunks(); c++) {
+ const auto& arr = *data.chunk(c);
+ const InType* in_values = GetPrimitiveValues<InType>(arr);
+ for (int64_t i = 0; i < arr.length(); ++i) {
+ *out_values = in_values[i];
+ }
+ }
+// Generic Array -> PyObject** converter that handles object deduplication, if
+// requested
+template <typename ArrayType, typename WriteValue>
+inline Status WriteArrayObjects(const ArrayType& arr, WriteValue&& write_func,
+ PyObject** out_values) {
+ const bool has_nulls = arr.null_count() > 0;
+ for (int64_t i = 0; i < arr.length(); ++i) {
+ if (has_nulls && arr.IsNull(i)) {
+ Py_INCREF(Py_None);
+ *out_values = Py_None;
+ } else {
+ RETURN_NOT_OK(write_func(arr.GetView(i), out_values));
+ }
+ ++out_values;
+ }
+ return Status::OK();
+template <typename T, typename Enable = void>
+struct MemoizationTraits {
+ using Scalar = typename T::c_type;
+template <typename T>
+struct MemoizationTraits<T, enable_if_has_string_view<T>> {
+ // For binary, we memoize string_view as a scalar value to avoid having to
+ // unnecessarily copy the memory into the memo table data structure
+ using Scalar = util::string_view;
+template <typename Type, typename WrapFunction>
+inline Status ConvertAsPyObjects(const PandasOptions& options, const ChunkedArray& data,
+ WrapFunction&& wrap_func, PyObject** out_values) {
+ using ArrayType = typename TypeTraits<Type>::ArrayType;
+ using Scalar = typename MemoizationTraits<Type>::Scalar;
+ // TODO(fsaintjacques): propagate memory pool.
+ ::arrow::internal::ScalarMemoTable<Scalar> memo_table(default_memory_pool());
+ std::vector<PyObject*> unique_values;
+ int32_t memo_size = 0;
+ auto WrapMemoized = [&](const Scalar& value, PyObject** out_values) {
+ int32_t memo_index;
+ RETURN_NOT_OK(memo_table.GetOrInsert(value, &memo_index));
+ if (memo_index == memo_size) {
+ // New entry
+ RETURN_NOT_OK(wrap_func(value, out_values));
+ unique_values.push_back(*out_values);
+ ++memo_size;
+ } else {
+ // Duplicate entry
+ Py_INCREF(unique_values[memo_index]);
+ *out_values = unique_values[memo_index];
+ }
+ return Status::OK();
+ };
+ auto WrapUnmemoized = [&](const Scalar& value, PyObject** out_values) {
+ return wrap_func(value, out_values);
+ };
+ for (int c = 0; c < data.num_chunks(); c++) {
+ const auto& arr = checked_cast<const ArrayType&>(*data.chunk(c));
+ if (options.deduplicate_objects) {
+ RETURN_NOT_OK(WriteArrayObjects(arr, WrapMemoized, out_values));
+ } else {
+ RETURN_NOT_OK(WriteArrayObjects(arr, WrapUnmemoized, out_values));
+ }
+ out_values += arr.length();
+ }
+ return Status::OK();
+Status ConvertStruct(PandasOptions options, const ChunkedArray& data,
+ PyObject** out_values) {
+ if (data.num_chunks() == 0) {
+ return Status::OK();
+ }
+ // ChunkedArray has at least one chunk
+ auto arr = checked_cast<const StructArray*>(data.chunk(0).get());
+ // Use it to cache the struct type and number of fields for all chunks
+ int32_t num_fields = arr->num_fields();
+ auto array_type = arr->type();
+ std::vector<OwnedRef> fields_data(num_fields * data.num_chunks());
+ OwnedRef dict_item;
+ // See notes in MakeInnerOptions.
+ options = MakeInnerOptions(std::move(options));
+ // Don't blindly convert because timestamps in lists are handled differently.
+ options.timestamp_as_object = true;
+ for (int c = 0; c < data.num_chunks(); c++) {
+ auto fields_data_offset = c * num_fields;
+ auto arr = checked_cast<const StructArray*>(data.chunk(c).get());
+ // Convert the struct arrays first
+ for (int32_t i = 0; i < num_fields; i++) {
+ const auto field = arr->field(static_cast<int>(i));
+ RETURN_NOT_OK(ConvertArrayToPandas(options, field, nullptr,
+ fields_data[i + fields_data_offset].ref()));
+ DCHECK(PyArray_Check(fields_data[i + fields_data_offset].obj()));
+ }
+ // Construct a dictionary for each row
+ const bool has_nulls = data.null_count() > 0;
+ for (int64_t i = 0; i < arr->length(); ++i) {
+ if (has_nulls && arr->IsNull(i)) {
+ Py_INCREF(Py_None);
+ *out_values = Py_None;
+ } else {
+ // Build the new dict object for the row
+ dict_item.reset(PyDict_New());
+ for (int32_t field_idx = 0; field_idx < num_fields; ++field_idx) {
+ OwnedRef field_value;
+ auto name = array_type->field(static_cast<int>(field_idx))->name();
+ if (!arr->field(static_cast<int>(field_idx))->IsNull(i)) {
+ // Value exists in child array, obtain it
+ auto array = reinterpret_cast<PyArrayObject*>(
+ fields_data[field_idx + fields_data_offset].obj());
+ auto ptr = reinterpret_cast<const char*>(PyArray_GETPTR1(array, i));
+ field_value.reset(PyArray_GETITEM(array, ptr));
+ } else {
+ // Translate the Null to a None
+ Py_INCREF(Py_None);
+ field_value.reset(Py_None);
+ }
+ // PyDict_SetItemString increments reference count
+ auto setitem_result =
+ PyDict_SetItemString(dict_item.obj(), name.c_str(), field_value.obj());
+ DCHECK_EQ(setitem_result, 0);
+ }
+ *out_values = dict_item.obj();
+ // Grant ownership to the resulting array
+ Py_INCREF(*out_values);
+ }
+ ++out_values;
+ }
+ }
+ return Status::OK();
+Status DecodeDictionaries(MemoryPool* pool, const std::shared_ptr<DataType>& dense_type,
+ ArrayVector* arrays) {
+ compute::ExecContext ctx(pool);
+ compute::CastOptions options;
+ for (size_t i = 0; i < arrays->size(); ++i) {
+ ARROW_ASSIGN_OR_RAISE((*arrays)[i],
+ compute::Cast(*(*arrays)[i], dense_type, options, &ctx));
+ }
+ return Status::OK();
+Status DecodeDictionaries(MemoryPool* pool, const std::shared_ptr<DataType>& dense_type,
+ std::shared_ptr<ChunkedArray>* array) {
+ auto chunks = (*array)->chunks();
+ RETURN_NOT_OK(DecodeDictionaries(pool, dense_type, &chunks));
+ *array = std::make_shared<ChunkedArray>(std::move(chunks), dense_type);
+ return Status::OK();
+template <typename ListArrayT>
+Status ConvertListsLike(PandasOptions options, const ChunkedArray& data,
+ PyObject** out_values) {
+ // Get column of underlying value arrays
+ ArrayVector value_arrays;
+ for (int c = 0; c < data.num_chunks(); c++) {
+ const auto& arr = checked_cast<const ListArrayT&>(*data.chunk(c));
+ value_arrays.emplace_back(arr.values());
+ }
+ using ListArrayType = typename ListArrayT::TypeClass;
+ const auto& list_type = checked_cast<const ListArrayType&>(*data.type());
+ auto value_type = list_type.value_type();
+ auto flat_column = std::make_shared<ChunkedArray>(value_arrays, value_type);
+ options = MakeInnerOptions(std::move(options));
+ OwnedRefNoGIL owned_numpy_array;
+ RETURN_NOT_OK(ConvertChunkedArrayToPandas(options, flat_column, nullptr,
+ owned_numpy_array.ref()));
+ PyObject* numpy_array = owned_numpy_array.obj();
+ DCHECK(PyArray_Check(numpy_array));
+ int64_t chunk_offset = 0;
+ for (int c = 0; c < data.num_chunks(); c++) {
+ const auto& arr = checked_cast<const ListArrayT&>(*data.chunk(c));
+ const bool has_nulls = data.null_count() > 0;
+ for (int64_t i = 0; i < arr.length(); ++i) {
+ if (has_nulls && arr.IsNull(i)) {
+ Py_INCREF(Py_None);
+ *out_values = Py_None;
+ } else {
+ OwnedRef start(PyLong_FromLongLong(arr.value_offset(i) + chunk_offset));
+ OwnedRef end(PyLong_FromLongLong(arr.value_offset(i + 1) + chunk_offset));
+ OwnedRef slice(PySlice_New(start.obj(), end.obj(), nullptr));
+ if (ARROW_PREDICT_FALSE(slice.obj() == nullptr)) {
+ // Fall out of loop, will return from RETURN_IF_PYERROR
+ break;
+ }
+ *out_values = PyObject_GetItem(numpy_array, slice.obj());
+ if (*out_values == nullptr) {
+ // Fall out of loop, will return from RETURN_IF_PYERROR
+ break;
+ }
+ }
+ ++out_values;
+ }
+ chunk_offset += arr.values()->length();
+ }
+ return Status::OK();
+Status ConvertMap(PandasOptions options, const ChunkedArray& data,
+ PyObject** out_values) {
+ // Get columns of underlying key/item arrays
+ std::vector<std::shared_ptr<Array>> key_arrays;
+ std::vector<std::shared_ptr<Array>> item_arrays;
+ for (int c = 0; c < data.num_chunks(); ++c) {
+ const auto& map_arr = checked_cast<const MapArray&>(*data.chunk(c));
+ key_arrays.emplace_back(map_arr.keys());
+ item_arrays.emplace_back(map_arr.items());
+ }
+ const auto& map_type = checked_cast<const MapType&>(*data.type());
+ auto key_type = map_type.key_type();
+ auto item_type = map_type.item_type();
+ // ARROW-6899: Convert dictionary-encoded children to dense instead of
+ // failing below. A more efficient conversion than this could be done later
+ if (key_type->id() == Type::DICTIONARY) {
+ auto dense_type = checked_cast<const DictionaryType&>(*key_type).value_type();
+ RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &key_arrays));
+ key_type = dense_type;
+ }
+ if (item_type->id() == Type::DICTIONARY) {
+ auto dense_type = checked_cast<const DictionaryType&>(*item_type).value_type();
+ RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &item_arrays));
+ item_type = dense_type;
+ }
+ // See notes in MakeInnerOptions.
+ options = MakeInnerOptions(std::move(options));
+ // Don't blindly convert because timestamps in lists are handled differently.
+ options.timestamp_as_object = true;
+ auto flat_keys = std::make_shared<ChunkedArray>(key_arrays, key_type);
+ auto flat_items = std::make_shared<ChunkedArray>(item_arrays, item_type);
+ OwnedRef list_item;
+ OwnedRef key_value;
+ OwnedRef item_value;
+ OwnedRefNoGIL owned_numpy_keys;
+ ConvertChunkedArrayToPandas(options, flat_keys, nullptr, owned_numpy_keys.ref()));
+ OwnedRefNoGIL owned_numpy_items;
+ ConvertChunkedArrayToPandas(options, flat_items, nullptr, owned_numpy_items.ref()));
+ PyArrayObject* py_keys = reinterpret_cast<PyArrayObject*>(owned_numpy_keys.obj());
+ PyArrayObject* py_items = reinterpret_cast<PyArrayObject*>(owned_numpy_items.obj());
+ int64_t chunk_offset = 0;
+ for (int c = 0; c < data.num_chunks(); ++c) {
+ const auto& arr = checked_cast<const MapArray&>(*data.chunk(c));
+ const bool has_nulls = data.null_count() > 0;
+ // Make a list of key/item pairs for each row in array
+ for (int64_t i = 0; i < arr.length(); ++i) {
+ if (has_nulls && arr.IsNull(i)) {
+ Py_INCREF(Py_None);
+ *out_values = Py_None;
+ } else {
+ int64_t entry_offset = arr.value_offset(i);
+ int64_t num_maps = arr.value_offset(i + 1) - entry_offset;
+ // Build the new list object for the row of maps
+ list_item.reset(PyList_New(num_maps));
+ // Add each key/item pair in the row
+ for (int64_t j = 0; j < num_maps; ++j) {
+ // Get key value, key is non-nullable for a valid row
+ auto ptr_key = reinterpret_cast<const char*>(
+ PyArray_GETPTR1(py_keys, chunk_offset + entry_offset + j));
+ key_value.reset(PyArray_GETITEM(py_keys, ptr_key));
+ if (item_arrays[c]->IsNull(entry_offset + j)) {
+ // Translate the Null to a None
+ Py_INCREF(Py_None);
+ item_value.reset(Py_None);
+ } else {
+ // Get valid value from item array
+ auto ptr_item = reinterpret_cast<const char*>(
+ PyArray_GETPTR1(py_items, chunk_offset + entry_offset + j));
+ item_value.reset(PyArray_GETITEM(py_items, ptr_item));
+ }
+ // Add the key/item pair to the list for the row
+ PyList_SET_ITEM(list_item.obj(), j,
+ PyTuple_Pack(2, key_value.obj(), item_value.obj()));
+ }
+ // Pass ownership to the resulting array
+ *out_values = list_item.detach();
+ }
+ ++out_values;
+ }
+ chunk_offset += arr.values()->length();
+ }
+ return Status::OK();
+template <typename InType, typename OutType>
+inline void ConvertNumericNullable(const ChunkedArray& data, InType na_value,
+ OutType* out_values) {
+ for (int c = 0; c < data.num_chunks(); c++) {
+ const auto& arr = *data.chunk(c);
+ const InType* in_values = GetPrimitiveValues<InType>(arr);
+ if (arr.null_count() > 0) {
+ for (int64_t i = 0; i < arr.length(); ++i) {
+ *out_values++ = arr.IsNull(i) ? na_value : in_values[i];
+ }
+ } else {
+ memcpy(out_values, in_values, sizeof(InType) * arr.length());
+ out_values += arr.length();
+ }
+ }
+template <typename InType, typename OutType>
+inline void ConvertNumericNullableCast(const ChunkedArray& data, InType na_value,
+ OutType* out_values) {
+ for (int c = 0; c < data.num_chunks(); c++) {
+ const auto& arr = *data.chunk(c);
+ const InType* in_values = GetPrimitiveValues<InType>(arr);
+ for (int64_t i = 0; i < arr.length(); ++i) {
+ *out_values++ = arr.IsNull(i) ? static_cast<OutType>(na_value)
+ : static_cast<OutType>(in_values[i]);
+ }
+ }
+template <int NPY_TYPE>
+class TypedPandasWriter : public PandasWriter {
+ public:
+ using T = typename npy_traits<NPY_TYPE>::value_type;
+ using PandasWriter::PandasWriter;
+ Status TransferSingle(std::shared_ptr<ChunkedArray> data, PyObject* py_ref) override {
+ if (CanZeroCopy(*data)) {
+ PyObject* wrapped;
+ npy_intp dims[2] = {static_cast<npy_intp>(num_columns_),
+ static_cast<npy_intp>(num_rows_)};
+ MakeNumPyView(data->chunk(0), py_ref, NPY_TYPE, /*ndim=*/2, dims, &wrapped));
+ SetBlockData(wrapped);
+ return Status::OK();
+ } else {
+ RETURN_NOT_OK(CheckNotZeroCopyOnly(*data));
+ RETURN_NOT_OK(EnsureAllocated());
+ return CopyInto(data, /*rel_placement=*/0);
+ }
+ }
+ Status CheckTypeExact(const DataType& type, Type::type expected) {
+ if (type.id() != expected) {
+ // TODO(wesm): stringify NumPy / pandas type
+ return Status::NotImplemented("Cannot write Arrow data of type ", type.ToString());
+ }
+ return Status::OK();
+ }
+ T* GetBlockColumnStart(int64_t rel_placement) {
+ return reinterpret_cast<T*>(block_data_) + rel_placement * num_rows_;
+ }
+ protected:
+ Status Allocate() override { return AllocateNDArray(NPY_TYPE); }
+struct ObjectWriterVisitor {
+ const PandasOptions& options;
+ const ChunkedArray& data;
+ PyObject** out_values;
+ Status Visit(const NullType& type) {
+ for (int c = 0; c < data.num_chunks(); c++) {
+ std::shared_ptr<Array> arr = data.chunk(c);
+ for (int64_t i = 0; i < arr->length(); ++i) {
+ // All values are null
+ Py_INCREF(Py_None);
+ *out_values = Py_None;
+ ++out_values;
+ }
+ }
+ return Status::OK();
+ }
+ Status Visit(const BooleanType& type) {
+ for (int c = 0; c < data.num_chunks(); c++) {
+ const auto& arr = checked_cast<const BooleanArray&>(*data.chunk(c));
+ for (int64_t i = 0; i < arr.length(); ++i) {
+ if (arr.IsNull(i)) {
+ Py_INCREF(Py_None);
+ *out_values++ = Py_None;
+ } else if (arr.Value(i)) {
+ // True
+ Py_INCREF(Py_True);
+ *out_values++ = Py_True;
+ } else {
+ // False
+ Py_INCREF(Py_False);
+ *out_values++ = Py_False;
+ }
+ }
+ }
+ return Status::OK();
+ }
+ template <typename Type>
+ enable_if_integer<Type, Status> Visit(const Type& type) {
+ using T = typename Type::c_type;
+ auto WrapValue = [](T value, PyObject** out) {
+ *out = std::is_signed<T>::value ? PyLong_FromLongLong(value)
+ : PyLong_FromUnsignedLongLong(value);
+ return Status::OK();
+ };
+ return ConvertAsPyObjects<Type>(options, data, WrapValue, out_values);
+ }
+ template <typename Type>
+ enable_if_t<is_base_binary_type<Type>::value || is_fixed_size_binary_type<Type>::value,
+ Status>
+ Visit(const Type& type) {
+ auto WrapValue = [](const util::string_view& view, PyObject** out) {
+ *out = WrapBytes<Type>::Wrap(view.data(), view.length());
+ if (*out == nullptr) {
+ PyErr_Clear();
+ return Status::UnknownError("Wrapping ", view, " failed");
+ }
+ return Status::OK();
+ };
+ return ConvertAsPyObjects<Type>(options, data, WrapValue, out_values);
+ }
+ template <typename Type>
+ enable_if_date<Type, Status> Visit(const Type& type) {
+ auto WrapValue = [](typename Type::c_type value, PyObject** out) {
+ RETURN_NOT_OK(internal::PyDate_from_int(value, Type::UNIT, out));
+ return Status::OK();
+ };
+ return ConvertAsPyObjects<Type>(options, data, WrapValue, out_values);
+ }
+ template <typename Type>
+ enable_if_time<Type, Status> Visit(const Type& type) {
+ const TimeUnit::type unit = type.unit();
+ auto WrapValue = [unit](typename Type::c_type value, PyObject** out) {
+ RETURN_NOT_OK(internal::PyTime_from_int(value, unit, out));
+ return Status::OK();
+ };
+ return ConvertAsPyObjects<Type>(options, data, WrapValue, out_values);
+ }
+ template <typename Type>
+ enable_if_timestamp<Type, Status> Visit(const Type& type) {
+ const TimeUnit::type unit = type.unit();
+ OwnedRef tzinfo;
+ auto ConvertTimezoneNaive = [&](typename Type::c_type value, PyObject** out) {
+ RETURN_NOT_OK(internal::PyDateTime_from_int(value, unit, out));
+ return Status::OK();
+ };
+ auto ConvertTimezoneAware = [&](typename Type::c_type value, PyObject** out) {
+ PyObject* naive_datetime;
+ RETURN_NOT_OK(ConvertTimezoneNaive(value, &naive_datetime));
+ // convert the timezone naive datetime object to timezone aware
+ *out = PyObject_CallMethod(tzinfo.obj(), "fromutc", "O", naive_datetime);
+ // the timezone naive object is no longer required
+ Py_DECREF(naive_datetime);
+ return Status::OK();
+ };
+ if (!type.timezone().empty() && !options.ignore_timezone) {
+ // convert timezone aware
+ PyObject* tzobj;
+ ARROW_ASSIGN_OR_RAISE(tzobj, internal::StringToTzinfo(type.timezone()));
+ tzinfo.reset(tzobj);
+ ConvertAsPyObjects<Type>(options, data, ConvertTimezoneAware, out_values));
+ } else {
+ // convert timezone naive
+ ConvertAsPyObjects<Type>(options, data, ConvertTimezoneNaive, out_values));
+ }
+ return Status::OK();
+ }
+ Status Visit(const Decimal128Type& type) {
+ OwnedRef decimal;
+ OwnedRef Decimal;
+ RETURN_NOT_OK(internal::ImportModule("decimal", &decimal));
+ RETURN_NOT_OK(internal::ImportFromModule(decimal.obj(), "Decimal", &Decimal));
+ PyObject* decimal_constructor = Decimal.obj();
+ for (int c = 0; c < data.num_chunks(); c++) {
+ const auto& arr = checked_cast<const arrow::Decimal128Array&>(*data.chunk(c));
+ for (int64_t i = 0; i < arr.length(); ++i) {
+ if (arr.IsNull(i)) {
+ Py_INCREF(Py_None);
+ *out_values++ = Py_None;
+ } else {
+ *out_values++ =
+ internal::DecimalFromString(decimal_constructor, arr.FormatValue(i));
+ }
+ }
+ }
+ return Status::OK();
+ }
+ Status Visit(const Decimal256Type& type) {
+ OwnedRef decimal;
+ OwnedRef Decimal;
+ RETURN_NOT_OK(internal::ImportModule("decimal", &decimal));
+ RETURN_NOT_OK(internal::ImportFromModule(decimal.obj(), "Decimal", &Decimal));
+ PyObject* decimal_constructor = Decimal.obj();
+ for (int c = 0; c < data.num_chunks(); c++) {
+ const auto& arr = checked_cast<const arrow::Decimal256Array&>(*data.chunk(c));
+ for (int64_t i = 0; i < arr.length(); ++i) {
+ if (arr.IsNull(i)) {
+ Py_INCREF(Py_None);
+ *out_values++ = Py_None;
+ } else {
+ *out_values++ =
+ internal::DecimalFromString(decimal_constructor, arr.FormatValue(i));
+ }
+ }
+ }
+ return Status::OK();
+ }
+ template <typename T>
+ enable_if_t<is_fixed_size_list_type<T>::value || is_var_length_list_type<T>::value,
+ Status>
+ Visit(const T& type) {
+ using ArrayType = typename TypeTraits<T>::ArrayType;
+ if (!ListTypeSupported(*type.value_type())) {
+ return Status::NotImplemented(
+ "Not implemented type for conversion from List to Pandas: ",
+ type.value_type()->ToString());
+ }
+ return ConvertListsLike<ArrayType>(options, data, out_values);
+ }
+ Status Visit(const MapType& type) { return ConvertMap(options, data, out_values); }
+ Status Visit(const StructType& type) {
+ return ConvertStruct(options, data, out_values);
+ }
+ template <typename Type>
+ enable_if_t<is_floating_type<Type>::value ||
+ std::is_same<DictionaryType, Type>::value ||
+ std::is_same<DurationType, Type>::value ||
+ std::is_same<ExtensionType, Type>::value ||
+ std::is_base_of<IntervalType, Type>::value ||
+ std::is_base_of<UnionType, Type>::value,
+ Status>
+ Visit(const Type& type) {
+ return Status::NotImplemented("No implemented conversion to object dtype: ",
+ type.ToString());
+ }
+class ObjectWriter : public TypedPandasWriter<NPY_OBJECT> {
+ public:
+ using TypedPandasWriter<NPY_OBJECT>::TypedPandasWriter;
+ Status CopyInto(std::shared_ptr<ChunkedArray> data, int64_t rel_placement) override {
+ PyAcquireGIL lock;
+ ObjectWriterVisitor visitor{this->options_, *data,
+ this->GetBlockColumnStart(rel_placement)};
+ return VisitTypeInline(*data->type(), &visitor);
+ }
+static inline bool IsNonNullContiguous(const ChunkedArray& data) {
+ return data.num_chunks() == 1 && data.null_count() == 0;
+template <int NPY_TYPE>
+class IntWriter : public TypedPandasWriter<NPY_TYPE> {
+ public:
+ using ArrowType = typename npy_traits<NPY_TYPE>::TypeClass;
+ using TypedPandasWriter<NPY_TYPE>::TypedPandasWriter;
+ bool CanZeroCopy(const ChunkedArray& data) const override {
+ return IsNonNullContiguous(data);
+ }
+ Status CopyInto(std::shared_ptr<ChunkedArray> data, int64_t rel_placement) override {
+ RETURN_NOT_OK(this->CheckTypeExact(*data->type(), ArrowType::type_id));
+ ConvertIntegerNoNullsSameType<typename ArrowType::c_type>(
+ this->options_, *data, this->GetBlockColumnStart(rel_placement));
+ return Status::OK();
+ }
+template <int NPY_TYPE>
+class FloatWriter : public TypedPandasWriter<NPY_TYPE> {
+ public:
+ using ArrowType = typename npy_traits<NPY_TYPE>::TypeClass;
+ using TypedPandasWriter<NPY_TYPE>::TypedPandasWriter;
+ using T = typename ArrowType::c_type;
+ bool CanZeroCopy(const ChunkedArray& data) const override {
+ return IsNonNullContiguous(data) && data.type()->id() == ArrowType::type_id;
+ }
+ Status CopyInto(std::shared_ptr<ChunkedArray> data, int64_t rel_placement) override {
+ Type::type in_type = data->type()->id();
+ auto out_values = this->GetBlockColumnStart(rel_placement);
+ ConvertIntegerWithNulls<IN_TYPE, T>(this->options_, *data, out_values); \
+ break;
+ switch (in_type) {
+ case Type::UINT8:
+ INTEGER_CASE(uint8_t);
+ case Type::INT8:
+ INTEGER_CASE(int8_t);
+ case Type::UINT16:
+ INTEGER_CASE(uint16_t);
+ case Type::INT16:
+ INTEGER_CASE(int16_t);
+ case Type::UINT32:
+ INTEGER_CASE(uint32_t);
+ case Type::INT32:
+ INTEGER_CASE(int32_t);
+ case Type::UINT64:
+ INTEGER_CASE(uint64_t);
+ case Type::INT64:
+ INTEGER_CASE(int64_t);
+ case Type::HALF_FLOAT:
+ ConvertNumericNullableCast(*data, npy_traits<NPY_TYPE>::na_sentinel, out_values);
+ case Type::FLOAT:
+ ConvertNumericNullableCast(*data, npy_traits<NPY_TYPE>::na_sentinel, out_values);
+ break;
+ case Type::DOUBLE:
+ ConvertNumericNullableCast(*data, npy_traits<NPY_TYPE>::na_sentinel, out_values);
+ break;
+ default:
+ return Status::NotImplemented("Cannot write Arrow data of type ",
+ data->type()->ToString(),
+ " to a Pandas floating point block");
+ }
+ return Status::OK();
+ }
+using UInt8Writer = IntWriter<NPY_UINT8>;
+using Int8Writer = IntWriter<NPY_INT8>;
+using UInt16Writer = IntWriter<NPY_UINT16>;
+using Int16Writer = IntWriter<NPY_INT16>;
+using UInt32Writer = IntWriter<NPY_UINT32>;
+using Int32Writer = IntWriter<NPY_INT32>;
+using UInt64Writer = IntWriter<NPY_UINT64>;
+using Int64Writer = IntWriter<NPY_INT64>;
+using Float16Writer = FloatWriter<NPY_FLOAT16>;
+using Float32Writer = FloatWriter<NPY_FLOAT32>;
+using Float64Writer = FloatWriter<NPY_FLOAT64>;
+class BoolWriter : public TypedPandasWriter<NPY_BOOL> {
+ public:
+ using TypedPandasWriter<NPY_BOOL>::TypedPandasWriter;
+ Status TransferSingle(std::shared_ptr<ChunkedArray> data, PyObject* py_ref) override {
+ CheckNoZeroCopy("Zero copy conversions not possible with "
+ "boolean types"));
+ RETURN_NOT_OK(EnsureAllocated());
+ return CopyInto(data, /*rel_placement=*/0);
+ }
+ Status CopyInto(std::shared_ptr<ChunkedArray> data, int64_t rel_placement) override {
+ RETURN_NOT_OK(this->CheckTypeExact(*data->type(), Type::BOOL));
+ auto out_values = this->GetBlockColumnStart(rel_placement);
+ for (int c = 0; c < data->num_chunks(); c++) {
+ const auto& arr = checked_cast<const BooleanArray&>(*data->chunk(c));
+ for (int64_t i = 0; i < arr.length(); ++i) {
+ *out_values++ = static_cast<uint8_t>(arr.Value(i));
+ }
+ }
+ return Status::OK();
+ }
+// ----------------------------------------------------------------------
+// Date / timestamp types
+template <typename T, int64_t SHIFT>
+inline void ConvertDatetimeLikeNanos(const ChunkedArray& data, int64_t* out_values) {
+ for (int c = 0; c < data.num_chunks(); c++) {
+ const auto& arr = *data.chunk(c);
+ const T* in_values = GetPrimitiveValues<T>(arr);
+ for (int64_t i = 0; i < arr.length(); ++i) {
+ *out_values++ = arr.IsNull(i) ? kPandasTimestampNull
+ : (static_cast<int64_t>(in_values[i]) * SHIFT);
+ }
+ }
+template <typename T, int SHIFT>
+void ConvertDatesShift(const ChunkedArray& data, int64_t* out_values) {
+ for (int c = 0; c < data.num_chunks(); c++) {
+ const auto& arr = *data.chunk(c);
+ const T* in_values = GetPrimitiveValues<T>(arr);
+ for (int64_t i = 0; i < arr.length(); ++i) {
+ *out_values++ = arr.IsNull(i) ? kPandasTimestampNull
+ : static_cast<int64_t>(in_values[i]) / SHIFT;
+ }
+ }
+class DatetimeDayWriter : public TypedPandasWriter<NPY_DATETIME> {
+ public:
+ using TypedPandasWriter<NPY_DATETIME>::TypedPandasWriter;
+ Status CopyInto(std::shared_ptr<ChunkedArray> data, int64_t rel_placement) override {
+ int64_t* out_values = this->GetBlockColumnStart(rel_placement);
+ const auto& type = checked_cast<const DateType&>(*data->type());
+ switch (type.unit()) {
+ case DateUnit::DAY:
+ ConvertDatesShift<int32_t, 1LL>(*data, out_values);
+ break;
+ case DateUnit::MILLI:
+ ConvertDatesShift<int64_t, 86400000LL>(*data, out_values);
+ break;
+ }
+ return Status::OK();
+ }
+ protected:
+ Status Allocate() override {
+ RETURN_NOT_OK(this->AllocateNDArray(NPY_DATETIME));
+ SetDatetimeUnit(NPY_FR_D);
+ return Status::OK();
+ }
+template <TimeUnit::type UNIT>
+class DatetimeWriter : public TypedPandasWriter<NPY_DATETIME> {
+ public:
+ using TypedPandasWriter<NPY_DATETIME>::TypedPandasWriter;
+ bool CanZeroCopy(const ChunkedArray& data) const override {
+ if (data.type()->id() == Type::TIMESTAMP) {
+ const auto& type = checked_cast<const TimestampType&>(*data.type());
+ return IsNonNullContiguous(data) && type.unit() == UNIT;
+ } else {
+ return false;
+ }
+ }
+ Status CopyInto(std::shared_ptr<ChunkedArray> data, int64_t rel_placement) override {
+ const auto& ts_type = checked_cast<const TimestampType&>(*data->type());
+ DCHECK_EQ(UNIT, ts_type.unit()) << "Should only call instances of this writer "
+ << "with arrays of the correct unit";
+ ConvertNumericNullable<int64_t>(*data, kPandasTimestampNull,
+ this->GetBlockColumnStart(rel_placement));
+ return Status::OK();
+ }
+ protected:
+ Status Allocate() override {
+ RETURN_NOT_OK(this->AllocateNDArray(NPY_DATETIME));
+ SetDatetimeUnit(internal::NumPyFrequency(UNIT));
+ return Status::OK();
+ }
+using DatetimeSecondWriter = DatetimeWriter<TimeUnit::SECOND>;
+using DatetimeMilliWriter = DatetimeWriter<TimeUnit::MILLI>;
+using DatetimeMicroWriter = DatetimeWriter<TimeUnit::MICRO>;
+class DatetimeNanoWriter : public DatetimeWriter<TimeUnit::NANO> {
+ public:
+ using DatetimeWriter<TimeUnit::NANO>::DatetimeWriter;
+ Status CopyInto(std::shared_ptr<ChunkedArray> data, int64_t rel_placement) override {
+ Type::type type = data->type()->id();
+ int64_t* out_values = this->GetBlockColumnStart(rel_placement);
+ compute::ExecContext ctx(options_.pool);
+ compute::CastOptions options;
+ if (options_.safe_cast) {
+ options = compute::CastOptions::Safe();
+ } else {
+ options = compute::CastOptions::Unsafe();
+ }
+ Datum out;
+ auto target_type = timestamp(TimeUnit::NANO);
+ if (type == Type::DATE32) {
+ // Convert from days since epoch to datetime64[ns]
+ ConvertDatetimeLikeNanos<int32_t, kNanosecondsInDay>(*data, out_values);
+ } else if (type == Type::DATE64) {
+ // Date64Type is millisecond timestamp stored as int64_t
+ // TODO(wesm): Do we want to make sure to zero out the milliseconds?
+ ConvertDatetimeLikeNanos<int64_t, 1000000L>(*data, out_values);
+ } else if (type == Type::TIMESTAMP) {
+ const auto& ts_type = checked_cast<const TimestampType&>(*data->type());
+ if (ts_type.unit() == TimeUnit::NANO) {
+ ConvertNumericNullable<int64_t>(*data, kPandasTimestampNull, out_values);
+ } else if (ts_type.unit() == TimeUnit::MICRO || ts_type.unit() == TimeUnit::MILLI ||
+ ts_type.unit() == TimeUnit::SECOND) {
+ ARROW_ASSIGN_OR_RAISE(out, compute::Cast(data, target_type, options, &ctx));
+ ConvertNumericNullable<int64_t>(*out.chunked_array(), kPandasTimestampNull,
+ out_values);
+ } else {
+ return Status::NotImplemented("Unsupported time unit");
+ }
+ } else {
+ return Status::NotImplemented("Cannot write Arrow data of type ",
+ data->type()->ToString(),
+ " to a Pandas datetime block.");
+ }
+ return Status::OK();
+ }
+class DatetimeTZWriter : public DatetimeNanoWriter {
+ public:
+ DatetimeTZWriter(const PandasOptions& options, const std::string& timezone,
+ int64_t num_rows)
+ : DatetimeNanoWriter(options, num_rows, 1), timezone_(timezone) {}
+ protected:
+ Status GetResultBlock(PyObject** out) override {
+ RETURN_NOT_OK(MakeBlock1D());
+ *out = block_arr_.obj();
+ return Status::OK();
+ }
+ Status AddResultMetadata(PyObject* result) override {
+ PyObject* py_tz = PyUnicode_FromStringAndSize(
+ timezone_.c_str(), static_cast<Py_ssize_t>(timezone_.size()));
+ PyDict_SetItemString(result, "timezone", py_tz);
+ Py_DECREF(py_tz);
+ return Status::OK();
+ }
+ private:
+ std::string timezone_;
+template <TimeUnit::type UNIT>
+class TimedeltaWriter : public TypedPandasWriter<NPY_TIMEDELTA> {
+ public:
+ using TypedPandasWriter<NPY_TIMEDELTA>::TypedPandasWriter;
+ Status AllocateTimedelta(int ndim) {
+ RETURN_NOT_OK(this->AllocateNDArray(NPY_TIMEDELTA, ndim));
+ SetDatetimeUnit(internal::NumPyFrequency(UNIT));
+ return Status::OK();
+ }
+ bool CanZeroCopy(const ChunkedArray& data) const override {
+ const auto& type = checked_cast<const DurationType&>(*data.type());
+ return IsNonNullContiguous(data) && type.unit() == UNIT;
+ }
+ Status CopyInto(std::shared_ptr<ChunkedArray> data, int64_t rel_placement) override {
+ const auto& type = checked_cast<const DurationType&>(*data->type());
+ DCHECK_EQ(UNIT, type.unit()) << "Should only call instances of this writer "
+ << "with arrays of the correct unit";
+ ConvertNumericNullable<int64_t>(*data, kPandasTimestampNull,
+ this->GetBlockColumnStart(rel_placement));
+ return Status::OK();
+ }
+ protected:
+ Status Allocate() override { return AllocateTimedelta(2); }
+using TimedeltaSecondWriter = TimedeltaWriter<TimeUnit::SECOND>;
+using TimedeltaMilliWriter = TimedeltaWriter<TimeUnit::MILLI>;
+using TimedeltaMicroWriter = TimedeltaWriter<TimeUnit::MICRO>;
+class TimedeltaNanoWriter : public TimedeltaWriter<TimeUnit::NANO> {
+ public:
+ using TimedeltaWriter<TimeUnit::NANO>::TimedeltaWriter;
+ Status CopyInto(std::shared_ptr<ChunkedArray> data, int64_t rel_placement) override {
+ Type::type type = data->type()->id();
+ int64_t* out_values = this->GetBlockColumnStart(rel_placement);
+ if (type == Type::DURATION) {
+ const auto& ts_type = checked_cast<const DurationType&>(*data->type());
+ if (ts_type.unit() == TimeUnit::NANO) {
+ ConvertNumericNullable<int64_t>(*data, kPandasTimestampNull, out_values);
+ } else if (ts_type.unit() == TimeUnit::MICRO) {
+ ConvertDatetimeLikeNanos<int64_t, 1000L>(*data, out_values);
+ } else if (ts_type.unit() == TimeUnit::MILLI) {
+ ConvertDatetimeLikeNanos<int64_t, 1000000L>(*data, out_values);
+ } else if (ts_type.unit() == TimeUnit::SECOND) {
+ ConvertDatetimeLikeNanos<int64_t, 1000000000L>(*data, out_values);
+ } else {
+ return Status::NotImplemented("Unsupported time unit");
+ }
+ } else {
+ return Status::NotImplemented("Cannot write Arrow data of type ",
+ data->type()->ToString(),
+ " to a Pandas timedelta block.");
+ }
+ return Status::OK();
+ }
+Status MakeZeroLengthArray(const std::shared_ptr<DataType>& type,
+ std::shared_ptr<Array>* out) {
+ std::unique_ptr<ArrayBuilder> builder;
+ RETURN_NOT_OK(MakeBuilder(default_memory_pool(), type, &builder));
+ RETURN_NOT_OK(builder->Resize(0));
+ return builder->Finish(out);
+bool NeedDictionaryUnification(const ChunkedArray& data) {
+ if (data.num_chunks() < 2) {
+ return false;
+ }
+ const auto& arr_first = checked_cast<const DictionaryArray&>(*data.chunk(0));
+ for (int c = 1; c < data.num_chunks(); c++) {
+ const auto& arr = checked_cast<const DictionaryArray&>(*data.chunk(c));
+ if (!(arr_first.dictionary()->Equals(arr.dictionary()))) {
+ return true;
+ }
+ }
+ return false;
+template <typename IndexType>
+class CategoricalWriter
+ : public TypedPandasWriter<arrow_traits<IndexType::type_id>::npy_type> {
+ public:
+ using TRAITS = arrow_traits<IndexType::type_id>;
+ using ArrayType = typename TypeTraits<IndexType>::ArrayType;
+ using T = typename TRAITS::T;
+ explicit CategoricalWriter(const PandasOptions& options, int64_t num_rows)
+ : TypedPandasWriter<TRAITS::npy_type>(options, num_rows, 1),
+ ordered_(false),
+ needs_copy_(false) {}
+ Status CopyInto(std::shared_ptr<ChunkedArray> data, int64_t rel_placement) override {
+ return Status::NotImplemented("categorical type");
+ }
+ Status TransferSingle(std::shared_ptr<ChunkedArray> data, PyObject* py_ref) override {
+ const auto& dict_type = checked_cast<const DictionaryType&>(*data->type());
+ std::shared_ptr<Array> dict;
+ if (data->num_chunks() == 0) {
+ // no dictionary values => create empty array
+ RETURN_NOT_OK(this->AllocateNDArray(TRAITS::npy_type, 1));
+ RETURN_NOT_OK(MakeZeroLengthArray(dict_type.value_type(), &dict));
+ } else {
+ DCHECK_EQ(IndexType::type_id, dict_type.index_type()->id());
+ RETURN_NOT_OK(WriteIndices(*data, &dict));
+ }
+ PyObject* pydict;
+ RETURN_NOT_OK(ConvertArrayToPandas(this->options_, dict, nullptr, &pydict));
+ dictionary_.reset(pydict);
+ ordered_ = dict_type.ordered();
+ return Status::OK();
+ }
+ Status Write(std::shared_ptr<ChunkedArray> data, int64_t abs_placement,
+ int64_t rel_placement) override {
+ RETURN_NOT_OK(this->EnsurePlacementAllocated());
+ RETURN_NOT_OK(TransferSingle(data, /*py_ref=*/nullptr));
+ this->placement_data_[rel_placement] = abs_placement;
+ return Status::OK();
+ }
+ Status GetSeriesResult(PyObject** out) override {
+ PyAcquireGIL lock;
+ PyObject* result = PyDict_New();
+ // Expected single array dictionary layout
+ PyDict_SetItemString(result, "indices", this->block_arr_.obj());
+ RETURN_NOT_OK(AddResultMetadata(result));
+ *out = result;
+ return Status::OK();
+ }
+ protected:
+ Status AddResultMetadata(PyObject* result) override {
+ PyDict_SetItemString(result, "dictionary", dictionary_.obj());
+ PyObject* py_ordered = ordered_ ? Py_True : Py_False;
+ Py_INCREF(py_ordered);
+ PyDict_SetItemString(result, "ordered", py_ordered);
+ return Status::OK();
+ }
+ Status WriteIndicesUniform(const ChunkedArray& data) {
+ RETURN_NOT_OK(this->AllocateNDArray(TRAITS::npy_type, 1));
+ T* out_values = reinterpret_cast<T*>(this->block_data_);
+ for (int c = 0; c < data.num_chunks(); c++) {
+ const auto& arr = checked_cast<const DictionaryArray&>(*data.chunk(c));
+ const auto& indices = checked_cast<const ArrayType&>(*arr.indices());
+ auto values = reinterpret_cast<const T*>(indices.raw_values());
+ RETURN_NOT_OK(CheckIndexBounds(*indices.data(), arr.dictionary()->length()));
+ // Null is -1 in CategoricalBlock
+ for (int i = 0; i < arr.length(); ++i) {
+ if (indices.IsValid(i)) {
+ *out_values++ = values[i];
+ } else {
+ *out_values++ = -1;
+ }
+ }
+ }
+ return Status::OK();
+ }
+ Status WriteIndicesVarying(const ChunkedArray& data, std::shared_ptr<Array>* out_dict) {
+ // Yield int32 indices to allow for dictionary outgrowing the current index
+ // type
+ RETURN_NOT_OK(this->AllocateNDArray(NPY_INT32, 1));
+ auto out_values = reinterpret_cast<int32_t*>(this->block_data_);
+ const auto& dict_type = checked_cast<const DictionaryType&>(*data.type());
+ ARROW_ASSIGN_OR_RAISE(auto unifier, DictionaryUnifier::Make(dict_type.value_type(),
+ this->options_.pool));
+ for (int c = 0; c < data.num_chunks(); c++) {
+ const auto& arr = checked_cast<const DictionaryArray&>(*data.chunk(c));
+ const auto& indices = checked_cast<const ArrayType&>(*arr.indices());
+ auto values = reinterpret_cast<const T*>(indices.raw_values());
+ std::shared_ptr<Buffer> transpose_buffer;
+ RETURN_NOT_OK(unifier->Unify(*arr.dictionary(), &transpose_buffer));
+ auto transpose = reinterpret_cast<const int32_t*>(transpose_buffer->data());
+ int64_t dict_length = arr.dictionary()->length();
+ RETURN_NOT_OK(CheckIndexBounds(*indices.data(), dict_length));
+ // Null is -1 in CategoricalBlock
+ for (int i = 0; i < arr.length(); ++i) {
+ if (indices.IsValid(i)) {
+ *out_values++ = transpose[values[i]];
+ } else {
+ *out_values++ = -1;
+ }
+ }
+ }
+ std::shared_ptr<DataType> unused_type;
+ return unifier->GetResult(&unused_type, out_dict);
+ }
+ Status WriteIndices(const ChunkedArray& data, std::shared_ptr<Array>* out_dict) {
+ DCHECK_GT(data.num_chunks(), 0);
+ // Sniff the first chunk
+ const auto& arr_first = checked_cast<const DictionaryArray&>(*data.chunk(0));
+ const auto indices_first = std::static_pointer_cast<ArrayType>(arr_first.indices());
+ if (data.num_chunks() == 1 && indices_first->null_count() == 0) {
+ CheckIndexBounds(*indices_first->data(), arr_first.dictionary()->length()));
+ PyObject* wrapped;
+ npy_intp dims[1] = {static_cast<npy_intp>(this->num_rows_)};
+ RETURN_NOT_OK(MakeNumPyView(indices_first, /*py_ref=*/nullptr, TRAITS::npy_type,
+ /*ndim=*/1, dims, &wrapped));
+ this->SetBlockData(wrapped);
+ *out_dict = arr_first.dictionary();
+ } else {
+ RETURN_NOT_OK(this->CheckNotZeroCopyOnly(data));
+ if (NeedDictionaryUnification(data)) {
+ RETURN_NOT_OK(WriteIndicesVarying(data, out_dict));
+ } else {
+ RETURN_NOT_OK(WriteIndicesUniform(data));
+ *out_dict = arr_first.dictionary();
+ }
+ }
+ return Status::OK();
+ }
+ OwnedRefNoGIL dictionary_;
+ bool ordered_;
+ bool needs_copy_;
+class ExtensionWriter : public PandasWriter {
+ public:
+ using PandasWriter::PandasWriter;
+ Status Allocate() override {
+ // no-op
+ return Status::OK();
+ }
+ Status TransferSingle(std::shared_ptr<ChunkedArray> data, PyObject* py_ref) override {
+ PyAcquireGIL lock;
+ PyObject* py_array;
+ py_array = wrap_chunked_array(data);
+ py_array_.reset(py_array);
+ return Status::OK();
+ }
+ Status CopyInto(std::shared_ptr<ChunkedArray> data, int64_t rel_placement) override {
+ return TransferSingle(data, nullptr);
+ }
+ Status GetDataFrameResult(PyObject** out) override {
+ PyAcquireGIL lock;
+ PyObject* result = PyDict_New();
+ PyDict_SetItemString(result, "py_array", py_array_.obj());
+ PyDict_SetItemString(result, "placement", placement_arr_.obj());
+ *out = result;
+ return Status::OK();
+ }
+ Status GetSeriesResult(PyObject** out) override {
+ *out = py_array_.detach();
+ return Status::OK();
+ }
+ protected:
+ OwnedRefNoGIL py_array_;
+Status MakeWriter(const PandasOptions& options, PandasWriter::type writer_type,
+ const DataType& type, int64_t num_rows, int num_columns,
+ std::shared_ptr<PandasWriter>* writer) {
+ case PandasWriter::NAME: \
+ *writer = std::make_shared<TYPE>(options, num_rows, num_columns); \
+ break;
+ case TYPE::type_id: \
+ *writer = std::make_shared<CategoricalWriter<TYPE>>(options, num_rows); \
+ break;
+ switch (writer_type) {
+ case PandasWriter::CATEGORICAL: {
+ const auto& index_type = *checked_cast<const DictionaryType&>(type).index_type();
+ switch (index_type.id()) {
+ case Type::UINT8:
+ case Type::UINT16:
+ case Type::UINT32:
+ case Type::UINT64:
+ return Status::TypeError(
+ "Converting unsigned dictionary indices to pandas",
+ " not yet supported, index type: ", index_type.ToString());
+ default:
+ // Unreachable
+ DCHECK(false);
+ break;
+ }
+ } break;
+ case PandasWriter::EXTENSION:
+ *writer = std::make_shared<ExtensionWriter>(options, num_rows, num_columns);
+ break;
+ BLOCK_CASE(OBJECT, ObjectWriter);
+ BLOCK_CASE(UINT8, UInt8Writer);
+ BLOCK_CASE(INT8, Int8Writer);
+ BLOCK_CASE(UINT16, UInt16Writer);
+ BLOCK_CASE(INT16, Int16Writer);
+ BLOCK_CASE(UINT32, UInt32Writer);
+ BLOCK_CASE(INT32, Int32Writer);
+ BLOCK_CASE(UINT64, UInt64Writer);
+ BLOCK_CASE(INT64, Int64Writer);
+ BLOCK_CASE(HALF_FLOAT, Float16Writer);
+ BLOCK_CASE(FLOAT, Float32Writer);
+ BLOCK_CASE(DOUBLE, Float64Writer);
+ BLOCK_CASE(BOOL, BoolWriter);
+ BLOCK_CASE(DATETIME_DAY, DatetimeDayWriter);
+ BLOCK_CASE(DATETIME_SECOND, DatetimeSecondWriter);
+ BLOCK_CASE(DATETIME_MILLI, DatetimeMilliWriter);
+ BLOCK_CASE(DATETIME_MICRO, DatetimeMicroWriter);
+ BLOCK_CASE(DATETIME_NANO, DatetimeNanoWriter);
+ BLOCK_CASE(TIMEDELTA_SECOND, TimedeltaSecondWriter);
+ BLOCK_CASE(TIMEDELTA_MILLI, TimedeltaMilliWriter);
+ BLOCK_CASE(TIMEDELTA_MICRO, TimedeltaMicroWriter);
+ BLOCK_CASE(TIMEDELTA_NANO, TimedeltaNanoWriter);
+ case PandasWriter::DATETIME_NANO_TZ: {
+ const auto& ts_type = checked_cast<const TimestampType&>(type);
+ *writer = std::make_shared<DatetimeTZWriter>(options, ts_type.timezone(), num_rows);
+ } break;
+ default:
+ return Status::NotImplemented("Unsupported block type");
+ }
+#undef BLOCK_CASE
+ return Status::OK();
+static Status GetPandasWriterType(const ChunkedArray& data, const PandasOptions& options,
+ PandasWriter::type* output_type) {
+ *output_type = \
+ data.null_count() > 0 \
+ ? options.integer_object_nulls ? PandasWriter::OBJECT : PandasWriter::DOUBLE \
+ : PandasWriter::NAME; \
+ break;
+ switch (data.type()->id()) {
+ case Type::BOOL:
+ *output_type = data.null_count() > 0 ? PandasWriter::OBJECT : PandasWriter::BOOL;
+ break;
+ case Type::UINT8:
+ case Type::INT8:
+ case Type::UINT16:
+ case Type::INT16:
+ case Type::UINT32:
+ case Type::INT32:
+ case Type::UINT64:
+ case Type::INT64:
+ case Type::HALF_FLOAT:
+ *output_type = PandasWriter::HALF_FLOAT;
+ break;
+ case Type::FLOAT:
+ *output_type = PandasWriter::FLOAT;
+ break;
+ case Type::DOUBLE:
+ *output_type = PandasWriter::DOUBLE;
+ break;
+ case Type::STRING: // fall through
+ case Type::LARGE_STRING: // fall through
+ case Type::BINARY: // fall through
+ case Type::LARGE_BINARY:
+ case Type::NA: // fall through
+ case Type::FIXED_SIZE_BINARY: // fall through
+ case Type::STRUCT: // fall through
+ case Type::TIME32: // fall through
+ case Type::TIME64: // fall through
+ case Type::DECIMAL128: // fall through
+ case Type::DECIMAL256: // fall through
+ *output_type = PandasWriter::OBJECT;
+ break;
+ case Type::DATE32: // fall through
+ case Type::DATE64:
+ if (options.date_as_object) {
+ *output_type = PandasWriter::OBJECT;
+ } else {
+ *output_type = options.coerce_temporal_nanoseconds ? PandasWriter::DATETIME_NANO
+ : PandasWriter::DATETIME_DAY;
+ }
+ break;
+ case Type::TIMESTAMP: {
+ const auto& ts_type = checked_cast<const TimestampType&>(*data.type());
+ if (options.timestamp_as_object && ts_type.unit() != TimeUnit::NANO) {
+ // Nanoseconds are never out of bounds for pandas, so in that case
+ // we don't convert to object
+ *output_type = PandasWriter::OBJECT;
+ } else if (!ts_type.timezone().empty()) {
+ *output_type = PandasWriter::DATETIME_NANO_TZ;
+ } else if (options.coerce_temporal_nanoseconds) {
+ *output_type = PandasWriter::DATETIME_NANO;
+ } else {
+ switch (ts_type.unit()) {
+ case TimeUnit::SECOND:
+ *output_type = PandasWriter::DATETIME_SECOND;
+ break;
+ case TimeUnit::MILLI:
+ *output_type = PandasWriter::DATETIME_MILLI;
+ break;
+ case TimeUnit::MICRO:
+ *output_type = PandasWriter::DATETIME_MICRO;
+ break;
+ case TimeUnit::NANO:
+ *output_type = PandasWriter::DATETIME_NANO;
+ break;
+ }
+ }
+ } break;
+ case Type::DURATION: {
+ const auto& dur_type = checked_cast<const DurationType&>(*data.type());
+ if (options.coerce_temporal_nanoseconds) {
+ *output_type = PandasWriter::TIMEDELTA_NANO;
+ } else {
+ switch (dur_type.unit()) {
+ case TimeUnit::SECOND:
+ *output_type = PandasWriter::TIMEDELTA_SECOND;
+ break;
+ case TimeUnit::MILLI:
+ *output_type = PandasWriter::TIMEDELTA_MILLI;
+ break;
+ case TimeUnit::MICRO:
+ *output_type = PandasWriter::TIMEDELTA_MICRO;
+ break;
+ case TimeUnit::NANO:
+ *output_type = PandasWriter::TIMEDELTA_NANO;
+ break;
+ }
+ }
+ } break;
+ case Type::FIXED_SIZE_LIST:
+ case Type::LIST:
+ case Type::LARGE_LIST:
+ case Type::MAP: {
+ auto list_type = std::static_pointer_cast<BaseListType>(data.type());
+ if (!ListTypeSupported(*list_type->value_type())) {
+ return Status::NotImplemented("Not implemented type for Arrow list to pandas: ",
+ list_type->value_type()->ToString());
+ }
+ *output_type = PandasWriter::OBJECT;
+ } break;
+ case Type::DICTIONARY:
+ *output_type = PandasWriter::CATEGORICAL;
+ break;
+ case Type::EXTENSION:
+ *output_type = PandasWriter::EXTENSION;
+ break;
+ default:
+ return Status::NotImplemented(
+ "No known equivalent Pandas block for Arrow data of type ",
+ data.type()->ToString(), " is known.");
+ }
+ return Status::OK();
+// Construct the exact pandas "BlockManager" memory layout
+// * For each column determine the correct output pandas type
+// * Allocate 2D blocks (ncols x nrows) for each distinct data type in output
+// * Allocate block placement arrays
+// * Write Arrow columns out into each slice of memory; populate block
+// * placement arrays as we go
+class PandasBlockCreator {
+ public:
+ using WriterMap = std::unordered_map<int, std::shared_ptr<PandasWriter>>;
+ explicit PandasBlockCreator(const PandasOptions& options, FieldVector fields,
+ ChunkedArrayVector arrays)
+ : options_(options), fields_(std::move(fields)), arrays_(std::move(arrays)) {
+ num_columns_ = static_cast<int>(arrays_.size());
+ if (num_columns_ > 0) {
+ num_rows_ = arrays_[0]->length();
+ }
+ column_block_placement_.resize(num_columns_);
+ }
+ virtual Status Convert(PyObject** out) = 0;
+ Status AppendBlocks(const WriterMap& blocks, PyObject* list) {
+ for (const auto& it : blocks) {
+ PyObject* item;
+ RETURN_NOT_OK(it.second->GetDataFrameResult(&item));
+ if (PyList_Append(list, item) < 0) {
+ }
+ // ARROW-1017; PyList_Append increments object refcount
+ Py_DECREF(item);
+ }
+ return Status::OK();
+ }
+ protected:
+ PandasOptions options_;
+ FieldVector fields_;
+ ChunkedArrayVector arrays_;
+ int num_columns_;
+ int64_t num_rows_;
+ // column num -> relative placement within internal block
+ std::vector<int> column_block_placement_;
+class ConsolidatedBlockCreator : public PandasBlockCreator {
+ public:
+ using PandasBlockCreator::PandasBlockCreator;
+ Status Convert(PyObject** out) override {
+ column_types_.resize(num_columns_);
+ RETURN_NOT_OK(CreateBlocks());
+ RETURN_NOT_OK(WriteTableToBlocks());
+ PyAcquireGIL lock;
+ PyObject* result = PyList_New(0);
+ RETURN_NOT_OK(AppendBlocks(blocks_, result));
+ RETURN_NOT_OK(AppendBlocks(singleton_blocks_, result));
+ *out = result;
+ return Status::OK();
+ }
+ Status GetBlockType(int column_index, PandasWriter::type* out) {
+ if (options_.extension_columns.count(fields_[column_index]->name())) {
+ *out = PandasWriter::EXTENSION;
+ return Status::OK();
+ } else {
+ return GetPandasWriterType(*arrays_[column_index], options_, out);
+ }
+ }
+ Status CreateBlocks() {
+ for (int i = 0; i < num_columns_; ++i) {
+ const DataType& type = *arrays_[i]->type();
+ PandasWriter::type output_type;
+ RETURN_NOT_OK(GetBlockType(i, &output_type));
+ int block_placement = 0;
+ std::shared_ptr<PandasWriter> writer;
+ if (output_type == PandasWriter::CATEGORICAL ||
+ output_type == PandasWriter::DATETIME_NANO_TZ ||
+ output_type == PandasWriter::EXTENSION) {
+ RETURN_NOT_OK(MakeWriter(options_, output_type, type, num_rows_,
+ /*num_columns=*/1, &writer));
+ singleton_blocks_[i] = writer;
+ } else {
+ auto it = block_sizes_.find(output_type);
+ if (it != block_sizes_.end()) {
+ block_placement = it->second;
+ // Increment count
+ ++it->second;
+ } else {
+ // Add key to map
+ block_sizes_[output_type] = 1;
+ }
+ }
+ column_types_[i] = output_type;
+ column_block_placement_[i] = block_placement;
+ }
+ // Create normal non-categorical blocks
+ for (const auto& it : this->block_sizes_) {
+ PandasWriter::type output_type = static_cast<PandasWriter::type>(it.first);
+ std::shared_ptr<PandasWriter> block;
+ RETURN_NOT_OK(MakeWriter(this->options_, output_type, /*unused*/ *null(), num_rows_,
+ it.second, &block));
+ this->blocks_[output_type] = block;
+ }
+ return Status::OK();
+ }
+ Status GetWriter(int i, std::shared_ptr<PandasWriter>* block) {
+ PandasWriter::type output_type = this->column_types_[i];
+ switch (output_type) {
+ case PandasWriter::CATEGORICAL:
+ case PandasWriter::DATETIME_NANO_TZ:
+ case PandasWriter::EXTENSION: {
+ auto it = this->singleton_blocks_.find(i);
+ if (it == this->singleton_blocks_.end()) {
+ return Status::KeyError("No block allocated");
+ }
+ *block = it->second;
+ } break;
+ default:
+ auto it = this->blocks_.find(output_type);
+ if (it == this->blocks_.end()) {
+ return Status::KeyError("No block allocated");
+ }
+ *block = it->second;
+ break;
+ }
+ return Status::OK();
+ }
+ Status WriteTableToBlocks() {
+ auto WriteColumn = [this](int i) {
+ std::shared_ptr<PandasWriter> block;
+ RETURN_NOT_OK(this->GetWriter(i, &block));
+ // ARROW-3789 Use std::move on the array to permit self-destructing
+ return block->Write(std::move(arrays_[i]), i, this->column_block_placement_[i]);
+ };
+ return OptionalParallelFor(options_.use_threads, num_columns_, WriteColumn);
+ }
+ private:
+ // column num -> block type id
+ std::vector<PandasWriter::type> column_types_;
+ // block type -> type count
+ std::unordered_map<int, int> block_sizes_;
+ std::unordered_map<int, const DataType*> block_types_;
+ // block type -> block
+ WriterMap blocks_;
+ WriterMap singleton_blocks_;
+/// \brief Create blocks for pandas.DataFrame block manager using one block per
+/// column strategy. This permits some zero-copy optimizations as well as the
+/// ability for the table to "self-destruct" if selected by the user.
+class SplitBlockCreator : public PandasBlockCreator {
+ public:
+ using PandasBlockCreator::PandasBlockCreator;
+ Status GetWriter(int i, std::shared_ptr<PandasWriter>* writer) {
+ PandasWriter::type output_type = PandasWriter::OBJECT;
+ const DataType& type = *arrays_[i]->type();
+ if (options_.extension_columns.count(fields_[i]->name())) {
+ output_type = PandasWriter::EXTENSION;
+ } else {
+ // Null count needed to determine output type
+ RETURN_NOT_OK(GetPandasWriterType(*arrays_[i], options_, &output_type));
+ }
+ return MakeWriter(this->options_, output_type, type, num_rows_, 1, writer);
+ }
+ Status Convert(PyObject** out) override {
+ PyAcquireGIL lock;
+ PyObject* result = PyList_New(0);
+ for (int i = 0; i < num_columns_; ++i) {
+ std::shared_ptr<PandasWriter> writer;
+ RETURN_NOT_OK(GetWriter(i, &writer));
+ // ARROW-3789 Use std::move on the array to permit self-destructing
+ RETURN_NOT_OK(writer->Write(std::move(arrays_[i]), i, /*rel_placement=*/0));
+ PyObject* item;
+ RETURN_NOT_OK(writer->GetDataFrameResult(&item));
+ if (PyList_Append(result, item) < 0) {
+ }
+ // PyList_Append increments object refcount
+ Py_DECREF(item);
+ }
+ *out = result;
+ return Status::OK();
+ }
+ private:
+ std::vector<std::shared_ptr<PandasWriter>> writers_;
+Status ConvertCategoricals(const PandasOptions& options, ChunkedArrayVector* arrays,
+ FieldVector* fields) {
+ std::vector<int> columns_to_encode;
+ // For Categorical conversions
+ auto EncodeColumn = [&](int j) {
+ int i = columns_to_encode[j];
+ if (options.zero_copy_only) {
+ return Status::Invalid("Need to dictionary encode a column, but ",
+ "only zero-copy conversions allowed");
+ }
+ compute::ExecContext ctx(options.pool);
+ Datum out, DictionaryEncode((*arrays)[i],
+ compute::DictionaryEncodeOptions::Defaults(), &ctx));
+ (*arrays)[i] = out.chunked_array();
+ (*fields)[i] = (*fields)[i]->WithType((*arrays)[i]->type());
+ return Status::OK();
+ };
+ if (!options.categorical_columns.empty()) {
+ for (int i = 0; i < static_cast<int>(arrays->size()); i++) {
+ if ((*arrays)[i]->type()->id() != Type::DICTIONARY &&
+ options.categorical_columns.count((*fields)[i]->name())) {
+ columns_to_encode.push_back(i);
+ }
+ }
+ }
+ if (options.strings_to_categorical) {
+ for (int i = 0; i < static_cast<int>(arrays->size()); i++) {
+ if (is_base_binary_like((*arrays)[i]->type()->id())) {
+ columns_to_encode.push_back(i);
+ }
+ }
+ }
+ return OptionalParallelFor(options.use_threads,
+ static_cast<int>(columns_to_encode.size()), EncodeColumn);
+} // namespace
+Status ConvertArrayToPandas(const PandasOptions& options, std::shared_ptr<Array> arr,
+ PyObject* py_ref, PyObject** out) {
+ return ConvertChunkedArrayToPandas(
+ options, std::make_shared<ChunkedArray>(std::move(arr)), py_ref, out);
+Status ConvertChunkedArrayToPandas(const PandasOptions& options,
+ std::shared_ptr<ChunkedArray> arr, PyObject* py_ref,
+ PyObject** out) {
+ if (options.decode_dictionaries && arr->type()->id() == Type::DICTIONARY) {
+ const auto& dense_type =
+ checked_cast<const DictionaryType&>(*arr->type()).value_type();
+ RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &arr));
+ DCHECK_NE(arr->type()->id(), Type::DICTIONARY);
+ // The original Python DictionaryArray won't own the memory anymore
+ // as we actually built a new array when we decoded the DictionaryArray
+ // thus let the final resulting numpy array own the memory through a Capsule
+ py_ref = nullptr;
+ }
+ if (options.strings_to_categorical && is_base_binary_like(arr->type()->id())) {
+ if (options.zero_copy_only) {
+ return Status::Invalid("Need to dictionary encode a column, but ",
+ "only zero-copy conversions allowed");
+ }
+ compute::ExecContext ctx(options.pool);
+ Datum out,
+ DictionaryEncode(arr, compute::DictionaryEncodeOptions::Defaults(), &ctx));
+ arr = out.chunked_array();
+ }
+ PandasOptions modified_options = options;
+ modified_options.strings_to_categorical = false;
+ // ARROW-7596: We permit the hybrid Series/DataFrame code path to do zero copy
+ // optimizations that we do not allow in the default case when converting
+ // Table->DataFrame
+ modified_options.allow_zero_copy_blocks = true;
+ PandasWriter::type output_type;
+ RETURN_NOT_OK(GetPandasWriterType(*arr, modified_options, &output_type));
+ if (options.decode_dictionaries) {
+ DCHECK_NE(output_type, PandasWriter::CATEGORICAL);
+ }
+ std::shared_ptr<PandasWriter> writer;
+ RETURN_NOT_OK(MakeWriter(modified_options, output_type, *arr->type(), arr->length(),
+ /*num_columns=*/1, &writer));
+ RETURN_NOT_OK(writer->TransferSingle(std::move(arr), py_ref));
+ return writer->GetSeriesResult(out);
+Status ConvertTableToPandas(const PandasOptions& options, std::shared_ptr<Table> table,
+ PyObject** out) {
+ ChunkedArrayVector arrays = table->columns();
+ FieldVector fields = table->fields();
+ // ARROW-3789: allow "self-destructing" by releasing references to columns as
+ // we convert them to pandas
+ table = nullptr;
+ RETURN_NOT_OK(ConvertCategoricals(options, &arrays, &fields));
+ PandasOptions modified_options = options;
+ modified_options.strings_to_categorical = false;
+ modified_options.categorical_columns.clear();
+ if (options.split_blocks) {
+ modified_options.allow_zero_copy_blocks = true;
+ SplitBlockCreator helper(modified_options, std::move(fields), std::move(arrays));
+ return helper.Convert(out);
+ } else {
+ ConsolidatedBlockCreator helper(modified_options, std::move(fields),
+ std::move(arrays));
+ return helper.Convert(out);
+ }
+} // namespace py
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/python/arrow_to_pandas.h b/contrib/libs/apache/arrow/cpp/src/arrow/python/arrow_to_pandas.h
new file mode 100644
index 0000000000..6570364b8d
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/python/arrow_to_pandas.h
@@ -0,0 +1,124 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+// Functions for converting between pandas's NumPy-based data representation
+// and Arrow data structures
+#pragma once
+#include "arrow/python/platform.h"
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include "arrow/memory_pool.h"
+#include "arrow/python/visibility.h"
+namespace arrow {
+class Array;
+class ChunkedArray;
+class Column;
+class DataType;
+class MemoryPool;
+class Status;
+class Table;
+namespace py {
+struct PandasOptions {
+ /// arrow::MemoryPool to use for memory allocations
+ MemoryPool* pool = default_memory_pool();
+ /// If true, we will convert all string columns to categoricals
+ bool strings_to_categorical = false;
+ bool zero_copy_only = false;
+ bool integer_object_nulls = false;
+ bool date_as_object = false;
+ bool timestamp_as_object = false;
+ bool use_threads = false;
+ /// Coerce all date and timestamp to datetime64[ns]
+ bool coerce_temporal_nanoseconds = false;
+ /// Used to maintain backwards compatibility for
+ /// timezone bugs (see ARROW-9528). Should be removed
+ /// after Arrow 2.0 release.
+ bool ignore_timezone = false;
+ /// \brief If true, do not create duplicate PyObject versions of equal
+ /// objects. This only applies to immutable objects like strings or datetime
+ /// objects
+ bool deduplicate_objects = false;
+ /// \brief For certain data types, a cast is needed in order to store the
+ /// data in a pandas DataFrame or Series (e.g. timestamps are always stored
+ /// as nanoseconds in pandas). This option controls whether it is a safe
+ /// cast or not.
+ bool safe_cast = true;
+ /// \brief If true, create one block per column rather than consolidated
+ /// blocks (1 per data type). Do zero-copy wrapping when there are no
+ /// nulls. pandas currently will consolidate the blocks on its own, causing
+ /// increased memory use, so keep this in mind if you are working on a
+ /// memory-constrained situation.
+ bool split_blocks = false;
+ /// \brief If true, allow non-writable zero-copy views to be created for
+ /// single column blocks. This option is also used to provide zero copy for
+ /// Series data
+ bool allow_zero_copy_blocks = false;
+ /// \brief If true, attempt to deallocate buffers in passed Arrow object if
+ /// it is the only remaining shared_ptr copy of it. See ARROW-3789 for
+ /// original context for this feature. Only currently implemented for Table
+ /// conversions
+ bool self_destruct = false;
+ // Used internally for nested arrays.
+ bool decode_dictionaries = false;
+ // Columns that should be casted to categorical
+ std::unordered_set<std::string> categorical_columns;
+ // Columns that should be passed through to be converted to
+ // ExtensionArray/Block
+ std::unordered_set<std::string> extension_columns;
+Status ConvertArrayToPandas(const PandasOptions& options, std::shared_ptr<Array> arr,
+ PyObject* py_ref, PyObject** out);
+Status ConvertChunkedArrayToPandas(const PandasOptions& options,
+ std::shared_ptr<ChunkedArray> col, PyObject* py_ref,
+ PyObject** out);
+// Convert a whole table as efficiently as possible to a pandas.DataFrame.
+// The returned Python object is a list of tuples consisting of the exact 2D
+// BlockManager structure of the pandas.DataFrame used as of pandas 0.19.x.
+// tuple item: (indices: ndarray[int32], block: ndarray[TYPE, ndim=2])
+Status ConvertTableToPandas(const PandasOptions& options, std::shared_ptr<Table> table,
+ PyObject** out);
+} // namespace py
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/python/benchmark.cc b/contrib/libs/apache/arrow/cpp/src/arrow/python/benchmark.cc
new file mode 100644
index 0000000000..2d29f69d25
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/python/benchmark.cc
@@ -0,0 +1,38 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#include <arrow/python/benchmark.h>
+#include <arrow/python/helpers.h>
+namespace arrow {
+namespace py {
+namespace benchmark {
+void Benchmark_PandasObjectIsNull(PyObject* list) {
+ if (!PyList_CheckExact(list)) {
+ PyErr_SetString(PyExc_TypeError, "expected a list");
+ return;
+ }
+ Py_ssize_t i, n = PyList_GET_SIZE(list);
+ for (i = 0; i < n; i++) {
+ internal::PandasObjectIsNull(PyList_GET_ITEM(list, i));
+ }
+} // namespace benchmark
+} // namespace py
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/python/benchmark.h b/contrib/libs/apache/arrow/cpp/src/arrow/python/benchmark.h
new file mode 100644
index 0000000000..8060dd3372
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/python/benchmark.h
@@ -0,0 +1,36 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+#include "arrow/python/platform.h"
+#include "arrow/python/visibility.h"
+namespace arrow {
+namespace py {
+namespace benchmark {
+// Micro-benchmark routines for use from ASV
+// Run PandasObjectIsNull() once over every object in *list*
+void Benchmark_PandasObjectIsNull(PyObject* list);
+} // namespace benchmark
+} // namespace py
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/python/common.cc b/contrib/libs/apache/arrow/cpp/src/arrow/python/common.cc
new file mode 100644
index 0000000000..6fe2ed4dae
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/python/common.cc
@@ -0,0 +1,203 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#include "arrow/python/common.h"
+#include <cstdlib>
+#include <mutex>
+#include <string>
+#include "arrow/memory_pool.h"
+#include "arrow/status.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/logging.h"
+#include "arrow/python/helpers.h"
+namespace arrow {
+using internal::checked_cast;
+namespace py {
+static std::mutex memory_pool_mutex;
+static MemoryPool* default_python_pool = nullptr;
+void set_default_memory_pool(MemoryPool* pool) {
+ std::lock_guard<std::mutex> guard(memory_pool_mutex);
+ default_python_pool = pool;
+MemoryPool* get_memory_pool() {
+ std::lock_guard<std::mutex> guard(memory_pool_mutex);
+ if (default_python_pool) {
+ return default_python_pool;
+ } else {
+ return default_memory_pool();
+ }
+// ----------------------------------------------------------------------
+// PythonErrorDetail
+namespace {
+const char kErrorDetailTypeId[] = "arrow::py::PythonErrorDetail";
+// Try to match the Python exception type with an appropriate Status code
+StatusCode MapPyError(PyObject* exc_type) {
+ StatusCode code;
+ if (PyErr_GivenExceptionMatches(exc_type, PyExc_MemoryError)) {
+ code = StatusCode::OutOfMemory;
+ } else if (PyErr_GivenExceptionMatches(exc_type, PyExc_IndexError)) {
+ code = StatusCode::IndexError;
+ } else if (PyErr_GivenExceptionMatches(exc_type, PyExc_KeyError)) {
+ code = StatusCode::KeyError;
+ } else if (PyErr_GivenExceptionMatches(exc_type, PyExc_TypeError)) {
+ code = StatusCode::TypeError;
+ } else if (PyErr_GivenExceptionMatches(exc_type, PyExc_ValueError) ||
+ PyErr_GivenExceptionMatches(exc_type, PyExc_OverflowError)) {
+ code = StatusCode::Invalid;
+ } else if (PyErr_GivenExceptionMatches(exc_type, PyExc_EnvironmentError)) {
+ code = StatusCode::IOError;
+ } else if (PyErr_GivenExceptionMatches(exc_type, PyExc_NotImplementedError)) {
+ code = StatusCode::NotImplemented;
+ } else {
+ code = StatusCode::UnknownError;
+ }
+ return code;
+// PythonErrorDetail indicates a Python exception was raised.
+class PythonErrorDetail : public StatusDetail {
+ public:
+ const char* type_id() const override { return kErrorDetailTypeId; }
+ std::string ToString() const override {
+ // This is simple enough not to need the GIL
+ const auto ty = reinterpret_cast<const PyTypeObject*>(exc_type_.obj());
+ // XXX Should we also print traceback?
+ return std::string("Python exception: ") + ty->tp_name;
+ }
+ void RestorePyError() const {
+ Py_INCREF(exc_type_.obj());
+ Py_INCREF(exc_value_.obj());
+ Py_INCREF(exc_traceback_.obj());
+ PyErr_Restore(exc_type_.obj(), exc_value_.obj(), exc_traceback_.obj());
+ }
+ PyObject* exc_type() const { return exc_type_.obj(); }
+ PyObject* exc_value() const { return exc_value_.obj(); }
+ static std::shared_ptr<PythonErrorDetail> FromPyError() {
+ PyObject* exc_type = nullptr;
+ PyObject* exc_value = nullptr;
+ PyObject* exc_traceback = nullptr;
+ PyErr_Fetch(&exc_type, &exc_value, &exc_traceback);
+ PyErr_NormalizeException(&exc_type, &exc_value, &exc_traceback);
+ ARROW_CHECK(exc_type)
+ << "PythonErrorDetail::FromPyError called without a Python error set";
+ DCHECK(PyType_Check(exc_type));
+ DCHECK(exc_value); // Ensured by PyErr_NormalizeException, double-check
+ if (exc_traceback == nullptr) {
+ // Needed by PyErr_Restore()
+ Py_INCREF(Py_None);
+ exc_traceback = Py_None;
+ }
+ std::shared_ptr<PythonErrorDetail> detail(new PythonErrorDetail);
+ detail->exc_type_.reset(exc_type);
+ detail->exc_value_.reset(exc_value);
+ detail->exc_traceback_.reset(exc_traceback);
+ return detail;
+ }
+ protected:
+ PythonErrorDetail() = default;
+ OwnedRefNoGIL exc_type_, exc_value_, exc_traceback_;
+} // namespace
+// ----------------------------------------------------------------------
+// Python exception <-> Status
+Status ConvertPyError(StatusCode code) {
+ auto detail = PythonErrorDetail::FromPyError();
+ if (code == StatusCode::UnknownError) {
+ code = MapPyError(detail->exc_type());
+ }
+ std::string message;
+ RETURN_NOT_OK(internal::PyObject_StdStringStr(detail->exc_value(), &message));
+ return Status(code, message, detail);
+bool IsPyError(const Status& status) {
+ if (status.ok()) {
+ return false;
+ }
+ auto detail = status.detail();
+ bool result = detail != nullptr && detail->type_id() == kErrorDetailTypeId;
+ return result;
+void RestorePyError(const Status& status) {
+ ARROW_CHECK(IsPyError(status));
+ const auto& detail = checked_cast<const PythonErrorDetail&>(*status.detail());
+ detail.RestorePyError();
+// ----------------------------------------------------------------------
+// PyBuffer
+PyBuffer::PyBuffer() : Buffer(nullptr, 0) {}
+Status PyBuffer::Init(PyObject* obj) {
+ if (!PyObject_GetBuffer(obj, &py_buf_, PyBUF_ANY_CONTIGUOUS)) {
+ data_ = reinterpret_cast<const uint8_t*>(py_buf_.buf);
+ ARROW_CHECK_NE(data_, nullptr) << "Null pointer in Py_buffer";
+ size_ = py_buf_.len;
+ capacity_ = py_buf_.len;
+ is_mutable_ = !py_buf_.readonly;
+ return Status::OK();
+ } else {
+ return ConvertPyError(StatusCode::Invalid);
+ }
+Result<std::shared_ptr<Buffer>> PyBuffer::FromPyObject(PyObject* obj) {
+ PyBuffer* buf = new PyBuffer();
+ std::shared_ptr<Buffer> res(buf);
+ RETURN_NOT_OK(buf->Init(obj));
+ return res;
+PyBuffer::~PyBuffer() {
+ if (data_ != nullptr) {
+ PyAcquireGIL lock;
+ PyBuffer_Release(&py_buf_);
+ }
+} // namespace py
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/python/common.h b/contrib/libs/apache/arrow/cpp/src/arrow/python/common.h
new file mode 100644
index 0000000000..24dcb130a2
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/python/common.h
@@ -0,0 +1,360 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+#include <memory>
+#include <utility>
+#include "arrow/buffer.h"
+#include "arrow/python/pyarrow.h"
+#include "arrow/python/visibility.h"
+#include "arrow/result.h"
+#include "arrow/util/macros.h"
+namespace arrow {
+class MemoryPool;
+template <class T>
+class Result;
+namespace py {
+// Convert current Python error to a Status. The Python error state is cleared
+// and can be restored with RestorePyError().
+ARROW_PYTHON_EXPORT Status ConvertPyError(StatusCode code = StatusCode::UnknownError);
+// Query whether the given Status is a Python error (as wrapped by ConvertPyError()).
+ARROW_PYTHON_EXPORT bool IsPyError(const Status& status);
+// Restore a Python error wrapped in a Status.
+ARROW_PYTHON_EXPORT void RestorePyError(const Status& status);
+// Catch a pending Python exception and return the corresponding Status.
+// If no exception is pending, Status::OK() is returned.
+inline Status CheckPyError(StatusCode code = StatusCode::UnknownError) {
+ if (ARROW_PREDICT_TRUE(!PyErr_Occurred())) {
+ return Status::OK();
+ } else {
+ return ConvertPyError(code);
+ }
+// For Cython, as you can't define template C++ functions in Cython, only use them.
+// This function can set a Python exception. It assumes that T has a (cheap)
+// default constructor.
+template <class T>
+T GetResultValue(Result<T> result) {
+ if (ARROW_PREDICT_TRUE(result.ok())) {
+ return *std::move(result);
+ } else {
+ int r = internal::check_status(result.status()); // takes the GIL
+ assert(r == -1); // should have errored out
+ return {};
+ }
+// A RAII-style helper that ensures the GIL is acquired inside a lexical block.
+ public:
+ PyAcquireGIL() : acquired_gil_(false) { acquire(); }
+ ~PyAcquireGIL() { release(); }
+ void acquire() {
+ if (!acquired_gil_) {
+ state_ = PyGILState_Ensure();
+ acquired_gil_ = true;
+ }
+ }
+ // idempotent
+ void release() {
+ if (acquired_gil_) {
+ PyGILState_Release(state_);
+ acquired_gil_ = false;
+ }
+ }
+ private:
+ bool acquired_gil_;
+ PyGILState_STATE state_;
+// A RAII-style helper that releases the GIL until the end of a lexical block
+ public:
+ PyReleaseGIL() { saved_state_ = PyEval_SaveThread(); }
+ ~PyReleaseGIL() { PyEval_RestoreThread(saved_state_); }
+ private:
+ PyThreadState* saved_state_;
+// A helper to call safely into the Python interpreter from arbitrary C++ code.
+// The GIL is acquired, and the current thread's error status is preserved.
+template <typename Function>
+auto SafeCallIntoPython(Function&& func) -> decltype(func()) {
+ PyAcquireGIL lock;
+ PyObject* exc_type;
+ PyObject* exc_value;
+ PyObject* exc_traceback;
+ PyErr_Fetch(&exc_type, &exc_value, &exc_traceback);
+ auto maybe_status = std::forward<Function>(func)();
+ // If the return Status is a "Python error", the current Python error status
+ // describes the error and shouldn't be clobbered.
+ if (!IsPyError(::arrow::internal::GenericToStatus(maybe_status)) &&
+ exc_type != NULLPTR) {
+ PyErr_Restore(exc_type, exc_value, exc_traceback);
+ }
+ return maybe_status;
+// A RAII primitive that DECREFs the underlying PyObject* when it
+// goes out of scope.
+ public:
+ OwnedRef() : obj_(NULLPTR) {}
+ OwnedRef(OwnedRef&& other) : OwnedRef(other.detach()) {}
+ explicit OwnedRef(PyObject* obj) : obj_(obj) {}
+ OwnedRef& operator=(OwnedRef&& other) {
+ obj_ = other.detach();
+ return *this;
+ }
+ ~OwnedRef() { reset(); }
+ void reset(PyObject* obj) {
+ Py_XDECREF(obj_);
+ obj_ = obj;
+ }
+ void reset() { reset(NULLPTR); }
+ PyObject* detach() {
+ PyObject* result = obj_;
+ obj_ = NULLPTR;
+ return result;
+ }
+ PyObject* obj() const { return obj_; }
+ PyObject** ref() { return &obj_; }
+ operator bool() const { return obj_ != NULLPTR; }
+ private:
+ PyObject* obj_;
+// Same as OwnedRef, but ensures the GIL is taken when it goes out of scope.
+// This is for situations where the GIL is not always known to be held
+// (e.g. if it is released in the middle of a function for performance reasons)
+class ARROW_PYTHON_EXPORT OwnedRefNoGIL : public OwnedRef {
+ public:
+ OwnedRefNoGIL() : OwnedRef() {}
+ OwnedRefNoGIL(OwnedRefNoGIL&& other) : OwnedRef(other.detach()) {}
+ explicit OwnedRefNoGIL(PyObject* obj) : OwnedRef(obj) {}
+ ~OwnedRefNoGIL() {
+ PyAcquireGIL lock;
+ reset();
+ }
+template <typename Fn>
+struct BoundFunction;
+template <typename... Args>
+struct BoundFunction<void(PyObject*, Args...)> {
+ // We bind `cdef void fn(object, ...)` to get a `Status(...)`
+ // where the Status contains any Python error raised by `fn`
+ using Unbound = void(PyObject*, Args...);
+ using Bound = Status(Args...);
+ BoundFunction(Unbound* unbound, PyObject* bound_arg)
+ : bound_arg_(bound_arg), unbound_(unbound) {}
+ Status Invoke(Args... args) const {
+ PyAcquireGIL lock;
+ unbound_(bound_arg_.obj(), std::forward<Args>(args)...);
+ return Status::OK();
+ }
+ Unbound* unbound_;
+ OwnedRefNoGIL bound_arg_;
+template <typename Return, typename... Args>
+struct BoundFunction<Return(PyObject*, Args...)> {
+ // We bind `cdef Return fn(object, ...)` to get a `Result<Return>(...)`
+ // where the Result contains any Python error raised by `fn` or the
+ // return value from `fn`.
+ using Unbound = Return(PyObject*, Args...);
+ using Bound = Result<Return>(Args...);
+ BoundFunction(Unbound* unbound, PyObject* bound_arg)
+ : bound_arg_(bound_arg), unbound_(unbound) {}
+ Result<Return> Invoke(Args... args) const {
+ PyAcquireGIL lock;
+ Return ret = unbound_(bound_arg_.obj(), std::forward<Args>(args)...);
+ return ret;
+ }
+ Unbound* unbound_;
+ OwnedRefNoGIL bound_arg_;
+template <typename OutFn, typename Return, typename... Args>
+std::function<OutFn> BindFunction(Return (*unbound)(PyObject*, Args...),
+ PyObject* bound_arg) {
+ using Fn = BoundFunction<Return(PyObject*, Args...)>;
+ static_assert(std::is_same<typename Fn::Bound, OutFn>::value,
+ "requested bound function of unsupported type");
+ Py_XINCREF(bound_arg);
+ auto bound_fn = std::make_shared<Fn>(unbound, bound_arg);
+ return
+ [bound_fn](Args... args) { return bound_fn->Invoke(std::forward<Args>(args)...); };
+// A temporary conversion of a Python object to a bytes area.
+struct PyBytesView {
+ const char* bytes;
+ Py_ssize_t size;
+ bool is_utf8;
+ static Result<PyBytesView> FromString(PyObject* obj, bool check_utf8 = false) {
+ PyBytesView self;
+ ARROW_RETURN_NOT_OK(self.ParseString(obj, check_utf8));
+ return std::move(self);
+ }
+ static Result<PyBytesView> FromUnicode(PyObject* obj) {
+ PyBytesView self;
+ ARROW_RETURN_NOT_OK(self.ParseUnicode(obj));
+ return std::move(self);
+ }
+ static Result<PyBytesView> FromBinary(PyObject* obj) {
+ PyBytesView self;
+ ARROW_RETURN_NOT_OK(self.ParseBinary(obj));
+ return std::move(self);
+ }
+ // View the given Python object as string-like, i.e. str or (utf8) bytes
+ Status ParseString(PyObject* obj, bool check_utf8 = false) {
+ if (PyUnicode_Check(obj)) {
+ return ParseUnicode(obj);
+ } else {
+ ARROW_RETURN_NOT_OK(ParseBinary(obj));
+ if (check_utf8) {
+ // Check the bytes are utf8 utf-8
+ OwnedRef decoded(PyUnicode_FromStringAndSize(bytes, size));
+ if (ARROW_PREDICT_TRUE(!PyErr_Occurred())) {
+ is_utf8 = true;
+ } else {
+ PyErr_Clear();
+ is_utf8 = false;
+ }
+ }
+ return Status::OK();
+ }
+ }
+ // View the given Python object as unicode string
+ Status ParseUnicode(PyObject* obj) {
+ // The utf-8 representation is cached on the unicode object
+ bytes = PyUnicode_AsUTF8AndSize(obj, &size);
+ is_utf8 = true;
+ return Status::OK();
+ }
+ // View the given Python object as binary-like, i.e. bytes
+ Status ParseBinary(PyObject* obj) {
+ if (PyBytes_Check(obj)) {
+ bytes = PyBytes_AS_STRING(obj);
+ size = PyBytes_GET_SIZE(obj);
+ is_utf8 = false;
+ } else if (PyByteArray_Check(obj)) {
+ bytes = PyByteArray_AS_STRING(obj);
+ size = PyByteArray_GET_SIZE(obj);
+ is_utf8 = false;
+ } else if (PyMemoryView_Check(obj)) {
+ PyObject* ref = PyMemoryView_GetContiguous(obj, PyBUF_READ, 'C');
+ Py_buffer* buffer = PyMemoryView_GET_BUFFER(ref);
+ bytes = reinterpret_cast<const char*>(buffer->buf);
+ size = buffer->len;
+ is_utf8 = false;
+ } else {
+ return Status::TypeError("Expected bytes, got a '", Py_TYPE(obj)->tp_name,
+ "' object");
+ }
+ return Status::OK();
+ }
+ protected:
+ OwnedRef ref;
+class ARROW_PYTHON_EXPORT PyBuffer : public Buffer {
+ public:
+ /// While memoryview objects support multi-dimensional buffers, PyBuffer only supports
+ /// one-dimensional byte buffers.
+ ~PyBuffer();
+ static Result<std::shared_ptr<Buffer>> FromPyObject(PyObject* obj);
+ private:
+ PyBuffer();
+ Status Init(PyObject*);
+ Py_buffer py_buf_;
+// Return the common PyArrow memory pool
+ARROW_PYTHON_EXPORT void set_default_memory_pool(MemoryPool* pool);
+ARROW_PYTHON_EXPORT MemoryPool* get_memory_pool();
+// This is annoying: because C++11 does not allow implicit conversion of string
+// literals to non-const char*, we need to go through some gymnastics to use
+// PyObject_CallMethod without a lot of pain (its arguments are non-const
+// char*)
+template <typename... ArgTypes>
+static inline PyObject* cpp_PyObject_CallMethod(PyObject* obj, const char* method_name,
+ const char* argspec, ArgTypes... args) {
+ return PyObject_CallMethod(obj, const_cast<char*>(method_name),
+ const_cast<char*>(argspec), args...);
+} // namespace py
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/python/datetime.cc b/contrib/libs/apache/arrow/cpp/src/arrow/python/datetime.cc
new file mode 100644
index 0000000000..4b18918cbc
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/python/datetime.cc
@@ -0,0 +1,455 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#include "arrow/python/datetime.h"
+#include <algorithm>
+#include <chrono>
+#include <iomanip>
+#include "arrow/python/common.h"
+#include "arrow/python/helpers.h"
+#include "arrow/python/platform.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/value_parsing.h"
+namespace arrow {
+namespace py {
+namespace internal {
+namespace {
+// Same as Regex '([+-])(0[0-9]|1[0-9]|2[0-3]):([0-5][0-9])$'.
+// GCC 4.9 doesn't support regex, so handcode until support for it
+// is dropped.
+bool MatchFixedOffset(const std::string& tz, util::string_view* sign,
+ util::string_view* hour, util::string_view* minute) {
+ if (tz.size() < 5) {
+ return false;
+ }
+ const char* iter = tz.data();
+ if (*iter == '+' || *iter == '-') {
+ *sign = util::string_view(iter, 1);
+ iter++;
+ if (tz.size() < 6) {
+ return false;
+ }
+ }
+ if ((((*iter == '0' || *iter == '1') && *(iter + 1) >= '0' && *(iter + 1) <= '9') ||
+ (*iter == '2' && *(iter + 1) >= '0' && *(iter + 1) <= '3'))) {
+ *hour = util::string_view(iter, 2);
+ iter += 2;
+ } else {
+ return false;
+ }
+ if (*iter != ':') {
+ return false;
+ }
+ iter++;
+ if (*iter >= '0' && *iter <= '5' && *(iter + 1) >= '0' && *(iter + 1) <= '9') {
+ *minute = util::string_view(iter, 2);
+ iter += 2;
+ } else {
+ return false;
+ }
+ return iter == (tz.data() + tz.size());
+} // namespace
+PyDateTime_CAPI* datetime_api = nullptr;
+void InitDatetime() {
+ PyAcquireGIL lock;
+ datetime_api =
+ reinterpret_cast<PyDateTime_CAPI*>(PyCapsule_Import(PyDateTime_CAPSULE_NAME, 0));
+ if (datetime_api == nullptr) {
+ Py_FatalError("Could not import datetime C API");
+ }
+// The following code is adapted from
+// https://github.com/numpy/numpy/blob/master/numpy/core/src/multiarray/datetime.c
+// Days per month, regular year and leap year
+static int64_t _days_per_month_table[2][12] = {
+ {31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31},
+ {31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}};
+static bool is_leapyear(int64_t year) {
+ return (year & 0x3) == 0 && // year % 4 == 0
+ ((year % 100) != 0 || (year % 400) == 0);
+// Calculates the days offset from the 1970 epoch.
+static int64_t get_days_from_date(int64_t date_year, int64_t date_month,
+ int64_t date_day) {
+ int64_t i, month;
+ int64_t year, days = 0;
+ int64_t* month_lengths;
+ year = date_year - 1970;
+ days = year * 365;
+ // Adjust for leap years
+ if (days >= 0) {
+ // 1968 is the closest leap year before 1970.
+ // Exclude the current year, so add 1.
+ year += 1;
+ // Add one day for each 4 years
+ days += year / 4;
+ // 1900 is the closest previous year divisible by 100
+ year += 68;
+ // Subtract one day for each 100 years
+ days -= year / 100;
+ // 1600 is the closest previous year divisible by 400
+ year += 300;
+ // Add one day for each 400 years
+ days += year / 400;
+ } else {
+ // 1972 is the closest later year after 1970.
+ // Include the current year, so subtract 2.
+ year -= 2;
+ // Subtract one day for each 4 years
+ days += year / 4;
+ // 2000 is the closest later year divisible by 100
+ year -= 28;
+ // Add one day for each 100 years
+ days -= year / 100;
+ // 2000 is also the closest later year divisible by 400
+ // Subtract one day for each 400 years
+ days += year / 400;
+ }
+ month_lengths = _days_per_month_table[is_leapyear(date_year)];
+ month = date_month - 1;
+ // Add the months
+ for (i = 0; i < month; ++i) {
+ days += month_lengths[i];
+ }
+ // Add the days
+ days += date_day - 1;
+ return days;
+// Modifies '*days_' to be the day offset within the year,
+// and returns the year.
+static int64_t days_to_yearsdays(int64_t* days_) {
+ const int64_t days_per_400years = (400 * 365 + 100 - 4 + 1);
+ // Adjust so it's relative to the year 2000 (divisible by 400)
+ int64_t days = (*days_) - (365 * 30 + 7);
+ int64_t year;
+ // Break down the 400 year cycle to get the year and day within the year
+ if (days >= 0) {
+ year = 400 * (days / days_per_400years);
+ days = days % days_per_400years;
+ } else {
+ year = 400 * ((days - (days_per_400years - 1)) / days_per_400years);
+ days = days % days_per_400years;
+ if (days < 0) {
+ days += days_per_400years;
+ }
+ }
+ // Work out the year/day within the 400 year cycle
+ if (days >= 366) {
+ year += 100 * ((days - 1) / (100 * 365 + 25 - 1));
+ days = (days - 1) % (100 * 365 + 25 - 1);
+ if (days >= 365) {
+ year += 4 * ((days + 1) / (4 * 365 + 1));
+ days = (days + 1) % (4 * 365 + 1);
+ if (days >= 366) {
+ year += (days - 1) / 365;
+ days = (days - 1) % 365;
+ }
+ }
+ }
+ *days_ = days;
+ return year + 2000;
+// Extracts the month and year and day number from a number of days
+static void get_date_from_days(int64_t days, int64_t* date_year, int64_t* date_month,
+ int64_t* date_day) {
+ int64_t *month_lengths, i;
+ *date_year = days_to_yearsdays(&days);
+ month_lengths = _days_per_month_table[is_leapyear(*date_year)];
+ for (i = 0; i < 12; ++i) {
+ if (days < month_lengths[i]) {
+ *date_month = i + 1;
+ *date_day = days + 1;
+ return;
+ } else {
+ days -= month_lengths[i];
+ }
+ }
+ // Should never get here
+ return;
+// Splitting time quantities, for example splitting total seconds into
+// minutes and remaining seconds. After we run
+// int64_t remaining = split_time(total, quotient, &next)
+// we have
+// total = next * quotient + remaining. Handles negative values by propagating
+// them: If total is negative, next will be negative and remaining will
+// always be non-negative.
+static inline int64_t split_time(int64_t total, int64_t quotient, int64_t* next) {
+ int64_t r = total % quotient;
+ if (r < 0) {
+ *next = total / quotient - 1;
+ return r + quotient;
+ } else {
+ *next = total / quotient;
+ return r;
+ }
+static inline Status PyTime_convert_int(int64_t val, const TimeUnit::type unit,
+ int64_t* hour, int64_t* minute, int64_t* second,
+ int64_t* microsecond) {
+ switch (unit) {
+ case TimeUnit::NANO:
+ if (val % 1000 != 0) {
+ return Status::Invalid("Value ", val, " has non-zero nanoseconds");
+ }
+ val /= 1000;
+ // fall through
+ case TimeUnit::MICRO:
+ *microsecond = split_time(val, 1000000LL, &val);
+ *second = split_time(val, 60, &val);
+ *minute = split_time(val, 60, hour);
+ break;
+ case TimeUnit::MILLI:
+ *microsecond = split_time(val, 1000, &val) * 1000;
+ // fall through
+ case TimeUnit::SECOND:
+ *second = split_time(val, 60, &val);
+ *minute = split_time(val, 60, hour);
+ break;
+ default:
+ break;
+ }
+ return Status::OK();
+static inline Status PyDate_convert_int(int64_t val, const DateUnit unit, int64_t* year,
+ int64_t* month, int64_t* day) {
+ switch (unit) {
+ case DateUnit::MILLI:
+ val /= 86400000LL; // fall through
+ case DateUnit::DAY:
+ get_date_from_days(val, year, month, day);
+ default:
+ break;
+ }
+ return Status::OK();
+Status PyTime_from_int(int64_t val, const TimeUnit::type unit, PyObject** out) {
+ int64_t hour = 0, minute = 0, second = 0, microsecond = 0;
+ RETURN_NOT_OK(PyTime_convert_int(val, unit, &hour, &minute, &second, &microsecond));
+ *out = PyTime_FromTime(static_cast<int32_t>(hour), static_cast<int32_t>(minute),
+ static_cast<int32_t>(second), static_cast<int32_t>(microsecond));
+ return Status::OK();
+Status PyDate_from_int(int64_t val, const DateUnit unit, PyObject** out) {
+ int64_t year = 0, month = 0, day = 0;
+ RETURN_NOT_OK(PyDate_convert_int(val, unit, &year, &month, &day));
+ *out = PyDate_FromDate(static_cast<int32_t>(year), static_cast<int32_t>(month),
+ static_cast<int32_t>(day));
+ return Status::OK();
+Status PyDateTime_from_int(int64_t val, const TimeUnit::type unit, PyObject** out) {
+ int64_t hour = 0, minute = 0, second = 0, microsecond = 0;
+ RETURN_NOT_OK(PyTime_convert_int(val, unit, &hour, &minute, &second, &microsecond));
+ int64_t total_days = 0;
+ hour = split_time(hour, 24, &total_days);
+ int64_t year = 0, month = 0, day = 0;
+ get_date_from_days(total_days, &year, &month, &day);
+ *out = PyDateTime_FromDateAndTime(
+ static_cast<int32_t>(year), static_cast<int32_t>(month), static_cast<int32_t>(day),
+ static_cast<int32_t>(hour), static_cast<int32_t>(minute),
+ static_cast<int32_t>(second), static_cast<int32_t>(microsecond));
+ return Status::OK();
+int64_t PyDate_to_days(PyDateTime_Date* pydate) {
+ return get_days_from_date(PyDateTime_GET_YEAR(pydate), PyDateTime_GET_MONTH(pydate),
+ PyDateTime_GET_DAY(pydate));
+Result<int64_t> PyDateTime_utcoffset_s(PyObject* obj) {
+ // calculate offset from UTC timezone in seconds
+ // supports only PyDateTime_DateTime and PyDateTime_Time objects
+ OwnedRef pyoffset(PyObject_CallMethod(obj, "utcoffset", NULL));
+ if (pyoffset.obj() != nullptr && pyoffset.obj() != Py_None) {
+ auto delta = reinterpret_cast<PyDateTime_Delta*>(pyoffset.obj());
+ return internal::PyDelta_to_s(delta);
+ } else {
+ return 0;
+ }
+Result<std::string> PyTZInfo_utcoffset_hhmm(PyObject* pytzinfo) {
+ // attempt to convert timezone offset objects to "+/-{hh}:{mm}" format
+ OwnedRef pydelta_object(PyObject_CallMethod(pytzinfo, "utcoffset", "O", Py_None));
+ if (!PyDelta_Check(pydelta_object.obj())) {
+ return Status::Invalid(
+ "Object returned by tzinfo.utcoffset(None) is not an instance of "
+ "datetime.timedelta");
+ }
+ auto pydelta = reinterpret_cast<PyDateTime_Delta*>(pydelta_object.obj());
+ // retrieve the offset as seconds
+ auto total_seconds = internal::PyDelta_to_s(pydelta);
+ // determine whether the offset is positive or negative
+ auto sign = (total_seconds < 0) ? "-" : "+";
+ total_seconds = abs(total_seconds);
+ // calculate offset components
+ int64_t hours, minutes, seconds;
+ seconds = split_time(total_seconds, 60, &minutes);
+ minutes = split_time(minutes, 60, &hours);
+ if (seconds > 0) {
+ // check there are no remaining seconds
+ return Status::Invalid("Offset must represent whole number of minutes");
+ }
+ // construct the timezone string
+ std::stringstream stream;
+ stream << sign << std::setfill('0') << std::setw(2) << hours << ":" << std::setfill('0')
+ << std::setw(2) << minutes;
+ return stream.str();
+// Converted from python. See https://github.com/apache/arrow/pull/7604
+// for details.
+Result<PyObject*> StringToTzinfo(const std::string& tz) {
+ util::string_view sign_str, hour_str, minute_str;
+ OwnedRef pytz;
+ RETURN_NOT_OK(internal::ImportModule("pytz", &pytz));
+ if (MatchFixedOffset(tz, &sign_str, &hour_str, &minute_str)) {
+ int sign = -1;
+ if (sign_str == "+") {
+ sign = 1;
+ }
+ OwnedRef fixed_offset;
+ RETURN_NOT_OK(internal::ImportFromModule(pytz.obj(), "FixedOffset", &fixed_offset));
+ uint32_t minutes, hours;
+ if (!::arrow::internal::ParseUnsigned(hour_str.data(), hour_str.size(), &hours) ||
+ !::arrow::internal::ParseUnsigned(minute_str.data(), minute_str.size(),
+ &minutes)) {
+ return Status::Invalid("Invalid timezone: ", tz);
+ }
+ OwnedRef total_minutes(PyLong_FromLong(
+ sign * ((static_cast<int>(hours) * 60) + static_cast<int>(minutes))));
+ auto tzinfo =
+ PyObject_CallFunctionObjArgs(fixed_offset.obj(), total_minutes.obj(), NULL);
+ return tzinfo;
+ }
+ OwnedRef timezone;
+ RETURN_NOT_OK(internal::ImportFromModule(pytz.obj(), "timezone", &timezone));
+ OwnedRef py_tz_string(
+ PyUnicode_FromStringAndSize(tz.c_str(), static_cast<Py_ssize_t>(tz.size())));
+ auto tzinfo = PyObject_CallFunctionObjArgs(timezone.obj(), py_tz_string.obj(), NULL);
+ return tzinfo;
+Result<std::string> TzinfoToString(PyObject* tzinfo) {
+ OwnedRef module_pytz; // import pytz
+ OwnedRef module_datetime; // import datetime
+ OwnedRef class_timezone; // from datetime import timezone
+ OwnedRef class_fixedoffset; // from pytz import _FixedOffset
+ // import necessary modules
+ RETURN_NOT_OK(internal::ImportModule("pytz", &module_pytz));
+ RETURN_NOT_OK(internal::ImportModule("datetime", &module_datetime));
+ // import necessary classes
+ internal::ImportFromModule(module_pytz.obj(), "_FixedOffset", &class_fixedoffset));
+ internal::ImportFromModule(module_datetime.obj(), "timezone", &class_timezone));
+ // check that it's a valid tzinfo object
+ if (!PyTZInfo_Check(tzinfo)) {
+ return Status::TypeError("Not an instance of datetime.tzinfo");
+ }
+ // if tzinfo is an instance of pytz._FixedOffset or datetime.timezone return the
+ // HH:MM offset string representation
+ if (PyObject_IsInstance(tzinfo, class_timezone.obj()) ||
+ PyObject_IsInstance(tzinfo, class_fixedoffset.obj())) {
+ // still recognize datetime.timezone.utc as UTC (instead of +00:00)
+ OwnedRef tzname_object(PyObject_CallMethod(tzinfo, "tzname", "O", Py_None));
+ if (PyUnicode_Check(tzname_object.obj())) {
+ std::string result;
+ RETURN_NOT_OK(internal::PyUnicode_AsStdString(tzname_object.obj(), &result));
+ if (result == "UTC") {
+ return result;
+ }
+ }
+ return PyTZInfo_utcoffset_hhmm(tzinfo);
+ }
+ // try to look up zone attribute
+ if (PyObject_HasAttrString(tzinfo, "zone")) {
+ OwnedRef zone(PyObject_GetAttrString(tzinfo, "zone"));
+ std::string result;
+ RETURN_NOT_OK(internal::PyUnicode_AsStdString(zone.obj(), &result));
+ return result;
+ }
+ // attempt to call tzinfo.tzname(None)
+ OwnedRef tzname_object(PyObject_CallMethod(tzinfo, "tzname", "O", Py_None));
+ if (PyUnicode_Check(tzname_object.obj())) {
+ std::string result;
+ RETURN_NOT_OK(internal::PyUnicode_AsStdString(tzname_object.obj(), &result));
+ return result;
+ }
+ // fall back to HH:MM offset string representation based on tzinfo.utcoffset(None)
+ return PyTZInfo_utcoffset_hhmm(tzinfo);
+} // namespace internal
+} // namespace py
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/python/datetime.h b/contrib/libs/apache/arrow/cpp/src/arrow/python/datetime.h
new file mode 100644
index 0000000000..0072cdda4c
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/python/datetime.h
@@ -0,0 +1,183 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+#include <algorithm>
+#include <chrono>
+#include "arrow/python/platform.h"
+#include "arrow/python/visibility.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/util/logging.h"
+// By default, PyDateTimeAPI is a *static* variable. This forces
+// PyDateTime_IMPORT to be called in every C/C++ module using the
+// C datetime API. This is error-prone and potentially costly.
+// Instead, we redefine PyDateTimeAPI to point to a global variable,
+// which is initialized once by calling InitDatetime().
+#define PyDateTimeAPI ::arrow::py::internal::datetime_api
+namespace arrow {
+namespace py {
+namespace internal {
+extern PyDateTime_CAPI* datetime_api;
+void InitDatetime();
+inline int64_t PyTime_to_us(PyObject* pytime) {
+ return (PyDateTime_TIME_GET_HOUR(pytime) * 3600000000LL +
+ PyDateTime_TIME_GET_MINUTE(pytime) * 60000000LL +
+ PyDateTime_TIME_GET_SECOND(pytime) * 1000000LL +
+ PyDateTime_TIME_GET_MICROSECOND(pytime));
+inline int64_t PyTime_to_s(PyObject* pytime) { return PyTime_to_us(pytime) / 1000000; }
+inline int64_t PyTime_to_ms(PyObject* pytime) { return PyTime_to_us(pytime) / 1000; }
+inline int64_t PyTime_to_ns(PyObject* pytime) { return PyTime_to_us(pytime) * 1000; }
+Status PyTime_from_int(int64_t val, const TimeUnit::type unit, PyObject** out);
+Status PyDate_from_int(int64_t val, const DateUnit unit, PyObject** out);
+// WARNING: This function returns a naive datetime.
+Status PyDateTime_from_int(int64_t val, const TimeUnit::type unit, PyObject** out);
+// This declaration must be the same as in filesystem/filesystem.h
+using TimePoint =
+ std::chrono::time_point<std::chrono::system_clock, std::chrono::nanoseconds>;
+int64_t PyDate_to_days(PyDateTime_Date* pydate);
+inline int64_t PyDate_to_s(PyDateTime_Date* pydate) {
+ return PyDate_to_days(pydate) * 86400LL;
+inline int64_t PyDate_to_ms(PyDateTime_Date* pydate) {
+ return PyDate_to_days(pydate) * 86400000LL;
+inline int64_t PyDateTime_to_s(PyDateTime_DateTime* pydatetime) {
+ return (PyDate_to_s(reinterpret_cast<PyDateTime_Date*>(pydatetime)) +
+ PyDateTime_DATE_GET_HOUR(pydatetime) * 3600LL +
+ PyDateTime_DATE_GET_MINUTE(pydatetime) * 60LL +
+ PyDateTime_DATE_GET_SECOND(pydatetime));
+inline int64_t PyDateTime_to_ms(PyDateTime_DateTime* pydatetime) {
+ return (PyDateTime_to_s(pydatetime) * 1000LL +
+ PyDateTime_DATE_GET_MICROSECOND(pydatetime) / 1000);
+inline int64_t PyDateTime_to_us(PyDateTime_DateTime* pydatetime) {
+ return (PyDateTime_to_s(pydatetime) * 1000000LL +
+ PyDateTime_DATE_GET_MICROSECOND(pydatetime));
+inline int64_t PyDateTime_to_ns(PyDateTime_DateTime* pydatetime) {
+ return PyDateTime_to_us(pydatetime) * 1000LL;
+inline TimePoint PyDateTime_to_TimePoint(PyDateTime_DateTime* pydatetime) {
+ return TimePoint(TimePoint::duration(PyDateTime_to_ns(pydatetime)));
+inline int64_t TimePoint_to_ns(TimePoint val) { return val.time_since_epoch().count(); }
+inline TimePoint TimePoint_from_s(double val) {
+ return TimePoint(TimePoint::duration(static_cast<int64_t>(1e9 * val)));
+inline TimePoint TimePoint_from_ns(int64_t val) {
+ return TimePoint(TimePoint::duration(val));
+inline int64_t PyDelta_to_s(PyDateTime_Delta* pytimedelta) {
+ return (PyDateTime_DELTA_GET_DAYS(pytimedelta) * 86400LL +
+ PyDateTime_DELTA_GET_SECONDS(pytimedelta));
+inline int64_t PyDelta_to_ms(PyDateTime_Delta* pytimedelta) {
+ return (PyDelta_to_s(pytimedelta) * 1000LL +
+ PyDateTime_DELTA_GET_MICROSECONDS(pytimedelta) / 1000);
+inline int64_t PyDelta_to_us(PyDateTime_Delta* pytimedelta) {
+ return (PyDelta_to_s(pytimedelta) * 1000000LL +
+ PyDateTime_DELTA_GET_MICROSECONDS(pytimedelta));
+inline int64_t PyDelta_to_ns(PyDateTime_Delta* pytimedelta) {
+ return PyDelta_to_us(pytimedelta) * 1000LL;
+Result<int64_t> PyDateTime_utcoffset_s(PyObject* pydatetime);
+/// \brief Convert a time zone name into a time zone object.
+/// Supported input strings are:
+/// * As used in the Olson time zone database (the "tz database" or
+/// "tzdata"), such as "America/New_York"
+/// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30
+/// GIL must be held when calling this method.
+Result<PyObject*> StringToTzinfo(const std::string& tz);
+/// \brief Convert a time zone object to a string representation.
+/// The output strings are:
+/// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30
+/// if the input object is either an instance of pytz._FixedOffset or
+/// datetime.timedelta
+/// * The timezone's name if the input object's tzname() method returns with a
+/// non-empty timezone name such as "UTC" or "America/New_York"
+/// GIL must be held when calling this method.
+Result<std::string> TzinfoToString(PyObject* pytzinfo);
+} // namespace internal
+} // namespace py
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/python/decimal.cc b/contrib/libs/apache/arrow/cpp/src/arrow/python/decimal.cc
new file mode 100644
index 0000000000..0c00fcfaa8
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/python/decimal.cc
@@ -0,0 +1,246 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#include <algorithm>
+#include <limits>
+#include "arrow/python/common.h"
+#include "arrow/python/decimal.h"
+#include "arrow/python/helpers.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/decimal.h"
+#include "arrow/util/logging.h"
+namespace arrow {
+namespace py {
+namespace internal {
+Status ImportDecimalType(OwnedRef* decimal_type) {
+ OwnedRef decimal_module;
+ RETURN_NOT_OK(ImportModule("decimal", &decimal_module));
+ RETURN_NOT_OK(ImportFromModule(decimal_module.obj(), "Decimal", decimal_type));
+ return Status::OK();
+Status PythonDecimalToString(PyObject* python_decimal, std::string* out) {
+ // Call Python's str(decimal_object)
+ return PyObject_StdStringStr(python_decimal, out);
+// \brief Infer the precision and scale of a Python decimal.Decimal instance
+// \param python_decimal[in] An instance of decimal.Decimal
+// \param precision[out] The value of the inferred precision
+// \param scale[out] The value of the inferred scale
+// \return The status of the operation
+static Status InferDecimalPrecisionAndScale(PyObject* python_decimal, int32_t* precision,
+ int32_t* scale) {
+ DCHECK_NE(python_decimal, NULLPTR);
+ DCHECK_NE(precision, NULLPTR);
+ // TODO(phillipc): Make sure we perform PyDecimal_Check(python_decimal) as a DCHECK
+ OwnedRef as_tuple(PyObject_CallMethod(python_decimal, const_cast<char*>("as_tuple"),
+ const_cast<char*>("")));
+ DCHECK(PyTuple_Check(as_tuple.obj()));
+ OwnedRef digits(PyObject_GetAttrString(as_tuple.obj(), "digits"));
+ DCHECK(PyTuple_Check(digits.obj()));
+ const auto num_digits = static_cast<int32_t>(PyTuple_Size(digits.obj()));
+ OwnedRef py_exponent(PyObject_GetAttrString(as_tuple.obj(), "exponent"));
+ DCHECK(IsPyInteger(py_exponent.obj()));
+ const auto exponent = static_cast<int32_t>(PyLong_AsLong(py_exponent.obj()));
+ if (exponent < 0) {
+ // If exponent > num_digits, we have a number with leading zeros
+ // such as 0.01234. Ensure we have enough precision for leading zeros
+ // (which are not included in num_digits).
+ *precision = std::max(num_digits, -exponent);
+ *scale = -exponent;
+ } else {
+ // Trailing zeros are not included in num_digits, need to add to precision.
+ // Note we don't generate negative scales as they are poorly supported
+ // in non-Arrow systems.
+ *precision = num_digits + exponent;
+ *scale = 0;
+ }
+ return Status::OK();
+PyObject* DecimalFromString(PyObject* decimal_constructor,
+ const std::string& decimal_string) {
+ DCHECK_NE(decimal_constructor, nullptr);
+ auto string_size = decimal_string.size();
+ DCHECK_GT(string_size, 0);
+ auto string_bytes = decimal_string.c_str();
+ DCHECK_NE(string_bytes, nullptr);
+ return PyObject_CallFunction(decimal_constructor, const_cast<char*>("s#"), string_bytes,
+ static_cast<Py_ssize_t>(string_size));
+namespace {
+template <typename ArrowDecimal>
+Status DecimalFromStdString(const std::string& decimal_string,
+ const DecimalType& arrow_type, ArrowDecimal* out) {
+ int32_t inferred_precision;
+ int32_t inferred_scale;
+ RETURN_NOT_OK(ArrowDecimal::FromString(decimal_string, out, &inferred_precision,
+ &inferred_scale));
+ const int32_t precision = arrow_type.precision();
+ const int32_t scale = arrow_type.scale();
+ if (scale != inferred_scale) {
+ ARROW_ASSIGN_OR_RAISE(*out, out->Rescale(inferred_scale, scale));
+ }
+ auto inferred_scale_delta = inferred_scale - scale;
+ if (ARROW_PREDICT_FALSE((inferred_precision - inferred_scale_delta) > precision)) {
+ return Status::Invalid(
+ "Decimal type with precision ", inferred_precision,
+ " does not fit into precision inferred from first array element: ", precision);
+ }
+ return Status::OK();
+template <typename ArrowDecimal>
+Status InternalDecimalFromPythonDecimal(PyObject* python_decimal,
+ const DecimalType& arrow_type,
+ ArrowDecimal* out) {
+ DCHECK_NE(python_decimal, NULLPTR);
+ std::string string;
+ RETURN_NOT_OK(PythonDecimalToString(python_decimal, &string));
+ return DecimalFromStdString(string, arrow_type, out);
+template <typename ArrowDecimal>
+Status InternalDecimalFromPyObject(PyObject* obj, const DecimalType& arrow_type,
+ ArrowDecimal* out) {
+ if (IsPyInteger(obj)) {
+ // TODO: add a fast path for small-ish ints
+ std::string string;
+ RETURN_NOT_OK(PyObject_StdStringStr(obj, &string));
+ return DecimalFromStdString(string, arrow_type, out);
+ } else if (PyDecimal_Check(obj)) {
+ return InternalDecimalFromPythonDecimal<ArrowDecimal>(obj, arrow_type, out);
+ } else {
+ return Status::TypeError("int or Decimal object expected, got ",
+ Py_TYPE(obj)->tp_name);
+ }
+} // namespace
+Status DecimalFromPythonDecimal(PyObject* python_decimal, const DecimalType& arrow_type,
+ Decimal128* out) {
+ return InternalDecimalFromPythonDecimal(python_decimal, arrow_type, out);
+Status DecimalFromPyObject(PyObject* obj, const DecimalType& arrow_type,
+ Decimal128* out) {
+ return InternalDecimalFromPyObject(obj, arrow_type, out);
+Status DecimalFromPythonDecimal(PyObject* python_decimal, const DecimalType& arrow_type,
+ Decimal256* out) {
+ return InternalDecimalFromPythonDecimal(python_decimal, arrow_type, out);
+Status DecimalFromPyObject(PyObject* obj, const DecimalType& arrow_type,
+ Decimal256* out) {
+ return InternalDecimalFromPyObject(obj, arrow_type, out);
+bool PyDecimal_Check(PyObject* obj) {
+ static OwnedRef decimal_type;
+ if (!decimal_type.obj()) {
+ ARROW_CHECK_OK(ImportDecimalType(&decimal_type));
+ DCHECK(PyType_Check(decimal_type.obj()));
+ }
+ // PyObject_IsInstance() is slower as it has to check for virtual subclasses
+ const int result =
+ PyType_IsSubtype(Py_TYPE(obj), reinterpret_cast<PyTypeObject*>(decimal_type.obj()));
+ ARROW_CHECK_NE(result, -1) << " error during PyType_IsSubtype check";
+ return result == 1;
+bool PyDecimal_ISNAN(PyObject* obj) {
+ DCHECK(PyDecimal_Check(obj)) << "obj is not an instance of decimal.Decimal";
+ OwnedRef is_nan(
+ PyObject_CallMethod(obj, const_cast<char*>("is_nan"), const_cast<char*>("")));
+ return PyObject_IsTrue(is_nan.obj()) == 1;
+ : DecimalMetadata(std::numeric_limits<int32_t>::min(),
+ std::numeric_limits<int32_t>::min()) {}
+DecimalMetadata::DecimalMetadata(int32_t precision, int32_t scale)
+ : precision_(precision), scale_(scale) {}
+Status DecimalMetadata::Update(int32_t suggested_precision, int32_t suggested_scale) {
+ const int32_t current_scale = scale_;
+ scale_ = std::max(current_scale, suggested_scale);
+ const int32_t current_precision = precision_;
+ if (current_precision == std::numeric_limits<int32_t>::min()) {
+ precision_ = suggested_precision;
+ } else {
+ auto num_digits = std::max(current_precision - current_scale,
+ suggested_precision - suggested_scale);
+ precision_ = std::max(num_digits + scale_, current_precision);
+ }
+ return Status::OK();
+Status DecimalMetadata::Update(PyObject* object) {
+ bool is_decimal = PyDecimal_Check(object);
+ if (ARROW_PREDICT_FALSE(!is_decimal || PyDecimal_ISNAN(object))) {
+ return Status::OK();
+ }
+ int32_t precision = 0;
+ int32_t scale = 0;
+ RETURN_NOT_OK(InferDecimalPrecisionAndScale(object, &precision, &scale));
+ return Update(precision, scale);
+} // namespace internal
+} // namespace py
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/python/decimal.h b/contrib/libs/apache/arrow/cpp/src/arrow/python/decimal.h
new file mode 100644
index 0000000000..1187037aed
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/python/decimal.h
@@ -0,0 +1,128 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+#include <string>
+#include "arrow/python/visibility.h"
+#include "arrow/type.h"
+namespace arrow {
+class Decimal128;
+class Decimal256;
+namespace py {
+class OwnedRef;
+// Python Decimal support
+namespace internal {
+// \brief Import the Python Decimal type
+Status ImportDecimalType(OwnedRef* decimal_type);
+// \brief Convert a Python Decimal object to a C++ string
+// \param[in] python_decimal A Python decimal.Decimal instance
+// \param[out] The string representation of the Python Decimal instance
+// \return The status of the operation
+Status PythonDecimalToString(PyObject* python_decimal, std::string* out);
+// \brief Convert a C++ std::string to a Python Decimal instance
+// \param[in] decimal_constructor The decimal type object
+// \param[in] decimal_string A decimal string
+// \return An instance of decimal.Decimal
+PyObject* DecimalFromString(PyObject* decimal_constructor,
+ const std::string& decimal_string);
+// \brief Convert a Python decimal to an Arrow Decimal128 object
+// \param[in] python_decimal A Python decimal.Decimal instance
+// \param[in] arrow_type An instance of arrow::DecimalType
+// \param[out] out A pointer to a Decimal128
+// \return The status of the operation
+Status DecimalFromPythonDecimal(PyObject* python_decimal, const DecimalType& arrow_type,
+ Decimal128* out);
+// \brief Convert a Python object to an Arrow Decimal128 object
+// \param[in] python_decimal A Python int or decimal.Decimal instance
+// \param[in] arrow_type An instance of arrow::DecimalType
+// \param[out] out A pointer to a Decimal128
+// \return The status of the operation
+Status DecimalFromPyObject(PyObject* obj, const DecimalType& arrow_type, Decimal128* out);
+// \brief Convert a Python decimal to an Arrow Decimal256 object
+// \param[in] python_decimal A Python decimal.Decimal instance
+// \param[in] arrow_type An instance of arrow::DecimalType
+// \param[out] out A pointer to a Decimal256
+// \return The status of the operation
+Status DecimalFromPythonDecimal(PyObject* python_decimal, const DecimalType& arrow_type,
+ Decimal256* out);
+// \brief Convert a Python object to an Arrow Decimal256 object
+// \param[in] python_decimal A Python int or decimal.Decimal instance
+// \param[in] arrow_type An instance of arrow::DecimalType
+// \param[out] out A pointer to a Decimal256
+// \return The status of the operation
+Status DecimalFromPyObject(PyObject* obj, const DecimalType& arrow_type, Decimal256* out);
+// \brief Check whether obj is an instance of Decimal
+bool PyDecimal_Check(PyObject* obj);
+// \brief Check whether obj is nan. This function will abort the program if the argument
+// is not a Decimal instance
+bool PyDecimal_ISNAN(PyObject* obj);
+// \brief Helper class to track and update the precision and scale of a decimal
+class ARROW_PYTHON_EXPORT DecimalMetadata {
+ public:
+ DecimalMetadata();
+ DecimalMetadata(int32_t precision, int32_t scale);
+ // \brief Adjust the precision and scale of a decimal type given a new precision and a
+ // new scale \param[in] suggested_precision A candidate precision \param[in]
+ // suggested_scale A candidate scale \return The status of the operation
+ Status Update(int32_t suggested_precision, int32_t suggested_scale);
+ // \brief A convenient interface for updating the precision and scale based on a Python
+ // Decimal object \param object A Python Decimal object \return The status of the
+ // operation
+ Status Update(PyObject* object);
+ int32_t precision() const { return precision_; }
+ int32_t scale() const { return scale_; }
+ private:
+ int32_t precision_;
+ int32_t scale_;
+} // namespace internal
+} // namespace py
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/python/deserialize.cc b/contrib/libs/apache/arrow/cpp/src/arrow/python/deserialize.cc
new file mode 100644
index 0000000000..961a1686e0
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/python/deserialize.cc
@@ -0,0 +1,495 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#include "arrow/python/deserialize.h"
+#include "arrow/python/numpy_interop.h"
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+#include <numpy/arrayobject.h>
+#include <numpy/arrayscalars.h>
+#include "arrow/array.h"
+#include "arrow/io/interfaces.h"
+#include "arrow/io/memory.h"
+#include "arrow/ipc/options.h"
+#include "arrow/ipc/reader.h"
+#include "arrow/ipc/util.h"
+#include "arrow/ipc/writer.h"
+#include "arrow/table.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/value_parsing.h"
+#include "arrow/python/common.h"
+#include "arrow/python/datetime.h"
+#include "arrow/python/helpers.h"
+#include "arrow/python/numpy_convert.h"
+#include "arrow/python/pyarrow.h"
+#include "arrow/python/serialize.h"
+namespace arrow {
+using internal::checked_cast;
+using internal::ParseValue;
+namespace py {
+Status CallDeserializeCallback(PyObject* context, PyObject* value,
+ PyObject** deserialized_object);
+Status DeserializeTuple(PyObject* context, const Array& array, int64_t start_idx,
+ int64_t stop_idx, PyObject* base, const SerializedPyObject& blobs,
+ PyObject** out);
+Status DeserializeList(PyObject* context, const Array& array, int64_t start_idx,
+ int64_t stop_idx, PyObject* base, const SerializedPyObject& blobs,
+ PyObject** out);
+Status DeserializeSet(PyObject* context, const Array& array, int64_t start_idx,
+ int64_t stop_idx, PyObject* base, const SerializedPyObject& blobs,
+ PyObject** out);
+Status DeserializeDict(PyObject* context, const Array& array, int64_t start_idx,
+ int64_t stop_idx, PyObject* base, const SerializedPyObject& blobs,
+ PyObject** out) {
+ const auto& data = checked_cast<const StructArray&>(array);
+ OwnedRef keys, vals;
+ OwnedRef result(PyDict_New());
+ DCHECK_EQ(2, data.num_fields());
+ RETURN_NOT_OK(DeserializeList(context, *data.field(0), start_idx, stop_idx, base, blobs,
+ keys.ref()));
+ RETURN_NOT_OK(DeserializeList(context, *data.field(1), start_idx, stop_idx, base, blobs,
+ vals.ref()));
+ for (int64_t i = start_idx; i < stop_idx; ++i) {
+ // PyDict_SetItem behaves differently from PyList_SetItem and PyTuple_SetItem.
+ // The latter two steal references whereas PyDict_SetItem does not. So we need
+ // to make sure the reference count is decremented by letting the OwnedRef
+ // go out of scope at the end.
+ int ret = PyDict_SetItem(result.obj(), PyList_GET_ITEM(keys.obj(), i - start_idx),
+ PyList_GET_ITEM(vals.obj(), i - start_idx));
+ if (ret != 0) {
+ return ConvertPyError();
+ }
+ }
+ static PyObject* py_type = PyUnicode_FromString("_pytype_");
+ if (PyDict_Contains(result.obj(), py_type)) {
+ RETURN_NOT_OK(CallDeserializeCallback(context, result.obj(), out));
+ } else {
+ *out = result.detach();
+ }
+ return Status::OK();
+Status DeserializeArray(int32_t index, PyObject* base, const SerializedPyObject& blobs,
+ PyObject** out) {
+ RETURN_NOT_OK(py::TensorToNdarray(blobs.ndarrays[index], base, out));
+ // Mark the array as immutable
+ OwnedRef flags(PyObject_GetAttrString(*out, "flags"));
+ if (flags.obj() == NULL) {
+ return ConvertPyError();
+ }
+ if (PyObject_SetAttrString(flags.obj(), "writeable", Py_False) < 0) {
+ return ConvertPyError();
+ }
+ return Status::OK();
+Status GetValue(PyObject* context, const Array& arr, int64_t index, int8_t type,
+ PyObject* base, const SerializedPyObject& blobs, PyObject** result) {
+ switch (type) {
+ case PythonType::NONE:
+ Py_INCREF(Py_None);
+ *result = Py_None;
+ return Status::OK();
+ case PythonType::BOOL:
+ *result = PyBool_FromLong(checked_cast<const BooleanArray&>(arr).Value(index));
+ return Status::OK();
+ case PythonType::PY2INT:
+ case PythonType::INT: {
+ *result = PyLong_FromSsize_t(checked_cast<const Int64Array&>(arr).Value(index));
+ return Status::OK();
+ }
+ case PythonType::BYTES: {
+ auto view = checked_cast<const BinaryArray&>(arr).GetView(index);
+ *result = PyBytes_FromStringAndSize(view.data(), view.length());
+ return CheckPyError();
+ }
+ case PythonType::STRING: {
+ auto view = checked_cast<const StringArray&>(arr).GetView(index);
+ *result = PyUnicode_FromStringAndSize(view.data(), view.length());
+ return CheckPyError();
+ }
+ case PythonType::HALF_FLOAT: {
+ *result = PyHalf_FromHalf(checked_cast<const HalfFloatArray&>(arr).Value(index));
+ return Status::OK();
+ }
+ case PythonType::FLOAT:
+ *result = PyFloat_FromDouble(checked_cast<const FloatArray&>(arr).Value(index));
+ return Status::OK();
+ case PythonType::DOUBLE:
+ *result = PyFloat_FromDouble(checked_cast<const DoubleArray&>(arr).Value(index));
+ return Status::OK();
+ case PythonType::DATE64: {
+ RETURN_NOT_OK(internal::PyDateTime_from_int(
+ checked_cast<const Date64Array&>(arr).Value(index), TimeUnit::MICRO, result));
+ return Status::OK();
+ }
+ case PythonType::LIST: {
+ const auto& l = checked_cast<const ListArray&>(arr);
+ return DeserializeList(context, *l.values(), l.value_offset(index),
+ l.value_offset(index + 1), base, blobs, result);
+ }
+ case PythonType::DICT: {
+ const auto& l = checked_cast<const ListArray&>(arr);
+ return DeserializeDict(context, *l.values(), l.value_offset(index),
+ l.value_offset(index + 1), base, blobs, result);
+ }
+ case PythonType::TUPLE: {
+ const auto& l = checked_cast<const ListArray&>(arr);
+ return DeserializeTuple(context, *l.values(), l.value_offset(index),
+ l.value_offset(index + 1), base, blobs, result);
+ }
+ case PythonType::SET: {
+ const auto& l = checked_cast<const ListArray&>(arr);
+ return DeserializeSet(context, *l.values(), l.value_offset(index),
+ l.value_offset(index + 1), base, blobs, result);
+ }
+ case PythonType::TENSOR: {
+ int32_t ref = checked_cast<const Int32Array&>(arr).Value(index);
+ *result = wrap_tensor(blobs.tensors[ref]);
+ return Status::OK();
+ }
+ case PythonType::SPARSECOOTENSOR: {
+ int32_t ref = checked_cast<const Int32Array&>(arr).Value(index);
+ const std::shared_ptr<SparseCOOTensor>& sparse_coo_tensor =
+ arrow::internal::checked_pointer_cast<SparseCOOTensor>(
+ blobs.sparse_tensors[ref]);
+ *result = wrap_sparse_coo_tensor(sparse_coo_tensor);
+ return Status::OK();
+ }
+ case PythonType::SPARSECSRMATRIX: {
+ int32_t ref = checked_cast<const Int32Array&>(arr).Value(index);
+ const std::shared_ptr<SparseCSRMatrix>& sparse_csr_matrix =
+ arrow::internal::checked_pointer_cast<SparseCSRMatrix>(
+ blobs.sparse_tensors[ref]);
+ *result = wrap_sparse_csr_matrix(sparse_csr_matrix);
+ return Status::OK();
+ }
+ case PythonType::SPARSECSCMATRIX: {
+ int32_t ref = checked_cast<const Int32Array&>(arr).Value(index);
+ const std::shared_ptr<SparseCSCMatrix>& sparse_csc_matrix =
+ arrow::internal::checked_pointer_cast<SparseCSCMatrix>(
+ blobs.sparse_tensors[ref]);
+ *result = wrap_sparse_csc_matrix(sparse_csc_matrix);
+ return Status::OK();
+ }
+ case PythonType::SPARSECSFTENSOR: {
+ int32_t ref = checked_cast<const Int32Array&>(arr).Value(index);
+ const std::shared_ptr<SparseCSFTensor>& sparse_csf_tensor =
+ arrow::internal::checked_pointer_cast<SparseCSFTensor>(
+ blobs.sparse_tensors[ref]);
+ *result = wrap_sparse_csf_tensor(sparse_csf_tensor);
+ return Status::OK();
+ }
+ case PythonType::NDARRAY: {
+ int32_t ref = checked_cast<const Int32Array&>(arr).Value(index);
+ return DeserializeArray(ref, base, blobs, result);
+ }
+ case PythonType::BUFFER: {
+ int32_t ref = checked_cast<const Int32Array&>(arr).Value(index);
+ *result = wrap_buffer(blobs.buffers[ref]);
+ return Status::OK();
+ }
+ default: {
+ ARROW_CHECK(false) << "union tag " << type << "' not recognized";
+ }
+ }
+ return Status::OK();
+Status GetPythonTypes(const UnionArray& data, std::vector<int8_t>* result) {
+ ARROW_CHECK(result != nullptr);
+ auto type = data.type();
+ for (int i = 0; i < type->num_fields(); ++i) {
+ int8_t tag = 0;
+ const std::string& data = type->field(i)->name();
+ if (!ParseValue<Int8Type>(data.c_str(), data.size(), &tag)) {
+ return Status::SerializationError("Cannot convert string: \"",
+ type->field(i)->name(), "\" to int8_t");
+ }
+ result->push_back(tag);
+ }
+ return Status::OK();
+template <typename CreateSequenceFn, typename SetItemFn>
+Status DeserializeSequence(PyObject* context, const Array& array, int64_t start_idx,
+ int64_t stop_idx, PyObject* base,
+ const SerializedPyObject& blobs,
+ CreateSequenceFn&& create_sequence, SetItemFn&& set_item,
+ PyObject** out) {
+ const auto& data = checked_cast<const DenseUnionArray&>(array);
+ OwnedRef result(create_sequence(stop_idx - start_idx));
+ const int8_t* type_codes = data.raw_type_codes();
+ const int32_t* value_offsets = data.raw_value_offsets();
+ std::vector<int8_t> python_types;
+ RETURN_NOT_OK(GetPythonTypes(data, &python_types));
+ for (int64_t i = start_idx; i < stop_idx; ++i) {
+ const int64_t offset = value_offsets[i];
+ const uint8_t type = type_codes[i];
+ PyObject* value;
+ RETURN_NOT_OK(GetValue(context, *data.field(type), offset, python_types[type], base,
+ blobs, &value));
+ RETURN_NOT_OK(set_item(result.obj(), i - start_idx, value));
+ }
+ *out = result.detach();
+ return Status::OK();
+Status DeserializeList(PyObject* context, const Array& array, int64_t start_idx,
+ int64_t stop_idx, PyObject* base, const SerializedPyObject& blobs,
+ PyObject** out) {
+ return DeserializeSequence(
+ context, array, start_idx, stop_idx, base, blobs,
+ [](int64_t size) { return PyList_New(size); },
+ [](PyObject* seq, int64_t index, PyObject* item) {
+ PyList_SET_ITEM(seq, index, item);
+ return Status::OK();
+ },
+ out);
+Status DeserializeTuple(PyObject* context, const Array& array, int64_t start_idx,
+ int64_t stop_idx, PyObject* base, const SerializedPyObject& blobs,
+ PyObject** out) {
+ return DeserializeSequence(
+ context, array, start_idx, stop_idx, base, blobs,
+ [](int64_t size) { return PyTuple_New(size); },
+ [](PyObject* seq, int64_t index, PyObject* item) {
+ PyTuple_SET_ITEM(seq, index, item);
+ return Status::OK();
+ },
+ out);
+Status DeserializeSet(PyObject* context, const Array& array, int64_t start_idx,
+ int64_t stop_idx, PyObject* base, const SerializedPyObject& blobs,
+ PyObject** out) {
+ return DeserializeSequence(
+ context, array, start_idx, stop_idx, base, blobs,
+ [](int64_t size) { return PySet_New(nullptr); },
+ [](PyObject* seq, int64_t index, PyObject* item) {
+ int err = PySet_Add(seq, item);
+ Py_DECREF(item);
+ if (err < 0) {
+ }
+ return Status::OK();
+ },
+ out);
+Status ReadSerializedObject(io::RandomAccessFile* src, SerializedPyObject* out) {
+ int32_t num_tensors;
+ int32_t num_sparse_tensors;
+ int32_t num_ndarrays;
+ int32_t num_buffers;
+ // Read number of tensors
+ RETURN_NOT_OK(src->Read(sizeof(int32_t), reinterpret_cast<uint8_t*>(&num_tensors)));
+ src->Read(sizeof(int32_t), reinterpret_cast<uint8_t*>(&num_sparse_tensors)));
+ RETURN_NOT_OK(src->Read(sizeof(int32_t), reinterpret_cast<uint8_t*>(&num_ndarrays)));
+ RETURN_NOT_OK(src->Read(sizeof(int32_t), reinterpret_cast<uint8_t*>(&num_buffers)));
+ // Align stream to 8-byte offset
+ RETURN_NOT_OK(ipc::AlignStream(src, ipc::kArrowIpcAlignment));
+ std::shared_ptr<RecordBatchReader> reader;
+ ARROW_ASSIGN_OR_RAISE(reader, ipc::RecordBatchStreamReader::Open(src));
+ RETURN_NOT_OK(reader->ReadNext(&out->batch));
+ /// Skip EOS marker
+ RETURN_NOT_OK(src->Advance(4));
+ /// Align stream so tensor bodies are 64-byte aligned
+ RETURN_NOT_OK(ipc::AlignStream(src, ipc::kTensorAlignment));
+ for (int i = 0; i < num_tensors; ++i) {
+ std::shared_ptr<Tensor> tensor;
+ ARROW_ASSIGN_OR_RAISE(tensor, ipc::ReadTensor(src));
+ RETURN_NOT_OK(ipc::AlignStream(src, ipc::kTensorAlignment));
+ out->tensors.push_back(tensor);
+ }
+ for (int i = 0; i < num_sparse_tensors; ++i) {
+ std::shared_ptr<SparseTensor> sparse_tensor;
+ ARROW_ASSIGN_OR_RAISE(sparse_tensor, ipc::ReadSparseTensor(src));
+ RETURN_NOT_OK(ipc::AlignStream(src, ipc::kTensorAlignment));
+ out->sparse_tensors.push_back(sparse_tensor);
+ }
+ for (int i = 0; i < num_ndarrays; ++i) {
+ std::shared_ptr<Tensor> ndarray;
+ ARROW_ASSIGN_OR_RAISE(ndarray, ipc::ReadTensor(src));
+ RETURN_NOT_OK(ipc::AlignStream(src, ipc::kTensorAlignment));
+ out->ndarrays.push_back(ndarray);
+ }
+ ARROW_ASSIGN_OR_RAISE(int64_t offset, src->Tell());
+ for (int i = 0; i < num_buffers; ++i) {
+ int64_t size;
+ RETURN_NOT_OK(src->ReadAt(offset, sizeof(int64_t), &size));
+ offset += sizeof(int64_t);
+ ARROW_ASSIGN_OR_RAISE(auto buffer, src->ReadAt(offset, size));
+ out->buffers.push_back(buffer);
+ offset += size;
+ }
+ return Status::OK();
+Status DeserializeObject(PyObject* context, const SerializedPyObject& obj, PyObject* base,
+ PyObject** out) {
+ PyAcquireGIL lock;
+ return DeserializeList(context, *obj.batch->column(0), 0, obj.batch->num_rows(), base,
+ obj, out);
+Status GetSerializedFromComponents(int num_tensors,
+ const SparseTensorCounts& num_sparse_tensors,
+ int num_ndarrays, int num_buffers, PyObject* data,
+ SerializedPyObject* out) {
+ PyAcquireGIL gil;
+ const Py_ssize_t data_length = PyList_Size(data);
+ const Py_ssize_t expected_data_length = 1 + num_tensors * 2 +
+ num_sparse_tensors.num_total_buffers() +
+ num_ndarrays * 2 + num_buffers;
+ if (data_length != expected_data_length) {
+ return Status::Invalid("Invalid number of buffers in data");
+ }
+ auto GetBuffer = [&data](Py_ssize_t index, std::shared_ptr<Buffer>* out) {
+ ARROW_CHECK_LE(index, PyList_Size(data));
+ PyObject* py_buf = PyList_GET_ITEM(data, index);
+ return unwrap_buffer(py_buf).Value(out);
+ };
+ Py_ssize_t buffer_index = 0;
+ // Read the union batch describing object structure
+ {
+ std::shared_ptr<Buffer> data_buffer;
+ RETURN_NOT_OK(GetBuffer(buffer_index++, &data_buffer));
+ gil.release();
+ io::BufferReader buf_reader(data_buffer);
+ std::shared_ptr<RecordBatchReader> reader;
+ ARROW_ASSIGN_OR_RAISE(reader, ipc::RecordBatchStreamReader::Open(&buf_reader));
+ RETURN_NOT_OK(reader->ReadNext(&out->batch));
+ gil.acquire();
+ }
+ // Zero-copy reconstruct tensors
+ for (int i = 0; i < num_tensors; ++i) {
+ std::shared_ptr<Buffer> metadata;
+ std::shared_ptr<Buffer> body;
+ std::shared_ptr<Tensor> tensor;
+ RETURN_NOT_OK(GetBuffer(buffer_index++, &metadata));
+ RETURN_NOT_OK(GetBuffer(buffer_index++, &body));
+ ipc::Message message(metadata, body);
+ ARROW_ASSIGN_OR_RAISE(tensor, ipc::ReadTensor(message));
+ out->tensors.emplace_back(std::move(tensor));
+ }
+ // Zero-copy reconstruct sparse tensors
+ for (int i = 0, n = num_sparse_tensors.num_total_tensors(); i < n; ++i) {
+ ipc::IpcPayload payload;
+ RETURN_NOT_OK(GetBuffer(buffer_index++, &payload.metadata));
+ size_t num_bodies,
+ ipc::internal::ReadSparseTensorBodyBufferCount(*payload.metadata));
+ payload.body_buffers.reserve(num_bodies);
+ for (size_t i = 0; i < num_bodies; ++i) {
+ std::shared_ptr<Buffer> body;
+ RETURN_NOT_OK(GetBuffer(buffer_index++, &body));
+ payload.body_buffers.emplace_back(body);
+ }
+ std::shared_ptr<SparseTensor> sparse_tensor;
+ ARROW_ASSIGN_OR_RAISE(sparse_tensor, ipc::internal::ReadSparseTensorPayload(payload));
+ out->sparse_tensors.emplace_back(std::move(sparse_tensor));
+ }
+ // Zero-copy reconstruct tensors for numpy ndarrays
+ for (int i = 0; i < num_ndarrays; ++i) {
+ std::shared_ptr<Buffer> metadata;
+ std::shared_ptr<Buffer> body;
+ std::shared_ptr<Tensor> tensor;
+ RETURN_NOT_OK(GetBuffer(buffer_index++, &metadata));
+ RETURN_NOT_OK(GetBuffer(buffer_index++, &body));
+ ipc::Message message(metadata, body);
+ ARROW_ASSIGN_OR_RAISE(tensor, ipc::ReadTensor(message));
+ out->ndarrays.emplace_back(std::move(tensor));
+ }
+ // Unwrap and append buffers
+ for (int i = 0; i < num_buffers; ++i) {
+ std::shared_ptr<Buffer> buffer;
+ RETURN_NOT_OK(GetBuffer(buffer_index++, &buffer));
+ out->buffers.emplace_back(std::move(buffer));
+ }
+ return Status::OK();
+Status DeserializeNdarray(const SerializedPyObject& object,
+ std::shared_ptr<Tensor>* out) {
+ if (object.ndarrays.size() != 1) {
+ return Status::Invalid("Object is not an Ndarray");
+ }
+ *out = object.ndarrays[0];
+ return Status::OK();
+Status NdarrayFromBuffer(std::shared_ptr<Buffer> src, std::shared_ptr<Tensor>* out) {
+ io::BufferReader reader(src);
+ SerializedPyObject object;
+ RETURN_NOT_OK(ReadSerializedObject(&reader, &object));
+ return DeserializeNdarray(object, out);
+} // namespace py
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/python/deserialize.h b/contrib/libs/apache/arrow/cpp/src/arrow/python/deserialize.h
new file mode 100644
index 0000000000..41b6a13a38
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/python/deserialize.h
@@ -0,0 +1,106 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+#include <cstdint>
+#include <memory>
+#include <vector>
+#include "arrow/python/serialize.h"
+#include "arrow/python/visibility.h"
+#include "arrow/status.h"
+namespace arrow {
+class RecordBatch;
+class Tensor;
+namespace io {
+class RandomAccessFile;
+} // namespace io
+namespace py {
+struct ARROW_PYTHON_EXPORT SparseTensorCounts {
+ int coo;
+ int csr;
+ int csc;
+ int csf;
+ int ndim_csf;
+ int num_total_tensors() const { return coo + csr + csc + csf; }
+ int num_total_buffers() const {
+ return coo * 3 + csr * 4 + csc * 4 + 2 * ndim_csf + csf;
+ }
+/// \brief Read serialized Python sequence from file interface using Arrow IPC
+/// \param[in] src a RandomAccessFile
+/// \param[out] out the reconstructed data
+/// \return Status
+Status ReadSerializedObject(io::RandomAccessFile* src, SerializedPyObject* out);
+/// \brief Reconstruct SerializedPyObject from representation produced by
+/// SerializedPyObject::GetComponents.
+/// \param[in] num_tensors number of tensors in the object
+/// \param[in] num_sparse_tensors number of sparse tensors in the object
+/// \param[in] num_ndarrays number of numpy Ndarrays in the object
+/// \param[in] num_buffers number of buffers in the object
+/// \param[in] data a list containing pyarrow.Buffer instances. It must be 1 +
+/// num_tensors * 2 + num_coo_tensors * 3 + num_csr_tensors * 4 + num_csc_tensors * 4 +
+/// num_csf_tensors * (2 * ndim_csf + 3) + num_buffers in length
+/// \param[out] out the reconstructed object
+/// \return Status
+Status GetSerializedFromComponents(int num_tensors,
+ const SparseTensorCounts& num_sparse_tensors,
+ int num_ndarrays, int num_buffers, PyObject* data,
+ SerializedPyObject* out);
+/// \brief Reconstruct Python object from Arrow-serialized representation
+/// \param[in] context Serialization context which contains custom serialization
+/// and deserialization callbacks. Can be any Python object with a
+/// _serialize_callback method for serialization and a _deserialize_callback
+/// method for deserialization. If context is None, no custom serialization
+/// will be attempted.
+/// \param[in] object Object to deserialize
+/// \param[in] base a Python object holding the underlying data that any NumPy
+/// arrays will reference, to avoid premature deallocation
+/// \param[out] out The returned object
+/// \return Status
+/// This acquires the GIL
+Status DeserializeObject(PyObject* context, const SerializedPyObject& object,
+ PyObject* base, PyObject** out);
+/// \brief Reconstruct Ndarray from Arrow-serialized representation
+/// \param[in] object Object to deserialize
+/// \param[out] out The deserialized tensor
+/// \return Status
+Status DeserializeNdarray(const SerializedPyObject& object, std::shared_ptr<Tensor>* out);
+Status NdarrayFromBuffer(std::shared_ptr<Buffer> src, std::shared_ptr<Tensor>* out);
+} // namespace py
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/python/extension_type.cc b/contrib/libs/apache/arrow/cpp/src/arrow/python/extension_type.cc
new file mode 100644
index 0000000000..3ccc171c87
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/python/extension_type.cc
@@ -0,0 +1,217 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#include <memory>
+#include <sstream>
+#include <utility>
+#include "arrow/python/extension_type.h"
+#include "arrow/python/helpers.h"
+#include "arrow/python/pyarrow.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/logging.h"
+namespace arrow {
+using internal::checked_cast;
+namespace py {
+namespace {
+// Serialize a Python ExtensionType instance
+Status SerializeExtInstance(PyObject* type_instance, std::string* out) {
+ OwnedRef res(
+ cpp_PyObject_CallMethod(type_instance, "__arrow_ext_serialize__", nullptr));
+ if (!res) {
+ return ConvertPyError();
+ }
+ if (!PyBytes_Check(res.obj())) {
+ return Status::TypeError(
+ "__arrow_ext_serialize__ should return bytes object, "
+ "got ",
+ internal::PyObject_StdStringRepr(res.obj()));
+ }
+ *out = internal::PyBytes_AsStdString(res.obj());
+ return Status::OK();
+// Deserialize a Python ExtensionType instance
+PyObject* DeserializeExtInstance(PyObject* type_class,
+ std::shared_ptr<DataType> storage_type,
+ const std::string& serialized_data) {
+ OwnedRef storage_ref(wrap_data_type(storage_type));
+ if (!storage_ref) {
+ return nullptr;
+ }
+ OwnedRef data_ref(PyBytes_FromStringAndSize(
+ serialized_data.data(), static_cast<Py_ssize_t>(serialized_data.size())));
+ if (!data_ref) {
+ return nullptr;
+ }
+ return cpp_PyObject_CallMethod(type_class, "__arrow_ext_deserialize__", "OO",
+ storage_ref.obj(), data_ref.obj());
+} // namespace
+static const char* kExtensionName = "arrow.py_extension_type";
+std::string PyExtensionType::ToString() const {
+ PyAcquireGIL lock;
+ std::stringstream ss;
+ OwnedRef instance(GetInstance());
+ ss << "extension<" << this->extension_name() << "<" << Py_TYPE(instance.obj())->tp_name
+ << ">>";
+ return ss.str();
+PyExtensionType::PyExtensionType(std::shared_ptr<DataType> storage_type, PyObject* typ,
+ PyObject* inst)
+ : ExtensionType(storage_type),
+ extension_name_(kExtensionName),
+ type_class_(typ),
+ type_instance_(inst) {}
+PyExtensionType::PyExtensionType(std::shared_ptr<DataType> storage_type,
+ std::string extension_name, PyObject* typ,
+ PyObject* inst)
+ : ExtensionType(storage_type),
+ extension_name_(std::move(extension_name)),
+ type_class_(typ),
+ type_instance_(inst) {}
+bool PyExtensionType::ExtensionEquals(const ExtensionType& other) const {
+ PyAcquireGIL lock;
+ if (other.extension_name() != extension_name()) {
+ return false;
+ }
+ const auto& other_ext = checked_cast<const PyExtensionType&>(other);
+ int res = -1;
+ if (!type_instance_) {
+ if (other_ext.type_instance_) {
+ return false;
+ }
+ // Compare Python types
+ res = PyObject_RichCompareBool(type_class_.obj(), other_ext.type_class_.obj(), Py_EQ);
+ } else {
+ if (!other_ext.type_instance_) {
+ return false;
+ }
+ // Compare Python instances
+ OwnedRef left(GetInstance());
+ OwnedRef right(other_ext.GetInstance());
+ if (!left || !right) {
+ goto error;
+ }
+ res = PyObject_RichCompareBool(left.obj(), right.obj(), Py_EQ);
+ }
+ if (res == -1) {
+ goto error;
+ }
+ return res == 1;
+ // Cannot propagate error
+ PyErr_WriteUnraisable(nullptr);
+ return false;
+std::shared_ptr<Array> PyExtensionType::MakeArray(std::shared_ptr<ArrayData> data) const {
+ DCHECK_EQ(data->type->id(), Type::EXTENSION);
+ return std::make_shared<ExtensionArray>(data);
+std::string PyExtensionType::Serialize() const {
+ DCHECK(type_instance_);
+ return serialized_;
+Result<std::shared_ptr<DataType>> PyExtensionType::Deserialize(
+ std::shared_ptr<DataType> storage_type, const std::string& serialized_data) const {
+ PyAcquireGIL lock;
+ if (import_pyarrow()) {
+ return ConvertPyError();
+ }
+ OwnedRef res(DeserializeExtInstance(type_class_.obj(), storage_type, serialized_data));
+ if (!res) {
+ return ConvertPyError();
+ }
+ return unwrap_data_type(res.obj());
+PyObject* PyExtensionType::GetInstance() const {
+ if (!type_instance_) {
+ PyErr_SetString(PyExc_TypeError, "Not an instance");
+ return nullptr;
+ }
+ DCHECK(PyWeakref_CheckRef(type_instance_.obj()));
+ PyObject* inst = PyWeakref_GET_OBJECT(type_instance_.obj());
+ if (inst != Py_None) {
+ // Cached instance still alive
+ Py_INCREF(inst);
+ return inst;
+ } else {
+ // Must reconstruct from serialized form
+ // XXX cache again?
+ return DeserializeExtInstance(type_class_.obj(), storage_type_, serialized_);
+ }
+Status PyExtensionType::SetInstance(PyObject* inst) const {
+ // Check we have the right type
+ PyObject* typ = reinterpret_cast<PyObject*>(Py_TYPE(inst));
+ if (typ != type_class_.obj()) {
+ return Status::TypeError("Unexpected Python ExtensionType class ",
+ internal::PyObject_StdStringRepr(typ), " expected ",
+ internal::PyObject_StdStringRepr(type_class_.obj()));
+ }
+ PyObject* wr = PyWeakref_NewRef(inst, nullptr);
+ if (wr == NULL) {
+ return ConvertPyError();
+ }
+ type_instance_.reset(wr);
+ return SerializeExtInstance(inst, &serialized_);
+Status PyExtensionType::FromClass(const std::shared_ptr<DataType> storage_type,
+ const std::string extension_name, PyObject* typ,
+ std::shared_ptr<ExtensionType>* out) {
+ Py_INCREF(typ);
+ out->reset(new PyExtensionType(storage_type, std::move(extension_name), typ));
+ return Status::OK();
+Status RegisterPyExtensionType(const std::shared_ptr<DataType>& type) {
+ DCHECK_EQ(type->id(), Type::EXTENSION);
+ auto ext_type = std::dynamic_pointer_cast<ExtensionType>(type);
+ return RegisterExtensionType(ext_type);
+Status UnregisterPyExtensionType(const std::string& type_name) {
+ return UnregisterExtensionType(type_name);
+std::string PyExtensionName() { return kExtensionName; }
+} // namespace py
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/python/extension_type.h b/contrib/libs/apache/arrow/cpp/src/arrow/python/extension_type.h
new file mode 100644
index 0000000000..e433d9aca7
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/python/extension_type.h
@@ -0,0 +1,85 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+#include <memory>
+#include <string>
+#include "arrow/extension_type.h"
+#include "arrow/python/common.h"
+#include "arrow/python/visibility.h"
+#include "arrow/util/macros.h"
+namespace arrow {
+namespace py {
+class ARROW_PYTHON_EXPORT PyExtensionType : public ExtensionType {
+ public:
+ // Implement extensionType API
+ std::string extension_name() const override { return extension_name_; }
+ std::string ToString() const override;
+ bool ExtensionEquals(const ExtensionType& other) const override;
+ std::shared_ptr<Array> MakeArray(std::shared_ptr<ArrayData> data) const override;
+ Result<std::shared_ptr<DataType>> Deserialize(
+ std::shared_ptr<DataType> storage_type,
+ const std::string& serialized) const override;
+ std::string Serialize() const override;
+ // For use from Cython
+ // Assumes that `typ` is borrowed
+ static Status FromClass(const std::shared_ptr<DataType> storage_type,
+ const std::string extension_name, PyObject* typ,
+ std::shared_ptr<ExtensionType>* out);
+ // Return new ref
+ PyObject* GetInstance() const;
+ Status SetInstance(PyObject*) const;
+ protected:
+ PyExtensionType(std::shared_ptr<DataType> storage_type, PyObject* typ,
+ PyObject* inst = NULLPTR);
+ PyExtensionType(std::shared_ptr<DataType> storage_type, std::string extension_name,
+ PyObject* typ, PyObject* inst = NULLPTR);
+ std::string extension_name_;
+ // These fields are mutable because of two-step initialization.
+ mutable OwnedRefNoGIL type_class_;
+ // A weakref or null. Storing a strong reference to the Python extension type
+ // instance would create an unreclaimable reference cycle between Python and C++
+ // (the Python instance has to keep a strong reference to the C++ ExtensionType
+ // in other direction). Instead, we store a weakref to the instance.
+ // If the weakref is dead, we reconstruct the instance from its serialized form.
+ mutable OwnedRefNoGIL type_instance_;
+ // Empty if type_instance_ is null
+ mutable std::string serialized_;
+ARROW_PYTHON_EXPORT std::string PyExtensionName();
+ARROW_PYTHON_EXPORT Status RegisterPyExtensionType(const std::shared_ptr<DataType>&);
+ARROW_PYTHON_EXPORT Status UnregisterPyExtensionType(const std::string& type_name);
+} // namespace py
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/python/filesystem.cc b/contrib/libs/apache/arrow/cpp/src/arrow/python/filesystem.cc
new file mode 100644
index 0000000000..8c12f05a0f
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/python/filesystem.cc
@@ -0,0 +1,206 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#include "arrow/python/filesystem.h"
+#include "arrow/util/logging.h"
+namespace arrow {
+using fs::FileInfo;
+using fs::FileSelector;
+namespace py {
+namespace fs {
+PyFileSystem::PyFileSystem(PyObject* handler, PyFileSystemVtable vtable)
+ : handler_(handler), vtable_(std::move(vtable)) {
+ Py_INCREF(handler);
+PyFileSystem::~PyFileSystem() {}
+std::shared_ptr<PyFileSystem> PyFileSystem::Make(PyObject* handler,
+ PyFileSystemVtable vtable) {
+ return std::make_shared<PyFileSystem>(handler, std::move(vtable));
+std::string PyFileSystem::type_name() const {
+ std::string result;
+ auto st = SafeCallIntoPython([&]() -> Status {
+ vtable_.get_type_name(handler_.obj(), &result);
+ if (PyErr_Occurred()) {
+ PyErr_WriteUnraisable(handler_.obj());
+ }
+ return Status::OK();
+ });
+ return result;
+bool PyFileSystem::Equals(const FileSystem& other) const {
+ bool result;
+ auto st = SafeCallIntoPython([&]() -> Status {
+ result = vtable_.equals(handler_.obj(), other);
+ if (PyErr_Occurred()) {
+ PyErr_WriteUnraisable(handler_.obj());
+ }
+ return Status::OK();
+ });
+ return result;
+Result<FileInfo> PyFileSystem::GetFileInfo(const std::string& path) {
+ FileInfo info;
+ auto st = SafeCallIntoPython([&]() -> Status {
+ vtable_.get_file_info(handler_.obj(), path, &info);
+ return CheckPyError();
+ });
+ return info;
+Result<std::vector<FileInfo>> PyFileSystem::GetFileInfo(
+ const std::vector<std::string>& paths) {
+ std::vector<FileInfo> infos;
+ auto st = SafeCallIntoPython([&]() -> Status {
+ vtable_.get_file_info_vector(handler_.obj(), paths, &infos);
+ return CheckPyError();
+ });
+ return infos;
+Result<std::vector<FileInfo>> PyFileSystem::GetFileInfo(const FileSelector& select) {
+ std::vector<FileInfo> infos;
+ auto st = SafeCallIntoPython([&]() -> Status {
+ vtable_.get_file_info_selector(handler_.obj(), select, &infos);
+ return CheckPyError();
+ });
+ return infos;
+Status PyFileSystem::CreateDir(const std::string& path, bool recursive) {
+ return SafeCallIntoPython([&]() -> Status {
+ vtable_.create_dir(handler_.obj(), path, recursive);
+ return CheckPyError();
+ });
+Status PyFileSystem::DeleteDir(const std::string& path) {
+ return SafeCallIntoPython([&]() -> Status {
+ vtable_.delete_dir(handler_.obj(), path);
+ return CheckPyError();
+ });
+Status PyFileSystem::DeleteDirContents(const std::string& path) {
+ return SafeCallIntoPython([&]() -> Status {
+ vtable_.delete_dir_contents(handler_.obj(), path);
+ return CheckPyError();
+ });
+Status PyFileSystem::DeleteRootDirContents() {
+ return SafeCallIntoPython([&]() -> Status {
+ vtable_.delete_root_dir_contents(handler_.obj());
+ return CheckPyError();
+ });
+Status PyFileSystem::DeleteFile(const std::string& path) {
+ return SafeCallIntoPython([&]() -> Status {
+ vtable_.delete_file(handler_.obj(), path);
+ return CheckPyError();
+ });
+Status PyFileSystem::Move(const std::string& src, const std::string& dest) {
+ return SafeCallIntoPython([&]() -> Status {
+ vtable_.move(handler_.obj(), src, dest);
+ return CheckPyError();
+ });
+Status PyFileSystem::CopyFile(const std::string& src, const std::string& dest) {
+ return SafeCallIntoPython([&]() -> Status {
+ vtable_.copy_file(handler_.obj(), src, dest);
+ return CheckPyError();
+ });
+Result<std::shared_ptr<io::InputStream>> PyFileSystem::OpenInputStream(
+ const std::string& path) {
+ std::shared_ptr<io::InputStream> stream;
+ auto st = SafeCallIntoPython([&]() -> Status {
+ vtable_.open_input_stream(handler_.obj(), path, &stream);
+ return CheckPyError();
+ });
+ return stream;
+Result<std::shared_ptr<io::RandomAccessFile>> PyFileSystem::OpenInputFile(
+ const std::string& path) {
+ std::shared_ptr<io::RandomAccessFile> stream;
+ auto st = SafeCallIntoPython([&]() -> Status {
+ vtable_.open_input_file(handler_.obj(), path, &stream);
+ return CheckPyError();
+ });
+ return stream;
+Result<std::shared_ptr<io::OutputStream>> PyFileSystem::OpenOutputStream(
+ const std::string& path, const std::shared_ptr<const KeyValueMetadata>& metadata) {
+ std::shared_ptr<io::OutputStream> stream;
+ auto st = SafeCallIntoPython([&]() -> Status {
+ vtable_.open_output_stream(handler_.obj(), path, metadata, &stream);
+ return CheckPyError();
+ });
+ return stream;
+Result<std::shared_ptr<io::OutputStream>> PyFileSystem::OpenAppendStream(
+ const std::string& path, const std::shared_ptr<const KeyValueMetadata>& metadata) {
+ std::shared_ptr<io::OutputStream> stream;
+ auto st = SafeCallIntoPython([&]() -> Status {
+ vtable_.open_append_stream(handler_.obj(), path, metadata, &stream);
+ return CheckPyError();
+ });
+ return stream;
+Result<std::string> PyFileSystem::NormalizePath(std::string path) {
+ std::string normalized;
+ auto st = SafeCallIntoPython([&]() -> Status {
+ vtable_.normalize_path(handler_.obj(), path, &normalized);
+ return CheckPyError();
+ });
+ return normalized;
+} // namespace fs
+} // namespace py
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/python/filesystem.h b/contrib/libs/apache/arrow/cpp/src/arrow/python/filesystem.h
new file mode 100644
index 0000000000..e1235f8de5
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/python/filesystem.h
@@ -0,0 +1,126 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+#include <memory>
+#include <string>
+#include <vector>
+#include "arrow/filesystem/filesystem.h"
+#include "arrow/python/common.h"
+#include "arrow/python/visibility.h"
+#include "arrow/util/macros.h"
+namespace arrow {
+namespace py {
+namespace fs {
+class ARROW_PYTHON_EXPORT PyFileSystemVtable {
+ public:
+ std::function<void(PyObject*, std::string* out)> get_type_name;
+ std::function<bool(PyObject*, const arrow::fs::FileSystem& other)> equals;
+ std::function<void(PyObject*, const std::string& path, arrow::fs::FileInfo* out)>
+ get_file_info;
+ std::function<void(PyObject*, const std::vector<std::string>& paths,
+ std::vector<arrow::fs::FileInfo>* out)>
+ get_file_info_vector;
+ std::function<void(PyObject*, const arrow::fs::FileSelector&,
+ std::vector<arrow::fs::FileInfo>* out)>
+ get_file_info_selector;
+ std::function<void(PyObject*, const std::string& path, bool)> create_dir;
+ std::function<void(PyObject*, const std::string& path)> delete_dir;
+ std::function<void(PyObject*, const std::string& path)> delete_dir_contents;
+ std::function<void(PyObject*)> delete_root_dir_contents;
+ std::function<void(PyObject*, const std::string& path)> delete_file;
+ std::function<void(PyObject*, const std::string& src, const std::string& dest)> move;
+ std::function<void(PyObject*, const std::string& src, const std::string& dest)>
+ copy_file;
+ std::function<void(PyObject*, const std::string& path,
+ std::shared_ptr<io::InputStream>* out)>
+ open_input_stream;
+ std::function<void(PyObject*, const std::string& path,
+ std::shared_ptr<io::RandomAccessFile>* out)>
+ open_input_file;
+ std::function<void(PyObject*, const std::string& path,
+ const std::shared_ptr<const KeyValueMetadata>&,
+ std::shared_ptr<io::OutputStream>* out)>
+ open_output_stream;
+ std::function<void(PyObject*, const std::string& path,
+ const std::shared_ptr<const KeyValueMetadata>&,
+ std::shared_ptr<io::OutputStream>* out)>
+ open_append_stream;
+ std::function<void(PyObject*, const std::string& path, std::string* out)>
+ normalize_path;
+class ARROW_PYTHON_EXPORT PyFileSystem : public arrow::fs::FileSystem {
+ public:
+ PyFileSystem(PyObject* handler, PyFileSystemVtable vtable);
+ ~PyFileSystem() override;
+ static std::shared_ptr<PyFileSystem> Make(PyObject* handler, PyFileSystemVtable vtable);
+ std::string type_name() const override;
+ bool Equals(const FileSystem& other) const override;
+ Result<arrow::fs::FileInfo> GetFileInfo(const std::string& path) override;
+ Result<std::vector<arrow::fs::FileInfo>> GetFileInfo(
+ const std::vector<std::string>& paths) override;
+ Result<std::vector<arrow::fs::FileInfo>> GetFileInfo(
+ const arrow::fs::FileSelector& select) override;
+ Status CreateDir(const std::string& path, bool recursive = true) override;
+ Status DeleteDir(const std::string& path) override;
+ Status DeleteDirContents(const std::string& path) override;
+ Status DeleteRootDirContents() override;
+ Status DeleteFile(const std::string& path) override;
+ Status Move(const std::string& src, const std::string& dest) override;
+ Status CopyFile(const std::string& src, const std::string& dest) override;
+ Result<std::shared_ptr<io::InputStream>> OpenInputStream(
+ const std::string& path) override;
+ Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
+ const std::string& path) override;
+ Result<std::shared_ptr<io::OutputStream>> OpenOutputStream(
+ const std::string& path,
+ const std::shared_ptr<const KeyValueMetadata>& metadata = {}) override;
+ Result<std::shared_ptr<io::OutputStream>> OpenAppendStream(
+ const std::string& path,
+ const std::shared_ptr<const KeyValueMetadata>& metadata = {}) override;
+ Result<std::string> NormalizePath(std::string path) override;
+ PyObject* handler() const { return handler_.obj(); }
+ private:
+ OwnedRefNoGIL handler_;
+ PyFileSystemVtable vtable_;
+} // namespace fs
+} // namespace py
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/python/helpers.cc b/contrib/libs/apache/arrow/cpp/src/arrow/python/helpers.cc
new file mode 100644
index 0000000000..75a77c640b
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/python/helpers.cc
@@ -0,0 +1,436 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+// helpers.h includes a NumPy header, so we include this first
+#include "arrow/python/numpy_interop.h"
+#include "arrow/python/helpers.h"
+#include <cmath>
+#include <limits>
+#include <sstream>
+#include <type_traits>
+#include "arrow/python/common.h"
+#include "arrow/python/decimal.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/logging.h"
+namespace arrow {
+using internal::checked_cast;
+namespace py {
+ case Type::NAME: \
+ return FACTORY()
+std::shared_ptr<DataType> GetPrimitiveType(Type::type type) {
+ switch (type) {
+ case Type::NA:
+ return null();
+ default:
+ return nullptr;
+ }
+PyObject* PyHalf_FromHalf(npy_half value) {
+ PyObject* result = PyArrayScalar_New(Half);
+ if (result != NULL) {
+ PyArrayScalar_ASSIGN(result, Half, value);
+ }
+ return result;
+Status PyFloat_AsHalf(PyObject* obj, npy_half* out) {
+ if (PyArray_IsScalar(obj, Half)) {
+ *out = PyArrayScalar_VAL(obj, Half);
+ return Status::OK();
+ } else {
+ // XXX: cannot use npy_double_to_half() without linking with Numpy
+ return Status::TypeError("Expected np.float16 instance");
+ }
+namespace internal {
+std::string PyBytes_AsStdString(PyObject* obj) {
+ DCHECK(PyBytes_Check(obj));
+ return std::string(PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj));
+Status PyUnicode_AsStdString(PyObject* obj, std::string* out) {
+ DCHECK(PyUnicode_Check(obj));
+ Py_ssize_t size;
+ // The utf-8 representation is cached on the unicode object
+ const char* data = PyUnicode_AsUTF8AndSize(obj, &size);
+ *out = std::string(data, size);
+ return Status::OK();
+std::string PyObject_StdStringRepr(PyObject* obj) {
+ OwnedRef unicode_ref(PyObject_Repr(obj));
+ OwnedRef bytes_ref;
+ if (unicode_ref) {
+ bytes_ref.reset(
+ PyUnicode_AsEncodedString(unicode_ref.obj(), "utf8", "backslashreplace"));
+ }
+ if (!bytes_ref) {
+ PyErr_Clear();
+ std::stringstream ss;
+ ss << "<object of type '" << Py_TYPE(obj)->tp_name << "' repr() failed>";
+ return ss.str();
+ }
+ return PyBytes_AsStdString(bytes_ref.obj());
+Status PyObject_StdStringStr(PyObject* obj, std::string* out) {
+ OwnedRef string_ref(PyObject_Str(obj));
+ return PyUnicode_AsStdString(string_ref.obj(), out);
+Result<bool> IsModuleImported(const std::string& module_name) {
+ // PyImport_GetModuleDict returns with a borrowed reference
+ OwnedRef key(PyUnicode_FromString(module_name.c_str()));
+ auto is_imported = PyDict_Contains(PyImport_GetModuleDict(), key.obj());
+ return is_imported;
+Status ImportModule(const std::string& module_name, OwnedRef* ref) {
+ PyObject* module = PyImport_ImportModule(module_name.c_str());
+ ref->reset(module);
+ return Status::OK();
+Status ImportFromModule(PyObject* module, const std::string& name, OwnedRef* ref) {
+ PyObject* attr = PyObject_GetAttrString(module, name.c_str());
+ ref->reset(attr);
+ return Status::OK();
+namespace {
+Status IntegerOverflowStatus(PyObject* obj, const std::string& overflow_message) {
+ if (overflow_message.empty()) {
+ std::string obj_as_stdstring;
+ RETURN_NOT_OK(PyObject_StdStringStr(obj, &obj_as_stdstring));
+ return Status::Invalid("Value ", obj_as_stdstring,
+ " too large to fit in C integer type");
+ } else {
+ return Status::Invalid(overflow_message);
+ }
+// Extract C signed int from Python object
+template <typename Int, enable_if_t<std::is_signed<Int>::value, Int> = 0>
+Status CIntFromPythonImpl(PyObject* obj, Int* out, const std::string& overflow_message) {
+ static_assert(sizeof(Int) <= sizeof(long long), // NOLINT
+ "integer type larger than long long");
+ if (sizeof(Int) > sizeof(long)) { // NOLINT
+ const auto value = PyLong_AsLongLong(obj);
+ if (ARROW_PREDICT_FALSE(value == -1)) {
+ }
+ if (ARROW_PREDICT_FALSE(value < std::numeric_limits<Int>::min() ||
+ value > std::numeric_limits<Int>::max())) {
+ return IntegerOverflowStatus(obj, overflow_message);
+ }
+ *out = static_cast<Int>(value);
+ } else {
+ const auto value = PyLong_AsLong(obj);
+ if (ARROW_PREDICT_FALSE(value == -1)) {
+ }
+ if (ARROW_PREDICT_FALSE(value < std::numeric_limits<Int>::min() ||
+ value > std::numeric_limits<Int>::max())) {
+ return IntegerOverflowStatus(obj, overflow_message);
+ }
+ *out = static_cast<Int>(value);
+ }
+ return Status::OK();
+// Extract C unsigned int from Python object
+template <typename Int, enable_if_t<std::is_unsigned<Int>::value, Int> = 0>
+Status CIntFromPythonImpl(PyObject* obj, Int* out, const std::string& overflow_message) {
+ static_assert(sizeof(Int) <= sizeof(unsigned long long), // NOLINT
+ "integer type larger than unsigned long long");
+ OwnedRef ref;
+ // PyLong_AsUnsignedLong() and PyLong_AsUnsignedLongLong() don't handle
+ // conversion from non-ints (e.g. np.uint64), so do it ourselves
+ if (!PyLong_Check(obj)) {
+ ref.reset(PyNumber_Index(obj));
+ if (!ref) {
+ }
+ obj = ref.obj();
+ }
+ if (sizeof(Int) > sizeof(unsigned long)) { // NOLINT
+ const auto value = PyLong_AsUnsignedLongLong(obj);
+ if (ARROW_PREDICT_FALSE(value == static_cast<decltype(value)>(-1))) {
+ }
+ if (ARROW_PREDICT_FALSE(value > std::numeric_limits<Int>::max())) {
+ return IntegerOverflowStatus(obj, overflow_message);
+ }
+ *out = static_cast<Int>(value);
+ } else {
+ const auto value = PyLong_AsUnsignedLong(obj);
+ if (ARROW_PREDICT_FALSE(value == static_cast<decltype(value)>(-1))) {
+ }
+ if (ARROW_PREDICT_FALSE(value > std::numeric_limits<Int>::max())) {
+ return IntegerOverflowStatus(obj, overflow_message);
+ }
+ *out = static_cast<Int>(value);
+ }
+ return Status::OK();
+} // namespace
+template <typename Int>
+Status CIntFromPython(PyObject* obj, Int* out, const std::string& overflow_message) {
+ if (PyBool_Check(obj)) {
+ return Status::TypeError("Expected integer, got bool");
+ }
+ return CIntFromPythonImpl(obj, out, overflow_message);
+template Status CIntFromPython(PyObject*, int8_t*, const std::string&);
+template Status CIntFromPython(PyObject*, int16_t*, const std::string&);
+template Status CIntFromPython(PyObject*, int32_t*, const std::string&);
+template Status CIntFromPython(PyObject*, int64_t*, const std::string&);
+template Status CIntFromPython(PyObject*, uint8_t*, const std::string&);
+template Status CIntFromPython(PyObject*, uint16_t*, const std::string&);
+template Status CIntFromPython(PyObject*, uint32_t*, const std::string&);
+template Status CIntFromPython(PyObject*, uint64_t*, const std::string&);
+inline bool MayHaveNaN(PyObject* obj) {
+ // Some core types can be very quickly type-checked and do not allow NaN values
+ const int64_t non_nan_tpflags = Py_TPFLAGS_LONG_SUBCLASS | Py_TPFLAGS_LIST_SUBCLASS |
+ return !PyType_HasFeature(Py_TYPE(obj), non_nan_tpflags);
+bool PyFloat_IsNaN(PyObject* obj) {
+ return PyFloat_Check(obj) && std::isnan(PyFloat_AsDouble(obj));
+namespace {
+static bool pandas_static_initialized = false;
+// Once initialized, these variables hold borrowed references to Pandas static data.
+// We should not use OwnedRef here because Python destructors would be
+// called on a finalized interpreter.
+static PyObject* pandas_NA = nullptr;
+static PyObject* pandas_NaT = nullptr;
+static PyObject* pandas_Timedelta = nullptr;
+static PyObject* pandas_Timestamp = nullptr;
+static PyTypeObject* pandas_NaTType = nullptr;
+} // namespace
+void InitPandasStaticData() {
+ // NOTE: This is called with the GIL held. We needn't (and shouldn't,
+ // to avoid deadlocks) use an additional C++ lock (ARROW-10519).
+ if (pandas_static_initialized) {
+ return;
+ }
+ OwnedRef pandas;
+ // Import pandas
+ Status s = ImportModule("pandas", &pandas);
+ if (!s.ok()) {
+ return;
+ }
+ // Since ImportModule can release the GIL, another thread could have
+ // already initialized the static data.
+ if (pandas_static_initialized) {
+ return;
+ }
+ OwnedRef ref;
+ // set NaT sentinel and its type
+ if (ImportFromModule(pandas.obj(), "NaT", &ref).ok()) {
+ pandas_NaT = ref.obj();
+ // PyObject_Type returns a new reference but we trust that pandas.NaT will
+ // outlive our use of this PyObject*
+ pandas_NaTType = Py_TYPE(ref.obj());
+ }
+ // retain a reference to Timedelta
+ if (ImportFromModule(pandas.obj(), "Timedelta", &ref).ok()) {
+ pandas_Timedelta = ref.obj();
+ }
+ // retain a reference to Timestamp
+ if (ImportFromModule(pandas.obj(), "Timestamp", &ref).ok()) {
+ pandas_Timestamp = ref.obj();
+ }
+ // if pandas.NA exists, retain a reference to it
+ if (ImportFromModule(pandas.obj(), "NA", &ref).ok()) {
+ pandas_NA = ref.obj();
+ }
+ pandas_static_initialized = true;
+bool PandasObjectIsNull(PyObject* obj) {
+ if (!MayHaveNaN(obj)) {
+ return false;
+ }
+ if (obj == Py_None) {
+ return true;
+ }
+ if (PyFloat_IsNaN(obj) || (pandas_NA && obj == pandas_NA) ||
+ (pandas_NaTType && PyObject_TypeCheck(obj, pandas_NaTType)) ||
+ (internal::PyDecimal_Check(obj) && internal::PyDecimal_ISNAN(obj))) {
+ return true;
+ }
+ return false;
+bool IsPandasTimedelta(PyObject* obj) {
+ return pandas_Timedelta && PyObject_IsInstance(obj, pandas_Timedelta);
+bool IsPandasTimestamp(PyObject* obj) {
+ return pandas_Timestamp && PyObject_IsInstance(obj, pandas_Timestamp);
+Status InvalidValue(PyObject* obj, const std::string& why) {
+ auto obj_as_str = PyObject_StdStringRepr(obj);
+ return Status::Invalid("Could not convert ", std::move(obj_as_str), " with type ",
+ Py_TYPE(obj)->tp_name, ": ", why);
+Status InvalidType(PyObject* obj, const std::string& why) {
+ auto obj_as_str = PyObject_StdStringRepr(obj);
+ return Status::TypeError("Could not convert ", std::move(obj_as_str), " with type ",
+ Py_TYPE(obj)->tp_name, ": ", why);
+Status UnboxIntegerAsInt64(PyObject* obj, int64_t* out) {
+ if (PyLong_Check(obj)) {
+ int overflow = 0;
+ *out = PyLong_AsLongLongAndOverflow(obj, &overflow);
+ if (overflow) {
+ return Status::Invalid("PyLong is too large to fit int64");
+ }
+ } else if (PyArray_IsScalar(obj, Byte)) {
+ *out = reinterpret_cast<PyByteScalarObject*>(obj)->obval;
+ } else if (PyArray_IsScalar(obj, UByte)) {
+ *out = reinterpret_cast<PyUByteScalarObject*>(obj)->obval;
+ } else if (PyArray_IsScalar(obj, Short)) {
+ *out = reinterpret_cast<PyShortScalarObject*>(obj)->obval;
+ } else if (PyArray_IsScalar(obj, UShort)) {
+ *out = reinterpret_cast<PyUShortScalarObject*>(obj)->obval;
+ } else if (PyArray_IsScalar(obj, Int)) {
+ *out = reinterpret_cast<PyIntScalarObject*>(obj)->obval;
+ } else if (PyArray_IsScalar(obj, UInt)) {
+ *out = reinterpret_cast<PyUIntScalarObject*>(obj)->obval;
+ } else if (PyArray_IsScalar(obj, Long)) {
+ *out = reinterpret_cast<PyLongScalarObject*>(obj)->obval;
+ } else if (PyArray_IsScalar(obj, ULong)) {
+ *out = reinterpret_cast<PyULongScalarObject*>(obj)->obval;
+ } else if (PyArray_IsScalar(obj, LongLong)) {
+ *out = reinterpret_cast<PyLongLongScalarObject*>(obj)->obval;
+ } else if (PyArray_IsScalar(obj, Int64)) {
+ *out = reinterpret_cast<PyInt64ScalarObject*>(obj)->obval;
+ } else if (PyArray_IsScalar(obj, ULongLong)) {
+ *out = reinterpret_cast<PyULongLongScalarObject*>(obj)->obval;
+ } else if (PyArray_IsScalar(obj, UInt64)) {
+ *out = reinterpret_cast<PyUInt64ScalarObject*>(obj)->obval;
+ } else {
+ return Status::Invalid("Integer scalar type not recognized");
+ }
+ return Status::OK();
+Status IntegerScalarToDoubleSafe(PyObject* obj, double* out) {
+ int64_t value = 0;
+ RETURN_NOT_OK(UnboxIntegerAsInt64(obj, &value));
+ constexpr int64_t kDoubleMax = 1LL << 53;
+ constexpr int64_t kDoubleMin = -(1LL << 53);
+ if (value < kDoubleMin || value > kDoubleMax) {
+ return Status::Invalid("Integer value ", value, " is outside of the range exactly",
+ " representable by a IEEE 754 double precision value");
+ }
+ *out = static_cast<double>(value);
+ return Status::OK();
+Status IntegerScalarToFloat32Safe(PyObject* obj, float* out) {
+ int64_t value = 0;
+ RETURN_NOT_OK(UnboxIntegerAsInt64(obj, &value));
+ constexpr int64_t kFloatMax = 1LL << 24;
+ constexpr int64_t kFloatMin = -(1LL << 24);
+ if (value < kFloatMin || value > kFloatMax) {
+ return Status::Invalid("Integer value ", value, " is outside of the range exactly",
+ " representable by a IEEE 754 single precision value");
+ }
+ *out = static_cast<float>(value);
+ return Status::OK();
+void DebugPrint(PyObject* obj) {
+ std::string repr = PyObject_StdStringRepr(obj);
+ PySys_WriteStderr("%s\n", repr.c_str());
+} // namespace internal
+} // namespace py
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/python/helpers.h b/contrib/libs/apache/arrow/cpp/src/arrow/python/helpers.h
new file mode 100644
index 0000000000..19288756c0
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/python/helpers.h
@@ -0,0 +1,156 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+#include "arrow/python/platform.h"
+#include <limits>
+#include <memory>
+#include <string>
+#include <utility>
+#include "arrow/python/numpy_interop.h"
+#include <numpy/halffloat.h>
+#include "arrow/python/visibility.h"
+#include "arrow/type.h"
+#include "arrow/util/macros.h"
+namespace arrow {
+namespace py {
+class OwnedRef;
+// \brief Get an arrow DataType instance from Arrow's Type::type enum
+// \param[in] type One of the values of Arrow's Type::type enum
+// \return A shared pointer to DataType
+ARROW_PYTHON_EXPORT std::shared_ptr<DataType> GetPrimitiveType(Type::type type);
+// \brief Construct a np.float16 object from a npy_half value.
+ARROW_PYTHON_EXPORT PyObject* PyHalf_FromHalf(npy_half value);
+// \brief Convert a Python object to a npy_half value.
+ARROW_PYTHON_EXPORT Status PyFloat_AsHalf(PyObject* obj, npy_half* out);
+namespace internal {
+// \brief Check that a Python module has been already imported
+// \param[in] module_name The name of the module
+Result<bool> IsModuleImported(const std::string& module_name);
+// \brief Import a Python module
+// \param[in] module_name The name of the module
+// \param[out] ref The OwnedRef containing the module PyObject*
+Status ImportModule(const std::string& module_name, OwnedRef* ref);
+// \brief Import an object from a Python module
+// \param[in] module A Python module
+// \param[in] name The name of the object to import
+// \param[out] ref The OwnedRef containing the \c name attribute of the Python module \c
+// module
+Status ImportFromModule(PyObject* module, const std::string& name, OwnedRef* ref);
+// \brief Check whether obj is an integer, independent of Python versions.
+inline bool IsPyInteger(PyObject* obj) { return PyLong_Check(obj); }
+// \brief Import symbols from pandas that we need for various type-checking,
+// like pandas.NaT or pandas.NA
+void InitPandasStaticData();
+// \brief Use pandas missing value semantics to check if a value is null
+bool PandasObjectIsNull(PyObject* obj);
+// \brief Check that obj is a pandas.Timedelta instance
+bool IsPandasTimedelta(PyObject* obj);
+// \brief Check that obj is a pandas.Timestamp instance
+bool IsPandasTimestamp(PyObject* obj);
+// \brief Check whether obj is a floating-point NaN
+bool PyFloat_IsNaN(PyObject* obj);
+inline bool IsPyBinary(PyObject* obj) {
+ return PyBytes_Check(obj) || PyByteArray_Check(obj) || PyMemoryView_Check(obj);
+// \brief Convert a Python integer into a C integer
+// \param[in] obj A Python integer
+// \param[out] out A pointer to a C integer to hold the result of the conversion
+// \return The status of the operation
+template <typename Int>
+Status CIntFromPython(PyObject* obj, Int* out, const std::string& overflow_message = "");
+// \brief Convert a Python unicode string to a std::string
+Status PyUnicode_AsStdString(PyObject* obj, std::string* out);
+// \brief Convert a Python bytes object to a std::string
+std::string PyBytes_AsStdString(PyObject* obj);
+// \brief Call str() on the given object and return the result as a std::string
+Status PyObject_StdStringStr(PyObject* obj, std::string* out);
+// \brief Return the repr() of the given object (always succeeds)
+std::string PyObject_StdStringRepr(PyObject* obj);
+// \brief Cast the given size to int32_t, with error checking
+inline Status CastSize(Py_ssize_t size, int32_t* out,
+ const char* error_msg = "Maximum size exceeded (2GB)") {
+ // size is assumed to be positive
+ if (size > std::numeric_limits<int32_t>::max()) {
+ return Status::Invalid(error_msg);
+ }
+ *out = static_cast<int32_t>(size);
+ return Status::OK();
+inline Status CastSize(Py_ssize_t size, int64_t* out, const char* error_msg = NULLPTR) {
+ // size is assumed to be positive
+ *out = static_cast<int64_t>(size);
+ return Status::OK();
+// \brief Print the Python object's __str__ form along with the passed error
+// message
+Status InvalidValue(PyObject* obj, const std::string& why);
+Status InvalidType(PyObject* obj, const std::string& why);
+Status IntegerScalarToDoubleSafe(PyObject* obj, double* result);
+Status IntegerScalarToFloat32Safe(PyObject* obj, float* result);
+// \brief Print Python object __repr__
+void DebugPrint(PyObject* obj);
+} // namespace internal
+} // namespace py
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/python/inference.cc b/contrib/libs/apache/arrow/cpp/src/arrow/python/inference.cc
new file mode 100644
index 0000000000..5086815f84
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/python/inference.cc
@@ -0,0 +1,660 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#include "arrow/python/inference.h"
+#include "arrow/python/numpy_interop.h"
+#include <datetime.h>
+#include <algorithm>
+#include <limits>
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
+#include "arrow/status.h"
+#include "arrow/util/decimal.h"
+#include "arrow/util/logging.h"
+#include "arrow/python/datetime.h"
+#include "arrow/python/decimal.h"
+#include "arrow/python/helpers.h"
+#include "arrow/python/iterators.h"
+#include "arrow/python/numpy_convert.h"
+namespace arrow {
+namespace py {
+ case NPY_##DTYPE: \
+ return OK;
+ case NPY_##DTYPE: \
+ current_type_num_ = dtype; \
+ current_dtype_ = descr; \
+ return OK;
+ case NPY_##DTYPE: \
+ current_type_num_ = NPY_##NEW_TYPE; \
+ current_dtype_ = PyArray_DescrFromType(current_type_num_); \
+ return OK;
+// Form a consensus NumPy dtype to use for Arrow conversion for a
+// collection of dtype objects observed one at a time
+class NumPyDtypeUnifier {
+ public:
+ enum Action { OK, INVALID };
+ NumPyDtypeUnifier() : current_type_num_(-1), current_dtype_(nullptr) {}
+ Status InvalidMix(int new_dtype) {
+ return Status::Invalid("Cannot mix NumPy dtypes ",
+ GetNumPyTypeName(current_type_num_), " and ",
+ GetNumPyTypeName(new_dtype));
+ }
+ int Observe_BOOL(PyArray_Descr* descr, int dtype) { return INVALID; }
+ int Observe_INT8(PyArray_Descr* descr, int dtype) {
+ switch (dtype) {
+ default:
+ return INVALID;
+ }
+ }
+ int Observe_INT16(PyArray_Descr* descr, int dtype) {
+ switch (dtype) {
+ default:
+ return INVALID;
+ }
+ }
+ int Observe_INT32(PyArray_Descr* descr, int dtype) {
+ switch (dtype) {
+ default:
+ return INVALID;
+ }
+ }
+ int Observe_INT64(PyArray_Descr* descr, int dtype) {
+ switch (dtype) {
+ default:
+ return INVALID;
+ }
+ }
+ int Observe_UINT8(PyArray_Descr* descr, int dtype) {
+ switch (dtype) {
+ default:
+ return INVALID;
+ }
+ }
+ int Observe_UINT16(PyArray_Descr* descr, int dtype) {
+ switch (dtype) {
+ default:
+ return INVALID;
+ }
+ }
+ int Observe_UINT32(PyArray_Descr* descr, int dtype) {
+ switch (dtype) {
+ default:
+ return INVALID;
+ }
+ }
+ int Observe_UINT64(PyArray_Descr* descr, int dtype) {
+ switch (dtype) {
+ default:
+ return INVALID;
+ }
+ }
+ int Observe_FLOAT16(PyArray_Descr* descr, int dtype) {
+ switch (dtype) {
+ default:
+ return INVALID;
+ }
+ }
+ int Observe_FLOAT32(PyArray_Descr* descr, int dtype) {
+ switch (dtype) {
+ default:
+ return INVALID;
+ }
+ }
+ int Observe_FLOAT64(PyArray_Descr* descr, int dtype) {
+ switch (dtype) {
+ default:
+ return INVALID;
+ }
+ }
+ int Observe_DATETIME(PyArray_Descr* dtype_obj) {
+ // TODO: check that units are all the same
+ return OK;
+ }
+ Status Observe(PyArray_Descr* descr) {
+ int dtype = fix_numpy_type_num(descr->type_num);
+ if (current_type_num_ == -1) {
+ current_dtype_ = descr;
+ current_type_num_ = dtype;
+ return Status::OK();
+ } else if (current_type_num_ == dtype) {
+ return Status::OK();
+ }
+ case NPY_##DTYPE: \
+ action = Observe_##DTYPE(descr, dtype); \
+ break;
+ int action = OK;
+ switch (current_type_num_) {
+ action = Observe_DATETIME(descr);
+ break;
+ default:
+ return Status::NotImplemented("Unsupported numpy type ", GetNumPyTypeName(dtype));
+ }
+ if (action == INVALID) {
+ return InvalidMix(dtype);
+ }
+ return Status::OK();
+ }
+ bool dtype_was_observed() const { return current_type_num_ != -1; }
+ PyArray_Descr* current_dtype() const { return current_dtype_; }
+ int current_type_num() const { return current_type_num_; }
+ private:
+ int current_type_num_;
+ PyArray_Descr* current_dtype_;
+class TypeInferrer {
+ // A type inference visitor for Python values
+ public:
+ // \param validate_interval the number of elements to observe before checking
+ // whether the data is mixed type or has other problems. This helps avoid
+ // excess computation for each element while also making sure we "bail out"
+ // early with long sequences that may have problems up front
+ // \param make_unions permit mixed-type data by creating union types (not yet
+ // implemented)
+ explicit TypeInferrer(bool pandas_null_sentinels = false,
+ int64_t validate_interval = 100, bool make_unions = false)
+ : pandas_null_sentinels_(pandas_null_sentinels),
+ validate_interval_(validate_interval),
+ make_unions_(make_unions),
+ total_count_(0),
+ none_count_(0),
+ bool_count_(0),
+ int_count_(0),
+ date_count_(0),
+ time_count_(0),
+ timestamp_micro_count_(0),
+ duration_count_(0),
+ float_count_(0),
+ binary_count_(0),
+ unicode_count_(0),
+ decimal_count_(0),
+ list_count_(0),
+ struct_count_(0),
+ numpy_dtype_count_(0),
+ max_decimal_metadata_(std::numeric_limits<int32_t>::min(),
+ std::numeric_limits<int32_t>::min()),
+ decimal_type_() {
+ ARROW_CHECK_OK(internal::ImportDecimalType(&decimal_type_));
+ }
+ /// \param[in] obj a Python object in the sequence
+ /// \param[out] keep_going if sufficient information has been gathered to
+ /// attempt to begin converting the sequence, *keep_going will be set to true
+ /// to signal to the calling visitor loop to terminate
+ Status Visit(PyObject* obj, bool* keep_going) {
+ ++total_count_;
+ if (obj == Py_None || (pandas_null_sentinels_ && internal::PandasObjectIsNull(obj))) {
+ ++none_count_;
+ } else if (PyBool_Check(obj)) {
+ ++bool_count_;
+ *keep_going = make_unions_;
+ } else if (PyFloat_Check(obj)) {
+ ++float_count_;
+ *keep_going = make_unions_;
+ } else if (internal::IsPyInteger(obj)) {
+ ++int_count_;
+ } else if (PyDateTime_Check(obj)) {
+ // infer timezone from the first encountered datetime object
+ if (!timestamp_micro_count_) {
+ OwnedRef tzinfo(PyObject_GetAttrString(obj, "tzinfo"));
+ if (tzinfo.obj() != nullptr && tzinfo.obj() != Py_None) {
+ ARROW_ASSIGN_OR_RAISE(timezone_, internal::TzinfoToString(tzinfo.obj()));
+ }
+ }
+ ++timestamp_micro_count_;
+ *keep_going = make_unions_;
+ } else if (PyDelta_Check(obj)) {
+ ++duration_count_;
+ *keep_going = make_unions_;
+ } else if (PyDate_Check(obj)) {
+ ++date_count_;
+ *keep_going = make_unions_;
+ } else if (PyTime_Check(obj)) {
+ ++time_count_;
+ *keep_going = make_unions_;
+ } else if (internal::IsPyBinary(obj)) {
+ ++binary_count_;
+ *keep_going = make_unions_;
+ } else if (PyUnicode_Check(obj)) {
+ ++unicode_count_;
+ *keep_going = make_unions_;
+ } else if (PyArray_CheckAnyScalarExact(obj)) {
+ RETURN_NOT_OK(VisitDType(PyArray_DescrFromScalar(obj), keep_going));
+ } else if (PyList_Check(obj)) {
+ RETURN_NOT_OK(VisitList(obj, keep_going));
+ } else if (PyArray_Check(obj)) {
+ RETURN_NOT_OK(VisitNdarray(obj, keep_going));
+ } else if (PyDict_Check(obj)) {
+ RETURN_NOT_OK(VisitDict(obj));
+ } else if (PyObject_IsInstance(obj, decimal_type_.obj())) {
+ RETURN_NOT_OK(max_decimal_metadata_.Update(obj));
+ ++decimal_count_;
+ } else {
+ return internal::InvalidValue(obj,
+ "did not recognize Python value type when inferring "
+ "an Arrow data type");
+ }
+ if (total_count_ % validate_interval_ == 0) {
+ RETURN_NOT_OK(Validate());
+ }
+ return Status::OK();
+ }
+ // Infer value type from a sequence of values
+ Status VisitSequence(PyObject* obj, PyObject* mask = nullptr) {
+ if (mask == nullptr || mask == Py_None) {
+ return internal::VisitSequence(
+ obj, /*offset=*/0,
+ [this](PyObject* value, bool* keep_going) { return Visit(value, keep_going); });
+ } else {
+ return internal::VisitSequenceMasked(
+ obj, mask, /*offset=*/0,
+ [this](PyObject* value, uint8_t masked, bool* keep_going) {
+ if (!masked) {
+ return Visit(value, keep_going);
+ } else {
+ return Status::OK();
+ }
+ });
+ }
+ }
+ Status GetType(std::shared_ptr<DataType>* out) {
+ // TODO(wesm): handling forming unions
+ if (make_unions_) {
+ return Status::NotImplemented("Creating union types not yet supported");
+ }
+ RETURN_NOT_OK(Validate());
+ if (numpy_dtype_count_ > 0) {
+ // All NumPy scalars and Nones/nulls
+ if (numpy_dtype_count_ + none_count_ == total_count_) {
+ std::shared_ptr<DataType> type;
+ RETURN_NOT_OK(NumPyDtypeToArrow(numpy_unifier_.current_dtype(), &type));
+ *out = type;
+ return Status::OK();
+ }
+ // The "bad path": data contains a mix of NumPy scalars and
+ // other kinds of scalars. Note this can happen innocuously
+ // because numpy.nan is not a NumPy scalar (it's a built-in
+ // PyFloat)
+ // TODO(ARROW-5564): Merge together type unification so this
+ // hack is not necessary
+ switch (numpy_unifier_.current_type_num()) {
+ case NPY_BOOL:
+ bool_count_ += numpy_dtype_count_;
+ break;
+ case NPY_INT8:
+ case NPY_INT16:
+ case NPY_INT32:
+ case NPY_INT64:
+ case NPY_UINT8:
+ case NPY_UINT16:
+ case NPY_UINT32:
+ case NPY_UINT64:
+ int_count_ += numpy_dtype_count_;
+ break;
+ case NPY_FLOAT32:
+ case NPY_FLOAT64:
+ float_count_ += numpy_dtype_count_;
+ break;
+ return Status::Invalid(
+ "numpy.datetime64 scalars cannot be mixed "
+ "with other Python scalar values currently");
+ }
+ }
+ if (list_count_) {
+ std::shared_ptr<DataType> value_type;
+ RETURN_NOT_OK(list_inferrer_->GetType(&value_type));
+ *out = list(value_type);
+ } else if (struct_count_) {
+ RETURN_NOT_OK(GetStructType(out));
+ } else if (decimal_count_) {
+ if (max_decimal_metadata_.precision() > Decimal128Type::kMaxPrecision) {
+ // the default constructor does not validate the precision and scale
+ Decimal256Type::Make(max_decimal_metadata_.precision(),
+ max_decimal_metadata_.scale()));
+ } else {
+ Decimal128Type::Make(max_decimal_metadata_.precision(),
+ max_decimal_metadata_.scale()));
+ }
+ } else if (float_count_) {
+ // Prioritize floats before integers
+ *out = float64();
+ } else if (int_count_) {
+ *out = int64();
+ } else if (date_count_) {
+ *out = date32();
+ } else if (time_count_) {
+ *out = time64(TimeUnit::MICRO);
+ } else if (timestamp_micro_count_) {
+ *out = timestamp(TimeUnit::MICRO, timezone_);
+ } else if (duration_count_) {
+ *out = duration(TimeUnit::MICRO);
+ } else if (bool_count_) {
+ *out = boolean();
+ } else if (binary_count_) {
+ *out = binary();
+ } else if (unicode_count_) {
+ *out = utf8();
+ } else {
+ *out = null();
+ }
+ return Status::OK();
+ }
+ int64_t total_count() const { return total_count_; }
+ protected:
+ Status Validate() const {
+ if (list_count_ > 0) {
+ if (list_count_ + none_count_ != total_count_) {
+ return Status::Invalid("cannot mix list and non-list, non-null values");
+ }
+ RETURN_NOT_OK(list_inferrer_->Validate());
+ } else if (struct_count_ > 0) {
+ if (struct_count_ + none_count_ != total_count_) {
+ return Status::Invalid("cannot mix struct and non-struct, non-null values");
+ }
+ for (const auto& it : struct_inferrers_) {
+ RETURN_NOT_OK(it.second.Validate());
+ }
+ }
+ return Status::OK();
+ }
+ Status VisitDType(PyArray_Descr* dtype, bool* keep_going) {
+ // Continue visiting dtypes for now.
+ // TODO(wesm): devise approach for unions
+ ++numpy_dtype_count_;
+ *keep_going = true;
+ return numpy_unifier_.Observe(dtype);
+ }
+ Status VisitList(PyObject* obj, bool* keep_going /* unused */) {
+ if (!list_inferrer_) {
+ list_inferrer_.reset(
+ new TypeInferrer(pandas_null_sentinels_, validate_interval_, make_unions_));
+ }
+ ++list_count_;
+ return list_inferrer_->VisitSequence(obj);
+ }
+ Status VisitNdarray(PyObject* obj, bool* keep_going) {
+ PyArray_Descr* dtype = PyArray_DESCR(reinterpret_cast<PyArrayObject*>(obj));
+ if (dtype->type_num == NPY_OBJECT) {
+ return VisitList(obj, keep_going);
+ }
+ // Not an object array: infer child Arrow type from dtype
+ if (!list_inferrer_) {
+ list_inferrer_.reset(
+ new TypeInferrer(pandas_null_sentinels_, validate_interval_, make_unions_));
+ }
+ ++list_count_;
+ // XXX(wesm): In ARROW-4324 I added accounting to check whether
+ // all of the non-null values have NumPy dtypes, but the
+ // total_count not not being properly incremented here
+ ++(*list_inferrer_).total_count_;
+ return list_inferrer_->VisitDType(dtype, keep_going);
+ }
+ Status VisitDict(PyObject* obj) {
+ PyObject* key_obj;
+ PyObject* value_obj;
+ Py_ssize_t pos = 0;
+ while (PyDict_Next(obj, &pos, &key_obj, &value_obj)) {
+ std::string key;
+ if (PyUnicode_Check(key_obj)) {
+ RETURN_NOT_OK(internal::PyUnicode_AsStdString(key_obj, &key));
+ } else if (PyBytes_Check(key_obj)) {
+ key = internal::PyBytes_AsStdString(key_obj);
+ } else {
+ return Status::TypeError("Expected dict key of type str or bytes, got '",
+ Py_TYPE(key_obj)->tp_name, "'");
+ }
+ // Get or create visitor for this key
+ auto it = struct_inferrers_.find(key);
+ if (it == struct_inferrers_.end()) {
+ it = struct_inferrers_
+ .insert(
+ std::make_pair(key, TypeInferrer(pandas_null_sentinels_,
+ validate_interval_, make_unions_)))
+ .first;
+ }
+ TypeInferrer* visitor = &it->second;
+ // We ignore termination signals from child visitors for now
+ //
+ // TODO(wesm): keep track of whether type inference has terminated for
+ // the child visitors to avoid doing unneeded work
+ bool keep_going = true;
+ RETURN_NOT_OK(visitor->Visit(value_obj, &keep_going));
+ }
+ // We do not terminate visiting dicts since we want the union of all
+ // observed keys
+ ++struct_count_;
+ return Status::OK();
+ }
+ Status GetStructType(std::shared_ptr<DataType>* out) {
+ std::vector<std::shared_ptr<Field>> fields;
+ for (auto&& it : struct_inferrers_) {
+ std::shared_ptr<DataType> field_type;
+ RETURN_NOT_OK(it.second.GetType(&field_type));
+ fields.emplace_back(field(it.first, field_type));
+ }
+ *out = struct_(fields);
+ return Status::OK();
+ }
+ private:
+ bool pandas_null_sentinels_;
+ int64_t validate_interval_;
+ bool make_unions_;
+ int64_t total_count_;
+ int64_t none_count_;
+ int64_t bool_count_;
+ int64_t int_count_;
+ int64_t date_count_;
+ int64_t time_count_;
+ int64_t timestamp_micro_count_;
+ std::string timezone_;
+ int64_t duration_count_;
+ int64_t float_count_;
+ int64_t binary_count_;
+ int64_t unicode_count_;
+ int64_t decimal_count_;
+ int64_t list_count_;
+ int64_t struct_count_;
+ int64_t numpy_dtype_count_;
+ std::unique_ptr<TypeInferrer> list_inferrer_;
+ std::map<std::string, TypeInferrer> struct_inferrers_;
+ // If we observe a strongly-typed value in e.g. a NumPy array, we can store
+ // it here to skip the type counting logic above
+ NumPyDtypeUnifier numpy_unifier_;
+ internal::DecimalMetadata max_decimal_metadata_;
+ // Place to accumulate errors
+ // std::vector<Status> errors_;
+ OwnedRefNoGIL decimal_type_;
+// Non-exhaustive type inference
+Result<std::shared_ptr<DataType>> InferArrowType(PyObject* obj, PyObject* mask,
+ bool pandas_null_sentinels) {
+ if (pandas_null_sentinels) {
+ // ARROW-842: If pandas is not installed then null checks will be less
+ // comprehensive, but that is okay.
+ internal::InitPandasStaticData();
+ }
+ std::shared_ptr<DataType> out_type;
+ TypeInferrer inferrer(pandas_null_sentinels);
+ RETURN_NOT_OK(inferrer.VisitSequence(obj, mask));
+ RETURN_NOT_OK(inferrer.GetType(&out_type));
+ if (out_type == nullptr) {
+ return Status::TypeError("Unable to determine data type");
+ } else {
+ return std::move(out_type);
+ }
+bool IsPyBool(PyObject* obj) { return internal::PyBoolScalar_Check(obj); }
+bool IsPyInt(PyObject* obj) { return internal::PyIntScalar_Check(obj); }
+bool IsPyFloat(PyObject* obj) { return internal::PyFloatScalar_Check(obj); }
+} // namespace py
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/python/inference.h b/contrib/libs/apache/arrow/cpp/src/arrow/python/inference.h
new file mode 100644
index 0000000000..eff1836293
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/python/inference.h
@@ -0,0 +1,64 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+// Functions for converting between CPython built-in data structures and Arrow
+// data structures
+#pragma once
+#include "arrow/python/platform.h"
+#include <memory>
+#include "arrow/python/visibility.h"
+#include "arrow/type.h"
+#include "arrow/util/macros.h"
+#include "arrow/python/common.h"
+namespace arrow {
+class Array;
+class Status;
+namespace py {
+// These functions take a sequence input, not arbitrary iterables
+/// \brief Infer Arrow type from a Python sequence
+/// \param[in] obj the sequence of values
+/// \param[in] mask an optional mask where True values are null. May
+/// be nullptr
+/// \param[in] pandas_null_sentinels use pandas's null value markers
+Result<std::shared_ptr<arrow::DataType>> InferArrowType(PyObject* obj, PyObject* mask,
+ bool pandas_null_sentinels);
+/// Checks whether the passed Python object is a boolean scalar
+bool IsPyBool(PyObject* obj);
+/// Checks whether the passed Python object is an integer scalar
+bool IsPyInt(PyObject* obj);
+/// Checks whether the passed Python object is a float scalar
+bool IsPyFloat(PyObject* obj);
+} // namespace py
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/python/init.cc b/contrib/libs/apache/arrow/cpp/src/arrow/python/init.cc
new file mode 100644
index 0000000000..dba293bbe2
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/python/init.cc
@@ -0,0 +1,24 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+// Trigger the array import (inversion of NO_IMPORT_ARRAY)
+#include "arrow/python/init.h"
+#include "arrow/python/numpy_interop.h"
+int arrow_init_numpy() { return arrow::py::import_numpy(); }
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/python/init.h b/contrib/libs/apache/arrow/cpp/src/arrow/python/init.h
new file mode 100644
index 0000000000..2e6c954862
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/python/init.h
@@ -0,0 +1,26 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+#include "arrow/python/platform.h"
+#include "arrow/python/visibility.h"
+extern "C" {
+int arrow_init_numpy();
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/python/io.cc b/contrib/libs/apache/arrow/cpp/src/arrow/python/io.cc
new file mode 100644
index 0000000000..73525feed3
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/python/io.cc
@@ -0,0 +1,374 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#include "arrow/python/io.h"
+#include <cstdint>
+#include <cstdlib>
+#include <memory>
+#include <mutex>
+#include <string>
+#include "arrow/io/memory.h"
+#include "arrow/memory_pool.h"
+#include "arrow/status.h"
+#include "arrow/util/logging.h"
+#include "arrow/python/common.h"
+#include "arrow/python/pyarrow.h"
+namespace arrow {
+using arrow::io::TransformInputStream;
+namespace py {
+// ----------------------------------------------------------------------
+// Python file
+// A common interface to a Python file-like object. Must acquire GIL before
+// calling any methods
+class PythonFile {
+ public:
+ explicit PythonFile(PyObject* file) : file_(file), checked_read_buffer_(false) {
+ Py_INCREF(file);
+ }
+ Status CheckClosed() const {
+ if (!file_) {
+ return Status::Invalid("operation on closed Python file");
+ }
+ return Status::OK();
+ }
+ Status Close() {
+ if (file_) {
+ PyObject* result = cpp_PyObject_CallMethod(file_.obj(), "close", "()");
+ Py_XDECREF(result);
+ file_.reset();
+ PY_RETURN_IF_ERROR(StatusCode::IOError);
+ }
+ return Status::OK();
+ }
+ Status Abort() {
+ file_.reset();
+ return Status::OK();
+ }
+ bool closed() const {
+ if (!file_) {
+ return true;
+ }
+ PyObject* result = PyObject_GetAttrString(file_.obj(), "closed");
+ if (result == NULL) {
+ // Can't propagate the error, so write it out and return an arbitrary value
+ PyErr_WriteUnraisable(NULL);
+ return true;
+ }
+ int ret = PyObject_IsTrue(result);
+ Py_XDECREF(result);
+ if (ret < 0) {
+ PyErr_WriteUnraisable(NULL);
+ return true;
+ }
+ return ret != 0;
+ }
+ Status Seek(int64_t position, int whence) {
+ RETURN_NOT_OK(CheckClosed());
+ // whence: 0 for relative to start of file, 2 for end of file
+ PyObject* result = cpp_PyObject_CallMethod(file_.obj(), "seek", "(ni)",
+ static_cast<Py_ssize_t>(position), whence);
+ Py_XDECREF(result);
+ PY_RETURN_IF_ERROR(StatusCode::IOError);
+ return Status::OK();
+ }
+ Status Read(int64_t nbytes, PyObject** out) {
+ RETURN_NOT_OK(CheckClosed());
+ PyObject* result = cpp_PyObject_CallMethod(file_.obj(), "read", "(n)",
+ static_cast<Py_ssize_t>(nbytes));
+ PY_RETURN_IF_ERROR(StatusCode::IOError);
+ *out = result;
+ return Status::OK();
+ }
+ Status ReadBuffer(int64_t nbytes, PyObject** out) {
+ PyObject* result = cpp_PyObject_CallMethod(file_.obj(), "read_buffer", "(n)",
+ static_cast<Py_ssize_t>(nbytes));
+ PY_RETURN_IF_ERROR(StatusCode::IOError);
+ *out = result;
+ return Status::OK();
+ }
+ Status Write(const void* data, int64_t nbytes) {
+ RETURN_NOT_OK(CheckClosed());
+ // Since the data isn't owned, we have to make a copy
+ PyObject* py_data =
+ PyBytes_FromStringAndSize(reinterpret_cast<const char*>(data), nbytes);
+ PY_RETURN_IF_ERROR(StatusCode::IOError);
+ PyObject* result = cpp_PyObject_CallMethod(file_.obj(), "write", "(O)", py_data);
+ Py_XDECREF(py_data);
+ Py_XDECREF(result);
+ PY_RETURN_IF_ERROR(StatusCode::IOError);
+ return Status::OK();
+ }
+ Status Write(const std::shared_ptr<Buffer>& buffer) {
+ RETURN_NOT_OK(CheckClosed());
+ PyObject* py_data = wrap_buffer(buffer);
+ PY_RETURN_IF_ERROR(StatusCode::IOError);
+ PyObject* result = cpp_PyObject_CallMethod(file_.obj(), "write", "(O)", py_data);
+ Py_XDECREF(py_data);
+ Py_XDECREF(result);
+ PY_RETURN_IF_ERROR(StatusCode::IOError);
+ return Status::OK();
+ }
+ Result<int64_t> Tell() {
+ RETURN_NOT_OK(CheckClosed());
+ PyObject* result = cpp_PyObject_CallMethod(file_.obj(), "tell", "()");
+ PY_RETURN_IF_ERROR(StatusCode::IOError);
+ int64_t position = PyLong_AsLongLong(result);
+ Py_DECREF(result);
+ // PyLong_AsLongLong can raise OverflowError
+ PY_RETURN_IF_ERROR(StatusCode::IOError);
+ return position;
+ }
+ std::mutex& lock() { return lock_; }
+ bool HasReadBuffer() {
+ if (!checked_read_buffer_) { // we don't want to check this each time
+ has_read_buffer_ = PyObject_HasAttrString(file_.obj(), "read_buffer") == 1;
+ checked_read_buffer_ = true;
+ }
+ return has_read_buffer_;
+ }
+ private:
+ std::mutex lock_;
+ OwnedRefNoGIL file_;
+ bool has_read_buffer_;
+ bool checked_read_buffer_;
+// ----------------------------------------------------------------------
+// Seekable input stream
+PyReadableFile::PyReadableFile(PyObject* file) { file_.reset(new PythonFile(file)); }
+// The destructor does not close the underlying Python file object, as
+// there may be multiple references to it. Instead let the Python
+// destructor do its job.
+PyReadableFile::~PyReadableFile() {}
+Status PyReadableFile::Abort() {
+ return SafeCallIntoPython([this]() { return file_->Abort(); });
+Status PyReadableFile::Close() {
+ return SafeCallIntoPython([this]() { return file_->Close(); });
+bool PyReadableFile::closed() const {
+ bool res;
+ Status st = SafeCallIntoPython([this, &res]() {
+ res = file_->closed();
+ return Status::OK();
+ });
+ return res;
+Status PyReadableFile::Seek(int64_t position) {
+ return SafeCallIntoPython([=] { return file_->Seek(position, 0); });
+Result<int64_t> PyReadableFile::Tell() const {
+ return SafeCallIntoPython([=]() -> Result<int64_t> { return file_->Tell(); });
+Result<int64_t> PyReadableFile::Read(int64_t nbytes, void* out) {
+ return SafeCallIntoPython([=]() -> Result<int64_t> {
+ OwnedRef bytes;
+ RETURN_NOT_OK(file_->Read(nbytes, bytes.ref()));
+ PyObject* bytes_obj = bytes.obj();
+ DCHECK(bytes_obj != NULL);
+ Py_buffer py_buf;
+ if (!PyObject_GetBuffer(bytes_obj, &py_buf, PyBUF_ANY_CONTIGUOUS)) {
+ const uint8_t* data = reinterpret_cast<const uint8_t*>(py_buf.buf);
+ std::memcpy(out, data, py_buf.len);
+ int64_t len = py_buf.len;
+ PyBuffer_Release(&py_buf);
+ return len;
+ } else {
+ return Status::TypeError(
+ "Python file read() should have returned a bytes object or an object "
+ "supporting the buffer protocol, got '",
+ Py_TYPE(bytes_obj)->tp_name, "' (did you open the file in binary mode?)");
+ }
+ });
+Result<std::shared_ptr<Buffer>> PyReadableFile::Read(int64_t nbytes) {
+ return SafeCallIntoPython([=]() -> Result<std::shared_ptr<Buffer>> {
+ OwnedRef buffer_obj;
+ if (file_->HasReadBuffer()) {
+ RETURN_NOT_OK(file_->ReadBuffer(nbytes, buffer_obj.ref()));
+ } else {
+ RETURN_NOT_OK(file_->Read(nbytes, buffer_obj.ref()));
+ }
+ DCHECK(buffer_obj.obj() != NULL);
+ return PyBuffer::FromPyObject(buffer_obj.obj());
+ });
+Result<int64_t> PyReadableFile::ReadAt(int64_t position, int64_t nbytes, void* out) {
+ std::lock_guard<std::mutex> guard(file_->lock());
+ return SafeCallIntoPython([=]() -> Result<int64_t> {
+ RETURN_NOT_OK(Seek(position));
+ return Read(nbytes, out);
+ });
+Result<std::shared_ptr<Buffer>> PyReadableFile::ReadAt(int64_t position, int64_t nbytes) {
+ std::lock_guard<std::mutex> guard(file_->lock());
+ return SafeCallIntoPython([=]() -> Result<std::shared_ptr<Buffer>> {
+ RETURN_NOT_OK(Seek(position));
+ return Read(nbytes);
+ });
+Result<int64_t> PyReadableFile::GetSize() {
+ return SafeCallIntoPython([=]() -> Result<int64_t> {
+ ARROW_ASSIGN_OR_RAISE(int64_t current_position, file_->Tell());
+ RETURN_NOT_OK(file_->Seek(0, 2));
+ ARROW_ASSIGN_OR_RAISE(int64_t file_size, file_->Tell());
+ // Restore previous file position
+ RETURN_NOT_OK(file_->Seek(current_position, 0));
+ return file_size;
+ });
+// ----------------------------------------------------------------------
+// Output stream
+PyOutputStream::PyOutputStream(PyObject* file) : position_(0) {
+ file_.reset(new PythonFile(file));
+// The destructor does not close the underlying Python file object, as
+// there may be multiple references to it. Instead let the Python
+// destructor do its job.
+PyOutputStream::~PyOutputStream() {}
+Status PyOutputStream::Abort() {
+ return SafeCallIntoPython([=]() { return file_->Abort(); });
+Status PyOutputStream::Close() {
+ return SafeCallIntoPython([=]() { return file_->Close(); });
+bool PyOutputStream::closed() const {
+ bool res;
+ Status st = SafeCallIntoPython([this, &res]() {
+ res = file_->closed();
+ return Status::OK();
+ });
+ return res;
+Result<int64_t> PyOutputStream::Tell() const { return position_; }
+Status PyOutputStream::Write(const void* data, int64_t nbytes) {
+ return SafeCallIntoPython([=]() {
+ position_ += nbytes;
+ return file_->Write(data, nbytes);
+ });
+Status PyOutputStream::Write(const std::shared_ptr<Buffer>& buffer) {
+ return SafeCallIntoPython([=]() {
+ position_ += buffer->size();
+ return file_->Write(buffer);
+ });
+// ----------------------------------------------------------------------
+// Foreign buffer
+Status PyForeignBuffer::Make(const uint8_t* data, int64_t size, PyObject* base,
+ std::shared_ptr<Buffer>* out) {
+ PyForeignBuffer* buf = new PyForeignBuffer(data, size, base);
+ if (buf == NULL) {
+ return Status::OutOfMemory("could not allocate foreign buffer object");
+ } else {
+ *out = std::shared_ptr<Buffer>(buf);
+ return Status::OK();
+ }
+// ----------------------------------------------------------------------
+// TransformInputStream::TransformFunc wrapper
+struct TransformFunctionWrapper {
+ TransformFunctionWrapper(TransformCallback cb, PyObject* arg)
+ : cb_(std::move(cb)), arg_(std::make_shared<OwnedRefNoGIL>(arg)) {
+ Py_INCREF(arg);
+ }
+ Result<std::shared_ptr<Buffer>> operator()(const std::shared_ptr<Buffer>& src) {
+ return SafeCallIntoPython([=]() -> Result<std::shared_ptr<Buffer>> {
+ std::shared_ptr<Buffer> dest;
+ cb_(arg_->obj(), src, &dest);
+ RETURN_NOT_OK(CheckPyError());
+ return dest;
+ });
+ }
+ protected:
+ // Need to wrap OwnedRefNoGIL because std::function needs the callable
+ // to be copy-constructible...
+ TransformCallback cb_;
+ std::shared_ptr<OwnedRefNoGIL> arg_;
+std::shared_ptr<::arrow::io::InputStream> MakeTransformInputStream(
+ std::shared_ptr<::arrow::io::InputStream> wrapped, TransformInputStreamVTable vtable,
+ PyObject* handler) {
+ TransformInputStream::TransformFunc transform(
+ TransformFunctionWrapper{std::move(vtable.transform), handler});
+ return std::make_shared<TransformInputStream>(std::move(wrapped), std::move(transform));
+} // namespace py
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/python/io.h b/contrib/libs/apache/arrow/cpp/src/arrow/python/io.h
new file mode 100644
index 0000000000..a38d0ca332
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/python/io.h
@@ -0,0 +1,116 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+#include <memory>
+#include "arrow/io/interfaces.h"
+#include "arrow/io/transform.h"
+#include "arrow/python/common.h"
+#include "arrow/python/visibility.h"
+namespace arrow {
+namespace py {
+class ARROW_NO_EXPORT PythonFile;
+class ARROW_PYTHON_EXPORT PyReadableFile : public io::RandomAccessFile {
+ public:
+ explicit PyReadableFile(PyObject* file);
+ ~PyReadableFile() override;
+ Status Close() override;
+ Status Abort() override;
+ bool closed() const override;
+ Result<int64_t> Read(int64_t nbytes, void* out) override;
+ Result<std::shared_ptr<Buffer>> Read(int64_t nbytes) override;
+ // Thread-safe version
+ Result<int64_t> ReadAt(int64_t position, int64_t nbytes, void* out) override;
+ // Thread-safe version
+ Result<std::shared_ptr<Buffer>> ReadAt(int64_t position, int64_t nbytes) override;
+ Result<int64_t> GetSize() override;
+ Status Seek(int64_t position) override;
+ Result<int64_t> Tell() const override;
+ private:
+ std::unique_ptr<PythonFile> file_;
+class ARROW_PYTHON_EXPORT PyOutputStream : public io::OutputStream {
+ public:
+ explicit PyOutputStream(PyObject* file);
+ ~PyOutputStream() override;
+ Status Close() override;
+ Status Abort() override;
+ bool closed() const override;
+ Result<int64_t> Tell() const override;
+ Status Write(const void* data, int64_t nbytes) override;
+ Status Write(const std::shared_ptr<Buffer>& buffer) override;
+ private:
+ std::unique_ptr<PythonFile> file_;
+ int64_t position_;
+// TODO(wesm): seekable output files
+// A Buffer subclass that keeps a PyObject reference throughout its
+// lifetime, such that the Python object is kept alive as long as the
+// C++ buffer is still needed.
+// Keeping the reference in a Python wrapper would be incorrect as
+// the Python wrapper can get destroyed even though the wrapped C++
+// buffer is still alive (ARROW-2270).
+class ARROW_PYTHON_EXPORT PyForeignBuffer : public Buffer {
+ public:
+ static Status Make(const uint8_t* data, int64_t size, PyObject* base,
+ std::shared_ptr<Buffer>* out);
+ private:
+ PyForeignBuffer(const uint8_t* data, int64_t size, PyObject* base)
+ : Buffer(data, size) {
+ Py_INCREF(base);
+ base_.reset(base);
+ }
+ OwnedRefNoGIL base_;
+// All this rigamarole because Cython is really poor with std::function<>
+using TransformCallback = std::function<void(
+ PyObject*, const std::shared_ptr<Buffer>& src, std::shared_ptr<Buffer>* out)>;
+struct TransformInputStreamVTable {
+ TransformCallback transform;
+std::shared_ptr<::arrow::io::InputStream> MakeTransformInputStream(
+ std::shared_ptr<::arrow::io::InputStream> wrapped, TransformInputStreamVTable vtable,
+ PyObject* arg);
+} // namespace py
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/python/ipc.cc b/contrib/libs/apache/arrow/cpp/src/arrow/python/ipc.cc
new file mode 100644
index 0000000000..2e6c9d9127
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/python/ipc.cc
@@ -0,0 +1,67 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#include "arrow/python/ipc.h"
+#include <memory>
+#include "arrow/python/pyarrow.h"
+namespace arrow {
+namespace py {
+PyRecordBatchReader::PyRecordBatchReader() {}
+Status PyRecordBatchReader::Init(std::shared_ptr<Schema> schema, PyObject* iterable) {
+ schema_ = std::move(schema);
+ iterator_.reset(PyObject_GetIter(iterable));
+ return CheckPyError();
+std::shared_ptr<Schema> PyRecordBatchReader::schema() const { return schema_; }
+Status PyRecordBatchReader::ReadNext(std::shared_ptr<RecordBatch>* batch) {
+ PyAcquireGIL lock;
+ if (!iterator_) {
+ // End of stream
+ batch->reset();
+ return Status::OK();
+ }
+ OwnedRef py_batch(PyIter_Next(iterator_.obj()));
+ if (!py_batch) {
+ // End of stream
+ batch->reset();
+ iterator_.reset();
+ return Status::OK();
+ }
+ return unwrap_batch(py_batch.obj()).Value(batch);
+Result<std::shared_ptr<RecordBatchReader>> PyRecordBatchReader::Make(
+ std::shared_ptr<Schema> schema, PyObject* iterable) {
+ auto reader = std::shared_ptr<PyRecordBatchReader>(new PyRecordBatchReader());
+ RETURN_NOT_OK(reader->Init(std::move(schema), iterable));
+ return reader;
+} // namespace py
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/python/ipc.h b/contrib/libs/apache/arrow/cpp/src/arrow/python/ipc.h
new file mode 100644
index 0000000000..92232ed830
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/python/ipc.h
@@ -0,0 +1,52 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+#include <memory>
+#include "arrow/python/common.h"
+#include "arrow/python/visibility.h"
+#include "arrow/record_batch.h"
+#include "arrow/result.h"
+#include "arrow/util/macros.h"
+namespace arrow {
+namespace py {
+class ARROW_PYTHON_EXPORT PyRecordBatchReader : public RecordBatchReader {
+ public:
+ std::shared_ptr<Schema> schema() const override;
+ Status ReadNext(std::shared_ptr<RecordBatch>* batch) override;
+ // For use from Cython
+ // Assumes that `iterable` is borrowed
+ static Result<std::shared_ptr<RecordBatchReader>> Make(std::shared_ptr<Schema>,
+ PyObject* iterable);
+ protected:
+ PyRecordBatchReader();
+ Status Init(std::shared_ptr<Schema>, PyObject* iterable);
+ std::shared_ptr<Schema> schema_;
+ OwnedRefNoGIL iterator_;
+} // namespace py
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/python/iterators.h b/contrib/libs/apache/arrow/cpp/src/arrow/python/iterators.h
new file mode 100644
index 0000000000..58213ee2db
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/python/iterators.h
@@ -0,0 +1,155 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+#include <utility>
+#include "arrow/python/common.h"
+#include "arrow/python/numpy_internal.h"
+namespace arrow {
+namespace py {
+namespace internal {
+// Visit the Python sequence, calling the given callable on each element. If
+// the callable returns a non-OK status, iteration stops and the status is
+// returned.
+// The call signature for Visitor must be
+// Visit(PyObject* obj, int64_t index, bool* keep_going)
+// If keep_going is set to false, the iteration terminates
+template <class VisitorFunc>
+inline Status VisitSequenceGeneric(PyObject* obj, int64_t offset, VisitorFunc&& func) {
+ // VisitorFunc may set to false to terminate iteration
+ bool keep_going = true;
+ if (PyArray_Check(obj)) {
+ PyArrayObject* arr_obj = reinterpret_cast<PyArrayObject*>(obj);
+ if (PyArray_NDIM(arr_obj) != 1) {
+ return Status::Invalid("Only 1D arrays accepted");
+ }
+ if (PyArray_DESCR(arr_obj)->type_num == NPY_OBJECT) {
+ // It's an array object, we can fetch object pointers directly
+ const Ndarray1DIndexer<PyObject*> objects(arr_obj);
+ for (int64_t i = offset; keep_going && i < objects.size(); ++i) {
+ RETURN_NOT_OK(func(objects[i], i, &keep_going));
+ }
+ return Status::OK();
+ }
+ // It's a non-object array, fall back on regular sequence access.
+ // (note PyArray_GETITEM() is slightly different: it returns standard
+ // Python types, not Numpy scalar types)
+ // This code path is inefficient: callers should implement dedicated
+ // logic for non-object arrays.
+ }
+ if (PySequence_Check(obj)) {
+ if (PyList_Check(obj) || PyTuple_Check(obj)) {
+ // Use fast item access
+ const Py_ssize_t size = PySequence_Fast_GET_SIZE(obj);
+ for (Py_ssize_t i = offset; keep_going && i < size; ++i) {
+ PyObject* value = PySequence_Fast_GET_ITEM(obj, i);
+ RETURN_NOT_OK(func(value, static_cast<int64_t>(i), &keep_going));
+ }
+ } else {
+ // Regular sequence: avoid making a potentially large copy
+ const Py_ssize_t size = PySequence_Size(obj);
+ for (Py_ssize_t i = offset; keep_going && i < size; ++i) {
+ OwnedRef value_ref(PySequence_ITEM(obj, i));
+ RETURN_NOT_OK(func(value_ref.obj(), static_cast<int64_t>(i), &keep_going));
+ }
+ }
+ } else {
+ return Status::TypeError("Object is not a sequence");
+ }
+ return Status::OK();
+// Visit sequence with no null mask
+template <class VisitorFunc>
+inline Status VisitSequence(PyObject* obj, int64_t offset, VisitorFunc&& func) {
+ return VisitSequenceGeneric(
+ obj, offset, [&func](PyObject* value, int64_t i /* unused */, bool* keep_going) {
+ return func(value, keep_going);
+ });
+/// Visit sequence with null mask
+template <class VisitorFunc>
+inline Status VisitSequenceMasked(PyObject* obj, PyObject* mo, int64_t offset,
+ VisitorFunc&& func) {
+ if (mo == nullptr || !PyArray_Check(mo)) {
+ return Status::Invalid("Null mask must be NumPy array");
+ }
+ PyArrayObject* mask = reinterpret_cast<PyArrayObject*>(mo);
+ if (PyArray_NDIM(mask) != 1) {
+ return Status::Invalid("Mask must be 1D array");
+ }
+ const Py_ssize_t obj_size = PySequence_Size(obj);
+ if (PyArray_SIZE(mask) != static_cast<int64_t>(obj_size)) {
+ return Status::Invalid("Mask was a different length from sequence being converted");
+ }
+ const int dtype = fix_numpy_type_num(PyArray_DESCR(mask)->type_num);
+ if (dtype == NPY_BOOL) {
+ Ndarray1DIndexer<uint8_t> mask_values(mask);
+ return VisitSequenceGeneric(
+ obj, offset, [&func, &mask_values](PyObject* value, int64_t i, bool* keep_going) {
+ return func(value, mask_values[i], keep_going);
+ });
+ } else {
+ return Status::Invalid("Mask must be boolean dtype");
+ }
+// Like IterateSequence, but accepts any generic iterable (including
+// non-restartable iterators, e.g. generators).
+// The call signature for VisitorFunc must be Visit(PyObject*, bool*
+// keep_going). If keep_going is set to false, the iteration terminates
+template <class VisitorFunc>
+inline Status VisitIterable(PyObject* obj, VisitorFunc&& func) {
+ if (PySequence_Check(obj)) {
+ // Numpy arrays fall here as well
+ return VisitSequence(obj, /*offset=*/0, std::forward<VisitorFunc>(func));
+ }
+ // Fall back on the iterator protocol
+ OwnedRef iter_ref(PyObject_GetIter(obj));
+ PyObject* iter = iter_ref.obj();
+ PyObject* value;
+ bool keep_going = true;
+ while (keep_going && (value = PyIter_Next(iter))) {
+ OwnedRef value_ref(value);
+ RETURN_NOT_OK(func(value_ref.obj(), &keep_going));
+ }
+ RETURN_IF_PYERROR(); // __next__() might have raised
+ return Status::OK();
+} // namespace internal
+} // namespace py
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/python/numpy_convert.cc b/contrib/libs/apache/arrow/cpp/src/arrow/python/numpy_convert.cc
new file mode 100644
index 0000000000..bf4afb2a0a
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/python/numpy_convert.cc
@@ -0,0 +1,559 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#include "arrow/python/numpy_interop.h"
+#include "arrow/python/numpy_convert.h"
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+#include "arrow/buffer.h"
+#include "arrow/sparse_tensor.h"
+#include "arrow/tensor.h"
+#include "arrow/type.h"
+#include "arrow/util/logging.h"
+#include "arrow/python/common.h"
+#include "arrow/python/pyarrow.h"
+#include "arrow/python/type_traits.h"
+namespace arrow {
+namespace py {
+NumPyBuffer::NumPyBuffer(PyObject* ao) : Buffer(nullptr, 0) {
+ PyAcquireGIL lock;
+ arr_ = ao;
+ Py_INCREF(ao);
+ if (PyArray_Check(ao)) {
+ PyArrayObject* ndarray = reinterpret_cast<PyArrayObject*>(ao);
+ auto ptr = reinterpret_cast<uint8_t*>(PyArray_DATA(ndarray));
+ data_ = const_cast<const uint8_t*>(ptr);
+ size_ = PyArray_SIZE(ndarray) * PyArray_DESCR(ndarray)->elsize;
+ capacity_ = size_;
+ is_mutable_ = !!(PyArray_FLAGS(ndarray) & NPY_ARRAY_WRITEABLE);
+ }
+NumPyBuffer::~NumPyBuffer() {
+ PyAcquireGIL lock;
+ Py_XDECREF(arr_);
+ case NPY_##NPY_NAME: \
+ *out = FACTORY(); \
+ break;
+namespace {
+Status GetTensorType(PyObject* dtype, std::shared_ptr<DataType>* out) {
+ if (!PyObject_TypeCheck(dtype, &PyArrayDescr_Type)) {
+ return Status::TypeError("Did not pass numpy.dtype object");
+ }
+ PyArray_Descr* descr = reinterpret_cast<PyArray_Descr*>(dtype);
+ int type_num = fix_numpy_type_num(descr->type_num);
+ switch (type_num) {
+ default: {
+ return Status::NotImplemented("Unsupported numpy type ", descr->type_num);
+ }
+ }
+ return Status::OK();
+Status GetNumPyType(const DataType& type, int* type_num) {
+ case Type::ARROW_NAME: \
+ *type_num = NPY_##NPY_NAME; \
+ break;
+ switch (type.id()) {
+ default: {
+ return Status::NotImplemented("Unsupported tensor type: ", type.ToString());
+ }
+ }
+ return Status::OK();
+} // namespace
+Status NumPyDtypeToArrow(PyObject* dtype, std::shared_ptr<DataType>* out) {
+ if (!PyObject_TypeCheck(dtype, &PyArrayDescr_Type)) {
+ return Status::TypeError("Did not pass numpy.dtype object");
+ }
+ PyArray_Descr* descr = reinterpret_cast<PyArray_Descr*>(dtype);
+ return NumPyDtypeToArrow(descr, out);
+Status NumPyDtypeToArrow(PyArray_Descr* descr, std::shared_ptr<DataType>* out) {
+ int type_num = fix_numpy_type_num(descr->type_num);
+ switch (type_num) {
+ case NPY_DATETIME: {
+ auto date_dtype =
+ reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(descr->c_metadata);
+ switch (date_dtype->meta.base) {
+ case NPY_FR_s:
+ *out = timestamp(TimeUnit::SECOND);
+ break;
+ case NPY_FR_ms:
+ *out = timestamp(TimeUnit::MILLI);
+ break;
+ case NPY_FR_us:
+ *out = timestamp(TimeUnit::MICRO);
+ break;
+ case NPY_FR_ns:
+ *out = timestamp(TimeUnit::NANO);
+ break;
+ case NPY_FR_D:
+ *out = date32();
+ break;
+ return Status::NotImplemented("Unbound or generic datetime64 time unit");
+ default:
+ return Status::NotImplemented("Unsupported datetime64 time unit");
+ }
+ } break;
+ auto timedelta_dtype =
+ reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(descr->c_metadata);
+ switch (timedelta_dtype->meta.base) {
+ case NPY_FR_s:
+ *out = duration(TimeUnit::SECOND);
+ break;
+ case NPY_FR_ms:
+ *out = duration(TimeUnit::MILLI);
+ break;
+ case NPY_FR_us:
+ *out = duration(TimeUnit::MICRO);
+ break;
+ case NPY_FR_ns:
+ *out = duration(TimeUnit::NANO);
+ break;
+ return Status::NotImplemented("Unbound or generic timedelta64 time unit");
+ default:
+ return Status::NotImplemented("Unsupported timedelta64 time unit");
+ }
+ } break;
+ default: {
+ return Status::NotImplemented("Unsupported numpy type ", descr->type_num);
+ }
+ }
+ return Status::OK();
+Status NdarrayToTensor(MemoryPool* pool, PyObject* ao,
+ const std::vector<std::string>& dim_names,
+ std::shared_ptr<Tensor>* out) {
+ if (!PyArray_Check(ao)) {
+ return Status::TypeError("Did not pass ndarray object");
+ }
+ PyArrayObject* ndarray = reinterpret_cast<PyArrayObject*>(ao);
+ // TODO(wesm): What do we want to do with non-contiguous memory and negative strides?
+ int ndim = PyArray_NDIM(ndarray);
+ std::shared_ptr<Buffer> data = std::make_shared<NumPyBuffer>(ao);
+ std::vector<int64_t> shape(ndim);
+ std::vector<int64_t> strides(ndim);
+ npy_intp* array_strides = PyArray_STRIDES(ndarray);
+ npy_intp* array_shape = PyArray_SHAPE(ndarray);
+ for (int i = 0; i < ndim; ++i) {
+ if (array_strides[i] < 0) {
+ return Status::Invalid("Negative ndarray strides not supported");
+ }
+ shape[i] = array_shape[i];
+ strides[i] = array_strides[i];
+ }
+ std::shared_ptr<DataType> type;
+ GetTensorType(reinterpret_cast<PyObject*>(PyArray_DESCR(ndarray)), &type));
+ *out = std::make_shared<Tensor>(type, data, shape, strides, dim_names);
+ return Status::OK();
+Status TensorToNdarray(const std::shared_ptr<Tensor>& tensor, PyObject* base,
+ PyObject** out) {
+ int type_num = 0;
+ RETURN_NOT_OK(GetNumPyType(*tensor->type(), &type_num));
+ PyArray_Descr* dtype = PyArray_DescrNewFromType(type_num);
+ const int ndim = tensor->ndim();
+ std::vector<npy_intp> npy_shape(ndim);
+ std::vector<npy_intp> npy_strides(ndim);
+ for (int i = 0; i < ndim; ++i) {
+ npy_shape[i] = tensor->shape()[i];
+ npy_strides[i] = tensor->strides()[i];
+ }
+ const void* immutable_data = nullptr;
+ if (tensor->data()) {
+ immutable_data = tensor->data()->data();
+ }
+ // Remove const =(
+ void* mutable_data = const_cast<void*>(immutable_data);
+ int array_flags = 0;
+ if (tensor->is_row_major()) {
+ array_flags |= NPY_ARRAY_C_CONTIGUOUS;
+ }
+ if (tensor->is_column_major()) {
+ array_flags |= NPY_ARRAY_F_CONTIGUOUS;
+ }
+ if (tensor->is_mutable()) {
+ array_flags |= NPY_ARRAY_WRITEABLE;
+ }
+ PyObject* result =
+ PyArray_NewFromDescr(&PyArray_Type, dtype, ndim, npy_shape.data(),
+ npy_strides.data(), mutable_data, array_flags, nullptr);
+ if (base == Py_None || base == nullptr) {
+ base = py::wrap_tensor(tensor);
+ } else {
+ Py_XINCREF(base);
+ }
+ PyArray_SetBaseObject(reinterpret_cast<PyArrayObject*>(result), base);
+ *out = result;
+ return Status::OK();
+// Wrap the dense data of a sparse tensor in a ndarray
+static Status SparseTensorDataToNdarray(const SparseTensor& sparse_tensor,
+ std::vector<npy_intp> data_shape, PyObject* base,
+ PyObject** out_data) {
+ int type_num_data = 0;
+ RETURN_NOT_OK(GetNumPyType(*sparse_tensor.type(), &type_num_data));
+ PyArray_Descr* dtype_data = PyArray_DescrNewFromType(type_num_data);
+ const void* immutable_data = sparse_tensor.data()->data();
+ // Remove const =(
+ void* mutable_data = const_cast<void*>(immutable_data);
+ if (sparse_tensor.is_mutable()) {
+ array_flags |= NPY_ARRAY_WRITEABLE;
+ }
+ *out_data = PyArray_NewFromDescr(&PyArray_Type, dtype_data,
+ static_cast<int>(data_shape.size()), data_shape.data(),
+ nullptr, mutable_data, array_flags, nullptr);
+ Py_XINCREF(base);
+ PyArray_SetBaseObject(reinterpret_cast<PyArrayObject*>(*out_data), base);
+ return Status::OK();
+Status SparseCOOTensorToNdarray(const std::shared_ptr<SparseCOOTensor>& sparse_tensor,
+ PyObject* base, PyObject** out_data,
+ PyObject** out_coords) {
+ const auto& sparse_index = arrow::internal::checked_cast<const SparseCOOIndex&>(
+ *sparse_tensor->sparse_index());
+ // Wrap tensor data
+ OwnedRef result_data;
+ RETURN_NOT_OK(SparseTensorDataToNdarray(
+ *sparse_tensor, {sparse_tensor->non_zero_length(), 1}, base, result_data.ref()));
+ // Wrap indices
+ PyObject* result_coords;
+ RETURN_NOT_OK(TensorToNdarray(sparse_index.indices(), base, &result_coords));
+ *out_data = result_data.detach();
+ *out_coords = result_coords;
+ return Status::OK();
+Status SparseCSXMatrixToNdarray(const std::shared_ptr<SparseTensor>& sparse_tensor,
+ PyObject* base, PyObject** out_data,
+ PyObject** out_indptr, PyObject** out_indices) {
+ // Wrap indices
+ OwnedRef result_indptr;
+ OwnedRef result_indices;
+ switch (sparse_tensor->format_id()) {
+ case SparseTensorFormat::CSR: {
+ const auto& sparse_index = arrow::internal::checked_cast<const SparseCSRIndex&>(
+ *sparse_tensor->sparse_index());
+ RETURN_NOT_OK(TensorToNdarray(sparse_index.indptr(), base, result_indptr.ref()));
+ RETURN_NOT_OK(TensorToNdarray(sparse_index.indices(), base, result_indices.ref()));
+ break;
+ }
+ case SparseTensorFormat::CSC: {
+ const auto& sparse_index = arrow::internal::checked_cast<const SparseCSCIndex&>(
+ *sparse_tensor->sparse_index());
+ RETURN_NOT_OK(TensorToNdarray(sparse_index.indptr(), base, result_indptr.ref()));
+ RETURN_NOT_OK(TensorToNdarray(sparse_index.indices(), base, result_indices.ref()));
+ break;
+ }
+ default:
+ return Status::NotImplemented("Invalid SparseTensor type.");
+ }
+ // Wrap tensor data
+ OwnedRef result_data;
+ RETURN_NOT_OK(SparseTensorDataToNdarray(
+ *sparse_tensor, {sparse_tensor->non_zero_length(), 1}, base, result_data.ref()));
+ *out_data = result_data.detach();
+ *out_indptr = result_indptr.detach();
+ *out_indices = result_indices.detach();
+ return Status::OK();
+Status SparseCSRMatrixToNdarray(const std::shared_ptr<SparseCSRMatrix>& sparse_tensor,
+ PyObject* base, PyObject** out_data,
+ PyObject** out_indptr, PyObject** out_indices) {
+ return SparseCSXMatrixToNdarray(sparse_tensor, base, out_data, out_indptr, out_indices);
+Status SparseCSCMatrixToNdarray(const std::shared_ptr<SparseCSCMatrix>& sparse_tensor,
+ PyObject* base, PyObject** out_data,
+ PyObject** out_indptr, PyObject** out_indices) {
+ return SparseCSXMatrixToNdarray(sparse_tensor, base, out_data, out_indptr, out_indices);
+Status SparseCSFTensorToNdarray(const std::shared_ptr<SparseCSFTensor>& sparse_tensor,
+ PyObject* base, PyObject** out_data,
+ PyObject** out_indptr, PyObject** out_indices) {
+ const auto& sparse_index = arrow::internal::checked_cast<const SparseCSFIndex&>(
+ *sparse_tensor->sparse_index());
+ // Wrap tensor data
+ OwnedRef result_data;
+ RETURN_NOT_OK(SparseTensorDataToNdarray(
+ *sparse_tensor, {sparse_tensor->non_zero_length(), 1}, base, result_data.ref()));
+ // Wrap indices
+ int ndim = static_cast<int>(sparse_index.indices().size());
+ OwnedRef indptr(PyList_New(ndim - 1));
+ OwnedRef indices(PyList_New(ndim));
+ for (int i = 0; i < ndim - 1; ++i) {
+ PyObject* item;
+ RETURN_NOT_OK(TensorToNdarray(sparse_index.indptr()[i], base, &item));
+ if (PyList_SetItem(indptr.obj(), i, item) < 0) {
+ Py_XDECREF(item);
+ }
+ }
+ for (int i = 0; i < ndim; ++i) {
+ PyObject* item;
+ RETURN_NOT_OK(TensorToNdarray(sparse_index.indices()[i], base, &item));
+ if (PyList_SetItem(indices.obj(), i, item) < 0) {
+ Py_XDECREF(item);
+ }
+ }
+ *out_indptr = indptr.detach();
+ *out_indices = indices.detach();
+ *out_data = result_data.detach();
+ return Status::OK();
+Status NdarraysToSparseCOOTensor(MemoryPool* pool, PyObject* data_ao, PyObject* coords_ao,
+ const std::vector<int64_t>& shape,
+ const std::vector<std::string>& dim_names,
+ std::shared_ptr<SparseCOOTensor>* out) {
+ if (!PyArray_Check(data_ao) || !PyArray_Check(coords_ao)) {
+ return Status::TypeError("Did not pass ndarray object");
+ }
+ PyArrayObject* ndarray_data = reinterpret_cast<PyArrayObject*>(data_ao);
+ std::shared_ptr<Buffer> data = std::make_shared<NumPyBuffer>(data_ao);
+ std::shared_ptr<DataType> type_data;
+ RETURN_NOT_OK(GetTensorType(reinterpret_cast<PyObject*>(PyArray_DESCR(ndarray_data)),
+ &type_data));
+ std::shared_ptr<Tensor> coords;
+ RETURN_NOT_OK(NdarrayToTensor(pool, coords_ao, {}, &coords));
+ ARROW_CHECK_EQ(coords->type_id(), Type::INT64); // Should be ensured by caller
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<SparseCOOIndex> sparse_index,
+ SparseCOOIndex::Make(coords));
+ *out = std::make_shared<SparseTensorImpl<SparseCOOIndex>>(sparse_index, type_data, data,
+ shape, dim_names);
+ return Status::OK();
+template <class IndexType>
+Status NdarraysToSparseCSXMatrix(MemoryPool* pool, PyObject* data_ao, PyObject* indptr_ao,
+ PyObject* indices_ao, const std::vector<int64_t>& shape,
+ const std::vector<std::string>& dim_names,
+ std::shared_ptr<SparseTensorImpl<IndexType>>* out) {
+ if (!PyArray_Check(data_ao) || !PyArray_Check(indptr_ao) ||
+ !PyArray_Check(indices_ao)) {
+ return Status::TypeError("Did not pass ndarray object");
+ }
+ PyArrayObject* ndarray_data = reinterpret_cast<PyArrayObject*>(data_ao);
+ std::shared_ptr<Buffer> data = std::make_shared<NumPyBuffer>(data_ao);
+ std::shared_ptr<DataType> type_data;
+ RETURN_NOT_OK(GetTensorType(reinterpret_cast<PyObject*>(PyArray_DESCR(ndarray_data)),
+ &type_data));
+ std::shared_ptr<Tensor> indptr, indices;
+ RETURN_NOT_OK(NdarrayToTensor(pool, indptr_ao, {}, &indptr));
+ RETURN_NOT_OK(NdarrayToTensor(pool, indices_ao, {}, &indices));
+ ARROW_CHECK_EQ(indptr->type_id(), Type::INT64); // Should be ensured by caller
+ ARROW_CHECK_EQ(indices->type_id(), Type::INT64); // Should be ensured by caller
+ auto sparse_index = std::make_shared<IndexType>(
+ std::static_pointer_cast<NumericTensor<Int64Type>>(indptr),
+ std::static_pointer_cast<NumericTensor<Int64Type>>(indices));
+ *out = std::make_shared<SparseTensorImpl<IndexType>>(sparse_index, type_data, data,
+ shape, dim_names);
+ return Status::OK();
+Status NdarraysToSparseCSFTensor(MemoryPool* pool, PyObject* data_ao, PyObject* indptr_ao,
+ PyObject* indices_ao, const std::vector<int64_t>& shape,
+ const std::vector<int64_t>& axis_order,
+ const std::vector<std::string>& dim_names,
+ std::shared_ptr<SparseCSFTensor>* out) {
+ if (!PyArray_Check(data_ao)) {
+ return Status::TypeError("Did not pass ndarray object for data");
+ }
+ const int ndim = static_cast<const int>(shape.size());
+ PyArrayObject* ndarray_data = reinterpret_cast<PyArrayObject*>(data_ao);
+ std::shared_ptr<Buffer> data = std::make_shared<NumPyBuffer>(data_ao);
+ std::shared_ptr<DataType> type_data;
+ RETURN_NOT_OK(GetTensorType(reinterpret_cast<PyObject*>(PyArray_DESCR(ndarray_data)),
+ &type_data));
+ std::vector<std::shared_ptr<Tensor>> indptr(ndim - 1);
+ std::vector<std::shared_ptr<Tensor>> indices(ndim);
+ for (int i = 0; i < ndim - 1; ++i) {
+ PyObject* item = PySequence_Fast_GET_ITEM(indptr_ao, i);
+ if (!PyArray_Check(item)) {
+ return Status::TypeError("Did not pass ndarray object for indptr");
+ }
+ RETURN_NOT_OK(NdarrayToTensor(pool, item, {}, &indptr[i]));
+ ARROW_CHECK_EQ(indptr[i]->type_id(), Type::INT64); // Should be ensured by caller
+ }
+ for (int i = 0; i < ndim; ++i) {
+ PyObject* item = PySequence_Fast_GET_ITEM(indices_ao, i);
+ if (!PyArray_Check(item)) {
+ return Status::TypeError("Did not pass ndarray object for indices");
+ }
+ RETURN_NOT_OK(NdarrayToTensor(pool, item, {}, &indices[i]));
+ ARROW_CHECK_EQ(indices[i]->type_id(), Type::INT64); // Should be ensured by caller
+ }
+ auto sparse_index = std::make_shared<SparseCSFIndex>(indptr, indices, axis_order);
+ *out = std::make_shared<SparseTensorImpl<SparseCSFIndex>>(sparse_index, type_data, data,
+ shape, dim_names);
+ return Status::OK();
+Status NdarraysToSparseCSRMatrix(MemoryPool* pool, PyObject* data_ao, PyObject* indptr_ao,
+ PyObject* indices_ao, const std::vector<int64_t>& shape,
+ const std::vector<std::string>& dim_names,
+ std::shared_ptr<SparseCSRMatrix>* out) {
+ return NdarraysToSparseCSXMatrix<SparseCSRIndex>(pool, data_ao, indptr_ao, indices_ao,
+ shape, dim_names, out);
+Status NdarraysToSparseCSCMatrix(MemoryPool* pool, PyObject* data_ao, PyObject* indptr_ao,
+ PyObject* indices_ao, const std::vector<int64_t>& shape,
+ const std::vector<std::string>& dim_names,
+ std::shared_ptr<SparseCSCMatrix>* out) {
+ return NdarraysToSparseCSXMatrix<SparseCSCIndex>(pool, data_ao, indptr_ao, indices_ao,
+ shape, dim_names, out);
+Status TensorToSparseCOOTensor(const std::shared_ptr<Tensor>& tensor,
+ std::shared_ptr<SparseCOOTensor>* out) {
+ return SparseCOOTensor::Make(*tensor).Value(out);
+Status TensorToSparseCSRMatrix(const std::shared_ptr<Tensor>& tensor,
+ std::shared_ptr<SparseCSRMatrix>* out) {
+ return SparseCSRMatrix::Make(*tensor).Value(out);
+Status TensorToSparseCSCMatrix(const std::shared_ptr<Tensor>& tensor,
+ std::shared_ptr<SparseCSCMatrix>* out) {
+ return SparseCSCMatrix::Make(*tensor).Value(out);
+Status TensorToSparseCSFTensor(const std::shared_ptr<Tensor>& tensor,
+ std::shared_ptr<SparseCSFTensor>* out) {
+ return SparseCSFTensor::Make(*tensor).Value(out);
+} // namespace py
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/python/numpy_convert.h b/contrib/libs/apache/arrow/cpp/src/arrow/python/numpy_convert.h
new file mode 100644
index 0000000000..10451077a2
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/python/numpy_convert.h
@@ -0,0 +1,120 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+// Functions for converting between pandas's NumPy-based data representation
+// and Arrow data structures
+#pragma once
+#include "arrow/python/platform.h"
+#include <memory>
+#include <string>
+#include <vector>
+#include "arrow/buffer.h"
+#include "arrow/python/visibility.h"
+#include "arrow/sparse_tensor.h"
+namespace arrow {
+class DataType;
+class MemoryPool;
+class Status;
+class Tensor;
+namespace py {
+class ARROW_PYTHON_EXPORT NumPyBuffer : public Buffer {
+ public:
+ explicit NumPyBuffer(PyObject* arr);
+ virtual ~NumPyBuffer();
+ private:
+ PyObject* arr_;
+Status NumPyDtypeToArrow(PyObject* dtype, std::shared_ptr<DataType>* out);
+Status NumPyDtypeToArrow(PyArray_Descr* descr, std::shared_ptr<DataType>* out);
+ARROW_PYTHON_EXPORT Status NdarrayToTensor(MemoryPool* pool, PyObject* ao,
+ const std::vector<std::string>& dim_names,
+ std::shared_ptr<Tensor>* out);
+ARROW_PYTHON_EXPORT Status TensorToNdarray(const std::shared_ptr<Tensor>& tensor,
+ PyObject* base, PyObject** out);
+SparseCOOTensorToNdarray(const std::shared_ptr<SparseCOOTensor>& sparse_tensor,
+ PyObject* base, PyObject** out_data, PyObject** out_coords);
+Status SparseCSXMatrixToNdarray(const std::shared_ptr<SparseTensor>& sparse_tensor,
+ PyObject* base, PyObject** out_data,
+ PyObject** out_indptr, PyObject** out_indices);
+ARROW_PYTHON_EXPORT Status SparseCSRMatrixToNdarray(
+ const std::shared_ptr<SparseCSRMatrix>& sparse_tensor, PyObject* base,
+ PyObject** out_data, PyObject** out_indptr, PyObject** out_indices);
+ARROW_PYTHON_EXPORT Status SparseCSCMatrixToNdarray(
+ const std::shared_ptr<SparseCSCMatrix>& sparse_tensor, PyObject* base,
+ PyObject** out_data, PyObject** out_indptr, PyObject** out_indices);
+ARROW_PYTHON_EXPORT Status SparseCSFTensorToNdarray(
+ const std::shared_ptr<SparseCSFTensor>& sparse_tensor, PyObject* base,
+ PyObject** out_data, PyObject** out_indptr, PyObject** out_indices);
+ARROW_PYTHON_EXPORT Status NdarraysToSparseCOOTensor(
+ MemoryPool* pool, PyObject* data_ao, PyObject* coords_ao,
+ const std::vector<int64_t>& shape, const std::vector<std::string>& dim_names,
+ std::shared_ptr<SparseCOOTensor>* out);
+ARROW_PYTHON_EXPORT Status NdarraysToSparseCSRMatrix(
+ MemoryPool* pool, PyObject* data_ao, PyObject* indptr_ao, PyObject* indices_ao,
+ const std::vector<int64_t>& shape, const std::vector<std::string>& dim_names,
+ std::shared_ptr<SparseCSRMatrix>* out);
+ARROW_PYTHON_EXPORT Status NdarraysToSparseCSCMatrix(
+ MemoryPool* pool, PyObject* data_ao, PyObject* indptr_ao, PyObject* indices_ao,
+ const std::vector<int64_t>& shape, const std::vector<std::string>& dim_names,
+ std::shared_ptr<SparseCSCMatrix>* out);
+ARROW_PYTHON_EXPORT Status NdarraysToSparseCSFTensor(
+ MemoryPool* pool, PyObject* data_ao, PyObject* indptr_ao, PyObject* indices_ao,
+ const std::vector<int64_t>& shape, const std::vector<int64_t>& axis_order,
+ const std::vector<std::string>& dim_names, std::shared_ptr<SparseCSFTensor>* out);
+TensorToSparseCOOTensor(const std::shared_ptr<Tensor>& tensor,
+ std::shared_ptr<SparseCOOTensor>* csparse_tensor);
+TensorToSparseCSRMatrix(const std::shared_ptr<Tensor>& tensor,
+ std::shared_ptr<SparseCSRMatrix>* csparse_tensor);
+TensorToSparseCSCMatrix(const std::shared_ptr<Tensor>& tensor,
+ std::shared_ptr<SparseCSCMatrix>* csparse_tensor);
+TensorToSparseCSFTensor(const std::shared_ptr<Tensor>& tensor,
+ std::shared_ptr<SparseCSFTensor>* csparse_tensor);
+} // namespace py
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/python/numpy_internal.h b/contrib/libs/apache/arrow/cpp/src/arrow/python/numpy_internal.h
new file mode 100644
index 0000000000..973f577cb1
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/python/numpy_internal.h
@@ -0,0 +1,182 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+// Internal utilities for dealing with NumPy
+#pragma once
+#include "arrow/python/numpy_interop.h"
+#include "arrow/status.h"
+#include "arrow/python/platform.h"
+#include <cstdint>
+#include <sstream>
+#include <string>
+namespace arrow {
+namespace py {
+/// Indexing convenience for interacting with strided 1-dim ndarray objects
+template <typename T>
+class Ndarray1DIndexer {
+ public:
+ typedef int64_t size_type;
+ Ndarray1DIndexer() : arr_(NULLPTR), data_(NULLPTR) {}
+ explicit Ndarray1DIndexer(PyArrayObject* arr) : Ndarray1DIndexer() {
+ arr_ = arr;
+ DCHECK_EQ(1, PyArray_NDIM(arr)) << "Only works with 1-dimensional arrays";
+ Py_INCREF(arr);
+ data_ = reinterpret_cast<uint8_t*>(PyArray_DATA(arr));
+ stride_ = PyArray_STRIDES(arr)[0];
+ }
+ ~Ndarray1DIndexer() { Py_XDECREF(arr_); }
+ int64_t size() const { return PyArray_SIZE(arr_); }
+ const T* data() const { return reinterpret_cast<const T*>(data_); }
+ bool is_strided() const { return stride_ != sizeof(T); }
+ T& operator[](size_type index) {
+ return *reinterpret_cast<T*>(data_ + index * stride_);
+ }
+ const T& operator[](size_type index) const {
+ return *reinterpret_cast<const T*>(data_ + index * stride_);
+ }
+ private:
+ PyArrayObject* arr_;
+ uint8_t* data_;
+ int64_t stride_;
+// Handling of Numpy Types by their static numbers
+// (the NPY_TYPES enum and related defines)
+static inline std::string GetNumPyTypeName(int npy_type) {
+#define TYPE_CASE(TYPE, NAME) \
+ case NPY_##TYPE: \
+ return NAME;
+ switch (npy_type) {
+ TYPE_CASE(BOOL, "bool")
+ TYPE_CASE(INT8, "int8")
+ TYPE_CASE(INT16, "int16")
+ TYPE_CASE(INT32, "int32")
+ TYPE_CASE(INT64, "int64")
+#if !NPY_INT32_IS_INT
+ TYPE_CASE(INT, "intc")
+ TYPE_CASE(LONGLONG, "longlong")
+ TYPE_CASE(UINT8, "uint8")
+ TYPE_CASE(UINT16, "uint16")
+ TYPE_CASE(UINT32, "uint32")
+ TYPE_CASE(UINT64, "uint64")
+#if !NPY_INT32_IS_INT
+ TYPE_CASE(UINT, "uintc")
+ TYPE_CASE(ULONGLONG, "ulonglong")
+ TYPE_CASE(FLOAT16, "float16")
+ TYPE_CASE(FLOAT32, "float32")
+ TYPE_CASE(FLOAT64, "float64")
+ TYPE_CASE(DATETIME, "datetime64")
+ TYPE_CASE(TIMEDELTA, "timedelta64")
+ TYPE_CASE(OBJECT, "object")
+ TYPE_CASE(VOID, "void")
+ default:
+ break;
+ }
+#undef TYPE_CASE
+ std::stringstream ss;
+ ss << "unrecognized type (" << npy_type << ") in GetNumPyTypeName";
+ return ss.str();
+ case NPY_##TYPE: \
+ return visitor->template Visit<NPY_##TYPE>(arr);
+template <typename VISITOR>
+inline Status VisitNumpyArrayInline(PyArrayObject* arr, VISITOR* visitor) {
+ switch (PyArray_TYPE(arr)) {
+#if !NPY_INT32_IS_INT
+ }
+ return Status::NotImplemented("NumPy type not implemented: ",
+ GetNumPyTypeName(PyArray_TYPE(arr)));
+namespace internal {
+inline bool PyFloatScalar_Check(PyObject* obj) {
+ return PyFloat_Check(obj) || PyArray_IsScalar(obj, Floating);
+inline bool PyIntScalar_Check(PyObject* obj) {
+ return PyLong_Check(obj) || PyArray_IsScalar(obj, Integer);
+inline bool PyBoolScalar_Check(PyObject* obj) {
+ return PyBool_Check(obj) || PyArray_IsScalar(obj, Bool);
+static inline PyArray_Descr* GetSafeNumPyDtype(int type) {
+ if (type == NPY_DATETIME) {
+ // It is not safe to mutate the result of DescrFromType
+ return PyArray_DescrNewFromType(type);
+ } else {
+ return PyArray_DescrFromType(type);
+ }
+} // namespace internal
+} // namespace py
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/python/numpy_interop.h b/contrib/libs/apache/arrow/cpp/src/arrow/python/numpy_interop.h
new file mode 100644
index 0000000000..ce7baed259
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/python/numpy_interop.h
@@ -0,0 +1,96 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+#include "arrow/python/platform.h" // IWYU pragma: export
+#include <numpy/numpyconfig.h> // IWYU pragma: export
+// Don't use the deprecated Numpy functions
+#ifdef NPY_1_7_API_VERSION
+// This is required to be able to access the NumPy C API properly in C++ files
+// other than init.cc.
+#include <numpy/arrayobject.h> // IWYU pragma: export
+#include <numpy/arrayscalars.h> // IWYU pragma: export
+#include <numpy/ufuncobject.h> // IWYU pragma: export
+// A bit subtle. Numpy has 5 canonical integer types:
+// (or, rather, type pairs: signed and unsigned)
+// It also has 4 fixed-width integer aliases.
+// When mapping Arrow integer types to these 4 fixed-width aliases,
+// we always miss one of the canonical types (even though it may
+// have the same width as one of the aliases).
+// Which one depends on the platform...
+// On a LP64 system, NPY_INT64 maps to NPY_LONG and
+// NPY_LONGLONG needs to be handled separately.
+// On a LLP64 system, NPY_INT32 maps to NPY_LONG and
+// NPY_INT needs to be handled separately.
+#define NPY_INT64_IS_LONG_LONG 1
+#define NPY_INT64_IS_LONG_LONG 0
+#if NPY_BITSOF_INT == 32 && NPY_BITSOF_LONG == 64
+#define NPY_INT32_IS_INT 1
+#define NPY_INT32_IS_INT 0
+namespace arrow {
+namespace py {
+inline int import_numpy() {
+ import_array1(-1);
+ import_umath1(-1);
+ return 0;
+// See above about the missing Numpy integer type numbers
+inline int fix_numpy_type_num(int type_num) {
+#if !NPY_INT32_IS_INT && NPY_BITSOF_INT == 32
+ if (type_num == NPY_INT) return NPY_INT32;
+ if (type_num == NPY_UINT) return NPY_UINT32;
+ if (type_num == NPY_LONGLONG) return NPY_INT64;
+ if (type_num == NPY_ULONGLONG) return NPY_UINT64;
+ return type_num;
+} // namespace py
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/python/numpy_to_arrow.cc b/contrib/libs/apache/arrow/cpp/src/arrow/python/numpy_to_arrow.cc
new file mode 100644
index 0000000000..a382f76633
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/python/numpy_to_arrow.cc
@@ -0,0 +1,865 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+// Functions for pandas conversion via NumPy
+#include "arrow/python/numpy_to_arrow.h"
+#include "arrow/python/numpy_interop.h"
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <cstring>
+#include <limits>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+#include "arrow/array.h"
+#include "arrow/array/builder_binary.h"
+#include "arrow/status.h"
+#include "arrow/table.h"
+#include "arrow/type_fwd.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_generate.h"
+#include "arrow/util/bitmap_ops.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/string.h"
+#include "arrow/util/utf8.h"
+#include "arrow/visitor_inline.h"
+#include "arrow/compute/api_scalar.h"
+#include "arrow/python/common.h"
+#include "arrow/python/datetime.h"
+#include "arrow/python/helpers.h"
+#include "arrow/python/iterators.h"
+#include "arrow/python/numpy_convert.h"
+#include "arrow/python/numpy_internal.h"
+#include "arrow/python/python_to_arrow.h"
+#include "arrow/python/type_traits.h"
+namespace arrow {
+using internal::checked_cast;
+using internal::CopyBitmap;
+using internal::GenerateBitsUnrolled;
+namespace py {
+using internal::NumPyTypeSize;
+// ----------------------------------------------------------------------
+// Conversion utilities
+namespace {
+Status AllocateNullBitmap(MemoryPool* pool, int64_t length,
+ std::shared_ptr<ResizableBuffer>* out) {
+ int64_t null_bytes = BitUtil::BytesForBits(length);
+ ARROW_ASSIGN_OR_RAISE(auto null_bitmap, AllocateResizableBuffer(null_bytes, pool));
+ // Padding zeroed by AllocateResizableBuffer
+ memset(null_bitmap->mutable_data(), 0, static_cast<size_t>(null_bytes));
+ *out = std::move(null_bitmap);
+ return Status::OK();
+// ----------------------------------------------------------------------
+// Conversion from NumPy-in-Pandas to Arrow null bitmap
+template <int TYPE>
+inline int64_t ValuesToBitmap(PyArrayObject* arr, uint8_t* bitmap) {
+ typedef internal::npy_traits<TYPE> traits;
+ typedef typename traits::value_type T;
+ int64_t null_count = 0;
+ Ndarray1DIndexer<T> values(arr);
+ for (int i = 0; i < values.size(); ++i) {
+ if (traits::isnull(values[i])) {
+ ++null_count;
+ } else {
+ BitUtil::SetBit(bitmap, i);
+ }
+ }
+ return null_count;
+class NumPyNullsConverter {
+ public:
+ /// Convert the given array's null values to a null bitmap.
+ /// The null bitmap is only allocated if null values are ever possible.
+ static Status Convert(MemoryPool* pool, PyArrayObject* arr, bool from_pandas,
+ std::shared_ptr<ResizableBuffer>* out_null_bitmap_,
+ int64_t* out_null_count) {
+ NumPyNullsConverter converter(pool, arr, from_pandas);
+ RETURN_NOT_OK(VisitNumpyArrayInline(arr, &converter));
+ *out_null_bitmap_ = converter.null_bitmap_;
+ *out_null_count = converter.null_count_;
+ return Status::OK();
+ }
+ template <int TYPE>
+ Status Visit(PyArrayObject* arr) {
+ typedef internal::npy_traits<TYPE> traits;
+ const bool null_sentinels_possible =
+ // Always treat Numpy's NaT as null
+ // Observing pandas's null sentinels
+ (from_pandas_ && traits::supports_nulls);
+ if (null_sentinels_possible) {
+ RETURN_NOT_OK(AllocateNullBitmap(pool_, PyArray_SIZE(arr), &null_bitmap_));
+ null_count_ = ValuesToBitmap<TYPE>(arr, null_bitmap_->mutable_data());
+ }
+ return Status::OK();
+ }
+ protected:
+ NumPyNullsConverter(MemoryPool* pool, PyArrayObject* arr, bool from_pandas)
+ : pool_(pool),
+ arr_(arr),
+ from_pandas_(from_pandas),
+ null_bitmap_data_(nullptr),
+ null_count_(0) {}
+ MemoryPool* pool_;
+ PyArrayObject* arr_;
+ bool from_pandas_;
+ std::shared_ptr<ResizableBuffer> null_bitmap_;
+ uint8_t* null_bitmap_data_;
+ int64_t null_count_;
+// Returns null count
+int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t* bitmap) {
+ int64_t null_count = 0;
+ Ndarray1DIndexer<uint8_t> mask_values(mask);
+ for (int i = 0; i < length; ++i) {
+ if (mask_values[i]) {
+ ++null_count;
+ BitUtil::ClearBit(bitmap, i);
+ } else {
+ BitUtil::SetBit(bitmap, i);
+ }
+ }
+ return null_count;
+} // namespace
+// ----------------------------------------------------------------------
+// Conversion from NumPy arrays (possibly originating from pandas) to Arrow
+// format. Does not handle NPY_OBJECT dtype arrays; use ConvertPySequence for
+// that
+class NumPyConverter {
+ public:
+ NumPyConverter(MemoryPool* pool, PyObject* arr, PyObject* mo,
+ const std::shared_ptr<DataType>& type, bool from_pandas,
+ const compute::CastOptions& cast_options = compute::CastOptions())
+ : pool_(pool),
+ type_(type),
+ arr_(reinterpret_cast<PyArrayObject*>(arr)),
+ dtype_(PyArray_DESCR(arr_)),
+ mask_(nullptr),
+ from_pandas_(from_pandas),
+ cast_options_(cast_options),
+ null_bitmap_data_(nullptr),
+ null_count_(0) {
+ if (mo != nullptr && mo != Py_None) {
+ mask_ = reinterpret_cast<PyArrayObject*>(mo);
+ }
+ length_ = static_cast<int64_t>(PyArray_SIZE(arr_));
+ itemsize_ = static_cast<int>(PyArray_DESCR(arr_)->elsize);
+ stride_ = static_cast<int64_t>(PyArray_STRIDES(arr_)[0]);
+ }
+ bool is_strided() const { return itemsize_ != stride_; }
+ Status Convert();
+ const ArrayVector& result() const { return out_arrays_; }
+ template <typename T>
+ enable_if_primitive_ctype<T, Status> Visit(const T& type) {
+ return VisitNative<T>();
+ }
+ Status Visit(const HalfFloatType& type) { return VisitNative<UInt16Type>(); }
+ Status Visit(const Date32Type& type) { return VisitNative<Date32Type>(); }
+ Status Visit(const Date64Type& type) { return VisitNative<Date64Type>(); }
+ Status Visit(const TimestampType& type) { return VisitNative<TimestampType>(); }
+ Status Visit(const Time32Type& type) { return VisitNative<Int32Type>(); }
+ Status Visit(const Time64Type& type) { return VisitNative<Int64Type>(); }
+ Status Visit(const DurationType& type) { return VisitNative<DurationType>(); }
+ Status Visit(const NullType& type) { return TypeNotImplemented(type.ToString()); }
+ // NumPy ascii string arrays
+ Status Visit(const BinaryType& type);
+ // NumPy unicode arrays
+ Status Visit(const StringType& type);
+ Status Visit(const StructType& type);
+ Status Visit(const FixedSizeBinaryType& type);
+ // Default case
+ Status Visit(const DataType& type) { return TypeNotImplemented(type.ToString()); }
+ protected:
+ Status InitNullBitmap() {
+ RETURN_NOT_OK(AllocateNullBitmap(pool_, length_, &null_bitmap_));
+ null_bitmap_data_ = null_bitmap_->mutable_data();
+ return Status::OK();
+ }
+ // Called before ConvertData to ensure Numpy input buffer is in expected
+ // Arrow layout
+ template <typename ArrowType>
+ Status PrepareInputData(std::shared_ptr<Buffer>* data);
+ // ----------------------------------------------------------------------
+ // Traditional visitor conversion for non-object arrays
+ template <typename ArrowType>
+ Status ConvertData(std::shared_ptr<Buffer>* data);
+ template <typename T>
+ Status PushBuilderResult(T* builder) {
+ std::shared_ptr<Array> out;
+ RETURN_NOT_OK(builder->Finish(&out));
+ out_arrays_.emplace_back(out);
+ return Status::OK();
+ }
+ Status PushArray(const std::shared_ptr<ArrayData>& data) {
+ out_arrays_.emplace_back(MakeArray(data));
+ return Status::OK();
+ }
+ template <typename ArrowType>
+ Status VisitNative() {
+ if (mask_ != nullptr) {
+ RETURN_NOT_OK(InitNullBitmap());
+ null_count_ = MaskToBitmap(mask_, length_, null_bitmap_data_);
+ } else {
+ RETURN_NOT_OK(NumPyNullsConverter::Convert(pool_, arr_, from_pandas_, &null_bitmap_,
+ &null_count_));
+ }
+ std::shared_ptr<Buffer> data;
+ RETURN_NOT_OK(ConvertData<ArrowType>(&data));
+ auto arr_data = ArrayData::Make(type_, length_, {null_bitmap_, data}, null_count_, 0);
+ return PushArray(arr_data);
+ }
+ Status TypeNotImplemented(std::string type_name) {
+ return Status::NotImplemented("NumPyConverter doesn't implement <", type_name,
+ "> conversion. ");
+ }
+ MemoryPool* pool_;
+ std::shared_ptr<DataType> type_;
+ PyArrayObject* arr_;
+ PyArray_Descr* dtype_;
+ PyArrayObject* mask_;
+ int64_t length_;
+ int64_t stride_;
+ int itemsize_;
+ bool from_pandas_;
+ compute::CastOptions cast_options_;
+ // Used in visitor pattern
+ ArrayVector out_arrays_;
+ std::shared_ptr<ResizableBuffer> null_bitmap_;
+ uint8_t* null_bitmap_data_;
+ int64_t null_count_;
+Status NumPyConverter::Convert() {
+ if (PyArray_NDIM(arr_) != 1) {
+ return Status::Invalid("only handle 1-dimensional arrays");
+ }
+ if (dtype_->type_num == NPY_OBJECT) {
+ // If an object array, convert it like a normal Python sequence
+ PyConversionOptions py_options;
+ py_options.type = type_;
+ py_options.from_pandas = from_pandas_;
+ auto chunked_array,
+ ConvertPySequence(reinterpret_cast<PyObject*>(arr_),
+ reinterpret_cast<PyObject*>(mask_), py_options, pool_));
+ out_arrays_ = chunked_array->chunks();
+ return Status::OK();
+ }
+ if (type_ == nullptr) {
+ return Status::Invalid("Must pass data type for non-object arrays");
+ }
+ // Visit the type to perform conversion
+ return VisitTypeInline(*type_, this);
+namespace {
+Status CastBuffer(const std::shared_ptr<DataType>& in_type,
+ const std::shared_ptr<Buffer>& input, const int64_t length,
+ const std::shared_ptr<Buffer>& valid_bitmap, const int64_t null_count,
+ const std::shared_ptr<DataType>& out_type,
+ const compute::CastOptions& cast_options, MemoryPool* pool,
+ std::shared_ptr<Buffer>* out) {
+ // Must cast
+ auto tmp_data = ArrayData::Make(in_type, length, {valid_bitmap, input}, null_count);
+ compute::ExecContext context(pool);
+ std::shared_ptr<Array> casted_array,
+ compute::Cast(*MakeArray(tmp_data), out_type, cast_options, &context));
+ *out = casted_array->data()->buffers[1];
+ return Status::OK();
+template <typename FromType, typename ToType>
+Status StaticCastBuffer(const Buffer& input, const int64_t length, MemoryPool* pool,
+ std::shared_ptr<Buffer>* out) {
+ ARROW_ASSIGN_OR_RAISE(auto result, AllocateBuffer(sizeof(ToType) * length, pool));
+ auto in_values = reinterpret_cast<const FromType*>(input.data());
+ auto out_values = reinterpret_cast<ToType*>(result->mutable_data());
+ for (int64_t i = 0; i < length; ++i) {
+ *out_values++ = static_cast<ToType>(*in_values++);
+ }
+ *out = std::move(result);
+ return Status::OK();
+template <typename T>
+void CopyStridedBytewise(int8_t* input_data, int64_t length, int64_t stride,
+ T* output_data) {
+ // Passing input_data as non-const is a concession to PyObject*
+ for (int64_t i = 0; i < length; ++i) {
+ memcpy(output_data + i, input_data, sizeof(T));
+ input_data += stride;
+ }
+template <typename T>
+void CopyStridedNatural(T* input_data, int64_t length, int64_t stride, T* output_data) {
+ // Passing input_data as non-const is a concession to PyObject*
+ int64_t j = 0;
+ for (int64_t i = 0; i < length; ++i) {
+ output_data[i] = input_data[j];
+ j += stride;
+ }
+class NumPyStridedConverter {
+ public:
+ static Status Convert(PyArrayObject* arr, int64_t length, MemoryPool* pool,
+ std::shared_ptr<Buffer>* out) {
+ NumPyStridedConverter converter(arr, length, pool);
+ RETURN_NOT_OK(VisitNumpyArrayInline(arr, &converter));
+ *out = converter.buffer_;
+ return Status::OK();
+ }
+ template <int TYPE>
+ Status Visit(PyArrayObject* arr) {
+ using traits = internal::npy_traits<TYPE>;
+ using T = typename traits::value_type;
+ ARROW_ASSIGN_OR_RAISE(buffer_, AllocateBuffer(sizeof(T) * length_, pool_));
+ const int64_t stride = PyArray_STRIDES(arr)[0];
+ if (stride % sizeof(T) == 0) {
+ const int64_t stride_elements = stride / sizeof(T);
+ CopyStridedNatural(reinterpret_cast<T*>(PyArray_DATA(arr)), length_,
+ stride_elements, reinterpret_cast<T*>(buffer_->mutable_data()));
+ } else {
+ CopyStridedBytewise(reinterpret_cast<int8_t*>(PyArray_DATA(arr)), length_, stride,
+ reinterpret_cast<T*>(buffer_->mutable_data()));
+ }
+ return Status::OK();
+ }
+ protected:
+ NumPyStridedConverter(PyArrayObject* arr, int64_t length, MemoryPool* pool)
+ : arr_(arr), length_(length), pool_(pool), buffer_(nullptr) {}
+ PyArrayObject* arr_;
+ int64_t length_;
+ MemoryPool* pool_;
+ std::shared_ptr<Buffer> buffer_;
+} // namespace
+template <typename ArrowType>
+inline Status NumPyConverter::PrepareInputData(std::shared_ptr<Buffer>* data) {
+ if (PyArray_ISBYTESWAPPED(arr_)) {
+ // TODO
+ return Status::NotImplemented("Byte-swapped arrays not supported");
+ }
+ if (dtype_->type_num == NPY_BOOL) {
+ int64_t nbytes = BitUtil::BytesForBits(length_);
+ ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateBuffer(nbytes, pool_));
+ Ndarray1DIndexer<uint8_t> values(arr_);
+ int64_t i = 0;
+ const auto generate = [&values, &i]() -> bool { return values[i++] > 0; };
+ GenerateBitsUnrolled(buffer->mutable_data(), 0, length_, generate);
+ *data = std::move(buffer);
+ } else if (is_strided()) {
+ RETURN_NOT_OK(NumPyStridedConverter::Convert(arr_, length_, pool_, data));
+ } else {
+ // Can zero-copy
+ *data = std::make_shared<NumPyBuffer>(reinterpret_cast<PyObject*>(arr_));
+ }
+ return Status::OK();
+template <typename ArrowType>
+inline Status NumPyConverter::ConvertData(std::shared_ptr<Buffer>* data) {
+ RETURN_NOT_OK(PrepareInputData<ArrowType>(data));
+ std::shared_ptr<DataType> input_type;
+ RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast<PyObject*>(dtype_), &input_type));
+ if (!input_type->Equals(*type_)) {
+ RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_, type_,
+ cast_options_, pool_, data));
+ }
+ return Status::OK();
+template <>
+inline Status NumPyConverter::ConvertData<Date32Type>(std::shared_ptr<Buffer>* data) {
+ std::shared_ptr<DataType> input_type;
+ RETURN_NOT_OK(PrepareInputData<Date32Type>(data));
+ auto date_dtype = reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(dtype_->c_metadata);
+ if (dtype_->type_num == NPY_DATETIME) {
+ // If we have inbound datetime64[D] data, this needs to be downcasted
+ // separately here from int64_t to int32_t, because this data is not
+ // supported in compute::Cast
+ if (date_dtype->meta.base == NPY_FR_D) {
+ // TODO(wesm): How pedantic do we really want to be about checking for int32
+ // overflow here?
+ Status s = StaticCastBuffer<int64_t, int32_t>(**data, length_, pool_, data);
+ } else {
+ RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast<PyObject*>(dtype_), &input_type));
+ if (!input_type->Equals(*type_)) {
+ // The null bitmap was already computed in VisitNative()
+ RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_,
+ type_, cast_options_, pool_, data));
+ }
+ }
+ } else {
+ RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast<PyObject*>(dtype_), &input_type));
+ if (!input_type->Equals(*type_)) {
+ RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_,
+ type_, cast_options_, pool_, data));
+ }
+ }
+ return Status::OK();
+template <>
+inline Status NumPyConverter::ConvertData<Date64Type>(std::shared_ptr<Buffer>* data) {
+ constexpr int64_t kMillisecondsInDay = 86400000;
+ std::shared_ptr<DataType> input_type;
+ RETURN_NOT_OK(PrepareInputData<Date64Type>(data));
+ auto date_dtype = reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(dtype_->c_metadata);
+ if (dtype_->type_num == NPY_DATETIME) {
+ // If we have inbound datetime64[D] data, this needs to be downcasted
+ // separately here from int64_t to int32_t, because this data is not
+ // supported in compute::Cast
+ if (date_dtype->meta.base == NPY_FR_D) {
+ ARROW_ASSIGN_OR_RAISE(auto result,
+ AllocateBuffer(sizeof(int64_t) * length_, pool_));
+ auto in_values = reinterpret_cast<const int64_t*>((*data)->data());
+ auto out_values = reinterpret_cast<int64_t*>(result->mutable_data());
+ for (int64_t i = 0; i < length_; ++i) {
+ *out_values++ = kMillisecondsInDay * (*in_values++);
+ }
+ *data = std::move(result);
+ } else {
+ RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast<PyObject*>(dtype_), &input_type));
+ if (!input_type->Equals(*type_)) {
+ // The null bitmap was already computed in VisitNative()
+ RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_,
+ type_, cast_options_, pool_, data));
+ }
+ }
+ } else {
+ RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast<PyObject*>(dtype_), &input_type));
+ if (!input_type->Equals(*type_)) {
+ RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_,
+ type_, cast_options_, pool_, data));
+ }
+ }
+ return Status::OK();
+// Create 16MB chunks for binary data
+constexpr int32_t kBinaryChunksize = 1 << 24;
+Status NumPyConverter::Visit(const BinaryType& type) {
+ ::arrow::internal::ChunkedBinaryBuilder builder(kBinaryChunksize, pool_);
+ auto data = reinterpret_cast<const uint8_t*>(PyArray_DATA(arr_));
+ auto AppendNotNull = [&builder, this](const uint8_t* data) {
+ // This is annoying. NumPy allows strings to have nul-terminators, so
+ // we must check for them here
+ const size_t item_size =
+ strnlen(reinterpret_cast<const char*>(data), static_cast<size_t>(itemsize_));
+ return builder.Append(data, static_cast<int32_t>(item_size));
+ };
+ if (mask_ != nullptr) {
+ Ndarray1DIndexer<uint8_t> mask_values(mask_);
+ for (int64_t i = 0; i < length_; ++i) {
+ if (mask_values[i]) {
+ RETURN_NOT_OK(builder.AppendNull());
+ } else {
+ RETURN_NOT_OK(AppendNotNull(data));
+ }
+ data += stride_;
+ }
+ } else {
+ for (int64_t i = 0; i < length_; ++i) {
+ RETURN_NOT_OK(AppendNotNull(data));
+ data += stride_;
+ }
+ }
+ ArrayVector result;
+ RETURN_NOT_OK(builder.Finish(&result));
+ for (auto arr : result) {
+ RETURN_NOT_OK(PushArray(arr->data()));
+ }
+ return Status::OK();
+Status NumPyConverter::Visit(const FixedSizeBinaryType& type) {
+ auto byte_width = type.byte_width();
+ if (itemsize_ != byte_width) {
+ return Status::Invalid("Got bytestring of length ", itemsize_, " (expected ",
+ byte_width, ")");
+ }
+ FixedSizeBinaryBuilder builder(::arrow::fixed_size_binary(byte_width), pool_);
+ auto data = reinterpret_cast<const uint8_t*>(PyArray_DATA(arr_));
+ if (mask_ != nullptr) {
+ Ndarray1DIndexer<uint8_t> mask_values(mask_);
+ RETURN_NOT_OK(builder.Reserve(length_));
+ for (int64_t i = 0; i < length_; ++i) {
+ if (mask_values[i]) {
+ RETURN_NOT_OK(builder.AppendNull());
+ } else {
+ RETURN_NOT_OK(builder.Append(data));
+ }
+ data += stride_;
+ }
+ } else {
+ for (int64_t i = 0; i < length_; ++i) {
+ RETURN_NOT_OK(builder.Append(data));
+ data += stride_;
+ }
+ }
+ std::shared_ptr<Array> result;
+ RETURN_NOT_OK(builder.Finish(&result));
+ return PushArray(result->data());
+namespace {
+// NumPy unicode is UCS4/UTF32 always
+constexpr int kNumPyUnicodeSize = 4;
+Status AppendUTF32(const char* data, int itemsize, int byteorder,
+ ::arrow::internal::ChunkedStringBuilder* builder) {
+ // The binary \x00\x00\x00\x00 indicates a nul terminator in NumPy unicode,
+ // so we need to detect that here to truncate if necessary. Yep.
+ int actual_length = 0;
+ for (; actual_length < itemsize / kNumPyUnicodeSize; ++actual_length) {
+ const char* code_point = data + actual_length * kNumPyUnicodeSize;
+ if ((*code_point == '\0') && (*(code_point + 1) == '\0') &&
+ (*(code_point + 2) == '\0') && (*(code_point + 3) == '\0')) {
+ break;
+ }
+ }
+ OwnedRef unicode_obj(PyUnicode_DecodeUTF32(data, actual_length * kNumPyUnicodeSize,
+ nullptr, &byteorder));
+ OwnedRef utf8_obj(PyUnicode_AsUTF8String(unicode_obj.obj()));
+ if (utf8_obj.obj() == NULL) {
+ PyErr_Clear();
+ return Status::Invalid("failed converting UTF32 to UTF8");
+ }
+ const int32_t length = static_cast<int32_t>(PyBytes_GET_SIZE(utf8_obj.obj()));
+ return builder->Append(
+ reinterpret_cast<const uint8_t*>(PyBytes_AS_STRING(utf8_obj.obj())), length);
+} // namespace
+Status NumPyConverter::Visit(const StringType& type) {
+ util::InitializeUTF8();
+ ::arrow::internal::ChunkedStringBuilder builder(kBinaryChunksize, pool_);
+ auto data = reinterpret_cast<const uint8_t*>(PyArray_DATA(arr_));
+ char numpy_byteorder = dtype_->byteorder;
+ // For Python C API, -1 is little-endian, 1 is big-endian
+ int byteorder = numpy_byteorder == '>' ? 1 : -1;
+ PyAcquireGIL gil_lock;
+ const bool is_binary_type = dtype_->type_num == NPY_STRING;
+ const bool is_unicode_type = dtype_->type_num == NPY_UNICODE;
+ if (!is_binary_type && !is_unicode_type) {
+ const bool is_float_type = dtype_->kind == 'f';
+ if (from_pandas_ && is_float_type) {
+ // in case of from_pandas=True, accept an all-NaN float array as input
+ RETURN_NOT_OK(NumPyNullsConverter::Convert(pool_, arr_, from_pandas_, &null_bitmap_,
+ &null_count_));
+ if (null_count_ == length_) {
+ auto arr = std::make_shared<NullArray>(length_);
+ compute::ExecContext context(pool_);
+ std::shared_ptr<Array> out,
+ compute::Cast(*arr, arrow::utf8(), cast_options_, &context));
+ out_arrays_.emplace_back(out);
+ return Status::OK();
+ }
+ }
+ std::string dtype_string;
+ RETURN_NOT_OK(internal::PyObject_StdStringStr(reinterpret_cast<PyObject*>(dtype_),
+ &dtype_string));
+ return Status::TypeError("Expected a string or bytes dtype, got ", dtype_string);
+ }
+ auto AppendNonNullValue = [&](const uint8_t* data) {
+ if (is_binary_type) {
+ if (ARROW_PREDICT_TRUE(util::ValidateUTF8(data, itemsize_))) {
+ return builder.Append(data, itemsize_);
+ } else {
+ return Status::Invalid("Encountered non-UTF8 binary value: ",
+ HexEncode(data, itemsize_));
+ }
+ } else {
+ // is_unicode_type case
+ return AppendUTF32(reinterpret_cast<const char*>(data), itemsize_, byteorder,
+ &builder);
+ }
+ };
+ if (mask_ != nullptr) {
+ Ndarray1DIndexer<uint8_t> mask_values(mask_);
+ for (int64_t i = 0; i < length_; ++i) {
+ if (mask_values[i]) {
+ RETURN_NOT_OK(builder.AppendNull());
+ } else {
+ RETURN_NOT_OK(AppendNonNullValue(data));
+ }
+ data += stride_;
+ }
+ } else {
+ for (int64_t i = 0; i < length_; ++i) {
+ RETURN_NOT_OK(AppendNonNullValue(data));
+ data += stride_;
+ }
+ }
+ ArrayVector result;
+ RETURN_NOT_OK(builder.Finish(&result));
+ for (auto arr : result) {
+ RETURN_NOT_OK(PushArray(arr->data()));
+ }
+ return Status::OK();
+Status NumPyConverter::Visit(const StructType& type) {
+ std::vector<NumPyConverter> sub_converters;
+ std::vector<OwnedRefNoGIL> sub_arrays;
+ {
+ PyAcquireGIL gil_lock;
+ // Create converters for each struct type field
+ if (dtype_->fields == NULL || !PyDict_Check(dtype_->fields)) {
+ return Status::TypeError("Expected struct array");
+ }
+ for (auto field : type.fields()) {
+ PyObject* tup = PyDict_GetItemString(dtype_->fields, field->name().c_str());
+ if (tup == NULL) {
+ return Status::Invalid("Missing field '", field->name(), "' in struct array");
+ }
+ PyArray_Descr* sub_dtype =
+ reinterpret_cast<PyArray_Descr*>(PyTuple_GET_ITEM(tup, 0));
+ DCHECK(PyObject_TypeCheck(sub_dtype, &PyArrayDescr_Type));
+ int offset = static_cast<int>(PyLong_AsLong(PyTuple_GET_ITEM(tup, 1)));
+ Py_INCREF(sub_dtype); /* PyArray_GetField() steals ref */
+ PyObject* sub_array = PyArray_GetField(arr_, sub_dtype, offset);
+ sub_arrays.emplace_back(sub_array);
+ sub_converters.emplace_back(pool_, sub_array, nullptr /* mask */, field->type(),
+ from_pandas_);
+ }
+ }
+ std::vector<ArrayVector> groups;
+ int64_t null_count = 0;
+ // Compute null bitmap and store it as a Boolean Array to include it
+ // in the rechunking below
+ {
+ if (mask_ != nullptr) {
+ RETURN_NOT_OK(InitNullBitmap());
+ null_count = MaskToBitmap(mask_, length_, null_bitmap_data_);
+ }
+ groups.push_back({std::make_shared<BooleanArray>(length_, null_bitmap_)});
+ }
+ // Convert child data
+ for (auto& converter : sub_converters) {
+ RETURN_NOT_OK(converter.Convert());
+ groups.push_back(converter.result());
+ const auto& group = groups.back();
+ int64_t n = 0;
+ for (const auto& array : group) {
+ n += array->length();
+ }
+ }
+ // Ensure the different array groups are chunked consistently
+ groups = ::arrow::internal::RechunkArraysConsistently(groups);
+ for (const auto& group : groups) {
+ int64_t n = 0;
+ for (const auto& array : group) {
+ n += array->length();
+ }
+ }
+ // Make struct array chunks by combining groups
+ size_t ngroups = groups.size();
+ size_t nchunks = groups[0].size();
+ for (size_t chunk = 0; chunk < nchunks; chunk++) {
+ // First group has the null bitmaps as Boolean Arrays
+ const auto& null_data = groups[0][chunk]->data();
+ DCHECK_EQ(null_data->type->id(), Type::BOOL);
+ DCHECK_EQ(null_data->buffers.size(), 2);
+ const auto& null_buffer = null_data->buffers[1];
+ // Careful: the rechunked null bitmap may have a non-zero offset
+ // to its buffer, and it may not even start on a byte boundary
+ int64_t null_offset = null_data->offset;
+ std::shared_ptr<Buffer> fixed_null_buffer;
+ if (!null_buffer) {
+ fixed_null_buffer = null_buffer;
+ } else if (null_offset % 8 == 0) {
+ fixed_null_buffer =
+ std::make_shared<Buffer>(null_buffer,
+ // byte offset
+ null_offset / 8,
+ // byte size
+ BitUtil::BytesForBits(null_data->length));
+ } else {
+ fixed_null_buffer,
+ CopyBitmap(pool_, null_buffer->data(), null_offset, null_data->length));
+ }
+ // Create struct array chunk and populate it
+ auto arr_data =
+ ArrayData::Make(type_, null_data->length, null_count ? kUnknownNullCount : 0, 0);
+ arr_data->buffers.push_back(fixed_null_buffer);
+ // Append child chunks
+ for (size_t i = 1; i < ngroups; i++) {
+ arr_data->child_data.push_back(groups[i][chunk]->data());
+ }
+ RETURN_NOT_OK(PushArray(arr_data));
+ }
+ return Status::OK();
+Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas,
+ const std::shared_ptr<DataType>& type,
+ const compute::CastOptions& cast_options,
+ std::shared_ptr<ChunkedArray>* out) {
+ if (!PyArray_Check(ao)) {
+ // This code path cannot be reached by Python unit tests currently so this
+ // is only a sanity check.
+ return Status::TypeError("Input object was not a NumPy array");
+ }
+ if (PyArray_NDIM(reinterpret_cast<PyArrayObject*>(ao)) != 1) {
+ return Status::Invalid("only handle 1-dimensional arrays");
+ }
+ NumPyConverter converter(pool, ao, mo, type, from_pandas, cast_options);
+ RETURN_NOT_OK(converter.Convert());
+ const auto& output_arrays = converter.result();
+ DCHECK_GT(output_arrays.size(), 0);
+ *out = std::make_shared<ChunkedArray>(output_arrays);
+ return Status::OK();
+Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas,
+ const std::shared_ptr<DataType>& type,
+ std::shared_ptr<ChunkedArray>* out) {
+ return NdarrayToArrow(pool, ao, mo, from_pandas, type, compute::CastOptions(), out);
+} // namespace py
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/python/numpy_to_arrow.h b/contrib/libs/apache/arrow/cpp/src/arrow/python/numpy_to_arrow.h
new file mode 100644
index 0000000000..b6cd093e55
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/python/numpy_to_arrow.h
@@ -0,0 +1,72 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+// Converting from pandas memory representation to Arrow data structures
+#pragma once
+#include "arrow/python/platform.h"
+#include <memory>
+#include "arrow/compute/api.h"
+#include "arrow/python/visibility.h"
+namespace arrow {
+class Array;
+class ChunkedArray;
+class DataType;
+class MemoryPool;
+class Status;
+namespace py {
+/// Convert NumPy arrays to Arrow. If target data type is not known, pass a
+/// type with null
+/// \param[in] pool Memory pool for any memory allocations
+/// \param[in] ao an ndarray with the array data
+/// \param[in] mo an ndarray with a null mask (True is null), optional
+/// \param[in] from_pandas If true, use pandas's null sentinels to determine
+/// whether values are null
+/// \param[in] type a specific type to cast to, may be null
+/// \param[in] cast_options casting options
+/// \param[out] out a ChunkedArray, to accommodate chunked output
+Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas,
+ const std::shared_ptr<DataType>& type,
+ const compute::CastOptions& cast_options,
+ std::shared_ptr<ChunkedArray>* out);
+/// Safely convert NumPy arrays to Arrow. If target data type is not known,
+/// pass a type with null.
+/// \param[in] pool Memory pool for any memory allocations
+/// \param[in] ao an ndarray with the array data
+/// \param[in] mo an ndarray with a null mask (True is null), optional
+/// \param[in] from_pandas If true, use pandas's null sentinels to determine
+/// whether values are null
+/// \param[in] type a specific type to cast to, may be null
+/// \param[out] out a ChunkedArray, to accommodate chunked output
+Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas,
+ const std::shared_ptr<DataType>& type,
+ std::shared_ptr<ChunkedArray>* out);
+} // namespace py
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/python/platform.h b/contrib/libs/apache/arrow/cpp/src/arrow/python/platform.h
new file mode 100644
index 0000000000..80f7e60813
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/python/platform.h
@@ -0,0 +1,36 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+// Functions for converting between pandas's NumPy-based data representation
+// and Arrow data structures
+#pragma once
+// If PY_SSIZE_T_CLEAN is defined, argument parsing functions treat #-specifier
+// to mean Py_ssize_t (defining this to suppress deprecation warning)
+#include <Python.h> // IWYU pragma: export
+#include <datetime.h>
+// Work around C2528 error
+#ifdef _MSC_VER
+#if _MSC_VER >= 1900
+#undef timezone
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/python/pyarrow.cc b/contrib/libs/apache/arrow/cpp/src/arrow/python/pyarrow.cc
new file mode 100644
index 0000000000..bea35ff3b6
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/python/pyarrow.cc
@@ -0,0 +1,93 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#include "arrow/python/pyarrow.h"
+#include <memory>
+#include <utility>
+#include "arrow/array.h"
+#include "arrow/table.h"
+#include "arrow/tensor.h"
+#include "arrow/type.h"
+#include "arrow/python/common.h"
+#include "arrow/python/datetime.h"
+namespace {
+#include "arrow/python/pyarrow_api.h"
+namespace arrow {
+namespace py {
+static Status UnwrapError(PyObject* obj, const char* expected_type) {
+ return Status::TypeError("Could not unwrap ", expected_type,
+ " from Python object of type '", Py_TYPE(obj)->tp_name, "'");
+int import_pyarrow() {
+ internal::InitDatetime();
+ return ::import_pyarrow__lib();
+ bool is_##FUNC_SUFFIX(PyObject* obj) { return ::pyarrow_is_##FUNC_SUFFIX(obj) != 0; } \
+ \
+ PyObject* wrap_##FUNC_SUFFIX(const std::shared_ptr<TYPE_NAME>& src) { \
+ return ::pyarrow_wrap_##FUNC_SUFFIX(src); \
+ } \
+ Result<std::shared_ptr<TYPE_NAME>> unwrap_##FUNC_SUFFIX(PyObject* obj) { \
+ auto out = ::pyarrow_unwrap_##FUNC_SUFFIX(obj); \
+ if (out) { \
+ return std::move(out); \
+ } else { \
+ return UnwrapError(obj, #TYPE_NAME); \
+ } \
+ } \
+ Status unwrap_##FUNC_SUFFIX(PyObject* obj, std::shared_ptr<TYPE_NAME>* out) { \
+ return unwrap_##FUNC_SUFFIX(obj).Value(out); \
+ }
+DEFINE_WRAP_FUNCTIONS(data_type, DataType)
+DEFINE_WRAP_FUNCTIONS(chunked_array, ChunkedArray)
+DEFINE_WRAP_FUNCTIONS(sparse_coo_tensor, SparseCOOTensor)
+DEFINE_WRAP_FUNCTIONS(sparse_csc_matrix, SparseCSCMatrix)
+DEFINE_WRAP_FUNCTIONS(sparse_csf_tensor, SparseCSFTensor)
+DEFINE_WRAP_FUNCTIONS(sparse_csr_matrix, SparseCSRMatrix)
+DEFINE_WRAP_FUNCTIONS(batch, RecordBatch)
+namespace internal {
+int check_status(const Status& status) { return ::pyarrow_internal_check_status(status); }
+} // namespace internal
+} // namespace py
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/python/pyarrow.h b/contrib/libs/apache/arrow/cpp/src/arrow/python/pyarrow.h
new file mode 100644
index 0000000000..8056e700a0
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/python/pyarrow.h
@@ -0,0 +1,87 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+#include "arrow/python/platform.h"
+#include <memory>
+#include "arrow/python/visibility.h"
+#include "arrow/sparse_tensor.h"
+// Work around ARROW-2317 (C linkage warning from Cython)
+extern "C++" {
+namespace arrow {
+class Array;
+class Buffer;
+class DataType;
+class Field;
+class RecordBatch;
+class Schema;
+class Status;
+class Table;
+class Tensor;
+namespace py {
+// Returns 0 on success, -1 on error.
+ARROW_PYTHON_EXPORT int import_pyarrow();
+ ARROW_PYTHON_EXPORT Result<std::shared_ptr<TYPE_NAME>> unwrap_##FUNC_SUFFIX( \
+ PyObject*); \
+ ARROW_PYTHON_EXPORT PyObject* wrap_##FUNC_SUFFIX(const std::shared_ptr<TYPE_NAME>&); \
+ ARROW_DEPRECATED("Use Result-returning version") \
+ ARROW_PYTHON_EXPORT Status unwrap_##FUNC_SUFFIX(PyObject*, \
+ std::shared_ptr<TYPE_NAME>* out);
+DECLARE_WRAP_FUNCTIONS(data_type, DataType)
+DECLARE_WRAP_FUNCTIONS(chunked_array, ChunkedArray)
+DECLARE_WRAP_FUNCTIONS(sparse_coo_tensor, SparseCOOTensor)
+DECLARE_WRAP_FUNCTIONS(sparse_csc_matrix, SparseCSCMatrix)
+DECLARE_WRAP_FUNCTIONS(sparse_csf_tensor, SparseCSFTensor)
+DECLARE_WRAP_FUNCTIONS(sparse_csr_matrix, SparseCSRMatrix)
+namespace internal {
+ARROW_PYTHON_EXPORT int check_status(const Status& status);
+} // namespace internal
+} // namespace py
+} // namespace arrow
+} // extern "C++"
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/python/pyarrow_api.h b/contrib/libs/apache/arrow/cpp/src/arrow/python/pyarrow_api.h
new file mode 100644
index 0000000000..9474312002
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/python/pyarrow_api.h
@@ -0,0 +1,239 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+// DO NOT EDIT THIS FILE. Update from pyarrow/lib_api.h after pyarrow build
+// This is used to be able to call back into Cython code from C++.
+/* Generated by Cython 0.29.15 */
+#ifndef __PYX_HAVE_API__pyarrow__lib
+#define __PYX_HAVE_API__pyarrow__lib
+#ifdef __MINGW64__
+#define MS_WIN64
+#include "Python.h"
+#include "pyarrow_lib.h"
+static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_scalar)(std::shared_ptr< arrow::Scalar> const &) = 0;
+#define pyarrow_wrap_scalar __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_scalar
+static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_array)(std::shared_ptr< arrow::Array> const &) = 0;
+#define pyarrow_wrap_array __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_array
+static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_chunked_array)(std::shared_ptr< arrow::ChunkedArray> const &) = 0;
+#define pyarrow_wrap_chunked_array __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_chunked_array
+static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_batch)(std::shared_ptr< arrow::RecordBatch> const &) = 0;
+#define pyarrow_wrap_batch __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_batch
+static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_buffer)(std::shared_ptr< arrow::Buffer> const &) = 0;
+#define pyarrow_wrap_buffer __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_buffer
+static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_data_type)(std::shared_ptr< arrow::DataType> const &) = 0;
+#define pyarrow_wrap_data_type __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_data_type
+static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_field)(std::shared_ptr< arrow::Field> const &) = 0;
+#define pyarrow_wrap_field __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_field
+static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_resizable_buffer)(std::shared_ptr< arrow::ResizableBuffer> const &) = 0;
+#define pyarrow_wrap_resizable_buffer __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_resizable_buffer
+static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_schema)(std::shared_ptr< arrow::Schema> const &) = 0;
+#define pyarrow_wrap_schema __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_schema
+static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_table)(std::shared_ptr< arrow::Table> const &) = 0;
+#define pyarrow_wrap_table __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_table
+static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_tensor)(std::shared_ptr< arrow::Tensor> const &) = 0;
+#define pyarrow_wrap_tensor __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_tensor
+static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_sparse_coo_tensor)(std::shared_ptr< arrow::SparseCOOTensor> const &) = 0;
+#define pyarrow_wrap_sparse_coo_tensor __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_sparse_coo_tensor
+static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_sparse_csr_matrix)(std::shared_ptr< arrow::SparseCSRMatrix> const &) = 0;
+#define pyarrow_wrap_sparse_csr_matrix __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_sparse_csr_matrix
+static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_sparse_csc_matrix)(std::shared_ptr< arrow::SparseCSCMatrix> const &) = 0;
+#define pyarrow_wrap_sparse_csc_matrix __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_sparse_csc_matrix
+static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_sparse_csf_tensor)(std::shared_ptr< arrow::SparseCSFTensor> const &) = 0;
+#define pyarrow_wrap_sparse_csf_tensor __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_sparse_csf_tensor
+static std::shared_ptr< arrow::Scalar> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_scalar)(PyObject *) = 0;
+#define pyarrow_unwrap_scalar __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_scalar
+static std::shared_ptr< arrow::Array> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_array)(PyObject *) = 0;
+#define pyarrow_unwrap_array __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_array
+static std::shared_ptr< arrow::ChunkedArray> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_chunked_array)(PyObject *) = 0;
+#define pyarrow_unwrap_chunked_array __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_chunked_array
+static std::shared_ptr< arrow::RecordBatch> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_batch)(PyObject *) = 0;
+#define pyarrow_unwrap_batch __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_batch
+static std::shared_ptr< arrow::Buffer> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_buffer)(PyObject *) = 0;
+#define pyarrow_unwrap_buffer __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_buffer
+static std::shared_ptr< arrow::DataType> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_data_type)(PyObject *) = 0;
+#define pyarrow_unwrap_data_type __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_data_type
+static std::shared_ptr< arrow::Field> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_field)(PyObject *) = 0;
+#define pyarrow_unwrap_field __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_field
+static std::shared_ptr< arrow::Schema> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_schema)(PyObject *) = 0;
+#define pyarrow_unwrap_schema __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_schema
+static std::shared_ptr< arrow::Table> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_table)(PyObject *) = 0;
+#define pyarrow_unwrap_table __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_table
+static std::shared_ptr< arrow::Tensor> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_tensor)(PyObject *) = 0;
+#define pyarrow_unwrap_tensor __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_tensor
+static std::shared_ptr< arrow::SparseCOOTensor> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_sparse_coo_tensor)(PyObject *) = 0;
+#define pyarrow_unwrap_sparse_coo_tensor __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_sparse_coo_tensor
+static std::shared_ptr< arrow::SparseCSRMatrix> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_sparse_csr_matrix)(PyObject *) = 0;
+#define pyarrow_unwrap_sparse_csr_matrix __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_sparse_csr_matrix
+static std::shared_ptr< arrow::SparseCSCMatrix> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_sparse_csc_matrix)(PyObject *) = 0;
+#define pyarrow_unwrap_sparse_csc_matrix __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_sparse_csc_matrix
+static std::shared_ptr< arrow::SparseCSFTensor> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_sparse_csf_tensor)(PyObject *) = 0;
+#define pyarrow_unwrap_sparse_csf_tensor __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_sparse_csf_tensor
+static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_internal_check_status)(arrow::Status const &) = 0;
+#define pyarrow_internal_check_status __pyx_api_f_7pyarrow_3lib_pyarrow_internal_check_status
+static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_buffer)(PyObject *) = 0;
+#define pyarrow_is_buffer __pyx_api_f_7pyarrow_3lib_pyarrow_is_buffer
+static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_data_type)(PyObject *) = 0;
+#define pyarrow_is_data_type __pyx_api_f_7pyarrow_3lib_pyarrow_is_data_type
+static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_metadata)(PyObject *) = 0;
+#define pyarrow_is_metadata __pyx_api_f_7pyarrow_3lib_pyarrow_is_metadata
+static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_field)(PyObject *) = 0;
+#define pyarrow_is_field __pyx_api_f_7pyarrow_3lib_pyarrow_is_field
+static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_schema)(PyObject *) = 0;
+#define pyarrow_is_schema __pyx_api_f_7pyarrow_3lib_pyarrow_is_schema
+static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_array)(PyObject *) = 0;
+#define pyarrow_is_array __pyx_api_f_7pyarrow_3lib_pyarrow_is_array
+static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_chunked_array)(PyObject *) = 0;
+#define pyarrow_is_chunked_array __pyx_api_f_7pyarrow_3lib_pyarrow_is_chunked_array
+static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_scalar)(PyObject *) = 0;
+#define pyarrow_is_scalar __pyx_api_f_7pyarrow_3lib_pyarrow_is_scalar
+static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_tensor)(PyObject *) = 0;
+#define pyarrow_is_tensor __pyx_api_f_7pyarrow_3lib_pyarrow_is_tensor
+static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_sparse_coo_tensor)(PyObject *) = 0;
+#define pyarrow_is_sparse_coo_tensor __pyx_api_f_7pyarrow_3lib_pyarrow_is_sparse_coo_tensor
+static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_sparse_csr_matrix)(PyObject *) = 0;
+#define pyarrow_is_sparse_csr_matrix __pyx_api_f_7pyarrow_3lib_pyarrow_is_sparse_csr_matrix
+static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_sparse_csc_matrix)(PyObject *) = 0;
+#define pyarrow_is_sparse_csc_matrix __pyx_api_f_7pyarrow_3lib_pyarrow_is_sparse_csc_matrix
+static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_sparse_csf_tensor)(PyObject *) = 0;
+#define pyarrow_is_sparse_csf_tensor __pyx_api_f_7pyarrow_3lib_pyarrow_is_sparse_csf_tensor
+static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_table)(PyObject *) = 0;
+#define pyarrow_is_table __pyx_api_f_7pyarrow_3lib_pyarrow_is_table
+static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_batch)(PyObject *) = 0;
+#define pyarrow_is_batch __pyx_api_f_7pyarrow_3lib_pyarrow_is_batch
+#if !defined(__Pyx_PyIdentifier_FromString)
+ #define __Pyx_PyIdentifier_FromString(s) PyString_FromString(s)
+ #define __Pyx_PyIdentifier_FromString(s) PyUnicode_FromString(s)
+#ifndef __PYX_HAVE_RT_ImportFunction
+#define __PYX_HAVE_RT_ImportFunction
+static int __Pyx_ImportFunction(PyObject *module, const char *funcname, void (**f)(void), const char *sig) {
+ PyObject *d = 0;
+ PyObject *cobj = 0;
+ union {
+ void (*fp)(void);
+ void *p;
+ } tmp;
+ d = PyObject_GetAttrString(module, (char *)"__pyx_capi__");
+ if (!d)
+ goto bad;
+ cobj = PyDict_GetItemString(d, funcname);
+ if (!cobj) {
+ PyErr_Format(PyExc_ImportError,
+ "%.200s does not export expected C function %.200s",
+ PyModule_GetName(module), funcname);
+ goto bad;
+ }
+#if PY_VERSION_HEX >= 0x02070000
+ if (!PyCapsule_IsValid(cobj, sig)) {
+ PyErr_Format(PyExc_TypeError,
+ "C function %.200s.%.200s has wrong signature (expected %.500s, got %.500s)",
+ PyModule_GetName(module), funcname, sig, PyCapsule_GetName(cobj));
+ goto bad;
+ }
+ tmp.p = PyCapsule_GetPointer(cobj, sig);
+ {const char *desc, *s1, *s2;
+ desc = (const char *)PyCObject_GetDesc(cobj);
+ if (!desc)
+ goto bad;
+ s1 = desc; s2 = sig;
+ while (*s1 != '\0' && *s1 == *s2) { s1++; s2++; }
+ if (*s1 != *s2) {
+ PyErr_Format(PyExc_TypeError,
+ "C function %.200s.%.200s has wrong signature (expected %.500s, got %.500s)",
+ PyModule_GetName(module), funcname, sig, desc);
+ goto bad;
+ }
+ tmp.p = PyCObject_AsVoidPtr(cobj);}
+ *f = tmp.fp;
+ if (!(*f))
+ goto bad;
+ Py_DECREF(d);
+ return 0;
+ Py_XDECREF(d);
+ return -1;
+static int import_pyarrow__lib(void) {
+ PyObject *module = 0;
+ module = PyImport_ImportModule("pyarrow.lib");
+ if (!module) goto bad;
+ if (__Pyx_ImportFunction(module, "pyarrow_wrap_scalar", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_scalar, "PyObject *(std::shared_ptr< arrow::Scalar> const &)") < 0) goto bad;
+ if (__Pyx_ImportFunction(module, "pyarrow_wrap_array", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_array, "PyObject *(std::shared_ptr< arrow::Array> const &)") < 0) goto bad;
+ if (__Pyx_ImportFunction(module, "pyarrow_wrap_chunked_array", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_chunked_array, "PyObject *(std::shared_ptr< arrow::ChunkedArray> const &)") < 0) goto bad;
+ if (__Pyx_ImportFunction(module, "pyarrow_wrap_batch", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_batch, "PyObject *(std::shared_ptr< arrow::RecordBatch> const &)") < 0) goto bad;
+ if (__Pyx_ImportFunction(module, "pyarrow_wrap_buffer", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_buffer, "PyObject *(std::shared_ptr< arrow::Buffer> const &)") < 0) goto bad;
+ if (__Pyx_ImportFunction(module, "pyarrow_wrap_data_type", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_data_type, "PyObject *(std::shared_ptr< arrow::DataType> const &)") < 0) goto bad;
+ if (__Pyx_ImportFunction(module, "pyarrow_wrap_field", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_field, "PyObject *(std::shared_ptr< arrow::Field> const &)") < 0) goto bad;
+ if (__Pyx_ImportFunction(module, "pyarrow_wrap_resizable_buffer", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_resizable_buffer, "PyObject *(std::shared_ptr< arrow::ResizableBuffer> const &)") < 0) goto bad;
+ if (__Pyx_ImportFunction(module, "pyarrow_wrap_schema", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_schema, "PyObject *(std::shared_ptr< arrow::Schema> const &)") < 0) goto bad;
+ if (__Pyx_ImportFunction(module, "pyarrow_wrap_table", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_table, "PyObject *(std::shared_ptr< arrow::Table> const &)") < 0) goto bad;
+ if (__Pyx_ImportFunction(module, "pyarrow_wrap_tensor", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_tensor, "PyObject *(std::shared_ptr< arrow::Tensor> const &)") < 0) goto bad;
+ if (__Pyx_ImportFunction(module, "pyarrow_wrap_sparse_coo_tensor", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_sparse_coo_tensor, "PyObject *(std::shared_ptr< arrow::SparseCOOTensor> const &)") < 0) goto bad;
+ if (__Pyx_ImportFunction(module, "pyarrow_wrap_sparse_csr_matrix", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_sparse_csr_matrix, "PyObject *(std::shared_ptr< arrow::SparseCSRMatrix> const &)") < 0) goto bad;
+ if (__Pyx_ImportFunction(module, "pyarrow_wrap_sparse_csc_matrix", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_sparse_csc_matrix, "PyObject *(std::shared_ptr< arrow::SparseCSCMatrix> const &)") < 0) goto bad;
+ if (__Pyx_ImportFunction(module, "pyarrow_wrap_sparse_csf_tensor", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_sparse_csf_tensor, "PyObject *(std::shared_ptr< arrow::SparseCSFTensor> const &)") < 0) goto bad;
+ if (__Pyx_ImportFunction(module, "pyarrow_unwrap_scalar", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_scalar, "std::shared_ptr< arrow::Scalar> (PyObject *)") < 0) goto bad;
+ if (__Pyx_ImportFunction(module, "pyarrow_unwrap_array", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_array, "std::shared_ptr< arrow::Array> (PyObject *)") < 0) goto bad;
+ if (__Pyx_ImportFunction(module, "pyarrow_unwrap_chunked_array", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_chunked_array, "std::shared_ptr< arrow::ChunkedArray> (PyObject *)") < 0) goto bad;
+ if (__Pyx_ImportFunction(module, "pyarrow_unwrap_batch", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_batch, "std::shared_ptr< arrow::RecordBatch> (PyObject *)") < 0) goto bad;
+ if (__Pyx_ImportFunction(module, "pyarrow_unwrap_buffer", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_buffer, "std::shared_ptr< arrow::Buffer> (PyObject *)") < 0) goto bad;
+ if (__Pyx_ImportFunction(module, "pyarrow_unwrap_data_type", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_data_type, "std::shared_ptr< arrow::DataType> (PyObject *)") < 0) goto bad;
+ if (__Pyx_ImportFunction(module, "pyarrow_unwrap_field", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_field, "std::shared_ptr< arrow::Field> (PyObject *)") < 0) goto bad;
+ if (__Pyx_ImportFunction(module, "pyarrow_unwrap_schema", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_schema, "std::shared_ptr< arrow::Schema> (PyObject *)") < 0) goto bad;
+ if (__Pyx_ImportFunction(module, "pyarrow_unwrap_table", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_table, "std::shared_ptr< arrow::Table> (PyObject *)") < 0) goto bad;
+ if (__Pyx_ImportFunction(module, "pyarrow_unwrap_tensor", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_tensor, "std::shared_ptr< arrow::Tensor> (PyObject *)") < 0) goto bad;
+ if (__Pyx_ImportFunction(module, "pyarrow_unwrap_sparse_coo_tensor", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_sparse_coo_tensor, "std::shared_ptr< arrow::SparseCOOTensor> (PyObject *)") < 0) goto bad;
+ if (__Pyx_ImportFunction(module, "pyarrow_unwrap_sparse_csr_matrix", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_sparse_csr_matrix, "std::shared_ptr< arrow::SparseCSRMatrix> (PyObject *)") < 0) goto bad;
+ if (__Pyx_ImportFunction(module, "pyarrow_unwrap_sparse_csc_matrix", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_sparse_csc_matrix, "std::shared_ptr< arrow::SparseCSCMatrix> (PyObject *)") < 0) goto bad;
+ if (__Pyx_ImportFunction(module, "pyarrow_unwrap_sparse_csf_tensor", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_sparse_csf_tensor, "std::shared_ptr< arrow::SparseCSFTensor> (PyObject *)") < 0) goto bad;
+ if (__Pyx_ImportFunction(module, "pyarrow_internal_check_status", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_internal_check_status, "int (arrow::Status const &)") < 0) goto bad;
+ if (__Pyx_ImportFunction(module, "pyarrow_is_buffer", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_buffer, "int (PyObject *)") < 0) goto bad;
+ if (__Pyx_ImportFunction(module, "pyarrow_is_data_type", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_data_type, "int (PyObject *)") < 0) goto bad;
+ if (__Pyx_ImportFunction(module, "pyarrow_is_metadata", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_metadata, "int (PyObject *)") < 0) goto bad;
+ if (__Pyx_ImportFunction(module, "pyarrow_is_field", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_field, "int (PyObject *)") < 0) goto bad;
+ if (__Pyx_ImportFunction(module, "pyarrow_is_schema", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_schema, "int (PyObject *)") < 0) goto bad;
+ if (__Pyx_ImportFunction(module, "pyarrow_is_array", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_array, "int (PyObject *)") < 0) goto bad;
+ if (__Pyx_ImportFunction(module, "pyarrow_is_chunked_array", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_chunked_array, "int (PyObject *)") < 0) goto bad;
+ if (__Pyx_ImportFunction(module, "pyarrow_is_scalar", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_scalar, "int (PyObject *)") < 0) goto bad;
+ if (__Pyx_ImportFunction(module, "pyarrow_is_tensor", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_tensor, "int (PyObject *)") < 0) goto bad;
+ if (__Pyx_ImportFunction(module, "pyarrow_is_sparse_coo_tensor", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_sparse_coo_tensor, "int (PyObject *)") < 0) goto bad;
+ if (__Pyx_ImportFunction(module, "pyarrow_is_sparse_csr_matrix", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_sparse_csr_matrix, "int (PyObject *)") < 0) goto bad;
+ if (__Pyx_ImportFunction(module, "pyarrow_is_sparse_csc_matrix", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_sparse_csc_matrix, "int (PyObject *)") < 0) goto bad;
+ if (__Pyx_ImportFunction(module, "pyarrow_is_sparse_csf_tensor", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_sparse_csf_tensor, "int (PyObject *)") < 0) goto bad;
+ if (__Pyx_ImportFunction(module, "pyarrow_is_table", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_table, "int (PyObject *)") < 0) goto bad;
+ if (__Pyx_ImportFunction(module, "pyarrow_is_batch", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_batch, "int (PyObject *)") < 0) goto bad;
+ Py_DECREF(module); module = 0;
+ return 0;
+ bad:
+ Py_XDECREF(module);
+ return -1;
+#endif /* !__PYX_HAVE_API__pyarrow__lib */
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/python/pyarrow_lib.h b/contrib/libs/apache/arrow/cpp/src/arrow/python/pyarrow_lib.h
new file mode 100644
index 0000000000..fa59414474
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/python/pyarrow_lib.h
@@ -0,0 +1,82 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+// DO NOT EDIT THIS FILE. Update from pyarrow/lib.h after pyarrow build
+/* Generated by Cython 0.29.15 */
+#ifndef __PYX_HAVE__pyarrow__lib
+#define __PYX_HAVE__pyarrow__lib
+#include "Python.h"
+#ifndef __PYX_HAVE_API__pyarrow__lib
+#ifndef __PYX_EXTERN_C
+ #ifdef __cplusplus
+ #define __PYX_EXTERN_C extern "C"
+ #else
+ #define __PYX_EXTERN_C extern
+ #endif
+#ifndef DL_IMPORT
+ #define DL_IMPORT(_T) _T
+__PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_scalar(std::shared_ptr< arrow::Scalar> const &);
+__PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_array(std::shared_ptr< arrow::Array> const &);
+__PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_chunked_array(std::shared_ptr< arrow::ChunkedArray> const &);
+__PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_batch(std::shared_ptr< arrow::RecordBatch> const &);
+__PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_buffer(std::shared_ptr< arrow::Buffer> const &);
+__PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_data_type(std::shared_ptr< arrow::DataType> const &);
+__PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_field(std::shared_ptr< arrow::Field> const &);
+__PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_resizable_buffer(std::shared_ptr< arrow::ResizableBuffer> const &);
+__PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_schema(std::shared_ptr< arrow::Schema> const &);
+__PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_table(std::shared_ptr< arrow::Table> const &);
+__PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_tensor(std::shared_ptr< arrow::Tensor> const &);
+__PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_sparse_coo_tensor(std::shared_ptr< arrow::SparseCOOTensor> const &);
+__PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_sparse_csr_matrix(std::shared_ptr< arrow::SparseCSRMatrix> const &);
+__PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_sparse_csc_matrix(std::shared_ptr< arrow::SparseCSCMatrix> const &);
+__PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_sparse_csf_tensor(std::shared_ptr< arrow::SparseCSFTensor> const &);
+__PYX_EXTERN_C std::shared_ptr< arrow::Scalar> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_scalar(PyObject *);
+__PYX_EXTERN_C std::shared_ptr< arrow::Array> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_array(PyObject *);
+__PYX_EXTERN_C std::shared_ptr< arrow::ChunkedArray> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_chunked_array(PyObject *);
+__PYX_EXTERN_C std::shared_ptr< arrow::RecordBatch> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_batch(PyObject *);
+__PYX_EXTERN_C std::shared_ptr< arrow::Buffer> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_buffer(PyObject *);
+__PYX_EXTERN_C std::shared_ptr< arrow::DataType> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_data_type(PyObject *);
+__PYX_EXTERN_C std::shared_ptr< arrow::Field> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_field(PyObject *);
+__PYX_EXTERN_C std::shared_ptr< arrow::Schema> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_schema(PyObject *);
+__PYX_EXTERN_C std::shared_ptr< arrow::Table> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_table(PyObject *);
+__PYX_EXTERN_C std::shared_ptr< arrow::Tensor> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_tensor(PyObject *);
+__PYX_EXTERN_C std::shared_ptr< arrow::SparseCOOTensor> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_sparse_coo_tensor(PyObject *);
+__PYX_EXTERN_C std::shared_ptr< arrow::SparseCSRMatrix> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_sparse_csr_matrix(PyObject *);
+__PYX_EXTERN_C std::shared_ptr< arrow::SparseCSCMatrix> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_sparse_csc_matrix(PyObject *);
+__PYX_EXTERN_C std::shared_ptr< arrow::SparseCSFTensor> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_sparse_csf_tensor(PyObject *);
+#endif /* !__PYX_HAVE_API__pyarrow__lib */
+/* WARNING: the interface of the module init function changed in CPython 3.5. */
+/* It now returns a PyModuleDef instance instead of a PyModule instance. */
+PyMODINIT_FUNC initlib(void);
+PyMODINIT_FUNC PyInit_lib(void);
+#endif /* !__PYX_HAVE__pyarrow__lib */
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/python/python_to_arrow.cc b/contrib/libs/apache/arrow/cpp/src/arrow/python/python_to_arrow.cc
new file mode 100644
index 0000000000..521249fd54
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/python/python_to_arrow.cc
@@ -0,0 +1,1041 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#include "arrow/python/python_to_arrow.h"
+#include "arrow/python/numpy_interop.h"
+#include <datetime.h>
+#include <algorithm>
+#include <limits>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+#include "arrow/array.h"
+#include "arrow/array/builder_binary.h"
+#include "arrow/array/builder_decimal.h"
+#include "arrow/array/builder_dict.h"
+#include "arrow/array/builder_nested.h"
+#include "arrow/array/builder_primitive.h"
+#include "arrow/chunked_array.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/converter.h"
+#include "arrow/util/decimal.h"
+#include "arrow/util/int_util_internal.h"
+#include "arrow/util/logging.h"
+#include "arrow/python/datetime.h"
+#include "arrow/python/decimal.h"
+#include "arrow/python/helpers.h"
+#include "arrow/python/inference.h"
+#include "arrow/python/iterators.h"
+#include "arrow/python/numpy_convert.h"
+#include "arrow/python/type_traits.h"
+#include "arrow/visitor_inline.h"
+namespace arrow {
+using internal::checked_cast;
+using internal::checked_pointer_cast;
+using internal::Converter;
+using internal::DictionaryConverter;
+using internal::ListConverter;
+using internal::PrimitiveConverter;
+using internal::StructConverter;
+using internal::MakeChunker;
+using internal::MakeConverter;
+namespace py {
+// Utility for converting single python objects to their intermediate C representations
+// which can be fed to the typed builders
+class PyValue {
+ public:
+ // Type aliases for shorter signature definitions
+ using I = PyObject*;
+ using O = PyConversionOptions;
+ // Used for null checking before actually converting the values
+ static bool IsNull(const O& options, I obj) {
+ if (options.from_pandas) {
+ return internal::PandasObjectIsNull(obj);
+ } else {
+ return obj == Py_None;
+ }
+ }
+ // Used for post-conversion numpy NaT sentinel checking
+ static bool IsNaT(const TimestampType*, int64_t value) {
+ return internal::npy_traits<NPY_DATETIME>::isnull(value);
+ }
+ // Used for post-conversion numpy NaT sentinel checking
+ static bool IsNaT(const DurationType*, int64_t value) {
+ return internal::npy_traits<NPY_TIMEDELTA>::isnull(value);
+ }
+ static Result<std::nullptr_t> Convert(const NullType*, const O&, I obj) {
+ if (obj == Py_None) {
+ return nullptr;
+ } else {
+ return Status::Invalid("Invalid null value");
+ }
+ }
+ static Result<bool> Convert(const BooleanType*, const O&, I obj) {
+ if (obj == Py_True) {
+ return true;
+ } else if (obj == Py_False) {
+ return false;
+ } else if (PyArray_IsScalar(obj, Bool)) {
+ return reinterpret_cast<PyBoolScalarObject*>(obj)->obval == NPY_TRUE;
+ } else {
+ return internal::InvalidValue(obj, "tried to convert to boolean");
+ }
+ }
+ template <typename T>
+ static enable_if_integer<T, Result<typename T::c_type>> Convert(const T*, const O&,
+ I obj) {
+ typename T::c_type value;
+ auto status = internal::CIntFromPython(obj, &value);
+ if (ARROW_PREDICT_TRUE(status.ok())) {
+ return value;
+ } else if (!internal::PyIntScalar_Check(obj)) {
+ return internal::InvalidValue(obj, "tried to convert to int");
+ } else {
+ return status;
+ }
+ }
+ static Result<uint16_t> Convert(const HalfFloatType*, const O&, I obj) {
+ uint16_t value;
+ RETURN_NOT_OK(PyFloat_AsHalf(obj, &value));
+ return value;
+ }
+ static Result<float> Convert(const FloatType*, const O&, I obj) {
+ float value;
+ if (internal::PyFloatScalar_Check(obj)) {
+ value = static_cast<float>(PyFloat_AsDouble(obj));
+ } else if (internal::PyIntScalar_Check(obj)) {
+ RETURN_NOT_OK(internal::IntegerScalarToFloat32Safe(obj, &value));
+ } else {
+ return internal::InvalidValue(obj, "tried to convert to float32");
+ }
+ return value;
+ }
+ static Result<double> Convert(const DoubleType*, const O&, I obj) {
+ double value;
+ if (PyFloat_Check(obj)) {
+ value = PyFloat_AS_DOUBLE(obj);
+ } else if (internal::PyFloatScalar_Check(obj)) {
+ // Other kinds of float-y things
+ value = PyFloat_AsDouble(obj);
+ } else if (internal::PyIntScalar_Check(obj)) {
+ RETURN_NOT_OK(internal::IntegerScalarToDoubleSafe(obj, &value));
+ } else {
+ return internal::InvalidValue(obj, "tried to convert to double");
+ }
+ return value;
+ }
+ static Result<Decimal128> Convert(const Decimal128Type* type, const O&, I obj) {
+ Decimal128 value;
+ RETURN_NOT_OK(internal::DecimalFromPyObject(obj, *type, &value));
+ return value;
+ }
+ static Result<Decimal256> Convert(const Decimal256Type* type, const O&, I obj) {
+ Decimal256 value;
+ RETURN_NOT_OK(internal::DecimalFromPyObject(obj, *type, &value));
+ return value;
+ }
+ static Result<int32_t> Convert(const Date32Type*, const O&, I obj) {
+ int32_t value;
+ if (PyDate_Check(obj)) {
+ auto pydate = reinterpret_cast<PyDateTime_Date*>(obj);
+ value = static_cast<int32_t>(internal::PyDate_to_days(pydate));
+ } else {
+ internal::CIntFromPython(obj, &value, "Integer too large for date32"));
+ }
+ return value;
+ }
+ static Result<int64_t> Convert(const Date64Type*, const O&, I obj) {
+ int64_t value;
+ if (PyDateTime_Check(obj)) {
+ auto pydate = reinterpret_cast<PyDateTime_DateTime*>(obj);
+ value = internal::PyDateTime_to_ms(pydate);
+ // Truncate any intraday milliseconds
+ // TODO: introduce an option for this
+ value -= value % 86400000LL;
+ } else if (PyDate_Check(obj)) {
+ auto pydate = reinterpret_cast<PyDateTime_Date*>(obj);
+ value = internal::PyDate_to_ms(pydate);
+ } else {
+ internal::CIntFromPython(obj, &value, "Integer too large for date64"));
+ }
+ return value;
+ }
+ static Result<int32_t> Convert(const Time32Type* type, const O&, I obj) {
+ int32_t value;
+ if (PyTime_Check(obj)) {
+ switch (type->unit()) {
+ case TimeUnit::SECOND:
+ value = static_cast<int32_t>(internal::PyTime_to_s(obj));
+ break;
+ case TimeUnit::MILLI:
+ value = static_cast<int32_t>(internal::PyTime_to_ms(obj));
+ break;
+ default:
+ return Status::UnknownError("Invalid time unit");
+ }
+ } else {
+ RETURN_NOT_OK(internal::CIntFromPython(obj, &value, "Integer too large for int32"));
+ }
+ return value;
+ }
+ static Result<int64_t> Convert(const Time64Type* type, const O&, I obj) {
+ int64_t value;
+ if (PyTime_Check(obj)) {
+ switch (type->unit()) {
+ case TimeUnit::MICRO:
+ value = internal::PyTime_to_us(obj);
+ break;
+ case TimeUnit::NANO:
+ value = internal::PyTime_to_ns(obj);
+ break;
+ default:
+ return Status::UnknownError("Invalid time unit");
+ }
+ } else {
+ RETURN_NOT_OK(internal::CIntFromPython(obj, &value, "Integer too large for int64"));
+ }
+ return value;
+ }
+ static Result<int64_t> Convert(const TimestampType* type, const O& options, I obj) {
+ int64_t value, offset;
+ if (PyDateTime_Check(obj)) {
+ if (ARROW_PREDICT_FALSE(options.ignore_timezone)) {
+ offset = 0;
+ } else {
+ ARROW_ASSIGN_OR_RAISE(offset, internal::PyDateTime_utcoffset_s(obj));
+ }
+ auto dt = reinterpret_cast<PyDateTime_DateTime*>(obj);
+ switch (type->unit()) {
+ case TimeUnit::SECOND:
+ value = internal::PyDateTime_to_s(dt) - offset;
+ break;
+ case TimeUnit::MILLI:
+ value = internal::PyDateTime_to_ms(dt) - offset * 1000LL;
+ break;
+ case TimeUnit::MICRO:
+ value = internal::PyDateTime_to_us(dt) - offset * 1000000LL;
+ break;
+ case TimeUnit::NANO:
+ if (internal::IsPandasTimestamp(obj)) {
+ // pd.Timestamp value attribute contains the offset from unix epoch
+ // so no adjustment for timezone is need.
+ OwnedRef nanos(PyObject_GetAttrString(obj, "value"));
+ RETURN_NOT_OK(internal::CIntFromPython(nanos.obj(), &value));
+ } else {
+ // Conversion to nanoseconds can overflow -> check multiply of microseconds
+ value = internal::PyDateTime_to_us(dt);
+ if (arrow::internal::MultiplyWithOverflow(value, 1000LL, &value)) {
+ return internal::InvalidValue(obj,
+ "out of bounds for nanosecond resolution");
+ }
+ // Adjust with offset and check for overflow
+ if (arrow::internal::SubtractWithOverflow(value, offset * 1000000000LL,
+ &value)) {
+ return internal::InvalidValue(obj,
+ "out of bounds for nanosecond resolution");
+ }
+ }
+ break;
+ default:
+ return Status::UnknownError("Invalid time unit");
+ }
+ } else if (PyArray_CheckAnyScalarExact(obj)) {
+ // validate that the numpy scalar has np.datetime64 dtype
+ std::shared_ptr<DataType> numpy_type;
+ RETURN_NOT_OK(NumPyDtypeToArrow(PyArray_DescrFromScalar(obj), &numpy_type));
+ if (!numpy_type->Equals(*type)) {
+ return Status::NotImplemented("Expected np.datetime64 but got: ",
+ numpy_type->ToString());
+ }
+ return reinterpret_cast<PyDatetimeScalarObject*>(obj)->obval;
+ } else {
+ RETURN_NOT_OK(internal::CIntFromPython(obj, &value));
+ }
+ return value;
+ }
+ static Result<int64_t> Convert(const DurationType* type, const O&, I obj) {
+ int64_t value;
+ if (PyDelta_Check(obj)) {
+ auto dt = reinterpret_cast<PyDateTime_Delta*>(obj);
+ switch (type->unit()) {
+ case TimeUnit::SECOND:
+ value = internal::PyDelta_to_s(dt);
+ break;
+ case TimeUnit::MILLI:
+ value = internal::PyDelta_to_ms(dt);
+ break;
+ case TimeUnit::MICRO:
+ value = internal::PyDelta_to_us(dt);
+ break;
+ case TimeUnit::NANO:
+ if (internal::IsPandasTimedelta(obj)) {
+ OwnedRef nanos(PyObject_GetAttrString(obj, "value"));
+ RETURN_NOT_OK(internal::CIntFromPython(nanos.obj(), &value));
+ } else {
+ value = internal::PyDelta_to_ns(dt);
+ }
+ break;
+ default:
+ return Status::UnknownError("Invalid time unit");
+ }
+ } else if (PyArray_CheckAnyScalarExact(obj)) {
+ // validate that the numpy scalar has np.datetime64 dtype
+ std::shared_ptr<DataType> numpy_type;
+ RETURN_NOT_OK(NumPyDtypeToArrow(PyArray_DescrFromScalar(obj), &numpy_type));
+ if (!numpy_type->Equals(*type)) {
+ return Status::NotImplemented("Expected np.timedelta64 but got: ",
+ numpy_type->ToString());
+ }
+ return reinterpret_cast<PyTimedeltaScalarObject*>(obj)->obval;
+ } else {
+ RETURN_NOT_OK(internal::CIntFromPython(obj, &value));
+ }
+ return value;
+ }
+ // The binary-like intermediate representation is PyBytesView because it keeps temporary
+ // python objects alive (non-contiguous memoryview) and stores whether the original
+ // object was unicode encoded or not, which is used for unicode -> bytes coersion if
+ // there is a non-unicode object observed.
+ static Status Convert(const BaseBinaryType*, const O&, I obj, PyBytesView& view) {
+ return view.ParseString(obj);
+ }
+ static Status Convert(const FixedSizeBinaryType* type, const O&, I obj,
+ PyBytesView& view) {
+ ARROW_RETURN_NOT_OK(view.ParseString(obj));
+ if (view.size != type->byte_width()) {
+ std::stringstream ss;
+ ss << "expected to be length " << type->byte_width() << " was " << view.size;
+ return internal::InvalidValue(obj, ss.str());
+ } else {
+ return Status::OK();
+ }
+ }
+ template <typename T>
+ static enable_if_string<T, Status> Convert(const T*, const O& options, I obj,
+ PyBytesView& view) {
+ if (options.strict) {
+ // Strict conversion, force output to be unicode / utf8 and validate that
+ // any binary values are utf8
+ ARROW_RETURN_NOT_OK(view.ParseString(obj, true));
+ if (!view.is_utf8) {
+ return internal::InvalidValue(obj, "was not a utf8 string");
+ }
+ return Status::OK();
+ } else {
+ // Non-strict conversion; keep track of whether values are unicode or bytes
+ return view.ParseString(obj);
+ }
+ }
+ static Result<bool> Convert(const DataType* type, const O&, I obj) {
+ return Status::NotImplemented("PyValue::Convert is not implemented for type ", type);
+ }
+// The base Converter class is a mixin with predefined behavior and constructors.
+class PyConverter : public Converter<PyObject*, PyConversionOptions> {
+ public:
+ // Iterate over the input values and defer the conversion to the Append method
+ Status Extend(PyObject* values, int64_t size, int64_t offset = 0) override {
+ DCHECK_GE(size, offset);
+ /// Ensure we've allocated enough space
+ RETURN_NOT_OK(this->Reserve(size - offset));
+ // Iterate over the items adding each one
+ return internal::VisitSequence(
+ values, offset,
+ [this](PyObject* item, bool* /* unused */) { return this->Append(item); });
+ }
+ // Convert and append a sequence of values masked with a numpy array
+ Status ExtendMasked(PyObject* values, PyObject* mask, int64_t size,
+ int64_t offset = 0) override {
+ DCHECK_GE(size, offset);
+ /// Ensure we've allocated enough space
+ RETURN_NOT_OK(this->Reserve(size - offset));
+ // Iterate over the items adding each one
+ return internal::VisitSequenceMasked(
+ values, mask, offset, [this](PyObject* item, bool is_masked, bool* /* unused */) {
+ if (is_masked) {
+ return this->AppendNull();
+ } else {
+ // This will also apply the null-checking convention in the event
+ // that the value is not masked
+ return this->Append(item); // perhaps use AppendValue instead?
+ }
+ });
+ }
+template <typename T, typename Enable = void>
+class PyPrimitiveConverter;
+template <typename T>
+class PyListConverter;
+template <typename U, typename Enable = void>
+class PyDictionaryConverter;
+class PyStructConverter;
+template <typename T, typename Enable = void>
+struct PyConverterTrait;
+template <typename T>
+struct PyConverterTrait<
+ T, enable_if_t<!is_nested_type<T>::value && !is_interval_type<T>::value &&
+ !is_extension_type<T>::value>> {
+ using type = PyPrimitiveConverter<T>;
+template <typename T>
+struct PyConverterTrait<T, enable_if_list_like<T>> {
+ using type = PyListConverter<T>;
+template <>
+struct PyConverterTrait<StructType> {
+ using type = PyStructConverter;
+template <>
+struct PyConverterTrait<DictionaryType> {
+ template <typename T>
+ using dictionary_type = PyDictionaryConverter<T>;
+template <typename T>
+class PyPrimitiveConverter<T, enable_if_null<T>>
+ : public PrimitiveConverter<T, PyConverter> {
+ public:
+ Status Append(PyObject* value) override {
+ if (PyValue::IsNull(this->options_, value)) {
+ return this->primitive_builder_->AppendNull();
+ } else {
+ auto converted, PyValue::Convert(this->primitive_type_, this->options_, value));
+ return this->primitive_builder_->Append(converted);
+ }
+ }
+template <typename T>
+class PyPrimitiveConverter<
+ T, enable_if_t<is_boolean_type<T>::value || is_number_type<T>::value ||
+ is_decimal_type<T>::value || is_date_type<T>::value ||
+ is_time_type<T>::value>> : public PrimitiveConverter<T, PyConverter> {
+ public:
+ Status Append(PyObject* value) override {
+ // Since the required space has been already allocated in the Extend functions we can
+ // rely on the Unsafe builder API which improves the performance.
+ if (PyValue::IsNull(this->options_, value)) {
+ this->primitive_builder_->UnsafeAppendNull();
+ } else {
+ auto converted, PyValue::Convert(this->primitive_type_, this->options_, value));
+ this->primitive_builder_->UnsafeAppend(converted);
+ }
+ return Status::OK();
+ }
+template <typename T>
+class PyPrimitiveConverter<
+ T, enable_if_t<is_timestamp_type<T>::value || is_duration_type<T>::value>>
+ : public PrimitiveConverter<T, PyConverter> {
+ public:
+ Status Append(PyObject* value) override {
+ if (PyValue::IsNull(this->options_, value)) {
+ this->primitive_builder_->UnsafeAppendNull();
+ } else {
+ auto converted, PyValue::Convert(this->primitive_type_, this->options_, value));
+ // Numpy NaT sentinels can be checked after the conversion
+ if (PyArray_CheckAnyScalarExact(value) &&
+ PyValue::IsNaT(this->primitive_type_, converted)) {
+ this->primitive_builder_->UnsafeAppendNull();
+ } else {
+ this->primitive_builder_->UnsafeAppend(converted);
+ }
+ }
+ return Status::OK();
+ }
+template <typename T>
+class PyPrimitiveConverter<T, enable_if_t<std::is_same<T, FixedSizeBinaryType>::value>>
+ : public PrimitiveConverter<T, PyConverter> {
+ public:
+ Status Append(PyObject* value) override {
+ if (PyValue::IsNull(this->options_, value)) {
+ this->primitive_builder_->UnsafeAppendNull();
+ } else {
+ PyValue::Convert(this->primitive_type_, this->options_, value, view_));
+ ARROW_RETURN_NOT_OK(this->primitive_builder_->ReserveData(view_.size));
+ this->primitive_builder_->UnsafeAppend(view_.bytes);
+ }
+ return Status::OK();
+ }
+ protected:
+ PyBytesView view_;
+template <typename T>
+class PyPrimitiveConverter<T, enable_if_base_binary<T>>
+ : public PrimitiveConverter<T, PyConverter> {
+ public:
+ using OffsetType = typename T::offset_type;
+ Status Append(PyObject* value) override {
+ if (PyValue::IsNull(this->options_, value)) {
+ this->primitive_builder_->UnsafeAppendNull();
+ } else {
+ PyValue::Convert(this->primitive_type_, this->options_, value, view_));
+ if (!view_.is_utf8) {
+ // observed binary value
+ observed_binary_ = true;
+ }
+ // Since we don't know the varying length input size in advance, we need to
+ // reserve space in the value builder one by one. ReserveData raises CapacityError
+ // if the value would not fit into the array.
+ ARROW_RETURN_NOT_OK(this->primitive_builder_->ReserveData(view_.size));
+ this->primitive_builder_->UnsafeAppend(view_.bytes,
+ static_cast<OffsetType>(view_.size));
+ }
+ return Status::OK();
+ }
+ Result<std::shared_ptr<Array>> ToArray() override {
+ ARROW_ASSIGN_OR_RAISE(auto array, (PrimitiveConverter<T, PyConverter>::ToArray()));
+ if (observed_binary_) {
+ // if we saw any non-unicode, cast results to BinaryArray
+ auto binary_type = TypeTraits<typename T::PhysicalType>::type_singleton();
+ return array->View(binary_type);
+ } else {
+ return array;
+ }
+ }
+ protected:
+ PyBytesView view_;
+ bool observed_binary_ = false;
+template <typename U>
+class PyDictionaryConverter<U, enable_if_has_c_type<U>>
+ : public DictionaryConverter<U, PyConverter> {
+ public:
+ Status Append(PyObject* value) override {
+ if (PyValue::IsNull(this->options_, value)) {
+ return this->value_builder_->AppendNull();
+ } else {
+ ARROW_ASSIGN_OR_RAISE(auto converted,
+ PyValue::Convert(this->value_type_, this->options_, value));
+ return this->value_builder_->Append(converted);
+ }
+ }
+template <typename U>
+class PyDictionaryConverter<U, enable_if_has_string_view<U>>
+ : public DictionaryConverter<U, PyConverter> {
+ public:
+ Status Append(PyObject* value) override {
+ if (PyValue::IsNull(this->options_, value)) {
+ return this->value_builder_->AppendNull();
+ } else {
+ PyValue::Convert(this->value_type_, this->options_, value, view_));
+ return this->value_builder_->Append(view_.bytes, static_cast<int32_t>(view_.size));
+ }
+ }
+ protected:
+ PyBytesView view_;
+template <typename T>
+class PyListConverter : public ListConverter<T, PyConverter, PyConverterTrait> {
+ public:
+ Status Append(PyObject* value) override {
+ if (PyValue::IsNull(this->options_, value)) {
+ return this->list_builder_->AppendNull();
+ }
+ RETURN_NOT_OK(this->list_builder_->Append());
+ if (PyArray_Check(value)) {
+ RETURN_NOT_OK(AppendNdarray(value));
+ } else if (PySequence_Check(value)) {
+ RETURN_NOT_OK(AppendSequence(value));
+ } else {
+ return internal::InvalidType(
+ value, "was not a sequence or recognized null for conversion to list type");
+ }
+ return ValidateBuilder(this->list_type_);
+ }
+ protected:
+ Status ValidateBuilder(const MapType*) {
+ if (this->list_builder_->key_builder()->null_count() > 0) {
+ return Status::Invalid("Invalid Map: key field can not contain null values");
+ } else {
+ return Status::OK();
+ }
+ }
+ Status ValidateBuilder(const BaseListType*) { return Status::OK(); }
+ Status AppendSequence(PyObject* value) {
+ int64_t size = static_cast<int64_t>(PySequence_Size(value));
+ RETURN_NOT_OK(this->list_builder_->ValidateOverflow(size));
+ return this->value_converter_->Extend(value, size);
+ }
+ Status AppendNdarray(PyObject* value) {
+ PyArrayObject* ndarray = reinterpret_cast<PyArrayObject*>(value);
+ if (PyArray_NDIM(ndarray) != 1) {
+ return Status::Invalid("Can only convert 1-dimensional array values");
+ }
+ const int64_t size = PyArray_SIZE(ndarray);
+ RETURN_NOT_OK(this->list_builder_->ValidateOverflow(size));
+ const auto value_type = this->value_converter_->builder()->type();
+ switch (value_type->id()) {
+// If the value type does not match the expected NumPy dtype, then fall through
+// to a slower PySequence-based path
+ case Type::TYPE_ID: { \
+ if (PyArray_DESCR(ndarray)->type_num != NUMPY_TYPE) { \
+ return this->value_converter_->Extend(value, size); \
+ } \
+ return AppendNdarrayTyped<TYPE, NUMPY_TYPE>(ndarray); \
+ }
+ default: {
+ return this->value_converter_->Extend(value, size);
+ }
+ }
+ }
+ template <typename ArrowType, int NUMPY_TYPE>
+ Status AppendNdarrayTyped(PyArrayObject* ndarray) {
+ // no need to go through the conversion
+ using NumpyTrait = internal::npy_traits<NUMPY_TYPE>;
+ using NumpyType = typename NumpyTrait::value_type;
+ using ValueBuilderType = typename TypeTraits<ArrowType>::BuilderType;
+ const bool null_sentinels_possible =
+ // Always treat Numpy's NaT as null
+ // Observing pandas's null sentinels
+ (this->options_.from_pandas && NumpyTrait::supports_nulls);
+ auto value_builder =
+ checked_cast<ValueBuilderType*>(this->value_converter_->builder().get());
+ Ndarray1DIndexer<NumpyType> values(ndarray);
+ if (null_sentinels_possible) {
+ for (int64_t i = 0; i < values.size(); ++i) {
+ if (NumpyTrait::isnull(values[i])) {
+ RETURN_NOT_OK(value_builder->AppendNull());
+ } else {
+ RETURN_NOT_OK(value_builder->Append(values[i]));
+ }
+ }
+ } else if (!values.is_strided()) {
+ RETURN_NOT_OK(value_builder->AppendValues(values.data(), values.size()));
+ } else {
+ for (int64_t i = 0; i < values.size(); ++i) {
+ RETURN_NOT_OK(value_builder->Append(values[i]));
+ }
+ }
+ return Status::OK();
+ }
+class PyStructConverter : public StructConverter<PyConverter, PyConverterTrait> {
+ public:
+ Status Append(PyObject* value) override {
+ if (PyValue::IsNull(this->options_, value)) {
+ return this->struct_builder_->AppendNull();
+ }
+ switch (input_kind_) {
+ case InputKind::DICT:
+ RETURN_NOT_OK(this->struct_builder_->Append());
+ return AppendDict(value);
+ case InputKind::TUPLE:
+ RETURN_NOT_OK(this->struct_builder_->Append());
+ return AppendTuple(value);
+ case InputKind::ITEMS:
+ RETURN_NOT_OK(this->struct_builder_->Append());
+ return AppendItems(value);
+ default:
+ RETURN_NOT_OK(InferInputKind(value));
+ return Append(value);
+ }
+ }
+ protected:
+ Status Init(MemoryPool* pool) override {
+ RETURN_NOT_OK((StructConverter<PyConverter, PyConverterTrait>::Init(pool)));
+ // Store the field names as a PyObjects for dict matching
+ num_fields_ = this->struct_type_->num_fields();
+ bytes_field_names_.reset(PyList_New(num_fields_));
+ unicode_field_names_.reset(PyList_New(num_fields_));
+ for (int i = 0; i < num_fields_; i++) {
+ const auto& field_name = this->struct_type_->field(i)->name();
+ PyObject* bytes = PyBytes_FromStringAndSize(field_name.c_str(), field_name.size());
+ PyObject* unicode =
+ PyUnicode_FromStringAndSize(field_name.c_str(), field_name.size());
+ PyList_SET_ITEM(bytes_field_names_.obj(), i, bytes);
+ PyList_SET_ITEM(unicode_field_names_.obj(), i, unicode);
+ }
+ return Status::OK();
+ }
+ Status InferInputKind(PyObject* value) {
+ // Infer input object's type, note that heterogeneous sequences are not allowed
+ if (PyDict_Check(value)) {
+ input_kind_ = InputKind::DICT;
+ } else if (PyTuple_Check(value)) {
+ input_kind_ = InputKind::TUPLE;
+ } else if (PySequence_Check(value)) {
+ input_kind_ = InputKind::ITEMS;
+ } else {
+ return internal::InvalidType(value,
+ "was not a dict, tuple, or recognized null value "
+ "for conversion to struct type");
+ }
+ return Status::OK();
+ }
+ Status InferKeyKind(PyObject* items) {
+ for (int i = 0; i < PySequence_Length(items); i++) {
+ // retrieve the key from the passed key-value pairs
+ ARROW_ASSIGN_OR_RAISE(auto pair, GetKeyValuePair(items, i));
+ // check key exists between the unicode field names
+ bool do_contain = PySequence_Contains(unicode_field_names_.obj(), pair.first);
+ if (do_contain) {
+ key_kind_ = KeyKind::UNICODE;
+ return Status::OK();
+ }
+ // check key exists between the bytes field names
+ do_contain = PySequence_Contains(bytes_field_names_.obj(), pair.first);
+ if (do_contain) {
+ key_kind_ = KeyKind::BYTES;
+ return Status::OK();
+ }
+ }
+ return Status::OK();
+ }
+ Status AppendEmpty() {
+ for (int i = 0; i < num_fields_; i++) {
+ RETURN_NOT_OK(this->children_[i]->Append(Py_None));
+ }
+ return Status::OK();
+ }
+ Status AppendTuple(PyObject* tuple) {
+ if (!PyTuple_Check(tuple)) {
+ return internal::InvalidType(tuple, "was expecting a tuple");
+ }
+ if (PyTuple_GET_SIZE(tuple) != num_fields_) {
+ return Status::Invalid("Tuple size must be equal to number of struct fields");
+ }
+ for (int i = 0; i < num_fields_; i++) {
+ PyObject* value = PyTuple_GET_ITEM(tuple, i);
+ RETURN_NOT_OK(this->children_[i]->Append(value));
+ }
+ return Status::OK();
+ }
+ Status AppendDict(PyObject* dict) {
+ if (!PyDict_Check(dict)) {
+ return internal::InvalidType(dict, "was expecting a dict");
+ }
+ switch (key_kind_) {
+ case KeyKind::UNICODE:
+ return AppendDict(dict, unicode_field_names_.obj());
+ case KeyKind::BYTES:
+ return AppendDict(dict, bytes_field_names_.obj());
+ default:
+ RETURN_NOT_OK(InferKeyKind(PyDict_Items(dict)));
+ if (key_kind_ == KeyKind::UNKNOWN) {
+ // was unable to infer the type which means that all keys are absent
+ return AppendEmpty();
+ } else {
+ return AppendDict(dict);
+ }
+ }
+ }
+ Status AppendItems(PyObject* items) {
+ if (!PySequence_Check(items)) {
+ return internal::InvalidType(items, "was expecting a sequence of key-value items");
+ }
+ switch (key_kind_) {
+ case KeyKind::UNICODE:
+ return AppendItems(items, unicode_field_names_.obj());
+ case KeyKind::BYTES:
+ return AppendItems(items, bytes_field_names_.obj());
+ default:
+ RETURN_NOT_OK(InferKeyKind(items));
+ if (key_kind_ == KeyKind::UNKNOWN) {
+ // was unable to infer the type which means that all keys are absent
+ return AppendEmpty();
+ } else {
+ return AppendItems(items);
+ }
+ }
+ }
+ Status AppendDict(PyObject* dict, PyObject* field_names) {
+ // NOTE we're ignoring any extraneous dict items
+ for (int i = 0; i < num_fields_; i++) {
+ PyObject* name = PyList_GET_ITEM(field_names, i); // borrowed
+ PyObject* value = PyDict_GetItem(dict, name); // borrowed
+ if (value == NULL) {
+ }
+ RETURN_NOT_OK(this->children_[i]->Append(value ? value : Py_None));
+ }
+ return Status::OK();
+ }
+ Result<std::pair<PyObject*, PyObject*>> GetKeyValuePair(PyObject* seq, int index) {
+ PyObject* pair = PySequence_GetItem(seq, index);
+ if (!PyTuple_Check(pair) || PyTuple_Size(pair) != 2) {
+ return internal::InvalidType(pair, "was expecting tuple of (key, value) pair");
+ }
+ PyObject* key = PyTuple_GetItem(pair, 0);
+ PyObject* value = PyTuple_GetItem(pair, 1);
+ return std::make_pair(key, value);
+ }
+ Status AppendItems(PyObject* items, PyObject* field_names) {
+ auto length = static_cast<int>(PySequence_Size(items));
+ // append the values for the defined fields
+ for (int i = 0; i < std::min(num_fields_, length); i++) {
+ // retrieve the key-value pair
+ ARROW_ASSIGN_OR_RAISE(auto pair, GetKeyValuePair(items, i));
+ // validate that the key and the field name are equal
+ PyObject* name = PyList_GET_ITEM(field_names, i);
+ bool are_equal = PyObject_RichCompareBool(pair.first, name, Py_EQ);
+ // finally append to the respective child builder
+ if (are_equal) {
+ RETURN_NOT_OK(this->children_[i]->Append(pair.second));
+ } else {
+ ARROW_ASSIGN_OR_RAISE(auto key_view, PyBytesView::FromString(pair.first));
+ ARROW_ASSIGN_OR_RAISE(auto name_view, PyBytesView::FromString(name));
+ return Status::Invalid("The expected field name is `", name_view.bytes, "` but `",
+ key_view.bytes, "` was given");
+ }
+ }
+ // insert null values for missing fields
+ for (int i = length; i < num_fields_; i++) {
+ RETURN_NOT_OK(this->children_[i]->AppendNull());
+ }
+ return Status::OK();
+ }
+ // Whether we're converting from a sequence of dicts or tuples or list of pairs
+ enum class InputKind { UNKNOWN, DICT, TUPLE, ITEMS } input_kind_ = InputKind::UNKNOWN;
+ // Whether the input dictionary keys' type is python bytes or unicode
+ enum class KeyKind { UNKNOWN, BYTES, UNICODE } key_kind_ = KeyKind::UNKNOWN;
+ // Store the field names as a PyObjects for dict matching
+ OwnedRef bytes_field_names_;
+ OwnedRef unicode_field_names_;
+ // Store the number of fields for later reuse
+ int num_fields_;
+// Convert *obj* to a sequence if necessary
+// Fill *size* to its length. If >= 0 on entry, *size* is an upper size
+// bound that may lead to truncation.
+Status ConvertToSequenceAndInferSize(PyObject* obj, PyObject** seq, int64_t* size) {
+ if (PySequence_Check(obj)) {
+ // obj is already a sequence
+ int64_t real_size = static_cast<int64_t>(PySequence_Size(obj));
+ if (*size < 0) {
+ *size = real_size;
+ } else {
+ *size = std::min(real_size, *size);
+ }
+ Py_INCREF(obj);
+ *seq = obj;
+ } else if (*size < 0) {
+ // unknown size, exhaust iterator
+ *seq = PySequence_List(obj);
+ *size = static_cast<int64_t>(PyList_GET_SIZE(*seq));
+ } else {
+ // size is known but iterator could be infinite
+ Py_ssize_t i, n = *size;
+ PyObject* iter = PyObject_GetIter(obj);
+ OwnedRef iter_ref(iter);
+ PyObject* lst = PyList_New(n);
+ for (i = 0; i < n; i++) {
+ PyObject* item = PyIter_Next(iter);
+ if (!item) break;
+ PyList_SET_ITEM(lst, i, item);
+ }
+ // Shrink list if len(iterator) < size
+ if (i < n && PyList_SetSlice(lst, i, n, NULL)) {
+ Py_DECREF(lst);
+ return Status::UnknownError("failed to resize list");
+ }
+ *seq = lst;
+ *size = std::min<int64_t>(i, *size);
+ }
+ return Status::OK();
+Result<std::shared_ptr<ChunkedArray>> ConvertPySequence(PyObject* obj, PyObject* mask,
+ PyConversionOptions options,
+ MemoryPool* pool) {
+ PyAcquireGIL lock;
+ PyObject* seq;
+ OwnedRef tmp_seq_nanny;
+ ARROW_ASSIGN_OR_RAISE(auto is_pandas_imported, internal::IsModuleImported("pandas"));
+ if (is_pandas_imported) {
+ // If pandas has been already imported initialize the static pandas objects to
+ // support converting from pd.Timedelta and pd.Timestamp objects
+ internal::InitPandasStaticData();
+ }
+ int64_t size = options.size;
+ RETURN_NOT_OK(ConvertToSequenceAndInferSize(obj, &seq, &size));
+ tmp_seq_nanny.reset(seq);
+ // In some cases, type inference may be "loose", like strings. If the user
+ // passed pa.string(), then we will error if we encounter any non-UTF8
+ // value. If not, then we will allow the result to be a BinaryArray
+ if (options.type == nullptr) {
+ ARROW_ASSIGN_OR_RAISE(options.type, InferArrowType(seq, mask, options.from_pandas));
+ options.strict = false;
+ } else {
+ options.strict = true;
+ }
+ DCHECK_GE(size, 0);
+ ARROW_ASSIGN_OR_RAISE(auto converter, (MakeConverter<PyConverter, PyConverterTrait>(
+ options.type, options, pool)));
+ if (converter->may_overflow()) {
+ // The converter hierarchy contains binary- or list-like builders which can overflow
+ // depending on the input values. Wrap the converter with a chunker which detects
+ // the overflow and automatically creates new chunks.
+ ARROW_ASSIGN_OR_RAISE(auto chunked_converter, MakeChunker(std::move(converter)));
+ if (mask != nullptr && mask != Py_None) {
+ RETURN_NOT_OK(chunked_converter->ExtendMasked(seq, mask, size));
+ } else {
+ RETURN_NOT_OK(chunked_converter->Extend(seq, size));
+ }
+ return chunked_converter->ToChunkedArray();
+ } else {
+ // If the converter can't overflow spare the capacity error checking on the hot-path,
+ // this improves the performance roughly by ~10% for primitive types.
+ if (mask != nullptr && mask != Py_None) {
+ RETURN_NOT_OK(converter->ExtendMasked(seq, mask, size));
+ } else {
+ RETURN_NOT_OK(converter->Extend(seq, size));
+ }
+ return converter->ToChunkedArray();
+ }
+} // namespace py
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/python/python_to_arrow.h b/contrib/libs/apache/arrow/cpp/src/arrow/python/python_to_arrow.h
new file mode 100644
index 0000000000..d167996ba8
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/python/python_to_arrow.h
@@ -0,0 +1,80 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+// Functions for converting between CPython built-in data structures and Arrow
+// data structures
+#pragma once
+#include "arrow/python/platform.h"
+#include <cstdint>
+#include <memory>
+#include "arrow/python/visibility.h"
+#include "arrow/type.h"
+#include "arrow/util/macros.h"
+#include "arrow/python/common.h"
+namespace arrow {
+class Array;
+class Status;
+namespace py {
+struct PyConversionOptions {
+ PyConversionOptions() = default;
+ PyConversionOptions(const std::shared_ptr<DataType>& type, int64_t size,
+ MemoryPool* pool, bool from_pandas)
+ : type(type), size(size), from_pandas(from_pandas) {}
+ // Set to null if to be inferred
+ std::shared_ptr<DataType> type;
+ // Default is -1, which indicates the size should the same as the input sequence
+ int64_t size = -1;
+ bool from_pandas = false;
+ /// Used to maintain backwards compatibility for
+ /// timezone bugs (see ARROW-9528). Should be removed
+ /// after Arrow 2.0 release.
+ bool ignore_timezone = false;
+ bool strict = false;
+/// \brief Convert sequence (list, generator, NumPy array with dtype object) of
+/// Python objects.
+/// \param[in] obj the sequence to convert
+/// \param[in] mask a NumPy array of true/false values to indicate whether
+/// values in the sequence are null (true) or not null (false). This parameter
+/// may be null
+/// \param[in] options various conversion options
+/// \param[in] pool MemoryPool to use for allocations
+/// \return Result ChunkedArray
+Result<std::shared_ptr<ChunkedArray>> ConvertPySequence(
+ PyObject* obj, PyObject* mask, PyConversionOptions options,
+ MemoryPool* pool = default_memory_pool());
+} // namespace py
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/python/serialize.cc b/contrib/libs/apache/arrow/cpp/src/arrow/python/serialize.cc
new file mode 100644
index 0000000000..ad079cbd9c
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/python/serialize.cc
@@ -0,0 +1,798 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#include "arrow/python/serialize.h"
+#include "arrow/python/numpy_interop.h"
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <vector>
+#include <numpy/arrayobject.h>
+#include <numpy/arrayscalars.h>
+#include "arrow/array.h"
+#include "arrow/array/builder_binary.h"
+#include "arrow/array/builder_nested.h"
+#include "arrow/array/builder_primitive.h"
+#include "arrow/array/builder_union.h"
+#include "arrow/io/interfaces.h"
+#include "arrow/io/memory.h"
+#include "arrow/ipc/util.h"
+#include "arrow/ipc/writer.h"
+#include "arrow/record_batch.h"
+#include "arrow/result.h"
+#include "arrow/tensor.h"
+#include "arrow/util/logging.h"
+#include "arrow/python/common.h"
+#include "arrow/python/datetime.h"
+#include "arrow/python/helpers.h"
+#include "arrow/python/iterators.h"
+#include "arrow/python/numpy_convert.h"
+#include "arrow/python/platform.h"
+#include "arrow/python/pyarrow.h"
+constexpr int32_t kMaxRecursionDepth = 100;
+namespace arrow {
+using internal::checked_cast;
+namespace py {
+class SequenceBuilder;
+class DictBuilder;
+Status Append(PyObject* context, PyObject* elem, SequenceBuilder* builder,
+ int32_t recursion_depth, SerializedPyObject* blobs_out);
+// A Sequence is a heterogeneous collections of elements. It can contain
+// scalar Python types, lists, tuples, dictionaries, tensors and sparse tensors.
+class SequenceBuilder {
+ public:
+ explicit SequenceBuilder(MemoryPool* pool = default_memory_pool())
+ : pool_(pool),
+ types_(::arrow::int8(), pool),
+ offsets_(::arrow::int32(), pool),
+ type_map_(PythonType::NUM_PYTHON_TYPES, -1) {
+ auto null_builder = std::make_shared<NullBuilder>(pool);
+ auto initial_ty = dense_union({field("0", null())});
+ builder_.reset(new DenseUnionBuilder(pool, {null_builder}, initial_ty));
+ }
+ // Appending a none to the sequence
+ Status AppendNone() { return builder_->AppendNull(); }
+ template <typename BuilderType, typename MakeBuilderFn>
+ Status CreateAndUpdate(std::shared_ptr<BuilderType>* child_builder, int8_t tag,
+ MakeBuilderFn make_builder) {
+ if (!*child_builder) {
+ child_builder->reset(make_builder());
+ std::ostringstream convert;
+ convert.imbue(std::locale::classic());
+ convert << static_cast<int>(tag);
+ type_map_[tag] = builder_->AppendChild(*child_builder, convert.str());
+ }
+ return builder_->Append(type_map_[tag]);
+ }
+ template <typename BuilderType, typename T>
+ Status AppendPrimitive(std::shared_ptr<BuilderType>* child_builder, const T val,
+ int8_t tag) {
+ CreateAndUpdate(child_builder, tag, [this]() { return new BuilderType(pool_); }));
+ return (*child_builder)->Append(val);
+ }
+ // Appending a boolean to the sequence
+ Status AppendBool(const bool data) {
+ return AppendPrimitive(&bools_, data, PythonType::BOOL);
+ }
+ // Appending an int64_t to the sequence
+ Status AppendInt64(const int64_t data) {
+ return AppendPrimitive(&ints_, data, PythonType::INT);
+ }
+ // Append a list of bytes to the sequence
+ Status AppendBytes(const uint8_t* data, int32_t length) {
+ RETURN_NOT_OK(CreateAndUpdate(&bytes_, PythonType::BYTES,
+ [this]() { return new BinaryBuilder(pool_); }));
+ return bytes_->Append(data, length);
+ }
+ // Appending a string to the sequence
+ Status AppendString(const char* data, int32_t length) {
+ RETURN_NOT_OK(CreateAndUpdate(&strings_, PythonType::STRING,
+ [this]() { return new StringBuilder(pool_); }));
+ return strings_->Append(data, length);
+ }
+ // Appending a half_float to the sequence
+ Status AppendHalfFloat(const npy_half data) {
+ return AppendPrimitive(&half_floats_, data, PythonType::HALF_FLOAT);
+ }
+ // Appending a float to the sequence
+ Status AppendFloat(const float data) {
+ return AppendPrimitive(&floats_, data, PythonType::FLOAT);
+ }
+ // Appending a double to the sequence
+ Status AppendDouble(const double data) {
+ return AppendPrimitive(&doubles_, data, PythonType::DOUBLE);
+ }
+ // Appending a Date64 timestamp to the sequence
+ Status AppendDate64(const int64_t timestamp) {
+ return AppendPrimitive(&date64s_, timestamp, PythonType::DATE64);
+ }
+ // Appending a tensor to the sequence
+ //
+ // \param tensor_index Index of the tensor in the object.
+ Status AppendTensor(const int32_t tensor_index) {
+ RETURN_NOT_OK(CreateAndUpdate(&tensor_indices_, PythonType::TENSOR,
+ [this]() { return new Int32Builder(pool_); }));
+ return tensor_indices_->Append(tensor_index);
+ }
+ // Appending a sparse coo tensor to the sequence
+ //
+ // \param sparse_coo_tensor_index Index of the sparse coo tensor in the object.
+ Status AppendSparseCOOTensor(const int32_t sparse_coo_tensor_index) {
+ RETURN_NOT_OK(CreateAndUpdate(&sparse_coo_tensor_indices_,
+ [this]() { return new Int32Builder(pool_); }));
+ return sparse_coo_tensor_indices_->Append(sparse_coo_tensor_index);
+ }
+ // Appending a sparse csr matrix to the sequence
+ //
+ // \param sparse_csr_matrix_index Index of the sparse csr matrix in the object.
+ Status AppendSparseCSRMatrix(const int32_t sparse_csr_matrix_index) {
+ RETURN_NOT_OK(CreateAndUpdate(&sparse_csr_matrix_indices_,
+ [this]() { return new Int32Builder(pool_); }));
+ return sparse_csr_matrix_indices_->Append(sparse_csr_matrix_index);
+ }
+ // Appending a sparse csc matrix to the sequence
+ //
+ // \param sparse_csc_matrix_index Index of the sparse csc matrix in the object.
+ Status AppendSparseCSCMatrix(const int32_t sparse_csc_matrix_index) {
+ RETURN_NOT_OK(CreateAndUpdate(&sparse_csc_matrix_indices_,
+ [this]() { return new Int32Builder(pool_); }));
+ return sparse_csc_matrix_indices_->Append(sparse_csc_matrix_index);
+ }
+ // Appending a sparse csf tensor to the sequence
+ //
+ // \param sparse_csf_tensor_index Index of the sparse csf tensor in the object.
+ Status AppendSparseCSFTensor(const int32_t sparse_csf_tensor_index) {
+ RETURN_NOT_OK(CreateAndUpdate(&sparse_csf_tensor_indices_,
+ [this]() { return new Int32Builder(pool_); }));
+ return sparse_csf_tensor_indices_->Append(sparse_csf_tensor_index);
+ }
+ // Appending a numpy ndarray to the sequence
+ //
+ // \param tensor_index Index of the tensor in the object.
+ Status AppendNdarray(const int32_t ndarray_index) {
+ RETURN_NOT_OK(CreateAndUpdate(&ndarray_indices_, PythonType::NDARRAY,
+ [this]() { return new Int32Builder(pool_); }));
+ return ndarray_indices_->Append(ndarray_index);
+ }
+ // Appending a buffer to the sequence
+ //
+ // \param buffer_index Index of the buffer in the object.
+ Status AppendBuffer(const int32_t buffer_index) {
+ RETURN_NOT_OK(CreateAndUpdate(&buffer_indices_, PythonType::BUFFER,
+ [this]() { return new Int32Builder(pool_); }));
+ return buffer_indices_->Append(buffer_index);
+ }
+ Status AppendSequence(PyObject* context, PyObject* sequence, int8_t tag,
+ std::shared_ptr<ListBuilder>& target_sequence,
+ std::unique_ptr<SequenceBuilder>& values, int32_t recursion_depth,
+ SerializedPyObject* blobs_out) {
+ if (recursion_depth >= kMaxRecursionDepth) {
+ return Status::NotImplemented(
+ "This object exceeds the maximum recursion depth. It may contain itself "
+ "recursively.");
+ }
+ RETURN_NOT_OK(CreateAndUpdate(&target_sequence, tag, [this, &values]() {
+ values.reset(new SequenceBuilder(pool_));
+ return new ListBuilder(pool_, values->builder());
+ }));
+ RETURN_NOT_OK(target_sequence->Append());
+ return internal::VisitIterable(
+ sequence, [&](PyObject* obj, bool* keep_going /* unused */) {
+ return Append(context, obj, values.get(), recursion_depth, blobs_out);
+ });
+ }
+ Status AppendList(PyObject* context, PyObject* list, int32_t recursion_depth,
+ SerializedPyObject* blobs_out) {
+ return AppendSequence(context, list, PythonType::LIST, lists_, list_values_,
+ recursion_depth + 1, blobs_out);
+ }
+ Status AppendTuple(PyObject* context, PyObject* tuple, int32_t recursion_depth,
+ SerializedPyObject* blobs_out) {
+ return AppendSequence(context, tuple, PythonType::TUPLE, tuples_, tuple_values_,
+ recursion_depth + 1, blobs_out);
+ }
+ Status AppendSet(PyObject* context, PyObject* set, int32_t recursion_depth,
+ SerializedPyObject* blobs_out) {
+ return AppendSequence(context, set, PythonType::SET, sets_, set_values_,
+ recursion_depth + 1, blobs_out);
+ }
+ Status AppendDict(PyObject* context, PyObject* dict, int32_t recursion_depth,
+ SerializedPyObject* blobs_out);
+ // Finish building the sequence and return the result.
+ // Input arrays may be nullptr
+ Status Finish(std::shared_ptr<Array>* out) { return builder_->Finish(out); }
+ std::shared_ptr<DenseUnionBuilder> builder() { return builder_; }
+ private:
+ MemoryPool* pool_;
+ Int8Builder types_;
+ Int32Builder offsets_;
+ /// Mapping from PythonType to child index
+ std::vector<int8_t> type_map_;
+ std::shared_ptr<BooleanBuilder> bools_;
+ std::shared_ptr<Int64Builder> ints_;
+ std::shared_ptr<BinaryBuilder> bytes_;
+ std::shared_ptr<StringBuilder> strings_;
+ std::shared_ptr<HalfFloatBuilder> half_floats_;
+ std::shared_ptr<FloatBuilder> floats_;
+ std::shared_ptr<DoubleBuilder> doubles_;
+ std::shared_ptr<Date64Builder> date64s_;
+ std::unique_ptr<SequenceBuilder> list_values_;
+ std::shared_ptr<ListBuilder> lists_;
+ std::unique_ptr<DictBuilder> dict_values_;
+ std::shared_ptr<ListBuilder> dicts_;
+ std::unique_ptr<SequenceBuilder> tuple_values_;
+ std::shared_ptr<ListBuilder> tuples_;
+ std::unique_ptr<SequenceBuilder> set_values_;
+ std::shared_ptr<ListBuilder> sets_;
+ std::shared_ptr<Int32Builder> tensor_indices_;
+ std::shared_ptr<Int32Builder> sparse_coo_tensor_indices_;
+ std::shared_ptr<Int32Builder> sparse_csr_matrix_indices_;
+ std::shared_ptr<Int32Builder> sparse_csc_matrix_indices_;
+ std::shared_ptr<Int32Builder> sparse_csf_tensor_indices_;
+ std::shared_ptr<Int32Builder> ndarray_indices_;
+ std::shared_ptr<Int32Builder> buffer_indices_;
+ std::shared_ptr<DenseUnionBuilder> builder_;
+// Constructing dictionaries of key/value pairs. Sequences of
+// keys and values are built separately using a pair of
+// SequenceBuilders. The resulting Arrow representation
+// can be obtained via the Finish method.
+class DictBuilder {
+ public:
+ explicit DictBuilder(MemoryPool* pool = nullptr) : keys_(pool), vals_(pool) {
+ builder_.reset(new StructBuilder(struct_({field("keys", dense_union(FieldVector{})),
+ field("vals", dense_union(FieldVector{}))}),
+ pool, {keys_.builder(), vals_.builder()}));
+ }
+ // Builder for the keys of the dictionary
+ SequenceBuilder& keys() { return keys_; }
+ // Builder for the values of the dictionary
+ SequenceBuilder& vals() { return vals_; }
+ // Construct an Arrow StructArray representing the dictionary.
+ // Contains a field "keys" for the keys and "vals" for the values.
+ Status Finish(std::shared_ptr<Array>* out) { return builder_->Finish(out); }
+ std::shared_ptr<StructBuilder> builder() { return builder_; }
+ private:
+ SequenceBuilder keys_;
+ SequenceBuilder vals_;
+ std::shared_ptr<StructBuilder> builder_;
+Status SequenceBuilder::AppendDict(PyObject* context, PyObject* dict,
+ int32_t recursion_depth,
+ SerializedPyObject* blobs_out) {
+ if (recursion_depth >= kMaxRecursionDepth) {
+ return Status::NotImplemented(
+ "This object exceeds the maximum recursion depth. It may contain itself "
+ "recursively.");
+ }
+ RETURN_NOT_OK(CreateAndUpdate(&dicts_, PythonType::DICT, [this]() {
+ dict_values_.reset(new DictBuilder(pool_));
+ return new ListBuilder(pool_, dict_values_->builder());
+ }));
+ RETURN_NOT_OK(dicts_->Append());
+ PyObject* key;
+ PyObject* value;
+ Py_ssize_t pos = 0;
+ while (PyDict_Next(dict, &pos, &key, &value)) {
+ RETURN_NOT_OK(dict_values_->builder()->Append());
+ Append(context, key, &dict_values_->keys(), recursion_depth + 1, blobs_out));
+ Append(context, value, &dict_values_->vals(), recursion_depth + 1, blobs_out));
+ }
+ // This block is used to decrement the reference counts of the results
+ // returned by the serialization callback, which is called in AppendArray,
+ // in DeserializeDict and in Append
+ static PyObject* py_type = PyUnicode_FromString("_pytype_");
+ if (PyDict_Contains(dict, py_type)) {
+ // If the dictionary contains the key "_pytype_", then the user has to
+ // have registered a callback.
+ if (context == Py_None) {
+ return Status::Invalid("No serialization callback set");
+ }
+ Py_XDECREF(dict);
+ }
+ return Status::OK();
+Status CallCustomCallback(PyObject* context, PyObject* method_name, PyObject* elem,
+ PyObject** result) {
+ if (context == Py_None) {
+ *result = NULL;
+ return Status::SerializationError("error while calling callback on ",
+ internal::PyObject_StdStringRepr(elem),
+ ": handler not registered");
+ } else {
+ *result = PyObject_CallMethodObjArgs(context, method_name, elem, NULL);
+ return CheckPyError();
+ }
+Status CallSerializeCallback(PyObject* context, PyObject* value,
+ PyObject** serialized_object) {
+ OwnedRef method_name(PyUnicode_FromString("_serialize_callback"));
+ RETURN_NOT_OK(CallCustomCallback(context, method_name.obj(), value, serialized_object));
+ if (!PyDict_Check(*serialized_object)) {
+ return Status::TypeError("serialization callback must return a valid dictionary");
+ }
+ return Status::OK();
+Status CallDeserializeCallback(PyObject* context, PyObject* value,
+ PyObject** deserialized_object) {
+ OwnedRef method_name(PyUnicode_FromString("_deserialize_callback"));
+ return CallCustomCallback(context, method_name.obj(), value, deserialized_object);
+Status AppendArray(PyObject* context, PyArrayObject* array, SequenceBuilder* builder,
+ int32_t recursion_depth, SerializedPyObject* blobs_out);
+template <typename NumpyScalarObject>
+Status AppendIntegerScalar(PyObject* obj, SequenceBuilder* builder) {
+ int64_t value = reinterpret_cast<NumpyScalarObject*>(obj)->obval;
+ return builder->AppendInt64(value);
+// Append a potentially 64-bit wide unsigned Numpy scalar.
+// Must check for overflow as we reinterpret it as signed int64.
+template <typename NumpyScalarObject>
+Status AppendLargeUnsignedScalar(PyObject* obj, SequenceBuilder* builder) {
+ constexpr uint64_t max_value = std::numeric_limits<int64_t>::max();
+ uint64_t value = reinterpret_cast<NumpyScalarObject*>(obj)->obval;
+ if (value > max_value) {
+ return Status::Invalid("cannot serialize Numpy uint64 scalar >= 2**63");
+ }
+ return builder->AppendInt64(static_cast<int64_t>(value));
+Status AppendScalar(PyObject* obj, SequenceBuilder* builder) {
+ if (PyArray_IsScalar(obj, Bool)) {
+ return builder->AppendBool(reinterpret_cast<PyBoolScalarObject*>(obj)->obval != 0);
+ } else if (PyArray_IsScalar(obj, Half)) {
+ return builder->AppendHalfFloat(reinterpret_cast<PyHalfScalarObject*>(obj)->obval);
+ } else if (PyArray_IsScalar(obj, Float)) {
+ return builder->AppendFloat(reinterpret_cast<PyFloatScalarObject*>(obj)->obval);
+ } else if (PyArray_IsScalar(obj, Double)) {
+ return builder->AppendDouble(reinterpret_cast<PyDoubleScalarObject*>(obj)->obval);
+ }
+ if (PyArray_IsScalar(obj, Byte)) {
+ return AppendIntegerScalar<PyByteScalarObject>(obj, builder);
+ } else if (PyArray_IsScalar(obj, Short)) {
+ return AppendIntegerScalar<PyShortScalarObject>(obj, builder);
+ } else if (PyArray_IsScalar(obj, Int)) {
+ return AppendIntegerScalar<PyIntScalarObject>(obj, builder);
+ } else if (PyArray_IsScalar(obj, Long)) {
+ return AppendIntegerScalar<PyLongScalarObject>(obj, builder);
+ } else if (PyArray_IsScalar(obj, LongLong)) {
+ return AppendIntegerScalar<PyLongLongScalarObject>(obj, builder);
+ } else if (PyArray_IsScalar(obj, Int64)) {
+ return AppendIntegerScalar<PyInt64ScalarObject>(obj, builder);
+ } else if (PyArray_IsScalar(obj, UByte)) {
+ return AppendIntegerScalar<PyUByteScalarObject>(obj, builder);
+ } else if (PyArray_IsScalar(obj, UShort)) {
+ return AppendIntegerScalar<PyUShortScalarObject>(obj, builder);
+ } else if (PyArray_IsScalar(obj, UInt)) {
+ return AppendIntegerScalar<PyUIntScalarObject>(obj, builder);
+ } else if (PyArray_IsScalar(obj, ULong)) {
+ return AppendLargeUnsignedScalar<PyULongScalarObject>(obj, builder);
+ } else if (PyArray_IsScalar(obj, ULongLong)) {
+ return AppendLargeUnsignedScalar<PyULongLongScalarObject>(obj, builder);
+ } else if (PyArray_IsScalar(obj, UInt64)) {
+ return AppendLargeUnsignedScalar<PyUInt64ScalarObject>(obj, builder);
+ }
+ return Status::NotImplemented("Numpy scalar type not recognized");
+Status Append(PyObject* context, PyObject* elem, SequenceBuilder* builder,
+ int32_t recursion_depth, SerializedPyObject* blobs_out) {
+ // The bool case must precede the int case (PyInt_Check passes for bools)
+ if (PyBool_Check(elem)) {
+ RETURN_NOT_OK(builder->AppendBool(elem == Py_True));
+ } else if (PyArray_DescrFromScalar(elem)->type_num == NPY_HALF) {
+ npy_half halffloat = reinterpret_cast<PyHalfScalarObject*>(elem)->obval;
+ RETURN_NOT_OK(builder->AppendHalfFloat(halffloat));
+ } else if (PyFloat_Check(elem)) {
+ RETURN_NOT_OK(builder->AppendDouble(PyFloat_AS_DOUBLE(elem)));
+ } else if (PyLong_Check(elem)) {
+ int overflow = 0;
+ int64_t data = PyLong_AsLongLongAndOverflow(elem, &overflow);
+ if (!overflow) {
+ RETURN_NOT_OK(builder->AppendInt64(data));
+ } else {
+ // Attempt to serialize the object using the custom callback.
+ PyObject* serialized_object;
+ // The reference count of serialized_object will be decremented in SerializeDict
+ RETURN_NOT_OK(CallSerializeCallback(context, elem, &serialized_object));
+ builder->AppendDict(context, serialized_object, recursion_depth, blobs_out));
+ }
+ } else if (PyBytes_Check(elem)) {
+ auto data = reinterpret_cast<uint8_t*>(PyBytes_AS_STRING(elem));
+ int32_t size = -1;
+ RETURN_NOT_OK(internal::CastSize(PyBytes_GET_SIZE(elem), &size));
+ RETURN_NOT_OK(builder->AppendBytes(data, size));
+ } else if (PyUnicode_Check(elem)) {
+ ARROW_ASSIGN_OR_RAISE(auto view, PyBytesView::FromUnicode(elem));
+ int32_t size = -1;
+ RETURN_NOT_OK(internal::CastSize(view.size, &size));
+ RETURN_NOT_OK(builder->AppendString(view.bytes, size));
+ } else if (PyList_CheckExact(elem)) {
+ RETURN_NOT_OK(builder->AppendList(context, elem, recursion_depth, blobs_out));
+ } else if (PyDict_CheckExact(elem)) {
+ RETURN_NOT_OK(builder->AppendDict(context, elem, recursion_depth, blobs_out));
+ } else if (PyTuple_CheckExact(elem)) {
+ RETURN_NOT_OK(builder->AppendTuple(context, elem, recursion_depth, blobs_out));
+ } else if (PySet_Check(elem)) {
+ RETURN_NOT_OK(builder->AppendSet(context, elem, recursion_depth, blobs_out));
+ } else if (PyArray_IsScalar(elem, Generic)) {
+ RETURN_NOT_OK(AppendScalar(elem, builder));
+ } else if (PyArray_CheckExact(elem)) {
+ RETURN_NOT_OK(AppendArray(context, reinterpret_cast<PyArrayObject*>(elem), builder,
+ recursion_depth, blobs_out));
+ } else if (elem == Py_None) {
+ RETURN_NOT_OK(builder->AppendNone());
+ } else if (PyDateTime_Check(elem)) {
+ PyDateTime_DateTime* datetime = reinterpret_cast<PyDateTime_DateTime*>(elem);
+ RETURN_NOT_OK(builder->AppendDate64(internal::PyDateTime_to_us(datetime)));
+ } else if (is_buffer(elem)) {
+ RETURN_NOT_OK(builder->AppendBuffer(static_cast<int32_t>(blobs_out->buffers.size())));
+ ARROW_ASSIGN_OR_RAISE(auto buffer, unwrap_buffer(elem));
+ blobs_out->buffers.push_back(buffer);
+ } else if (is_tensor(elem)) {
+ RETURN_NOT_OK(builder->AppendTensor(static_cast<int32_t>(blobs_out->tensors.size())));
+ ARROW_ASSIGN_OR_RAISE(auto tensor, unwrap_tensor(elem));
+ blobs_out->tensors.push_back(tensor);
+ } else if (is_sparse_coo_tensor(elem)) {
+ RETURN_NOT_OK(builder->AppendSparseCOOTensor(
+ static_cast<int32_t>(blobs_out->sparse_tensors.size())));
+ ARROW_ASSIGN_OR_RAISE(auto tensor, unwrap_sparse_coo_tensor(elem));
+ blobs_out->sparse_tensors.push_back(tensor);
+ } else if (is_sparse_csr_matrix(elem)) {
+ RETURN_NOT_OK(builder->AppendSparseCSRMatrix(
+ static_cast<int32_t>(blobs_out->sparse_tensors.size())));
+ ARROW_ASSIGN_OR_RAISE(auto matrix, unwrap_sparse_csr_matrix(elem));
+ blobs_out->sparse_tensors.push_back(matrix);
+ } else if (is_sparse_csc_matrix(elem)) {
+ RETURN_NOT_OK(builder->AppendSparseCSCMatrix(
+ static_cast<int32_t>(blobs_out->sparse_tensors.size())));
+ ARROW_ASSIGN_OR_RAISE(auto matrix, unwrap_sparse_csc_matrix(elem));
+ blobs_out->sparse_tensors.push_back(matrix);
+ } else if (is_sparse_csf_tensor(elem)) {
+ RETURN_NOT_OK(builder->AppendSparseCSFTensor(
+ static_cast<int32_t>(blobs_out->sparse_tensors.size())));
+ ARROW_ASSIGN_OR_RAISE(auto tensor, unwrap_sparse_csf_tensor(elem));
+ blobs_out->sparse_tensors.push_back(tensor);
+ } else {
+ // Attempt to serialize the object using the custom callback.
+ PyObject* serialized_object;
+ // The reference count of serialized_object will be decremented in SerializeDict
+ RETURN_NOT_OK(CallSerializeCallback(context, elem, &serialized_object));
+ builder->AppendDict(context, serialized_object, recursion_depth, blobs_out));
+ }
+ return Status::OK();
+Status AppendArray(PyObject* context, PyArrayObject* array, SequenceBuilder* builder,
+ int32_t recursion_depth, SerializedPyObject* blobs_out) {
+ int dtype = PyArray_TYPE(array);
+ switch (dtype) {
+ case NPY_UINT8:
+ case NPY_INT8:
+ case NPY_UINT16:
+ case NPY_INT16:
+ case NPY_UINT32:
+ case NPY_INT32:
+ case NPY_UINT64:
+ case NPY_INT64:
+ case NPY_HALF:
+ case NPY_FLOAT:
+ case NPY_DOUBLE: {
+ builder->AppendNdarray(static_cast<int32_t>(blobs_out->ndarrays.size())));
+ std::shared_ptr<Tensor> tensor;
+ RETURN_NOT_OK(NdarrayToTensor(default_memory_pool(),
+ reinterpret_cast<PyObject*>(array), {}, &tensor));
+ blobs_out->ndarrays.push_back(tensor);
+ } break;
+ default: {
+ PyObject* serialized_object;
+ // The reference count of serialized_object will be decremented in SerializeDict
+ RETURN_NOT_OK(CallSerializeCallback(context, reinterpret_cast<PyObject*>(array),
+ &serialized_object));
+ RETURN_NOT_OK(builder->AppendDict(context, serialized_object, recursion_depth + 1,
+ blobs_out));
+ }
+ }
+ return Status::OK();
+std::shared_ptr<RecordBatch> MakeBatch(std::shared_ptr<Array> data) {
+ auto field = std::make_shared<Field>("list", data->type());
+ auto schema = ::arrow::schema({field});
+ return RecordBatch::Make(schema, data->length(), {data});
+Status SerializeObject(PyObject* context, PyObject* sequence, SerializedPyObject* out) {
+ PyAcquireGIL lock;
+ SequenceBuilder builder;
+ RETURN_NOT_OK(internal::VisitIterable(
+ sequence, [&](PyObject* obj, bool* keep_going /* unused */) {
+ return Append(context, obj, &builder, 0, out);
+ }));
+ std::shared_ptr<Array> array;
+ RETURN_NOT_OK(builder.Finish(&array));
+ out->batch = MakeBatch(array);
+ return Status::OK();
+Status SerializeNdarray(std::shared_ptr<Tensor> tensor, SerializedPyObject* out) {
+ std::shared_ptr<Array> array;
+ SequenceBuilder builder;
+ RETURN_NOT_OK(builder.AppendNdarray(static_cast<int32_t>(out->ndarrays.size())));
+ out->ndarrays.push_back(tensor);
+ RETURN_NOT_OK(builder.Finish(&array));
+ out->batch = MakeBatch(array);
+ return Status::OK();
+Status WriteNdarrayHeader(std::shared_ptr<DataType> dtype,
+ const std::vector<int64_t>& shape, int64_t tensor_num_bytes,
+ io::OutputStream* dst) {
+ auto empty_tensor = std::make_shared<Tensor>(
+ dtype, std::make_shared<Buffer>(nullptr, tensor_num_bytes), shape);
+ SerializedPyObject serialized_tensor;
+ RETURN_NOT_OK(SerializeNdarray(empty_tensor, &serialized_tensor));
+ return serialized_tensor.WriteTo(dst);
+ : ipc_options(ipc::IpcWriteOptions::Defaults()) {}
+Status SerializedPyObject::WriteTo(io::OutputStream* dst) {
+ int32_t num_tensors = static_cast<int32_t>(this->tensors.size());
+ int32_t num_sparse_tensors = static_cast<int32_t>(this->sparse_tensors.size());
+ int32_t num_ndarrays = static_cast<int32_t>(this->ndarrays.size());
+ int32_t num_buffers = static_cast<int32_t>(this->buffers.size());
+ dst->Write(reinterpret_cast<const uint8_t*>(&num_tensors), sizeof(int32_t)));
+ dst->Write(reinterpret_cast<const uint8_t*>(&num_sparse_tensors), sizeof(int32_t)));
+ dst->Write(reinterpret_cast<const uint8_t*>(&num_ndarrays), sizeof(int32_t)));
+ dst->Write(reinterpret_cast<const uint8_t*>(&num_buffers), sizeof(int32_t)));
+ // Align stream to 8-byte offset
+ RETURN_NOT_OK(ipc::AlignStream(dst, ipc::kArrowIpcAlignment));
+ RETURN_NOT_OK(ipc::WriteRecordBatchStream({this->batch}, this->ipc_options, dst));
+ // Align stream to 64-byte offset so tensor bodies are 64-byte aligned
+ RETURN_NOT_OK(ipc::AlignStream(dst, ipc::kTensorAlignment));
+ int32_t metadata_length;
+ int64_t body_length;
+ for (const auto& tensor : this->tensors) {
+ RETURN_NOT_OK(ipc::WriteTensor(*tensor, dst, &metadata_length, &body_length));
+ RETURN_NOT_OK(ipc::AlignStream(dst, ipc::kTensorAlignment));
+ }
+ for (const auto& sparse_tensor : this->sparse_tensors) {
+ ipc::WriteSparseTensor(*sparse_tensor, dst, &metadata_length, &body_length));
+ RETURN_NOT_OK(ipc::AlignStream(dst, ipc::kTensorAlignment));
+ }
+ for (const auto& tensor : this->ndarrays) {
+ RETURN_NOT_OK(ipc::WriteTensor(*tensor, dst, &metadata_length, &body_length));
+ RETURN_NOT_OK(ipc::AlignStream(dst, ipc::kTensorAlignment));
+ }
+ for (const auto& buffer : this->buffers) {
+ int64_t size = buffer->size();
+ RETURN_NOT_OK(dst->Write(reinterpret_cast<const uint8_t*>(&size), sizeof(int64_t)));
+ RETURN_NOT_OK(dst->Write(buffer->data(), size));
+ }
+ return Status::OK();
+namespace {
+Status CountSparseTensors(
+ const std::vector<std::shared_ptr<SparseTensor>>& sparse_tensors, PyObject** out) {
+ OwnedRef num_sparse_tensors(PyDict_New());
+ size_t num_coo = 0;
+ size_t num_csr = 0;
+ size_t num_csc = 0;
+ size_t num_csf = 0;
+ size_t ndim_csf = 0;
+ for (const auto& sparse_tensor : sparse_tensors) {
+ switch (sparse_tensor->format_id()) {
+ case SparseTensorFormat::COO:
+ ++num_coo;
+ break;
+ case SparseTensorFormat::CSR:
+ ++num_csr;
+ break;
+ case SparseTensorFormat::CSC:
+ ++num_csc;
+ break;
+ case SparseTensorFormat::CSF:
+ ++num_csf;
+ ndim_csf += sparse_tensor->ndim();
+ break;
+ }
+ }
+ PyDict_SetItemString(num_sparse_tensors.obj(), "coo", PyLong_FromSize_t(num_coo));
+ PyDict_SetItemString(num_sparse_tensors.obj(), "csr", PyLong_FromSize_t(num_csr));
+ PyDict_SetItemString(num_sparse_tensors.obj(), "csc", PyLong_FromSize_t(num_csc));
+ PyDict_SetItemString(num_sparse_tensors.obj(), "csf", PyLong_FromSize_t(num_csf));
+ PyDict_SetItemString(num_sparse_tensors.obj(), "ndim_csf", PyLong_FromSize_t(ndim_csf));
+ *out = num_sparse_tensors.detach();
+ return Status::OK();
+} // namespace
+Status SerializedPyObject::GetComponents(MemoryPool* memory_pool, PyObject** out) {
+ PyAcquireGIL py_gil;
+ OwnedRef result(PyDict_New());
+ PyObject* buffers = PyList_New(0);
+ PyObject* num_sparse_tensors = nullptr;
+ // TODO(wesm): Not sure how pedantic we need to be about checking the return
+ // values of these functions. There are other places where we do not check
+ // PyDict_SetItem/SetItemString return value, but these failures would be
+ // quite esoteric
+ PyDict_SetItemString(result.obj(), "num_tensors",
+ PyLong_FromSize_t(this->tensors.size()));
+ RETURN_NOT_OK(CountSparseTensors(this->sparse_tensors, &num_sparse_tensors));
+ PyDict_SetItemString(result.obj(), "num_sparse_tensors", num_sparse_tensors);
+ PyDict_SetItemString(result.obj(), "ndim_csf", num_sparse_tensors);
+ PyDict_SetItemString(result.obj(), "num_ndarrays",
+ PyLong_FromSize_t(this->ndarrays.size()));
+ PyDict_SetItemString(result.obj(), "num_buffers",
+ PyLong_FromSize_t(this->buffers.size()));
+ PyDict_SetItemString(result.obj(), "data", buffers);
+ Py_DECREF(buffers);
+ auto PushBuffer = [&buffers](const std::shared_ptr<Buffer>& buffer) {
+ PyObject* wrapped_buffer = wrap_buffer(buffer);
+ if (PyList_Append(buffers, wrapped_buffer) < 0) {
+ Py_DECREF(wrapped_buffer);
+ }
+ Py_DECREF(wrapped_buffer);
+ return Status::OK();
+ };
+ constexpr int64_t kInitialCapacity = 1024;
+ // Write the record batch describing the object structure
+ py_gil.release();
+ ARROW_ASSIGN_OR_RAISE(auto stream,
+ io::BufferOutputStream::Create(kInitialCapacity, memory_pool));
+ ipc::WriteRecordBatchStream({this->batch}, this->ipc_options, stream.get()));
+ ARROW_ASSIGN_OR_RAISE(auto buffer, stream->Finish());
+ py_gil.acquire();
+ RETURN_NOT_OK(PushBuffer(buffer));
+ // For each tensor, get a metadata buffer and a buffer for the body
+ for (const auto& tensor : this->tensors) {
+ ARROW_ASSIGN_OR_RAISE(std::unique_ptr<ipc::Message> message,
+ ipc::GetTensorMessage(*tensor, memory_pool));
+ RETURN_NOT_OK(PushBuffer(message->metadata()));
+ RETURN_NOT_OK(PushBuffer(message->body()));
+ }
+ // For each sparse tensor, get a metadata buffer and buffers containing index and data
+ for (const auto& sparse_tensor : this->sparse_tensors) {
+ ipc::IpcPayload payload;
+ RETURN_NOT_OK(ipc::GetSparseTensorPayload(*sparse_tensor, memory_pool, &payload));
+ RETURN_NOT_OK(PushBuffer(payload.metadata));
+ for (const auto& body : payload.body_buffers) {
+ RETURN_NOT_OK(PushBuffer(body));
+ }
+ }
+ // For each ndarray, get a metadata buffer and a buffer for the body
+ for (const auto& ndarray : this->ndarrays) {
+ ARROW_ASSIGN_OR_RAISE(std::unique_ptr<ipc::Message> message,
+ ipc::GetTensorMessage(*ndarray, memory_pool));
+ RETURN_NOT_OK(PushBuffer(message->metadata()));
+ RETURN_NOT_OK(PushBuffer(message->body()));
+ }
+ for (const auto& buf : this->buffers) {
+ RETURN_NOT_OK(PushBuffer(buf));
+ }
+ *out = result.detach();
+ return Status::OK();
+} // namespace py
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/python/serialize.h b/contrib/libs/apache/arrow/cpp/src/arrow/python/serialize.h
new file mode 100644
index 0000000000..fd207d3e06
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/python/serialize.h
@@ -0,0 +1,145 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+#include <memory>
+#include <vector>
+#include "arrow/ipc/options.h"
+#include "arrow/python/visibility.h"
+#include "arrow/sparse_tensor.h"
+#include "arrow/status.h"
+// Forward declaring PyObject, see
+// https://mail.python.org/pipermail/python-dev/2003-August/037601.html
+#ifndef PyObject_HEAD
+struct _object;
+typedef _object PyObject;
+namespace arrow {
+class Buffer;
+class DataType;
+class MemoryPool;
+class RecordBatch;
+class Tensor;
+namespace io {
+class OutputStream;
+} // namespace io
+namespace py {
+struct ARROW_PYTHON_EXPORT SerializedPyObject {
+ std::shared_ptr<RecordBatch> batch;
+ std::vector<std::shared_ptr<Tensor>> tensors;
+ std::vector<std::shared_ptr<SparseTensor>> sparse_tensors;
+ std::vector<std::shared_ptr<Tensor>> ndarrays;
+ std::vector<std::shared_ptr<Buffer>> buffers;
+ ipc::IpcWriteOptions ipc_options;
+ SerializedPyObject();
+ /// \brief Write serialized Python object to OutputStream
+ /// \param[in,out] dst an OutputStream
+ /// \return Status
+ Status WriteTo(io::OutputStream* dst);
+ /// \brief Convert SerializedPyObject to a dict containing the message
+ /// components as Buffer instances with minimal memory allocation
+ ///
+ /// {
+ /// 'num_tensors': M,
+ /// 'num_sparse_tensors': N,
+ /// 'num_buffers': K,
+ /// 'data': [Buffer]
+ /// }
+ ///
+ /// Each tensor is written as two buffers, one for the metadata and one for
+ /// the body. Therefore, the number of buffers in 'data' is 2 * M + 2 * N + K + 1,
+ /// with the first buffer containing the serialized record batch containing
+ /// the UnionArray that describes the whole object
+ Status GetComponents(MemoryPool* pool, PyObject** out);
+/// \brief Serialize Python sequence as a SerializedPyObject.
+/// \param[in] context Serialization context which contains custom serialization
+/// and deserialization callbacks. Can be any Python object with a
+/// _serialize_callback method for serialization and a _deserialize_callback
+/// method for deserialization. If context is None, no custom serialization
+/// will be attempted.
+/// \param[in] sequence A Python sequence object to serialize to Arrow data
+/// structures
+/// \param[out] out The serialized representation
+/// \return Status
+/// Release GIL before calling
+Status SerializeObject(PyObject* context, PyObject* sequence, SerializedPyObject* out);
+/// \brief Serialize an Arrow Tensor as a SerializedPyObject.
+/// \param[in] tensor Tensor to be serialized
+/// \param[out] out The serialized representation
+/// \return Status
+Status SerializeTensor(std::shared_ptr<Tensor> tensor, py::SerializedPyObject* out);
+/// \brief Write the Tensor metadata header to an OutputStream.
+/// \param[in] dtype DataType of the Tensor
+/// \param[in] shape The shape of the tensor
+/// \param[in] tensor_num_bytes The length of the Tensor data in bytes
+/// \param[in] dst The OutputStream to write the Tensor header to
+/// \return Status
+Status WriteNdarrayHeader(std::shared_ptr<DataType> dtype,
+ const std::vector<int64_t>& shape, int64_t tensor_num_bytes,
+ io::OutputStream* dst);
+struct PythonType {
+ enum type {
+ INT,
+ PY2INT, // Kept for compatibility
+ DATE64,
+ SET,
+ };
+} // namespace py
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/python/type_traits.h b/contrib/libs/apache/arrow/cpp/src/arrow/python/type_traits.h
new file mode 100644
index 0000000000..a941577f76
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/python/type_traits.h
@@ -0,0 +1,350 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+// Internal header
+#pragma once
+#include "arrow/python/platform.h"
+#include <cstdint>
+#include <limits>
+#include "arrow/python/numpy_interop.h"
+#include <numpy/halffloat.h>
+#include "arrow/type_fwd.h"
+#include "arrow/util/logging.h"
+namespace arrow {
+namespace py {
+static constexpr int64_t kPandasTimestampNull = std::numeric_limits<int64_t>::min();
+constexpr int64_t kNanosecondsInDay = 86400000000000LL;
+namespace internal {
+// Type traits for Numpy -> Arrow equivalence
+template <int TYPE>
+struct npy_traits {};
+template <>
+struct npy_traits<NPY_BOOL> {
+ typedef uint8_t value_type;
+ using TypeClass = BooleanType;
+ using BuilderClass = BooleanBuilder;
+ static constexpr bool supports_nulls = false;
+ static inline bool isnull(uint8_t v) { return false; }
+#define NPY_INT_DECL(TYPE, CapType, T) \
+ template <> \
+ struct npy_traits<NPY_##TYPE> { \
+ typedef T value_type; \
+ using TypeClass = CapType##Type; \
+ using BuilderClass = CapType##Builder; \
+ \
+ static constexpr bool supports_nulls = false; \
+ static inline bool isnull(T v) { return false; } \
+ };
+NPY_INT_DECL(INT8, Int8, int8_t);
+NPY_INT_DECL(INT16, Int16, int16_t);
+NPY_INT_DECL(INT32, Int32, int32_t);
+NPY_INT_DECL(INT64, Int64, int64_t);
+NPY_INT_DECL(UINT8, UInt8, uint8_t);
+NPY_INT_DECL(UINT16, UInt16, uint16_t);
+NPY_INT_DECL(UINT32, UInt32, uint32_t);
+NPY_INT_DECL(UINT64, UInt64, uint64_t);
+#if !NPY_INT32_IS_INT && NPY_BITSOF_INT == 32
+NPY_INT_DECL(INT, Int32, int32_t);
+NPY_INT_DECL(UINT, UInt32, uint32_t);
+NPY_INT_DECL(LONGLONG, Int64, int64_t);
+NPY_INT_DECL(ULONGLONG, UInt64, uint64_t);
+template <>
+struct npy_traits<NPY_FLOAT16> {
+ typedef npy_half value_type;
+ using TypeClass = HalfFloatType;
+ using BuilderClass = HalfFloatBuilder;
+ static constexpr npy_half na_sentinel = NPY_HALF_NAN;
+ static constexpr bool supports_nulls = true;
+ static inline bool isnull(npy_half v) { return v == NPY_HALF_NAN; }
+template <>
+struct npy_traits<NPY_FLOAT32> {
+ typedef float value_type;
+ using TypeClass = FloatType;
+ using BuilderClass = FloatBuilder;
+ // We need to use quiet_NaN here instead of the NAN macro as on Windows
+ // the NAN macro leads to "division-by-zero" compile-time error with clang.
+ static constexpr float na_sentinel = std::numeric_limits<float>::quiet_NaN();
+ static constexpr bool supports_nulls = true;
+ static inline bool isnull(float v) { return v != v; }
+template <>
+struct npy_traits<NPY_FLOAT64> {
+ typedef double value_type;
+ using TypeClass = DoubleType;
+ using BuilderClass = DoubleBuilder;
+ static constexpr double na_sentinel = std::numeric_limits<double>::quiet_NaN();
+ static constexpr bool supports_nulls = true;
+ static inline bool isnull(double v) { return v != v; }
+template <>
+struct npy_traits<NPY_DATETIME> {
+ typedef int64_t value_type;
+ using TypeClass = TimestampType;
+ using BuilderClass = TimestampBuilder;
+ static constexpr bool supports_nulls = true;
+ static inline bool isnull(int64_t v) {
+ // NaT = -2**63
+ // = -0x8000000000000000
+ // = -9223372036854775808;
+ // = std::numeric_limits<int64_t>::min()
+ return v == std::numeric_limits<int64_t>::min();
+ }
+template <>
+struct npy_traits<NPY_TIMEDELTA> {
+ typedef int64_t value_type;
+ using TypeClass = DurationType;
+ using BuilderClass = DurationBuilder;
+ static constexpr bool supports_nulls = true;
+ static inline bool isnull(int64_t v) {
+ // NaT = -2**63 = std::numeric_limits<int64_t>::min()
+ return v == std::numeric_limits<int64_t>::min();
+ }
+template <>
+struct npy_traits<NPY_OBJECT> {
+ typedef PyObject* value_type;
+ static constexpr bool supports_nulls = true;
+ static inline bool isnull(PyObject* v) { return v == Py_None; }
+// Type traits for Arrow -> Numpy equivalence
+// Note *supports_nulls* means the equivalent Numpy type support nulls
+template <int TYPE>
+struct arrow_traits {};
+template <>
+struct arrow_traits<Type::BOOL> {
+ static constexpr int npy_type = NPY_BOOL;
+ static constexpr bool supports_nulls = false;
+ typedef typename npy_traits<NPY_BOOL>::value_type T;
+#define INT_DECL(TYPE) \
+ template <> \
+ struct arrow_traits<Type::TYPE> { \
+ static constexpr int npy_type = NPY_##TYPE; \
+ static constexpr bool supports_nulls = false; \
+ static constexpr double na_value = std::numeric_limits<double>::quiet_NaN(); \
+ typedef typename npy_traits<NPY_##TYPE>::value_type T; \
+ };
+template <>
+struct arrow_traits<Type::HALF_FLOAT> {
+ static constexpr int npy_type = NPY_FLOAT16;
+ static constexpr bool supports_nulls = true;
+ static constexpr uint16_t na_value = NPY_HALF_NAN;
+ typedef typename npy_traits<NPY_FLOAT16>::value_type T;
+template <>
+struct arrow_traits<Type::FLOAT> {
+ static constexpr int npy_type = NPY_FLOAT32;
+ static constexpr bool supports_nulls = true;
+ static constexpr float na_value = std::numeric_limits<float>::quiet_NaN();
+ typedef typename npy_traits<NPY_FLOAT32>::value_type T;
+template <>
+struct arrow_traits<Type::DOUBLE> {
+ static constexpr int npy_type = NPY_FLOAT64;
+ static constexpr bool supports_nulls = true;
+ static constexpr double na_value = std::numeric_limits<double>::quiet_NaN();
+ typedef typename npy_traits<NPY_FLOAT64>::value_type T;
+template <>
+struct arrow_traits<Type::TIMESTAMP> {
+ static constexpr int npy_type = NPY_DATETIME;
+ static constexpr int64_t npy_shift = 1;
+ static constexpr bool supports_nulls = true;
+ static constexpr int64_t na_value = kPandasTimestampNull;
+ typedef typename npy_traits<NPY_DATETIME>::value_type T;
+template <>
+struct arrow_traits<Type::DURATION> {
+ static constexpr int npy_type = NPY_TIMEDELTA;
+ static constexpr int64_t npy_shift = 1;
+ static constexpr bool supports_nulls = true;
+ static constexpr int64_t na_value = kPandasTimestampNull;
+ typedef typename npy_traits<NPY_TIMEDELTA>::value_type T;
+template <>
+struct arrow_traits<Type::DATE32> {
+ // Data stores as FR_D day unit
+ static constexpr int npy_type = NPY_DATETIME;
+ static constexpr int64_t npy_shift = 1;
+ static constexpr bool supports_nulls = true;
+ typedef typename npy_traits<NPY_DATETIME>::value_type T;
+ static constexpr int64_t na_value = kPandasTimestampNull;
+ static inline bool isnull(int64_t v) { return npy_traits<NPY_DATETIME>::isnull(v); }
+template <>
+struct arrow_traits<Type::DATE64> {
+ // Data stores as FR_D day unit
+ static constexpr int npy_type = NPY_DATETIME;
+ // There are 1000 * 60 * 60 * 24 = 86400000ms in a day
+ static constexpr int64_t npy_shift = 86400000;
+ static constexpr bool supports_nulls = true;
+ typedef typename npy_traits<NPY_DATETIME>::value_type T;
+ static constexpr int64_t na_value = kPandasTimestampNull;
+ static inline bool isnull(int64_t v) { return npy_traits<NPY_DATETIME>::isnull(v); }
+template <>
+struct arrow_traits<Type::TIME32> {
+ static constexpr int npy_type = NPY_OBJECT;
+ static constexpr bool supports_nulls = true;
+ static constexpr int64_t na_value = kPandasTimestampNull;
+ typedef typename npy_traits<NPY_DATETIME>::value_type T;
+template <>
+struct arrow_traits<Type::TIME64> {
+ static constexpr int npy_type = NPY_OBJECT;
+ static constexpr bool supports_nulls = true;
+ typedef typename npy_traits<NPY_DATETIME>::value_type T;
+template <>
+struct arrow_traits<Type::STRING> {
+ static constexpr int npy_type = NPY_OBJECT;
+ static constexpr bool supports_nulls = true;
+template <>
+struct arrow_traits<Type::BINARY> {
+ static constexpr int npy_type = NPY_OBJECT;
+ static constexpr bool supports_nulls = true;
+static inline NPY_DATETIMEUNIT NumPyFrequency(TimeUnit::type unit) {
+ switch (unit) {
+ case TimestampType::Unit::SECOND:
+ return NPY_FR_s;
+ case TimestampType::Unit::MILLI:
+ return NPY_FR_ms;
+ break;
+ case TimestampType::Unit::MICRO:
+ return NPY_FR_us;
+ default:
+ // NANO
+ return NPY_FR_ns;
+ }
+static inline int NumPyTypeSize(int npy_type) {
+ npy_type = fix_numpy_type_num(npy_type);
+ switch (npy_type) {
+ case NPY_BOOL:
+ case NPY_INT8:
+ case NPY_UINT8:
+ return 1;
+ case NPY_INT16:
+ case NPY_UINT16:
+ return 2;
+ case NPY_INT32:
+ case NPY_UINT32:
+ return 4;
+ case NPY_INT64:
+ case NPY_UINT64:
+ return 8;
+ case NPY_FLOAT16:
+ return 2;
+ case NPY_FLOAT32:
+ return 4;
+ case NPY_FLOAT64:
+ return 8;
+ return 8;
+ case NPY_OBJECT:
+ return sizeof(void*);
+ default:
+ ARROW_CHECK(false) << "unhandled numpy type";
+ break;
+ }
+ return -1;
+} // namespace internal
+} // namespace py
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/python/visibility.h b/contrib/libs/apache/arrow/cpp/src/arrow/python/visibility.h
new file mode 100644
index 0000000000..c0b343c70e
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/python/visibility.h
@@ -0,0 +1,39 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+#if defined(_WIN32) || defined(__CYGWIN__) // Windows
+#if defined(_MSC_VER)
+#pragma warning(disable : 4251)
+#pragma GCC diagnostic ignored "-Wattributes"
+#define ARROW_PYTHON_EXPORT __declspec(dllexport)
+#define ARROW_PYTHON_EXPORT __declspec(dllimport)
+#else // Not Windows
+#define ARROW_PYTHON_EXPORT __attribute__((visibility("default")))
+#endif // Non-Windows
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/converter.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/converter.h
new file mode 100644
index 0000000000..0b29e0f5bc
--- /dev/null
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/converter.h
@@ -0,0 +1,411 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#include <string>
+#include <utility>
+#include <vector>
+#include "arrow/array.h"
+#include "arrow/chunked_array.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/make_unique.h"
+#include "arrow/visitor_inline.h"
+namespace arrow {
+namespace internal {
+template <typename BaseConverter, template <typename...> class ConverterTrait>
+static Result<std::unique_ptr<BaseConverter>> MakeConverter(
+ std::shared_ptr<DataType> type, typename BaseConverter::OptionsType options,
+ MemoryPool* pool);
+template <typename Input, typename Options>
+class Converter {
+ public:
+ using Self = Converter<Input, Options>;
+ using InputType = Input;
+ using OptionsType = Options;
+ virtual ~Converter() = default;
+ Status Construct(std::shared_ptr<DataType> type, OptionsType options,
+ MemoryPool* pool) {
+ type_ = std::move(type);
+ options_ = std::move(options);
+ return Init(pool);
+ }
+ virtual Status Append(InputType value) { return Status::NotImplemented("Append"); }
+ virtual Status Extend(InputType values, int64_t size, int64_t offset = 0) {
+ return Status::NotImplemented("Extend");
+ }
+ virtual Status ExtendMasked(InputType values, InputType mask, int64_t size,
+ int64_t offset = 0) {
+ return Status::NotImplemented("ExtendMasked");
+ }
+ const std::shared_ptr<ArrayBuilder>& builder() const { return builder_; }
+ const std::shared_ptr<DataType>& type() const { return type_; }
+ OptionsType options() const { return options_; }
+ bool may_overflow() const { return may_overflow_; }
+ bool rewind_on_overflow() const { return rewind_on_overflow_; }
+ virtual Status Reserve(int64_t additional_capacity) {
+ return builder_->Reserve(additional_capacity);
+ }
+ Status AppendNull() { return builder_->AppendNull(); }
+ virtual Result<std::shared_ptr<Array>> ToArray() { return builder_->Finish(); }
+ virtual Result<std::shared_ptr<Array>> ToArray(int64_t length) {
+ ARROW_ASSIGN_OR_RAISE(auto arr, this->ToArray());
+ return arr->Slice(0, length);
+ }
+ virtual Result<std::shared_ptr<ChunkedArray>> ToChunkedArray() {
+ ARROW_ASSIGN_OR_RAISE(auto array, ToArray());
+ std::vector<std::shared_ptr<Array>> chunks = {std::move(array)};
+ return std::make_shared<ChunkedArray>(chunks);
+ }
+ protected:
+ virtual Status Init(MemoryPool* pool) { return Status::OK(); }
+ std::shared_ptr<DataType> type_;
+ std::shared_ptr<ArrayBuilder> builder_;
+ OptionsType options_;
+ bool may_overflow_ = false;
+ bool rewind_on_overflow_ = false;
+template <typename ArrowType, typename BaseConverter>
+class PrimitiveConverter : public BaseConverter {
+ public:
+ using BuilderType = typename TypeTraits<ArrowType>::BuilderType;
+ protected:
+ Status Init(MemoryPool* pool) override {
+ this->builder_ = std::make_shared<BuilderType>(this->type_, pool);
+ // Narrow variable-sized binary types may overflow
+ this->may_overflow_ = is_binary_like(this->type_->id());
+ primitive_type_ = checked_cast<const ArrowType*>(this->type_.get());
+ primitive_builder_ = checked_cast<BuilderType*>(this->builder_.get());
+ return Status::OK();
+ }
+ const ArrowType* primitive_type_;
+ BuilderType* primitive_builder_;
+template <typename ArrowType, typename BaseConverter,
+ template <typename...> class ConverterTrait>
+class ListConverter : public BaseConverter {
+ public:
+ using BuilderType = typename TypeTraits<ArrowType>::BuilderType;
+ using ConverterType = typename ConverterTrait<ArrowType>::type;
+ protected:
+ Status Init(MemoryPool* pool) override {
+ list_type_ = checked_cast<const ArrowType*>(this->type_.get());
+ ARROW_ASSIGN_OR_RAISE(value_converter_,
+ (MakeConverter<BaseConverter, ConverterTrait>(
+ list_type_->value_type(), this->options_, pool)));
+ this->builder_ =
+ std::make_shared<BuilderType>(pool, value_converter_->builder(), this->type_);
+ list_builder_ = checked_cast<BuilderType*>(this->builder_.get());
+ // Narrow list types may overflow
+ this->may_overflow_ = this->rewind_on_overflow_ =
+ sizeof(typename ArrowType::offset_type) < sizeof(int64_t);
+ return Status::OK();
+ }
+ const ArrowType* list_type_;
+ BuilderType* list_builder_;
+ std::unique_ptr<BaseConverter> value_converter_;
+template <typename BaseConverter, template <typename...> class ConverterTrait>
+class StructConverter : public BaseConverter {
+ public:
+ using ConverterType = typename ConverterTrait<StructType>::type;
+ Status Reserve(int64_t additional_capacity) override {
+ ARROW_RETURN_NOT_OK(this->builder_->Reserve(additional_capacity));
+ for (const auto& child : children_) {
+ ARROW_RETURN_NOT_OK(child->Reserve(additional_capacity));
+ }
+ return Status::OK();
+ }
+ protected:
+ Status Init(MemoryPool* pool) override {
+ std::unique_ptr<BaseConverter> child_converter;
+ std::vector<std::shared_ptr<ArrayBuilder>> child_builders;
+ struct_type_ = checked_cast<const StructType*>(this->type_.get());
+ for (const auto& field : struct_type_->fields()) {
+ ARROW_ASSIGN_OR_RAISE(child_converter,
+ (MakeConverter<BaseConverter, ConverterTrait>(
+ field->type(), this->options_, pool)));
+ this->may_overflow_ |= child_converter->may_overflow();
+ this->rewind_on_overflow_ = this->may_overflow_;
+ child_builders.push_back(child_converter->builder());
+ children_.push_back(std::move(child_converter));
+ }
+ this->builder_ =
+ std::make_shared<StructBuilder>(this->type_, pool, std::move(child_builders));
+ struct_builder_ = checked_cast<StructBuilder*>(this->builder_.get());
+ return Status::OK();
+ }
+ const StructType* struct_type_;
+ StructBuilder* struct_builder_;
+ std::vector<std::unique_ptr<BaseConverter>> children_;
+template <typename ValueType, typename BaseConverter>
+class DictionaryConverter : public BaseConverter {
+ public:
+ using BuilderType = DictionaryBuilder<ValueType>;
+ protected:
+ Status Init(MemoryPool* pool) override {
+ std::unique_ptr<ArrayBuilder> builder;
+ ARROW_RETURN_NOT_OK(MakeDictionaryBuilder(pool, this->type_, NULLPTR, &builder));
+ this->builder_ = std::move(builder);
+ this->may_overflow_ = false;
+ dict_type_ = checked_cast<const DictionaryType*>(this->type_.get());
+ value_type_ = checked_cast<const ValueType*>(dict_type_->value_type().get());
+ value_builder_ = checked_cast<BuilderType*>(this->builder_.get());
+ return Status::OK();
+ }
+ const DictionaryType* dict_type_;
+ const ValueType* value_type_;
+ BuilderType* value_builder_;
+template <typename BaseConverter, template <typename...> class ConverterTrait>
+struct MakeConverterImpl {
+ template <typename T, typename ConverterType = typename ConverterTrait<T>::type>
+ Status Visit(const T&) {
+ out.reset(new ConverterType());
+ return out->Construct(std::move(type), std::move(options), pool);
+ }
+ Status Visit(const DictionaryType& t) {
+ switch (t.value_type()->id()) {
+ case TYPE::type_id: \
+ out = internal::make_unique< \
+ typename ConverterTrait<DictionaryType>::template dictionary_type<TYPE>>(); \
+ break;
+ DICTIONARY_CASE(FixedSizeBinaryType);
+ default:
+ return Status::NotImplemented("DictionaryArray converter for type ", t.ToString(),
+ " not implemented");
+ }
+ return out->Construct(std::move(type), std::move(options), pool);
+ }
+ Status Visit(const DataType& t) { return Status::NotImplemented(t.name()); }
+ std::shared_ptr<DataType> type;
+ typename BaseConverter::OptionsType options;
+ MemoryPool* pool;
+ std::unique_ptr<BaseConverter> out;
+template <typename BaseConverter, template <typename...> class ConverterTrait>
+static Result<std::unique_ptr<BaseConverter>> MakeConverter(
+ std::shared_ptr<DataType> type, typename BaseConverter::OptionsType options,
+ MemoryPool* pool) {
+ MakeConverterImpl<BaseConverter, ConverterTrait> visitor{
+ std::move(type), std::move(options), pool, NULLPTR};
+ ARROW_RETURN_NOT_OK(VisitTypeInline(*visitor.type, &visitor));
+ return std::move(visitor.out);
+template <typename Converter>
+class Chunker {
+ public:
+ using InputType = typename Converter::InputType;
+ explicit Chunker(std::unique_ptr<Converter> converter)
+ : converter_(std::move(converter)) {}
+ Status Reserve(int64_t additional_capacity) {
+ ARROW_RETURN_NOT_OK(converter_->Reserve(additional_capacity));
+ reserved_ += additional_capacity;
+ return Status::OK();
+ }
+ Status AppendNull() {
+ auto status = converter_->AppendNull();
+ if (ARROW_PREDICT_FALSE(status.IsCapacityError())) {
+ if (converter_->builder()->length() == 0) {
+ // Builder length == 0 means the individual element is too large to append.
+ // In this case, no need to try again.
+ return status;
+ }
+ ARROW_RETURN_NOT_OK(FinishChunk());
+ return converter_->AppendNull();
+ }
+ ++length_;
+ return status;
+ }
+ Status Append(InputType value) {
+ auto status = converter_->Append(value);
+ if (ARROW_PREDICT_FALSE(status.IsCapacityError())) {
+ if (converter_->builder()->length() == 0) {
+ return status;
+ }
+ ARROW_RETURN_NOT_OK(FinishChunk());
+ return Append(value);
+ }
+ ++length_;
+ return status;
+ }
+ Status Extend(InputType values, int64_t size, int64_t offset = 0) {
+ while (offset < size) {
+ auto length_before = converter_->builder()->length();
+ auto status = converter_->Extend(values, size, offset);
+ auto length_after = converter_->builder()->length();
+ auto num_converted = length_after - length_before;
+ offset += num_converted;
+ length_ += num_converted;
+ if (status.IsCapacityError()) {
+ if (converter_->builder()->length() == 0) {
+ // Builder length == 0 means the individual element is too large to append.
+ // In this case, no need to try again.
+ return status;
+ } else if (converter_->rewind_on_overflow()) {
+ // The list-like and binary-like conversion paths may raise a capacity error,
+ // we need to handle them differently. While the binary-like converters check
+ // the capacity before append/extend the list-like converters just check after
+ // append/extend. Thus depending on the implementation semantics we may need
+ // to rewind (slice) the output chunk by one.
+ length_ -= 1;
+ offset -= 1;
+ }
+ ARROW_RETURN_NOT_OK(FinishChunk());
+ } else if (!status.ok()) {
+ return status;
+ }
+ }
+ return Status::OK();
+ }
+ Status ExtendMasked(InputType values, InputType mask, int64_t size,
+ int64_t offset = 0) {
+ while (offset < size) {
+ auto length_before = converter_->builder()->length();
+ auto status = converter_->ExtendMasked(values, mask, size, offset);
+ auto length_after = converter_->builder()->length();
+ auto num_converted = length_after - length_before;
+ offset += num_converted;
+ length_ += num_converted;
+ if (status.IsCapacityError()) {
+ if (converter_->builder()->length() == 0) {
+ // Builder length == 0 means the individual element is too large to append.
+ // In this case, no need to try again.
+ return status;
+ } else if (converter_->rewind_on_overflow()) {
+ // The list-like and binary-like conversion paths may raise a capacity error,
+ // we need to handle them differently. While the binary-like converters check
+ // the capacity before append/extend the list-like converters just check after
+ // append/extend. Thus depending on the implementation semantics we may need
+ // to rewind (slice) the output chunk by one.
+ length_ -= 1;
+ offset -= 1;
+ }
+ ARROW_RETURN_NOT_OK(FinishChunk());
+ } else if (!status.ok()) {
+ return status;
+ }
+ }
+ return Status::OK();
+ }
+ Status FinishChunk() {
+ ARROW_ASSIGN_OR_RAISE(auto chunk, converter_->ToArray(length_));
+ chunks_.push_back(chunk);
+ // Reserve space for the remaining items.
+ // Besides being an optimization, it is also required if the converter's
+ // implementation relies on unsafe builder methods in converter->Append().
+ auto remaining = reserved_ - length_;
+ Reset();
+ return Reserve(remaining);
+ }
+ Result<std::shared_ptr<ChunkedArray>> ToChunkedArray() {
+ ARROW_RETURN_NOT_OK(FinishChunk());
+ return std::make_shared<ChunkedArray>(chunks_);
+ }
+ protected:
+ void Reset() {
+ converter_->builder()->Reset();
+ length_ = 0;
+ reserved_ = 0;
+ }
+ int64_t length_ = 0;
+ int64_t reserved_ = 0;
+ std::unique_ptr<Converter> converter_;
+ std::vector<std::shared_ptr<Array>> chunks_;
+template <typename T>
+static Result<std::unique_ptr<Chunker<T>>> MakeChunker(std::unique_ptr<T> converter) {
+ return internal::make_unique<Chunker<T>>(std::move(converter));
+} // namespace internal
+} // namespace arrow