// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #include "contrib/libs/apache/arrow_next/cpp/src/arrow/tensor/converter_internal.h" #include #include #include #include #include #include "contrib/libs/apache/arrow_next/cpp/src/arrow/buffer.h" #include "contrib/libs/apache/arrow_next/cpp/src/arrow/status.h" #include "contrib/libs/apache/arrow_next/cpp/src/arrow/type.h" #include "contrib/libs/apache/arrow_next/cpp/src/arrow/util/checked_cast.h" #include "contrib/libs/apache/arrow_next/cpp/src/arrow/util/macros.h" #include "contrib/libs/apache/arrow_next/cpp/src/arrow/visit_type_inline.h" namespace arrow20 { class MemoryPool; namespace internal { namespace { template inline void IncrementRowMajorIndex(std::vector& coord, const std::vector& shape) { const int64_t ndim = shape.size(); ++coord[ndim - 1]; if (coord[ndim - 1] == shape[ndim - 1]) { int64_t d = ndim - 1; while (d > 0 && coord[d] == shape[d]) { coord[d] = 0; ++coord[d - 1]; --d; } } } template void ConvertRowMajorTensor(const Tensor& tensor, c_index_type* indices, c_value_type* values, const int64_t size) { const auto ndim = tensor.ndim(); const auto& shape = tensor.shape(); const c_value_type* tensor_data = reinterpret_cast(tensor.raw_data()); constexpr c_value_type zero = 0; std::vector coord(ndim, 0); for (int64_t n = tensor.size(); n > 0; --n) { const c_value_type x = *tensor_data; if (ARROW_PREDICT_FALSE(x != zero)) { std::copy(coord.begin(), coord.end(), indices); *values++ = x; indices += ndim; } IncrementRowMajorIndex(coord, shape); ++tensor_data; } } template void ConvertColumnMajorTensor(const Tensor& tensor, c_index_type* out_indices, c_value_type* out_values, const int64_t size) { const auto ndim = tensor.ndim(); std::vector indices(ndim * size); std::vector values(size); ConvertRowMajorTensor(tensor, indices.data(), values.data(), size); // transpose indices for (int64_t i = 0; i < size; ++i) { for (int j = 0; j < ndim / 2; ++j) { std::swap(indices[i * ndim + j], indices[i * ndim + ndim - j - 1]); } } // sort indices std::vector order(size); std::iota(order.begin(), order.end(), 0); std::sort(order.begin(), order.end(), [&](const int64_t xi, const int64_t yi) { const int64_t x_offset = xi * ndim; const int64_t y_offset = yi * ndim; for (int j = 0; j < ndim; ++j) { const auto x = indices[x_offset + j]; const auto y = indices[y_offset + j]; if (x < y) return true; if (x > y) return false; } return false; }); // transfer result const auto* indices_data = indices.data(); for (int64_t i = 0; i < size; ++i) { out_values[i] = values[i]; std::copy_n(indices_data, ndim, out_indices); indices_data += ndim; out_indices += ndim; } } template void ConvertStridedTensor(const Tensor& tensor, c_index_type* indices, c_value_type* values, const int64_t size) { using ValueType = typename CTypeTraits::ArrowType; const auto& shape = tensor.shape(); const auto ndim = tensor.ndim(); std::vector coord(ndim, 0); constexpr c_value_type zero = 0; c_value_type x; int64_t i; for (int64_t n = tensor.size(); n > 0; --n) { x = tensor.Value(coord); if (ARROW_PREDICT_FALSE(x != zero)) { *values++ = x; for (i = 0; i < ndim; ++i) { *indices++ = static_cast(coord[i]); } } IncrementRowMajorIndex(coord, shape); } } #define CONVERT_TENSOR(func, index_type, value_type, indices, values, size) \ func(tensor_, reinterpret_cast(indices), \ reinterpret_cast(values), size) // Using ARROW_EXPAND is necessary to expand __VA_ARGS__ correctly on VC++. #define CONVERT_ROW_MAJOR_TENSOR(index_type, value_type, ...) \ ARROW_EXPAND(CONVERT_TENSOR(ConvertRowMajorTensor, index_type, value_type, __VA_ARGS__)) #define CONVERT_COLUMN_MAJOR_TENSOR(index_type, value_type, ...) \ ARROW_EXPAND( \ CONVERT_TENSOR(ConvertColumnMajorTensor, index_type, value_type, __VA_ARGS__)) #define CONVERT_STRIDED_TENSOR(index_type, value_type, ...) \ ARROW_EXPAND(CONVERT_TENSOR(ConvertStridedTensor, index_type, value_type, __VA_ARGS__)) // ---------------------------------------------------------------------- // SparseTensorConverter for SparseCOOIndex class SparseCOOTensorConverter : private SparseTensorConverterMixin { using SparseTensorConverterMixin::AssignIndex; using SparseTensorConverterMixin::IsNonZero; public: SparseCOOTensorConverter(const Tensor& tensor, const std::shared_ptr& index_value_type, MemoryPool* pool) : tensor_(tensor), index_value_type_(index_value_type), pool_(pool) {} Status Convert() { RETURN_NOT_OK(::arrow20::internal::CheckSparseIndexMaximumValue(index_value_type_, tensor_.shape())); const int index_elsize = index_value_type_->byte_width(); const int value_elsize = tensor_.type()->byte_width(); const int64_t ndim = tensor_.ndim(); ARROW_ASSIGN_OR_RAISE(int64_t nonzero_count, tensor_.CountNonZero()); ARROW_ASSIGN_OR_RAISE(auto indices_buffer, AllocateBuffer(index_elsize * ndim * nonzero_count, pool_)); uint8_t* indices = indices_buffer->mutable_data(); ARROW_ASSIGN_OR_RAISE(auto values_buffer, AllocateBuffer(value_elsize * nonzero_count, pool_)); uint8_t* values = values_buffer->mutable_data(); const uint8_t* tensor_data = tensor_.raw_data(); if (ndim <= 1) { const int64_t count = ndim == 0 ? 1 : tensor_.shape()[0]; for (int64_t i = 0; i < count; ++i) { if (std::any_of(tensor_data, tensor_data + value_elsize, IsNonZero)) { AssignIndex(indices, i, index_elsize); std::copy_n(tensor_data, value_elsize, values); indices += index_elsize; values += value_elsize; } tensor_data += value_elsize; } } else if (tensor_.is_row_major()) { DISPATCH(CONVERT_ROW_MAJOR_TENSOR, index_elsize, value_elsize, indices, values, nonzero_count); } else if (tensor_.is_column_major()) { DISPATCH(CONVERT_COLUMN_MAJOR_TENSOR, index_elsize, value_elsize, indices, values, nonzero_count); } else { DISPATCH(CONVERT_STRIDED_TENSOR, index_elsize, value_elsize, indices, values, nonzero_count); } // make results const std::vector indices_shape = {nonzero_count, ndim}; std::vector indices_strides; RETURN_NOT_OK(internal::ComputeRowMajorStrides( checked_cast(*index_value_type_), indices_shape, &indices_strides)); auto coords = std::make_shared(index_value_type_, std::move(indices_buffer), indices_shape, indices_strides); ARROW_ASSIGN_OR_RAISE(sparse_index, SparseCOOIndex::Make(coords, true)); data = std::move(values_buffer); return Status::OK(); } std::shared_ptr sparse_index; std::shared_ptr data; private: const Tensor& tensor_; const std::shared_ptr& index_value_type_; MemoryPool* pool_; }; } // namespace void SparseTensorConverterMixin::AssignIndex(uint8_t* indices, int64_t val, const int elsize) { switch (elsize) { case 1: *indices = static_cast(val); break; case 2: *reinterpret_cast(indices) = static_cast(val); break; case 4: *reinterpret_cast(indices) = static_cast(val); break; case 8: *reinterpret_cast(indices) = val; break; default: break; } } int64_t SparseTensorConverterMixin::GetIndexValue(const uint8_t* value_ptr, const int elsize) { switch (elsize) { case 1: return *value_ptr; case 2: return *reinterpret_cast(value_ptr); case 4: return *reinterpret_cast(value_ptr); case 8: return *reinterpret_cast(value_ptr); default: return 0; } } Status MakeSparseCOOTensorFromTensor(const Tensor& tensor, const std::shared_ptr& index_value_type, MemoryPool* pool, std::shared_ptr* out_sparse_index, std::shared_ptr* out_data) { SparseCOOTensorConverter converter(tensor, index_value_type, pool); RETURN_NOT_OK(converter.Convert()); *out_sparse_index = checked_pointer_cast(converter.sparse_index); *out_data = converter.data; return Status::OK(); } Result> MakeTensorFromSparseCOOTensor( MemoryPool* pool, const SparseCOOTensor* sparse_tensor) { const auto& sparse_index = checked_cast(*sparse_tensor->sparse_index()); const auto& coords = sparse_index.indices(); const auto* coords_data = coords->raw_data(); const int index_elsize = coords->type()->byte_width(); const auto& value_type = checked_cast(*sparse_tensor->type()); const int value_elsize = value_type.byte_width(); ARROW_ASSIGN_OR_RAISE(auto values_buffer, AllocateBuffer(value_elsize * sparse_tensor->size(), pool)); auto values = values_buffer->mutable_data(); std::fill_n(values, value_elsize * sparse_tensor->size(), 0); std::vector strides; RETURN_NOT_OK(ComputeRowMajorStrides(value_type, sparse_tensor->shape(), &strides)); const auto* raw_data = sparse_tensor->raw_data(); const int ndim = sparse_tensor->ndim(); for (int64_t i = 0; i < sparse_tensor->non_zero_length(); ++i) { int64_t offset = 0; for (int j = 0; j < ndim; ++j) { auto index = static_cast( SparseTensorConverterMixin::GetIndexValue(coords_data, index_elsize)); offset += index * strides[j]; coords_data += index_elsize; } std::copy_n(raw_data, value_elsize, values + offset); raw_data += value_elsize; } return std::make_shared(sparse_tensor->type(), std::move(values_buffer), sparse_tensor->shape(), strides, sparse_tensor->dim_names()); } } // namespace internal } // namespace arrow20