// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #include "contrib/libs/apache/arrow_next/cpp/src/arrow/tensor/converter.h" #include #include #include #include #include #include "contrib/libs/apache/arrow_next/cpp/src/arrow/buffer.h" #include "contrib/libs/apache/arrow_next/cpp/src/arrow/buffer_builder.h" #include "contrib/libs/apache/arrow_next/cpp/src/arrow/result.h" #include "contrib/libs/apache/arrow_next/cpp/src/arrow/status.h" #include "contrib/libs/apache/arrow_next/cpp/src/arrow/type.h" #include "contrib/libs/apache/arrow_next/cpp/src/arrow/util/checked_cast.h" #include "contrib/libs/apache/arrow_next/cpp/src/arrow/util/sort.h" #include "contrib/libs/apache/arrow_next/cpp/src/arrow/visit_type_inline.h" namespace arrow20 { class MemoryPool; namespace internal { namespace { inline void IncrementIndex(std::vector& coord, const std::vector& shape, const std::vector& axis_order) { const int64_t ndim = shape.size(); const int64_t last_axis = axis_order[ndim - 1]; ++coord[last_axis]; if (coord[last_axis] == shape[last_axis]) { int64_t d = ndim - 1; while (d > 0 && coord[axis_order[d]] == shape[axis_order[d]]) { coord[axis_order[d]] = 0; ++coord[axis_order[d - 1]]; --d; } } } // ---------------------------------------------------------------------- // SparseTensorConverter for SparseCSFIndex class SparseCSFTensorConverter : private SparseTensorConverterMixin { using SparseTensorConverterMixin::AssignIndex; using SparseTensorConverterMixin::IsNonZero; public: SparseCSFTensorConverter(const Tensor& tensor, const std::shared_ptr& index_value_type, MemoryPool* pool) : tensor_(tensor), index_value_type_(index_value_type), pool_(pool) {} Status Convert() { RETURN_NOT_OK(::arrow20::internal::CheckSparseIndexMaximumValue(index_value_type_, tensor_.shape())); const int index_elsize = index_value_type_->byte_width(); const int value_elsize = tensor_.type()->byte_width(); const int64_t ndim = tensor_.ndim(); // Axis order as ascending order of dimension size is a good heuristic but is not // necessarily optimal. std::vector axis_order = internal::ArgSort(tensor_.shape()); ARROW_ASSIGN_OR_RAISE(int64_t nonzero_count, tensor_.CountNonZero()); ARROW_ASSIGN_OR_RAISE(auto values_buffer, AllocateBuffer(value_elsize * nonzero_count, pool_)); auto* values = values_buffer->mutable_data(); std::vector counts(ndim, 0); std::vector coord(ndim, 0); std::vector previous_coord(ndim, -1); std::vector indptr_buffer_builders(ndim - 1); std::vector indices_buffer_builders(ndim); const auto* tensor_data = tensor_.raw_data(); uint8_t index_buffer[sizeof(int64_t)]; if (ndim <= 1) { return Status::NotImplemented("TODO for ndim <= 1"); } else { const auto& shape = tensor_.shape(); for (int64_t n = tensor_.size(); n > 0; n--) { const auto offset = tensor_.CalculateValueOffset(coord); const auto xp = tensor_data + offset; if (std::any_of(xp, xp + value_elsize, IsNonZero)) { bool tree_split = false; std::copy_n(xp, value_elsize, values); values += value_elsize; for (int64_t i = 0; i < ndim; ++i) { int64_t dimension = axis_order[i]; tree_split = tree_split || (coord[dimension] != previous_coord[dimension]); if (tree_split) { if (i < ndim - 1) { AssignIndex(index_buffer, counts[i + 1], index_elsize); RETURN_NOT_OK( indptr_buffer_builders[i].Append(index_buffer, index_elsize)); } AssignIndex(index_buffer, coord[dimension], index_elsize); RETURN_NOT_OK( indices_buffer_builders[i].Append(index_buffer, index_elsize)); ++counts[i]; } } previous_coord = coord; } IncrementIndex(coord, shape, axis_order); } } for (int64_t column = 0; column < ndim - 1; ++column) { AssignIndex(index_buffer, counts[column + 1], index_elsize); RETURN_NOT_OK(indptr_buffer_builders[column].Append(index_buffer, index_elsize)); } // make results data = std::move(values_buffer); std::vector> indptr_buffers(ndim - 1); std::vector> indices_buffers(ndim); std::vector indptr_shapes(counts.begin(), counts.end() - 1); std::vector indices_shapes = counts; for (int64_t column = 0; column < ndim; ++column) { RETURN_NOT_OK( indices_buffer_builders[column].Finish(&indices_buffers[column], true)); } for (int64_t column = 0; column < ndim - 1; ++column) { RETURN_NOT_OK(indptr_buffer_builders[column].Finish(&indptr_buffers[column], true)); } ARROW_ASSIGN_OR_RAISE( sparse_index, SparseCSFIndex::Make(index_value_type_, indices_shapes, axis_order, indptr_buffers, indices_buffers)); return Status::OK(); } std::shared_ptr sparse_index; std::shared_ptr data; private: const Tensor& tensor_; const std::shared_ptr& index_value_type_; MemoryPool* pool_; }; class TensorBuilderFromSparseCSFTensor : private SparseTensorConverterMixin { using SparseTensorConverterMixin::GetIndexValue; MemoryPool* pool_; const SparseCSFTensor* sparse_tensor_; const SparseCSFIndex* sparse_index_; const std::vector>& indptr_; const std::vector>& indices_; const std::vector& axis_order_; const std::vector& shape_; const int64_t non_zero_length_; const int ndim_; const int64_t tensor_size_; const FixedWidthType& value_type_; const int value_elsize_; const uint8_t* raw_data_; std::vector strides_; std::shared_ptr values_buffer_; uint8_t* values_; public: TensorBuilderFromSparseCSFTensor(const SparseCSFTensor* sparse_tensor, MemoryPool* pool) : pool_(pool), sparse_tensor_(sparse_tensor), sparse_index_( checked_cast(sparse_tensor->sparse_index().get())), indptr_(sparse_index_->indptr()), indices_(sparse_index_->indices()), axis_order_(sparse_index_->axis_order()), shape_(sparse_tensor->shape()), non_zero_length_(sparse_tensor->non_zero_length()), ndim_(sparse_tensor->ndim()), tensor_size_(sparse_tensor->size()), value_type_(checked_cast(*sparse_tensor->type())), value_elsize_(value_type_.byte_width()), raw_data_(sparse_tensor->raw_data()) {} int ElementSize(const std::shared_ptr& tensor) const { return tensor->type()->byte_width(); } Result> Build() { RETURN_NOT_OK(internal::ComputeRowMajorStrides(value_type_, shape_, &strides_)); ARROW_ASSIGN_OR_RAISE(values_buffer_, AllocateBuffer(value_elsize_ * tensor_size_, pool_)); values_ = values_buffer_->mutable_data(); std::fill_n(values_, value_elsize_ * tensor_size_, 0); const int64_t start = 0; const int64_t stop = indptr_[0]->size() - 1; ExpandValues(0, 0, start, stop); return std::make_shared(sparse_tensor_->type(), std::move(values_buffer_), shape_, strides_, sparse_tensor_->dim_names()); } void ExpandValues(const int64_t dim, const int64_t dim_offset, const int64_t start, const int64_t stop) { const auto& cur_indices = indices_[dim]; const int indices_elsize = ElementSize(cur_indices); const auto* indices_data = cur_indices->raw_data() + start * indices_elsize; if (dim == ndim_ - 1) { for (auto i = start; i < stop; ++i) { const int64_t index = SparseTensorConverterMixin::GetIndexValue(indices_data, indices_elsize); const int64_t offset = dim_offset + index * strides_[axis_order_[dim]]; std::copy_n(raw_data_ + i * value_elsize_, value_elsize_, values_ + offset); indices_data += indices_elsize; } } else { const auto& cur_indptr = indptr_[dim]; const int indptr_elsize = ElementSize(cur_indptr); const auto* indptr_data = cur_indptr->raw_data() + start * indptr_elsize; for (int64_t i = start; i < stop; ++i) { const int64_t index = SparseTensorConverterMixin::GetIndexValue(indices_data, indices_elsize); const int64_t offset = dim_offset + index * strides_[axis_order_[dim]]; const int64_t next_start = GetIndexValue(indptr_data, indptr_elsize); const int64_t next_stop = GetIndexValue(indptr_data + indptr_elsize, indptr_elsize); ExpandValues(dim + 1, offset, next_start, next_stop); indices_data += indices_elsize; indptr_data += indptr_elsize; } } } }; } // namespace Status MakeSparseCSFTensorFromTensor(const Tensor& tensor, const std::shared_ptr& index_value_type, MemoryPool* pool, std::shared_ptr* out_sparse_index, std::shared_ptr* out_data) { SparseCSFTensorConverter converter(tensor, index_value_type, pool); RETURN_NOT_OK(converter.Convert()); *out_sparse_index = checked_pointer_cast(converter.sparse_index); *out_data = converter.data; return Status::OK(); } Result> MakeTensorFromSparseCSFTensor( MemoryPool* pool, const SparseCSFTensor* sparse_tensor) { TensorBuilderFromSparseCSFTensor builder(sparse_tensor, pool); return builder.Build(); } } // namespace internal } // namespace arrow20