// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #include "contrib/libs/apache/arrow_next/cpp/src/arrow/sparse_tensor.h" #include "contrib/libs/apache/arrow_next/cpp/src/arrow/tensor/converter.h" #include #include #include #include #include "contrib/libs/apache/arrow_next/cpp/src/arrow/compare.h" #include "contrib/libs/apache/arrow_next/cpp/src/arrow/type_traits.h" #include "contrib/libs/apache/arrow_next/cpp/src/arrow/util/checked_cast.h" #include "contrib/libs/apache/arrow_next/cpp/src/arrow/util/logging.h" #include "contrib/libs/apache/arrow_next/cpp/src/arrow/visit_type_inline.h" namespace arrow20 { class MemoryPool; // ---------------------------------------------------------------------- // SparseIndex Status SparseIndex::ValidateShape(const std::vector& shape) const { if (!std::all_of(shape.begin(), shape.end(), [](int64_t x) { return x >= 0; })) { return Status::Invalid("Shape elements must be positive"); } return Status::OK(); } namespace internal { namespace { template Status CheckSparseIndexMaximumValue(const std::vector& shape) { using c_index_value_type = typename IndexValueType::c_type; constexpr int64_t type_max = static_cast(std::numeric_limits::max()); auto greater_than_type_max = [&](int64_t x) { return x > type_max; }; if (std::any_of(shape.begin(), shape.end(), greater_than_type_max)) { return Status::Invalid("The bit width of the index value type is too small"); } return Status::OK(); } template <> Status CheckSparseIndexMaximumValue(const std::vector& shape) { return Status::OK(); } template <> Status CheckSparseIndexMaximumValue(const std::vector& shape) { return Status::Invalid("UInt64Type cannot be used as IndexValueType of SparseIndex"); } } // namespace #define CALL_CHECK_MAXIMUM_VALUE(TYPE_CLASS) \ case TYPE_CLASS##Type::type_id: \ return CheckSparseIndexMaximumValue(shape); Status CheckSparseIndexMaximumValue(const std::shared_ptr& index_value_type, const std::vector& shape) { switch (index_value_type->id()) { ARROW_GENERATE_FOR_ALL_INTEGER_TYPES(CALL_CHECK_MAXIMUM_VALUE); default: return Status::TypeError("Unsupported SparseTensor index value type"); } } #undef CALL_CHECK_MAXIMUM_VALUE Status MakeSparseTensorFromTensor(const Tensor& tensor, SparseTensorFormat::type sparse_format_id, const std::shared_ptr& index_value_type, MemoryPool* pool, std::shared_ptr* out_sparse_index, std::shared_ptr* out_data) { switch (sparse_format_id) { case SparseTensorFormat::COO: return MakeSparseCOOTensorFromTensor(tensor, index_value_type, pool, out_sparse_index, out_data); case SparseTensorFormat::CSR: return MakeSparseCSXMatrixFromTensor(SparseMatrixCompressedAxis::ROW, tensor, index_value_type, pool, out_sparse_index, out_data); case SparseTensorFormat::CSC: return MakeSparseCSXMatrixFromTensor(SparseMatrixCompressedAxis::COLUMN, tensor, index_value_type, pool, out_sparse_index, out_data); case SparseTensorFormat::CSF: return MakeSparseCSFTensorFromTensor(tensor, index_value_type, pool, out_sparse_index, out_data); // LCOV_EXCL_START: ignore program failure default: return Status::Invalid("Invalid sparse tensor format"); // LCOV_EXCL_STOP } } } // namespace internal // ---------------------------------------------------------------------- // SparseCOOIndex namespace { inline Status CheckSparseCOOIndexValidity(const std::shared_ptr& type, const std::vector& shape, const std::vector& strides) { if (!is_integer(type->id())) { return Status::TypeError("Type of SparseCOOIndex indices must be integer"); } if (shape.size() != 2) { return Status::Invalid("SparseCOOIndex indices must be a matrix"); } RETURN_NOT_OK(internal::CheckSparseIndexMaximumValue(type, shape)); if (!internal::IsTensorStridesContiguous(type, shape, strides)) { return Status::Invalid("SparseCOOIndex indices must be contiguous"); } return Status::OK(); } void GetCOOIndexTensorRow(const std::shared_ptr& coords, const int64_t row, std::vector* out_index) { const auto& fw_index_value_type = internal::checked_cast(*coords->type()); const size_t indices_elsize = fw_index_value_type.bit_width() / CHAR_BIT; const auto& shape = coords->shape(); const int64_t non_zero_length = shape[0]; DCHECK(0 <= row && row < non_zero_length); const int64_t ndim = shape[1]; out_index->resize(ndim); switch (indices_elsize) { case 1: // Int8, UInt8 for (int64_t i = 0; i < ndim; ++i) { (*out_index)[i] = static_cast(coords->Value({row, i})); } break; case 2: // Int16, UInt16 for (int64_t i = 0; i < ndim; ++i) { (*out_index)[i] = static_cast(coords->Value({row, i})); } break; case 4: // Int32, UInt32 for (int64_t i = 0; i < ndim; ++i) { (*out_index)[i] = static_cast(coords->Value({row, i})); } break; case 8: // Int64 for (int64_t i = 0; i < ndim; ++i) { (*out_index)[i] = coords->Value({row, i}); } break; default: DCHECK(false) << "Must not reach here"; break; } } bool DetectSparseCOOIndexCanonicality(const std::shared_ptr& coords) { DCHECK_EQ(coords->ndim(), 2); const auto& shape = coords->shape(); const int64_t non_zero_length = shape[0]; if (non_zero_length <= 1) return true; const int64_t ndim = shape[1]; std::vector last_index, index; GetCOOIndexTensorRow(coords, 0, &last_index); for (int64_t i = 1; i < non_zero_length; ++i) { GetCOOIndexTensorRow(coords, i, &index); int64_t j = 0; while (j < ndim) { if (last_index[j] > index[j]) { // last_index > index, so we can detect non-canonical here return false; } if (last_index[j] < index[j]) { // last_index < index, so we can skip the remaining dimensions break; } ++j; } if (j == ndim) { // last_index == index, so we can detect non-canonical here return false; } swap(last_index, index); } return true; } } // namespace Result> SparseCOOIndex::Make( const std::shared_ptr& coords, bool is_canonical) { RETURN_NOT_OK( CheckSparseCOOIndexValidity(coords->type(), coords->shape(), coords->strides())); return std::make_shared(coords, is_canonical); } Result> SparseCOOIndex::Make( const std::shared_ptr& coords) { RETURN_NOT_OK( CheckSparseCOOIndexValidity(coords->type(), coords->shape(), coords->strides())); auto is_canonical = DetectSparseCOOIndexCanonicality(coords); return std::make_shared(coords, is_canonical); } Result> SparseCOOIndex::Make( const std::shared_ptr& indices_type, const std::vector& indices_shape, const std::vector& indices_strides, std::shared_ptr indices_data, bool is_canonical) { RETURN_NOT_OK( CheckSparseCOOIndexValidity(indices_type, indices_shape, indices_strides)); return std::make_shared( std::make_shared(indices_type, indices_data, indices_shape, indices_strides), is_canonical); } Result> SparseCOOIndex::Make( const std::shared_ptr& indices_type, const std::vector& indices_shape, const std::vector& indices_strides, std::shared_ptr indices_data) { RETURN_NOT_OK( CheckSparseCOOIndexValidity(indices_type, indices_shape, indices_strides)); auto coords = std::make_shared(indices_type, indices_data, indices_shape, indices_strides); auto is_canonical = DetectSparseCOOIndexCanonicality(coords); return std::make_shared(coords, is_canonical); } Result> SparseCOOIndex::Make( const std::shared_ptr& indices_type, const std::vector& shape, int64_t non_zero_length, std::shared_ptr indices_data, bool is_canonical) { auto ndim = static_cast(shape.size()); if (!is_integer(indices_type->id())) { return Status::TypeError("Type of SparseCOOIndex indices must be integer"); } const int64_t elsize = internal::checked_cast(*indices_type).bit_width() / 8; std::vector indices_shape({non_zero_length, ndim}); std::vector indices_strides({elsize * ndim, elsize}); return Make(indices_type, indices_shape, indices_strides, indices_data, is_canonical); } Result> SparseCOOIndex::Make( const std::shared_ptr& indices_type, const std::vector& shape, int64_t non_zero_length, std::shared_ptr indices_data) { auto ndim = static_cast(shape.size()); if (!is_integer(indices_type->id())) { return Status::TypeError("Type of SparseCOOIndex indices must be integer"); } const int64_t elsize = indices_type->byte_width(); std::vector indices_shape({non_zero_length, ndim}); std::vector indices_strides({elsize * ndim, elsize}); return Make(indices_type, indices_shape, indices_strides, indices_data); } // Constructor with a contiguous NumericTensor SparseCOOIndex::SparseCOOIndex(const std::shared_ptr& coords, bool is_canonical) : SparseIndexBase(), coords_(coords), is_canonical_(is_canonical) { ARROW_CHECK_OK( CheckSparseCOOIndexValidity(coords_->type(), coords_->shape(), coords_->strides())); } std::string SparseCOOIndex::ToString() const { return std::string("SparseCOOIndex"); } // ---------------------------------------------------------------------- // SparseCSXIndex namespace internal { Status ValidateSparseCSXIndex(const std::shared_ptr& indptr_type, const std::shared_ptr& indices_type, const std::vector& indptr_shape, const std::vector& indices_shape, char const* type_name) { if (!is_integer(indptr_type->id())) { return Status::TypeError("Type of ", type_name, " indptr must be integer"); } if (indptr_shape.size() != 1) { return Status::Invalid(type_name, " indptr must be a vector"); } if (!is_integer(indices_type->id())) { return Status::Invalid("Type of ", type_name, " indices must be integer"); } if (indices_shape.size() != 1) { return Status::Invalid(type_name, " indices must be a vector"); } RETURN_NOT_OK(internal::CheckSparseIndexMaximumValue(indptr_type, indptr_shape)); RETURN_NOT_OK(internal::CheckSparseIndexMaximumValue(indices_type, indices_shape)); return Status::OK(); } void CheckSparseCSXIndexValidity(const std::shared_ptr& indptr_type, const std::shared_ptr& indices_type, const std::vector& indptr_shape, const std::vector& indices_shape, char const* type_name) { ARROW_CHECK_OK(ValidateSparseCSXIndex(indptr_type, indices_type, indptr_shape, indices_shape, type_name)); } } // namespace internal // ---------------------------------------------------------------------- // SparseCSFIndex namespace { inline Status CheckSparseCSFIndexValidity(const std::shared_ptr& indptr_type, const std::shared_ptr& indices_type, const int64_t num_indptrs, const int64_t num_indices, const int64_t axis_order_size) { if (!is_integer(indptr_type->id())) { return Status::TypeError("Type of SparseCSFIndex indptr must be integer"); } if (!is_integer(indices_type->id())) { return Status::TypeError("Type of SparseCSFIndex indices must be integer"); } if (num_indptrs + 1 != num_indices) { return Status::Invalid( "Length of indices must be equal to length of indptrs + 1 for SparseCSFIndex."); } if (axis_order_size != num_indices) { return Status::Invalid( "Length of indices must be equal to number of dimensions for SparseCSFIndex."); } return Status::OK(); } } // namespace Result> SparseCSFIndex::Make( const std::shared_ptr& indptr_type, const std::shared_ptr& indices_type, const std::vector& indices_shapes, const std::vector& axis_order, const std::vector>& indptr_data, const std::vector>& indices_data) { int64_t ndim = axis_order.size(); std::vector> indptr(ndim - 1); std::vector> indices(ndim); for (int64_t i = 0; i < ndim - 1; ++i) indptr[i] = std::make_shared(indptr_type, indptr_data[i], std::vector({indices_shapes[i] + 1})); for (int64_t i = 0; i < ndim; ++i) indices[i] = std::make_shared(indices_type, indices_data[i], std::vector({indices_shapes[i]})); RETURN_NOT_OK(CheckSparseCSFIndexValidity(indptr_type, indices_type, indptr.size(), indices.size(), axis_order.size())); for (auto tensor : indptr) { RETURN_NOT_OK(internal::CheckSparseIndexMaximumValue(indptr_type, tensor->shape())); } for (auto tensor : indices) { RETURN_NOT_OK(internal::CheckSparseIndexMaximumValue(indices_type, tensor->shape())); } return std::make_shared(indptr, indices, axis_order); } // Constructor with two index vectors SparseCSFIndex::SparseCSFIndex(const std::vector>& indptr, const std::vector>& indices, const std::vector& axis_order) : SparseIndexBase(), indptr_(indptr), indices_(indices), axis_order_(axis_order) { ARROW_CHECK_OK(CheckSparseCSFIndexValidity(indptr_.front()->type(), indices_.front()->type(), indptr_.size(), indices_.size(), axis_order_.size())); } std::string SparseCSFIndex::ToString() const { return std::string("SparseCSFIndex"); } bool SparseCSFIndex::Equals(const SparseCSFIndex& other) const { for (int64_t i = 0; i < static_cast(indices().size()); ++i) { if (!indices()[i]->Equals(*other.indices()[i])) return false; } for (int64_t i = 0; i < static_cast(indptr().size()); ++i) { if (!indptr()[i]->Equals(*other.indptr()[i])) return false; } return axis_order() == other.axis_order(); } // ---------------------------------------------------------------------- // SparseTensor // Constructor with all attributes SparseTensor::SparseTensor(const std::shared_ptr& type, const std::shared_ptr& data, const std::vector& shape, const std::shared_ptr& sparse_index, const std::vector& dim_names) : type_(type), data_(data), shape_(shape), sparse_index_(sparse_index), dim_names_(dim_names) { ARROW_CHECK(is_tensor_supported(type->id())); } const std::string& SparseTensor::dim_name(int i) const { static const std::string kEmpty = ""; if (dim_names_.size() == 0) { return kEmpty; } else { ARROW_CHECK_LT(i, static_cast(dim_names_.size())); return dim_names_[i]; } } int64_t SparseTensor::size() const { return std::accumulate(shape_.begin(), shape_.end(), 1LL, std::multiplies()); } bool SparseTensor::Equals(const SparseTensor& other, const EqualOptions& opts) const { return SparseTensorEquals(*this, other, opts); } Result> SparseTensor::ToTensor(MemoryPool* pool) const { switch (format_id()) { case SparseTensorFormat::COO: return MakeTensorFromSparseCOOTensor( pool, internal::checked_cast(this)); break; case SparseTensorFormat::CSR: return MakeTensorFromSparseCSRMatrix( pool, internal::checked_cast(this)); break; case SparseTensorFormat::CSC: return MakeTensorFromSparseCSCMatrix( pool, internal::checked_cast(this)); break; case SparseTensorFormat::CSF: return MakeTensorFromSparseCSFTensor( pool, internal::checked_cast(this)); default: return Status::NotImplemented("Unsupported SparseIndex format type"); } } } // namespace arrow20