From f30402b292d9cf61cabfb4cc23c3f1fee1cd79e1 Mon Sep 17 00:00:00 2001 From: Arash Andishgar Date: Wed, 17 Sep 2025 16:25:35 +0330 Subject: [PATCH 1/4] resolve negative zero --- cpp/src/arrow/sparse_tensor_test.cc | 148 ++++++++++++++++++++++ cpp/src/arrow/tensor.cc | 12 +- cpp/src/arrow/tensor.h | 7 + cpp/src/arrow/tensor/converter.h | 53 ++++++++ cpp/src/arrow/tensor/converter_internal.h | 88 ------------- cpp/src/arrow/tensor/coo_converter.cc | 131 ++++++++++--------- cpp/src/arrow/tensor/csf_converter.cc | 85 ++++++------- cpp/src/arrow/tensor/csx_converter.cc | 83 ++++++------ 8 files changed, 353 insertions(+), 254 deletions(-) delete mode 100644 cpp/src/arrow/tensor/converter_internal.h diff --git a/cpp/src/arrow/sparse_tensor_test.cc b/cpp/src/arrow/sparse_tensor_test.cc index c9c28a11b1b39..406c72d0e2ec4 100644 --- a/cpp/src/arrow/sparse_tensor_test.cc +++ b/cpp/src/arrow/sparse_tensor_test.cc @@ -413,6 +413,71 @@ TEST_F(TestSparseCOOTensor, TestToTensor) { ASSERT_TRUE(tensor.Equals(*dense_tensor)); } +TEST_F(TestSparseCOOTensor, CreationFromVectorTensorWithNegZero) { + std::vector data{ + -0.0, -0.0, 0.0, -0.0, 4.0, -0.0, -0.0, 0.0, -0.0, -1.0, -0.0, -0.0, + }; + std::vector shape = {12}; + auto buffer = Buffer::FromVector(data); + ASSERT_OK_AND_ASSIGN(auto dense_tensor, Tensor::Make(float32(), buffer, shape)); + ASSERT_OK_AND_ASSIGN(auto sparse_coo_tensor, + SparseCOOTensor::Make(*dense_tensor, int64())); + ASSERT_EQ(2, sparse_coo_tensor->non_zero_length()); + auto si = + internal::checked_pointer_cast(sparse_coo_tensor->sparse_index()); + AssertCOOIndex(si->indices(), 0, {4}); + AssertCOOIndex(si->indices(), 1, {9}); + ASSERT_OK_AND_ASSIGN(auto new_tensor, sparse_coo_tensor->ToTensor()); + ASSERT_TRUE(new_tensor->Equals(*dense_tensor)); +} + +TEST_F(TestSparseCOOTensor, CreationFromContiguousDenseTensorWithNegZero) { + // clang-format off + std::vector data{ + -0.0, -0.0, 0.0, + -0.0, 4.0, -0.0, + -0.0, 0.0, -0.0, + -1.0, -0.0, -0.0, + }; + // clang-format on + std::vector shape = {4, 3}; + auto buffer = Buffer::FromVector(data); + ASSERT_OK_AND_ASSIGN(auto dense_tensor, Tensor::Make(float32(), buffer, shape)); + ASSERT_OK_AND_ASSIGN(auto sparse_coo_tensor, + SparseCOOTensor::Make(*dense_tensor, int64())); + ASSERT_EQ(2, sparse_coo_tensor->non_zero_length()); + auto si = + internal::checked_pointer_cast(sparse_coo_tensor->sparse_index()); + AssertCOOIndex(si->indices(), 0, {1, 1}); + AssertCOOIndex(si->indices(), 1, {3, 0}); + ASSERT_OK_AND_ASSIGN(auto new_tensor, sparse_coo_tensor->ToTensor()); + ASSERT_TRUE(new_tensor->Equals(*dense_tensor)); +} + +TEST_F(TestSparseCOOTensor, CreationFromNonContiguousDenseTensorWithNegZero) { + // clang-format off + std::vector data{ + -0.0, -0.0, 0.0, 1.0, 2.0, + -0.0, 4.0, -0.0, 0.0, -0.0, + -0.0, 0.0, -0.0, 3.0, 4.0, + -1.0, -0.0, -0.0, 0.0, 0.0, + }; + // clang-format on + std::vector shape = {4, 3}; + auto buffer = Buffer::FromVector(data); + ASSERT_OK_AND_ASSIGN(auto dense_tensor, + Tensor::Make(float32(), buffer, shape, {20, 4})); + ASSERT_OK_AND_ASSIGN(auto sparse_coo_tensor, + SparseCOOTensor::Make(*dense_tensor, int64())); + ASSERT_EQ(2, sparse_coo_tensor->non_zero_length()); + auto si = + internal::checked_pointer_cast(sparse_coo_tensor->sparse_index()); + AssertCOOIndex(si->indices(), 0, {1, 1}); + AssertCOOIndex(si->indices(), 1, {3, 0}); + ASSERT_OK_AND_ASSIGN(auto new_tensor, sparse_coo_tensor->ToTensor()); + ASSERT_TRUE(new_tensor->Equals(*dense_tensor)); +} + template class TestSparseCOOTensorEquality : public TestSparseTensorBase { public: @@ -869,6 +934,33 @@ TEST_F(TestSparseCSRMatrix, TestToTensor) { ASSERT_TRUE(tensor.Equals(*dense_tensor)); } +TEST_F(TestSparseCSRMatrix, CreationFromTensorWithNegZero) { + // clang-format off + std::vector data{ + -0.0, -0.0, 0.0, + -0.0, 4.0, -0.0, + -0.0, 0.0, -0.0, + -1.0, -0.0, -0.0, + }; + // clang-format on + std::vector shape = {4, 3}; + auto buffer = Buffer::FromVector(data); + ASSERT_OK_AND_ASSIGN(auto dense_tensor, Tensor::Make(float32(), buffer, shape)); + ASSERT_OK_AND_ASSIGN(auto sparse_csr_tensor, + SparseCSRMatrix::Make(*dense_tensor, int64())); + ASSERT_EQ(2, sparse_csr_tensor->non_zero_length()); + auto si = + internal::checked_pointer_cast(sparse_csr_tensor->sparse_index()); + const auto* indptr = si->indptr()->data()->data_as(); + const auto* indices = si->indices()->data()->data_as(); + ASSERT_EQ(indptr[2], 1); + ASSERT_EQ(indptr[4], 2); + ASSERT_EQ(indices[0], 1); + ASSERT_EQ(indices[1], 0); + ASSERT_OK_AND_ASSIGN(auto new_tensor, sparse_csr_tensor->ToTensor()); + ASSERT_TRUE(new_tensor->Equals(*dense_tensor)); +} + template class TestSparseCSRMatrixEquality : public TestSparseTensorBase { public: @@ -1204,6 +1296,33 @@ TEST_F(TestSparseCSCMatrix, TestToTensor) { ASSERT_TRUE(tensor.Equals(*dense_tensor)); } +TEST_F(TestSparseCSCMatrix, CreationFromTensorWithNegZero) { + // clang-format off + std::vector data{ + -0.0, -0.0, 0.0, + -0.0, 4.0, -0.0, + -0.0, 0.0, -0.0, + -1.0, -0.0, -0.0, + }; + // clang-format on + std::vector shape = {4, 3}; + auto buffer = Buffer::FromVector(data); + ASSERT_OK_AND_ASSIGN(auto dense_tensor, Tensor::Make(float32(), buffer, shape)); + ASSERT_OK_AND_ASSIGN(auto sparse_csc_tensor, + SparseCSCMatrix::Make(*dense_tensor, int64())); + ASSERT_EQ(2, sparse_csc_tensor->non_zero_length()); + auto si = + internal::checked_pointer_cast(sparse_csc_tensor->sparse_index()); + const auto* indptr = si->indptr()->data()->data_as(); + const auto* indices = si->indices()->data()->data_as(); + ASSERT_EQ(indptr[1], 1); + ASSERT_EQ(indptr[2], 2); + ASSERT_EQ(indices[0], 3); + ASSERT_EQ(indices[1], 1); + ASSERT_OK_AND_ASSIGN(auto new_tensor, sparse_csc_tensor->ToTensor()); + ASSERT_TRUE(new_tensor->Equals(*dense_tensor)); +} + template class TestSparseCSCMatrixEquality : public TestSparseTensorBase { public: @@ -1479,6 +1598,35 @@ TEST_F(TestSparseCSFTensor, CreationFromZeroTensor) { ASSERT_TRUE(t->Equals(*t_zero)); } +TEST_F(TestSparseCSFTensor, CreationFromTensorWithNegZero) { + // clang-format off + std::vector data{ + -0.0, -0.0, -0.0, -0.0, + 4.0, -0.0, -0.0, -0.0, + 0.0, -1.0, -0.0, -0.0, + }; + // clang-format on + std::vector shape = {3, 4}; + auto buffer = Buffer::FromVector(data); + ASSERT_OK_AND_ASSIGN(auto dense_tensor, Tensor::Make(float32(), buffer, shape)); + ASSERT_OK_AND_ASSIGN(auto sparse_csf_tensor, + SparseCSFTensor::Make(*dense_tensor, int64())); + ASSERT_EQ(2, sparse_csf_tensor->non_zero_length()); + auto si = + internal::checked_pointer_cast(sparse_csf_tensor->sparse_index()); + auto indptr = si->indptr()[0]->data()->data_as(); + auto row_indices = si->indices()[0]->data()->data_as(); + auto column_indices = si->indices()[1]->data()->data_as(); + ASSERT_EQ(indptr[1], 1); + ASSERT_EQ(indptr[2], 2); + EXPECT_EQ(row_indices[0], 1); + EXPECT_EQ(row_indices[1], 2); + EXPECT_EQ(column_indices[0], 0); + EXPECT_EQ(column_indices[1], 1); + ASSERT_OK_AND_ASSIGN(auto new_tensor, sparse_csf_tensor->ToTensor()); + ASSERT_TRUE(new_tensor->Equals(*dense_tensor)); +} + template class TestSparseCSFTensorForIndexValueType : public TestSparseCSFTensorBase { diff --git a/cpp/src/arrow/tensor.cc b/cpp/src/arrow/tensor.cc index 8cdf7f82d2642..d3dd96b81095f 100644 --- a/cpp/src/arrow/tensor.cc +++ b/cpp/src/arrow/tensor.cc @@ -35,6 +35,7 @@ #include "arrow/util/checked_cast.h" #include "arrow/util/int_util_overflow.h" #include "arrow/util/logging_internal.h" +#include "arrow/util/macros.h" #include "arrow/util/unreachable.h" #include "arrow/visit_type_inline.h" @@ -85,7 +86,7 @@ Status ComputeColumnMajorStrides(const FixedWidthType& type, if (!shape.empty() && shape.back() > 0) { total = byte_width; for (size_t i = 0; i < ndim - 1; ++i) { - if (internal::MultiplyWithOverflow(total, shape[i], &total)) { + if (ARROW_PREDICT_FALSE(internal::MultiplyWithOverflow(total, shape[i], &total))) { return Status::Invalid( "Column-major strides computed from shape would not fit in 64-bit " "integer"); @@ -485,13 +486,14 @@ namespace { template int64_t StridedTensorCountNonZero(int dim_index, int64_t offset, const Tensor& tensor) { using c_type = typename TYPE::c_type; - const c_type zero = c_type(0); int64_t nnz = 0; if (dim_index == tensor.ndim() - 1) { for (int64_t i = 0; i < tensor.shape()[dim_index]; ++i) { const auto* ptr = tensor.raw_data() + offset + i * tensor.strides()[dim_index]; - auto& elem = *reinterpret_cast(ptr); - if (elem != zero) ++nnz; + if (auto& elem = *reinterpret_cast(ptr); + internal::is_not_zero(elem)) { + ++nnz; + } } return nnz; } @@ -507,7 +509,7 @@ int64_t ContiguousTensorCountNonZero(const Tensor& tensor) { using c_type = typename TYPE::c_type; auto* data = reinterpret_cast(tensor.raw_data()); return std::count_if(data, data + tensor.size(), - [](const c_type& x) { return x != 0; }); + [](const c_type& x) { return internal::is_not_zero(x); }); } template diff --git a/cpp/src/arrow/tensor.h b/cpp/src/arrow/tensor.h index beb62a11bdce9..7af5cefbcbf18 100644 --- a/cpp/src/arrow/tensor.h +++ b/cpp/src/arrow/tensor.h @@ -55,6 +55,13 @@ constexpr bool is_tensor_supported(Type::type type_id) { namespace internal { +// TODO(GH-47578): Enable HalfFloatType +template +inline bool is_not_zero(typename ValueDataType::c_type value) { + typename ValueDataType::c_type zero = 0; + return value != zero; +} + ARROW_EXPORT Status ComputeRowMajorStrides(const FixedWidthType& type, const std::vector& shape, diff --git a/cpp/src/arrow/tensor/converter.h b/cpp/src/arrow/tensor/converter.h index 408ab22305fff..fd23f83c8e8f0 100644 --- a/cpp/src/arrow/tensor/converter.h +++ b/cpp/src/arrow/tensor/converter.h @@ -20,6 +20,9 @@ #include "arrow/sparse_tensor.h" // IWYU pragma: export #include +#include + +#include "arrow/visit_type_inline.h" namespace arrow { namespace internal { @@ -63,5 +66,55 @@ Result> MakeTensorFromSparseCSCMatrix( Result> MakeTensorFromSparseCSFTensor( MemoryPool* pool, const SparseCSFTensor* sparse_tensor); +template +struct ConverterVisitor { + explicit ConverterVisitor(Convertor& converter) : converter(converter) {} + template + Status operator()(const ValueType& value, const IndexType& index_type) { + return converter.Convert(value, index_type); + } + + Convertor& converter; +}; + +struct ValueTypeVisitor { + template + enable_if_number Visit(const ValueType& value_type, + const IndexType& index_type, + Function&& function) { + return function(value_type, index_type); + } + + template + Status Visit(const DataType& value_type, const IndexType&, Function&&) { + return Status::Invalid("Invalid value type and the type is ", value_type.name()); + } +}; + +struct IndexAndValueTypeVisitor { + template + enable_if_integer Visit(const IndexType& index_type, + const std::shared_ptr& value_type, + Function&& function) { + ValueTypeVisitor visitor; + return VisitTypeInline(*value_type, &visitor, index_type, + std::forward(function)); + } + + template + Status Visit(const DataType& type, const std::shared_ptr&, Function&&) { + return Status::Invalid("Invalid index type and the type is ", type.name()); + } +}; + +template +Status VisitValueAndIndexType(const std::shared_ptr& value_type, + const std::shared_ptr& index_type, + Function&& function) { + IndexAndValueTypeVisitor visitor; + return VisitTypeInline(*index_type, &visitor, value_type, + std::forward(function)); +} + } // namespace internal } // namespace arrow diff --git a/cpp/src/arrow/tensor/converter_internal.h b/cpp/src/arrow/tensor/converter_internal.h deleted file mode 100644 index 3a87feaf4b346..0000000000000 --- a/cpp/src/arrow/tensor/converter_internal.h +++ /dev/null @@ -1,88 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include "arrow/tensor/converter.h" - -#define DISPATCH(ACTION, index_elsize, value_elsize, ...) \ - switch (index_elsize) { \ - case 1: \ - switch (value_elsize) { \ - case 1: \ - ACTION(uint8_t, uint8_t, __VA_ARGS__); \ - break; \ - case 2: \ - ACTION(uint8_t, uint16_t, __VA_ARGS__); \ - break; \ - case 4: \ - ACTION(uint8_t, uint32_t, __VA_ARGS__); \ - break; \ - case 8: \ - ACTION(uint8_t, uint64_t, __VA_ARGS__); \ - break; \ - } \ - break; \ - case 2: \ - switch (value_elsize) { \ - case 1: \ - ACTION(uint16_t, uint8_t, __VA_ARGS__); \ - break; \ - case 2: \ - ACTION(uint16_t, uint16_t, __VA_ARGS__); \ - break; \ - case 4: \ - ACTION(uint16_t, uint32_t, __VA_ARGS__); \ - break; \ - case 8: \ - ACTION(uint16_t, uint64_t, __VA_ARGS__); \ - break; \ - } \ - break; \ - case 4: \ - switch (value_elsize) { \ - case 1: \ - ACTION(uint32_t, uint8_t, __VA_ARGS__); \ - break; \ - case 2: \ - ACTION(uint32_t, uint16_t, __VA_ARGS__); \ - break; \ - case 4: \ - ACTION(uint32_t, uint32_t, __VA_ARGS__); \ - break; \ - case 8: \ - ACTION(uint32_t, uint64_t, __VA_ARGS__); \ - break; \ - } \ - break; \ - case 8: \ - switch (value_elsize) { \ - case 1: \ - ACTION(int64_t, uint8_t, __VA_ARGS__); \ - break; \ - case 2: \ - ACTION(int64_t, uint16_t, __VA_ARGS__); \ - break; \ - case 4: \ - ACTION(int64_t, uint32_t, __VA_ARGS__); \ - break; \ - case 8: \ - ACTION(int64_t, uint64_t, __VA_ARGS__); \ - break; \ - } \ - break; \ - } diff --git a/cpp/src/arrow/tensor/coo_converter.cc b/cpp/src/arrow/tensor/coo_converter.cc index 7e29b668f53ec..57e696a0b18b5 100644 --- a/cpp/src/arrow/tensor/coo_converter.cc +++ b/cpp/src/arrow/tensor/coo_converter.cc @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -#include "arrow/tensor/converter_internal.h" +#include "arrow/tensor/converter.h" #include #include @@ -25,26 +25,27 @@ #include "arrow/buffer.h" #include "arrow/status.h" +#include "arrow/tensor.h" #include "arrow/type.h" #include "arrow/util/checked_cast.h" #include "arrow/util/macros.h" -#include "arrow/visit_type_inline.h" namespace arrow { class MemoryPool; namespace internal { + namespace { -template -inline void IncrementRowMajorIndex(std::vector& coord, +template +inline void IncrementRowMajorIndex(std::vector& coord, const std::vector& shape) { const int64_t ndim = shape.size(); ++coord[ndim - 1]; - if (coord[ndim - 1] == shape[ndim - 1]) { + if (static_cast(coord[ndim - 1]) == shape[ndim - 1]) { int64_t d = ndim - 1; - while (d > 0 && coord[d] == shape[d]) { + while (d > 0 && static_cast(coord[d]) == shape[d]) { coord[d] = 0; ++coord[d - 1]; --d; @@ -52,19 +53,20 @@ inline void IncrementRowMajorIndex(std::vector& coord, } } -template -void ConvertRowMajorTensor(const Tensor& tensor, c_index_type* indices, - c_value_type* values, const int64_t size) { +template +void ConvertContinuousTensor(const Tensor& tensor, typename IndexType::c_type* indices, + typename ValueType::c_type* values) { + using ValueCType = typename ValueType::c_type; + using IndexCType = typename IndexType::c_type; + const auto ndim = tensor.ndim(); const auto& shape = tensor.shape(); - const c_value_type* tensor_data = - reinterpret_cast(tensor.raw_data()); + const auto* tensor_data = tensor.data()->data_as(); - constexpr c_value_type zero = 0; - std::vector coord(ndim, 0); + std::vector coord(ndim, 0); for (int64_t n = tensor.size(); n > 0; --n) { - const c_value_type x = *tensor_data; - if (ARROW_PREDICT_FALSE(x != zero)) { + auto x = *tensor_data; + if (is_not_zero(x)) { std::copy(coord.begin(), coord.end(), indices); *values++ = x; indices += ndim; @@ -75,13 +77,25 @@ void ConvertRowMajorTensor(const Tensor& tensor, c_index_type* indices, } } -template -void ConvertColumnMajorTensor(const Tensor& tensor, c_index_type* out_indices, - c_value_type* out_values, const int64_t size) { +template +void ConvertRowMajorTensor(const Tensor& tensor, typename IndexType::c_type* out_indices, + typename ValueType::c_type* out_values) { + ConvertContinuousTensor(tensor, out_indices, out_values); +} + +// TODO(GH-47580): Correct column-major tensor conversion +template +void ConvertColumnMajorTensor(const Tensor& tensor, + typename IndexType::c_type* out_indices, + typename ValueType::c_type* out_values, + const int64_t size) { + using ValueCtype = typename ValueType::c_type; + using IndexCType = typename IndexType::c_type; + const auto ndim = tensor.ndim(); - std::vector indices(ndim * size); - std::vector values(size); - ConvertRowMajorTensor(tensor, indices.data(), values.data(), size); + std::vector indices(ndim * size); + std::vector values(size); + ConvertContinuousTensor(tensor, indices.data(), values.data()); // transpose indices for (int64_t i = 0; i < size; ++i) { @@ -116,23 +130,24 @@ void ConvertColumnMajorTensor(const Tensor& tensor, c_index_type* out_indices, } } -template -void ConvertStridedTensor(const Tensor& tensor, c_index_type* indices, - c_value_type* values, const int64_t size) { - using ValueType = typename CTypeTraits::ArrowType; +template +void ConvertStridedTensor(const Tensor& tensor, typename IndexType::c_type* indices, + typename ValueType::c_type* values) { + using ValueCType = typename ValueType::c_type; + using IndexCType = typename IndexType::c_type; + const auto& shape = tensor.shape(); const auto ndim = tensor.ndim(); std::vector coord(ndim, 0); - constexpr c_value_type zero = 0; - c_value_type x; + ValueCType x; int64_t i; for (int64_t n = tensor.size(); n > 0; --n) { x = tensor.Value(coord); - if (ARROW_PREDICT_FALSE(x != zero)) { + if (is_not_zero(x)) { *values++ = x; for (i = 0; i < ndim; ++i) { - *indices++ = static_cast(coord[i]); + *indices++ = static_cast(coord[i]); } } @@ -140,35 +155,21 @@ void ConvertStridedTensor(const Tensor& tensor, c_index_type* indices, } } -#define CONVERT_TENSOR(func, index_type, value_type, indices, values, size) \ - func(tensor_, reinterpret_cast(indices), \ - reinterpret_cast(values), size) - -// Using ARROW_EXPAND is necessary to expand __VA_ARGS__ correctly on VC++. -#define CONVERT_ROW_MAJOR_TENSOR(index_type, value_type, ...) \ - ARROW_EXPAND(CONVERT_TENSOR(ConvertRowMajorTensor, index_type, value_type, __VA_ARGS__)) - -#define CONVERT_COLUMN_MAJOR_TENSOR(index_type, value_type, ...) \ - ARROW_EXPAND( \ - CONVERT_TENSOR(ConvertColumnMajorTensor, index_type, value_type, __VA_ARGS__)) - -#define CONVERT_STRIDED_TENSOR(index_type, value_type, ...) \ - ARROW_EXPAND(CONVERT_TENSOR(ConvertStridedTensor, index_type, value_type, __VA_ARGS__)) - // ---------------------------------------------------------------------- // SparseTensorConverter for SparseCOOIndex -class SparseCOOTensorConverter : private SparseTensorConverterMixin { - using SparseTensorConverterMixin::AssignIndex; - using SparseTensorConverterMixin::IsNonZero; - +class SparseCOOTensorConverter { public: SparseCOOTensorConverter(const Tensor& tensor, const std::shared_ptr& index_value_type, MemoryPool* pool) : tensor_(tensor), index_value_type_(index_value_type), pool_(pool) {} - Status Convert() { + template + Status Convert(const ValueType&, const IndexType&) { + using ValueCType = typename ValueType::c_type; + using IndexCType = typename IndexType::c_type; + RETURN_NOT_OK(::arrow::internal::CheckSparseIndexMaximumValue(index_value_type_, tensor_.shape())); @@ -180,34 +181,29 @@ class SparseCOOTensorConverter : private SparseTensorConverterMixin { ARROW_ASSIGN_OR_RAISE(auto indices_buffer, AllocateBuffer(index_elsize * ndim * nonzero_count, pool_)); - uint8_t* indices = indices_buffer->mutable_data(); ARROW_ASSIGN_OR_RAISE(auto values_buffer, AllocateBuffer(value_elsize * nonzero_count, pool_)); - uint8_t* values = values_buffer->mutable_data(); - const uint8_t* tensor_data = tensor_.raw_data(); + auto* values = values_buffer->mutable_data_as(); + const auto* tensor_data = tensor_.data()->data_as(); + auto* indices = indices_buffer->mutable_data_as(); if (ndim <= 1) { const int64_t count = ndim == 0 ? 1 : tensor_.shape()[0]; for (int64_t i = 0; i < count; ++i) { - if (std::any_of(tensor_data, tensor_data + value_elsize, IsNonZero)) { - AssignIndex(indices, i, index_elsize); - std::copy_n(tensor_data, value_elsize, values); - - indices += index_elsize; - values += value_elsize; + if (is_not_zero(*tensor_data)) { + *indices++ = static_cast(i); + *values++ = *tensor_data; } - tensor_data += value_elsize; + ++tensor_data; } } else if (tensor_.is_row_major()) { - DISPATCH(CONVERT_ROW_MAJOR_TENSOR, index_elsize, value_elsize, indices, values, - nonzero_count); + ConvertRowMajorTensor(tensor_, indices, values); } else if (tensor_.is_column_major()) { - DISPATCH(CONVERT_COLUMN_MAJOR_TENSOR, index_elsize, value_elsize, indices, values, - nonzero_count); + ConvertColumnMajorTensor(tensor_, indices, values, + nonzero_count); } else { - DISPATCH(CONVERT_STRIDED_TENSOR, index_elsize, value_elsize, indices, values, - nonzero_count); + ConvertStridedTensor(tensor_, indices, values); } // make results @@ -281,13 +277,14 @@ Status MakeSparseCOOTensorFromTensor(const Tensor& tensor, std::shared_ptr* out_sparse_index, std::shared_ptr* out_data) { SparseCOOTensorConverter converter(tensor, index_value_type, pool); - RETURN_NOT_OK(converter.Convert()); - + ConverterVisitor visitor{converter}; + ARROW_RETURN_NOT_OK(VisitValueAndIndexType(tensor.type(), index_value_type, visitor)); *out_sparse_index = checked_pointer_cast(converter.sparse_index); *out_data = converter.data; return Status::OK(); } +// TODO(GH-47580): Enable column-major index tensor Result> MakeTensorFromSparseCOOTensor( MemoryPool* pool, const SparseCOOTensor* sparse_tensor) { const auto& sparse_index = diff --git a/cpp/src/arrow/tensor/csf_converter.cc b/cpp/src/arrow/tensor/csf_converter.cc index f6470e16b7818..8ed5838c4543e 100644 --- a/cpp/src/arrow/tensor/csf_converter.cc +++ b/cpp/src/arrow/tensor/csf_converter.cc @@ -27,10 +27,10 @@ #include "arrow/buffer_builder.h" #include "arrow/result.h" #include "arrow/status.h" +#include "arrow/tensor.h" #include "arrow/type.h" #include "arrow/util/checked_cast.h" #include "arrow/util/sort_internal.h" -#include "arrow/visit_type_inline.h" namespace arrow { @@ -57,24 +57,25 @@ inline void IncrementIndex(std::vector& coord, const std::vector& index_value_type, MemoryPool* pool) : tensor_(tensor), index_value_type_(index_value_type), pool_(pool) {} - Status Convert() { + template + Status Convert(const ValueType&, const IndexType&) { + using ValueCType = typename ValueType::c_type; + using IndexCType = typename IndexType::c_type; RETURN_NOT_OK(::arrow::internal::CheckSparseIndexMaximumValue(index_value_type_, tensor_.shape())); + const int64_t ndim = tensor_.ndim(); + if (ndim <= 1) { + return Status::NotImplemented("TODO for ndim <= 1"); + } - const int index_elsize = index_value_type_->byte_width(); const int value_elsize = tensor_.type()->byte_width(); - - const int64_t ndim = tensor_.ndim(); // Axis order as ascending order of dimension size is a good heuristic but is not // necessarily optimal. std::vector axis_order = internal::ArgSort(tensor_.shape()); @@ -82,60 +83,48 @@ class SparseCSFTensorConverter : private SparseTensorConverterMixin { ARROW_ASSIGN_OR_RAISE(auto values_buffer, AllocateBuffer(value_elsize * nonzero_count, pool_)); - auto* values = values_buffer->mutable_data(); std::vector counts(ndim, 0); std::vector coord(ndim, 0); std::vector previous_coord(ndim, -1); - std::vector indptr_buffer_builders(ndim - 1); - std::vector indices_buffer_builders(ndim); - const auto* tensor_data = tensor_.raw_data(); - uint8_t index_buffer[sizeof(int64_t)]; + std::vector> indptr_buffer_builders(ndim - 1); + std::vector> indices_buffer_builders(ndim); - if (ndim <= 1) { - return Status::NotImplemented("TODO for ndim <= 1"); - } else { - const auto& shape = tensor_.shape(); - for (int64_t n = tensor_.size(); n > 0; n--) { - const auto offset = tensor_.CalculateValueOffset(coord); - const auto xp = tensor_data + offset; - - if (std::any_of(xp, xp + value_elsize, IsNonZero)) { - bool tree_split = false; - - std::copy_n(xp, value_elsize, values); - values += value_elsize; - - for (int64_t i = 0; i < ndim; ++i) { - int64_t dimension = axis_order[i]; + auto* values = values_buffer->mutable_data_as(); - tree_split = tree_split || (coord[dimension] != previous_coord[dimension]); - if (tree_split) { - if (i < ndim - 1) { - AssignIndex(index_buffer, counts[i + 1], index_elsize); - RETURN_NOT_OK( - indptr_buffer_builders[i].Append(index_buffer, index_elsize)); - } + const auto& shape = tensor_.shape(); + for (int64_t n = tensor_.size(); n > 0; n--) { + const auto value = tensor_.Value(coord); - AssignIndex(index_buffer, coord[dimension], index_elsize); - RETURN_NOT_OK( - indices_buffer_builders[i].Append(index_buffer, index_elsize)); + if (is_not_zero(value)) { + bool tree_split = false; + *values++ = value; + for (int64_t i = 0; i < ndim; ++i) { + int64_t dimension = axis_order[i]; - ++counts[i]; + tree_split = tree_split || (coord[dimension] != previous_coord[dimension]); + if (tree_split) { + if (i < ndim - 1) { + RETURN_NOT_OK(indptr_buffer_builders[i].Append( + static_cast(counts[i + 1]))); } - } + RETURN_NOT_OK(indices_buffer_builders[i].Append( + static_cast(coord[dimension]))); - previous_coord = coord; + ++counts[i]; + } } - IncrementIndex(coord, shape, axis_order); + previous_coord = coord; } + + IncrementIndex(coord, shape, axis_order); } for (int64_t column = 0; column < ndim - 1; ++column) { - AssignIndex(index_buffer, counts[column + 1], index_elsize); - RETURN_NOT_OK(indptr_buffer_builders[column].Append(index_buffer, index_elsize)); + RETURN_NOT_OK(indptr_buffer_builders[column].Append( + static_cast(counts[column + 1]))); } // make results @@ -272,8 +261,8 @@ Status MakeSparseCSFTensorFromTensor(const Tensor& tensor, std::shared_ptr* out_sparse_index, std::shared_ptr* out_data) { SparseCSFTensorConverter converter(tensor, index_value_type, pool); - RETURN_NOT_OK(converter.Convert()); - + ConverterVisitor visitor{converter}; + ARROW_RETURN_NOT_OK(VisitValueAndIndexType(tensor.type(), index_value_type, visitor)); *out_sparse_index = checked_pointer_cast(converter.sparse_index); *out_data = converter.data; return Status::OK(); diff --git a/cpp/src/arrow/tensor/csx_converter.cc b/cpp/src/arrow/tensor/csx_converter.cc index 679c3a0f1acd7..877021f9c9f2d 100644 --- a/cpp/src/arrow/tensor/csx_converter.cc +++ b/cpp/src/arrow/tensor/csx_converter.cc @@ -24,6 +24,7 @@ #include "arrow/buffer.h" #include "arrow/status.h" +#include "arrow/tensor.h" #include "arrow/type.h" #include "arrow/util/checked_cast.h" #include "arrow/visit_type_inline.h" @@ -33,24 +34,25 @@ namespace arrow { class MemoryPool; namespace internal { + namespace { // ---------------------------------------------------------------------- // SparseTensorConverter for SparseCSRIndex -class SparseCSXMatrixConverter : private SparseTensorConverterMixin { - using SparseTensorConverterMixin::AssignIndex; - using SparseTensorConverterMixin::IsNonZero; - +class SparseCSXMatrixConverter { public: SparseCSXMatrixConverter(SparseMatrixCompressedAxis axis, const Tensor& tensor, const std::shared_ptr& index_value_type, MemoryPool* pool) : axis_(axis), tensor_(tensor), index_value_type_(index_value_type), pool_(pool) {} - Status Convert() { + template + Status Convert(const ValueType&, const IndexType&) { RETURN_NOT_OK(::arrow::internal::CheckSparseIndexMaximumValue(index_value_type_, tensor_.shape())); + using ValueCType = typename ValueType::c_type; + using IndexCType = typename IndexType::c_type; const int index_elsize = index_value_type_->byte_width(); const int value_elsize = tensor_.type()->byte_width(); @@ -58,6 +60,8 @@ class SparseCSXMatrixConverter : private SparseTensorConverterMixin { const int64_t ndim = tensor_.ndim(); if (ndim > 2) { return Status::Invalid("Invalid tensor dimension"); + } else if (ndim <= 1) { + return Status::NotImplemented("TODO for ndim <= 1"); } const int major_axis = static_cast(axis_); @@ -70,47 +74,34 @@ class SparseCSXMatrixConverter : private SparseTensorConverterMixin { ARROW_ASSIGN_OR_RAISE(auto values_buffer, AllocateBuffer(value_elsize * nonzero_count, pool_)); - auto* values = values_buffer->mutable_data(); - - const auto* tensor_data = tensor_.raw_data(); - - if (ndim <= 1) { - return Status::NotImplemented("TODO for ndim <= 1"); - } else { - ARROW_ASSIGN_OR_RAISE(indptr_buffer, - AllocateBuffer(index_elsize * (n_major + 1), pool_)); - auto* indptr = indptr_buffer->mutable_data(); - - ARROW_ASSIGN_OR_RAISE(indices_buffer, - AllocateBuffer(index_elsize * nonzero_count, pool_)); - auto* indices = indices_buffer->mutable_data(); - - std::vector coords(2); - int64_t k = 0; - std::fill_n(indptr, index_elsize, 0); - indptr += index_elsize; - for (int64_t i = 0; i < n_major; ++i) { - for (int64_t j = 0; j < n_minor; ++j) { - if (axis_ == SparseMatrixCompressedAxis::ROW) { - coords = {i, j}; - } else { - coords = {j, i}; - } - const int64_t offset = tensor_.CalculateValueOffset(coords); - if (std::any_of(tensor_data + offset, tensor_data + offset + value_elsize, - IsNonZero)) { - std::copy_n(tensor_data + offset, value_elsize, values); - values += value_elsize; - - AssignIndex(indices, j, index_elsize); - indices += index_elsize; - - k++; - } + ARROW_ASSIGN_OR_RAISE(indptr_buffer, + AllocateBuffer(index_elsize * (n_major + 1), pool_)); + ARROW_ASSIGN_OR_RAISE(indices_buffer, + AllocateBuffer(index_elsize * nonzero_count, pool_)); + + auto* indptr = indptr_buffer->mutable_data_as(); + auto* values = values_buffer->mutable_data_as(); + auto* indices = indices_buffer->mutable_data_as(); + + std::vector coords(2); + int64_t k = 0; + indptr[0] = 0; + ++indptr; + for (int64_t i = 0; i < n_major; ++i) { + for (int64_t j = 0; j < n_minor; ++j) { + if (axis_ == SparseMatrixCompressedAxis::ROW) { + coords = {i, j}; + } else { + coords = {j, i}; + } + auto value = tensor_.Value(coords); + if (is_not_zero(value)) { + *values++ = value; + *indices++ = static_cast(j); + k++; } - AssignIndex(indptr, k, index_elsize); - indptr += index_elsize; } + *indptr++ = static_cast(k); } std::vector indptr_shape({n_major + 1}); @@ -150,8 +141,8 @@ Status MakeSparseCSXMatrixFromTensor(SparseMatrixCompressedAxis axis, std::shared_ptr* out_sparse_index, std::shared_ptr* out_data) { SparseCSXMatrixConverter converter(axis, tensor, index_value_type, pool); - RETURN_NOT_OK(converter.Convert()); - + ConverterVisitor visitor(converter); + ARROW_RETURN_NOT_OK(VisitValueAndIndexType(tensor.type(), index_value_type, visitor)); *out_sparse_index = converter.sparse_index; *out_data = converter.data; return Status::OK(); From 82ff02542a665b49fae83acef8092845bb177222 Mon Sep 17 00:00:00 2001 From: Arash Andishgar Date: Tue, 23 Sep 2025 17:39:02 +0330 Subject: [PATCH 2/4] Apply rok suggestion --- cpp/src/arrow/sparse_tensor_test.cc | 354 ++++++++++++++++---------- cpp/src/arrow/tensor/converter.h | 11 +- cpp/src/arrow/tensor/coo_converter.cc | 12 +- 3 files changed, 227 insertions(+), 150 deletions(-) diff --git a/cpp/src/arrow/sparse_tensor_test.cc b/cpp/src/arrow/sparse_tensor_test.cc index 406c72d0e2ec4..83597219c70a2 100644 --- a/cpp/src/arrow/sparse_tensor_test.cc +++ b/cpp/src/arrow/sparse_tensor_test.cc @@ -413,69 +413,97 @@ TEST_F(TestSparseCOOTensor, TestToTensor) { ASSERT_TRUE(tensor.Equals(*dense_tensor)); } -TEST_F(TestSparseCOOTensor, CreationFromVectorTensorWithNegZero) { - std::vector data{ - -0.0, -0.0, 0.0, -0.0, 4.0, -0.0, -0.0, 0.0, -0.0, -1.0, -0.0, -0.0, - }; - std::vector shape = {12}; - auto buffer = Buffer::FromVector(data); - ASSERT_OK_AND_ASSIGN(auto dense_tensor, Tensor::Make(float32(), buffer, shape)); - ASSERT_OK_AND_ASSIGN(auto sparse_coo_tensor, - SparseCOOTensor::Make(*dense_tensor, int64())); - ASSERT_EQ(2, sparse_coo_tensor->non_zero_length()); - auto si = - internal::checked_pointer_cast(sparse_coo_tensor->sparse_index()); - AssertCOOIndex(si->indices(), 0, {4}); - AssertCOOIndex(si->indices(), 1, {9}); - ASSERT_OK_AND_ASSIGN(auto new_tensor, sparse_coo_tensor->ToTensor()); - ASSERT_TRUE(new_tensor->Equals(*dense_tensor)); -} +template +class TestSparseCOOTensorCreationFromNegativeZero + : public TestSparseTensorBase { + public: + using ValueCType = typename ValueType::c_type; -TEST_F(TestSparseCOOTensor, CreationFromContiguousDenseTensorWithNegZero) { - // clang-format off - std::vector data{ - -0.0, -0.0, 0.0, - -0.0, 4.0, -0.0, - -0.0, 0.0, -0.0, - -1.0, -0.0, -0.0, + void SetUp() override { type_ = TypeTraits::type_singleton(); } + + void FromVector() { + std::vector data{ + -0.0, -0.0, 0.0, -0.0, 4.0, -0.0, -0.0, 0.0, -0.0, -1.0, -0.0, -0.0, }; - // clang-format on - std::vector shape = {4, 3}; - auto buffer = Buffer::FromVector(data); - ASSERT_OK_AND_ASSIGN(auto dense_tensor, Tensor::Make(float32(), buffer, shape)); - ASSERT_OK_AND_ASSIGN(auto sparse_coo_tensor, - SparseCOOTensor::Make(*dense_tensor, int64())); - ASSERT_EQ(2, sparse_coo_tensor->non_zero_length()); - auto si = - internal::checked_pointer_cast(sparse_coo_tensor->sparse_index()); - AssertCOOIndex(si->indices(), 0, {1, 1}); - AssertCOOIndex(si->indices(), 1, {3, 0}); - ASSERT_OK_AND_ASSIGN(auto new_tensor, sparse_coo_tensor->ToTensor()); - ASSERT_TRUE(new_tensor->Equals(*dense_tensor)); + std::vector shape = {12}; + auto buffer = Buffer::FromVector(data); + ASSERT_OK_AND_ASSIGN(auto dense_tensor, Tensor::Make(type_, buffer, shape)); + ASSERT_OK_AND_ASSIGN(auto sparse_coo_tensor, + SparseCOOTensor::Make(*dense_tensor, int64())); + ASSERT_EQ(2, sparse_coo_tensor->non_zero_length()); + auto si = + internal::checked_pointer_cast(sparse_coo_tensor->sparse_index()); + AssertCOOIndex(si->indices(), 0, {4}); + AssertCOOIndex(si->indices(), 1, {9}); + ASSERT_OK_AND_ASSIGN(auto new_tensor, sparse_coo_tensor->ToTensor()); + ASSERT_TRUE(new_tensor->Equals(*dense_tensor)); + } + + void FromContiguousTensor() { + // clang-format off + std::vector data{ + -0.0, -0.0, 0.0, + -0.0, 4.0, -0.0, + -0.0, 0.0, -0.0, + -1.0, -0.0, -0.0, + }; + // clang-format on + std::vector shape = {4, 3}; + auto buffer = Buffer::FromVector(data); + ASSERT_OK_AND_ASSIGN(auto dense_tensor, Tensor::Make(type_, buffer, shape)); + ASSERT_OK_AND_ASSIGN(auto sparse_coo_tensor, + SparseCOOTensor::Make(*dense_tensor, int64())); + ASSERT_EQ(2, sparse_coo_tensor->non_zero_length()); + auto si = + internal::checked_pointer_cast(sparse_coo_tensor->sparse_index()); + AssertCOOIndex(si->indices(), 0, {1, 1}); + AssertCOOIndex(si->indices(), 1, {3, 0}); + ASSERT_OK_AND_ASSIGN(auto new_tensor, sparse_coo_tensor->ToTensor()); + ASSERT_TRUE(new_tensor->Equals(*dense_tensor)); + } + + void FromNonContiguousTensor() { + // clang-format off + std::vector data{ + -0.0, -0.0, 0.0, 1.0, 2.0, + -0.0, 4.0, -0.0, 0.0, -0.0, + -0.0, 0.0, -0.0, 3.0, 4.0, + -1.0, -0.0, -0.0, 0.0, 0.0, + }; + // clang-format on + std::vector shape = {4, 3}; + auto buffer = Buffer::FromVector(data); + ASSERT_OK_AND_ASSIGN(auto dense_tensor, + Tensor::Make(type_, buffer, shape, + {type_->byte_width() * 5, type_->byte_width()})); + ASSERT_OK_AND_ASSIGN(auto sparse_coo_tensor, + SparseCOOTensor::Make(*dense_tensor, int64())); + ASSERT_EQ(12, sparse_coo_tensor->size()); + ASSERT_EQ(2, sparse_coo_tensor->non_zero_length()); + auto si = + internal::checked_pointer_cast(sparse_coo_tensor->sparse_index()); + AssertCOOIndex(si->indices(), 0, {1, 1}); + AssertCOOIndex(si->indices(), 1, {3, 0}); + ASSERT_OK_AND_ASSIGN(auto new_tensor, sparse_coo_tensor->ToTensor()); + ASSERT_TRUE(new_tensor->Equals(*dense_tensor)); + } + + private: + std::shared_ptr type_; +}; + +TYPED_TEST_SUITE(TestSparseCOOTensorCreationFromNegativeZero, RealArrowTypes); + +TYPED_TEST(TestSparseCOOTensorCreationFromNegativeZero, FromVector) { + this->FromVector(); } -TEST_F(TestSparseCOOTensor, CreationFromNonContiguousDenseTensorWithNegZero) { - // clang-format off - std::vector data{ - -0.0, -0.0, 0.0, 1.0, 2.0, - -0.0, 4.0, -0.0, 0.0, -0.0, - -0.0, 0.0, -0.0, 3.0, 4.0, - -1.0, -0.0, -0.0, 0.0, 0.0, - }; - // clang-format on - std::vector shape = {4, 3}; - auto buffer = Buffer::FromVector(data); - ASSERT_OK_AND_ASSIGN(auto dense_tensor, - Tensor::Make(float32(), buffer, shape, {20, 4})); - ASSERT_OK_AND_ASSIGN(auto sparse_coo_tensor, - SparseCOOTensor::Make(*dense_tensor, int64())); - ASSERT_EQ(2, sparse_coo_tensor->non_zero_length()); - auto si = - internal::checked_pointer_cast(sparse_coo_tensor->sparse_index()); - AssertCOOIndex(si->indices(), 0, {1, 1}); - AssertCOOIndex(si->indices(), 1, {3, 0}); - ASSERT_OK_AND_ASSIGN(auto new_tensor, sparse_coo_tensor->ToTensor()); - ASSERT_TRUE(new_tensor->Equals(*dense_tensor)); +TYPED_TEST(TestSparseCOOTensorCreationFromNegativeZero, FromContiguousTensor) { + this->FromContiguousTensor(); +} + +TYPED_TEST(TestSparseCOOTensorCreationFromNegativeZero, FromNonContiguousTensor) { + this->FromNonContiguousTensor(); } template @@ -934,31 +962,49 @@ TEST_F(TestSparseCSRMatrix, TestToTensor) { ASSERT_TRUE(tensor.Equals(*dense_tensor)); } -TEST_F(TestSparseCSRMatrix, CreationFromTensorWithNegZero) { - // clang-format off - std::vector data{ - -0.0, -0.0, 0.0, - -0.0, 4.0, -0.0, - -0.0, 0.0, -0.0, - -1.0, -0.0, -0.0, - }; - // clang-format on - std::vector shape = {4, 3}; - auto buffer = Buffer::FromVector(data); - ASSERT_OK_AND_ASSIGN(auto dense_tensor, Tensor::Make(float32(), buffer, shape)); - ASSERT_OK_AND_ASSIGN(auto sparse_csr_tensor, - SparseCSRMatrix::Make(*dense_tensor, int64())); - ASSERT_EQ(2, sparse_csr_tensor->non_zero_length()); - auto si = - internal::checked_pointer_cast(sparse_csr_tensor->sparse_index()); - const auto* indptr = si->indptr()->data()->data_as(); - const auto* indices = si->indices()->data()->data_as(); - ASSERT_EQ(indptr[2], 1); - ASSERT_EQ(indptr[4], 2); - ASSERT_EQ(indices[0], 1); - ASSERT_EQ(indices[1], 0); - ASSERT_OK_AND_ASSIGN(auto new_tensor, sparse_csr_tensor->ToTensor()); - ASSERT_TRUE(new_tensor->Equals(*dense_tensor)); +template +class TestSparseCSRTensorCreationFromNegativeZero + : public TestSparseTensorBase { + public: + using ValueCType = typename ValueType::c_type; + + void SetUp() override { type_ = TypeTraits::type_singleton(); } + + void FromTensor() { + // clang-format off + std::vector data{ + -0.0, -0.0, 0.0, + -0.0, 4.0, -0.0, + -0.0, 0.0, -0.0, + -1.0, -0.0, -0.0, + }; + // clang-format on + std::vector shape = {4, 3}; + auto buffer = Buffer::FromVector(data); + ASSERT_OK_AND_ASSIGN(auto dense_tensor, Tensor::Make(type_, buffer, shape)); + ASSERT_OK_AND_ASSIGN(auto sparse_csr_tensor, + SparseCSRMatrix::Make(*dense_tensor, int64())); + ASSERT_EQ(2, sparse_csr_tensor->non_zero_length()); + auto si = + internal::checked_pointer_cast(sparse_csr_tensor->sparse_index()); + const auto* indptr = si->indptr()->data()->template data_as(); + const auto* indices = si->indices()->data()->template data_as(); + ASSERT_EQ(indptr[2], 1); + ASSERT_EQ(indptr[4], 2); + ASSERT_EQ(indices[0], 1); + ASSERT_EQ(indices[1], 0); + ASSERT_OK_AND_ASSIGN(auto new_tensor, sparse_csr_tensor->ToTensor()); + ASSERT_TRUE(new_tensor->Equals(*dense_tensor)); + } + + private: + std::shared_ptr type_; +}; + +TYPED_TEST_SUITE(TestSparseCSRTensorCreationFromNegativeZero, RealArrowTypes); + +TYPED_TEST(TestSparseCSRTensorCreationFromNegativeZero, FromTensor) { + this->FromTensor(); } template @@ -1296,31 +1342,49 @@ TEST_F(TestSparseCSCMatrix, TestToTensor) { ASSERT_TRUE(tensor.Equals(*dense_tensor)); } -TEST_F(TestSparseCSCMatrix, CreationFromTensorWithNegZero) { - // clang-format off - std::vector data{ - -0.0, -0.0, 0.0, - -0.0, 4.0, -0.0, - -0.0, 0.0, -0.0, - -1.0, -0.0, -0.0, - }; - // clang-format on - std::vector shape = {4, 3}; - auto buffer = Buffer::FromVector(data); - ASSERT_OK_AND_ASSIGN(auto dense_tensor, Tensor::Make(float32(), buffer, shape)); - ASSERT_OK_AND_ASSIGN(auto sparse_csc_tensor, - SparseCSCMatrix::Make(*dense_tensor, int64())); - ASSERT_EQ(2, sparse_csc_tensor->non_zero_length()); - auto si = - internal::checked_pointer_cast(sparse_csc_tensor->sparse_index()); - const auto* indptr = si->indptr()->data()->data_as(); - const auto* indices = si->indices()->data()->data_as(); - ASSERT_EQ(indptr[1], 1); - ASSERT_EQ(indptr[2], 2); - ASSERT_EQ(indices[0], 3); - ASSERT_EQ(indices[1], 1); - ASSERT_OK_AND_ASSIGN(auto new_tensor, sparse_csc_tensor->ToTensor()); - ASSERT_TRUE(new_tensor->Equals(*dense_tensor)); +template +class TestSparseCSCTensorCreationFromNegativeZero + : public TestSparseTensorBase { + public: + using ValueCType = typename ValueType::c_type; + + void SetUp() override { type_ = TypeTraits::type_singleton(); } + + void FromTensor() { + // clang-format off + std::vector data{ + -0.0, -0.0, 0.0, + -0.0, 4.0, -0.0, + -0.0, 0.0, -0.0, + -1.0, -0.0, -0.0, + }; + // clang-format on + std::vector shape = {4, 3}; + auto buffer = Buffer::FromVector(data); + ASSERT_OK_AND_ASSIGN(auto dense_tensor, Tensor::Make(type_, buffer, shape)); + ASSERT_OK_AND_ASSIGN(auto sparse_csc_tensor, + SparseCSCMatrix::Make(*dense_tensor, int64())); + ASSERT_EQ(2, sparse_csc_tensor->non_zero_length()); + auto si = + internal::checked_pointer_cast(sparse_csc_tensor->sparse_index()); + const auto* indptr = si->indptr()->data()->template data_as(); + const auto* indices = si->indices()->data()->template data_as(); + ASSERT_EQ(indptr[1], 1); + ASSERT_EQ(indptr[2], 2); + ASSERT_EQ(indices[0], 3); + ASSERT_EQ(indices[1], 1); + ASSERT_OK_AND_ASSIGN(auto new_tensor, sparse_csc_tensor->ToTensor()); + ASSERT_TRUE(new_tensor->Equals(*dense_tensor)); + } + + private: + std::shared_ptr type_; +}; + +TYPED_TEST_SUITE(TestSparseCSCTensorCreationFromNegativeZero, RealArrowTypes); + +TYPED_TEST(TestSparseCSCTensorCreationFromNegativeZero, FromTensor) { + this->FromTensor(); } template @@ -1598,33 +1662,51 @@ TEST_F(TestSparseCSFTensor, CreationFromZeroTensor) { ASSERT_TRUE(t->Equals(*t_zero)); } -TEST_F(TestSparseCSFTensor, CreationFromTensorWithNegZero) { - // clang-format off - std::vector data{ - -0.0, -0.0, -0.0, -0.0, - 4.0, -0.0, -0.0, -0.0, - 0.0, -1.0, -0.0, -0.0, - }; - // clang-format on - std::vector shape = {3, 4}; - auto buffer = Buffer::FromVector(data); - ASSERT_OK_AND_ASSIGN(auto dense_tensor, Tensor::Make(float32(), buffer, shape)); - ASSERT_OK_AND_ASSIGN(auto sparse_csf_tensor, - SparseCSFTensor::Make(*dense_tensor, int64())); - ASSERT_EQ(2, sparse_csf_tensor->non_zero_length()); - auto si = - internal::checked_pointer_cast(sparse_csf_tensor->sparse_index()); - auto indptr = si->indptr()[0]->data()->data_as(); - auto row_indices = si->indices()[0]->data()->data_as(); - auto column_indices = si->indices()[1]->data()->data_as(); - ASSERT_EQ(indptr[1], 1); - ASSERT_EQ(indptr[2], 2); - EXPECT_EQ(row_indices[0], 1); - EXPECT_EQ(row_indices[1], 2); - EXPECT_EQ(column_indices[0], 0); - EXPECT_EQ(column_indices[1], 1); - ASSERT_OK_AND_ASSIGN(auto new_tensor, sparse_csf_tensor->ToTensor()); - ASSERT_TRUE(new_tensor->Equals(*dense_tensor)); +template +class TestSparseCSFTensorCreationFromNegativeZero + : public TestSparseTensorBase { + public: + using ValueCType = typename ValueType::c_type; + + void SetUp() override { type_ = TypeTraits::type_singleton(); } + + void FromTensor() { + // clang-format off + std::vector data{ + -0.0, -0.0, -0.0, -0.0, + 4.0, -0.0, -0.0, -0.0, + 0.0, -1.0, -0.0, -0.0, + }; + // clang-format on + std::vector shape = {3, 4}; + auto buffer = Buffer::FromVector(data); + ASSERT_OK_AND_ASSIGN(auto dense_tensor, Tensor::Make(type_, buffer, shape)); + ASSERT_OK_AND_ASSIGN(auto sparse_csf_tensor, + SparseCSFTensor::Make(*dense_tensor, int64())); + ASSERT_EQ(2, sparse_csf_tensor->non_zero_length()); + auto si = + internal::checked_pointer_cast(sparse_csf_tensor->sparse_index()); + auto indptr = si->indptr()[0]->data()->template data_as(); + auto row_indices = si->indices()[0]->data()->template data_as(); + auto column_indices = si->indices()[1]->data()->template data_as(); + ASSERT_EQ(indptr[1], 1); + ASSERT_EQ(indptr[2], 2); + EXPECT_EQ(row_indices[0], 1); + EXPECT_EQ(row_indices[1], 2); + EXPECT_EQ(column_indices[0], 0); + EXPECT_EQ(column_indices[1], 1); + ASSERT_OK_AND_ASSIGN(auto new_tensor, sparse_csf_tensor->ToTensor()); + ASSERT_TRUE(new_tensor->Equals(*dense_tensor)); + } + + private: + std::shared_ptr type_; +}; + +TYPED_TEST_SUITE(TestSparseCSFTensorCreationFromNegativeZero, RealArrowTypes); + +TYPED_TEST(TestSparseCSFTensorCreationFromNegativeZero, FromTensor) { + this->FromTensor(); } template diff --git a/cpp/src/arrow/tensor/converter.h b/cpp/src/arrow/tensor/converter.h index fd23f83c8e8f0..f51cb541cf1fc 100644 --- a/cpp/src/arrow/tensor/converter.h +++ b/cpp/src/arrow/tensor/converter.h @@ -66,15 +66,15 @@ Result> MakeTensorFromSparseCSCMatrix( Result> MakeTensorFromSparseCSFTensor( MemoryPool* pool, const SparseCSFTensor* sparse_tensor); -template +template struct ConverterVisitor { - explicit ConverterVisitor(Convertor& converter) : converter(converter) {} + explicit ConverterVisitor(Converter& converter) : converter(converter) {} template Status operator()(const ValueType& value, const IndexType& index_type) { return converter.Convert(value, index_type); } - Convertor& converter; + Converter& converter; }; struct ValueTypeVisitor { @@ -87,7 +87,8 @@ struct ValueTypeVisitor { template Status Visit(const DataType& value_type, const IndexType&, Function&&) { - return Status::Invalid("Invalid value type and the type is ", value_type.name()); + return Status::Invalid("Invalid value type: ", value_type.name(), + ". Expected a number."); } }; @@ -103,7 +104,7 @@ struct IndexAndValueTypeVisitor { template Status Visit(const DataType& type, const std::shared_ptr&, Function&&) { - return Status::Invalid("Invalid index type and the type is ", type.name()); + return Status::Invalid("Invalid index type: ", type.name(), ". Expected integer."); } }; diff --git a/cpp/src/arrow/tensor/coo_converter.cc b/cpp/src/arrow/tensor/coo_converter.cc index 57e696a0b18b5..149db6518d3af 100644 --- a/cpp/src/arrow/tensor/coo_converter.cc +++ b/cpp/src/arrow/tensor/coo_converter.cc @@ -54,8 +54,8 @@ inline void IncrementRowMajorIndex(std::vector& coord, } template -void ConvertContinuousTensor(const Tensor& tensor, typename IndexType::c_type* indices, - typename ValueType::c_type* values) { +void ConvertRowMajorTensor(const Tensor& tensor, typename IndexType::c_type* indices, + typename ValueType::c_type* values) { using ValueCType = typename ValueType::c_type; using IndexCType = typename IndexType::c_type; @@ -77,12 +77,6 @@ void ConvertContinuousTensor(const Tensor& tensor, typename IndexType::c_type* i } } -template -void ConvertRowMajorTensor(const Tensor& tensor, typename IndexType::c_type* out_indices, - typename ValueType::c_type* out_values) { - ConvertContinuousTensor(tensor, out_indices, out_values); -} - // TODO(GH-47580): Correct column-major tensor conversion template void ConvertColumnMajorTensor(const Tensor& tensor, @@ -95,7 +89,7 @@ void ConvertColumnMajorTensor(const Tensor& tensor, const auto ndim = tensor.ndim(); std::vector indices(ndim * size); std::vector values(size); - ConvertContinuousTensor(tensor, indices.data(), values.data()); + ConvertRowMajorTensor(tensor, indices.data(), values.data()); // transpose indices for (int64_t i = 0; i < size; ++i) { From c8c66e1eafde86de432f6181b5bc915859bec1eb Mon Sep 17 00:00:00 2001 From: arash andishgar Date: Fri, 26 Sep 2025 12:11:26 +0330 Subject: [PATCH 3/4] apply rok suggestion --- cpp/src/arrow/sparse_tensor_test.cc | 24 ++++++++++++------------ cpp/src/arrow/tensor.cc | 4 ++-- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/cpp/src/arrow/sparse_tensor_test.cc b/cpp/src/arrow/sparse_tensor_test.cc index 83597219c70a2..2d5b2561c5ded 100644 --- a/cpp/src/arrow/sparse_tensor_test.cc +++ b/cpp/src/arrow/sparse_tensor_test.cc @@ -423,7 +423,7 @@ class TestSparseCOOTensorCreationFromNegativeZero void FromVector() { std::vector data{ - -0.0, -0.0, 0.0, -0.0, 4.0, -0.0, -0.0, 0.0, -0.0, -1.0, -0.0, -0.0, + -0.0, -0.0, -0.0, -0.0, 4.0, -0.0, -0.0, -0.0, -0.0, -1.0, -0.0, -0.0, }; std::vector shape = {12}; auto buffer = Buffer::FromVector(data); @@ -442,9 +442,9 @@ class TestSparseCOOTensorCreationFromNegativeZero void FromContiguousTensor() { // clang-format off std::vector data{ - -0.0, -0.0, 0.0, + -0.0, -0.0, -0.0, -0.0, 4.0, -0.0, - -0.0, 0.0, -0.0, + -0.0, -0.0, -0.0, -1.0, -0.0, -0.0, }; // clang-format on @@ -465,10 +465,10 @@ class TestSparseCOOTensorCreationFromNegativeZero void FromNonContiguousTensor() { // clang-format off std::vector data{ - -0.0, -0.0, 0.0, 1.0, 2.0, - -0.0, 4.0, -0.0, 0.0, -0.0, - -0.0, 0.0, -0.0, 3.0, 4.0, - -1.0, -0.0, -0.0, 0.0, 0.0, + -0.0, -0.0, -0.0, 1.0, 2.0, + -0.0, 4.0, -0.0, -0.0, -0.0, + -0.0, -0.0, -0.0, 3.0, 4.0, + -1.0, -0.0, -0.0, -0.0, -0.0, }; // clang-format on std::vector shape = {4, 3}; @@ -973,9 +973,9 @@ class TestSparseCSRTensorCreationFromNegativeZero void FromTensor() { // clang-format off std::vector data{ - -0.0, -0.0, 0.0, + -0.0, -0.0, -0.0, -0.0, 4.0, -0.0, - -0.0, 0.0, -0.0, + -0.0, -0.0, -0.0, -1.0, -0.0, -0.0, }; // clang-format on @@ -1353,9 +1353,9 @@ class TestSparseCSCTensorCreationFromNegativeZero void FromTensor() { // clang-format off std::vector data{ - -0.0, -0.0, 0.0, + -0.0, -0.0, -0.0, -0.0, 4.0, -0.0, - -0.0, 0.0, -0.0, + -0.0, -0.0, -0.0, -1.0, -0.0, -0.0, }; // clang-format on @@ -1675,7 +1675,7 @@ class TestSparseCSFTensorCreationFromNegativeZero std::vector data{ -0.0, -0.0, -0.0, -0.0, 4.0, -0.0, -0.0, -0.0, - 0.0, -1.0, -0.0, -0.0, + -0.0, -1.0, -0.0, -0.0, }; // clang-format on std::vector shape = {3, 4}; diff --git a/cpp/src/arrow/tensor.cc b/cpp/src/arrow/tensor.cc index d3dd96b81095f..7483d079b0b66 100644 --- a/cpp/src/arrow/tensor.cc +++ b/cpp/src/arrow/tensor.cc @@ -490,8 +490,8 @@ int64_t StridedTensorCountNonZero(int dim_index, int64_t offset, const Tensor& t if (dim_index == tensor.ndim() - 1) { for (int64_t i = 0; i < tensor.shape()[dim_index]; ++i) { const auto* ptr = tensor.raw_data() + offset + i * tensor.strides()[dim_index]; - if (auto& elem = *reinterpret_cast(ptr); - internal::is_not_zero(elem)) { + auto& elem = *reinterpret_cast(ptr); + if (internal::is_not_zero(elem)) { ++nnz; } } From a90bd1e200f424f3339c2a2344e17a20889647e6 Mon Sep 17 00:00:00 2001 From: arash andishgar Date: Fri, 26 Sep 2025 15:17:21 +0330 Subject: [PATCH 4/4] Apply +0.0,-0.0,0.0 to relevant test cases --- cpp/src/arrow/sparse_tensor_test.cc | 32 ++++++++++++++--------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/cpp/src/arrow/sparse_tensor_test.cc b/cpp/src/arrow/sparse_tensor_test.cc index 2d5b2561c5ded..635a858e54196 100644 --- a/cpp/src/arrow/sparse_tensor_test.cc +++ b/cpp/src/arrow/sparse_tensor_test.cc @@ -423,7 +423,7 @@ class TestSparseCOOTensorCreationFromNegativeZero void FromVector() { std::vector data{ - -0.0, -0.0, -0.0, -0.0, 4.0, -0.0, -0.0, -0.0, -0.0, -1.0, -0.0, -0.0, + -0.0, -0.0, 0.0, -0.0, 4.0, +0.0, -0.0, -0.0, -0.0, -1.0, 0.0, -0.0, }; std::vector shape = {12}; auto buffer = Buffer::FromVector(data); @@ -442,9 +442,9 @@ class TestSparseCOOTensorCreationFromNegativeZero void FromContiguousTensor() { // clang-format off std::vector data{ - -0.0, -0.0, -0.0, - -0.0, 4.0, -0.0, - -0.0, -0.0, -0.0, + -0.0, 0.0, -0.0, + +0.0, 4.0, -0.0, + -0.0, -0.0, 0.0, -1.0, -0.0, -0.0, }; // clang-format on @@ -465,10 +465,10 @@ class TestSparseCOOTensorCreationFromNegativeZero void FromNonContiguousTensor() { // clang-format off std::vector data{ - -0.0, -0.0, -0.0, 1.0, 2.0, - -0.0, 4.0, -0.0, -0.0, -0.0, - -0.0, -0.0, -0.0, 3.0, 4.0, - -1.0, -0.0, -0.0, -0.0, -0.0, + -0.0, -0.0, 0.0, 1.0, 2.0, + -0.0, 4.0, 0.0, 0.0, -0.0, + -0.0, +0.0, -0.0, 3.0, 4.0, + -1.0, -0.0, -0.0, -0.0, +0.0, }; // clang-format on std::vector shape = {4, 3}; @@ -973,10 +973,10 @@ class TestSparseCSRTensorCreationFromNegativeZero void FromTensor() { // clang-format off std::vector data{ - -0.0, -0.0, -0.0, + -0.0, -0.0, 0.0, -0.0, 4.0, -0.0, - -0.0, -0.0, -0.0, - -1.0, -0.0, -0.0, + +0.0, -0.0, -0.0, + -1.0, -0.0, +0.0, }; // clang-format on std::vector shape = {4, 3}; @@ -1353,9 +1353,9 @@ class TestSparseCSCTensorCreationFromNegativeZero void FromTensor() { // clang-format off std::vector data{ - -0.0, -0.0, -0.0, + -0.0, -0.0, +0.0, -0.0, 4.0, -0.0, - -0.0, -0.0, -0.0, + -0.0, 0.0, -0.0, -1.0, -0.0, -0.0, }; // clang-format on @@ -1673,9 +1673,9 @@ class TestSparseCSFTensorCreationFromNegativeZero void FromTensor() { // clang-format off std::vector data{ - -0.0, -0.0, -0.0, -0.0, - 4.0, -0.0, -0.0, -0.0, - -0.0, -1.0, -0.0, -0.0, + -0.0, -0.0, 0.0, -0.0, + 4.0, +0.0, -0.0, -0.0, + 0.0, -1.0, -0.0, -0.0, }; // clang-format on std::vector shape = {3, 4};