diff --git a/backends/aoti/slim/c10/core/WrapDimMinimal.h b/backends/aoti/slim/c10/core/WrapDimMinimal.h new file mode 100644 index 00000000000..0a3acc3f54f --- /dev/null +++ b/backends/aoti/slim/c10/core/WrapDimMinimal.h @@ -0,0 +1,94 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include + +#include +#include + +namespace executorch::backends::aoti::slim::c10 { + +namespace detail { + +/// Slow path for maybe_wrap_dim when dimension needs validation. +template +inline T maybe_wrap_dim_slow(T dim, T dim_post_expr, bool wrap_scalar) { + ET_CHECK_MSG( + dim_post_expr >= 0, + "Rank cannot be negative but got %ld", + static_cast(dim_post_expr)); + + if (dim_post_expr == 0) { + ET_CHECK_MSG( + wrap_scalar, + "Dimension specified as %ld but tensor has no dimensions", + static_cast(dim)); + // Recursively call with dim_post_expr=1 + if (dim >= 0 && dim < 1) { + return dim; + } else if (dim >= -1 && dim < 0) { + return dim + 1; + } + ET_CHECK_MSG( + false, + "Dimension out of range (expected to be in range of [-1, 0], but got %ld)", + static_cast(dim)); + } + + T min = dim_post_expr * -1; + T max = dim_post_expr - 1; + ET_CHECK_MSG( + min <= dim && dim <= max, + "Dimension out of range (expected to be in range of [%ld, %ld], but got %ld)", + static_cast(min), + static_cast(max), + static_cast(dim)); + + // This should be unreachable if above check passes + return dim < 0 ? dim + dim_post_expr : dim; +} + +} // namespace detail + +/// Wraps a dimension index to handle negative indexing. +/// For example, dim=-1 with dim_post_expr=3 returns 2. +/// +/// @param dim The dimension index (may be negative). +/// @param dim_post_expr The number of dimensions. +/// @param wrap_scalar If true, allows wrapping for 0-dimensional tensors. +/// @return The wrapped dimension index (always non-negative). +template +inline T _maybe_wrap_dim(T dim, T dim_post_expr, bool wrap_scalar = true) { + // Inline the fast paths + if (SLIMTENSOR_LIKELY(dim_post_expr * -1 <= dim && dim < dim_post_expr)) { + if (dim < 0) { + return dim + dim_post_expr; + } + return dim; + } + // Check edge-cases out-of-line + return detail::maybe_wrap_dim_slow( + std::move(dim), std::move(dim_post_expr), wrap_scalar); +} + +/// Wraps a dimension index for int64_t. +inline int64_t +maybe_wrap_dim(int64_t dim, int64_t dim_post_expr, bool wrap_scalar = true) { + return _maybe_wrap_dim(dim, dim_post_expr, wrap_scalar); +} + +/// Wraps a dimension index for size_t. +inline int64_t +maybe_wrap_dim(int64_t dim, size_t dim_post_expr, bool wrap_scalar = true) { + return _maybe_wrap_dim(dim, static_cast(dim_post_expr), wrap_scalar); +} + +} // namespace executorch::backends::aoti::slim::c10 diff --git a/backends/aoti/slim/c10/core/targets.bzl b/backends/aoti/slim/c10/core/targets.bzl index 5a9b9558938..65c6aaa7707 100644 --- a/backends/aoti/slim/c10/core/targets.bzl +++ b/backends/aoti/slim/c10/core/targets.bzl @@ -67,6 +67,19 @@ def define_common_targets(): ], ) + # Header-only library for WrapDimMinimal + runtime.cxx_library( + name = "wrap_dim_minimal", + headers = [ + "WrapDimMinimal.h", + ], + visibility = ["@EXECUTORCH_CLIENTS"], + exported_deps = [ + "//executorch/backends/aoti/slim/c10/macros:macros", + "//executorch/runtime/platform:platform", + ], + ) + # Combined c10 core library runtime.cxx_library( name = "core", @@ -77,5 +90,6 @@ def define_common_targets(): ":device_type", ":scalar_type", ":sizes_and_strides", + ":wrap_dim_minimal", ], ) diff --git a/backends/aoti/slim/core/SlimTensor.h b/backends/aoti/slim/core/SlimTensor.h index 92b34e8a3e8..0061b0e08b9 100644 --- a/backends/aoti/slim/core/SlimTensor.h +++ b/backends/aoti/slim/core/SlimTensor.h @@ -10,9 +10,12 @@ #include #include +#include #include #include +#include + #include #include #include @@ -254,22 +257,113 @@ class SlimTensor { } /** - * Set sizes and strides together. + * Set sizes, strides, and storage offset together. */ - void set_sizes_and_strides(IntArrayRef sizes, IntArrayRef strides) { + void set_sizes_and_strides( + IntArrayRef sizes, + IntArrayRef strides, + std::optional storage_offset = std::nullopt) { + const size_t new_dim = sizes.size(); ET_CHECK_MSG( - sizes.size() == strides.size(), - "sizes (%zu) and strides (%zu) must have the same length", - sizes.size(), + new_dim == strides.size(), + "dimensionality of sizes (%zu) must match dimensionality of strides (%zu)", + new_dim, strides.size()); - sizes_and_strides_.set_sizes(sizes); - sizes_and_strides_.set_strides(strides); + std::vector new_sizes = toVec(sizes); + std::vector new_strides = toVec(strides); + + // stride calculation logic + bool overflowed = false; + if (new_dim > 0) { + for (int64_t dim = new_dim - 1; dim >= 0; dim--) { + if (strides[dim] >= 0) { + new_strides[dim] = strides[dim]; + } else { + // for negative strides + if (dim == new_dim - 1) { + new_strides[dim] = 1; + } else { + overflowed |= ::c10::mul_overflows( + new_strides[dim + 1], + std::max(new_sizes[dim + 1], 1), + &new_strides[dim]); + } + } + } + } + ET_CHECK_MSG(!overflowed, "Stride calculation overflowed"); + + sizes_and_strides_.set_sizes(makeArrayRef(new_sizes)); + sizes_and_strides_.set_strides(makeArrayRef(new_strides)); + if (storage_offset.has_value()) { + storage_offset_ = *storage_offset; + } refresh_numel(); refresh_contiguous(); } + /** + * Set sizes to a contiguous layout (computes strides automatically). + */ + void set_sizes_contiguous(IntArrayRef sizes) { + std::vector contig_strides = compute_contiguous_strides(sizes); + set_sizes_and_strides(sizes, makeArrayRef(contig_strides)); + } + + // ========================================================================= + // View Operations + // ========================================================================= + + /** + * Returns a view of the tensor with the specified sizes, strides, and + * storage offset. The returned tensor shares the same underlying storage. + * + * @param sizes The sizes of the view. + * @param strides The strides of the view. + * @param storage_offset Offset into storage in number of elements. + * @return A new SlimTensor that is a view of this tensor. + */ + inline SlimTensor as_strided( + IntArrayRef sizes, + IntArrayRef strides, + int64_t storage_offset) const; + + /** + * Overload for initializer lists. + */ + inline SlimTensor as_strided( + std::initializer_list sizes, + std::initializer_list strides, + int64_t storage_offset) const { + return as_strided( + makeArrayRef(sizes), makeArrayRef(strides), storage_offset); + } + + /** + * Modifies this tensor in-place to have the specified sizes, strides, and + * storage offset. The underlying storage remains unchanged. + * + * @param sizes The new sizes. + * @param strides The new strides. + * @param storage_offset New offset into storage in number of elements. + * @return Reference to this tensor. + */ + inline SlimTensor& + as_strided_(IntArrayRef sizes, IntArrayRef strides, int64_t storage_offset); + + /** + * Overload for initializer lists. + */ + inline SlimTensor& as_strided_( + std::initializer_list sizes, + std::initializer_list strides, + int64_t storage_offset) { + return as_strided_( + makeArrayRef(sizes), makeArrayRef(strides), storage_offset); + } + // ========================================================================= // Copy Operation // ========================================================================= @@ -278,7 +372,7 @@ class SlimTensor { * Copy data from another tensor to this tensor. * * Both tensors must have the same numel and dtype. - * Supports CPU-to-CPU and cross-device copies (CPU↔CUDA, CUDA↔CUDA). + * Currently only supports CPU-to-CPU copy (contiguous tensors only). * * @param other The source tensor to copy from * @return Reference to this tensor @@ -371,3 +465,7 @@ class SlimTensor { }; } // namespace executorch::backends::aoti::slim + +// Include view operations implementations (must be after SlimTensor class +// definition) +#include diff --git a/backends/aoti/slim/core/SlimTensorView-incl.h b/backends/aoti/slim/core/SlimTensorView-incl.h new file mode 100644 index 00000000000..f0ed8bc087c --- /dev/null +++ b/backends/aoti/slim/core/SlimTensorView-incl.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include + +namespace executorch::backends::aoti::slim { + +inline SlimTensor SlimTensor::as_strided( + IntArrayRef sizes, + IntArrayRef strides, + int64_t storage_offset) const { + SlimTensor result = *this; + result.as_strided_(sizes, strides, storage_offset); + return result; +} + +inline SlimTensor& SlimTensor::as_strided_( + IntArrayRef sizes, + IntArrayRef strides, + int64_t storage_offset) { + ET_CHECK_MSG( + sizes.size() == strides.size(), + "as_strided: number of sizes (%zu) must equal number of strides (%zu)", + sizes.size(), + strides.size()); + + for (size_t i = 0; i < sizes.size(); ++i) { + ET_CHECK_MSG( + sizes[i] >= 0, + "as_strided: size at dimension %zu is negative: %ld", + i, + static_cast(sizes[i])); + } + + ET_CHECK_MSG( + storage_offset >= 0, + "as_strided: storage_offset must be non-negative, got: %ld", + static_cast(storage_offset)); + + this->set_sizes_and_strides(sizes, strides, storage_offset); + return *this; +} + +} // namespace executorch::backends::aoti::slim diff --git a/backends/aoti/slim/core/targets.bzl b/backends/aoti/slim/core/targets.bzl index cc74b01b444..408738edd35 100644 --- a/backends/aoti/slim/core/targets.bzl +++ b/backends/aoti/slim/core/targets.bzl @@ -26,6 +26,7 @@ def define_common_targets(): name = "slimtensor", headers = [ "SlimTensor.h", + "SlimTensorView-incl.h", ], visibility = ["@EXECUTORCH_CLIENTS"], exported_deps = [ @@ -34,9 +35,10 @@ def define_common_targets(): "//executorch/backends/aoti/slim/c10/core:device", "//executorch/backends/aoti/slim/c10/core:scalar_type", "//executorch/backends/aoti/slim/c10/core:sizes_and_strides", + "//executorch/backends/aoti/slim/c10/core:wrap_dim_minimal", "//executorch/backends/aoti/slim/util:array_ref_util", "//executorch/backends/aoti/slim/util:size_util", - "//executorch/backends/aoti/slim/c10/cuda:exception", "//executorch/runtime/platform:platform", + "//executorch/backends/aoti/slim/c10/cuda:exception", ], ) diff --git a/backends/aoti/slim/core/test/targets.bzl b/backends/aoti/slim/core/test/targets.bzl index d0991708c7f..e2bd116ffc9 100644 --- a/backends/aoti/slim/core/test/targets.bzl +++ b/backends/aoti/slim/core/test/targets.bzl @@ -7,8 +7,17 @@ def get_backend_mode(): def define_common_targets(): """Define test targets for SlimTensor core module.""" + runtime.cxx_test( + name = "test_slimtensor_dtypes", + srcs = [ + "test_slimtensor_dtypes.cpp", + ], + deps = [ + "//executorch/backends/aoti/slim/factory:empty", + ], + ) - # GPU storage test with CUDA support + # Backend mode specific tests for backend_mode in get_backend_mode(): backend_suffix = "_" + backend_mode if backend_mode == "cuda" else "" @@ -57,12 +66,14 @@ def define_common_targets(): **backend_kwargs ) - runtime.cxx_test( - name = "test_slimtensor_dtypes", - srcs = [ - "test_slimtensor_dtypes.cpp", - ], - deps = [ - "//executorch/backends/aoti/slim/factory:empty", - ], - ) + runtime.cxx_test( + name = "test_as_strided" + backend_suffix, + srcs = [ + "test_as_strided.cpp", + ], + deps = [ + "//executorch/backends/aoti/slim/core:slimtensor", + "//executorch/backends/aoti/slim/factory:empty", + ], + **backend_kwargs + ) diff --git a/backends/aoti/slim/core/test/test_as_strided.cpp b/backends/aoti/slim/core/test/test_as_strided.cpp new file mode 100644 index 00000000000..f73104b5ba0 --- /dev/null +++ b/backends/aoti/slim/core/test/test_as_strided.cpp @@ -0,0 +1,388 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include +#include + +#ifdef CUDA_AVAILABLE +#include +#endif + +namespace executorch::backends::aoti::slim { + +// ============================================================================= +// Device trait for parameterized tests +// ============================================================================= + +struct CPUDevice { + static c10::Device device() { + return CPU_DEVICE; + } + static constexpr bool is_cuda = false; +}; + +#ifdef CUDA_AVAILABLE +struct CUDADevice { + static c10::Device device() { + return DEFAULT_CUDA_DEVICE; + } + static constexpr bool is_cuda = true; +}; +#endif + +// ============================================================================= +// Test fixture for parameterized device tests +// ============================================================================= + +template +class AsStridedDeviceTest : public ::testing::Test { + protected: + static c10::Device device() { + return DeviceTrait::device(); + } + + SlimTensor make_tensor( + std::initializer_list sizes, + c10::ScalarType dtype = c10::ScalarType::Float) { + return empty(sizes, dtype, device()); + } + + // Helper to initialize tensor data from CPU (handles both CPU and CUDA) + template + void fill_sequential(SlimTensor& tensor, size_t count) { + if constexpr (DeviceTrait::is_cuda) { +#ifdef CUDA_AVAILABLE + std::vector cpu_data(count); + for (size_t i = 0; i < count; ++i) { + cpu_data[i] = static_cast(i); + } + DeviceTraits::memcpy( + tensor.data_ptr(), + cpu_data.data(), + count * sizeof(T), + DEFAULT_CUDA_DEVICE, + CPU_DEVICE); +#endif + } else { + T* data = static_cast(tensor.data_ptr()); + for (size_t i = 0; i < count; ++i) { + data[i] = static_cast(i); + } + } + } + + // Helper to read a value from tensor (handles both CPU and CUDA) + template + T read_value(void* ptr, size_t offset = 0) { + if constexpr (DeviceTrait::is_cuda) { +#ifdef CUDA_AVAILABLE + T value; + DeviceTraits::memcpy( + &value, + static_cast(ptr) + offset, + sizeof(T), + CPU_DEVICE, + DEFAULT_CUDA_DEVICE); + return value; +#else + return T{}; +#endif + } else { + return *(static_cast(ptr) + offset); + } + } + + // Helper to write a value to tensor (handles both CPU and CUDA) + template + void write_value(void* ptr, T value, size_t offset = 0) { + if constexpr (DeviceTrait::is_cuda) { +#ifdef CUDA_AVAILABLE + DeviceTraits::memcpy( + static_cast(ptr) + offset, + &value, + sizeof(T), + DEFAULT_CUDA_DEVICE, + CPU_DEVICE); +#endif + } else { + *(static_cast(ptr) + offset) = value; + } + } +}; + +// Type list for parameterized tests +using DeviceTypes = ::testing::Types< + CPUDevice +#ifdef CUDA_AVAILABLE + , + CUDADevice +#endif + >; + +TYPED_TEST_SUITE(AsStridedDeviceTest, DeviceTypes); + +// ============================================================================= +// as_strided Basic Tests +// ============================================================================= + +TYPED_TEST(AsStridedDeviceTest, BasicView) { + SlimTensor tensor = this->make_tensor({4, 4}); + this->template fill_sequential(tensor, 16); + + SlimTensor view = tensor.as_strided({2, 2}, {4, 1}, 0); + + EXPECT_EQ(view.size(0), 2); + EXPECT_EQ(view.size(1), 2); + EXPECT_EQ(view.stride(0), 4); + EXPECT_EQ(view.stride(1), 1); + EXPECT_EQ(view.storage_offset(), 0); + EXPECT_EQ(view.numel(), 4); + + // View should share storage + EXPECT_EQ(view.storage().get(), tensor.storage().get()); + + // Verify data access through view + EXPECT_FLOAT_EQ(this->template read_value(view.data_ptr(), 0), 0.0f); + EXPECT_FLOAT_EQ(this->template read_value(view.data_ptr(), 1), 1.0f); + EXPECT_FLOAT_EQ(this->template read_value(tensor.data_ptr(), 4), 4.0f); +} + +TYPED_TEST(AsStridedDeviceTest, WithStorageOffset) { + SlimTensor tensor = this->make_tensor({4, 4}); + this->template fill_sequential(tensor, 16); + + SlimTensor view = tensor.as_strided({2, 3}, {4, 1}, 5); + + EXPECT_EQ(view.storage_offset(), 5); + EXPECT_EQ(view.numel(), 6); + + EXPECT_FLOAT_EQ(this->template read_value(view.data_ptr(), 0), 5.0f); +} + +TYPED_TEST(AsStridedDeviceTest, NonContiguousStrides) { + SlimTensor tensor = this->make_tensor({6}); + this->template fill_sequential(tensor, 6); + + SlimTensor view = tensor.as_strided({3}, {2}, 0); + + EXPECT_EQ(view.size(0), 3); + EXPECT_EQ(view.stride(0), 2); + EXPECT_EQ(view.numel(), 3); + EXPECT_FALSE(view.is_contiguous()); + + // Access values through stride (stride=2, so indices 0, 2, 4) + EXPECT_FLOAT_EQ( + this->template read_value(view.data_ptr(), 0 * 2), 0.0f); + EXPECT_FLOAT_EQ( + this->template read_value(view.data_ptr(), 1 * 2), 2.0f); + EXPECT_FLOAT_EQ( + this->template read_value(view.data_ptr(), 2 * 2), 4.0f); +} + +TYPED_TEST(AsStridedDeviceTest, TransposeView) { + SlimTensor tensor = this->make_tensor({3, 4}); + this->template fill_sequential(tensor, 12); + + // Create transposed view (4x3) by swapping sizes and strides + SlimTensor transposed = tensor.as_strided({4, 3}, {1, 4}, 0); + + EXPECT_EQ(transposed.size(0), 4); + EXPECT_EQ(transposed.size(1), 3); + EXPECT_EQ(transposed.stride(0), 1); + EXPECT_EQ(transposed.stride(1), 4); + EXPECT_FALSE(transposed.is_contiguous()); +} + +TYPED_TEST(AsStridedDeviceTest, SharedStorageModification) { + SlimTensor tensor = this->make_tensor({4}); + this->template fill_sequential(tensor, 4); + + SlimTensor view = tensor.as_strided({2}, {1}, 1); + + // Modify through view + this->template write_value(view.data_ptr(), 100.0f, 0); + this->template write_value(view.data_ptr(), 200.0f, 1); + + // Changes should be visible in original tensor + EXPECT_FLOAT_EQ( + this->template read_value(tensor.data_ptr(), 1), 100.0f); + EXPECT_FLOAT_EQ( + this->template read_value(tensor.data_ptr(), 2), 200.0f); +} + +// ============================================================================= +// as_strided_ In-Place Tests +// ============================================================================= + +TYPED_TEST(AsStridedDeviceTest, InPlaceModification) { + SlimTensor tensor = this->make_tensor({4, 4}); + void* original_data = tensor.data_ptr(); + + tensor.as_strided_({2, 8}, {8, 1}, 0); + + EXPECT_EQ(tensor.size(0), 2); + EXPECT_EQ(tensor.size(1), 8); + EXPECT_EQ(tensor.stride(0), 8); + EXPECT_EQ(tensor.stride(1), 1); + EXPECT_EQ(tensor.numel(), 16); + EXPECT_TRUE(tensor.is_contiguous()); + + EXPECT_EQ(tensor.data_ptr(), original_data); +} + +TYPED_TEST(AsStridedDeviceTest, InPlaceWithOffset) { + SlimTensor tensor = this->make_tensor({16}); + + tensor.as_strided_({4}, {1}, 4); + + EXPECT_EQ(tensor.size(0), 4); + EXPECT_EQ(tensor.storage_offset(), 4); + EXPECT_EQ(tensor.numel(), 4); +} + +// ============================================================================= +// as_strided Edge Cases +// ============================================================================= + +TYPED_TEST(AsStridedDeviceTest, ZeroDimView) { + SlimTensor tensor = this->make_tensor({4}); + this->template write_value(tensor.data_ptr(), 42.0f, 2); + + SlimTensor scalar_view = tensor.as_strided({}, {}, 2); + + EXPECT_EQ(scalar_view.dim(), 0); + EXPECT_EQ(scalar_view.numel(), 1); + EXPECT_EQ(scalar_view.storage_offset(), 2); + + EXPECT_FLOAT_EQ( + this->template read_value(scalar_view.data_ptr(), 0), 42.0f); +} + +TYPED_TEST(AsStridedDeviceTest, SingleElementView) { + SlimTensor tensor = this->make_tensor({3, 3}); + this->template fill_sequential(tensor, 9); + + SlimTensor view = tensor.as_strided({1, 1}, {3, 1}, 4); + + EXPECT_EQ(view.numel(), 1); + + EXPECT_FLOAT_EQ(this->template read_value(view.data_ptr(), 0), 4.0f); +} + +TYPED_TEST(AsStridedDeviceTest, ZeroStridesBroadcast) { + SlimTensor tensor = this->make_tensor({4}); + this->template write_value(tensor.data_ptr(), 42.0f, 0); + + SlimTensor broadcast = tensor.as_strided({3, 3}, {0, 0}, 0); + + EXPECT_EQ(broadcast.size(0), 3); + EXPECT_EQ(broadcast.size(1), 3); + EXPECT_EQ(broadcast.stride(0), 0); + EXPECT_EQ(broadcast.stride(1), 0); + EXPECT_EQ(broadcast.numel(), 9); + + EXPECT_FLOAT_EQ( + this->template read_value(broadcast.data_ptr(), 0), 42.0f); +} + +// ============================================================================= +// as_strided with Different DTypes +// ============================================================================= + +TYPED_TEST(AsStridedDeviceTest, Int64View) { + SlimTensor tensor = this->make_tensor({8}, c10::ScalarType::Long); + + // Fill with values multiplied by 10 + if constexpr (TypeParam::is_cuda) { +#ifdef CUDA_AVAILABLE + std::vector cpu_data(8); + for (size_t i = 0; i < 8; ++i) { + cpu_data[i] = static_cast(i * 10); + } + DeviceTraits::memcpy( + tensor.data_ptr(), + cpu_data.data(), + 8 * sizeof(int64_t), + DEFAULT_CUDA_DEVICE, + CPU_DEVICE); +#endif + } else { + int64_t* data = static_cast(tensor.data_ptr()); + for (size_t i = 0; i < 8; ++i) { + data[i] = static_cast(i * 10); + } + } + + SlimTensor view = tensor.as_strided({2, 3}, {3, 1}, 1); + + EXPECT_EQ(view.dtype(), c10::ScalarType::Long); + EXPECT_EQ(this->template read_value(view.data_ptr(), 0), 10); +} + +TYPED_TEST(AsStridedDeviceTest, Int8View) { + SlimTensor tensor = this->make_tensor({16}, c10::ScalarType::Char); + + if constexpr (TypeParam::is_cuda) { +#ifdef CUDA_AVAILABLE + std::vector cpu_data(16); + for (size_t i = 0; i < 16; ++i) { + cpu_data[i] = static_cast(i); + } + DeviceTraits::memcpy( + tensor.data_ptr(), + cpu_data.data(), + 16 * sizeof(int8_t), + DEFAULT_CUDA_DEVICE, + CPU_DEVICE); +#endif + } else { + int8_t* data = static_cast(tensor.data_ptr()); + for (size_t i = 0; i < 16; ++i) { + data[i] = static_cast(i); + } + } + + SlimTensor view = tensor.as_strided({4, 2}, {4, 1}, 2); + + EXPECT_EQ(view.dtype(), c10::ScalarType::Char); + EXPECT_EQ(view.itemsize(), 1); + EXPECT_EQ(this->template read_value(view.data_ptr(), 0), 2); +} + +// ============================================================================= +// Multiple Views Share Storage +// ============================================================================= + +TYPED_TEST(AsStridedDeviceTest, MultipleViews) { + SlimTensor tensor = this->make_tensor({12}); + this->template fill_sequential(tensor, 12); + + SlimTensor view1 = tensor.as_strided({4}, {1}, 0); + SlimTensor view2 = tensor.as_strided({4}, {1}, 4); + SlimTensor view3 = tensor.as_strided({4}, {1}, 8); + + EXPECT_EQ(view1.storage().get(), tensor.storage().get()); + EXPECT_EQ(view2.storage().get(), tensor.storage().get()); + EXPECT_EQ(view3.storage().get(), tensor.storage().get()); + + EXPECT_FLOAT_EQ(this->template read_value(view1.data_ptr(), 0), 0.0f); + EXPECT_FLOAT_EQ(this->template read_value(view2.data_ptr(), 0), 4.0f); + EXPECT_FLOAT_EQ(this->template read_value(view3.data_ptr(), 0), 8.0f); + + // Modify through one view + this->template write_value(view2.data_ptr(), 100.0f, 0); + + // Visible in original + EXPECT_FLOAT_EQ( + this->template read_value(tensor.data_ptr(), 4), 100.0f); +} + +} // namespace executorch::backends::aoti::slim diff --git a/backends/aoti/slim/factory/FromBlob.h b/backends/aoti/slim/factory/FromBlob.h new file mode 100644 index 00000000000..b0c659419e9 --- /dev/null +++ b/backends/aoti/slim/factory/FromBlob.h @@ -0,0 +1,106 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include + +namespace executorch::backends::aoti::slim { + +/// Creates a SlimTensor that wraps external memory without taking ownership. +/// The returned tensor does NOT own the underlying storage; the caller is +/// responsible for keeping the data alive for the lifetime of the tensor. +/// +/// @param data Pointer to external memory (must not be null). +/// @param sizes The sizes of each dimension. +/// @param strides The strides of each dimension. +/// @param dtype The scalar type of tensor elements. +/// @param device The device where the data resides. +/// @param storage_offset Offset into storage in number of elements. +/// @return A new SlimTensor with non-owning storage. +inline SlimTensor from_blob( + void* data, + IntArrayRef sizes, + IntArrayRef strides, + c10::ScalarType dtype, + const c10::Device& device = CPU_DEVICE, + int64_t storage_offset = 0) { + ET_CHECK_MSG(data != nullptr, "from_blob: data pointer cannot be nullptr"); + + size_t nbytes = compute_storage_nbytes( + sizes, strides, c10::elementSize(dtype), storage_offset); + + Storage storage(new MaybeOwningStorage(device, data, nbytes)); + return SlimTensor(std::move(storage), sizes, strides, dtype, storage_offset); +} + +/// Creates a contiguous SlimTensor that wraps external memory. +/// Computes contiguous strides automatically. +/// +/// @param data Pointer to external memory (must not be null). +/// @param sizes The sizes of each dimension. +/// @param dtype The scalar type of tensor elements. +/// @param device The device where the data resides. +/// @param storage_offset Offset into storage in number of elements. +/// @return A new SlimTensor with non-owning storage and contiguous strides. +inline SlimTensor from_blob( + void* data, + IntArrayRef sizes, + c10::ScalarType dtype, + const c10::Device& device = CPU_DEVICE, + int64_t storage_offset = 0) { + std::vector contig_strides = compute_contiguous_strides(sizes); + return from_blob( + data, sizes, makeArrayRef(contig_strides), dtype, device, storage_offset); +} + +/// Creates a contiguous SlimTensor from an initializer list of sizes. +/// +/// @param data Pointer to external memory (must not be null). +/// @param sizes The sizes as an initializer list. +/// @param dtype The scalar type of tensor elements. +/// @param device The device where the data resides. +/// @param storage_offset Offset into storage in number of elements. +/// @return A new SlimTensor with non-owning storage and contiguous strides. +inline SlimTensor from_blob( + void* data, + std::initializer_list sizes, + c10::ScalarType dtype, + const c10::Device& device = CPU_DEVICE, + int64_t storage_offset = 0) { + return from_blob(data, makeArrayRef(sizes), dtype, device, storage_offset); +} + +/// Creates a SlimTensor from initializer lists for both sizes and strides. +/// +/// @param data Pointer to external memory (must not be null). +/// @param sizes The sizes as an initializer list. +/// @param strides The strides as an initializer list. +/// @param dtype The scalar type of tensor elements. +/// @param device The device where the data resides. +/// @param storage_offset Offset into storage in number of elements. +/// @return A new SlimTensor with non-owning storage. +inline SlimTensor from_blob( + void* data, + std::initializer_list sizes, + std::initializer_list strides, + c10::ScalarType dtype, + const c10::Device& device = CPU_DEVICE, + int64_t storage_offset = 0) { + return from_blob( + data, + makeArrayRef(sizes), + makeArrayRef(strides), + dtype, + device, + storage_offset); +} + +} // namespace executorch::backends::aoti::slim diff --git a/backends/aoti/slim/factory/targets.bzl b/backends/aoti/slim/factory/targets.bzl index d6dc41aa877..5b10967e166 100644 --- a/backends/aoti/slim/factory/targets.bzl +++ b/backends/aoti/slim/factory/targets.bzl @@ -16,3 +16,16 @@ def define_common_targets(): "//executorch/backends/aoti/slim/util:size_util", ], ) + + runtime.cxx_library( + name = "from_blob", + headers = [ + "FromBlob.h", + ], + visibility = ["@EXECUTORCH_CLIENTS"], + exported_deps = [ + "//executorch/backends/aoti/slim/core:slimtensor", + "//executorch/backends/aoti/slim/util:array_ref_util", + "//executorch/backends/aoti/slim/util:size_util", + ], + ) diff --git a/backends/aoti/slim/factory/test/targets.bzl b/backends/aoti/slim/factory/test/targets.bzl index 7bad3067cc0..668d7f75385 100644 --- a/backends/aoti/slim/factory/test/targets.bzl +++ b/backends/aoti/slim/factory/test/targets.bzl @@ -31,3 +31,16 @@ def define_common_targets(): ], **backend_kwargs ) + + runtime.cxx_test( + name = "test_from_blob" + backend_suffix, + srcs = [ + "test_from_blob.cpp", + ], + deps = [ + "//executorch/backends/aoti/slim/core:storage", + "//executorch/backends/aoti/slim/factory:from_blob", + "//executorch/backends/aoti/slim/factory:empty", + ], + **backend_kwargs + ) diff --git a/backends/aoti/slim/factory/test/test_from_blob.cpp b/backends/aoti/slim/factory/test/test_from_blob.cpp new file mode 100644 index 00000000000..16d43d545f3 --- /dev/null +++ b/backends/aoti/slim/factory/test/test_from_blob.cpp @@ -0,0 +1,782 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include +#include + +#ifdef CUDA_AVAILABLE +#include +#endif + +namespace executorch::backends::aoti::slim { + +// ============================================================================= +// from_blob Basic Tests +// ============================================================================= + +TEST(FromBlobTest, BasicConstruction) { + constexpr size_t kNumFloats = 24; + float external_data[kNumFloats]; + + // Initialize external data + for (size_t i = 0; i < kNumFloats; ++i) { + external_data[i] = static_cast(i) * 1.5f; + } + + SlimTensor tensor = + from_blob(external_data, {2, 3, 4}, c10::ScalarType::Float); + + // Verify tensor properties + EXPECT_EQ(tensor.numel(), kNumFloats); + EXPECT_EQ(tensor.dim(), 3); + EXPECT_EQ(tensor.size(0), 2); + EXPECT_EQ(tensor.size(1), 3); + EXPECT_EQ(tensor.size(2), 4); + EXPECT_EQ(tensor.dtype(), c10::ScalarType::Float); + EXPECT_TRUE(tensor.is_cpu()); + EXPECT_TRUE(tensor.is_contiguous()); + EXPECT_EQ(tensor.storage_offset(), 0); + + // Verify data pointer points to external data + EXPECT_EQ(tensor.data_ptr(), static_cast(external_data)); + + // Verify data is accessible through tensor + float* data = static_cast(tensor.data_ptr()); + for (size_t i = 0; i < kNumFloats; ++i) { + EXPECT_FLOAT_EQ(data[i], static_cast(i) * 1.5f); + } +} + +TEST(FromBlobTest, ModifyThroughTensor) { + constexpr size_t kNumFloats = 16; + float external_data[kNumFloats]; + + for (size_t i = 0; i < kNumFloats; ++i) { + external_data[i] = 0.0f; + } + + SlimTensor tensor = from_blob(external_data, {4, 4}, c10::ScalarType::Float); + + // Modify through tensor + float* data = static_cast(tensor.data_ptr()); + for (size_t i = 0; i < kNumFloats; ++i) { + data[i] = static_cast(i) * 10.0f; + } + + // Verify external data was modified + for (size_t i = 0; i < kNumFloats; ++i) { + EXPECT_FLOAT_EQ(external_data[i], static_cast(i) * 10.0f); + } +} + +TEST(FromBlobTest, ExternalDataSurvivesTensorDestruction) { + constexpr size_t kNumFloats = 8; + float external_data[kNumFloats]; + + for (size_t i = 0; i < kNumFloats; ++i) { + external_data[i] = static_cast(i) * 2.0f; + } + + { + SlimTensor tensor = + from_blob(external_data, {2, 4}, c10::ScalarType::Float); + + // Modify through tensor + float* data = static_cast(tensor.data_ptr()); + data[0] = 999.0f; + } + // tensor is destroyed here + + // External data should still be accessible + EXPECT_FLOAT_EQ(external_data[0], 999.0f); + for (size_t i = 1; i < kNumFloats; ++i) { + EXPECT_FLOAT_EQ(external_data[i], static_cast(i) * 2.0f); + } +} + +// ============================================================================= +// from_blob with Strides Tests +// ============================================================================= + +TEST(FromBlobTest, CustomStrides) { + constexpr size_t kBufferSize = 16; + float external_data[kBufferSize]; + + for (size_t i = 0; i < kBufferSize; ++i) { + external_data[i] = static_cast(i); + } + + // Create a 2x3 tensor with custom strides (transpose-like) + SlimTensor tensor = from_blob( + external_data, + {2, 3}, + {1, 4}, // Non-contiguous strides + c10::ScalarType::Float); + + EXPECT_EQ(tensor.size(0), 2); + EXPECT_EQ(tensor.size(1), 3); + EXPECT_EQ(tensor.stride(0), 1); + EXPECT_EQ(tensor.stride(1), 4); + EXPECT_FALSE(tensor.is_contiguous()); +} + +TEST(FromBlobTest, WithStorageOffset) { + constexpr size_t kBufferSize = 20; + float external_data[kBufferSize]; + + for (size_t i = 0; i < kBufferSize; ++i) { + external_data[i] = static_cast(i); + } + + // Create tensor with offset of 5 elements + SlimTensor tensor = from_blob( + external_data, + {3, 4}, + c10::ScalarType::Float, + CPU_DEVICE, + 5); // storage_offset = 5 + + EXPECT_EQ(tensor.storage_offset(), 5); + EXPECT_EQ(tensor.numel(), 12); + + // data_ptr() should point to external_data + 5 * sizeof(float) + EXPECT_EQ(tensor.data_ptr(), static_cast(external_data + 5)); + + // Verify first element is external_data[5] + float* data = static_cast(tensor.data_ptr()); + EXPECT_FLOAT_EQ(data[0], 5.0f); +} + +// ============================================================================= +// from_blob with Different DTypes Tests +// ============================================================================= + +TEST(FromBlobTest, Int64Dtype) { + constexpr size_t kNumElements = 6; + int64_t external_data[kNumElements] = {10, 20, 30, 40, 50, 60}; + + SlimTensor tensor = from_blob(external_data, {2, 3}, c10::ScalarType::Long); + + EXPECT_EQ(tensor.dtype(), c10::ScalarType::Long); + EXPECT_EQ(tensor.itemsize(), sizeof(int64_t)); + EXPECT_EQ(tensor.numel(), kNumElements); + + int64_t* data = static_cast(tensor.data_ptr()); + EXPECT_EQ(data[0], 10); + EXPECT_EQ(data[5], 60); +} + +TEST(FromBlobTest, Int8Dtype) { + constexpr size_t kNumElements = 10; + int8_t external_data[kNumElements]; + + for (size_t i = 0; i < kNumElements; ++i) { + external_data[i] = static_cast(i); + } + + SlimTensor tensor = from_blob(external_data, {10}, c10::ScalarType::Char); + + EXPECT_EQ(tensor.dtype(), c10::ScalarType::Char); + EXPECT_EQ(tensor.itemsize(), sizeof(int8_t)); + EXPECT_EQ(tensor.dim(), 1); + + int8_t* data = static_cast(tensor.data_ptr()); + for (size_t i = 0; i < kNumElements; ++i) { + EXPECT_EQ(data[i], static_cast(i)); + } +} + +TEST(FromBlobTest, BoolDtype) { + bool external_data[] = {true, false, true, false, true, true}; + + SlimTensor tensor = from_blob(external_data, {2, 3}, c10::ScalarType::Bool); + + EXPECT_EQ(tensor.dtype(), c10::ScalarType::Bool); + EXPECT_EQ(tensor.numel(), 6); + + bool* data = static_cast(tensor.data_ptr()); + EXPECT_TRUE(data[0]); + EXPECT_FALSE(data[1]); + EXPECT_TRUE(data[2]); +} + +// ============================================================================= +// from_blob Copy Tests +// ============================================================================= + +TEST(FromBlobTest, CopyToOwnedTensor) { + constexpr size_t kNumFloats = 12; + float external_data[kNumFloats]; + + for (size_t i = 0; i < kNumFloats; ++i) { + external_data[i] = static_cast(i) * 3.0f; + } + + SlimTensor src = from_blob(external_data, {3, 4}, c10::ScalarType::Float); + SlimTensor dst = empty({3, 4}, c10::ScalarType::Float); + + dst.copy_(src); + + // Verify dst has the data + float* dst_data = static_cast(dst.data_ptr()); + for (size_t i = 0; i < kNumFloats; ++i) { + EXPECT_FLOAT_EQ(dst_data[i], static_cast(i) * 3.0f); + } + + // Verify dst is independent of src + external_data[0] = 999.0f; + EXPECT_FLOAT_EQ(dst_data[0], 0.0f); +} + +TEST(FromBlobTest, TensorCopyToFromBlob) { + constexpr size_t kNumFloats = 6; + float src_data[kNumFloats]; + float dst_data[kNumFloats]; + + for (size_t i = 0; i < kNumFloats; ++i) { + src_data[i] = static_cast(i) * 5.0f; + dst_data[i] = 0.0f; + } + + SlimTensor src = from_blob(src_data, {2, 3}, c10::ScalarType::Float); + SlimTensor dst = from_blob(dst_data, {2, 3}, c10::ScalarType::Float); + + dst.copy_(src); + + // Verify dst_data was modified + for (size_t i = 0; i < kNumFloats; ++i) { + EXPECT_FLOAT_EQ(dst_data[i], static_cast(i) * 5.0f); + } +} + +// ============================================================================= +// from_blob Shared Storage Tests +// ============================================================================= + +TEST(FromBlobTest, CopiedTensorSharesStorage) { + constexpr size_t kNumFloats = 8; + float external_data[kNumFloats]; + + for (size_t i = 0; i < kNumFloats; ++i) { + external_data[i] = static_cast(i); + } + + SlimTensor tensor1 = from_blob(external_data, {2, 4}, c10::ScalarType::Float); + SlimTensor tensor2 = tensor1; // Copy constructor + + // Both should point to same storage + EXPECT_EQ(tensor1.data_ptr(), tensor2.data_ptr()); + EXPECT_EQ(tensor1.storage().get(), tensor2.storage().get()); + + // Modification through tensor2 affects tensor1 + float* data2 = static_cast(tensor2.data_ptr()); + data2[0] = 100.0f; + + float* data1 = static_cast(tensor1.data_ptr()); + EXPECT_FLOAT_EQ(data1[0], 100.0f); + + // And external data + EXPECT_FLOAT_EQ(external_data[0], 100.0f); +} + +// ============================================================================= +// from_blob with ArrayRef Tests +// ============================================================================= + +TEST(FromBlobTest, WithArrayRef) { + constexpr size_t kNumFloats = 6; + float external_data[kNumFloats]; + + for (size_t i = 0; i < kNumFloats; ++i) { + external_data[i] = static_cast(i); + } + + std::vector sizes = {2, 3}; + std::vector strides = {3, 1}; + + SlimTensor tensor = from_blob( + external_data, + makeArrayRef(sizes), + makeArrayRef(strides), + c10::ScalarType::Float); + + EXPECT_EQ(tensor.size(0), 2); + EXPECT_EQ(tensor.size(1), 3); + EXPECT_EQ(tensor.stride(0), 3); + EXPECT_EQ(tensor.stride(1), 1); + EXPECT_TRUE(tensor.is_contiguous()); +} + +// ============================================================================= +// CUDA from_blob Tests +// Tests are skipped at runtime if CUDA hardware is not available. +// ============================================================================= + +#ifdef CUDA_AVAILABLE + +// ============================================================================= +// from_blob CUDA Basic Tests +// ============================================================================= + +TEST(FromBlobCUDATest, BasicConstruction) { + constexpr size_t kNumFloats = 24; + constexpr size_t kNbytes = kNumFloats * sizeof(float); + + // Allocate CUDA memory + float* cuda_data = + static_cast(DeviceTraits::allocate( + kNbytes, DEFAULT_CUDA_DEVICE)); + + // Initialize via CPU buffer + float* cpu_buffer = static_cast( + DeviceTraits::allocate(kNbytes)); + for (size_t i = 0; i < kNumFloats; ++i) { + cpu_buffer[i] = static_cast(i) * 1.5f; + } + DeviceTraits::memcpy( + cuda_data, cpu_buffer, kNbytes, DEFAULT_CUDA_DEVICE, CPU_DEVICE); + + SlimTensor tensor = from_blob( + cuda_data, {2, 3, 4}, c10::ScalarType::Float, DEFAULT_CUDA_DEVICE); + + // Verify tensor properties + EXPECT_EQ(tensor.numel(), kNumFloats); + EXPECT_EQ(tensor.dim(), 3); + EXPECT_EQ(tensor.size(0), 2); + EXPECT_EQ(tensor.size(1), 3); + EXPECT_EQ(tensor.size(2), 4); + EXPECT_EQ(tensor.dtype(), c10::ScalarType::Float); + EXPECT_TRUE(tensor.is_cuda()); + EXPECT_FALSE(tensor.is_cpu()); + EXPECT_TRUE(tensor.is_contiguous()); + EXPECT_EQ(tensor.storage_offset(), 0); + + // Verify data pointer points to CUDA data + EXPECT_EQ(tensor.data_ptr(), static_cast(cuda_data)); + + // Verify data is accessible by copying back to CPU + float* verify_buffer = static_cast( + DeviceTraits::allocate(kNbytes)); + DeviceTraits::memcpy( + verify_buffer, cuda_data, kNbytes, CPU_DEVICE, DEFAULT_CUDA_DEVICE); + for (size_t i = 0; i < kNumFloats; ++i) { + EXPECT_FLOAT_EQ(verify_buffer[i], static_cast(i) * 1.5f); + } + + // Clean up + DeviceTraits::free(cpu_buffer); + DeviceTraits::free(verify_buffer); + DeviceTraits::free(cuda_data); +} + +TEST(FromBlobCUDATest, ExternalDataSurvivesTensorDestruction) { + constexpr size_t kNumFloats = 8; + constexpr size_t kNbytes = kNumFloats * sizeof(float); + + // Allocate CUDA memory + float* cuda_data = + static_cast(DeviceTraits::allocate( + kNbytes, DEFAULT_CUDA_DEVICE)); + + // Initialize via CPU buffer + float* cpu_buffer = static_cast( + DeviceTraits::allocate(kNbytes)); + for (size_t i = 0; i < kNumFloats; ++i) { + cpu_buffer[i] = static_cast(i) * 2.0f; + } + DeviceTraits::memcpy( + cuda_data, cpu_buffer, kNbytes, DEFAULT_CUDA_DEVICE, CPU_DEVICE); + + { + SlimTensor tensor = from_blob( + cuda_data, {2, 4}, c10::ScalarType::Float, DEFAULT_CUDA_DEVICE); + + // Modify first element via CPU buffer and copy back + cpu_buffer[0] = 999.0f; + DeviceTraits::memcpy( + cuda_data, cpu_buffer, sizeof(float), DEFAULT_CUDA_DEVICE, CPU_DEVICE); + } + // tensor is destroyed here + + // External CUDA data should still be accessible + float* verify_buffer = static_cast( + DeviceTraits::allocate(kNbytes)); + DeviceTraits::memcpy( + verify_buffer, cuda_data, kNbytes, CPU_DEVICE, DEFAULT_CUDA_DEVICE); + EXPECT_FLOAT_EQ(verify_buffer[0], 999.0f); + for (size_t i = 1; i < kNumFloats; ++i) { + EXPECT_FLOAT_EQ(verify_buffer[i], static_cast(i) * 2.0f); + } + + // Clean up + DeviceTraits::free(cpu_buffer); + DeviceTraits::free(verify_buffer); + DeviceTraits::free(cuda_data); +} + +// ============================================================================= +// from_blob CUDA with Strides Tests +// ============================================================================= + +TEST(FromBlobCUDATest, CustomStrides) { + constexpr size_t kBufferSize = 16; + constexpr size_t kNbytes = kBufferSize * sizeof(float); + + float* cuda_data = + static_cast(DeviceTraits::allocate( + kNbytes, DEFAULT_CUDA_DEVICE)); + + // Create a 2x3 tensor with custom strides (transpose-like) + SlimTensor tensor = from_blob( + cuda_data, + {2, 3}, + {1, 4}, // Non-contiguous strides + c10::ScalarType::Float, + DEFAULT_CUDA_DEVICE); + + EXPECT_EQ(tensor.size(0), 2); + EXPECT_EQ(tensor.size(1), 3); + EXPECT_EQ(tensor.stride(0), 1); + EXPECT_EQ(tensor.stride(1), 4); + EXPECT_FALSE(tensor.is_contiguous()); + EXPECT_TRUE(tensor.is_cuda()); + + DeviceTraits::free(cuda_data); +} + +TEST(FromBlobCUDATest, WithStorageOffset) { + constexpr size_t kBufferSize = 20; + constexpr size_t kNbytes = kBufferSize * sizeof(float); + + float* cuda_data = + static_cast(DeviceTraits::allocate( + kNbytes, DEFAULT_CUDA_DEVICE)); + + // Initialize via CPU buffer + float* cpu_buffer = static_cast( + DeviceTraits::allocate(kNbytes)); + for (size_t i = 0; i < kBufferSize; ++i) { + cpu_buffer[i] = static_cast(i); + } + DeviceTraits::memcpy( + cuda_data, cpu_buffer, kNbytes, DEFAULT_CUDA_DEVICE, CPU_DEVICE); + + // Create tensor with offset of 5 elements + SlimTensor tensor = from_blob( + cuda_data, + {3, 4}, + c10::ScalarType::Float, + DEFAULT_CUDA_DEVICE, + 5); // storage_offset = 5 + + EXPECT_EQ(tensor.storage_offset(), 5); + EXPECT_EQ(tensor.numel(), 12); + EXPECT_TRUE(tensor.is_cuda()); + + // data_ptr() should point to cuda_data + 5 * sizeof(float) + EXPECT_EQ(tensor.data_ptr(), static_cast(cuda_data + 5)); + + // Verify first element is cuda_data[5] by copying back + float first_elem = 0.0f; + DeviceTraits::memcpy( + &first_elem, + cuda_data + 5, + sizeof(float), + CPU_DEVICE, + DEFAULT_CUDA_DEVICE); + EXPECT_FLOAT_EQ(first_elem, 5.0f); + + // Clean up + DeviceTraits::free(cpu_buffer); + DeviceTraits::free(cuda_data); +} + +// ============================================================================= +// from_blob CUDA with Different DTypes Tests +// ============================================================================= + +TEST(FromBlobCUDATest, Int64Dtype) { + constexpr size_t kNumElements = 6; + constexpr size_t kNbytes = kNumElements * sizeof(int64_t); + + int64_t* cuda_data = + static_cast(DeviceTraits::allocate( + kNbytes, DEFAULT_CUDA_DEVICE)); + + int64_t cpu_buffer[kNumElements] = {10, 20, 30, 40, 50, 60}; + DeviceTraits::memcpy( + cuda_data, cpu_buffer, kNbytes, DEFAULT_CUDA_DEVICE, CPU_DEVICE); + + SlimTensor tensor = + from_blob(cuda_data, {2, 3}, c10::ScalarType::Long, DEFAULT_CUDA_DEVICE); + + EXPECT_EQ(tensor.dtype(), c10::ScalarType::Long); + EXPECT_EQ(tensor.itemsize(), sizeof(int64_t)); + EXPECT_EQ(tensor.numel(), kNumElements); + EXPECT_TRUE(tensor.is_cuda()); + + // Verify by copying back + int64_t verify_buffer[kNumElements]; + DeviceTraits::memcpy( + verify_buffer, cuda_data, kNbytes, CPU_DEVICE, DEFAULT_CUDA_DEVICE); + EXPECT_EQ(verify_buffer[0], 10); + EXPECT_EQ(verify_buffer[5], 60); + + DeviceTraits::free(cuda_data); +} + +TEST(FromBlobCUDATest, Int8Dtype) { + constexpr size_t kNumElements = 10; + constexpr size_t kNbytes = kNumElements * sizeof(int8_t); + + int8_t* cuda_data = + static_cast(DeviceTraits::allocate( + kNbytes, DEFAULT_CUDA_DEVICE)); + + int8_t cpu_buffer[kNumElements]; + for (size_t i = 0; i < kNumElements; ++i) { + cpu_buffer[i] = static_cast(i); + } + DeviceTraits::memcpy( + cuda_data, cpu_buffer, kNbytes, DEFAULT_CUDA_DEVICE, CPU_DEVICE); + + SlimTensor tensor = + from_blob(cuda_data, {10}, c10::ScalarType::Char, DEFAULT_CUDA_DEVICE); + + EXPECT_EQ(tensor.dtype(), c10::ScalarType::Char); + EXPECT_EQ(tensor.itemsize(), sizeof(int8_t)); + EXPECT_EQ(tensor.dim(), 1); + EXPECT_TRUE(tensor.is_cuda()); + + // Verify by copying back + int8_t verify_buffer[kNumElements]; + DeviceTraits::memcpy( + verify_buffer, cuda_data, kNbytes, CPU_DEVICE, DEFAULT_CUDA_DEVICE); + for (size_t i = 0; i < kNumElements; ++i) { + EXPECT_EQ(verify_buffer[i], static_cast(i)); + } + + DeviceTraits::free(cuda_data); +} + +// ============================================================================= +// from_blob CUDA Cross-Device Copy Tests +// ============================================================================= + +TEST(FromBlobCUDATest, CopyCPUFromBlobToCUDAFromBlob) { + constexpr size_t kNumFloats = 6; + constexpr size_t kNbytes = kNumFloats * sizeof(float); + + // Create CPU source with from_blob + float cpu_src_data[kNumFloats]; + for (size_t i = 0; i < kNumFloats; ++i) { + cpu_src_data[i] = static_cast(i) * 3.0f; + } + SlimTensor cpu_src = + from_blob(cpu_src_data, {2, 3}, c10::ScalarType::Float, CPU_DEVICE); + + // Create CUDA destination with from_blob + float* cuda_dst_data = + static_cast(DeviceTraits::allocate( + kNbytes, DEFAULT_CUDA_DEVICE)); + SlimTensor cuda_dst = from_blob( + cuda_dst_data, {2, 3}, c10::ScalarType::Float, DEFAULT_CUDA_DEVICE); + + // Copy CPU -> CUDA + cuda_dst.copy_(cpu_src); + + // Verify by copying back to CPU + float verify_buffer[kNumFloats]; + DeviceTraits::memcpy( + verify_buffer, cuda_dst_data, kNbytes, CPU_DEVICE, DEFAULT_CUDA_DEVICE); + for (size_t i = 0; i < kNumFloats; ++i) { + EXPECT_FLOAT_EQ(verify_buffer[i], static_cast(i) * 3.0f); + } + + DeviceTraits::free(cuda_dst_data); +} + +TEST(FromBlobCUDATest, CopyCUDAFromBlobToCPUFromBlob) { + constexpr size_t kNumFloats = 4; + constexpr size_t kNbytes = kNumFloats * sizeof(float); + + // Create and initialize CUDA source with from_blob + float* cuda_src_data = + static_cast(DeviceTraits::allocate( + kNbytes, DEFAULT_CUDA_DEVICE)); + float cpu_init[kNumFloats]; + for (size_t i = 0; i < kNumFloats; ++i) { + cpu_init[i] = static_cast(i) + 100.0f; + } + DeviceTraits::memcpy( + cuda_src_data, cpu_init, kNbytes, DEFAULT_CUDA_DEVICE, CPU_DEVICE); + SlimTensor cuda_src = from_blob( + cuda_src_data, {2, 2}, c10::ScalarType::Float, DEFAULT_CUDA_DEVICE); + + // Create CPU destination with from_blob + float cpu_dst_data[kNumFloats] = {0.0f, 0.0f, 0.0f, 0.0f}; + SlimTensor cpu_dst = + from_blob(cpu_dst_data, {2, 2}, c10::ScalarType::Float, CPU_DEVICE); + + // Copy CUDA -> CPU + cpu_dst.copy_(cuda_src); + + // Verify CPU destination data + for (size_t i = 0; i < kNumFloats; ++i) { + EXPECT_FLOAT_EQ(cpu_dst_data[i], static_cast(i) + 100.0f); + } + + DeviceTraits::free(cuda_src_data); +} + +TEST(FromBlobCUDATest, CopyCUDAFromBlobToCUDAFromBlob) { + constexpr size_t kNumFloats = 4; + constexpr size_t kNbytes = kNumFloats * sizeof(float); + + // Create and initialize CUDA source with from_blob + float* cuda_src_data = + static_cast(DeviceTraits::allocate( + kNbytes, DEFAULT_CUDA_DEVICE)); + float cpu_init[kNumFloats]; + for (size_t i = 0; i < kNumFloats; ++i) { + cpu_init[i] = static_cast(i) * 5.0f; + } + DeviceTraits::memcpy( + cuda_src_data, cpu_init, kNbytes, DEFAULT_CUDA_DEVICE, CPU_DEVICE); + SlimTensor cuda_src = from_blob( + cuda_src_data, {2, 2}, c10::ScalarType::Float, DEFAULT_CUDA_DEVICE); + + // Create CUDA destination with from_blob + float* cuda_dst_data = + static_cast(DeviceTraits::allocate( + kNbytes, DEFAULT_CUDA_DEVICE)); + SlimTensor cuda_dst = from_blob( + cuda_dst_data, {2, 2}, c10::ScalarType::Float, DEFAULT_CUDA_DEVICE); + + // Copy CUDA -> CUDA + cuda_dst.copy_(cuda_src); + + // Verify by copying back to CPU + float verify_buffer[kNumFloats]; + DeviceTraits::memcpy( + verify_buffer, cuda_dst_data, kNbytes, CPU_DEVICE, DEFAULT_CUDA_DEVICE); + for (size_t i = 0; i < kNumFloats; ++i) { + EXPECT_FLOAT_EQ(verify_buffer[i], static_cast(i) * 5.0f); + } + + DeviceTraits::free(cuda_src_data); + DeviceTraits::free(cuda_dst_data); +} + +// ============================================================================= +// from_blob CUDA to empty() Copy Tests +// ============================================================================= + +TEST(FromBlobCUDATest, CopyCUDAFromBlobToOwnedCUDATensor) { + constexpr size_t kNumFloats = 12; + constexpr size_t kNbytes = kNumFloats * sizeof(float); + + // Create CUDA source with from_blob + float* cuda_src_data = + static_cast(DeviceTraits::allocate( + kNbytes, DEFAULT_CUDA_DEVICE)); + float cpu_init[kNumFloats]; + for (size_t i = 0; i < kNumFloats; ++i) { + cpu_init[i] = static_cast(i) * 7.0f; + } + DeviceTraits::memcpy( + cuda_src_data, cpu_init, kNbytes, DEFAULT_CUDA_DEVICE, CPU_DEVICE); + SlimTensor cuda_src = from_blob( + cuda_src_data, {3, 4}, c10::ScalarType::Float, DEFAULT_CUDA_DEVICE); + + // Create owned CUDA destination with empty() + SlimTensor cuda_dst = + empty({3, 4}, c10::ScalarType::Float, DEFAULT_CUDA_DEVICE); + + cuda_dst.copy_(cuda_src); + + // Verify by copying back to CPU + float verify_buffer[kNumFloats]; + DeviceTraits::memcpy( + verify_buffer, + cuda_dst.data_ptr(), + kNbytes, + CPU_DEVICE, + DEFAULT_CUDA_DEVICE); + for (size_t i = 0; i < kNumFloats; ++i) { + EXPECT_FLOAT_EQ(verify_buffer[i], static_cast(i) * 7.0f); + } + + DeviceTraits::free(cuda_src_data); +} + +TEST(FromBlobCUDATest, CopyOwnedCUDATensorToCUDAFromBlob) { + constexpr size_t kNumFloats = 6; + constexpr size_t kNbytes = kNumFloats * sizeof(float); + + // Create owned CUDA source with empty() and initialize via CPU + SlimTensor cuda_src = + empty({2, 3}, c10::ScalarType::Float, DEFAULT_CUDA_DEVICE); + float cpu_init[kNumFloats]; + for (size_t i = 0; i < kNumFloats; ++i) { + cpu_init[i] = static_cast(i) * 11.0f; + } + DeviceTraits::memcpy( + cuda_src.data_ptr(), cpu_init, kNbytes, DEFAULT_CUDA_DEVICE, CPU_DEVICE); + + // Create CUDA destination with from_blob + float* cuda_dst_data = + static_cast(DeviceTraits::allocate( + kNbytes, DEFAULT_CUDA_DEVICE)); + SlimTensor cuda_dst = from_blob( + cuda_dst_data, {2, 3}, c10::ScalarType::Float, DEFAULT_CUDA_DEVICE); + + cuda_dst.copy_(cuda_src); + + // Verify by copying back to CPU + float verify_buffer[kNumFloats]; + DeviceTraits::memcpy( + verify_buffer, cuda_dst_data, kNbytes, CPU_DEVICE, DEFAULT_CUDA_DEVICE); + for (size_t i = 0; i < kNumFloats; ++i) { + EXPECT_FLOAT_EQ(verify_buffer[i], static_cast(i) * 11.0f); + } + + DeviceTraits::free(cuda_dst_data); +} + +// ============================================================================= +// from_blob CUDA Shared Storage Tests +// ============================================================================= + +TEST(FromBlobCUDATest, CopiedTensorSharesStorage) { + constexpr size_t kNumFloats = 8; + constexpr size_t kNbytes = kNumFloats * sizeof(float); + + float* cuda_data = + static_cast(DeviceTraits::allocate( + kNbytes, DEFAULT_CUDA_DEVICE)); + + SlimTensor tensor1 = + from_blob(cuda_data, {2, 4}, c10::ScalarType::Float, DEFAULT_CUDA_DEVICE); + SlimTensor tensor2 = tensor1; // Copy constructor + + // Both should point to same storage + EXPECT_EQ(tensor1.data_ptr(), tensor2.data_ptr()); + EXPECT_EQ(tensor1.storage().get(), tensor2.storage().get()); + EXPECT_TRUE(tensor1.is_cuda()); + EXPECT_TRUE(tensor2.is_cuda()); + + DeviceTraits::free(cuda_data); +} + +#endif // CUDA_AVAILABLE + +} // namespace executorch::backends::aoti::slim