Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions runtime/core/portable_type/targets.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ def define_common_targets():
"//executorch/extension/fb/dynamic_shim/...",
"//executorch/kernels/portable/cpu/...",
"//executorch/runtime/core/...",
"//executorch/runtime/executor/...",
"//executorch/runtime/core/exec_aten/...",
"//executorch/runtime/core/portable_type/test/...",
],
Expand Down
46 changes: 46 additions & 0 deletions runtime/executor/memory_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@

#include <executorch/runtime/core/hierarchical_allocator.h>
#include <executorch/runtime/core/memory_allocator.h>
#include <executorch/runtime/core/portable_type/device.h>
#include <executorch/runtime/core/span.h>

namespace executorch {
namespace runtime {
Expand Down Expand Up @@ -61,6 +63,32 @@ class MemoryManager final {
"method allocator cannot be the same as temp allocator");
}

/**
* Constructs a new MemoryManager with per-buffer device metadata.
*
* @param[in] method_allocator Same as above.
* @param[in] planned_memory Same as above. May contain a mix of CPU and
* device pointers — HierarchicalAllocator only does pointer arithmetic,
* so device pointers are valid.
* @param[in] temp_allocator Same as above.
* @param[in] planned_buffer_devices One entry per planned memory buffer
* (same count as planned_memory buffers), indicating the device type for
* each buffer. For CPU-only programs, use the 3-arg constructor instead.
*/
MemoryManager(
MemoryAllocator* method_allocator,
HierarchicalAllocator* planned_memory,
MemoryAllocator* temp_allocator,
Span<const etensor::DeviceType> planned_buffer_devices)
: method_allocator_(method_allocator),
planned_memory_(planned_memory),
temp_allocator_(temp_allocator),
planned_buffer_devices_(planned_buffer_devices) {
ET_CHECK_MSG(
method_allocator != temp_allocator,
"method allocator cannot be the same as temp allocator");
}

/**
* DEPRECATED: Use the constructor without `constant_allocator` instead.
*
Expand Down Expand Up @@ -105,10 +133,28 @@ class MemoryManager final {
return temp_allocator_;
}

/**
* Returns per-buffer device metadata. One entry per planned memory buffer,
* same count as planned_memory buffers. Empty if no device metadata was
* provided (CPU-only program).
*/
Span<const etensor::DeviceType> planned_buffer_devices() const {
return planned_buffer_devices_;
}

/**
* Returns true if any planned buffer is on a non-CPU device.
* When false, the memory setup is CPU-only and follows the legacy path.
*/
bool has_device_memory() const {
return planned_buffer_devices_.size() > 0;
}

private:
MemoryAllocator* method_allocator_;
HierarchicalAllocator* planned_memory_;
MemoryAllocator* temp_allocator_;
Span<const etensor::DeviceType> planned_buffer_devices_;
};

} // namespace runtime
Expand Down
1 change: 1 addition & 0 deletions runtime/executor/targets.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ def define_common_targets():
],
exported_deps = [
"//executorch/runtime/core:memory_allocator",
"//executorch/runtime/core/portable_type:portable_type",
],
visibility = ["PUBLIC"],
)
Expand Down
44 changes: 44 additions & 0 deletions runtime/executor/test/memory_manager_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ using namespace ::testing;
using executorch::runtime::HierarchicalAllocator;
using executorch::runtime::MemoryAllocator;
using executorch::runtime::MemoryManager;
using executorch::runtime::Span;
using executorch::runtime::etensor::DeviceType;

TEST(MemoryManagerTest, MinimalCtor) {
MemoryAllocator method_allocator(0, nullptr);
Expand Down Expand Up @@ -93,3 +95,45 @@ TEST(MemoryManagerTest, CtorWithSameAllocator) {
/*temp_allocator=*/&method_allocator),
"cannot be the same");
}

TEST(MemoryManagerTest, ThreeArgCtorHasNoDeviceMemory) {
MemoryAllocator method_allocator(0, nullptr);
HierarchicalAllocator planned_memory({});
MemoryAllocator temp_allocator(0, nullptr);

MemoryManager mm(&method_allocator, &planned_memory, &temp_allocator);

EXPECT_FALSE(mm.has_device_memory());
EXPECT_EQ(mm.planned_buffer_devices().size(), 0);
}

TEST(MemoryManagerTest, FourArgCtorWithDeviceMetadata) {
MemoryAllocator method_allocator(0, nullptr);
HierarchicalAllocator planned_memory({});
MemoryAllocator temp_allocator(0, nullptr);

// 3 buffers: CPU, CUDA, CPU
DeviceType devices[] = {DeviceType::CPU, DeviceType::CUDA, DeviceType::CPU};
Span<const DeviceType> device_span(devices, 3);

MemoryManager mm(
&method_allocator, &planned_memory, &temp_allocator, device_span);

EXPECT_EQ(mm.method_allocator(), &method_allocator);
EXPECT_EQ(mm.planned_memory(), &planned_memory);
EXPECT_EQ(mm.temp_allocator(), &temp_allocator);
EXPECT_TRUE(mm.has_device_memory());
EXPECT_EQ(mm.planned_buffer_devices().size(), 3);
EXPECT_EQ(mm.planned_buffer_devices()[0], DeviceType::CPU);
EXPECT_EQ(mm.planned_buffer_devices()[1], DeviceType::CUDA);
EXPECT_EQ(mm.planned_buffer_devices()[2], DeviceType::CPU);
}

TEST(MemoryManagerTest, MinimalCtorHasNoDeviceMemory) {
MemoryAllocator method_allocator(0, nullptr);

MemoryManager mm(&method_allocator);

EXPECT_FALSE(mm.has_device_memory());
EXPECT_EQ(mm.planned_buffer_devices().size(), 0);
}
3 changes: 3 additions & 0 deletions runtime/executor/test/targets.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ def define_common_targets(is_fbcode = False):
"//executorch/exir/backend/test/...",
"//executorch/runtime/backend/...",
"//executorch/extension/pybindings/...",
"//executorch/extension/module/test/...",
"//executorch/devtools/fb/runners/...",
"//executorch/test/...",
"//executorch/examples/...",
Expand Down Expand Up @@ -326,6 +327,8 @@ def define_common_targets(is_fbcode = False):
deps = [
":managed_memory_manager",
"//executorch/runtime/executor:program",
"//executorch/runtime/core:device_allocator",
"//executorch/runtime/core:device_memory_buffer",
"//executorch/extension/data_loader:file_data_loader",
"//executorch/schema:program",
],
Expand Down
169 changes: 169 additions & 0 deletions runtime/executor/test/tensor_parser_device_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,17 +17,31 @@
#include <executorch/runtime/executor/tensor_parser.h>

#include <executorch/extension/data_loader/file_data_loader.h>
#include <executorch/runtime/core/device_allocator.h>
#include <executorch/runtime/core/device_memory_buffer.h>
#include <executorch/runtime/core/exec_aten/exec_aten.h>
#include <executorch/runtime/executor/test/managed_memory_manager.h>
#include <executorch/runtime/platform/runtime.h>
#include <executorch/schema/program_generated.h>

#include <gtest/gtest.h>

using executorch::aten::Tensor;
using executorch::runtime::DeviceAllocator;
using executorch::runtime::DeviceMemoryBuffer;
using executorch::runtime::Error;
using executorch::runtime::get_device_allocator;
using executorch::runtime::HierarchicalAllocator;
using executorch::runtime::MemoryAllocator;
using executorch::runtime::MemoryManager;
using executorch::runtime::MethodMeta;
using executorch::runtime::Program;
using executorch::runtime::register_device_allocator;
using executorch::runtime::Result;
using executorch::runtime::Span;
using executorch::runtime::deserialization::parseTensor;
using executorch::runtime::etensor::DeviceIndex;
using executorch::runtime::etensor::DeviceType;
using executorch::runtime::testing::ManagedMemoryManager;
using torch::executor::util::FileDataLoader;

Expand All @@ -50,15 +64,77 @@ class ProgramTestFriend final {

using executorch::runtime::testing::ProgramTestFriend;

namespace {

/**
* Mock CUDA allocator that uses host memory for testing.
* Tracks the allocated range so tests can verify tensor data_ptr
* falls within the "device" memory region.
*/
class MockCudaAllocator : public DeviceAllocator {
public:
Result<void*> allocate(size_t nbytes, DeviceIndex index) override {
allocate_count_++;
buffer_ = std::make_unique<uint8_t[]>(nbytes);
buffer_size_ = nbytes;
return static_cast<void*>(buffer_.get());
}

void deallocate(void* ptr, DeviceIndex index) override {
deallocate_count_++;
buffer_.reset();
buffer_size_ = 0;
}

Error copy_host_to_device(void*, const void*, size_t, DeviceIndex) override {
return Error::Ok;
}

Error copy_device_to_host(void*, const void*, size_t, DeviceIndex) override {
return Error::Ok;
}

DeviceType device_type() const override {
return DeviceType::CUDA;
}

bool is_device_ptr(const void* ptr) const {
if (buffer_ == nullptr || buffer_size_ == 0) {
return false;
}
auto* p = static_cast<const uint8_t*>(ptr);
return p >= buffer_.get() && p < buffer_.get() + buffer_size_;
}

int allocate_count_ = 0;
int deallocate_count_ = 0;

private:
std::unique_ptr<uint8_t[]> buffer_;
size_t buffer_size_ = 0;
};

} // namespace

static MockCudaAllocator g_mock_cuda;

class TensorParserDeviceTest : public ::testing::Test {
protected:
static void SetUpTestSuite() {
executorch::runtime::runtime_init();
register_device_allocator(DeviceType::CUDA, &g_mock_cuda);
}

void SetUp() override {
const char* path = std::getenv("ET_MODULE_ADD_WITH_DEVICE_PATH");
ASSERT_NE(path, nullptr)
<< "ET_MODULE_ADD_WITH_DEVICE_PATH env var not set";
Result<FileDataLoader> loader = FileDataLoader::from(path);
ASSERT_EQ(loader.error(), Error::Ok);
loader_ = std::make_unique<FileDataLoader>(std::move(loader.get()));

g_mock_cuda.allocate_count_ = 0;
g_mock_cuda.deallocate_count_ = 0;
}

std::unique_ptr<FileDataLoader> loader_;
Expand Down Expand Up @@ -167,3 +243,96 @@ TEST_F(TensorParserDeviceTest, NonDelegatedTensorsDefaultToCPU) {
<< " without device annotation should have device_index=0";
}
}
TEST_F(TensorParserDeviceTest, CudaTensorDataPtrPointsToDeviceMemory) {
Result<Program> program =
Program::load(loader_.get(), Program::Verification::Minimal);
ASSERT_EQ(program.error(), Error::Ok);

Result<MethodMeta> method_meta = program->method_meta("forward");
ASSERT_EQ(method_meta.error(), Error::Ok);

// ModuleAddWithDevice has:
// non_const_buffer_sizes: [0, 48] (index 0 reserved, buffer 0 = 48 bytes)
// non_const_buffer_device: [{buffer_idx=1, device_type=CUDA}]
const size_t num_buffers = method_meta->num_memory_planned_buffers();
ASSERT_EQ(num_buffers, 1);

// Set up device-aware planned memory.
std::vector<Span<uint8_t>> planned_spans;
std::vector<std::vector<uint8_t>> cpu_buffers;
std::vector<DeviceMemoryBuffer> device_buffers;

for (size_t i = 0; i < num_buffers; ++i) {
auto size = method_meta->memory_planned_buffer_size(i);
ASSERT_TRUE(size.ok());
auto device = method_meta->memory_planned_buffer_device(i);
ASSERT_TRUE(device.ok());

if (device->is_cpu()) {
cpu_buffers.emplace_back(size.get());
planned_spans.emplace_back(
cpu_buffers.back().data(), cpu_buffers.back().size());
} else {
cpu_buffers.emplace_back(); // empty placeholder
auto dmb = DeviceMemoryBuffer::create(
size.get(), device->type(), device->index());
ASSERT_TRUE(dmb.ok())
<< "DeviceMemoryBuffer::create failed for buffer " << i;
planned_spans.emplace_back(dmb->as_span());
device_buffers.push_back(std::move(dmb.get()));
}
}

ASSERT_EQ(g_mock_cuda.allocate_count_, 1);

// Build HierarchicalAllocator with mixed CPU/device spans.
HierarchicalAllocator planned_memory(
{planned_spans.data(), planned_spans.size()});

constexpr size_t kMethodAllocBytes = 32 * 1024U;
auto method_alloc_pool = std::make_unique<uint8_t[]>(kMethodAllocBytes);
MemoryAllocator method_allocator(kMethodAllocBytes, method_alloc_pool.get());
MemoryManager memory_manager(&method_allocator, &planned_memory);

// Parse tensors and verify CUDA tensors have device memory.
const executorch_flatbuffer::Program* internal_program =
ProgramTestFriend::GetInternalProgram(&program.get());
auto* execution_plan =
internal_program->execution_plan()->GetMutableObject(0);
auto* flatbuffer_values = execution_plan->values();

int cuda_with_device_memory = 0;

for (size_t i = 0; i < flatbuffer_values->size(); ++i) {
auto* serialization_value = flatbuffer_values->Get(i);
if (serialization_value->val_type() !=
executorch_flatbuffer::KernelTypes::Tensor) {
continue;
}

auto* s_tensor = serialization_value->val_as_Tensor();
bool is_cuda = s_tensor->extra_tensor_info() != nullptr &&
s_tensor->extra_tensor_info()->device_type() ==
executorch_flatbuffer::DeviceType::CUDA;

Result<Tensor> tensor =
parseTensor(&program.get(), &memory_manager, s_tensor);
ASSERT_TRUE(tensor.ok())
<< "parseTensor failed at index " << i << " with error 0x" << std::hex
<< static_cast<uint32_t>(tensor.error());

Tensor t = tensor.get();

if (is_cuda && t.unsafeGetTensorImpl()->device_type() == DeviceType::CUDA) {
EXPECT_TRUE(g_mock_cuda.is_device_ptr(t.const_data_ptr()))
<< "CUDA tensor at index " << i
<< " should have data_ptr in device memory, but got CPU memory";
cuda_with_device_memory++;
}
}

// All 3 CUDA tensors (2 inputs + 1 output of the delegate) should have
// their data_ptr pointing to the mock device memory buffer.
EXPECT_EQ(cuda_with_device_memory, 3)
<< "All 3 CUDA tensors should have data_ptr in device memory";
}
Loading