Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -540,7 +540,7 @@ hsa_status_t KfdDriver::ImportMemoryHandle(const core::Agent& agent, core::Drive
switch (type) {
case core::ShareType::DMABUF_FD: {
const auto& gpu_agent = static_cast<const GpuAgent&>(agent);
const int dmabuf_fd = *static_cast<int*>(import_handle);
const int dmabuf_fd = static_cast<const core::DriverMemoryHandle*>(import_handle)->dmabuf_fd;

HsaHandleImportDesc desc = {};
desc.device_handle = gpu_agent.libThunkDev();
Expand All @@ -564,7 +564,7 @@ hsa_status_t KfdDriver::ImportMemoryHandle(const core::Agent& agent, core::Drive
return HSA_STATUS_ERROR;
#endif
const auto& gpu_agent = static_cast<const GpuAgent&>(agent);
const auto fabric_handle = *static_cast<const hsa_fabric_handle_t*>(import_handle);
const auto fabric_handle = static_cast<const core::DriverMemoryHandle*>(import_handle)->fabric_handle;

HsaHandleImportDesc desc = {};
desc.device_handle = gpu_agent.libThunkDev();
Expand Down Expand Up @@ -626,8 +626,9 @@ hsa_status_t KfdDriver::CreateShareableHandle(void* va, void* mem, size_t size,

/*
* On Linux, export via KFD first (EXPORT_MEMORY_FLAGS_KFD_DMABUF) so the KFD section of the
* AMDGPU driver has a BO entry, then import into DRM. Re-export from DRM for the shareable fd.
* On Windows, KFD export and DRM export are equivalent.
* AMDGPU driver has a BO entry, then import into DRM. The shareable dmabuf_fd itself is created
* lazily when access is set (see Runtime::VMemorySetAccessPerHandle), so it is not re-exported
* here. On Windows, KFD export and DRM export are equivalent.
*/

core::DriverMemoryHandle kfd_alloc = {};
Expand All @@ -639,32 +640,26 @@ hsa_status_t KfdDriver::CreateShareableHandle(void* va, void* mem, size_t size,
return HSA_STATUS_ERROR;
}

core::DriverMemoryHandle source_handle = {};
source_handle.dmabuf_fd = source_fd;

core::DriverMemoryHandle targetHandle = {};
hsa_status_t ret = ImportMemoryHandle(agent, &targetHandle, core::ShareType::DMABUF_FD,
&source_fd, mem);
&source_handle, mem);
#if defined(__linux__)
rocr::os::DmaBufClose(source_fd);
#endif
if (ret != HSA_STATUS_SUCCESS)
return ret;
assert(targetHandle.size == size);

int shareable_fd = source_fd;
#if defined(__linux__)
// Re-export from DRM; the KFD fd was transient and is already closed.
ret = ExportMemoryHandle(agent, targetHandle, core::ShareType::DMABUF_FD, 0,
&shareable_fd);
if (ret != HSA_STATUS_SUCCESS) {
DestroyMemoryHandle(&targetHandle);
return ret;
}
/*
* We converted mem into a driver handle. The driver handle will keep the reference count
* inside the KMD so we can free the original KFD allocation.
*/
if (HSAKMT_CALL(hsaKmtFreeMemory(mem, size)) != HSAKMT_STATUS_SUCCESS) {
DestroyMemoryHandle(&targetHandle);
rocr::os::DmaBufClose(shareable_fd);
return HSA_STATUS_ERROR;
}
#endif
Expand All @@ -673,19 +668,23 @@ hsa_status_t KfdDriver::CreateShareableHandle(void* va, void* mem, size_t size,
const auto memhandle = reinterpret_cast<HsaMemoryObjectHandle>(targetHandle.handle);
if (HSAKMT_CALL(hsaKmtMemoryGetCpuAddr(devhandle, memhandle, &handle->mmap_offset)) != HSAKMT_STATUS_SUCCESS) {
DestroyMemoryHandle(&targetHandle);
rocr::os::DmaBufClose(shareable_fd);
return HSA_STATUS_ERROR;
}

handle->handle = targetHandle.handle;
handle->dmabuf_fd = shareable_fd;
/*
* Do not hold a shareable dmabuf_fd open for the lifetime of the handle. It is created lazily
* (and closed again) when access is set in Runtime::VMemorySetAccessPerHandle.
*/
handle->dmabuf_fd = -1;
handle->size = size;
return HSA_STATUS_SUCCESS;
}

hsa_status_t KfdDriver::DestroyMemoryHandle(core::DriverMemoryHandle* handle) {
if (handle->dmabuf_fd >= 0) {
hsa_status_t status = rocr::os::DmaBufClose(handle->dmabuf_fd);
handle->dmabuf_fd = -1;
if (status != HSA_STATUS_SUCCESS) return status;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -508,7 +508,7 @@ hsa_status_t KfdVirtioDriver::ImportMemoryHandle(const core::Agent& agent, core:

switch (type) {
case core::ShareType::DMABUF_FD: {
const int dmabuf_fd = *static_cast<int*>(import_handle);
const int dmabuf_fd = static_cast<const core::DriverMemoryHandle*>(import_handle)->dmabuf_fd;
const auto& gpu_agent = static_cast<const GpuAgent&>(agent);
amdgpu_bo_import_result res;
auto ret = vamdgpu_bo_import(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -860,7 +860,7 @@ hsa_status_t XdnaDriver::ImportMemoryHandle(const core::Agent& agent, core::Driv

switch (type) {
case core::ShareType::DMABUF_FD: {
const int dmabuf_fd = *static_cast<int*>(import_handle);
const int dmabuf_fd = static_cast<const core::DriverMemoryHandle*>(import_handle)->dmabuf_fd;

drm_prime_handle import_params = {};
import_params.handle = AMDXDNA_INVALID_BO_HANDLE;
Expand Down
5 changes: 3 additions & 2 deletions projects/rocr-runtime/runtime/hsa-runtime/core/inc/driver.h
Original file line number Diff line number Diff line change
Expand Up @@ -246,8 +246,9 @@ class Driver {
/// @param[out] handle handle to the imported memory; @p handle->size is set to the
/// imported allocation size in bytes
/// @param[in] type @ref ShareType to import
/// @param[in] import_handle input handle; @p int* for @p DMABUF_FD,
/// @p hsa_fabric_handle_t* for @p FABRIC_HANDLE
/// @param[in] import_handle input handle; @p DriverMemoryHandle* whose
/// @p dmabuf_fd field is read for @p DMABUF_FD and whose
/// @p fabric_handle field is read for @p FABRIC_HANDLE
/// @param[in] mem address of existing buffer, used to bypass import
virtual hsa_status_t ImportMemoryHandle(const core::Agent& agent, DriverMemoryHandle* handle,
ShareType type, void* import_handle,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4022,11 +4022,11 @@ Runtime::MappedHandleAllowedAgent::MappedHandleAllowedAgent(
if (memHandle->imported && memHandle->is_fabric_handle) {
status = targetAgent->driver().ImportMemoryHandle(
*targetAgent, &driver_handle, ShareType::FABRIC_HANDLE,
&memHandle->driver_handle.fabric_handle);
&memHandle->driver_handle);
} else {
status = targetAgent->driver().ImportMemoryHandle(
*targetAgent, &driver_handle, ShareType::DMABUF_FD,
&memHandle->driver_handle.dmabuf_fd);
&memHandle->driver_handle);
}
if (status != HSA_STATUS_SUCCESS)
throw AMD::hsa_exception(status, "Failed to import memory");
Expand Down Expand Up @@ -4056,7 +4056,7 @@ hsa_status_t Runtime::MappedHandleAllowedAgent::EnableAccess(hsa_access_permissi
#endif
auto agentOwner = mappedHandle->mem_handle->agentOwner();
int mmap_fd = -1;
/* Do not check the return value of GetDeviceFd. We do not need mmap_fd so it is valid for mmap_fd to be -1*/
/* Do not check the return value of GetDeviceFd. We do not need mmap_fd in some cases, so it is valid for mmap_fd to be -1*/
agentOwner->driver().GetDeviceFd(agentOwner->node_id(), &mmap_fd);

if (!rocr::os::MapMemory(va, size, PermissionsToMemProt(perms), mmap_fd,
Expand Down Expand Up @@ -4153,6 +4153,11 @@ Runtime::MemoryHandle::MemoryHandle(hsa_fabric_handle_t fabric_handle)
Runtime::MemoryHandle::~MemoryHandle() {
if (driver_handle.handle != 0 && region != nullptr)
agentOwner()->driver().DestroyMemoryHandle(&driver_handle);
Comment thread
dayatsin-amd marked this conversation as resolved.

if (driver_handle.dmabuf_fd >= 0) {
os::DmaBufClose(driver_handle.dmabuf_fd);
driver_handle.dmabuf_fd = -1;
}
}


Expand All @@ -4161,6 +4166,32 @@ hsa_status_t
Runtime::VMemorySetAccessPerHandle(void *va, MappedHandle &mappedHandle,
const hsa_amd_memory_access_desc_t *desc,
const size_t desc_cnt) {
MemoryHandle *memHandle = mappedHandle.mem_handle;

/*
* For locally-created shareable handles CreateShareableHandle leaves dmabuf_fd as -1 to avoid
* holding an fd open for the lifetime of the handle. Export it lazily here so the target agents
* can import the memory below, then close it again before returning.
*/
bool created_dmabuf_fd = false;

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure how often a single MemoryHandle is mapped across multiple VA chunks--if not often, then this probably isn't worth worrying about, but here's Claude feedback on this:

[MINOR] Per-chunk re-export amplification — runtime.cpp:4166

When a single MemoryHandle is mapped across multiple VA chunks (e.g., the same handle mapped twice via VMemoryHandleMap), VMemorySetAccess calls
VMemorySetAccessPerHandle once per chunk. Each call finds dmabuf_fd == -1 (the scope guard reset it) and does a fresh ExportMemoryHandle → kernel
ioctl. Cost becomes O(chunks × agents) rather than O(agents). Not a correctness issue, but worth lifting the lazy-export out to the per-handle
level in VMemorySetAccess if this path gets hot.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that situation is very rare. We can ignore it for now.

if (!memHandle->imported && memHandle->driver_handle.dmabuf_fd == -1) {
Agent *ownerAgent = memHandle->agentOwner();
int dmabuf_fd = -1;
hsa_status_t status = ownerAgent->driver().ExportMemoryHandle(
*ownerAgent, memHandle->driver_handle, ShareType::DMABUF_FD, 0, &dmabuf_fd);
if (status != HSA_STATUS_SUCCESS)
return status;
memHandle->driver_handle.dmabuf_fd = dmabuf_fd;
created_dmabuf_fd = true;
}

MAKE_SCOPE_GUARD([&]() {
if (created_dmabuf_fd) {
os::DmaBufClose(memHandle->driver_handle.dmabuf_fd);
memHandle->driver_handle.dmabuf_fd = -1;
}
});

for (int i = 0; i < desc_cnt; i++) {
Agent *targetAgent = Agent::Convert(desc[i].agent_handle);

Expand Down
Loading