Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

UCT/CUDA: Runtime CUDA >= 12.3 to enable VMM #10396

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
63 changes: 54 additions & 9 deletions src/uct/cuda/cuda_copy/cuda_copy_md.c
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,10 @@ static ucs_config_field_t uct_cuda_copy_md_config_table[] = {
{NULL}
};

static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i don't think we need a lock, and if we did, i would use a better name like uct_cuda_...._lock ..

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

removed


static CUresult (*ctx_set_flags_func)(unsigned);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

uct_cuda_cuCtxSetFlags_func

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed


static int uct_cuda_copy_md_is_dmabuf_supported()
{
int dmabuf_supported = 0;
Expand Down Expand Up @@ -479,7 +483,6 @@ static void uct_cuda_copy_md_close(uct_md_h uct_md) {

static size_t uct_cuda_copy_md_get_total_device_mem(CUdevice cuda_device)
{
static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
static size_t total_bytes[UCT_CUDA_MAX_DEVICES];
char dev_name[UCT_CUDA_DEV_NAME_MAX_LEN];

Expand Down Expand Up @@ -515,22 +518,28 @@ static size_t uct_cuda_copy_md_get_total_device_mem(CUdevice cuda_device)
static void
uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md, const void *address)
{
unsigned value = 1;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let's rename to sync_memops_value?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed


#if HAVE_CUDA_FABRIC
ucs_status_t status;
if (!md->sync_memops_set) {
/* Synchronize future DMA operations for all memory types */
status = UCT_CUDADRV_FUNC_LOG_WARN(cuCtxSetFlags(CU_CTX_SYNC_MEMOPS));
if (status == UCS_OK) {
md->sync_memops_set = 1;
if (ctx_set_flags_func != NULL) {
if (!md->sync_memops_set) {
/* Synchronize future DMA operations for all memory types */
status = UCT_CUDADRV_FUNC_LOG_ERR(
rakhmets marked this conversation as resolved.
Show resolved Hide resolved
ctx_set_flags_func(CU_CTX_SYNC_MEMOPS));
if (status == UCS_OK) {
md->sync_memops_set = 1;
}
}

return;
}
#else
unsigned value = 1;
#endif

/* Synchronize for DMA for legacy memory types*/
UCT_CUDADRV_FUNC_LOG_WARN(
cuPointerSetAttribute(&value, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
(CUdeviceptr)address));
#endif
}

static ucs_status_t
Expand Down Expand Up @@ -823,6 +832,35 @@ static uct_md_ops_t md_ops = {
.detect_memory_type = uct_cuda_copy_md_detect_memory_type
};

static ucs_status_t uct_cuda_copy_md_check_is_ctx_set_flags_supported(void)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To simplify the code, we could have this function call the needed function pointer, and move the global var inside it.
Something like
ucs_status_t uct_cuda_copy_set_ctx_flags(unsigned flags)
and have it return UCS_ERR_UNSUPPORTED if the func pointer is not found.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought about it but went for two step approach as we need:

  1. disable fabric at init time
  2. set the flag with md and address as parameter, in case we cannot use cuCtxSetFlags()

{
static ucs_status_t status = UCS_ERR_INVALID_ADDR;
rakhmets marked this conversation as resolved.
Show resolved Hide resolved

#if CUDA_VERSION >= 12000
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why needed?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cuGetProcAddress() prototype changed at >=12000 and we know that cuCtxSetFlags() also appeared after 12000 so no need to use older cuGetProcAddress() prototype to check.

CUdriverProcAddressQueryResult sym_status;
CUresult cu_err;

if (status == UCS_ERR_INVALID_ADDR) {
pthread_mutex_lock(&lock);
rakhmets marked this conversation as resolved.
Show resolved Hide resolved
cu_err = cuGetProcAddress("cuCtxSetFlags", (void**)&ctx_set_flags_func,
12010, CU_GET_PROC_ADDRESS_DEFAULT,
&sym_status);

if ((cu_err == CUDA_SUCCESS) &&
(sym_status == CU_GET_PROC_ADDRESS_SUCCESS)) {
status = UCS_OK;
} else {
ctx_set_flags_func = NULL;
status = UCS_ERR_UNSUPPORTED;
}

pthread_mutex_unlock(&lock);
}
#endif

return status;
}

static ucs_status_t
uct_cuda_copy_md_open(uct_component_t *component, const char *md_name,
const uct_md_config_t *md_config, uct_md_h *md_p)
Expand Down Expand Up @@ -850,6 +888,13 @@ uct_cuda_copy_md_open(uct_component_t *component, const char *md_name,
md->sync_memops_set = 0;
md->granularity = SIZE_MAX;

status = uct_cuda_copy_md_check_is_ctx_set_flags_supported();
if ((status != UCS_OK) && (md->config.enable_fabric != UCS_NO)) {
ucs_warn("disabled fabric memory allocations as cuda driver "
"library does not support cuCtxSetFlags()");
md->config.enable_fabric = UCS_NO;
}
rakhmets marked this conversation as resolved.
Show resolved Hide resolved

if ((config->cuda_async_mem_type != UCS_MEMORY_TYPE_CUDA) &&
(config->cuda_async_mem_type != UCS_MEMORY_TYPE_CUDA_MANAGED)) {
ucs_warn("wrong memory type for async memory allocations: \"%s\";"
Expand Down
Loading