From 68a5f5170953a0d6f47661a5a4e84f99eb0b1df3 Mon Sep 17 00:00:00 2001 From: Thomas Vegas Date: Fri, 20 Dec 2024 09:36:46 +0000 Subject: [PATCH 1/9] UCT/CUDA: Runtime CUDA >= 12.3 to enable VMM --- src/uct/cuda/cuda_copy/cuda_copy_md.c | 60 +++++++++++++++++++-------- src/uct/cuda/cuda_copy/cuda_copy_md.h | 2 + 2 files changed, 44 insertions(+), 18 deletions(-) diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c index a185dde3779..fa2fde07681 100644 --- a/src/uct/cuda/cuda_copy/cuda_copy_md.c +++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c @@ -29,6 +29,10 @@ #define UCT_CUDA_DEV_NAME_MAX_LEN 64 #define UCT_CUDA_MAX_DEVICES 32 +#define UCT_CUDA_VERSION_VMM 12030 /* for VMM: cuCtxSetFlags() >= cuda 12.1 */ +#define UCT_CUDA_MAJOR(_version) ((_version) / 1000) +#define UCT_CUDA_MINOR(_version) (((_version) % 1000) / 10) + static const char *uct_cuda_pref_loc[] = { [UCT_CUDA_PREF_LOC_CPU] = "cpu", @@ -515,22 +519,27 @@ static size_t uct_cuda_copy_md_get_total_device_mem(CUdevice cuda_device) static void uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md, const void *address) { + unsigned value = 1; + #if HAVE_CUDA_FABRIC ucs_status_t status; - if (!md->sync_memops_set) { - /* Synchronize future DMA operations for all memory types */ - status = UCT_CUDADRV_FUNC_LOG_WARN(cuCtxSetFlags(CU_CTX_SYNC_MEMOPS)); - if (status == UCS_OK) { - md->sync_memops_set = 1; + if (md->config.cuda_ctx_set_flags) { + if (!md->sync_memops_set) { + /* Synchronize future DMA operations for all memory types */ + status = UCT_CUDADRV_FUNC_LOG_WARN(cuCtxSetFlags(CU_CTX_SYNC_MEMOPS)); + if (status == UCS_OK) { + md->sync_memops_set = 1; + } } + + return; } -#else - unsigned value = 1; +#endif + /* Synchronize for DMA for legacy memory types*/ UCT_CUDADRV_FUNC_LOG_WARN( cuPointerSetAttribute(&value, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, (CUdeviceptr)address)); -#endif } static ucs_status_t @@ -830,7 +839,7 @@ uct_cuda_copy_md_open(uct_component_t *component, const char *md_name, uct_cuda_copy_md_config_t *config = ucs_derived_of(md_config, uct_cuda_copy_md_config_t); uct_cuda_copy_md_t *md; - int dmabuf_supported; + int dmabuf_supported, version; ucs_status_t status; md = ucs_malloc(sizeof(uct_cuda_copy_md_t), "uct_cuda_copy_md_t"); @@ -840,15 +849,30 @@ uct_cuda_copy_md_open(uct_component_t *component, const char *md_name, goto err; } - md->super.ops = &md_ops; - md->super.component = &uct_cuda_copy_component; - md->config.alloc_whole_reg = config->alloc_whole_reg; - md->config.max_reg_ratio = config->max_reg_ratio; - md->config.pref_loc = config->pref_loc; - md->config.enable_fabric = config->enable_fabric; - md->config.dmabuf_supported = 0; - md->sync_memops_set = 0; - md->granularity = SIZE_MAX; + md->super.ops = &md_ops; + md->super.component = &uct_cuda_copy_component; + md->config.alloc_whole_reg = config->alloc_whole_reg; + md->config.max_reg_ratio = config->max_reg_ratio; + md->config.pref_loc = config->pref_loc; + md->config.enable_fabric = config->enable_fabric; + md->config.dmabuf_supported = 0; + md->config.cuda_ctx_set_flags = 1; + md->sync_memops_set = 0; + md->granularity = SIZE_MAX; + + if ((cuDriverGetVersion(&version) == CUDA_SUCCESS) && + (version < UCT_CUDA_VERSION_VMM)) { + if (md->config.enable_fabric != UCS_NO) { + ucs_warn("disabled fabric memory allocations as cuda driver " + "library %d.%d < %d.%d", + UCT_CUDA_MAJOR(version), UCT_CUDA_MINOR(version), + UCT_CUDA_MAJOR(UCT_CUDA_VERSION_VMM), + UCT_CUDA_MINOR(UCT_CUDA_VERSION_VMM)); + } + + md->config.enable_fabric = UCS_NO; + md->config.cuda_ctx_set_flags = 0; + } if ((config->cuda_async_mem_type != UCS_MEMORY_TYPE_CUDA) && (config->cuda_async_mem_type != UCS_MEMORY_TYPE_CUDA_MANAGED)) { diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.h b/src/uct/cuda/cuda_copy/cuda_copy_md.h index e14aff739e5..0176a10801a 100644 --- a/src/uct/cuda/cuda_copy/cuda_copy_md.h +++ b/src/uct/cuda/cuda_copy/cuda_copy_md.h @@ -36,6 +36,8 @@ typedef struct uct_cuda_copy_md { ucs_ternary_auto_value_t enable_fabric; uct_cuda_pref_loc_t pref_loc; int cuda_async_managed; + int cuda_ctx_set_flags; /* missing cuCtxSetFlags() + below CUDA 12.1 */ } config; } uct_cuda_copy_md_t; From 9fc443096ce3271029a689ae501ff45163632a4d Mon Sep 17 00:00:00 2001 From: Thomas Vegas Date: Fri, 20 Dec 2024 13:53:51 +0000 Subject: [PATCH 2/9] UCT/CUDA: Runtime CUDA >= 12.3 to enable VMM --- src/uct/cuda/cuda_copy/cuda_copy_md.c | 83 +++++++++++++++++---------- src/uct/cuda/cuda_copy/cuda_copy_md.h | 2 - 2 files changed, 52 insertions(+), 33 deletions(-) diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c index fa2fde07681..8a952d64830 100644 --- a/src/uct/cuda/cuda_copy/cuda_copy_md.c +++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c @@ -29,10 +29,6 @@ #define UCT_CUDA_DEV_NAME_MAX_LEN 64 #define UCT_CUDA_MAX_DEVICES 32 -#define UCT_CUDA_VERSION_VMM 12030 /* for VMM: cuCtxSetFlags() >= cuda 12.1 */ -#define UCT_CUDA_MAJOR(_version) ((_version) / 1000) -#define UCT_CUDA_MINOR(_version) (((_version) % 1000) / 10) - static const char *uct_cuda_pref_loc[] = { [UCT_CUDA_PREF_LOC_CPU] = "cpu", @@ -85,6 +81,10 @@ static ucs_config_field_t uct_cuda_copy_md_config_table[] = { {NULL} }; +static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER; + +static CUresult (*ctx_set_flags_func)(unsigned); + static int uct_cuda_copy_md_is_dmabuf_supported() { int dmabuf_supported = 0; @@ -483,7 +483,6 @@ static void uct_cuda_copy_md_close(uct_md_h uct_md) { static size_t uct_cuda_copy_md_get_total_device_mem(CUdevice cuda_device) { - static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER; static size_t total_bytes[UCT_CUDA_MAX_DEVICES]; char dev_name[UCT_CUDA_DEV_NAME_MAX_LEN]; @@ -523,10 +522,11 @@ uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md, const void *address) #if HAVE_CUDA_FABRIC ucs_status_t status; - if (md->config.cuda_ctx_set_flags) { + if (ctx_set_flags_func != NULL) { if (!md->sync_memops_set) { /* Synchronize future DMA operations for all memory types */ - status = UCT_CUDADRV_FUNC_LOG_WARN(cuCtxSetFlags(CU_CTX_SYNC_MEMOPS)); + status = UCT_CUDADRV_FUNC_LOG_ERR( + ctx_set_flags_func(CU_CTX_SYNC_MEMOPS)); if (status == UCS_OK) { md->sync_memops_set = 1; } @@ -832,6 +832,35 @@ static uct_md_ops_t md_ops = { .detect_memory_type = uct_cuda_copy_md_detect_memory_type }; +static ucs_status_t uct_cuda_copy_md_check_is_ctx_set_flags_supported(void) +{ + static ucs_status_t status = UCS_ERR_INVALID_ADDR; + +#if CUDA_VERSION >= 12000 + CUdriverProcAddressQueryResult sym_status; + CUresult cu_err; + + if (status == UCS_ERR_INVALID_ADDR) { + pthread_mutex_lock(&lock); + cu_err = cuGetProcAddress("cuCtxSetFlags", (void**)&ctx_set_flags_func, + 12010, CU_GET_PROC_ADDRESS_DEFAULT, + &sym_status); + + if ((cu_err == CUDA_SUCCESS) && + (sym_status == CU_GET_PROC_ADDRESS_SUCCESS)) { + status = UCS_OK; + } else { + ctx_set_flags_func = NULL; + status = UCS_ERR_UNSUPPORTED; + } + + pthread_mutex_unlock(&lock); + } +#endif + + return status; +} + static ucs_status_t uct_cuda_copy_md_open(uct_component_t *component, const char *md_name, const uct_md_config_t *md_config, uct_md_h *md_p) @@ -839,7 +868,7 @@ uct_cuda_copy_md_open(uct_component_t *component, const char *md_name, uct_cuda_copy_md_config_t *config = ucs_derived_of(md_config, uct_cuda_copy_md_config_t); uct_cuda_copy_md_t *md; - int dmabuf_supported, version; + int dmabuf_supported; ucs_status_t status; md = ucs_malloc(sizeof(uct_cuda_copy_md_t), "uct_cuda_copy_md_t"); @@ -849,29 +878,21 @@ uct_cuda_copy_md_open(uct_component_t *component, const char *md_name, goto err; } - md->super.ops = &md_ops; - md->super.component = &uct_cuda_copy_component; - md->config.alloc_whole_reg = config->alloc_whole_reg; - md->config.max_reg_ratio = config->max_reg_ratio; - md->config.pref_loc = config->pref_loc; - md->config.enable_fabric = config->enable_fabric; - md->config.dmabuf_supported = 0; - md->config.cuda_ctx_set_flags = 1; - md->sync_memops_set = 0; - md->granularity = SIZE_MAX; - - if ((cuDriverGetVersion(&version) == CUDA_SUCCESS) && - (version < UCT_CUDA_VERSION_VMM)) { - if (md->config.enable_fabric != UCS_NO) { - ucs_warn("disabled fabric memory allocations as cuda driver " - "library %d.%d < %d.%d", - UCT_CUDA_MAJOR(version), UCT_CUDA_MINOR(version), - UCT_CUDA_MAJOR(UCT_CUDA_VERSION_VMM), - UCT_CUDA_MINOR(UCT_CUDA_VERSION_VMM)); - } - - md->config.enable_fabric = UCS_NO; - md->config.cuda_ctx_set_flags = 0; + md->super.ops = &md_ops; + md->super.component = &uct_cuda_copy_component; + md->config.alloc_whole_reg = config->alloc_whole_reg; + md->config.max_reg_ratio = config->max_reg_ratio; + md->config.pref_loc = config->pref_loc; + md->config.enable_fabric = config->enable_fabric; + md->config.dmabuf_supported = 0; + md->sync_memops_set = 0; + md->granularity = SIZE_MAX; + + status = uct_cuda_copy_md_check_is_ctx_set_flags_supported(); + if ((status != UCS_OK) && (md->config.enable_fabric != UCS_NO)) { + ucs_warn("disabled fabric memory allocations as cuda driver " + "library does not support cuCtxSetFlags()"); + md->config.enable_fabric = UCS_NO; } if ((config->cuda_async_mem_type != UCS_MEMORY_TYPE_CUDA) && diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.h b/src/uct/cuda/cuda_copy/cuda_copy_md.h index 0176a10801a..e14aff739e5 100644 --- a/src/uct/cuda/cuda_copy/cuda_copy_md.h +++ b/src/uct/cuda/cuda_copy/cuda_copy_md.h @@ -36,8 +36,6 @@ typedef struct uct_cuda_copy_md { ucs_ternary_auto_value_t enable_fabric; uct_cuda_pref_loc_t pref_loc; int cuda_async_managed; - int cuda_ctx_set_flags; /* missing cuCtxSetFlags() - below CUDA 12.1 */ } config; } uct_cuda_copy_md_t; From e8c9f9901d03377247a8d81cd93bf5b5cc16bc03 Mon Sep 17 00:00:00 2001 From: Thomas Vegas Date: Fri, 20 Dec 2024 17:30:58 +0200 Subject: [PATCH 3/9] UCT/CUDA: Runtime CUDA >= 12.3 to enable VMM --- src/uct/cuda/cuda_copy/cuda_copy_md.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c index 8a952d64830..3f77f660b4d 100644 --- a/src/uct/cuda/cuda_copy/cuda_copy_md.c +++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c @@ -525,7 +525,7 @@ uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md, const void *address) if (ctx_set_flags_func != NULL) { if (!md->sync_memops_set) { /* Synchronize future DMA operations for all memory types */ - status = UCT_CUDADRV_FUNC_LOG_ERR( + status = UCT_CUDADRV_FUNC_LOG_WARN( ctx_set_flags_func(CU_CTX_SYNC_MEMOPS)); if (status == UCS_OK) { md->sync_memops_set = 1; @@ -834,7 +834,7 @@ static uct_md_ops_t md_ops = { static ucs_status_t uct_cuda_copy_md_check_is_ctx_set_flags_supported(void) { - static ucs_status_t status = UCS_ERR_INVALID_ADDR; + static ucs_status_t status = UCS_ERR_LAST; #if CUDA_VERSION >= 12000 CUdriverProcAddressQueryResult sym_status; @@ -842,16 +842,18 @@ static ucs_status_t uct_cuda_copy_md_check_is_ctx_set_flags_supported(void) if (status == UCS_ERR_INVALID_ADDR) { pthread_mutex_lock(&lock); - cu_err = cuGetProcAddress("cuCtxSetFlags", (void**)&ctx_set_flags_func, - 12010, CU_GET_PROC_ADDRESS_DEFAULT, - &sym_status); - - if ((cu_err == CUDA_SUCCESS) && - (sym_status == CU_GET_PROC_ADDRESS_SUCCESS)) { - status = UCS_OK; - } else { - ctx_set_flags_func = NULL; - status = UCS_ERR_UNSUPPORTED; + if (status == UCS_ERR_INVALID_ADDR) { + cu_err = cuGetProcAddress("cuCtxSetFlags", + (void**)&ctx_set_flags_func, 12010, + CU_GET_PROC_ADDRESS_DEFAULT, &sym_status); + + if ((cu_err == CUDA_SUCCESS) && + (sym_status == CU_GET_PROC_ADDRESS_SUCCESS)) { + status = UCS_OK; + } else { + ctx_set_flags_func = NULL; + status = UCS_ERR_UNSUPPORTED; + } } pthread_mutex_unlock(&lock); From 3b43d299b8a76b8bb7fb19f9a3f18449165ee8c4 Mon Sep 17 00:00:00 2001 From: Thomas Vegas Date: Fri, 20 Dec 2024 16:25:17 +0000 Subject: [PATCH 4/9] UCT/CUDA: Runtime CUDA >= 12.3 to enable VMM --- src/uct/cuda/cuda_copy/cuda_copy_md.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c index 3f77f660b4d..374a97760dc 100644 --- a/src/uct/cuda/cuda_copy/cuda_copy_md.c +++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c @@ -534,6 +534,8 @@ uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md, const void *address) return; } +#else + (void)ctx_set_flags_func; #endif /* Synchronize for DMA for legacy memory types*/ From 2161adf8410bd47f58c285288eeab1bc016d6c75 Mon Sep 17 00:00:00 2001 From: Thomas Vegas Date: Fri, 20 Dec 2024 17:16:43 +0000 Subject: [PATCH 5/9] UCT/CUDA: Runtime CUDA >= 12.3 to enable VMM --- src/uct/cuda/cuda_copy/cuda_copy_md.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c index 374a97760dc..2657e4b288a 100644 --- a/src/uct/cuda/cuda_copy/cuda_copy_md.c +++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c @@ -893,9 +893,12 @@ uct_cuda_copy_md_open(uct_component_t *component, const char *md_name, md->granularity = SIZE_MAX; status = uct_cuda_copy_md_check_is_ctx_set_flags_supported(); - if ((status != UCS_OK) && (md->config.enable_fabric != UCS_NO)) { - ucs_warn("disabled fabric memory allocations as cuda driver " - "library does not support cuCtxSetFlags()"); + if (status != UCS_OK) { + if (md->config.enable_fabric == UCS_YES) { + ucs_warn("disabled fabric memory allocations as cuda driver " + "library does not support cuCtxSetFlags()"); + } + md->config.enable_fabric = UCS_NO; } From 656325344d803786d4e40bbcb60e4888db7b0c4b Mon Sep 17 00:00:00 2001 From: Thomas Vegas Date: Mon, 6 Jan 2025 20:23:23 +0200 Subject: [PATCH 6/9] UCT/CUDA: Runtime CUDA >= 12.3 to enable VMM --- src/uct/cuda/cuda_copy/cuda_copy_md.c | 46 +++++++++++---------------- 1 file changed, 19 insertions(+), 27 deletions(-) diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c index 2657e4b288a..dadf8bbb985 100644 --- a/src/uct/cuda/cuda_copy/cuda_copy_md.c +++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c @@ -81,9 +81,7 @@ static ucs_config_field_t uct_cuda_copy_md_config_table[] = { {NULL} }; -static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER; - -static CUresult (*ctx_set_flags_func)(unsigned); +static CUresult (*uct_cuda_cuCtxSetFlags_func)(unsigned); static int uct_cuda_copy_md_is_dmabuf_supported() { @@ -483,6 +481,7 @@ static void uct_cuda_copy_md_close(uct_md_h uct_md) { static size_t uct_cuda_copy_md_get_total_device_mem(CUdevice cuda_device) { + static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER; static size_t total_bytes[UCT_CUDA_MAX_DEVICES]; char dev_name[UCT_CUDA_DEV_NAME_MAX_LEN]; @@ -518,15 +517,14 @@ static size_t uct_cuda_copy_md_get_total_device_mem(CUdevice cuda_device) static void uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md, const void *address) { - unsigned value = 1; - -#if HAVE_CUDA_FABRIC + unsigned sync_memops_value = 1; ucs_status_t status; - if (ctx_set_flags_func != NULL) { + + if (uct_cuda_cuCtxSetFlags_func != NULL) { if (!md->sync_memops_set) { /* Synchronize future DMA operations for all memory types */ status = UCT_CUDADRV_FUNC_LOG_WARN( - ctx_set_flags_func(CU_CTX_SYNC_MEMOPS)); + uct_cuda_cuCtxSetFlags_func(CU_CTX_SYNC_MEMOPS)); if (status == UCS_OK) { md->sync_memops_set = 1; } @@ -534,13 +532,11 @@ uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md, const void *address) return; } -#else - (void)ctx_set_flags_func; -#endif /* Synchronize for DMA for legacy memory types*/ UCT_CUDADRV_FUNC_LOG_WARN( - cuPointerSetAttribute(&value, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, + cuPointerSetAttribute(&sync_memops_value, + CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, (CUdeviceptr)address)); } @@ -842,23 +838,19 @@ static ucs_status_t uct_cuda_copy_md_check_is_ctx_set_flags_supported(void) CUdriverProcAddressQueryResult sym_status; CUresult cu_err; - if (status == UCS_ERR_INVALID_ADDR) { - pthread_mutex_lock(&lock); - if (status == UCS_ERR_INVALID_ADDR) { - cu_err = cuGetProcAddress("cuCtxSetFlags", - (void**)&ctx_set_flags_func, 12010, - CU_GET_PROC_ADDRESS_DEFAULT, &sym_status); + if (status == UCS_ERR_LAST) { + cu_err = cuGetProcAddress("cuCtxSetFlags", + (void**)&uct_cuda_cuCtxSetFlags_func, + 12010, CU_GET_PROC_ADDRESS_DEFAULT, + &sym_status); - if ((cu_err == CUDA_SUCCESS) && - (sym_status == CU_GET_PROC_ADDRESS_SUCCESS)) { - status = UCS_OK; - } else { - ctx_set_flags_func = NULL; - status = UCS_ERR_UNSUPPORTED; - } + if ((cu_err == CUDA_SUCCESS) && + (sym_status == CU_GET_PROC_ADDRESS_SUCCESS)) { + status = UCS_OK; + } else { + uct_cuda_cuCtxSetFlags_func = NULL; + status = UCS_ERR_UNSUPPORTED; } - - pthread_mutex_unlock(&lock); } #endif From ff4313c9ead6dcdac11d88b1603bf98b1b001d7f Mon Sep 17 00:00:00 2001 From: Thomas Vegas Date: Tue, 7 Jan 2025 09:02:46 +0000 Subject: [PATCH 7/9] UCT/CUDA: Runtime CUDA >= 12.3 to enable VMM --- src/uct/cuda/cuda_copy/cuda_copy_md.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c index dadf8bbb985..ee7532d477d 100644 --- a/src/uct/cuda/cuda_copy/cuda_copy_md.c +++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c @@ -518,6 +518,8 @@ static void uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md, const void *address) { unsigned sync_memops_value = 1; + +#if HAVE_CUDA_FABRIC ucs_status_t status; if (uct_cuda_cuCtxSetFlags_func != NULL) { @@ -532,6 +534,9 @@ uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md, const void *address) return; } +#else + (void)uct_cuda_cuCtxSetFlags_func; +#endif /* Synchronize for DMA for legacy memory types*/ UCT_CUDADRV_FUNC_LOG_WARN( From 2f5e5a5f20a00176768cae27d8b3fc7efe994b2d Mon Sep 17 00:00:00 2001 From: Thomas Vegas Date: Tue, 7 Jan 2025 11:23:39 +0000 Subject: [PATCH 8/9] UCT/CUDA: Runtime CUDA >= 12.3 to enable VMM --- src/uct/cuda/cuda_copy/cuda_copy_md.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c index ee7532d477d..4c5a1766722 100644 --- a/src/uct/cuda/cuda_copy/cuda_copy_md.c +++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c @@ -890,12 +890,16 @@ uct_cuda_copy_md_open(uct_component_t *component, const char *md_name, md->granularity = SIZE_MAX; status = uct_cuda_copy_md_check_is_ctx_set_flags_supported(); - if (status != UCS_OK) { + if ((status != UCS_OK) && (md->config.enable_fabric != UCS_NO)) { if (md->config.enable_fabric == UCS_YES) { - ucs_warn("disabled fabric memory allocations as cuda driver " - "library does not support cuCtxSetFlags()"); + ucs_error("failed to enable fabric memory allocations as cuda " + "driver library does not support cuCtxSetFlags()"); + goto err_free_md; } + ucs_diag("disabled fabric memory allocations as cuda driver library " + "does not support cuCtxSetFlags()"); + md->config.enable_fabric = UCS_NO; } From f1601a384334681fe761f099c7b259cc29f95e6c Mon Sep 17 00:00:00 2001 From: Thomas Vegas Date: Tue, 7 Jan 2025 15:52:02 +0000 Subject: [PATCH 9/9] UCT/CUDA: Runtime CUDA >= 12.3 to enable VMM --- src/uct/cuda/cuda_copy/cuda_copy_md.c | 31 ++++++++++++++------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c index 4c5a1766722..b771cf8be17 100644 --- a/src/uct/cuda/cuda_copy/cuda_copy_md.c +++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c @@ -835,15 +835,15 @@ static uct_md_ops_t md_ops = { .detect_memory_type = uct_cuda_copy_md_detect_memory_type }; -static ucs_status_t uct_cuda_copy_md_check_is_ctx_set_flags_supported(void) +static int uct_cuda_copy_md_check_is_ctx_set_flags_supported(void) { - static ucs_status_t status = UCS_ERR_LAST; - #if CUDA_VERSION >= 12000 + static int is_supported = -1; + CUdriverProcAddressQueryResult sym_status; CUresult cu_err; - if (status == UCS_ERR_LAST) { + if (is_supported < 0) { cu_err = cuGetProcAddress("cuCtxSetFlags", (void**)&uct_cuda_cuCtxSetFlags_func, 12010, CU_GET_PROC_ADDRESS_DEFAULT, @@ -851,15 +851,18 @@ static ucs_status_t uct_cuda_copy_md_check_is_ctx_set_flags_supported(void) if ((cu_err == CUDA_SUCCESS) && (sym_status == CU_GET_PROC_ADDRESS_SUCCESS)) { - status = UCS_OK; + is_supported = 1; } else { + ucs_debug("cuda driver library does not support cuCtxSetFlags()"); uct_cuda_cuCtxSetFlags_func = NULL; - status = UCS_ERR_UNSUPPORTED; + is_supported = 0; } } -#endif - return status; + return is_supported; +#else + return 0; +#endif } static ucs_status_t @@ -889,17 +892,15 @@ uct_cuda_copy_md_open(uct_component_t *component, const char *md_name, md->sync_memops_set = 0; md->granularity = SIZE_MAX; - status = uct_cuda_copy_md_check_is_ctx_set_flags_supported(); - if ((status != UCS_OK) && (md->config.enable_fabric != UCS_NO)) { + if (!uct_cuda_copy_md_check_is_ctx_set_flags_supported() && + (md->config.enable_fabric != UCS_NO)) { if (md->config.enable_fabric == UCS_YES) { - ucs_error("failed to enable fabric memory allocations as cuda " - "driver library does not support cuCtxSetFlags()"); + ucs_error("failed to enable fabric memory allocations"); + status = UCS_ERR_UNSUPPORTED; goto err_free_md; } - ucs_diag("disabled fabric memory allocations as cuda driver library " - "does not support cuCtxSetFlags()"); - + ucs_diag("disabled fabric memory allocations"); md->config.enable_fabric = UCS_NO; }