From 68a5f5170953a0d6f47661a5a4e84f99eb0b1df3 Mon Sep 17 00:00:00 2001
From: Thomas Vegas <tvegas@nvidia.com>
Date: Fri, 20 Dec 2024 09:36:46 +0000
Subject: [PATCH 1/9] UCT/CUDA: Runtime CUDA >= 12.3 to enable VMM

---
 src/uct/cuda/cuda_copy/cuda_copy_md.c | 60 +++++++++++++++++++--------
 src/uct/cuda/cuda_copy/cuda_copy_md.h |  2 +
 2 files changed, 44 insertions(+), 18 deletions(-)

diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c
index a185dde3779..fa2fde07681 100644
--- a/src/uct/cuda/cuda_copy/cuda_copy_md.c
+++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c
@@ -29,6 +29,10 @@
 #define UCT_CUDA_DEV_NAME_MAX_LEN 64
 #define UCT_CUDA_MAX_DEVICES      32
 
+#define UCT_CUDA_VERSION_VMM     12030 /* for VMM: cuCtxSetFlags() >= cuda 12.1 */
+#define UCT_CUDA_MAJOR(_version) ((_version) / 1000)
+#define UCT_CUDA_MINOR(_version) (((_version) % 1000) / 10)
+
 
 static const char *uct_cuda_pref_loc[] = {
     [UCT_CUDA_PREF_LOC_CPU]  = "cpu",
@@ -515,22 +519,27 @@ static size_t uct_cuda_copy_md_get_total_device_mem(CUdevice cuda_device)
 static void
 uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md, const void *address)
 {
+    unsigned value = 1;
+
 #if HAVE_CUDA_FABRIC
     ucs_status_t status;
-    if (!md->sync_memops_set) {
-        /* Synchronize future DMA operations for all memory types */
-        status = UCT_CUDADRV_FUNC_LOG_WARN(cuCtxSetFlags(CU_CTX_SYNC_MEMOPS));
-        if (status == UCS_OK) {
-            md->sync_memops_set = 1;
+    if (md->config.cuda_ctx_set_flags) {
+        if (!md->sync_memops_set) {
+            /* Synchronize future DMA operations for all memory types */
+            status = UCT_CUDADRV_FUNC_LOG_WARN(cuCtxSetFlags(CU_CTX_SYNC_MEMOPS));
+            if (status == UCS_OK) {
+                md->sync_memops_set = 1;
+            }
         }
+
+        return;
     }
-#else
-    unsigned value = 1;
+#endif
+
     /* Synchronize for DMA for legacy memory types*/
     UCT_CUDADRV_FUNC_LOG_WARN(
             cuPointerSetAttribute(&value, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
                                   (CUdeviceptr)address));
-#endif
 }
 
 static ucs_status_t
@@ -830,7 +839,7 @@ uct_cuda_copy_md_open(uct_component_t *component, const char *md_name,
     uct_cuda_copy_md_config_t *config = ucs_derived_of(md_config,
                                                        uct_cuda_copy_md_config_t);
     uct_cuda_copy_md_t *md;
-    int dmabuf_supported;
+    int dmabuf_supported, version;
     ucs_status_t status;
 
     md = ucs_malloc(sizeof(uct_cuda_copy_md_t), "uct_cuda_copy_md_t");
@@ -840,15 +849,30 @@ uct_cuda_copy_md_open(uct_component_t *component, const char *md_name,
         goto err;
     }
 
-    md->super.ops               = &md_ops;
-    md->super.component         = &uct_cuda_copy_component;
-    md->config.alloc_whole_reg  = config->alloc_whole_reg;
-    md->config.max_reg_ratio    = config->max_reg_ratio;
-    md->config.pref_loc         = config->pref_loc;
-    md->config.enable_fabric    = config->enable_fabric;
-    md->config.dmabuf_supported = 0;
-    md->sync_memops_set         = 0;
-    md->granularity             = SIZE_MAX;
+    md->super.ops                 = &md_ops;
+    md->super.component           = &uct_cuda_copy_component;
+    md->config.alloc_whole_reg    = config->alloc_whole_reg;
+    md->config.max_reg_ratio      = config->max_reg_ratio;
+    md->config.pref_loc           = config->pref_loc;
+    md->config.enable_fabric      = config->enable_fabric;
+    md->config.dmabuf_supported   = 0;
+    md->config.cuda_ctx_set_flags = 1;
+    md->sync_memops_set           = 0;
+    md->granularity               = SIZE_MAX;
+
+    if ((cuDriverGetVersion(&version) == CUDA_SUCCESS) &&
+        (version < UCT_CUDA_VERSION_VMM)) {
+        if (md->config.enable_fabric != UCS_NO) {
+            ucs_warn("disabled fabric memory allocations as cuda driver "
+                     "library %d.%d < %d.%d",
+                     UCT_CUDA_MAJOR(version), UCT_CUDA_MINOR(version),
+                     UCT_CUDA_MAJOR(UCT_CUDA_VERSION_VMM),
+                     UCT_CUDA_MINOR(UCT_CUDA_VERSION_VMM));
+        }
+
+        md->config.enable_fabric      = UCS_NO;
+        md->config.cuda_ctx_set_flags = 0;
+    }
 
     if ((config->cuda_async_mem_type != UCS_MEMORY_TYPE_CUDA) &&
         (config->cuda_async_mem_type != UCS_MEMORY_TYPE_CUDA_MANAGED)) {
diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.h b/src/uct/cuda/cuda_copy/cuda_copy_md.h
index e14aff739e5..0176a10801a 100644
--- a/src/uct/cuda/cuda_copy/cuda_copy_md.h
+++ b/src/uct/cuda/cuda_copy/cuda_copy_md.h
@@ -36,6 +36,8 @@ typedef struct uct_cuda_copy_md {
         ucs_ternary_auto_value_t enable_fabric;
         uct_cuda_pref_loc_t      pref_loc;
         int                      cuda_async_managed;
+        int                      cuda_ctx_set_flags; /* missing cuCtxSetFlags()
+                                                        below CUDA 12.1 */
     } config;
 } uct_cuda_copy_md_t;
 

From 9fc443096ce3271029a689ae501ff45163632a4d Mon Sep 17 00:00:00 2001
From: Thomas Vegas <tvegas@nvidia.com>
Date: Fri, 20 Dec 2024 13:53:51 +0000
Subject: [PATCH 2/9] UCT/CUDA: Runtime CUDA >= 12.3 to enable VMM

---
 src/uct/cuda/cuda_copy/cuda_copy_md.c | 83 +++++++++++++++++----------
 src/uct/cuda/cuda_copy/cuda_copy_md.h |  2 -
 2 files changed, 52 insertions(+), 33 deletions(-)

diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c
index fa2fde07681..8a952d64830 100644
--- a/src/uct/cuda/cuda_copy/cuda_copy_md.c
+++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c
@@ -29,10 +29,6 @@
 #define UCT_CUDA_DEV_NAME_MAX_LEN 64
 #define UCT_CUDA_MAX_DEVICES      32
 
-#define UCT_CUDA_VERSION_VMM     12030 /* for VMM: cuCtxSetFlags() >= cuda 12.1 */
-#define UCT_CUDA_MAJOR(_version) ((_version) / 1000)
-#define UCT_CUDA_MINOR(_version) (((_version) % 1000) / 10)
-
 
 static const char *uct_cuda_pref_loc[] = {
     [UCT_CUDA_PREF_LOC_CPU]  = "cpu",
@@ -85,6 +81,10 @@ static ucs_config_field_t uct_cuda_copy_md_config_table[] = {
     {NULL}
 };
 
+static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
+
+static CUresult (*ctx_set_flags_func)(unsigned);
+
 static int uct_cuda_copy_md_is_dmabuf_supported()
 {
     int dmabuf_supported = 0;
@@ -483,7 +483,6 @@ static void uct_cuda_copy_md_close(uct_md_h uct_md) {
 
 static size_t uct_cuda_copy_md_get_total_device_mem(CUdevice cuda_device)
 {
-    static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
     static size_t total_bytes[UCT_CUDA_MAX_DEVICES];
     char dev_name[UCT_CUDA_DEV_NAME_MAX_LEN];
 
@@ -523,10 +522,11 @@ uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md, const void *address)
 
 #if HAVE_CUDA_FABRIC
     ucs_status_t status;
-    if (md->config.cuda_ctx_set_flags) {
+    if (ctx_set_flags_func != NULL) {
         if (!md->sync_memops_set) {
             /* Synchronize future DMA operations for all memory types */
-            status = UCT_CUDADRV_FUNC_LOG_WARN(cuCtxSetFlags(CU_CTX_SYNC_MEMOPS));
+            status = UCT_CUDADRV_FUNC_LOG_ERR(
+                    ctx_set_flags_func(CU_CTX_SYNC_MEMOPS));
             if (status == UCS_OK) {
                 md->sync_memops_set = 1;
             }
@@ -832,6 +832,35 @@ static uct_md_ops_t md_ops = {
     .detect_memory_type = uct_cuda_copy_md_detect_memory_type
 };
 
+static ucs_status_t uct_cuda_copy_md_check_is_ctx_set_flags_supported(void)
+{
+    static ucs_status_t status = UCS_ERR_INVALID_ADDR;
+
+#if CUDA_VERSION >= 12000
+    CUdriverProcAddressQueryResult sym_status;
+    CUresult cu_err;
+
+    if (status == UCS_ERR_INVALID_ADDR) {
+        pthread_mutex_lock(&lock);
+        cu_err = cuGetProcAddress("cuCtxSetFlags", (void**)&ctx_set_flags_func,
+                                  12010, CU_GET_PROC_ADDRESS_DEFAULT,
+                                  &sym_status);
+
+        if ((cu_err == CUDA_SUCCESS) &&
+            (sym_status == CU_GET_PROC_ADDRESS_SUCCESS)) {
+            status = UCS_OK;
+        } else {
+            ctx_set_flags_func = NULL;
+            status             = UCS_ERR_UNSUPPORTED;
+        }
+
+        pthread_mutex_unlock(&lock);
+    }
+#endif
+
+    return status;
+}
+
 static ucs_status_t
 uct_cuda_copy_md_open(uct_component_t *component, const char *md_name,
                       const uct_md_config_t *md_config, uct_md_h *md_p)
@@ -839,7 +868,7 @@ uct_cuda_copy_md_open(uct_component_t *component, const char *md_name,
     uct_cuda_copy_md_config_t *config = ucs_derived_of(md_config,
                                                        uct_cuda_copy_md_config_t);
     uct_cuda_copy_md_t *md;
-    int dmabuf_supported, version;
+    int dmabuf_supported;
     ucs_status_t status;
 
     md = ucs_malloc(sizeof(uct_cuda_copy_md_t), "uct_cuda_copy_md_t");
@@ -849,29 +878,21 @@ uct_cuda_copy_md_open(uct_component_t *component, const char *md_name,
         goto err;
     }
 
-    md->super.ops                 = &md_ops;
-    md->super.component           = &uct_cuda_copy_component;
-    md->config.alloc_whole_reg    = config->alloc_whole_reg;
-    md->config.max_reg_ratio      = config->max_reg_ratio;
-    md->config.pref_loc           = config->pref_loc;
-    md->config.enable_fabric      = config->enable_fabric;
-    md->config.dmabuf_supported   = 0;
-    md->config.cuda_ctx_set_flags = 1;
-    md->sync_memops_set           = 0;
-    md->granularity               = SIZE_MAX;
-
-    if ((cuDriverGetVersion(&version) == CUDA_SUCCESS) &&
-        (version < UCT_CUDA_VERSION_VMM)) {
-        if (md->config.enable_fabric != UCS_NO) {
-            ucs_warn("disabled fabric memory allocations as cuda driver "
-                     "library %d.%d < %d.%d",
-                     UCT_CUDA_MAJOR(version), UCT_CUDA_MINOR(version),
-                     UCT_CUDA_MAJOR(UCT_CUDA_VERSION_VMM),
-                     UCT_CUDA_MINOR(UCT_CUDA_VERSION_VMM));
-        }
-
-        md->config.enable_fabric      = UCS_NO;
-        md->config.cuda_ctx_set_flags = 0;
+    md->super.ops               = &md_ops;
+    md->super.component         = &uct_cuda_copy_component;
+    md->config.alloc_whole_reg  = config->alloc_whole_reg;
+    md->config.max_reg_ratio    = config->max_reg_ratio;
+    md->config.pref_loc         = config->pref_loc;
+    md->config.enable_fabric    = config->enable_fabric;
+    md->config.dmabuf_supported = 0;
+    md->sync_memops_set         = 0;
+    md->granularity             = SIZE_MAX;
+
+    status = uct_cuda_copy_md_check_is_ctx_set_flags_supported();
+    if ((status != UCS_OK) && (md->config.enable_fabric != UCS_NO)) {
+        ucs_warn("disabled fabric memory allocations as cuda driver "
+                 "library does not support cuCtxSetFlags()");
+        md->config.enable_fabric = UCS_NO;
     }
 
     if ((config->cuda_async_mem_type != UCS_MEMORY_TYPE_CUDA) &&
diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.h b/src/uct/cuda/cuda_copy/cuda_copy_md.h
index 0176a10801a..e14aff739e5 100644
--- a/src/uct/cuda/cuda_copy/cuda_copy_md.h
+++ b/src/uct/cuda/cuda_copy/cuda_copy_md.h
@@ -36,8 +36,6 @@ typedef struct uct_cuda_copy_md {
         ucs_ternary_auto_value_t enable_fabric;
         uct_cuda_pref_loc_t      pref_loc;
         int                      cuda_async_managed;
-        int                      cuda_ctx_set_flags; /* missing cuCtxSetFlags()
-                                                        below CUDA 12.1 */
     } config;
 } uct_cuda_copy_md_t;
 

From e8c9f9901d03377247a8d81cd93bf5b5cc16bc03 Mon Sep 17 00:00:00 2001
From: Thomas Vegas <tvegas@nvidia.com>
Date: Fri, 20 Dec 2024 17:30:58 +0200
Subject: [PATCH 3/9] UCT/CUDA: Runtime CUDA >= 12.3 to enable VMM

---
 src/uct/cuda/cuda_copy/cuda_copy_md.c | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c
index 8a952d64830..3f77f660b4d 100644
--- a/src/uct/cuda/cuda_copy/cuda_copy_md.c
+++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c
@@ -525,7 +525,7 @@ uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md, const void *address)
     if (ctx_set_flags_func != NULL) {
         if (!md->sync_memops_set) {
             /* Synchronize future DMA operations for all memory types */
-            status = UCT_CUDADRV_FUNC_LOG_ERR(
+            status = UCT_CUDADRV_FUNC_LOG_WARN(
                     ctx_set_flags_func(CU_CTX_SYNC_MEMOPS));
             if (status == UCS_OK) {
                 md->sync_memops_set = 1;
@@ -834,7 +834,7 @@ static uct_md_ops_t md_ops = {
 
 static ucs_status_t uct_cuda_copy_md_check_is_ctx_set_flags_supported(void)
 {
-    static ucs_status_t status = UCS_ERR_INVALID_ADDR;
+    static ucs_status_t status = UCS_ERR_LAST;
 
 #if CUDA_VERSION >= 12000
     CUdriverProcAddressQueryResult sym_status;
@@ -842,16 +842,18 @@ static ucs_status_t uct_cuda_copy_md_check_is_ctx_set_flags_supported(void)
 
     if (status == UCS_ERR_INVALID_ADDR) {
         pthread_mutex_lock(&lock);
-        cu_err = cuGetProcAddress("cuCtxSetFlags", (void**)&ctx_set_flags_func,
-                                  12010, CU_GET_PROC_ADDRESS_DEFAULT,
-                                  &sym_status);
-
-        if ((cu_err == CUDA_SUCCESS) &&
-            (sym_status == CU_GET_PROC_ADDRESS_SUCCESS)) {
-            status = UCS_OK;
-        } else {
-            ctx_set_flags_func = NULL;
-            status             = UCS_ERR_UNSUPPORTED;
+        if (status == UCS_ERR_INVALID_ADDR) {
+            cu_err = cuGetProcAddress("cuCtxSetFlags",
+                                      (void**)&ctx_set_flags_func, 12010,
+                                      CU_GET_PROC_ADDRESS_DEFAULT, &sym_status);
+
+            if ((cu_err == CUDA_SUCCESS) &&
+                (sym_status == CU_GET_PROC_ADDRESS_SUCCESS)) {
+                status = UCS_OK;
+            } else {
+                ctx_set_flags_func = NULL;
+                status             = UCS_ERR_UNSUPPORTED;
+            }
         }
 
         pthread_mutex_unlock(&lock);

From 3b43d299b8a76b8bb7fb19f9a3f18449165ee8c4 Mon Sep 17 00:00:00 2001
From: Thomas Vegas <tvegas@nvidia.com>
Date: Fri, 20 Dec 2024 16:25:17 +0000
Subject: [PATCH 4/9] UCT/CUDA: Runtime CUDA >= 12.3 to enable VMM

---
 src/uct/cuda/cuda_copy/cuda_copy_md.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c
index 3f77f660b4d..374a97760dc 100644
--- a/src/uct/cuda/cuda_copy/cuda_copy_md.c
+++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c
@@ -534,6 +534,8 @@ uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md, const void *address)
 
         return;
     }
+#else
+    (void)ctx_set_flags_func;
 #endif
 
     /* Synchronize for DMA for legacy memory types*/

From 2161adf8410bd47f58c285288eeab1bc016d6c75 Mon Sep 17 00:00:00 2001
From: Thomas Vegas <tvegas@nvidia.com>
Date: Fri, 20 Dec 2024 17:16:43 +0000
Subject: [PATCH 5/9] UCT/CUDA: Runtime CUDA >= 12.3 to enable VMM

---
 src/uct/cuda/cuda_copy/cuda_copy_md.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c
index 374a97760dc..2657e4b288a 100644
--- a/src/uct/cuda/cuda_copy/cuda_copy_md.c
+++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c
@@ -893,9 +893,12 @@ uct_cuda_copy_md_open(uct_component_t *component, const char *md_name,
     md->granularity             = SIZE_MAX;
 
     status = uct_cuda_copy_md_check_is_ctx_set_flags_supported();
-    if ((status != UCS_OK) && (md->config.enable_fabric != UCS_NO)) {
-        ucs_warn("disabled fabric memory allocations as cuda driver "
-                 "library does not support cuCtxSetFlags()");
+    if (status != UCS_OK) {
+        if (md->config.enable_fabric == UCS_YES) {
+            ucs_warn("disabled fabric memory allocations as cuda driver "
+                     "library does not support cuCtxSetFlags()");
+        }
+
         md->config.enable_fabric = UCS_NO;
     }
 

From 656325344d803786d4e40bbcb60e4888db7b0c4b Mon Sep 17 00:00:00 2001
From: Thomas Vegas <tvegas@nvidia.com>
Date: Mon, 6 Jan 2025 20:23:23 +0200
Subject: [PATCH 6/9] UCT/CUDA: Runtime CUDA >= 12.3 to enable VMM

---
 src/uct/cuda/cuda_copy/cuda_copy_md.c | 46 +++++++++++----------------
 1 file changed, 19 insertions(+), 27 deletions(-)

diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c
index 2657e4b288a..dadf8bbb985 100644
--- a/src/uct/cuda/cuda_copy/cuda_copy_md.c
+++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c
@@ -81,9 +81,7 @@ static ucs_config_field_t uct_cuda_copy_md_config_table[] = {
     {NULL}
 };
 
-static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
-
-static CUresult (*ctx_set_flags_func)(unsigned);
+static CUresult (*uct_cuda_cuCtxSetFlags_func)(unsigned);
 
 static int uct_cuda_copy_md_is_dmabuf_supported()
 {
@@ -483,6 +481,7 @@ static void uct_cuda_copy_md_close(uct_md_h uct_md) {
 
 static size_t uct_cuda_copy_md_get_total_device_mem(CUdevice cuda_device)
 {
+    static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
     static size_t total_bytes[UCT_CUDA_MAX_DEVICES];
     char dev_name[UCT_CUDA_DEV_NAME_MAX_LEN];
 
@@ -518,15 +517,14 @@ static size_t uct_cuda_copy_md_get_total_device_mem(CUdevice cuda_device)
 static void
 uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md, const void *address)
 {
-    unsigned value = 1;
-
-#if HAVE_CUDA_FABRIC
+    unsigned sync_memops_value = 1;
     ucs_status_t status;
-    if (ctx_set_flags_func != NULL) {
+
+    if (uct_cuda_cuCtxSetFlags_func != NULL) {
         if (!md->sync_memops_set) {
             /* Synchronize future DMA operations for all memory types */
             status = UCT_CUDADRV_FUNC_LOG_WARN(
-                    ctx_set_flags_func(CU_CTX_SYNC_MEMOPS));
+                    uct_cuda_cuCtxSetFlags_func(CU_CTX_SYNC_MEMOPS));
             if (status == UCS_OK) {
                 md->sync_memops_set = 1;
             }
@@ -534,13 +532,11 @@ uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md, const void *address)
 
         return;
     }
-#else
-    (void)ctx_set_flags_func;
-#endif
 
     /* Synchronize for DMA for legacy memory types*/
     UCT_CUDADRV_FUNC_LOG_WARN(
-            cuPointerSetAttribute(&value, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
+            cuPointerSetAttribute(&sync_memops_value,
+                                  CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
                                   (CUdeviceptr)address));
 }
 
@@ -842,23 +838,19 @@ static ucs_status_t uct_cuda_copy_md_check_is_ctx_set_flags_supported(void)
     CUdriverProcAddressQueryResult sym_status;
     CUresult cu_err;
 
-    if (status == UCS_ERR_INVALID_ADDR) {
-        pthread_mutex_lock(&lock);
-        if (status == UCS_ERR_INVALID_ADDR) {
-            cu_err = cuGetProcAddress("cuCtxSetFlags",
-                                      (void**)&ctx_set_flags_func, 12010,
-                                      CU_GET_PROC_ADDRESS_DEFAULT, &sym_status);
+    if (status == UCS_ERR_LAST) {
+        cu_err = cuGetProcAddress("cuCtxSetFlags",
+                                  (void**)&uct_cuda_cuCtxSetFlags_func,
+                                  12010, CU_GET_PROC_ADDRESS_DEFAULT,
+                                  &sym_status);
 
-            if ((cu_err == CUDA_SUCCESS) &&
-                (sym_status == CU_GET_PROC_ADDRESS_SUCCESS)) {
-                status = UCS_OK;
-            } else {
-                ctx_set_flags_func = NULL;
-                status             = UCS_ERR_UNSUPPORTED;
-            }
+        if ((cu_err == CUDA_SUCCESS) &&
+            (sym_status == CU_GET_PROC_ADDRESS_SUCCESS)) {
+            status = UCS_OK;
+        } else {
+            uct_cuda_cuCtxSetFlags_func = NULL;
+            status                      = UCS_ERR_UNSUPPORTED;
         }
-
-        pthread_mutex_unlock(&lock);
     }
 #endif
 

From ff4313c9ead6dcdac11d88b1603bf98b1b001d7f Mon Sep 17 00:00:00 2001
From: Thomas Vegas <tvegas@nvidia.com>
Date: Tue, 7 Jan 2025 09:02:46 +0000
Subject: [PATCH 7/9] UCT/CUDA: Runtime CUDA >= 12.3 to enable VMM

---
 src/uct/cuda/cuda_copy/cuda_copy_md.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c
index dadf8bbb985..ee7532d477d 100644
--- a/src/uct/cuda/cuda_copy/cuda_copy_md.c
+++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c
@@ -518,6 +518,8 @@ static void
 uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md, const void *address)
 {
     unsigned sync_memops_value = 1;
+
+#if HAVE_CUDA_FABRIC
     ucs_status_t status;
 
     if (uct_cuda_cuCtxSetFlags_func != NULL) {
@@ -532,6 +534,9 @@ uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md, const void *address)
 
         return;
     }
+#else
+    (void)uct_cuda_cuCtxSetFlags_func;
+#endif
 
     /* Synchronize for DMA for legacy memory types*/
     UCT_CUDADRV_FUNC_LOG_WARN(

From 2f5e5a5f20a00176768cae27d8b3fc7efe994b2d Mon Sep 17 00:00:00 2001
From: Thomas Vegas <tvegas@nvidia.com>
Date: Tue, 7 Jan 2025 11:23:39 +0000
Subject: [PATCH 8/9] UCT/CUDA: Runtime CUDA >= 12.3 to enable VMM

---
 src/uct/cuda/cuda_copy/cuda_copy_md.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c
index ee7532d477d..4c5a1766722 100644
--- a/src/uct/cuda/cuda_copy/cuda_copy_md.c
+++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c
@@ -890,12 +890,16 @@ uct_cuda_copy_md_open(uct_component_t *component, const char *md_name,
     md->granularity             = SIZE_MAX;
 
     status = uct_cuda_copy_md_check_is_ctx_set_flags_supported();
-    if (status != UCS_OK) {
+    if ((status != UCS_OK) && (md->config.enable_fabric != UCS_NO)) {
         if (md->config.enable_fabric == UCS_YES) {
-            ucs_warn("disabled fabric memory allocations as cuda driver "
-                     "library does not support cuCtxSetFlags()");
+            ucs_error("failed to enable fabric memory allocations as cuda "
+                      "driver library does not support cuCtxSetFlags()");
+            goto err_free_md;
         }
 
+        ucs_diag("disabled fabric memory allocations as cuda driver library "
+                 "does not support cuCtxSetFlags()");
+
         md->config.enable_fabric = UCS_NO;
     }
 

From f1601a384334681fe761f099c7b259cc29f95e6c Mon Sep 17 00:00:00 2001
From: Thomas Vegas <tvegas@nvidia.com>
Date: Tue, 7 Jan 2025 15:52:02 +0000
Subject: [PATCH 9/9] UCT/CUDA: Runtime CUDA >= 12.3 to enable VMM

---
 src/uct/cuda/cuda_copy/cuda_copy_md.c | 31 ++++++++++++++-------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c
index 4c5a1766722..b771cf8be17 100644
--- a/src/uct/cuda/cuda_copy/cuda_copy_md.c
+++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c
@@ -835,15 +835,15 @@ static uct_md_ops_t md_ops = {
     .detect_memory_type = uct_cuda_copy_md_detect_memory_type
 };
 
-static ucs_status_t uct_cuda_copy_md_check_is_ctx_set_flags_supported(void)
+static int uct_cuda_copy_md_check_is_ctx_set_flags_supported(void)
 {
-    static ucs_status_t status = UCS_ERR_LAST;
-
 #if CUDA_VERSION >= 12000
+    static int is_supported = -1;
+
     CUdriverProcAddressQueryResult sym_status;
     CUresult cu_err;
 
-    if (status == UCS_ERR_LAST) {
+    if (is_supported < 0) {
         cu_err = cuGetProcAddress("cuCtxSetFlags",
                                   (void**)&uct_cuda_cuCtxSetFlags_func,
                                   12010, CU_GET_PROC_ADDRESS_DEFAULT,
@@ -851,15 +851,18 @@ static ucs_status_t uct_cuda_copy_md_check_is_ctx_set_flags_supported(void)
 
         if ((cu_err == CUDA_SUCCESS) &&
             (sym_status == CU_GET_PROC_ADDRESS_SUCCESS)) {
-            status = UCS_OK;
+            is_supported = 1;
         } else {
+            ucs_debug("cuda driver library does not support cuCtxSetFlags()");
             uct_cuda_cuCtxSetFlags_func = NULL;
-            status                      = UCS_ERR_UNSUPPORTED;
+            is_supported                = 0;
         }
     }
-#endif
 
-    return status;
+    return is_supported;
+#else
+    return 0;
+#endif
 }
 
 static ucs_status_t
@@ -889,17 +892,15 @@ uct_cuda_copy_md_open(uct_component_t *component, const char *md_name,
     md->sync_memops_set         = 0;
     md->granularity             = SIZE_MAX;
 
-    status = uct_cuda_copy_md_check_is_ctx_set_flags_supported();
-    if ((status != UCS_OK) && (md->config.enable_fabric != UCS_NO)) {
+    if (!uct_cuda_copy_md_check_is_ctx_set_flags_supported() &&
+        (md->config.enable_fabric != UCS_NO)) {
         if (md->config.enable_fabric == UCS_YES) {
-            ucs_error("failed to enable fabric memory allocations as cuda "
-                      "driver library does not support cuCtxSetFlags()");
+            ucs_error("failed to enable fabric memory allocations");
+            status = UCS_ERR_UNSUPPORTED;
             goto err_free_md;
         }
 
-        ucs_diag("disabled fabric memory allocations as cuda driver library "
-                 "does not support cuCtxSetFlags()");
-
+        ucs_diag("disabled fabric memory allocations");
         md->config.enable_fabric = UCS_NO;
     }