diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c index a185dde3779..b771cf8be17 100644 --- a/src/uct/cuda/cuda_copy/cuda_copy_md.c +++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c @@ -81,6 +81,8 @@ static ucs_config_field_t uct_cuda_copy_md_config_table[] = { {NULL} }; +static CUresult (*uct_cuda_cuCtxSetFlags_func)(unsigned); + static int uct_cuda_copy_md_is_dmabuf_supported() { int dmabuf_supported = 0; @@ -515,22 +517,32 @@ static size_t uct_cuda_copy_md_get_total_device_mem(CUdevice cuda_device) static void uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md, const void *address) { + unsigned sync_memops_value = 1; + #if HAVE_CUDA_FABRIC ucs_status_t status; - if (!md->sync_memops_set) { - /* Synchronize future DMA operations for all memory types */ - status = UCT_CUDADRV_FUNC_LOG_WARN(cuCtxSetFlags(CU_CTX_SYNC_MEMOPS)); - if (status == UCS_OK) { - md->sync_memops_set = 1; + + if (uct_cuda_cuCtxSetFlags_func != NULL) { + if (!md->sync_memops_set) { + /* Synchronize future DMA operations for all memory types */ + status = UCT_CUDADRV_FUNC_LOG_WARN( + uct_cuda_cuCtxSetFlags_func(CU_CTX_SYNC_MEMOPS)); + if (status == UCS_OK) { + md->sync_memops_set = 1; + } } + + return; } #else - unsigned value = 1; + (void)uct_cuda_cuCtxSetFlags_func; +#endif + /* Synchronize for DMA for legacy memory types*/ UCT_CUDADRV_FUNC_LOG_WARN( - cuPointerSetAttribute(&value, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, + cuPointerSetAttribute(&sync_memops_value, + CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, (CUdeviceptr)address)); -#endif } static ucs_status_t @@ -823,6 +835,36 @@ static uct_md_ops_t md_ops = { .detect_memory_type = uct_cuda_copy_md_detect_memory_type }; +static int uct_cuda_copy_md_check_is_ctx_set_flags_supported(void) +{ +#if CUDA_VERSION >= 12000 + static int is_supported = -1; + + CUdriverProcAddressQueryResult sym_status; + CUresult cu_err; + + if (is_supported < 0) { + cu_err = cuGetProcAddress("cuCtxSetFlags", + (void**)&uct_cuda_cuCtxSetFlags_func, + 12010, CU_GET_PROC_ADDRESS_DEFAULT, + &sym_status); + + if ((cu_err == CUDA_SUCCESS) && + (sym_status == CU_GET_PROC_ADDRESS_SUCCESS)) { + is_supported = 1; + } else { + ucs_debug("cuda driver library does not support cuCtxSetFlags()"); + uct_cuda_cuCtxSetFlags_func = NULL; + is_supported = 0; + } + } + + return is_supported; +#else + return 0; +#endif +} + static ucs_status_t uct_cuda_copy_md_open(uct_component_t *component, const char *md_name, const uct_md_config_t *md_config, uct_md_h *md_p) @@ -850,6 +892,18 @@ uct_cuda_copy_md_open(uct_component_t *component, const char *md_name, md->sync_memops_set = 0; md->granularity = SIZE_MAX; + if (!uct_cuda_copy_md_check_is_ctx_set_flags_supported() && + (md->config.enable_fabric != UCS_NO)) { + if (md->config.enable_fabric == UCS_YES) { + ucs_error("failed to enable fabric memory allocations"); + status = UCS_ERR_UNSUPPORTED; + goto err_free_md; + } + + ucs_diag("disabled fabric memory allocations"); + md->config.enable_fabric = UCS_NO; + } + if ((config->cuda_async_mem_type != UCS_MEMORY_TYPE_CUDA) && (config->cuda_async_mem_type != UCS_MEMORY_TYPE_CUDA_MANAGED)) { ucs_warn("wrong memory type for async memory allocations: \"%s\";"