openucx · tvegas1 · Dec 20, 2024 · Dec 20, 2024 · Dec 20, 2024 · Dec 20, 2024
diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c
@@ -81,6 +81,8 @@ static ucs_config_field_t uct_cuda_copy_md_config_table[] = {
     {NULL}
 };
 
+static CUresult (*uct_cuda_cuCtxSetFlags_func)(unsigned);
+
 static int uct_cuda_copy_md_is_dmabuf_supported()
 {
     int dmabuf_supported = 0;
@@ -515,22 +517,32 @@ static size_t uct_cuda_copy_md_get_total_device_mem(CUdevice cuda_device)
 static void
 uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md, const void *address)
 {
+    unsigned sync_memops_value = 1;
+
 #if HAVE_CUDA_FABRIC
     ucs_status_t status;
-    if (!md->sync_memops_set) {
-        /* Synchronize future DMA operations for all memory types */
-        status = UCT_CUDADRV_FUNC_LOG_WARN(cuCtxSetFlags(CU_CTX_SYNC_MEMOPS));
-        if (status == UCS_OK) {
-            md->sync_memops_set = 1;
+
+    if (uct_cuda_cuCtxSetFlags_func != NULL) {
+        if (!md->sync_memops_set) {
+            /* Synchronize future DMA operations for all memory types */
+            status = UCT_CUDADRV_FUNC_LOG_WARN(
+                    uct_cuda_cuCtxSetFlags_func(CU_CTX_SYNC_MEMOPS));
+            if (status == UCS_OK) {
+                md->sync_memops_set = 1;
+            }
         }
+
+        return;
     }
 #else
-    unsigned value = 1;
+    (void)uct_cuda_cuCtxSetFlags_func;
+#endif
+
     /* Synchronize for DMA for legacy memory types*/
     UCT_CUDADRV_FUNC_LOG_WARN(
-            cuPointerSetAttribute(&value, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
+            cuPointerSetAttribute(&sync_memops_value,
+                                  CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
                                   (CUdeviceptr)address));
-#endif
 }
 
 static ucs_status_t
@@ -823,6 +835,33 @@ static uct_md_ops_t md_ops = {
     .detect_memory_type = uct_cuda_copy_md_detect_memory_type
 };
 
+static ucs_status_t uct_cuda_copy_md_check_is_ctx_set_flags_supported(void)
+{
+    static ucs_status_t status = UCS_ERR_LAST;
+
+#if CUDA_VERSION >= 12000
+    CUdriverProcAddressQueryResult sym_status;
+    CUresult cu_err;
+
+    if (status == UCS_ERR_LAST) {
+        cu_err = cuGetProcAddress("cuCtxSetFlags",
+                                  (void**)&uct_cuda_cuCtxSetFlags_func,
+                                  12010, CU_GET_PROC_ADDRESS_DEFAULT,
+                                  &sym_status);
+
+        if ((cu_err == CUDA_SUCCESS) &&
+            (sym_status == CU_GET_PROC_ADDRESS_SUCCESS)) {
+            status = UCS_OK;
+        } else {
+            uct_cuda_cuCtxSetFlags_func = NULL;
+            status                      = UCS_ERR_UNSUPPORTED;
+        }
+    }
+#endif
+
+    return status;
+}
+
 static ucs_status_t
 uct_cuda_copy_md_open(uct_component_t *component, const char *md_name,
                       const uct_md_config_t *md_config, uct_md_h *md_p)
@@ -850,6 +889,20 @@ uct_cuda_copy_md_open(uct_component_t *component, const char *md_name,
     md->sync_memops_set         = 0;
     md->granularity             = SIZE_MAX;
 
+    status = uct_cuda_copy_md_check_is_ctx_set_flags_supported();
+    if ((status != UCS_OK) && (md->config.enable_fabric != UCS_NO)) {
+        if (md->config.enable_fabric == UCS_YES) {
+            ucs_error("failed to enable fabric memory allocations as cuda "
+                      "driver library does not support cuCtxSetFlags()");
+            goto err_free_md;
+        }
+
+        ucs_diag("disabled fabric memory allocations as cuda driver library "
+                 "does not support cuCtxSetFlags()");
+
+        md->config.enable_fabric = UCS_NO;
+    }
+
     if ((config->cuda_async_mem_type != UCS_MEMORY_TYPE_CUDA) &&
         (config->cuda_async_mem_type != UCS_MEMORY_TYPE_CUDA_MANAGED)) {
         ucs_warn("wrong memory type for async memory allocations: \"%s\";"