openucx · tvegas1 · Dec 20, 2024 · Dec 20, 2024 · Dec 20, 2024 · Dec 20, 2024
diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c
@@ -304,6 +304,58 @@ uct_cuda_copy_mem_alloc_fabric(uct_cuda_copy_md_t *md,
     return UCS_ERR_NO_MEMORY;
 }
 
+typedef CUresult (*uct_cuda_cuCtxSetFlags_t)(unsigned);
+
+static void uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md,
+                                      const void *address, int is_vmm)
+{
+    unsigned sync_memops_value = 1;
+#if HAVE_CUDA_FABRIC
+    static uct_cuda_cuCtxSetFlags_t cuda_cuCtxSetFlags_func =
+        (uct_cuda_cuCtxSetFlags_t)ucs_empty_function;
+    CUdriverProcAddressQueryResult sym_status;
+    CUresult cu_err;
+    ucs_status_t status;
+
+    if (md->sync_memops_set) {
+        return;
+    }
+
+    if (cuda_cuCtxSetFlags_func ==
+        (uct_cuda_cuCtxSetFlags_t)ucs_empty_function) {
+        cu_err = cuGetProcAddress("cuCtxSetFlags",
+                                  (void**)&cuda_cuCtxSetFlags_func, 12010,
+                                  CU_GET_PROC_ADDRESS_DEFAULT, &sym_status);
+        if ((cu_err != CUDA_SUCCESS) ||
+            (sym_status != CU_GET_PROC_ADDRESS_SUCCESS)) {
+            cuda_cuCtxSetFlags_func = NULL;
+        }
+    }
+
+    if (cuda_cuCtxSetFlags_func != NULL) {
+        /* Synchronize future DMA operations for all memory types */
+        status = UCT_CUDADRV_FUNC_LOG_WARN(
+                    cuda_cuCtxSetFlags_func(CU_CTX_SYNC_MEMOPS));
+        if (status == UCS_OK) {
+            md->sync_memops_set = 1;
+        }
+
+        return;
+    }
+
+    if (is_vmm) {
+        ucs_warn("failed to set sync_memops on CUDA VMM without "
+                 "cuCtxSetFlags() (address=%p)", address);
+    }
+#endif
+
+    /* Synchronize for DMA for legacy memory types */
+    UCT_CUDADRV_FUNC_LOG_WARN(
+            cuPointerSetAttribute(&sync_memops_value,
+                                  CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
+                                  (CUdeviceptr)address));
+}
+
 static ucs_status_t
 uct_cuda_copy_mem_alloc(uct_md_h uct_md, size_t *length_p, void **address_p,
                         ucs_memory_type_t mem_type, unsigned flags,
@@ -379,6 +431,9 @@ uct_cuda_copy_mem_alloc(uct_md_h uct_md, size_t *length_p, void **address_p,
     }
 
 allocated:
+    uct_cuda_copy_sync_memops(md, (void *)alloc_handle->ptr,
+                              alloc_handle->is_vmm);
+
     *memh_p    = alloc_handle;
     *address_p = (void*)alloc_handle->ptr;
     *length_p  = alloc_handle->length;
@@ -512,27 +567,6 @@ static size_t uct_cuda_copy_md_get_total_device_mem(CUdevice cuda_device)
     return 1; /* return 1 byte to avoid division by zero */
 }
 
-static void
-uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md, const void *address)
-{
-#if HAVE_CUDA_FABRIC
-    ucs_status_t status;
-    if (!md->sync_memops_set) {
-        /* Synchronize future DMA operations for all memory types */
-        status = UCT_CUDADRV_FUNC_LOG_WARN(cuCtxSetFlags(CU_CTX_SYNC_MEMOPS));
-        if (status == UCS_OK) {
-            md->sync_memops_set = 1;
-        }
-    }
-#else
-    unsigned value = 1;
-    /* Synchronize for DMA for legacy memory types*/
-    UCT_CUDADRV_FUNC_LOG_WARN(
-            cuPointerSetAttribute(&value, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
-                                  (CUdeviceptr)address));
-#endif
-}
-
 static ucs_status_t
 uct_cuda_copy_md_query_attributes(uct_cuda_copy_md_t *md, const void *address,
                                   size_t length, ucs_memory_info_t *mem_info)
@@ -636,7 +670,7 @@ uct_cuda_copy_md_query_attributes(uct_cuda_copy_md_t *md, const void *address,
         return UCS_ERR_NO_DEVICE;
     }
 
-    uct_cuda_copy_sync_memops(md, address);
+    uct_cuda_copy_sync_memops(md, address, is_vmm);
 
     /* Extending the registration range is disable by configuration */
     if (md->config.alloc_whole_reg == UCS_CONFIG_OFF) {