From 68a5f5170953a0d6f47661a5a4e84f99eb0b1df3 Mon Sep 17 00:00:00 2001
From: Thomas Vegas <tvegas@nvidia.com>
Date: Fri, 20 Dec 2024 09:36:46 +0000
Subject: [PATCH 01/20] UCT/CUDA: Runtime CUDA >= 12.3 to enable VMM

---
 src/uct/cuda/cuda_copy/cuda_copy_md.c | 60 +++++++++++++++++++--------
 src/uct/cuda/cuda_copy/cuda_copy_md.h |  2 +
 2 files changed, 44 insertions(+), 18 deletions(-)

diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c
index a185dde3779..fa2fde07681 100644
--- a/src/uct/cuda/cuda_copy/cuda_copy_md.c
+++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c
@@ -29,6 +29,10 @@
 #define UCT_CUDA_DEV_NAME_MAX_LEN 64
 #define UCT_CUDA_MAX_DEVICES      32
 
+#define UCT_CUDA_VERSION_VMM     12030 /* for VMM: cuCtxSetFlags() >= cuda 12.1 */
+#define UCT_CUDA_MAJOR(_version) ((_version) / 1000)
+#define UCT_CUDA_MINOR(_version) (((_version) % 1000) / 10)
+
 
 static const char *uct_cuda_pref_loc[] = {
     [UCT_CUDA_PREF_LOC_CPU]  = "cpu",
@@ -515,22 +519,27 @@ static size_t uct_cuda_copy_md_get_total_device_mem(CUdevice cuda_device)
 static void
 uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md, const void *address)
 {
+    unsigned value = 1;
+
 #if HAVE_CUDA_FABRIC
     ucs_status_t status;
-    if (!md->sync_memops_set) {
-        /* Synchronize future DMA operations for all memory types */
-        status = UCT_CUDADRV_FUNC_LOG_WARN(cuCtxSetFlags(CU_CTX_SYNC_MEMOPS));
-        if (status == UCS_OK) {
-            md->sync_memops_set = 1;
+    if (md->config.cuda_ctx_set_flags) {
+        if (!md->sync_memops_set) {
+            /* Synchronize future DMA operations for all memory types */
+            status = UCT_CUDADRV_FUNC_LOG_WARN(cuCtxSetFlags(CU_CTX_SYNC_MEMOPS));
+            if (status == UCS_OK) {
+                md->sync_memops_set = 1;
+            }
         }
+
+        return;
     }
-#else
-    unsigned value = 1;
+#endif
+
     /* Synchronize for DMA for legacy memory types*/
     UCT_CUDADRV_FUNC_LOG_WARN(
             cuPointerSetAttribute(&value, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
                                   (CUdeviceptr)address));
-#endif
 }
 
 static ucs_status_t
@@ -830,7 +839,7 @@ uct_cuda_copy_md_open(uct_component_t *component, const char *md_name,
     uct_cuda_copy_md_config_t *config = ucs_derived_of(md_config,
                                                        uct_cuda_copy_md_config_t);
     uct_cuda_copy_md_t *md;
-    int dmabuf_supported;
+    int dmabuf_supported, version;
     ucs_status_t status;
 
     md = ucs_malloc(sizeof(uct_cuda_copy_md_t), "uct_cuda_copy_md_t");
@@ -840,15 +849,30 @@ uct_cuda_copy_md_open(uct_component_t *component, const char *md_name,
         goto err;
     }
 
-    md->super.ops               = &md_ops;
-    md->super.component         = &uct_cuda_copy_component;
-    md->config.alloc_whole_reg  = config->alloc_whole_reg;
-    md->config.max_reg_ratio    = config->max_reg_ratio;
-    md->config.pref_loc         = config->pref_loc;
-    md->config.enable_fabric    = config->enable_fabric;
-    md->config.dmabuf_supported = 0;
-    md->sync_memops_set         = 0;
-    md->granularity             = SIZE_MAX;
+    md->super.ops                 = &md_ops;
+    md->super.component           = &uct_cuda_copy_component;
+    md->config.alloc_whole_reg    = config->alloc_whole_reg;
+    md->config.max_reg_ratio      = config->max_reg_ratio;
+    md->config.pref_loc           = config->pref_loc;
+    md->config.enable_fabric      = config->enable_fabric;
+    md->config.dmabuf_supported   = 0;
+    md->config.cuda_ctx_set_flags = 1;
+    md->sync_memops_set           = 0;
+    md->granularity               = SIZE_MAX;
+
+    if ((cuDriverGetVersion(&version) == CUDA_SUCCESS) &&
+        (version < UCT_CUDA_VERSION_VMM)) {
+        if (md->config.enable_fabric != UCS_NO) {
+            ucs_warn("disabled fabric memory allocations as cuda driver "
+                     "library %d.%d < %d.%d",
+                     UCT_CUDA_MAJOR(version), UCT_CUDA_MINOR(version),
+                     UCT_CUDA_MAJOR(UCT_CUDA_VERSION_VMM),
+                     UCT_CUDA_MINOR(UCT_CUDA_VERSION_VMM));
+        }
+
+        md->config.enable_fabric      = UCS_NO;
+        md->config.cuda_ctx_set_flags = 0;
+    }
 
     if ((config->cuda_async_mem_type != UCS_MEMORY_TYPE_CUDA) &&
         (config->cuda_async_mem_type != UCS_MEMORY_TYPE_CUDA_MANAGED)) {
diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.h b/src/uct/cuda/cuda_copy/cuda_copy_md.h
index e14aff739e5..0176a10801a 100644
--- a/src/uct/cuda/cuda_copy/cuda_copy_md.h
+++ b/src/uct/cuda/cuda_copy/cuda_copy_md.h
@@ -36,6 +36,8 @@ typedef struct uct_cuda_copy_md {
         ucs_ternary_auto_value_t enable_fabric;
         uct_cuda_pref_loc_t      pref_loc;
         int                      cuda_async_managed;
+        int                      cuda_ctx_set_flags; /* missing cuCtxSetFlags()
+                                                        below CUDA 12.1 */
     } config;
 } uct_cuda_copy_md_t;
 

From 9fc443096ce3271029a689ae501ff45163632a4d Mon Sep 17 00:00:00 2001
From: Thomas Vegas <tvegas@nvidia.com>
Date: Fri, 20 Dec 2024 13:53:51 +0000
Subject: [PATCH 02/20] UCT/CUDA: Runtime CUDA >= 12.3 to enable VMM

---
 src/uct/cuda/cuda_copy/cuda_copy_md.c | 83 +++++++++++++++++----------
 src/uct/cuda/cuda_copy/cuda_copy_md.h |  2 -
 2 files changed, 52 insertions(+), 33 deletions(-)

diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c
index fa2fde07681..8a952d64830 100644
--- a/src/uct/cuda/cuda_copy/cuda_copy_md.c
+++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c
@@ -29,10 +29,6 @@
 #define UCT_CUDA_DEV_NAME_MAX_LEN 64
 #define UCT_CUDA_MAX_DEVICES      32
 
-#define UCT_CUDA_VERSION_VMM     12030 /* for VMM: cuCtxSetFlags() >= cuda 12.1 */
-#define UCT_CUDA_MAJOR(_version) ((_version) / 1000)
-#define UCT_CUDA_MINOR(_version) (((_version) % 1000) / 10)
-
 
 static const char *uct_cuda_pref_loc[] = {
     [UCT_CUDA_PREF_LOC_CPU]  = "cpu",
@@ -85,6 +81,10 @@ static ucs_config_field_t uct_cuda_copy_md_config_table[] = {
     {NULL}
 };
 
+static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
+
+static CUresult (*ctx_set_flags_func)(unsigned);
+
 static int uct_cuda_copy_md_is_dmabuf_supported()
 {
     int dmabuf_supported = 0;
@@ -483,7 +483,6 @@ static void uct_cuda_copy_md_close(uct_md_h uct_md) {
 
 static size_t uct_cuda_copy_md_get_total_device_mem(CUdevice cuda_device)
 {
-    static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
     static size_t total_bytes[UCT_CUDA_MAX_DEVICES];
     char dev_name[UCT_CUDA_DEV_NAME_MAX_LEN];
 
@@ -523,10 +522,11 @@ uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md, const void *address)
 
 #if HAVE_CUDA_FABRIC
     ucs_status_t status;
-    if (md->config.cuda_ctx_set_flags) {
+    if (ctx_set_flags_func != NULL) {
         if (!md->sync_memops_set) {
             /* Synchronize future DMA operations for all memory types */
-            status = UCT_CUDADRV_FUNC_LOG_WARN(cuCtxSetFlags(CU_CTX_SYNC_MEMOPS));
+            status = UCT_CUDADRV_FUNC_LOG_ERR(
+                    ctx_set_flags_func(CU_CTX_SYNC_MEMOPS));
             if (status == UCS_OK) {
                 md->sync_memops_set = 1;
             }
@@ -832,6 +832,35 @@ static uct_md_ops_t md_ops = {
     .detect_memory_type = uct_cuda_copy_md_detect_memory_type
 };
 
+static ucs_status_t uct_cuda_copy_md_check_is_ctx_set_flags_supported(void)
+{
+    static ucs_status_t status = UCS_ERR_INVALID_ADDR;
+
+#if CUDA_VERSION >= 12000
+    CUdriverProcAddressQueryResult sym_status;
+    CUresult cu_err;
+
+    if (status == UCS_ERR_INVALID_ADDR) {
+        pthread_mutex_lock(&lock);
+        cu_err = cuGetProcAddress("cuCtxSetFlags", (void**)&ctx_set_flags_func,
+                                  12010, CU_GET_PROC_ADDRESS_DEFAULT,
+                                  &sym_status);
+
+        if ((cu_err == CUDA_SUCCESS) &&
+            (sym_status == CU_GET_PROC_ADDRESS_SUCCESS)) {
+            status = UCS_OK;
+        } else {
+            ctx_set_flags_func = NULL;
+            status             = UCS_ERR_UNSUPPORTED;
+        }
+
+        pthread_mutex_unlock(&lock);
+    }
+#endif
+
+    return status;
+}
+
 static ucs_status_t
 uct_cuda_copy_md_open(uct_component_t *component, const char *md_name,
                       const uct_md_config_t *md_config, uct_md_h *md_p)
@@ -839,7 +868,7 @@ uct_cuda_copy_md_open(uct_component_t *component, const char *md_name,
     uct_cuda_copy_md_config_t *config = ucs_derived_of(md_config,
                                                        uct_cuda_copy_md_config_t);
     uct_cuda_copy_md_t *md;
-    int dmabuf_supported, version;
+    int dmabuf_supported;
     ucs_status_t status;
 
     md = ucs_malloc(sizeof(uct_cuda_copy_md_t), "uct_cuda_copy_md_t");
@@ -849,29 +878,21 @@ uct_cuda_copy_md_open(uct_component_t *component, const char *md_name,
         goto err;
     }
 
-    md->super.ops                 = &md_ops;
-    md->super.component           = &uct_cuda_copy_component;
-    md->config.alloc_whole_reg    = config->alloc_whole_reg;
-    md->config.max_reg_ratio      = config->max_reg_ratio;
-    md->config.pref_loc           = config->pref_loc;
-    md->config.enable_fabric      = config->enable_fabric;
-    md->config.dmabuf_supported   = 0;
-    md->config.cuda_ctx_set_flags = 1;
-    md->sync_memops_set           = 0;
-    md->granularity               = SIZE_MAX;
-
-    if ((cuDriverGetVersion(&version) == CUDA_SUCCESS) &&
-        (version < UCT_CUDA_VERSION_VMM)) {
-        if (md->config.enable_fabric != UCS_NO) {
-            ucs_warn("disabled fabric memory allocations as cuda driver "
-                     "library %d.%d < %d.%d",
-                     UCT_CUDA_MAJOR(version), UCT_CUDA_MINOR(version),
-                     UCT_CUDA_MAJOR(UCT_CUDA_VERSION_VMM),
-                     UCT_CUDA_MINOR(UCT_CUDA_VERSION_VMM));
-        }
-
-        md->config.enable_fabric      = UCS_NO;
-        md->config.cuda_ctx_set_flags = 0;
+    md->super.ops               = &md_ops;
+    md->super.component         = &uct_cuda_copy_component;
+    md->config.alloc_whole_reg  = config->alloc_whole_reg;
+    md->config.max_reg_ratio    = config->max_reg_ratio;
+    md->config.pref_loc         = config->pref_loc;
+    md->config.enable_fabric    = config->enable_fabric;
+    md->config.dmabuf_supported = 0;
+    md->sync_memops_set         = 0;
+    md->granularity             = SIZE_MAX;
+
+    status = uct_cuda_copy_md_check_is_ctx_set_flags_supported();
+    if ((status != UCS_OK) && (md->config.enable_fabric != UCS_NO)) {
+        ucs_warn("disabled fabric memory allocations as cuda driver "
+                 "library does not support cuCtxSetFlags()");
+        md->config.enable_fabric = UCS_NO;
     }
 
     if ((config->cuda_async_mem_type != UCS_MEMORY_TYPE_CUDA) &&
diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.h b/src/uct/cuda/cuda_copy/cuda_copy_md.h
index 0176a10801a..e14aff739e5 100644
--- a/src/uct/cuda/cuda_copy/cuda_copy_md.h
+++ b/src/uct/cuda/cuda_copy/cuda_copy_md.h
@@ -36,8 +36,6 @@ typedef struct uct_cuda_copy_md {
         ucs_ternary_auto_value_t enable_fabric;
         uct_cuda_pref_loc_t      pref_loc;
         int                      cuda_async_managed;
-        int                      cuda_ctx_set_flags; /* missing cuCtxSetFlags()
-                                                        below CUDA 12.1 */
     } config;
 } uct_cuda_copy_md_t;
 

From e8c9f9901d03377247a8d81cd93bf5b5cc16bc03 Mon Sep 17 00:00:00 2001
From: Thomas Vegas <tvegas@nvidia.com>
Date: Fri, 20 Dec 2024 17:30:58 +0200
Subject: [PATCH 03/20] UCT/CUDA: Runtime CUDA >= 12.3 to enable VMM

---
 src/uct/cuda/cuda_copy/cuda_copy_md.c | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c
index 8a952d64830..3f77f660b4d 100644
--- a/src/uct/cuda/cuda_copy/cuda_copy_md.c
+++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c
@@ -525,7 +525,7 @@ uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md, const void *address)
     if (ctx_set_flags_func != NULL) {
         if (!md->sync_memops_set) {
             /* Synchronize future DMA operations for all memory types */
-            status = UCT_CUDADRV_FUNC_LOG_ERR(
+            status = UCT_CUDADRV_FUNC_LOG_WARN(
                     ctx_set_flags_func(CU_CTX_SYNC_MEMOPS));
             if (status == UCS_OK) {
                 md->sync_memops_set = 1;
@@ -834,7 +834,7 @@ static uct_md_ops_t md_ops = {
 
 static ucs_status_t uct_cuda_copy_md_check_is_ctx_set_flags_supported(void)
 {
-    static ucs_status_t status = UCS_ERR_INVALID_ADDR;
+    static ucs_status_t status = UCS_ERR_LAST;
 
 #if CUDA_VERSION >= 12000
     CUdriverProcAddressQueryResult sym_status;
@@ -842,16 +842,18 @@ static ucs_status_t uct_cuda_copy_md_check_is_ctx_set_flags_supported(void)
 
     if (status == UCS_ERR_INVALID_ADDR) {
         pthread_mutex_lock(&lock);
-        cu_err = cuGetProcAddress("cuCtxSetFlags", (void**)&ctx_set_flags_func,
-                                  12010, CU_GET_PROC_ADDRESS_DEFAULT,
-                                  &sym_status);
-
-        if ((cu_err == CUDA_SUCCESS) &&
-            (sym_status == CU_GET_PROC_ADDRESS_SUCCESS)) {
-            status = UCS_OK;
-        } else {
-            ctx_set_flags_func = NULL;
-            status             = UCS_ERR_UNSUPPORTED;
+        if (status == UCS_ERR_INVALID_ADDR) {
+            cu_err = cuGetProcAddress("cuCtxSetFlags",
+                                      (void**)&ctx_set_flags_func, 12010,
+                                      CU_GET_PROC_ADDRESS_DEFAULT, &sym_status);
+
+            if ((cu_err == CUDA_SUCCESS) &&
+                (sym_status == CU_GET_PROC_ADDRESS_SUCCESS)) {
+                status = UCS_OK;
+            } else {
+                ctx_set_flags_func = NULL;
+                status             = UCS_ERR_UNSUPPORTED;
+            }
         }
 
         pthread_mutex_unlock(&lock);

From 3b43d299b8a76b8bb7fb19f9a3f18449165ee8c4 Mon Sep 17 00:00:00 2001
From: Thomas Vegas <tvegas@nvidia.com>
Date: Fri, 20 Dec 2024 16:25:17 +0000
Subject: [PATCH 04/20] UCT/CUDA: Runtime CUDA >= 12.3 to enable VMM

---
 src/uct/cuda/cuda_copy/cuda_copy_md.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c
index 3f77f660b4d..374a97760dc 100644
--- a/src/uct/cuda/cuda_copy/cuda_copy_md.c
+++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c
@@ -534,6 +534,8 @@ uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md, const void *address)
 
         return;
     }
+#else
+    (void)ctx_set_flags_func;
 #endif
 
     /* Synchronize for DMA for legacy memory types*/

From 2161adf8410bd47f58c285288eeab1bc016d6c75 Mon Sep 17 00:00:00 2001
From: Thomas Vegas <tvegas@nvidia.com>
Date: Fri, 20 Dec 2024 17:16:43 +0000
Subject: [PATCH 05/20] UCT/CUDA: Runtime CUDA >= 12.3 to enable VMM

---
 src/uct/cuda/cuda_copy/cuda_copy_md.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c
index 374a97760dc..2657e4b288a 100644
--- a/src/uct/cuda/cuda_copy/cuda_copy_md.c
+++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c
@@ -893,9 +893,12 @@ uct_cuda_copy_md_open(uct_component_t *component, const char *md_name,
     md->granularity             = SIZE_MAX;
 
     status = uct_cuda_copy_md_check_is_ctx_set_flags_supported();
-    if ((status != UCS_OK) && (md->config.enable_fabric != UCS_NO)) {
-        ucs_warn("disabled fabric memory allocations as cuda driver "
-                 "library does not support cuCtxSetFlags()");
+    if (status != UCS_OK) {
+        if (md->config.enable_fabric == UCS_YES) {
+            ucs_warn("disabled fabric memory allocations as cuda driver "
+                     "library does not support cuCtxSetFlags()");
+        }
+
         md->config.enable_fabric = UCS_NO;
     }
 

From 656325344d803786d4e40bbcb60e4888db7b0c4b Mon Sep 17 00:00:00 2001
From: Thomas Vegas <tvegas@nvidia.com>
Date: Mon, 6 Jan 2025 20:23:23 +0200
Subject: [PATCH 06/20] UCT/CUDA: Runtime CUDA >= 12.3 to enable VMM

---
 src/uct/cuda/cuda_copy/cuda_copy_md.c | 46 +++++++++++----------------
 1 file changed, 19 insertions(+), 27 deletions(-)

diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c
index 2657e4b288a..dadf8bbb985 100644
--- a/src/uct/cuda/cuda_copy/cuda_copy_md.c
+++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c
@@ -81,9 +81,7 @@ static ucs_config_field_t uct_cuda_copy_md_config_table[] = {
     {NULL}
 };
 
-static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
-
-static CUresult (*ctx_set_flags_func)(unsigned);
+static CUresult (*uct_cuda_cuCtxSetFlags_func)(unsigned);
 
 static int uct_cuda_copy_md_is_dmabuf_supported()
 {
@@ -483,6 +481,7 @@ static void uct_cuda_copy_md_close(uct_md_h uct_md) {
 
 static size_t uct_cuda_copy_md_get_total_device_mem(CUdevice cuda_device)
 {
+    static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
     static size_t total_bytes[UCT_CUDA_MAX_DEVICES];
     char dev_name[UCT_CUDA_DEV_NAME_MAX_LEN];
 
@@ -518,15 +517,14 @@ static size_t uct_cuda_copy_md_get_total_device_mem(CUdevice cuda_device)
 static void
 uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md, const void *address)
 {
-    unsigned value = 1;
-
-#if HAVE_CUDA_FABRIC
+    unsigned sync_memops_value = 1;
     ucs_status_t status;
-    if (ctx_set_flags_func != NULL) {
+
+    if (uct_cuda_cuCtxSetFlags_func != NULL) {
         if (!md->sync_memops_set) {
             /* Synchronize future DMA operations for all memory types */
             status = UCT_CUDADRV_FUNC_LOG_WARN(
-                    ctx_set_flags_func(CU_CTX_SYNC_MEMOPS));
+                    uct_cuda_cuCtxSetFlags_func(CU_CTX_SYNC_MEMOPS));
             if (status == UCS_OK) {
                 md->sync_memops_set = 1;
             }
@@ -534,13 +532,11 @@ uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md, const void *address)
 
         return;
     }
-#else
-    (void)ctx_set_flags_func;
-#endif
 
     /* Synchronize for DMA for legacy memory types*/
     UCT_CUDADRV_FUNC_LOG_WARN(
-            cuPointerSetAttribute(&value, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
+            cuPointerSetAttribute(&sync_memops_value,
+                                  CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
                                   (CUdeviceptr)address));
 }
 
@@ -842,23 +838,19 @@ static ucs_status_t uct_cuda_copy_md_check_is_ctx_set_flags_supported(void)
     CUdriverProcAddressQueryResult sym_status;
     CUresult cu_err;
 
-    if (status == UCS_ERR_INVALID_ADDR) {
-        pthread_mutex_lock(&lock);
-        if (status == UCS_ERR_INVALID_ADDR) {
-            cu_err = cuGetProcAddress("cuCtxSetFlags",
-                                      (void**)&ctx_set_flags_func, 12010,
-                                      CU_GET_PROC_ADDRESS_DEFAULT, &sym_status);
+    if (status == UCS_ERR_LAST) {
+        cu_err = cuGetProcAddress("cuCtxSetFlags",
+                                  (void**)&uct_cuda_cuCtxSetFlags_func,
+                                  12010, CU_GET_PROC_ADDRESS_DEFAULT,
+                                  &sym_status);
 
-            if ((cu_err == CUDA_SUCCESS) &&
-                (sym_status == CU_GET_PROC_ADDRESS_SUCCESS)) {
-                status = UCS_OK;
-            } else {
-                ctx_set_flags_func = NULL;
-                status             = UCS_ERR_UNSUPPORTED;
-            }
+        if ((cu_err == CUDA_SUCCESS) &&
+            (sym_status == CU_GET_PROC_ADDRESS_SUCCESS)) {
+            status = UCS_OK;
+        } else {
+            uct_cuda_cuCtxSetFlags_func = NULL;
+            status                      = UCS_ERR_UNSUPPORTED;
         }
-
-        pthread_mutex_unlock(&lock);
     }
 #endif
 

From ff4313c9ead6dcdac11d88b1603bf98b1b001d7f Mon Sep 17 00:00:00 2001
From: Thomas Vegas <tvegas@nvidia.com>
Date: Tue, 7 Jan 2025 09:02:46 +0000
Subject: [PATCH 07/20] UCT/CUDA: Runtime CUDA >= 12.3 to enable VMM

---
 src/uct/cuda/cuda_copy/cuda_copy_md.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c
index dadf8bbb985..ee7532d477d 100644
--- a/src/uct/cuda/cuda_copy/cuda_copy_md.c
+++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c
@@ -518,6 +518,8 @@ static void
 uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md, const void *address)
 {
     unsigned sync_memops_value = 1;
+
+#if HAVE_CUDA_FABRIC
     ucs_status_t status;
 
     if (uct_cuda_cuCtxSetFlags_func != NULL) {
@@ -532,6 +534,9 @@ uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md, const void *address)
 
         return;
     }
+#else
+    (void)uct_cuda_cuCtxSetFlags_func;
+#endif
 
     /* Synchronize for DMA for legacy memory types*/
     UCT_CUDADRV_FUNC_LOG_WARN(

From 2f5e5a5f20a00176768cae27d8b3fc7efe994b2d Mon Sep 17 00:00:00 2001
From: Thomas Vegas <tvegas@nvidia.com>
Date: Tue, 7 Jan 2025 11:23:39 +0000
Subject: [PATCH 08/20] UCT/CUDA: Runtime CUDA >= 12.3 to enable VMM

---
 src/uct/cuda/cuda_copy/cuda_copy_md.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c
index ee7532d477d..4c5a1766722 100644
--- a/src/uct/cuda/cuda_copy/cuda_copy_md.c
+++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c
@@ -890,12 +890,16 @@ uct_cuda_copy_md_open(uct_component_t *component, const char *md_name,
     md->granularity             = SIZE_MAX;
 
     status = uct_cuda_copy_md_check_is_ctx_set_flags_supported();
-    if (status != UCS_OK) {
+    if ((status != UCS_OK) && (md->config.enable_fabric != UCS_NO)) {
         if (md->config.enable_fabric == UCS_YES) {
-            ucs_warn("disabled fabric memory allocations as cuda driver "
-                     "library does not support cuCtxSetFlags()");
+            ucs_error("failed to enable fabric memory allocations as cuda "
+                      "driver library does not support cuCtxSetFlags()");
+            goto err_free_md;
         }
 
+        ucs_diag("disabled fabric memory allocations as cuda driver library "
+                 "does not support cuCtxSetFlags()");
+
         md->config.enable_fabric = UCS_NO;
     }
 

From f1601a384334681fe761f099c7b259cc29f95e6c Mon Sep 17 00:00:00 2001
From: Thomas Vegas <tvegas@nvidia.com>
Date: Tue, 7 Jan 2025 15:52:02 +0000
Subject: [PATCH 09/20] UCT/CUDA: Runtime CUDA >= 12.3 to enable VMM

---
 src/uct/cuda/cuda_copy/cuda_copy_md.c | 31 ++++++++++++++-------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c
index 4c5a1766722..b771cf8be17 100644
--- a/src/uct/cuda/cuda_copy/cuda_copy_md.c
+++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c
@@ -835,15 +835,15 @@ static uct_md_ops_t md_ops = {
     .detect_memory_type = uct_cuda_copy_md_detect_memory_type
 };
 
-static ucs_status_t uct_cuda_copy_md_check_is_ctx_set_flags_supported(void)
+static int uct_cuda_copy_md_check_is_ctx_set_flags_supported(void)
 {
-    static ucs_status_t status = UCS_ERR_LAST;
-
 #if CUDA_VERSION >= 12000
+    static int is_supported = -1;
+
     CUdriverProcAddressQueryResult sym_status;
     CUresult cu_err;
 
-    if (status == UCS_ERR_LAST) {
+    if (is_supported < 0) {
         cu_err = cuGetProcAddress("cuCtxSetFlags",
                                   (void**)&uct_cuda_cuCtxSetFlags_func,
                                   12010, CU_GET_PROC_ADDRESS_DEFAULT,
@@ -851,15 +851,18 @@ static ucs_status_t uct_cuda_copy_md_check_is_ctx_set_flags_supported(void)
 
         if ((cu_err == CUDA_SUCCESS) &&
             (sym_status == CU_GET_PROC_ADDRESS_SUCCESS)) {
-            status = UCS_OK;
+            is_supported = 1;
         } else {
+            ucs_debug("cuda driver library does not support cuCtxSetFlags()");
             uct_cuda_cuCtxSetFlags_func = NULL;
-            status                      = UCS_ERR_UNSUPPORTED;
+            is_supported                = 0;
         }
     }
-#endif
 
-    return status;
+    return is_supported;
+#else
+    return 0;
+#endif
 }
 
 static ucs_status_t
@@ -889,17 +892,15 @@ uct_cuda_copy_md_open(uct_component_t *component, const char *md_name,
     md->sync_memops_set         = 0;
     md->granularity             = SIZE_MAX;
 
-    status = uct_cuda_copy_md_check_is_ctx_set_flags_supported();
-    if ((status != UCS_OK) && (md->config.enable_fabric != UCS_NO)) {
+    if (!uct_cuda_copy_md_check_is_ctx_set_flags_supported() &&
+        (md->config.enable_fabric != UCS_NO)) {
         if (md->config.enable_fabric == UCS_YES) {
-            ucs_error("failed to enable fabric memory allocations as cuda "
-                      "driver library does not support cuCtxSetFlags()");
+            ucs_error("failed to enable fabric memory allocations");
+            status = UCS_ERR_UNSUPPORTED;
             goto err_free_md;
         }
 
-        ucs_diag("disabled fabric memory allocations as cuda driver library "
-                 "does not support cuCtxSetFlags()");
-
+        ucs_diag("disabled fabric memory allocations");
         md->config.enable_fabric = UCS_NO;
     }
 

From 8657d5487306916a1802e2f2d2df4253c0f5a5d1 Mon Sep 17 00:00:00 2001
From: Thomas Vegas <tvegas@nvidia.com>
Date: Fri, 10 Jan 2025 09:40:42 +0000
Subject: [PATCH 10/20] UCT/CUDA: Runtime CUDA >= 12.3 to enable VMM

---
 src/uct/cuda/cuda_copy/cuda_copy_md.c | 83 ++++++++++-----------------
 1 file changed, 30 insertions(+), 53 deletions(-)

diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c
index b771cf8be17..9b03f426689 100644
--- a/src/uct/cuda/cuda_copy/cuda_copy_md.c
+++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c
@@ -81,8 +81,6 @@ static ucs_config_field_t uct_cuda_copy_md_config_table[] = {
     {NULL}
 };
 
-static CUresult (*uct_cuda_cuCtxSetFlags_func)(unsigned);
-
 static int uct_cuda_copy_md_is_dmabuf_supported()
 {
     int dmabuf_supported = 0;
@@ -514,19 +512,36 @@ static size_t uct_cuda_copy_md_get_total_device_mem(CUdevice cuda_device)
     return 1; /* return 1 byte to avoid division by zero */
 }
 
-static void
-uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md, const void *address)
+typedef CUresult (*uct_cuda_cuCtxSetFlags_t)(unsigned);
+
+static void uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md,
+                                      const void *address, int is_vmm)
 {
     unsigned sync_memops_value = 1;
 
-#if HAVE_CUDA_FABRIC
+#if CUDA_VERSION >= 12000
+    static uct_cuda_cuCtxSetFlags_t cuda_cuCtxSetFlags_func =
+            (uct_cuda_cuCtxSetFlags_t)ucs_empty_function;
+    CUdriverProcAddressQueryResult sym_status;
+    CUresult cu_err;
     ucs_status_t status;
 
-    if (uct_cuda_cuCtxSetFlags_func != NULL) {
+    if (cuda_cuCtxSetFlags_func ==
+        (uct_cuda_cuCtxSetFlags_t)ucs_empty_function) {
+        cu_err = cuGetProcAddress("cuCtxSetFlags",
+                                  (void**)&cuda_cuCtxSetFlags_func, 12010,
+                                  CU_GET_PROC_ADDRESS_DEFAULT, &sym_status);
+        if ((cu_err != CUDA_SUCCESS) ||
+            (sym_status != CU_GET_PROC_ADDRESS_SUCCESS)) {
+            cuda_cuCtxSetFlags_func = NULL;
+        }
+    }
+
+    if (cuda_cuCtxSetFlags_func != NULL) {
         if (!md->sync_memops_set) {
             /* Synchronize future DMA operations for all memory types */
             status = UCT_CUDADRV_FUNC_LOG_WARN(
-                    uct_cuda_cuCtxSetFlags_func(CU_CTX_SYNC_MEMOPS));
+                    cuda_cuCtxSetFlags_func(CU_CTX_SYNC_MEMOPS));
             if (status == UCS_OK) {
                 md->sync_memops_set = 1;
             }
@@ -534,11 +549,15 @@ uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md, const void *address)
 
         return;
     }
-#else
-    (void)uct_cuda_cuCtxSetFlags_func;
 #endif
 
-    /* Synchronize for DMA for legacy memory types*/
+    if (is_vmm) {
+        ucs_fatal("failed to set sync_memops on CUDA VMM without "
+                  "cuCtxSetFlags() (address=%p)",
+                  address);
+    }
+
+    /* Synchronize for DMA for legacy memory types */
     UCT_CUDADRV_FUNC_LOG_WARN(
             cuPointerSetAttribute(&sync_memops_value,
                                   CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
@@ -648,7 +667,7 @@ uct_cuda_copy_md_query_attributes(uct_cuda_copy_md_t *md, const void *address,
         return UCS_ERR_NO_DEVICE;
     }
 
-    uct_cuda_copy_sync_memops(md, address);
+    uct_cuda_copy_sync_memops(md, address, is_vmm);
 
     /* Extending the registration range is disable by configuration */
     if (md->config.alloc_whole_reg == UCS_CONFIG_OFF) {
@@ -835,36 +854,6 @@ static uct_md_ops_t md_ops = {
     .detect_memory_type = uct_cuda_copy_md_detect_memory_type
 };
 
-static int uct_cuda_copy_md_check_is_ctx_set_flags_supported(void)
-{
-#if CUDA_VERSION >= 12000
-    static int is_supported = -1;
-
-    CUdriverProcAddressQueryResult sym_status;
-    CUresult cu_err;
-
-    if (is_supported < 0) {
-        cu_err = cuGetProcAddress("cuCtxSetFlags",
-                                  (void**)&uct_cuda_cuCtxSetFlags_func,
-                                  12010, CU_GET_PROC_ADDRESS_DEFAULT,
-                                  &sym_status);
-
-        if ((cu_err == CUDA_SUCCESS) &&
-            (sym_status == CU_GET_PROC_ADDRESS_SUCCESS)) {
-            is_supported = 1;
-        } else {
-            ucs_debug("cuda driver library does not support cuCtxSetFlags()");
-            uct_cuda_cuCtxSetFlags_func = NULL;
-            is_supported                = 0;
-        }
-    }
-
-    return is_supported;
-#else
-    return 0;
-#endif
-}
-
 static ucs_status_t
 uct_cuda_copy_md_open(uct_component_t *component, const char *md_name,
                       const uct_md_config_t *md_config, uct_md_h *md_p)
@@ -892,18 +881,6 @@ uct_cuda_copy_md_open(uct_component_t *component, const char *md_name,
     md->sync_memops_set         = 0;
     md->granularity             = SIZE_MAX;
 
-    if (!uct_cuda_copy_md_check_is_ctx_set_flags_supported() &&
-        (md->config.enable_fabric != UCS_NO)) {
-        if (md->config.enable_fabric == UCS_YES) {
-            ucs_error("failed to enable fabric memory allocations");
-            status = UCS_ERR_UNSUPPORTED;
-            goto err_free_md;
-        }
-
-        ucs_diag("disabled fabric memory allocations");
-        md->config.enable_fabric = UCS_NO;
-    }
-
     if ((config->cuda_async_mem_type != UCS_MEMORY_TYPE_CUDA) &&
         (config->cuda_async_mem_type != UCS_MEMORY_TYPE_CUDA_MANAGED)) {
         ucs_warn("wrong memory type for async memory allocations: \"%s\";"

From 0c27f3179144cb8003629f224cda200dcd8a66b0 Mon Sep 17 00:00:00 2001
From: Thomas Vegas <tvegas@nvidia.com>
Date: Fri, 10 Jan 2025 11:09:37 +0000
Subject: [PATCH 11/20] UCT/CUDA: Runtime CUDA >= 12.3 to enable VMM

---
 src/uct/cuda/cuda_copy/cuda_copy_md.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c
index 9b03f426689..b8892b2f0d8 100644
--- a/src/uct/cuda/cuda_copy/cuda_copy_md.c
+++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c
@@ -519,7 +519,7 @@ static void uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md,
 {
     unsigned sync_memops_value = 1;
 
-#if CUDA_VERSION >= 12000
+#if HAVE_CUDA_FABRIC && (CUDA_VERSION >= 12000)
     static uct_cuda_cuCtxSetFlags_t cuda_cuCtxSetFlags_func =
             (uct_cuda_cuCtxSetFlags_t)ucs_empty_function;
     CUdriverProcAddressQueryResult sym_status;

From eb0d1fc84c1433b0956008cd31be181c2f319b1e Mon Sep 17 00:00:00 2001
From: Thomas Vegas <tvegas@nvidia.com>
Date: Mon, 13 Jan 2025 08:59:59 +0000
Subject: [PATCH 12/20] UCT/CUDA: Runtime CUDA >= 12.3 to enable VMM

---
 src/uct/cuda/cuda_copy/cuda_copy_md.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c
index b8892b2f0d8..9ed47bd3e7d 100644
--- a/src/uct/cuda/cuda_copy/cuda_copy_md.c
+++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c
@@ -518,13 +518,16 @@ static void uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md,
                                       const void *address, int is_vmm)
 {
     unsigned sync_memops_value = 1;
-
-#if HAVE_CUDA_FABRIC && (CUDA_VERSION >= 12000)
-    static uct_cuda_cuCtxSetFlags_t cuda_cuCtxSetFlags_func =
-            (uct_cuda_cuCtxSetFlags_t)ucs_empty_function;
+#if HAVE_CUDA_FABRIC
     CUdriverProcAddressQueryResult sym_status;
     CUresult cu_err;
     ucs_status_t status;
+    uct_cuda_cuCtxSetFlags_t cuda_cuCtxSetFlags_func =
+        (uct_cuda_cuCtxSetFlags_t)ucs_empty_function;
+
+    if (md->sync_memops_set) {
+        return;
+    }
 
     if (cuda_cuCtxSetFlags_func ==
         (uct_cuda_cuCtxSetFlags_t)ucs_empty_function) {
@@ -538,13 +541,11 @@ static void uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md,
     }
 
     if (cuda_cuCtxSetFlags_func != NULL) {
-        if (!md->sync_memops_set) {
-            /* Synchronize future DMA operations for all memory types */
-            status = UCT_CUDADRV_FUNC_LOG_WARN(
+        /* Synchronize future DMA operations for all memory types */
+        status = UCT_CUDADRV_FUNC_LOG_WARN(
                     cuda_cuCtxSetFlags_func(CU_CTX_SYNC_MEMOPS));
-            if (status == UCS_OK) {
-                md->sync_memops_set = 1;
-            }
+        if (status == UCS_OK) {
+            md->sync_memops_set = 1;
         }
 
         return;
@@ -553,8 +554,7 @@ static void uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md,
 
     if (is_vmm) {
         ucs_fatal("failed to set sync_memops on CUDA VMM without "
-                  "cuCtxSetFlags() (address=%p)",
-                  address);
+                  "cuCtxSetFlags() (address=%p)", address);
     }
 
     /* Synchronize for DMA for legacy memory types */

From fe0370b82a6c3ec680500c0882d5ad639762f539 Mon Sep 17 00:00:00 2001
From: Thomas Vegas <tvegas@nvidia.com>
Date: Mon, 13 Jan 2025 12:37:39 +0000
Subject: [PATCH 13/20] UCT/IB/EFA/SRD: Initial interface add

---
 src/uct/cuda/cuda_copy/cuda_copy_md.c | 107 +++++++++++++-------------
 1 file changed, 55 insertions(+), 52 deletions(-)

diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c
index 9ed47bd3e7d..9232bd27fa7 100644
--- a/src/uct/cuda/cuda_copy/cuda_copy_md.c
+++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c
@@ -304,6 +304,58 @@ uct_cuda_copy_mem_alloc_fabric(uct_cuda_copy_md_t *md,
     return UCS_ERR_NO_MEMORY;
 }
 
+typedef CUresult (*uct_cuda_cuCtxSetFlags_t)(unsigned);
+
+static void uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md,
+                                      const void *address, int is_vmm)
+{
+    unsigned sync_memops_value = 1;
+#if HAVE_CUDA_FABRIC
+    static uct_cuda_cuCtxSetFlags_t cuda_cuCtxSetFlags_func =
+        (uct_cuda_cuCtxSetFlags_t)ucs_empty_function;
+    CUdriverProcAddressQueryResult sym_status;
+    CUresult cu_err;
+    ucs_status_t status;
+
+    if (md->sync_memops_set) {
+        return;
+    }
+
+    if (cuda_cuCtxSetFlags_func ==
+        (uct_cuda_cuCtxSetFlags_t)ucs_empty_function) {
+        cu_err = cuGetProcAddress("cuCtxSetFlags",
+                                  (void**)&cuda_cuCtxSetFlags_func, 12010,
+                                  CU_GET_PROC_ADDRESS_DEFAULT, &sym_status);
+        if ((cu_err != CUDA_SUCCESS) ||
+            (sym_status != CU_GET_PROC_ADDRESS_SUCCESS)) {
+            cuda_cuCtxSetFlags_func = NULL;
+        }
+    }
+
+    if (cuda_cuCtxSetFlags_func != NULL) {
+        /* Synchronize future DMA operations for all memory types */
+        status = UCT_CUDADRV_FUNC_LOG_WARN(
+                    cuda_cuCtxSetFlags_func(CU_CTX_SYNC_MEMOPS));
+        if (status == UCS_OK) {
+            md->sync_memops_set = 1;
+        }
+
+        return;
+    }
+
+    if (is_vmm) {
+        ucs_fatal("failed to set sync_memops on CUDA VMM without "
+                  "cuCtxSetFlags() (address=%p)", address);
+    }
+#endif
+
+    /* Synchronize for DMA for legacy memory types */
+    UCT_CUDADRV_FUNC_LOG_WARN(
+            cuPointerSetAttribute(&sync_memops_value,
+                                  CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
+                                  (CUdeviceptr)address));
+}
+
 static ucs_status_t
 uct_cuda_copy_mem_alloc(uct_md_h uct_md, size_t *length_p, void **address_p,
                         ucs_memory_type_t mem_type, unsigned flags,
@@ -379,6 +431,9 @@ uct_cuda_copy_mem_alloc(uct_md_h uct_md, size_t *length_p, void **address_p,
     }
 
 allocated:
+    uct_cuda_copy_sync_memops(md, (void *)alloc_handle->ptr,
+                              alloc_handle->is_vmm);
+
     *memh_p    = alloc_handle;
     *address_p = (void*)alloc_handle->ptr;
     *length_p  = alloc_handle->length;
@@ -512,58 +567,6 @@ static size_t uct_cuda_copy_md_get_total_device_mem(CUdevice cuda_device)
     return 1; /* return 1 byte to avoid division by zero */
 }
 
-typedef CUresult (*uct_cuda_cuCtxSetFlags_t)(unsigned);
-
-static void uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md,
-                                      const void *address, int is_vmm)
-{
-    unsigned sync_memops_value = 1;
-#if HAVE_CUDA_FABRIC
-    CUdriverProcAddressQueryResult sym_status;
-    CUresult cu_err;
-    ucs_status_t status;
-    uct_cuda_cuCtxSetFlags_t cuda_cuCtxSetFlags_func =
-        (uct_cuda_cuCtxSetFlags_t)ucs_empty_function;
-
-    if (md->sync_memops_set) {
-        return;
-    }
-
-    if (cuda_cuCtxSetFlags_func ==
-        (uct_cuda_cuCtxSetFlags_t)ucs_empty_function) {
-        cu_err = cuGetProcAddress("cuCtxSetFlags",
-                                  (void**)&cuda_cuCtxSetFlags_func, 12010,
-                                  CU_GET_PROC_ADDRESS_DEFAULT, &sym_status);
-        if ((cu_err != CUDA_SUCCESS) ||
-            (sym_status != CU_GET_PROC_ADDRESS_SUCCESS)) {
-            cuda_cuCtxSetFlags_func = NULL;
-        }
-    }
-
-    if (cuda_cuCtxSetFlags_func != NULL) {
-        /* Synchronize future DMA operations for all memory types */
-        status = UCT_CUDADRV_FUNC_LOG_WARN(
-                    cuda_cuCtxSetFlags_func(CU_CTX_SYNC_MEMOPS));
-        if (status == UCS_OK) {
-            md->sync_memops_set = 1;
-        }
-
-        return;
-    }
-#endif
-
-    if (is_vmm) {
-        ucs_fatal("failed to set sync_memops on CUDA VMM without "
-                  "cuCtxSetFlags() (address=%p)", address);
-    }
-
-    /* Synchronize for DMA for legacy memory types */
-    UCT_CUDADRV_FUNC_LOG_WARN(
-            cuPointerSetAttribute(&sync_memops_value,
-                                  CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
-                                  (CUdeviceptr)address));
-}
-
 static ucs_status_t
 uct_cuda_copy_md_query_attributes(uct_cuda_copy_md_t *md, const void *address,
                                   size_t length, ucs_memory_info_t *mem_info)

From ab3d0c740e98d0a5d9641b580c5a8d7cf6b7dda2 Mon Sep 17 00:00:00 2001
From: Thomas Vegas <tvegas@nvidia.com>
Date: Mon, 13 Jan 2025 12:40:11 +0000
Subject: [PATCH 14/20] UCT/CUDA: Runtime CUDA >= 12.3 to enable VMM

---
 src/uct/cuda/cuda_copy/cuda_copy_md.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c
index 9232bd27fa7..05aa7f15ff0 100644
--- a/src/uct/cuda/cuda_copy/cuda_copy_md.c
+++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c
@@ -344,8 +344,8 @@ static void uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md,
     }
 
     if (is_vmm) {
-        ucs_fatal("failed to set sync_memops on CUDA VMM without "
-                  "cuCtxSetFlags() (address=%p)", address);
+        ucs_warn("failed to set sync_memops on CUDA VMM without "
+                 "cuCtxSetFlags() (address=%p)", address);
     }
 #endif
 

From a0004c48dcc164204048ce4e643710e9030cab98 Mon Sep 17 00:00:00 2001
From: Thomas Vegas <tvegas@nvidia.com>
Date: Mon, 13 Jan 2025 16:59:05 +0000
Subject: [PATCH 15/20] UCT/CUDA: Runtime CUDA >= 12.3 to enable VMM

---
 src/uct/cuda/cuda_copy/cuda_copy_md.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c
index 05aa7f15ff0..7d342816caa 100644
--- a/src/uct/cuda/cuda_copy/cuda_copy_md.c
+++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c
@@ -342,12 +342,12 @@ static void uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md,
 
         return;
     }
+#endif
 
     if (is_vmm) {
-        ucs_warn("failed to set sync_memops on CUDA VMM without "
-                 "cuCtxSetFlags() (address=%p)", address);
+        ucs_fatal("failed to set sync_memops on CUDA VMM without "
+                  "cuCtxSetFlags() (address=%p)", address);
     }
-#endif
 
     /* Synchronize for DMA for legacy memory types */
     UCT_CUDADRV_FUNC_LOG_WARN(
@@ -469,7 +469,6 @@ static int uct_cuda_copy_detect_vmm(void *address,
                                     ucs_memory_type_t *vmm_mem_type,
                                     CUdevice *cuda_device)
 {
-#if HAVE_CUDA_FABRIC
     ucs_status_t status      = UCS_OK;
     CUmemAllocationProp prop = {};
     CUmemGenericAllocationHandle alloc_handle;
@@ -504,9 +503,6 @@ static int uct_cuda_copy_detect_vmm(void *address,
 err:
     UCT_CUDADRV_FUNC_LOG_DEBUG(cuMemRelease(alloc_handle));
     return 1;
-#else
-    return 0;
-#endif
 }
 
 static ucs_status_t uct_cuda_copy_mem_free(uct_md_h md, uct_mem_h memh)

From 81d47f0d391eb29bab577587fd52f14dec681a7a Mon Sep 17 00:00:00 2001
From: Thomas Vegas <tvegas@nvidia.com>
Date: Tue, 14 Jan 2025 08:18:42 +0000
Subject: [PATCH 16/20] UCT/CUDA: Runtime CUDA >= 12.3 to enable VMM

---
 src/uct/cuda/cuda_copy/cuda_copy_md.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c
index 7d342816caa..d318b2b45e4 100644
--- a/src/uct/cuda/cuda_copy/cuda_copy_md.c
+++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c
@@ -491,12 +491,15 @@ static int uct_cuda_copy_detect_vmm(void *address,
     }
 
     *cuda_device = (CUdevice)prop.location.id;
+#if CUDA_VERSION >= 12020
     if ((prop.location.type == CU_MEM_LOCATION_TYPE_HOST) ||
         (prop.location.type == CU_MEM_LOCATION_TYPE_HOST_NUMA) ||
         (prop.location.type == CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT)) {
         /* TODO: Marking as CUDA to allow cuda_ipc access vmm for now */
         *vmm_mem_type = UCS_MEMORY_TYPE_CUDA;
-    } else if (prop.location.type == CU_MEM_LOCATION_TYPE_DEVICE) {
+    } else
+#endif
+    if (prop.location.type == CU_MEM_LOCATION_TYPE_DEVICE) {
         *vmm_mem_type = UCS_MEMORY_TYPE_CUDA;
     }
 

From edc00284dd3df031472b5e24d6eea6ccef81dc5f Mon Sep 17 00:00:00 2001
From: Thomas Vegas <tvegas@nvidia.com>
Date: Tue, 14 Jan 2025 14:36:13 +0000
Subject: [PATCH 17/20] UCT/CUDA: Runtime CUDA >= 12.3 to enable VMM

---
 config/m4/cuda.m4                     |  7 +++++++
 src/uct/cuda/cuda_copy/cuda_copy_md.c | 11 ++++++++---
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/config/m4/cuda.m4 b/config/m4/cuda.m4
index d3a321916c5..6bed1d1d7f4 100644
--- a/config/m4/cuda.m4
+++ b/config/m4/cuda.m4
@@ -52,6 +52,13 @@ AS_IF([test "x$cuda_checked" != "xyes"],
          AS_IF([test "x$cuda_happy" = "xyes"],
                [AC_CHECK_LIB([cudart], [cudaGetDeviceCount],
                              [CUDART_LIBS="$CUDART_LIBS -lcudart"], [cuda_happy="no"])])
+         # Check optional cuda library members
+         AS_IF([test "x$cuda_happy" = "xyes"],
+               [AC_CHECK_LIB([cuda], [cuMemRetainAllocationHandle],
+                             [AC_DEFINE([HAVE_CUMEMRETAINALLOCATIONHANDLE], [1],
+                                        [Enable cuMemRetainAllocationHandle() usage])]),
+                AC_CHECK_DECLS([CU_MEM_LOCATION_TYPE_HOST],
+                               [], [], [[#include <cuda.h>]])])
 
          # Check nvml header files
          AS_IF([test "x$cuda_happy" = "xyes"],
diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c
index d318b2b45e4..786269e1e7e 100644
--- a/src/uct/cuda/cuda_copy/cuda_copy_md.c
+++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c
@@ -345,8 +345,9 @@ static void uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md,
 #endif
 
     if (is_vmm) {
-        ucs_fatal("failed to set sync_memops on CUDA VMM without "
-                  "cuCtxSetFlags() (address=%p)", address);
+        ucs_warn("cannot set sync_memops on CUDA VMM without cuCtxSetFlags() "
+                 "(address=%p)", address);
+        return;
     }
 
     /* Synchronize for DMA for legacy memory types */
@@ -469,6 +470,7 @@ static int uct_cuda_copy_detect_vmm(void *address,
                                     ucs_memory_type_t *vmm_mem_type,
                                     CUdevice *cuda_device)
 {
+#if HAVE_CUMEMRETAINALLOCATIONHANDLE
     ucs_status_t status      = UCS_OK;
     CUmemAllocationProp prop = {};
     CUmemGenericAllocationHandle alloc_handle;
@@ -491,7 +493,7 @@ static int uct_cuda_copy_detect_vmm(void *address,
     }
 
     *cuda_device = (CUdevice)prop.location.id;
-#if CUDA_VERSION >= 12020
+#if HAVE_DECL_CU_MEM_LOCATION_TYPE_HOST
     if ((prop.location.type == CU_MEM_LOCATION_TYPE_HOST) ||
         (prop.location.type == CU_MEM_LOCATION_TYPE_HOST_NUMA) ||
         (prop.location.type == CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT)) {
@@ -506,6 +508,9 @@ static int uct_cuda_copy_detect_vmm(void *address,
 err:
     UCT_CUDADRV_FUNC_LOG_DEBUG(cuMemRelease(alloc_handle));
     return 1;
+#else
+    return 0;
+#endif
 }
 
 static ucs_status_t uct_cuda_copy_mem_free(uct_md_h md, uct_mem_h memh)

From 078a6cc410ad5e6e1432b649154749e6026bcb33 Mon Sep 17 00:00:00 2001
From: Thomas Vegas <tvegas@nvidia.com>
Date: Tue, 14 Jan 2025 20:04:16 +0200
Subject: [PATCH 18/20] UCT/CUDA: Runtime CUDA >= 12.3 to enable VMM

---
 src/uct/cuda/cuda_copy/cuda_copy_md.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c
index 786269e1e7e..1fb57648f1f 100644
--- a/src/uct/cuda/cuda_copy/cuda_copy_md.c
+++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c
@@ -470,7 +470,7 @@ static int uct_cuda_copy_detect_vmm(void *address,
                                     ucs_memory_type_t *vmm_mem_type,
                                     CUdevice *cuda_device)
 {
-#if HAVE_CUMEMRETAINALLOCATIONHANDLE
+#ifdef HAVE_CUMEMRETAINALLOCATIONHANDLE
     ucs_status_t status      = UCS_OK;
     CUmemAllocationProp prop = {};
     CUmemGenericAllocationHandle alloc_handle;
@@ -493,7 +493,7 @@ static int uct_cuda_copy_detect_vmm(void *address,
     }
 
     *cuda_device = (CUdevice)prop.location.id;
-#if HAVE_DECL_CU_MEM_LOCATION_TYPE_HOST
+#ifdef HAVE_DECL_CU_MEM_LOCATION_TYPE_HOST
     if ((prop.location.type == CU_MEM_LOCATION_TYPE_HOST) ||
         (prop.location.type == CU_MEM_LOCATION_TYPE_HOST_NUMA) ||
         (prop.location.type == CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT)) {

From 4ace4b1632533a0b0dd3b26b926dfdbc83e370e0 Mon Sep 17 00:00:00 2001
From: Thomas Vegas <tvegas@nvidia.com>
Date: Tue, 14 Jan 2025 20:06:42 +0200
Subject: [PATCH 19/20] UCT/CUDA: Runtime CUDA >= 12.3 to enable VMM

---
 src/uct/cuda/cuda_copy/cuda_copy_md.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c
index 1fb57648f1f..af99da13553 100644
--- a/src/uct/cuda/cuda_copy/cuda_copy_md.c
+++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c
@@ -310,7 +310,7 @@ static void uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md,
                                       const void *address, int is_vmm)
 {
     unsigned sync_memops_value = 1;
-#if HAVE_CUDA_FABRIC
+#ifdef HAVE_CUDA_FABRIC
     static uct_cuda_cuCtxSetFlags_t cuda_cuCtxSetFlags_func =
         (uct_cuda_cuCtxSetFlags_t)ucs_empty_function;
     CUdriverProcAddressQueryResult sym_status;

From 0c39faaa335b256955bf585c519d2170198c9178 Mon Sep 17 00:00:00 2001
From: Thomas Vegas <tvegas@nvidia.com>
Date: Tue, 14 Jan 2025 18:50:27 +0000
Subject: [PATCH 20/20] UCT/CUDA: Runtime CUDA >= 12.3 to enable VMM

---
 src/uct/cuda/cuda_copy/cuda_copy_md.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c
index af99da13553..786269e1e7e 100644
--- a/src/uct/cuda/cuda_copy/cuda_copy_md.c
+++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c
@@ -310,7 +310,7 @@ static void uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md,
                                       const void *address, int is_vmm)
 {
     unsigned sync_memops_value = 1;
-#ifdef HAVE_CUDA_FABRIC
+#if HAVE_CUDA_FABRIC
     static uct_cuda_cuCtxSetFlags_t cuda_cuCtxSetFlags_func =
         (uct_cuda_cuCtxSetFlags_t)ucs_empty_function;
     CUdriverProcAddressQueryResult sym_status;
@@ -470,7 +470,7 @@ static int uct_cuda_copy_detect_vmm(void *address,
                                     ucs_memory_type_t *vmm_mem_type,
                                     CUdevice *cuda_device)
 {
-#ifdef HAVE_CUMEMRETAINALLOCATIONHANDLE
+#if HAVE_CUMEMRETAINALLOCATIONHANDLE
     ucs_status_t status      = UCS_OK;
     CUmemAllocationProp prop = {};
     CUmemGenericAllocationHandle alloc_handle;
@@ -493,7 +493,7 @@ static int uct_cuda_copy_detect_vmm(void *address,
     }
 
     *cuda_device = (CUdevice)prop.location.id;
-#ifdef HAVE_DECL_CU_MEM_LOCATION_TYPE_HOST
+#if HAVE_DECL_CU_MEM_LOCATION_TYPE_HOST
     if ((prop.location.type == CU_MEM_LOCATION_TYPE_HOST) ||
         (prop.location.type == CU_MEM_LOCATION_TYPE_HOST_NUMA) ||
         (prop.location.type == CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT)) {