diff --git a/config/m4/cuda.m4 b/config/m4/cuda.m4
index d3a321916c5..6bed1d1d7f4 100644
--- a/config/m4/cuda.m4
+++ b/config/m4/cuda.m4
@@ -52,6 +52,13 @@ AS_IF([test "x$cuda_checked" != "xyes"],
          AS_IF([test "x$cuda_happy" = "xyes"],
                [AC_CHECK_LIB([cudart], [cudaGetDeviceCount],
                              [CUDART_LIBS="$CUDART_LIBS -lcudart"], [cuda_happy="no"])])
+         # Check optional cuda library members
+         AS_IF([test "x$cuda_happy" = "xyes"],
+               [AC_CHECK_LIB([cuda], [cuMemRetainAllocationHandle],
+                             [AC_DEFINE([HAVE_CUMEMRETAINALLOCATIONHANDLE], [1],
+                                        [Enable cuMemRetainAllocationHandle() usage])]),
+                AC_CHECK_DECLS([CU_MEM_LOCATION_TYPE_HOST],
+                               [], [], [[#include <cuda.h>]])])
 
          # Check nvml header files
          AS_IF([test "x$cuda_happy" = "xyes"],
diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c
index a185dde3779..786269e1e7e 100644
--- a/src/uct/cuda/cuda_copy/cuda_copy_md.c
+++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c
@@ -304,6 +304,59 @@ uct_cuda_copy_mem_alloc_fabric(uct_cuda_copy_md_t *md,
     return UCS_ERR_NO_MEMORY;
 }
 
+typedef CUresult (*uct_cuda_cuCtxSetFlags_t)(unsigned);
+
+static void uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md,
+                                      const void *address, int is_vmm)
+{
+    unsigned sync_memops_value = 1;
+#if HAVE_CUDA_FABRIC
+    static uct_cuda_cuCtxSetFlags_t cuda_cuCtxSetFlags_func =
+        (uct_cuda_cuCtxSetFlags_t)ucs_empty_function;
+    CUdriverProcAddressQueryResult sym_status;
+    CUresult cu_err;
+    ucs_status_t status;
+
+    if (md->sync_memops_set) {
+        return;
+    }
+
+    if (cuda_cuCtxSetFlags_func ==
+        (uct_cuda_cuCtxSetFlags_t)ucs_empty_function) {
+        cu_err = cuGetProcAddress("cuCtxSetFlags",
+                                  (void**)&cuda_cuCtxSetFlags_func, 12010,
+                                  CU_GET_PROC_ADDRESS_DEFAULT, &sym_status);
+        if ((cu_err != CUDA_SUCCESS) ||
+            (sym_status != CU_GET_PROC_ADDRESS_SUCCESS)) {
+            cuda_cuCtxSetFlags_func = NULL;
+        }
+    }
+
+    if (cuda_cuCtxSetFlags_func != NULL) {
+        /* Synchronize future DMA operations for all memory types */
+        status = UCT_CUDADRV_FUNC_LOG_WARN(
+                    cuda_cuCtxSetFlags_func(CU_CTX_SYNC_MEMOPS));
+        if (status == UCS_OK) {
+            md->sync_memops_set = 1;
+        }
+
+        return;
+    }
+#endif
+
+    if (is_vmm) {
+        ucs_warn("cannot set sync_memops on CUDA VMM without cuCtxSetFlags() "
+                 "(address=%p)", address);
+        return;
+    }
+
+    /* Synchronize for DMA for legacy memory types */
+    UCT_CUDADRV_FUNC_LOG_WARN(
+            cuPointerSetAttribute(&sync_memops_value,
+                                  CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
+                                  (CUdeviceptr)address));
+}
+
 static ucs_status_t
 uct_cuda_copy_mem_alloc(uct_md_h uct_md, size_t *length_p, void **address_p,
                         ucs_memory_type_t mem_type, unsigned flags,
@@ -379,6 +432,9 @@ uct_cuda_copy_mem_alloc(uct_md_h uct_md, size_t *length_p, void **address_p,
     }
 
 allocated:
+    uct_cuda_copy_sync_memops(md, (void *)alloc_handle->ptr,
+                              alloc_handle->is_vmm);
+
     *memh_p    = alloc_handle;
     *address_p = (void*)alloc_handle->ptr;
     *length_p  = alloc_handle->length;
@@ -414,7 +470,7 @@ static int uct_cuda_copy_detect_vmm(void *address,
                                     ucs_memory_type_t *vmm_mem_type,
                                     CUdevice *cuda_device)
 {
-#if HAVE_CUDA_FABRIC
+#if HAVE_CUMEMRETAINALLOCATIONHANDLE
     ucs_status_t status      = UCS_OK;
     CUmemAllocationProp prop = {};
     CUmemGenericAllocationHandle alloc_handle;
@@ -437,12 +493,15 @@ static int uct_cuda_copy_detect_vmm(void *address,
     }
 
     *cuda_device = (CUdevice)prop.location.id;
+#if HAVE_DECL_CU_MEM_LOCATION_TYPE_HOST
     if ((prop.location.type == CU_MEM_LOCATION_TYPE_HOST) ||
         (prop.location.type == CU_MEM_LOCATION_TYPE_HOST_NUMA) ||
         (prop.location.type == CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT)) {
         /* TODO: Marking as CUDA to allow cuda_ipc access vmm for now */
         *vmm_mem_type = UCS_MEMORY_TYPE_CUDA;
-    } else if (prop.location.type == CU_MEM_LOCATION_TYPE_DEVICE) {
+    } else
+#endif
+    if (prop.location.type == CU_MEM_LOCATION_TYPE_DEVICE) {
         *vmm_mem_type = UCS_MEMORY_TYPE_CUDA;
     }
 
@@ -512,27 +571,6 @@ static size_t uct_cuda_copy_md_get_total_device_mem(CUdevice cuda_device)
     return 1; /* return 1 byte to avoid division by zero */
 }
 
-static void
-uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md, const void *address)
-{
-#if HAVE_CUDA_FABRIC
-    ucs_status_t status;
-    if (!md->sync_memops_set) {
-        /* Synchronize future DMA operations for all memory types */
-        status = UCT_CUDADRV_FUNC_LOG_WARN(cuCtxSetFlags(CU_CTX_SYNC_MEMOPS));
-        if (status == UCS_OK) {
-            md->sync_memops_set = 1;
-        }
-    }
-#else
-    unsigned value = 1;
-    /* Synchronize for DMA for legacy memory types*/
-    UCT_CUDADRV_FUNC_LOG_WARN(
-            cuPointerSetAttribute(&value, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
-                                  (CUdeviceptr)address));
-#endif
-}
-
 static ucs_status_t
 uct_cuda_copy_md_query_attributes(uct_cuda_copy_md_t *md, const void *address,
                                   size_t length, ucs_memory_info_t *mem_info)
@@ -636,7 +674,7 @@ uct_cuda_copy_md_query_attributes(uct_cuda_copy_md_t *md, const void *address,
         return UCS_ERR_NO_DEVICE;
     }
 
-    uct_cuda_copy_sync_memops(md, address);
+    uct_cuda_copy_sync_memops(md, address, is_vmm);
 
     /* Extending the registration range is disable by configuration */
     if (md->config.alloc_whole_reg == UCS_CONFIG_OFF) {