From 68a5f5170953a0d6f47661a5a4e84f99eb0b1df3 Mon Sep 17 00:00:00 2001 From: Thomas Vegas Date: Fri, 20 Dec 2024 09:36:46 +0000 Subject: [PATCH 01/20] UCT/CUDA: Runtime CUDA >= 12.3 to enable VMM --- src/uct/cuda/cuda_copy/cuda_copy_md.c | 60 +++++++++++++++++++-------- src/uct/cuda/cuda_copy/cuda_copy_md.h | 2 + 2 files changed, 44 insertions(+), 18 deletions(-) diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c index a185dde3779..fa2fde07681 100644 --- a/src/uct/cuda/cuda_copy/cuda_copy_md.c +++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c @@ -29,6 +29,10 @@ #define UCT_CUDA_DEV_NAME_MAX_LEN 64 #define UCT_CUDA_MAX_DEVICES 32 +#define UCT_CUDA_VERSION_VMM 12030 /* for VMM: cuCtxSetFlags() >= cuda 12.1 */ +#define UCT_CUDA_MAJOR(_version) ((_version) / 1000) +#define UCT_CUDA_MINOR(_version) (((_version) % 1000) / 10) + static const char *uct_cuda_pref_loc[] = { [UCT_CUDA_PREF_LOC_CPU] = "cpu", @@ -515,22 +519,27 @@ static size_t uct_cuda_copy_md_get_total_device_mem(CUdevice cuda_device) static void uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md, const void *address) { + unsigned value = 1; + #if HAVE_CUDA_FABRIC ucs_status_t status; - if (!md->sync_memops_set) { - /* Synchronize future DMA operations for all memory types */ - status = UCT_CUDADRV_FUNC_LOG_WARN(cuCtxSetFlags(CU_CTX_SYNC_MEMOPS)); - if (status == UCS_OK) { - md->sync_memops_set = 1; + if (md->config.cuda_ctx_set_flags) { + if (!md->sync_memops_set) { + /* Synchronize future DMA operations for all memory types */ + status = UCT_CUDADRV_FUNC_LOG_WARN(cuCtxSetFlags(CU_CTX_SYNC_MEMOPS)); + if (status == UCS_OK) { + md->sync_memops_set = 1; + } } + + return; } -#else - unsigned value = 1; +#endif + /* Synchronize for DMA for legacy memory types*/ UCT_CUDADRV_FUNC_LOG_WARN( cuPointerSetAttribute(&value, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, (CUdeviceptr)address)); -#endif } static ucs_status_t @@ -830,7 +839,7 @@ uct_cuda_copy_md_open(uct_component_t *component, const char *md_name, uct_cuda_copy_md_config_t *config = ucs_derived_of(md_config, uct_cuda_copy_md_config_t); uct_cuda_copy_md_t *md; - int dmabuf_supported; + int dmabuf_supported, version; ucs_status_t status; md = ucs_malloc(sizeof(uct_cuda_copy_md_t), "uct_cuda_copy_md_t"); @@ -840,15 +849,30 @@ uct_cuda_copy_md_open(uct_component_t *component, const char *md_name, goto err; } - md->super.ops = &md_ops; - md->super.component = &uct_cuda_copy_component; - md->config.alloc_whole_reg = config->alloc_whole_reg; - md->config.max_reg_ratio = config->max_reg_ratio; - md->config.pref_loc = config->pref_loc; - md->config.enable_fabric = config->enable_fabric; - md->config.dmabuf_supported = 0; - md->sync_memops_set = 0; - md->granularity = SIZE_MAX; + md->super.ops = &md_ops; + md->super.component = &uct_cuda_copy_component; + md->config.alloc_whole_reg = config->alloc_whole_reg; + md->config.max_reg_ratio = config->max_reg_ratio; + md->config.pref_loc = config->pref_loc; + md->config.enable_fabric = config->enable_fabric; + md->config.dmabuf_supported = 0; + md->config.cuda_ctx_set_flags = 1; + md->sync_memops_set = 0; + md->granularity = SIZE_MAX; + + if ((cuDriverGetVersion(&version) == CUDA_SUCCESS) && + (version < UCT_CUDA_VERSION_VMM)) { + if (md->config.enable_fabric != UCS_NO) { + ucs_warn("disabled fabric memory allocations as cuda driver " + "library %d.%d < %d.%d", + UCT_CUDA_MAJOR(version), UCT_CUDA_MINOR(version), + UCT_CUDA_MAJOR(UCT_CUDA_VERSION_VMM), + UCT_CUDA_MINOR(UCT_CUDA_VERSION_VMM)); + } + + md->config.enable_fabric = UCS_NO; + md->config.cuda_ctx_set_flags = 0; + } if ((config->cuda_async_mem_type != UCS_MEMORY_TYPE_CUDA) && (config->cuda_async_mem_type != UCS_MEMORY_TYPE_CUDA_MANAGED)) { diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.h b/src/uct/cuda/cuda_copy/cuda_copy_md.h index e14aff739e5..0176a10801a 100644 --- a/src/uct/cuda/cuda_copy/cuda_copy_md.h +++ b/src/uct/cuda/cuda_copy/cuda_copy_md.h @@ -36,6 +36,8 @@ typedef struct uct_cuda_copy_md { ucs_ternary_auto_value_t enable_fabric; uct_cuda_pref_loc_t pref_loc; int cuda_async_managed; + int cuda_ctx_set_flags; /* missing cuCtxSetFlags() + below CUDA 12.1 */ } config; } uct_cuda_copy_md_t; From 9fc443096ce3271029a689ae501ff45163632a4d Mon Sep 17 00:00:00 2001 From: Thomas Vegas Date: Fri, 20 Dec 2024 13:53:51 +0000 Subject: [PATCH 02/20] UCT/CUDA: Runtime CUDA >= 12.3 to enable VMM --- src/uct/cuda/cuda_copy/cuda_copy_md.c | 83 +++++++++++++++++---------- src/uct/cuda/cuda_copy/cuda_copy_md.h | 2 - 2 files changed, 52 insertions(+), 33 deletions(-) diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c index fa2fde07681..8a952d64830 100644 --- a/src/uct/cuda/cuda_copy/cuda_copy_md.c +++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c @@ -29,10 +29,6 @@ #define UCT_CUDA_DEV_NAME_MAX_LEN 64 #define UCT_CUDA_MAX_DEVICES 32 -#define UCT_CUDA_VERSION_VMM 12030 /* for VMM: cuCtxSetFlags() >= cuda 12.1 */ -#define UCT_CUDA_MAJOR(_version) ((_version) / 1000) -#define UCT_CUDA_MINOR(_version) (((_version) % 1000) / 10) - static const char *uct_cuda_pref_loc[] = { [UCT_CUDA_PREF_LOC_CPU] = "cpu", @@ -85,6 +81,10 @@ static ucs_config_field_t uct_cuda_copy_md_config_table[] = { {NULL} }; +static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER; + +static CUresult (*ctx_set_flags_func)(unsigned); + static int uct_cuda_copy_md_is_dmabuf_supported() { int dmabuf_supported = 0; @@ -483,7 +483,6 @@ static void uct_cuda_copy_md_close(uct_md_h uct_md) { static size_t uct_cuda_copy_md_get_total_device_mem(CUdevice cuda_device) { - static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER; static size_t total_bytes[UCT_CUDA_MAX_DEVICES]; char dev_name[UCT_CUDA_DEV_NAME_MAX_LEN]; @@ -523,10 +522,11 @@ uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md, const void *address) #if HAVE_CUDA_FABRIC ucs_status_t status; - if (md->config.cuda_ctx_set_flags) { + if (ctx_set_flags_func != NULL) { if (!md->sync_memops_set) { /* Synchronize future DMA operations for all memory types */ - status = UCT_CUDADRV_FUNC_LOG_WARN(cuCtxSetFlags(CU_CTX_SYNC_MEMOPS)); + status = UCT_CUDADRV_FUNC_LOG_ERR( + ctx_set_flags_func(CU_CTX_SYNC_MEMOPS)); if (status == UCS_OK) { md->sync_memops_set = 1; } @@ -832,6 +832,35 @@ static uct_md_ops_t md_ops = { .detect_memory_type = uct_cuda_copy_md_detect_memory_type }; +static ucs_status_t uct_cuda_copy_md_check_is_ctx_set_flags_supported(void) +{ + static ucs_status_t status = UCS_ERR_INVALID_ADDR; + +#if CUDA_VERSION >= 12000 + CUdriverProcAddressQueryResult sym_status; + CUresult cu_err; + + if (status == UCS_ERR_INVALID_ADDR) { + pthread_mutex_lock(&lock); + cu_err = cuGetProcAddress("cuCtxSetFlags", (void**)&ctx_set_flags_func, + 12010, CU_GET_PROC_ADDRESS_DEFAULT, + &sym_status); + + if ((cu_err == CUDA_SUCCESS) && + (sym_status == CU_GET_PROC_ADDRESS_SUCCESS)) { + status = UCS_OK; + } else { + ctx_set_flags_func = NULL; + status = UCS_ERR_UNSUPPORTED; + } + + pthread_mutex_unlock(&lock); + } +#endif + + return status; +} + static ucs_status_t uct_cuda_copy_md_open(uct_component_t *component, const char *md_name, const uct_md_config_t *md_config, uct_md_h *md_p) @@ -839,7 +868,7 @@ uct_cuda_copy_md_open(uct_component_t *component, const char *md_name, uct_cuda_copy_md_config_t *config = ucs_derived_of(md_config, uct_cuda_copy_md_config_t); uct_cuda_copy_md_t *md; - int dmabuf_supported, version; + int dmabuf_supported; ucs_status_t status; md = ucs_malloc(sizeof(uct_cuda_copy_md_t), "uct_cuda_copy_md_t"); @@ -849,29 +878,21 @@ uct_cuda_copy_md_open(uct_component_t *component, const char *md_name, goto err; } - md->super.ops = &md_ops; - md->super.component = &uct_cuda_copy_component; - md->config.alloc_whole_reg = config->alloc_whole_reg; - md->config.max_reg_ratio = config->max_reg_ratio; - md->config.pref_loc = config->pref_loc; - md->config.enable_fabric = config->enable_fabric; - md->config.dmabuf_supported = 0; - md->config.cuda_ctx_set_flags = 1; - md->sync_memops_set = 0; - md->granularity = SIZE_MAX; - - if ((cuDriverGetVersion(&version) == CUDA_SUCCESS) && - (version < UCT_CUDA_VERSION_VMM)) { - if (md->config.enable_fabric != UCS_NO) { - ucs_warn("disabled fabric memory allocations as cuda driver " - "library %d.%d < %d.%d", - UCT_CUDA_MAJOR(version), UCT_CUDA_MINOR(version), - UCT_CUDA_MAJOR(UCT_CUDA_VERSION_VMM), - UCT_CUDA_MINOR(UCT_CUDA_VERSION_VMM)); - } - - md->config.enable_fabric = UCS_NO; - md->config.cuda_ctx_set_flags = 0; + md->super.ops = &md_ops; + md->super.component = &uct_cuda_copy_component; + md->config.alloc_whole_reg = config->alloc_whole_reg; + md->config.max_reg_ratio = config->max_reg_ratio; + md->config.pref_loc = config->pref_loc; + md->config.enable_fabric = config->enable_fabric; + md->config.dmabuf_supported = 0; + md->sync_memops_set = 0; + md->granularity = SIZE_MAX; + + status = uct_cuda_copy_md_check_is_ctx_set_flags_supported(); + if ((status != UCS_OK) && (md->config.enable_fabric != UCS_NO)) { + ucs_warn("disabled fabric memory allocations as cuda driver " + "library does not support cuCtxSetFlags()"); + md->config.enable_fabric = UCS_NO; } if ((config->cuda_async_mem_type != UCS_MEMORY_TYPE_CUDA) && diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.h b/src/uct/cuda/cuda_copy/cuda_copy_md.h index 0176a10801a..e14aff739e5 100644 --- a/src/uct/cuda/cuda_copy/cuda_copy_md.h +++ b/src/uct/cuda/cuda_copy/cuda_copy_md.h @@ -36,8 +36,6 @@ typedef struct uct_cuda_copy_md { ucs_ternary_auto_value_t enable_fabric; uct_cuda_pref_loc_t pref_loc; int cuda_async_managed; - int cuda_ctx_set_flags; /* missing cuCtxSetFlags() - below CUDA 12.1 */ } config; } uct_cuda_copy_md_t; From e8c9f9901d03377247a8d81cd93bf5b5cc16bc03 Mon Sep 17 00:00:00 2001 From: Thomas Vegas Date: Fri, 20 Dec 2024 17:30:58 +0200 Subject: [PATCH 03/20] UCT/CUDA: Runtime CUDA >= 12.3 to enable VMM --- src/uct/cuda/cuda_copy/cuda_copy_md.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c index 8a952d64830..3f77f660b4d 100644 --- a/src/uct/cuda/cuda_copy/cuda_copy_md.c +++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c @@ -525,7 +525,7 @@ uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md, const void *address) if (ctx_set_flags_func != NULL) { if (!md->sync_memops_set) { /* Synchronize future DMA operations for all memory types */ - status = UCT_CUDADRV_FUNC_LOG_ERR( + status = UCT_CUDADRV_FUNC_LOG_WARN( ctx_set_flags_func(CU_CTX_SYNC_MEMOPS)); if (status == UCS_OK) { md->sync_memops_set = 1; @@ -834,7 +834,7 @@ static uct_md_ops_t md_ops = { static ucs_status_t uct_cuda_copy_md_check_is_ctx_set_flags_supported(void) { - static ucs_status_t status = UCS_ERR_INVALID_ADDR; + static ucs_status_t status = UCS_ERR_LAST; #if CUDA_VERSION >= 12000 CUdriverProcAddressQueryResult sym_status; @@ -842,16 +842,18 @@ static ucs_status_t uct_cuda_copy_md_check_is_ctx_set_flags_supported(void) if (status == UCS_ERR_INVALID_ADDR) { pthread_mutex_lock(&lock); - cu_err = cuGetProcAddress("cuCtxSetFlags", (void**)&ctx_set_flags_func, - 12010, CU_GET_PROC_ADDRESS_DEFAULT, - &sym_status); - - if ((cu_err == CUDA_SUCCESS) && - (sym_status == CU_GET_PROC_ADDRESS_SUCCESS)) { - status = UCS_OK; - } else { - ctx_set_flags_func = NULL; - status = UCS_ERR_UNSUPPORTED; + if (status == UCS_ERR_INVALID_ADDR) { + cu_err = cuGetProcAddress("cuCtxSetFlags", + (void**)&ctx_set_flags_func, 12010, + CU_GET_PROC_ADDRESS_DEFAULT, &sym_status); + + if ((cu_err == CUDA_SUCCESS) && + (sym_status == CU_GET_PROC_ADDRESS_SUCCESS)) { + status = UCS_OK; + } else { + ctx_set_flags_func = NULL; + status = UCS_ERR_UNSUPPORTED; + } } pthread_mutex_unlock(&lock); From 3b43d299b8a76b8bb7fb19f9a3f18449165ee8c4 Mon Sep 17 00:00:00 2001 From: Thomas Vegas Date: Fri, 20 Dec 2024 16:25:17 +0000 Subject: [PATCH 04/20] UCT/CUDA: Runtime CUDA >= 12.3 to enable VMM --- src/uct/cuda/cuda_copy/cuda_copy_md.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c index 3f77f660b4d..374a97760dc 100644 --- a/src/uct/cuda/cuda_copy/cuda_copy_md.c +++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c @@ -534,6 +534,8 @@ uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md, const void *address) return; } +#else + (void)ctx_set_flags_func; #endif /* Synchronize for DMA for legacy memory types*/ From 2161adf8410bd47f58c285288eeab1bc016d6c75 Mon Sep 17 00:00:00 2001 From: Thomas Vegas Date: Fri, 20 Dec 2024 17:16:43 +0000 Subject: [PATCH 05/20] UCT/CUDA: Runtime CUDA >= 12.3 to enable VMM --- src/uct/cuda/cuda_copy/cuda_copy_md.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c index 374a97760dc..2657e4b288a 100644 --- a/src/uct/cuda/cuda_copy/cuda_copy_md.c +++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c @@ -893,9 +893,12 @@ uct_cuda_copy_md_open(uct_component_t *component, const char *md_name, md->granularity = SIZE_MAX; status = uct_cuda_copy_md_check_is_ctx_set_flags_supported(); - if ((status != UCS_OK) && (md->config.enable_fabric != UCS_NO)) { - ucs_warn("disabled fabric memory allocations as cuda driver " - "library does not support cuCtxSetFlags()"); + if (status != UCS_OK) { + if (md->config.enable_fabric == UCS_YES) { + ucs_warn("disabled fabric memory allocations as cuda driver " + "library does not support cuCtxSetFlags()"); + } + md->config.enable_fabric = UCS_NO; } From 656325344d803786d4e40bbcb60e4888db7b0c4b Mon Sep 17 00:00:00 2001 From: Thomas Vegas Date: Mon, 6 Jan 2025 20:23:23 +0200 Subject: [PATCH 06/20] UCT/CUDA: Runtime CUDA >= 12.3 to enable VMM --- src/uct/cuda/cuda_copy/cuda_copy_md.c | 46 +++++++++++---------------- 1 file changed, 19 insertions(+), 27 deletions(-) diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c index 2657e4b288a..dadf8bbb985 100644 --- a/src/uct/cuda/cuda_copy/cuda_copy_md.c +++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c @@ -81,9 +81,7 @@ static ucs_config_field_t uct_cuda_copy_md_config_table[] = { {NULL} }; -static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER; - -static CUresult (*ctx_set_flags_func)(unsigned); +static CUresult (*uct_cuda_cuCtxSetFlags_func)(unsigned); static int uct_cuda_copy_md_is_dmabuf_supported() { @@ -483,6 +481,7 @@ static void uct_cuda_copy_md_close(uct_md_h uct_md) { static size_t uct_cuda_copy_md_get_total_device_mem(CUdevice cuda_device) { + static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER; static size_t total_bytes[UCT_CUDA_MAX_DEVICES]; char dev_name[UCT_CUDA_DEV_NAME_MAX_LEN]; @@ -518,15 +517,14 @@ static size_t uct_cuda_copy_md_get_total_device_mem(CUdevice cuda_device) static void uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md, const void *address) { - unsigned value = 1; - -#if HAVE_CUDA_FABRIC + unsigned sync_memops_value = 1; ucs_status_t status; - if (ctx_set_flags_func != NULL) { + + if (uct_cuda_cuCtxSetFlags_func != NULL) { if (!md->sync_memops_set) { /* Synchronize future DMA operations for all memory types */ status = UCT_CUDADRV_FUNC_LOG_WARN( - ctx_set_flags_func(CU_CTX_SYNC_MEMOPS)); + uct_cuda_cuCtxSetFlags_func(CU_CTX_SYNC_MEMOPS)); if (status == UCS_OK) { md->sync_memops_set = 1; } @@ -534,13 +532,11 @@ uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md, const void *address) return; } -#else - (void)ctx_set_flags_func; -#endif /* Synchronize for DMA for legacy memory types*/ UCT_CUDADRV_FUNC_LOG_WARN( - cuPointerSetAttribute(&value, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, + cuPointerSetAttribute(&sync_memops_value, + CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, (CUdeviceptr)address)); } @@ -842,23 +838,19 @@ static ucs_status_t uct_cuda_copy_md_check_is_ctx_set_flags_supported(void) CUdriverProcAddressQueryResult sym_status; CUresult cu_err; - if (status == UCS_ERR_INVALID_ADDR) { - pthread_mutex_lock(&lock); - if (status == UCS_ERR_INVALID_ADDR) { - cu_err = cuGetProcAddress("cuCtxSetFlags", - (void**)&ctx_set_flags_func, 12010, - CU_GET_PROC_ADDRESS_DEFAULT, &sym_status); + if (status == UCS_ERR_LAST) { + cu_err = cuGetProcAddress("cuCtxSetFlags", + (void**)&uct_cuda_cuCtxSetFlags_func, + 12010, CU_GET_PROC_ADDRESS_DEFAULT, + &sym_status); - if ((cu_err == CUDA_SUCCESS) && - (sym_status == CU_GET_PROC_ADDRESS_SUCCESS)) { - status = UCS_OK; - } else { - ctx_set_flags_func = NULL; - status = UCS_ERR_UNSUPPORTED; - } + if ((cu_err == CUDA_SUCCESS) && + (sym_status == CU_GET_PROC_ADDRESS_SUCCESS)) { + status = UCS_OK; + } else { + uct_cuda_cuCtxSetFlags_func = NULL; + status = UCS_ERR_UNSUPPORTED; } - - pthread_mutex_unlock(&lock); } #endif From ff4313c9ead6dcdac11d88b1603bf98b1b001d7f Mon Sep 17 00:00:00 2001 From: Thomas Vegas Date: Tue, 7 Jan 2025 09:02:46 +0000 Subject: [PATCH 07/20] UCT/CUDA: Runtime CUDA >= 12.3 to enable VMM --- src/uct/cuda/cuda_copy/cuda_copy_md.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c index dadf8bbb985..ee7532d477d 100644 --- a/src/uct/cuda/cuda_copy/cuda_copy_md.c +++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c @@ -518,6 +518,8 @@ static void uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md, const void *address) { unsigned sync_memops_value = 1; + +#if HAVE_CUDA_FABRIC ucs_status_t status; if (uct_cuda_cuCtxSetFlags_func != NULL) { @@ -532,6 +534,9 @@ uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md, const void *address) return; } +#else + (void)uct_cuda_cuCtxSetFlags_func; +#endif /* Synchronize for DMA for legacy memory types*/ UCT_CUDADRV_FUNC_LOG_WARN( From 2f5e5a5f20a00176768cae27d8b3fc7efe994b2d Mon Sep 17 00:00:00 2001 From: Thomas Vegas Date: Tue, 7 Jan 2025 11:23:39 +0000 Subject: [PATCH 08/20] UCT/CUDA: Runtime CUDA >= 12.3 to enable VMM --- src/uct/cuda/cuda_copy/cuda_copy_md.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c index ee7532d477d..4c5a1766722 100644 --- a/src/uct/cuda/cuda_copy/cuda_copy_md.c +++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c @@ -890,12 +890,16 @@ uct_cuda_copy_md_open(uct_component_t *component, const char *md_name, md->granularity = SIZE_MAX; status = uct_cuda_copy_md_check_is_ctx_set_flags_supported(); - if (status != UCS_OK) { + if ((status != UCS_OK) && (md->config.enable_fabric != UCS_NO)) { if (md->config.enable_fabric == UCS_YES) { - ucs_warn("disabled fabric memory allocations as cuda driver " - "library does not support cuCtxSetFlags()"); + ucs_error("failed to enable fabric memory allocations as cuda " + "driver library does not support cuCtxSetFlags()"); + goto err_free_md; } + ucs_diag("disabled fabric memory allocations as cuda driver library " + "does not support cuCtxSetFlags()"); + md->config.enable_fabric = UCS_NO; } From f1601a384334681fe761f099c7b259cc29f95e6c Mon Sep 17 00:00:00 2001 From: Thomas Vegas Date: Tue, 7 Jan 2025 15:52:02 +0000 Subject: [PATCH 09/20] UCT/CUDA: Runtime CUDA >= 12.3 to enable VMM --- src/uct/cuda/cuda_copy/cuda_copy_md.c | 31 ++++++++++++++------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c index 4c5a1766722..b771cf8be17 100644 --- a/src/uct/cuda/cuda_copy/cuda_copy_md.c +++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c @@ -835,15 +835,15 @@ static uct_md_ops_t md_ops = { .detect_memory_type = uct_cuda_copy_md_detect_memory_type }; -static ucs_status_t uct_cuda_copy_md_check_is_ctx_set_flags_supported(void) +static int uct_cuda_copy_md_check_is_ctx_set_flags_supported(void) { - static ucs_status_t status = UCS_ERR_LAST; - #if CUDA_VERSION >= 12000 + static int is_supported = -1; + CUdriverProcAddressQueryResult sym_status; CUresult cu_err; - if (status == UCS_ERR_LAST) { + if (is_supported < 0) { cu_err = cuGetProcAddress("cuCtxSetFlags", (void**)&uct_cuda_cuCtxSetFlags_func, 12010, CU_GET_PROC_ADDRESS_DEFAULT, @@ -851,15 +851,18 @@ static ucs_status_t uct_cuda_copy_md_check_is_ctx_set_flags_supported(void) if ((cu_err == CUDA_SUCCESS) && (sym_status == CU_GET_PROC_ADDRESS_SUCCESS)) { - status = UCS_OK; + is_supported = 1; } else { + ucs_debug("cuda driver library does not support cuCtxSetFlags()"); uct_cuda_cuCtxSetFlags_func = NULL; - status = UCS_ERR_UNSUPPORTED; + is_supported = 0; } } -#endif - return status; + return is_supported; +#else + return 0; +#endif } static ucs_status_t @@ -889,17 +892,15 @@ uct_cuda_copy_md_open(uct_component_t *component, const char *md_name, md->sync_memops_set = 0; md->granularity = SIZE_MAX; - status = uct_cuda_copy_md_check_is_ctx_set_flags_supported(); - if ((status != UCS_OK) && (md->config.enable_fabric != UCS_NO)) { + if (!uct_cuda_copy_md_check_is_ctx_set_flags_supported() && + (md->config.enable_fabric != UCS_NO)) { if (md->config.enable_fabric == UCS_YES) { - ucs_error("failed to enable fabric memory allocations as cuda " - "driver library does not support cuCtxSetFlags()"); + ucs_error("failed to enable fabric memory allocations"); + status = UCS_ERR_UNSUPPORTED; goto err_free_md; } - ucs_diag("disabled fabric memory allocations as cuda driver library " - "does not support cuCtxSetFlags()"); - + ucs_diag("disabled fabric memory allocations"); md->config.enable_fabric = UCS_NO; } From 8657d5487306916a1802e2f2d2df4253c0f5a5d1 Mon Sep 17 00:00:00 2001 From: Thomas Vegas Date: Fri, 10 Jan 2025 09:40:42 +0000 Subject: [PATCH 10/20] UCT/CUDA: Runtime CUDA >= 12.3 to enable VMM --- src/uct/cuda/cuda_copy/cuda_copy_md.c | 83 ++++++++++----------------- 1 file changed, 30 insertions(+), 53 deletions(-) diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c index b771cf8be17..9b03f426689 100644 --- a/src/uct/cuda/cuda_copy/cuda_copy_md.c +++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c @@ -81,8 +81,6 @@ static ucs_config_field_t uct_cuda_copy_md_config_table[] = { {NULL} }; -static CUresult (*uct_cuda_cuCtxSetFlags_func)(unsigned); - static int uct_cuda_copy_md_is_dmabuf_supported() { int dmabuf_supported = 0; @@ -514,19 +512,36 @@ static size_t uct_cuda_copy_md_get_total_device_mem(CUdevice cuda_device) return 1; /* return 1 byte to avoid division by zero */ } -static void -uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md, const void *address) +typedef CUresult (*uct_cuda_cuCtxSetFlags_t)(unsigned); + +static void uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md, + const void *address, int is_vmm) { unsigned sync_memops_value = 1; -#if HAVE_CUDA_FABRIC +#if CUDA_VERSION >= 12000 + static uct_cuda_cuCtxSetFlags_t cuda_cuCtxSetFlags_func = + (uct_cuda_cuCtxSetFlags_t)ucs_empty_function; + CUdriverProcAddressQueryResult sym_status; + CUresult cu_err; ucs_status_t status; - if (uct_cuda_cuCtxSetFlags_func != NULL) { + if (cuda_cuCtxSetFlags_func == + (uct_cuda_cuCtxSetFlags_t)ucs_empty_function) { + cu_err = cuGetProcAddress("cuCtxSetFlags", + (void**)&cuda_cuCtxSetFlags_func, 12010, + CU_GET_PROC_ADDRESS_DEFAULT, &sym_status); + if ((cu_err != CUDA_SUCCESS) || + (sym_status != CU_GET_PROC_ADDRESS_SUCCESS)) { + cuda_cuCtxSetFlags_func = NULL; + } + } + + if (cuda_cuCtxSetFlags_func != NULL) { if (!md->sync_memops_set) { /* Synchronize future DMA operations for all memory types */ status = UCT_CUDADRV_FUNC_LOG_WARN( - uct_cuda_cuCtxSetFlags_func(CU_CTX_SYNC_MEMOPS)); + cuda_cuCtxSetFlags_func(CU_CTX_SYNC_MEMOPS)); if (status == UCS_OK) { md->sync_memops_set = 1; } @@ -534,11 +549,15 @@ uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md, const void *address) return; } -#else - (void)uct_cuda_cuCtxSetFlags_func; #endif - /* Synchronize for DMA for legacy memory types*/ + if (is_vmm) { + ucs_fatal("failed to set sync_memops on CUDA VMM without " + "cuCtxSetFlags() (address=%p)", + address); + } + + /* Synchronize for DMA for legacy memory types */ UCT_CUDADRV_FUNC_LOG_WARN( cuPointerSetAttribute(&sync_memops_value, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, @@ -648,7 +667,7 @@ uct_cuda_copy_md_query_attributes(uct_cuda_copy_md_t *md, const void *address, return UCS_ERR_NO_DEVICE; } - uct_cuda_copy_sync_memops(md, address); + uct_cuda_copy_sync_memops(md, address, is_vmm); /* Extending the registration range is disable by configuration */ if (md->config.alloc_whole_reg == UCS_CONFIG_OFF) { @@ -835,36 +854,6 @@ static uct_md_ops_t md_ops = { .detect_memory_type = uct_cuda_copy_md_detect_memory_type }; -static int uct_cuda_copy_md_check_is_ctx_set_flags_supported(void) -{ -#if CUDA_VERSION >= 12000 - static int is_supported = -1; - - CUdriverProcAddressQueryResult sym_status; - CUresult cu_err; - - if (is_supported < 0) { - cu_err = cuGetProcAddress("cuCtxSetFlags", - (void**)&uct_cuda_cuCtxSetFlags_func, - 12010, CU_GET_PROC_ADDRESS_DEFAULT, - &sym_status); - - if ((cu_err == CUDA_SUCCESS) && - (sym_status == CU_GET_PROC_ADDRESS_SUCCESS)) { - is_supported = 1; - } else { - ucs_debug("cuda driver library does not support cuCtxSetFlags()"); - uct_cuda_cuCtxSetFlags_func = NULL; - is_supported = 0; - } - } - - return is_supported; -#else - return 0; -#endif -} - static ucs_status_t uct_cuda_copy_md_open(uct_component_t *component, const char *md_name, const uct_md_config_t *md_config, uct_md_h *md_p) @@ -892,18 +881,6 @@ uct_cuda_copy_md_open(uct_component_t *component, const char *md_name, md->sync_memops_set = 0; md->granularity = SIZE_MAX; - if (!uct_cuda_copy_md_check_is_ctx_set_flags_supported() && - (md->config.enable_fabric != UCS_NO)) { - if (md->config.enable_fabric == UCS_YES) { - ucs_error("failed to enable fabric memory allocations"); - status = UCS_ERR_UNSUPPORTED; - goto err_free_md; - } - - ucs_diag("disabled fabric memory allocations"); - md->config.enable_fabric = UCS_NO; - } - if ((config->cuda_async_mem_type != UCS_MEMORY_TYPE_CUDA) && (config->cuda_async_mem_type != UCS_MEMORY_TYPE_CUDA_MANAGED)) { ucs_warn("wrong memory type for async memory allocations: \"%s\";" From 0c27f3179144cb8003629f224cda200dcd8a66b0 Mon Sep 17 00:00:00 2001 From: Thomas Vegas Date: Fri, 10 Jan 2025 11:09:37 +0000 Subject: [PATCH 11/20] UCT/CUDA: Runtime CUDA >= 12.3 to enable VMM --- src/uct/cuda/cuda_copy/cuda_copy_md.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c index 9b03f426689..b8892b2f0d8 100644 --- a/src/uct/cuda/cuda_copy/cuda_copy_md.c +++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c @@ -519,7 +519,7 @@ static void uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md, { unsigned sync_memops_value = 1; -#if CUDA_VERSION >= 12000 +#if HAVE_CUDA_FABRIC && (CUDA_VERSION >= 12000) static uct_cuda_cuCtxSetFlags_t cuda_cuCtxSetFlags_func = (uct_cuda_cuCtxSetFlags_t)ucs_empty_function; CUdriverProcAddressQueryResult sym_status; From eb0d1fc84c1433b0956008cd31be181c2f319b1e Mon Sep 17 00:00:00 2001 From: Thomas Vegas Date: Mon, 13 Jan 2025 08:59:59 +0000 Subject: [PATCH 12/20] UCT/CUDA: Runtime CUDA >= 12.3 to enable VMM --- src/uct/cuda/cuda_copy/cuda_copy_md.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c index b8892b2f0d8..9ed47bd3e7d 100644 --- a/src/uct/cuda/cuda_copy/cuda_copy_md.c +++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c @@ -518,13 +518,16 @@ static void uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md, const void *address, int is_vmm) { unsigned sync_memops_value = 1; - -#if HAVE_CUDA_FABRIC && (CUDA_VERSION >= 12000) - static uct_cuda_cuCtxSetFlags_t cuda_cuCtxSetFlags_func = - (uct_cuda_cuCtxSetFlags_t)ucs_empty_function; +#if HAVE_CUDA_FABRIC CUdriverProcAddressQueryResult sym_status; CUresult cu_err; ucs_status_t status; + uct_cuda_cuCtxSetFlags_t cuda_cuCtxSetFlags_func = + (uct_cuda_cuCtxSetFlags_t)ucs_empty_function; + + if (md->sync_memops_set) { + return; + } if (cuda_cuCtxSetFlags_func == (uct_cuda_cuCtxSetFlags_t)ucs_empty_function) { @@ -538,13 +541,11 @@ static void uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md, } if (cuda_cuCtxSetFlags_func != NULL) { - if (!md->sync_memops_set) { - /* Synchronize future DMA operations for all memory types */ - status = UCT_CUDADRV_FUNC_LOG_WARN( + /* Synchronize future DMA operations for all memory types */ + status = UCT_CUDADRV_FUNC_LOG_WARN( cuda_cuCtxSetFlags_func(CU_CTX_SYNC_MEMOPS)); - if (status == UCS_OK) { - md->sync_memops_set = 1; - } + if (status == UCS_OK) { + md->sync_memops_set = 1; } return; @@ -553,8 +554,7 @@ static void uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md, if (is_vmm) { ucs_fatal("failed to set sync_memops on CUDA VMM without " - "cuCtxSetFlags() (address=%p)", - address); + "cuCtxSetFlags() (address=%p)", address); } /* Synchronize for DMA for legacy memory types */ From fe0370b82a6c3ec680500c0882d5ad639762f539 Mon Sep 17 00:00:00 2001 From: Thomas Vegas Date: Mon, 13 Jan 2025 12:37:39 +0000 Subject: [PATCH 13/20] UCT/IB/EFA/SRD: Initial interface add --- src/uct/cuda/cuda_copy/cuda_copy_md.c | 107 +++++++++++++------------- 1 file changed, 55 insertions(+), 52 deletions(-) diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c index 9ed47bd3e7d..9232bd27fa7 100644 --- a/src/uct/cuda/cuda_copy/cuda_copy_md.c +++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c @@ -304,6 +304,58 @@ uct_cuda_copy_mem_alloc_fabric(uct_cuda_copy_md_t *md, return UCS_ERR_NO_MEMORY; } +typedef CUresult (*uct_cuda_cuCtxSetFlags_t)(unsigned); + +static void uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md, + const void *address, int is_vmm) +{ + unsigned sync_memops_value = 1; +#if HAVE_CUDA_FABRIC + static uct_cuda_cuCtxSetFlags_t cuda_cuCtxSetFlags_func = + (uct_cuda_cuCtxSetFlags_t)ucs_empty_function; + CUdriverProcAddressQueryResult sym_status; + CUresult cu_err; + ucs_status_t status; + + if (md->sync_memops_set) { + return; + } + + if (cuda_cuCtxSetFlags_func == + (uct_cuda_cuCtxSetFlags_t)ucs_empty_function) { + cu_err = cuGetProcAddress("cuCtxSetFlags", + (void**)&cuda_cuCtxSetFlags_func, 12010, + CU_GET_PROC_ADDRESS_DEFAULT, &sym_status); + if ((cu_err != CUDA_SUCCESS) || + (sym_status != CU_GET_PROC_ADDRESS_SUCCESS)) { + cuda_cuCtxSetFlags_func = NULL; + } + } + + if (cuda_cuCtxSetFlags_func != NULL) { + /* Synchronize future DMA operations for all memory types */ + status = UCT_CUDADRV_FUNC_LOG_WARN( + cuda_cuCtxSetFlags_func(CU_CTX_SYNC_MEMOPS)); + if (status == UCS_OK) { + md->sync_memops_set = 1; + } + + return; + } + + if (is_vmm) { + ucs_fatal("failed to set sync_memops on CUDA VMM without " + "cuCtxSetFlags() (address=%p)", address); + } +#endif + + /* Synchronize for DMA for legacy memory types */ + UCT_CUDADRV_FUNC_LOG_WARN( + cuPointerSetAttribute(&sync_memops_value, + CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, + (CUdeviceptr)address)); +} + static ucs_status_t uct_cuda_copy_mem_alloc(uct_md_h uct_md, size_t *length_p, void **address_p, ucs_memory_type_t mem_type, unsigned flags, @@ -379,6 +431,9 @@ uct_cuda_copy_mem_alloc(uct_md_h uct_md, size_t *length_p, void **address_p, } allocated: + uct_cuda_copy_sync_memops(md, (void *)alloc_handle->ptr, + alloc_handle->is_vmm); + *memh_p = alloc_handle; *address_p = (void*)alloc_handle->ptr; *length_p = alloc_handle->length; @@ -512,58 +567,6 @@ static size_t uct_cuda_copy_md_get_total_device_mem(CUdevice cuda_device) return 1; /* return 1 byte to avoid division by zero */ } -typedef CUresult (*uct_cuda_cuCtxSetFlags_t)(unsigned); - -static void uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md, - const void *address, int is_vmm) -{ - unsigned sync_memops_value = 1; -#if HAVE_CUDA_FABRIC - CUdriverProcAddressQueryResult sym_status; - CUresult cu_err; - ucs_status_t status; - uct_cuda_cuCtxSetFlags_t cuda_cuCtxSetFlags_func = - (uct_cuda_cuCtxSetFlags_t)ucs_empty_function; - - if (md->sync_memops_set) { - return; - } - - if (cuda_cuCtxSetFlags_func == - (uct_cuda_cuCtxSetFlags_t)ucs_empty_function) { - cu_err = cuGetProcAddress("cuCtxSetFlags", - (void**)&cuda_cuCtxSetFlags_func, 12010, - CU_GET_PROC_ADDRESS_DEFAULT, &sym_status); - if ((cu_err != CUDA_SUCCESS) || - (sym_status != CU_GET_PROC_ADDRESS_SUCCESS)) { - cuda_cuCtxSetFlags_func = NULL; - } - } - - if (cuda_cuCtxSetFlags_func != NULL) { - /* Synchronize future DMA operations for all memory types */ - status = UCT_CUDADRV_FUNC_LOG_WARN( - cuda_cuCtxSetFlags_func(CU_CTX_SYNC_MEMOPS)); - if (status == UCS_OK) { - md->sync_memops_set = 1; - } - - return; - } -#endif - - if (is_vmm) { - ucs_fatal("failed to set sync_memops on CUDA VMM without " - "cuCtxSetFlags() (address=%p)", address); - } - - /* Synchronize for DMA for legacy memory types */ - UCT_CUDADRV_FUNC_LOG_WARN( - cuPointerSetAttribute(&sync_memops_value, - CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, - (CUdeviceptr)address)); -} - static ucs_status_t uct_cuda_copy_md_query_attributes(uct_cuda_copy_md_t *md, const void *address, size_t length, ucs_memory_info_t *mem_info) From ab3d0c740e98d0a5d9641b580c5a8d7cf6b7dda2 Mon Sep 17 00:00:00 2001 From: Thomas Vegas Date: Mon, 13 Jan 2025 12:40:11 +0000 Subject: [PATCH 14/20] UCT/CUDA: Runtime CUDA >= 12.3 to enable VMM --- src/uct/cuda/cuda_copy/cuda_copy_md.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c index 9232bd27fa7..05aa7f15ff0 100644 --- a/src/uct/cuda/cuda_copy/cuda_copy_md.c +++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c @@ -344,8 +344,8 @@ static void uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md, } if (is_vmm) { - ucs_fatal("failed to set sync_memops on CUDA VMM without " - "cuCtxSetFlags() (address=%p)", address); + ucs_warn("failed to set sync_memops on CUDA VMM without " + "cuCtxSetFlags() (address=%p)", address); } #endif From a0004c48dcc164204048ce4e643710e9030cab98 Mon Sep 17 00:00:00 2001 From: Thomas Vegas Date: Mon, 13 Jan 2025 16:59:05 +0000 Subject: [PATCH 15/20] UCT/CUDA: Runtime CUDA >= 12.3 to enable VMM --- src/uct/cuda/cuda_copy/cuda_copy_md.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c index 05aa7f15ff0..7d342816caa 100644 --- a/src/uct/cuda/cuda_copy/cuda_copy_md.c +++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c @@ -342,12 +342,12 @@ static void uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md, return; } +#endif if (is_vmm) { - ucs_warn("failed to set sync_memops on CUDA VMM without " - "cuCtxSetFlags() (address=%p)", address); + ucs_fatal("failed to set sync_memops on CUDA VMM without " + "cuCtxSetFlags() (address=%p)", address); } -#endif /* Synchronize for DMA for legacy memory types */ UCT_CUDADRV_FUNC_LOG_WARN( @@ -469,7 +469,6 @@ static int uct_cuda_copy_detect_vmm(void *address, ucs_memory_type_t *vmm_mem_type, CUdevice *cuda_device) { -#if HAVE_CUDA_FABRIC ucs_status_t status = UCS_OK; CUmemAllocationProp prop = {}; CUmemGenericAllocationHandle alloc_handle; @@ -504,9 +503,6 @@ static int uct_cuda_copy_detect_vmm(void *address, err: UCT_CUDADRV_FUNC_LOG_DEBUG(cuMemRelease(alloc_handle)); return 1; -#else - return 0; -#endif } static ucs_status_t uct_cuda_copy_mem_free(uct_md_h md, uct_mem_h memh) From 81d47f0d391eb29bab577587fd52f14dec681a7a Mon Sep 17 00:00:00 2001 From: Thomas Vegas Date: Tue, 14 Jan 2025 08:18:42 +0000 Subject: [PATCH 16/20] UCT/CUDA: Runtime CUDA >= 12.3 to enable VMM --- src/uct/cuda/cuda_copy/cuda_copy_md.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c index 7d342816caa..d318b2b45e4 100644 --- a/src/uct/cuda/cuda_copy/cuda_copy_md.c +++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c @@ -491,12 +491,15 @@ static int uct_cuda_copy_detect_vmm(void *address, } *cuda_device = (CUdevice)prop.location.id; +#if CUDA_VERSION >= 12020 if ((prop.location.type == CU_MEM_LOCATION_TYPE_HOST) || (prop.location.type == CU_MEM_LOCATION_TYPE_HOST_NUMA) || (prop.location.type == CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT)) { /* TODO: Marking as CUDA to allow cuda_ipc access vmm for now */ *vmm_mem_type = UCS_MEMORY_TYPE_CUDA; - } else if (prop.location.type == CU_MEM_LOCATION_TYPE_DEVICE) { + } else +#endif + if (prop.location.type == CU_MEM_LOCATION_TYPE_DEVICE) { *vmm_mem_type = UCS_MEMORY_TYPE_CUDA; } From edc00284dd3df031472b5e24d6eea6ccef81dc5f Mon Sep 17 00:00:00 2001 From: Thomas Vegas Date: Tue, 14 Jan 2025 14:36:13 +0000 Subject: [PATCH 17/20] UCT/CUDA: Runtime CUDA >= 12.3 to enable VMM --- config/m4/cuda.m4 | 7 +++++++ src/uct/cuda/cuda_copy/cuda_copy_md.c | 11 ++++++++--- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/config/m4/cuda.m4 b/config/m4/cuda.m4 index d3a321916c5..6bed1d1d7f4 100644 --- a/config/m4/cuda.m4 +++ b/config/m4/cuda.m4 @@ -52,6 +52,13 @@ AS_IF([test "x$cuda_checked" != "xyes"], AS_IF([test "x$cuda_happy" = "xyes"], [AC_CHECK_LIB([cudart], [cudaGetDeviceCount], [CUDART_LIBS="$CUDART_LIBS -lcudart"], [cuda_happy="no"])]) + # Check optional cuda library members + AS_IF([test "x$cuda_happy" = "xyes"], + [AC_CHECK_LIB([cuda], [cuMemRetainAllocationHandle], + [AC_DEFINE([HAVE_CUMEMRETAINALLOCATIONHANDLE], [1], + [Enable cuMemRetainAllocationHandle() usage])]), + AC_CHECK_DECLS([CU_MEM_LOCATION_TYPE_HOST], + [], [], [[#include ]])]) # Check nvml header files AS_IF([test "x$cuda_happy" = "xyes"], diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c index d318b2b45e4..786269e1e7e 100644 --- a/src/uct/cuda/cuda_copy/cuda_copy_md.c +++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c @@ -345,8 +345,9 @@ static void uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md, #endif if (is_vmm) { - ucs_fatal("failed to set sync_memops on CUDA VMM without " - "cuCtxSetFlags() (address=%p)", address); + ucs_warn("cannot set sync_memops on CUDA VMM without cuCtxSetFlags() " + "(address=%p)", address); + return; } /* Synchronize for DMA for legacy memory types */ @@ -469,6 +470,7 @@ static int uct_cuda_copy_detect_vmm(void *address, ucs_memory_type_t *vmm_mem_type, CUdevice *cuda_device) { +#if HAVE_CUMEMRETAINALLOCATIONHANDLE ucs_status_t status = UCS_OK; CUmemAllocationProp prop = {}; CUmemGenericAllocationHandle alloc_handle; @@ -491,7 +493,7 @@ static int uct_cuda_copy_detect_vmm(void *address, } *cuda_device = (CUdevice)prop.location.id; -#if CUDA_VERSION >= 12020 +#if HAVE_DECL_CU_MEM_LOCATION_TYPE_HOST if ((prop.location.type == CU_MEM_LOCATION_TYPE_HOST) || (prop.location.type == CU_MEM_LOCATION_TYPE_HOST_NUMA) || (prop.location.type == CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT)) { @@ -506,6 +508,9 @@ static int uct_cuda_copy_detect_vmm(void *address, err: UCT_CUDADRV_FUNC_LOG_DEBUG(cuMemRelease(alloc_handle)); return 1; +#else + return 0; +#endif } static ucs_status_t uct_cuda_copy_mem_free(uct_md_h md, uct_mem_h memh) From 078a6cc410ad5e6e1432b649154749e6026bcb33 Mon Sep 17 00:00:00 2001 From: Thomas Vegas Date: Tue, 14 Jan 2025 20:04:16 +0200 Subject: [PATCH 18/20] UCT/CUDA: Runtime CUDA >= 12.3 to enable VMM --- src/uct/cuda/cuda_copy/cuda_copy_md.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c index 786269e1e7e..1fb57648f1f 100644 --- a/src/uct/cuda/cuda_copy/cuda_copy_md.c +++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c @@ -470,7 +470,7 @@ static int uct_cuda_copy_detect_vmm(void *address, ucs_memory_type_t *vmm_mem_type, CUdevice *cuda_device) { -#if HAVE_CUMEMRETAINALLOCATIONHANDLE +#ifdef HAVE_CUMEMRETAINALLOCATIONHANDLE ucs_status_t status = UCS_OK; CUmemAllocationProp prop = {}; CUmemGenericAllocationHandle alloc_handle; @@ -493,7 +493,7 @@ static int uct_cuda_copy_detect_vmm(void *address, } *cuda_device = (CUdevice)prop.location.id; -#if HAVE_DECL_CU_MEM_LOCATION_TYPE_HOST +#ifdef HAVE_DECL_CU_MEM_LOCATION_TYPE_HOST if ((prop.location.type == CU_MEM_LOCATION_TYPE_HOST) || (prop.location.type == CU_MEM_LOCATION_TYPE_HOST_NUMA) || (prop.location.type == CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT)) { From 4ace4b1632533a0b0dd3b26b926dfdbc83e370e0 Mon Sep 17 00:00:00 2001 From: Thomas Vegas Date: Tue, 14 Jan 2025 20:06:42 +0200 Subject: [PATCH 19/20] UCT/CUDA: Runtime CUDA >= 12.3 to enable VMM --- src/uct/cuda/cuda_copy/cuda_copy_md.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c index 1fb57648f1f..af99da13553 100644 --- a/src/uct/cuda/cuda_copy/cuda_copy_md.c +++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c @@ -310,7 +310,7 @@ static void uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md, const void *address, int is_vmm) { unsigned sync_memops_value = 1; -#if HAVE_CUDA_FABRIC +#ifdef HAVE_CUDA_FABRIC static uct_cuda_cuCtxSetFlags_t cuda_cuCtxSetFlags_func = (uct_cuda_cuCtxSetFlags_t)ucs_empty_function; CUdriverProcAddressQueryResult sym_status; From 0c39faaa335b256955bf585c519d2170198c9178 Mon Sep 17 00:00:00 2001 From: Thomas Vegas Date: Tue, 14 Jan 2025 18:50:27 +0000 Subject: [PATCH 20/20] UCT/CUDA: Runtime CUDA >= 12.3 to enable VMM --- src/uct/cuda/cuda_copy/cuda_copy_md.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c index af99da13553..786269e1e7e 100644 --- a/src/uct/cuda/cuda_copy/cuda_copy_md.c +++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c @@ -310,7 +310,7 @@ static void uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md, const void *address, int is_vmm) { unsigned sync_memops_value = 1; -#ifdef HAVE_CUDA_FABRIC +#if HAVE_CUDA_FABRIC static uct_cuda_cuCtxSetFlags_t cuda_cuCtxSetFlags_func = (uct_cuda_cuCtxSetFlags_t)ucs_empty_function; CUdriverProcAddressQueryResult sym_status; @@ -470,7 +470,7 @@ static int uct_cuda_copy_detect_vmm(void *address, ucs_memory_type_t *vmm_mem_type, CUdevice *cuda_device) { -#ifdef HAVE_CUMEMRETAINALLOCATIONHANDLE +#if HAVE_CUMEMRETAINALLOCATIONHANDLE ucs_status_t status = UCS_OK; CUmemAllocationProp prop = {}; CUmemGenericAllocationHandle alloc_handle; @@ -493,7 +493,7 @@ static int uct_cuda_copy_detect_vmm(void *address, } *cuda_device = (CUdevice)prop.location.id; -#ifdef HAVE_DECL_CU_MEM_LOCATION_TYPE_HOST +#if HAVE_DECL_CU_MEM_LOCATION_TYPE_HOST if ((prop.location.type == CU_MEM_LOCATION_TYPE_HOST) || (prop.location.type == CU_MEM_LOCATION_TYPE_HOST_NUMA) || (prop.location.type == CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT)) {