-
Notifications
You must be signed in to change notification settings - Fork 432
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
UCT/CUDA: Runtime CUDA >= 12.3 to enable VMM #10396
base: master
Are you sure you want to change the base?
Changes from 14 commits
68a5f51
9fc4430
e8c9f99
3b43d29
2161adf
6563253
ff4313c
2f5e5a5
f1601a3
8657d54
0c27f31
eb0d1fc
fe0370b
ab3d0c7
a0004c4
81d47f0
edc0028
078a6cc
4ace4b1
0c39faa
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -304,6 +304,58 @@ uct_cuda_copy_mem_alloc_fabric(uct_cuda_copy_md_t *md, | |
return UCS_ERR_NO_MEMORY; | ||
} | ||
|
||
typedef CUresult (*uct_cuda_cuCtxSetFlags_t)(unsigned); | ||
|
||
static void uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md, | ||
const void *address, int is_vmm) | ||
{ | ||
unsigned sync_memops_value = 1; | ||
#if HAVE_CUDA_FABRIC | ||
static uct_cuda_cuCtxSetFlags_t cuda_cuCtxSetFlags_func = | ||
(uct_cuda_cuCtxSetFlags_t)ucs_empty_function; | ||
CUdriverProcAddressQueryResult sym_status; | ||
CUresult cu_err; | ||
ucs_status_t status; | ||
|
||
if (md->sync_memops_set) { | ||
return; | ||
} | ||
|
||
if (cuda_cuCtxSetFlags_func == | ||
(uct_cuda_cuCtxSetFlags_t)ucs_empty_function) { | ||
cu_err = cuGetProcAddress("cuCtxSetFlags", | ||
(void**)&cuda_cuCtxSetFlags_func, 12010, | ||
CU_GET_PROC_ADDRESS_DEFAULT, &sym_status); | ||
if ((cu_err != CUDA_SUCCESS) || | ||
(sym_status != CU_GET_PROC_ADDRESS_SUCCESS)) { | ||
cuda_cuCtxSetFlags_func = NULL; | ||
} | ||
} | ||
|
||
if (cuda_cuCtxSetFlags_func != NULL) { | ||
/* Synchronize future DMA operations for all memory types */ | ||
status = UCT_CUDADRV_FUNC_LOG_WARN( | ||
cuda_cuCtxSetFlags_func(CU_CTX_SYNC_MEMOPS)); | ||
if (status == UCS_OK) { | ||
md->sync_memops_set = 1; | ||
} | ||
|
||
return; | ||
} | ||
|
||
if (is_vmm) { | ||
ucs_warn("failed to set sync_memops on CUDA VMM without " | ||
"cuCtxSetFlags() (address=%p)", address); | ||
} | ||
#endif | ||
|
||
/* Synchronize for DMA for legacy memory types */ | ||
UCT_CUDADRV_FUNC_LOG_WARN( | ||
cuPointerSetAttribute(&sync_memops_value, | ||
CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, | ||
(CUdeviceptr)address)); | ||
} | ||
|
||
static ucs_status_t | ||
uct_cuda_copy_mem_alloc(uct_md_h uct_md, size_t *length_p, void **address_p, | ||
ucs_memory_type_t mem_type, unsigned flags, | ||
|
@@ -379,6 +431,9 @@ uct_cuda_copy_mem_alloc(uct_md_h uct_md, size_t *length_p, void **address_p, | |
} | ||
|
||
allocated: | ||
uct_cuda_copy_sync_memops(md, (void *)alloc_handle->ptr, | ||
yosefe marked this conversation as resolved.
Show resolved
Hide resolved
|
||
alloc_handle->is_vmm); | ||
|
||
*memh_p = alloc_handle; | ||
*address_p = (void*)alloc_handle->ptr; | ||
*length_p = alloc_handle->length; | ||
|
@@ -512,27 +567,6 @@ static size_t uct_cuda_copy_md_get_total_device_mem(CUdevice cuda_device) | |
return 1; /* return 1 byte to avoid division by zero */ | ||
} | ||
|
||
static void | ||
uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md, const void *address) | ||
{ | ||
#if HAVE_CUDA_FABRIC | ||
ucs_status_t status; | ||
if (!md->sync_memops_set) { | ||
/* Synchronize future DMA operations for all memory types */ | ||
status = UCT_CUDADRV_FUNC_LOG_WARN(cuCtxSetFlags(CU_CTX_SYNC_MEMOPS)); | ||
if (status == UCS_OK) { | ||
md->sync_memops_set = 1; | ||
} | ||
} | ||
#else | ||
unsigned value = 1; | ||
/* Synchronize for DMA for legacy memory types*/ | ||
UCT_CUDADRV_FUNC_LOG_WARN( | ||
cuPointerSetAttribute(&value, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, | ||
(CUdeviceptr)address)); | ||
#endif | ||
} | ||
|
||
static ucs_status_t | ||
uct_cuda_copy_md_query_attributes(uct_cuda_copy_md_t *md, const void *address, | ||
size_t length, ucs_memory_info_t *mem_info) | ||
|
@@ -636,7 +670,7 @@ uct_cuda_copy_md_query_attributes(uct_cuda_copy_md_t *md, const void *address, | |
return UCS_ERR_NO_DEVICE; | ||
} | ||
|
||
uct_cuda_copy_sync_memops(md, address); | ||
uct_cuda_copy_sync_memops(md, address, is_vmm); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should we call it also from cuda allocate flow? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this could end-up calling cuPointerSetAttribute twice when set flags function is not available There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. where would be the 2nd time? AFAIK we don't call pointer query on allocated memory (such as rndv fragments) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok |
||
|
||
/* Extending the registration range is disable by configuration */ | ||
if (md->config.alloc_whole_reg == UCS_CONFIG_OFF) { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@tvegas1 Current changes look good to me but @yosefe brought up an issue where library is built with >=12.3 compatible driver version but the system where that library gets used has driver version < 12.1. On such a system, VMM/Mallocasync allocations are allowed (as VMM and MallocAsync is supported on driver versions < 12.1). But there would be a need to report an error or fail even if UCX isn't compiled with HAVE_CUDA_FABRIC (driver version >= 12.3). The condition met here is when UCX is built with >=12.3 driver.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
agree, that's where we have VMM independently allocated, but still we don't have
HAVE_FABRIC
set, in this case I will moveis_vmm
out of the #ifdef and fatal ifis_vmm == 1
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
i don't think it will help - if HAVE_FABRIC is not set we will never know in UCX it is VMM memory and assume it is legacy memory. Then we can only hope that cuPointerSetAttribute would fail.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yes also enabled detect vmm because:
assuming we built UCX with cuda >= 11 anyways
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
let me know if you read it differently: https://rocm.docs.amd.com/projects/HIPIFY/en/latest/tables/CUDA_Driver_API_functions_supported_by_HIP.html
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
double checked actual function prototype with cuda older release online documentation
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
CI failure on cuda11.0, seems although
cuMemRetainAllocationHandle
is found in headers in /usr/local/cuda-11/include/, it is not availalbe in the link time cuda stub library.