-
Notifications
You must be signed in to change notification settings - Fork 27
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add distributed backend (XCCL) (#1105)
### Motivation: As design illustrated in Intel distributed support RFC pytorch/pytorch#141741, Intel GPU distributed Backend integration in PyTorch torch-xpu-ops. ### Design: USE_XCCL is set to ON by default. Users can manually set it to OFF to disable XCCL compilation. The OneCCL path is first searched in /opt/intel/oneapi/ccl/latest. If not found, it uses the CCL_ROOT flag set by the user after sourcing OneCCL. The USE_C10D_XCCL variable is intended to align with other distributed backend environment variables. Oneccl lib link to torch_xpu align with other distribute backend. --------- Co-authored-by: Cheng, Penghui <[email protected]> Co-authored-by: Yutao Xu <[email protected]>
- Loading branch information
1 parent
081596f
commit 3660d74
Showing
9 changed files
with
941 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
# This will define the following variables: | ||
# XCCL_FOUND : True if the system has the XCCL library. | ||
# XCCL_INCLUDE_DIR : Include directories needed to use XCCL. | ||
# XCCL_LIBRARY_DIR :The path to the XCCL library. | ||
# XCCL_LIBRARY : XCCL library fullname. | ||
|
||
include(${CMAKE_ROOT}/Modules/FindPackageHandleStandardArgs.cmake) | ||
|
||
# we need source OneCCL environment before building. | ||
set(XCCL_ROOT $ENV{CCL_ROOT}) | ||
|
||
# Find include path from binary. | ||
find_file( | ||
XCCL_INCLUDE_DIR | ||
NAMES include | ||
HINTS ${XCCL_ROOT} | ||
NO_DEFAULT_PATH | ||
) | ||
|
||
# Find include/oneapi path from include path. | ||
find_file( | ||
XCCL_INCLUDE_ONEAPI_DIR | ||
NAMES oneapi | ||
HINTS ${XCCL_ROOT}/include/ | ||
NO_DEFAULT_PATH | ||
) | ||
|
||
list(APPEND XCCL_INCLUDE_DIR ${XCCL_INCLUDE_ONEAPI_DIR}) | ||
|
||
# Find library directory from binary. | ||
find_file( | ||
XCCL_LIBRARY_DIR | ||
NAMES lib | ||
HINTS ${XCCL_ROOT} | ||
NO_DEFAULT_PATH | ||
) | ||
|
||
# Find XCCL library fullname. | ||
find_library( | ||
XCCL_LIBRARY | ||
NAMES ccl | ||
HINTS ${XCCL_LIBRARY_DIR} | ||
NO_DEFAULT_PATH | ||
) | ||
|
||
if((NOT XCCL_INCLUDE_DIR) OR (NOT XCCL_LIBRARY_DIR) OR (NOT XCCL_LIBRARY)) | ||
set(XCCL_FOUND False) | ||
set(XCCL_NOT_FOUND_MESSAGE "OneCCL library not found!!") | ||
return() | ||
endif() | ||
|
||
SET(CMAKE_INCLUDE_PATH ${CMAKE_INCLUDE_PATH} | ||
"${XCCL_INCLUDE_DIR}") | ||
SET(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH} | ||
"${XCCL_LIBRARY_DIR}") | ||
|
||
find_package_handle_standard_args( | ||
XCCL | ||
FOUND_VAR XCCL_FOUND | ||
REQUIRED_VARS XCCL_INCLUDE_DIR XCCL_LIBRARY_DIR XCCL_LIBRARY | ||
REASON_FAILURE_MESSAGE "${XCCL_NOT_FOUND_MESSAGE}" | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
if(NOT __XCCL_INCLUDED) | ||
set(__XCCL_INCLUDED TRUE) | ||
|
||
# XCCL_ROOT, XCCL_LIBRARY_DIR, XCCL_INCLUDE_DIR are handled by FindXCCL.cmake. | ||
find_package(XCCL REQUIRED) | ||
if(NOT XCCL_FOUND) | ||
message("${XCCL_NOT_FOUND_MESSAGE}") | ||
return() | ||
endif() | ||
if(XCCL_FOUND) | ||
add_library(torch::xccl INTERFACE IMPORTED) | ||
set_property( | ||
TARGET torch::xccl PROPERTY INTERFACE_INCLUDE_DIRECTORIES | ||
${XCCL_INCLUDE_DIR}) | ||
set_property( | ||
TARGET torch::xccl PROPERTY INTERFACE_LINK_LIBRARIES | ||
${XCCL_LIBRARY}) | ||
set(USE_C10D_XCCL ON) | ||
set(USE_C10D_XCCL ${USE_C10D_XCCL} PARENT_SCOPE) | ||
endif() | ||
endif() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
# XCCL sources | ||
|
||
file(GLOB xccl_h "*.hpp") | ||
file(GLOB xccl_cpp "*.cpp") | ||
|
||
list(APPEND ATen_XPU_XCCL_SRCS ${xccl_cpp}) | ||
|
||
set(ATen_XPU_XCCL_SRCS ${ATen_XPU_XCCL_SRCS} PARENT_SCOPE) | ||
|
||
# Why copy the header file to the build directory? | ||
# We want register XCCL backend to PyTorch c10d in torch/csrc/distributed/c10d/init.cpp#L27-L29. | ||
# To align with other backends, we need to copy the header file to the build torch/csrc/distributed/c10d directory. | ||
# Further solution is add find path for torch/csrc/distributed/c10d/init.cpp#L27-L29. | ||
foreach(HEADER ${xccl_h}) | ||
file(COPY ${HEADER} DESTINATION "${CMAKE_BINARY_DIR}/torch/csrc/distributed/c10d") | ||
endforeach() |
Oops, something went wrong.