Skip to content

Commit

Permalink
Add distributed backend (XCCL) (#1105)
Browse files Browse the repository at this point in the history
### Motivation:
As design illustrated in Intel distributed support RFC
pytorch/pytorch#141741, Intel GPU distributed
Backend integration in PyTorch torch-xpu-ops.

### Design:
USE_XCCL is set to ON by default. Users can manually set it to OFF to
disable XCCL compilation. The OneCCL path is first searched in
/opt/intel/oneapi/ccl/latest. If not found, it uses the CCL_ROOT flag
set by the user after sourcing OneCCL. The USE_C10D_XCCL variable is
intended to align with other distributed backend environment variables.
Oneccl lib link to torch_xpu align with other distribute backend.

---------

Co-authored-by: Cheng, Penghui <[email protected]>
Co-authored-by: Yutao Xu <[email protected]>
  • Loading branch information
3 people authored Jan 7, 2025
1 parent 081596f commit 3660d74
Show file tree
Hide file tree
Showing 9 changed files with 941 additions and 2 deletions.
6 changes: 6 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,12 @@ list(APPEND CMAKE_MODULE_PATH ${TORCH_XPU_OPS_ROOT}/cmake/Modules)
include(${TORCH_XPU_OPS_ROOT}/cmake/SYCL.cmake)
include(${TORCH_XPU_OPS_ROOT}/cmake/BuildFlags.cmake)

option(USE_XCCL "Build with XCCL support" ON)

if(NOT WIN32 AND USE_XCCL)
include(${TORCH_XPU_OPS_ROOT}/cmake/XCCL.cmake)
endif()

if(BUILD_TEST)
add_subdirectory(${TORCH_XPU_OPS_ROOT}/test/sycl ${CMAKE_BINARY_DIR}/test_sycl)
endif()
Expand Down
62 changes: 62 additions & 0 deletions cmake/Modules/FindXCCL.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# This will define the following variables:
# XCCL_FOUND : True if the system has the XCCL library.
# XCCL_INCLUDE_DIR : Include directories needed to use XCCL.
# XCCL_LIBRARY_DIR :The path to the XCCL library.
# XCCL_LIBRARY : XCCL library fullname.

include(${CMAKE_ROOT}/Modules/FindPackageHandleStandardArgs.cmake)

# we need source OneCCL environment before building.
set(XCCL_ROOT $ENV{CCL_ROOT})

# Find include path from binary.
find_file(
XCCL_INCLUDE_DIR
NAMES include
HINTS ${XCCL_ROOT}
NO_DEFAULT_PATH
)

# Find include/oneapi path from include path.
find_file(
XCCL_INCLUDE_ONEAPI_DIR
NAMES oneapi
HINTS ${XCCL_ROOT}/include/
NO_DEFAULT_PATH
)

list(APPEND XCCL_INCLUDE_DIR ${XCCL_INCLUDE_ONEAPI_DIR})

# Find library directory from binary.
find_file(
XCCL_LIBRARY_DIR
NAMES lib
HINTS ${XCCL_ROOT}
NO_DEFAULT_PATH
)

# Find XCCL library fullname.
find_library(
XCCL_LIBRARY
NAMES ccl
HINTS ${XCCL_LIBRARY_DIR}
NO_DEFAULT_PATH
)

if((NOT XCCL_INCLUDE_DIR) OR (NOT XCCL_LIBRARY_DIR) OR (NOT XCCL_LIBRARY))
set(XCCL_FOUND False)
set(XCCL_NOT_FOUND_MESSAGE "OneCCL library not found!!")
return()
endif()

SET(CMAKE_INCLUDE_PATH ${CMAKE_INCLUDE_PATH}
"${XCCL_INCLUDE_DIR}")
SET(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH}
"${XCCL_LIBRARY_DIR}")

find_package_handle_standard_args(
XCCL
FOUND_VAR XCCL_FOUND
REQUIRED_VARS XCCL_INCLUDE_DIR XCCL_LIBRARY_DIR XCCL_LIBRARY
REASON_FAILURE_MESSAGE "${XCCL_NOT_FOUND_MESSAGE}"
)
21 changes: 21 additions & 0 deletions cmake/XCCL.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
if(NOT __XCCL_INCLUDED)
set(__XCCL_INCLUDED TRUE)

# XCCL_ROOT, XCCL_LIBRARY_DIR, XCCL_INCLUDE_DIR are handled by FindXCCL.cmake.
find_package(XCCL REQUIRED)
if(NOT XCCL_FOUND)
message("${XCCL_NOT_FOUND_MESSAGE}")
return()
endif()
if(XCCL_FOUND)
add_library(torch::xccl INTERFACE IMPORTED)
set_property(
TARGET torch::xccl PROPERTY INTERFACE_INCLUDE_DIRECTORIES
${XCCL_INCLUDE_DIR})
set_property(
TARGET torch::xccl PROPERTY INTERFACE_LINK_LIBRARIES
${XCCL_LIBRARY})
set(USE_C10D_XCCL ON)
set(USE_C10D_XCCL ${USE_C10D_XCCL} PARENT_SCOPE)
endif()
endif()
8 changes: 7 additions & 1 deletion src/BuildOnLinux.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,13 @@ add_library(
STATIC
${ATen_XPU_CPP_SRCS}
${ATen_XPU_NATIVE_CPP_SRCS}
${ATen_XPU_GEN_SRCS})
${ATen_XPU_GEN_SRCS}
${ATen_XPU_XCCL_SRCS})

if(USE_C10D_XCCL)
target_compile_definitions(torch_xpu_ops PRIVATE USE_C10D_XCCL)
target_link_libraries(torch_xpu_ops PUBLIC torch::xccl)
endif()

if(BUILD_SEPARATE_OPS)
foreach(sycl_src ${ATen_XPU_SYCL_SRCS})
Expand Down
5 changes: 4 additions & 1 deletion src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,14 @@ include(${TORCH_XPU_OPS_ROOT}/cmake/Codegen.cmake)
set(ATen_XPU_CPP_SRCS)
set(ATen_XPU_NATIVE_CPP_SRCS)
set(ATen_XPU_SYCL_SRCS)
set(ATen_XPU_XCCL_SRCS)

set(ATen_XPU_INCLUDE_DIRS ${TORCH_XPU_OPS_ROOT}/src CACHE STRING "ATen XPU Include directory")

add_subdirectory(ATen)

if(USE_C10D_XCCL)
add_subdirectory(xccl)
endif()
# With the increasement of bin size, we have to split libtorch_xpu.so into
# multiple libraries. Because of strict linkage requirements on Windows,
# we add extra logics to resolve, 1) Cyclic dependence, 2) Make symbols visible.
Expand Down
16 changes: 16 additions & 0 deletions src/xccl/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# XCCL sources

file(GLOB xccl_h "*.hpp")
file(GLOB xccl_cpp "*.cpp")

list(APPEND ATen_XPU_XCCL_SRCS ${xccl_cpp})

set(ATen_XPU_XCCL_SRCS ${ATen_XPU_XCCL_SRCS} PARENT_SCOPE)

# Why copy the header file to the build directory?
# We want register XCCL backend to PyTorch c10d in torch/csrc/distributed/c10d/init.cpp#L27-L29.
# To align with other backends, we need to copy the header file to the build torch/csrc/distributed/c10d directory.
# Further solution is add find path for torch/csrc/distributed/c10d/init.cpp#L27-L29.
foreach(HEADER ${xccl_h})
file(COPY ${HEADER} DESTINATION "${CMAKE_BINARY_DIR}/torch/csrc/distributed/c10d")
endforeach()
Loading

0 comments on commit 3660d74

Please sign in to comment.