Add distributed backend (XCCL) (#1105)

### Motivation: As design illustrated in Intel distributed support RFC pytorch/pytorch#141741, Intel GPU distributed Backend integration in PyTorch torch-xpu-ops. ### Design: USE_XCCL is set to ON by default. Users can manually set it to OFF to disable XCCL compilation. The OneCCL path is first searched in /opt/intel/oneapi/ccl/latest. If not found, it uses the CCL_ROOT flag set by the user after sourcing OneCCL. The USE_C10D_XCCL variable is intended to align with other distributed backend environment variables. Oneccl lib link to torch_xpu align with other distribute backend. --------- Co-authored-by: Cheng, Penghui <[email protected]> Co-authored-by: Yutao Xu <[email protected]>
intel · Jan 7, 2025 · 3660d74 · 3660d74
1 parent 081596f
commit 3660d74
Show file tree

Hide file tree

Showing 9 changed files with 941 additions and 2 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -38,6 +38,12 @@ list(APPEND CMAKE_MODULE_PATH ${TORCH_XPU_OPS_ROOT}/cmake/Modules)
 include(${TORCH_XPU_OPS_ROOT}/cmake/SYCL.cmake)
 include(${TORCH_XPU_OPS_ROOT}/cmake/BuildFlags.cmake)
 
+option(USE_XCCL "Build with XCCL support" ON)
+
+if(NOT WIN32 AND USE_XCCL)
+  include(${TORCH_XPU_OPS_ROOT}/cmake/XCCL.cmake)
+endif()
+
 if(BUILD_TEST)
   add_subdirectory(${TORCH_XPU_OPS_ROOT}/test/sycl ${CMAKE_BINARY_DIR}/test_sycl)
 endif()

diff --git a/cmake/Modules/FindXCCL.cmake b/cmake/Modules/FindXCCL.cmake
@@ -0,0 +1,62 @@
+# This will define the following variables:
+# XCCL_FOUND               : True if the system has the XCCL library.
+# XCCL_INCLUDE_DIR         : Include directories needed to use XCCL.
+# XCCL_LIBRARY_DIR         ：The path to the XCCL library.
+# XCCL_LIBRARY             : XCCL library fullname.
+
+include(${CMAKE_ROOT}/Modules/FindPackageHandleStandardArgs.cmake)
+
+# we need source OneCCL environment before building.
+set(XCCL_ROOT $ENV{CCL_ROOT})
+
+# Find include path from binary.
+find_file(
+  XCCL_INCLUDE_DIR
+  NAMES include
+  HINTS ${XCCL_ROOT}
+  NO_DEFAULT_PATH
+)
+
+# Find include/oneapi path from include path.
+find_file(
+  XCCL_INCLUDE_ONEAPI_DIR
+  NAMES oneapi
+  HINTS ${XCCL_ROOT}/include/
+  NO_DEFAULT_PATH
+)
+
+list(APPEND XCCL_INCLUDE_DIR ${XCCL_INCLUDE_ONEAPI_DIR})
+
+# Find library directory from binary.
+find_file(
+  XCCL_LIBRARY_DIR
+  NAMES lib
+  HINTS ${XCCL_ROOT}
+  NO_DEFAULT_PATH
+)
+
+# Find XCCL library fullname.
+find_library(
+  XCCL_LIBRARY
+  NAMES ccl
+  HINTS ${XCCL_LIBRARY_DIR}
+  NO_DEFAULT_PATH
+)
+
+if((NOT XCCL_INCLUDE_DIR) OR (NOT XCCL_LIBRARY_DIR) OR (NOT XCCL_LIBRARY))
+  set(XCCL_FOUND False)
+  set(XCCL_NOT_FOUND_MESSAGE "OneCCL library not found!!")
+  return()
+endif()
+
+SET(CMAKE_INCLUDE_PATH ${CMAKE_INCLUDE_PATH}
+  "${XCCL_INCLUDE_DIR}")
+SET(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH}
+  "${XCCL_LIBRARY_DIR}")
+
+find_package_handle_standard_args(
+  XCCL
+  FOUND_VAR XCCL_FOUND
+  REQUIRED_VARS XCCL_INCLUDE_DIR XCCL_LIBRARY_DIR XCCL_LIBRARY
+  REASON_FAILURE_MESSAGE "${XCCL_NOT_FOUND_MESSAGE}"
+)
diff --git a/cmake/XCCL.cmake b/cmake/XCCL.cmake
@@ -0,0 +1,21 @@
+if(NOT __XCCL_INCLUDED)
+  set(__XCCL_INCLUDED TRUE)
+
+  # XCCL_ROOT, XCCL_LIBRARY_DIR, XCCL_INCLUDE_DIR are handled by FindXCCL.cmake.
+  find_package(XCCL REQUIRED)
+  if(NOT XCCL_FOUND)
+    message("${XCCL_NOT_FOUND_MESSAGE}")
+    return()
+  endif()
+  if(XCCL_FOUND)
+    add_library(torch::xccl INTERFACE IMPORTED)
+    set_property(
+      TARGET torch::xccl PROPERTY INTERFACE_INCLUDE_DIRECTORIES
+      ${XCCL_INCLUDE_DIR})
+    set_property(
+      TARGET torch::xccl PROPERTY INTERFACE_LINK_LIBRARIES
+      ${XCCL_LIBRARY})
+    set(USE_C10D_XCCL ON)
+    set(USE_C10D_XCCL ${USE_C10D_XCCL} PARENT_SCOPE)
+  endif()
+endif()
diff --git a/src/BuildOnLinux.cmake b/src/BuildOnLinux.cmake
@@ -8,7 +8,13 @@ add_library(
   STATIC
   ${ATen_XPU_CPP_SRCS}
   ${ATen_XPU_NATIVE_CPP_SRCS}
-  ${ATen_XPU_GEN_SRCS})
+  ${ATen_XPU_GEN_SRCS}
+  ${ATen_XPU_XCCL_SRCS})
+
+if(USE_C10D_XCCL)
+  target_compile_definitions(torch_xpu_ops PRIVATE USE_C10D_XCCL)
+  target_link_libraries(torch_xpu_ops PUBLIC torch::xccl)
+endif()
 
 if(BUILD_SEPARATE_OPS)
   foreach(sycl_src ${ATen_XPU_SYCL_SRCS})

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -4,11 +4,14 @@ include(${TORCH_XPU_OPS_ROOT}/cmake/Codegen.cmake)
 set(ATen_XPU_CPP_SRCS)
 set(ATen_XPU_NATIVE_CPP_SRCS)
 set(ATen_XPU_SYCL_SRCS)
+set(ATen_XPU_XCCL_SRCS)
 
 set(ATen_XPU_INCLUDE_DIRS ${TORCH_XPU_OPS_ROOT}/src CACHE STRING "ATen XPU Include directory")
 
 add_subdirectory(ATen)
-
+if(USE_C10D_XCCL)
+  add_subdirectory(xccl)
+endif()
 # With the increasement of bin size, we have to split libtorch_xpu.so into
 # multiple libraries. Because of strict linkage requirements on Windows,
 # we add extra logics to resolve, 1) Cyclic dependence, 2) Make symbols visible.

diff --git a/src/xccl/CMakeLists.txt b/src/xccl/CMakeLists.txt
@@ -0,0 +1,16 @@
+# XCCL sources
+
+file(GLOB xccl_h "*.hpp")
+file(GLOB xccl_cpp "*.cpp")
+
+list(APPEND ATen_XPU_XCCL_SRCS ${xccl_cpp})
+
+set(ATen_XPU_XCCL_SRCS ${ATen_XPU_XCCL_SRCS} PARENT_SCOPE)
+
+# Why copy the header file to the build directory?
+# We want register XCCL backend to PyTorch c10d in torch/csrc/distributed/c10d/init.cpp#L27-L29.
+# To align with other backends, we need to copy the header file to the build torch/csrc/distributed/c10d directory.
+# Further solution is add find path for torch/csrc/distributed/c10d/init.cpp#L27-L29.
+foreach(HEADER  ${xccl_h})
+  file(COPY ${HEADER} DESTINATION "${CMAKE_BINARY_DIR}/torch/csrc/distributed/c10d")
+endforeach()