From c0e1dc1c864840da352de3fac11b6fd717979479 Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Thu, 29 Aug 2024 09:28:58 +0000
Subject: [PATCH 01/96] Happy Init

---
 CMakeLists.txt                                |   6 +
 caffe2/CMakeLists.txt                         |  11 +
 caffe2/core/macros.h.in                       |   1 +
 cmake/Dependencies.cmake                      |  16 +
 cmake/External/xccl.cmake                     |  13 +
 cmake/Modules/FindXCCL.cmake                  |  68 ++
 cmake/Summary.cmake                           |   5 +
 setup.py                                      |   4 +
 torch/CMakeLists.txt                          |   7 +
 .../distributed/c10d/ProcessGroupXCCL.cpp     | 356 +++++++
 .../distributed/c10d/ProcessGroupXCCL.hpp     | 140 +++
 torch/csrc/distributed/c10d/XCCLUtils.hpp     | 334 +++++++
 torch/csrc/xpu/xccl.cpp                       | 923 ++++++++++++++++++
 torch/csrc/xpu/xccl.h                         | 112 +++
 14 files changed, 1996 insertions(+)
 create mode 100644 cmake/External/xccl.cmake
 create mode 100644 cmake/Modules/FindXCCL.cmake
 create mode 100644 torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
 create mode 100644 torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
 create mode 100644 torch/csrc/distributed/c10d/XCCLUtils.hpp
 create mode 100644 torch/csrc/xpu/xccl.cpp
 create mode 100644 torch/csrc/xpu/xccl.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5139c0a478e788..89ef59681bfff4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -275,6 +275,8 @@ option(USE_NATIVE_ARCH "Use -march=native" OFF)
 cmake_dependent_option(USE_MPS "Use MPS for macOS build" ON "MPS_FOUND" OFF)
 cmake_dependent_option(USE_NCCL "Use NCCL" ON
                        "USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
+cmake_dependent_option(USE_XCCL "Use XCCL" ON
+                       "USE_XPU;UNIX;NOT APPLE" OFF)
 cmake_dependent_option(USE_RCCL "Use RCCL" ON USE_NCCL OFF)
 cmake_dependent_option(USE_STATIC_NCCL "Use static NCCL" OFF "USE_NCCL" OFF)
 cmake_dependent_option(USE_SYSTEM_NCCL "Use system-wide NCCL" OFF "USE_NCCL"
@@ -353,6 +355,8 @@ cmake_dependent_option(USE_C10D_GLOO "USE C10D GLOO" ON
                        "USE_DISTRIBUTED;USE_GLOO" OFF)
 cmake_dependent_option(USE_C10D_NCCL "USE C10D NCCL" ON
                        "USE_DISTRIBUTED;USE_NCCL" OFF)
+cmake_dependent_option(USE_C10D_XCCL "USE C10D XCCL" ON
+                       "USE_DISTRIBUTED;USE_XCCL" OFF)
 cmake_dependent_option(USE_C10D_MPI "USE C10D MPI" ON "USE_DISTRIBUTED;USE_MPI"
                        OFF)
 cmake_dependent_option(
@@ -365,6 +369,8 @@ cmake_dependent_option(
     USE_C10D_GLOO "USE C10D GLOO" ON "USE_DISTRIBUTED;USE_GLOO" OFF)
 cmake_dependent_option(
     USE_C10D_NCCL "USE C10D NCCL" ON "USE_DISTRIBUTED;USE_NCCL" OFF)
+cmake_dependent_option(
+    USE_C10D_XCCL "USE C10D XCCL" ON "USE_DISTRIBUTED;USE_XCCL" OFF)
 cmake_dependent_option(
     USE_C10D_MPI "USE C10D MPI" ON "USE_DISTRIBUTED;USE_MPI" OFF)
 cmake_dependent_option(
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 8ed93cdff0479c..2c4da5fd50f10c 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -1057,6 +1057,10 @@ if(USE_XPU)
   # 2. Using add_custom_command in torch-xpu-ops to define sycl device sources
   #    compilation. add_custom_command requires an explicit dependency.
   list(APPEND ${Caffe2_XPU_INCLUDE} ${TORCH_XPU_OPS_DIR}/src/ATen/)
+  # if(USE_XCCL)
+  #   list(APPEND Caffe2_GPU_SRCS
+  #     ${TORCH_SRC_DIR}/csrc/xpu/xccl.cpp)
+  # endif()
   set(TORCH_XPU_OPS_PYTORCH_DEPS ATEN_CPU_FILES_GEN_TARGET)
 
   add_subdirectory(${TORCH_ROOT}/third_party/torch-xpu-ops
@@ -1065,6 +1069,10 @@ if(USE_XPU)
     message(WARNING "Failed to include ATen XPU implementation target")
   else()
     target_link_libraries(torch_xpu PRIVATE torch_xpu_ops)
+    if(USE_XCCL)
+      target_link_libraries(torch_xpu PRIVATE __caffe2_xccl)
+      target_compile_definitions(torch_xpu PRIVATE USE_XCCL)
+    endif()
     if(MSVC)
       # Windows
       target_link_libraries(torch_xpu PRIVATE
@@ -1365,6 +1373,9 @@ if(USE_DISTRIBUTED)
       target_compile_definitions(torch_cuda PUBLIC USE_C10D_NCCL)
     endif()
   endif()
+  if(USE_C10D_XCCL)
+    target_compile_definitions(torch_xpu PUBLIC USE_C10D_XCCL)
+  endif()
   if(USE_MPI AND USE_C10D_MPI)
     if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
       set_source_files_properties(
diff --git a/caffe2/core/macros.h.in b/caffe2/core/macros.h.in
index 2929f105b31faa..e5398a83cad947 100644
--- a/caffe2/core/macros.h.in
+++ b/caffe2/core/macros.h.in
@@ -45,6 +45,7 @@
   {"USE_CUDNN", "${USE_CUDNN}"}, \
   {"CUDNN_VERSION", "${CUDNN_VERSION}"}, \
   {"USE_NCCL", "${USE_NCCL}"}, \
+  {"USE_XCCL", "${USE_XCCL}"}, \
   {"USE_MPI", "${USE_MPI}"}, \
   {"USE_GFLAGS", "${USE_GFLAGS}"}, \
   {"USE_GLOG", "${USE_GLOG}"}, \
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index ef33a3165340c1..49fb525afbf8a8 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -1150,6 +1150,22 @@ if(USE_CUDA)
   include_directories(SYSTEM ${CUB_INCLUDE_DIRS})
 endif()
 
+# ---[ XCCL
+if(USE_XCCL)
+  if(NOT USE_XPU)
+    message(WARNING
+        "Not using XPU, so disabling USE_NUSE_XCCLCCL. Suppress this warning with "
+        "-DUSE_XCCL=OFF.")
+    caffe2_update_option(USE_XCCL OFF)
+  elseif(NOT CMAKE_SYSTEM_NAME STREQUAL "Linux")
+    message(WARNING "USE_XCCL is currently only supported under Linux.")
+    caffe2_update_option(USE_XCCL OFF)
+  else()
+    include(${CMAKE_CURRENT_LIST_DIR}/External/xccl.cmake)
+    list(APPEND Caffe2_XPU_DEPENDENCY_LIBS __caffe2_xccl)
+  endif()
+endif()
+
 if(USE_DISTRIBUTED AND USE_TENSORPIPE)
   if(MSVC)
     message(WARNING "Tensorpipe cannot be used on Windows.")
diff --git a/cmake/External/xccl.cmake b/cmake/External/xccl.cmake
new file mode 100644
index 00000000000000..d1e8f33881b80b
--- /dev/null
+++ b/cmake/External/xccl.cmake
@@ -0,0 +1,13 @@
+if(NOT __XCCL_INCLUDED)
+  set(__XCCL_INCLUDED TRUE)
+
+  if(USE_XCCL)
+    # XCCL_ROOT, XCCL_LIBRARY_DIR, XCCL_INCLUDE_DIR are handled by FindXCCL.cmake.
+    find_package(XCCL REQUIRED)
+    if(XCCL_FOUND)
+      add_library(__caffe2_xccl INTERFACE)
+      target_link_libraries(__caffe2_xccl INTERFACE ${XCCL_LIBRARY})
+      target_include_directories(__caffe2_xccl INTERFACE ${XCCL_INCLUDE_DIR})
+    endif()
+  endif()
+endif()
diff --git a/cmake/Modules/FindXCCL.cmake b/cmake/Modules/FindXCCL.cmake
new file mode 100644
index 00000000000000..3f30e8cd23d6e7
--- /dev/null
+++ b/cmake/Modules/FindXCCL.cmake
@@ -0,0 +1,68 @@
+# This will define the following variables:
+# XCCL_FOUND               : True if the system has the XCCL library.
+# XCCL_INCLUDE_DIR         : Include directories needed to use XCCL.
+# XCCL_LIBRARY_DIR         ：The path to the XCCL library.
+# XCCL_LIBRARY             : XCCL library fullname.
+
+include(FindPackageHandleStandardArgs)
+
+set(XCCL_ROOT "")
+if(DEFINED ENV{CCL_ROOT})
+  set(XCCL_ROOT $ENV{CCL_ROOT})
+endif()
+
+string(COMPARE EQUAL "${XCCL_ROOT}" "" nosyclfound)
+if(nosyclfound)
+  set(XCCL_FOUND False)
+  set(XCCL_REASON_FAILURE "XCCL library not set!!")
+  set(XCCL_NOT_FOUND_MESSAGE "${XCCL_REASON_FAILURE}")
+  return()
+endif()
+
+# Find include path from binary.
+find_file(
+  XCCL_INCLUDE_DIR
+  NAMES include
+  HINTS ${XCCL_ROOT}
+  NO_DEFAULT_PATH
+)
+
+# Find include/sycl path from include path.
+find_file(
+  XCCL_INCLUDE_ONEAPI_DIR
+  NAMES oneapi
+  HINTS ${XCCL_ROOT}/include/
+  NO_DEFAULT_PATH
+)
+
+list(APPEND XCCL_INCLUDE_DIR ${XCCL_INCLUDE_ONEAPI_DIR})
+
+# Find library directory from binary.
+find_file(
+  XCCL_LIBRARY_DIR
+  NAMES lib
+  HINTS ${XCCL_ROOT}
+  NO_DEFAULT_PATH
+)
+
+# Find XCCL library fullname.
+find_library(
+  XCCL_LIBRARY
+  NAMES ccl
+  HINTS ${XCCL_LIBRARY_DIR}
+  NO_DEFAULT_PATH
+)
+
+if((NOT XCCL_INCLUDE_DIR) OR (NOT XCCL_LIBRARY_DIR) OR (NOT XCCL_LIBRARY))
+  set(XCCL_FOUND False)
+  set(XCCL_REASON_FAILURE "XCCL library is incomplete!!")
+  set(XCCL_NOT_FOUND_MESSAGE "${XCCL_REASON_FAILURE}")
+  return()
+endif()
+
+find_package_handle_standard_args(
+  XCCL
+  FOUND_VAR XCCL_FOUND
+  REQUIRED_VARS XCCL_INCLUDE_DIR XCCL_LIBRARY_DIR XCCL_LIBRARY
+  REASON_FAILURE_MESSAGE "${XCCL_REASON_FAILURE}"
+)
diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
index d51c451589c2c4..0b601cf2a6a329 100644
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@@ -153,6 +153,11 @@ function(caffe2_print_configuration_summary)
     message(STATUS "    USE_SYSTEM_UCC        : ${USE_SYSTEM_UCC}")
   endif()
   message(STATUS "  USE_ITT               : ${USE_ITT}")
+  message(STATUS "  USE_XCCL              : ${USE_XCCL}")
+  if(${USE_XCCL})
+    message(STATUS "    XCCL include path   : ${XCCL_INCLUDE_DIR}")
+    message(STATUS "    XCCL library        : ${XCCL_LIBRARY}")
+  endif()
   message(STATUS "  USE_NCCL              : ${USE_NCCL}")
   if(${USE_NCCL})
     message(STATUS "    USE_SYSTEM_NCCL     : ${USE_SYSTEM_NCCL}")
diff --git a/setup.py b/setup.py
index 92f1e2ddc7bcd3..e6191c0616db4a 100644
--- a/setup.py
+++ b/setup.py
@@ -645,6 +645,10 @@ def run(self):
             report("-- Building NCCL library")
         else:
             report("-- Not using NCCL")
+        if cmake_cache_vars["USE_XCCL"]:
+            report("-- Building XCCL library")
+        else:
+            report("-- Not using XCCL") 
         if cmake_cache_vars["USE_DISTRIBUTED"]:
             if IS_WINDOWS:
                 report("-- Building without distributed package")
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index bb949a081c95e9..8ab7d7aeb095b6 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -282,6 +282,9 @@ if(USE_DISTRIBUTED)
     if(USE_NCCL)
       list(APPEND TORCH_PYTHON_LINK_LIBRARIES __caffe2_nccl)
     endif()
+    if(USE_XCCL)
+      list(APPEND TORCH_PYTHON_LINK_LIBRARIES __caffe2_xccl)
+    endif()
     # Same for MPI.
     if(USE_MPI)
       list(APPEND TORCH_PYTHON_LINK_LIBRARIES MPI::MPI_CXX)
@@ -345,6 +348,10 @@ if(BUILD_LIBTORCHLESS)
     target_compile_definitions(torch_python PRIVATE USE_C10D_NCCL)
   endif()
 
+  if(USE_XPU AND USE_C10D_XCCL)
+    target_compile_definitions(torch_python PRIVATE USE_C10D_XCCL)
+  endif()
+
   if(USE_DISTRIBUTED)
     target_compile_definitions(torch_python PRIVATE USE_DISTRIBUTED)
   endif()
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
new file mode 100644
index 00000000000000..9466a0c091c99c
--- /dev/null
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -0,0 +1,356 @@
+#include <torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp>
+#include <fstream>
+#include <mutex>
+#include <sstream>
+
+#ifdef USE_C10D_XCCL
+#include <exception>
+#include <map>
+#include <stdexcept>
+#include <tuple>
+#include <unordered_set>
+#include <utility>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDAGraph.h>
+#include <c10/core/DeviceType.h>
+#include <c10/cuda/CUDAAllocatorConfig.h>
+#include <c10/cuda/CUDAGraphsC10Utils.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/util/CallOnce.h>
+#include <c10/util/Exception.h>
+#include <c10/util/Logging.h>
+#include <c10/util/Optional.h>
+#include <c10/util/irange.h>
+#include <torch/csrc/cuda/nccl.h>
+#include <torch/csrc/distributed/c10d/ParamCommsUtils.hpp>
+#include <torch/csrc/distributed/c10d/TraceUtils.h>
+#include <torch/csrc/distributed/c10d/Utils.hpp>
+#include <torch/torch.h>
+
+namespace c10d {
+
+namespace {
+std::map<c10d::ReduceOp, ccl::reduction> xcclOps =
+  {
+    {ReduceOp::MIN, ccl::reduction::min},
+    {ReduceOp::MAX, ccl::reduction::max},
+    {ReduceOp::SUM, ccl::reduction::sum},
+    {ReduceOp::PRODUCT, ccl::reduction::prod},
+  };
+
+std::map<at::ScalarType, ccl::datatype> xcclDatatypes =
+  {
+    {at::kByte, ccl::datatype::uint8},
+    {at::kChar, ccl::datatype::int8},
+    {at::kShort, ccl::datatype::int16},
+    {at::kInt, ccl::datatype::int32},
+    {at::kLong, ccl::datatype::int64},
+    {at::kHalf, ccl::datatype::float16},
+    {at::kFloat, ccl::datatype::float32},
+    {at::kDouble, ccl::datatype::float64},
+    {at::kBFloat16, ccl::datatype::bfloat16},
+    {at::kBool, ccl::datatype::uint8},
+  };
+
+void check_gpu_single_tensor(
+    const at::Tensor& tensor
+) {
+  if (!tensor.is_xpu() || tensor.is_sparse()) {
+    C10_THROW_ERROR(ValueError, "Tensors must be XPU and dense");
+  }
+  if (!tensor.is_contiguous(tensor.suggest_memory_format())) {
+      C10_THROW_ERROR(ValueError, "Tensors must be contiguous");
+    }
+  }
+}
+
+} // namespace
+
+namespace {
+
+ProcessGroupXCCL::WorkXCCL::WorkXCCL(std::vector<std::vector<at::Tensor>> outputTensors,
+                                            int rank,
+                                            c10d::OpType opType,
+                                            const c10::optional<std::vector<at::Tensor>>& inputTensors)
+        : Work(rank, opType, nullptr, inputTensors),
+          outputTensors_(std::move(outputTensors)),
+          future_(createFutureAsOutput(outputTensors)
+          );
+
+c10::intrusive_ptr<c10::ivalue::Future> ProcessGroupXCCL::WorkXCCL::getFuture() {
+  return future_;
+}
+
+c10::intrusive_ptr<Backend> ProcessGroupXCCL::createProcessGroupXCCL(
+    const c10::intrusive_ptr<Store>& store,
+    int rank,
+    int size)
+{
+  return c10::make_intrusive<ProcessGroupXCCL>(store, rank, size);
+}
+
+c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL> ProcessGroupNCCL::initWork(
+    at::Device& device,
+    int rank,
+    OpType opType,
+    const std::vector<at::Tensor>& inputs,
+    const std::vector<at::Tensor>& outputs,
+    bool record) {
+  auto r = c10::make_intrusive<ProcessGroupNCCL::WorkXCCL>(
+      device,
+      rank,
+      opType,
+      seqCollective_,
+      profilingTitle,
+      profilingTitle != nullptr ? std::optional<std::vector<at::Tensor>>(inputs)
+                                : std::nullopt,
+      desyncDebug_,
+      enableTiming_.load(),
+      dist_debug_level_);
+  if (record) {
+    bool isP2P = isP2POp(opType);
+    r->trace_id_ = NCCLTraceBuffer::get()->record(
+        local_id_,
+        std::make_tuple(pg_uid_, pg_desc_),
+        seqCollective_,
+        seqP2P_,
+        op_id_,
+        profilingTitle ? profilingTitle : "",
+        inputs,
+        outputs,
+        r->ncclStartEvent_.get(),
+        r->ncclEndEvent_.get(),
+        options_->timeout,
+        pgStatus_,
+        isP2P);
+  }
+  return r;
+}
+
+ProcessGroupXCCL::~ProcessGroupXCCL()
+{
+}
+
+std::shared_ptr<XCCLComm> ProcessGroupXCCL::getXCCLComm(
+    const std::string& deviceKey,
+    at::Device& device) {
+
+  if (deviceKey.empty()) {
+    C10_THROW_ERROR(
+        DistBackendError,
+        "Not able to create/get the CCL Communicator since "
+            "the devices are empty ");
+  }
+
+  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (devXCCLCommMap_.find(deviceKey) != devXCCLCommMap_.end()) {
+      return devXCCLCommMap_[deviceKey];
+    }
+  }
+
+  std::shared_ptr<XCCLComm> xcclComm;
+
+  XCCL_KVS kvs = get_kvs(rank_, store_);
+
+  int numRanks, rank;
+  numRanks = getSize();
+  rank = getRank();
+
+  ccl::vector_class<ccl::pair_class<int, ccl::device>> devs_rank;
+  c10::impl::VirtualGuardImpl impl(device.type());
+  c10::Stream stream = impl.getStream(device);
+  auto q = get_sycl_queue(stream);
+  auto ctx = ccl::create_context(q.get_context());
+  devs_rank.emplace_back(rank, ccl::create_device(q.get_device()));
+  auto xcclComm = ccl::create_communicator(numRanks, devs_rank, ctx, kvs);
+
+  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    inInitializationCommMap_.emplace(deviceKey, ncclComm);
+  }
+
+  auto it = inInitializationCommMap_.find(deviceKey);
+  if (it != inInitializationCommMap_.end()) {
+    devXCCLCommMap_.emplace(deviceKey, std::move(it->second));
+    inInitializationCommMap_.erase(deviceKey);
+
+    ncclCommDevIdxMapMutex.lock();
+    ncclCommDevIdxMap.emplace(ncclComm, device.index());
+    ncclCommDevIdxMapMutex.unlock();
+  }
+
+  it = devXCCLCommMap_.find(deviceKey);
+  TORCH_INTERNAL_ASSERT(
+      it != devXCCLCommMap_.end(), "Communicators not populated in cache!");
+
+  return it->second;
+}
+
+template <typename Fn, typename PreProcess, typename PostProcess>
+c10::intrusive_ptr<Work> ProcessGroupNCCL::collective(
+    at::Tensor& input,
+    at::Tensor& output,
+    Fn fn,
+    PreProcess pre,
+    PostProcess post,
+    OpType opType) {
+
+  auto device = input.device();
+  const auto key = std::to_string(device.index());
+  auto ncclComm = getXCCLComm(key, device);
+
+  std::vector<at::Tensor> inputs{input};
+  std::vector<at::Tensor> outputs{output};
+
+  auto work =
+      initWork(device, rank_, opType, profilingTitle, inputs, outputs, enqueue);
+
+  // Store references to outputs to be used by WorkNCCL::result and operator<<.
+  work->outputs_ =
+      std::make_shared<std::vector<at::Tensor>>(std::move(outputs));
+
+  if (avoidRecordStreams) {
+    work->stashed_for_allocator_safety_ =
+        std::make_shared<std::vector<at::Tensor>>();
+    work->stashed_for_allocator_safety_->push_back(input);
+  }
+
+  at::cuda::OptionalCUDAGuard gpuGuard;
+
+  // Start event should only be recorded before the ncclGroupStart()
+  if (work->timingEnabled_) {
+    work->ncclStartEvent_->record(ncclStream);
+  }
+
+  pre(ncclStream, work);
+
+  ncclComm_t comm = ncclComm->getNcclComm();
+
+  // Both `inputs' and `outputs' are created on a worker stream and used in
+  // different ncclStreams.  Hence, both must record the ncclStream to
+  // prevent being freed before the collective finishes.
+  //
+  // We only record `inputs' here, and leave recording `outputs' to `fn' for
+  // operations where `inputs' and `outputs' are not the same.
+  //
+  // See [Sync Streams].
+  if (!avoidRecordStreams) {
+    if (!input.is_sparse()) {
+      c10::cuda::CUDACachingAllocator::recordStream(
+          input.storage().data_ptr(), ncclStream);
+    } else {
+      // for sparse input case record streams on both index and value
+      // tensors
+      c10::cuda::CUDACachingAllocator::recordStream(
+          input.values().storage().data_ptr(), ncclStream);
+      c10::cuda::CUDACachingAllocator::recordStream(
+          input.indices().storage().data_ptr(), ncclStream);
+    }
+  }
+#ifndef NCCL_HAS_COMM_NONBLOCKING
+  C10D_NCCL_CHECK(
+      fn(input, output, comm, ncclStream),
+      ncclComm->getNcclCommFailureReason());
+#else
+  C10D_NCCL_CHECK_TIMEOUT(
+      fn(input, output, comm, ncclStream),
+      comm,
+      ncclComm->getNcclCommFailureReason());
+#endif
+
+  post(ncclStream, work);
+
+  // End event should only be recorded after the ncclGroupEnd()
+  if (!coalescing_state_) {
+    work->ncclEndEvent_->record(ncclStream);
+  }
+  work->ncclComm_ = ncclComm;
+
+  {
+    c10::cuda::CUDAMultiStreamGuard streamGuard(ncclStream);
+    std::vector<at::Device> devices{device};
+    work->future_ = c10::make_intrusive<at::ivalue::Future>(
+        c10::ListType::create(c10::TensorType::get()), devices);
+
+    // Add a callback that runs profiling end callbacks. wrapCallback() in CUDA
+    // future blocks the stream this callback runs on the corresponding
+    // ncclEndEvents_ ensuring appropriate synchronization.
+    if (work->recordFunctionEndCallback_) {
+      work->future_->addCallback(
+          [work](at::ivalue::Future& /* unused */) {
+            work->recordFunctionEndCallback_();
+          },
+          // uses_future = false allows us to skip synchronization in
+          // ivalue::Future, but is only valid as long as the lambda doesn't use
+          // the "Future" argument.
+          /*uses_future=*/false);
+    }
+    work->future_->markCompleted(at::IValue(*work->outputs_));
+  }
+
+  // Set appropriate work parameters.
+  work->blockingWait_ = blockingWait_;
+  work->avoidRecordStreams_ = avoidRecordStreams;
+  work->opTimeout_ = options_->timeout;
+  work->store_ = store_;
+  // Record size info for debug. We only record the size on the first device as
+  // multi-device per process is deprecated
+  work->numelIn_ = input.numel();
+  work->numelOut_ = output.numel();
+
+  // Notify graphs before we check the capture status preemptively
+  at::cuda::CUDAGraph::inc_pending_event_queries();
+  if (enqueue) {
+    workEnqueue(work);
+  } else {
+    at::cuda::CUDAGraph::dec_pending_event_queries();
+  }
+
+  return work;
+}
+
+c10::intrusive_ptr<Work> ProcessGroupXCCL::allreduce_impl(
+    at::Tensor& tensor,
+    const AllreduceOptions& opts) {
+  return collective(
+      tensor,
+      tensor,
+      [&](at::Tensor& input,
+          at::Tensor& output,
+          ncclComm_t comm,
+          at::cuda::CUDAStream& stream) {
+        auto ncclDataType = getNcclDataType(input.scalar_type());
+        auto ncclReduceOp =
+            getNcclReduceOp(opts.reduceOp, input, ncclDataType, comm);
+        return ncclAllReduce(
+            input.data_ptr(),
+            output.data_ptr(),
+            input.numel(),
+            ncclDataType,
+            ncclReduceOp,
+            comm,
+            stream.stream());
+      },
+      OpType::ALLREDUCE,
+      "nccl:all_reduce");
+}
+
+c10::intrusive_ptr<Work> ProcessGroupXCCL::allreduce(
+  std::vector<at::Tensor>& tensors,
+  const AllreduceOptions& opts)
+{
+  TORCH_CHECK(tensors.size() == 1, "Expecting one tensor only but got multiple");
+  auto tensor = tensors.back();
+  check_gpu_single_tensor(tensor);
+  if (opts.reduceOp == ReduceOp::SUM) {
+    TORCH_CHECK(false, "Cannot use ReduceOp SUM with XPU")
+  }
+  return allreduce_impl(tensor, opts);
+}
+
+
+}
+
+}
\ No newline at end of file
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
new file mode 100644
index 00000000000000..39f3c1a5e89964
--- /dev/null
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
@@ -0,0 +1,140 @@
+#pragma once
+
+#if defined(__linux__)
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#endif
+
+#ifdef USE_C10D_XCCL
+
+#include <oneapi/ccl.hpp>
+#include <exception>
+#include <memory>
+#include <vector>
+
+#include <atomic>
+#include <chrono>
+#include <future>
+#include <iostream>
+#include <list>
+#include <mutex>
+#include <thread>
+#include <unordered_map>
+
+#include <torch/csrc/distributed/c10d/Backend.hpp>
+#include <torch/csrc/distributed/c10d/NCCLUtils.hpp>
+#include <torch/csrc/distributed/c10d/PrefixStore.hpp>
+#include <torch/csrc/distributed/c10d/Store.hpp>
+#include <torch/csrc/distributed/c10d/intra_node_comm.hpp>
+
+namespace c10d {
+
+constexpr const char* XCCL_BACKEND_NAME = "xccl";
+
+class ProcessGroupXCCL : public Backend {
+public:
+  class WorkXCCL : public Work {
+  public:
+    WorkXCCL(        
+        std::vector<std::vector<at::Tensor>> outputTensors,
+        int rank = -1,
+        OpType opType = UNKNOWN,
+        const c10::optional<std::vector<at::Tensor>>& inputTensors = c10::nullopt)
+        : outputTensors_(std::move(outputTensors)) {}
+
+    WorkXCCL(const WorkXCCL& w)
+        : outputTensors_(w.outputTensors_), events_(w.events_) {}
+
+    ~WorkXCCL() override {
+        // Ensures all events are properly handled before destruction
+        for (auto& event : events_) {
+            event.wait();
+        }
+    }
+
+    bool isCompleted() override {
+      for (const auto& event : events_) {
+        if (!event.test()) {
+          return false;
+        }
+      }
+      return true;
+    }
+
+    bool isSuccess() const override {
+        TORCH_CHECK(false, "ProcessGroupXCCL::WorkXCCL::isSuccess not implemented");
+    }
+
+    void abort() override {
+        TORCH_CHECK(false, "ProcessGroupXCCL::WorkXCCL::abort not implemented");
+    }
+
+    void synchronize() override {
+      for (auto& event : events_) {
+        event.wait();
+      }
+    }
+
+    void wait() override {
+      for (auto& event : events_) {
+        call_with_lock(globalMutex, [&]() {
+            CCL_CHECK(event.wait());
+        });
+      }
+      events_.clear();
+    }
+
+    c10::intrusive_ptr<c10::ivalue::Future> getFuture() override;
+
+    std::vector<at::Tensor> result() override {
+        return outputTensors_.empty() ? std::vector<at::Tensor>() : outputTensors_[0];
+    }
+
+  protected:
+    friend class ProcessGroupXCCL;
+    std::vector<ccl::event> events_;
+    const std::vector<std::vector<at::Tensor>> outputTensors_;
+    c10::intrusive_ptr<at::ivalue::Future> future_;
+  };
+
+  explicit ProcessGroupXCCL(const c10::intrusive_ptr<Store>& store,
+                            int rank,
+                            int size)
+      : store_(store), rank_(rank), size_(size) {
+      }
+
+  virtual ~ProcessGroupXCCL();
+
+  const std::string getBackendName() const override {
+    return std::string(XCCL_BACKEND_NAME);
+  }
+
+  c10::intrusive_ptr<Work> allreduce(
+      std::vector<at::Tensor>& tensors,
+      const AllreduceOptions& opts = AllreduceOptions()) override;
+
+  c10::intrusive_ptr<Work> barrier(
+      const BarrierOptions& opts = BarrierOptions()) override;
+
+  static c10::intrusive_ptr<Backend> createProcessGroupXCCL(
+      const c10::intrusive_ptr<Store>& store,
+      int rank = -1,
+      int size = -1);
+
+private:
+  int rank_;
+  int size_;
+
+public:
+  std::unordered_map<std::string, std::shared_ptr<XCCLComm>>
+      inInitializationCommMap_;
+  std::unordered_map<std::string, std::shared_ptr<XCCLComm>> devXCCLCommMap_;
+  c10::intrusive_ptr<Store> store_;
+  std::mutex mutex_;
+};
+
+} // namespace c10d
+
+#endif // USE_C10D_XCCL
diff --git a/torch/csrc/distributed/c10d/XCCLUtils.hpp b/torch/csrc/distributed/c10d/XCCLUtils.hpp
new file mode 100644
index 00000000000000..d52f3df8ea466d
--- /dev/null
+++ b/torch/csrc/distributed/c10d/XCCLUtils.hpp
@@ -0,0 +1,334 @@
+#pragma once
+
+#ifdef USE_C10D_XCCL
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <memory>
+#include <mutex>
+#include <thread>
+
+#include <ATen/ATen.h>
+#include <c10/util/Exception.h>
+#include <c10/util/Optional.h>
+#include <xccl.h>
+
+// RAII wrapper for NCCL communicator
+class XCCLComm {
+ public:
+  explicit XCCLComm(ncclComm_t ncclComm)
+      : ncclComm_(ncclComm),
+        aborted_(false),
+        ncclAsyncErr_(ncclSuccess),
+        commFailureReason_(c10::nullopt),
+        initialized_(false) {}
+
+  NCCLComm() : NCCLComm(nullptr) {}
+
+  ~NCCLComm() noexcept {
+    // Add lock in this destructor, as aborted_ needs to be read after memory
+    // barrier here.
+    std::unique_lock<std::mutex> lock(mutex_);
+    if (ncclComm_ && !aborted_) {
+#ifdef ENABLE_NCCL_ERROR_CHECKING
+      // Use ncclCommAbort instead of ncclCommDestroy here since
+      // ncclCommDestroy could block forever waiting for work to complete on
+      // the communicator.
+      C10D_NCCL_ASSERT(::ncclCommAbort(ncclComm_));
+#else
+      C10D_NCCL_ASSERT(::ncclCommDestroy(ncclComm_));
+#endif
+    }
+  }
+
+  static std::shared_ptr<NCCLComm> create(
+      int numRanks,
+      int rank,
+      ncclUniqueId commId) {
+    auto comm = std::make_shared<NCCLComm>();
+    C10D_NCCL_CHECK(
+        ncclCommInitRank(&(comm->ncclComm_), numRanks, commId, rank),
+        c10::nullopt);
+    comm->ncclId_ = commId;
+    comm->rank_ = rank;
+    comm->initialized_ = true;
+    return comm;
+  }
+
+#ifdef NCCL_HAS_COMM_NONBLOCKING
+  static std::shared_ptr<NCCLComm> create(
+      int numRanks,
+      int rank,
+      ncclUniqueId commId,
+      ncclConfig_t& config) {
+    auto comm = std::make_shared<NCCLComm>();
+    bool isInitialized = false;
+    if (nccl_use_nonblocking()) {
+      config.blocking = 0;
+      LOG(INFO) << "Rank " << rank
+                << ": creating NCCL communicator in nonblocking mode";
+      C10D_NCCL_CHECK_NONBLOCKING(
+          ncclCommInitRankConfig(
+              &(comm->ncclComm_), numRanks, commId, rank, &config),
+          c10::nullopt);
+    } else {
+      C10D_NCCL_CHECK(
+          ncclCommInitRankConfig(
+              &(comm->ncclComm_), numRanks, commId, rank, &config),
+          c10::nullopt);
+      // under blocking mode, comm is initialized after NCCL CHECK
+      isInitialized = true;
+    }
+    comm->ncclId_ = commId;
+    comm->rank_ = rank;
+    comm->initialized_ = isInitialized;
+    return comm;
+  }
+#endif
+
+#ifdef NCCL_HAS_COMM_SPLIT
+  static std::shared_ptr<NCCLComm> split(
+      NCCLComm* source,
+      int color_id,
+      int rank,
+      ncclConfig_t& config) {
+    auto comm = std::make_shared<NCCLComm>();
+    C10D_NCCL_CHECK(
+        ncclCommSplit(
+            source->ncclComm_, color_id, rank, &(comm->ncclComm_), &config),
+        c10::nullopt);
+    ++source->ncclCommSplitCounter_;
+    comm->rank_ = rank;
+    return comm;
+  }
+#endif
+
+#if defined(IS_NCCL_EXP) && defined(NCCL_COMM_DUMP)
+  std::unordered_map<std::string, std::string> ncclCommDump() {
+    std::unordered_map<std::string, std::string> dump;
+    if (isAborted()) {
+      LOG(INFO) << "Communicator was aborted before trying to dump its state.";
+      return dump;
+    }
+    C10D_NCCL_CHECK(::ncclCommDump(ncclComm_, dump), c10::nullopt);
+    return dump;
+  }
+#endif
+
+  ncclUniqueId getNcclId() {
+    return ncclId_;
+  }
+
+  // Must not be copyable
+  NCCLComm(const NCCLComm&) = delete;
+  NCCLComm& operator=(const NCCLComm&) = delete;
+
+  // Do not support move assignment as there is no valid use case
+  NCCLComm& operator=(NCCLComm&& other) = delete;
+
+  // Move constructable
+  NCCLComm(NCCLComm&& other) {
+    // Using other's lock, as it reads other's states
+    // Can not use this.mutex_, as this object is being constructed.
+    std::unique_lock<std::mutex> lock(other.mutex_);
+    std::swap(ncclComm_, other.ncclComm_);
+    std::swap(aborted_, other.aborted_);
+    std::swap(ncclAsyncErr_, other.ncclAsyncErr_);
+    std::swap(initialized_, other.initialized_);
+  }
+
+  ncclComm_t getNcclComm();
+
+  c10::optional<std::string> getNcclCommFailureReason() const {
+    std::unique_lock<std::mutex> lock(mutex_);
+    return commFailureReason_;
+  }
+
+  void ncclCommAbort(
+      c10::optional<std::string> commFailureReason = c10::nullopt) {
+    std::unique_lock<std::mutex> lock(mutex_);
+#ifdef ENABLE_NCCL_ERROR_CHECKING
+    if (aborted_) {
+      // Should not abort twice.
+      return;
+    }
+
+#ifdef NCCL_HAS_COMM_REGISTER
+    // Deregister all registered segments before aborting.
+    for (auto& it : registeredSegmentHandles_) {
+      void* handle = it.second;
+      C10D_NCCL_CHECK(
+          ::ncclCommDeregister(ncclComm_, handle),
+          c10::str(
+              "Failed to deregister segment handle ",
+              handle,
+              " on ncclComm_ ",
+              ncclComm_));
+    }
+    registeredSegmentHandles_.clear();
+#endif
+
+    // Set true failure reason if provided by ProcessGroupNCCL (e.g. work
+    // timeout)
+    commFailureReason_ = commFailureReason;
+    LOG(INFO) << "Aborting ncclComm_ " << ncclComm_ << " with reason: "
+              << (commFailureReason ? *commFailureReason
+                                    : "No abort reason provided.");
+#ifndef NCCL_HAS_COMM_NONBLOCKING
+    C10D_NCCL_CHECK(::ncclCommAbort(ncclComm_), commFailureReason_);
+#else
+    C10D_NCCL_CHECK_TIMEOUT(
+        ::ncclCommAbort(ncclComm_), ncclComm_, commFailureReason_);
+#endif
+    aborted_ = true;
+    ncclComm_ = nullptr;
+
+    // Set an appropriate error so that we avoid using the communicator.
+    if (ncclAsyncErr_ == ncclSuccess) {
+      ncclAsyncErr_ = ncclSystemError;
+    }
+#else
+    // This is a NOOP, if error checks are disabled.
+    return;
+#endif
+  }
+
+  bool isAborted() const {
+    std::unique_lock<std::mutex> lock(mutex_);
+    return aborted_;
+  }
+
+  uint64_t getCommSplitCounter() const {
+    return ncclCommSplitCounter_;
+  }
+
+  ncclResult_t checkForNcclError() {
+    std::unique_lock<std::mutex> lock(mutex_);
+#ifdef ENABLE_NCCL_ERROR_CHECKING
+    if (ncclAsyncErr_ != ncclSuccess) {
+      return ncclAsyncErr_;
+    }
+    C10D_NCCL_CHECK(
+        ncclCommGetAsyncError(ncclComm_, &ncclAsyncErr_), commFailureReason_);
+    return ncclAsyncErr_;
+#else
+    // Always return success, if error checks are disabled.
+    return ncclSuccess;
+#endif
+  }
+
+  ncclResult_t registerSegment(void* ptr, size_t size) {
+    std::unique_lock<std::mutex> lock(mutex_);
+#ifdef NCCL_HAS_COMM_REGISTER
+    // We register only segments from cache allocator
+    // which are guaranteed to be with disjoint addr ranges. Thus, a ptr always
+    // maps to a unique handle and should not be registered before the current
+    // ptr is deregistered and freed.
+    TORCH_CHECK(
+        registeredSegmentHandles_.count(ptr) == 0,
+        "Segment with ptr ",
+        ptr,
+        " has already been registered on ncclComm_ ",
+        ncclComm_);
+
+    void* handle;
+    C10D_NCCL_CHECK(
+        ncclCommRegister(ncclComm_, ptr, size, &handle),
+        c10::str(
+            "Failed to register segment with ptr ",
+            ptr,
+            ", size ",
+            size,
+            " on ncclComm_ ",
+            ncclComm_));
+    registeredSegmentHandles_[ptr] = handle;
+    return ncclSuccess;
+#else
+    return ncclInvalidUsage;
+#endif
+  }
+
+  ncclResult_t deregisterSegment(void* ptr) {
+    std::unique_lock<std::mutex> lock(mutex_);
+#ifdef NCCL_HAS_COMM_REGISTER
+    TORCH_CHECK(
+        registeredSegmentHandles_.count(ptr) == 1,
+        "Segment with ptr ",
+        ptr,
+        " is not registered on ncclComm_ ",
+        ncclComm_);
+
+    void* handle = registeredSegmentHandles_[ptr];
+    C10D_NCCL_CHECK(
+        ncclCommDeregister(ncclComm_, handle),
+        c10::str(
+            "Failed to deregister segment handle ",
+            handle,
+            ", with ptr ",
+            ptr,
+            " on ncclComm_ ",
+            ncclComm_));
+    registeredSegmentHandles_.erase(ptr);
+    return ncclSuccess;
+#else
+    return ncclInvalidUsage;
+#endif
+  }
+
+  friend class ProcessGroupNCCL;
+
+ protected:
+  // a helper function to wait until the communicator is initialized;
+  void waitUntilInitialized(int timeoutSecs);
+  ncclComm_t ncclComm_;
+  // Unique nccl_id for this communicator.
+  ncclUniqueId ncclId_;
+  bool aborted_;
+  uint64_t ncclCommSplitCounter_{0};
+  ncclResult_t ncclAsyncErr_;
+  mutable std::mutex mutex_;
+  // Rank that this communicator corresponds to.
+  int rank_;
+  // Optional reason for communicator failure, provided by ProcessGroupNCCL for
+  // better error messaging.
+  c10::optional<std::string> commFailureReason_;
+  bool initialized_{false};
+#ifdef NCCL_HAS_COMM_REGISTER
+  // Stores handlers for tensors registered by NCCL
+  std::unordered_map<void*, void*> registeredSegmentHandles_;
+#endif
+};
+
+// Helper that automatically cleans up premul sums.
+struct ncclRedOpRAII {
+  ncclRedOpRAII() = default;
+  ncclRedOpRAII(ncclRedOp_t op) : op_(op) {}
+  ncclRedOpRAII(ncclRedOp_t op, ncclComm_t comm)
+      : op_(op), comm_(comm), premul_sum_(true) {}
+  ncclRedOpRAII(const ncclRedOpRAII&) = delete;
+  ncclRedOpRAII& operator=(const ncclRedOpRAII&) = delete;
+  ncclRedOpRAII(ncclRedOpRAII&& tmp) : ncclRedOpRAII() {
+    std::swap(tmp.op_, this->op_);
+    std::swap(tmp.comm_, this->comm_);
+    std::swap(tmp.premul_sum_, this->premul_sum_);
+  }
+#if defined(ENABLE_NCCL_PREMUL_SUM_SUPPORT)
+  ~ncclRedOpRAII() {
+    if (premul_sum_) {
+      ncclRedOpDestroy(op_, comm_);
+    }
+  }
+#endif
+  operator ncclRedOp_t() const {
+    return op_;
+  }
+  ncclRedOp_t op_;
+  ncclComm_t comm_;
+  bool premul_sum_ = false;
+};
+
+} // namespace c10d
+
+#endif // USE_C10D_NCCL
+
diff --git a/torch/csrc/xpu/xccl.cpp b/torch/csrc/xpu/xccl.cpp
new file mode 100644
index 00000000000000..5304b43f57d410
--- /dev/null
+++ b/torch/csrc/xpu/xccl.cpp
@@ -0,0 +1,923 @@
+#include <ATen/core/functional.h>
+#include <torch/csrc/cuda/device_set.h>
+#include <torch/csrc/cuda/nccl.h>
+
+#include <ATen/ATen.h>
+#include <c10/cuda/CUDAException.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/util/Exception.h>
+#include <c10/util/hash.h>
+#include <c10/util/irange.h>
+
+#include <nccl.h>
+
+#include <limits>
+#include <sstream>
+#include <type_traits>
+#include <unordered_map>
+
+
+xcclComm_t* to_xccl_comm(torch::xpu::xccl::xcclComm_t* var) {
+  return reinterpret_cast<xcclComm_t*>(var);
+}
+
+xcclComm_t to_xccl_comm(torch::xpu::xccl::xcclComm_t var) {
+  return reinterpret_cast<xcclComm_t>(var);
+}
+
+
+xcclDataType_t to_nccl_data_type(c10::ScalarType type) {
+  switch (type) {
+    case at::kFloat:
+      return ccl::datatype::float32;
+    case at::kHalf:
+      return ccl::datatype::float16;
+    case at::kDouble:
+      return ccl::datatype::float64;
+    case at::kLong:
+      return ccl::datatype::int64;
+    case at::kInt:
+      return ccl::datatype::int32;
+    case at::kChar:
+      return ccl::datatype::int8;
+    case at::kByte:
+      return ccl::datatype::uint8;
+    case at::kBool:
+      return ccl::datatype::uint8;
+    case at::kBFloat16:
+      return ccl::datatype::bfloat16;
+    default:
+      TORCH_CHECK(false, "Unconvertible XCCL type ", type);
+  }
+}
+
+ncclDataType_t to_xccl_data_type(const at::Tensor& t) {
+  if (!t.is_xpu()) {
+    TORCH_CHECK(
+        false,
+        "XCCL only supports XPU tensors, but got a tensor on ",
+        t.device());
+  }
+  return to_xccl_data_type(t.scalar_type());
+}
+
+ccl::reduction to_xccl_red_op(int var) {
+  return (ccl::reduction)(var);
+}
+
+namespace torch::xpu::xccl {
+
+XCCL_KVS get_kvs(int rank, c10d::Store& store) {
+  if (kvs)
+    return kvs;
+  // Each process group is with different store, so we use the unique key for
+  // broadcast the bootstrap network information.
+  std::string storeKey = "ccl_kvs";
+
+  // Rank 0 broadcast the bootstrap network information to other ranks
+  if (rank == 0) {
+    kvs = ccl::create_main_kvs();
+    ccl::kvs::address_type main_addr = kvs->get_address();
+    auto ccl_kvs_addr = std::vector<uint8_t>(main_addr.begin(), main_addr.end());
+    store.set(storeKey, ccl_kvs_addr);
+  }
+  else {
+    auto ccl_kvs_addr = store.get(storeKey);
+    if (ccl_kvs_addr.size() != ccl::kvs::address_max_size) {
+      throw std::runtime_error(
+              "Unexpected ccl kvs addr from the store\n");
+    }
+    ccl::kvs::address_type main_addr;
+    std::copy_n(std::make_move_iterator(ccl_kvs_addr.begin()),
+                ccl::kvs::address_max_size,
+                main_addr.begin());
+    kvs = ccl::create_kvs(main_addr);
+  }
+
+  return kvs;
+}
+
+
+using namespace at;
+
+namespace detail {
+
+void xcclCommInitAll(xcclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) {
+  for(int i = 0; i < nranks; i++) {
+    newcomm[i] = ccl::create_communicator(nranks, i, get_kvs_addr)
+  }
+  c10::Stream dpcpp_stream = impl.getStream(devices[0]);
+  ccl::vector_class<ccl::pair_class<int, ccl::device>> devs_rank;
+  newcomm = ccl::create_communicators(nranks, devs_rank, ctx, )
+}
+
+struct XcclCommList {
+  std::unique_ptr<xcclComm_t[]> comms;
+  int ndevices;
+  XcclCommList(const std::vector<int>& devices)
+      : comms(new xcclComm_t[devices.size()]), ndevices(devices.size()) {
+    xcclCommInitAll(
+        to_xccl_comm(comms.get()), devices.size(), devices.data());
+  }
+  NcclCommList(NcclCommList&& foo) = default;
+  ~NcclCommList() {
+    if (comms) {
+      for (const auto i : c10::irange(ndevices)) {
+        comm_destroy(comms[i]);
+      }
+    }
+  }
+  ArrayRef<ncclComm_t> ref() const {
+    return ArrayRef<ncclComm_t>(comms.get(), ndevices);
+  }
+};
+
+using device_list = std::vector<int>;
+// accesses to this object have to be guarded by THC's CudaFreeMutex
+std::unordered_map<device_list, std::shared_ptr<Comms>> _communicators;
+static std::unordered_map<device_list, NcclCommList, c10::hash<device_list>>
+    _communicators;
+
+ArrayRef<xcclComm_t> get_communicators(TensorList inputs) {
+  static auto get_device = [](const at::Tensor& t) -> int {
+    return t.get_device();
+  };
+  device_list devices = fmap(inputs, get_device);
+  auto it = _communicators.find(devices);
+  if (it == _communicators.end()) {
+    it = _communicators.emplace(devices, devices).first;
+  }
+  return it->second;
+}
+
+static inline void check_tensor(
+    const at::Tensor& input,
+    const std::optional<at::Tensor>& output,
+    int input_multiplier,
+    int output_multiplier,
+    int64_t ref_numel,
+    ScalarType ref_dtype) {
+  auto check_one = [&](const at::Tensor& tensor) {
+    if (!tensor.is_xpu() || tensor.is_sparse()) {
+      throw std::runtime_error(
+          "input and output elements have to be xpu dense Tensors");
+    }
+
+    if (ref_dtype != tensor.scalar_type()) {
+      throw std::runtime_error(
+          "all inputs and outputs must be of the same Tensor dtype");
+    }
+
+    if (!tensor.is_contiguous()) {
+      throw std::runtime_error("all inputs and outputs have to be contiguous");
+    }
+  };
+
+  check_one(input);
+
+  // all inputs must be same size
+  if (input.numel() != ref_numel) {
+    throw std::runtime_error(
+        "all inputs must have the same number of elements");
+  }
+
+  if (output) {
+    check_one(*output);
+
+    // inputs and outputs must be on same device respectively
+    if (input.get_device() != output->get_device()) {
+      throw std::runtime_error("input and output must be on the same device");
+    }
+
+    if (output->numel() * output_multiplier != ref_numel * input_multiplier) {
+      throw std::runtime_error(
+          "output must be of size input_size * size_multiplier");
+    }
+  }
+}
+
+void check_inputs(
+    TensorList inputs,
+    TensorList outputs,
+    int input_multiplier,
+    int output_multiplier) {
+  // len(inputs) == len(outputs)
+  size_t len = inputs.size();
+
+  if (len <= 0) {
+    throw std::runtime_error("input sequence can't be empty");
+  }
+
+  if (len != outputs.size()) {
+    std::stringstream err;
+    err << "inputs and outputs sequences have to be of the same length, but got input of length "
+        << len << " and output of length " << outputs.size();
+    throw std::runtime_error(err.str());
+  }
+
+  device_set devices;
+  int64_t numel = inputs[0].numel();
+  auto dtype = inputs[0].scalar_type();
+
+  for (const auto i : c10::irange(len)) {
+    auto input = inputs[i];
+    auto output = outputs[i];
+
+    check_tensor(
+        input, output, input_multiplier, output_multiplier, numel, dtype);
+
+    auto input_device = input.get_device();
+    // inputs must be on unique devices
+    if (devices.test(input_device)) {
+      throw std::runtime_error("inputs must be on unique devices");
+    }
+    devices.set(input_device);
+  }
+}
+
+void check_inputs(
+    TensorList inputs,
+    const at::Tensor& output,
+    int root,
+    int input_multiplier,
+    int output_multiplier) {
+  auto len = inputs.size();
+
+  if (len <= 0) {
+    throw std::runtime_error("input sequence can't be empty");
+  }
+
+  device_set devices;
+  int64_t numel = inputs[0].numel();
+  auto dtype = inputs[0].scalar_type();
+
+  for (const auto i : c10::irange(len)) {
+    auto input = inputs[i];
+
+    check_tensor(
+        input,
+        i == static_cast<std::remove_cv_t<decltype(i)>>(root)
+            ? std::optional<at::Tensor>{output}
+            : std::nullopt,
+        input_multiplier,
+        output_multiplier,
+        numel,
+        dtype);
+
+    auto input_device = input.get_device();
+    // inputs must be on unique devices
+    if (devices.test(input_device)) {
+      throw std::runtime_error("inputs must be on unique devices");
+    }
+    devices.set(input_device);
+  }
+}
+
+} // namespace detail
+
+bool is_available(TensorList tensors) {
+#ifdef USE_XCCL
+  device_set devices;
+  for (auto& tensor : tensors) {
+    if (!tensor.is_xpu() || tensor.is_sparse())
+      return false;
+    if (!tensor.is_contiguous())
+      return false;
+    auto device = tensor.get_device();
+    if (devices[device])
+      return false;
+    devices[device] = true;
+  }
+  return true;
+#else
+  return false;
+#endif
+}
+
+std::uint64_t version() {
+#if defined(NCCL_MAJOR)
+  constexpr std::uint64_t ver = (((uint64_t)NCCL_MAJOR) << 32) |
+      (((uint64_t)NCCL_MINOR) << 16) | ((uint64_t)NCCL_PATCH);
+  return ver;
+#elif defined(USE_NCCL)
+  // return major version "1"
+  return ((uint64_t)1) << 32;
+#else
+  return 0;
+#endif
+}
+
+ncclComm_t comm_init_rank(int nranks, const ncclUniqueId& comm_id, int rank) {
+#ifdef USE_XCCL
+  using namespace torch::xpu::xccl::detail;
+  xcclComm_t comm;
+  ncclUniqueId id = comm_id;
+  NCCL_CHECK(ncclCommInitRank(
+      to_nccl_comm(&comm), nranks, *(to_nccl_unique_id(&id)), rank));
+  return comm;
+#else
+  return nullptr;
+#endif
+}
+
+
+namespace {
+// NCCL changed the numerical type used for count between NCCL1 and NCCL2.
+// So we use the following struct, which gets the type of the second argument
+// of T, if T is a function type, with ncclBcast, to get that type statically
+// and programmatically.
+
+template <typename T>
+struct GetSecondArgType;
+
+template <typename R, typename Arg0, typename Arg1, typename... Args>
+struct GetSecondArgType<R(Arg0, Arg1, Args...)> {
+  typedef typename std::decay<Arg1>::type type;
+};
+
+constexpr auto count_max =
+    std::numeric_limits<GetSecondArgType<decltype(ncclBcast)>::type>::max();
+
+// Since NCCL 2.12.10, NCCL supports send/recv 0 byte:
+// https://github.com/NVIDIA/nccl/issues/696. The issue of skipping send/recv
+// is that it can cause deadlock when a rank send and recv 0 bytes so it's
+// completely skipping the collective, causing mismatch across ranks
+#if defined(NCCL_MAJOR) && \
+    ((NCCL_MAJOR > 2) || ((NCCL_MAJOR == 2) && (NCCL_MINOR > 13)))
+template <typename T>
+constexpr bool _nccl_should_send_recv(C10_UNUSED T _unused_) {
+  return true;
+}
+#else
+// old NCCL uses 0 byte message for synchronization
+// Avoid send/recv when message size is zero
+template <typename T>
+inline bool _nccl_should_send_recv(T value) {
+  return value != 0;
+}
+#endif
+} // namespace
+
+size_t get_max_count() {
+  return count_max;
+}
+
+void broadcast(
+    TensorList tensors,
+    const stream_list& streams,
+    const comm_list& user_comms) {
+#ifdef USE_NCCL
+  using namespace torch::cuda::nccl::detail;
+  check_inputs(tensors, tensors, 1, 1);
+  auto data_type = to_nccl_data_type(tensors[0]);
+  int64_t numel = tensors[0].numel();
+
+  const auto comms = user_comms.empty() ? get_communicators(tensors)
+                                        : ArrayRef<ncclComm_t>(user_comms);
+
+  AutoNcclGroup nccl_group_guard;
+  at::cuda::OptionalCUDAGuard device_guard;
+  for (size_t i = 0, num_tensors = tensors.size(); i < num_tensors; i++) {
+    auto device = tensors[i].get_device();
+    device_guard.set_index(device);
+    // Default to the current stream
+    const auto stream = (streams.empty() || !streams[i])
+        ? at::cuda::getCurrentCUDAStream(device).stream()
+        : streams[i]->stream();
+    TORCH_CHECK(
+        static_cast<uint64_t>(numel) <= static_cast<uint64_t>(count_max),
+        "Broadcast tensor has ",
+        numel,
+        " elements, which exceeds the "
+        "maximum NCCL supports (",
+        count_max,
+        ")");
+    ncclComm_t comm = comms[i];
+    NCCL_CHECK(ncclBcast(
+        tensors[i].data_ptr(),
+        numel,
+        data_type,
+        0,
+        to_nccl_comm(comm),
+        stream));
+  }
+#else
+  AT_ERROR("PyTorch built without NCCL support");
+#endif
+}
+
+void reduce(
+    const std::vector<at::Tensor>& inputs,
+    at::Tensor& output,
+    int32_t root,
+    int32_t op,
+    const stream_list& streams,
+    const comm_list& user_comms) {
+#ifdef USE_NCCL
+  using namespace torch::cuda::nccl::detail;
+  TORCH_CHECK(
+      root >= 0 && static_cast<size_t>(root) < inputs.size(), "invalid root");
+
+  check_inputs(inputs, output, root, 1, 1);
+  const auto len = inputs.size();
+
+  auto data_type = to_nccl_data_type(inputs[0]);
+
+  const auto count = inputs[0].numel();
+  auto comms_ref = user_comms.empty() ? get_communicators(inputs)
+                                      : ArrayRef<ncclComm_t>(user_comms);
+
+  AutoNcclGroup nccl_group_guard;
+  at::cuda::OptionalCUDAGuard device_guard;
+  for (const auto i : c10::irange(len)) {
+    auto device = inputs[i].device().index();
+    device_guard.set_index(device);
+    // Default to the current stream
+    const auto stream = (streams.empty() || !streams[i])
+        ? at::cuda::getCurrentCUDAStream(device).stream()
+        : streams[i]->stream();
+
+    ncclComm_t comm = comms_ref[i];
+    NCCL_CHECK(ncclReduce(
+        inputs[i].data_ptr(),
+        static_cast<std::remove_cv_t<decltype(i)>>(root) == i
+            ? output.data_ptr()
+            : nullptr,
+        count,
+        data_type,
+        to_nccl_red_op(op),
+        root,
+        to_nccl_comm(comm),
+        stream));
+  }
+#else
+  AT_ERROR("PyTorch built without NCCL support");
+#endif
+}
+
+void reduce(
+    std::vector<at::Tensor>& inputs,
+    int32_t root,
+    int32_t op,
+    const stream_list& streams,
+    const comm_list& user_comms) {
+  reduce(inputs, /*output=*/inputs[root], root, op, streams, user_comms);
+}
+
+void all_reduce(
+    const std::vector<at::Tensor>& inputs,
+    std::vector<at::Tensor>& outputs,
+    int32_t op,
+    const stream_list& streams,
+    const comm_list& user_comms) {
+#ifdef USE_NCCL
+  using namespace torch::cuda::nccl::detail;
+  check_inputs(inputs, outputs, 1, 1);
+  const auto len = inputs.size();
+
+  auto data_type = to_nccl_data_type(inputs[0]);
+
+  const auto count = inputs[0].numel();
+  auto comms_ref = user_comms.empty() ? get_communicators(inputs)
+                                      : ArrayRef<ncclComm_t>(user_comms);
+
+  AutoNcclGroup nccl_group_guard;
+  at::cuda::OptionalCUDAGuard device_guard;
+  for (const auto i : c10::irange(len)) {
+    auto device = inputs[i].device().index();
+    device_guard.set_index(device);
+    // Default to the current stream
+    const auto stream = (streams.empty() || !streams[i])
+        ? at::cuda::getCurrentCUDAStream(device).stream()
+        : streams[i]->stream();
+
+    ncclComm_t comm = comms_ref[i];
+    NCCL_CHECK(ncclAllReduce(
+        inputs[i].data_ptr(),
+        outputs[i].data_ptr(),
+        count,
+        data_type,
+        to_nccl_red_op(op),
+        to_nccl_comm(comm),
+        stream));
+  }
+#else
+  AT_ERROR("PyTorch built without NCCL support");
+#endif
+}
+
+void reduce_scatter(
+    const std::vector<at::Tensor>& inputs,
+    std::vector<at::Tensor>& outputs,
+    int32_t op,
+    const stream_list& streams,
+    const comm_list& user_comms) {
+#ifdef USE_NCCL
+  using namespace torch::cuda::nccl::detail;
+  const auto len = inputs.size();
+  check_inputs(inputs, outputs, 1, len);
+
+  auto data_type = to_nccl_data_type(inputs[0]);
+
+  const auto count = inputs[0].numel() / len;
+  auto comms_ref = user_comms.empty() ? get_communicators(inputs)
+                                      : ArrayRef<ncclComm_t>(user_comms);
+
+  AutoNcclGroup nccl_group_guard;
+  at::cuda::OptionalCUDAGuard device_guard;
+  for (const auto i : c10::irange(len)) {
+    auto device = inputs[i].device().index();
+    device_guard.set_index(device);
+    // Default to the current stream
+    const auto stream = (streams.empty() || !streams[i])
+        ? at::cuda::getCurrentCUDAStream(device).stream()
+        : streams[i]->stream();
+
+    ncclComm_t comm = comms_ref[i];
+    NCCL_CHECK(ncclReduceScatter(
+        inputs[i].data_ptr(),
+        outputs[i].data_ptr(),
+        count,
+        data_type,
+        to_nccl_red_op(op),
+        to_nccl_comm(comm),
+        stream));
+  }
+#else
+  AT_ERROR("PyTorch built without NCCL support");
+#endif
+}
+
+void all_gather(
+    const std::vector<at::Tensor>& inputs,
+    std::vector<at::Tensor>& outputs,
+    const stream_list& streams,
+    const comm_list& user_comms) {
+#ifdef USE_NCCL
+  using namespace torch::cuda::nccl::detail;
+  const auto len = inputs.size();
+  check_inputs(inputs, outputs, len, 1);
+
+  auto data_type = to_nccl_data_type(inputs[0]);
+
+  const auto count = inputs[0].numel();
+  auto comms_ref = user_comms.empty() ? get_communicators(inputs)
+                                      : ArrayRef<ncclComm_t>(user_comms);
+
+  AutoNcclGroup nccl_group_guard;
+  at::cuda::OptionalCUDAGuard device_guard;
+  for (const auto i : c10::irange(len)) {
+    auto device = inputs[i].device().index();
+    device_guard.set_index(device);
+    // Default to the current stream
+    const auto stream = (streams.empty() || !streams[i])
+        ? at::cuda::getCurrentCUDAStream(device).stream()
+        : streams[i]->stream();
+
+    ncclComm_t comm = comms_ref[i];
+#if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2)
+    NCCL_CHECK(ncclAllGather(
+        inputs[i].data_ptr(),
+        outputs[i].data_ptr(),
+        count,
+        data_type,
+        to_nccl_comm(comm),
+        stream));
+#else
+    NCCL_CHECK(ncclAllGather(
+        inputs[i].data_ptr(),
+        count,
+        data_type,
+        outputs[i].data_ptr(),
+        to_nccl_comm(comm),
+        stream));
+#endif
+  }
+#else
+  AT_ERROR("PyTorch built without NCCL support");
+#endif
+}
+
+void all2all_single_equal_split(
+    at::Tensor& input,
+    at::Tensor& output,
+    int size,
+    ncclComm_t _comm,
+    at::cuda::CUDAStream& stream) {
+#ifdef USE_NCCL
+#if defined(NCCL_MAJOR) && \
+    ((NCCL_MAJOR > 2) || ((NCCL_MAJOR == 2) && (NCCL_MINOR >= 7)))
+  using namespace torch::cuda::nccl::detail;
+
+  int numranks;
+  auto type = to_nccl_data_type(input);
+  size_t count = input.numel() / size;
+  size_t rankdiff = input.nbytes() / size;
+  const auto* sendbuff = reinterpret_cast<const char*>(input.const_data_ptr());
+  auto* recvbuff = reinterpret_cast<char*>(output.data_ptr());
+  auto comm = to_nccl_comm(_comm);
+#if defined(USE_ROCM)
+  NCCL_CHECK(ncclAllToAll(sendbuff, recvbuff, count, type, comm, stream));
+#else
+  NCCL_CHECK(ncclCommCount(comm, &numranks));
+  NCCL_CHECK(ncclGroupStart());
+  for (const auto r : c10::irange(numranks)) {
+    if (_nccl_should_send_recv(count)) {
+      NCCL_CHECK(
+          ncclSend(sendbuff + r * rankdiff, count, type, r, comm, stream));
+      NCCL_CHECK(
+          ncclRecv(recvbuff + r * rankdiff, count, type, r, comm, stream));
+    }
+  }
+#ifndef NCCL_HAS_COMM_NONBLOCKING
+  NCCL_CHECK(ncclGroupEnd());
+#else
+  NCCL_CHECK_TIMEOUT(ncclGroupEnd(), _comm);
+#endif
+#endif
+#else
+  AT_ERROR("all2all is only supported for NCCL lib version >= 2.7.0");
+#endif
+#else
+  AT_ERROR("PyTorch built without NCCL support");
+#endif
+}
+
+void all2all_single_unequal_split(
+    void* sendbuff,
+    const size_t* sendcounts,
+    const size_t* senddispls,
+    void* recvbuff,
+    const size_t* recvcounts,
+    const size_t* recvdispls,
+    size_t size,
+    c10::ScalarType _type,
+    ncclComm_t _comm,
+    at::cuda::CUDAStream& stream) {
+#ifdef USE_NCCL
+#if defined(NCCL_MAJOR) && \
+    ((NCCL_MAJOR > 2) || ((NCCL_MAJOR == 2) && (NCCL_MINOR >= 7)))
+  using namespace torch::cuda::nccl::detail;
+
+  auto type = to_nccl_data_type(_type);
+  auto comm = to_nccl_comm(_comm);
+  int numranks;
+  NCCL_CHECK(ncclCommCount(comm, &numranks));
+  NCCL_CHECK(ncclGroupStart());
+  for (const auto r : c10::irange(numranks)) {
+    if (_nccl_should_send_recv(sendcounts[r])) {
+      NCCL_CHECK(ncclSend(
+          ((char*)sendbuff) + senddispls[r] * size,
+          sendcounts[r],
+          type,
+          r,
+          comm,
+          stream));
+    }
+    if (_nccl_should_send_recv(recvcounts[r])) {
+      NCCL_CHECK(ncclRecv(
+          ((char*)recvbuff) + recvdispls[r] * size,
+          recvcounts[r],
+          type,
+          r,
+          comm,
+          stream));
+    }
+  }
+#ifndef NCCL_HAS_COMM_NONBLOCKING
+  NCCL_CHECK(ncclGroupEnd());
+#else
+  NCCL_CHECK_TIMEOUT(ncclGroupEnd(), _comm);
+#endif
+#else
+  AT_ERROR("all2all is only supported for NCCL lib version >= 2.7.0");
+#endif
+#else
+  AT_ERROR("PyTorch built without NCCL support");
+#endif
+}
+
+void all2all(
+    std::vector<at::Tensor>& outputTensors,
+    std::vector<at::Tensor>& inputTensors,
+    ncclComm_t _comm,
+    at::cuda::CUDAStream& stream) {
+#ifdef USE_NCCL
+#if defined(NCCL_MAJOR) && \
+    ((NCCL_MAJOR > 2) || ((NCCL_MAJOR == 2) && (NCCL_MINOR >= 7)))
+  using namespace torch::cuda::nccl::detail;
+  auto comm = to_nccl_comm(_comm);
+
+  NCCL_CHECK(ncclGroupStart());
+  for (const auto r : c10::irange(outputTensors.size())) {
+    at::Tensor& input = inputTensors[r];
+    at::Tensor& output = outputTensors[r];
+
+    if (_nccl_should_send_recv(input.numel())) {
+      NCCL_CHECK(ncclSend(
+          input.data_ptr(),
+          input.numel(),
+          to_nccl_data_type(input),
+          r,
+          comm,
+          stream.stream()));
+    }
+    if (_nccl_should_send_recv(output.numel())) {
+      NCCL_CHECK(ncclRecv(
+          output.data_ptr(),
+          output.numel(),
+          to_nccl_data_type(output),
+          r,
+          comm,
+          stream.stream()));
+    }
+  }
+#ifndef NCCL_HAS_COMM_NONBLOCKING
+  NCCL_CHECK(ncclGroupEnd());
+#else
+  NCCL_CHECK_TIMEOUT(ncclGroupEnd(), _comm);
+#endif
+#else
+  AT_ERROR("all2all is only supported for NCCL lib version >= 2.7.0");
+#endif
+#else
+  AT_ERROR("PyTorch built without NCCL support");
+#endif
+}
+
+void send(
+    const at::Tensor& input,
+    ncclComm_t comm,
+    at::cuda::CUDAStream stream,
+    int dst) {
+#ifdef USE_NCCL
+#if defined(NCCL_MAJOR) && \
+    ((NCCL_MAJOR > 2) || ((NCCL_MAJOR == 2) && (NCCL_MINOR >= 7)))
+  using namespace torch::cuda::nccl::detail;
+#ifndef NCCL_HAS_COMM_NONBLOCKING
+  NCCL_CHECK(ncclSend(
+      input.data_ptr(),
+      input.numel(),
+      to_nccl_data_type(input),
+      dst,
+      to_nccl_comm(comm),
+      stream.stream()));
+#else
+  NCCL_CHECK_TIMEOUT(
+      ncclSend(
+          input.data_ptr(),
+          input.numel(),
+          to_nccl_data_type(input),
+          dst,
+          to_nccl_comm(comm),
+          stream.stream()),
+      comm);
+#endif
+#else
+  AT_ERROR("Send is only supported for NCCL lib version >= 2.7.0");
+#endif
+#else
+  AT_ERROR("PyTorch built without NCCL support");
+#endif
+}
+
+void recv(
+    at::Tensor& output,
+    ncclComm_t comm,
+    at::cuda::CUDAStream stream,
+    int src) {
+#ifdef USE_NCCL
+#if defined(NCCL_MAJOR) && \
+    ((NCCL_MAJOR > 2) || ((NCCL_MAJOR == 2) && (NCCL_MINOR >= 7)))
+  using namespace torch::cuda::nccl::detail;
+#ifndef NCCL_HAS_COMM_NONBLOCKING
+  NCCL_CHECK(ncclRecv(
+      output.data_ptr(),
+      output.numel(),
+      to_nccl_data_type(output),
+      src,
+      to_nccl_comm(comm),
+      stream.stream()));
+#else
+  NCCL_CHECK_TIMEOUT(
+      ncclRecv(
+          output.data_ptr(),
+          output.numel(),
+          to_nccl_data_type(output),
+          src,
+          to_nccl_comm(comm),
+          stream.stream()),
+      comm);
+#endif
+#else
+  AT_ERROR("Recv is only supported for NCCL lib version >= 2.7.0");
+#endif
+#else
+  AT_ERROR("PyTorch built without NCCL support");
+#endif
+}
+
+void gather(
+    const at::Tensor& inputs,
+    std::vector<at::Tensor>& outputs,
+    ncclComm_t _comm,
+    at::cuda::CUDAStream& stream,
+    int32_t root) {
+#ifdef USE_NCCL
+#if defined(NCCL_MAJOR) && \
+    ((NCCL_MAJOR > 2) || ((NCCL_MAJOR == 2) && (NCCL_MINOR >= 7)))
+  using namespace torch::cuda::nccl::detail;
+
+  auto comm = to_nccl_comm(_comm);
+  int numranks, cur_rank;
+  NCCL_CHECK(ncclCommCount(comm, &numranks));
+  NCCL_CHECK(ncclCommUserRank(comm, &cur_rank));
+
+  size_t count = inputs.numel();
+  auto type = to_nccl_data_type(inputs);
+  const auto* sendbuff = reinterpret_cast<const char*>(inputs.const_data_ptr());
+
+  NCCL_CHECK(ncclGroupStart());
+
+  if (cur_rank == root) {
+    for (const auto r : c10::irange(numranks)) {
+      if (r != root) {
+        auto* recvbuff = reinterpret_cast<char*>(outputs[r].data_ptr());
+        NCCL_CHECK(ncclRecv(recvbuff, count, type, r, comm, stream));
+      } else {
+        // on its own rank, simply copy from the input
+        outputs[r].copy_(inputs);
+      }
+    }
+  } else {
+    NCCL_CHECK(ncclSend(sendbuff, count, type, root, comm, stream));
+  }
+#ifndef NCCL_HAS_COMM_NONBLOCKING
+  NCCL_CHECK(ncclGroupEnd());
+#else
+  NCCL_CHECK_TIMEOUT(ncclGroupEnd(), _comm);
+#endif
+
+#else
+  AT_ERROR("gather is only supported for NCCL lib version >= 2.7.0");
+#endif
+#else
+  AT_ERROR("PyTorch built without NCCL support");
+#endif
+}
+
+void scatter(
+    const std::vector<at::Tensor>& inputs,
+    at::Tensor& outputs,
+    ncclComm_t _comm,
+    at::cuda::CUDAStream& stream,
+    int32_t root) {
+#ifdef USE_NCCL
+#if defined(NCCL_MAJOR) && \
+    ((NCCL_MAJOR > 2) || ((NCCL_MAJOR == 2) && (NCCL_MINOR >= 7)))
+  using namespace torch::cuda::nccl::detail;
+
+  auto comm = to_nccl_comm(_comm);
+  int numranks, cur_rank;
+#ifndef NCCL_HAS_COMM_NONBLOCKING
+  NCCL_CHECK(ncclCommCount(comm, &numranks));
+  NCCL_CHECK(ncclCommUserRank(comm, &cur_rank));
+#else
+  NCCL_CHECK_TIMEOUT(ncclCommCount(comm, &numranks), _comm);
+  NCCL_CHECK_TIMEOUT(ncclCommUserRank(comm, &cur_rank), _comm);
+#endif
+  NCCL_CHECK(ncclGroupStart());
+  if (cur_rank == root) {
+    for (const auto r : c10::irange(numranks)) {
+      if (r != root) {
+        size_t send_count = inputs[r].numel();
+        auto send_type = to_nccl_data_type(inputs[r]);
+        const auto* sendbuff =
+            reinterpret_cast<const char*>(inputs[r].const_data_ptr());
+        NCCL_CHECK(ncclSend(sendbuff, send_count, send_type, r, comm, stream));
+      } else {
+        // on its own rank, simply copy it to the output
+        outputs.copy_(inputs[r]);
+      }
+    }
+  } else {
+    size_t recv_count = outputs.numel();
+    auto recv_type = to_nccl_data_type(outputs);
+    auto* recvbuff = reinterpret_cast<char*>(outputs.data_ptr());
+    NCCL_CHECK(ncclRecv(recvbuff, recv_count, recv_type, root, comm, stream));
+  }
+#ifndef NCCL_HAS_COMM_NONBLOCKING
+  NCCL_CHECK(ncclGroupEnd());
+#else
+  NCCL_CHECK_TIMEOUT(ncclGroupEnd(), _comm);
+#endif
+#else
+  AT_ERROR("scatter is only supported for NCCL lib version >= 2.7.0");
+#endif
+#else
+  AT_ERROR("PyTorch built without NCCL support");
+#endif
+}
+
+} // namespace torch::cuda::nccl
+
diff --git a/torch/csrc/xpu/xccl.h b/torch/csrc/xpu/xccl.h
new file mode 100644
index 00000000000000..d844f166ec5ab1
--- /dev/null
+++ b/torch/csrc/xpu/xccl.h
@@ -0,0 +1,112 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <oneapi/ccl.hpp>
+#include <cstddef>
+#include <optional>
+#include <vector>
+
+namespace torch::xpu::xccl {
+
+using xcclComm_t = ccl::communicator;
+
+using XCCL_KVS = ccl::shared_ptr_class<ccl::kvs>;
+
+ccl::shared_ptr_class<ccl::kvs> kvs;
+std::vector<uint8_t> kvs_addr;
+
+XCCL_KVS get_kvs(int rank, c10d::Store& store)
+class Comms {
+public:
+
+  explicit Comms(ccl::vector_class<xcclComm_t> &comms) :
+    comms(std::move(comms)), streams{} {}
+
+  explicit Comms(ccl::vector_class<xcclComm_t> &comms, ccl::vector_class<ccl::stream> &streams, std::vector<c10::Stream> &torch_streams) :
+    comms(std::move(comms)), streams(std::move(streams)), torch_streams(std::move(torch_streams)) {}
+
+  ~Comms() noexcept(false) {}
+
+  Comms() = delete;
+
+  Comms(const Comms &) = delete;
+
+  Comms &operator=(const Comms &) = delete;
+
+  Comms(Comms &&other) : comms(std::move(other.comms)), streams(std::move(other.streams)),
+                         torch_streams(std::move(other.torch_streams)) {}
+
+  Comms &operator=(Comms &&other) {
+    std::swap(comms, other.comms);
+    std::swap(streams, other.streams);
+    std::swap(torch_streams, other.torch_streams);
+    return *this;
+  }
+
+public:
+  // The Communicators used by XCCL
+  ccl::vector_class<xcclComm_t> comms;
+  // The streams used by XCCL
+  ccl::vector_class<ccl::stream> streams;
+  // one to one mapping the torch streams to the ccl::stream.
+  std::vector<c10::Stream> torch_streams;
+};
+
+enum class xcclRedOp { Sum = 0, Prod = 1, Max = 2, Min = 3};
+
+enum class xcclDataType {
+  Int8 = 0,
+  Char = 0,
+  Uint8 = 1,
+  Int32 = 2,
+  Int = 2,
+  Uint32 = 3,
+  Int64 = 4,
+  Uint64 = 5,
+  Float16 = 6,
+  Half = 6,
+  Float32 = 7,
+  Float = 7,
+  Float64 = 8,
+  Double = 8,
+  Bfloat16 = 9,
+  NumTypes = 10
+};
+
+namespace detail {
+
+ at::ArrayRef<xcclComm_t> get_communicators(
+    at::TensorList inputs);
+ void check_inputs(
+    at::TensorList inputs,
+    at::TensorList outputs,
+    int input_multiplier,
+    int output_multiplier);
+ void check_inputs(
+    at::TensorList inputs,
+    const at::Tensor& output,
+    int root,
+    int input_multiplier,
+    int output_multiplier);
+
+} // namespace detail
+
+using comm_list = std::vector<xor>;
+using stream_list = std::vector<std::optional<at::xpu::XPUStream>>;
+
+ std::uint64_t version();
+ const char* version_suffix();
+
+bool is_available(at::TensorList tensors);
+
+comm_init_rank(int nranks, const ncclUniqueId& comm_id, int rank);
+ void comm_destroy(ncclComm_t comm);
+
+void all_reduce(
+    const std::vector<at::Tensor>& inputs,
+    std::vector<at::Tensor>& outputs,
+    int32_t op = static_cast<int>(xcclRedOp::Sum),
+    const stream_list& streams = {},
+    const comm_list& user_comms = {});
+} // namespace torch::xpu::xccl
+

From 93a4bdb962b9c8e5bffa0f9b5716dbe6df05bda4 Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Fri, 30 Aug 2024 07:44:51 +0000
Subject: [PATCH 02/96] update

---
 .../distributed/c10d/ProcessGroupXCCL.cpp     | 323 +++----
 .../distributed/c10d/ProcessGroupXCCL.hpp     |  56 +-
 torch/csrc/distributed/c10d/XCCLUtils.hpp     | 334 -------
 torch/csrc/xpu/xccl.cpp                       | 850 +++---------------
 torch/csrc/xpu/xccl.h                         |  56 +-
 5 files changed, 295 insertions(+), 1324 deletions(-)
 delete mode 100644 torch/csrc/distributed/c10d/XCCLUtils.hpp

diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index 9466a0c091c99c..3325691c3a8531 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -14,9 +14,6 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/CUDAGraph.h>
 #include <c10/core/DeviceType.h>
-#include <c10/cuda/CUDAAllocatorConfig.h>
-#include <c10/cuda/CUDAGraphsC10Utils.h>
-#include <c10/cuda/CUDAGuard.h>
 #include <c10/util/CallOnce.h>
 #include <c10/util/Exception.h>
 #include <c10/util/Logging.h>
@@ -31,16 +28,14 @@
 namespace c10d {
 
 namespace {
-std::map<c10d::ReduceOp, ccl::reduction> xcclOps =
-  {
+std::map<c10d::ReduceOp, ccl::reduction> xcclOps = {
     {ReduceOp::MIN, ccl::reduction::min},
     {ReduceOp::MAX, ccl::reduction::max},
     {ReduceOp::SUM, ccl::reduction::sum},
     {ReduceOp::PRODUCT, ccl::reduction::prod},
-  };
+};
 
-std::map<at::ScalarType, ccl::datatype> xcclDatatypes =
-  {
+std::map<at::ScalarType, ccl::datatype> xcclDatatypes = {
     {at::kByte, ccl::datatype::uint8},
     {at::kChar, ccl::datatype::int8},
     {at::kShort, ccl::datatype::int16},
@@ -51,96 +46,89 @@ std::map<at::ScalarType, ccl::datatype> xcclDatatypes =
     {at::kDouble, ccl::datatype::float64},
     {at::kBFloat16, ccl::datatype::bfloat16},
     {at::kBool, ccl::datatype::uint8},
-  };
+};
 
-void check_gpu_single_tensor(
-    const at::Tensor& tensor
-) {
+void check_gpu_single_tensor(const at::Tensor& tensor) {
   if (!tensor.is_xpu() || tensor.is_sparse()) {
     C10_THROW_ERROR(ValueError, "Tensors must be XPU and dense");
   }
   if (!tensor.is_contiguous(tensor.suggest_memory_format())) {
-      C10_THROW_ERROR(ValueError, "Tensors must be contiguous");
-    }
+    C10_THROW_ERROR(ValueError, "Tensors must be contiguous");
   }
 }
-
 } // namespace
 
+ccl::datatype getXcclDataType(at::ScalarType type) {
+  auto it = xcclDatatypes.find(type);
+  TORCH_CHECK_WITH(
+      TypeError,
+      it != xcclDatatypes.end(),
+      "Input tensor data type is not supported for XCCL process group: ",
+      type);
+  return it->second;
+}
+
+} // namespace c10d
+
 namespace {
 
-ProcessGroupXCCL::WorkXCCL::WorkXCCL(std::vector<std::vector<at::Tensor>> outputTensors,
-                                            int rank,
-                                            c10d::OpType opType,
-                                            const c10::optional<std::vector<at::Tensor>>& inputTensors)
-        : Work(rank, opType, nullptr, inputTensors),
-          outputTensors_(std::move(outputTensors)),
-          future_(createFutureAsOutput(outputTensors)
-          );
+static std::mutex xcclCommDevIdxMapMutex;
+static std::unordered_map<std::shared_ptr<XCCLComm>, int> xcclCommDevIdxMap;
+
+template <
+    template <typename, typename, typename, typename, typename>
+    class WorkXCCL,
+    typename RunF,
+    typename CommType,
+    typename InputType,
+    typename OutputType,
+    typename attr_t>
+c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL> make_work_ccl(
+    const std::vector<InputType>& inputs,
+    const std::vector<OutputType>& outputs,
+    RunF f,
+    CommType& comms,
+    attr_t& attr,
+    int rank,
+    c10d::OpType op_type) {
+  c10::intrusive_ptr<WorkCCL<RunF, CommType, InputType, OutputType, attr_t>>
+      ret_ptr = c10::make_intrusive<
+          WorkCCL<RunF, CommType, InputType, OutputType, attr_t>>(
+          inputs, outputs, f, comms, attr, rank, op_type);
+  return ret_ptr;
+}
 
-c10::intrusive_ptr<c10::ivalue::Future> ProcessGroupXCCL::WorkXCCL::getFuture() {
+ProcessGroupXCCL::WorkXCCL::WorkXCCL(
+    std::vector<std::vector<at::Tensor>> outputTensors,
+    int rank,
+    c10d::OpType opType,
+    const c10::optional<std::vector<at::Tensor>>& inputTensors)
+    : Work(rank, opType, nullptr, inputTensors),
+      outputTensors_(std::move(outputTensors)),
+      future_(createFutureAsOutput(outputTensors)) {}
+
+c10::intrusive_ptr<c10::ivalue::Future> ProcessGroupXCCL::WorkXCCL::
+    getFuture() {
   return future_;
 }
 
 c10::intrusive_ptr<Backend> ProcessGroupXCCL::createProcessGroupXCCL(
     const c10::intrusive_ptr<Store>& store,
     int rank,
-    int size)
-{
+    int size) {
   return c10::make_intrusive<ProcessGroupXCCL>(store, rank, size);
 }
 
-c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL> ProcessGroupNCCL::initWork(
-    at::Device& device,
-    int rank,
-    OpType opType,
-    const std::vector<at::Tensor>& inputs,
-    const std::vector<at::Tensor>& outputs,
-    bool record) {
-  auto r = c10::make_intrusive<ProcessGroupNCCL::WorkXCCL>(
-      device,
-      rank,
-      opType,
-      seqCollective_,
-      profilingTitle,
-      profilingTitle != nullptr ? std::optional<std::vector<at::Tensor>>(inputs)
-                                : std::nullopt,
-      desyncDebug_,
-      enableTiming_.load(),
-      dist_debug_level_);
-  if (record) {
-    bool isP2P = isP2POp(opType);
-    r->trace_id_ = NCCLTraceBuffer::get()->record(
-        local_id_,
-        std::make_tuple(pg_uid_, pg_desc_),
-        seqCollective_,
-        seqP2P_,
-        op_id_,
-        profilingTitle ? profilingTitle : "",
-        inputs,
-        outputs,
-        r->ncclStartEvent_.get(),
-        r->ncclEndEvent_.get(),
-        options_->timeout,
-        pgStatus_,
-        isP2P);
-  }
-  return r;
-}
-
-ProcessGroupXCCL::~ProcessGroupXCCL()
-{
-}
+ProcessGroupXCCL::~ProcessGroupXCCL() {}
 
 std::shared_ptr<XCCLComm> ProcessGroupXCCL::getXCCLComm(
     const std::string& deviceKey,
     at::Device& device) {
-
   if (deviceKey.empty()) {
     C10_THROW_ERROR(
         DistBackendError,
         "Not able to create/get the CCL Communicator since "
-            "the devices are empty ");
+        "the devices are empty ");
   }
 
   {
@@ -164,11 +152,11 @@ std::shared_ptr<XCCLComm> ProcessGroupXCCL::getXCCLComm(
   auto q = get_sycl_queue(stream);
   auto ctx = ccl::create_context(q.get_context());
   devs_rank.emplace_back(rank, ccl::create_device(q.get_device()));
-  auto xcclComm = ccl::create_communicator(numRanks, devs_rank, ctx, kvs);
+  xcclComm = ccl::create_communicator(numRanks, devs_rank, ctx, kvs);
 
   {
     std::lock_guard<std::mutex> lock(mutex_);
-    inInitializationCommMap_.emplace(deviceKey, ncclComm);
+    inInitializationCommMap_.emplace(deviceKey, xcclComm);
   }
 
   auto it = inInitializationCommMap_.find(deviceKey);
@@ -176,9 +164,9 @@ std::shared_ptr<XCCLComm> ProcessGroupXCCL::getXCCLComm(
     devXCCLCommMap_.emplace(deviceKey, std::move(it->second));
     inInitializationCommMap_.erase(deviceKey);
 
-    ncclCommDevIdxMapMutex.lock();
-    ncclCommDevIdxMap.emplace(ncclComm, device.index());
-    ncclCommDevIdxMapMutex.unlock();
+    xcclCommDevIdxMapMutex.lock();
+    xcclCommDevIdxMap.emplace(xcclComm, device.index());
+    xcclCommDevIdxMapMutex.unlock();
   }
 
   it = devXCCLCommMap_.find(deviceKey);
@@ -189,168 +177,87 @@ std::shared_ptr<XCCLComm> ProcessGroupXCCL::getXCCLComm(
 }
 
 template <typename Fn, typename PreProcess, typename PostProcess>
-c10::intrusive_ptr<Work> ProcessGroupNCCL::collective(
+c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
     at::Tensor& input,
     at::Tensor& output,
     Fn fn,
     PreProcess pre,
     PostProcess post,
     OpType opType) {
+  using traits = function_traits<Fn>;
+  using attr_t = typename traits::template arg<2>::type;
+  attr_t attr = ccl::create_operation_attr<attr_t>();
 
   auto device = input.device();
   const auto key = std::to_string(device.index());
-  auto ncclComm = getXCCLComm(key, device);
+  auto xcclComm = getXCCLComm(key, device);
 
   std::vector<at::Tensor> inputs{input};
   std::vector<at::Tensor> outputs{output};
 
-  auto work =
-      initWork(device, rank_, opType, profilingTitle, inputs, outputs, enqueue);
-
-  // Store references to outputs to be used by WorkNCCL::result and operator<<.
-  work->outputs_ =
-      std::make_shared<std::vector<at::Tensor>>(std::move(outputs));
-
-  if (avoidRecordStreams) {
-    work->stashed_for_allocator_safety_ =
-        std::make_shared<std::vector<at::Tensor>>();
-    work->stashed_for_allocator_safety_->push_back(input);
-  }
+  c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL> work;
+  // work =
+  //     initWork(device, rank_, opType, profilingTitle, inputs, outputs,
+  //     enqueue);
 
-  at::cuda::OptionalCUDAGuard gpuGuard;
-
-  // Start event should only be recorded before the ncclGroupStart()
-  if (work->timingEnabled_) {
-    work->ncclStartEvent_->record(ncclStream);
-  }
-
-  pre(ncclStream, work);
-
-  ncclComm_t comm = ncclComm->getNcclComm();
-
-  // Both `inputs' and `outputs' are created on a worker stream and used in
-  // different ncclStreams.  Hence, both must record the ncclStream to
-  // prevent being freed before the collective finishes.
-  //
-  // We only record `inputs' here, and leave recording `outputs' to `fn' for
-  // operations where `inputs' and `outputs' are not the same.
-  //
-  // See [Sync Streams].
-  if (!avoidRecordStreams) {
-    if (!input.is_sparse()) {
-      c10::cuda::CUDACachingAllocator::recordStream(
-          input.storage().data_ptr(), ncclStream);
-    } else {
-      // for sparse input case record streams on both index and value
-      // tensors
-      c10::cuda::CUDACachingAllocator::recordStream(
-          input.values().storage().data_ptr(), ncclStream);
-      c10::cuda::CUDACachingAllocator::recordStream(
-          input.indices().storage().data_ptr(), ncclStream);
-    }
-  }
-#ifndef NCCL_HAS_COMM_NONBLOCKING
-  C10D_NCCL_CHECK(
-      fn(input, output, comm, ncclStream),
-      ncclComm->getNcclCommFailureReason());
-#else
-  C10D_NCCL_CHECK_TIMEOUT(
-      fn(input, output, comm, ncclStream),
-      comm,
-      ncclComm->getNcclCommFailureReason());
-#endif
-
-  post(ncclStream, work);
-
-  // End event should only be recorded after the ncclGroupEnd()
-  if (!coalescing_state_) {
-    work->ncclEndEvent_->record(ncclStream);
-  }
-  work->ncclComm_ = ncclComm;
-
-  {
-    c10::cuda::CUDAMultiStreamGuard streamGuard(ncclStream);
-    std::vector<at::Device> devices{device};
-    work->future_ = c10::make_intrusive<at::ivalue::Future>(
-        c10::ListType::create(c10::TensorType::get()), devices);
-
-    // Add a callback that runs profiling end callbacks. wrapCallback() in CUDA
-    // future blocks the stream this callback runs on the corresponding
-    // ncclEndEvents_ ensuring appropriate synchronization.
-    if (work->recordFunctionEndCallback_) {
-      work->future_->addCallback(
-          [work](at::ivalue::Future& /* unused */) {
-            work->recordFunctionEndCallback_();
-          },
-          // uses_future = false allows us to skip synchronization in
-          // ivalue::Future, but is only valid as long as the lambda doesn't use
-          // the "Future" argument.
-          /*uses_future=*/false);
-    }
-    work->future_->markCompleted(at::IValue(*work->outputs_));
-  }
-
-  // Set appropriate work parameters.
-  work->blockingWait_ = blockingWait_;
-  work->avoidRecordStreams_ = avoidRecordStreams;
-  work->opTimeout_ = options_->timeout;
-  work->store_ = store_;
-  // Record size info for debug. We only record the size on the first device as
-  // multi-device per process is deprecated
-  work->numelIn_ = input.numel();
-  work->numelOut_ = output.numel();
-
-  // Notify graphs before we check the capture status preemptively
-  at::cuda::CUDAGraph::inc_pending_event_queries();
-  if (enqueue) {
-    workEnqueue(work);
-  } else {
-    at::cuda::CUDAGraph::dec_pending_event_queries();
-  }
+  work = make_work_ccl<WorkXCCL>(
+      inputs, outputs, fn, xcclComm, attr, rank_, op_type);
+  // pre(ncclStream, work);
+  // ncclComm_t comm = ncclComm->getNcclComm();
+  // post(ncclStream, work);
 
   return work;
 }
 
-c10::intrusive_ptr<Work> ProcessGroupXCCL::allreduce_impl(
-    at::Tensor& tensor,
+template <typename Fn>
+c10::intrusive_ptr<Work> ProcessGroupNCCL::collective(
+    at::Tensor& input,
+    at::Tensor& output,
+    Fn fn,
+    OpType opType) {
+  return collective<Fn>(
+      input,
+      output,
+      fn,
+      [](std::vector<ccl::stream>&) {},
+      [](std::vector<ccl::stream>&) {},
+      opType);
+}
+
+c10::intrusive_ptr<Work> ProcessGroupXCCL::allreduce(
+    std::vector<at::Tensor>& tensors,
     const AllreduceOptions& opts) {
+  TORCH_CHECK(
+      tensors.size() == 1, "Expecting one tensor only but got multiple");
+  auto tensor = tensors.back();
+  check_gpu_single_tensor(tensor);
+  if (opts.reduceOp == ReduceOp::AVG) {
+    TORCH_CHECK(false, "Cannot use ReduceOp AVG with XPU")
+  }
   return collective(
       tensor,
       tensor,
       [&](at::Tensor& input,
           at::Tensor& output,
-          ncclComm_t comm,
-          at::cuda::CUDAStream& stream) {
-        auto ncclDataType = getNcclDataType(input.scalar_type());
-        auto ncclReduceOp =
-            getNcclReduceOp(opts.reduceOp, input, ncclDataType, comm);
-        return ncclAllReduce(
+          ccl::allreduce_attr attr,
+          xcclComm_t comm,
+          ccl::stream& stream) {
+        ccl::event ret_evt;
+        ccl::datatype datatype = getXcclDataType(input.scalar_type());
+        ret_evt = ccl::allreduce(
             input.data_ptr(),
             output.data_ptr(),
-            input.numel(),
-            ncclDataType,
-            ncclReduceOp,
+            (size_t)input.numel(),
+            getXcclDataType(input.scalar_type()),
+            xcclOp.at(opts.reduceOp),
             comm,
-            stream.stream());
+            stream,
+            attr);
+        return ret_evt;
       },
-      OpType::ALLREDUCE,
-      "nccl:all_reduce");
+      OpType::ALLREDUCE);
 }
 
-c10::intrusive_ptr<Work> ProcessGroupXCCL::allreduce(
-  std::vector<at::Tensor>& tensors,
-  const AllreduceOptions& opts)
-{
-  TORCH_CHECK(tensors.size() == 1, "Expecting one tensor only but got multiple");
-  auto tensor = tensors.back();
-  check_gpu_single_tensor(tensor);
-  if (opts.reduceOp == ReduceOp::SUM) {
-    TORCH_CHECK(false, "Cannot use ReduceOp SUM with XPU")
-  }
-  return allreduce_impl(tensor, opts);
-}
-
-
-}
+} // namespace
 
-}
\ No newline at end of file
+#endif // USE_C10D_XCCL
\ No newline at end of file
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
index 39f3c1a5e89964..51801ed992edcc 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
@@ -10,6 +10,7 @@
 #ifdef USE_C10D_XCCL
 
 #include <oneapi/ccl.hpp>
+#include <nccl.h>
 #include <exception>
 #include <memory>
 #include <vector>
@@ -34,24 +35,25 @@ namespace c10d {
 constexpr const char* XCCL_BACKEND_NAME = "xccl";
 
 class ProcessGroupXCCL : public Backend {
-public:
+ public:
   class WorkXCCL : public Work {
-  public:
-    WorkXCCL(        
+   public:
+    WorkXCCL(
         std::vector<std::vector<at::Tensor>> outputTensors,
         int rank = -1,
         OpType opType = UNKNOWN,
-        const c10::optional<std::vector<at::Tensor>>& inputTensors = c10::nullopt)
-        : outputTensors_(std::move(outputTensors)) {}
+        const c10::optional<std::vector<at::Tensor>>& inputTensors =
+            c10::nullopt)
+        : Work(rank, opType), outputTensors_(std::move(outputTensors)) {}
 
     WorkXCCL(const WorkXCCL& w)
         : outputTensors_(w.outputTensors_), events_(w.events_) {}
 
     ~WorkXCCL() override {
-        // Ensures all events are properly handled before destruction
-        for (auto& event : events_) {
-            event.wait();
-        }
+      // Ensures all events are properly handled before destruction
+      for (auto& event : events_) {
+        event.wait();
+      }
     }
 
     bool isCompleted() override {
@@ -64,11 +66,12 @@ class ProcessGroupXCCL : public Backend {
     }
 
     bool isSuccess() const override {
-        TORCH_CHECK(false, "ProcessGroupXCCL::WorkXCCL::isSuccess not implemented");
+      TORCH_CHECK(
+          false, "ProcessGroupXCCL::WorkXCCL::isSuccess not implemented");
     }
 
     void abort() override {
-        TORCH_CHECK(false, "ProcessGroupXCCL::WorkXCCL::abort not implemented");
+      TORCH_CHECK(false, "ProcessGroupXCCL::WorkXCCL::abort not implemented");
     }
 
     void synchronize() override {
@@ -78,34 +81,37 @@ class ProcessGroupXCCL : public Backend {
     }
 
     void wait() override {
+      std::lock_guard<std::mutex> lock(mutex_);
       for (auto& event : events_) {
-        call_with_lock(globalMutex, [&]() {
-            CCL_CHECK(event.wait());
-        });
+        CCL_CHECK(event.wait());
       }
       events_.clear();
     }
 
-    c10::intrusive_ptr<c10::ivalue::Future> getFuture() override;
+    c10::intrusive_ptr<c10::ivalue::Future> getFuture() override {
+      TORCH_CHECK(
+          false, "ProcessGroupXCCL::WorkXCCL::getFuture not implemented");
+    }
 
     std::vector<at::Tensor> result() override {
-        return outputTensors_.empty() ? std::vector<at::Tensor>() : outputTensors_[0];
+      return outputTensors_.empty() ? std::vector<at::Tensor>()
+                                    : outputTensors_[0];
     }
 
-  protected:
+   protected:
     friend class ProcessGroupXCCL;
     std::vector<ccl::event> events_;
     const std::vector<std::vector<at::Tensor>> outputTensors_;
     c10::intrusive_ptr<at::ivalue::Future> future_;
   };
 
-  explicit ProcessGroupXCCL(const c10::intrusive_ptr<Store>& store,
-                            int rank,
-                            int size)
-      : store_(store), rank_(rank), size_(size) {
-      }
+  explicit ProcessGroupXCCL(
+      const c10::intrusive_ptr<Store>& store,
+      int rank,
+      int size)
+      : store_(store), rank_(rank), size_(size) {}
 
-  virtual ~ProcessGroupXCCL();
+  ProcessGroupXCCL::~ProcessGroupXCCL() = default;
 
   const std::string getBackendName() const override {
     return std::string(XCCL_BACKEND_NAME);
@@ -123,11 +129,11 @@ class ProcessGroupXCCL : public Backend {
       int rank = -1,
       int size = -1);
 
-private:
+ private:
   int rank_;
   int size_;
 
-public:
+ public:
   std::unordered_map<std::string, std::shared_ptr<XCCLComm>>
       inInitializationCommMap_;
   std::unordered_map<std::string, std::shared_ptr<XCCLComm>> devXCCLCommMap_;
diff --git a/torch/csrc/distributed/c10d/XCCLUtils.hpp b/torch/csrc/distributed/c10d/XCCLUtils.hpp
deleted file mode 100644
index d52f3df8ea466d..00000000000000
--- a/torch/csrc/distributed/c10d/XCCLUtils.hpp
+++ /dev/null
@@ -1,334 +0,0 @@
-#pragma once
-
-#ifdef USE_C10D_XCCL
-
-#include <stdio.h>
-#include <stdlib.h>
-
-#include <memory>
-#include <mutex>
-#include <thread>
-
-#include <ATen/ATen.h>
-#include <c10/util/Exception.h>
-#include <c10/util/Optional.h>
-#include <xccl.h>
-
-// RAII wrapper for NCCL communicator
-class XCCLComm {
- public:
-  explicit XCCLComm(ncclComm_t ncclComm)
-      : ncclComm_(ncclComm),
-        aborted_(false),
-        ncclAsyncErr_(ncclSuccess),
-        commFailureReason_(c10::nullopt),
-        initialized_(false) {}
-
-  NCCLComm() : NCCLComm(nullptr) {}
-
-  ~NCCLComm() noexcept {
-    // Add lock in this destructor, as aborted_ needs to be read after memory
-    // barrier here.
-    std::unique_lock<std::mutex> lock(mutex_);
-    if (ncclComm_ && !aborted_) {
-#ifdef ENABLE_NCCL_ERROR_CHECKING
-      // Use ncclCommAbort instead of ncclCommDestroy here since
-      // ncclCommDestroy could block forever waiting for work to complete on
-      // the communicator.
-      C10D_NCCL_ASSERT(::ncclCommAbort(ncclComm_));
-#else
-      C10D_NCCL_ASSERT(::ncclCommDestroy(ncclComm_));
-#endif
-    }
-  }
-
-  static std::shared_ptr<NCCLComm> create(
-      int numRanks,
-      int rank,
-      ncclUniqueId commId) {
-    auto comm = std::make_shared<NCCLComm>();
-    C10D_NCCL_CHECK(
-        ncclCommInitRank(&(comm->ncclComm_), numRanks, commId, rank),
-        c10::nullopt);
-    comm->ncclId_ = commId;
-    comm->rank_ = rank;
-    comm->initialized_ = true;
-    return comm;
-  }
-
-#ifdef NCCL_HAS_COMM_NONBLOCKING
-  static std::shared_ptr<NCCLComm> create(
-      int numRanks,
-      int rank,
-      ncclUniqueId commId,
-      ncclConfig_t& config) {
-    auto comm = std::make_shared<NCCLComm>();
-    bool isInitialized = false;
-    if (nccl_use_nonblocking()) {
-      config.blocking = 0;
-      LOG(INFO) << "Rank " << rank
-                << ": creating NCCL communicator in nonblocking mode";
-      C10D_NCCL_CHECK_NONBLOCKING(
-          ncclCommInitRankConfig(
-              &(comm->ncclComm_), numRanks, commId, rank, &config),
-          c10::nullopt);
-    } else {
-      C10D_NCCL_CHECK(
-          ncclCommInitRankConfig(
-              &(comm->ncclComm_), numRanks, commId, rank, &config),
-          c10::nullopt);
-      // under blocking mode, comm is initialized after NCCL CHECK
-      isInitialized = true;
-    }
-    comm->ncclId_ = commId;
-    comm->rank_ = rank;
-    comm->initialized_ = isInitialized;
-    return comm;
-  }
-#endif
-
-#ifdef NCCL_HAS_COMM_SPLIT
-  static std::shared_ptr<NCCLComm> split(
-      NCCLComm* source,
-      int color_id,
-      int rank,
-      ncclConfig_t& config) {
-    auto comm = std::make_shared<NCCLComm>();
-    C10D_NCCL_CHECK(
-        ncclCommSplit(
-            source->ncclComm_, color_id, rank, &(comm->ncclComm_), &config),
-        c10::nullopt);
-    ++source->ncclCommSplitCounter_;
-    comm->rank_ = rank;
-    return comm;
-  }
-#endif
-
-#if defined(IS_NCCL_EXP) && defined(NCCL_COMM_DUMP)
-  std::unordered_map<std::string, std::string> ncclCommDump() {
-    std::unordered_map<std::string, std::string> dump;
-    if (isAborted()) {
-      LOG(INFO) << "Communicator was aborted before trying to dump its state.";
-      return dump;
-    }
-    C10D_NCCL_CHECK(::ncclCommDump(ncclComm_, dump), c10::nullopt);
-    return dump;
-  }
-#endif
-
-  ncclUniqueId getNcclId() {
-    return ncclId_;
-  }
-
-  // Must not be copyable
-  NCCLComm(const NCCLComm&) = delete;
-  NCCLComm& operator=(const NCCLComm&) = delete;
-
-  // Do not support move assignment as there is no valid use case
-  NCCLComm& operator=(NCCLComm&& other) = delete;
-
-  // Move constructable
-  NCCLComm(NCCLComm&& other) {
-    // Using other's lock, as it reads other's states
-    // Can not use this.mutex_, as this object is being constructed.
-    std::unique_lock<std::mutex> lock(other.mutex_);
-    std::swap(ncclComm_, other.ncclComm_);
-    std::swap(aborted_, other.aborted_);
-    std::swap(ncclAsyncErr_, other.ncclAsyncErr_);
-    std::swap(initialized_, other.initialized_);
-  }
-
-  ncclComm_t getNcclComm();
-
-  c10::optional<std::string> getNcclCommFailureReason() const {
-    std::unique_lock<std::mutex> lock(mutex_);
-    return commFailureReason_;
-  }
-
-  void ncclCommAbort(
-      c10::optional<std::string> commFailureReason = c10::nullopt) {
-    std::unique_lock<std::mutex> lock(mutex_);
-#ifdef ENABLE_NCCL_ERROR_CHECKING
-    if (aborted_) {
-      // Should not abort twice.
-      return;
-    }
-
-#ifdef NCCL_HAS_COMM_REGISTER
-    // Deregister all registered segments before aborting.
-    for (auto& it : registeredSegmentHandles_) {
-      void* handle = it.second;
-      C10D_NCCL_CHECK(
-          ::ncclCommDeregister(ncclComm_, handle),
-          c10::str(
-              "Failed to deregister segment handle ",
-              handle,
-              " on ncclComm_ ",
-              ncclComm_));
-    }
-    registeredSegmentHandles_.clear();
-#endif
-
-    // Set true failure reason if provided by ProcessGroupNCCL (e.g. work
-    // timeout)
-    commFailureReason_ = commFailureReason;
-    LOG(INFO) << "Aborting ncclComm_ " << ncclComm_ << " with reason: "
-              << (commFailureReason ? *commFailureReason
-                                    : "No abort reason provided.");
-#ifndef NCCL_HAS_COMM_NONBLOCKING
-    C10D_NCCL_CHECK(::ncclCommAbort(ncclComm_), commFailureReason_);
-#else
-    C10D_NCCL_CHECK_TIMEOUT(
-        ::ncclCommAbort(ncclComm_), ncclComm_, commFailureReason_);
-#endif
-    aborted_ = true;
-    ncclComm_ = nullptr;
-
-    // Set an appropriate error so that we avoid using the communicator.
-    if (ncclAsyncErr_ == ncclSuccess) {
-      ncclAsyncErr_ = ncclSystemError;
-    }
-#else
-    // This is a NOOP, if error checks are disabled.
-    return;
-#endif
-  }
-
-  bool isAborted() const {
-    std::unique_lock<std::mutex> lock(mutex_);
-    return aborted_;
-  }
-
-  uint64_t getCommSplitCounter() const {
-    return ncclCommSplitCounter_;
-  }
-
-  ncclResult_t checkForNcclError() {
-    std::unique_lock<std::mutex> lock(mutex_);
-#ifdef ENABLE_NCCL_ERROR_CHECKING
-    if (ncclAsyncErr_ != ncclSuccess) {
-      return ncclAsyncErr_;
-    }
-    C10D_NCCL_CHECK(
-        ncclCommGetAsyncError(ncclComm_, &ncclAsyncErr_), commFailureReason_);
-    return ncclAsyncErr_;
-#else
-    // Always return success, if error checks are disabled.
-    return ncclSuccess;
-#endif
-  }
-
-  ncclResult_t registerSegment(void* ptr, size_t size) {
-    std::unique_lock<std::mutex> lock(mutex_);
-#ifdef NCCL_HAS_COMM_REGISTER
-    // We register only segments from cache allocator
-    // which are guaranteed to be with disjoint addr ranges. Thus, a ptr always
-    // maps to a unique handle and should not be registered before the current
-    // ptr is deregistered and freed.
-    TORCH_CHECK(
-        registeredSegmentHandles_.count(ptr) == 0,
-        "Segment with ptr ",
-        ptr,
-        " has already been registered on ncclComm_ ",
-        ncclComm_);
-
-    void* handle;
-    C10D_NCCL_CHECK(
-        ncclCommRegister(ncclComm_, ptr, size, &handle),
-        c10::str(
-            "Failed to register segment with ptr ",
-            ptr,
-            ", size ",
-            size,
-            " on ncclComm_ ",
-            ncclComm_));
-    registeredSegmentHandles_[ptr] = handle;
-    return ncclSuccess;
-#else
-    return ncclInvalidUsage;
-#endif
-  }
-
-  ncclResult_t deregisterSegment(void* ptr) {
-    std::unique_lock<std::mutex> lock(mutex_);
-#ifdef NCCL_HAS_COMM_REGISTER
-    TORCH_CHECK(
-        registeredSegmentHandles_.count(ptr) == 1,
-        "Segment with ptr ",
-        ptr,
-        " is not registered on ncclComm_ ",
-        ncclComm_);
-
-    void* handle = registeredSegmentHandles_[ptr];
-    C10D_NCCL_CHECK(
-        ncclCommDeregister(ncclComm_, handle),
-        c10::str(
-            "Failed to deregister segment handle ",
-            handle,
-            ", with ptr ",
-            ptr,
-            " on ncclComm_ ",
-            ncclComm_));
-    registeredSegmentHandles_.erase(ptr);
-    return ncclSuccess;
-#else
-    return ncclInvalidUsage;
-#endif
-  }
-
-  friend class ProcessGroupNCCL;
-
- protected:
-  // a helper function to wait until the communicator is initialized;
-  void waitUntilInitialized(int timeoutSecs);
-  ncclComm_t ncclComm_;
-  // Unique nccl_id for this communicator.
-  ncclUniqueId ncclId_;
-  bool aborted_;
-  uint64_t ncclCommSplitCounter_{0};
-  ncclResult_t ncclAsyncErr_;
-  mutable std::mutex mutex_;
-  // Rank that this communicator corresponds to.
-  int rank_;
-  // Optional reason for communicator failure, provided by ProcessGroupNCCL for
-  // better error messaging.
-  c10::optional<std::string> commFailureReason_;
-  bool initialized_{false};
-#ifdef NCCL_HAS_COMM_REGISTER
-  // Stores handlers for tensors registered by NCCL
-  std::unordered_map<void*, void*> registeredSegmentHandles_;
-#endif
-};
-
-// Helper that automatically cleans up premul sums.
-struct ncclRedOpRAII {
-  ncclRedOpRAII() = default;
-  ncclRedOpRAII(ncclRedOp_t op) : op_(op) {}
-  ncclRedOpRAII(ncclRedOp_t op, ncclComm_t comm)
-      : op_(op), comm_(comm), premul_sum_(true) {}
-  ncclRedOpRAII(const ncclRedOpRAII&) = delete;
-  ncclRedOpRAII& operator=(const ncclRedOpRAII&) = delete;
-  ncclRedOpRAII(ncclRedOpRAII&& tmp) : ncclRedOpRAII() {
-    std::swap(tmp.op_, this->op_);
-    std::swap(tmp.comm_, this->comm_);
-    std::swap(tmp.premul_sum_, this->premul_sum_);
-  }
-#if defined(ENABLE_NCCL_PREMUL_SUM_SUPPORT)
-  ~ncclRedOpRAII() {
-    if (premul_sum_) {
-      ncclRedOpDestroy(op_, comm_);
-    }
-  }
-#endif
-  operator ncclRedOp_t() const {
-    return op_;
-  }
-  ncclRedOp_t op_;
-  ncclComm_t comm_;
-  bool premul_sum_ = false;
-};
-
-} // namespace c10d
-
-#endif // USE_C10D_NCCL
-
diff --git a/torch/csrc/xpu/xccl.cpp b/torch/csrc/xpu/xccl.cpp
index 5304b43f57d410..747c5bf3eb1103 100644
--- a/torch/csrc/xpu/xccl.cpp
+++ b/torch/csrc/xpu/xccl.cpp
@@ -1,22 +1,17 @@
 #include <ATen/core/functional.h>
-#include <torch/csrc/cuda/device_set.h>
-#include <torch/csrc/cuda/nccl.h>
 
 #include <ATen/ATen.h>
-#include <c10/cuda/CUDAException.h>
-#include <c10/cuda/CUDAGuard.h>
 #include <c10/util/Exception.h>
 #include <c10/util/hash.h>
 #include <c10/util/irange.h>
 
-#include <nccl.h>
+#include <xccl.h>
 
 #include <limits>
 #include <sstream>
 #include <type_traits>
 #include <unordered_map>
 
-
 xcclComm_t* to_xccl_comm(torch::xpu::xccl::xcclComm_t* var) {
   return reinterpret_cast<xcclComm_t*>(var);
 }
@@ -25,8 +20,7 @@ xcclComm_t to_xccl_comm(torch::xpu::xccl::xcclComm_t var) {
   return reinterpret_cast<xcclComm_t>(var);
 }
 
-
-xcclDataType_t to_nccl_data_type(c10::ScalarType type) {
+ccl::datatype to_nccl_data_type(c10::ScalarType type) {
   switch (type) {
     case at::kFloat:
       return ccl::datatype::float32;
@@ -78,77 +72,77 @@ XCCL_KVS get_kvs(int rank, c10d::Store& store) {
   if (rank == 0) {
     kvs = ccl::create_main_kvs();
     ccl::kvs::address_type main_addr = kvs->get_address();
-    auto ccl_kvs_addr = std::vector<uint8_t>(main_addr.begin(), main_addr.end());
+    auto ccl_kvs_addr =
+        std::vector<uint8_t>(main_addr.begin(), main_addr.end());
     store.set(storeKey, ccl_kvs_addr);
-  }
-  else {
+  } else {
     auto ccl_kvs_addr = store.get(storeKey);
     if (ccl_kvs_addr.size() != ccl::kvs::address_max_size) {
-      throw std::runtime_error(
-              "Unexpected ccl kvs addr from the store\n");
+      throw std::runtime_error("Unexpected ccl kvs addr from the store\n");
     }
     ccl::kvs::address_type main_addr;
-    std::copy_n(std::make_move_iterator(ccl_kvs_addr.begin()),
-                ccl::kvs::address_max_size,
-                main_addr.begin());
+    std::copy_n(
+        std::make_move_iterator(ccl_kvs_addr.begin()),
+        ccl::kvs::address_max_size,
+        main_addr.begin());
     kvs = ccl::create_kvs(main_addr);
   }
 
   return kvs;
 }
 
-
 using namespace at;
 
 namespace detail {
 
-void xcclCommInitAll(xcclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) {
-  for(int i = 0; i < nranks; i++) {
-    newcomm[i] = ccl::create_communicator(nranks, i, get_kvs_addr)
-  }
-  c10::Stream dpcpp_stream = impl.getStream(devices[0]);
-  ccl::vector_class<ccl::pair_class<int, ccl::device>> devs_rank;
-  newcomm = ccl::create_communicators(nranks, devs_rank, ctx, )
-}
-
-struct XcclCommList {
-  std::unique_ptr<xcclComm_t[]> comms;
-  int ndevices;
-  XcclCommList(const std::vector<int>& devices)
-      : comms(new xcclComm_t[devices.size()]), ndevices(devices.size()) {
-    xcclCommInitAll(
-        to_xccl_comm(comms.get()), devices.size(), devices.data());
-  }
-  NcclCommList(NcclCommList&& foo) = default;
-  ~NcclCommList() {
-    if (comms) {
-      for (const auto i : c10::irange(ndevices)) {
-        comm_destroy(comms[i]);
-      }
-    }
-  }
-  ArrayRef<ncclComm_t> ref() const {
-    return ArrayRef<ncclComm_t>(comms.get(), ndevices);
-  }
-};
-
-using device_list = std::vector<int>;
-// accesses to this object have to be guarded by THC's CudaFreeMutex
-std::unordered_map<device_list, std::shared_ptr<Comms>> _communicators;
-static std::unordered_map<device_list, NcclCommList, c10::hash<device_list>>
-    _communicators;
-
-ArrayRef<xcclComm_t> get_communicators(TensorList inputs) {
-  static auto get_device = [](const at::Tensor& t) -> int {
-    return t.get_device();
-  };
-  device_list devices = fmap(inputs, get_device);
-  auto it = _communicators.find(devices);
-  if (it == _communicators.end()) {
-    it = _communicators.emplace(devices, devices).first;
-  }
-  return it->second;
-}
+// void xcclCommInitAll(xcclComm_t* newcomm, int nranks, ncclUniqueId commId,
+// int myrank) {
+//   for(int i = 0; i < nranks; i++) {
+//     newcomm[i] = ccl::create_communicator(nranks, i, get_kvs_addr)
+//   }
+//   c10::Stream dpcpp_stream = impl.getStream(devices[0]);
+//   ccl::vector_class<ccl::pair_class<int, ccl::device>> devs_rank;
+//   newcomm = ccl::create_communicators(nranks, devs_rank, ctx, )
+// }
+
+// struct XcclCommList {
+//   std::unique_ptr<xcclComm_t[]> comms;
+//   int ndevices;
+//   XcclCommList(const std::vector<int>& devices)
+//       : comms(new xcclComm_t[devices.size()]), ndevices(devices.size()) {
+//     xcclCommInitAll(
+//         to_xccl_comm(comms.get()), devices.size(), devices.data());
+//   }
+//   NcclCommList(NcclCommList&& foo) = default;
+//   ~NcclCommList() {
+//     if (comms) {
+//       for (const auto i : c10::irange(ndevices)) {
+//         comm_destroy(comms[i]);
+//       }
+//     }
+//   }
+//   ArrayRef<ncclComm_t> ref() const {
+//     return ArrayRef<ncclComm_t>(comms.get(), ndevices);
+//   }
+// };
+
+// using device_list = std::vector<int>;
+// // accesses to this object have to be guarded by THC's CudaFreeMutex
+// std::unordered_map<device_list, std::shared_ptr<Comms>> _communicators;
+// static std::unordered_map<device_list, NcclCommList, c10::hash<device_list>>
+//     _communicators;
+
+// ArrayRef<xcclComm_t> get_communicators(TensorList inputs) {
+//   static auto get_device = [](const at::Tensor& t) -> int {
+//     return t.get_device();
+//   };
+//   device_list devices = fmap(inputs, get_device);
+//   auto it = _communicators.find(devices);
+//   if (it == _communicators.end()) {
+//     it = _communicators.emplace(devices, devices).first;
+//   }
+//   return it->second;
+// }
 
 static inline void check_tensor(
     const at::Tensor& input,
@@ -275,649 +269,85 @@ void check_inputs(
 
 } // namespace detail
 
-bool is_available(TensorList tensors) {
-#ifdef USE_XCCL
-  device_set devices;
-  for (auto& tensor : tensors) {
-    if (!tensor.is_xpu() || tensor.is_sparse())
-      return false;
-    if (!tensor.is_contiguous())
-      return false;
-    auto device = tensor.get_device();
-    if (devices[device])
-      return false;
-    devices[device] = true;
-  }
-  return true;
-#else
-  return false;
-#endif
-}
-
-std::uint64_t version() {
-#if defined(NCCL_MAJOR)
-  constexpr std::uint64_t ver = (((uint64_t)NCCL_MAJOR) << 32) |
-      (((uint64_t)NCCL_MINOR) << 16) | ((uint64_t)NCCL_PATCH);
-  return ver;
-#elif defined(USE_NCCL)
-  // return major version "1"
-  return ((uint64_t)1) << 32;
-#else
-  return 0;
-#endif
-}
-
-ncclComm_t comm_init_rank(int nranks, const ncclUniqueId& comm_id, int rank) {
-#ifdef USE_XCCL
-  using namespace torch::xpu::xccl::detail;
-  xcclComm_t comm;
-  ncclUniqueId id = comm_id;
-  NCCL_CHECK(ncclCommInitRank(
-      to_nccl_comm(&comm), nranks, *(to_nccl_unique_id(&id)), rank));
-  return comm;
-#else
-  return nullptr;
-#endif
-}
-
-
-namespace {
-// NCCL changed the numerical type used for count between NCCL1 and NCCL2.
-// So we use the following struct, which gets the type of the second argument
-// of T, if T is a function type, with ncclBcast, to get that type statically
-// and programmatically.
-
-template <typename T>
-struct GetSecondArgType;
-
-template <typename R, typename Arg0, typename Arg1, typename... Args>
-struct GetSecondArgType<R(Arg0, Arg1, Args...)> {
-  typedef typename std::decay<Arg1>::type type;
-};
-
-constexpr auto count_max =
-    std::numeric_limits<GetSecondArgType<decltype(ncclBcast)>::type>::max();
-
-// Since NCCL 2.12.10, NCCL supports send/recv 0 byte:
-// https://github.com/NVIDIA/nccl/issues/696. The issue of skipping send/recv
-// is that it can cause deadlock when a rank send and recv 0 bytes so it's
-// completely skipping the collective, causing mismatch across ranks
-#if defined(NCCL_MAJOR) && \
-    ((NCCL_MAJOR > 2) || ((NCCL_MAJOR == 2) && (NCCL_MINOR > 13)))
-template <typename T>
-constexpr bool _nccl_should_send_recv(C10_UNUSED T _unused_) {
-  return true;
-}
-#else
-// old NCCL uses 0 byte message for synchronization
-// Avoid send/recv when message size is zero
-template <typename T>
-inline bool _nccl_should_send_recv(T value) {
-  return value != 0;
-}
-#endif
-} // namespace
-
-size_t get_max_count() {
-  return count_max;
-}
-
-void broadcast(
-    TensorList tensors,
-    const stream_list& streams,
-    const comm_list& user_comms) {
-#ifdef USE_NCCL
-  using namespace torch::cuda::nccl::detail;
-  check_inputs(tensors, tensors, 1, 1);
-  auto data_type = to_nccl_data_type(tensors[0]);
-  int64_t numel = tensors[0].numel();
-
-  const auto comms = user_comms.empty() ? get_communicators(tensors)
-                                        : ArrayRef<ncclComm_t>(user_comms);
-
-  AutoNcclGroup nccl_group_guard;
-  at::cuda::OptionalCUDAGuard device_guard;
-  for (size_t i = 0, num_tensors = tensors.size(); i < num_tensors; i++) {
-    auto device = tensors[i].get_device();
-    device_guard.set_index(device);
-    // Default to the current stream
-    const auto stream = (streams.empty() || !streams[i])
-        ? at::cuda::getCurrentCUDAStream(device).stream()
-        : streams[i]->stream();
-    TORCH_CHECK(
-        static_cast<uint64_t>(numel) <= static_cast<uint64_t>(count_max),
-        "Broadcast tensor has ",
-        numel,
-        " elements, which exceeds the "
-        "maximum NCCL supports (",
-        count_max,
-        ")");
-    ncclComm_t comm = comms[i];
-    NCCL_CHECK(ncclBcast(
-        tensors[i].data_ptr(),
-        numel,
-        data_type,
-        0,
-        to_nccl_comm(comm),
-        stream));
-  }
-#else
-  AT_ERROR("PyTorch built without NCCL support");
-#endif
-}
-
-void reduce(
-    const std::vector<at::Tensor>& inputs,
-    at::Tensor& output,
-    int32_t root,
-    int32_t op,
-    const stream_list& streams,
-    const comm_list& user_comms) {
-#ifdef USE_NCCL
-  using namespace torch::cuda::nccl::detail;
-  TORCH_CHECK(
-      root >= 0 && static_cast<size_t>(root) < inputs.size(), "invalid root");
-
-  check_inputs(inputs, output, root, 1, 1);
-  const auto len = inputs.size();
-
-  auto data_type = to_nccl_data_type(inputs[0]);
-
-  const auto count = inputs[0].numel();
-  auto comms_ref = user_comms.empty() ? get_communicators(inputs)
-                                      : ArrayRef<ncclComm_t>(user_comms);
-
-  AutoNcclGroup nccl_group_guard;
-  at::cuda::OptionalCUDAGuard device_guard;
-  for (const auto i : c10::irange(len)) {
-    auto device = inputs[i].device().index();
-    device_guard.set_index(device);
-    // Default to the current stream
-    const auto stream = (streams.empty() || !streams[i])
-        ? at::cuda::getCurrentCUDAStream(device).stream()
-        : streams[i]->stream();
-
-    ncclComm_t comm = comms_ref[i];
-    NCCL_CHECK(ncclReduce(
-        inputs[i].data_ptr(),
-        static_cast<std::remove_cv_t<decltype(i)>>(root) == i
-            ? output.data_ptr()
-            : nullptr,
-        count,
-        data_type,
-        to_nccl_red_op(op),
-        root,
-        to_nccl_comm(comm),
-        stream));
-  }
-#else
-  AT_ERROR("PyTorch built without NCCL support");
-#endif
-}
-
-void reduce(
-    std::vector<at::Tensor>& inputs,
-    int32_t root,
-    int32_t op,
-    const stream_list& streams,
-    const comm_list& user_comms) {
-  reduce(inputs, /*output=*/inputs[root], root, op, streams, user_comms);
-}
-
-void all_reduce(
-    const std::vector<at::Tensor>& inputs,
-    std::vector<at::Tensor>& outputs,
-    int32_t op,
-    const stream_list& streams,
-    const comm_list& user_comms) {
-#ifdef USE_NCCL
-  using namespace torch::cuda::nccl::detail;
-  check_inputs(inputs, outputs, 1, 1);
-  const auto len = inputs.size();
-
-  auto data_type = to_nccl_data_type(inputs[0]);
-
-  const auto count = inputs[0].numel();
-  auto comms_ref = user_comms.empty() ? get_communicators(inputs)
-                                      : ArrayRef<ncclComm_t>(user_comms);
-
-  AutoNcclGroup nccl_group_guard;
-  at::cuda::OptionalCUDAGuard device_guard;
-  for (const auto i : c10::irange(len)) {
-    auto device = inputs[i].device().index();
-    device_guard.set_index(device);
-    // Default to the current stream
-    const auto stream = (streams.empty() || !streams[i])
-        ? at::cuda::getCurrentCUDAStream(device).stream()
-        : streams[i]->stream();
-
-    ncclComm_t comm = comms_ref[i];
-    NCCL_CHECK(ncclAllReduce(
-        inputs[i].data_ptr(),
-        outputs[i].data_ptr(),
-        count,
-        data_type,
-        to_nccl_red_op(op),
-        to_nccl_comm(comm),
-        stream));
-  }
-#else
-  AT_ERROR("PyTorch built without NCCL support");
-#endif
-}
-
-void reduce_scatter(
-    const std::vector<at::Tensor>& inputs,
-    std::vector<at::Tensor>& outputs,
-    int32_t op,
-    const stream_list& streams,
-    const comm_list& user_comms) {
-#ifdef USE_NCCL
-  using namespace torch::cuda::nccl::detail;
-  const auto len = inputs.size();
-  check_inputs(inputs, outputs, 1, len);
-
-  auto data_type = to_nccl_data_type(inputs[0]);
-
-  const auto count = inputs[0].numel() / len;
-  auto comms_ref = user_comms.empty() ? get_communicators(inputs)
-                                      : ArrayRef<ncclComm_t>(user_comms);
-
-  AutoNcclGroup nccl_group_guard;
-  at::cuda::OptionalCUDAGuard device_guard;
-  for (const auto i : c10::irange(len)) {
-    auto device = inputs[i].device().index();
-    device_guard.set_index(device);
-    // Default to the current stream
-    const auto stream = (streams.empty() || !streams[i])
-        ? at::cuda::getCurrentCUDAStream(device).stream()
-        : streams[i]->stream();
-
-    ncclComm_t comm = comms_ref[i];
-    NCCL_CHECK(ncclReduceScatter(
-        inputs[i].data_ptr(),
-        outputs[i].data_ptr(),
-        count,
-        data_type,
-        to_nccl_red_op(op),
-        to_nccl_comm(comm),
-        stream));
-  }
-#else
-  AT_ERROR("PyTorch built without NCCL support");
-#endif
-}
-
-void all_gather(
-    const std::vector<at::Tensor>& inputs,
-    std::vector<at::Tensor>& outputs,
-    const stream_list& streams,
-    const comm_list& user_comms) {
-#ifdef USE_NCCL
-  using namespace torch::cuda::nccl::detail;
-  const auto len = inputs.size();
-  check_inputs(inputs, outputs, len, 1);
-
-  auto data_type = to_nccl_data_type(inputs[0]);
-
-  const auto count = inputs[0].numel();
-  auto comms_ref = user_comms.empty() ? get_communicators(inputs)
-                                      : ArrayRef<ncclComm_t>(user_comms);
-
-  AutoNcclGroup nccl_group_guard;
-  at::cuda::OptionalCUDAGuard device_guard;
-  for (const auto i : c10::irange(len)) {
-    auto device = inputs[i].device().index();
-    device_guard.set_index(device);
-    // Default to the current stream
-    const auto stream = (streams.empty() || !streams[i])
-        ? at::cuda::getCurrentCUDAStream(device).stream()
-        : streams[i]->stream();
-
-    ncclComm_t comm = comms_ref[i];
-#if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2)
-    NCCL_CHECK(ncclAllGather(
-        inputs[i].data_ptr(),
-        outputs[i].data_ptr(),
-        count,
-        data_type,
-        to_nccl_comm(comm),
-        stream));
-#else
-    NCCL_CHECK(ncclAllGather(
-        inputs[i].data_ptr(),
-        count,
-        data_type,
-        outputs[i].data_ptr(),
-        to_nccl_comm(comm),
-        stream));
-#endif
-  }
-#else
-  AT_ERROR("PyTorch built without NCCL support");
-#endif
-}
-
-void all2all_single_equal_split(
-    at::Tensor& input,
-    at::Tensor& output,
-    int size,
-    ncclComm_t _comm,
-    at::cuda::CUDAStream& stream) {
-#ifdef USE_NCCL
-#if defined(NCCL_MAJOR) && \
-    ((NCCL_MAJOR > 2) || ((NCCL_MAJOR == 2) && (NCCL_MINOR >= 7)))
-  using namespace torch::cuda::nccl::detail;
-
-  int numranks;
-  auto type = to_nccl_data_type(input);
-  size_t count = input.numel() / size;
-  size_t rankdiff = input.nbytes() / size;
-  const auto* sendbuff = reinterpret_cast<const char*>(input.const_data_ptr());
-  auto* recvbuff = reinterpret_cast<char*>(output.data_ptr());
-  auto comm = to_nccl_comm(_comm);
-#if defined(USE_ROCM)
-  NCCL_CHECK(ncclAllToAll(sendbuff, recvbuff, count, type, comm, stream));
-#else
-  NCCL_CHECK(ncclCommCount(comm, &numranks));
-  NCCL_CHECK(ncclGroupStart());
-  for (const auto r : c10::irange(numranks)) {
-    if (_nccl_should_send_recv(count)) {
-      NCCL_CHECK(
-          ncclSend(sendbuff + r * rankdiff, count, type, r, comm, stream));
-      NCCL_CHECK(
-          ncclRecv(recvbuff + r * rankdiff, count, type, r, comm, stream));
-    }
-  }
-#ifndef NCCL_HAS_COMM_NONBLOCKING
-  NCCL_CHECK(ncclGroupEnd());
-#else
-  NCCL_CHECK_TIMEOUT(ncclGroupEnd(), _comm);
-#endif
-#endif
-#else
-  AT_ERROR("all2all is only supported for NCCL lib version >= 2.7.0");
-#endif
-#else
-  AT_ERROR("PyTorch built without NCCL support");
-#endif
-}
-
-void all2all_single_unequal_split(
-    void* sendbuff,
-    const size_t* sendcounts,
-    const size_t* senddispls,
-    void* recvbuff,
-    const size_t* recvcounts,
-    const size_t* recvdispls,
-    size_t size,
-    c10::ScalarType _type,
-    ncclComm_t _comm,
-    at::cuda::CUDAStream& stream) {
-#ifdef USE_NCCL
-#if defined(NCCL_MAJOR) && \
-    ((NCCL_MAJOR > 2) || ((NCCL_MAJOR == 2) && (NCCL_MINOR >= 7)))
-  using namespace torch::cuda::nccl::detail;
-
-  auto type = to_nccl_data_type(_type);
-  auto comm = to_nccl_comm(_comm);
-  int numranks;
-  NCCL_CHECK(ncclCommCount(comm, &numranks));
-  NCCL_CHECK(ncclGroupStart());
-  for (const auto r : c10::irange(numranks)) {
-    if (_nccl_should_send_recv(sendcounts[r])) {
-      NCCL_CHECK(ncclSend(
-          ((char*)sendbuff) + senddispls[r] * size,
-          sendcounts[r],
-          type,
-          r,
-          comm,
-          stream));
-    }
-    if (_nccl_should_send_recv(recvcounts[r])) {
-      NCCL_CHECK(ncclRecv(
-          ((char*)recvbuff) + recvdispls[r] * size,
-          recvcounts[r],
-          type,
-          r,
-          comm,
-          stream));
-    }
-  }
-#ifndef NCCL_HAS_COMM_NONBLOCKING
-  NCCL_CHECK(ncclGroupEnd());
-#else
-  NCCL_CHECK_TIMEOUT(ncclGroupEnd(), _comm);
-#endif
-#else
-  AT_ERROR("all2all is only supported for NCCL lib version >= 2.7.0");
-#endif
-#else
-  AT_ERROR("PyTorch built without NCCL support");
-#endif
-}
-
-void all2all(
-    std::vector<at::Tensor>& outputTensors,
-    std::vector<at::Tensor>& inputTensors,
-    ncclComm_t _comm,
-    at::cuda::CUDAStream& stream) {
-#ifdef USE_NCCL
-#if defined(NCCL_MAJOR) && \
-    ((NCCL_MAJOR > 2) || ((NCCL_MAJOR == 2) && (NCCL_MINOR >= 7)))
-  using namespace torch::cuda::nccl::detail;
-  auto comm = to_nccl_comm(_comm);
-
-  NCCL_CHECK(ncclGroupStart());
-  for (const auto r : c10::irange(outputTensors.size())) {
-    at::Tensor& input = inputTensors[r];
-    at::Tensor& output = outputTensors[r];
-
-    if (_nccl_should_send_recv(input.numel())) {
-      NCCL_CHECK(ncclSend(
-          input.data_ptr(),
-          input.numel(),
-          to_nccl_data_type(input),
-          r,
-          comm,
-          stream.stream()));
-    }
-    if (_nccl_should_send_recv(output.numel())) {
-      NCCL_CHECK(ncclRecv(
-          output.data_ptr(),
-          output.numel(),
-          to_nccl_data_type(output),
-          r,
-          comm,
-          stream.stream()));
-    }
-  }
-#ifndef NCCL_HAS_COMM_NONBLOCKING
-  NCCL_CHECK(ncclGroupEnd());
-#else
-  NCCL_CHECK_TIMEOUT(ncclGroupEnd(), _comm);
-#endif
-#else
-  AT_ERROR("all2all is only supported for NCCL lib version >= 2.7.0");
-#endif
-#else
-  AT_ERROR("PyTorch built without NCCL support");
-#endif
-}
-
-void send(
-    const at::Tensor& input,
-    ncclComm_t comm,
-    at::cuda::CUDAStream stream,
-    int dst) {
-#ifdef USE_NCCL
-#if defined(NCCL_MAJOR) && \
-    ((NCCL_MAJOR > 2) || ((NCCL_MAJOR == 2) && (NCCL_MINOR >= 7)))
-  using namespace torch::cuda::nccl::detail;
-#ifndef NCCL_HAS_COMM_NONBLOCKING
-  NCCL_CHECK(ncclSend(
-      input.data_ptr(),
-      input.numel(),
-      to_nccl_data_type(input),
-      dst,
-      to_nccl_comm(comm),
-      stream.stream()));
-#else
-  NCCL_CHECK_TIMEOUT(
-      ncclSend(
-          input.data_ptr(),
-          input.numel(),
-          to_nccl_data_type(input),
-          dst,
-          to_nccl_comm(comm),
-          stream.stream()),
-      comm);
-#endif
-#else
-  AT_ERROR("Send is only supported for NCCL lib version >= 2.7.0");
-#endif
-#else
-  AT_ERROR("PyTorch built without NCCL support");
-#endif
-}
-
-void recv(
-    at::Tensor& output,
-    ncclComm_t comm,
-    at::cuda::CUDAStream stream,
-    int src) {
-#ifdef USE_NCCL
-#if defined(NCCL_MAJOR) && \
-    ((NCCL_MAJOR > 2) || ((NCCL_MAJOR == 2) && (NCCL_MINOR >= 7)))
-  using namespace torch::cuda::nccl::detail;
-#ifndef NCCL_HAS_COMM_NONBLOCKING
-  NCCL_CHECK(ncclRecv(
-      output.data_ptr(),
-      output.numel(),
-      to_nccl_data_type(output),
-      src,
-      to_nccl_comm(comm),
-      stream.stream()));
-#else
-  NCCL_CHECK_TIMEOUT(
-      ncclRecv(
-          output.data_ptr(),
-          output.numel(),
-          to_nccl_data_type(output),
-          src,
-          to_nccl_comm(comm),
-          stream.stream()),
-      comm);
-#endif
-#else
-  AT_ERROR("Recv is only supported for NCCL lib version >= 2.7.0");
-#endif
-#else
-  AT_ERROR("PyTorch built without NCCL support");
-#endif
-}
-
-void gather(
-    const at::Tensor& inputs,
-    std::vector<at::Tensor>& outputs,
-    ncclComm_t _comm,
-    at::cuda::CUDAStream& stream,
-    int32_t root) {
-#ifdef USE_NCCL
-#if defined(NCCL_MAJOR) && \
-    ((NCCL_MAJOR > 2) || ((NCCL_MAJOR == 2) && (NCCL_MINOR >= 7)))
-  using namespace torch::cuda::nccl::detail;
-
-  auto comm = to_nccl_comm(_comm);
-  int numranks, cur_rank;
-  NCCL_CHECK(ncclCommCount(comm, &numranks));
-  NCCL_CHECK(ncclCommUserRank(comm, &cur_rank));
-
-  size_t count = inputs.numel();
-  auto type = to_nccl_data_type(inputs);
-  const auto* sendbuff = reinterpret_cast<const char*>(inputs.const_data_ptr());
-
-  NCCL_CHECK(ncclGroupStart());
-
-  if (cur_rank == root) {
-    for (const auto r : c10::irange(numranks)) {
-      if (r != root) {
-        auto* recvbuff = reinterpret_cast<char*>(outputs[r].data_ptr());
-        NCCL_CHECK(ncclRecv(recvbuff, count, type, r, comm, stream));
-      } else {
-        // on its own rank, simply copy from the input
-        outputs[r].copy_(inputs);
-      }
-    }
-  } else {
-    NCCL_CHECK(ncclSend(sendbuff, count, type, root, comm, stream));
-  }
-#ifndef NCCL_HAS_COMM_NONBLOCKING
-  NCCL_CHECK(ncclGroupEnd());
-#else
-  NCCL_CHECK_TIMEOUT(ncclGroupEnd(), _comm);
-#endif
-
-#else
-  AT_ERROR("gather is only supported for NCCL lib version >= 2.7.0");
-#endif
-#else
-  AT_ERROR("PyTorch built without NCCL support");
-#endif
-}
-
-void scatter(
-    const std::vector<at::Tensor>& inputs,
-    at::Tensor& outputs,
-    ncclComm_t _comm,
-    at::cuda::CUDAStream& stream,
-    int32_t root) {
-#ifdef USE_NCCL
-#if defined(NCCL_MAJOR) && \
-    ((NCCL_MAJOR > 2) || ((NCCL_MAJOR == 2) && (NCCL_MINOR >= 7)))
-  using namespace torch::cuda::nccl::detail;
-
-  auto comm = to_nccl_comm(_comm);
-  int numranks, cur_rank;
-#ifndef NCCL_HAS_COMM_NONBLOCKING
-  NCCL_CHECK(ncclCommCount(comm, &numranks));
-  NCCL_CHECK(ncclCommUserRank(comm, &cur_rank));
-#else
-  NCCL_CHECK_TIMEOUT(ncclCommCount(comm, &numranks), _comm);
-  NCCL_CHECK_TIMEOUT(ncclCommUserRank(comm, &cur_rank), _comm);
-#endif
-  NCCL_CHECK(ncclGroupStart());
-  if (cur_rank == root) {
-    for (const auto r : c10::irange(numranks)) {
-      if (r != root) {
-        size_t send_count = inputs[r].numel();
-        auto send_type = to_nccl_data_type(inputs[r]);
-        const auto* sendbuff =
-            reinterpret_cast<const char*>(inputs[r].const_data_ptr());
-        NCCL_CHECK(ncclSend(sendbuff, send_count, send_type, r, comm, stream));
-      } else {
-        // on its own rank, simply copy it to the output
-        outputs.copy_(inputs[r]);
-      }
-    }
-  } else {
-    size_t recv_count = outputs.numel();
-    auto recv_type = to_nccl_data_type(outputs);
-    auto* recvbuff = reinterpret_cast<char*>(outputs.data_ptr());
-    NCCL_CHECK(ncclRecv(recvbuff, recv_count, recv_type, root, comm, stream));
-  }
-#ifndef NCCL_HAS_COMM_NONBLOCKING
-  NCCL_CHECK(ncclGroupEnd());
-#else
-  NCCL_CHECK_TIMEOUT(ncclGroupEnd(), _comm);
-#endif
-#else
-  AT_ERROR("scatter is only supported for NCCL lib version >= 2.7.0");
-#endif
-#else
-  AT_ERROR("PyTorch built without NCCL support");
-#endif
-}
-
-} // namespace torch::cuda::nccl
-
+// std::uint64_t version() {
+// #if defined(NCCL_MAJOR)
+//   constexpr std::uint64_t ver = (((uint64_t)NCCL_MAJOR) << 32) |
+//       (((uint64_t)NCCL_MINOR) << 16) | ((uint64_t)NCCL_PATCH);
+//   return ver;
+// #elif defined(USE_NCCL)
+//   // return major version "1"
+//   return ((uint64_t)1) << 32;
+// #else
+//   return 0;
+// #endif
+// }
+
+// ncclComm_t comm_init_rank(int nranks, const ncclUniqueId& comm_id, int rank)
+// { #ifdef USE_XCCL
+//   using namespace torch::xpu::xccl::detail;
+//   xcclComm_t comm;
+//   ncclUniqueId id = comm_id;
+//   NCCL_CHECK(ncclCommInitRank(
+//       to_nccl_comm(&comm), nranks, *(to_nccl_unique_id(&id)), rank));
+//   return comm;
+// #else
+//   return nullptr;
+// #endif
+// }
+
+// namespace {
+
+//         ret_evt = torch::xpu::xccl::all_reduce(
+//             input,
+//             output,
+//             datatype,
+//             xcclOp.at(opts.reduceOp),
+//             comm,
+//             attr,
+//             stream,
+//             root);
+
+// void all_reduce(
+//     at::Tensor& input,
+//     at::Tensor& output,
+//     ccl::datatype datatype,
+//     ccl::reduction op,
+//     const stream_list& streams,
+//     const comm_list& user_comms) {
+// #ifdef USE_XCCL
+//   using namespace torch::cuda::nccl::detail;
+//   check_inputs(inputs, outputs, 1, 1);
+//   const auto len = inputs.size();
+
+//   auto data_type = to_nccl_data_type(inputs[0]);
+
+//   const auto count = inputs[0].numel();
+//   auto comms_ref = user_comms.empty() ? get_communicators(inputs)
+//                                       : ArrayRef<ncclComm_t>(user_comms);
+
+//   AutoNcclGroup nccl_group_guard;
+//   at::cuda::OptionalCUDAGuard device_guard;
+//   for (const auto i : c10::irange(len)) {
+//     auto device = inputs[i].device().index();
+//     device_guard.set_index(device);
+//     // Default to the current stream
+//     const auto stream = (streams.empty() || !streams[i])
+//         ? at::cuda::getCurrentCUDAStream(device).stream()
+//         : streams[i]->stream();
+
+//     ncclComm_t comm = comms_ref[i];
+//     NCCL_CHECK(ncclAllReduce(
+//         inputs[i].data_ptr(),
+//         outputs[i].data_ptr(),
+//         count,
+//         data_type,
+//         to_nccl_red_op(op),
+//         to_nccl_comm(comm),
+//         stream));
+//   }
+// #else
+//   AT_ERROR("PyTorch built without NCCL support");
+// #endif
+// }
+
+} // namespace torch::xpu::xccl
diff --git a/torch/csrc/xpu/xccl.h b/torch/csrc/xpu/xccl.h
index d844f166ec5ab1..f0f2b57a1dc9f7 100644
--- a/torch/csrc/xpu/xccl.h
+++ b/torch/csrc/xpu/xccl.h
@@ -12,47 +12,11 @@ using xcclComm_t = ccl::communicator;
 
 using XCCL_KVS = ccl::shared_ptr_class<ccl::kvs>;
 
-ccl::shared_ptr_class<ccl::kvs> kvs;
-std::vector<uint8_t> kvs_addr;
+XCCL_KVS kvs;
 
-XCCL_KVS get_kvs(int rank, c10d::Store& store)
-class Comms {
-public:
+XCCL_KVS get_kvs(int rank, c10d::Store& store);
 
-  explicit Comms(ccl::vector_class<xcclComm_t> &comms) :
-    comms(std::move(comms)), streams{} {}
-
-  explicit Comms(ccl::vector_class<xcclComm_t> &comms, ccl::vector_class<ccl::stream> &streams, std::vector<c10::Stream> &torch_streams) :
-    comms(std::move(comms)), streams(std::move(streams)), torch_streams(std::move(torch_streams)) {}
-
-  ~Comms() noexcept(false) {}
-
-  Comms() = delete;
-
-  Comms(const Comms &) = delete;
-
-  Comms &operator=(const Comms &) = delete;
-
-  Comms(Comms &&other) : comms(std::move(other.comms)), streams(std::move(other.streams)),
-                         torch_streams(std::move(other.torch_streams)) {}
-
-  Comms &operator=(Comms &&other) {
-    std::swap(comms, other.comms);
-    std::swap(streams, other.streams);
-    std::swap(torch_streams, other.torch_streams);
-    return *this;
-  }
-
-public:
-  // The Communicators used by XCCL
-  ccl::vector_class<xcclComm_t> comms;
-  // The streams used by XCCL
-  ccl::vector_class<ccl::stream> streams;
-  // one to one mapping the torch streams to the ccl::stream.
-  std::vector<c10::Stream> torch_streams;
-};
-
-enum class xcclRedOp { Sum = 0, Prod = 1, Max = 2, Min = 3};
+enum class xcclRedOp { Sum = 0, Prod = 1, Max = 2, Min = 3 };
 
 enum class xcclDataType {
   Int8 = 0,
@@ -75,14 +39,13 @@ enum class xcclDataType {
 
 namespace detail {
 
- at::ArrayRef<xcclComm_t> get_communicators(
-    at::TensorList inputs);
- void check_inputs(
+at::ArrayRef<xcclComm_t> get_communicators(at::TensorList inputs);
+void check_inputs(
     at::TensorList inputs,
     at::TensorList outputs,
     int input_multiplier,
     int output_multiplier);
- void check_inputs(
+void check_inputs(
     at::TensorList inputs,
     const at::Tensor& output,
     int root,
@@ -94,13 +57,13 @@ namespace detail {
 using comm_list = std::vector<xor>;
 using stream_list = std::vector<std::optional<at::xpu::XPUStream>>;
 
- std::uint64_t version();
- const char* version_suffix();
+std::uint64_t version();
+const char* version_suffix();
 
 bool is_available(at::TensorList tensors);
 
 comm_init_rank(int nranks, const ncclUniqueId& comm_id, int rank);
- void comm_destroy(ncclComm_t comm);
+void comm_destroy(ncclComm_t comm);
 
 void all_reduce(
     const std::vector<at::Tensor>& inputs,
@@ -109,4 +72,3 @@ void all_reduce(
     const stream_list& streams = {},
     const comm_list& user_comms = {});
 } // namespace torch::xpu::xccl
-

From ba6c4b7f4d14bc7e8345125215246c5a5153b520 Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Fri, 30 Aug 2024 08:11:14 +0000
Subject: [PATCH 03/96] update

---
 caffe2/CMakeLists.txt                            | 8 ++++----
 torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp | 3 ---
 torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp | 2 +-
 3 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 2c4da5fd50f10c..28e7d0c96ba877 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -1014,6 +1014,10 @@ elseif(USE_CUDA)
 endif()
 
 if(USE_XPU)
+  if(USE_XCCL)
+  list(APPEND Caffe2_XPU_SRCS
+    ${TORCH_SRC_DIR}/csrc/xpu/xccl.cpp)
+  endif()
   add_library(torch_xpu ${Caffe2_XPU_SRCS})
   torch_compile_options(torch_xpu)  # see cmake/public/utils.cmake
   target_compile_definitions(torch_xpu PRIVATE USE_XPU)
@@ -1057,10 +1061,6 @@ if(USE_XPU)
   # 2. Using add_custom_command in torch-xpu-ops to define sycl device sources
   #    compilation. add_custom_command requires an explicit dependency.
   list(APPEND ${Caffe2_XPU_INCLUDE} ${TORCH_XPU_OPS_DIR}/src/ATen/)
-  # if(USE_XCCL)
-  #   list(APPEND Caffe2_GPU_SRCS
-  #     ${TORCH_SRC_DIR}/csrc/xpu/xccl.cpp)
-  # endif()
   set(TORCH_XPU_OPS_PYTORCH_DEPS ATEN_CPU_FILES_GEN_TARGET)
 
   add_subdirectory(${TORCH_ROOT}/third_party/torch-xpu-ops
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index 3325691c3a8531..d901259f400c5e 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -11,15 +11,12 @@
 #include <unordered_set>
 #include <utility>
 
-#include <ATen/cuda/CUDAContext.h>
-#include <ATen/cuda/CUDAGraph.h>
 #include <c10/core/DeviceType.h>
 #include <c10/util/CallOnce.h>
 #include <c10/util/Exception.h>
 #include <c10/util/Logging.h>
 #include <c10/util/Optional.h>
 #include <c10/util/irange.h>
-#include <torch/csrc/cuda/nccl.h>
 #include <torch/csrc/distributed/c10d/ParamCommsUtils.hpp>
 #include <torch/csrc/distributed/c10d/TraceUtils.h>
 #include <torch/csrc/distributed/c10d/Utils.hpp>
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
index 51801ed992edcc..c7e17b491ffce6 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
@@ -10,7 +10,7 @@
 #ifdef USE_C10D_XCCL
 
 #include <oneapi/ccl.hpp>
-#include <nccl.h>
+#include <torch/csrc/xpu/xccl.h>
 #include <exception>
 #include <memory>
 #include <vector>

From 68a6aeecd90e3cf9f993a9df236121cd223102c8 Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Fri, 30 Aug 2024 09:07:54 +0000
Subject: [PATCH 04/96] update

---
 torch/csrc/xpu/xccl.cpp | 159 +++++++++++++++++++---------------------
 torch/csrc/xpu/xccl.h   |  23 +++---
 2 files changed, 89 insertions(+), 93 deletions(-)

diff --git a/torch/csrc/xpu/xccl.cpp b/torch/csrc/xpu/xccl.cpp
index 747c5bf3eb1103..6224b19254dbfe 100644
--- a/torch/csrc/xpu/xccl.cpp
+++ b/torch/csrc/xpu/xccl.cpp
@@ -5,22 +5,15 @@
 #include <c10/util/hash.h>
 #include <c10/util/irange.h>
 
-#include <xccl.h>
+#include <torch/csrc/xpu/xccl.h>
 
 #include <limits>
 #include <sstream>
 #include <type_traits>
 #include <unordered_map>
 
-xcclComm_t* to_xccl_comm(torch::xpu::xccl::xcclComm_t* var) {
-  return reinterpret_cast<xcclComm_t*>(var);
-}
-
-xcclComm_t to_xccl_comm(torch::xpu::xccl::xcclComm_t var) {
-  return reinterpret_cast<xcclComm_t>(var);
-}
 
-ccl::datatype to_nccl_data_type(c10::ScalarType type) {
+ccl::datatype to_xccl_data_type(c10::ScalarType type) {
   switch (type) {
     case at::kFloat:
       return ccl::datatype::float32;
@@ -45,7 +38,7 @@ ccl::datatype to_nccl_data_type(c10::ScalarType type) {
   }
 }
 
-ncclDataType_t to_xccl_data_type(const at::Tensor& t) {
+ccl::datatype to_xccl_data_type(const at::Tensor& t) {
   if (!t.is_xpu()) {
     TORCH_CHECK(
         false,
@@ -61,11 +54,13 @@ ccl::reduction to_xccl_red_op(int var) {
 
 namespace torch::xpu::xccl {
 
+XCCL_KVS kvs;
+std::mutex kvs_mutex;
+
 XCCL_KVS get_kvs(int rank, c10d::Store& store) {
+  std::lock_guard<std::mutex> lock(kvs_mutex);
   if (kvs)
     return kvs;
-  // Each process group is with different store, so we use the unique key for
-  // broadcast the bootstrap network information.
   std::string storeKey = "ccl_kvs";
 
   // Rank 0 broadcast the bootstrap network information to other ranks
@@ -82,9 +77,9 @@ XCCL_KVS get_kvs(int rank, c10d::Store& store) {
     }
     ccl::kvs::address_type main_addr;
     std::copy_n(
-        std::make_move_iterator(ccl_kvs_addr.begin()),
-        ccl::kvs::address_max_size,
-        main_addr.begin());
+      ccl_kvs_addr.begin(),
+      ccl::kvs::address_max_size,
+      main_addr.begin());
     kvs = ccl::create_kvs(main_addr);
   }
 
@@ -190,82 +185,82 @@ static inline void check_tensor(
   }
 }
 
-void check_inputs(
-    TensorList inputs,
-    TensorList outputs,
-    int input_multiplier,
-    int output_multiplier) {
-  // len(inputs) == len(outputs)
-  size_t len = inputs.size();
+// void check_inputs(
+//     TensorList inputs,
+//     TensorList outputs,
+//     int input_multiplier,
+//     int output_multiplier) {
+//   // len(inputs) == len(outputs)
+//   size_t len = inputs.size();
 
-  if (len <= 0) {
-    throw std::runtime_error("input sequence can't be empty");
-  }
+//   if (len <= 0) {
+//     throw std::runtime_error("input sequence can't be empty");
+//   }
 
-  if (len != outputs.size()) {
-    std::stringstream err;
-    err << "inputs and outputs sequences have to be of the same length, but got input of length "
-        << len << " and output of length " << outputs.size();
-    throw std::runtime_error(err.str());
-  }
+//   if (len != outputs.size()) {
+//     std::stringstream err;
+//     err << "inputs and outputs sequences have to be of the same length, but got input of length "
+//         << len << " and output of length " << outputs.size();
+//     throw std::runtime_error(err.str());
+//   }
 
-  device_set devices;
-  int64_t numel = inputs[0].numel();
-  auto dtype = inputs[0].scalar_type();
+//   device_set devices;
+//   int64_t numel = inputs[0].numel();
+//   auto dtype = inputs[0].scalar_type();
 
-  for (const auto i : c10::irange(len)) {
-    auto input = inputs[i];
-    auto output = outputs[i];
+//   for (const auto i : c10::irange(len)) {
+//     auto input = inputs[i];
+//     auto output = outputs[i];
 
-    check_tensor(
-        input, output, input_multiplier, output_multiplier, numel, dtype);
+//     check_tensor(
+//         input, output, input_multiplier, output_multiplier, numel, dtype);
 
-    auto input_device = input.get_device();
-    // inputs must be on unique devices
-    if (devices.test(input_device)) {
-      throw std::runtime_error("inputs must be on unique devices");
-    }
-    devices.set(input_device);
-  }
-}
+//     auto input_device = input.get_device();
+//     // inputs must be on unique devices
+//     if (devices.test(input_device)) {
+//       throw std::runtime_error("inputs must be on unique devices");
+//     }
+//     devices.set(input_device);
+//   }
+// }
 
-void check_inputs(
-    TensorList inputs,
-    const at::Tensor& output,
-    int root,
-    int input_multiplier,
-    int output_multiplier) {
-  auto len = inputs.size();
+// void check_inputs(
+//     TensorList inputs,
+//     const at::Tensor& output,
+//     int root,
+//     int input_multiplier,
+//     int output_multiplier) {
+//   auto len = inputs.size();
 
-  if (len <= 0) {
-    throw std::runtime_error("input sequence can't be empty");
-  }
+//   if (len <= 0) {
+//     throw std::runtime_error("input sequence can't be empty");
+//   }
 
-  device_set devices;
-  int64_t numel = inputs[0].numel();
-  auto dtype = inputs[0].scalar_type();
-
-  for (const auto i : c10::irange(len)) {
-    auto input = inputs[i];
-
-    check_tensor(
-        input,
-        i == static_cast<std::remove_cv_t<decltype(i)>>(root)
-            ? std::optional<at::Tensor>{output}
-            : std::nullopt,
-        input_multiplier,
-        output_multiplier,
-        numel,
-        dtype);
-
-    auto input_device = input.get_device();
-    // inputs must be on unique devices
-    if (devices.test(input_device)) {
-      throw std::runtime_error("inputs must be on unique devices");
-    }
-    devices.set(input_device);
-  }
-}
+//   device_set devices;
+//   int64_t numel = inputs[0].numel();
+//   auto dtype = inputs[0].scalar_type();
+
+//   for (const auto i : c10::irange(len)) {
+//     auto input = inputs[i];
+
+//     check_tensor(
+//         input,
+//         i == static_cast<std::remove_cv_t<decltype(i)>>(root)
+//             ? std::optional<at::Tensor>{output}
+//             : std::nullopt,
+//         input_multiplier,
+//         output_multiplier,
+//         numel,
+//         dtype);
+
+//     auto input_device = input.get_device();
+//     // inputs must be on unique devices
+//     if (devices.test(input_device)) {
+//       throw std::runtime_error("inputs must be on unique devices");
+//     }
+//     devices.set(input_device);
+//   }
+// }
 
 } // namespace detail
 
diff --git a/torch/csrc/xpu/xccl.h b/torch/csrc/xpu/xccl.h
index f0f2b57a1dc9f7..31fc594e71cc0b 100644
--- a/torch/csrc/xpu/xccl.h
+++ b/torch/csrc/xpu/xccl.h
@@ -5,6 +5,7 @@
 #include <cstddef>
 #include <optional>
 #include <vector>
+#include <torch/csrc/distributed/c10d/Store.hpp>
 
 namespace torch::xpu::xccl {
 
@@ -12,7 +13,7 @@ using xcclComm_t = ccl::communicator;
 
 using XCCL_KVS = ccl::shared_ptr_class<ccl::kvs>;
 
-XCCL_KVS kvs;
+extern XCCL_KVS kvs;
 
 XCCL_KVS get_kvs(int rank, c10d::Store& store);
 
@@ -54,21 +55,21 @@ void check_inputs(
 
 } // namespace detail
 
-using comm_list = std::vector<xor>;
-using stream_list = std::vector<std::optional<at::xpu::XPUStream>>;
+// using comm_list = std::vector<xor>;
+// using stream_list = std::vector<std::optional<at::xpu::XPUStream>>;
 
 std::uint64_t version();
 const char* version_suffix();
 
 bool is_available(at::TensorList tensors);
 
-comm_init_rank(int nranks, const ncclUniqueId& comm_id, int rank);
-void comm_destroy(ncclComm_t comm);
+// comm_init_rank(int nranks, const ncclUniqueId& comm_id, int rank);
+// void comm_destroy(xcclComm_t comm);
 
-void all_reduce(
-    const std::vector<at::Tensor>& inputs,
-    std::vector<at::Tensor>& outputs,
-    int32_t op = static_cast<int>(xcclRedOp::Sum),
-    const stream_list& streams = {},
-    const comm_list& user_comms = {});
+// void all_reduce(
+//     const std::vector<at::Tensor>& inputs,
+//     std::vector<at::Tensor>& outputs,
+//     int32_t op = static_cast<int>(xcclRedOp::Sum),
+//     const stream_list& streams = {},
+//     const comm_list& user_comms = {});
 } // namespace torch::xpu::xccl

From 31eeee95ff611e8c8ab88dce0b096e45db808f41 Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Mon, 2 Sep 2024 03:30:54 +0000
Subject: [PATCH 05/96] register

---
 build_variables.bzl                           |  1 +
 torch/_C/_distributed_c10d.pyi                |  9 ++++++
 .../distributed/c10d/ProcessGroupXCCL.cpp     | 16 +++++-----
 .../distributed/c10d/ProcessGroupXCCL.hpp     | 29 +++++++------------
 torch/csrc/distributed/c10d/init.cpp          | 22 ++++++++++++++
 5 files changed, 51 insertions(+), 26 deletions(-)

diff --git a/build_variables.bzl b/build_variables.bzl
index e05c94bd83f577..80a575324aa8b3 100644
--- a/build_variables.bzl
+++ b/build_variables.bzl
@@ -542,6 +542,7 @@ libtorch_distributed_extra_sources = [
     "torch/csrc/distributed/autograd/rpc_messages/rref_backward_req.cpp",
     "torch/csrc/distributed/autograd/rpc_messages/rref_backward_resp.cpp",
     "torch/csrc/distributed/c10d/HashStore.cpp",
+    "torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp",
     "torch/csrc/distributed/rpc/agent_utils.cpp",
     "torch/csrc/distributed/rpc/message.cpp",
     "torch/csrc/distributed/rpc/profiler/remote_profiler_manager.cpp",
diff --git a/torch/_C/_distributed_c10d.pyi b/torch/_C/_distributed_c10d.pyi
index 94e8578bbfff62..b2cba6905901f3 100644
--- a/torch/_C/_distributed_c10d.pyi
+++ b/torch/_C/_distributed_c10d.pyi
@@ -697,3 +697,12 @@ class ProcessGroupCudaP2P(Backend):
         storage_offset: Optional[int] = 0,
     ) -> torch.Tensor: ...
     def _shutdown(self) -> None: ...
+
+class ProcessGroupXCC(Backend):
+    def __init__(
+        self,
+        store: Store,
+        rank: int,
+        size: int,
+        timeout: timedelta,
+    ): ...
\ No newline at end of file
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index d901259f400c5e..1fe069575c7143 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -70,7 +70,7 @@ ccl::datatype getXcclDataType(at::ScalarType type) {
 namespace {
 
 static std::mutex xcclCommDevIdxMapMutex;
-static std::unordered_map<std::shared_ptr<XCCLComm>, int> xcclCommDevIdxMap;
+static std::unordered_map<std::shared_ptr<xcclComm_t>, int> xcclCommDevIdxMap;
 
 template <
     template <typename, typename, typename, typename, typename>
@@ -118,7 +118,7 @@ c10::intrusive_ptr<Backend> ProcessGroupXCCL::createProcessGroupXCCL(
 
 ProcessGroupXCCL::~ProcessGroupXCCL() {}
 
-std::shared_ptr<XCCLComm> ProcessGroupXCCL::getXCCLComm(
+std::shared_ptr<xcclComm_t> ProcessGroupXCCL::getXCCLComm(
     const std::string& deviceKey,
     at::Device& device) {
   if (deviceKey.empty()) {
@@ -135,7 +135,7 @@ std::shared_ptr<XCCLComm> ProcessGroupXCCL::getXCCLComm(
     }
   }
 
-  std::shared_ptr<XCCLComm> xcclComm;
+  std::shared_ptr<xcclComm_t> xcclComm_t;
 
   XCCL_KVS kvs = get_kvs(rank_, store_);
 
@@ -149,11 +149,11 @@ std::shared_ptr<XCCLComm> ProcessGroupXCCL::getXCCLComm(
   auto q = get_sycl_queue(stream);
   auto ctx = ccl::create_context(q.get_context());
   devs_rank.emplace_back(rank, ccl::create_device(q.get_device()));
-  xcclComm = ccl::create_communicator(numRanks, devs_rank, ctx, kvs);
+  xcclComm_t = ccl::create_communicator(numRanks, devs_rank, ctx, kvs);
 
   {
     std::lock_guard<std::mutex> lock(mutex_);
-    inInitializationCommMap_.emplace(deviceKey, xcclComm);
+    inInitializationCommMap_.emplace(deviceKey, xcclComm_t);
   }
 
   auto it = inInitializationCommMap_.find(deviceKey);
@@ -162,7 +162,7 @@ std::shared_ptr<XCCLComm> ProcessGroupXCCL::getXCCLComm(
     inInitializationCommMap_.erase(deviceKey);
 
     xcclCommDevIdxMapMutex.lock();
-    xcclCommDevIdxMap.emplace(xcclComm, device.index());
+    xcclCommDevIdxMap.emplace(xcclComm_t, device.index());
     xcclCommDevIdxMapMutex.unlock();
   }
 
@@ -187,7 +187,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
 
   auto device = input.device();
   const auto key = std::to_string(device.index());
-  auto xcclComm = getXCCLComm(key, device);
+  auto xcclComm_t = getXCCLComm(key, device);
 
   std::vector<at::Tensor> inputs{input};
   std::vector<at::Tensor> outputs{output};
@@ -198,7 +198,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
   //     enqueue);
 
   work = make_work_ccl<WorkXCCL>(
-      inputs, outputs, fn, xcclComm, attr, rank_, op_type);
+      inputs, outputs, fn, xcclComm_t, attr, rank_, op_type);
   // pre(ncclStream, work);
   // ncclComm_t comm = ncclComm->getNcclComm();
   // post(ncclStream, work);
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
index c7e17b491ffce6..6c4a40f0a3ee77 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
@@ -25,14 +25,13 @@
 #include <unordered_map>
 
 #include <torch/csrc/distributed/c10d/Backend.hpp>
-#include <torch/csrc/distributed/c10d/NCCLUtils.hpp>
 #include <torch/csrc/distributed/c10d/PrefixStore.hpp>
 #include <torch/csrc/distributed/c10d/Store.hpp>
-#include <torch/csrc/distributed/c10d/intra_node_comm.hpp>
 
 namespace c10d {
 
 constexpr const char* XCCL_BACKEND_NAME = "xccl";
+using namespace torch::xpu::xccl;
 
 class ProcessGroupXCCL : public Backend {
  public:
@@ -41,14 +40,11 @@ class ProcessGroupXCCL : public Backend {
     WorkXCCL(
         std::vector<std::vector<at::Tensor>> outputTensors,
         int rank = -1,
-        OpType opType = UNKNOWN,
+        OpType opType = OpType::UNKNOWN,
         const c10::optional<std::vector<at::Tensor>>& inputTensors =
             c10::nullopt)
         : Work(rank, opType), outputTensors_(std::move(outputTensors)) {}
 
-    WorkXCCL(const WorkXCCL& w)
-        : outputTensors_(w.outputTensors_), events_(w.events_) {}
-
     ~WorkXCCL() override {
       // Ensures all events are properly handled before destruction
       for (auto& event : events_) {
@@ -57,7 +53,7 @@ class ProcessGroupXCCL : public Backend {
     }
 
     bool isCompleted() override {
-      for (const auto& event : events_) {
+      for (auto& event : events_) {
         if (!event.test()) {
           return false;
         }
@@ -80,14 +76,15 @@ class ProcessGroupXCCL : public Backend {
       }
     }
 
-    void wait() override {
-      std::lock_guard<std::mutex> lock(mutex_);
+    void wait() {
+      std::unique_lock<std::timed_mutex> lock(mutex_);
       for (auto& event : events_) {
-        CCL_CHECK(event.wait());
+        event.wait();
       }
       events_.clear();
     }
 
+
     c10::intrusive_ptr<c10::ivalue::Future> getFuture() override {
       TORCH_CHECK(
           false, "ProcessGroupXCCL::WorkXCCL::getFuture not implemented");
@@ -109,9 +106,9 @@ class ProcessGroupXCCL : public Backend {
       const c10::intrusive_ptr<Store>& store,
       int rank,
       int size)
-      : store_(store), rank_(rank), size_(size) {}
+      : store_(store), Backend(rank, size)  {}
 
-  ProcessGroupXCCL::~ProcessGroupXCCL() = default;
+  ~ProcessGroupXCCL() = default;
 
   const std::string getBackendName() const override {
     return std::string(XCCL_BACKEND_NAME);
@@ -129,14 +126,10 @@ class ProcessGroupXCCL : public Backend {
       int rank = -1,
       int size = -1);
 
- private:
-  int rank_;
-  int size_;
-
  public:
-  std::unordered_map<std::string, std::shared_ptr<XCCLComm>>
+  std::unordered_map<std::string, std::shared_ptr<xcclComm_t>>
       inInitializationCommMap_;
-  std::unordered_map<std::string, std::shared_ptr<XCCLComm>> devXCCLCommMap_;
+  std::unordered_map<std::string, std::shared_ptr<xcclComm_t>> devXCCLCommMap_;
   c10::intrusive_ptr<Store> store_;
   std::mutex mutex_;
 };
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index c8f9dff37f06e2..e12e96f9fe882f 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -37,6 +37,11 @@
 #include <torch/csrc/distributed/c10d/ProcessGroupUCC.hpp>
 #endif
 
+#ifdef USE_C10D_XCCL
+#include <torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp>
+#endif
+
+
 #include <fmt/format.h>
 #include <pybind11/chrono.h>
 #include <torch/csrc/distributed/c10d/DMAConnectivity.hpp>
@@ -2877,6 +2882,23 @@ Example::
               py::call_guard<py::gil_scoped_release>());
 #endif
 
+#ifdef USE_C10D_XCCL
+  auto processGroupXCCL =
+      intrusive_ptr_no_gil_destructor_class_<::c10d::ProcessGroupXCCL>(
+          module, "ProcessGroupXCCL", backend)
+          .def(
+              py::init([](const c10::intrusive_ptr<::c10d::Store>& store,
+                          int rank,
+                          int size) {
+                return c10::make_intrusive<::c10d::ProcessGroupXCCL>(
+                    store, rank, size);
+              }),
+              py::arg("store"),
+              py::arg("rank"),
+              py::arg("size"),
+              py::call_guard<py::gil_scoped_release>());
+#endif
+
   py::enum_<::c10d::OpType>(module, "OpType")
       .value("BROADCAST", ::c10d::OpType::BROADCAST)
       .value("ALLREDUCE", ::c10d::OpType::ALLREDUCE)

From b977abcad2464bff802df68318ea658014cad63e Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Mon, 2 Sep 2024 06:58:33 +0000
Subject: [PATCH 06/96] update

---
 .../distributed/c10d/ProcessGroupXCCL.cpp     | 33 +++++++++----------
 .../distributed/c10d/ProcessGroupXCCL.hpp     | 10 +++---
 2 files changed, 20 insertions(+), 23 deletions(-)

diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index 1fe069575c7143..12f33316c08f86 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -65,10 +65,6 @@ ccl::datatype getXcclDataType(at::ScalarType type) {
   return it->second;
 }
 
-} // namespace c10d
-
-namespace {
-
 static std::mutex xcclCommDevIdxMapMutex;
 static std::unordered_map<std::shared_ptr<xcclComm_t>, int> xcclCommDevIdxMap;
 
@@ -116,7 +112,13 @@ c10::intrusive_ptr<Backend> ProcessGroupXCCL::createProcessGroupXCCL(
   return c10::make_intrusive<ProcessGroupXCCL>(store, rank, size);
 }
 
-ProcessGroupXCCL::~ProcessGroupXCCL() {}
+ProcessGroupXCCL::ProcessGroupXCCL(
+    const c10::intrusive_ptr<Store>& store,
+    int rank,
+    int size)
+    : Backend(rank, size), store_(store) {}
+
+ProcessGroupXCCL::~ProcessGroupXCCL() = default;
 
 std::shared_ptr<xcclComm_t> ProcessGroupXCCL::getXCCLComm(
     const std::string& deviceKey,
@@ -135,7 +137,7 @@ std::shared_ptr<xcclComm_t> ProcessGroupXCCL::getXCCLComm(
     }
   }
 
-  std::shared_ptr<xcclComm_t> xcclComm_t;
+  std::shared_ptr<xcclComm_t> XCCLComm;
 
   XCCL_KVS kvs = get_kvs(rank_, store_);
 
@@ -149,11 +151,11 @@ std::shared_ptr<xcclComm_t> ProcessGroupXCCL::getXCCLComm(
   auto q = get_sycl_queue(stream);
   auto ctx = ccl::create_context(q.get_context());
   devs_rank.emplace_back(rank, ccl::create_device(q.get_device()));
-  xcclComm_t = ccl::create_communicator(numRanks, devs_rank, ctx, kvs);
+  XCCLComm = ccl::create_communicator(numRanks, devs_rank, ctx, kvs);
 
   {
     std::lock_guard<std::mutex> lock(mutex_);
-    inInitializationCommMap_.emplace(deviceKey, xcclComm_t);
+    inInitializationCommMap_.emplace(deviceKey, XCCLComm);
   }
 
   auto it = inInitializationCommMap_.find(deviceKey);
@@ -162,7 +164,7 @@ std::shared_ptr<xcclComm_t> ProcessGroupXCCL::getXCCLComm(
     inInitializationCommMap_.erase(deviceKey);
 
     xcclCommDevIdxMapMutex.lock();
-    xcclCommDevIdxMap.emplace(xcclComm_t, device.index());
+    xcclCommDevIdxMap.emplace(XCCLComm, device.index());
     xcclCommDevIdxMapMutex.unlock();
   }
 
@@ -193,15 +195,9 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
   std::vector<at::Tensor> outputs{output};
 
   c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL> work;
-  // work =
-  //     initWork(device, rank_, opType, profilingTitle, inputs, outputs,
-  //     enqueue);
 
   work = make_work_ccl<WorkXCCL>(
       inputs, outputs, fn, xcclComm_t, attr, rank_, op_type);
-  // pre(ncclStream, work);
-  // ncclComm_t comm = ncclComm->getNcclComm();
-  // post(ncclStream, work);
 
   return work;
 }
@@ -255,6 +251,9 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::allreduce(
       OpType::ALLREDUCE);
 }
 
-} // namespace
+// c10::intrusive_ptr<Work> barrier(
+//     const BarrierOptions& opts = BarrierOptions()) override;
+
+} // namespace c10d
 
-#endif // USE_C10D_XCCL
\ No newline at end of file
+#endif // USE_C10D_XCCL
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
index 6c4a40f0a3ee77..7e59180eb9b57e 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
@@ -83,11 +83,9 @@ class ProcessGroupXCCL : public Backend {
       }
       events_.clear();
     }
-
-
+    
     c10::intrusive_ptr<c10::ivalue::Future> getFuture() override {
-      TORCH_CHECK(
-          false, "ProcessGroupXCCL::WorkXCCL::getFuture not implemented");
+      return future_;
     }
 
     std::vector<at::Tensor> result() override {
@@ -118,8 +116,8 @@ class ProcessGroupXCCL : public Backend {
       std::vector<at::Tensor>& tensors,
       const AllreduceOptions& opts = AllreduceOptions()) override;
 
-  c10::intrusive_ptr<Work> barrier(
-      const BarrierOptions& opts = BarrierOptions()) override;
+  // c10::intrusive_ptr<Work> barrier(
+  //     const BarrierOptions& opts = BarrierOptions()) override;
 
   static c10::intrusive_ptr<Backend> createProcessGroupXCCL(
       const c10::intrusive_ptr<Store>& store,

From 486b61a9f78bdc530da1185bdd4023098e987f78 Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Tue, 3 Sep 2024 01:41:04 +0000
Subject: [PATCH 07/96] update

---
 torch/_C/_distributed_c10d.pyi                   |  3 +--
 torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp | 15 ++++++++-------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/torch/_C/_distributed_c10d.pyi b/torch/_C/_distributed_c10d.pyi
index b2cba6905901f3..0c97185519d28f 100644
--- a/torch/_C/_distributed_c10d.pyi
+++ b/torch/_C/_distributed_c10d.pyi
@@ -698,11 +698,10 @@ class ProcessGroupCudaP2P(Backend):
     ) -> torch.Tensor: ...
     def _shutdown(self) -> None: ...
 
-class ProcessGroupXCC(Backend):
+class ProcessGroupXCCL(Backend):
     def __init__(
         self,
         store: Store,
         rank: int,
         size: int,
-        timeout: timedelta,
     ): ...
\ No newline at end of file
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
index 7e59180eb9b57e..0c5f4fa5aeccf7 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
@@ -76,13 +76,14 @@ class ProcessGroupXCCL : public Backend {
       }
     }
 
-    void wait() {
-      std::unique_lock<std::timed_mutex> lock(mutex_);
-      for (auto& event : events_) {
-        event.wait();
-      }
-      events_.clear();
-    }
+    bool wait(std::chrono::milliseconds timeout = kNoTimeout) override;
+    // void wait() {
+    //   std::unique_lock<std::timed_mutex> lock(mutex_);
+    //   for (auto& event : events_) {
+    //     event.wait();
+    //   }
+    //   events_.clear();
+    // }
     
     c10::intrusive_ptr<c10::ivalue::Future> getFuture() override {
       return future_;

From 6844932aeb7b4c8aff6e0d4bac5bf32ede5e0a5b Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Tue, 3 Sep 2024 02:25:24 +0000
Subject: [PATCH 08/96] fix typo and register frontend

---
 cmake/Dependencies.cmake              |  2 +-
 cmake/Modules/FindXCCL.cmake          |  2 +-
 torch/distributed/distributed_c10d.py | 29 ++++++++++++++++++++++++---
 3 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 49fb525afbf8a8..cb204eada5f689 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -1154,7 +1154,7 @@ endif()
 if(USE_XCCL)
   if(NOT USE_XPU)
     message(WARNING
-        "Not using XPU, so disabling USE_NUSE_XCCLCCL. Suppress this warning with "
+        "Not using XPU, so disabling USE_XCCL. Suppress this warning with "
         "-DUSE_XCCL=OFF.")
     caffe2_update_option(USE_XCCL OFF)
   elseif(NOT CMAKE_SYSTEM_NAME STREQUAL "Linux")
diff --git a/cmake/Modules/FindXCCL.cmake b/cmake/Modules/FindXCCL.cmake
index 3f30e8cd23d6e7..56b7fc0f7dcf32 100644
--- a/cmake/Modules/FindXCCL.cmake
+++ b/cmake/Modules/FindXCCL.cmake
@@ -27,7 +27,7 @@ find_file(
   NO_DEFAULT_PATH
 )
 
-# Find include/sycl path from include path.
+# Find include/oneapi path from include path.
 find_file(
   XCCL_INCLUDE_ONEAPI_DIR
   NAMES oneapi
diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index 45e096985143a3..d178f976c5682d 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -130,6 +130,7 @@
 _NCCL_AVAILABLE = True
 _GLOO_AVAILABLE = True
 _UCC_AVAILABLE = True
+_XCCL_AVAILABLE = True
 
 _pickler = pickle.Pickler
 _unpickler = pickle.Unpickler
@@ -193,6 +194,14 @@ def _export_c_types() -> None:
 except ImportError:
     _UCC_AVAILABLE = False
 
+try:
+    from torch._C._distributed_c10d import ProcessGroupXCCL
+
+    ProcessGroupXCCL.__module__ = "torch.distributed.distributed_c10d"
+    __all__ += ["ProcessGroupXCCL"]
+except ImportError:
+    _XCCL_AVAILABLE = False
+
 logger = logging.getLogger(__name__)
 
 PG_WRAPPER_STORE_PREFIX = "pg_wrapper"
@@ -222,7 +231,7 @@ class Backend(str):
     """
     An enum-like class for backends.
 
-    Available backends: GLOO, NCCL, UCC, MPI, and other registered backends.
+    Available backends: GLOO, NCCL, UCC, MPI, XCCL, and other registered backends.
 
     The values of this class are lowercase strings, e.g., ``"gloo"``. They can
     be accessed as attributes, e.g., ``Backend.NCCL``.
@@ -242,6 +251,7 @@ class Backend(str):
     NCCL = "nccl"
     UCC = "ucc"
     MPI = "mpi"
+    XCCL = "XCCL"
 
     _BackendPlugin = namedtuple("_BackendPlugin", ["creator_fn", "extended_api"])
 
@@ -1097,6 +1107,9 @@ def is_ucc_available() -> bool:
     """Check if the UCC backend is available."""
     return _UCC_AVAILABLE
 
+def is_xccl_available() -> bool:
+    """Check if the XCCL backend is available."""
+    return _XCCL_AVAILABLE
 
 def is_backend_available(backend: str) -> bool:
     """
@@ -1385,7 +1398,7 @@ def init_process_group(
 
     Args:
         backend (str or Backend, optional): The backend to use. Depending on
-            build-time configurations, valid values include ``mpi``, ``gloo``,
+            build-time configurations, valid values include ``mpi``, ``gloo``, ``xccl``,
             ``nccl``, and ``ucc``. If the backend is not provided, then both a ``gloo``
             and ``nccl`` backend will be created, see notes below for how multiple
             backends are managed. This field can be given as a lowercase string
@@ -1762,7 +1775,6 @@ def _new_process_group_helper(
                 pg_options = ProcessGroupNCCL.Options()
                 pg_options.is_high_priority_stream = False
             pg_options._timeout = timeout
-
             if split_from:
                 pg_options.split_from = split_from
                 pg_options.split_color = _process_group_color(global_ranks_in_group)
@@ -1781,6 +1793,17 @@ def _new_process_group_helper(
                 backend_prefix_store, group_rank, group_size, timeout=timeout
             )
             backend_type = ProcessGroup.BackendType.UCC
+        elif backend_str == Backend.XCCL:
+            if not is_xccl_available():
+                raise RuntimeError("Distributed package doesn't have XCCL built in")
+            if pg_options is not None:
+                assert isinstance(
+                    pg_options, ProcessGroupXCCL.Options
+                ), "Expected pg_options argument to be of type ProcessGroupXCCL.Options"
+            backend_class = ProcessGroupXCCL(
+                backend_prefix_store, group_rank, group_size
+            )
+            backend_type = ProcessGroup.BackendType.XCCL
         else:
             assert (
                 backend_str.upper() in Backend._plugins

From 7f6f8b96bb2bba92b8bc4e912e414da65d521f2d Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Tue, 3 Sep 2024 02:30:15 +0000
Subject: [PATCH 09/96] update

---
 torch/distributed/distributed_c10d.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index d178f976c5682d..26cb1cda1db8cb 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -87,6 +87,7 @@
     "is_nccl_available",
     "is_torchelastic_launched",
     "is_ucc_available",
+    "is_xccl_available",
     "isend",
     "monitored_barrier",
     "new_group",

From be683207a1e745f4973410b63da180f8d2a46578 Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Tue, 3 Sep 2024 06:03:13 +0000
Subject: [PATCH 10/96] update

---
 .../distributed/c10d/ProcessGroupXCCL.cpp     | 30 ++++++--
 .../distributed/c10d/ProcessGroupXCCL.hpp     | 74 +++++++++++--------
 2 files changed, 66 insertions(+), 38 deletions(-)

diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index 12f33316c08f86..ef60d0546b0df8 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -53,7 +53,6 @@ void check_gpu_single_tensor(const at::Tensor& tensor) {
     C10_THROW_ERROR(ValueError, "Tensors must be contiguous");
   }
 }
-} // namespace
 
 ccl::datatype getXcclDataType(at::ScalarType type) {
   auto it = xcclDatatypes.find(type);
@@ -64,6 +63,9 @@ ccl::datatype getXcclDataType(at::ScalarType type) {
       type);
   return it->second;
 }
+} // namespace
+
+
 
 static std::mutex xcclCommDevIdxMapMutex;
 static std::unordered_map<std::shared_ptr<xcclComm_t>, int> xcclCommDevIdxMap;
@@ -91,14 +93,26 @@ c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL> make_work_ccl(
   return ret_ptr;
 }
 
+// ProcessGroupXCCL::WorkXCCL::WorkXCCL(
+//     std::vector<std::vector<at::Tensor>> outputTensors,
+//     int rank,
+//     c10d::OpType opType,
+//     const c10::optional<std::vector<at::Tensor>>& inputTensors)
+//     : Work(rank, opType, nullptr, inputTensors),
+//       outputTensors_(std::move(outputTensors)),
+//       future_(createFutureAsOutput(outputTensors)) {}
+
 ProcessGroupXCCL::WorkXCCL::WorkXCCL(
-    std::vector<std::vector<at::Tensor>> outputTensors,
+    at::Device& device,
     int rank,
-    c10d::OpType opType,
-    const c10::optional<std::vector<at::Tensor>>& inputTensors)
-    : Work(rank, opType, nullptr, inputTensors),
-      outputTensors_(std::move(outputTensors)),
-      future_(createFutureAsOutput(outputTensors)) {}
+    OpType opType,
+    const std::optional<std::vector<at::Tensor>>& inputs)
+    : Work(rank, opType, "profilingTitle", inputs), device_(device) {}
+
+ProcessGroupXCCL::WorkXCCL::WorkXCCL(const WorkXCCL& w)
+    : Work(w.rank_, w.opType_), device_(w.device_) {}
+
+ProcessGroupXCCL::WorkXCCL::~WorkXCCL() = default;
 
 c10::intrusive_ptr<c10::ivalue::Future> ProcessGroupXCCL::WorkXCCL::
     getFuture() {
@@ -198,7 +212,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
 
   work = make_work_ccl<WorkXCCL>(
       inputs, outputs, fn, xcclComm_t, attr, rank_, op_type);
-
+  // work->events_.emplace_back(fn);
   return work;
 }
 
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
index 0c5f4fa5aeccf7..0b3a50a4c1fffd 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
@@ -38,27 +38,34 @@ class ProcessGroupXCCL : public Backend {
   class WorkXCCL : public Work {
    public:
     WorkXCCL(
-        std::vector<std::vector<at::Tensor>> outputTensors,
-        int rank = -1,
-        OpType opType = OpType::UNKNOWN,
-        const c10::optional<std::vector<at::Tensor>>& inputTensors =
-            c10::nullopt)
-        : Work(rank, opType), outputTensors_(std::move(outputTensors)) {}
-
-    ~WorkXCCL() override {
-      // Ensures all events are properly handled before destruction
-      for (auto& event : events_) {
-        event.wait();
-      }
-    }
-
+        at::Device& device,
+        int rank,
+        OpType opType,
+        const std::optional<std::vector<at::Tensor>>& inputs = std::nullopt);
+    // WorkXCCL(
+    //     std::vector<std::vector<at::Tensor>> outputTensors,
+    //     int rank = -1,
+    //     OpType opType = OpType::UNKNOWN,
+    //     const c10::optional<std::vector<at::Tensor>>& inputTensors =
+    //         c10::nullopt)
+    //     : Work(rank, opType), outputTensors_(std::move(outputTensors)) {}
+    WorkXCCL(const WorkXCCL& w);
+    // ~WorkXCCL() override {
+    //   // Ensures all events are properly handled before destruction
+    //   for (auto& event : events_) {
+    //     event.wait();
+    //   }
+    // }
+    ~WorkXCCL() override;
     bool isCompleted() override {
-      for (auto& event : events_) {
-        if (!event.test()) {
-          return false;
-        }
-      }
-      return true;
+      TORCH_CHECK(
+          false, "ProcessGroupXCCL::WorkXCCL::isCompleted not implemented");
+      // for (auto& event : events_) {
+      //   if (!event.test()) {
+      //     return false;
+      //   }
+      // }
+      // return true;
     }
 
     bool isSuccess() const override {
@@ -71,9 +78,11 @@ class ProcessGroupXCCL : public Backend {
     }
 
     void synchronize() override {
-      for (auto& event : events_) {
-        event.wait();
-      }
+      TORCH_CHECK(
+          false, "ProcessGroupXCCL::WorkXCCL::synchronize not implemented");
+      // for (auto& event : events_) {
+      //   event.wait();
+      // }
     }
 
     bool wait(std::chrono::milliseconds timeout = kNoTimeout) override;
@@ -84,28 +93,33 @@ class ProcessGroupXCCL : public Backend {
     //   }
     //   events_.clear();
     // }
-    
+
     c10::intrusive_ptr<c10::ivalue::Future> getFuture() override {
       return future_;
     }
 
     std::vector<at::Tensor> result() override {
-      return outputTensors_.empty() ? std::vector<at::Tensor>()
-                                    : outputTensors_[0];
+      TORCH_CHECK(false, "ProcessGroupXCCL::WorkXCCL::result not implemented");
+      // return outputTensors_.empty() ? std::vector<at::Tensor>()
+      //                               : outputTensors_[0];
     }
 
    protected:
-    friend class ProcessGroupXCCL;
-    std::vector<ccl::event> events_;
-    const std::vector<std::vector<at::Tensor>> outputTensors_;
+    at::Device device_;
+    // std::vector<ccl::event> events_;
+    // std::shared_ptr<xcclComm_t> xcclComm_;
+    // const std::vector<std::vector<at::Tensor>> outputTensors_;
+   private:
+    std::shared_ptr<std::vector<at::Tensor>> outputs_;
     c10::intrusive_ptr<at::ivalue::Future> future_;
+    friend class ProcessGroupXCCL;
   };
 
   explicit ProcessGroupXCCL(
       const c10::intrusive_ptr<Store>& store,
       int rank,
       int size)
-      : store_(store), Backend(rank, size)  {}
+      : store_(store), Backend(rank, size) {}
 
   ~ProcessGroupXCCL() = default;
 

From 2e21d4f21803175ef7e697d1a22fced2777deb39 Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Tue, 3 Sep 2024 08:50:16 +0000
Subject: [PATCH 11/96] update

---
 torch/csrc/distributed/c10d/Ops.cpp           |   1 +
 .../distributed/c10d/ProcessGroupXCCL.cpp     | 107 ++++++++++++------
 .../distributed/c10d/ProcessGroupXCCL.hpp     |  14 +--
 3 files changed, 81 insertions(+), 41 deletions(-)

diff --git a/torch/csrc/distributed/c10d/Ops.cpp b/torch/csrc/distributed/c10d/Ops.cpp
index ae822ad3975049..03a5e42874594e 100644
--- a/torch/csrc/distributed/c10d/Ops.cpp
+++ b/torch/csrc/distributed/c10d/Ops.cpp
@@ -181,6 +181,7 @@ IMPL_BROADCAST(PrivateUse1)
 
 IMPL_ALLREDUCE(CPU)
 IMPL_ALLREDUCE(CUDA)
+IMPL_ALLREDUCE(XPU)
 IMPL_ALLREDUCE(PrivateUse1)
 
 #define IMPL_ALLREDUCE_COALESCED(DEV)                             \
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index ef60d0546b0df8..5e2e179d32af37 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -45,7 +45,7 @@ std::map<at::ScalarType, ccl::datatype> xcclDatatypes = {
     {at::kBool, ccl::datatype::uint8},
 };
 
-void check_gpu_single_tensor(const at::Tensor& tensor) {
+void check_xpu_single_tensor(const at::Tensor& tensor) {
   if (!tensor.is_xpu() || tensor.is_sparse()) {
     C10_THROW_ERROR(ValueError, "Tensors must be XPU and dense");
   }
@@ -65,33 +65,31 @@ ccl::datatype getXcclDataType(at::ScalarType type) {
 }
 } // namespace
 
-
-
 static std::mutex xcclCommDevIdxMapMutex;
 static std::unordered_map<std::shared_ptr<xcclComm_t>, int> xcclCommDevIdxMap;
 
-template <
-    template <typename, typename, typename, typename, typename>
-    class WorkXCCL,
-    typename RunF,
-    typename CommType,
-    typename InputType,
-    typename OutputType,
-    typename attr_t>
-c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL> make_work_ccl(
-    const std::vector<InputType>& inputs,
-    const std::vector<OutputType>& outputs,
-    RunF f,
-    CommType& comms,
-    attr_t& attr,
-    int rank,
-    c10d::OpType op_type) {
-  c10::intrusive_ptr<WorkCCL<RunF, CommType, InputType, OutputType, attr_t>>
-      ret_ptr = c10::make_intrusive<
-          WorkCCL<RunF, CommType, InputType, OutputType, attr_t>>(
-          inputs, outputs, f, comms, attr, rank, op_type);
-  return ret_ptr;
-}
+// template <
+//     template <typename, typename, typename, typename, typename>
+//     class WorkXCCL,
+//     typename RunF,
+//     typename CommType,
+//     typename InputType,
+//     typename OutputType,
+//     typename attr_t>
+// c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL> make_work_ccl(
+//     const std::vector<InputType>& inputs,
+//     const std::vector<OutputType>& outputs,
+//     RunF f,
+//     CommType& comms,
+//     attr_t& attr,
+//     int rank,
+//     c10d::OpType op_type) {
+//   c10::intrusive_ptr<WorkCCL<RunF, CommType, InputType, OutputType, attr_t>>
+//       ret_ptr = c10::make_intrusive<
+//           WorkCCL<RunF, CommType, InputType, OutputType, attr_t>>(
+//           inputs, outputs, f, comms, attr, rank, op_type);
+//   return ret_ptr;
+// }
 
 // ProcessGroupXCCL::WorkXCCL::WorkXCCL(
 //     std::vector<std::vector<at::Tensor>> outputTensors,
@@ -107,10 +105,14 @@ ProcessGroupXCCL::WorkXCCL::WorkXCCL(
     int rank,
     OpType opType,
     const std::optional<std::vector<at::Tensor>>& inputs)
-    : Work(rank, opType, "profilingTitle", inputs), device_(device) {}
+    : Work(rank, opType, "profilingTitle", inputs), device_(device) {
+  unsigned char enable_timing = 0;
+  xcclEndEvent_ = std::make_shared<at::xpu::XPUEvent>(enable_timing);
+}
 
 ProcessGroupXCCL::WorkXCCL::WorkXCCL(const WorkXCCL& w)
-    : Work(w.rank_, w.opType_), device_(w.device_) {}
+    : Work(w.rank_, w.opType_), device_(w.device_),
+      xcclEndEvent_(w.xcclEndEvent_) {}
 
 ProcessGroupXCCL::WorkXCCL::~WorkXCCL() = default;
 
@@ -119,6 +121,12 @@ c10::intrusive_ptr<c10::ivalue::Future> ProcessGroupXCCL::WorkXCCL::
   return future_;
 }
 
+void ProcessGroupXCCL::WorkXCCL::synchronize() {
+  auto currentStream = at::xpu::getCurrentXPUStream(device_.index());
+  // Block the current stream on the XCCL stream
+  xcclEndEvent_->block(currentStream);
+}
+
 c10::intrusive_ptr<Backend> ProcessGroupXCCL::createProcessGroupXCCL(
     const c10::intrusive_ptr<Store>& store,
     int rank,
@@ -134,6 +142,20 @@ ProcessGroupXCCL::ProcessGroupXCCL(
 
 ProcessGroupXCCL::~ProcessGroupXCCL() = default;
 
+c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL> ProcessGroupXCCL::initWork(
+    at::Device& device,
+    int rank,
+    OpType opType,
+    const std::vector<at::Tensor>& inputs,
+    const std::vector<at::Tensor>& outputs) {
+  auto r = c10::make_intrusive<ProcessGroupXCCL::WorkXCCL>(
+      device,
+      rank,
+      opType,
+      std::optional<std::vector<at::Tensor>>(inputs));
+  return r;
+}
+
 std::shared_ptr<xcclComm_t> ProcessGroupXCCL::getXCCLComm(
     const std::string& deviceKey,
     at::Device& device) {
@@ -162,7 +184,7 @@ std::shared_ptr<xcclComm_t> ProcessGroupXCCL::getXCCLComm(
   ccl::vector_class<ccl::pair_class<int, ccl::device>> devs_rank;
   c10::impl::VirtualGuardImpl impl(device.type());
   c10::Stream stream = impl.getStream(device);
-  auto q = get_sycl_queue(stream);
+  auto q = at::xpu::XPUStream(stream).queue();
   auto ctx = ccl::create_context(q.get_context());
   devs_rank.emplace_back(rank, ccl::create_device(q.get_device()));
   XCCLComm = ccl::create_communicator(numRanks, devs_rank, ctx, kvs);
@@ -172,6 +194,8 @@ std::shared_ptr<xcclComm_t> ProcessGroupXCCL::getXCCLComm(
     inInitializationCommMap_.emplace(deviceKey, XCCLComm);
   }
 
+  xcclStreams_.emplace(deviceKey, std::move(stream));
+
   auto it = inInitializationCommMap_.find(deviceKey);
   if (it != inInitializationCommMap_.end()) {
     devXCCLCommMap_.emplace(deviceKey, std::move(it->second));
@@ -203,21 +227,38 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
 
   auto device = input.device();
   const auto key = std::to_string(device.index());
-  auto xcclComm_t = getXCCLComm(key, device);
+  auto comm = getXCCLComm(key, device);
 
+  auto xcclStream = xcclStreams_.at(key);
   std::vector<at::Tensor> inputs{input};
   std::vector<at::Tensor> outputs{output};
 
   c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL> work;
 
-  work = make_work_ccl<WorkXCCL>(
-      inputs, outputs, fn, xcclComm_t, attr, rank_, op_type);
+  work =initWork(device, rank_, op_type);
+  // work = make_work_ccl<WorkXCCL>(
+  //     inputs, outputs, fn, xcclComm_t, attr, rank_, op_type);
   // work->events_.emplace_back(fn);
+  work->outputs_ =
+      std::make_shared<std::vector<at::Tensor>>(std::move(outputs));
+  c10::xpu::XPUCachingAllocator::recordStream(
+      input.storage().data_ptr(), xcclStream);
+  
+  auto ccl_stream = ccl::create_stream(at::xpu::XPUStream(xcclStream).queue());
+  fn(input, output, attr, comm, ccl_stream);
+
+  work->xcclEndEvent_->record(xcclStream);
+  c10::MultiStreamGuard streamGuard(xcclStream);
+  std::vector<at::Device> devices{device};
+  work->future_ = c10::make_intrusive<at::ivalue::Future>(
+      c10::ListType::create(c10::TensorType::get()), devices);
+  work->future_->markCompleted(at::IValue(*work->outputs_));
+
   return work;
 }
 
 template <typename Fn>
-c10::intrusive_ptr<Work> ProcessGroupNCCL::collective(
+c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
     at::Tensor& input,
     at::Tensor& output,
     Fn fn,
@@ -237,7 +278,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::allreduce(
   TORCH_CHECK(
       tensors.size() == 1, "Expecting one tensor only but got multiple");
   auto tensor = tensors.back();
-  check_gpu_single_tensor(tensor);
+  check_xpu_single_tensor(tensor);
   if (opts.reduceOp == ReduceOp::AVG) {
     TORCH_CHECK(false, "Cannot use ReduceOp AVG with XPU")
   }
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
index 0b3a50a4c1fffd..02eddb7acb8ec0 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
@@ -11,6 +11,8 @@
 
 #include <oneapi/ccl.hpp>
 #include <torch/csrc/xpu/xccl.h>
+#include <torch/csrc/xpu/Stream.h>
+#include <torch/csrc/xpu/Event.h>
 #include <exception>
 #include <memory>
 #include <vector>
@@ -77,13 +79,7 @@ class ProcessGroupXCCL : public Backend {
       TORCH_CHECK(false, "ProcessGroupXCCL::WorkXCCL::abort not implemented");
     }
 
-    void synchronize() override {
-      TORCH_CHECK(
-          false, "ProcessGroupXCCL::WorkXCCL::synchronize not implemented");
-      // for (auto& event : events_) {
-      //   event.wait();
-      // }
-    }
+    void synchronize() override;
 
     bool wait(std::chrono::milliseconds timeout = kNoTimeout) override;
     // void wait() {
@@ -106,6 +102,7 @@ class ProcessGroupXCCL : public Backend {
 
    protected:
     at::Device device_;
+    std::shared_ptr<at::xpu::XPUEvent> xcclEndEvent_;
     // std::vector<ccl::event> events_;
     // std::shared_ptr<xcclComm_t> xcclComm_;
     // const std::vector<std::vector<at::Tensor>> outputTensors_;
@@ -121,7 +118,7 @@ class ProcessGroupXCCL : public Backend {
       int size)
       : store_(store), Backend(rank, size) {}
 
-  ~ProcessGroupXCCL() = default;
+  ~ProcessGroupXCCL() override;
 
   const std::string getBackendName() const override {
     return std::string(XCCL_BACKEND_NAME);
@@ -140,6 +137,7 @@ class ProcessGroupXCCL : public Backend {
       int size = -1);
 
  public:
+  std::unordered_map<std::string, at::xpu::XPUStream> xcclStreams_;
   std::unordered_map<std::string, std::shared_ptr<xcclComm_t>>
       inInitializationCommMap_;
   std::unordered_map<std::string, std::shared_ptr<xcclComm_t>> devXCCLCommMap_;

From c9ef78fdc5c8872246c74e5a1949d5a7c94726c5 Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Wed, 4 Sep 2024 04:25:38 +0000
Subject: [PATCH 12/96] update

---
 build_variables.bzl                           |  1 +
 torch/csrc/distributed/c10d/ProcessGroup.hpp  |  1 +
 .../distributed/c10d/ProcessGroupXCCL.cpp     | 50 ++++---------------
 .../distributed/c10d/ProcessGroupXCCL.hpp     | 15 ++----
 4 files changed, 16 insertions(+), 51 deletions(-)

diff --git a/build_variables.bzl b/build_variables.bzl
index 80a575324aa8b3..55a3f0023b571f 100644
--- a/build_variables.bzl
+++ b/build_variables.bzl
@@ -786,6 +786,7 @@ libtorch_python_cuda_sources = libtorch_python_cuda_core_sources + [
 ]
 
 libtorch_python_xpu_sources = [
+    "torch/csrc/xpu/xccl.cpp",
     "torch/csrc/xpu/Event.cpp",
     "torch/csrc/xpu/Module.cpp",
     "torch/csrc/xpu/Stream.cpp",
diff --git a/torch/csrc/distributed/c10d/ProcessGroup.hpp b/torch/csrc/distributed/c10d/ProcessGroup.hpp
index acf8c9c354a76b..85142caf0ac7c7 100644
--- a/torch/csrc/distributed/c10d/ProcessGroup.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroup.hpp
@@ -70,6 +70,7 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
     UCC = 3,
     MPI = 4,
     CUSTOM = 5,
+    XCCL = 6,
   };
 
   // Not used, set for backwards compatibility and only used for TypeDef in
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index 5e2e179d32af37..8be7c6451fcdd0 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -3,7 +3,7 @@
 #include <mutex>
 #include <sstream>
 
-#ifdef USE_C10D_XCCL
+// #ifdef USE_C10D_XCCL
 #include <exception>
 #include <map>
 #include <stdexcept>
@@ -68,38 +68,6 @@ ccl::datatype getXcclDataType(at::ScalarType type) {
 static std::mutex xcclCommDevIdxMapMutex;
 static std::unordered_map<std::shared_ptr<xcclComm_t>, int> xcclCommDevIdxMap;
 
-// template <
-//     template <typename, typename, typename, typename, typename>
-//     class WorkXCCL,
-//     typename RunF,
-//     typename CommType,
-//     typename InputType,
-//     typename OutputType,
-//     typename attr_t>
-// c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL> make_work_ccl(
-//     const std::vector<InputType>& inputs,
-//     const std::vector<OutputType>& outputs,
-//     RunF f,
-//     CommType& comms,
-//     attr_t& attr,
-//     int rank,
-//     c10d::OpType op_type) {
-//   c10::intrusive_ptr<WorkCCL<RunF, CommType, InputType, OutputType, attr_t>>
-//       ret_ptr = c10::make_intrusive<
-//           WorkCCL<RunF, CommType, InputType, OutputType, attr_t>>(
-//           inputs, outputs, f, comms, attr, rank, op_type);
-//   return ret_ptr;
-// }
-
-// ProcessGroupXCCL::WorkXCCL::WorkXCCL(
-//     std::vector<std::vector<at::Tensor>> outputTensors,
-//     int rank,
-//     c10d::OpType opType,
-//     const c10::optional<std::vector<at::Tensor>>& inputTensors)
-//     : Work(rank, opType, nullptr, inputTensors),
-//       outputTensors_(std::move(outputTensors)),
-//       future_(createFutureAsOutput(outputTensors)) {}
-
 ProcessGroupXCCL::WorkXCCL::WorkXCCL(
     at::Device& device,
     int rank,
@@ -116,6 +84,11 @@ ProcessGroupXCCL::WorkXCCL::WorkXCCL(const WorkXCCL& w)
 
 ProcessGroupXCCL::WorkXCCL::~WorkXCCL() = default;
 
+bool ProcessGroupXCCL::WorkXCCL::wait(std::chrono::milliseconds timeout) {
+  synchronize();
+  return true;
+}
+
 c10::intrusive_ptr<c10::ivalue::Future> ProcessGroupXCCL::WorkXCCL::
     getFuture() {
   return future_;
@@ -267,8 +240,10 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
       input,
       output,
       fn,
-      [](std::vector<ccl::stream>&) {},
-      [](std::vector<ccl::stream>&) {},
+      [](at::xpu::XPUStream&,
+         c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL>& work) {},
+      [](at::xpu::XPUStream&,
+         c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL>& work) {},
       opType);
 }
 
@@ -306,9 +281,6 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::allreduce(
       OpType::ALLREDUCE);
 }
 
-// c10::intrusive_ptr<Work> barrier(
-//     const BarrierOptions& opts = BarrierOptions()) override;
-
 } // namespace c10d
 
-#endif // USE_C10D_XCCL
+// #endif // USE_C10D_XCCL
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
index 02eddb7acb8ec0..d14d677205ecbb 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
@@ -7,7 +7,7 @@
 #include <unistd.h>
 #endif
 
-#ifdef USE_C10D_XCCL
+// #ifdef USE_C10D_XCCL
 
 #include <oneapi/ccl.hpp>
 #include <torch/csrc/xpu/xccl.h>
@@ -35,7 +35,7 @@ namespace c10d {
 constexpr const char* XCCL_BACKEND_NAME = "xccl";
 using namespace torch::xpu::xccl;
 
-class ProcessGroupXCCL : public Backend {
+class TORCH_XPU_API ProcessGroupXCCL : public Backend {
  public:
   class WorkXCCL : public Work {
    public:
@@ -82,13 +82,6 @@ class ProcessGroupXCCL : public Backend {
     void synchronize() override;
 
     bool wait(std::chrono::milliseconds timeout = kNoTimeout) override;
-    // void wait() {
-    //   std::unique_lock<std::timed_mutex> lock(mutex_);
-    //   for (auto& event : events_) {
-    //     event.wait();
-    //   }
-    //   events_.clear();
-    // }
 
     c10::intrusive_ptr<c10::ivalue::Future> getFuture() override {
       return future_;
@@ -96,8 +89,6 @@ class ProcessGroupXCCL : public Backend {
 
     std::vector<at::Tensor> result() override {
       TORCH_CHECK(false, "ProcessGroupXCCL::WorkXCCL::result not implemented");
-      // return outputTensors_.empty() ? std::vector<at::Tensor>()
-      //                               : outputTensors_[0];
     }
 
    protected:
@@ -147,4 +138,4 @@ class ProcessGroupXCCL : public Backend {
 
 } // namespace c10d
 
-#endif // USE_C10D_XCCL
+// #endif // USE_C10D_XCCL

From 076db36d3da015427b53c473484d59a0b5ebcd21 Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Wed, 4 Sep 2024 07:46:38 +0000
Subject: [PATCH 13/96] update

---
 build_variables.bzl                              | 2 +-
 caffe2/CMakeLists.txt                            | 5 ++---
 cmake/Summary.cmake                              | 1 +
 torch/CMakeLists.txt                             | 5 +++++
 torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp | 2 --
 torch/csrc/xpu/xccl.h                            | 2 ++
 6 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/build_variables.bzl b/build_variables.bzl
index 55a3f0023b571f..b903a55b17439b 100644
--- a/build_variables.bzl
+++ b/build_variables.bzl
@@ -542,7 +542,6 @@ libtorch_distributed_extra_sources = [
     "torch/csrc/distributed/autograd/rpc_messages/rref_backward_req.cpp",
     "torch/csrc/distributed/autograd/rpc_messages/rref_backward_resp.cpp",
     "torch/csrc/distributed/c10d/HashStore.cpp",
-    "torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp",
     "torch/csrc/distributed/rpc/agent_utils.cpp",
     "torch/csrc/distributed/rpc/message.cpp",
     "torch/csrc/distributed/rpc/profiler/remote_profiler_manager.cpp",
@@ -787,6 +786,7 @@ libtorch_python_cuda_sources = libtorch_python_cuda_core_sources + [
 
 libtorch_python_xpu_sources = [
     "torch/csrc/xpu/xccl.cpp",
+    "torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp",
     "torch/csrc/xpu/Event.cpp",
     "torch/csrc/xpu/Module.cpp",
     "torch/csrc/xpu/Stream.cpp",
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 28e7d0c96ba877..01d280cb3fc7c4 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -1015,13 +1015,12 @@ endif()
 
 if(USE_XPU)
   if(USE_XCCL)
-  list(APPEND Caffe2_XPU_SRCS
-    ${TORCH_SRC_DIR}/csrc/xpu/xccl.cpp)
+    list(APPEND Caffe2_XPU_SRCS
+      ${TORCH_SRC_DIR}/csrc/xpu/xccl.cpp)
   endif()
   add_library(torch_xpu ${Caffe2_XPU_SRCS})
   torch_compile_options(torch_xpu)  # see cmake/public/utils.cmake
   target_compile_definitions(torch_xpu PRIVATE USE_XPU)
-
   # ATen XPU implementation
   set(TORCH_XPU_OPS_DIR ${TORCH_ROOT}/third_party/torch-xpu-ops)
   set(TORCH_XPU_OPS_REPO_URL https://github.com/intel/torch-xpu-ops.git)
diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
index 0b601cf2a6a329..229ff112ab3187 100644
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@@ -155,6 +155,7 @@ function(caffe2_print_configuration_summary)
   message(STATUS "  USE_ITT               : ${USE_ITT}")
   message(STATUS "  USE_XCCL              : ${USE_XCCL}")
   if(${USE_XCCL})
+    message(STATUS "    USE_C10D_XCCL       : ${USE_C10D_XCCL}")
     message(STATUS "    XCCL include path   : ${XCCL_INCLUDE_DIR}")
     message(STATUS "    XCCL library        : ${XCCL_LIBRARY}")
   endif()
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index 8ab7d7aeb095b6..f50ae4e02c3386 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -165,6 +165,9 @@ if(USE_XPU)
     append_filelist("libtorch_python_xpu_sources" TORCH_PYTHON_SRCS)
 
     list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_XPU)
+    # if(USE_XCCL)
+    list(APPEND TORCH_PYTHON_LINK_LIBRARIES torch::xpurt)
+    # endif()
 endif()
 
 if(USE_CUDNN OR USE_ROCM)
@@ -419,6 +422,8 @@ endif()
 target_compile_definitions(torch_python PRIVATE "-DTHP_BUILD_MAIN_LIB")
 
 target_link_libraries(torch_python PRIVATE ${TORCH_LIB} ${TORCH_PYTHON_LINK_LIBRARIES})
+target_link_libraries(torch_python PRIVATE torch::xpurt)
+target_link_libraries(torch_python PRIVATE c10_xpu)
 
 target_compile_definitions(torch_python PRIVATE ${TORCH_PYTHON_COMPILE_DEFINITIONS})
 
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
index d14d677205ecbb..01a5966b811069 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
@@ -11,8 +11,6 @@
 
 #include <oneapi/ccl.hpp>
 #include <torch/csrc/xpu/xccl.h>
-#include <torch/csrc/xpu/Stream.h>
-#include <torch/csrc/xpu/Event.h>
 #include <exception>
 #include <memory>
 #include <vector>
diff --git a/torch/csrc/xpu/xccl.h b/torch/csrc/xpu/xccl.h
index 31fc594e71cc0b..c7a67975bb286c 100644
--- a/torch/csrc/xpu/xccl.h
+++ b/torch/csrc/xpu/xccl.h
@@ -6,6 +6,8 @@
 #include <optional>
 #include <vector>
 #include <torch/csrc/distributed/c10d/Store.hpp>
+#include <torch/csrc/xpu/Stream.h>
+#include <torch/csrc/xpu/Event.h>
 
 namespace torch::xpu::xccl {
 

From 8d739aca40d4f6f59458478093af304c2f327b86 Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Wed, 4 Sep 2024 10:09:43 +0000
Subject: [PATCH 14/96] update

---
 build_variables.bzl                           |   1 -
 caffe2/CMakeLists.txt                         |   8 +-
 .../distributed/c10d/ProcessGroupXCCL.cpp     |  61 +--
 .../distributed/c10d/ProcessGroupXCCL.hpp     |  40 +-
 torch/csrc/xpu/xccl.cpp                       | 348 ------------------
 torch/csrc/xpu/xccl.h                         |  77 ----
 6 files changed, 80 insertions(+), 455 deletions(-)
 delete mode 100644 torch/csrc/xpu/xccl.cpp
 delete mode 100644 torch/csrc/xpu/xccl.h

diff --git a/build_variables.bzl b/build_variables.bzl
index b903a55b17439b..cff70d00320b0e 100644
--- a/build_variables.bzl
+++ b/build_variables.bzl
@@ -785,7 +785,6 @@ libtorch_python_cuda_sources = libtorch_python_cuda_core_sources + [
 ]
 
 libtorch_python_xpu_sources = [
-    "torch/csrc/xpu/xccl.cpp",
     "torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp",
     "torch/csrc/xpu/Event.cpp",
     "torch/csrc/xpu/Module.cpp",
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 01d280cb3fc7c4..55339880a82a37 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -1014,10 +1014,10 @@ elseif(USE_CUDA)
 endif()
 
 if(USE_XPU)
-  if(USE_XCCL)
-    list(APPEND Caffe2_XPU_SRCS
-      ${TORCH_SRC_DIR}/csrc/xpu/xccl.cpp)
-  endif()
+  # if(USE_XCCL)
+  #   list(APPEND Caffe2_XPU_SRCS
+  #     ${TORCH_SRC_DIR}/csrc/xpu/xccl.cpp)
+  # endif()
   add_library(torch_xpu ${Caffe2_XPU_SRCS})
   torch_compile_options(torch_xpu)  # see cmake/public/utils.cmake
   target_compile_definitions(torch_xpu PRIVATE USE_XPU)
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index 8be7c6451fcdd0..ffd566f10f854a 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -11,6 +11,7 @@
 #include <unordered_set>
 #include <utility>
 
+#include <ATen/detail/FunctionTraits.h>
 #include <c10/core/DeviceType.h>
 #include <c10/util/CallOnce.h>
 #include <c10/util/Exception.h>
@@ -45,6 +46,36 @@ std::map<at::ScalarType, ccl::datatype> xcclDatatypes = {
     {at::kBool, ccl::datatype::uint8},
 };
 
+XCCL_KVS kvs;
+std::mutex kvs_mutex;
+
+XCCL_KVS get_kvs(int rank, c10d::Store& store) {
+  std::lock_guard<std::mutex> lock(kvs_mutex);
+  if (kvs)
+    return kvs;
+  std::string storeKey = "ccl_kvs";
+
+  // Rank 0 broadcast the bootstrap network information to other ranks
+  if (rank == 0) {
+    kvs = ccl::create_main_kvs();
+    ccl::kvs::address_type main_addr = kvs->get_address();
+    auto ccl_kvs_addr =
+        std::vector<uint8_t>(main_addr.begin(), main_addr.end());
+    store.set(storeKey, ccl_kvs_addr);
+  } else {
+    auto ccl_kvs_addr = store.get(storeKey);
+    if (ccl_kvs_addr.size() != ccl::kvs::address_max_size) {
+      throw std::runtime_error("Unexpected ccl kvs addr from the store\n");
+    }
+    ccl::kvs::address_type main_addr;
+    std::copy_n(
+        ccl_kvs_addr.begin(), ccl::kvs::address_max_size, main_addr.begin());
+    kvs = ccl::create_kvs(main_addr);
+  }
+
+  return kvs;
+}
+
 void check_xpu_single_tensor(const at::Tensor& tensor) {
   if (!tensor.is_xpu() || tensor.is_sparse()) {
     C10_THROW_ERROR(ValueError, "Tensors must be XPU and dense");
@@ -89,11 +120,6 @@ bool ProcessGroupXCCL::WorkXCCL::wait(std::chrono::milliseconds timeout) {
   return true;
 }
 
-c10::intrusive_ptr<c10::ivalue::Future> ProcessGroupXCCL::WorkXCCL::
-    getFuture() {
-  return future_;
-}
-
 void ProcessGroupXCCL::WorkXCCL::synchronize() {
   auto currentStream = at::xpu::getCurrentXPUStream(device_.index());
   // Block the current stream on the XCCL stream
@@ -107,12 +133,6 @@ c10::intrusive_ptr<Backend> ProcessGroupXCCL::createProcessGroupXCCL(
   return c10::make_intrusive<ProcessGroupXCCL>(store, rank, size);
 }
 
-ProcessGroupXCCL::ProcessGroupXCCL(
-    const c10::intrusive_ptr<Store>& store,
-    int rank,
-    int size)
-    : Backend(rank, size), store_(store) {}
-
 ProcessGroupXCCL::~ProcessGroupXCCL() = default;
 
 c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL> ProcessGroupXCCL::initWork(
@@ -148,7 +168,7 @@ std::shared_ptr<xcclComm_t> ProcessGroupXCCL::getXCCLComm(
 
   std::shared_ptr<xcclComm_t> XCCLComm;
 
-  XCCL_KVS kvs = get_kvs(rank_, store_);
+  XCCL_KVS kvs = get_kvs(rank_, *store_);
 
   int numRanks, rank;
   numRanks = getSize();
@@ -157,7 +177,7 @@ std::shared_ptr<xcclComm_t> ProcessGroupXCCL::getXCCLComm(
   ccl::vector_class<ccl::pair_class<int, ccl::device>> devs_rank;
   c10::impl::VirtualGuardImpl impl(device.type());
   c10::Stream stream = impl.getStream(device);
-  auto q = at::xpu::XPUStream(stream).queue();
+  sycl::queue& q = c10::xpu::XPUStream(stream).queue();
   auto ctx = ccl::create_context(q.get_context());
   devs_rank.emplace_back(rank, ccl::create_device(q.get_device()));
   XCCLComm = ccl::create_communicator(numRanks, devs_rank, ctx, kvs);
@@ -208,20 +228,20 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
 
   c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL> work;
 
-  work =initWork(device, rank_, op_type);
-  // work = make_work_ccl<WorkXCCL>(
-  //     inputs, outputs, fn, xcclComm_t, attr, rank_, op_type);
-  // work->events_.emplace_back(fn);
+  work = initWork(device, rank_, opType);
+
   work->outputs_ =
       std::make_shared<std::vector<at::Tensor>>(std::move(outputs));
   c10::xpu::XPUCachingAllocator::recordStream(
       input.storage().data_ptr(), xcclStream);
   
-  auto ccl_stream = ccl::create_stream(at::xpu::XPUStream(xcclStream).queue());
+  auto ccl_stream = ccl::create_stream(xcclStream.queue());
   fn(input, output, attr, comm, ccl_stream);
 
   work->xcclEndEvent_->record(xcclStream);
-  c10::MultiStreamGuard streamGuard(xcclStream);
+
+  std::vector<c10::Stream> streams = {xcclStream.unwrap()};
+  c10::MultiStreamGuard streamGuard(streams);
   std::vector<at::Device> devices{device};
   work->future_ = c10::make_intrusive<at::ivalue::Future>(
       c10::ListType::create(c10::TensorType::get()), devices);
@@ -266,13 +286,12 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::allreduce(
           xcclComm_t comm,
           ccl::stream& stream) {
         ccl::event ret_evt;
-        ccl::datatype datatype = getXcclDataType(input.scalar_type());
         ret_evt = ccl::allreduce(
             input.data_ptr(),
             output.data_ptr(),
             (size_t)input.numel(),
             getXcclDataType(input.scalar_type()),
-            xcclOp.at(opts.reduceOp),
+            xcclOps.at(opts.reduceOp),
             comm,
             stream,
             attr);
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
index 01a5966b811069..b43403f52f31ab 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
@@ -8,9 +8,11 @@
 #endif
 
 // #ifdef USE_C10D_XCCL
-
 #include <oneapi/ccl.hpp>
-#include <torch/csrc/xpu/xccl.h>
+#include <torch/csrc/distributed/c10d/Store.hpp>
+#include <torch/csrc/xpu/Event.h>
+#include <torch/csrc/xpu/Stream.h>
+// #include <torch/csrc/xpu/xccl.h>
 #include <exception>
 #include <memory>
 #include <vector>
@@ -24,14 +26,17 @@
 #include <thread>
 #include <unordered_map>
 
+#include <c10/core/StreamGuard.h>
+#include <c10/xpu/XPUCachingAllocator.h>
 #include <torch/csrc/distributed/c10d/Backend.hpp>
 #include <torch/csrc/distributed/c10d/PrefixStore.hpp>
 #include <torch/csrc/distributed/c10d/Store.hpp>
-
 namespace c10d {
 
+using xcclComm_t = ccl::communicator;
+using XCCL_KVS = ccl::shared_ptr_class<ccl::kvs>;
 constexpr const char* XCCL_BACKEND_NAME = "xccl";
-using namespace torch::xpu::xccl;
+// using namespace torch::xpu::xccl;
 
 class TORCH_XPU_API ProcessGroupXCCL : public Backend {
  public:
@@ -113,6 +118,33 @@ class TORCH_XPU_API ProcessGroupXCCL : public Backend {
     return std::string(XCCL_BACKEND_NAME);
   }
 
+  std::shared_ptr<xcclComm_t> getXCCLComm(
+      const std::string& deviceKey,
+      at::Device& device);
+
+  virtual c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL> initWork(
+      at::Device& device,
+      int rank,
+      OpType opType,
+      const std::vector<at::Tensor>& inputs = {},
+      const std::vector<at::Tensor>& outputs = {});
+
+  template <typename Fn>
+  c10::intrusive_ptr<Work> collective(
+      at::Tensor& input,
+      at::Tensor& output,
+      Fn fn,
+      OpType opType);
+
+  template <typename Fn, typename PreProcess, typename PostProcess>
+  c10::intrusive_ptr<Work> collective(
+      at::Tensor& input,
+      at::Tensor& output,
+      Fn fn,
+      PreProcess pre,
+      PostProcess post,
+      OpType opType);
+
   c10::intrusive_ptr<Work> allreduce(
       std::vector<at::Tensor>& tensors,
       const AllreduceOptions& opts = AllreduceOptions()) override;
diff --git a/torch/csrc/xpu/xccl.cpp b/torch/csrc/xpu/xccl.cpp
deleted file mode 100644
index 6224b19254dbfe..00000000000000
--- a/torch/csrc/xpu/xccl.cpp
+++ /dev/null
@@ -1,348 +0,0 @@
-#include <ATen/core/functional.h>
-
-#include <ATen/ATen.h>
-#include <c10/util/Exception.h>
-#include <c10/util/hash.h>
-#include <c10/util/irange.h>
-
-#include <torch/csrc/xpu/xccl.h>
-
-#include <limits>
-#include <sstream>
-#include <type_traits>
-#include <unordered_map>
-
-
-ccl::datatype to_xccl_data_type(c10::ScalarType type) {
-  switch (type) {
-    case at::kFloat:
-      return ccl::datatype::float32;
-    case at::kHalf:
-      return ccl::datatype::float16;
-    case at::kDouble:
-      return ccl::datatype::float64;
-    case at::kLong:
-      return ccl::datatype::int64;
-    case at::kInt:
-      return ccl::datatype::int32;
-    case at::kChar:
-      return ccl::datatype::int8;
-    case at::kByte:
-      return ccl::datatype::uint8;
-    case at::kBool:
-      return ccl::datatype::uint8;
-    case at::kBFloat16:
-      return ccl::datatype::bfloat16;
-    default:
-      TORCH_CHECK(false, "Unconvertible XCCL type ", type);
-  }
-}
-
-ccl::datatype to_xccl_data_type(const at::Tensor& t) {
-  if (!t.is_xpu()) {
-    TORCH_CHECK(
-        false,
-        "XCCL only supports XPU tensors, but got a tensor on ",
-        t.device());
-  }
-  return to_xccl_data_type(t.scalar_type());
-}
-
-ccl::reduction to_xccl_red_op(int var) {
-  return (ccl::reduction)(var);
-}
-
-namespace torch::xpu::xccl {
-
-XCCL_KVS kvs;
-std::mutex kvs_mutex;
-
-XCCL_KVS get_kvs(int rank, c10d::Store& store) {
-  std::lock_guard<std::mutex> lock(kvs_mutex);
-  if (kvs)
-    return kvs;
-  std::string storeKey = "ccl_kvs";
-
-  // Rank 0 broadcast the bootstrap network information to other ranks
-  if (rank == 0) {
-    kvs = ccl::create_main_kvs();
-    ccl::kvs::address_type main_addr = kvs->get_address();
-    auto ccl_kvs_addr =
-        std::vector<uint8_t>(main_addr.begin(), main_addr.end());
-    store.set(storeKey, ccl_kvs_addr);
-  } else {
-    auto ccl_kvs_addr = store.get(storeKey);
-    if (ccl_kvs_addr.size() != ccl::kvs::address_max_size) {
-      throw std::runtime_error("Unexpected ccl kvs addr from the store\n");
-    }
-    ccl::kvs::address_type main_addr;
-    std::copy_n(
-      ccl_kvs_addr.begin(),
-      ccl::kvs::address_max_size,
-      main_addr.begin());
-    kvs = ccl::create_kvs(main_addr);
-  }
-
-  return kvs;
-}
-
-using namespace at;
-
-namespace detail {
-
-// void xcclCommInitAll(xcclComm_t* newcomm, int nranks, ncclUniqueId commId,
-// int myrank) {
-//   for(int i = 0; i < nranks; i++) {
-//     newcomm[i] = ccl::create_communicator(nranks, i, get_kvs_addr)
-//   }
-//   c10::Stream dpcpp_stream = impl.getStream(devices[0]);
-//   ccl::vector_class<ccl::pair_class<int, ccl::device>> devs_rank;
-//   newcomm = ccl::create_communicators(nranks, devs_rank, ctx, )
-// }
-
-// struct XcclCommList {
-//   std::unique_ptr<xcclComm_t[]> comms;
-//   int ndevices;
-//   XcclCommList(const std::vector<int>& devices)
-//       : comms(new xcclComm_t[devices.size()]), ndevices(devices.size()) {
-//     xcclCommInitAll(
-//         to_xccl_comm(comms.get()), devices.size(), devices.data());
-//   }
-//   NcclCommList(NcclCommList&& foo) = default;
-//   ~NcclCommList() {
-//     if (comms) {
-//       for (const auto i : c10::irange(ndevices)) {
-//         comm_destroy(comms[i]);
-//       }
-//     }
-//   }
-//   ArrayRef<ncclComm_t> ref() const {
-//     return ArrayRef<ncclComm_t>(comms.get(), ndevices);
-//   }
-// };
-
-// using device_list = std::vector<int>;
-// // accesses to this object have to be guarded by THC's CudaFreeMutex
-// std::unordered_map<device_list, std::shared_ptr<Comms>> _communicators;
-// static std::unordered_map<device_list, NcclCommList, c10::hash<device_list>>
-//     _communicators;
-
-// ArrayRef<xcclComm_t> get_communicators(TensorList inputs) {
-//   static auto get_device = [](const at::Tensor& t) -> int {
-//     return t.get_device();
-//   };
-//   device_list devices = fmap(inputs, get_device);
-//   auto it = _communicators.find(devices);
-//   if (it == _communicators.end()) {
-//     it = _communicators.emplace(devices, devices).first;
-//   }
-//   return it->second;
-// }
-
-static inline void check_tensor(
-    const at::Tensor& input,
-    const std::optional<at::Tensor>& output,
-    int input_multiplier,
-    int output_multiplier,
-    int64_t ref_numel,
-    ScalarType ref_dtype) {
-  auto check_one = [&](const at::Tensor& tensor) {
-    if (!tensor.is_xpu() || tensor.is_sparse()) {
-      throw std::runtime_error(
-          "input and output elements have to be xpu dense Tensors");
-    }
-
-    if (ref_dtype != tensor.scalar_type()) {
-      throw std::runtime_error(
-          "all inputs and outputs must be of the same Tensor dtype");
-    }
-
-    if (!tensor.is_contiguous()) {
-      throw std::runtime_error("all inputs and outputs have to be contiguous");
-    }
-  };
-
-  check_one(input);
-
-  // all inputs must be same size
-  if (input.numel() != ref_numel) {
-    throw std::runtime_error(
-        "all inputs must have the same number of elements");
-  }
-
-  if (output) {
-    check_one(*output);
-
-    // inputs and outputs must be on same device respectively
-    if (input.get_device() != output->get_device()) {
-      throw std::runtime_error("input and output must be on the same device");
-    }
-
-    if (output->numel() * output_multiplier != ref_numel * input_multiplier) {
-      throw std::runtime_error(
-          "output must be of size input_size * size_multiplier");
-    }
-  }
-}
-
-// void check_inputs(
-//     TensorList inputs,
-//     TensorList outputs,
-//     int input_multiplier,
-//     int output_multiplier) {
-//   // len(inputs) == len(outputs)
-//   size_t len = inputs.size();
-
-//   if (len <= 0) {
-//     throw std::runtime_error("input sequence can't be empty");
-//   }
-
-//   if (len != outputs.size()) {
-//     std::stringstream err;
-//     err << "inputs and outputs sequences have to be of the same length, but got input of length "
-//         << len << " and output of length " << outputs.size();
-//     throw std::runtime_error(err.str());
-//   }
-
-//   device_set devices;
-//   int64_t numel = inputs[0].numel();
-//   auto dtype = inputs[0].scalar_type();
-
-//   for (const auto i : c10::irange(len)) {
-//     auto input = inputs[i];
-//     auto output = outputs[i];
-
-//     check_tensor(
-//         input, output, input_multiplier, output_multiplier, numel, dtype);
-
-//     auto input_device = input.get_device();
-//     // inputs must be on unique devices
-//     if (devices.test(input_device)) {
-//       throw std::runtime_error("inputs must be on unique devices");
-//     }
-//     devices.set(input_device);
-//   }
-// }
-
-// void check_inputs(
-//     TensorList inputs,
-//     const at::Tensor& output,
-//     int root,
-//     int input_multiplier,
-//     int output_multiplier) {
-//   auto len = inputs.size();
-
-//   if (len <= 0) {
-//     throw std::runtime_error("input sequence can't be empty");
-//   }
-
-//   device_set devices;
-//   int64_t numel = inputs[0].numel();
-//   auto dtype = inputs[0].scalar_type();
-
-//   for (const auto i : c10::irange(len)) {
-//     auto input = inputs[i];
-
-//     check_tensor(
-//         input,
-//         i == static_cast<std::remove_cv_t<decltype(i)>>(root)
-//             ? std::optional<at::Tensor>{output}
-//             : std::nullopt,
-//         input_multiplier,
-//         output_multiplier,
-//         numel,
-//         dtype);
-
-//     auto input_device = input.get_device();
-//     // inputs must be on unique devices
-//     if (devices.test(input_device)) {
-//       throw std::runtime_error("inputs must be on unique devices");
-//     }
-//     devices.set(input_device);
-//   }
-// }
-
-} // namespace detail
-
-// std::uint64_t version() {
-// #if defined(NCCL_MAJOR)
-//   constexpr std::uint64_t ver = (((uint64_t)NCCL_MAJOR) << 32) |
-//       (((uint64_t)NCCL_MINOR) << 16) | ((uint64_t)NCCL_PATCH);
-//   return ver;
-// #elif defined(USE_NCCL)
-//   // return major version "1"
-//   return ((uint64_t)1) << 32;
-// #else
-//   return 0;
-// #endif
-// }
-
-// ncclComm_t comm_init_rank(int nranks, const ncclUniqueId& comm_id, int rank)
-// { #ifdef USE_XCCL
-//   using namespace torch::xpu::xccl::detail;
-//   xcclComm_t comm;
-//   ncclUniqueId id = comm_id;
-//   NCCL_CHECK(ncclCommInitRank(
-//       to_nccl_comm(&comm), nranks, *(to_nccl_unique_id(&id)), rank));
-//   return comm;
-// #else
-//   return nullptr;
-// #endif
-// }
-
-// namespace {
-
-//         ret_evt = torch::xpu::xccl::all_reduce(
-//             input,
-//             output,
-//             datatype,
-//             xcclOp.at(opts.reduceOp),
-//             comm,
-//             attr,
-//             stream,
-//             root);
-
-// void all_reduce(
-//     at::Tensor& input,
-//     at::Tensor& output,
-//     ccl::datatype datatype,
-//     ccl::reduction op,
-//     const stream_list& streams,
-//     const comm_list& user_comms) {
-// #ifdef USE_XCCL
-//   using namespace torch::cuda::nccl::detail;
-//   check_inputs(inputs, outputs, 1, 1);
-//   const auto len = inputs.size();
-
-//   auto data_type = to_nccl_data_type(inputs[0]);
-
-//   const auto count = inputs[0].numel();
-//   auto comms_ref = user_comms.empty() ? get_communicators(inputs)
-//                                       : ArrayRef<ncclComm_t>(user_comms);
-
-//   AutoNcclGroup nccl_group_guard;
-//   at::cuda::OptionalCUDAGuard device_guard;
-//   for (const auto i : c10::irange(len)) {
-//     auto device = inputs[i].device().index();
-//     device_guard.set_index(device);
-//     // Default to the current stream
-//     const auto stream = (streams.empty() || !streams[i])
-//         ? at::cuda::getCurrentCUDAStream(device).stream()
-//         : streams[i]->stream();
-
-//     ncclComm_t comm = comms_ref[i];
-//     NCCL_CHECK(ncclAllReduce(
-//         inputs[i].data_ptr(),
-//         outputs[i].data_ptr(),
-//         count,
-//         data_type,
-//         to_nccl_red_op(op),
-//         to_nccl_comm(comm),
-//         stream));
-//   }
-// #else
-//   AT_ERROR("PyTorch built without NCCL support");
-// #endif
-// }
-
-} // namespace torch::xpu::xccl
diff --git a/torch/csrc/xpu/xccl.h b/torch/csrc/xpu/xccl.h
deleted file mode 100644
index c7a67975bb286c..00000000000000
--- a/torch/csrc/xpu/xccl.h
+++ /dev/null
@@ -1,77 +0,0 @@
-#pragma once
-
-#include <ATen/ATen.h>
-#include <oneapi/ccl.hpp>
-#include <cstddef>
-#include <optional>
-#include <vector>
-#include <torch/csrc/distributed/c10d/Store.hpp>
-#include <torch/csrc/xpu/Stream.h>
-#include <torch/csrc/xpu/Event.h>
-
-namespace torch::xpu::xccl {
-
-using xcclComm_t = ccl::communicator;
-
-using XCCL_KVS = ccl::shared_ptr_class<ccl::kvs>;
-
-extern XCCL_KVS kvs;
-
-XCCL_KVS get_kvs(int rank, c10d::Store& store);
-
-enum class xcclRedOp { Sum = 0, Prod = 1, Max = 2, Min = 3 };
-
-enum class xcclDataType {
-  Int8 = 0,
-  Char = 0,
-  Uint8 = 1,
-  Int32 = 2,
-  Int = 2,
-  Uint32 = 3,
-  Int64 = 4,
-  Uint64 = 5,
-  Float16 = 6,
-  Half = 6,
-  Float32 = 7,
-  Float = 7,
-  Float64 = 8,
-  Double = 8,
-  Bfloat16 = 9,
-  NumTypes = 10
-};
-
-namespace detail {
-
-at::ArrayRef<xcclComm_t> get_communicators(at::TensorList inputs);
-void check_inputs(
-    at::TensorList inputs,
-    at::TensorList outputs,
-    int input_multiplier,
-    int output_multiplier);
-void check_inputs(
-    at::TensorList inputs,
-    const at::Tensor& output,
-    int root,
-    int input_multiplier,
-    int output_multiplier);
-
-} // namespace detail
-
-// using comm_list = std::vector<xor>;
-// using stream_list = std::vector<std::optional<at::xpu::XPUStream>>;
-
-std::uint64_t version();
-const char* version_suffix();
-
-bool is_available(at::TensorList tensors);
-
-// comm_init_rank(int nranks, const ncclUniqueId& comm_id, int rank);
-// void comm_destroy(xcclComm_t comm);
-
-// void all_reduce(
-//     const std::vector<at::Tensor>& inputs,
-//     std::vector<at::Tensor>& outputs,
-//     int32_t op = static_cast<int>(xcclRedOp::Sum),
-//     const stream_list& streams = {},
-//     const comm_list& user_comms = {});
-} // namespace torch::xpu::xccl

From fb9746bd8c15ebaeef525926cc7b5e112e51dddd Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Thu, 5 Sep 2024 05:56:21 +0000
Subject: [PATCH 15/96] register again

---
 torch/_C/_distributed_c10d.pyi                |  1 +
 torch/csrc/distributed/c10d/ProcessGroup.cpp  |  2 ++
 torch/csrc/distributed/c10d/ProcessGroup.hpp  |  2 ++
 .../distributed/c10d/ProcessGroupXCCL.cpp     | 30 ++++++++++++-------
 torch/csrc/distributed/c10d/init.cpp          |  1 +
 torch/distributed/distributed_c10d.py         | 11 +++++--
 6 files changed, 35 insertions(+), 12 deletions(-)

diff --git a/torch/_C/_distributed_c10d.pyi b/torch/_C/_distributed_c10d.pyi
index 0c97185519d28f..53011cde6b178a 100644
--- a/torch/_C/_distributed_c10d.pyi
+++ b/torch/_C/_distributed_c10d.pyi
@@ -309,6 +309,7 @@ class ProcessGroup:
         UNDEFINED = ...
         GLOO = ...
         NCCL = ...
+        XCCL = ...
         UCC = ...
         MPI = ...
         CUSTOM = ...
diff --git a/torch/csrc/distributed/c10d/ProcessGroup.cpp b/torch/csrc/distributed/c10d/ProcessGroup.cpp
index 75635bc68aed4f..70356b3bf382ce 100644
--- a/torch/csrc/distributed/c10d/ProcessGroup.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroup.cpp
@@ -21,6 +21,8 @@ static ProcessGroup::BackendType strToBackendType(std::string_view backend) {
     return ProcessGroup::BackendType::GLOO;
   } else if (backend == "nccl") {
     return ProcessGroup::BackendType::NCCL;
+  } else if (backend == "xccl") {
+    return ProcessGroup::BackendType::XCCL;
   } else if (backend == "ucc") {
     return ProcessGroup::BackendType::UCC;
   } else if (backend == "mpi") {
diff --git a/torch/csrc/distributed/c10d/ProcessGroup.hpp b/torch/csrc/distributed/c10d/ProcessGroup.hpp
index 85142caf0ac7c7..73fc2bda701327 100644
--- a/torch/csrc/distributed/c10d/ProcessGroup.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroup.hpp
@@ -490,6 +490,7 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
     // TODO: HACK for backend name to get sequence number for that backend.
     if (backendType == ProcessGroup::BackendType::GLOO ||
         backendType == ProcessGroup::BackendType::NCCL ||
+        backendType == ProcessGroup::BackendType::XCCL ||
         backendType == ProcessGroup::BackendType::UCC) {
       getDefaultBackend()->setSequenceNumberForGroup();
     } else {
@@ -511,6 +512,7 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
     // TODO: HACK for backend name to get sequence number for that backend.
     if (backendType == ProcessGroup::BackendType::GLOO ||
         backendType == ProcessGroup::BackendType::NCCL ||
+        backendType == ProcessGroup::BackendType::XCCL ||
         backendType == ProcessGroup::BackendType::UCC) {
       return getDefaultBackend()->getSequenceNumberForGroup();
     } else {
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index ffd566f10f854a..e21be88ef83d16 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -178,9 +178,16 @@ std::shared_ptr<xcclComm_t> ProcessGroupXCCL::getXCCLComm(
   c10::impl::VirtualGuardImpl impl(device.type());
   c10::Stream stream = impl.getStream(device);
   sycl::queue& q = c10::xpu::XPUStream(stream).queue();
-  auto ctx = ccl::create_context(q.get_context());
-  devs_rank.emplace_back(rank, ccl::create_device(q.get_device()));
-  XCCLComm = ccl::create_communicator(numRanks, devs_rank, ctx, kvs);
+  // const sycl::context& sycl_ctx = q.get_context();
+  // sycl::context sycl_ctx = q.get_context();
+  // ccl::generic_context_type<ccl::cl_backend_type::dpcpp_sycl_l0> ccl_ctx(sycl_ctx);
+  // auto ctx = ccl::create_context(ccl_ctx.get());
+
+  // auto ctx = ccl::create_context(q.get_context());
+  // devs_rank.emplace_back(rank, ccl::create_device(q.get_device()));
+  // XCCLComm = ccl::create_communicator(numRanks, devs_rank, ctx, kvs);
+  XCCLComm = std::make_shared<xcclComm_t>(ccl::create_communicator(numRanks, rank, kvs));
+
 
   {
     std::lock_guard<std::mutex> lock(mutex_);
@@ -222,7 +229,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
   const auto key = std::to_string(device.index());
   auto comm = getXCCLComm(key, device);
 
-  auto xcclStream = xcclStreams_.at(key);
+  auto stream = xcclStreams_.at(key);
   std::vector<at::Tensor> inputs{input};
   std::vector<at::Tensor> outputs{output};
 
@@ -233,14 +240,17 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
   work->outputs_ =
       std::make_shared<std::vector<at::Tensor>>(std::move(outputs));
   c10::xpu::XPUCachingAllocator::recordStream(
-      input.storage().data_ptr(), xcclStream);
+      input.storage().data_ptr(), stream);
   
-  auto ccl_stream = ccl::create_stream(xcclStream.queue());
-  fn(input, output, attr, comm, ccl_stream);
+  // auto ccl_stream = ccl::create_stream(stream.queue());
+  auto ccl_stream = ccl::create_stream();
+
+  fn(input, output, attr, *comm, ccl_stream);
+  // fn(input, output, attr, comm, ccl_stream);
 
-  work->xcclEndEvent_->record(xcclStream);
+  work->xcclEndEvent_->record(stream);
 
-  std::vector<c10::Stream> streams = {xcclStream.unwrap()};
+  std::vector<c10::Stream> streams = {stream.unwrap()};
   c10::MultiStreamGuard streamGuard(streams);
   std::vector<at::Device> devices{device};
   work->future_ = c10::make_intrusive<at::ivalue::Future>(
@@ -283,7 +293,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::allreduce(
       [&](at::Tensor& input,
           at::Tensor& output,
           ccl::allreduce_attr attr,
-          xcclComm_t comm,
+          xcclComm_t& comm,
           ccl::stream& stream) {
         ccl::event ret_evt;
         ret_evt = ccl::allreduce(
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index e12e96f9fe882f..5d200bb6eeb9cf 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -2237,6 +2237,7 @@ The hook must have the following signature:
       .value("UNDEFINED", ::c10d::ProcessGroup::BackendType::UNDEFINED)
       .value("GLOO", ::c10d::ProcessGroup::BackendType::GLOO)
       .value("NCCL", ::c10d::ProcessGroup::BackendType::NCCL)
+      .value("XCCL", ::c10d::ProcessGroup::BackendType::XCCL)
       .value("UCC", ::c10d::ProcessGroup::BackendType::UCC)
       .value("MPI", ::c10d::ProcessGroup::BackendType::MPI)
       .value("CUSTOM", ::c10d::ProcessGroup::BackendType::CUSTOM)
diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index 26cb1cda1db8cb..3f68609905bb5a 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -252,22 +252,24 @@ class Backend(str):
     NCCL = "nccl"
     UCC = "ucc"
     MPI = "mpi"
-    XCCL = "XCCL"
+    XCCL = "xccl"
 
     _BackendPlugin = namedtuple("_BackendPlugin", ["creator_fn", "extended_api"])
 
     _plugins: Dict[str, _BackendPlugin] = {}
 
-    backend_list = [UNDEFINED, GLOO, NCCL, UCC, MPI]
+    backend_list = [UNDEFINED, GLOO, NCCL, XCCL, UCC, MPI]
 
     default_device_backend_map: Dict[str, str] = {
         "cpu": GLOO,
         "cuda": NCCL,
+        "xpu": XCCL,
     }
 
     backend_capability: Dict[str, List[str]] = {
         GLOO: ["cpu", "cuda"],
         NCCL: ["cuda"],
+        XCCL: ["xpu"],
         UCC: ["cpu", "cuda"],
         MPI: ["cpu", "cuda"],
     }
@@ -276,6 +278,7 @@ class Backend(str):
         UNDEFINED: ProcessGroup.BackendType.UNDEFINED,
         GLOO: ProcessGroup.BackendType.GLOO,
         NCCL: ProcessGroup.BackendType.NCCL,
+        XCCL: ProcessGroup.BackendType.XCCL,
         UCC: ProcessGroup.BackendType.UCC,
     }
 
@@ -1364,6 +1367,10 @@ def _set_pg_timeout(timeout: timedelta, group: Optional[ProcessGroup] = None) ->
             backends.add(backend)  # type: ignore[arg-type]
         elif is_gloo_available() and isinstance(backend, ProcessGroupGloo):
             backends.add(backend)  # type: ignore[arg-type]
+    if torch.device("xpu") in devices and is_xpu_available():
+        backend = group._get_backend(torch.device("xpu"))
+        if isinstance(backend, ProcessGroupXCCL):
+            backends.add(backend)  # type: ignore[arg-type]
     if len(backends) == 0:
         warnings.warn("Set timeout is now only supported for either nccl or gloo.")
     for backend in backends:

From 4f73180371f3560acbb4750d9e366c3dc3feea40 Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Thu, 5 Sep 2024 07:42:13 +0000
Subject: [PATCH 16/96] update

---
 torch/csrc/distributed/c10d/Ops.cpp              | 1 +
 torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/torch/csrc/distributed/c10d/Ops.cpp b/torch/csrc/distributed/c10d/Ops.cpp
index 03a5e42874594e..4979c57384fcb4 100644
--- a/torch/csrc/distributed/c10d/Ops.cpp
+++ b/torch/csrc/distributed/c10d/Ops.cpp
@@ -516,6 +516,7 @@ REGISTER_C10D_OP(alltoall_)
 REGISTER_C10D_OP(alltoall_base_)
 REGISTER_C10D_OP(barrier)
 
+REGISTER_C10D_OP1(allreduce_, XPU)
 // The following ops are specialized, register them separately
 
 TORCH_LIBRARY_IMPL(c10d, CPU, m) {
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
index b43403f52f31ab..9ad20797afcb6d 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
@@ -38,7 +38,7 @@ using XCCL_KVS = ccl::shared_ptr_class<ccl::kvs>;
 constexpr const char* XCCL_BACKEND_NAME = "xccl";
 // using namespace torch::xpu::xccl;
 
-class TORCH_XPU_API ProcessGroupXCCL : public Backend {
+class TORCH_API ProcessGroupXCCL : public Backend {
  public:
   class WorkXCCL : public Work {
    public:

From 7c2f0180b16ecc681836708474ec4c79b09e12fa Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Fri, 6 Sep 2024 10:10:50 +0000
Subject: [PATCH 17/96] update

---
 caffe2/CMakeLists.txt                         | 11 ++++---
 .../distributed/c10d/ProcessGroupXCCL.cpp     | 23 ++++++--------
 .../distributed/c10d/ProcessGroupXCCL.hpp     | 30 ++-----------------
 3 files changed, 19 insertions(+), 45 deletions(-)

diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 55339880a82a37..2119dd19328000 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -1014,10 +1014,6 @@ elseif(USE_CUDA)
 endif()
 
 if(USE_XPU)
-  # if(USE_XCCL)
-  #   list(APPEND Caffe2_XPU_SRCS
-  #     ${TORCH_SRC_DIR}/csrc/xpu/xccl.cpp)
-  # endif()
   add_library(torch_xpu ${Caffe2_XPU_SRCS})
   torch_compile_options(torch_xpu)  # see cmake/public/utils.cmake
   target_compile_definitions(torch_xpu PRIVATE USE_XPU)
@@ -1373,7 +1369,14 @@ if(USE_DISTRIBUTED)
     endif()
   endif()
   if(USE_C10D_XCCL)
+    # if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+    #   set_source_files_properties(
+    #     ${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+    #     PROPERTIES COMPILE_DEFINITIONS  "CCL_ENABLE_SYCL")
+    #   target_sources(torch_xpu PRIVATE ${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupXCCL.cpp)
+    # endif()
     target_compile_definitions(torch_xpu PUBLIC USE_C10D_XCCL)
+    target_compile_definitions(torch_xpu PUBLIC CCL_ENABLE_SYCL)
   endif()
   if(USE_MPI AND USE_C10D_MPI)
     if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index e21be88ef83d16..cabdb9f61433bc 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -3,7 +3,7 @@
 #include <mutex>
 #include <sstream>
 
-// #ifdef USE_C10D_XCCL
+#ifdef USE_C10D_XCCL
 #include <exception>
 #include <map>
 #include <stdexcept>
@@ -174,20 +174,16 @@ std::shared_ptr<xcclComm_t> ProcessGroupXCCL::getXCCLComm(
   numRanks = getSize();
   rank = getRank();
 
-  ccl::vector_class<ccl::pair_class<int, ccl::device>> devs_rank;
   c10::impl::VirtualGuardImpl impl(device.type());
   c10::Stream stream = impl.getStream(device);
   sycl::queue& q = c10::xpu::XPUStream(stream).queue();
-  // const sycl::context& sycl_ctx = q.get_context();
-  // sycl::context sycl_ctx = q.get_context();
-  // ccl::generic_context_type<ccl::cl_backend_type::dpcpp_sycl_l0> ccl_ctx(sycl_ctx);
-  // auto ctx = ccl::create_context(ccl_ctx.get());
 
-  // auto ctx = ccl::create_context(q.get_context());
-  // devs_rank.emplace_back(rank, ccl::create_device(q.get_device()));
-  // XCCLComm = ccl::create_communicator(numRanks, devs_rank, ctx, kvs);
-  XCCLComm = std::make_shared<xcclComm_t>(ccl::create_communicator(numRanks, rank, kvs));
+  auto ctx = ccl::create_context(q.get_context());
+  ccl::vector_class<ccl::pair_class<int, ccl::device>> devs_rank;
+  devs_rank.emplace_back(rank, ccl::create_device(q.get_device()));
 
+  auto comms = ccl::create_communicators(numRanks, devs_rank, ctx, kvs);
+  XCCLComm = std::make_shared<xcclComm_t>(std::move(comms[0]));
 
   {
     std::lock_guard<std::mutex> lock(mutex_);
@@ -242,11 +238,10 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
   c10::xpu::XPUCachingAllocator::recordStream(
       input.storage().data_ptr(), stream);
   
-  // auto ccl_stream = ccl::create_stream(stream.queue());
-  auto ccl_stream = ccl::create_stream();
+  auto ccl_stream = ccl::create_stream(stream.queue());
+  // auto ccl_stream = ccl::create_stream();
 
   fn(input, output, attr, *comm, ccl_stream);
-  // fn(input, output, attr, comm, ccl_stream);
 
   work->xcclEndEvent_->record(stream);
 
@@ -312,4 +307,4 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::allreduce(
 
 } // namespace c10d
 
-// #endif // USE_C10D_XCCL
+#endif // USE_C10D_XCCL
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
index 9ad20797afcb6d..f8b9d15bd65484 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
@@ -7,12 +7,11 @@
 #include <unistd.h>
 #endif
 
-// #ifdef USE_C10D_XCCL
+#ifdef USE_C10D_XCCL
 #include <oneapi/ccl.hpp>
 #include <torch/csrc/distributed/c10d/Store.hpp>
 #include <torch/csrc/xpu/Event.h>
 #include <torch/csrc/xpu/Stream.h>
-// #include <torch/csrc/xpu/xccl.h>
 #include <exception>
 #include <memory>
 #include <vector>
@@ -36,7 +35,6 @@ namespace c10d {
 using xcclComm_t = ccl::communicator;
 using XCCL_KVS = ccl::shared_ptr_class<ccl::kvs>;
 constexpr const char* XCCL_BACKEND_NAME = "xccl";
-// using namespace torch::xpu::xccl;
 
 class TORCH_API ProcessGroupXCCL : public Backend {
  public:
@@ -47,30 +45,11 @@ class TORCH_API ProcessGroupXCCL : public Backend {
         int rank,
         OpType opType,
         const std::optional<std::vector<at::Tensor>>& inputs = std::nullopt);
-    // WorkXCCL(
-    //     std::vector<std::vector<at::Tensor>> outputTensors,
-    //     int rank = -1,
-    //     OpType opType = OpType::UNKNOWN,
-    //     const c10::optional<std::vector<at::Tensor>>& inputTensors =
-    //         c10::nullopt)
-    //     : Work(rank, opType), outputTensors_(std::move(outputTensors)) {}
     WorkXCCL(const WorkXCCL& w);
-    // ~WorkXCCL() override {
-    //   // Ensures all events are properly handled before destruction
-    //   for (auto& event : events_) {
-    //     event.wait();
-    //   }
-    // }
     ~WorkXCCL() override;
     bool isCompleted() override {
       TORCH_CHECK(
           false, "ProcessGroupXCCL::WorkXCCL::isCompleted not implemented");
-      // for (auto& event : events_) {
-      //   if (!event.test()) {
-      //     return false;
-      //   }
-      // }
-      // return true;
     }
 
     bool isSuccess() const override {
@@ -97,9 +76,6 @@ class TORCH_API ProcessGroupXCCL : public Backend {
    protected:
     at::Device device_;
     std::shared_ptr<at::xpu::XPUEvent> xcclEndEvent_;
-    // std::vector<ccl::event> events_;
-    // std::shared_ptr<xcclComm_t> xcclComm_;
-    // const std::vector<std::vector<at::Tensor>> outputTensors_;
    private:
     std::shared_ptr<std::vector<at::Tensor>> outputs_;
     c10::intrusive_ptr<at::ivalue::Future> future_;
@@ -110,7 +86,7 @@ class TORCH_API ProcessGroupXCCL : public Backend {
       const c10::intrusive_ptr<Store>& store,
       int rank,
       int size)
-      : store_(store), Backend(rank, size) {}
+      : Backend(rank, size), store_(store) {}
 
   ~ProcessGroupXCCL() override;
 
@@ -168,4 +144,4 @@ class TORCH_API ProcessGroupXCCL : public Backend {
 
 } // namespace c10d
 
-// #endif // USE_C10D_XCCL
+#endif // USE_C10D_XCCL

From 229a80ac530ff1e976b7670826e953b73ac6f5b8 Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Mon, 9 Sep 2024 08:12:16 +0000
Subject: [PATCH 18/96] refine cmake

---
 build_variables.bzl                           |  5 ++++-
 caffe2/CMakeLists.txt                         | 19 +++++++++----------
 cmake/Dependencies.cmake                      |  2 +-
 cmake/External/xccl.cmake                     | 10 +++++++---
 torch/CMakeLists.txt                          |  6 ++----
 .../distributed/c10d/ProcessGroupXCCL.hpp     |  3 +--
 6 files changed, 24 insertions(+), 21 deletions(-)

diff --git a/build_variables.bzl b/build_variables.bzl
index cff70d00320b0e..98b721617b609c 100644
--- a/build_variables.bzl
+++ b/build_variables.bzl
@@ -700,6 +700,10 @@ libtorch_cuda_sources = libtorch_cuda_core_sources + libtorch_cuda_distributed_s
     "torch/csrc/cuda/nccl.cpp",
 ]
 
+libtorch_xpu_distributed_extra_sources = [
+    "torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp",
+]
+
 torch_cpp_srcs = [
     "torch/csrc/api/src/cuda.cpp",  # this just forwards stuff, no real CUDA
     "torch/csrc/api/src/data/datasets/mnist.cpp",
@@ -785,7 +789,6 @@ libtorch_python_cuda_sources = libtorch_python_cuda_core_sources + [
 ]
 
 libtorch_python_xpu_sources = [
-    "torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp",
     "torch/csrc/xpu/Event.cpp",
     "torch/csrc/xpu/Module.cpp",
     "torch/csrc/xpu/Stream.cpp",
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 2119dd19328000..9f242febb94711 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -1014,9 +1014,14 @@ elseif(USE_CUDA)
 endif()
 
 if(USE_XPU)
+  if(USE_DISTRIBUTED)
+    append_filelist("libtorch_xpu_distributed_extra_sources" Caffe2_XPU_SRCS)
+    # message(FATAL_ERROR ${Caffe2_XPU_SRCS})
+  endif()
   add_library(torch_xpu ${Caffe2_XPU_SRCS})
   torch_compile_options(torch_xpu)  # see cmake/public/utils.cmake
   target_compile_definitions(torch_xpu PRIVATE USE_XPU)
+
   # ATen XPU implementation
   set(TORCH_XPU_OPS_DIR ${TORCH_ROOT}/third_party/torch-xpu-ops)
   set(TORCH_XPU_OPS_REPO_URL https://github.com/intel/torch-xpu-ops.git)
@@ -1064,10 +1069,6 @@ if(USE_XPU)
     message(WARNING "Failed to include ATen XPU implementation target")
   else()
     target_link_libraries(torch_xpu PRIVATE torch_xpu_ops)
-    if(USE_XCCL)
-      target_link_libraries(torch_xpu PRIVATE __caffe2_xccl)
-      target_compile_definitions(torch_xpu PRIVATE USE_XCCL)
-    endif()
     if(MSVC)
       # Windows
       target_link_libraries(torch_xpu PRIVATE
@@ -1082,6 +1083,10 @@ if(USE_XPU)
     include_directories(SYSTEM ${ATen_XPU_INCLUDE_DIRS})
 
   endif()
+  if(USE_XCCL)
+    target_link_libraries(torch_xpu PRIVATE torch::xccl)
+    target_compile_definitions(torch_xpu PRIVATE USE_XCCL)
+  endif()
 endif()
 
 if(NOT MSVC AND USE_XNNPACK)
@@ -1369,12 +1374,6 @@ if(USE_DISTRIBUTED)
     endif()
   endif()
   if(USE_C10D_XCCL)
-    # if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-    #   set_source_files_properties(
-    #     ${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupXCCL.cpp
-    #     PROPERTIES COMPILE_DEFINITIONS  "CCL_ENABLE_SYCL")
-    #   target_sources(torch_xpu PRIVATE ${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupXCCL.cpp)
-    # endif()
     target_compile_definitions(torch_xpu PUBLIC USE_C10D_XCCL)
     target_compile_definitions(torch_xpu PUBLIC CCL_ENABLE_SYCL)
   endif()
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index cb204eada5f689..8abea841fcf61c 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -1162,7 +1162,7 @@ if(USE_XCCL)
     caffe2_update_option(USE_XCCL OFF)
   else()
     include(${CMAKE_CURRENT_LIST_DIR}/External/xccl.cmake)
-    list(APPEND Caffe2_XPU_DEPENDENCY_LIBS __caffe2_xccl)
+    list(APPEND Caffe2_XPU_DEPENDENCY_LIBS torch::xccl)
   endif()
 endif()
 
diff --git a/cmake/External/xccl.cmake b/cmake/External/xccl.cmake
index d1e8f33881b80b..56205b381b1324 100644
--- a/cmake/External/xccl.cmake
+++ b/cmake/External/xccl.cmake
@@ -5,9 +5,13 @@ if(NOT __XCCL_INCLUDED)
     # XCCL_ROOT, XCCL_LIBRARY_DIR, XCCL_INCLUDE_DIR are handled by FindXCCL.cmake.
     find_package(XCCL REQUIRED)
     if(XCCL_FOUND)
-      add_library(__caffe2_xccl INTERFACE)
-      target_link_libraries(__caffe2_xccl INTERFACE ${XCCL_LIBRARY})
-      target_include_directories(__caffe2_xccl INTERFACE ${XCCL_INCLUDE_DIR})
+      add_library(torch::xccl INTERFACE IMPORTED)
+      set_property(
+        TARGET torch::xccl PROPERTY INTERFACE_INCLUDE_DIRECTORIES
+        ${XCCL_INCLUDE_DIR})
+      set_property(
+        TARGET torch::xccl PROPERTY INTERFACE_LINK_LIBRARIES
+        ${XCCL_LIBRARY})
     endif()
   endif()
 endif()
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index f50ae4e02c3386..5bca5ac72452ec 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -163,11 +163,9 @@ endif()
 if(USE_XPU)
     include(${TORCH_ROOT}/cmake/public/xpu.cmake)
     append_filelist("libtorch_python_xpu_sources" TORCH_PYTHON_SRCS)
-
+    
     list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_XPU)
-    # if(USE_XCCL)
     list(APPEND TORCH_PYTHON_LINK_LIBRARIES torch::xpurt)
-    # endif()
 endif()
 
 if(USE_CUDNN OR USE_ROCM)
@@ -286,7 +284,7 @@ if(USE_DISTRIBUTED)
       list(APPEND TORCH_PYTHON_LINK_LIBRARIES __caffe2_nccl)
     endif()
     if(USE_XCCL)
-      list(APPEND TORCH_PYTHON_LINK_LIBRARIES __caffe2_xccl)
+      list(APPEND TORCH_PYTHON_LINK_LIBRARIES torch::xccl)
     endif()
     # Same for MPI.
     if(USE_MPI)
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
index f8b9d15bd65484..829e07816589fc 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
@@ -10,8 +10,7 @@
 #ifdef USE_C10D_XCCL
 #include <oneapi/ccl.hpp>
 #include <torch/csrc/distributed/c10d/Store.hpp>
-#include <torch/csrc/xpu/Event.h>
-#include <torch/csrc/xpu/Stream.h>
+#include <ATen/xpu/XPUEvent.h>
 #include <exception>
 #include <memory>
 #include <vector>

From 746007b5fc6edb8ad98f6e7e1811ffe7623240f5 Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Mon, 9 Sep 2024 09:46:48 +0000
Subject: [PATCH 19/96] register all dist op and enable getXcclReduceOp

---
 caffe2/CMakeLists.txt                         |   8 +-
 torch/csrc/distributed/c10d/Ops.cpp           |  20 ++-
 .../distributed/c10d/ProcessGroupXCCL.cpp     |  50 +++++--
 .../distributed/c10d/ProcessGroupXCCL.hpp     | 141 +++++++++++++++++-
 4 files changed, 198 insertions(+), 21 deletions(-)

diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 9f242febb94711..ae183e32d17e7d 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -1014,9 +1014,8 @@ elseif(USE_CUDA)
 endif()
 
 if(USE_XPU)
-  if(USE_DISTRIBUTED)
+  if(USE_XCCL)
     append_filelist("libtorch_xpu_distributed_extra_sources" Caffe2_XPU_SRCS)
-    # message(FATAL_ERROR ${Caffe2_XPU_SRCS})
   endif()
   add_library(torch_xpu ${Caffe2_XPU_SRCS})
   torch_compile_options(torch_xpu)  # see cmake/public/utils.cmake
@@ -1375,7 +1374,10 @@ if(USE_DISTRIBUTED)
   endif()
   if(USE_C10D_XCCL)
     target_compile_definitions(torch_xpu PUBLIC USE_C10D_XCCL)
-    target_compile_definitions(torch_xpu PUBLIC CCL_ENABLE_SYCL)
+    # target_compile_definitions(torch_xpu PUBLIC CCL_ENABLE_SYCL)
+    set_source_files_properties(
+      ${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+      PROPERTIES COMPILE_DEFINITIONS  "CCL_ENABLE_SYCL")
   endif()
   if(USE_MPI AND USE_C10D_MPI)
     if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
diff --git a/torch/csrc/distributed/c10d/Ops.cpp b/torch/csrc/distributed/c10d/Ops.cpp
index 4979c57384fcb4..48d2b3ed1bf69a 100644
--- a/torch/csrc/distributed/c10d/Ops.cpp
+++ b/torch/csrc/distributed/c10d/Ops.cpp
@@ -79,6 +79,7 @@ namespace {
   }
 
 IMPL_SEND(CPU)
+IMPL_SEND(XPU)
 IMPL_SEND(CUDA)
 IMPL_SEND(PrivateUse1)
 
@@ -94,6 +95,7 @@ IMPL_SEND(PrivateUse1)
   }
 
 IMPL_RECV(CPU)
+IMPL_RECV(XPU)
 IMPL_RECV(CUDA)
 IMPL_RECV(PrivateUse1)
 
@@ -108,6 +110,7 @@ IMPL_RECV(PrivateUse1)
   }
 
 IMPL_RECV_ANY_SOURCE(CPU)
+IMPL_RECV_ANY_SOURCE(XPU)
 IMPL_RECV_ANY_SOURCE(CUDA)
 IMPL_RECV_ANY_SOURCE(PrivateUse1)
 
@@ -131,6 +134,7 @@ IMPL_RECV_ANY_SOURCE(PrivateUse1)
   }
 
 IMPL_REDUCE(CPU)
+IMPL_REDUCE(XPU)
 IMPL_REDUCE(CUDA)
 IMPL_REDUCE(PrivateUse1)
 
@@ -156,6 +160,7 @@ IMPL_REDUCE(PrivateUse1)
   }
 
 IMPL_BROADCAST(CPU)
+IMPL_BROADCAST(XPU)
 IMPL_BROADCAST(CUDA)
 IMPL_BROADCAST(PrivateUse1)
 
@@ -199,6 +204,7 @@ IMPL_ALLREDUCE(PrivateUse1)
   }
 
 IMPL_ALLREDUCE_COALESCED(CPU)
+IMPL_ALLREDUCE_COALESCED(XPU)
 IMPL_ALLREDUCE_COALESCED(CUDA)
 IMPL_ALLREDUCE_COALESCED(PrivateUse1)
 
@@ -223,6 +229,7 @@ IMPL_ALLREDUCE_COALESCED(PrivateUse1)
 
 // NOLINTBEGIN(cppcoreguidelines-pro-type-const-cast)
 IMPL_ALLGATHER(CPU)
+IMPL_ALLGATHER(XPU)
 IMPL_ALLGATHER(CUDA)
 IMPL_ALLGATHER(PrivateUse1)
 
@@ -243,6 +250,7 @@ IMPL_ALLGATHER(PrivateUse1)
   }
 
 IMPL__ALLGATHER_BASE(CPU)
+IMPL__ALLGATHER_BASE(XPU)
 IMPL__ALLGATHER_BASE(CUDA)
 IMPL__ALLGATHER_BASE(PrivateUse1)
 
@@ -259,6 +267,7 @@ IMPL__ALLGATHER_BASE(PrivateUse1)
   }
 
 IMPL_ALLGATHER_COALESCED(CPU)
+IMPL_ALLGATHER_COALESCED(XPU)
 IMPL_ALLGATHER_COALESCED(CUDA)
 IMPL_ALLGATHER_COALESCED(PrivateUse1)
 
@@ -274,6 +283,7 @@ IMPL_ALLGATHER_COALESCED(PrivateUse1)
   }
 
 IMPL_ALLGATHER_INTO_TENSOR_COALESCED(CPU)
+IMPL_ALLGATHER_INTO_TENSOR_COALESCED(XPU)
 IMPL_ALLGATHER_INTO_TENSOR_COALESCED(CUDA)
 IMPL_ALLGATHER_INTO_TENSOR_COALESCED(PrivateUse1)
 
@@ -297,6 +307,7 @@ IMPL_ALLGATHER_INTO_TENSOR_COALESCED(PrivateUse1)
   }
 
 IMPL_REDUCE_SCATTER(CPU)
+IMPL_REDUCE_SCATTER(XPU)
 IMPL_REDUCE_SCATTER(CUDA)
 IMPL_REDUCE_SCATTER(PrivateUse1)
 
@@ -321,6 +332,7 @@ IMPL_REDUCE_SCATTER(PrivateUse1)
   }
 
 IMPL__REDUCE_SCATTER_BASE(CPU)
+IMPL__REDUCE_SCATTER_BASE(XPU)
 IMPL__REDUCE_SCATTER_BASE(CUDA)
 IMPL__REDUCE_SCATTER_BASE(PrivateUse1)
 
@@ -342,6 +354,7 @@ IMPL__REDUCE_SCATTER_BASE(PrivateUse1)
   }
 
 IMPL_REDUCE_SCATTER_TENSOR_COALESCED(CPU)
+IMPL_REDUCE_SCATTER_TENSOR_COALESCED(XPU)
 IMPL_REDUCE_SCATTER_TENSOR_COALESCED(CUDA)
 IMPL_REDUCE_SCATTER_TENSOR_COALESCED(PrivateUse1)
 
@@ -361,6 +374,7 @@ IMPL_REDUCE_SCATTER_TENSOR_COALESCED(PrivateUse1)
   }
 
 IMPL_GATHER(CPU)
+IMPL_GATHER(XPU)
 IMPL_GATHER(CUDA)
 IMPL_GATHER(PrivateUse1)
 
@@ -383,6 +397,7 @@ IMPL_GATHER(PrivateUse1)
   }
 
 IMPL_SCATTER(CPU)
+IMPL_SCATTER(XPU)
 IMPL_SCATTER(CUDA)
 IMPL_SCATTER(PrivateUse1)
 
@@ -404,6 +419,7 @@ IMPL_SCATTER(PrivateUse1)
   }
 
 IMPL_ALLTOALL(CPU)
+IMPL_ALLTOALL(XPU)
 IMPL_ALLTOALL(CUDA)
 IMPL_ALLTOALL(PrivateUse1)
 
@@ -425,6 +441,7 @@ IMPL_ALLTOALL(PrivateUse1)
   }
 
 IMPL_ALLTOALL_BASE(CPU)
+IMPL_ALLTOALL_BASE(XPU)
 IMPL_ALLTOALL_BASE(CUDA)
 IMPL_ALLTOALL_BASE(PrivateUse1)
 
@@ -440,6 +457,7 @@ IMPL_ALLTOALL_BASE(PrivateUse1)
   }
 
 IMPL_BARRIER(CPU)
+IMPL_BARRIER(XPU)
 IMPL_BARRIER(CUDA)
 IMPL_BARRIER(PrivateUse1)
 // NOLINTEND(cppcoreguidelines-pro-type-const-cast)
@@ -492,6 +510,7 @@ namespace {
 #define REGISTER_C10D_OP(FUNC)  \
   REGISTER_C10D_OP1(FUNC, CPU)  \
   REGISTER_C10D_OP1(FUNC, CUDA) \
+  REGISTER_C10D_OP1(FUNC, XPU) \
   REGISTER_C10D_OP1(FUNC, PrivateUse1)
 
 // Now we start to register ops with the three device keys
@@ -516,7 +535,6 @@ REGISTER_C10D_OP(alltoall_)
 REGISTER_C10D_OP(alltoall_base_)
 REGISTER_C10D_OP(barrier)
 
-REGISTER_C10D_OP1(allreduce_, XPU)
 // The following ops are specialized, register them separately
 
 TORCH_LIBRARY_IMPL(c10d, CPU, m) {
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index cabdb9f61433bc..f6ef0ae0a6ebee 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -94,6 +94,39 @@ ccl::datatype getXcclDataType(at::ScalarType type) {
       type);
   return it->second;
 }
+
+ccl::reduction getXcclReduceOp(const ReduceOp& reduceOp, at::Tensor& input) {
+  try {
+    if (input.scalar_type() == at::kBool) {
+      if (reduceOp == ReduceOp::SUM) {
+        // For bool tensors, map sum to max, which both represent a bitwise or.
+        // This is to prevent overflow issues with sum, since we use uint8 to
+        // represent a bool (see xcclDatatypes mapping align with cuda).
+        return ccl::reduction::max;
+      }
+    }
+    return xcclOps.at(reduceOp);
+  } catch (const std::out_of_range&) {
+    switch (reduceOp) {
+      case ReduceOp::AVG:
+        C10_THROW_ERROR(ValueError, "Cannot use ReduceOp AVG with XCCL");
+        break;
+      case ReduceOp::BAND:
+        C10_THROW_ERROR(ValueError, "Cannot use ReduceOp.BAND with XCCL");
+        break;
+      case ReduceOp::BOR:
+        C10_THROW_ERROR(ValueError, "Cannot use ReduceOp.BOR with XCCL");
+        break;
+      case ReduceOp::BXOR:
+        C10_THROW_ERROR(ValueError, "Cannot use ReduceOp.BXOR with XCCL");
+        break;
+      default:
+        C10_THROW_ERROR(ValueError, "Unhandled ReduceOp");
+        break;
+    }
+  }
+}
+
 } // namespace
 
 static std::mutex xcclCommDevIdxMapMutex;
@@ -110,7 +143,8 @@ ProcessGroupXCCL::WorkXCCL::WorkXCCL(
 }
 
 ProcessGroupXCCL::WorkXCCL::WorkXCCL(const WorkXCCL& w)
-    : Work(w.rank_, w.opType_), device_(w.device_),
+    : Work(w.rank_, w.opType_),
+      device_(w.device_),
       xcclEndEvent_(w.xcclEndEvent_) {}
 
 ProcessGroupXCCL::WorkXCCL::~WorkXCCL() = default;
@@ -142,10 +176,7 @@ c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL> ProcessGroupXCCL::initWork(
     const std::vector<at::Tensor>& inputs,
     const std::vector<at::Tensor>& outputs) {
   auto r = c10::make_intrusive<ProcessGroupXCCL::WorkXCCL>(
-      device,
-      rank,
-      opType,
-      std::optional<std::vector<at::Tensor>>(inputs));
+      device, rank, opType, std::optional<std::vector<at::Tensor>>(inputs));
   return r;
 }
 
@@ -237,9 +268,8 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
       std::make_shared<std::vector<at::Tensor>>(std::move(outputs));
   c10::xpu::XPUCachingAllocator::recordStream(
       input.storage().data_ptr(), stream);
-  
+
   auto ccl_stream = ccl::create_stream(stream.queue());
-  // auto ccl_stream = ccl::create_stream();
 
   fn(input, output, attr, *comm, ccl_stream);
 
@@ -290,13 +320,15 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::allreduce(
           ccl::allreduce_attr attr,
           xcclComm_t& comm,
           ccl::stream& stream) {
+        auto xcclDataType = getXcclDataType(input.scalar_type());
+        auto xcclReduceOp = getXcclReduceOp(opts.reduceOp, input);
         ccl::event ret_evt;
         ret_evt = ccl::allreduce(
             input.data_ptr(),
             output.data_ptr(),
             (size_t)input.numel(),
-            getXcclDataType(input.scalar_type()),
-            xcclOps.at(opts.reduceOp),
+            xcclDataType,
+            xcclReduceOp,
             comm,
             stream,
             attr);
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
index 829e07816589fc..bd74ca745ed644 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
@@ -8,9 +8,9 @@
 #endif
 
 #ifdef USE_C10D_XCCL
+#include <ATen/xpu/XPUEvent.h>
 #include <oneapi/ccl.hpp>
 #include <torch/csrc/distributed/c10d/Store.hpp>
-#include <ATen/xpu/XPUEvent.h>
 #include <exception>
 #include <memory>
 #include <vector>
@@ -75,6 +75,7 @@ class TORCH_API ProcessGroupXCCL : public Backend {
    protected:
     at::Device device_;
     std::shared_ptr<at::xpu::XPUEvent> xcclEndEvent_;
+
    private:
     std::shared_ptr<std::vector<at::Tensor>> outputs_;
     c10::intrusive_ptr<at::ivalue::Future> future_;
@@ -89,6 +90,11 @@ class TORCH_API ProcessGroupXCCL : public Backend {
 
   ~ProcessGroupXCCL() override;
 
+  static c10::intrusive_ptr<Backend> createProcessGroupXCCL(
+      const c10::intrusive_ptr<Store>& store,
+      int rank = -1,
+      int size = -1);
+
   const std::string getBackendName() const override {
     return std::string(XCCL_BACKEND_NAME);
   }
@@ -124,13 +130,133 @@ class TORCH_API ProcessGroupXCCL : public Backend {
       std::vector<at::Tensor>& tensors,
       const AllreduceOptions& opts = AllreduceOptions()) override;
 
-  // c10::intrusive_ptr<Work> barrier(
-  //     const BarrierOptions& opts = BarrierOptions()) override;
+  c10::intrusive_ptr<Work> allreduce_coalesced(
+      std::vector<at::Tensor>& tensors,
+      const AllreduceCoalescedOptions& opts =
+          AllreduceCoalescedOptions()) override {
+    TORCH_CHECK(false, "ProcessGroupXCCL::allreduce_coalesced not implemented");
+  }
 
-  static c10::intrusive_ptr<Backend> createProcessGroupXCCL(
-      const c10::intrusive_ptr<Store>& store,
-      int rank = -1,
-      int size = -1);
+  c10::intrusive_ptr<Work> reduce(
+      std::vector<at::Tensor>& tensors,
+      const ReduceOptions& opts = ReduceOptions()) override {
+    TORCH_CHECK(false, "ProcessGroupXCCL::reduce not implemented");
+  }
+
+  c10::intrusive_ptr<Work> broadcast(
+      std::vector<at::Tensor>& tensors,
+      const BroadcastOptions& opts = BroadcastOptions()) override {
+    TORCH_CHECK(false, "ProcessGroupXCCL::broadcast not implemented");
+  }
+
+  c10::intrusive_ptr<Work> allreduce_sparse(
+      std::vector<at::Tensor>& tensors,
+      const AllreduceOptions& opts = AllreduceOptions()) override {
+    TORCH_CHECK(false, "ProcessGroupXCCL::allreduce_sparse not implemented");
+  }
+
+  c10::intrusive_ptr<Work> allgather(
+      std::vector<std::vector<at::Tensor>>& outputTensors,
+      std::vector<at::Tensor>& inputTensors,
+      const AllgatherOptions& opts = AllgatherOptions()) override {
+    TORCH_CHECK(false, "ProcessGroupXCCL::allgather not implemented");
+  }
+
+  c10::intrusive_ptr<Work> _allgather_base(
+      at::Tensor& outputbuffer,
+      at::Tensor& inputbuffer,
+      const AllgatherOptions& opts = AllgatherOptions()) override {
+    TORCH_CHECK(false, "ProcessGroupXCCL::_allgather_base not implemented");
+  }
+
+  c10::intrusive_ptr<Work> allgather_coalesced(
+      std::vector<std::vector<at::Tensor>>& outputTensorLists,
+      std::vector<at::Tensor>& inputTensors,
+      const AllgatherOptions& opts = AllgatherOptions()) override {
+    TORCH_CHECK(false, "ProcessGroupXCCL::allgather_coalesced not implemented");
+  }
+
+  c10::intrusive_ptr<Work> allgather_into_tensor_coalesced(
+      std::vector<at::Tensor>& outputs,
+      std::vector<at::Tensor>& inputs,
+      const AllgatherOptions& opts = AllgatherOptions()) override {
+    TORCH_CHECK(
+        false,
+        "ProcessGroupXCCL::allgather_into_tensor_coalesced not implemented");
+  }
+
+  c10::intrusive_ptr<Work> reduce_scatter(
+      std::vector<at::Tensor>& outputTensors,
+      std::vector<std::vector<at::Tensor>>& inputTensors,
+      const ReduceScatterOptions& opts = ReduceScatterOptions()) override {
+    TORCH_CHECK(false, "ProcessGroupXCCL::reduce_scatter not implemented");
+  }
+
+  c10::intrusive_ptr<Work> _reduce_scatter_base(
+      at::Tensor& outputTensor,
+      at::Tensor& inputTensor,
+      const ReduceScatterOptions& opts = ReduceScatterOptions()) override {
+    TORCH_CHECK(
+        false, "ProcessGroupXCCL::_reduce_scatter_base not implemented");
+  }
+
+  c10::intrusive_ptr<Work> reduce_scatter_tensor_coalesced(
+      std::vector<at::Tensor>& outputs,
+      std::vector<at::Tensor>& inputs,
+      const ReduceScatterOptions& opts = ReduceScatterOptions()) override {
+    TORCH_CHECK(
+        false,
+        "ProcessGroupXCCL::reduce_scatter_tensor_coalesced not implemented");
+  }
+
+  c10::intrusive_ptr<Work> barrier(
+      const BarrierOptions& opts = BarrierOptions()) override {
+    TORCH_CHECK(false, "ProcessGroupXCCL::barrier not implemented");
+  }
+
+  c10::intrusive_ptr<Work> alltoall_base(
+      at::Tensor& outputTensor,
+      at::Tensor& inputTensor,
+      std::vector<int64_t>& outputSplitSizes,
+      std::vector<int64_t>& inputSplitSizes,
+      const AllToAllOptions& opts = AllToAllOptions()) override {
+    TORCH_CHECK(false, "ProcessGroupXCCL::alltoall_base not implemented");
+  }
+
+  c10::intrusive_ptr<Work> alltoall(
+      std::vector<at::Tensor>& outputTensors,
+      std::vector<at::Tensor>& inputTensors,
+      const AllToAllOptions& opts = AllToAllOptions()) override {
+    TORCH_CHECK(false, "ProcessGroupXCCL::alltoall not implemented");
+  }
+
+  c10::intrusive_ptr<Work> send(
+      std::vector<at::Tensor>& tensors,
+      int dstRank,
+      int tag) override {
+    TORCH_CHECK(false, "ProcessGroupXCCL::send not implemented");
+  }
+
+  c10::intrusive_ptr<Work> recv(
+      std::vector<at::Tensor>& tensors,
+      int srcRank,
+      int tag) override {
+    TORCH_CHECK(false, "ProcessGroupXCCL::recv not implemented");
+  }
+
+  c10::intrusive_ptr<Work> gather(
+      std::vector<std::vector<at::Tensor>>& outputTensors,
+      std::vector<at::Tensor>& inputTensors,
+      const GatherOptions& opts = GatherOptions()) override {
+    TORCH_CHECK(false, "ProcessGroupXCCL::gather not implemented");
+  }
+
+  c10::intrusive_ptr<Work> scatter(
+      std::vector<at::Tensor>& outputTensors,
+      std::vector<std::vector<at::Tensor>>& inputTensors,
+      const ScatterOptions& opts = ScatterOptions()) override {
+    TORCH_CHECK(false, "ProcessGroupXCCL::scatter not implemented");
+  }
 
  public:
   std::unordered_map<std::string, at::xpu::XPUStream> xcclStreams_;
@@ -140,7 +266,6 @@ class TORCH_API ProcessGroupXCCL : public Backend {
   c10::intrusive_ptr<Store> store_;
   std::mutex mutex_;
 };
-
 } // namespace c10d
 
 #endif // USE_C10D_XCCL

From 5195f523342e7176ef5912ce17bd73598c13b8d6 Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Mon, 9 Sep 2024 09:50:10 +0000
Subject: [PATCH 20/96] update

---
 caffe2/CMakeLists.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index ae183e32d17e7d..a51b2938c0ff73 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -1374,7 +1374,6 @@ if(USE_DISTRIBUTED)
   endif()
   if(USE_C10D_XCCL)
     target_compile_definitions(torch_xpu PUBLIC USE_C10D_XCCL)
-    # target_compile_definitions(torch_xpu PUBLIC CCL_ENABLE_SYCL)
     set_source_files_properties(
       ${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupXCCL.cpp
       PROPERTIES COMPILE_DEFINITIONS  "CCL_ENABLE_SYCL")

From 2eb044620774692913cc083e58b53717dc22b004 Mon Sep 17 00:00:00 2001
From: "Han, Chao1" <chao1.han@intel.com>
Date: Tue, 10 Sep 2024 17:19:54 +0800
Subject: [PATCH 21/96] update

---
 torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index f6ef0ae0a6ebee..4e0b7db3592093 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -309,9 +309,6 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::allreduce(
       tensors.size() == 1, "Expecting one tensor only but got multiple");
   auto tensor = tensors.back();
   check_xpu_single_tensor(tensor);
-  if (opts.reduceOp == ReduceOp::AVG) {
-    TORCH_CHECK(false, "Cannot use ReduceOp AVG with XPU")
-  }
   return collective(
       tensor,
       tensor,

From 0f6176270833f9f5866c491e6ed58f57822724a5 Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Tue, 10 Sep 2024 02:36:59 +0000
Subject: [PATCH 22/96] update flag

---
 caffe2/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index a51b2938c0ff73..d44a8da210462f 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -1376,7 +1376,7 @@ if(USE_DISTRIBUTED)
     target_compile_definitions(torch_xpu PUBLIC USE_C10D_XCCL)
     set_source_files_properties(
       ${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupXCCL.cpp
-      PROPERTIES COMPILE_DEFINITIONS  "CCL_ENABLE_SYCL")
+      PROPERTIES COMPILE_DEFINITIONS  "CCL_ENABLE_ZE;CCL_ENABLE_SYCL")
   endif()
   if(USE_MPI AND USE_C10D_MPI)
     if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")

From 227e98decb633a108faaf50ab34641d446aa7774 Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Tue, 10 Sep 2024 02:59:13 +0000
Subject: [PATCH 23/96] update

---
 torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
index bd74ca745ed644..2f16df6450fe62 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
@@ -149,12 +149,6 @@ class TORCH_API ProcessGroupXCCL : public Backend {
     TORCH_CHECK(false, "ProcessGroupXCCL::broadcast not implemented");
   }
 
-  c10::intrusive_ptr<Work> allreduce_sparse(
-      std::vector<at::Tensor>& tensors,
-      const AllreduceOptions& opts = AllreduceOptions()) override {
-    TORCH_CHECK(false, "ProcessGroupXCCL::allreduce_sparse not implemented");
-  }
-
   c10::intrusive_ptr<Work> allgather(
       std::vector<std::vector<at::Tensor>>& outputTensors,
       std::vector<at::Tensor>& inputTensors,

From df81919f64c2b2bf42ca732b51a372679035fc20 Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Wed, 11 Sep 2024 04:43:01 +0000
Subject: [PATCH 24/96] update

---
 torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index 4e0b7db3592093..790c02675b03bf 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -265,7 +265,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
   work = initWork(device, rank_, opType);
 
   work->outputs_ =
-      std::make_shared<std::vector<at::Tensor>>(std::move(outputs));
+      std::make_shared<std::vector<at::Tensor>>(outputs);
   c10::xpu::XPUCachingAllocator::recordStream(
       input.storage().data_ptr(), stream);
 

From b0c05928c607cf80b3883c935c57721abca935e5 Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Wed, 11 Sep 2024 04:49:00 +0000
Subject: [PATCH 25/96] update

---
 torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index 790c02675b03bf..e690cc1f57aa43 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -264,8 +264,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
 
   work = initWork(device, rank_, opType);
 
-  work->outputs_ =
-      std::make_shared<std::vector<at::Tensor>>(outputs);
+  work->outputs_ = std::make_shared<std::vector<at::Tensor>>(outputs);
   c10::xpu::XPUCachingAllocator::recordStream(
       input.storage().data_ptr(), stream);
 

From 366d20849aeea8197ffc94cdb4851b054d3c2c07 Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Wed, 11 Sep 2024 06:23:46 +0000
Subject: [PATCH 26/96] rm redundance code

---
 torch/CMakeLists.txt | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index 5bca5ac72452ec..af678d11e7f325 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -165,7 +165,6 @@ if(USE_XPU)
     append_filelist("libtorch_python_xpu_sources" TORCH_PYTHON_SRCS)
     
     list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_XPU)
-    list(APPEND TORCH_PYTHON_LINK_LIBRARIES torch::xpurt)
 endif()
 
 if(USE_CUDNN OR USE_ROCM)
@@ -420,8 +419,6 @@ endif()
 target_compile_definitions(torch_python PRIVATE "-DTHP_BUILD_MAIN_LIB")
 
 target_link_libraries(torch_python PRIVATE ${TORCH_LIB} ${TORCH_PYTHON_LINK_LIBRARIES})
-target_link_libraries(torch_python PRIVATE torch::xpurt)
-target_link_libraries(torch_python PRIVATE c10_xpu)
 
 target_compile_definitions(torch_python PRIVATE ${TORCH_PYTHON_COMPILE_DEFINITIONS})
 

From 3530e43f74742f60ca2f121be920916e1e6a4e14 Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Wed, 11 Sep 2024 09:57:00 +0000
Subject: [PATCH 27/96] enable timeout

---
 .../distributed/c10d/ProcessGroupXCCL.cpp     | 80 +++++++++++++++++--
 .../distributed/c10d/ProcessGroupXCCL.hpp     | 42 +++++++---
 2 files changed, 102 insertions(+), 20 deletions(-)

diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index e690cc1f57aa43..421336b4872a5a 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -131,13 +131,16 @@ ccl::reduction getXcclReduceOp(const ReduceOp& reduceOp, at::Tensor& input) {
 
 static std::mutex xcclCommDevIdxMapMutex;
 static std::unordered_map<std::shared_ptr<xcclComm_t>, int> xcclCommDevIdxMap;
+constexpr int64_t kSynchronizeBusyWaitMillis = 10;
 
 ProcessGroupXCCL::WorkXCCL::WorkXCCL(
     at::Device& device,
     int rank,
     OpType opType,
     const std::optional<std::vector<at::Tensor>>& inputs)
-    : Work(rank, opType, "profilingTitle", inputs), device_(device) {
+    : Work(rank, opType, "profilingTitle", inputs),
+      device_(device),
+      workStartTime_(std::chrono::steady_clock::now()) {
   unsigned char enable_timing = 0;
   xcclEndEvent_ = std::make_shared<at::xpu::XPUEvent>(enable_timing);
 }
@@ -145,26 +148,85 @@ ProcessGroupXCCL::WorkXCCL::WorkXCCL(
 ProcessGroupXCCL::WorkXCCL::WorkXCCL(const WorkXCCL& w)
     : Work(w.rank_, w.opType_),
       device_(w.device_),
+      blockingWait_(w.blockingWait_),
+      workStartTime_(w.workStartTime_),
       xcclEndEvent_(w.xcclEndEvent_) {}
 
 ProcessGroupXCCL::WorkXCCL::~WorkXCCL() = default;
 
-bool ProcessGroupXCCL::WorkXCCL::wait(std::chrono::milliseconds timeout) {
-  synchronize();
+bool ProcessGroupXCCL::WorkXCCL::checkTimeout(
+    std::optional<std::chrono::milliseconds> timeout) {
+  auto currentTimepoint = std::chrono::steady_clock::now();
+  auto timeElapsed = std::chrono::duration_cast<std::chrono::milliseconds>(
+      currentTimepoint - workStartTime_);
+  std::chrono::milliseconds opTimeout = std::chrono::milliseconds(60000);
+
+  auto workTimeout = timeout ? *timeout : opTimeout;
+
+  if (timeElapsed < workTimeout)
+    return false;
+  return true;
+}
+
+bool ProcessGroupXCCL::WorkXCCL::isCompleted() {
+  for (auto& ret : rets) {
+    bool flag;
+    try {
+      TORCH_CHECK(flag = ret.test());
+    } catch (...) {
+      finishAWorkXCCLError(std::current_exception());
+      return true;
+    }
+    if (!flag) {
+      return false;
+    }
+  }
   return true;
 }
 
 void ProcessGroupXCCL::WorkXCCL::synchronize() {
+  synchronizeInternal(kNoTimeout);
+}
+
+void ProcessGroupXCCL::WorkXCCL::synchronizeStream() {
   auto currentStream = at::xpu::getCurrentXPUStream(device_.index());
   // Block the current stream on the XCCL stream
   xcclEndEvent_->block(currentStream);
 }
 
-c10::intrusive_ptr<Backend> ProcessGroupXCCL::createProcessGroupXCCL(
+void ProcessGroupXCCL::WorkXCCL::synchronizeInternal(
+    std::chrono::milliseconds timeout) {
+  synchronizeStream();
+
+  if (blockingWait_) {
+    while (!isCompleted()) {
+      bool timedOut = checkTimeout(
+          timeout == kNoTimeout ? std::nullopt : std::make_optional(timeout));
+      if (timedOut) {
+        break;
+      }
+      std::this_thread::sleep_for(
+          std::chrono::milliseconds(kSynchronizeBusyWaitMillis));
+    }
+  }
+}
+
+bool ProcessGroupXCCL::WorkXCCL::wait(std::chrono::milliseconds timeout) {
+  synchronizeInternal(timeout);
+  for (auto& event : rets) {
+    event.wait();
+  }
+  rets.clear();
+  return true;
+}
+
+ProcessGroupXCCL::ProcessGroupXCCL(
     const c10::intrusive_ptr<Store>& store,
     int rank,
-    int size) {
-  return c10::make_intrusive<ProcessGroupXCCL>(store, rank, size);
+    int size)
+    : Backend(rank, size), store_(store) {
+  blockingWait_ = getCvarBool(TORCH_XCCL_BLOCKING_WAIT, false);
+  init();
 }
 
 ProcessGroupXCCL::~ProcessGroupXCCL() = default;
@@ -264,13 +326,14 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
 
   work = initWork(device, rank_, opType);
 
-  work->outputs_ = std::make_shared<std::vector<at::Tensor>>(outputs);
+  work->outputs_ =
+      std::make_shared<std::vector<at::Tensor>>(std::move(outputs));
   c10::xpu::XPUCachingAllocator::recordStream(
       input.storage().data_ptr(), stream);
 
   auto ccl_stream = ccl::create_stream(stream.queue());
 
-  fn(input, output, attr, *comm, ccl_stream);
+  work->addResult(fn(input, output, attr, *comm, ccl_stream));
 
   work->xcclEndEvent_->record(stream);
 
@@ -280,6 +343,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
   work->future_ = c10::make_intrusive<at::ivalue::Future>(
       c10::ListType::create(c10::TensorType::get()), devices);
   work->future_->markCompleted(at::IValue(*work->outputs_));
+  work->blockingWait_ = blockingWait_;
 
   return work;
 }
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
index 2f16df6450fe62..7bb3a14d6e1446 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
@@ -31,6 +31,10 @@
 #include <torch/csrc/distributed/c10d/Store.hpp>
 namespace c10d {
 
+static std::vector<std::string> TORCH_XCCL_BLOCKING_WAIT = {
+    "TORCH_XCCL_BLOCKING_WAIT",
+    "XCCL_BLOCKING_WAIT"};
+
 using xcclComm_t = ccl::communicator;
 using XCCL_KVS = ccl::shared_ptr_class<ccl::kvs>;
 constexpr const char* XCCL_BACKEND_NAME = "xccl";
@@ -46,11 +50,13 @@ class TORCH_API ProcessGroupXCCL : public Backend {
         const std::optional<std::vector<at::Tensor>>& inputs = std::nullopt);
     WorkXCCL(const WorkXCCL& w);
     ~WorkXCCL() override;
-    bool isCompleted() override {
-      TORCH_CHECK(
-          false, "ProcessGroupXCCL::WorkXCCL::isCompleted not implemented");
+
+    void addResult(ccl::event&& result) {
+      rets.push_back(std::move(result));
     }
 
+    bool isCompleted() override;
+
     bool isSuccess() const override {
       TORCH_CHECK(
           false, "ProcessGroupXCCL::WorkXCCL::isSuccess not implemented");
@@ -62,6 +68,8 @@ class TORCH_API ProcessGroupXCCL : public Backend {
 
     void synchronize() override;
 
+    void synchronizeStream();
+
     bool wait(std::chrono::milliseconds timeout = kNoTimeout) override;
 
     c10::intrusive_ptr<c10::ivalue::Future> getFuture() override {
@@ -72,29 +80,38 @@ class TORCH_API ProcessGroupXCCL : public Backend {
       TORCH_CHECK(false, "ProcessGroupXCCL::WorkXCCL::result not implemented");
     }
 
+    bool checkTimeout(
+        std::optional<std::chrono::milliseconds> timeout = std::nullopt);
+
    protected:
     at::Device device_;
     std::shared_ptr<at::xpu::XPUEvent> xcclEndEvent_;
+    bool blockingWait_ = false;
+    std::chrono::time_point<std::chrono::steady_clock> workStartTime_;
+    std::vector<ccl::event> rets;
 
    private:
+    void finishAWorkXCCLError(std::exception_ptr eptr) {
+      future_->setError(eptr);
+      finish(eptr);
+    }
+    void synchronizeInternal(std::chrono::milliseconds timeout);
     std::shared_ptr<std::vector<at::Tensor>> outputs_;
     c10::intrusive_ptr<at::ivalue::Future> future_;
     friend class ProcessGroupXCCL;
   };
 
-  explicit ProcessGroupXCCL(
+  ProcessGroupXCCL(const c10::intrusive_ptr<Store>& store, int rank, int size);
+
+  C10_DEPRECATED ProcessGroupXCCL(
       const c10::intrusive_ptr<Store>& store,
       int rank,
-      int size)
-      : Backend(rank, size), store_(store) {}
+      int size,
+      const std::string& groupName)
+      : ProcessGroupXCCL(store, rank, size) {}
 
   ~ProcessGroupXCCL() override;
 
-  static c10::intrusive_ptr<Backend> createProcessGroupXCCL(
-      const c10::intrusive_ptr<Store>& store,
-      int rank = -1,
-      int size = -1);
-
   const std::string getBackendName() const override {
     return std::string(XCCL_BACKEND_NAME);
   }
@@ -252,13 +269,14 @@ class TORCH_API ProcessGroupXCCL : public Backend {
     TORCH_CHECK(false, "ProcessGroupXCCL::scatter not implemented");
   }
 
- public:
+ protected:
   std::unordered_map<std::string, at::xpu::XPUStream> xcclStreams_;
   std::unordered_map<std::string, std::shared_ptr<xcclComm_t>>
       inInitializationCommMap_;
   std::unordered_map<std::string, std::shared_ptr<xcclComm_t>> devXCCLCommMap_;
   c10::intrusive_ptr<Store> store_;
   std::mutex mutex_;
+  bool blockingWait_ = false;
 };
 } // namespace c10d
 

From 485ae8b9015bec2afee97b1653d9362e440fd11c Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Thu, 12 Sep 2024 01:02:50 +0000
Subject: [PATCH 28/96] add oneccl env

---
 .../distributed/c10d/ProcessGroupXCCL.cpp     | 12 +++++++++
 .../distributed/c10d/ProcessGroupXCCL.hpp     | 26 +++++++++++++++++++
 2 files changed, 38 insertions(+)

diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index 421336b4872a5a..e008669ca8ad79 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -227,6 +227,18 @@ ProcessGroupXCCL::ProcessGroupXCCL(
     : Backend(rank, size), store_(store) {
   blockingWait_ = getCvarBool(TORCH_XCCL_BLOCKING_WAIT, false);
   init();
+
+  {
+    int local_rank = getXCCLEnvVar("LOCAL_RANK");
+    int local_world_size = getXCCLEnvVar("LOCAL_WORLD_SIZE");
+    if (local_rank == -1 || local_world_size == -1) {
+      local_rank = rank;
+      local_world_size = size;
+    }
+    setXCCLEnvVar("CCL_PROCESS_LAUNCHER", "none");
+    setXCCLEnvVar("CCL_LOCAL_RANK", local_rank);
+    setXCCLEnvVar("CCL_LOCAL_SIZE", local_world_size);
+  }
 }
 
 ProcessGroupXCCL::~ProcessGroupXCCL() = default;
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
index 7bb3a14d6e1446..eca66a33922d55 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
@@ -31,6 +31,32 @@
 #include <torch/csrc/distributed/c10d/Store.hpp>
 namespace c10d {
 
+namespace {
+int getXCCLEnvVar(std::string envVarName) {
+  char* stringValue = std::getenv(envVarName.c_str());
+  if (stringValue != nullptr) {
+    try {
+      int val = std::stoi(stringValue);
+      return val;
+    } catch (std::exception& e) {
+      TORCH_CHECK(
+          false,
+          "Invalid value for environment variable: " + std::string(envVarName));
+    }
+  } else {
+    return -1;
+  }
+}
+
+void setXCCLEnvVar(std::string envVarName, int val) {
+  setenv(envVarName.c_str(), std::to_string(val).c_str(), val);
+}
+
+void setXCCLEnvVar(std::string envVarName, std::string val) {
+  setenv(envVarName.c_str(), val.c_str(), 1);
+}
+} // namespace
+
 static std::vector<std::string> TORCH_XCCL_BLOCKING_WAIT = {
     "TORCH_XCCL_BLOCKING_WAIT",
     "XCCL_BLOCKING_WAIT"};

From 0cfd224d34a357c0586ad0da4c0e19def4e36d47 Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Thu, 12 Sep 2024 01:06:22 +0000
Subject: [PATCH 29/96] update

---
 torch/CMakeLists.txt                             | 2 +-
 torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index af678d11e7f325..9a91b26d54cfb4 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -163,7 +163,7 @@ endif()
 if(USE_XPU)
     include(${TORCH_ROOT}/cmake/public/xpu.cmake)
     append_filelist("libtorch_python_xpu_sources" TORCH_PYTHON_SRCS)
-    
+
     list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_XPU)
 endif()
 
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index e008669ca8ad79..e550225e19cb79 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -260,7 +260,7 @@ std::shared_ptr<xcclComm_t> ProcessGroupXCCL::getXCCLComm(
   if (deviceKey.empty()) {
     C10_THROW_ERROR(
         DistBackendError,
-        "Not able to create/get the CCL Communicator since "
+        "Not able to create/get the XCCL Communicator since "
         "the devices are empty ");
   }
 

From b99fd8cf26fd0dae68826be199d09f39ac2af01d Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Thu, 12 Sep 2024 03:20:47 +0000
Subject: [PATCH 30/96] Add simple test

---
 test/distributed/test_c10d_xccl.py            | 221 ++++++++++++++++++
 test/run_test.py                              |   1 +
 torch/testing/_internal/common_distributed.py |   6 +
 3 files changed, 228 insertions(+)
 create mode 100644 test/distributed/test_c10d_xccl.py

diff --git a/test/distributed/test_c10d_xccl.py b/test/distributed/test_c10d_xccl.py
new file mode 100644
index 00000000000000..33a2f196c3b5d1
--- /dev/null
+++ b/test/distributed/test_c10d_xccl.py
@@ -0,0 +1,221 @@
+# Owner(s): ["oncall: distributed"]
+
+import copy
+import logging
+import math
+import operator
+import os
+import random
+import sys
+import tempfile
+from functools import reduce
+from unittest import mock, SkipTest
+
+import torch
+import torch.distributed as c10d
+
+
+if not c10d.is_available() or not c10d.is_xccl_available():
+    print("c10d XCCL not available, skipping tests", file=sys.stderr)
+    sys.exit(0)
+
+import test_c10d_common
+
+import torch.distributed as dist
+import torch.nn.functional as F
+import torch.testing._internal.common_utils as common
+from torch import nn
+from torch.nn.parallel import DistributedDataParallel
+from torch.testing._internal.common_distributed import (
+    MultiProcessTestCase,
+    requires_xccl,
+)
+from torch.testing._internal.common_utils import (
+    retry_on_connect_failures,
+    run_tests,
+    TestCase,
+)
+
+def simple_reduce_tests(rank, world_size):
+    tests = [
+        (
+            c10d.ReduceOp.SUM,
+            torch.tensor([rank + 1.0]),
+            torch.tensor([float(world_size * (world_size + 1) / 2)]),
+        ),
+        (
+            c10d.ReduceOp.PRODUCT,
+            torch.tensor([rank + 1.0]),
+            torch.tensor([float(math.factorial(world_size))]),
+        ),
+        (
+            c10d.ReduceOp.MIN,
+            torch.tensor([rank + 1.0]),
+            torch.tensor([1.0]),
+        ),
+        (
+            c10d.ReduceOp.MAX,
+            torch.tensor([rank + 1.0]),
+            torch.tensor([world_size]),
+        ),
+    ]
+
+    return tests
+
+
+class RendezvousEnvTest(TestCase):
+    @retry_on_connect_failures
+    @requires_xccl()
+    def test_common_errors(self):
+        vars = {
+            "WORLD_SIZE": "1",
+            "RANK": "0",
+            "MASTER_ADDR": "127.0.0.1",
+            "MASTER_PORT": str(common.find_free_port()),
+        }
+
+        class Env:
+            def __init__(self, vars):
+                self.env_patcher = mock.patch.dict(os.environ, vars, clear=True)
+
+            def __enter__(self):
+                self.env_patcher.start()
+
+            def __exit__(self, type, value, traceback):
+                self.env_patcher.stop()
+
+        def without(d, key):
+            d = d.copy()
+            d.pop(key)
+            return d
+
+        def withouts(d, keys):
+            d = d.copy()
+            for key in keys:
+                d.pop(key)
+            return d
+
+        with Env(without(vars, "WORLD_SIZE")):
+            self.assertEqual(None, os.environ.get("WORLD_SIZE"))
+            with self.assertRaisesRegex(ValueError, "WORLD_SIZE expected"):
+                gen = c10d.rendezvous("env://")
+                next(gen)
+            c10d.init_process_group(backend="xccl", world_size=1)
+            self.assertEqual(c10d.get_rank(), 0)
+            self.assertEqual(c10d.get_world_size(), 1)
+            c10d.destroy_process_group()
+
+        with Env(without(vars, "RANK")):
+            self.assertEqual(None, os.environ.get("RANK"))
+            with self.assertRaisesRegex(ValueError, "RANK expected"):
+                gen = c10d.rendezvous("env://")
+                next(gen)
+            c10d.init_process_group(backend="xccl", rank=0)
+            self.assertEqual(c10d.get_rank(), 0)
+            self.assertEqual(c10d.get_world_size(), 1)
+            c10d.destroy_process_group()
+
+        with Env(withouts(vars, ["RANK", "WORLD_SIZE"])):
+            self.assertEqual(None, os.environ.get("RANK"))
+            self.assertEqual(None, os.environ.get("WORLD_SIZE"))
+            c10d.init_process_group(backend="xccl", rank=0, world_size=1)
+            self.assertEqual(c10d.get_rank(), 0)
+            self.assertEqual(c10d.get_world_size(), 1)
+            c10d.destroy_process_group()
+
+        with Env(vars):
+            c10d.init_process_group(backend="xccl")
+            self.assertEqual(c10d.get_rank(), 0)
+            self.assertEqual(c10d.get_world_size(), 1)
+            c10d.destroy_process_group()
+
+        with Env(without(vars, "MASTER_ADDR")):
+            self.assertEqual(None, os.environ.get("MASTER_ADDR"))
+            with self.assertRaisesRegex(ValueError, "MASTER_ADDR expected"):
+                gen = c10d.rendezvous("env://")
+                next(gen)
+
+        with Env(without(vars, "MASTER_PORT")):
+            self.assertEqual(None, os.environ.get("MASTER_PORT"))
+            with self.assertRaisesRegex(ValueError, "MASTER_PORT expected"):
+                gen = c10d.rendezvous("env://")
+                next(gen)
+
+        with Env(without(vars, "WORLD_SIZE")):
+            self.assertEqual(None, os.environ.get("WORLD_SIZE"))
+            gen = c10d.rendezvous(f"env://?world_size={1}")
+            _, _, size = next(gen)
+            self.assertEqual(size, 1)
+
+        with Env(without(vars, "RANK")):
+            self.assertEqual(None, os.environ.get("RANK"))
+            gen = c10d.rendezvous(f"env://?rank={0}")
+            _, rank, _ = next(gen)
+            self.assertEqual(rank, 0)
+
+        with Env(withouts(vars, ["RANK", "WORLD_SIZE"])):
+            self.assertEqual(None, os.environ.get("RANK"))
+            self.assertEqual(None, os.environ.get("WORLD_SIZE"))
+            gen = c10d.rendezvous(f"env://?rank={0}&world_size={1}")
+            _, rank, size = next(gen)
+            self.assertEqual(rank, 0)
+            self.assertEqual(size, 1)
+
+class TimeoutTest(test_c10d_common.AbstractTimeoutTest, TestCase):
+    @requires_xccl()
+    @retry_on_connect_failures
+    def test_default_store_timeout_nccl(self):
+        self._test_default_store_timeout("xccl")
+
+class ProcessGroupXCCLTest(MultiProcessTestCase):
+    def _create_process_group_xccl(self):
+        store = c10d.FileStore(self.file_name, self.world_size)
+        return c10d.ProcessGroupXCCL(store, self.rank, self.world_size)
+
+    def setUp(self):
+        super().setUp()
+        self._spawn_processes()
+
+    def tearDown(self):
+        super().tearDown()
+        try:
+            os.remove(self.file_name)
+        except OSError:
+            pass
+        
+    def _test_allreduce_basics(self, fn):
+        pg = self._create_process_group_xccl()
+        device = torch.device("xpu:" + str(self.rank))
+        # Single input tests
+        tests = simple_reduce_tests(self.rank, self.world_size)
+        for op, input, expected in tests:
+            opts = c10d.AllreduceOptions()
+            opts.reduceOp = op
+            tensor = fn(input.to(device))
+            fut = pg.allreduce([tensor], opts).get_future()
+            fut.wait()
+            result = fut.value()
+            self.assertEqual(expected, result[0], exact_dtype=False)
+
+        x = fn(torch.tensor([self.rank + 1.0], device = device))
+        fut = pg.allreduce(x).get_future()
+        fut.wait()
+        result = fut.value()
+        self.assertEqual(
+            torch.tensor([float(self.world_size * (self.world_size + 1) / 2)]),
+            result[0],
+        )
+
+    @requires_xccl()
+    def test_allreduce_basics(self):
+        self._test_allreduce_basics(lambda t: t.clone())
+
+
+
+if __name__ == "__main__":
+    assert (
+        not torch.xpu._initialized
+    ), "test_distributed must not have initialized XPU context on main process"
+
+    run_tests()
+
diff --git a/test/run_test.py b/test/run_test.py
index 80a724e129a7a2..02a37ffee07375 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -1105,6 +1105,7 @@ def run_ci_sanity_check(test: ShardedTest, test_directory, options):
     "distributed/test_c10d_nccl": run_test_with_subprocess,
     "distributed/test_c10d_gloo": run_test_with_subprocess,
     "distributed/test_c10d_ucc": run_test_with_subprocess,
+    "distributed/test_c10d_xccl": run_test_with_subprocess,
     "distributed/test_c10d_common": run_test_with_subprocess,
     "distributed/test_c10d_spawn_gloo": run_test_with_subprocess,
     "distributed/test_c10d_spawn_nccl": run_test_with_subprocess,
diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py
index d59102232f7db7..ff83bc8ab66666 100644
--- a/torch/testing/_internal/common_distributed.py
+++ b/torch/testing/_internal/common_distributed.py
@@ -320,6 +320,12 @@ def requires_nccl():
         "c10d was not compiled with the NCCL backend",
     )
 
+def requires_xccl():
+    return skip_but_pass_in_sandcastle_if(
+        not c10d.is_xccl_available(),
+        "c10d was not compiled with the XCCL backend",
+    )
+    
 def requires_ucc():
     return skip_but_pass_in_sandcastle_if(
         not c10d.is_ucc_available(),

From dc41d6adf4b831029432c9cb9f10eacbedd85278 Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Thu, 12 Sep 2024 03:27:54 +0000
Subject: [PATCH 31/96] update

---
 test/run_test.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/run_test.py b/test/run_test.py
index 02a37ffee07375..80a724e129a7a2 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -1105,7 +1105,6 @@ def run_ci_sanity_check(test: ShardedTest, test_directory, options):
     "distributed/test_c10d_nccl": run_test_with_subprocess,
     "distributed/test_c10d_gloo": run_test_with_subprocess,
     "distributed/test_c10d_ucc": run_test_with_subprocess,
-    "distributed/test_c10d_xccl": run_test_with_subprocess,
     "distributed/test_c10d_common": run_test_with_subprocess,
     "distributed/test_c10d_spawn_gloo": run_test_with_subprocess,
     "distributed/test_c10d_spawn_nccl": run_test_with_subprocess,

From 4c3f49f2cffcab2718f5169d52f42d7b6ee36f0d Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Tue, 10 Sep 2024 05:46:06 +0000
Subject: [PATCH 32/96] enable coalese

---
 .../distributed/c10d/ProcessGroupXCCL.cpp     | 137 ++++++++++++++++++
 .../distributed/c10d/ProcessGroupXCCL.hpp     |  25 +++-
 2 files changed, 159 insertions(+), 3 deletions(-)

diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index e550225e19cb79..3c73e7547b50aa 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -26,6 +26,22 @@
 namespace c10d {
 
 namespace {
+
+// wait nonblocking implement
+AutoXcclGroup::AutoXcclGroup() {
+  comm_ = nullptr;
+  ccl::group_start();
+}
+
+AutoNcclGroup::AutoNcclGroup(xcclComm_t comm) {
+  comm_ = comm;
+  ccl::group_start();
+}
+
+AutoNcclGroup::~AutoNcclGroup() noexcept(false) {
+  ccl::group_end();
+}
+
 std::map<c10d::ReduceOp, ccl::reduction> xcclOps = {
     {ReduceOp::MIN, ccl::reduction::min},
     {ReduceOp::MAX, ccl::reduction::max},
@@ -85,6 +101,34 @@ void check_xpu_single_tensor(const at::Tensor& tensor) {
   }
 }
 
+int64_t check_xpu_tensors_same_device(const std::vector<at::Tensor>& tensors) {
+  if (tensors.size() == 0) {
+    C10_THROW_ERROR(ValueError, "Tensor list must be nonempty");
+  }
+
+  const auto& first = tensors.front();
+
+  int64_t total_numel = 0;
+  for (const auto& t : tensors) {
+    if (!t.is_xpu() || t.is_sparse()) {
+      C10_THROW_ERROR(ValueError, "Tensors must be XPU and dense");
+    }
+    if (t.scalar_type() != first.scalar_type()) {
+      C10_THROW_ERROR(TypeError, "Tensors must have identical type");
+    }
+    if (!t.is_non_overlapping_and_dense()) {
+      C10_THROW_ERROR(ValueError, "Tensors must be non-overlapping and dense");
+    }
+    TORCH_CHECK_WITH(
+        ValueError,
+        t.get_device() == tensors[0].get_device(),
+        "Expected list of tensors on the same device");
+    total_numel += t.numel();
+  }
+
+  return total_numel;
+}
+
 ccl::datatype getXcclDataType(at::ScalarType type) {
   auto it = xcclDatatypes.find(type);
   TORCH_CHECK_WITH(
@@ -133,6 +177,9 @@ static std::mutex xcclCommDevIdxMapMutex;
 static std::unordered_map<std::shared_ptr<xcclComm_t>, int> xcclCommDevIdxMap;
 constexpr int64_t kSynchronizeBusyWaitMillis = 10;
 
+// Before implementing send/recv, the xcclActiveGroupCounter_ variable has no effect.
+thread_local uint64_t ProcessGroupXCCL::xcclActiveGroupCounter_ = 0;
+
 ProcessGroupXCCL::WorkXCCL::WorkXCCL(
     at::Device& device,
     int rank,
@@ -314,6 +361,16 @@ std::shared_ptr<xcclComm_t> ProcessGroupXCCL::getXCCLComm(
   return it->second;
 }
 
+void ProcessGroupXCCL::groupStart() {
+  ccl::group_start();
+  ++xcclActiveGroupCounter_;
+}
+
+void ProcessGroupXCCL::groupEnd() {
+  ccl::group_end();
+  --xcclActiveGroupCounter_;
+}
+
 template <typename Fn, typename PreProcess, typename PostProcess>
 c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
     at::Tensor& input,
@@ -377,6 +434,53 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
       opType);
 }
 
+template <typename Fn>
+c10::intrusive_ptr<Work> ProcessGroupXCCL::collectiveCoalesced(
+    std::vector<at::Tensor>& inputs,
+    std::vector<at::Tensor>& outputs,
+    Fn fn,
+    OpType opType) {
+
+  using traits = function_traits<Fn>;
+  using attr_t = typename traits::template arg<2>::type;
+  attr_t attr = ccl::create_operation_attr<attr_t>();
+
+  auto device = inputs[0].device();
+  const auto key = std::to_string(device.index());
+  auto comm = getXCCLComm(key, device);
+
+  auto stream = xcclStreams_.at(key);
+  auto ccl_stream = ccl::create_stream(stream.queue());
+
+  c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL> work;
+
+  work = initWork(device, rank_, opType);
+
+  work->outputs_ = 
+      std::make_shared<std::vector<at::Tensor>>(std::move(outputs));
+  
+  {
+    AutoXcclGroup xccl_group_guard(comm);
+    for (const auto i : c10::irange(inputs.size())) {
+      c10::xpu::XPUCachingAllocator::recordStream(
+          inputs[i].storage().data_ptr(), stream);
+      fn(inputs[i], outputs[i], attr, *comm, ccl_stream);
+    }
+  }
+
+  work->xcclEndEvent_->record(stream);
+
+  std::vector<c10::Stream> streams = {stream.unwrap()};
+  c10::MultiStreamGuard streamGuard(streams);
+  std::vector<at::Device> devices{device};
+  work->future_ = c10::make_intrusive<at::ivalue::Future>(
+      c10::ListType::create(c10::TensorType::get()), devices);
+  work->future_->markCompleted(at::IValue(*work->outputs_));
+
+  return work;
+
+}
+
 c10::intrusive_ptr<Work> ProcessGroupXCCL::allreduce(
     std::vector<at::Tensor>& tensors,
     const AllreduceOptions& opts) {
@@ -409,6 +513,39 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::allreduce(
       OpType::ALLREDUCE);
 }
 
+c10::intrusive_ptr<Work> ProcessGroupXCCL::allreduce_coalesced(
+    std::vector<at::Tensor>& tensors,
+    const AllreduceCoalescedOptions& opts =
+        AllreduceCoalescedOptions()) {
+  check_xpu_tensors_same_device(tensors);
+  TORCH_CHECK(
+      !isFloat8Type(tensors.back().scalar_type()),
+      "Float8 dtypes are not currenlty supported for XCCL reductions");
+
+  return collectiveCoalesced(
+      tensors,
+      tensors,
+      [&](at::Tensor& input,
+          at::Tensor& output,
+          xcclComm_t& comm,
+          ccl::stream& stream) {
+        auto xcclDataType = getXcclDataType(input.scalar_type());
+        auto xcclReduceOp = getXcclReduceOp(opts.reduceOp, input);
+        ccl::event ret_evt;
+        ret_evt = ccl::allreduce(
+            input.data_ptr(),
+            output.data_ptr(),
+            (size_t)input.numel(),
+            xcclDataType,
+            xcclReduceOp,
+            comm,
+            stream,
+            attr);
+        return ret_evt;
+      },
+      OpType::COALESCED);
+}
+
 } // namespace c10d
 
 #endif // USE_C10D_XCCL
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
index eca66a33922d55..6fa066e83b976c 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
@@ -65,6 +65,15 @@ using xcclComm_t = ccl::communicator;
 using XCCL_KVS = ccl::shared_ptr_class<ccl::kvs>;
 constexpr const char* XCCL_BACKEND_NAME = "xccl";
 
+namespace {
+struct AutoXcclGroup {
+  AutoXcclGroup();
+  AutoXcclGroup(xcclComm_t comm);
+  ~AutoXcclGroup() noexcept(false);
+  xcclComm_t comm_;
+};
+} // namespace
+
 class TORCH_API ProcessGroupXCCL : public Backend {
  public:
   class WorkXCCL : public Work {
@@ -169,6 +178,13 @@ class TORCH_API ProcessGroupXCCL : public Backend {
       PostProcess post,
       OpType opType);
 
+  template <typename Fn>
+  c10::intrusive_ptr<Work> collectiveCoalesced(
+      std::vector<at::Tensor>& input,
+      std::vector<at::Tensor>& output,
+      Fn fn,
+      OpType opType);
+
   c10::intrusive_ptr<Work> allreduce(
       std::vector<at::Tensor>& tensors,
       const AllreduceOptions& opts = AllreduceOptions()) override;
@@ -176,9 +192,7 @@ class TORCH_API ProcessGroupXCCL : public Backend {
   c10::intrusive_ptr<Work> allreduce_coalesced(
       std::vector<at::Tensor>& tensors,
       const AllreduceCoalescedOptions& opts =
-          AllreduceCoalescedOptions()) override {
-    TORCH_CHECK(false, "ProcessGroupXCCL::allreduce_coalesced not implemented");
-  }
+          AllreduceCoalescedOptions()) override;
 
   c10::intrusive_ptr<Work> reduce(
       std::vector<at::Tensor>& tensors,
@@ -281,6 +295,10 @@ class TORCH_API ProcessGroupXCCL : public Backend {
     TORCH_CHECK(false, "ProcessGroupXCCL::recv not implemented");
   }
 
+  void groupStart();
+
+  void groupEnd();
+
   c10::intrusive_ptr<Work> gather(
       std::vector<std::vector<at::Tensor>>& outputTensors,
       std::vector<at::Tensor>& inputTensors,
@@ -303,6 +321,7 @@ class TORCH_API ProcessGroupXCCL : public Backend {
   c10::intrusive_ptr<Store> store_;
   std::mutex mutex_;
   bool blockingWait_ = false;
+  static thread_local uint64_t xcclActiveGroupCounter_;
 };
 } // namespace c10d
 

From afa2adc754130feedc55440500a5bc413c42965c Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Wed, 11 Sep 2024 05:34:46 +0000
Subject: [PATCH 33/96] Support broadcast

---
 .../distributed/c10d/ProcessGroupXCCL.cpp     | 36 +++++++++++++++++++
 .../distributed/c10d/ProcessGroupXCCL.hpp     |  4 +--
 2 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index 3c73e7547b50aa..74a102ddf0ad3a 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -546,6 +546,42 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::allreduce_coalesced(
       OpType::COALESCED);
 }
 
+c10::intrusive_ptr<Work> ProcessGroupXCCL::broadcast(
+    std::vector<at::Tensor>& tensors,
+    const BroadcastOptions& opts) {
+  TORCH_CHECK(
+      tensors.size() == 1, "Expecting one tensor only but got multiple");
+  auto tensor = tensors.back();
+  if (tensor.is_complex()) {
+    tensor = at::view_as_real(tensor);
+  }
+  check_xpu_single_tensor(tensor);
+
+  const auto root = opts.rootRank + opts.rootTensor;
+
+  return collective(
+      tensor,
+      tensor,
+      [&](at::Tensor& input,
+          at::Tensor& output,
+          ccl::broadcast_attr attr,
+          xcclComm_t& comm,
+          ccl::stream& stream) {
+        auto xcclDataType = getXcclDataType(input.scalar_type());
+        ccl::event ret_evt;
+        ret_evt = ccl::broadcast(
+            input.data_ptr(),
+            (size_t)input.numel(),
+            xcclDataType,
+            root,
+            comm,
+            stream,
+            attr);
+        return ret_evt;
+      },
+      OpType::BROADCAST);
+}
+
 } // namespace c10d
 
 #endif // USE_C10D_XCCL
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
index 6fa066e83b976c..75f2d944bf72a7 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
@@ -202,9 +202,7 @@ class TORCH_API ProcessGroupXCCL : public Backend {
 
   c10::intrusive_ptr<Work> broadcast(
       std::vector<at::Tensor>& tensors,
-      const BroadcastOptions& opts = BroadcastOptions()) override {
-    TORCH_CHECK(false, "ProcessGroupXCCL::broadcast not implemented");
-  }
+      const BroadcastOptions& opts = BroadcastOptions()) override;
 
   c10::intrusive_ptr<Work> allgather(
       std::vector<std::vector<at::Tensor>>& outputTensors,

From 8efb5d0d397a8171f04347570aec4b6d2d5a810b Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Thu, 12 Sep 2024 04:52:38 +0000
Subject: [PATCH 34/96] update

---
 torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index 74a102ddf0ad3a..ac36fddf80ac4a 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -457,14 +457,14 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collectiveCoalesced(
   work = initWork(device, rank_, opType);
 
   work->outputs_ = 
-      std::make_shared<std::vector<at::Tensor>>(std::move(outputs));
+      std::make_shared<std::vector<at::Tensor>>(outputs);
   
   {
     AutoXcclGroup xccl_group_guard(comm);
     for (const auto i : c10::irange(inputs.size())) {
       c10::xpu::XPUCachingAllocator::recordStream(
           inputs[i].storage().data_ptr(), stream);
-      fn(inputs[i], outputs[i], attr, *comm, ccl_stream);
+      work->addResult(fn(inputs[i], outputs[i], attr, *comm, ccl_stream));
     }
   }
 
@@ -476,6 +476,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collectiveCoalesced(
   work->future_ = c10::make_intrusive<at::ivalue::Future>(
       c10::ListType::create(c10::TensorType::get()), devices);
   work->future_->markCompleted(at::IValue(*work->outputs_));
+  work->blockingWait_ = blockingWait_;
 
   return work;
 

From e85c26816e3dacf7244cc0d4f5abe1914f79fe66 Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Thu, 12 Sep 2024 04:54:33 +0000
Subject: [PATCH 35/96] update

---
 torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp | 5 -----
 torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp | 2 --
 2 files changed, 7 deletions(-)

diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index ac36fddf80ac4a..ce482a97952c34 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -33,11 +33,6 @@ AutoXcclGroup::AutoXcclGroup() {
   ccl::group_start();
 }
 
-AutoNcclGroup::AutoNcclGroup(xcclComm_t comm) {
-  comm_ = comm;
-  ccl::group_start();
-}
-
 AutoNcclGroup::~AutoNcclGroup() noexcept(false) {
   ccl::group_end();
 }
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
index 75f2d944bf72a7..25f3a1653a0c45 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
@@ -68,9 +68,7 @@ constexpr const char* XCCL_BACKEND_NAME = "xccl";
 namespace {
 struct AutoXcclGroup {
   AutoXcclGroup();
-  AutoXcclGroup(xcclComm_t comm);
   ~AutoXcclGroup() noexcept(false);
-  xcclComm_t comm_;
 };
 } // namespace
 

From 7488dbd780dd703f39a49e81a8d488040fab4572 Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Thu, 12 Sep 2024 05:01:19 +0000
Subject: [PATCH 36/96] update

---
 .../distributed/c10d/ProcessGroupXCCL.cpp     | 21 ++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index ce482a97952c34..ea5220e3dac77b 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -29,11 +29,23 @@ namespace {
 
 // wait nonblocking implement
 AutoXcclGroup::AutoXcclGroup() {
+<<<<<<< HEAD
+  ccl::group_start();
+}
+
+AutoXcclGroup::AutoXcclGroup(std::shared_ptr<xcclComm_t> comm) {
+  comm_ = std::move(comm);
+  ccl::group_start();
+}
+
+AutoXcclGroup::~AutoXcclGroup() noexcept(false) {
+=======
   comm_ = nullptr;
   ccl::group_start();
 }
 
 AutoNcclGroup::~AutoNcclGroup() noexcept(false) {
+>>>>>>> e85c26816e3dacf7244cc0d4f5abe1914f79fe66
   ccl::group_end();
 }
 
@@ -175,6 +187,10 @@ constexpr int64_t kSynchronizeBusyWaitMillis = 10;
 // Before implementing send/recv, the xcclActiveGroupCounter_ variable has no effect.
 thread_local uint64_t ProcessGroupXCCL::xcclActiveGroupCounter_ = 0;
 
+// Before implementing send/recv, the xcclActiveGroupCounter_ variable has no
+// effect.
+thread_local uint64_t ProcessGroupXCCL::xcclActiveGroupCounter_ = 0;
+
 ProcessGroupXCCL::WorkXCCL::WorkXCCL(
     at::Device& device,
     int rank,
@@ -435,7 +451,6 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collectiveCoalesced(
     std::vector<at::Tensor>& outputs,
     Fn fn,
     OpType opType) {
-
   using traits = function_traits<Fn>;
   using attr_t = typename traits::template arg<2>::type;
   attr_t attr = ccl::create_operation_attr<attr_t>();
@@ -511,8 +526,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::allreduce(
 
 c10::intrusive_ptr<Work> ProcessGroupXCCL::allreduce_coalesced(
     std::vector<at::Tensor>& tensors,
-    const AllreduceCoalescedOptions& opts =
-        AllreduceCoalescedOptions()) {
+    const AllreduceCoalescedOptions& opts) {
   check_xpu_tensors_same_device(tensors);
   TORCH_CHECK(
       !isFloat8Type(tensors.back().scalar_type()),
@@ -523,6 +537,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::allreduce_coalesced(
       tensors,
       [&](at::Tensor& input,
           at::Tensor& output,
+          ccl::allreduce_attr attr,
           xcclComm_t& comm,
           ccl::stream& stream) {
         auto xcclDataType = getXcclDataType(input.scalar_type());

From e5d6f3728c58fbf62ef9f5f864041730455df2d3 Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Thu, 12 Sep 2024 05:09:16 +0000
Subject: [PATCH 37/96] update

---
 .../distributed/c10d/ProcessGroupXCCL.cpp     | 24 +++----------------
 1 file changed, 3 insertions(+), 21 deletions(-)

diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index ea5220e3dac77b..03ba5824baf2e5 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -29,23 +29,10 @@ namespace {
 
 // wait nonblocking implement
 AutoXcclGroup::AutoXcclGroup() {
-<<<<<<< HEAD
-  ccl::group_start();
-}
-
-AutoXcclGroup::AutoXcclGroup(std::shared_ptr<xcclComm_t> comm) {
-  comm_ = std::move(comm);
   ccl::group_start();
 }
 
 AutoXcclGroup::~AutoXcclGroup() noexcept(false) {
-=======
-  comm_ = nullptr;
-  ccl::group_start();
-}
-
-AutoNcclGroup::~AutoNcclGroup() noexcept(false) {
->>>>>>> e85c26816e3dacf7244cc0d4f5abe1914f79fe66
   ccl::group_end();
 }
 
@@ -184,9 +171,6 @@ static std::mutex xcclCommDevIdxMapMutex;
 static std::unordered_map<std::shared_ptr<xcclComm_t>, int> xcclCommDevIdxMap;
 constexpr int64_t kSynchronizeBusyWaitMillis = 10;
 
-// Before implementing send/recv, the xcclActiveGroupCounter_ variable has no effect.
-thread_local uint64_t ProcessGroupXCCL::xcclActiveGroupCounter_ = 0;
-
 // Before implementing send/recv, the xcclActiveGroupCounter_ variable has no
 // effect.
 thread_local uint64_t ProcessGroupXCCL::xcclActiveGroupCounter_ = 0;
@@ -466,11 +450,10 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collectiveCoalesced(
 
   work = initWork(device, rank_, opType);
 
-  work->outputs_ = 
-      std::make_shared<std::vector<at::Tensor>>(outputs);
-  
+  work->outputs_ = std::make_shared<std::vector<at::Tensor>>(outputs);
+
   {
-    AutoXcclGroup xccl_group_guard(comm);
+    AutoXcclGroup xccl_group_guard;
     for (const auto i : c10::irange(inputs.size())) {
       c10::xpu::XPUCachingAllocator::recordStream(
           inputs[i].storage().data_ptr(), stream);
@@ -489,7 +472,6 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collectiveCoalesced(
   work->blockingWait_ = blockingWait_;
 
   return work;
-
 }
 
 c10::intrusive_ptr<Work> ProcessGroupXCCL::allreduce(

From 0da5e777f3792331f373918ff54514acab824ee9 Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Thu, 12 Sep 2024 09:21:05 +0000
Subject: [PATCH 38/96] add allgather

---
 .../distributed/c10d/ProcessGroupXCCL.cpp     | 259 ++++++++++++++++--
 .../distributed/c10d/ProcessGroupXCCL.hpp     |  39 ++-
 2 files changed, 257 insertions(+), 41 deletions(-)

diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index 03ba5824baf2e5..79d67eb8fdb809 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -10,6 +10,7 @@
 #include <tuple>
 #include <unordered_set>
 #include <utility>
+#include <variant>
 
 #include <ATen/detail/FunctionTraits.h>
 #include <c10/core/DeviceType.h>
@@ -86,6 +87,15 @@ XCCL_KVS get_kvs(int rank, c10d::Store& store) {
   return kvs;
 }
 
+bool check_same_size(const std::vector<at::Tensor>& input_tensors) {
+  for (const auto& input_tensor : input_tensors) {
+    if (!input_tensors[0].is_same_size(input_tensor)) {
+      return false;
+    }
+  }
+  return true;
+}
+
 void check_xpu_single_tensor(const at::Tensor& tensor) {
   if (!tensor.is_xpu() || tensor.is_sparse()) {
     C10_THROW_ERROR(ValueError, "Tensors must be XPU and dense");
@@ -190,9 +200,9 @@ ProcessGroupXCCL::WorkXCCL::WorkXCCL(
 ProcessGroupXCCL::WorkXCCL::WorkXCCL(const WorkXCCL& w)
     : Work(w.rank_, w.opType_),
       device_(w.device_),
+      xcclEndEvent_(w.xcclEndEvent_),
       blockingWait_(w.blockingWait_),
-      workStartTime_(w.workStartTime_),
-      xcclEndEvent_(w.xcclEndEvent_) {}
+      workStartTime_(w.workStartTime_) {}
 
 ProcessGroupXCCL::WorkXCCL::~WorkXCCL() = default;
 
@@ -366,10 +376,17 @@ void ProcessGroupXCCL::groupEnd() {
   --xcclActiveGroupCounter_;
 }
 
-template <typename Fn, typename PreProcess, typename PostProcess>
+// align with good design single-device style, input_t and output_t due to
+// allgatherv need vector output
+template <
+    typename Fn,
+    typename input_t,
+    typename output_t,
+    typename PreProcess,
+    typename PostProcess>
 c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
-    at::Tensor& input,
-    at::Tensor& output,
+    std::vector<input_t>& inputs,
+    std::vector<output_t>& outputs,
     Fn fn,
     PreProcess pre,
     PostProcess post,
@@ -378,26 +395,50 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
   using attr_t = typename traits::template arg<2>::type;
   attr_t attr = ccl::create_operation_attr<attr_t>();
 
-  auto device = input.device();
+  auto device = inputs[0].device();
   const auto key = std::to_string(device.index());
   auto comm = getXCCLComm(key, device);
 
   auto stream = xcclStreams_.at(key);
-  std::vector<at::Tensor> inputs{input};
-  std::vector<at::Tensor> outputs{output};
 
   c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL> work;
 
   work = initWork(device, rank_, opType);
 
-  work->outputs_ =
-      std::make_shared<std::vector<at::Tensor>>(std::move(outputs));
-  c10::xpu::XPUCachingAllocator::recordStream(
-      input.storage().data_ptr(), stream);
+  { // Do we need to store the result of the operation?
+    std::variant<std::vector<at::Tensor>, std::vector<std::vector<at::Tensor>>>
+        outputs;
+    std::visit(
+        [&work](auto&& outputData) {
+          using T = std::decay_t<decltype(outputData)>;
+
+          if constexpr (std::is_same_v<T, std::vector<at::Tensor>>) {
+            work->outputs_ = std::make_shared<std::vector<at::Tensor>>(
+                std::move(outputData));
+          } else if constexpr (std::is_same_v<
+                                   T,
+                                   std::vector<std::vector<at::Tensor>>>) {
+            std::vector<at::Tensor> flattened;
+            for (auto& vec : outputData) {
+              flattened.insert(flattened.end(), vec.begin(), vec.end());
+            }
+            work->outputs_ =
+                std::make_shared<std::vector<at::Tensor>>(std::move(flattened));
+          }
+        },
+        outputs);
+  }
+
+  pre(stream, work);
+
+  for (const auto& input : inputs) {
+    c10::xpu::XPUCachingAllocator::recordStream(
+        input.storage().data_ptr(), stream);
+  }
 
-  auto ccl_stream = ccl::create_stream(stream.queue());
+  work->addResult(fn(inputs[0], outputs[0], attr, *comm, stream));
 
-  work->addResult(fn(input, output, attr, *comm, ccl_stream));
+  post(stream, work);
 
   work->xcclEndEvent_->record(stream);
 
@@ -412,20 +453,38 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
   return work;
 }
 
-template <typename Fn>
+template <
+    typename Fn,
+    typename input_t,
+    typename output_t,
+    typename PreProcess,
+    typename PostProcess>
+c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
+    input_t& input,
+    output_t& output,
+    Fn fn,
+    PreProcess pre,
+    PostProcess post,
+    OpType opType) {
+  auto inputs = std::vector<input_t>{input};
+  auto outputs = std::vector<output_t>{output};
+  return collective(inputs, outputs, fn, pre, post, opType);
+}
+
+template <typename Fn, typename input_t, typename output_t>
 c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
-    at::Tensor& input,
-    at::Tensor& output,
+    input_t& input,
+    output_t& output,
     Fn fn,
     OpType opType) {
   return collective<Fn>(
       input,
       output,
       fn,
-      [](at::xpu::XPUStream&,
-         c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL>& work) {},
-      [](at::xpu::XPUStream&,
-         c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL>& work) {},
+      [](at::xpu::XPUStream&, c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL>&) {
+      },
+      [](at::xpu::XPUStream&, c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL>&) {
+      },
       opType);
 }
 
@@ -444,7 +503,6 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collectiveCoalesced(
   auto comm = getXCCLComm(key, device);
 
   auto stream = xcclStreams_.at(key);
-  auto ccl_stream = ccl::create_stream(stream.queue());
 
   c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL> work;
 
@@ -457,7 +515,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collectiveCoalesced(
     for (const auto i : c10::irange(inputs.size())) {
       c10::xpu::XPUCachingAllocator::recordStream(
           inputs[i].storage().data_ptr(), stream);
-      work->addResult(fn(inputs[i], outputs[i], attr, *comm, ccl_stream));
+      work->addResult(fn(inputs[i], outputs[i], attr, *comm, stream));
     }
   }
 
@@ -488,7 +546,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::allreduce(
           at::Tensor& output,
           ccl::allreduce_attr attr,
           xcclComm_t& comm,
-          ccl::stream& stream) {
+          at::xpu::XPUStream& stream) {
         auto xcclDataType = getXcclDataType(input.scalar_type());
         auto xcclReduceOp = getXcclReduceOp(opts.reduceOp, input);
         ccl::event ret_evt;
@@ -499,7 +557,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::allreduce(
             xcclDataType,
             xcclReduceOp,
             comm,
-            stream,
+            ccl::create_stream(stream.queue()),
             attr);
         return ret_evt;
       },
@@ -521,7 +579,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::allreduce_coalesced(
           at::Tensor& output,
           ccl::allreduce_attr attr,
           xcclComm_t& comm,
-          ccl::stream& stream) {
+          at::xpu::XPUStream& stream) {
         auto xcclDataType = getXcclDataType(input.scalar_type());
         auto xcclReduceOp = getXcclReduceOp(opts.reduceOp, input);
         ccl::event ret_evt;
@@ -532,7 +590,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::allreduce_coalesced(
             xcclDataType,
             xcclReduceOp,
             comm,
-            stream,
+            ccl::create_stream(stream.queue()),
             attr);
         return ret_evt;
       },
@@ -559,7 +617,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::broadcast(
           at::Tensor& output,
           ccl::broadcast_attr attr,
           xcclComm_t& comm,
-          ccl::stream& stream) {
+          at::xpu::XPUStream& stream) {
         auto xcclDataType = getXcclDataType(input.scalar_type());
         ccl::event ret_evt;
         ret_evt = ccl::broadcast(
@@ -568,13 +626,156 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::broadcast(
             xcclDataType,
             root,
             comm,
-            stream,
+            ccl::create_stream(stream.queue()),
             attr);
         return ret_evt;
       },
       OpType::BROADCAST);
 }
 
+c10::intrusive_ptr<Work> ProcessGroupXCCL::allgather(
+    std::vector<std::vector<at::Tensor>>& outputTensors,
+    std::vector<at::Tensor>& inputTensors,
+    const AllgatherOptions& opts) {
+  TORCH_CHECK(
+      inputTensors.size() == 1, "Expecting one tensor only but got multiple");
+  // @lint-ignore CLANGTIDY
+  auto inputTensor = inputTensors.back();
+  check_xpu_single_tensor(inputTensor);
+  // @lint-ignore CLANGTIDY
+  std::vector<at::Tensor>& outputTensors_ = outputTensors.back();
+
+  bool same_size = check_same_size(outputTensors_);
+  if (same_size) {
+    // Flatten a vector of tensors into a single, stacked tensor.
+    at::Tensor outputFlattened = newLikeFlat(outputTensors_);
+
+    return collective(
+        inputTensor,
+        outputFlattened,
+        [&](at::Tensor& input,
+            at::Tensor& output,
+            ccl::allgather_attr attr,
+            xcclComm_t& comm,
+            at::xpu::XPUStream& stream) {
+          c10::xpu::XPUCachingAllocator::recordStream(
+              output.storage().data_ptr(), stream);
+          auto xcclDataType = getXcclDataType(input.scalar_type());
+          ccl::event ret_evt;
+
+          ret_evt = ccl::allgather(
+              input.data_ptr(),
+              output.data_ptr(),
+              (size_t)input.numel(),
+              xcclDataType,
+              comm,
+              ccl::create_stream(stream.queue()),
+              attr);
+          return ret_evt;
+        },
+        [](at::xpu::XPUStream&,
+           c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL>& work) {},
+        [&](at::xpu::XPUStream& Stream,
+            c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL>& work) {
+          // Copy the flattened output tensors to the outputs.
+          c10::StreamGuard guard(Stream);
+          for (const auto j : c10::irange(outputTensors_.size())) {
+            c10::xpu::XPUCachingAllocator::recordStream(
+                outputTensors_[j].storage().data_ptr(), Stream);
+            outputTensors_[j].copy_(outputFlattened[j], true);
+          }
+        },
+        OpType::ALLGATHER);
+  } else {
+    // xccl implemented allgatherv, so broadcast_oop not needed
+    return collective(
+        inputTensor,
+        outputTensors_,
+        [=](at::Tensor& input,
+            const std::vector<at::Tensor>& outputs,
+            ccl::allgatherv_attr attr,
+            xcclComm_t& comm,
+            at::xpu::XPUStream& stream) {
+          ccl::event ret_evt;
+          auto xcclDataType = getXcclDataType(input.scalar_type());
+
+          std::vector<size_t> recvCounts(outputs.size(), 0);
+          std::transform(
+              outputs.begin(),
+              outputs.end(),
+              recvCounts.begin(),
+              [](const at::Tensor& t) { return t.numel(); });
+
+          TORCH_CHECK(
+              (size_t)input.numel() == recvCounts[rank_],
+              "allgather: send and recv count doesn't match");
+
+          std::vector<void*> recvBufs(outputs.size(), nullptr);
+          std::transform(
+              outputs.begin(),
+              outputs.end(),
+              recvBufs.begin(),
+              [](const at::Tensor& t) { return t.data_ptr(); });
+
+          ret_evt = ccl::allgatherv(
+              input.data_ptr(),
+              (size_t)input.numel(),
+              recvBufs,
+              recvCounts,
+              xcclDataType,
+              comm,
+              ccl::create_stream(stream.queue()),
+              attr);
+          return ret_evt;
+        },
+        c10d::OpType::ALLGATHER);
+  }
+}
+
+c10::intrusive_ptr<Work> ProcessGroupXCCL::_allgather_base(
+    at::Tensor& output_tensor,
+    at::Tensor& input_tensor,
+    const AllgatherOptions& opts) {
+  check_xpu_single_tensor(input_tensor);
+  check_xpu_single_tensor(output_tensor);
+
+  if (input_tensor.dtype() != output_tensor.dtype()) {
+    C10_THROW_ERROR(
+        TypeError, "output tensor must have the same type as input tensor");
+  }
+
+  if (input_tensor.numel() * size_ != output_tensor.numel()) {
+    C10_THROW_ERROR(
+        ValueError,
+        "output tensor size must be equal to world_size times input tensor size");
+  }
+
+  return collective(
+      input_tensor,
+      output_tensor,
+      [&](at::Tensor& input,
+          at::Tensor& output,
+          ccl::allgather_attr attr,
+          xcclComm_t& comm,
+          at::xpu::XPUStream& stream) {
+        c10::xpu::XPUCachingAllocator::recordStream(
+            output.storage().data_ptr(), stream);
+        auto xcclDataType = getXcclDataType(input.scalar_type());
+        ccl::event ret_evt;
+
+        ret_evt = ccl::allgather(
+            input.data_ptr(),
+            output.data_ptr(),
+            (size_t)input.numel(),
+            xcclDataType,
+            comm,
+            ccl::create_stream(stream.queue()),
+            attr);
+        return ret_evt;
+      },
+      OpType::_ALLGATHER_BASE);
+}
+
 } // namespace c10d
 
 #endif // USE_C10D_XCCL
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
index 25f3a1653a0c45..71d6a7ec653152 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
@@ -160,17 +160,36 @@ class TORCH_API ProcessGroupXCCL : public Backend {
       const std::vector<at::Tensor>& inputs = {},
       const std::vector<at::Tensor>& outputs = {});
 
-  template <typename Fn>
+  template <typename Fn, typename input_t, typename output_t>
   c10::intrusive_ptr<Work> collective(
-      at::Tensor& input,
-      at::Tensor& output,
+      input_t& input,
+      output_t& output,
       Fn fn,
       OpType opType);
 
-  template <typename Fn, typename PreProcess, typename PostProcess>
+  template <
+      typename Fn,
+      typename input_t,
+      typename output_t,
+      typename PreProcess,
+      typename PostProcess>
   c10::intrusive_ptr<Work> collective(
-      at::Tensor& input,
-      at::Tensor& output,
+      input_t& input,
+      output_t& output,
+      Fn fn,
+      PreProcess pre,
+      PostProcess post,
+      OpType opType);
+
+  template <
+      typename Fn,
+      typename input_t,
+      typename output_t,
+      typename PreProcess,
+      typename PostProcess>
+  c10::intrusive_ptr<Work> collective(
+      std::vector<input_t>& inputs,
+      std::vector<output_t>& outputs,
       Fn fn,
       PreProcess pre,
       PostProcess post,
@@ -205,16 +224,12 @@ class TORCH_API ProcessGroupXCCL : public Backend {
   c10::intrusive_ptr<Work> allgather(
       std::vector<std::vector<at::Tensor>>& outputTensors,
       std::vector<at::Tensor>& inputTensors,
-      const AllgatherOptions& opts = AllgatherOptions()) override {
-    TORCH_CHECK(false, "ProcessGroupXCCL::allgather not implemented");
-  }
+      const AllgatherOptions& opts = AllgatherOptions()) override;
 
   c10::intrusive_ptr<Work> _allgather_base(
       at::Tensor& outputbuffer,
       at::Tensor& inputbuffer,
-      const AllgatherOptions& opts = AllgatherOptions()) override {
-    TORCH_CHECK(false, "ProcessGroupXCCL::_allgather_base not implemented");
-  }
+      const AllgatherOptions& opts = AllgatherOptions()) override;
 
   c10::intrusive_ptr<Work> allgather_coalesced(
       std::vector<std::vector<at::Tensor>>& outputTensorLists,

From 0ad5677130743821757939de262c58d934afd19c Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Fri, 13 Sep 2024 02:43:53 +0000
Subject: [PATCH 39/96] support allgather_into_tensor_coalesced

---
 .../distributed/c10d/ProcessGroupXCCL.cpp     | 28 ++++++++++++++++++-
 .../distributed/c10d/ProcessGroupXCCL.hpp     |  6 +---
 2 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index 79d67eb8fdb809..e78bc4e49871bd 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -762,7 +762,6 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::_allgather_base(
             output.storage().data_ptr(), stream);
         auto xcclDataType = getXcclDataType(input.scalar_type());
         ccl::event ret_evt;
-
         ret_evt = ccl::allgather(
             input.data_ptr(),
             output.data_ptr(),
@@ -776,6 +775,33 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::_allgather_base(
       OpType::_ALLGATHER_BASE);
 }
 
+c10::intrusive_ptr<Work> ProcessGroupXCCL::allgather_into_tensor_coalesced(
+    std::vector<at::Tensor>& outputs,
+    std::vector<at::Tensor>& inputs,
+    const AllgatherOptions& opts) {
+  return collectiveCoalesced(
+      inputs,
+      outputs,
+      [&](at::Tensor& input,
+          at::Tensor& output,
+          ccl::allgather_attr attr,
+          xcclComm_t& comm,
+          at::xpu::XPUStream& stream) {
+        auto xcclDataType = getXcclDataType(input.scalar_type());
+        ccl::event ret_evt;
+        ret_evt = ccl::allgather(
+            input.data_ptr(),
+            output.data_ptr(),
+            (size_t)input.numel(),
+            xcclDataType,
+            comm,
+            ccl::create_stream(stream.queue()),
+            attr);
+        return ret_evt;
+      },
+      OpType::COALESCED);
+}
+
 } // namespace c10d
 
 #endif // USE_C10D_XCCL
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
index 71d6a7ec653152..94ee71ab0190cb 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
@@ -241,11 +241,7 @@ class TORCH_API ProcessGroupXCCL : public Backend {
   c10::intrusive_ptr<Work> allgather_into_tensor_coalesced(
       std::vector<at::Tensor>& outputs,
       std::vector<at::Tensor>& inputs,
-      const AllgatherOptions& opts = AllgatherOptions()) override {
-    TORCH_CHECK(
-        false,
-        "ProcessGroupXCCL::allgather_into_tensor_coalesced not implemented");
-  }
+      const AllgatherOptions& opts = AllgatherOptions()) override;
 
   c10::intrusive_ptr<Work> reduce_scatter(
       std::vector<at::Tensor>& outputTensors,

From 009e334af7ee713d015907c1103027282e74f3ef Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Fri, 13 Sep 2024 05:30:33 +0000
Subject: [PATCH 40/96] support reduce_scatter

---
 .../distributed/c10d/ProcessGroupXCCL.cpp     | 198 +++++++++++++++++-
 .../distributed/c10d/ProcessGroupXCCL.hpp     |  18 +-
 2 files changed, 206 insertions(+), 10 deletions(-)

diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index e78bc4e49871bd..4792ba86682f03 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -272,6 +272,13 @@ bool ProcessGroupXCCL::WorkXCCL::wait(std::chrono::milliseconds timeout) {
   return true;
 }
 
+constexpr const char* MULTI_DEVICE_ERROR_MSG =
+    "Expecting one tensor only but got multiple. You are probably using multiple "
+    "devices under one thread. The support for such usage has been deprecated. "
+    "For details, please refer to "
+    "https://pytorch.org/docs/stable/distributed.html#multi-gpu-collective-functions. "
+    "ProcessGroupXCCL continues supporting multi-process and multi-thread modes.";
+
 ProcessGroupXCCL::ProcessGroupXCCL(
     const c10::intrusive_ptr<Store>& store,
     int rank,
@@ -376,7 +383,50 @@ void ProcessGroupXCCL::groupEnd() {
   --xcclActiveGroupCounter_;
 }
 
-// align with good design single-device style, input_t and output_t due to
+// TODO: wait p2p enable
+static constexpr int CoalActive = 0x01, CoalColl = 0x02;
+void ProcessGroupXCCL::startCoalescing() {
+  coalescedDevice_.set_index(-1);
+  coalescedComm_ = nullptr;
+  coalescing_state_ |= CoalActive;
+  groupStart();
+}
+
+c10::intrusive_ptr<Work> ProcessGroupXCCL::endCoalescing(OpType optype) {
+  if (coalescedComm_ == nullptr) {
+    // There is no actual work being coalesced, return here
+    groupEnd();
+    coalescing_state_ = 0;
+    return nullptr;
+  }
+  TORCH_CHECK(
+      coalescedDevice_.index() >= 0,
+      "Somthing went wrong. Did you call end_coalescing before start_coalescing?");
+
+  auto comm = coalescedComm_;
+  auto device = coalescedDevice_;
+
+  const auto key = std::to_string(device.index());
+  auto stream = xcclStreams_.at(key);
+
+  auto work = initWork(device, rank_, optype);
+  work->blockingWait_ = blockingWait_;
+
+  groupEnd();
+
+  work->xcclEndEvent_->record(stream);
+
+  coalescing_state_ = 0;
+  coalescedComm_ = nullptr;
+  return work;
+}
+
+c10::intrusive_ptr<Work> ProcessGroupXCCL::endCoalescing() {
+  // Default OpType to COALESCED if not specified
+  return endCoalescing(OpType::COALESCED);
+}
+
+// align with single-device style, input_t and output_t due to
 // allgatherv need vector output
 template <
     typename Fn,
@@ -399,6 +449,21 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
   const auto key = std::to_string(device.index());
   auto comm = getXCCLComm(key, device);
 
+  if (coalescing_state_ & CoalActive) {
+    coalescing_state_ |= CoalColl;
+    if (coalescedDevice_.index() < 0) {
+      coalescedDevice_ = device;
+    } else {
+      TORCH_CHECK(
+          coalescedDevice_.index() == device.index(), MULTI_DEVICE_ERROR_MSG);
+    }
+    if (coalescedComm_ == nullptr) {
+      coalescedComm_ = comm;
+    } else {
+      TORCH_CHECK(coalescedComm_ == comm, MULTI_DEVICE_ERROR_MSG);
+    }
+  }
+
   auto stream = xcclStreams_.at(key);
 
   c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL> work;
@@ -502,6 +567,21 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collectiveCoalesced(
   const auto key = std::to_string(device.index());
   auto comm = getXCCLComm(key, device);
 
+  if (coalescing_state_ & CoalActive) {
+    coalescing_state_ |= CoalColl;
+    if (coalescedDevice_.index() < 0) {
+      coalescedDevice_ = device;
+    } else {
+      TORCH_CHECK(
+          coalescedDevice_.index() == device.index(), MULTI_DEVICE_ERROR_MSG);
+    }
+    if (coalescedComm_ == nullptr) {
+      coalescedComm_ = comm;
+    } else {
+      TORCH_CHECK(coalescedComm_ == comm, MULTI_DEVICE_ERROR_MSG);
+    }
+  }
+
   auto stream = xcclStreams_.at(key);
 
   c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL> work;
@@ -535,8 +615,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collectiveCoalesced(
 c10::intrusive_ptr<Work> ProcessGroupXCCL::allreduce(
     std::vector<at::Tensor>& tensors,
     const AllreduceOptions& opts) {
-  TORCH_CHECK(
-      tensors.size() == 1, "Expecting one tensor only but got multiple");
+  TORCH_CHECK(tensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
   auto tensor = tensors.back();
   check_xpu_single_tensor(tensor);
   return collective(
@@ -600,8 +679,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::allreduce_coalesced(
 c10::intrusive_ptr<Work> ProcessGroupXCCL::broadcast(
     std::vector<at::Tensor>& tensors,
     const BroadcastOptions& opts) {
-  TORCH_CHECK(
-      tensors.size() == 1, "Expecting one tensor only but got multiple");
+  TORCH_CHECK(tensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
   auto tensor = tensors.back();
   if (tensor.is_complex()) {
     tensor = at::view_as_real(tensor);
@@ -633,12 +711,46 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::broadcast(
       OpType::BROADCAST);
 }
 
+c10::intrusive_ptr<Work> ProcessGroupXCCL::_reduce_oop(
+    at::Tensor& outputTensor,
+    at::Tensor& inputTensor,
+    const ReduceOptions& opts) {
+  if (outputTensor.numel() != inputTensor.numel()) {
+    C10_THROW_ERROR(
+        ValueError,
+        "Tensor input and output of _reduce_oop must have the same number of elements ");
+  }
+  return collective(
+      inputTensor,
+      outputTensor,
+      [&](at::Tensor& input,
+          at::Tensor& output,
+          ccl::reduce_attr attr,
+          xcclComm_t& comm,
+          at::xpu::XPUStream& stream) {
+        const int root = opts.rootRank + opts.rootTensor;
+        const auto xcclDataType = getXcclDataType(input.scalar_type());
+        const auto xcclReduceOp = getXcclReduceOp(opts.reduceOp, input);
+        ccl::event ret_evt;
+        ret_evt = ccl::reduce(
+            input.data_ptr(),
+            output.data_ptr(),
+            (size_t)input.numel(),
+            xcclDataType,
+            xcclReduceOp,
+            root,
+            comm,
+            ccl::create_stream(stream.queue()));
+        return ret_evt;
+      },
+      OpType::REDUCE);
+}
+
 c10::intrusive_ptr<Work> ProcessGroupXCCL::allgather(
     std::vector<std::vector<at::Tensor>>& outputTensors,
     std::vector<at::Tensor>& inputTensors,
     const AllgatherOptions& opts) {
-  TORCH_CHECK(
-      inputTensors.size() == 1, "Expecting one tensor only but got multiple");
+  TORCH_CHECK(inputTensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
   // @lint-ignore CLANGTIDY
   auto inputTensor = inputTensors.back();
   check_xpu_single_tensor(inputTensor);
@@ -802,6 +914,78 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::allgather_into_tensor_coalesced(
       OpType::COALESCED);
 }
 
+c10::intrusive_ptr<Work> ProcessGroupXCCL::reduce_scatter(
+    std::vector<at::Tensor>& outputTensors,
+    std::vector<std::vector<at::Tensor>>& inputTensors,
+    const ReduceScatterOptions& opts) {
+  TORCH_CHECK(outputTensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
+  // @lint-ignore CLANGTIDY
+  auto outputTensor = outputTensors.back();
+  check_xpu_single_tensor(outputTensor);
+  // @lint-ignore CLANGTIDY
+  auto inputTensors_ = inputTensors.back();
+  TORCH_CHECK(
+      !isFloat8Type(outputTensor.scalar_type()),
+      "Float8 dtypes are not currenlty supported for NCCL reductions");
+
+  bool same_size = check_same_size(inputTensors_);
+  if (same_size) {
+    // Flatten a vector of tensors into a single, stacked tensor.
+    at::Tensor inputFlattened = newLikeFlat(inputTensors_);
+    return collective(
+        inputFlattened,
+        outputTensor,
+        [&](at::Tensor& input,
+            at::Tensor& output,
+            ccl::reduce_attr attr,
+            xcclComm_t& comm,
+            at::xpu::XPUStream& stream) {
+          c10::xpu::XPUCachingAllocator::recordStream(
+              output.storage().data_ptr(), stream);
+          auto xcclDataType = getXcclDataType(input.scalar_type());
+          auto xcclReduceOp = getXcclReduceOp(opts.reduceOp, input);
+          ccl::event ret_evt;
+          ret_evt = ccl::reduce_scatter(
+              input.data_ptr(),
+              output.data_ptr(),
+              (size_t)output.numel(),
+              xcclDataType,
+              xcclReduceOp,
+              comm,
+              ccl::create_stream(stream.queue()));
+          return ret_evt;
+        },
+        [&](at::xpu::XPUStream& Stream,
+            c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL>& work) {
+          // Copy the input tensors to the flattened inputs.
+          c10::StreamGuard guard(Stream);
+          for (const auto j : c10::irange(inputTensors_.size())) {
+            c10::xpu::XPUCachingAllocator::recordStream(
+                inputTensors_[j].storage().data_ptr(), Stream);
+            inputFlattened[j].copy_(inputTensors_[j], true);
+          }
+        },
+        [&](at::xpu::XPUStream&,
+            c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL>&) {},
+        OpType::REDUCE_SCATTER);
+  } else {
+    const auto num_reduces = inputTensors_.size();
+    startCoalescing();
+    for (const int i : c10::irange(num_reduces)) {
+      auto& input = inputTensors_[i];
+      auto& output = (i == rank_) ? outputTensor : input;
+      auto reduceOpts = ReduceOptions{
+          opts.reduceOp,
+          static_cast<int64_t>(i),
+          static_cast<int64_t>(0),
+          opts.timeout};
+      _reduce_oop(output, input, reduceOpts);
+    }
+    auto work = endCoalescing(OpType::REDUCE_SCATTER);
+    return work;
+  }
+}
+
 } // namespace c10d
 
 #endif // USE_C10D_XCCL
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
index 94ee71ab0190cb..3b72fd4261f5cb 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
@@ -149,6 +149,12 @@ class TORCH_API ProcessGroupXCCL : public Backend {
     return std::string(XCCL_BACKEND_NAME);
   }
 
+  void startCoalescing() override;
+
+  c10::intrusive_ptr<Work> endCoalescing() override;
+
+  c10::intrusive_ptr<Work> endCoalescing(OpType optype);
+
   std::shared_ptr<xcclComm_t> getXCCLComm(
       const std::string& deviceKey,
       at::Device& device);
@@ -221,6 +227,11 @@ class TORCH_API ProcessGroupXCCL : public Backend {
       std::vector<at::Tensor>& tensors,
       const BroadcastOptions& opts = BroadcastOptions()) override;
 
+  c10::intrusive_ptr<Work> _reduce_oop(
+      at::Tensor& outputTensors,
+      at::Tensor& inputTensors,
+      const ReduceOptions& opts = ReduceOptions());
+
   c10::intrusive_ptr<Work> allgather(
       std::vector<std::vector<at::Tensor>>& outputTensors,
       std::vector<at::Tensor>& inputTensors,
@@ -246,9 +257,7 @@ class TORCH_API ProcessGroupXCCL : public Backend {
   c10::intrusive_ptr<Work> reduce_scatter(
       std::vector<at::Tensor>& outputTensors,
       std::vector<std::vector<at::Tensor>>& inputTensors,
-      const ReduceScatterOptions& opts = ReduceScatterOptions()) override {
-    TORCH_CHECK(false, "ProcessGroupXCCL::reduce_scatter not implemented");
-  }
+      const ReduceScatterOptions& opts = ReduceScatterOptions()) override;
 
   c10::intrusive_ptr<Work> _reduce_scatter_base(
       at::Tensor& outputTensor,
@@ -327,6 +336,9 @@ class TORCH_API ProcessGroupXCCL : public Backend {
   std::unordered_map<std::string, std::shared_ptr<xcclComm_t>> devXCCLCommMap_;
   c10::intrusive_ptr<Store> store_;
   std::mutex mutex_;
+  int coalescing_state_ = 0;
+  at::Device coalescedDevice_ = at::Device("xpu");
+  std::shared_ptr<xcclComm_t> coalescedComm_ = nullptr;
   bool blockingWait_ = false;
   static thread_local uint64_t xcclActiveGroupCounter_;
 };

From ecbd9894c4dcca31d8b10746231c3a0d2d155d85 Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Fri, 13 Sep 2024 06:26:27 +0000
Subject: [PATCH 41/96] refine test cases

---
 test/distributed/test_c10d_common.py          |   5 +-
 test/distributed/test_c10d_xccl.py            | 168 +++++++++++++++++-
 torch/distributed/distributed_c10d.py         |   4 +-
 torch/testing/_internal/common_distributed.py |   5 +-
 4 files changed, 173 insertions(+), 9 deletions(-)

diff --git a/test/distributed/test_c10d_common.py b/test/distributed/test_c10d_common.py
index 6a0621f3f49913..0c1426d0e29c21 100644
--- a/test/distributed/test_c10d_common.py
+++ b/test/distributed/test_c10d_common.py
@@ -66,8 +66,9 @@ def gpus_for_rank(world_size):
     On a single node, all visible GPUs are evenly
     divided to subsets, each process only uses a subset.
     """
-    visible_devices = list(range(torch.cuda.device_count()))
-    gpus_per_process = torch.cuda.device_count() // world_size
+    device_count = torch.xpu.device_count() if torch.xpu.is_available() else torch.cuda.device_count()
+    visible_devices = list(range(device_count))
+    gpus_per_process = device_count // world_size
     gpus_for_rank = []
     for rank in range(world_size):
         gpus_for_rank.append(
diff --git a/test/distributed/test_c10d_xccl.py b/test/distributed/test_c10d_xccl.py
index 33a2f196c3b5d1..a998af7b16ef98 100644
--- a/test/distributed/test_c10d_xccl.py
+++ b/test/distributed/test_c10d_xccl.py
@@ -7,7 +7,10 @@
 import os
 import random
 import sys
+
+import time
 import tempfile
+from datetime import timedelta
 from functools import reduce
 from unittest import mock, SkipTest
 
@@ -20,6 +23,7 @@
     sys.exit(0)
 
 import test_c10d_common
+from test_c10d_common import DoubleGpuNet, gpus_for_rank, ModuleForDdpCommHook
 
 import torch.distributed as dist
 import torch.nn.functional as F
@@ -29,8 +33,12 @@
 from torch.testing._internal.common_distributed import (
     MultiProcessTestCase,
     requires_xccl,
+    init_multigpu_helper,
+    skip_if_lt_x_gpu,
 )
 from torch.testing._internal.common_utils import (
+    skip_but_pass_in_sandcastle_if,
+    TEST_XPU,
     retry_on_connect_failures,
     run_tests,
     TestCase,
@@ -62,10 +70,12 @@ def simple_reduce_tests(rank, world_size):
 
     return tests
 
+TEST_MULTIXPU = torch.xpu.device_count() > 1
 
 class RendezvousEnvTest(TestCase):
     @retry_on_connect_failures
     @requires_xccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_XPU, "No GPUs available, skipping test")
     def test_common_errors(self):
         vars = {
             "WORLD_SIZE": "1",
@@ -164,13 +174,23 @@ def withouts(d, keys):
 class TimeoutTest(test_c10d_common.AbstractTimeoutTest, TestCase):
     @requires_xccl()
     @retry_on_connect_failures
+    @skip_but_pass_in_sandcastle_if(not TEST_XPU, "No GPUs available, skipping test")
     def test_default_store_timeout_nccl(self):
         self._test_default_store_timeout("xccl")
 
 class ProcessGroupXCCLTest(MultiProcessTestCase):
-    def _create_process_group_xccl(self):
+    def _create_process_group_xccl(self, timeout=timedelta(seconds=600), device_id=None):
         store = c10d.FileStore(self.file_name, self.world_size)
-        return c10d.ProcessGroupXCCL(store, self.rank, self.world_size)
+        c10d.init_process_group(
+            "xccl",
+            world_size=self.world_size,
+            rank=self.rank,
+            store=store,
+            timeout=timeout,
+            device_id=device_id,
+        )
+        pg = c10d.distributed_c10d._get_default_group()
+        return pg
 
     def setUp(self):
         super().setUp()
@@ -182,7 +202,76 @@ def tearDown(self):
             os.remove(self.file_name)
         except OSError:
             pass
-        
+
+    @property
+    def world_size(self):
+        return 2
+
+    @property
+    def rank_to_GPU(self):
+        # return rank to GPU map
+        return init_multigpu_helper(self.world_size, "xccl")
+
+    @requires_xccl()
+    @skip_but_pass_in_sandcastle_if(
+        torch.xpu.device_count() < 2, "XCCL test requires 2+ GPUs"
+    )
+    def test_close_multi_pg_unordered(self):
+        pg = self._create_process_group_xccl()
+        device = self.rank_to_GPU[self.rank][0]
+        t = torch.rand(10, 10, device=device)
+        # First allreduce to initialize default PG's communicator.
+        pg.allreduce(t).wait()
+        new_pg1 = c10d.new_group([0, 1])
+        new_pg2 = c10d.new_group([0, 1])
+        if self.rank == 0 or self.rank == 1:
+            t1 = torch.rand(10, 10, device=device)
+            t2 = torch.rand(10, 10, device=device)
+            new_pg1.allreduce(t1).wait()
+            new_pg2.allreduce(t2).wait()
+        if self.rank == 0:
+            dist.destroy_process_group(new_pg2)
+            # force destruction of pg2 first
+            del new_pg2
+            dist.destroy_process_group(new_pg1)
+            del new_pg1
+        if self.rank == 1:
+            c10d.destroy_process_group(new_pg1)
+            # force destruction of pg1 first
+            del new_pg1
+            dist.destroy_process_group(new_pg2)
+            del new_pg2
+        dist.destroy_process_group()
+
+    @requires_xccl()
+    @skip_but_pass_in_sandcastle_if(
+        torch.xpu.device_count() < 2, "XCCL test requires 2+ GPUs"
+    )
+    def test_file_store_check(self):
+        # self.file_name is created using "delete=False"
+        # e.g., self.file_name = tempfile.NamedTemporaryFile(delete=False).name
+        store = dist.FileStore(self.file_name, self.world_size)
+        dist.init_process_group(
+            backend="xccl", rank=self.rank, world_size=self.world_size, store=store
+        )
+        pg = dist.distributed_c10d._get_default_group()
+        self.assertEqual(pg.rank(), self.rank)
+        self.assertEqual(pg.size(), self.world_size)
+        # give enough time for check() to be executed multiple times
+        time.sleep(2)
+        dist.destroy_process_group()
+
+    @requires_xccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIXPU, "XCCL test requires 2+ GPUs")
+    def test_set_process_group_desc(self):
+        device = torch.device(f"xpu:{self.rank}")
+        pg_default = self._create_process_group_xccl(device_id=device)
+        self.assertEqual(pg_default.group_desc, "default_pg")
+        pg_1 = c10d.new_group([0, 1], group_desc="test_purpose")
+        self.assertEqual(pg_1.group_desc, "test_purpose")
+        pg_2 = c10d.new_group([0, 1])
+        self.assertEqual(pg_2.group_desc, "undefined")
+
     def _test_allreduce_basics(self, fn):
         pg = self._create_process_group_xccl()
         device = torch.device("xpu:" + str(self.rank))
@@ -210,6 +299,79 @@ def _test_allreduce_basics(self, fn):
     def test_allreduce_basics(self):
         self._test_allreduce_basics(lambda t: t.clone())
 
+class DistributedDataParallelTest(
+    test_c10d_common.CommonDistributedDataParallelTest, MultiProcessTestCase
+):
+    def setUp(self):
+        super().setUp()
+        self._spawn_processes()
+
+    def _get_process_group(self):
+        store = self._get_store()
+        c10d.init_process_group(
+            "xccl", store=store, rank=self.rank, world_size=self.world_size
+        )
+        return c10d.distributed_c10d._get_default_group()
+
+    def _test_xccl_backend(
+        self, devices, device_ids, multi_device=False, gradient_as_bucket_view=False
+    ):
+        process_group = self._get_process_group()
+        self._test_ddp_with_process_group(
+            process_group, devices, device_ids, multi_device, gradient_as_bucket_view
+        )
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(2)
+    def test_xccl_backend_multi_device_ids_not_allowed(self):
+        int_devices = list(range(torch.xpu.device_count()))
+        devices = [torch.device("xpu:" + str(i)) for i in int_devices]
+        with self.assertRaisesRegex(
+            ValueError, "device_ids can only be None or contain a single element."
+        ):
+            self._test_xccl_backend(devices, int_devices)
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(4)
+    def test_ddp_multi_device_module_config(self):
+        gpus = gpus_for_rank(self.world_size)[self.rank]
+
+        self.assertTrue(len(gpus) >= 2, "expecting at least 2 gpus per process")
+
+        process_group = self._get_process_group()
+
+        gpus = gpus[:2]
+        model = DoubleGpuNet(gpus)
+
+        with self.assertRaisesRegex(
+            ValueError,
+            "DistributedDataParallel device_ids and output_device arguments only work with "
+            "single-device/multiple-device GPU modules or CPU modules",
+        ):
+            ddp_model = DistributedDataParallel(
+                model, output_device=gpus[1], process_group=process_group
+            )
+
+        with self.assertRaisesRegex(
+            ValueError, "device_ids can only be None or contain a single element."
+        ):
+            ddp_model = DistributedDataParallel(
+                model, device_ids=gpus, process_group=process_group
+            )
+
+        with self.assertRaisesRegex(
+            ValueError, "input module must be on the same type of devices"
+        ):
+            model.fc1 = model.fc1.cpu()
+            ddp_model = DistributedDataParallel(model, process_group=process_group)
+
+        model = model.cpu()
+        with self.assertRaisesRegex(
+            ValueError, "device_ids can only be None or contain a single element."
+        ):
+            ddp_model = DistributedDataParallel(
+                model, device_ids=gpus, process_group=process_group
+            )
 
 
 if __name__ == "__main__":
diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index 3f68609905bb5a..d0781765c090ff 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -1672,10 +1672,10 @@ def _new_process_group_helper(
             "created, please use a different group name"
         )
 
-    if device_id is not None and (device_id.index is None or device_id.type != "cuda"):
+    if device_id is not None and (device_id.index is None or (device_id.type != "cuda" and device_id.type != "xpu")):
         raise ValueError(
             "init_process_group device_id parameter must be a cuda device with an "
-            "id, e.g. cuda:0, not just cuda or cpu"
+            "id, e.g. cuda:0, xpu, not just cuda or xpu or cpu"
         )
 
     # Note: _new_process_group_helper is only called from init_process_group, which always provides a timeout value
diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py
index ff83bc8ab66666..554114b7bbcb1c 100644
--- a/torch/testing/_internal/common_distributed.py
+++ b/torch/testing/_internal/common_distributed.py
@@ -180,7 +180,8 @@ def skip_if_lt_x_gpu(x):
     def decorator(func):
         @wraps(func)
         def wrapper(*args, **kwargs):
-            if torch.cuda.is_available() and torch.cuda.device_count() >= x:
+            if (torch.cuda.is_available() and torch.cuda.device_count() >= x) or \
+               (torch.xpu.is_available() and torch.xpu.device_count() >= x):
                 return func(*args, **kwargs)
             sys.exit(TEST_SKIPS[f"multi-gpu-{x}"].exit_code)
 
@@ -469,7 +470,7 @@ def init_multigpu_helper(world_size: int, backend: str):
     On a single node, all visible GPUs are evenly
     divided to subsets, each process only uses a subset.
     """
-    nGPUs = torch.cuda.device_count()
+    nGPUs = torch.xpu.device_count() if torch.xpu.is_available() else torch.cuda.device_count()
     visible_devices = range(nGPUs)
 
     # If rank is less than or equal to number of available GPU's

From a23ffb2f6bd537359570701a63820ce7e0ab52dc Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Fri, 13 Sep 2024 06:34:06 +0000
Subject: [PATCH 42/96] update ut

---
 test/distributed/test_c10d_xccl.py | 73 ------------------------------
 1 file changed, 73 deletions(-)

diff --git a/test/distributed/test_c10d_xccl.py b/test/distributed/test_c10d_xccl.py
index a998af7b16ef98..3843a695f766c9 100644
--- a/test/distributed/test_c10d_xccl.py
+++ b/test/distributed/test_c10d_xccl.py
@@ -299,79 +299,6 @@ def _test_allreduce_basics(self, fn):
     def test_allreduce_basics(self):
         self._test_allreduce_basics(lambda t: t.clone())
 
-class DistributedDataParallelTest(
-    test_c10d_common.CommonDistributedDataParallelTest, MultiProcessTestCase
-):
-    def setUp(self):
-        super().setUp()
-        self._spawn_processes()
-
-    def _get_process_group(self):
-        store = self._get_store()
-        c10d.init_process_group(
-            "xccl", store=store, rank=self.rank, world_size=self.world_size
-        )
-        return c10d.distributed_c10d._get_default_group()
-
-    def _test_xccl_backend(
-        self, devices, device_ids, multi_device=False, gradient_as_bucket_view=False
-    ):
-        process_group = self._get_process_group()
-        self._test_ddp_with_process_group(
-            process_group, devices, device_ids, multi_device, gradient_as_bucket_view
-        )
-
-    @requires_xccl()
-    @skip_if_lt_x_gpu(2)
-    def test_xccl_backend_multi_device_ids_not_allowed(self):
-        int_devices = list(range(torch.xpu.device_count()))
-        devices = [torch.device("xpu:" + str(i)) for i in int_devices]
-        with self.assertRaisesRegex(
-            ValueError, "device_ids can only be None or contain a single element."
-        ):
-            self._test_xccl_backend(devices, int_devices)
-
-    @requires_xccl()
-    @skip_if_lt_x_gpu(4)
-    def test_ddp_multi_device_module_config(self):
-        gpus = gpus_for_rank(self.world_size)[self.rank]
-
-        self.assertTrue(len(gpus) >= 2, "expecting at least 2 gpus per process")
-
-        process_group = self._get_process_group()
-
-        gpus = gpus[:2]
-        model = DoubleGpuNet(gpus)
-
-        with self.assertRaisesRegex(
-            ValueError,
-            "DistributedDataParallel device_ids and output_device arguments only work with "
-            "single-device/multiple-device GPU modules or CPU modules",
-        ):
-            ddp_model = DistributedDataParallel(
-                model, output_device=gpus[1], process_group=process_group
-            )
-
-        with self.assertRaisesRegex(
-            ValueError, "device_ids can only be None or contain a single element."
-        ):
-            ddp_model = DistributedDataParallel(
-                model, device_ids=gpus, process_group=process_group
-            )
-
-        with self.assertRaisesRegex(
-            ValueError, "input module must be on the same type of devices"
-        ):
-            model.fc1 = model.fc1.cpu()
-            ddp_model = DistributedDataParallel(model, process_group=process_group)
-
-        model = model.cpu()
-        with self.assertRaisesRegex(
-            ValueError, "device_ids can only be None or contain a single element."
-        ):
-            ddp_model = DistributedDataParallel(
-                model, device_ids=gpus, process_group=process_group
-            )
 
 
 if __name__ == "__main__":

From 1d02dfe83b8c6546cd4200cc53b19aed38d025b9 Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Fri, 13 Sep 2024 06:47:42 +0000
Subject: [PATCH 43/96] add mpi check

---
 torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp | 2 +-
 torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index e550225e19cb79..b51d299b47d8de 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -228,7 +228,7 @@ ProcessGroupXCCL::ProcessGroupXCCL(
   blockingWait_ = getCvarBool(TORCH_XCCL_BLOCKING_WAIT, false);
   init();
 
-  {
+  if (!with_mpirun()) {
     int local_rank = getXCCLEnvVar("LOCAL_RANK");
     int local_world_size = getXCCLEnvVar("LOCAL_WORLD_SIZE");
     if (local_rank == -1 || local_world_size == -1) {
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
index eca66a33922d55..6d946acbea804b 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
@@ -55,6 +55,13 @@ void setXCCLEnvVar(std::string envVarName, int val) {
 void setXCCLEnvVar(std::string envVarName, std::string val) {
   setenv(envVarName.c_str(), val.c_str(), 1);
 }
+
+bool with_mpirun() {
+  return (getenv("MPI_LOCALRANKID") || getenv("MPI_LOCALNRANKS") ||
+          getenv("PMI_RANK") || getenv("PMI_SIZE") || getenv("PMIX_RANK"))
+      ? true
+      : false;
+}
 } // namespace
 
 static std::vector<std::string> TORCH_XCCL_BLOCKING_WAIT = {

From c485bd82da66537b3323110c97bbb5d01c57ed67 Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Fri, 13 Sep 2024 06:56:23 +0000
Subject: [PATCH 44/96] update datatype map

---
 torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp | 8 ++++----
 torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp | 4 ----
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index b51d299b47d8de..fd02226a1dd772 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -36,7 +36,6 @@ std::map<c10d::ReduceOp, ccl::reduction> xcclOps = {
 std::map<at::ScalarType, ccl::datatype> xcclDatatypes = {
     {at::kByte, ccl::datatype::uint8},
     {at::kChar, ccl::datatype::int8},
-    {at::kShort, ccl::datatype::int16},
     {at::kInt, ccl::datatype::int32},
     {at::kLong, ccl::datatype::int64},
     {at::kHalf, ccl::datatype::float16},
@@ -148,9 +147,9 @@ ProcessGroupXCCL::WorkXCCL::WorkXCCL(
 ProcessGroupXCCL::WorkXCCL::WorkXCCL(const WorkXCCL& w)
     : Work(w.rank_, w.opType_),
       device_(w.device_),
+      xcclEndEvent_(w.xcclEndEvent_),
       blockingWait_(w.blockingWait_),
-      workStartTime_(w.workStartTime_),
-      xcclEndEvent_(w.xcclEndEvent_) {}
+      workStartTime_(w.workStartTime_) {}
 
 ProcessGroupXCCL::WorkXCCL::~WorkXCCL() = default;
 
@@ -174,7 +173,8 @@ bool ProcessGroupXCCL::WorkXCCL::isCompleted() {
     try {
       TORCH_CHECK(flag = ret.test());
     } catch (...) {
-      finishAWorkXCCLError(std::current_exception());
+      future_->setError(std::current_exception());
+      finish(std::current_exception());
       return true;
     }
     if (!flag) {
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
index 6d946acbea804b..6d4cc5097ebbc5 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
@@ -124,10 +124,6 @@ class TORCH_API ProcessGroupXCCL : public Backend {
     std::vector<ccl::event> rets;
 
    private:
-    void finishAWorkXCCLError(std::exception_ptr eptr) {
-      future_->setError(eptr);
-      finish(eptr);
-    }
     void synchronizeInternal(std::chrono::milliseconds timeout);
     std::shared_ptr<std::vector<at::Tensor>> outputs_;
     c10::intrusive_ptr<at::ivalue::Future> future_;

From 2d1ae87592eeab86bd5ef20706c5e373073c5ff2 Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Fri, 13 Sep 2024 07:22:25 +0000
Subject: [PATCH 45/96] update

---
 torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp | 9 +++++++--
 torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp | 2 ++
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index fd02226a1dd772..8f689ec80eb12a 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -167,14 +167,19 @@ bool ProcessGroupXCCL::WorkXCCL::checkTimeout(
   return true;
 }
 
+void ProcessGroupXCCL::WorkXCCL::finishWorkXcclError(
+    const std::exception_ptr& eptr) {
+  future_->setError(eptr);
+  finish(eptr);
+}
+
 bool ProcessGroupXCCL::WorkXCCL::isCompleted() {
   for (auto& ret : rets) {
     bool flag;
     try {
       TORCH_CHECK(flag = ret.test());
     } catch (...) {
-      future_->setError(std::current_exception());
-      finish(std::current_exception());
+      finishWorkXcclError(std::current_exception());
       return true;
     }
     if (!flag) {
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
index 6d4cc5097ebbc5..37e36047a63c16 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
@@ -88,6 +88,8 @@ class TORCH_API ProcessGroupXCCL : public Backend {
       rets.push_back(std::move(result));
     }
 
+    void finishWorkXcclError(const std::exception_ptr& eptr);
+
     bool isCompleted() override;
 
     bool isSuccess() const override {

From 61842614f089399f77991278e84e2d0d29e71a44 Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Fri, 13 Sep 2024 07:47:05 +0000
Subject: [PATCH 46/96] update

---
 torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
index 8f59b7a13d4d3e..cc6b9d36869b58 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
@@ -62,6 +62,11 @@ bool with_mpirun() {
       ? true
       : false;
 }
+
+struct AutoXcclGroup {
+  AutoXcclGroup();
+  ~AutoXcclGroup() noexcept(false);
+};
 } // namespace
 
 static std::vector<std::string> TORCH_XCCL_BLOCKING_WAIT = {
@@ -72,13 +77,6 @@ using xcclComm_t = ccl::communicator;
 using XCCL_KVS = ccl::shared_ptr_class<ccl::kvs>;
 constexpr const char* XCCL_BACKEND_NAME = "xccl";
 
-namespace {
-struct AutoXcclGroup {
-  AutoXcclGroup();
-  ~AutoXcclGroup() noexcept(false);
-};
-} // namespace
-
 class TORCH_API ProcessGroupXCCL : public Backend {
  public:
   class WorkXCCL : public Work {

From 91d26d94f27289af653ba7cebba301c20607ae61 Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Fri, 13 Sep 2024 08:50:44 +0000
Subject: [PATCH 47/96] update

---
 .../distributed/c10d/ProcessGroupXCCL.cpp     | 29 ++++---------------
 .../distributed/c10d/ProcessGroupXCCL.hpp     |  7 -----
 2 files changed, 6 insertions(+), 30 deletions(-)

diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index 8f689ec80eb12a..fbca2c7f470247 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -167,26 +167,11 @@ bool ProcessGroupXCCL::WorkXCCL::checkTimeout(
   return true;
 }
 
-void ProcessGroupXCCL::WorkXCCL::finishWorkXcclError(
-    const std::exception_ptr& eptr) {
-  future_->setError(eptr);
-  finish(eptr);
-}
-
 bool ProcessGroupXCCL::WorkXCCL::isCompleted() {
-  for (auto& ret : rets) {
-    bool flag;
-    try {
-      TORCH_CHECK(flag = ret.test());
-    } catch (...) {
-      finishWorkXcclError(std::current_exception());
-      return true;
-    }
-    if (!flag) {
-      return false;
-    }
+  if (xcclEndEvent_ && xcclEndEvent_->query()) {
+    return true;
   }
-  return true;
+  return false;
 }
 
 void ProcessGroupXCCL::WorkXCCL::synchronize() {
@@ -218,10 +203,6 @@ void ProcessGroupXCCL::WorkXCCL::synchronizeInternal(
 
 bool ProcessGroupXCCL::WorkXCCL::wait(std::chrono::milliseconds timeout) {
   synchronizeInternal(timeout);
-  for (auto& event : rets) {
-    event.wait();
-  }
-  rets.clear();
   return true;
 }
 
@@ -233,6 +214,8 @@ ProcessGroupXCCL::ProcessGroupXCCL(
   blockingWait_ = getCvarBool(TORCH_XCCL_BLOCKING_WAIT, false);
   init();
 
+  // Intel oneCCL requires passing CCL_LOCAL_RANK and CCL_LOCAL_SIZE for non-MPI
+  // launchers.
   if (!with_mpirun()) {
     int local_rank = getXCCLEnvVar("LOCAL_RANK");
     int local_world_size = getXCCLEnvVar("LOCAL_WORLD_SIZE");
@@ -350,7 +333,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
 
   auto ccl_stream = ccl::create_stream(stream.queue());
 
-  work->addResult(fn(input, output, attr, *comm, ccl_stream));
+  fn(input, output, attr, *comm, ccl_stream);
 
   work->xcclEndEvent_->record(stream);
 
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
index 37e36047a63c16..96f7e46e7c378d 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
@@ -84,12 +84,6 @@ class TORCH_API ProcessGroupXCCL : public Backend {
     WorkXCCL(const WorkXCCL& w);
     ~WorkXCCL() override;
 
-    void addResult(ccl::event&& result) {
-      rets.push_back(std::move(result));
-    }
-
-    void finishWorkXcclError(const std::exception_ptr& eptr);
-
     bool isCompleted() override;
 
     bool isSuccess() const override {
@@ -123,7 +117,6 @@ class TORCH_API ProcessGroupXCCL : public Backend {
     std::shared_ptr<at::xpu::XPUEvent> xcclEndEvent_;
     bool blockingWait_ = false;
     std::chrono::time_point<std::chrono::steady_clock> workStartTime_;
-    std::vector<ccl::event> rets;
 
    private:
     void synchronizeInternal(std::chrono::milliseconds timeout);

From 2a83d68e7562bb40b3e21ed15d15303189864070 Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Sat, 14 Sep 2024 01:25:16 +0000
Subject: [PATCH 48/96] update

---
 torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index fbca2c7f470247..7b0336960b7285 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -319,8 +319,6 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
   auto comm = getXCCLComm(key, device);
 
   auto stream = xcclStreams_.at(key);
-  std::vector<at::Tensor> inputs{input};
-  std::vector<at::Tensor> outputs{output};
 
   c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL> work;
 

From 7f62b869297f4e199ae0bda426e4964b3257cd2a Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Sat, 14 Sep 2024 01:26:18 +0000
Subject: [PATCH 49/96] update

---
 torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index 7b0336960b7285..6b57a6c5471b36 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -319,6 +319,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
   auto comm = getXCCLComm(key, device);
 
   auto stream = xcclStreams_.at(key);
+  std::vector<at::Tensor> outputs{output};
 
   c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL> work;
 

From c48f5eb3c00d50f24abbc037e94fe0724c41f8b6 Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Sat, 14 Sep 2024 02:12:36 +0000
Subject: [PATCH 50/96] Support reduce_scatter_base

---
 .../distributed/c10d/ProcessGroupXCCL.cpp     | 52 +++++++++++++++++++
 .../distributed/c10d/ProcessGroupXCCL.hpp     |  7 +--
 2 files changed, 54 insertions(+), 5 deletions(-)

diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index 1a854853117615..f2dde28c4e3b94 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -54,6 +54,11 @@ std::map<at::ScalarType, ccl::datatype> xcclDatatypes = {
     {at::kDouble, ccl::datatype::float64},
     {at::kBFloat16, ccl::datatype::bfloat16},
     {at::kBool, ccl::datatype::uint8},
+    // use for allgather
+    {at::kFloat8_e5m2, ccl::datatype::uint8},
+    {at::kFloat8_e4m3fn, ccl::datatype::uint8},
+    {at::kFloat8_e4m3fnuz, ccl::datatype::uint8},
+    {at::kFloat8_e5m2fnuz, ccl::datatype::uint8},
 };
 
 XCCL_KVS kvs;
@@ -991,6 +996,53 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::reduce_scatter(
   }
 }
 
+c10::intrusive_ptr<Work> ProcessGroupXCCL::_reduce_scatter_base(
+    at::Tensor& outputTensor,
+    at::Tensor& inputTensor,
+    const ReduceScatterOptions& opts) {
+  if (inputTensor.dtype() != outputTensor.dtype()) {
+    C10_THROW_ERROR(
+        TypeError, "input tensor must be the same type as the output tensor.");
+  }
+
+  if (inputTensor.numel() != outputTensor.numel() * size_) {
+    C10_THROW_ERROR(
+        ValueError,
+        "input tensor must be the same size as output size times world size");
+  }
+
+  // @lint-ignore CLANGTIDY
+  const auto& tensor = outputTensor;
+  TORCH_CHECK(
+      !isFloat8Type(tensor.scalar_type()),
+      "Float8 dtypes are not currenlty supported for NCCL reductions");
+
+  return collective(
+      inputTensor,
+      outputTensor,
+      [&](at::Tensor& input,
+          at::Tensor& output,
+          ccl::reduce_attr attr,
+          xcclComm_t& comm,
+          at::xpu::XPUStream& stream) {
+        c10::xpu::XPUCachingAllocator::recordStream(
+            output.storage().data_ptr(), stream);
+        auto xcclDataType = getXcclDataType(input.scalar_type());
+        auto xcclReduceOp = getXcclReduceOp(opts.reduceOp, input);
+        ccl::event ret_evt;
+        ret_evt = ccl::reduce_scatter(
+            input.data_ptr(),
+            output.data_ptr(),
+            (size_t)output.numel(),
+            xcclDataType,
+            xcclReduceOp,
+            comm,
+            ccl::create_stream(stream.queue()));
+        return ret_evt;
+      },
+      OpType::_REDUCE_SCATTER_BASE);
+}
+
 } // namespace c10d
 
 #endif // USE_C10D_XCCL
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
index cc6b9d36869b58..42b26740d49055 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
@@ -117,7 +117,7 @@ class TORCH_API ProcessGroupXCCL : public Backend {
     }
 
     std::vector<at::Tensor> result() override {
-      TORCH_CHECK(false, "ProcessGroupXCCL::WorkXCCL::result not implemented");
+      return *outputs_;
     }
 
     bool checkTimeout(
@@ -265,10 +265,7 @@ class TORCH_API ProcessGroupXCCL : public Backend {
   c10::intrusive_ptr<Work> _reduce_scatter_base(
       at::Tensor& outputTensor,
       at::Tensor& inputTensor,
-      const ReduceScatterOptions& opts = ReduceScatterOptions()) override {
-    TORCH_CHECK(
-        false, "ProcessGroupXCCL::_reduce_scatter_base not implemented");
-  }
+      const ReduceScatterOptions& opts = ReduceScatterOptions()) override;
 
   c10::intrusive_ptr<Work> reduce_scatter_tensor_coalesced(
       std::vector<at::Tensor>& outputs,

From 9b17dc4911f7b4074db79b6dc573aefbb74bbcb2 Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Sat, 14 Sep 2024 02:53:06 +0000
Subject: [PATCH 51/96] Support reduce_scatter_tensor_coalesced

---
 .../distributed/c10d/ProcessGroupXCCL.cpp     | 37 ++++++++++++++++++-
 .../distributed/c10d/ProcessGroupXCCL.hpp     |  6 +--
 2 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index f2dde28c4e3b94..dfd5d78a5b970b 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -936,7 +936,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::reduce_scatter(
   auto inputTensors_ = inputTensors.back();
   TORCH_CHECK(
       !isFloat8Type(outputTensor.scalar_type()),
-      "Float8 dtypes are not currenlty supported for NCCL reductions");
+      "Float8 dtypes are not currenlty supported for XCCL reductions");
 
   bool same_size = check_same_size(inputTensors_);
   if (same_size) {
@@ -1015,7 +1015,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::_reduce_scatter_base(
   const auto& tensor = outputTensor;
   TORCH_CHECK(
       !isFloat8Type(tensor.scalar_type()),
-      "Float8 dtypes are not currenlty supported for NCCL reductions");
+      "Float8 dtypes are not currenlty supported for XCCL reductions");
 
   return collective(
       inputTensor,
@@ -1043,6 +1043,39 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::_reduce_scatter_base(
       OpType::_REDUCE_SCATTER_BASE);
 }
 
+c10::intrusive_ptr<Work> ProcessGroupXCCL::reduce_scatter_tensor_coalesced(
+    std::vector<at::Tensor>& outputs,
+    std::vector<at::Tensor>& inputs,
+    const ReduceScatterOptions& opts) {
+  TORCH_CHECK(
+      !isFloat8Type(inputs.back().scalar_type()),
+      "Float8 dtypes are not currenlty supported for XCCL reductions");
+  return collectiveCoalesced(
+      inputs,
+      outputs,
+      [&](at::Tensor& input,
+          at::Tensor& output,
+          ccl::reduce_attr attr,
+          xcclComm_t& comm,
+          at::xpu::XPUStream& stream) {
+        c10::xpu::XPUCachingAllocator::recordStream(
+            output.storage().data_ptr(), stream);
+        auto xcclDataType = getXcclDataType(input.scalar_type());
+        auto xcclReduceOp = getXcclReduceOp(opts.reduceOp, input);
+        ccl::event ret_evt;
+        ret_evt = ccl::reduce_scatter(
+            input.data_ptr(),
+            output.data_ptr(),
+            (size_t)output.numel(),
+            xcclDataType,
+            xcclReduceOp,
+            comm,
+            ccl::create_stream(stream.queue()));
+        return ret_evt;
+      },
+      OpType::COALESCED);
+}
+
 } // namespace c10d
 
 #endif // USE_C10D_XCCL
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
index 42b26740d49055..2357aad73bb512 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
@@ -270,11 +270,7 @@ class TORCH_API ProcessGroupXCCL : public Backend {
   c10::intrusive_ptr<Work> reduce_scatter_tensor_coalesced(
       std::vector<at::Tensor>& outputs,
       std::vector<at::Tensor>& inputs,
-      const ReduceScatterOptions& opts = ReduceScatterOptions()) override {
-    TORCH_CHECK(
-        false,
-        "ProcessGroupXCCL::reduce_scatter_tensor_coalesced not implemented");
-  }
+      const ReduceScatterOptions& opts = ReduceScatterOptions()) override;
 
   c10::intrusive_ptr<Work> barrier(
       const BarrierOptions& opts = BarrierOptions()) override {

From 6cb32272695f76a3f8522473db999f8036f4c771 Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Sat, 14 Sep 2024 03:33:48 +0000
Subject: [PATCH 52/96] support barrier

---
 .../distributed/c10d/ProcessGroupXCCL.cpp     | 59 +++++++++++++++++--
 .../distributed/c10d/ProcessGroupXCCL.hpp     | 10 +++-
 2 files changed, 61 insertions(+), 8 deletions(-)

diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index dfd5d78a5b970b..670fc343c04cbd 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -271,6 +271,10 @@ void ProcessGroupXCCL::WorkXCCL::synchronizeInternal(
           std::chrono::milliseconds(kSynchronizeBusyWaitMillis));
     }
   }
+  if (barrierTensor_.defined()) {
+    auto currentStream = at::xpu::getCurrentXPUStream(device_.index());
+    currentStream.synchronize();
+  }
 }
 
 bool ProcessGroupXCCL::WorkXCCL::wait(std::chrono::milliseconds timeout) {
@@ -333,6 +337,8 @@ std::shared_ptr<xcclComm_t> ProcessGroupXCCL::getXCCLComm(
         "the devices are empty ");
   }
 
+  usedDeviceIdxs_.insert(device.index());
+
   {
     std::lock_guard<std::mutex> lock(mutex_);
     if (devXCCLCommMap_.find(deviceKey) != devXCCLCommMap_.end()) {
@@ -622,12 +628,9 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collectiveCoalesced(
   return work;
 }
 
-c10::intrusive_ptr<Work> ProcessGroupXCCL::allreduce(
-    std::vector<at::Tensor>& tensors,
+c10::intrusive_ptr<Work> ProcessGroupXCCL::allreduce_impl(
+    at::Tensor& tensor,
     const AllreduceOptions& opts) {
-  TORCH_CHECK(tensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
-  auto tensor = tensors.back();
-  check_xpu_single_tensor(tensor);
   return collective(
       tensor,
       tensor,
@@ -653,6 +656,19 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::allreduce(
       OpType::ALLREDUCE);
 }
 
+c10::intrusive_ptr<Work> ProcessGroupXCCL::allreduce(
+    std::vector<at::Tensor>& tensors,
+    const AllreduceOptions& opts) {
+  TORCH_CHECK(tensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
+  auto tensor = tensors.back();
+  check_xpu_single_tensor(tensor);
+  TORCH_CHECK(
+      !isFloat8Type(tensor.scalar_type()),
+      "Float8 dtypes are not currenlty supported for XCCL reductions");
+
+  return allreduce_impl(tensor, opts);
+}
+
 c10::intrusive_ptr<Work> ProcessGroupXCCL::allreduce_coalesced(
     std::vector<at::Tensor>& tensors,
     const AllreduceCoalescedOptions& opts) {
@@ -1076,6 +1092,39 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::reduce_scatter_tensor_coalesced(
       OpType::COALESCED);
 }
 
+c10::intrusive_ptr<Work> ProcessGroupXCCL::barrier(const BarrierOptions& opts) {
+  // Device to use for barrier
+  int barDevIdx = -1;
+
+  // See nccl barrier comments
+  if (!opts.device_ids.empty()) {
+    barDevIdx = opts.device_ids[0];
+  } else if (getBoundDeviceId()) {
+    barDevIdx = (*getBoundDeviceId()).index();
+  } else if (!usedDeviceIdxs_.empty()) {
+    barDevIdx = *usedDeviceIdxs_.begin();
+  } else {
+    barDevIdx =
+        static_cast<int16_t>(rank_ % at::detail::getXPUHooks().getNumGPUs());
+  }
+
+  TORCH_CHECK_WITH(
+      ValueError,
+      barDevIdx >= 0,
+      "Failed to infer a GPU device id to perform barrier. ");
+  auto barDevice = at::Device(at::DeviceType::XPU, barDevIdx);
+
+  at::Tensor barrierTensor =
+      at::zeros({1}, at::TensorOptions().device(barDevice).dtype(at::kFloat));
+
+  auto work = allreduce_impl(barrierTensor);
+
+  auto xcclWork = dynamic_cast<ProcessGroupXCCL::WorkXCCL*>(work.get());
+  TORCH_CHECK(xcclWork);
+  xcclWork->barrierTensor_ = std::move(barrierTensor);
+  return work;
+}
+
 } // namespace c10d
 
 #endif // USE_C10D_XCCL
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
index 2357aad73bb512..80bf9a3dc5749f 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
@@ -126,6 +126,7 @@ class TORCH_API ProcessGroupXCCL : public Backend {
    protected:
     at::Device device_;
     std::shared_ptr<at::xpu::XPUEvent> xcclEndEvent_;
+    at::Tensor barrierTensor_;
     bool blockingWait_ = false;
     std::chrono::time_point<std::chrono::steady_clock> workStartTime_;
     std::vector<ccl::event> rets;
@@ -211,6 +212,10 @@ class TORCH_API ProcessGroupXCCL : public Backend {
       Fn fn,
       OpType opType);
 
+  c10::intrusive_ptr<Work> allreduce_impl(
+      at::Tensor& tensor,
+      const AllreduceOptions& opts = AllreduceOptions());
+
   c10::intrusive_ptr<Work> allreduce(
       std::vector<at::Tensor>& tensors,
       const AllreduceOptions& opts = AllreduceOptions()) override;
@@ -273,9 +278,7 @@ class TORCH_API ProcessGroupXCCL : public Backend {
       const ReduceScatterOptions& opts = ReduceScatterOptions()) override;
 
   c10::intrusive_ptr<Work> barrier(
-      const BarrierOptions& opts = BarrierOptions()) override {
-    TORCH_CHECK(false, "ProcessGroupXCCL::barrier not implemented");
-  }
+      const BarrierOptions& opts = BarrierOptions()) override;
 
   c10::intrusive_ptr<Work> alltoall_base(
       at::Tensor& outputTensor,
@@ -332,6 +335,7 @@ class TORCH_API ProcessGroupXCCL : public Backend {
   std::unordered_map<std::string, std::shared_ptr<xcclComm_t>> devXCCLCommMap_;
   c10::intrusive_ptr<Store> store_;
   std::mutex mutex_;
+  std::set<int> usedDeviceIdxs_;
   int coalescing_state_ = 0;
   at::Device coalescedDevice_ = at::Device("xpu");
   std::shared_ptr<xcclComm_t> coalescedComm_ = nullptr;

From d858c81606962b0e439578dbb6e11247f278ce80 Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Sat, 14 Sep 2024 06:10:35 +0000
Subject: [PATCH 53/96] update

---
 torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index c45e995ebd12c0..df1510ce9162d9 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -464,7 +464,6 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
   }
 
   auto stream = xcclStreams_.at(key);
-  std::vector<at::Tensor> outputs{output};
 
   c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL> work;
 

From fea20f5081463691700954986458eefa8c07df17 Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Sat, 14 Sep 2024 06:13:00 +0000
Subject: [PATCH 54/96] update

---
 torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index df1510ce9162d9..c7d9a10d9bf706 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -594,7 +594,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collectiveCoalesced(
     for (const auto i : c10::irange(inputs.size())) {
       c10::xpu::XPUCachingAllocator::recordStream(
           inputs[i].storage().data_ptr(), stream);
-      work->addResult(fn(inputs[i], outputs[i], attr, *comm, stream));
+      fn(inputs[i], outputs[i], attr, *comm, stream);
     }
   }
 

From e0e27f3d72454bbae0f10113362b12068bdfeaa8 Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Sat, 14 Sep 2024 08:04:20 +0000
Subject: [PATCH 55/96] update

---
 torch/csrc/distributed/c10d/ProcessGroup.hpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/torch/csrc/distributed/c10d/ProcessGroup.hpp b/torch/csrc/distributed/c10d/ProcessGroup.hpp
index 73fc2bda701327..f5e87c9be999ea 100644
--- a/torch/csrc/distributed/c10d/ProcessGroup.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroup.hpp
@@ -590,6 +590,11 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
       tensor = at::empty(
           {1},
           at::TensorOptions().device(at::DeviceType::CUDA).dtype(at::kByte));
+    } else if (backendType_ == c10d::ProcessGroup::BackendType::XCCL) {
+      // set xpu tensor for override cpu dispatch
+      tensor = at::empty(
+          {1},
+          at::TensorOptions().device(at::DeviceType::XPU).dtype(at::kByte));
     } else {
       // Default to using cpu implementation
       tensor = at::empty(

From 029026d547055b17679846a024f53131c1cf7bdf Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Wed, 18 Sep 2024 01:55:53 +0000
Subject: [PATCH 56/96] add ut

---
 test/distributed/test_c10d_ops_xccl.py | 852 +++++++++++++++++++++++++
 1 file changed, 852 insertions(+)
 create mode 100644 test/distributed/test_c10d_ops_xccl.py

diff --git a/test/distributed/test_c10d_ops_xccl.py b/test/distributed/test_c10d_ops_xccl.py
new file mode 100644
index 00000000000000..5d041058ead41b
--- /dev/null
+++ b/test/distributed/test_c10d_ops_xccl.py
@@ -0,0 +1,852 @@
+# Owner(s): ["oncall: distributed"]
+# This test file contains positive tests for c10d with XCCL backend.
+# During the test, it is expected that ProcessGroup will not be aborted, destroyed or incur fatal error.
+# Please be mindful of this when adding tests here.
+# If you need to add tests for group creation, abort or destroy, please add tests in test_c10d_xccl.py.
+
+# There are two ways to launch tests in this file:
+# 1. Run this file directly with `python test_c10d_ops_xccl.py`
+# 2. Use multi-process launcher, e.g. `torchrun --standalone --nproc-per-node 2 test_c10d_ops_xccl.py`
+
+import math
+import os
+import sys
+import tempfile
+
+import torch
+import torch.distributed as c10d
+
+
+if not c10d.is_available() or not c10d.is_xccl_available():
+    print("c10d XCCL not available, skipping tests", file=sys.stderr)
+    sys.exit(0)
+
+
+import torch.distributed as dist
+from torch.testing._internal.common_distributed import (
+    init_multigpu_helper,
+    MultiProcContinousTest,
+    requires_xccl,
+)
+from torch.testing._internal.common_utils import (
+    skip_but_pass_in_sandcastle_if,
+    skipIfRocm,
+    TEST_WITH_DEV_DBG_ASAN,
+    TEST_XPU,
+)
+
+
+if TEST_WITH_DEV_DBG_ASAN:
+    print(
+        "Skip ASAN as torch + multiprocessing spawn have known issues", file=sys.stderr
+    )
+    sys.exit(0)
+
+TEST_MULTIGPU = TEST_XPU and torch.xpu.device_count() >= 2
+
+class ProcessGroupXCCLOpTest(MultiProcContinousTest):
+    @classmethod
+    def backend_str(cls) -> str:
+        return "xccl"
+
+    # @classmethod
+    # def opts(cls):
+    #     opts = c10d.ProcessGroupXCCL.Options()
+    #     return opts
+
+    @property
+    def rank_to_GPU(self):
+        # return rank to GPU map
+        return init_multigpu_helper(self.world_size, "xccl")
+
+    # TODO: wait reduce
+    # @requires_xccl()
+    # @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs")
+    # def test_empty_tensors(self):
+    #     pg = self.pg
+    #     local_device_idx = self.rank_to_GPU[self.rank][0]
+
+    #     xs = [torch.FloatTensor([]).xpu(local_device_idx)]
+    #     pg.broadcast(xs).wait()
+    #     self.assertEqual(0, xs[0].numel())
+
+    #     pg.allreduce(xs).wait()
+    #     self.assertEqual(0, xs[0].numel())
+
+    #     pg.reduce(xs).wait()
+    #     self.assertEqual(0, xs[0].numel())
+
+    #     ys = [
+    #         [
+    #             torch.FloatTensor([]).xpu(local_device_idx)
+    #             for _ in range(self.world_size)
+    #         ]
+    #     ]
+    #     pg.allgather(ys, xs).wait()
+    #     for y in ys[0]:
+    #         self.assertEqual(0, y.numel())
+
+    #     ys = [torch.FloatTensor([]).xpu(local_device_idx)]
+    #     xs = [
+    #         [
+    #             torch.FloatTensor([]).xpu(local_device_idx)
+    #             for _ in range(self.world_size)
+    #         ]
+    #     ]
+    #     pg.reduce_scatter(ys, xs).wait()
+    #     self.assertEqual(0, ys[0].numel())
+
+    @requires_xccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs")
+    def test_broadcast_ops(self):
+        pg = self.pg
+
+        def broadcast(xs, rootRank, rootTensor):
+            opts = c10d.BroadcastOptions()
+            opts.rootRank = rootRank
+            opts.rootTensor = rootTensor
+            work = pg.broadcast(xs, opts)
+            work.wait()
+            return xs
+
+        # Every rank is root once
+        for i in range(self.world_size):
+            # Run with 1 input tensor
+            x = torch.tensor([self.rank]).xpu(self.rank_to_GPU[self.rank][0])
+            output = broadcast([x], i, 0)
+            self.assertEqual(torch.tensor([i]), output[0])
+
+            expected_tensor = torch.empty([i + 1, i + 1]).fill_(i + 1)
+            xs = [
+                torch.empty([i + 1, i + 1]).fill_(-1).xpu(device=device_idx)
+                for device_idx in self.rank_to_GPU[self.rank]
+            ]
+
+            # test with multiple input tensors (multiple gpu in one rank)
+            for j in range(len(xs)):
+                if self.rank == i:
+                    xs[j] = expected_tensor.xpu(device=self.rank_to_GPU[self.rank][j])
+
+                broadcast(xs, i, j)
+
+                for tensor in xs:
+                    self.assertEqual(tensor, expected_tensor)
+
+    @requires_xccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs")
+    def test_allreduce_ops(self):
+        device_count = torch.xpu.device_count()
+        pg = self.pg
+        local_device_id = self.rank_to_GPU[self.rank][0]
+
+        def allreduce(tensors, op):
+            opts = c10d.AllreduceOptions()
+            opts.reduceOp = op
+            work = pg.allreduce(tensors, opts)
+            work.wait()
+
+        # Sum
+        tensors = [torch.tensor([self.rank + 1]).xpu(local_device_id)]
+
+        allreduce(tensors, c10d.ReduceOp.SUM)
+
+        ndev = self.world_size
+        self.assertEqual(
+            torch.tensor([ndev * (ndev + 1) // 2]),
+            tensors[0],
+        )
+
+        # Product
+        tensors = [torch.tensor([self.rank + 1]).xpu(local_device_id)]
+
+        allreduce(tensors, c10d.ReduceOp.PRODUCT)
+        self.assertEqual(torch.tensor([math.factorial(self.world_size)]), tensors[0])
+
+        # Min
+        tensors = [torch.tensor([self.rank + 1]).xpu(local_device_id)]
+
+        allreduce(tensors, c10d.ReduceOp.MIN)
+        self.assertEqual(torch.tensor([1]), tensors[0])
+
+        # Max
+        tensors = [torch.tensor([self.rank + 1]).xpu(local_device_id)]
+
+        allreduce(tensors, c10d.ReduceOp.MAX)
+        self.assertEqual(torch.tensor([self.world_size]), tensors[0])
+
+        for op, err in zip(
+            (c10d.ReduceOp.BAND, c10d.ReduceOp.BOR, c10d.ReduceOp.BXOR),
+            ("ReduceOp.BAND", "ReduceOp.BOR", "ReduceOp.BXOR"),
+        ):
+            with self.assertRaisesRegex(ValueError, "Cannot use " + err + " with XCCL"):
+                allreduce(tensors, op)
+
+    # TODO: wait all2all
+    # @requires_xccl()
+    # @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs")
+    # def test_alltoall_ops_with_xpufree_race(self):
+    #     pg = self.pg
+    #     opts = c10d.AllToAllOptions()
+    #     local_device = f"xpu:{self.rank_to_GPU[self.rank][0]}"
+    #     torch.xpu.set_device(local_device)
+    #     input = torch.rand(1000, 1000, device=local_device)
+    #     output = torch.rand(1000, 1000, device=local_device)
+    #     race_tensors = []
+    #     # create some tensors to race with alltoall collective
+    #     for _ in range(10):
+    #         tmp = []
+    #         for i in range(5):
+    #             tmp.append(torch.rand(10 ** (3 + i), device=local_device))
+    #         race_tensors.append(tmp)
+
+    #     for i in range(10):
+    #         race_tensors.pop()
+    #         work = pg.alltoall_base(output, input, [], [], opts)
+    #         # this triggers xpuFree
+    #         torch.xpu.empty_cache()
+    #         work.wait()
+    #     torch.xpu.synchronize(device=local_device)
+
+    # TODO: wait reduce
+    # @requires_xccl()
+    # @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs")
+    # def test_reduce_ops(self):
+    #     pg = self.pg
+    #     local_device_id = self.rank_to_GPU[self.rank][0]
+
+    #     def reduce(xs, rootRank, rootTensor, op=None):
+    #         opts = c10d.ReduceOptions()
+    #         opts.rootRank = rootRank
+    #         opts.rootTensor = rootTensor
+    #         if op:
+    #             opts.reduceOp = op
+    #         work = pg.reduce(xs, opts)
+    #         work.wait()
+
+    #     # for every root tensor
+    #     for rt in range(self.world_size):
+    #         tensors = [torch.tensor([self.rank + 1]).xpu(local_device_id)]
+
+    #         reduce(tensors, rt, 0)
+
+    #         if self.rank == rt:
+    #             self.assertEqual(
+    #                 torch.tensor([self.world_size * (self.world_size + 1) // 2]),
+    #                 tensors[0],
+    #             )
+    #         else:
+    #             self.assertEqual(
+    #                 torch.tensor([self.rank + 1]),
+    #                 tensors[0],
+    #             )
+
+    #         for op, err in zip(
+    #             (c10d.ReduceOp.BAND, c10d.ReduceOp.BOR, c10d.ReduceOp.BXOR),
+    #             ("ReduceOp.BAND", "ReduceOp.BOR", "ReduceOp.BXOR"),
+    #         ):
+    #             with self.assertRaisesRegex(
+    #                 ValueError, "Cannot use " + err + " with XCCL"
+    #             ):
+    #                 reduce(tensors, self.rank, rt, op)
+
+    #         # Premul sum
+    #         if torch.xpu.xccl.version() >= (2, 11, 1):
+    #             for factor in (3.0, torch.tensor([5.0], device=local_device_id)):
+    #                 if isinstance(factor, torch.Tensor):
+    #                     factor_ref = factor.cpu().item()
+    #                 else:
+    #                     factor_ref = factor
+    #                 float_tensors = [
+    #                     torch.tensor(
+    #                         [self.rank + 1.0], device=f"xpu:{local_device_id}"
+    #                     )
+    #                 ]
+    #                 float_tensors_ref = [
+    #                     torch.tensor(
+    #                         [(self.rank + 1.0) * factor_ref],
+    #                         device=f"xpu:{local_device_id}",
+    #                     )
+    #                 ]
+
+    #                 reduce(float_tensors_ref, rt, 0)
+    #                 reduce(float_tensors, rt, 0, c10d._make_xccl_premul_sum(factor))
+    #                 if self.rank == rt:
+    #                     self.assertEqual(float_tensors_ref[0], float_tensors[0])
+
+    @requires_xccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs")
+    def test_allgather_ops(self):
+        pg = self.pg
+        local_device_ids = self.rank_to_GPU[self.rank]
+
+        def allgather(output_ts, input_ts):
+            work = pg.allgather(output_ts, input_ts)
+            return work.wait()
+
+        tensors = [torch.empty(2, 2).fill_(2).xpu(device=i) for i in local_device_ids]
+        output_tensors = []
+        expected_output = []
+
+        output_per_gpu = (
+            [torch.empty(2, 2).fill_(-1)] * len(local_device_ids) * self.world_size
+        )
+        expected_per_gpu = (
+            [torch.empty(2, 2).fill_(2)] * len(local_device_ids) * self.world_size
+        )
+
+        for gpu in local_device_ids:
+            output_tensors.append([t.xpu(device=gpu) for t in output_per_gpu])
+            expected_output.append([t.xpu(device=gpu) for t in expected_per_gpu])
+
+        result = allgather(output_tensors, tensors)
+
+        # Verification
+        self.assertEqual(output_tensors, expected_output)
+
+    @requires_xccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs")
+    def test_allgather_base_ops(self):
+        pg = self.pg
+        local_device_id = self.rank_to_GPU[self.rank][0]
+
+        def allgather_base(output_t, input_t):
+            work = pg._allgather_base(output_t, input_t)
+            work.wait()
+
+        # allgather_base is GPU number agnostic.
+        # Each rank contribute one tensor regardless of GPU counts
+        tensor = torch.tensor([self.rank]).xpu(local_device_id)
+        output_t = torch.empty((self.world_size), dtype=tensor.dtype).xpu(
+            local_device_id
+        )
+
+        allgather_base(output_t, tensor)
+
+        # Verification
+        self.assertEqual(torch.arange(self.world_size), output_t)
+
+    @requires_xccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs")
+    def test_allgather_base_basics(self):
+        pg = self.pg
+        local_device_id = self.rank_to_GPU[self.rank][0]
+
+        def allgather_base(output_t, input_t):
+            work = pg._allgather_base(output_t, input_t)
+            work.wait()
+
+        # anticipate an error
+        with self.assertRaisesRegex(
+            ValueError,
+            "output tensor size must be equal to world_size times input tensor size",
+        ):
+            tensor = torch.tensor([self.rank]).xpu(local_device_id)
+            output_t = torch.empty((self.world_size + 1), dtype=tensor.dtype).xpu(
+                local_device_id
+            )
+            # fails the check because output_t is not correctly sized
+            allgather_base(output_t, tensor)
+
+        # anticipate an error
+        with self.assertRaisesRegex(
+            TypeError, "output tensor must have the same type as input tensor"
+        ):
+            tensor = torch.tensor([self.rank], dtype=torch.float).xpu(local_device_id)
+            output_t = torch.empty((self.world_size + 1), dtype=torch.long).xpu(
+                local_device_id
+            )
+            # fails the check because the dtype is different
+            allgather_base(output_t, tensor)
+
+    # TODO: wait gather
+    # @requires_xccl()
+    # @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs")
+    # def test_gather_ops(self):
+    #     pg = self.pg
+    #     local_device_ids = self.rank_to_GPU[self.rank]
+    #     num_gpus = len(local_device_ids)
+
+    #     def gather(output_t, input_t, rootRank):
+    #         opts = c10d.GatherOptions()
+    #         opts.rootRank = rootRank
+    #         if rootRank == self.rank:
+    #             work = pg.gather(output_t, input_t, opts)
+    #         else:
+    #             work = pg.gather([], input_t, opts)
+    #         work.wait()
+
+    #     # init input
+    #     tensors = []
+    #     for device_id in local_device_ids:
+    #         tensors.append(torch.tensor([self.rank]).xpu(device_id))
+
+    #     # init output
+    #     output_ts = []
+    #     for idx in range(num_gpus):
+    #         gpu_idx = local_device_ids[idx]
+    #         output_ts.append([])
+    #         for rank in range(self.world_size):
+    #             output_ts[idx].append(torch.tensor([-1]).xpu(gpu_idx))
+
+    #     expected = [[torch.tensor([rank]) for rank in range(self.world_size)]]
+    #     for rank in range(self.world_size):
+    #         gather(output_ts, tensors, rank)
+    #         if rank == self.rank:
+    #             self.assertEqual(expected, output_ts)
+
+    # @requires_xccl()
+    # @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs")
+    # def test_gather_stress(self):
+    #     pg = self.pg
+    #     local_device_ids = self.rank_to_GPU[self.rank]
+    #     num_gpus = len(local_device_ids)
+
+    #     def gather(output_t, input_t, rootRank):
+    #         opts = c10d.GatherOptions()
+    #         opts.rootRank = rootRank
+    #         if rootRank == self.rank:
+    #             work = pg.gather(output_t, input_t, opts)
+    #         else:
+    #             work = pg.gather([], input_t, opts)
+    #         work.wait()
+
+    #     stress_length = 1000
+
+    #     # init input
+    #     tensors = []
+    #     for i in range(stress_length):
+    #         tensors.append([])
+    #         for device_id in local_device_ids:
+    #             tensors[i].append(torch.tensor([self.rank]).xpu(device_id))
+
+    #     # init output
+    #     output_ts = []
+    #     for i in range(stress_length):
+    #         output_ts.append([[] for _ in range(num_gpus)])
+    #         for idx, ls in enumerate(output_ts[i]):
+    #             gpu_idx = local_device_ids[idx]
+    #             for _ in range(self.world_size):
+    #                 ls.append(torch.tensor([-1]).xpu(gpu_idx))
+
+    #     expected = [[torch.tensor([rank]) for rank in range(self.world_size)]]
+    #     for i in range(stress_length):
+    #         for rank in range(self.world_size):
+    #             gather(output_ts[i], tensors[i], rank)
+    #             # Verification
+    #             if rank == self.rank:
+    #                 self.assertEqual(output_ts[i], expected)
+
+    # @requires_xccl()
+    # @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs")
+    # def test_gather_checks(self):
+    #     pg = self.pg
+    #     device_id = self.rank_to_GPU[self.rank][0]
+
+    #     # init input
+    #     tensor = torch.tensor([self.rank]).xpu(device_id)
+
+    #     # init output
+    #     output_ts = []
+    #     for rank in range(self.world_size):
+    #         output_ts.append(torch.tensor([-1]).xpu(device_id))
+
+    #     with self.assertRaisesRegex(ValueError, "invalid root rank"):
+    #         opts = c10d.GatherOptions()
+    #         opts.rootRank = -1
+    #         pg.gather([output_ts], [tensor], opts)
+
+    #     with self.assertRaisesRegex(TypeError, "incompatible function arguments"):
+    #         pg.gather([output_ts], [tensor], 0)
+
+    #     with self.assertRaisesRegex(ValueError, "invalid root rank"):
+    #         opts = c10d.GatherOptions()
+    #         opts.rootRank = self.world_size
+    #         pg.gather([output_ts], [tensor], opts)
+
+    #     with self.assertRaisesRegex(
+    #         # throws error message from dispatcher
+    #         RuntimeError,
+    #         "There were no tensor arguments to this function",
+    #     ):
+    #         opts = c10d.GatherOptions()
+    #         opts.rootRank = 0
+    #         pg.gather([output_ts], [], opts)
+
+    # TODO: wait scatter
+    # @requires_xccl()
+    # @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs")
+    # def test_scatter_ops(self):
+    #     pg = self.pg
+    #     local_device_ids = self.rank_to_GPU[self.rank]
+    #     num_gpus = len(local_device_ids)
+
+    #     def scatter(output_t, input_t, rootRank):
+    #         opts = c10d.ScatterOptions()
+    #         opts.rootRank = rootRank
+    #         if rootRank == self.rank:
+    #             work = pg.scatter(output_t, input_t, opts)
+    #         else:
+    #             work = pg.scatter(output_t, [], opts)
+    #         work.wait()
+
+    #     # init output
+    #     tensors = []
+    #     for device_id in local_device_ids:
+    #         tensors.append(torch.tensor([-1]).xpu(device_id))
+
+    #     # init input
+    #     scatter_list = []
+    #     for idx in range(num_gpus):
+    #         gpu_idx = local_device_ids[idx]
+    #         scatter_list.append([])
+    #         for rank in range(self.world_size):
+    #             scatter_list[idx].append(torch.tensor([rank]).xpu(gpu_idx))
+
+    #     # test each rank to scatter
+    #     expected = [torch.tensor([self.rank])]
+    #     for rank in range(self.world_size):
+    #         scatter(tensors, scatter_list, rank)
+    #         self.assertEqual(expected, tensors)
+
+    # @requires_xccl()
+    # @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs")
+    # def test_scatter_stress(self):
+    #     pg = self.pg
+    #     local_device_ids = self.rank_to_GPU[self.rank]
+    #     num_gpus = len(local_device_ids)
+
+    #     def scatter(output_t, input_t, rootRank):
+    #         opts = c10d.ScatterOptions()
+    #         opts.rootRank = rootRank
+    #         if rootRank == self.rank:
+    #             work = pg.scatter(output_t, input_t, opts)
+    #         else:
+    #             work = pg.scatter(output_t, [], opts)
+    #         work.wait()
+
+    #     stress_length = 1000
+
+    #     # init output
+    #     tensors = []
+    #     for i in range(stress_length):
+    #         tensors.append([])
+    #         for device_id in local_device_ids:
+    #             tensors[i].append(torch.tensor([-1]).xpu(device_id))
+
+    #     # init input
+    #     scatter_list = []
+    #     for i in range(stress_length):
+    #         scatter_list.append([[] for _ in range(num_gpus)])
+    #         for idx, ls in enumerate(scatter_list[i]):
+    #             gpu_idx = local_device_ids[idx]
+    #             for rank in range(self.world_size):
+    #                 ls.append(torch.tensor([rank]).xpu(gpu_idx))
+
+    #     # test each rank to scatter
+    #     expected = [torch.tensor([self.rank])]
+    #     for i in range(stress_length):
+    #         for rank in range(self.world_size):
+    #             scatter(tensors[i], scatter_list[i], rank)
+    #             # Verification
+    #             self.assertEqual(tensors[i], expected)
+
+    # @requires_xccl()
+    # @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs")
+    # def test_scatter_checks(self):
+    #     pg = self.pg
+    #     local_device_ids = self.rank_to_GPU[self.rank]
+    #     num_gpus = len(local_device_ids)
+
+    #     # init output
+    #     tensors = []
+    #     for device_id in local_device_ids:
+    #         tensors.append(torch.tensor([-1]).xpu(device_id))
+
+    #     # init input
+    #     scatter_list = []
+    #     for idx in range(num_gpus):
+    #         gpu_idx = local_device_ids[idx]
+    #         scatter_list.append([])
+    #         for rank in range(self.world_size):
+    #             scatter_list[idx].append(torch.tensor([rank]).xpu(gpu_idx))
+
+    #     with self.assertRaisesRegex(ValueError, "invalid root rank"):
+    #         opts = c10d.ScatterOptions()
+    #         opts.rootRank = -1
+    #         pg.scatter(tensors, scatter_list, opts)
+
+    #     with self.assertRaisesRegex(TypeError, "incompatible function arguments"):
+    #         pg.scatter(tensors, scatter_list, 0)
+
+    #     with self.assertRaisesRegex(ValueError, "invalid root rank"):
+    #         opts = c10d.ScatterOptions()
+    #         opts.rootRank = self.world_size
+    #         pg.scatter(tensors, scatter_list, opts)
+
+    #     with self.assertRaisesRegex(
+    #         # throws error message from dispatcher
+    #         RuntimeError,
+    #         "There were no tensor arguments to this function",
+    #     ):
+    #         opts = c10d.ScatterOptions()
+    #         opts.rootRank = 0
+    #         pg.scatter([], scatter_list, opts)
+
+    @requires_xccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs")
+    def test_reduce_scatter_base_basics(self):
+        pg = self.pg
+        local_device_id = self.rank_to_GPU[self.rank][0]
+
+        def reduce_scatter_base(output_t, input_t):
+            work = pg._reduce_scatter_base(output_t, input_t)
+            work.wait()
+
+        # anticipate an error
+        with self.assertRaisesRegex(
+            ValueError,
+            "input tensor must be the same size as output size times world size",
+        ):
+            input_t = torch.tensor([self.rank]).xpu(local_device_id)
+            output_t = torch.empty((self.world_size + 1), dtype=input_t.dtype).xpu(
+                local_device_id
+            )
+            # fails the check because output_t is not correctly sized
+            reduce_scatter_base(output_t, input_t)
+
+        # anticipate an error
+        with self.assertRaisesRegex(
+            TypeError, "input tensor must be the same type as the output tensor."
+        ):
+            tensor = torch.tensor([self.rank], dtype=torch.float).xpu(local_device_id)
+            output_t = torch.empty((self.world_size + 1), dtype=torch.long).xpu(
+                local_device_id
+            )
+            # fails the check because the dtype is different
+            reduce_scatter_base(output_t, tensor)
+
+    @requires_xccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs")
+    def test_reduce_scatter_ops(self):
+        pg = self.pg
+        local_device_ids = self.rank_to_GPU[self.rank]
+        num_gpus = len(local_device_ids)
+
+        def reduce_scatter(outputs, input_lists, op):
+            opts = c10d.ReduceScatterOptions()
+            opts.reduceOp = op
+            work = pg.reduce_scatter(outputs, input_lists, opts)
+            work.wait()
+
+        output = [torch.tensor([0]).xpu(i) for i in local_device_ids]
+
+        #  GPU/rank
+        #   0         [1], [2], [3], [4]
+        #   1         [2], [3], [4], [5]
+        #   2         [3], [4], [5], [6]
+        #   3         [4], [5], [6], [7]
+
+        # Sum
+        tensor_lists = []
+        input_per_gpu = []
+
+        for i in range(self.world_size):
+            input_per_gpu.append(torch.tensor([self.rank + i + 1]))
+
+        for gpu in local_device_ids:
+            tensor_lists.append([t.xpu(device=gpu) for t in input_per_gpu])
+
+        reduce_scatter(output, tensor_lists, c10d.ReduceOp.SUM)
+
+        for i in range(num_gpus):
+            expected = torch.tensor(
+                [
+                    (1 + self.world_size) * self.world_size // 2
+                    + self.world_size * self.rank
+                ]
+            )
+
+            self.assertEqual(expected, output[i])
+
+        # Min
+        reduce_scatter(output, tensor_lists, c10d.ReduceOp.MIN)
+
+        for i in range(num_gpus):
+            expected = torch.tensor([self.rank + 1 + i])
+            self.assertEqual(expected, output[i])
+
+        # Max
+        reduce_scatter(output, tensor_lists, c10d.ReduceOp.MAX)
+
+        for i in range(num_gpus):
+            expected = torch.tensor([self.rank + self.world_size + i])
+            self.assertEqual(expected, output[i])
+
+        # Product
+        reduce_scatter(output, tensor_lists, c10d.ReduceOp.PRODUCT)
+
+        # math package don't have math.perm until python 3.8, so
+        # we implement a naive version here.
+        def perm(n, k):
+            prod_val = n
+            for val in range(n - k + 1, n):
+                prod_val *= val
+            return prod_val
+
+        for i in range(num_gpus):
+            prod_val = perm(self.rank + self.world_size, self.world_size)
+
+            expected = torch.tensor([prod_val])
+            self.assertEqual(expected, output[i])
+
+        # Test the input params overridden scenarios, aka, when the input is
+        # a list and output is just one tensor.
+        # Sum
+        output_tensor = torch.empty_like(input_per_gpu[0][0]).xpu(self.rank)
+        input_list = [tensor[0].xpu(self.rank) for tensor in input_per_gpu]
+        pg.reduce_scatter(output_tensor, input_list, c10d.ReduceOp.SUM).wait()
+        expected = torch.tensor(
+            (1 + self.world_size) * self.world_size // 2 + self.world_size * self.rank
+        )
+        self.assertEqual(expected, output_tensor)
+
+        # Min
+        pg.reduce_scatter(output_tensor, input_list, c10d.ReduceOp.MIN).wait()
+        expected = torch.tensor(self.rank + 1)
+        self.assertEqual(expected, output_tensor)
+
+        # Max
+        pg.reduce_scatter(output_tensor, input_list, c10d.ReduceOp.MAX).wait()
+        expected = torch.tensor(self.rank + self.world_size)
+        self.assertEqual(expected, output_tensor)
+
+        # Product
+        pg.reduce_scatter(output_tensor, input_list, c10d.ReduceOp.PRODUCT).wait()
+        prod_val = self.rank + 1
+        for k in range(1, self.world_size):
+            prod_val = prod_val * (self.rank + 1 + k)
+        expected = torch.tensor(prod_val)
+        self.assertEqual(expected, output_tensor)
+
+
+    @requires_xccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs")
+    def test_reduce_scatter_base_ops(self):
+        pg = self.pg
+        local_device_id = self.rank_to_GPU[self.rank][0]
+
+        def reduce_scatter_base(output_t, input_t):
+            work = pg._reduce_scatter_base(output_t, input_t)
+            work.wait()
+
+        # reduce_scatter_base is GPU number agnostic.
+        # Each rank contribute one tensor regardless of GPU counts
+        output_t = torch.empty([1]).xpu(local_device_id)
+        tensor = torch.arange(self.world_size, dtype=output_t.dtype).xpu(
+            local_device_id
+        )
+
+        reduce_scatter_base(output_t, tensor)
+
+        # Verification
+        self.assertEqual(output_t[0], self.rank * self.world_size)
+
+    @requires_xccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs")
+    def test_barrier(self):
+        pg = self.pg
+        local_device_ids = self.rank_to_GPU[self.rank]
+
+        def allreduce(tensors):
+            opts = c10d.AllreduceOptions()
+            work = pg.allreduce(tensors, opts)
+            return work
+
+        # Making the collective to operate on
+        # 1, 2, 3, 4, .... len(local_device_ids) GPUs
+        tensors_list = [[] for _ in range(len(local_device_ids))]
+
+        for i in range(1, len(local_device_ids) + 1):
+            for j in range(i):
+                tensors_list[i - 1].append(
+                    torch.tensor([j + 1]).xpu(local_device_ids[j])
+                )
+
+        works = []
+        for tensors in tensors_list:
+            work = allreduce(tensors)
+            works.append(work)
+
+        # Barrier will ensure that all previous work is completed
+        pg.barrier().wait()
+
+        for i in range(1, len(local_device_ids) + 1):
+            for j in range(i):
+                self.assertEqual(
+                    torch.tensor([(j + 1) * self.world_size]), tensors_list[i - 1][j]
+                )
+
+    # TODO: wait send/recv
+    # @requires_xccl()
+    # @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs")
+    # def test_send_recv(self):
+    #     pg = self.pg
+    #     device = self.rank_to_GPU[self.rank][0]
+
+    #     # Generate the same random tensor
+    #     torch.manual_seed(0)
+    #     send_tensor = torch.rand(10, 10, device=device)
+    #     if self.rank == 0:
+    #         dist.send(send_tensor, 1)
+    #     if self.rank == 1:
+    #         recv_tensor = torch.rand(10, 10, device=device)
+    #         dist.recv(recv_tensor, 0)
+    #         self.assertEqual(send_tensor, recv_tensor)
+
+    # @requires_xccl()
+    # @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs")
+    # def test_send_recv_complex(self):
+    #     pg = self.pg
+    #     device = self.rank_to_GPU[self.rank][0]
+
+    #     # Generate the same random tensor
+    #     torch.manual_seed(0)
+    #     send_tensor = torch.rand(10, 10, dtype=torch.cfloat, device=device)
+    #     if self.rank == 0:
+    #         dist.send(send_tensor, 1)
+    #     if self.rank == 1:
+    #         recv_tensor = torch.rand(10, 10, dtype=torch.cfloat, device=device)
+    #         dist.recv(recv_tensor, 0)
+    #         self.assertEqual(send_tensor, recv_tensor)
+
+    # @requires_xccl()
+    # @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs")
+    # def test_send_recv_object_list(self):
+    #     device = self.rank_to_GPU[self.rank][0]
+
+    #     val = 99 if self.rank == 0 else None
+    #     object_list = [val] * self.world_size
+    #     if self.rank == 0:
+    #         dist.send_object_list(object_list, 1, device=device)
+    #     if self.rank == 1:
+    #         dist.recv_object_list(object_list, 0, device=device)
+    #         self.assertEqual(object_list[0], 99)
+
+
+if __name__ == "__main__":
+    rank = int(os.getenv("RANK", -1))
+    world_size = int(os.getenv("WORLD_SIZE", 2))
+
+    if rank != -1:
+        # Launched with torchrun or other multi-proc launchers. Directly run the test.
+        ProcessGroupXCCLOpTest.run_rank(rank, world_size)
+    else:
+        # Launched as a single process. Spawn subprocess to run the tests.
+        # Also need a rendezvous file for `init_process_group` purpose.
+        rdvz_file = tempfile.NamedTemporaryFile(delete=False).name
+        torch.multiprocessing.spawn(
+            ProcessGroupXCCLOpTest.run_rank,
+            nprocs=world_size,
+            args=(world_size, rdvz_file),
+        )
+

From 682f40fe87c2fa2f725eae3550f557fdc19c60ef Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Wed, 18 Sep 2024 06:16:31 +0000
Subject: [PATCH 57/96] Support all2all_base

---
 .../distributed/c10d/ProcessGroupXCCL.cpp     | 102 +++++++++++++++++-
 .../distributed/c10d/ProcessGroupXCCL.hpp     |   4 +-
 2 files changed, 101 insertions(+), 5 deletions(-)

diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index c7d9a10d9bf706..7f3e3719cdd307 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -100,12 +100,23 @@ bool check_same_size(const std::vector<at::Tensor>& input_tensors) {
   return true;
 }
 
-void check_xpu_single_tensor(const at::Tensor& tensor) {
+void check_xpu_single_tensor(
+    const at::Tensor& tensor,
+    const bool p2p = false // whether operation is a P2P operation
+) {
   if (!tensor.is_xpu() || tensor.is_sparse()) {
     C10_THROW_ERROR(ValueError, "Tensors must be XPU and dense");
   }
+  // Skip the following requirements for P2P operations
   if (!tensor.is_contiguous(tensor.suggest_memory_format())) {
-    C10_THROW_ERROR(ValueError, "Tensors must be contiguous");
+    if (p2p) {
+      TORCH_WARN_ONCE(
+          "Detected non-contiguous tensor in P2P operations. It is user "
+          "responsibility to guarantee that source and destination tensors have "
+          "the same contiguity format.");
+    } else {
+      C10_THROW_ERROR(ValueError, "Tensors must be contiguous");
+    }
   }
 }
 
@@ -1108,6 +1119,93 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::barrier(const BarrierOptions& opts) {
   return work;
 }
 
+c10::intrusive_ptr<Work> ProcessGroupXCCL::alltoall_base(
+    at::Tensor& outputTensor,
+    at::Tensor& inputTensor,
+    std::vector<int64_t>& outputSplitSizes,
+    std::vector<int64_t>& inputSplitSizes,
+    const AllToAllOptions& /* unused */) {
+  check_xpu_single_tensor(outputTensor, true);
+  check_xpu_single_tensor(inputTensor, true);
+  if (outputSplitSizes.size() == 0 && inputSplitSizes.size() == 0) {
+    TORCH_CHECK(
+        outputTensor.numel() == inputTensor.numel() &&
+            outputTensor.scalar_type() == inputTensor.scalar_type(),
+        "xpu_alltoall_base: tensors are not equal in size or data type");
+    TORCH_CHECK(
+        outputTensor.size(0) % size_ == 0,
+        "xpu_alltoall_base: tensor's dim 0 does not divide equally across group size");
+    return collective(
+        inputTensor,
+        outputTensor,
+        [&](at::Tensor& input,
+            at::Tensor& output,
+            ccl::alltoall_attr attr,
+            xcclComm_t& comm,
+            at::xpu::XPUStream& stream) {
+          c10::xpu::XPUCachingAllocator::recordStream(
+              output.storage().data_ptr(), stream);
+          auto xcclDataType = getXcclDataType(output.scalar_type());
+          ccl::event ret_evt;
+          ret_evt = ccl::alltoall(
+              input.data_ptr(),
+              output.data_ptr(),
+              (size_t)output.numel() / comm.size(),
+              xcclDataType,
+              comm,
+              ccl::create_stream(stream.queue()),
+              attr);
+          return ret_evt;
+        },
+        OpType::ALLTOALL_BASE);
+  } else {
+    c10d::checkSplitSizes(inputSplitSizes, inputTensor, size_);
+    c10d::checkSplitSizes(outputSplitSizes, outputTensor, size_);
+
+    return collective(
+        inputTensor,
+        outputTensor,
+        [&](at::Tensor& input,
+            at::Tensor& output,
+            ccl::alltoall_attr attr,
+            xcclComm_t& comm,
+            at::xpu::XPUStream& stream) {
+          std::vector<size_t> sendCounts(size_);
+          std::vector<size_t> recvCounts(size_);
+          bool inputSplitsEqual = inputSplitSizes.size() == 0;
+          bool outputSplitsEqual = outputSplitSizes.size() == 0;
+
+          size_t inLen = input.numel();
+          size_t outLen = output.numel();
+          if (inLen)
+            inLen /= (inputSplitsEqual ? size_ : input.size(0));
+          if (outLen)
+            outLen /= (outputSplitsEqual ? size_ : output.size(0));
+
+          for (int i = 0; i < size_; i++) {
+            sendCounts[i] =
+                (inputSplitsEqual ? inLen : inputSplitSizes[i] * inLen);
+            recvCounts[i] =
+                (outputSplitsEqual ? outLen : outputSplitSizes[i] * outLen);
+          }
+          auto xcclDataType = getXcclDataType(output.scalar_type());
+          ccl::event ret_evt;
+
+          ret_evt = ccl::alltoallv(
+              input.data_ptr(),
+              sendCounts,
+              output.data_ptr(),
+              recvCounts,
+              xcclDataType,
+              comm,
+              ccl::create_stream(stream.queue()),
+              attr);
+          return ret_evt;
+        },
+        OpType::ALLTOALL_BASE);
+  }
+}
+
 } // namespace c10d
 
 #endif // USE_C10D_XCCL
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
index 8e17435a4ce1b6..0147ef3744384e 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
@@ -278,9 +278,7 @@ class TORCH_API ProcessGroupXCCL : public Backend {
       at::Tensor& inputTensor,
       std::vector<int64_t>& outputSplitSizes,
       std::vector<int64_t>& inputSplitSizes,
-      const AllToAllOptions& opts = AllToAllOptions()) override {
-    TORCH_CHECK(false, "ProcessGroupXCCL::alltoall_base not implemented");
-  }
+      const AllToAllOptions& opts = AllToAllOptions()) override;
 
   c10::intrusive_ptr<Work> alltoall(
       std::vector<at::Tensor>& outputTensors,

From 2694617e6c4027875360e050051fa5128bcd7261 Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Wed, 18 Sep 2024 06:53:24 +0000
Subject: [PATCH 58/96] update

---
 torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index 7f3e3719cdd307..16766b1190b072 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -1158,7 +1158,8 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::alltoall_base(
           return ret_evt;
         },
         OpType::ALLTOALL_BASE);
-  } else {
+  } 
+  else {
     c10d::checkSplitSizes(inputSplitSizes, inputTensor, size_);
     c10d::checkSplitSizes(outputSplitSizes, outputTensor, size_);
 
@@ -1167,7 +1168,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::alltoall_base(
         outputTensor,
         [&](at::Tensor& input,
             at::Tensor& output,
-            ccl::alltoall_attr attr,
+            ccl::alltoallv_attr attr,
             xcclComm_t& comm,
             at::xpu::XPUStream& stream) {
           std::vector<size_t> sendCounts(size_);

From 612df4271896bde6e0d30cb0eb2a07a2bae346cf Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Wed, 18 Sep 2024 08:57:33 +0000
Subject: [PATCH 59/96] support all2all

---
 test/distributed/test_c10d_ops_xccl.py        |  49 ++++----
 .../distributed/c10d/ProcessGroupXCCL.cpp     | 116 +++++++++++++++++-
 .../distributed/c10d/ProcessGroupXCCL.hpp     |   4 +-
 3 files changed, 139 insertions(+), 30 deletions(-)

diff --git a/test/distributed/test_c10d_ops_xccl.py b/test/distributed/test_c10d_ops_xccl.py
index 5d041058ead41b..a59d03a1750e1c 100644
--- a/test/distributed/test_c10d_ops_xccl.py
+++ b/test/distributed/test_c10d_ops_xccl.py
@@ -181,31 +181,30 @@ def allreduce(tensors, op):
             with self.assertRaisesRegex(ValueError, "Cannot use " + err + " with XCCL"):
                 allreduce(tensors, op)
 
-    # TODO: wait all2all
-    # @requires_xccl()
-    # @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs")
-    # def test_alltoall_ops_with_xpufree_race(self):
-    #     pg = self.pg
-    #     opts = c10d.AllToAllOptions()
-    #     local_device = f"xpu:{self.rank_to_GPU[self.rank][0]}"
-    #     torch.xpu.set_device(local_device)
-    #     input = torch.rand(1000, 1000, device=local_device)
-    #     output = torch.rand(1000, 1000, device=local_device)
-    #     race_tensors = []
-    #     # create some tensors to race with alltoall collective
-    #     for _ in range(10):
-    #         tmp = []
-    #         for i in range(5):
-    #             tmp.append(torch.rand(10 ** (3 + i), device=local_device))
-    #         race_tensors.append(tmp)
-
-    #     for i in range(10):
-    #         race_tensors.pop()
-    #         work = pg.alltoall_base(output, input, [], [], opts)
-    #         # this triggers xpuFree
-    #         torch.xpu.empty_cache()
-    #         work.wait()
-    #     torch.xpu.synchronize(device=local_device)
+    @requires_xccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs")
+    def test_alltoall_ops_with_xpufree_race(self):
+        pg = self.pg
+        opts = c10d.AllToAllOptions()
+        local_device = f"xpu:{self.rank_to_GPU[self.rank][0]}"
+        torch.xpu.set_device(local_device)
+        input = torch.rand(1000, 1000, device=local_device)
+        output = torch.rand(1000, 1000, device=local_device)
+        race_tensors = []
+        # create some tensors to race with alltoall collective
+        for _ in range(10):
+            tmp = []
+            for i in range(5):
+                tmp.append(torch.rand(10 ** (3 + i), device=local_device))
+            race_tensors.append(tmp)
+
+        for i in range(10):
+            race_tensors.pop()
+            work = pg.alltoall_base(output, input, [], [], opts)
+            # this triggers xpuFree
+            torch.xpu.empty_cache()
+            work.wait()
+        torch.xpu.synchronize(device=local_device)
 
     # TODO: wait reduce
     # @requires_xccl()
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index 16766b1190b072..5d43694def146c 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -91,6 +91,44 @@ XCCL_KVS get_kvs(int rank, c10d::Store& store) {
   return kvs;
 }
 
+bool computeLengthsAndCheckAndGetFlat(
+    const std::vector<at::Tensor>& tensors,
+    std::vector<size_t>& lengths,
+    at::Tensor& flatTensor,
+    int64_t& flatLength) {
+  int64_t groupSize = tensors.size();
+  auto firstTensor = tensors[0];
+  int64_t totalSize = 0;
+  bool isFlat = true;
+
+  auto storage = firstTensor.storage();
+  int64_t firstStorageOffset = firstTensor.storage_offset();
+
+  for (int i = 0; i < groupSize; i++) {
+    auto& curTensor = tensors[i];
+    int64_t length = curTensor.numel();
+    lengths[i] = length;
+    totalSize += length;
+
+    if (isFlat &&
+        (!storage.is_alias_of(curTensor.storage()) ||
+         curTensor.storage_offset() !=
+             firstStorageOffset + totalSize - length)) {
+      isFlat = false;
+    }
+  }
+
+  flatLength = totalSize;
+
+  if (isFlat) {
+    flatTensor = firstTensor;
+  } else {
+    flatTensor = at::empty({totalSize}, firstTensor.options());
+  }
+
+  return isFlat;
+}
+
 bool check_same_size(const std::vector<at::Tensor>& input_tensors) {
   for (const auto& input_tensor : input_tensors) {
     if (!input_tensors[0].is_same_size(input_tensor)) {
@@ -1158,8 +1196,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::alltoall_base(
           return ret_evt;
         },
         OpType::ALLTOALL_BASE);
-  } 
-  else {
+  } else {
     c10d::checkSplitSizes(inputSplitSizes, inputTensor, size_);
     c10d::checkSplitSizes(outputSplitSizes, outputTensor, size_);
 
@@ -1207,6 +1244,81 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::alltoall_base(
   }
 }
 
+c10::intrusive_ptr<Work> ProcessGroupXCCL::alltoall(
+    std::vector<at::Tensor>& outputTensors,
+    std::vector<at::Tensor>& inputTensors,
+    const AllToAllOptions& /* unused */) {
+  auto device = outputTensors[0].device();
+  for (const auto r : c10::irange(outputTensors.size())) {
+    check_xpu_single_tensor(outputTensors[r], true);
+    check_xpu_single_tensor(inputTensors[r], true);
+    TORCH_CHECK(
+        device == outputTensors[r].device() &&
+            device == inputTensors[r].device(),
+        "Tensors must be on the same device")
+  }
+
+  return collective(
+      inputTensors,
+      outputTensors,
+      [&](at::Tensor& /* unused */,
+          at::Tensor& /* unused */,
+          ccl::alltoallv_attr attr,
+          xcclComm_t& comm,
+          at::xpu::XPUStream& stream) {
+        c10::OptionalStreamGuard stream_guard(stream.unwrap());
+        at::Tensor flatInput;
+        at::Tensor flatOutput;
+
+        std::vector<size_t> sendCounts(size_);
+        std::vector<size_t> recvCounts(size_);
+
+        int64_t flatSendCount;
+        int64_t flatRecvCount;
+
+        bool isInputFlat = computeLengthsAndCheckAndGetFlat(
+            inputTensors, sendCounts, flatInput, flatSendCount);
+        bool isOutputFlat = computeLengthsAndCheckAndGetFlat(
+            outputTensors, recvCounts, flatOutput, flatRecvCount);
+        if (!isInputFlat) {
+          auto flatInputSplits = flatInput.split_with_sizes(
+              c10::IntArrayRef((int64_t*)sendCounts.data(), sendCounts.size()),
+              0);
+
+          for (int i = 0; i < size_; i++) {
+            flatInputSplits[i].copy_(inputTensors[i].view({-1}));
+          }
+        }
+
+        auto xcclDataType = getXcclDataType(flatOutput.scalar_type());
+        ccl::event ret_evt;
+        ret_evt = ccl::alltoallv(
+            flatInput.data_ptr(),
+            sendCounts,
+            flatOutput.data_ptr(),
+            recvCounts,
+            xcclDataType,
+            comm,
+            ccl::create_stream(stream.queue()),
+            attr);
+
+        if (!isOutputFlat) {
+          ret_evt.wait();
+          auto flatOutputSplits = flatOutput.split_with_sizes(
+              c10::IntArrayRef((int64_t*)recvCounts.data(), recvCounts.size()),
+              0);
+
+          for (int i = 0; i < size_; i++) {
+            outputTensors[i].view({-1}).copy_(flatOutputSplits[i]);
+          }
+        }
+
+        stream.synchronize();
+        return ret_evt;
+      },
+      OpType::ALLTOALL);
+}
+
 } // namespace c10d
 
 #endif // USE_C10D_XCCL
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
index 0147ef3744384e..cfef4ace195f26 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
@@ -283,9 +283,7 @@ class TORCH_API ProcessGroupXCCL : public Backend {
   c10::intrusive_ptr<Work> alltoall(
       std::vector<at::Tensor>& outputTensors,
       std::vector<at::Tensor>& inputTensors,
-      const AllToAllOptions& opts = AllToAllOptions()) override {
-    TORCH_CHECK(false, "ProcessGroupXCCL::alltoall not implemented");
-  }
+      const AllToAllOptions& opts = AllToAllOptions()) override;
 
   c10::intrusive_ptr<Work> send(
       std::vector<at::Tensor>& tensors,

From 001dac2fafb6d658326d579bfb86216ef04e6077 Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Thu, 19 Sep 2024 03:33:32 +0000
Subject: [PATCH 60/96] use lintrunner format code

---
 setup.py                                      |  2 +-
 test/distributed/test_c10d_common.py          |  6 +++-
 test/distributed/test_c10d_xccl.py            | 33 ++++++++-----------
 torch/_C/_distributed_c10d.pyi                |  2 +-
 torch/csrc/distributed/c10d/Ops.cpp           |  2 +-
 torch/csrc/distributed/c10d/init.cpp          |  1 -
 torch/distributed/distributed_c10d.py         |  9 +++--
 torch/testing/_internal/common_distributed.py |  2 +-
 8 files changed, 29 insertions(+), 28 deletions(-)

diff --git a/setup.py b/setup.py
index e6191c0616db4a..ad48f4b0108633 100644
--- a/setup.py
+++ b/setup.py
@@ -648,7 +648,7 @@ def run(self):
         if cmake_cache_vars["USE_XCCL"]:
             report("-- Building XCCL library")
         else:
-            report("-- Not using XCCL") 
+            report("-- Not using XCCL")
         if cmake_cache_vars["USE_DISTRIBUTED"]:
             if IS_WINDOWS:
                 report("-- Building without distributed package")
diff --git a/test/distributed/test_c10d_common.py b/test/distributed/test_c10d_common.py
index 0c1426d0e29c21..3e5538d57e38ae 100644
--- a/test/distributed/test_c10d_common.py
+++ b/test/distributed/test_c10d_common.py
@@ -66,7 +66,11 @@ def gpus_for_rank(world_size):
     On a single node, all visible GPUs are evenly
     divided to subsets, each process only uses a subset.
     """
-    device_count = torch.xpu.device_count() if torch.xpu.is_available() else torch.cuda.device_count()
+    device_count = (
+        torch.xpu.device_count()
+        if torch.xpu.is_available()
+        else torch.cuda.device_count()
+    )
     visible_devices = list(range(device_count))
     gpus_per_process = device_count // world_size
     gpus_for_rank = []
diff --git a/test/distributed/test_c10d_xccl.py b/test/distributed/test_c10d_xccl.py
index 3843a695f766c9..704cdd414e554b 100644
--- a/test/distributed/test_c10d_xccl.py
+++ b/test/distributed/test_c10d_xccl.py
@@ -1,18 +1,11 @@
 # Owner(s): ["oncall: distributed"]
 
-import copy
-import logging
 import math
-import operator
 import os
-import random
 import sys
-
 import time
-import tempfile
 from datetime import timedelta
-from functools import reduce
-from unittest import mock, SkipTest
+from unittest import mock
 
 import torch
 import torch.distributed as c10d
@@ -23,27 +16,23 @@
     sys.exit(0)
 
 import test_c10d_common
-from test_c10d_common import DoubleGpuNet, gpus_for_rank, ModuleForDdpCommHook
 
 import torch.distributed as dist
-import torch.nn.functional as F
 import torch.testing._internal.common_utils as common
-from torch import nn
-from torch.nn.parallel import DistributedDataParallel
 from torch.testing._internal.common_distributed import (
+    init_multigpu_helper,
     MultiProcessTestCase,
     requires_xccl,
-    init_multigpu_helper,
-    skip_if_lt_x_gpu,
 )
 from torch.testing._internal.common_utils import (
-    skip_but_pass_in_sandcastle_if,
-    TEST_XPU,
     retry_on_connect_failures,
     run_tests,
+    skip_but_pass_in_sandcastle_if,
+    TEST_XPU,
     TestCase,
 )
 
+
 def simple_reduce_tests(rank, world_size):
     tests = [
         (
@@ -70,8 +59,10 @@ def simple_reduce_tests(rank, world_size):
 
     return tests
 
+
 TEST_MULTIXPU = torch.xpu.device_count() > 1
 
+
 class RendezvousEnvTest(TestCase):
     @retry_on_connect_failures
     @requires_xccl()
@@ -171,6 +162,7 @@ def withouts(d, keys):
             self.assertEqual(rank, 0)
             self.assertEqual(size, 1)
 
+
 class TimeoutTest(test_c10d_common.AbstractTimeoutTest, TestCase):
     @requires_xccl()
     @retry_on_connect_failures
@@ -178,8 +170,11 @@ class TimeoutTest(test_c10d_common.AbstractTimeoutTest, TestCase):
     def test_default_store_timeout_nccl(self):
         self._test_default_store_timeout("xccl")
 
+
 class ProcessGroupXCCLTest(MultiProcessTestCase):
-    def _create_process_group_xccl(self, timeout=timedelta(seconds=600), device_id=None):
+    def _create_process_group_xccl(
+        self, timeout=timedelta(seconds=600), device_id=None
+    ):
         store = c10d.FileStore(self.file_name, self.world_size)
         c10d.init_process_group(
             "xccl",
@@ -286,7 +281,7 @@ def _test_allreduce_basics(self, fn):
             result = fut.value()
             self.assertEqual(expected, result[0], exact_dtype=False)
 
-        x = fn(torch.tensor([self.rank + 1.0], device = device))
+        x = fn(torch.tensor([self.rank + 1.0], device=device))
         fut = pg.allreduce(x).get_future()
         fut.wait()
         result = fut.value()
@@ -300,11 +295,9 @@ def test_allreduce_basics(self):
         self._test_allreduce_basics(lambda t: t.clone())
 
 
-
 if __name__ == "__main__":
     assert (
         not torch.xpu._initialized
     ), "test_distributed must not have initialized XPU context on main process"
 
     run_tests()
-
diff --git a/torch/_C/_distributed_c10d.pyi b/torch/_C/_distributed_c10d.pyi
index 53011cde6b178a..6033d969925972 100644
--- a/torch/_C/_distributed_c10d.pyi
+++ b/torch/_C/_distributed_c10d.pyi
@@ -705,4 +705,4 @@ class ProcessGroupXCCL(Backend):
         store: Store,
         rank: int,
         size: int,
-    ): ...
\ No newline at end of file
+    ): ...
diff --git a/torch/csrc/distributed/c10d/Ops.cpp b/torch/csrc/distributed/c10d/Ops.cpp
index 48d2b3ed1bf69a..699c54236f6412 100644
--- a/torch/csrc/distributed/c10d/Ops.cpp
+++ b/torch/csrc/distributed/c10d/Ops.cpp
@@ -510,7 +510,7 @@ namespace {
 #define REGISTER_C10D_OP(FUNC)  \
   REGISTER_C10D_OP1(FUNC, CPU)  \
   REGISTER_C10D_OP1(FUNC, CUDA) \
-  REGISTER_C10D_OP1(FUNC, XPU) \
+  REGISTER_C10D_OP1(FUNC, XPU)  \
   REGISTER_C10D_OP1(FUNC, PrivateUse1)
 
 // Now we start to register ops with the three device keys
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index 5d200bb6eeb9cf..e3ed6d6bd4bcb4 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -41,7 +41,6 @@
 #include <torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp>
 #endif
 
-
 #include <fmt/format.h>
 #include <pybind11/chrono.h>
 #include <torch/csrc/distributed/c10d/DMAConnectivity.hpp>
diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index d0781765c090ff..9fa3224873c9fc 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -1111,10 +1111,12 @@ def is_ucc_available() -> bool:
     """Check if the UCC backend is available."""
     return _UCC_AVAILABLE
 
+
 def is_xccl_available() -> bool:
     """Check if the XCCL backend is available."""
     return _XCCL_AVAILABLE
 
+
 def is_backend_available(backend: str) -> bool:
     """
     Check backend availability.
@@ -1367,7 +1369,7 @@ def _set_pg_timeout(timeout: timedelta, group: Optional[ProcessGroup] = None) ->
             backends.add(backend)  # type: ignore[arg-type]
         elif is_gloo_available() and isinstance(backend, ProcessGroupGloo):
             backends.add(backend)  # type: ignore[arg-type]
-    if torch.device("xpu") in devices and is_xpu_available():
+    if torch.device("xpu") in devices and is_xccl_available():
         backend = group._get_backend(torch.device("xpu"))
         if isinstance(backend, ProcessGroupXCCL):
             backends.add(backend)  # type: ignore[arg-type]
@@ -1672,7 +1674,10 @@ def _new_process_group_helper(
             "created, please use a different group name"
         )
 
-    if device_id is not None and (device_id.index is None or (device_id.type != "cuda" and device_id.type != "xpu")):
+    if device_id is not None and (
+        device_id.index is None
+        or (device_id.type != "cuda" and device_id.type != "xpu")
+    ):
         raise ValueError(
             "init_process_group device_id parameter must be a cuda device with an "
             "id, e.g. cuda:0, xpu, not just cuda or xpu or cpu"
diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py
index 554114b7bbcb1c..26bdcce6103120 100644
--- a/torch/testing/_internal/common_distributed.py
+++ b/torch/testing/_internal/common_distributed.py
@@ -326,7 +326,7 @@ def requires_xccl():
         not c10d.is_xccl_available(),
         "c10d was not compiled with the XCCL backend",
     )
-    
+
 def requires_ucc():
     return skip_but_pass_in_sandcastle_if(
         not c10d.is_ucc_available(),

From f13b44908b4b6366f8af7bb2bdbfd4d1a2e3758c Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Thu, 19 Sep 2024 05:35:15 +0000
Subject: [PATCH 61/96] rm allgatherv align with nccl

---
 .../distributed/c10d/ProcessGroupXCCL.cpp     | 126 ++++++++----------
 .../distributed/c10d/ProcessGroupXCCL.hpp     |  41 +++---
 2 files changed, 74 insertions(+), 93 deletions(-)

diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index c7d9a10d9bf706..25181d2b9d2498 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -268,11 +268,7 @@ bool ProcessGroupXCCL::WorkXCCL::wait(std::chrono::milliseconds timeout) {
 }
 
 constexpr const char* MULTI_DEVICE_ERROR_MSG =
-    "Expecting one tensor only but got multiple. You are probably using multiple "
-    "devices under one thread. The support for such usage has been deprecated. "
-    "For details, please refer to "
-    "https://pytorch.org/docs/stable/distributed.html#multi-gpu-collective-functions. "
-    "ProcessGroupXCCL continues supporting multi-process and multi-thread modes.";
+    "Expecting one tensor only but got multiple";
 
 ProcessGroupXCCL::ProcessGroupXCCL(
     const c10::intrusive_ptr<Store>& store,
@@ -425,17 +421,10 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::endCoalescing() {
   return endCoalescing(OpType::COALESCED);
 }
 
-// align with single-device style, input_t and output_t due to
-// allgatherv need vector output
-template <
-    typename Fn,
-    typename input_t,
-    typename output_t,
-    typename PreProcess,
-    typename PostProcess>
+template <typename Fn, typename PreProcess, typename PostProcess>
 c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
-    std::vector<input_t>& inputs,
-    std::vector<output_t>& outputs,
+    std::vector<at::Tensor>& inputs,
+    std::vector<at::Tensor>& outputs,
     Fn fn,
     PreProcess pre,
     PostProcess post,
@@ -517,28 +506,23 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
   return work;
 }
 
-template <
-    typename Fn,
-    typename input_t,
-    typename output_t,
-    typename PreProcess,
-    typename PostProcess>
+template <typename Fn, typename PreProcess, typename PostProcess>
 c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
-    input_t& input,
-    output_t& output,
+    at::Tensor& input,
+    at::Tensor& output,
     Fn fn,
     PreProcess pre,
     PostProcess post,
     OpType opType) {
-  auto inputs = std::vector<input_t>{input};
-  auto outputs = std::vector<output_t>{output};
+  auto inputs = std::vector<at::Tensor>{input};
+  auto outputs = std::vector<at::Tensor>{output};
   return collective(inputs, outputs, fn, pre, post, opType);
 }
 
-template <typename Fn, typename input_t, typename output_t>
+template <typename Fn>
 c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
-    input_t& input,
-    output_t& output,
+    at::Tensor& input,
+    at::Tensor& output,
     Fn fn,
     OpType opType) {
   return collective<Fn>(
@@ -720,6 +704,39 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::broadcast(
       OpType::BROADCAST);
 }
 
+c10::intrusive_ptr<Work> ProcessGroupXCCL::_broadcast_oop(
+    at::Tensor& outputTensor,
+    at::Tensor& inputTensor,
+    const BroadcastOptions& opts) {
+  if (outputTensor.numel() != inputTensor.numel()) {
+    C10_THROW_ERROR(
+        ValueError,
+        "Tensor input and output of _broadcast_oop must have the same number of elements ");
+  }
+  const auto root = opts.rootRank + opts.rootTensor;
+  return collective(
+      inputTensor,
+      outputTensor,
+      [&](at::Tensor& input,
+          at::Tensor& output,
+          ccl::broadcast_attr attr,
+          xcclComm_t& comm,
+          at::xpu::XPUStream& stream) {
+        auto xcclDataType = getXcclDataType(input.scalar_type());
+        ccl::event ret_evt;
+        ret_evt = ccl::broadcast(
+            input.data_ptr(),
+            (size_t)input.numel(),
+            xcclDataType,
+            root,
+            comm,
+            ccl::create_stream(stream.queue()),
+            attr);
+        return ret_evt;
+      },
+      OpType::BROADCAST);
+}
+
 c10::intrusive_ptr<Work> ProcessGroupXCCL::_reduce_oop(
     at::Tensor& outputTensor,
     at::Tensor& inputTensor,
@@ -808,48 +825,17 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::allgather(
         },
         OpType::ALLGATHER);
   } else {
-    // xccl implemented allgatherv, so broadcast_oop not needed
-    return collective(
-        inputTensor,
-        outputTensors_,
-        [=](at::Tensor& input,
-            const std::vector<at::Tensor>& outputs,
-            ccl::allgatherv_attr attr,
-            xcclComm_t& comm,
-            at::xpu::XPUStream& stream) {
-          ccl::event ret_evt;
-          auto xcclDataType = getXcclDataType(input.scalar_type());
-
-          std::vector<size_t> recvCounts(outputs.size(), 0);
-          std::transform(
-              outputs.begin(),
-              outputs.end(),
-              recvCounts.begin(),
-              [](const at::Tensor& t) { return t.numel(); });
-
-          TORCH_CHECK(
-              (size_t)input.numel() == recvCounts[rank_],
-              "allgather: send and recv count doesn't match");
-
-          std::vector<void*> recvBufs(outputs.size(), nullptr);
-          std::transform(
-              outputs.begin(),
-              outputs.end(),
-              recvBufs.begin(),
-              [](const at::Tensor& t) { return t.data_ptr(); });
-
-          ret_evt = ccl::allgatherv(
-              input.data_ptr(),
-              (size_t)input.numel(),
-              recvBufs,
-              recvCounts,
-              xcclDataType,
-              comm,
-              ccl::create_stream(stream.queue()),
-              attr);
-          return ret_evt;
-        },
-        c10d::OpType::ALLGATHER);
+    const auto num_reduces = outputTensors_.size();
+    startCoalescing();
+    for (const int i : c10::irange(num_reduces)) {
+      auto& output = outputTensors_[i];
+      auto& input = (i == rank_) ? inputTensor : output;
+      auto broadcastOpts = BroadcastOptions{
+          static_cast<int64_t>(i), static_cast<int64_t>(0), opts.timeout};
+      _broadcast_oop(output, input, broadcastOpts);
+    }
+    auto work = endCoalescing(OpType::ALLGATHER);
+    return work;
   }
 }
 
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
index 8e17435a4ce1b6..f50c2bbc4dc4bd 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
@@ -163,36 +163,26 @@ class TORCH_API ProcessGroupXCCL : public Backend {
       const std::vector<at::Tensor>& inputs = {},
       const std::vector<at::Tensor>& outputs = {});
 
-  template <typename Fn, typename input_t, typename output_t>
+  template <typename Fn>
   c10::intrusive_ptr<Work> collective(
-      input_t& input,
-      output_t& output,
+      at::Tensor& input,
+      at::Tensor& output,
       Fn fn,
       OpType opType);
 
-  template <
-      typename Fn,
-      typename input_t,
-      typename output_t,
-      typename PreProcess,
-      typename PostProcess>
+  template <typename Fn, typename PreProcess, typename PostProcess>
   c10::intrusive_ptr<Work> collective(
-      input_t& input,
-      output_t& output,
+      at::Tensor& input,
+      at::Tensor& output,
       Fn fn,
       PreProcess pre,
       PostProcess post,
       OpType opType);
 
-  template <
-      typename Fn,
-      typename input_t,
-      typename output_t,
-      typename PreProcess,
-      typename PostProcess>
+  template <typename Fn, typename PreProcess, typename PostProcess>
   c10::intrusive_ptr<Work> collective(
-      std::vector<input_t>& inputs,
-      std::vector<output_t>& outputs,
+      std::vector<at::Tensor>& inputs,
+      std::vector<at::Tensor>& outputs,
       Fn fn,
       PreProcess pre,
       PostProcess post,
@@ -224,15 +214,20 @@ class TORCH_API ProcessGroupXCCL : public Backend {
     TORCH_CHECK(false, "ProcessGroupXCCL::reduce not implemented");
   }
 
-  c10::intrusive_ptr<Work> broadcast(
-      std::vector<at::Tensor>& tensors,
-      const BroadcastOptions& opts = BroadcastOptions()) override;
-
   c10::intrusive_ptr<Work> _reduce_oop(
       at::Tensor& outputTensors,
       at::Tensor& inputTensors,
       const ReduceOptions& opts = ReduceOptions());
 
+  c10::intrusive_ptr<Work> broadcast(
+      std::vector<at::Tensor>& tensors,
+      const BroadcastOptions& opts = BroadcastOptions()) override;
+
+  c10::intrusive_ptr<Work> _broadcast_oop(
+      at::Tensor& outputTensor,
+      at::Tensor& inputTensor,
+      const BroadcastOptions& opts);
+
   c10::intrusive_ptr<Work> allgather(
       std::vector<std::vector<at::Tensor>>& outputTensors,
       std::vector<at::Tensor>& inputTensors,

From af29d9650d9be5d99ea3f0b62d62c8ac1107e994 Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Thu, 19 Sep 2024 06:03:24 +0000
Subject: [PATCH 62/96] Support reduce

---
 test/distributed/test_c10d_ops_xccl.py        | 169 ++++++++----------
 .../distributed/c10d/ProcessGroupXCCL.cpp     |  62 +++++++
 .../distributed/c10d/ProcessGroupXCCL.hpp     |   4 +-
 3 files changed, 135 insertions(+), 100 deletions(-)

diff --git a/test/distributed/test_c10d_ops_xccl.py b/test/distributed/test_c10d_ops_xccl.py
index a59d03a1750e1c..0e278b0e2deab4 100644
--- a/test/distributed/test_c10d_ops_xccl.py
+++ b/test/distributed/test_c10d_ops_xccl.py
@@ -59,42 +59,41 @@ def rank_to_GPU(self):
         # return rank to GPU map
         return init_multigpu_helper(self.world_size, "xccl")
 
-    # TODO: wait reduce
-    # @requires_xccl()
-    # @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs")
-    # def test_empty_tensors(self):
-    #     pg = self.pg
-    #     local_device_idx = self.rank_to_GPU[self.rank][0]
-
-    #     xs = [torch.FloatTensor([]).xpu(local_device_idx)]
-    #     pg.broadcast(xs).wait()
-    #     self.assertEqual(0, xs[0].numel())
-
-    #     pg.allreduce(xs).wait()
-    #     self.assertEqual(0, xs[0].numel())
-
-    #     pg.reduce(xs).wait()
-    #     self.assertEqual(0, xs[0].numel())
-
-    #     ys = [
-    #         [
-    #             torch.FloatTensor([]).xpu(local_device_idx)
-    #             for _ in range(self.world_size)
-    #         ]
-    #     ]
-    #     pg.allgather(ys, xs).wait()
-    #     for y in ys[0]:
-    #         self.assertEqual(0, y.numel())
-
-    #     ys = [torch.FloatTensor([]).xpu(local_device_idx)]
-    #     xs = [
-    #         [
-    #             torch.FloatTensor([]).xpu(local_device_idx)
-    #             for _ in range(self.world_size)
-    #         ]
-    #     ]
-    #     pg.reduce_scatter(ys, xs).wait()
-    #     self.assertEqual(0, ys[0].numel())
+    @requires_xccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs")
+    def test_empty_tensors(self):
+        pg = self.pg
+        local_device_idx = self.rank_to_GPU[self.rank][0]
+
+        xs = [torch.FloatTensor([]).xpu(local_device_idx)]
+        pg.broadcast(xs).wait()
+        self.assertEqual(0, xs[0].numel())
+
+        pg.allreduce(xs).wait()
+        self.assertEqual(0, xs[0].numel())
+
+        pg.reduce(xs).wait()
+        self.assertEqual(0, xs[0].numel())
+
+        ys = [
+            [
+                torch.FloatTensor([]).xpu(local_device_idx)
+                for _ in range(self.world_size)
+            ]
+        ]
+        pg.allgather(ys, xs).wait()
+        for y in ys[0]:
+            self.assertEqual(0, y.numel())
+
+        ys = [torch.FloatTensor([]).xpu(local_device_idx)]
+        xs = [
+            [
+                torch.FloatTensor([]).xpu(local_device_idx)
+                for _ in range(self.world_size)
+            ]
+        ]
+        pg.reduce_scatter(ys, xs).wait()
+        self.assertEqual(0, ys[0].numel())
 
     @requires_xccl()
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs")
@@ -206,71 +205,47 @@ def test_alltoall_ops_with_xpufree_race(self):
             work.wait()
         torch.xpu.synchronize(device=local_device)
 
-    # TODO: wait reduce
-    # @requires_xccl()
-    # @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs")
-    # def test_reduce_ops(self):
-    #     pg = self.pg
-    #     local_device_id = self.rank_to_GPU[self.rank][0]
+    @requires_xccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs")
+    def test_reduce_ops(self):
+        pg = self.pg
+        local_device_id = self.rank_to_GPU[self.rank][0]
 
-    #     def reduce(xs, rootRank, rootTensor, op=None):
-    #         opts = c10d.ReduceOptions()
-    #         opts.rootRank = rootRank
-    #         opts.rootTensor = rootTensor
-    #         if op:
-    #             opts.reduceOp = op
-    #         work = pg.reduce(xs, opts)
-    #         work.wait()
+        def reduce(xs, rootRank, rootTensor, op=None):
+            opts = c10d.ReduceOptions()
+            opts.rootRank = rootRank
+            opts.rootTensor = rootTensor
+            if op:
+                opts.reduceOp = op
+            work = pg.reduce(xs, opts)
+            work.wait()
 
-    #     # for every root tensor
-    #     for rt in range(self.world_size):
-    #         tensors = [torch.tensor([self.rank + 1]).xpu(local_device_id)]
+        # for every root tensor
+        for rt in range(self.world_size):
+            tensors = [torch.tensor([self.rank + 1]).xpu(local_device_id)]
 
-    #         reduce(tensors, rt, 0)
+            reduce(tensors, rt, 0)
+
+            if self.rank == rt:
+                self.assertEqual(
+                    torch.tensor([self.world_size * (self.world_size + 1) // 2]),
+                    tensors[0],
+                )
+            else:
+                self.assertEqual(
+                    torch.tensor([self.rank + 1]),
+                    tensors[0],
+                )
+
+            for op, err in zip(
+                (c10d.ReduceOp.BAND, c10d.ReduceOp.BOR, c10d.ReduceOp.BXOR),
+                ("ReduceOp.BAND", "ReduceOp.BOR", "ReduceOp.BXOR"),
+            ):
+                with self.assertRaisesRegex(
+                    ValueError, "Cannot use " + err + " with XCCL"
+                ):
+                    reduce(tensors, self.rank, rt, op)
 
-    #         if self.rank == rt:
-    #             self.assertEqual(
-    #                 torch.tensor([self.world_size * (self.world_size + 1) // 2]),
-    #                 tensors[0],
-    #             )
-    #         else:
-    #             self.assertEqual(
-    #                 torch.tensor([self.rank + 1]),
-    #                 tensors[0],
-    #             )
-
-    #         for op, err in zip(
-    #             (c10d.ReduceOp.BAND, c10d.ReduceOp.BOR, c10d.ReduceOp.BXOR),
-    #             ("ReduceOp.BAND", "ReduceOp.BOR", "ReduceOp.BXOR"),
-    #         ):
-    #             with self.assertRaisesRegex(
-    #                 ValueError, "Cannot use " + err + " with XCCL"
-    #             ):
-    #                 reduce(tensors, self.rank, rt, op)
-
-    #         # Premul sum
-    #         if torch.xpu.xccl.version() >= (2, 11, 1):
-    #             for factor in (3.0, torch.tensor([5.0], device=local_device_id)):
-    #                 if isinstance(factor, torch.Tensor):
-    #                     factor_ref = factor.cpu().item()
-    #                 else:
-    #                     factor_ref = factor
-    #                 float_tensors = [
-    #                     torch.tensor(
-    #                         [self.rank + 1.0], device=f"xpu:{local_device_id}"
-    #                     )
-    #                 ]
-    #                 float_tensors_ref = [
-    #                     torch.tensor(
-    #                         [(self.rank + 1.0) * factor_ref],
-    #                         device=f"xpu:{local_device_id}",
-    #                     )
-    #                 ]
-
-    #                 reduce(float_tensors_ref, rt, 0)
-    #                 reduce(float_tensors, rt, 0, c10d._make_xccl_premul_sum(factor))
-    #                 if self.rank == rt:
-    #                     self.assertEqual(float_tensors_ref[0], float_tensors[0])
 
     @requires_xccl()
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs")
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index 5d43694def146c..1ba775b9239879 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -228,6 +228,18 @@ ccl::reduction getXcclReduceOp(const ReduceOp& reduceOp, at::Tensor& input) {
   }
 }
 
+bool complexViewAsRealAllowed(const ReduceOp reduceOp) {
+  switch (reduceOp) {
+    case ReduceOp::SUM:
+      return true;
+    case ReduceOp::UNUSED:
+      return true;
+    default:
+      return false;
+  }
+  return false;
+}
+
 } // namespace
 
 static std::mutex xcclCommDevIdxMapMutex;
@@ -693,6 +705,14 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::allreduce(
     const AllreduceOptions& opts) {
   TORCH_CHECK(tensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
   auto tensor = tensors.back();
+  if (tensor.is_complex()) {
+    TORCH_CHECK(
+        complexViewAsRealAllowed(opts.reduceOp),
+        "all_reduce does not support",
+        opts.reduceOp,
+        "on complex tensors");
+    tensor = at::view_as_real(tensor);
+  }
   check_xpu_single_tensor(tensor);
   TORCH_CHECK(
       !isFloat8Type(tensor.scalar_type()),
@@ -769,6 +789,48 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::broadcast(
       OpType::BROADCAST);
 }
 
+c10::intrusive_ptr<Work> ProcessGroupXCCL::reduce(
+    std::vector<at::Tensor>& tensors,
+    const ReduceOptions& opts) {
+  TORCH_CHECK(tensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
+  // @lint-ignore CLANGTIDY
+  auto tensor = tensors.back();
+  if (tensor.is_complex()) {
+    TORCH_CHECK(
+        complexViewAsRealAllowed(opts.reduceOp),
+        "reduce does not support",
+        opts.reduceOp,
+        "on complex tensors");
+    tensor = at::view_as_real(tensor);
+  }
+  check_xpu_single_tensor(tensor);
+
+  return collective(
+      tensor,
+      tensor,
+      [&](at::Tensor& input,
+          at::Tensor& output,
+          ccl::reduce_attr attr,
+          xcclComm_t& comm,
+          at::xpu::XPUStream& stream) {
+        const int root = opts.rootRank + opts.rootTensor;
+        const auto xcclDataType = getXcclDataType(input.scalar_type());
+        const auto xcclReduceOp = getXcclReduceOp(opts.reduceOp, input);
+        ccl::event ret_evt;
+        ret_evt = ccl::reduce(
+            input.data_ptr(),
+            output.data_ptr(),
+            (size_t)input.numel(),
+            xcclDataType,
+            xcclReduceOp,
+            root,
+            comm,
+            ccl::create_stream(stream.queue()));
+        return ret_evt;
+      },
+      OpType::REDUCE);
+}
+
 c10::intrusive_ptr<Work> ProcessGroupXCCL::_reduce_oop(
     at::Tensor& outputTensor,
     at::Tensor& inputTensor,
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
index cfef4ace195f26..f7b946aab603f0 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
@@ -220,9 +220,7 @@ class TORCH_API ProcessGroupXCCL : public Backend {
 
   c10::intrusive_ptr<Work> reduce(
       std::vector<at::Tensor>& tensors,
-      const ReduceOptions& opts = ReduceOptions()) override {
-    TORCH_CHECK(false, "ProcessGroupXCCL::reduce not implemented");
-  }
+      const ReduceOptions& opts = ReduceOptions()) override;
 
   c10::intrusive_ptr<Work> broadcast(
       std::vector<at::Tensor>& tensors,

From 20b118822d797a5dd15c1cb6745336fcbb1e5aa7 Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Thu, 19 Sep 2024 06:43:18 +0000
Subject: [PATCH 63/96] Support gather

---
 test/distributed/test_c10d_ops_xccl.py        | 201 +++++++++---------
 .../distributed/c10d/ProcessGroupXCCL.cpp     |  96 +++++++++
 .../distributed/c10d/ProcessGroupXCCL.hpp     |   4 +-
 3 files changed, 197 insertions(+), 104 deletions(-)

diff --git a/test/distributed/test_c10d_ops_xccl.py b/test/distributed/test_c10d_ops_xccl.py
index 0e278b0e2deab4..3076444f2e4786 100644
--- a/test/distributed/test_c10d_ops_xccl.py
+++ b/test/distributed/test_c10d_ops_xccl.py
@@ -332,119 +332,118 @@ def allgather_base(output_t, input_t):
             # fails the check because the dtype is different
             allgather_base(output_t, tensor)
 
-    # TODO: wait gather
-    # @requires_xccl()
-    # @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs")
-    # def test_gather_ops(self):
-    #     pg = self.pg
-    #     local_device_ids = self.rank_to_GPU[self.rank]
-    #     num_gpus = len(local_device_ids)
-
-    #     def gather(output_t, input_t, rootRank):
-    #         opts = c10d.GatherOptions()
-    #         opts.rootRank = rootRank
-    #         if rootRank == self.rank:
-    #             work = pg.gather(output_t, input_t, opts)
-    #         else:
-    #             work = pg.gather([], input_t, opts)
-    #         work.wait()
-
-    #     # init input
-    #     tensors = []
-    #     for device_id in local_device_ids:
-    #         tensors.append(torch.tensor([self.rank]).xpu(device_id))
-
-    #     # init output
-    #     output_ts = []
-    #     for idx in range(num_gpus):
-    #         gpu_idx = local_device_ids[idx]
-    #         output_ts.append([])
-    #         for rank in range(self.world_size):
-    #             output_ts[idx].append(torch.tensor([-1]).xpu(gpu_idx))
-
-    #     expected = [[torch.tensor([rank]) for rank in range(self.world_size)]]
-    #     for rank in range(self.world_size):
-    #         gather(output_ts, tensors, rank)
-    #         if rank == self.rank:
-    #             self.assertEqual(expected, output_ts)
-
-    # @requires_xccl()
-    # @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs")
-    # def test_gather_stress(self):
-    #     pg = self.pg
-    #     local_device_ids = self.rank_to_GPU[self.rank]
-    #     num_gpus = len(local_device_ids)
+    @requires_xccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs")
+    def test_gather_ops(self):
+        pg = self.pg
+        local_device_ids = self.rank_to_GPU[self.rank]
+        num_gpus = len(local_device_ids)
 
-    #     def gather(output_t, input_t, rootRank):
-    #         opts = c10d.GatherOptions()
-    #         opts.rootRank = rootRank
-    #         if rootRank == self.rank:
-    #             work = pg.gather(output_t, input_t, opts)
-    #         else:
-    #             work = pg.gather([], input_t, opts)
-    #         work.wait()
+        def gather(output_t, input_t, rootRank):
+            opts = c10d.GatherOptions()
+            opts.rootRank = rootRank
+            if rootRank == self.rank:
+                work = pg.gather(output_t, input_t, opts)
+            else:
+                work = pg.gather([], input_t, opts)
+            work.wait()
 
-    #     stress_length = 1000
+        # init input
+        tensors = []
+        for device_id in local_device_ids:
+            tensors.append(torch.tensor([self.rank]).xpu(device_id))
+
+        # init output
+        output_ts = []
+        for idx in range(num_gpus):
+            gpu_idx = local_device_ids[idx]
+            output_ts.append([])
+            for rank in range(self.world_size):
+                output_ts[idx].append(torch.tensor([-1]).xpu(gpu_idx))
+
+        expected = [[torch.tensor([rank]) for rank in range(self.world_size)]]
+        for rank in range(self.world_size):
+            gather(output_ts, tensors, rank)
+            if rank == self.rank:
+                self.assertEqual(expected, output_ts)
 
-    #     # init input
-    #     tensors = []
-    #     for i in range(stress_length):
-    #         tensors.append([])
-    #         for device_id in local_device_ids:
-    #             tensors[i].append(torch.tensor([self.rank]).xpu(device_id))
+    @requires_xccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs")
+    def test_gather_stress(self):
+        pg = self.pg
+        local_device_ids = self.rank_to_GPU[self.rank]
+        num_gpus = len(local_device_ids)
 
-    #     # init output
-    #     output_ts = []
-    #     for i in range(stress_length):
-    #         output_ts.append([[] for _ in range(num_gpus)])
-    #         for idx, ls in enumerate(output_ts[i]):
-    #             gpu_idx = local_device_ids[idx]
-    #             for _ in range(self.world_size):
-    #                 ls.append(torch.tensor([-1]).xpu(gpu_idx))
+        def gather(output_t, input_t, rootRank):
+            opts = c10d.GatherOptions()
+            opts.rootRank = rootRank
+            if rootRank == self.rank:
+                work = pg.gather(output_t, input_t, opts)
+            else:
+                work = pg.gather([], input_t, opts)
+            work.wait()
 
-    #     expected = [[torch.tensor([rank]) for rank in range(self.world_size)]]
-    #     for i in range(stress_length):
-    #         for rank in range(self.world_size):
-    #             gather(output_ts[i], tensors[i], rank)
-    #             # Verification
-    #             if rank == self.rank:
-    #                 self.assertEqual(output_ts[i], expected)
+        stress_length = 1000
+
+        # init input
+        tensors = []
+        for i in range(stress_length):
+            tensors.append([])
+            for device_id in local_device_ids:
+                tensors[i].append(torch.tensor([self.rank]).xpu(device_id))
+
+        # init output
+        output_ts = []
+        for i in range(stress_length):
+            output_ts.append([[] for _ in range(num_gpus)])
+            for idx, ls in enumerate(output_ts[i]):
+                gpu_idx = local_device_ids[idx]
+                for _ in range(self.world_size):
+                    ls.append(torch.tensor([-1]).xpu(gpu_idx))
+
+        expected = [[torch.tensor([rank]) for rank in range(self.world_size)]]
+        for i in range(stress_length):
+            for rank in range(self.world_size):
+                gather(output_ts[i], tensors[i], rank)
+                # Verification
+                if rank == self.rank:
+                    self.assertEqual(output_ts[i], expected)
 
-    # @requires_xccl()
-    # @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs")
-    # def test_gather_checks(self):
-    #     pg = self.pg
-    #     device_id = self.rank_to_GPU[self.rank][0]
+    @requires_xccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs")
+    def test_gather_checks(self):
+        pg = self.pg
+        device_id = self.rank_to_GPU[self.rank][0]
 
-    #     # init input
-    #     tensor = torch.tensor([self.rank]).xpu(device_id)
+        # init input
+        tensor = torch.tensor([self.rank]).xpu(device_id)
 
-    #     # init output
-    #     output_ts = []
-    #     for rank in range(self.world_size):
-    #         output_ts.append(torch.tensor([-1]).xpu(device_id))
+        # init output
+        output_ts = []
+        for rank in range(self.world_size):
+            output_ts.append(torch.tensor([-1]).xpu(device_id))
 
-    #     with self.assertRaisesRegex(ValueError, "invalid root rank"):
-    #         opts = c10d.GatherOptions()
-    #         opts.rootRank = -1
-    #         pg.gather([output_ts], [tensor], opts)
+        with self.assertRaisesRegex(ValueError, "invalid root rank"):
+            opts = c10d.GatherOptions()
+            opts.rootRank = -1
+            pg.gather([output_ts], [tensor], opts)
 
-    #     with self.assertRaisesRegex(TypeError, "incompatible function arguments"):
-    #         pg.gather([output_ts], [tensor], 0)
+        with self.assertRaisesRegex(TypeError, "incompatible function arguments"):
+            pg.gather([output_ts], [tensor], 0)
 
-    #     with self.assertRaisesRegex(ValueError, "invalid root rank"):
-    #         opts = c10d.GatherOptions()
-    #         opts.rootRank = self.world_size
-    #         pg.gather([output_ts], [tensor], opts)
+        with self.assertRaisesRegex(ValueError, "invalid root rank"):
+            opts = c10d.GatherOptions()
+            opts.rootRank = self.world_size
+            pg.gather([output_ts], [tensor], opts)
 
-    #     with self.assertRaisesRegex(
-    #         # throws error message from dispatcher
-    #         RuntimeError,
-    #         "There were no tensor arguments to this function",
-    #     ):
-    #         opts = c10d.GatherOptions()
-    #         opts.rootRank = 0
-    #         pg.gather([output_ts], [], opts)
+        with self.assertRaisesRegex(
+            # throws error message from dispatcher
+            RuntimeError,
+            "There were no tensor arguments to this function",
+        ):
+            opts = c10d.GatherOptions()
+            opts.rootRank = 0
+            pg.gather([output_ts], [], opts)
 
     # TODO: wait scatter
     # @requires_xccl()
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index 1ba775b9239879..c34583d14c2017 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -672,6 +672,102 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collectiveCoalesced(
   return work;
 }
 
+c10::intrusive_ptr<Work> ProcessGroupXCCL::gather(
+    std::vector<std::vector<at::Tensor>>& outputTensors,
+    std::vector<at::Tensor>& inputTensors,
+    const GatherOptions& opts) {
+  static auto invalidArgument = [](const std::string& msg) {
+    C10_THROW_ERROR(ValueError, "ProcessGroupXCCL::gather: " + msg);
+  };
+
+  assertRootRank(invalidArgument, opts.rootRank, size_);
+
+  TORCH_CHECK(inputTensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
+  // @lint-ignore CLANGTIDY
+  auto inputTensor = inputTensors.back();
+
+  std::vector<at::Tensor> outputs;
+
+  if (getRank() == opts.rootRank) {
+    if (outputTensors.size() != 1) {
+      std::stringstream ss;
+      ss << "requires a single-element output list containing a list with "
+         << getSize() << " tensors.";
+      invalidArgument(ss.str());
+    } else if (outputTensors[0].size() != static_cast<size_t>(getSize())) {
+      std::stringstream ss;
+      ss << "Incorrect output list size " << outputTensors[0].size()
+         << ". Output list size should be " << getSize()
+         << ", same as size of the process group.";
+      invalidArgument(ss.str());
+    }
+
+    const auto& options = inputTensor.options();
+    const auto& sizes = inputTensor.sizes();
+    assertTypeAndSizesMatch(invalidArgument, outputTensors[0], options, sizes);
+    outputs = outputTensors[0];
+  } else {
+    // if not in the root rank, initialize outputs as empty list
+    if (outputTensors.size() != 0) {
+      invalidArgument("requires empty output on non-root");
+    }
+    outputs = {};
+    // append a empty tensor to the list, we don't use it but the
+    // `collective` template function requires it to invoke its function
+    outputs.emplace_back();
+  }
+
+  auto inputs = std::vector<at::Tensor>{inputTensor};
+  return collective(
+      inputs,
+      outputs, // just to fit the collective interface
+      [&](at::Tensor& /* unused */,
+          at::Tensor& /* unused */,
+          ccl::allgather_attr attr, // just to fit interface
+          xcclComm_t& comm,
+          at::xpu::XPUStream& stream) {
+        const auto root = opts.rootRank;
+        if (getRank() == root) {
+          for (auto output : outputs) {
+            c10::xpu::XPUCachingAllocator::recordStream(
+                output.storage().data_ptr(), stream);
+          }
+        }
+        {
+          ccl::event ret_evt;
+          auto xcclDataType = getXcclDataType(inputTensor.scalar_type());
+          if (rank_ == root) {
+            for (const auto r : c10::irange(size_)) {
+              if (r != root) {
+                // do receive
+                ret_evt = ccl::recv(
+                    outputs[r].data_ptr(),
+                    (size_t)inputTensor.numel(),
+                    xcclDataType,
+                    r,
+                    comm,
+                    ccl::create_stream(stream.queue()));
+              } else {
+                // on its own rank, simply copy from the input
+                outputs[r].copy_(inputTensor);
+              }
+            }
+          } else {
+            // do send
+            ret_evt = ccl::send(
+                inputTensor.data_ptr(),
+                (size_t)inputTensor.numel(),
+                xcclDataType,
+                root,
+                comm,
+                ccl::create_stream(stream.queue()));
+          }
+          return ret_evt;
+        }
+      },
+      OpType::GATHER);
+}
+
 c10::intrusive_ptr<Work> ProcessGroupXCCL::allreduce_impl(
     at::Tensor& tensor,
     const AllreduceOptions& opts) {
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
index f7b946aab603f0..ec0e1b805f579f 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
@@ -304,9 +304,7 @@ class TORCH_API ProcessGroupXCCL : public Backend {
   c10::intrusive_ptr<Work> gather(
       std::vector<std::vector<at::Tensor>>& outputTensors,
       std::vector<at::Tensor>& inputTensors,
-      const GatherOptions& opts = GatherOptions()) override {
-    TORCH_CHECK(false, "ProcessGroupXCCL::gather not implemented");
-  }
+      const GatherOptions& opts = GatherOptions()) override;
 
   c10::intrusive_ptr<Work> scatter(
       std::vector<at::Tensor>& outputTensors,

From 1463eca58ba59dd98a39ebc425fc9c7bd93ef164 Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Thu, 19 Sep 2024 07:14:25 +0000
Subject: [PATCH 64/96] Support scatter

---
 test/distributed/test_c10d_ops_xccl.py        | 233 +++++++++---------
 .../distributed/c10d/ProcessGroupXCCL.cpp     | 101 ++++++++
 .../distributed/c10d/ProcessGroupXCCL.hpp     |   4 +-
 3 files changed, 218 insertions(+), 120 deletions(-)

diff --git a/test/distributed/test_c10d_ops_xccl.py b/test/distributed/test_c10d_ops_xccl.py
index 3076444f2e4786..8cfce2be164d9f 100644
--- a/test/distributed/test_c10d_ops_xccl.py
+++ b/test/distributed/test_c10d_ops_xccl.py
@@ -445,125 +445,124 @@ def test_gather_checks(self):
             opts.rootRank = 0
             pg.gather([output_ts], [], opts)
 
-    # TODO: wait scatter
-    # @requires_xccl()
-    # @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs")
-    # def test_scatter_ops(self):
-    #     pg = self.pg
-    #     local_device_ids = self.rank_to_GPU[self.rank]
-    #     num_gpus = len(local_device_ids)
-
-    #     def scatter(output_t, input_t, rootRank):
-    #         opts = c10d.ScatterOptions()
-    #         opts.rootRank = rootRank
-    #         if rootRank == self.rank:
-    #             work = pg.scatter(output_t, input_t, opts)
-    #         else:
-    #             work = pg.scatter(output_t, [], opts)
-    #         work.wait()
-
-    #     # init output
-    #     tensors = []
-    #     for device_id in local_device_ids:
-    #         tensors.append(torch.tensor([-1]).xpu(device_id))
-
-    #     # init input
-    #     scatter_list = []
-    #     for idx in range(num_gpus):
-    #         gpu_idx = local_device_ids[idx]
-    #         scatter_list.append([])
-    #         for rank in range(self.world_size):
-    #             scatter_list[idx].append(torch.tensor([rank]).xpu(gpu_idx))
-
-    #     # test each rank to scatter
-    #     expected = [torch.tensor([self.rank])]
-    #     for rank in range(self.world_size):
-    #         scatter(tensors, scatter_list, rank)
-    #         self.assertEqual(expected, tensors)
+    @requires_xccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs")
+    def test_scatter_ops(self):
+        pg = self.pg
+        local_device_ids = self.rank_to_GPU[self.rank]
+        num_gpus = len(local_device_ids)
 
-    # @requires_xccl()
-    # @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs")
-    # def test_scatter_stress(self):
-    #     pg = self.pg
-    #     local_device_ids = self.rank_to_GPU[self.rank]
-    #     num_gpus = len(local_device_ids)
-
-    #     def scatter(output_t, input_t, rootRank):
-    #         opts = c10d.ScatterOptions()
-    #         opts.rootRank = rootRank
-    #         if rootRank == self.rank:
-    #             work = pg.scatter(output_t, input_t, opts)
-    #         else:
-    #             work = pg.scatter(output_t, [], opts)
-    #         work.wait()
-
-    #     stress_length = 1000
-
-    #     # init output
-    #     tensors = []
-    #     for i in range(stress_length):
-    #         tensors.append([])
-    #         for device_id in local_device_ids:
-    #             tensors[i].append(torch.tensor([-1]).xpu(device_id))
-
-    #     # init input
-    #     scatter_list = []
-    #     for i in range(stress_length):
-    #         scatter_list.append([[] for _ in range(num_gpus)])
-    #         for idx, ls in enumerate(scatter_list[i]):
-    #             gpu_idx = local_device_ids[idx]
-    #             for rank in range(self.world_size):
-    #                 ls.append(torch.tensor([rank]).xpu(gpu_idx))
-
-    #     # test each rank to scatter
-    #     expected = [torch.tensor([self.rank])]
-    #     for i in range(stress_length):
-    #         for rank in range(self.world_size):
-    #             scatter(tensors[i], scatter_list[i], rank)
-    #             # Verification
-    #             self.assertEqual(tensors[i], expected)
+        def scatter(output_t, input_t, rootRank):
+            opts = c10d.ScatterOptions()
+            opts.rootRank = rootRank
+            if rootRank == self.rank:
+                work = pg.scatter(output_t, input_t, opts)
+            else:
+                work = pg.scatter(output_t, [], opts)
+            work.wait()
 
-    # @requires_xccl()
-    # @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs")
-    # def test_scatter_checks(self):
-    #     pg = self.pg
-    #     local_device_ids = self.rank_to_GPU[self.rank]
-    #     num_gpus = len(local_device_ids)
-
-    #     # init output
-    #     tensors = []
-    #     for device_id in local_device_ids:
-    #         tensors.append(torch.tensor([-1]).xpu(device_id))
-
-    #     # init input
-    #     scatter_list = []
-    #     for idx in range(num_gpus):
-    #         gpu_idx = local_device_ids[idx]
-    #         scatter_list.append([])
-    #         for rank in range(self.world_size):
-    #             scatter_list[idx].append(torch.tensor([rank]).xpu(gpu_idx))
-
-    #     with self.assertRaisesRegex(ValueError, "invalid root rank"):
-    #         opts = c10d.ScatterOptions()
-    #         opts.rootRank = -1
-    #         pg.scatter(tensors, scatter_list, opts)
-
-    #     with self.assertRaisesRegex(TypeError, "incompatible function arguments"):
-    #         pg.scatter(tensors, scatter_list, 0)
-
-    #     with self.assertRaisesRegex(ValueError, "invalid root rank"):
-    #         opts = c10d.ScatterOptions()
-    #         opts.rootRank = self.world_size
-    #         pg.scatter(tensors, scatter_list, opts)
-
-    #     with self.assertRaisesRegex(
-    #         # throws error message from dispatcher
-    #         RuntimeError,
-    #         "There were no tensor arguments to this function",
-    #     ):
-    #         opts = c10d.ScatterOptions()
-    #         opts.rootRank = 0
-    #         pg.scatter([], scatter_list, opts)
+        # init output
+        tensors = []
+        for device_id in local_device_ids:
+            tensors.append(torch.tensor([-1]).xpu(device_id))
+
+        # init input
+        scatter_list = []
+        for idx in range(num_gpus):
+            gpu_idx = local_device_ids[idx]
+            scatter_list.append([])
+            for rank in range(self.world_size):
+                scatter_list[idx].append(torch.tensor([rank]).xpu(gpu_idx))
+
+        # test each rank to scatter
+        expected = [torch.tensor([self.rank])]
+        for rank in range(self.world_size):
+            scatter(tensors, scatter_list, rank)
+            self.assertEqual(expected, tensors)
+
+    @requires_xccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs")
+    def test_scatter_stress(self):
+        pg = self.pg
+        local_device_ids = self.rank_to_GPU[self.rank]
+        num_gpus = len(local_device_ids)
+
+        def scatter(output_t, input_t, rootRank):
+            opts = c10d.ScatterOptions()
+            opts.rootRank = rootRank
+            if rootRank == self.rank:
+                work = pg.scatter(output_t, input_t, opts)
+            else:
+                work = pg.scatter(output_t, [], opts)
+            work.wait()
+
+        stress_length = 1000
+
+        # init output
+        tensors = []
+        for i in range(stress_length):
+            tensors.append([])
+            for device_id in local_device_ids:
+                tensors[i].append(torch.tensor([-1]).xpu(device_id))
+
+        # init input
+        scatter_list = []
+        for i in range(stress_length):
+            scatter_list.append([[] for _ in range(num_gpus)])
+            for idx, ls in enumerate(scatter_list[i]):
+                gpu_idx = local_device_ids[idx]
+                for rank in range(self.world_size):
+                    ls.append(torch.tensor([rank]).xpu(gpu_idx))
+
+        # test each rank to scatter
+        expected = [torch.tensor([self.rank])]
+        for i in range(stress_length):
+            for rank in range(self.world_size):
+                scatter(tensors[i], scatter_list[i], rank)
+                # Verification
+                self.assertEqual(tensors[i], expected)
+
+    @requires_xccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs")
+    def test_scatter_checks(self):
+        pg = self.pg
+        local_device_ids = self.rank_to_GPU[self.rank]
+        num_gpus = len(local_device_ids)
+
+        # init output
+        tensors = []
+        for device_id in local_device_ids:
+            tensors.append(torch.tensor([-1]).xpu(device_id))
+
+        # init input
+        scatter_list = []
+        for idx in range(num_gpus):
+            gpu_idx = local_device_ids[idx]
+            scatter_list.append([])
+            for rank in range(self.world_size):
+                scatter_list[idx].append(torch.tensor([rank]).xpu(gpu_idx))
+
+        with self.assertRaisesRegex(ValueError, "invalid root rank"):
+            opts = c10d.ScatterOptions()
+            opts.rootRank = -1
+            pg.scatter(tensors, scatter_list, opts)
+
+        with self.assertRaisesRegex(TypeError, "incompatible function arguments"):
+            pg.scatter(tensors, scatter_list, 0)
+
+        with self.assertRaisesRegex(ValueError, "invalid root rank"):
+            opts = c10d.ScatterOptions()
+            opts.rootRank = self.world_size
+            pg.scatter(tensors, scatter_list, opts)
+
+        with self.assertRaisesRegex(
+            # throws error message from dispatcher
+            RuntimeError,
+            "There were no tensor arguments to this function",
+        ):
+            opts = c10d.ScatterOptions()
+            opts.rootRank = 0
+            pg.scatter([], scatter_list, opts)
 
     @requires_xccl()
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs")
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index c34583d14c2017..638b969fb2b5e7 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -768,6 +768,107 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::gather(
       OpType::GATHER);
 }
 
+c10::intrusive_ptr<Work> ProcessGroupXCCL::scatter(
+    std::vector<at::Tensor>& outputTensors,
+    std::vector<std::vector<at::Tensor>>& inputTensors,
+    const ScatterOptions& opts) {
+  static auto invalidArgument = [](const std::string& msg) {
+    C10_THROW_ERROR(ValueError, "ProcessGroupXCCL::scatter: " + msg);
+  };
+
+  assertRootRank(invalidArgument, opts.rootRank, size_);
+
+  TORCH_CHECK(outputTensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
+  auto outputTensor = outputTensors.back();
+
+  std::vector<at::Tensor> inputs;
+
+  if (getRank() == opts.rootRank) {
+    if (inputTensors.size() != 1) {
+      std::stringstream ss;
+      ss << "requires a single-element input list containing a list with "
+         << getSize() << " tensors.";
+      invalidArgument(ss.str());
+    } else if (inputTensors[0].size() != static_cast<size_t>(getSize())) {
+      std::stringstream ss;
+      ss << "Incorrect input list size " << inputTensors[0].size()
+         << ". Input list size should be " << getSize()
+         << ", same as size of the process group.";
+      invalidArgument(ss.str());
+    }
+
+    const auto& options = outputTensor.options();
+    const auto& sizes = outputTensor.sizes();
+    assertTypeAndSizesMatch(invalidArgument, inputTensors[0], options, sizes);
+    inputs = inputTensors[0];
+  } else {
+    // if not in the root rank, initialize inputTensors as empty place holder
+    // with an empty list
+    if (inputTensors.size() != 0) {
+      invalidArgument("requires empty input on non-root");
+    }
+    inputs = {};
+    // append a empty tensor to the list, we don't use it but the
+    // `collective` template function requires it to invoke its function
+    inputs.emplace_back();
+  }
+
+  const auto root = opts.rootRank;
+
+  auto outputs = std::vector<at::Tensor>{outputTensor};
+  return collective(
+      outputs,
+      inputs, // just to fit the collective interface
+      [&](at::Tensor& /* unused */,
+          at::Tensor& /* unused */,
+          ccl::allgather_attr attr, // just to fit interface
+          xcclComm_t& comm,
+          at::xpu::XPUStream& stream) {
+        if (getRank() == root) {
+          for (auto input : inputs) {
+            c10::xpu::XPUCachingAllocator::recordStream(
+                input.storage().data_ptr(), stream);
+          }
+        }
+        {
+          ccl::event ret_evt;
+          if (rank_ == root) {
+            for (const auto r : c10::irange(size_)) {
+              if (r != root) {
+                // do send
+                size_t send_count = inputs[r].numel();
+                auto send_type = getXcclDataType(inputs[r].scalar_type());
+                ret_evt = ccl::send(
+                    inputs[r].data_ptr(),
+                    send_count,
+                    send_type,
+                    r,
+                    comm,
+                    ccl::create_stream(stream.queue()));
+              } else {
+                // on its own rank, simply copy from the input
+                outputTensor.copy_(inputs[r]);
+              }
+            }
+          } else {
+            // do receive
+            size_t recv_count = outputTensor.numel();
+            auto recv_type = getXcclDataType(outputTensor.scalar_type());
+            ret_evt = ccl::recv(
+                outputTensor.data_ptr(),
+                recv_count,
+                recv_type,
+                root,
+                comm,
+                ccl::create_stream(stream.queue()));
+          }
+
+          return ret_evt;
+        }
+      },
+      OpType::SCATTER);
+}
+
 c10::intrusive_ptr<Work> ProcessGroupXCCL::allreduce_impl(
     at::Tensor& tensor,
     const AllreduceOptions& opts) {
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
index ec0e1b805f579f..690aec54e8cc0b 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
@@ -309,9 +309,7 @@ class TORCH_API ProcessGroupXCCL : public Backend {
   c10::intrusive_ptr<Work> scatter(
       std::vector<at::Tensor>& outputTensors,
       std::vector<std::vector<at::Tensor>>& inputTensors,
-      const ScatterOptions& opts = ScatterOptions()) override {
-    TORCH_CHECK(false, "ProcessGroupXCCL::scatter not implemented");
-  }
+      const ScatterOptions& opts = ScatterOptions()) override;
 
  protected:
   std::unordered_map<std::string, at::xpu::XPUStream> xcclStreams_;

From 156c2ac9bccb6159cd538de3e54ec484d2399787 Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Thu, 19 Sep 2024 09:21:27 +0000
Subject: [PATCH 65/96] update

---
 torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp |  2 +-
 torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp | 13 +++++++------
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index 6b57a6c5471b36..5aeeb62bee1ece 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -52,7 +52,7 @@ XCCL_KVS get_kvs(int rank, c10d::Store& store) {
   std::lock_guard<std::mutex> lock(kvs_mutex);
   if (kvs)
     return kvs;
-  std::string storeKey = "ccl_kvs";
+  std::string storeKey = "xccl_kvs";
 
   // Rank 0 broadcast the bootstrap network information to other ranks
   if (rank == 0) {
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
index 96f7e46e7c378d..14a9f398a8cbe7 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
@@ -48,12 +48,13 @@ int getXCCLEnvVar(std::string envVarName) {
   }
 }
 
-void setXCCLEnvVar(std::string envVarName, int val) {
-  setenv(envVarName.c_str(), std::to_string(val).c_str(), val);
-}
-
-void setXCCLEnvVar(std::string envVarName, std::string val) {
-  setenv(envVarName.c_str(), val.c_str(), 1);
+template <typename T>
+void setXCCLEnvVar(const std::string& envVarName, T val) {
+  if constexpr (std::is_same_v<T, int>) {
+    setenv(envVarName.c_str(), std::to_string(val).c_str(), 1);
+  } else if constexpr (std::is_same_v<T, std::string>) {
+    setenv(envVarName.c_str(), val.c_str(), 1);
+  }
 }
 
 bool with_mpirun() {

From 652da01588ecb64e354707bc52496c28c01f07ce Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Thu, 29 Aug 2024 09:28:58 +0000
Subject: [PATCH 66/96] Xccl process group for Pytorch

---
 CMakeLists.txt                                |   6 +
 build_variables.bzl                           |   4 +
 caffe2/CMakeLists.txt                         |  13 +
 caffe2/core/macros.h.in                       |   1 +
 cmake/Dependencies.cmake                      |  16 +
 cmake/External/xccl.cmake                     |  17 +
 cmake/Modules/FindXCCL.cmake                  |  68 +++
 cmake/Summary.cmake                           |   6 +
 setup.py                                      |   4 +
 test/distributed/test_c10d_common.py          |   9 +-
 test/distributed/test_c10d_xccl.py            | 303 +++++++++++++
 torch/CMakeLists.txt                          |   7 +
 torch/_C/_distributed_c10d.pyi                |   9 +
 torch/csrc/distributed/c10d/Ops.cpp           |  20 +
 torch/csrc/distributed/c10d/ProcessGroup.cpp  |   2 +
 torch/csrc/distributed/c10d/ProcessGroup.hpp  |   3 +
 .../distributed/c10d/ProcessGroupXCCL.cpp     | 401 ++++++++++++++++++
 .../distributed/c10d/ProcessGroupXCCL.hpp     | 308 ++++++++++++++
 torch/csrc/distributed/c10d/init.cpp          |  22 +
 torch/distributed/distributed_c10d.py         |  48 ++-
 torch/testing/_internal/common_distributed.py |  11 +-
 21 files changed, 1268 insertions(+), 10 deletions(-)
 create mode 100644 cmake/External/xccl.cmake
 create mode 100644 cmake/Modules/FindXCCL.cmake
 create mode 100644 test/distributed/test_c10d_xccl.py
 create mode 100644 torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
 create mode 100644 torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5139c0a478e788..89ef59681bfff4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -275,6 +275,8 @@ option(USE_NATIVE_ARCH "Use -march=native" OFF)
 cmake_dependent_option(USE_MPS "Use MPS for macOS build" ON "MPS_FOUND" OFF)
 cmake_dependent_option(USE_NCCL "Use NCCL" ON
                        "USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
+cmake_dependent_option(USE_XCCL "Use XCCL" ON
+                       "USE_XPU;UNIX;NOT APPLE" OFF)
 cmake_dependent_option(USE_RCCL "Use RCCL" ON USE_NCCL OFF)
 cmake_dependent_option(USE_STATIC_NCCL "Use static NCCL" OFF "USE_NCCL" OFF)
 cmake_dependent_option(USE_SYSTEM_NCCL "Use system-wide NCCL" OFF "USE_NCCL"
@@ -353,6 +355,8 @@ cmake_dependent_option(USE_C10D_GLOO "USE C10D GLOO" ON
                        "USE_DISTRIBUTED;USE_GLOO" OFF)
 cmake_dependent_option(USE_C10D_NCCL "USE C10D NCCL" ON
                        "USE_DISTRIBUTED;USE_NCCL" OFF)
+cmake_dependent_option(USE_C10D_XCCL "USE C10D XCCL" ON
+                       "USE_DISTRIBUTED;USE_XCCL" OFF)
 cmake_dependent_option(USE_C10D_MPI "USE C10D MPI" ON "USE_DISTRIBUTED;USE_MPI"
                        OFF)
 cmake_dependent_option(
@@ -365,6 +369,8 @@ cmake_dependent_option(
     USE_C10D_GLOO "USE C10D GLOO" ON "USE_DISTRIBUTED;USE_GLOO" OFF)
 cmake_dependent_option(
     USE_C10D_NCCL "USE C10D NCCL" ON "USE_DISTRIBUTED;USE_NCCL" OFF)
+cmake_dependent_option(
+    USE_C10D_XCCL "USE C10D XCCL" ON "USE_DISTRIBUTED;USE_XCCL" OFF)
 cmake_dependent_option(
     USE_C10D_MPI "USE C10D MPI" ON "USE_DISTRIBUTED;USE_MPI" OFF)
 cmake_dependent_option(
diff --git a/build_variables.bzl b/build_variables.bzl
index e05c94bd83f577..98b721617b609c 100644
--- a/build_variables.bzl
+++ b/build_variables.bzl
@@ -700,6 +700,10 @@ libtorch_cuda_sources = libtorch_cuda_core_sources + libtorch_cuda_distributed_s
     "torch/csrc/cuda/nccl.cpp",
 ]
 
+libtorch_xpu_distributed_extra_sources = [
+    "torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp",
+]
+
 torch_cpp_srcs = [
     "torch/csrc/api/src/cuda.cpp",  # this just forwards stuff, no real CUDA
     "torch/csrc/api/src/data/datasets/mnist.cpp",
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 8ed93cdff0479c..d44a8da210462f 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -1014,6 +1014,9 @@ elseif(USE_CUDA)
 endif()
 
 if(USE_XPU)
+  if(USE_XCCL)
+    append_filelist("libtorch_xpu_distributed_extra_sources" Caffe2_XPU_SRCS)
+  endif()
   add_library(torch_xpu ${Caffe2_XPU_SRCS})
   torch_compile_options(torch_xpu)  # see cmake/public/utils.cmake
   target_compile_definitions(torch_xpu PRIVATE USE_XPU)
@@ -1079,6 +1082,10 @@ if(USE_XPU)
     include_directories(SYSTEM ${ATen_XPU_INCLUDE_DIRS})
 
   endif()
+  if(USE_XCCL)
+    target_link_libraries(torch_xpu PRIVATE torch::xccl)
+    target_compile_definitions(torch_xpu PRIVATE USE_XCCL)
+  endif()
 endif()
 
 if(NOT MSVC AND USE_XNNPACK)
@@ -1365,6 +1372,12 @@ if(USE_DISTRIBUTED)
       target_compile_definitions(torch_cuda PUBLIC USE_C10D_NCCL)
     endif()
   endif()
+  if(USE_C10D_XCCL)
+    target_compile_definitions(torch_xpu PUBLIC USE_C10D_XCCL)
+    set_source_files_properties(
+      ${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+      PROPERTIES COMPILE_DEFINITIONS  "CCL_ENABLE_ZE;CCL_ENABLE_SYCL")
+  endif()
   if(USE_MPI AND USE_C10D_MPI)
     if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
       set_source_files_properties(
diff --git a/caffe2/core/macros.h.in b/caffe2/core/macros.h.in
index 2929f105b31faa..e5398a83cad947 100644
--- a/caffe2/core/macros.h.in
+++ b/caffe2/core/macros.h.in
@@ -45,6 +45,7 @@
   {"USE_CUDNN", "${USE_CUDNN}"}, \
   {"CUDNN_VERSION", "${CUDNN_VERSION}"}, \
   {"USE_NCCL", "${USE_NCCL}"}, \
+  {"USE_XCCL", "${USE_XCCL}"}, \
   {"USE_MPI", "${USE_MPI}"}, \
   {"USE_GFLAGS", "${USE_GFLAGS}"}, \
   {"USE_GLOG", "${USE_GLOG}"}, \
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index ef33a3165340c1..8abea841fcf61c 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -1150,6 +1150,22 @@ if(USE_CUDA)
   include_directories(SYSTEM ${CUB_INCLUDE_DIRS})
 endif()
 
+# ---[ XCCL
+if(USE_XCCL)
+  if(NOT USE_XPU)
+    message(WARNING
+        "Not using XPU, so disabling USE_XCCL. Suppress this warning with "
+        "-DUSE_XCCL=OFF.")
+    caffe2_update_option(USE_XCCL OFF)
+  elseif(NOT CMAKE_SYSTEM_NAME STREQUAL "Linux")
+    message(WARNING "USE_XCCL is currently only supported under Linux.")
+    caffe2_update_option(USE_XCCL OFF)
+  else()
+    include(${CMAKE_CURRENT_LIST_DIR}/External/xccl.cmake)
+    list(APPEND Caffe2_XPU_DEPENDENCY_LIBS torch::xccl)
+  endif()
+endif()
+
 if(USE_DISTRIBUTED AND USE_TENSORPIPE)
   if(MSVC)
     message(WARNING "Tensorpipe cannot be used on Windows.")
diff --git a/cmake/External/xccl.cmake b/cmake/External/xccl.cmake
new file mode 100644
index 00000000000000..56205b381b1324
--- /dev/null
+++ b/cmake/External/xccl.cmake
@@ -0,0 +1,17 @@
+if(NOT __XCCL_INCLUDED)
+  set(__XCCL_INCLUDED TRUE)
+
+  if(USE_XCCL)
+    # XCCL_ROOT, XCCL_LIBRARY_DIR, XCCL_INCLUDE_DIR are handled by FindXCCL.cmake.
+    find_package(XCCL REQUIRED)
+    if(XCCL_FOUND)
+      add_library(torch::xccl INTERFACE IMPORTED)
+      set_property(
+        TARGET torch::xccl PROPERTY INTERFACE_INCLUDE_DIRECTORIES
+        ${XCCL_INCLUDE_DIR})
+      set_property(
+        TARGET torch::xccl PROPERTY INTERFACE_LINK_LIBRARIES
+        ${XCCL_LIBRARY})
+    endif()
+  endif()
+endif()
diff --git a/cmake/Modules/FindXCCL.cmake b/cmake/Modules/FindXCCL.cmake
new file mode 100644
index 00000000000000..56b7fc0f7dcf32
--- /dev/null
+++ b/cmake/Modules/FindXCCL.cmake
@@ -0,0 +1,68 @@
+# This will define the following variables:
+# XCCL_FOUND               : True if the system has the XCCL library.
+# XCCL_INCLUDE_DIR         : Include directories needed to use XCCL.
+# XCCL_LIBRARY_DIR         ：The path to the XCCL library.
+# XCCL_LIBRARY             : XCCL library fullname.
+
+include(FindPackageHandleStandardArgs)
+
+set(XCCL_ROOT "")
+if(DEFINED ENV{CCL_ROOT})
+  set(XCCL_ROOT $ENV{CCL_ROOT})
+endif()
+
+string(COMPARE EQUAL "${XCCL_ROOT}" "" nosyclfound)
+if(nosyclfound)
+  set(XCCL_FOUND False)
+  set(XCCL_REASON_FAILURE "XCCL library not set!!")
+  set(XCCL_NOT_FOUND_MESSAGE "${XCCL_REASON_FAILURE}")
+  return()
+endif()
+
+# Find include path from binary.
+find_file(
+  XCCL_INCLUDE_DIR
+  NAMES include
+  HINTS ${XCCL_ROOT}
+  NO_DEFAULT_PATH
+)
+
+# Find include/oneapi path from include path.
+find_file(
+  XCCL_INCLUDE_ONEAPI_DIR
+  NAMES oneapi
+  HINTS ${XCCL_ROOT}/include/
+  NO_DEFAULT_PATH
+)
+
+list(APPEND XCCL_INCLUDE_DIR ${XCCL_INCLUDE_ONEAPI_DIR})
+
+# Find library directory from binary.
+find_file(
+  XCCL_LIBRARY_DIR
+  NAMES lib
+  HINTS ${XCCL_ROOT}
+  NO_DEFAULT_PATH
+)
+
+# Find XCCL library fullname.
+find_library(
+  XCCL_LIBRARY
+  NAMES ccl
+  HINTS ${XCCL_LIBRARY_DIR}
+  NO_DEFAULT_PATH
+)
+
+if((NOT XCCL_INCLUDE_DIR) OR (NOT XCCL_LIBRARY_DIR) OR (NOT XCCL_LIBRARY))
+  set(XCCL_FOUND False)
+  set(XCCL_REASON_FAILURE "XCCL library is incomplete!!")
+  set(XCCL_NOT_FOUND_MESSAGE "${XCCL_REASON_FAILURE}")
+  return()
+endif()
+
+find_package_handle_standard_args(
+  XCCL
+  FOUND_VAR XCCL_FOUND
+  REQUIRED_VARS XCCL_INCLUDE_DIR XCCL_LIBRARY_DIR XCCL_LIBRARY
+  REASON_FAILURE_MESSAGE "${XCCL_REASON_FAILURE}"
+)
diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
index d51c451589c2c4..229ff112ab3187 100644
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@@ -153,6 +153,12 @@ function(caffe2_print_configuration_summary)
     message(STATUS "    USE_SYSTEM_UCC        : ${USE_SYSTEM_UCC}")
   endif()
   message(STATUS "  USE_ITT               : ${USE_ITT}")
+  message(STATUS "  USE_XCCL              : ${USE_XCCL}")
+  if(${USE_XCCL})
+    message(STATUS "    USE_C10D_XCCL       : ${USE_C10D_XCCL}")
+    message(STATUS "    XCCL include path   : ${XCCL_INCLUDE_DIR}")
+    message(STATUS "    XCCL library        : ${XCCL_LIBRARY}")
+  endif()
   message(STATUS "  USE_NCCL              : ${USE_NCCL}")
   if(${USE_NCCL})
     message(STATUS "    USE_SYSTEM_NCCL     : ${USE_SYSTEM_NCCL}")
diff --git a/setup.py b/setup.py
index 92f1e2ddc7bcd3..ad48f4b0108633 100644
--- a/setup.py
+++ b/setup.py
@@ -645,6 +645,10 @@ def run(self):
             report("-- Building NCCL library")
         else:
             report("-- Not using NCCL")
+        if cmake_cache_vars["USE_XCCL"]:
+            report("-- Building XCCL library")
+        else:
+            report("-- Not using XCCL")
         if cmake_cache_vars["USE_DISTRIBUTED"]:
             if IS_WINDOWS:
                 report("-- Building without distributed package")
diff --git a/test/distributed/test_c10d_common.py b/test/distributed/test_c10d_common.py
index 6a0621f3f49913..3e5538d57e38ae 100644
--- a/test/distributed/test_c10d_common.py
+++ b/test/distributed/test_c10d_common.py
@@ -66,8 +66,13 @@ def gpus_for_rank(world_size):
     On a single node, all visible GPUs are evenly
     divided to subsets, each process only uses a subset.
     """
-    visible_devices = list(range(torch.cuda.device_count()))
-    gpus_per_process = torch.cuda.device_count() // world_size
+    device_count = (
+        torch.xpu.device_count()
+        if torch.xpu.is_available()
+        else torch.cuda.device_count()
+    )
+    visible_devices = list(range(device_count))
+    gpus_per_process = device_count // world_size
     gpus_for_rank = []
     for rank in range(world_size):
         gpus_for_rank.append(
diff --git a/test/distributed/test_c10d_xccl.py b/test/distributed/test_c10d_xccl.py
new file mode 100644
index 00000000000000..704cdd414e554b
--- /dev/null
+++ b/test/distributed/test_c10d_xccl.py
@@ -0,0 +1,303 @@
+# Owner(s): ["oncall: distributed"]
+
+import math
+import os
+import sys
+import time
+from datetime import timedelta
+from unittest import mock
+
+import torch
+import torch.distributed as c10d
+
+
+if not c10d.is_available() or not c10d.is_xccl_available():
+    print("c10d XCCL not available, skipping tests", file=sys.stderr)
+    sys.exit(0)
+
+import test_c10d_common
+
+import torch.distributed as dist
+import torch.testing._internal.common_utils as common
+from torch.testing._internal.common_distributed import (
+    init_multigpu_helper,
+    MultiProcessTestCase,
+    requires_xccl,
+)
+from torch.testing._internal.common_utils import (
+    retry_on_connect_failures,
+    run_tests,
+    skip_but_pass_in_sandcastle_if,
+    TEST_XPU,
+    TestCase,
+)
+
+
+def simple_reduce_tests(rank, world_size):
+    tests = [
+        (
+            c10d.ReduceOp.SUM,
+            torch.tensor([rank + 1.0]),
+            torch.tensor([float(world_size * (world_size + 1) / 2)]),
+        ),
+        (
+            c10d.ReduceOp.PRODUCT,
+            torch.tensor([rank + 1.0]),
+            torch.tensor([float(math.factorial(world_size))]),
+        ),
+        (
+            c10d.ReduceOp.MIN,
+            torch.tensor([rank + 1.0]),
+            torch.tensor([1.0]),
+        ),
+        (
+            c10d.ReduceOp.MAX,
+            torch.tensor([rank + 1.0]),
+            torch.tensor([world_size]),
+        ),
+    ]
+
+    return tests
+
+
+TEST_MULTIXPU = torch.xpu.device_count() > 1
+
+
+class RendezvousEnvTest(TestCase):
+    @retry_on_connect_failures
+    @requires_xccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_XPU, "No GPUs available, skipping test")
+    def test_common_errors(self):
+        vars = {
+            "WORLD_SIZE": "1",
+            "RANK": "0",
+            "MASTER_ADDR": "127.0.0.1",
+            "MASTER_PORT": str(common.find_free_port()),
+        }
+
+        class Env:
+            def __init__(self, vars):
+                self.env_patcher = mock.patch.dict(os.environ, vars, clear=True)
+
+            def __enter__(self):
+                self.env_patcher.start()
+
+            def __exit__(self, type, value, traceback):
+                self.env_patcher.stop()
+
+        def without(d, key):
+            d = d.copy()
+            d.pop(key)
+            return d
+
+        def withouts(d, keys):
+            d = d.copy()
+            for key in keys:
+                d.pop(key)
+            return d
+
+        with Env(without(vars, "WORLD_SIZE")):
+            self.assertEqual(None, os.environ.get("WORLD_SIZE"))
+            with self.assertRaisesRegex(ValueError, "WORLD_SIZE expected"):
+                gen = c10d.rendezvous("env://")
+                next(gen)
+            c10d.init_process_group(backend="xccl", world_size=1)
+            self.assertEqual(c10d.get_rank(), 0)
+            self.assertEqual(c10d.get_world_size(), 1)
+            c10d.destroy_process_group()
+
+        with Env(without(vars, "RANK")):
+            self.assertEqual(None, os.environ.get("RANK"))
+            with self.assertRaisesRegex(ValueError, "RANK expected"):
+                gen = c10d.rendezvous("env://")
+                next(gen)
+            c10d.init_process_group(backend="xccl", rank=0)
+            self.assertEqual(c10d.get_rank(), 0)
+            self.assertEqual(c10d.get_world_size(), 1)
+            c10d.destroy_process_group()
+
+        with Env(withouts(vars, ["RANK", "WORLD_SIZE"])):
+            self.assertEqual(None, os.environ.get("RANK"))
+            self.assertEqual(None, os.environ.get("WORLD_SIZE"))
+            c10d.init_process_group(backend="xccl", rank=0, world_size=1)
+            self.assertEqual(c10d.get_rank(), 0)
+            self.assertEqual(c10d.get_world_size(), 1)
+            c10d.destroy_process_group()
+
+        with Env(vars):
+            c10d.init_process_group(backend="xccl")
+            self.assertEqual(c10d.get_rank(), 0)
+            self.assertEqual(c10d.get_world_size(), 1)
+            c10d.destroy_process_group()
+
+        with Env(without(vars, "MASTER_ADDR")):
+            self.assertEqual(None, os.environ.get("MASTER_ADDR"))
+            with self.assertRaisesRegex(ValueError, "MASTER_ADDR expected"):
+                gen = c10d.rendezvous("env://")
+                next(gen)
+
+        with Env(without(vars, "MASTER_PORT")):
+            self.assertEqual(None, os.environ.get("MASTER_PORT"))
+            with self.assertRaisesRegex(ValueError, "MASTER_PORT expected"):
+                gen = c10d.rendezvous("env://")
+                next(gen)
+
+        with Env(without(vars, "WORLD_SIZE")):
+            self.assertEqual(None, os.environ.get("WORLD_SIZE"))
+            gen = c10d.rendezvous(f"env://?world_size={1}")
+            _, _, size = next(gen)
+            self.assertEqual(size, 1)
+
+        with Env(without(vars, "RANK")):
+            self.assertEqual(None, os.environ.get("RANK"))
+            gen = c10d.rendezvous(f"env://?rank={0}")
+            _, rank, _ = next(gen)
+            self.assertEqual(rank, 0)
+
+        with Env(withouts(vars, ["RANK", "WORLD_SIZE"])):
+            self.assertEqual(None, os.environ.get("RANK"))
+            self.assertEqual(None, os.environ.get("WORLD_SIZE"))
+            gen = c10d.rendezvous(f"env://?rank={0}&world_size={1}")
+            _, rank, size = next(gen)
+            self.assertEqual(rank, 0)
+            self.assertEqual(size, 1)
+
+
+class TimeoutTest(test_c10d_common.AbstractTimeoutTest, TestCase):
+    @requires_xccl()
+    @retry_on_connect_failures
+    @skip_but_pass_in_sandcastle_if(not TEST_XPU, "No GPUs available, skipping test")
+    def test_default_store_timeout_nccl(self):
+        self._test_default_store_timeout("xccl")
+
+
+class ProcessGroupXCCLTest(MultiProcessTestCase):
+    def _create_process_group_xccl(
+        self, timeout=timedelta(seconds=600), device_id=None
+    ):
+        store = c10d.FileStore(self.file_name, self.world_size)
+        c10d.init_process_group(
+            "xccl",
+            world_size=self.world_size,
+            rank=self.rank,
+            store=store,
+            timeout=timeout,
+            device_id=device_id,
+        )
+        pg = c10d.distributed_c10d._get_default_group()
+        return pg
+
+    def setUp(self):
+        super().setUp()
+        self._spawn_processes()
+
+    def tearDown(self):
+        super().tearDown()
+        try:
+            os.remove(self.file_name)
+        except OSError:
+            pass
+
+    @property
+    def world_size(self):
+        return 2
+
+    @property
+    def rank_to_GPU(self):
+        # return rank to GPU map
+        return init_multigpu_helper(self.world_size, "xccl")
+
+    @requires_xccl()
+    @skip_but_pass_in_sandcastle_if(
+        torch.xpu.device_count() < 2, "XCCL test requires 2+ GPUs"
+    )
+    def test_close_multi_pg_unordered(self):
+        pg = self._create_process_group_xccl()
+        device = self.rank_to_GPU[self.rank][0]
+        t = torch.rand(10, 10, device=device)
+        # First allreduce to initialize default PG's communicator.
+        pg.allreduce(t).wait()
+        new_pg1 = c10d.new_group([0, 1])
+        new_pg2 = c10d.new_group([0, 1])
+        if self.rank == 0 or self.rank == 1:
+            t1 = torch.rand(10, 10, device=device)
+            t2 = torch.rand(10, 10, device=device)
+            new_pg1.allreduce(t1).wait()
+            new_pg2.allreduce(t2).wait()
+        if self.rank == 0:
+            dist.destroy_process_group(new_pg2)
+            # force destruction of pg2 first
+            del new_pg2
+            dist.destroy_process_group(new_pg1)
+            del new_pg1
+        if self.rank == 1:
+            c10d.destroy_process_group(new_pg1)
+            # force destruction of pg1 first
+            del new_pg1
+            dist.destroy_process_group(new_pg2)
+            del new_pg2
+        dist.destroy_process_group()
+
+    @requires_xccl()
+    @skip_but_pass_in_sandcastle_if(
+        torch.xpu.device_count() < 2, "XCCL test requires 2+ GPUs"
+    )
+    def test_file_store_check(self):
+        # self.file_name is created using "delete=False"
+        # e.g., self.file_name = tempfile.NamedTemporaryFile(delete=False).name
+        store = dist.FileStore(self.file_name, self.world_size)
+        dist.init_process_group(
+            backend="xccl", rank=self.rank, world_size=self.world_size, store=store
+        )
+        pg = dist.distributed_c10d._get_default_group()
+        self.assertEqual(pg.rank(), self.rank)
+        self.assertEqual(pg.size(), self.world_size)
+        # give enough time for check() to be executed multiple times
+        time.sleep(2)
+        dist.destroy_process_group()
+
+    @requires_xccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIXPU, "XCCL test requires 2+ GPUs")
+    def test_set_process_group_desc(self):
+        device = torch.device(f"xpu:{self.rank}")
+        pg_default = self._create_process_group_xccl(device_id=device)
+        self.assertEqual(pg_default.group_desc, "default_pg")
+        pg_1 = c10d.new_group([0, 1], group_desc="test_purpose")
+        self.assertEqual(pg_1.group_desc, "test_purpose")
+        pg_2 = c10d.new_group([0, 1])
+        self.assertEqual(pg_2.group_desc, "undefined")
+
+    def _test_allreduce_basics(self, fn):
+        pg = self._create_process_group_xccl()
+        device = torch.device("xpu:" + str(self.rank))
+        # Single input tests
+        tests = simple_reduce_tests(self.rank, self.world_size)
+        for op, input, expected in tests:
+            opts = c10d.AllreduceOptions()
+            opts.reduceOp = op
+            tensor = fn(input.to(device))
+            fut = pg.allreduce([tensor], opts).get_future()
+            fut.wait()
+            result = fut.value()
+            self.assertEqual(expected, result[0], exact_dtype=False)
+
+        x = fn(torch.tensor([self.rank + 1.0], device=device))
+        fut = pg.allreduce(x).get_future()
+        fut.wait()
+        result = fut.value()
+        self.assertEqual(
+            torch.tensor([float(self.world_size * (self.world_size + 1) / 2)]),
+            result[0],
+        )
+
+    @requires_xccl()
+    def test_allreduce_basics(self):
+        self._test_allreduce_basics(lambda t: t.clone())
+
+
+if __name__ == "__main__":
+    assert (
+        not torch.xpu._initialized
+    ), "test_distributed must not have initialized XPU context on main process"
+
+    run_tests()
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index bb949a081c95e9..9a91b26d54cfb4 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -282,6 +282,9 @@ if(USE_DISTRIBUTED)
     if(USE_NCCL)
       list(APPEND TORCH_PYTHON_LINK_LIBRARIES __caffe2_nccl)
     endif()
+    if(USE_XCCL)
+      list(APPEND TORCH_PYTHON_LINK_LIBRARIES torch::xccl)
+    endif()
     # Same for MPI.
     if(USE_MPI)
       list(APPEND TORCH_PYTHON_LINK_LIBRARIES MPI::MPI_CXX)
@@ -345,6 +348,10 @@ if(BUILD_LIBTORCHLESS)
     target_compile_definitions(torch_python PRIVATE USE_C10D_NCCL)
   endif()
 
+  if(USE_XPU AND USE_C10D_XCCL)
+    target_compile_definitions(torch_python PRIVATE USE_C10D_XCCL)
+  endif()
+
   if(USE_DISTRIBUTED)
     target_compile_definitions(torch_python PRIVATE USE_DISTRIBUTED)
   endif()
diff --git a/torch/_C/_distributed_c10d.pyi b/torch/_C/_distributed_c10d.pyi
index 94e8578bbfff62..6033d969925972 100644
--- a/torch/_C/_distributed_c10d.pyi
+++ b/torch/_C/_distributed_c10d.pyi
@@ -309,6 +309,7 @@ class ProcessGroup:
         UNDEFINED = ...
         GLOO = ...
         NCCL = ...
+        XCCL = ...
         UCC = ...
         MPI = ...
         CUSTOM = ...
@@ -697,3 +698,11 @@ class ProcessGroupCudaP2P(Backend):
         storage_offset: Optional[int] = 0,
     ) -> torch.Tensor: ...
     def _shutdown(self) -> None: ...
+
+class ProcessGroupXCCL(Backend):
+    def __init__(
+        self,
+        store: Store,
+        rank: int,
+        size: int,
+    ): ...
diff --git a/torch/csrc/distributed/c10d/Ops.cpp b/torch/csrc/distributed/c10d/Ops.cpp
index ae822ad3975049..699c54236f6412 100644
--- a/torch/csrc/distributed/c10d/Ops.cpp
+++ b/torch/csrc/distributed/c10d/Ops.cpp
@@ -79,6 +79,7 @@ namespace {
   }
 
 IMPL_SEND(CPU)
+IMPL_SEND(XPU)
 IMPL_SEND(CUDA)
 IMPL_SEND(PrivateUse1)
 
@@ -94,6 +95,7 @@ IMPL_SEND(PrivateUse1)
   }
 
 IMPL_RECV(CPU)
+IMPL_RECV(XPU)
 IMPL_RECV(CUDA)
 IMPL_RECV(PrivateUse1)
 
@@ -108,6 +110,7 @@ IMPL_RECV(PrivateUse1)
   }
 
 IMPL_RECV_ANY_SOURCE(CPU)
+IMPL_RECV_ANY_SOURCE(XPU)
 IMPL_RECV_ANY_SOURCE(CUDA)
 IMPL_RECV_ANY_SOURCE(PrivateUse1)
 
@@ -131,6 +134,7 @@ IMPL_RECV_ANY_SOURCE(PrivateUse1)
   }
 
 IMPL_REDUCE(CPU)
+IMPL_REDUCE(XPU)
 IMPL_REDUCE(CUDA)
 IMPL_REDUCE(PrivateUse1)
 
@@ -156,6 +160,7 @@ IMPL_REDUCE(PrivateUse1)
   }
 
 IMPL_BROADCAST(CPU)
+IMPL_BROADCAST(XPU)
 IMPL_BROADCAST(CUDA)
 IMPL_BROADCAST(PrivateUse1)
 
@@ -181,6 +186,7 @@ IMPL_BROADCAST(PrivateUse1)
 
 IMPL_ALLREDUCE(CPU)
 IMPL_ALLREDUCE(CUDA)
+IMPL_ALLREDUCE(XPU)
 IMPL_ALLREDUCE(PrivateUse1)
 
 #define IMPL_ALLREDUCE_COALESCED(DEV)                             \
@@ -198,6 +204,7 @@ IMPL_ALLREDUCE(PrivateUse1)
   }
 
 IMPL_ALLREDUCE_COALESCED(CPU)
+IMPL_ALLREDUCE_COALESCED(XPU)
 IMPL_ALLREDUCE_COALESCED(CUDA)
 IMPL_ALLREDUCE_COALESCED(PrivateUse1)
 
@@ -222,6 +229,7 @@ IMPL_ALLREDUCE_COALESCED(PrivateUse1)
 
 // NOLINTBEGIN(cppcoreguidelines-pro-type-const-cast)
 IMPL_ALLGATHER(CPU)
+IMPL_ALLGATHER(XPU)
 IMPL_ALLGATHER(CUDA)
 IMPL_ALLGATHER(PrivateUse1)
 
@@ -242,6 +250,7 @@ IMPL_ALLGATHER(PrivateUse1)
   }
 
 IMPL__ALLGATHER_BASE(CPU)
+IMPL__ALLGATHER_BASE(XPU)
 IMPL__ALLGATHER_BASE(CUDA)
 IMPL__ALLGATHER_BASE(PrivateUse1)
 
@@ -258,6 +267,7 @@ IMPL__ALLGATHER_BASE(PrivateUse1)
   }
 
 IMPL_ALLGATHER_COALESCED(CPU)
+IMPL_ALLGATHER_COALESCED(XPU)
 IMPL_ALLGATHER_COALESCED(CUDA)
 IMPL_ALLGATHER_COALESCED(PrivateUse1)
 
@@ -273,6 +283,7 @@ IMPL_ALLGATHER_COALESCED(PrivateUse1)
   }
 
 IMPL_ALLGATHER_INTO_TENSOR_COALESCED(CPU)
+IMPL_ALLGATHER_INTO_TENSOR_COALESCED(XPU)
 IMPL_ALLGATHER_INTO_TENSOR_COALESCED(CUDA)
 IMPL_ALLGATHER_INTO_TENSOR_COALESCED(PrivateUse1)
 
@@ -296,6 +307,7 @@ IMPL_ALLGATHER_INTO_TENSOR_COALESCED(PrivateUse1)
   }
 
 IMPL_REDUCE_SCATTER(CPU)
+IMPL_REDUCE_SCATTER(XPU)
 IMPL_REDUCE_SCATTER(CUDA)
 IMPL_REDUCE_SCATTER(PrivateUse1)
 
@@ -320,6 +332,7 @@ IMPL_REDUCE_SCATTER(PrivateUse1)
   }
 
 IMPL__REDUCE_SCATTER_BASE(CPU)
+IMPL__REDUCE_SCATTER_BASE(XPU)
 IMPL__REDUCE_SCATTER_BASE(CUDA)
 IMPL__REDUCE_SCATTER_BASE(PrivateUse1)
 
@@ -341,6 +354,7 @@ IMPL__REDUCE_SCATTER_BASE(PrivateUse1)
   }
 
 IMPL_REDUCE_SCATTER_TENSOR_COALESCED(CPU)
+IMPL_REDUCE_SCATTER_TENSOR_COALESCED(XPU)
 IMPL_REDUCE_SCATTER_TENSOR_COALESCED(CUDA)
 IMPL_REDUCE_SCATTER_TENSOR_COALESCED(PrivateUse1)
 
@@ -360,6 +374,7 @@ IMPL_REDUCE_SCATTER_TENSOR_COALESCED(PrivateUse1)
   }
 
 IMPL_GATHER(CPU)
+IMPL_GATHER(XPU)
 IMPL_GATHER(CUDA)
 IMPL_GATHER(PrivateUse1)
 
@@ -382,6 +397,7 @@ IMPL_GATHER(PrivateUse1)
   }
 
 IMPL_SCATTER(CPU)
+IMPL_SCATTER(XPU)
 IMPL_SCATTER(CUDA)
 IMPL_SCATTER(PrivateUse1)
 
@@ -403,6 +419,7 @@ IMPL_SCATTER(PrivateUse1)
   }
 
 IMPL_ALLTOALL(CPU)
+IMPL_ALLTOALL(XPU)
 IMPL_ALLTOALL(CUDA)
 IMPL_ALLTOALL(PrivateUse1)
 
@@ -424,6 +441,7 @@ IMPL_ALLTOALL(PrivateUse1)
   }
 
 IMPL_ALLTOALL_BASE(CPU)
+IMPL_ALLTOALL_BASE(XPU)
 IMPL_ALLTOALL_BASE(CUDA)
 IMPL_ALLTOALL_BASE(PrivateUse1)
 
@@ -439,6 +457,7 @@ IMPL_ALLTOALL_BASE(PrivateUse1)
   }
 
 IMPL_BARRIER(CPU)
+IMPL_BARRIER(XPU)
 IMPL_BARRIER(CUDA)
 IMPL_BARRIER(PrivateUse1)
 // NOLINTEND(cppcoreguidelines-pro-type-const-cast)
@@ -491,6 +510,7 @@ namespace {
 #define REGISTER_C10D_OP(FUNC)  \
   REGISTER_C10D_OP1(FUNC, CPU)  \
   REGISTER_C10D_OP1(FUNC, CUDA) \
+  REGISTER_C10D_OP1(FUNC, XPU)  \
   REGISTER_C10D_OP1(FUNC, PrivateUse1)
 
 // Now we start to register ops with the three device keys
diff --git a/torch/csrc/distributed/c10d/ProcessGroup.cpp b/torch/csrc/distributed/c10d/ProcessGroup.cpp
index 75635bc68aed4f..70356b3bf382ce 100644
--- a/torch/csrc/distributed/c10d/ProcessGroup.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroup.cpp
@@ -21,6 +21,8 @@ static ProcessGroup::BackendType strToBackendType(std::string_view backend) {
     return ProcessGroup::BackendType::GLOO;
   } else if (backend == "nccl") {
     return ProcessGroup::BackendType::NCCL;
+  } else if (backend == "xccl") {
+    return ProcessGroup::BackendType::XCCL;
   } else if (backend == "ucc") {
     return ProcessGroup::BackendType::UCC;
   } else if (backend == "mpi") {
diff --git a/torch/csrc/distributed/c10d/ProcessGroup.hpp b/torch/csrc/distributed/c10d/ProcessGroup.hpp
index acf8c9c354a76b..73fc2bda701327 100644
--- a/torch/csrc/distributed/c10d/ProcessGroup.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroup.hpp
@@ -70,6 +70,7 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
     UCC = 3,
     MPI = 4,
     CUSTOM = 5,
+    XCCL = 6,
   };
 
   // Not used, set for backwards compatibility and only used for TypeDef in
@@ -489,6 +490,7 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
     // TODO: HACK for backend name to get sequence number for that backend.
     if (backendType == ProcessGroup::BackendType::GLOO ||
         backendType == ProcessGroup::BackendType::NCCL ||
+        backendType == ProcessGroup::BackendType::XCCL ||
         backendType == ProcessGroup::BackendType::UCC) {
       getDefaultBackend()->setSequenceNumberForGroup();
     } else {
@@ -510,6 +512,7 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
     // TODO: HACK for backend name to get sequence number for that backend.
     if (backendType == ProcessGroup::BackendType::GLOO ||
         backendType == ProcessGroup::BackendType::NCCL ||
+        backendType == ProcessGroup::BackendType::XCCL ||
         backendType == ProcessGroup::BackendType::UCC) {
       return getDefaultBackend()->getSequenceNumberForGroup();
     } else {
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
new file mode 100644
index 00000000000000..5aeeb62bee1ece
--- /dev/null
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -0,0 +1,401 @@
+#include <torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp>
+#include <fstream>
+#include <mutex>
+#include <sstream>
+
+#ifdef USE_C10D_XCCL
+#include <exception>
+#include <map>
+#include <stdexcept>
+#include <tuple>
+#include <unordered_set>
+#include <utility>
+
+#include <ATen/detail/FunctionTraits.h>
+#include <c10/core/DeviceType.h>
+#include <c10/util/CallOnce.h>
+#include <c10/util/Exception.h>
+#include <c10/util/Logging.h>
+#include <c10/util/Optional.h>
+#include <c10/util/irange.h>
+#include <torch/csrc/distributed/c10d/ParamCommsUtils.hpp>
+#include <torch/csrc/distributed/c10d/TraceUtils.h>
+#include <torch/csrc/distributed/c10d/Utils.hpp>
+#include <torch/torch.h>
+
+namespace c10d {
+
+namespace {
+std::map<c10d::ReduceOp, ccl::reduction> xcclOps = {
+    {ReduceOp::MIN, ccl::reduction::min},
+    {ReduceOp::MAX, ccl::reduction::max},
+    {ReduceOp::SUM, ccl::reduction::sum},
+    {ReduceOp::PRODUCT, ccl::reduction::prod},
+};
+
+std::map<at::ScalarType, ccl::datatype> xcclDatatypes = {
+    {at::kByte, ccl::datatype::uint8},
+    {at::kChar, ccl::datatype::int8},
+    {at::kInt, ccl::datatype::int32},
+    {at::kLong, ccl::datatype::int64},
+    {at::kHalf, ccl::datatype::float16},
+    {at::kFloat, ccl::datatype::float32},
+    {at::kDouble, ccl::datatype::float64},
+    {at::kBFloat16, ccl::datatype::bfloat16},
+    {at::kBool, ccl::datatype::uint8},
+};
+
+XCCL_KVS kvs;
+std::mutex kvs_mutex;
+
+XCCL_KVS get_kvs(int rank, c10d::Store& store) {
+  std::lock_guard<std::mutex> lock(kvs_mutex);
+  if (kvs)
+    return kvs;
+  std::string storeKey = "xccl_kvs";
+
+  // Rank 0 broadcast the bootstrap network information to other ranks
+  if (rank == 0) {
+    kvs = ccl::create_main_kvs();
+    ccl::kvs::address_type main_addr = kvs->get_address();
+    auto ccl_kvs_addr =
+        std::vector<uint8_t>(main_addr.begin(), main_addr.end());
+    store.set(storeKey, ccl_kvs_addr);
+  } else {
+    auto ccl_kvs_addr = store.get(storeKey);
+    if (ccl_kvs_addr.size() != ccl::kvs::address_max_size) {
+      throw std::runtime_error("Unexpected ccl kvs addr from the store\n");
+    }
+    ccl::kvs::address_type main_addr;
+    std::copy_n(
+        ccl_kvs_addr.begin(), ccl::kvs::address_max_size, main_addr.begin());
+    kvs = ccl::create_kvs(main_addr);
+  }
+
+  return kvs;
+}
+
+void check_xpu_single_tensor(const at::Tensor& tensor) {
+  if (!tensor.is_xpu() || tensor.is_sparse()) {
+    C10_THROW_ERROR(ValueError, "Tensors must be XPU and dense");
+  }
+  if (!tensor.is_contiguous(tensor.suggest_memory_format())) {
+    C10_THROW_ERROR(ValueError, "Tensors must be contiguous");
+  }
+}
+
+ccl::datatype getXcclDataType(at::ScalarType type) {
+  auto it = xcclDatatypes.find(type);
+  TORCH_CHECK_WITH(
+      TypeError,
+      it != xcclDatatypes.end(),
+      "Input tensor data type is not supported for XCCL process group: ",
+      type);
+  return it->second;
+}
+
+ccl::reduction getXcclReduceOp(const ReduceOp& reduceOp, at::Tensor& input) {
+  try {
+    if (input.scalar_type() == at::kBool) {
+      if (reduceOp == ReduceOp::SUM) {
+        // For bool tensors, map sum to max, which both represent a bitwise or.
+        // This is to prevent overflow issues with sum, since we use uint8 to
+        // represent a bool (see xcclDatatypes mapping align with cuda).
+        return ccl::reduction::max;
+      }
+    }
+    return xcclOps.at(reduceOp);
+  } catch (const std::out_of_range&) {
+    switch (reduceOp) {
+      case ReduceOp::AVG:
+        C10_THROW_ERROR(ValueError, "Cannot use ReduceOp AVG with XCCL");
+        break;
+      case ReduceOp::BAND:
+        C10_THROW_ERROR(ValueError, "Cannot use ReduceOp.BAND with XCCL");
+        break;
+      case ReduceOp::BOR:
+        C10_THROW_ERROR(ValueError, "Cannot use ReduceOp.BOR with XCCL");
+        break;
+      case ReduceOp::BXOR:
+        C10_THROW_ERROR(ValueError, "Cannot use ReduceOp.BXOR with XCCL");
+        break;
+      default:
+        C10_THROW_ERROR(ValueError, "Unhandled ReduceOp");
+        break;
+    }
+  }
+}
+
+} // namespace
+
+static std::mutex xcclCommDevIdxMapMutex;
+static std::unordered_map<std::shared_ptr<xcclComm_t>, int> xcclCommDevIdxMap;
+constexpr int64_t kSynchronizeBusyWaitMillis = 10;
+
+ProcessGroupXCCL::WorkXCCL::WorkXCCL(
+    at::Device& device,
+    int rank,
+    OpType opType,
+    const std::optional<std::vector<at::Tensor>>& inputs)
+    : Work(rank, opType, "profilingTitle", inputs),
+      device_(device),
+      workStartTime_(std::chrono::steady_clock::now()) {
+  unsigned char enable_timing = 0;
+  xcclEndEvent_ = std::make_shared<at::xpu::XPUEvent>(enable_timing);
+}
+
+ProcessGroupXCCL::WorkXCCL::WorkXCCL(const WorkXCCL& w)
+    : Work(w.rank_, w.opType_),
+      device_(w.device_),
+      xcclEndEvent_(w.xcclEndEvent_),
+      blockingWait_(w.blockingWait_),
+      workStartTime_(w.workStartTime_) {}
+
+ProcessGroupXCCL::WorkXCCL::~WorkXCCL() = default;
+
+bool ProcessGroupXCCL::WorkXCCL::checkTimeout(
+    std::optional<std::chrono::milliseconds> timeout) {
+  auto currentTimepoint = std::chrono::steady_clock::now();
+  auto timeElapsed = std::chrono::duration_cast<std::chrono::milliseconds>(
+      currentTimepoint - workStartTime_);
+  std::chrono::milliseconds opTimeout = std::chrono::milliseconds(60000);
+
+  auto workTimeout = timeout ? *timeout : opTimeout;
+
+  if (timeElapsed < workTimeout)
+    return false;
+  return true;
+}
+
+bool ProcessGroupXCCL::WorkXCCL::isCompleted() {
+  if (xcclEndEvent_ && xcclEndEvent_->query()) {
+    return true;
+  }
+  return false;
+}
+
+void ProcessGroupXCCL::WorkXCCL::synchronize() {
+  synchronizeInternal(kNoTimeout);
+}
+
+void ProcessGroupXCCL::WorkXCCL::synchronizeStream() {
+  auto currentStream = at::xpu::getCurrentXPUStream(device_.index());
+  // Block the current stream on the XCCL stream
+  xcclEndEvent_->block(currentStream);
+}
+
+void ProcessGroupXCCL::WorkXCCL::synchronizeInternal(
+    std::chrono::milliseconds timeout) {
+  synchronizeStream();
+
+  if (blockingWait_) {
+    while (!isCompleted()) {
+      bool timedOut = checkTimeout(
+          timeout == kNoTimeout ? std::nullopt : std::make_optional(timeout));
+      if (timedOut) {
+        break;
+      }
+      std::this_thread::sleep_for(
+          std::chrono::milliseconds(kSynchronizeBusyWaitMillis));
+    }
+  }
+}
+
+bool ProcessGroupXCCL::WorkXCCL::wait(std::chrono::milliseconds timeout) {
+  synchronizeInternal(timeout);
+  return true;
+}
+
+ProcessGroupXCCL::ProcessGroupXCCL(
+    const c10::intrusive_ptr<Store>& store,
+    int rank,
+    int size)
+    : Backend(rank, size), store_(store) {
+  blockingWait_ = getCvarBool(TORCH_XCCL_BLOCKING_WAIT, false);
+  init();
+
+  // Intel oneCCL requires passing CCL_LOCAL_RANK and CCL_LOCAL_SIZE for non-MPI
+  // launchers.
+  if (!with_mpirun()) {
+    int local_rank = getXCCLEnvVar("LOCAL_RANK");
+    int local_world_size = getXCCLEnvVar("LOCAL_WORLD_SIZE");
+    if (local_rank == -1 || local_world_size == -1) {
+      local_rank = rank;
+      local_world_size = size;
+    }
+    setXCCLEnvVar("CCL_PROCESS_LAUNCHER", "none");
+    setXCCLEnvVar("CCL_LOCAL_RANK", local_rank);
+    setXCCLEnvVar("CCL_LOCAL_SIZE", local_world_size);
+  }
+}
+
+ProcessGroupXCCL::~ProcessGroupXCCL() = default;
+
+c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL> ProcessGroupXCCL::initWork(
+    at::Device& device,
+    int rank,
+    OpType opType,
+    const std::vector<at::Tensor>& inputs,
+    const std::vector<at::Tensor>& outputs) {
+  auto r = c10::make_intrusive<ProcessGroupXCCL::WorkXCCL>(
+      device, rank, opType, std::optional<std::vector<at::Tensor>>(inputs));
+  return r;
+}
+
+std::shared_ptr<xcclComm_t> ProcessGroupXCCL::getXCCLComm(
+    const std::string& deviceKey,
+    at::Device& device) {
+  if (deviceKey.empty()) {
+    C10_THROW_ERROR(
+        DistBackendError,
+        "Not able to create/get the XCCL Communicator since "
+        "the devices are empty ");
+  }
+
+  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (devXCCLCommMap_.find(deviceKey) != devXCCLCommMap_.end()) {
+      return devXCCLCommMap_[deviceKey];
+    }
+  }
+
+  std::shared_ptr<xcclComm_t> XCCLComm;
+
+  XCCL_KVS kvs = get_kvs(rank_, *store_);
+
+  int numRanks, rank;
+  numRanks = getSize();
+  rank = getRank();
+
+  c10::impl::VirtualGuardImpl impl(device.type());
+  c10::Stream stream = impl.getStream(device);
+  sycl::queue& q = c10::xpu::XPUStream(stream).queue();
+
+  auto ctx = ccl::create_context(q.get_context());
+  ccl::vector_class<ccl::pair_class<int, ccl::device>> devs_rank;
+  devs_rank.emplace_back(rank, ccl::create_device(q.get_device()));
+
+  auto comms = ccl::create_communicators(numRanks, devs_rank, ctx, kvs);
+  XCCLComm = std::make_shared<xcclComm_t>(std::move(comms[0]));
+
+  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    inInitializationCommMap_.emplace(deviceKey, XCCLComm);
+  }
+
+  xcclStreams_.emplace(deviceKey, std::move(stream));
+
+  auto it = inInitializationCommMap_.find(deviceKey);
+  if (it != inInitializationCommMap_.end()) {
+    devXCCLCommMap_.emplace(deviceKey, std::move(it->second));
+    inInitializationCommMap_.erase(deviceKey);
+
+    xcclCommDevIdxMapMutex.lock();
+    xcclCommDevIdxMap.emplace(XCCLComm, device.index());
+    xcclCommDevIdxMapMutex.unlock();
+  }
+
+  it = devXCCLCommMap_.find(deviceKey);
+  TORCH_INTERNAL_ASSERT(
+      it != devXCCLCommMap_.end(), "Communicators not populated in cache!");
+
+  return it->second;
+}
+
+template <typename Fn, typename PreProcess, typename PostProcess>
+c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
+    at::Tensor& input,
+    at::Tensor& output,
+    Fn fn,
+    PreProcess pre,
+    PostProcess post,
+    OpType opType) {
+  using traits = function_traits<Fn>;
+  using attr_t = typename traits::template arg<2>::type;
+  attr_t attr = ccl::create_operation_attr<attr_t>();
+
+  auto device = input.device();
+  const auto key = std::to_string(device.index());
+  auto comm = getXCCLComm(key, device);
+
+  auto stream = xcclStreams_.at(key);
+  std::vector<at::Tensor> outputs{output};
+
+  c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL> work;
+
+  work = initWork(device, rank_, opType);
+
+  work->outputs_ =
+      std::make_shared<std::vector<at::Tensor>>(std::move(outputs));
+  c10::xpu::XPUCachingAllocator::recordStream(
+      input.storage().data_ptr(), stream);
+
+  auto ccl_stream = ccl::create_stream(stream.queue());
+
+  fn(input, output, attr, *comm, ccl_stream);
+
+  work->xcclEndEvent_->record(stream);
+
+  std::vector<c10::Stream> streams = {stream.unwrap()};
+  c10::MultiStreamGuard streamGuard(streams);
+  std::vector<at::Device> devices{device};
+  work->future_ = c10::make_intrusive<at::ivalue::Future>(
+      c10::ListType::create(c10::TensorType::get()), devices);
+  work->future_->markCompleted(at::IValue(*work->outputs_));
+  work->blockingWait_ = blockingWait_;
+
+  return work;
+}
+
+template <typename Fn>
+c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
+    at::Tensor& input,
+    at::Tensor& output,
+    Fn fn,
+    OpType opType) {
+  return collective<Fn>(
+      input,
+      output,
+      fn,
+      [](at::xpu::XPUStream&,
+         c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL>& work) {},
+      [](at::xpu::XPUStream&,
+         c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL>& work) {},
+      opType);
+}
+
+c10::intrusive_ptr<Work> ProcessGroupXCCL::allreduce(
+    std::vector<at::Tensor>& tensors,
+    const AllreduceOptions& opts) {
+  TORCH_CHECK(
+      tensors.size() == 1, "Expecting one tensor only but got multiple");
+  auto tensor = tensors.back();
+  check_xpu_single_tensor(tensor);
+  return collective(
+      tensor,
+      tensor,
+      [&](at::Tensor& input,
+          at::Tensor& output,
+          ccl::allreduce_attr attr,
+          xcclComm_t& comm,
+          ccl::stream& stream) {
+        auto xcclDataType = getXcclDataType(input.scalar_type());
+        auto xcclReduceOp = getXcclReduceOp(opts.reduceOp, input);
+        ccl::event ret_evt;
+        ret_evt = ccl::allreduce(
+            input.data_ptr(),
+            output.data_ptr(),
+            (size_t)input.numel(),
+            xcclDataType,
+            xcclReduceOp,
+            comm,
+            stream,
+            attr);
+        return ret_evt;
+      },
+      OpType::ALLREDUCE);
+}
+
+} // namespace c10d
+
+#endif // USE_C10D_XCCL
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
new file mode 100644
index 00000000000000..14a9f398a8cbe7
--- /dev/null
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
@@ -0,0 +1,308 @@
+#pragma once
+
+#if defined(__linux__)
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#endif
+
+#ifdef USE_C10D_XCCL
+#include <ATen/xpu/XPUEvent.h>
+#include <oneapi/ccl.hpp>
+#include <torch/csrc/distributed/c10d/Store.hpp>
+#include <exception>
+#include <memory>
+#include <vector>
+
+#include <atomic>
+#include <chrono>
+#include <future>
+#include <iostream>
+#include <list>
+#include <mutex>
+#include <thread>
+#include <unordered_map>
+
+#include <c10/core/StreamGuard.h>
+#include <c10/xpu/XPUCachingAllocator.h>
+#include <torch/csrc/distributed/c10d/Backend.hpp>
+#include <torch/csrc/distributed/c10d/PrefixStore.hpp>
+#include <torch/csrc/distributed/c10d/Store.hpp>
+namespace c10d {
+
+namespace {
+int getXCCLEnvVar(std::string envVarName) {
+  char* stringValue = std::getenv(envVarName.c_str());
+  if (stringValue != nullptr) {
+    try {
+      int val = std::stoi(stringValue);
+      return val;
+    } catch (std::exception& e) {
+      TORCH_CHECK(
+          false,
+          "Invalid value for environment variable: " + std::string(envVarName));
+    }
+  } else {
+    return -1;
+  }
+}
+
+template <typename T>
+void setXCCLEnvVar(const std::string& envVarName, T val) {
+  if constexpr (std::is_same_v<T, int>) {
+    setenv(envVarName.c_str(), std::to_string(val).c_str(), 1);
+  } else if constexpr (std::is_same_v<T, std::string>) {
+    setenv(envVarName.c_str(), val.c_str(), 1);
+  }
+}
+
+bool with_mpirun() {
+  return (getenv("MPI_LOCALRANKID") || getenv("MPI_LOCALNRANKS") ||
+          getenv("PMI_RANK") || getenv("PMI_SIZE") || getenv("PMIX_RANK"))
+      ? true
+      : false;
+}
+} // namespace
+
+static std::vector<std::string> TORCH_XCCL_BLOCKING_WAIT = {
+    "TORCH_XCCL_BLOCKING_WAIT",
+    "XCCL_BLOCKING_WAIT"};
+
+using xcclComm_t = ccl::communicator;
+using XCCL_KVS = ccl::shared_ptr_class<ccl::kvs>;
+constexpr const char* XCCL_BACKEND_NAME = "xccl";
+
+class TORCH_API ProcessGroupXCCL : public Backend {
+ public:
+  class WorkXCCL : public Work {
+   public:
+    WorkXCCL(
+        at::Device& device,
+        int rank,
+        OpType opType,
+        const std::optional<std::vector<at::Tensor>>& inputs = std::nullopt);
+    WorkXCCL(const WorkXCCL& w);
+    ~WorkXCCL() override;
+
+    bool isCompleted() override;
+
+    bool isSuccess() const override {
+      TORCH_CHECK(
+          false, "ProcessGroupXCCL::WorkXCCL::isSuccess not implemented");
+    }
+
+    void abort() override {
+      TORCH_CHECK(false, "ProcessGroupXCCL::WorkXCCL::abort not implemented");
+    }
+
+    void synchronize() override;
+
+    void synchronizeStream();
+
+    bool wait(std::chrono::milliseconds timeout = kNoTimeout) override;
+
+    c10::intrusive_ptr<c10::ivalue::Future> getFuture() override {
+      return future_;
+    }
+
+    std::vector<at::Tensor> result() override {
+      TORCH_CHECK(false, "ProcessGroupXCCL::WorkXCCL::result not implemented");
+    }
+
+    bool checkTimeout(
+        std::optional<std::chrono::milliseconds> timeout = std::nullopt);
+
+   protected:
+    at::Device device_;
+    std::shared_ptr<at::xpu::XPUEvent> xcclEndEvent_;
+    bool blockingWait_ = false;
+    std::chrono::time_point<std::chrono::steady_clock> workStartTime_;
+
+   private:
+    void synchronizeInternal(std::chrono::milliseconds timeout);
+    std::shared_ptr<std::vector<at::Tensor>> outputs_;
+    c10::intrusive_ptr<at::ivalue::Future> future_;
+    friend class ProcessGroupXCCL;
+  };
+
+  ProcessGroupXCCL(const c10::intrusive_ptr<Store>& store, int rank, int size);
+
+  C10_DEPRECATED ProcessGroupXCCL(
+      const c10::intrusive_ptr<Store>& store,
+      int rank,
+      int size,
+      const std::string& groupName)
+      : ProcessGroupXCCL(store, rank, size) {}
+
+  ~ProcessGroupXCCL() override;
+
+  const std::string getBackendName() const override {
+    return std::string(XCCL_BACKEND_NAME);
+  }
+
+  std::shared_ptr<xcclComm_t> getXCCLComm(
+      const std::string& deviceKey,
+      at::Device& device);
+
+  virtual c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL> initWork(
+      at::Device& device,
+      int rank,
+      OpType opType,
+      const std::vector<at::Tensor>& inputs = {},
+      const std::vector<at::Tensor>& outputs = {});
+
+  template <typename Fn>
+  c10::intrusive_ptr<Work> collective(
+      at::Tensor& input,
+      at::Tensor& output,
+      Fn fn,
+      OpType opType);
+
+  template <typename Fn, typename PreProcess, typename PostProcess>
+  c10::intrusive_ptr<Work> collective(
+      at::Tensor& input,
+      at::Tensor& output,
+      Fn fn,
+      PreProcess pre,
+      PostProcess post,
+      OpType opType);
+
+  c10::intrusive_ptr<Work> allreduce(
+      std::vector<at::Tensor>& tensors,
+      const AllreduceOptions& opts = AllreduceOptions()) override;
+
+  c10::intrusive_ptr<Work> allreduce_coalesced(
+      std::vector<at::Tensor>& tensors,
+      const AllreduceCoalescedOptions& opts =
+          AllreduceCoalescedOptions()) override {
+    TORCH_CHECK(false, "ProcessGroupXCCL::allreduce_coalesced not implemented");
+  }
+
+  c10::intrusive_ptr<Work> reduce(
+      std::vector<at::Tensor>& tensors,
+      const ReduceOptions& opts = ReduceOptions()) override {
+    TORCH_CHECK(false, "ProcessGroupXCCL::reduce not implemented");
+  }
+
+  c10::intrusive_ptr<Work> broadcast(
+      std::vector<at::Tensor>& tensors,
+      const BroadcastOptions& opts = BroadcastOptions()) override {
+    TORCH_CHECK(false, "ProcessGroupXCCL::broadcast not implemented");
+  }
+
+  c10::intrusive_ptr<Work> allgather(
+      std::vector<std::vector<at::Tensor>>& outputTensors,
+      std::vector<at::Tensor>& inputTensors,
+      const AllgatherOptions& opts = AllgatherOptions()) override {
+    TORCH_CHECK(false, "ProcessGroupXCCL::allgather not implemented");
+  }
+
+  c10::intrusive_ptr<Work> _allgather_base(
+      at::Tensor& outputbuffer,
+      at::Tensor& inputbuffer,
+      const AllgatherOptions& opts = AllgatherOptions()) override {
+    TORCH_CHECK(false, "ProcessGroupXCCL::_allgather_base not implemented");
+  }
+
+  c10::intrusive_ptr<Work> allgather_coalesced(
+      std::vector<std::vector<at::Tensor>>& outputTensorLists,
+      std::vector<at::Tensor>& inputTensors,
+      const AllgatherOptions& opts = AllgatherOptions()) override {
+    TORCH_CHECK(false, "ProcessGroupXCCL::allgather_coalesced not implemented");
+  }
+
+  c10::intrusive_ptr<Work> allgather_into_tensor_coalesced(
+      std::vector<at::Tensor>& outputs,
+      std::vector<at::Tensor>& inputs,
+      const AllgatherOptions& opts = AllgatherOptions()) override {
+    TORCH_CHECK(
+        false,
+        "ProcessGroupXCCL::allgather_into_tensor_coalesced not implemented");
+  }
+
+  c10::intrusive_ptr<Work> reduce_scatter(
+      std::vector<at::Tensor>& outputTensors,
+      std::vector<std::vector<at::Tensor>>& inputTensors,
+      const ReduceScatterOptions& opts = ReduceScatterOptions()) override {
+    TORCH_CHECK(false, "ProcessGroupXCCL::reduce_scatter not implemented");
+  }
+
+  c10::intrusive_ptr<Work> _reduce_scatter_base(
+      at::Tensor& outputTensor,
+      at::Tensor& inputTensor,
+      const ReduceScatterOptions& opts = ReduceScatterOptions()) override {
+    TORCH_CHECK(
+        false, "ProcessGroupXCCL::_reduce_scatter_base not implemented");
+  }
+
+  c10::intrusive_ptr<Work> reduce_scatter_tensor_coalesced(
+      std::vector<at::Tensor>& outputs,
+      std::vector<at::Tensor>& inputs,
+      const ReduceScatterOptions& opts = ReduceScatterOptions()) override {
+    TORCH_CHECK(
+        false,
+        "ProcessGroupXCCL::reduce_scatter_tensor_coalesced not implemented");
+  }
+
+  c10::intrusive_ptr<Work> barrier(
+      const BarrierOptions& opts = BarrierOptions()) override {
+    TORCH_CHECK(false, "ProcessGroupXCCL::barrier not implemented");
+  }
+
+  c10::intrusive_ptr<Work> alltoall_base(
+      at::Tensor& outputTensor,
+      at::Tensor& inputTensor,
+      std::vector<int64_t>& outputSplitSizes,
+      std::vector<int64_t>& inputSplitSizes,
+      const AllToAllOptions& opts = AllToAllOptions()) override {
+    TORCH_CHECK(false, "ProcessGroupXCCL::alltoall_base not implemented");
+  }
+
+  c10::intrusive_ptr<Work> alltoall(
+      std::vector<at::Tensor>& outputTensors,
+      std::vector<at::Tensor>& inputTensors,
+      const AllToAllOptions& opts = AllToAllOptions()) override {
+    TORCH_CHECK(false, "ProcessGroupXCCL::alltoall not implemented");
+  }
+
+  c10::intrusive_ptr<Work> send(
+      std::vector<at::Tensor>& tensors,
+      int dstRank,
+      int tag) override {
+    TORCH_CHECK(false, "ProcessGroupXCCL::send not implemented");
+  }
+
+  c10::intrusive_ptr<Work> recv(
+      std::vector<at::Tensor>& tensors,
+      int srcRank,
+      int tag) override {
+    TORCH_CHECK(false, "ProcessGroupXCCL::recv not implemented");
+  }
+
+  c10::intrusive_ptr<Work> gather(
+      std::vector<std::vector<at::Tensor>>& outputTensors,
+      std::vector<at::Tensor>& inputTensors,
+      const GatherOptions& opts = GatherOptions()) override {
+    TORCH_CHECK(false, "ProcessGroupXCCL::gather not implemented");
+  }
+
+  c10::intrusive_ptr<Work> scatter(
+      std::vector<at::Tensor>& outputTensors,
+      std::vector<std::vector<at::Tensor>>& inputTensors,
+      const ScatterOptions& opts = ScatterOptions()) override {
+    TORCH_CHECK(false, "ProcessGroupXCCL::scatter not implemented");
+  }
+
+ protected:
+  std::unordered_map<std::string, at::xpu::XPUStream> xcclStreams_;
+  std::unordered_map<std::string, std::shared_ptr<xcclComm_t>>
+      inInitializationCommMap_;
+  std::unordered_map<std::string, std::shared_ptr<xcclComm_t>> devXCCLCommMap_;
+  c10::intrusive_ptr<Store> store_;
+  std::mutex mutex_;
+  bool blockingWait_ = false;
+};
+} // namespace c10d
+
+#endif // USE_C10D_XCCL
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index c8f9dff37f06e2..e3ed6d6bd4bcb4 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -37,6 +37,10 @@
 #include <torch/csrc/distributed/c10d/ProcessGroupUCC.hpp>
 #endif
 
+#ifdef USE_C10D_XCCL
+#include <torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp>
+#endif
+
 #include <fmt/format.h>
 #include <pybind11/chrono.h>
 #include <torch/csrc/distributed/c10d/DMAConnectivity.hpp>
@@ -2232,6 +2236,7 @@ The hook must have the following signature:
       .value("UNDEFINED", ::c10d::ProcessGroup::BackendType::UNDEFINED)
       .value("GLOO", ::c10d::ProcessGroup::BackendType::GLOO)
       .value("NCCL", ::c10d::ProcessGroup::BackendType::NCCL)
+      .value("XCCL", ::c10d::ProcessGroup::BackendType::XCCL)
       .value("UCC", ::c10d::ProcessGroup::BackendType::UCC)
       .value("MPI", ::c10d::ProcessGroup::BackendType::MPI)
       .value("CUSTOM", ::c10d::ProcessGroup::BackendType::CUSTOM)
@@ -2877,6 +2882,23 @@ Example::
               py::call_guard<py::gil_scoped_release>());
 #endif
 
+#ifdef USE_C10D_XCCL
+  auto processGroupXCCL =
+      intrusive_ptr_no_gil_destructor_class_<::c10d::ProcessGroupXCCL>(
+          module, "ProcessGroupXCCL", backend)
+          .def(
+              py::init([](const c10::intrusive_ptr<::c10d::Store>& store,
+                          int rank,
+                          int size) {
+                return c10::make_intrusive<::c10d::ProcessGroupXCCL>(
+                    store, rank, size);
+              }),
+              py::arg("store"),
+              py::arg("rank"),
+              py::arg("size"),
+              py::call_guard<py::gil_scoped_release>());
+#endif
+
   py::enum_<::c10d::OpType>(module, "OpType")
       .value("BROADCAST", ::c10d::OpType::BROADCAST)
       .value("ALLREDUCE", ::c10d::OpType::ALLREDUCE)
diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index 45e096985143a3..9fa3224873c9fc 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -87,6 +87,7 @@
     "is_nccl_available",
     "is_torchelastic_launched",
     "is_ucc_available",
+    "is_xccl_available",
     "isend",
     "monitored_barrier",
     "new_group",
@@ -130,6 +131,7 @@
 _NCCL_AVAILABLE = True
 _GLOO_AVAILABLE = True
 _UCC_AVAILABLE = True
+_XCCL_AVAILABLE = True
 
 _pickler = pickle.Pickler
 _unpickler = pickle.Unpickler
@@ -193,6 +195,14 @@ def _export_c_types() -> None:
 except ImportError:
     _UCC_AVAILABLE = False
 
+try:
+    from torch._C._distributed_c10d import ProcessGroupXCCL
+
+    ProcessGroupXCCL.__module__ = "torch.distributed.distributed_c10d"
+    __all__ += ["ProcessGroupXCCL"]
+except ImportError:
+    _XCCL_AVAILABLE = False
+
 logger = logging.getLogger(__name__)
 
 PG_WRAPPER_STORE_PREFIX = "pg_wrapper"
@@ -222,7 +232,7 @@ class Backend(str):
     """
     An enum-like class for backends.
 
-    Available backends: GLOO, NCCL, UCC, MPI, and other registered backends.
+    Available backends: GLOO, NCCL, UCC, MPI, XCCL, and other registered backends.
 
     The values of this class are lowercase strings, e.g., ``"gloo"``. They can
     be accessed as attributes, e.g., ``Backend.NCCL``.
@@ -242,21 +252,24 @@ class Backend(str):
     NCCL = "nccl"
     UCC = "ucc"
     MPI = "mpi"
+    XCCL = "xccl"
 
     _BackendPlugin = namedtuple("_BackendPlugin", ["creator_fn", "extended_api"])
 
     _plugins: Dict[str, _BackendPlugin] = {}
 
-    backend_list = [UNDEFINED, GLOO, NCCL, UCC, MPI]
+    backend_list = [UNDEFINED, GLOO, NCCL, XCCL, UCC, MPI]
 
     default_device_backend_map: Dict[str, str] = {
         "cpu": GLOO,
         "cuda": NCCL,
+        "xpu": XCCL,
     }
 
     backend_capability: Dict[str, List[str]] = {
         GLOO: ["cpu", "cuda"],
         NCCL: ["cuda"],
+        XCCL: ["xpu"],
         UCC: ["cpu", "cuda"],
         MPI: ["cpu", "cuda"],
     }
@@ -265,6 +278,7 @@ class Backend(str):
         UNDEFINED: ProcessGroup.BackendType.UNDEFINED,
         GLOO: ProcessGroup.BackendType.GLOO,
         NCCL: ProcessGroup.BackendType.NCCL,
+        XCCL: ProcessGroup.BackendType.XCCL,
         UCC: ProcessGroup.BackendType.UCC,
     }
 
@@ -1098,6 +1112,11 @@ def is_ucc_available() -> bool:
     return _UCC_AVAILABLE
 
 
+def is_xccl_available() -> bool:
+    """Check if the XCCL backend is available."""
+    return _XCCL_AVAILABLE
+
+
 def is_backend_available(backend: str) -> bool:
     """
     Check backend availability.
@@ -1350,6 +1369,10 @@ def _set_pg_timeout(timeout: timedelta, group: Optional[ProcessGroup] = None) ->
             backends.add(backend)  # type: ignore[arg-type]
         elif is_gloo_available() and isinstance(backend, ProcessGroupGloo):
             backends.add(backend)  # type: ignore[arg-type]
+    if torch.device("xpu") in devices and is_xccl_available():
+        backend = group._get_backend(torch.device("xpu"))
+        if isinstance(backend, ProcessGroupXCCL):
+            backends.add(backend)  # type: ignore[arg-type]
     if len(backends) == 0:
         warnings.warn("Set timeout is now only supported for either nccl or gloo.")
     for backend in backends:
@@ -1385,7 +1408,7 @@ def init_process_group(
 
     Args:
         backend (str or Backend, optional): The backend to use. Depending on
-            build-time configurations, valid values include ``mpi``, ``gloo``,
+            build-time configurations, valid values include ``mpi``, ``gloo``, ``xccl``,
             ``nccl``, and ``ucc``. If the backend is not provided, then both a ``gloo``
             and ``nccl`` backend will be created, see notes below for how multiple
             backends are managed. This field can be given as a lowercase string
@@ -1651,10 +1674,13 @@ def _new_process_group_helper(
             "created, please use a different group name"
         )
 
-    if device_id is not None and (device_id.index is None or device_id.type != "cuda"):
+    if device_id is not None and (
+        device_id.index is None
+        or (device_id.type != "cuda" and device_id.type != "xpu")
+    ):
         raise ValueError(
             "init_process_group device_id parameter must be a cuda device with an "
-            "id, e.g. cuda:0, not just cuda or cpu"
+            "id, e.g. cuda:0, xpu, not just cuda or xpu or cpu"
         )
 
     # Note: _new_process_group_helper is only called from init_process_group, which always provides a timeout value
@@ -1762,7 +1788,6 @@ def _new_process_group_helper(
                 pg_options = ProcessGroupNCCL.Options()
                 pg_options.is_high_priority_stream = False
             pg_options._timeout = timeout
-
             if split_from:
                 pg_options.split_from = split_from
                 pg_options.split_color = _process_group_color(global_ranks_in_group)
@@ -1781,6 +1806,17 @@ def _new_process_group_helper(
                 backend_prefix_store, group_rank, group_size, timeout=timeout
             )
             backend_type = ProcessGroup.BackendType.UCC
+        elif backend_str == Backend.XCCL:
+            if not is_xccl_available():
+                raise RuntimeError("Distributed package doesn't have XCCL built in")
+            if pg_options is not None:
+                assert isinstance(
+                    pg_options, ProcessGroupXCCL.Options
+                ), "Expected pg_options argument to be of type ProcessGroupXCCL.Options"
+            backend_class = ProcessGroupXCCL(
+                backend_prefix_store, group_rank, group_size
+            )
+            backend_type = ProcessGroup.BackendType.XCCL
         else:
             assert (
                 backend_str.upper() in Backend._plugins
diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py
index d59102232f7db7..26bdcce6103120 100644
--- a/torch/testing/_internal/common_distributed.py
+++ b/torch/testing/_internal/common_distributed.py
@@ -180,7 +180,8 @@ def skip_if_lt_x_gpu(x):
     def decorator(func):
         @wraps(func)
         def wrapper(*args, **kwargs):
-            if torch.cuda.is_available() and torch.cuda.device_count() >= x:
+            if (torch.cuda.is_available() and torch.cuda.device_count() >= x) or \
+               (torch.xpu.is_available() and torch.xpu.device_count() >= x):
                 return func(*args, **kwargs)
             sys.exit(TEST_SKIPS[f"multi-gpu-{x}"].exit_code)
 
@@ -320,6 +321,12 @@ def requires_nccl():
         "c10d was not compiled with the NCCL backend",
     )
 
+def requires_xccl():
+    return skip_but_pass_in_sandcastle_if(
+        not c10d.is_xccl_available(),
+        "c10d was not compiled with the XCCL backend",
+    )
+
 def requires_ucc():
     return skip_but_pass_in_sandcastle_if(
         not c10d.is_ucc_available(),
@@ -463,7 +470,7 @@ def init_multigpu_helper(world_size: int, backend: str):
     On a single node, all visible GPUs are evenly
     divided to subsets, each process only uses a subset.
     """
-    nGPUs = torch.cuda.device_count()
+    nGPUs = torch.xpu.device_count() if torch.xpu.is_available() else torch.cuda.device_count()
     visible_devices = range(nGPUs)
 
     # If rank is less than or equal to number of available GPU's

From a71d69a50684d8e6c6edd2ddc285f7589a44914d Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Fri, 20 Sep 2024 03:39:00 +0000
Subject: [PATCH 67/96] Align latest

---
 torch/distributed/distributed_c10d.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index 2d9357bbd15a44..4bbb1c41011231 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -1819,10 +1819,10 @@ def _new_process_group_helper(
         elif backend_str == Backend.XCCL:
             if not is_xccl_available():
                 raise RuntimeError("Distributed package doesn't have XCCL built in")
-            if pg_options is not None:
+            if backend_options is not None:
                 assert isinstance(
-                    pg_options, ProcessGroupXCCL.Options
-                ), "Expected pg_options argument to be of type ProcessGroupXCCL.Options"
+                    backend_options, ProcessGroupXCCL.Options
+                ), "Expected backend_options argument to be of type ProcessGroupXCCL.Options"
             backend_class = ProcessGroupXCCL(
                 backend_prefix_store, group_rank, group_size
             )

From 4bf448dd02db537095d0e0ec275116246d0bac92 Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Fri, 20 Sep 2024 05:21:28 +0000
Subject: [PATCH 68/96] update

---
 torch/distributed/distributed_c10d.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index dfae588345c726..4bbb1c41011231 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -1819,17 +1819,10 @@ def _new_process_group_helper(
         elif backend_str == Backend.XCCL:
             if not is_xccl_available():
                 raise RuntimeError("Distributed package doesn't have XCCL built in")
-<<<<<<< HEAD
-            if pg_options is not None:
-                assert isinstance(
-                    pg_options, ProcessGroupXCCL.Options
-                ), "Expected pg_options argument to be of type ProcessGroupXCCL.Options"
-=======
             if backend_options is not None:
                 assert isinstance(
                     backend_options, ProcessGroupXCCL.Options
                 ), "Expected backend_options argument to be of type ProcessGroupXCCL.Options"
->>>>>>> xccl-bak
             backend_class = ProcessGroupXCCL(
                 backend_prefix_store, group_rank, group_size
             )

From 1f83fbf89397b132db708279369dbe1940b527a6 Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Fri, 20 Sep 2024 05:48:39 +0000
Subject: [PATCH 69/96] update

---
 .../distributed/c10d/ProcessGroupXCCL.cpp     | 25 +------------------
 1 file changed, 1 insertion(+), 24 deletions(-)

diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index acdcae2eea4cde..32030f45e73ae3 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -10,7 +10,6 @@
 #include <tuple>
 #include <unordered_set>
 #include <utility>
-#include <variant>
 
 #include <ATen/detail/FunctionTraits.h>
 #include <c10/core/DeviceType.h>
@@ -458,29 +457,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
 
   work = initWork(device, rank_, opType);
 
-  { // Do we need to store the result of the operation?
-    std::variant<std::vector<at::Tensor>, std::vector<std::vector<at::Tensor>>>
-        outputs;
-    std::visit(
-        [&work](auto&& outputData) {
-          using T = std::decay_t<decltype(outputData)>;
-
-          if constexpr (std::is_same_v<T, std::vector<at::Tensor>>) {
-            work->outputs_ = std::make_shared<std::vector<at::Tensor>>(
-                std::move(outputData));
-          } else if constexpr (std::is_same_v<
-                                   T,
-                                   std::vector<std::vector<at::Tensor>>>) {
-            std::vector<at::Tensor> flattened;
-            for (auto& vec : outputData) {
-              flattened.insert(flattened.end(), vec.begin(), vec.end());
-            }
-            work->outputs_ =
-                std::make_shared<std::vector<at::Tensor>>(std::move(flattened));
-          }
-        },
-        outputs);
-  }
+  work->outputs_ = std::make_shared<std::vector<at::Tensor>>(outputs);
 
   pre(stream, work);
 

From 4f4ecf476317fc0a12b768dc77f7b109ba01020e Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Fri, 20 Sep 2024 07:34:29 +0000
Subject: [PATCH 70/96] update

---
 torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index 32030f45e73ae3..e973ce110ab0bc 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -4,6 +4,7 @@
 #include <sstream>
 
 #ifdef USE_C10D_XCCL
+#include <comm/XPUGuard.h>
 #include <exception>
 #include <map>
 #include <stdexcept>
@@ -459,6 +460,8 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
 
   work->outputs_ = std::make_shared<std::vector<at::Tensor>>(outputs);
 
+  at::xpu::OptionalXPUGuard gpuGuard(device);
+
   pre(stream, work);
 
   for (const auto& input : inputs) {
@@ -470,7 +473,9 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
 
   post(stream, work);
 
-  work->xcclEndEvent_->record(stream);
+  if (!coalescing_state_) {
+    work->xcclEndEvent_->record(stream);
+  }
 
   std::vector<c10::Stream> streams = {stream.unwrap()};
   c10::MultiStreamGuard streamGuard(streams);
@@ -550,6 +555,8 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collectiveCoalesced(
 
   work->outputs_ = std::make_shared<std::vector<at::Tensor>>(outputs);
 
+  at::xpu::OptionalXPUGuard gpuGuard(device);
+
   {
     AutoXcclGroup xccl_group_guard;
     for (const auto i : c10::irange(inputs.size())) {

From b6bc4a82376d23ccaf9ad951ff3a016677bfc9f8 Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Fri, 20 Sep 2024 08:01:50 +0000
Subject: [PATCH 71/96] update

---
 torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index 034f201dc75bd7..59cd218b7f1c25 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -733,6 +733,10 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::gather(
           return ret_evt;
         }
       },
+      [](at::xpu::XPUStream&, c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL>&) {
+      },
+      [](at::xpu::XPUStream&, c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL>&) {
+      },
       OpType::GATHER);
 }
 
@@ -834,6 +838,10 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::scatter(
           return ret_evt;
         }
       },
+      [](at::xpu::XPUStream&, c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL>&) {
+      },
+      [](at::xpu::XPUStream&, c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL>&) {
+      },
       OpType::SCATTER);
 }
 
@@ -1545,6 +1553,10 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::alltoall(
         stream.synchronize();
         return ret_evt;
       },
+      [](at::xpu::XPUStream&, c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL>&) {
+      },
+      [](at::xpu::XPUStream&, c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL>&) {
+      },
       OpType::ALLTOALL);
 }
 

From 1fbb7edfa494196a90a2445d74aaea7e0966ca7b Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Mon, 23 Sep 2024 07:15:42 +0000
Subject: [PATCH 72/96] support p2p

---
 test/distributed/test_c10d_ops_xccl.py        |  89 ++++---
 .../distributed/c10d/ProcessGroupXCCL.cpp     | 226 +++++++++++++++++-
 .../distributed/c10d/ProcessGroupXCCL.hpp     |  30 ++-
 3 files changed, 281 insertions(+), 64 deletions(-)

diff --git a/test/distributed/test_c10d_ops_xccl.py b/test/distributed/test_c10d_ops_xccl.py
index 8cfce2be164d9f..279ec0eb03ecf8 100644
--- a/test/distributed/test_c10d_ops_xccl.py
+++ b/test/distributed/test_c10d_ops_xccl.py
@@ -758,51 +758,50 @@ def allreduce(tensors):
                     torch.tensor([(j + 1) * self.world_size]), tensors_list[i - 1][j]
                 )
 
-    # TODO: wait send/recv
-    # @requires_xccl()
-    # @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs")
-    # def test_send_recv(self):
-    #     pg = self.pg
-    #     device = self.rank_to_GPU[self.rank][0]
-
-    #     # Generate the same random tensor
-    #     torch.manual_seed(0)
-    #     send_tensor = torch.rand(10, 10, device=device)
-    #     if self.rank == 0:
-    #         dist.send(send_tensor, 1)
-    #     if self.rank == 1:
-    #         recv_tensor = torch.rand(10, 10, device=device)
-    #         dist.recv(recv_tensor, 0)
-    #         self.assertEqual(send_tensor, recv_tensor)
-
-    # @requires_xccl()
-    # @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs")
-    # def test_send_recv_complex(self):
-    #     pg = self.pg
-    #     device = self.rank_to_GPU[self.rank][0]
-
-    #     # Generate the same random tensor
-    #     torch.manual_seed(0)
-    #     send_tensor = torch.rand(10, 10, dtype=torch.cfloat, device=device)
-    #     if self.rank == 0:
-    #         dist.send(send_tensor, 1)
-    #     if self.rank == 1:
-    #         recv_tensor = torch.rand(10, 10, dtype=torch.cfloat, device=device)
-    #         dist.recv(recv_tensor, 0)
-    #         self.assertEqual(send_tensor, recv_tensor)
-
-    # @requires_xccl()
-    # @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs")
-    # def test_send_recv_object_list(self):
-    #     device = self.rank_to_GPU[self.rank][0]
-
-    #     val = 99 if self.rank == 0 else None
-    #     object_list = [val] * self.world_size
-    #     if self.rank == 0:
-    #         dist.send_object_list(object_list, 1, device=device)
-    #     if self.rank == 1:
-    #         dist.recv_object_list(object_list, 0, device=device)
-    #         self.assertEqual(object_list[0], 99)
+    @requires_xccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs")
+    def test_send_recv(self):
+        pg = self.pg
+        device = self.rank_to_GPU[self.rank][0]
+
+        # Generate the same random tensor
+        torch.manual_seed(0)
+        send_tensor = torch.rand(10, 10, device=device)
+        if self.rank == 0:
+            dist.send(send_tensor, 1)
+        if self.rank == 1:
+            recv_tensor = torch.rand(10, 10, device=device)
+            dist.recv(recv_tensor, 0)
+            self.assertEqual(send_tensor, recv_tensor)
+
+    @requires_xccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs")
+    def test_send_recv_complex(self):
+        pg = self.pg
+        device = self.rank_to_GPU[self.rank][0]
+
+        # Generate the same random tensor
+        torch.manual_seed(0)
+        send_tensor = torch.rand(10, 10, dtype=torch.cfloat, device=device)
+        if self.rank == 0:
+            dist.send(send_tensor, 1)
+        if self.rank == 1:
+            recv_tensor = torch.rand(10, 10, dtype=torch.cfloat, device=device)
+            dist.recv(recv_tensor, 0)
+            self.assertEqual(send_tensor, recv_tensor)
+
+    @requires_xccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs")
+    def test_send_recv_object_list(self):
+        device = self.rank_to_GPU[self.rank][0]
+
+        val = 99 if self.rank == 0 else None
+        object_list = [val] * self.world_size
+        if self.rank == 0:
+            dist.send_object_list(object_list, 1, device=device)
+        if self.rank == 1:
+            dist.recv_object_list(object_list, 0, device=device)
+            self.assertEqual(object_list[0], 99)
 
 
 if __name__ == "__main__":
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index 59cd218b7f1c25..27a11506f5e309 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -228,6 +228,14 @@ ccl::reduction getXcclReduceOp(const ReduceOp& reduceOp, at::Tensor& input) {
   }
 }
 
+void syncStream(
+    at::Device& device,
+    at::xpu::XPUEvent& xcclEvent,
+    at::xpu::XPUStream& xcclStream) {
+  xcclEvent.record(at::xpu::getCurrentXPUStream(device.index()));
+  xcclEvent.block(xcclStream);
+}
+
 bool complexViewAsRealAllowed(const ReduceOp reduceOp) {
   switch (reduceOp) {
     case ReduceOp::SUM:
@@ -245,9 +253,6 @@ bool complexViewAsRealAllowed(const ReduceOp reduceOp) {
 static std::mutex xcclCommDevIdxMapMutex;
 static std::unordered_map<std::shared_ptr<xcclComm_t>, int> xcclCommDevIdxMap;
 constexpr int64_t kSynchronizeBusyWaitMillis = 10;
-
-// Before implementing send/recv, the xcclActiveGroupCounter_ variable has no
-// effect.
 thread_local uint64_t ProcessGroupXCCL::xcclActiveGroupCounter_ = 0;
 
 ProcessGroupXCCL::WorkXCCL::WorkXCCL(
@@ -369,7 +374,10 @@ c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL> ProcessGroupXCCL::initWork(
 
 std::shared_ptr<xcclComm_t> ProcessGroupXCCL::getXCCLComm(
     const std::string& deviceKey,
-    at::Device& device) {
+    at::Device& device,
+    OpType opType,
+    int p2pRank,
+    bool isSendRecvSelf) {
   if (deviceKey.empty()) {
     C10_THROW_ERROR(
         DistBackendError,
@@ -387,12 +395,29 @@ std::shared_ptr<xcclComm_t> ProcessGroupXCCL::getXCCLComm(
   }
 
   std::shared_ptr<xcclComm_t> XCCLComm;
-
   XCCL_KVS kvs = get_kvs(rank_, *store_);
 
+  bool batchP2P = xcclActiveGroupCounter_ > 0;
+  bool singleP2POp = isP2POp(opType, batchP2P);
+
+  at::xpu::OptionalXPUGuard gpuGuard(device);
+
+  for (const auto i : c10::irange(xcclActiveGroupCounter_)) {
+    (void)i;
+    ccl::group_end();
+  }
+
   int numRanks, rank;
-  numRanks = getSize();
-  rank = getRank();
+  if (!singleP2POp) {
+    numRanks = getSize();
+    rank = getRank();
+  } else if (isSendRecvSelf) {
+    numRanks = 1;
+    rank = 0;
+  } else {
+    numRanks = 2;
+    rank = p2pRank;
+  }
 
   c10::impl::VirtualGuardImpl impl(device.type());
   c10::Stream stream = impl.getStream(device);
@@ -410,7 +435,13 @@ std::shared_ptr<xcclComm_t> ProcessGroupXCCL::getXCCLComm(
     inInitializationCommMap_.emplace(deviceKey, XCCLComm);
   }
 
+  for (const auto i : c10::irange(xcclActiveGroupCounter_)) {
+    (void)i;
+    ccl::group_start();
+  }
+
   xcclStreams_.emplace(deviceKey, std::move(stream));
+  xcclEvents_.emplace(deviceKey, at::xpu::XPUEvent());
 
   auto it = inInitializationCommMap_.find(deviceKey);
   if (it != inInitializationCommMap_.end()) {
@@ -440,7 +471,7 @@ void ProcessGroupXCCL::groupEnd() {
 }
 
 // TODO: wait p2p enable
-static constexpr int CoalActive = 0x01, CoalColl = 0x02;
+static constexpr int CoalActive = 0x01, CoalColl = 0x02, CoalP2P = 0x04;
 void ProcessGroupXCCL::startCoalescing() {
   coalescedDevice_.set_index(-1);
   coalescedComm_ = nullptr;
@@ -496,7 +527,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
 
   auto device = inputs[0].device();
   const auto key = std::to_string(device.index());
-  auto comm = getXCCLComm(key, device);
+  auto comm = getXCCLComm(key, device, opType);
 
   if (coalescing_state_ & CoalActive) {
     coalescing_state_ |= CoalColl;
@@ -514,9 +545,9 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
   }
 
   auto stream = xcclStreams_.at(key);
+  syncStream(device, xcclEvents_[key], stream);
 
   c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL> work;
-
   work = initWork(device, rank_, opType);
 
   work->outputs_ = std::make_shared<std::vector<at::Tensor>>(outputs);
@@ -591,7 +622,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collectiveCoalesced(
 
   auto device = inputs[0].device();
   const auto key = std::to_string(device.index());
-  auto comm = getXCCLComm(key, device);
+  auto comm = getXCCLComm(key, device, opType);
 
   if (coalescing_state_ & CoalActive) {
     coalescing_state_ |= CoalColl;
@@ -609,9 +640,9 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collectiveCoalesced(
   }
 
   auto stream = xcclStreams_.at(key);
+  syncStream(device, xcclEvents_[key], stream);
 
   c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL> work;
-
   work = initWork(device, rank_, opType);
 
   work->outputs_ = std::make_shared<std::vector<at::Tensor>>(outputs);
@@ -640,6 +671,177 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collectiveCoalesced(
   return work;
 }
 
+template <typename Fn, typename PreProcess, typename PostProcess>
+c10::intrusive_ptr<Work> ProcessGroupXCCL::pointToPoint(
+    at::Tensor& tensor,
+    Fn fn,
+    int peer,
+    OpType opType,
+    PreProcess pre,
+    PostProcess post) {
+  using traits = function_traits<Fn>;
+  using attr_t = typename traits::template arg<1>::type;
+  attr_t attr = ccl::create_operation_attr<attr_t>();
+
+  auto device = tensor.device();
+  std::string key;
+  int p2pRank = 0, p2pTargetRank = 0;
+  bool isSendRecvSelf = false;
+
+  bool batchP2P = xcclActiveGroupCounter_ > 0;
+  if (batchP2P) {
+    key = std::to_string(device.index());
+    p2pRank = rank_;
+    p2pTargetRank = peer;
+  } else {
+    int lowRank = rank_ < peer ? rank_ : peer;
+    int highRank = rank_ < peer ? peer : rank_;
+    key = std::to_string(lowRank) + ":" + std::to_string(highRank);
+    p2pRank = rank_ <= peer ? 0 : 1;
+    isSendRecvSelf = rank_ == peer;
+    p2pTargetRank = isSendRecvSelf ? 0 : 1 - p2pRank;
+  }
+
+  auto comm = getXCCLComm(key, device, opType, p2pRank, isSendRecvSelf);
+
+  if (coalescing_state_ & CoalActive) {
+    coalescing_state_ |= CoalP2P;
+    if (coalescedDevice_.index() < 0) {
+      coalescedDevice_ = device;
+    } else {
+      TORCH_CHECK(
+          coalescedDevice_.index() == device.index(), MULTI_DEVICE_ERROR_MSG);
+    }
+    if (coalescedComm_ == nullptr) {
+      coalescedComm_ = comm;
+    } else {
+      TORCH_CHECK(coalescedComm_ == comm, MULTI_DEVICE_ERROR_MSG);
+    }
+  }
+
+  auto stream = xcclStreams_.at(key);
+  syncStream(device, xcclEvents_[key], stream);
+
+  c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL> work;
+  if (!coalescing_state_) {
+    work = initWork(device, rank_, opType);
+    work->outputs_ = std::make_shared<std::vector<at::Tensor>>();
+    work->outputs_->push_back(tensor);
+  }
+
+  at::xpu::OptionalXPUGuard gpuGuard(device);
+
+  if (!coalescing_state_) {
+    pre(stream, work);
+  }
+
+  c10::xpu::XPUCachingAllocator::recordStream(
+      tensor.storage().data_ptr(), stream);
+
+  fn(tensor, attr, *comm, stream, p2pTargetRank);
+
+  if (!coalescing_state_) {
+    post(stream);
+
+    work->xcclEndEvent_->record(stream);
+    work->blockingWait_ = blockingWait_;
+
+    {
+      std::vector<c10::Stream> streams = {stream.unwrap()};
+      c10::MultiStreamGuard streamGuard(streams);
+      std::vector<at::Device> devices{device};
+      work->future_ = c10::make_intrusive<at::ivalue::Future>(
+          c10::ListType::create(c10::TensorType::get()), devices);
+      work->future_->markCompleted(at::IValue(*work->outputs_));
+    }
+    return work;
+  } else {
+    return nullptr;
+  }
+}
+
+template <typename Fn>
+c10::intrusive_ptr<Work> ProcessGroupXCCL::pointToPoint(
+    at::Tensor& tensor,
+    Fn fn,
+    int peer,
+    OpType opType) {
+  return pointToPoint(
+      tensor,
+      fn,
+      peer,
+      opType,
+      [](at::xpu::XPUStream&, c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL>&) {
+      },
+      [](at::xpu::XPUStream&) {});
+}
+
+c10::intrusive_ptr<Work> ProcessGroupXCCL::send(
+    std::vector<at::Tensor>& tensors,
+    int dstRank,
+    int /* unused */) {
+  TORCH_CHECK(tensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
+  // @lint-ignore CLANGTIDY
+  auto tensor = tensors.back();
+  check_xpu_single_tensor(tensor, true);
+
+  auto ret = pointToPoint(
+      tensor,
+      [&](at::Tensor& input,
+          ccl::pt2pt_attr attr,
+          xcclComm_t& comm,
+          at::xpu::XPUStream& stream,
+          int dst) {
+        ccl::event ret_evt;
+        auto xcclDataType = getXcclDataType(input.scalar_type());
+        ret_evt = ccl::send(
+            input.data_ptr(),
+            (size_t)input.numel(),
+            xcclDataType,
+            dst,
+            comm,
+            ccl::create_stream(stream.queue()),
+            attr);
+        return ret_evt;
+      },
+      dstRank,
+      OpType::SEND);
+  return ret;
+}
+
+c10::intrusive_ptr<Work> ProcessGroupXCCL::recv(
+    std::vector<at::Tensor>& tensors,
+    int srcRank,
+    int /* unused */) {
+  TORCH_CHECK(tensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
+  // @lint-ignore CLANGTIDY
+  auto tensor = tensors.back();
+  check_xpu_single_tensor(tensor, true);
+
+  auto ret = pointToPoint(
+      tensor,
+      [&](at::Tensor& output,
+          ccl::pt2pt_attr attr,
+          xcclComm_t& comm,
+          at::xpu::XPUStream& stream,
+          int src) {
+        ccl::event ret_evt;
+        auto xcclDataType = getXcclDataType(output.scalar_type());
+        ret_evt = ccl::recv(
+            output.data_ptr(),
+            (size_t)output.numel(),
+            xcclDataType,
+            src,
+            comm,
+            ccl::create_stream(stream.queue()),
+            attr);
+        return ret_evt;
+      },
+      srcRank,
+      OpType::RECV);
+  return ret;
+}
+
 c10::intrusive_ptr<Work> ProcessGroupXCCL::gather(
     std::vector<std::vector<at::Tensor>>& outputTensors,
     std::vector<at::Tensor>& inputTensors,
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
index bc69ec992e8649..033396e87fdedc 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
@@ -155,7 +155,10 @@ class TORCH_API ProcessGroupXCCL : public Backend {
 
   std::shared_ptr<xcclComm_t> getXCCLComm(
       const std::string& deviceKey,
-      at::Device& device);
+      at::Device& device,
+      OpType opType,
+      int p2pRank = 0,
+      bool isSendRecvSelf = false);
 
   virtual c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL> initWork(
       at::Device& device,
@@ -196,6 +199,22 @@ class TORCH_API ProcessGroupXCCL : public Backend {
       Fn fn,
       OpType opType);
 
+  template <typename Fn>
+  c10::intrusive_ptr<Work> pointToPoint(
+      at::Tensor& tensor,
+      Fn fn,
+      int peer,
+      OpType opType);
+
+  template <typename Fn, typename PreProcess, typename PostProcess>
+  c10::intrusive_ptr<Work> pointToPoint(
+      at::Tensor& tensor,
+      Fn fn,
+      int peer,
+      OpType opType,
+      PreProcess pre,
+      PostProcess post);
+
   c10::intrusive_ptr<Work> allreduce_impl(
       at::Tensor& tensor,
       const AllreduceOptions& opts = AllreduceOptions());
@@ -282,16 +301,12 @@ class TORCH_API ProcessGroupXCCL : public Backend {
   c10::intrusive_ptr<Work> send(
       std::vector<at::Tensor>& tensors,
       int dstRank,
-      int tag) override {
-    TORCH_CHECK(false, "ProcessGroupXCCL::send not implemented");
-  }
+      int tag) override;
 
   c10::intrusive_ptr<Work> recv(
       std::vector<at::Tensor>& tensors,
       int srcRank,
-      int tag) override {
-    TORCH_CHECK(false, "ProcessGroupXCCL::recv not implemented");
-  }
+      int tag) override;
 
   void groupStart();
 
@@ -309,6 +324,7 @@ class TORCH_API ProcessGroupXCCL : public Backend {
 
  protected:
   std::unordered_map<std::string, at::xpu::XPUStream> xcclStreams_;
+  std::unordered_map<std::string, at::xpu::XPUEvent> xcclEvents_;
   std::unordered_map<std::string, std::shared_ptr<xcclComm_t>>
       inInitializationCommMap_;
   std::unordered_map<std::string, std::shared_ptr<xcclComm_t>> devXCCLCommMap_;

From 88bea257af5931dc9e083f1cf7027e802d587f8d Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Sun, 29 Sep 2024 09:05:17 +0000
Subject: [PATCH 73/96] refine findccl code

---
 cmake/Modules/FindXCCL.cmake | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cmake/Modules/FindXCCL.cmake b/cmake/Modules/FindXCCL.cmake
index 56b7fc0f7dcf32..a717ad1dafc653 100644
--- a/cmake/Modules/FindXCCL.cmake
+++ b/cmake/Modules/FindXCCL.cmake
@@ -11,10 +11,10 @@ if(DEFINED ENV{CCL_ROOT})
   set(XCCL_ROOT $ENV{CCL_ROOT})
 endif()
 
-string(COMPARE EQUAL "${XCCL_ROOT}" "" nosyclfound)
-if(nosyclfound)
+string(COMPARE EQUAL "${XCCL_ROOT}" "" nocclfound)
+if(nocclfound)
   set(XCCL_FOUND False)
-  set(XCCL_REASON_FAILURE "XCCL library not set!!")
+  set(XCCL_REASON_FAILURE "OneCCL library not found!!")
   set(XCCL_NOT_FOUND_MESSAGE "${XCCL_REASON_FAILURE}")
   return()
 endif()
@@ -55,7 +55,7 @@ find_library(
 
 if((NOT XCCL_INCLUDE_DIR) OR (NOT XCCL_LIBRARY_DIR) OR (NOT XCCL_LIBRARY))
   set(XCCL_FOUND False)
-  set(XCCL_REASON_FAILURE "XCCL library is incomplete!!")
+  set(XCCL_REASON_FAILURE "OneCCL library not found!!")
   set(XCCL_NOT_FOUND_MESSAGE "${XCCL_REASON_FAILURE}")
   return()
 endif()

From f6ea93450c1b8bd10b709ee5b4076ac25c6413a2 Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Mon, 30 Sep 2024 04:41:54 +0000
Subject: [PATCH 74/96] Add comments for build xccl

---
 caffe2/CMakeLists.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 2160399a3ea296..54ec7db0cad87e 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -1013,6 +1013,10 @@ elseif(USE_CUDA)
 endif()
 
 if(USE_XPU)
+  # if SYCL runtime and oneCCL runtime are both system installed
+  # then building flag USE_XPU=ON , USE_XCCL=ON and USE_C10D_XCCL=ON;
+  # XCCL backend will be build in libtorch_xpu;
+  # manually set `USE_XCCL=OFF` disable XCCL backend building.
   if(USE_XCCL)
     append_filelist("libtorch_xpu_distributed_extra_sources" Caffe2_XPU_SRCS)
   endif()

From 31d092d72303b08dcdfa0b2fd8b4e4ae45d3dffd Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Wed, 9 Oct 2024 08:18:35 +0000
Subject: [PATCH 75/96] minor fix

---
 caffe2/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 54ec7db0cad87e..16a8834225915c 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -1085,7 +1085,7 @@ if(USE_XPU)
     include_directories(SYSTEM ${ATen_XPU_INCLUDE_DIRS})
 
   endif()
-  if(USE_XCCL)
+  if(USE_C10D_XCCL)
     target_link_libraries(torch_xpu PRIVATE torch::xccl)
     target_compile_definitions(torch_xpu PRIVATE USE_XCCL)
   endif()

From cbea299190dcc0e90796fac38cb7b1adb2a34e1a Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Wed, 9 Oct 2024 10:31:25 +0000
Subject: [PATCH 76/96] rm duplicate code and refine cmake

---
 CMakeLists.txt            | 2 --
 caffe2/CMakeLists.txt     | 4 ++--
 cmake/Dependencies.cmake  | 1 -
 cmake/External/xccl.cmake | 3 +++
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0318fcb4d1ec04..60fc8aae14173b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -369,8 +369,6 @@ cmake_dependent_option(
     USE_C10D_GLOO "USE C10D GLOO" ON "USE_DISTRIBUTED;USE_GLOO" OFF)
 cmake_dependent_option(
     USE_C10D_NCCL "USE C10D NCCL" ON "USE_DISTRIBUTED;USE_NCCL" OFF)
-cmake_dependent_option(
-    USE_C10D_XCCL "USE C10D XCCL" ON "USE_DISTRIBUTED;USE_XCCL" OFF)
 cmake_dependent_option(
     USE_C10D_MPI "USE C10D MPI" ON "USE_DISTRIBUTED;USE_MPI" OFF)
 cmake_dependent_option(
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 16a8834225915c..b4ec018019f165 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -1085,7 +1085,7 @@ if(USE_XPU)
     include_directories(SYSTEM ${ATen_XPU_INCLUDE_DIRS})
 
   endif()
-  if(USE_C10D_XCCL)
+  if(USE_XCCL)
     target_link_libraries(torch_xpu PRIVATE torch::xccl)
     target_compile_definitions(torch_xpu PRIVATE USE_XCCL)
   endif()
@@ -1374,7 +1374,7 @@ if(USE_DISTRIBUTED)
       target_compile_definitions(torch_cuda PUBLIC USE_C10D_NCCL)
     endif()
   endif()
-  if(USE_C10D_XCCL)
+  if(USE_XPU AND USE_C10D_XCCL)
     target_compile_definitions(torch_xpu PUBLIC USE_C10D_XCCL)
     set_source_files_properties(
       ${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupXCCL.cpp
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 3e59b813d31381..ee38f19773af81 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -1163,7 +1163,6 @@ if(USE_XCCL)
     caffe2_update_option(USE_XCCL OFF)
   else()
     include(${CMAKE_CURRENT_LIST_DIR}/External/xccl.cmake)
-    list(APPEND Caffe2_XPU_DEPENDENCY_LIBS torch::xccl)
   endif()
 endif()
 
diff --git a/cmake/External/xccl.cmake b/cmake/External/xccl.cmake
index 56205b381b1324..467bb830e0b6cf 100644
--- a/cmake/External/xccl.cmake
+++ b/cmake/External/xccl.cmake
@@ -12,6 +12,9 @@ if(NOT __XCCL_INCLUDED)
       set_property(
         TARGET torch::xccl PROPERTY INTERFACE_LINK_LIBRARIES
         ${XCCL_LIBRARY})
+    else()
+      set(USE_XCCL OFF)
+      set(USE_C10D_XCCL OFF)
     endif()
   endif()
 endif()

From ef261c6f3de6d9cd25c12dbf149fca83482996e2 Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Thu, 10 Oct 2024 00:30:33 +0000
Subject: [PATCH 77/96] update cmake

---
 cmake/Dependencies.cmake  |  3 +++
 cmake/External/xccl.cmake | 25 ++++++++++---------------
 2 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index ee38f19773af81..f90846e89c7549 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -1163,6 +1163,9 @@ if(USE_XCCL)
     caffe2_update_option(USE_XCCL OFF)
   else()
     include(${CMAKE_CURRENT_LIST_DIR}/External/xccl.cmake)
+    if(NOT XCCL_FOUND)
+      caffe2_update_option(USE_XCCL OFF)
+    endif()
   endif()
 endif()
 
diff --git a/cmake/External/xccl.cmake b/cmake/External/xccl.cmake
index 467bb830e0b6cf..acb7cee87593e1 100644
--- a/cmake/External/xccl.cmake
+++ b/cmake/External/xccl.cmake
@@ -1,20 +1,15 @@
 if(NOT __XCCL_INCLUDED)
   set(__XCCL_INCLUDED TRUE)
 
-  if(USE_XCCL)
-    # XCCL_ROOT, XCCL_LIBRARY_DIR, XCCL_INCLUDE_DIR are handled by FindXCCL.cmake.
-    find_package(XCCL REQUIRED)
-    if(XCCL_FOUND)
-      add_library(torch::xccl INTERFACE IMPORTED)
-      set_property(
-        TARGET torch::xccl PROPERTY INTERFACE_INCLUDE_DIRECTORIES
-        ${XCCL_INCLUDE_DIR})
-      set_property(
-        TARGET torch::xccl PROPERTY INTERFACE_LINK_LIBRARIES
-        ${XCCL_LIBRARY})
-    else()
-      set(USE_XCCL OFF)
-      set(USE_C10D_XCCL OFF)
-    endif()
+  # XCCL_ROOT, XCCL_LIBRARY_DIR, XCCL_INCLUDE_DIR are handled by FindXCCL.cmake.
+  find_package(XCCL REQUIRED)
+  if(XCCL_FOUND)
+    add_library(torch::xccl INTERFACE IMPORTED)
+    set_property(
+      TARGET torch::xccl PROPERTY INTERFACE_INCLUDE_DIRECTORIES
+      ${XCCL_INCLUDE_DIR})
+    set_property(
+      TARGET torch::xccl PROPERTY INTERFACE_LINK_LIBRARIES
+      ${XCCL_LIBRARY})
   endif()
 endif()

From 6c648cdbc1260f55256eebff0e0a0d6981b66694 Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Tue, 24 Sep 2024 05:19:22 +0000
Subject: [PATCH 78/96] hidden xccl specific

---
 .../distributed/c10d/ProcessGroupXCCL.cpp     | 104 +++---------------
 .../distributed/c10d/ProcessGroupXCCL.hpp     | 103 ++++++++++-------
 torch/csrc/distributed/c10d/Utils.hpp         |  25 +++++
 3 files changed, 106 insertions(+), 126 deletions(-)

diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index 5aeeb62bee1ece..d26d25ae03e39a 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -1,11 +1,9 @@
+#ifdef USE_C10D_XCCL
+
 #include <torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp>
 #include <fstream>
-#include <mutex>
-#include <sstream>
-
-#ifdef USE_C10D_XCCL
-#include <exception>
 #include <map>
+#include <sstream>
 #include <stdexcept>
 #include <tuple>
 #include <unordered_set>
@@ -13,15 +11,7 @@
 
 #include <ATen/detail/FunctionTraits.h>
 #include <c10/core/DeviceType.h>
-#include <c10/util/CallOnce.h>
-#include <c10/util/Exception.h>
-#include <c10/util/Logging.h>
 #include <c10/util/Optional.h>
-#include <c10/util/irange.h>
-#include <torch/csrc/distributed/c10d/ParamCommsUtils.hpp>
-#include <torch/csrc/distributed/c10d/TraceUtils.h>
-#include <torch/csrc/distributed/c10d/Utils.hpp>
-#include <torch/torch.h>
 
 namespace c10d {
 
@@ -45,36 +35,6 @@ std::map<at::ScalarType, ccl::datatype> xcclDatatypes = {
     {at::kBool, ccl::datatype::uint8},
 };
 
-XCCL_KVS kvs;
-std::mutex kvs_mutex;
-
-XCCL_KVS get_kvs(int rank, c10d::Store& store) {
-  std::lock_guard<std::mutex> lock(kvs_mutex);
-  if (kvs)
-    return kvs;
-  std::string storeKey = "xccl_kvs";
-
-  // Rank 0 broadcast the bootstrap network information to other ranks
-  if (rank == 0) {
-    kvs = ccl::create_main_kvs();
-    ccl::kvs::address_type main_addr = kvs->get_address();
-    auto ccl_kvs_addr =
-        std::vector<uint8_t>(main_addr.begin(), main_addr.end());
-    store.set(storeKey, ccl_kvs_addr);
-  } else {
-    auto ccl_kvs_addr = store.get(storeKey);
-    if (ccl_kvs_addr.size() != ccl::kvs::address_max_size) {
-      throw std::runtime_error("Unexpected ccl kvs addr from the store\n");
-    }
-    ccl::kvs::address_type main_addr;
-    std::copy_n(
-        ccl_kvs_addr.begin(), ccl::kvs::address_max_size, main_addr.begin());
-    kvs = ccl::create_kvs(main_addr);
-  }
-
-  return kvs;
-}
-
 void check_xpu_single_tensor(const at::Tensor& tensor) {
   if (!tensor.is_xpu() || tensor.is_sparse()) {
     C10_THROW_ERROR(ValueError, "Tensors must be XPU and dense");
@@ -106,23 +66,9 @@ ccl::reduction getXcclReduceOp(const ReduceOp& reduceOp, at::Tensor& input) {
     }
     return xcclOps.at(reduceOp);
   } catch (const std::out_of_range&) {
-    switch (reduceOp) {
-      case ReduceOp::AVG:
-        C10_THROW_ERROR(ValueError, "Cannot use ReduceOp AVG with XCCL");
-        break;
-      case ReduceOp::BAND:
-        C10_THROW_ERROR(ValueError, "Cannot use ReduceOp.BAND with XCCL");
-        break;
-      case ReduceOp::BOR:
-        C10_THROW_ERROR(ValueError, "Cannot use ReduceOp.BOR with XCCL");
-        break;
-      case ReduceOp::BXOR:
-        C10_THROW_ERROR(ValueError, "Cannot use ReduceOp.BXOR with XCCL");
-        break;
-      default:
-        C10_THROW_ERROR(ValueError, "Unhandled ReduceOp");
-        break;
-    }
+    C10_THROW_ERROR(
+        ValueError,
+        "Cannot use ReduceOp." + reduce_op_to_string(reduceOp) + " with XCCL");
   }
 }
 
@@ -153,20 +99,6 @@ ProcessGroupXCCL::WorkXCCL::WorkXCCL(const WorkXCCL& w)
 
 ProcessGroupXCCL::WorkXCCL::~WorkXCCL() = default;
 
-bool ProcessGroupXCCL::WorkXCCL::checkTimeout(
-    std::optional<std::chrono::milliseconds> timeout) {
-  auto currentTimepoint = std::chrono::steady_clock::now();
-  auto timeElapsed = std::chrono::duration_cast<std::chrono::milliseconds>(
-      currentTimepoint - workStartTime_);
-  std::chrono::milliseconds opTimeout = std::chrono::milliseconds(60000);
-
-  auto workTimeout = timeout ? *timeout : opTimeout;
-
-  if (timeElapsed < workTimeout)
-    return false;
-  return true;
-}
-
 bool ProcessGroupXCCL::WorkXCCL::isCompleted() {
   if (xcclEndEvent_ && xcclEndEvent_->query()) {
     return true;
@@ -178,23 +110,23 @@ void ProcessGroupXCCL::WorkXCCL::synchronize() {
   synchronizeInternal(kNoTimeout);
 }
 
-void ProcessGroupXCCL::WorkXCCL::synchronizeStream() {
-  auto currentStream = at::xpu::getCurrentXPUStream(device_.index());
-  // Block the current stream on the XCCL stream
-  xcclEndEvent_->block(currentStream);
-}
-
 void ProcessGroupXCCL::WorkXCCL::synchronizeInternal(
     std::chrono::milliseconds timeout) {
-  synchronizeStream();
-
+  auto currentStream = at::xpu::getCurrentXPUStream(device_.index());
+  xcclEndEvent_->block(currentStream);
   if (blockingWait_) {
     while (!isCompleted()) {
-      bool timedOut = checkTimeout(
-          timeout == kNoTimeout ? std::nullopt : std::make_optional(timeout));
-      if (timedOut) {
-        break;
+      auto currentTimepoint = std::chrono::steady_clock::now();
+      auto timeElapsed = std::chrono::duration_cast<std::chrono::milliseconds>(
+          currentTimepoint - workStartTime_);
+      if (timeElapsed >= timeout) {
+        std::string exceptionMsg = c10::str(
+            "Work ran for ",
+            timeElapsed.count(),
+            " milliseconds before timing out.");
+        TORCH_CHECK(false, exceptionMsg)
       }
+
       std::this_thread::sleep_for(
           std::chrono::milliseconds(kSynchronizeBusyWaitMillis));
     }
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
index 14a9f398a8cbe7..99b815f2138b4e 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
@@ -28,43 +28,8 @@
 #include <c10/xpu/XPUCachingAllocator.h>
 #include <torch/csrc/distributed/c10d/Backend.hpp>
 #include <torch/csrc/distributed/c10d/PrefixStore.hpp>
-#include <torch/csrc/distributed/c10d/Store.hpp>
 namespace c10d {
 
-namespace {
-int getXCCLEnvVar(std::string envVarName) {
-  char* stringValue = std::getenv(envVarName.c_str());
-  if (stringValue != nullptr) {
-    try {
-      int val = std::stoi(stringValue);
-      return val;
-    } catch (std::exception& e) {
-      TORCH_CHECK(
-          false,
-          "Invalid value for environment variable: " + std::string(envVarName));
-    }
-  } else {
-    return -1;
-  }
-}
-
-template <typename T>
-void setXCCLEnvVar(const std::string& envVarName, T val) {
-  if constexpr (std::is_same_v<T, int>) {
-    setenv(envVarName.c_str(), std::to_string(val).c_str(), 1);
-  } else if constexpr (std::is_same_v<T, std::string>) {
-    setenv(envVarName.c_str(), val.c_str(), 1);
-  }
-}
-
-bool with_mpirun() {
-  return (getenv("MPI_LOCALRANKID") || getenv("MPI_LOCALNRANKS") ||
-          getenv("PMI_RANK") || getenv("PMI_SIZE") || getenv("PMIX_RANK"))
-      ? true
-      : false;
-}
-} // namespace
-
 static std::vector<std::string> TORCH_XCCL_BLOCKING_WAIT = {
     "TORCH_XCCL_BLOCKING_WAIT",
     "XCCL_BLOCKING_WAIT"};
@@ -98,8 +63,6 @@ class TORCH_API ProcessGroupXCCL : public Backend {
 
     void synchronize() override;
 
-    void synchronizeStream();
-
     bool wait(std::chrono::milliseconds timeout = kNoTimeout) override;
 
     c10::intrusive_ptr<c10::ivalue::Future> getFuture() override {
@@ -110,9 +73,6 @@ class TORCH_API ProcessGroupXCCL : public Backend {
       TORCH_CHECK(false, "ProcessGroupXCCL::WorkXCCL::result not implemented");
     }
 
-    bool checkTimeout(
-        std::optional<std::chrono::milliseconds> timeout = std::nullopt);
-
    protected:
     at::Device device_;
     std::shared_ptr<at::xpu::XPUEvent> xcclEndEvent_;
@@ -302,7 +262,70 @@ class TORCH_API ProcessGroupXCCL : public Backend {
   c10::intrusive_ptr<Store> store_;
   std::mutex mutex_;
   bool blockingWait_ = false;
+
+ private:
+  XCCL_KVS kvs;
+  std::mutex kvs_mutex;
+  XCCL_KVS get_kvs(int rank, c10d::Store& store) {
+    std::lock_guard<std::mutex> lock(kvs_mutex);
+    if (kvs)
+      return kvs;
+    std::string storeKey = "xccl_kvs";
+    // Rank 0 broadcast the bootstrap network information to other ranks
+    if (rank == 0) {
+      kvs = ccl::create_main_kvs();
+      ccl::kvs::address_type main_addr = kvs->get_address();
+      auto ccl_kvs_addr =
+          std::vector<uint8_t>(main_addr.begin(), main_addr.end());
+      store.set(storeKey, ccl_kvs_addr);
+    } else {
+      auto ccl_kvs_addr = store.get(storeKey);
+      if (ccl_kvs_addr.size() != ccl::kvs::address_max_size) {
+        throw std::runtime_error("Unexpected ccl kvs addr from the store\n");
+      }
+      ccl::kvs::address_type main_addr;
+      std::copy_n(
+          ccl_kvs_addr.begin(), ccl::kvs::address_max_size, main_addr.begin());
+      kvs = ccl::create_kvs(main_addr);
+    }
+    return kvs;
+  }
 };
+
+namespace {
+int getXCCLEnvVar(std::string envVarName) {
+  char* stringValue = std::getenv(envVarName.c_str());
+  if (stringValue != nullptr) {
+    try {
+      int val = std::stoi(stringValue);
+      return val;
+    } catch (std::exception& e) {
+      TORCH_CHECK(
+          false,
+          "Invalid value for environment variable: " + std::string(envVarName));
+    }
+  } else {
+    return -1;
+  }
+}
+
+template <typename T>
+void setXCCLEnvVar(const std::string& envVarName, T val) {
+  if constexpr (std::is_same_v<T, int>) {
+    setenv(envVarName.c_str(), std::to_string(val).c_str(), 1);
+  } else if constexpr (std::is_same_v<T, std::string>) {
+    setenv(envVarName.c_str(), val.c_str(), 1);
+  }
+}
+
+bool with_mpirun() {
+  return (getenv("MPI_LOCALRANKID") || getenv("MPI_LOCALNRANKS") ||
+          getenv("PMI_RANK") || getenv("PMI_SIZE") || getenv("PMIX_RANK"))
+      ? true
+      : false;
+}
+
+} // namespace
 } // namespace c10d
 
 #endif // USE_C10D_XCCL
diff --git a/torch/csrc/distributed/c10d/Utils.hpp b/torch/csrc/distributed/c10d/Utils.hpp
index ea4a4653bc35fc..73e37e0437c459 100644
--- a/torch/csrc/distributed/c10d/Utils.hpp
+++ b/torch/csrc/distributed/c10d/Utils.hpp
@@ -557,6 +557,31 @@ size_t computeLengthsAndOffsets(
   return offset;
 }
 
+inline std::string reduce_op_to_string(c10d::ReduceOp op) {
+  switch (op) {
+    case c10d::ReduceOp::SUM:
+      return "SUM";
+    case c10d::ReduceOp::PRODUCT:
+      return "PRODUCT";
+    case c10d::ReduceOp::MIN:
+      return "MIN";
+    case c10d::ReduceOp::MAX:
+      return "MAX";
+    case c10d::ReduceOp::BAND:
+      return "BAND";
+    case c10d::ReduceOp::BOR:
+      return "BOR";
+    case c10d::ReduceOp::BXOR:
+      return "BXOR";
+    case c10d::ReduceOp::AVG:
+      return "AVG";
+    case c10d::ReduceOp::PREMUL_SUM:
+      return "PREMUL_SUM";
+    default:
+      return "UNKNOWN";
+  }
+}
+
 using RankType = uint32_t;
 using SizeType = uint64_t;
 

From e621fe6010382c3c4e614df4ace6a861f598442d Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Fri, 11 Oct 2024 01:55:18 +0000
Subject: [PATCH 79/96] fix ci fail

---
 test/distributed/test_c10d_common.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test/distributed/test_c10d_common.py b/test/distributed/test_c10d_common.py
index d96abb1ca82675..903df26bba9f6f 100644
--- a/test/distributed/test_c10d_common.py
+++ b/test/distributed/test_c10d_common.py
@@ -1836,6 +1836,9 @@ def test_init_process_group_for_all_backends(self):
             elif backend == dist.Backend.UCC:
                 if not dist.is_ucc_available():
                     continue
+            elif backend == dist.Backend.XCCL:
+                if not dist.is_xccl_available():
+                    continue
             # Multi-threaded PG is defined as a pure python class.
             # Its pg.name() does not going through Pybind, so its backend name
             # is still "threaded" instead of "custom".

From f85a8451003bf14b6bc72ddd7799c7bc239bd8b4 Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Sat, 12 Oct 2024 02:35:00 +0000
Subject: [PATCH 80/96] rm ccl attr

---
 .../distributed/c10d/ProcessGroupXCCL.cpp     | 78 ++++---------------
 1 file changed, 17 insertions(+), 61 deletions(-)

diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index 3a795c817bec22..365640d1377781 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -1,9 +1,9 @@
 #ifdef USE_C10D_XCCL
 
-#include <torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp>
-#include <fstream>
 #include <comm/XPUGuard.h>
+#include <torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp>
 #include <exception>
+#include <fstream>
 #include <map>
 #include <sstream>
 #include <stdexcept>
@@ -11,7 +11,6 @@
 #include <unordered_set>
 #include <utility>
 
-#include <ATen/detail/FunctionTraits.h>
 #include <c10/core/DeviceType.h>
 #include <c10/util/Optional.h>
 
@@ -454,10 +453,6 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
     PreProcess pre,
     PostProcess post,
     OpType opType) {
-  using traits = function_traits<Fn>;
-  using attr_t = typename traits::template arg<2>::type;
-  attr_t attr = ccl::create_operation_attr<attr_t>();
-
   auto device = inputs[0].device();
   const auto key = std::to_string(device.index());
   auto comm = getXCCLComm(key, device, opType);
@@ -494,7 +489,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
         input.storage().data_ptr(), stream);
   }
 
-  fn(inputs[0], outputs[0], attr, *comm, stream);
+  fn(inputs[0], outputs[0], *comm, stream);
 
   post(stream, work);
 
@@ -549,10 +544,6 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collectiveCoalesced(
     std::vector<at::Tensor>& outputs,
     Fn fn,
     OpType opType) {
-  using traits = function_traits<Fn>;
-  using attr_t = typename traits::template arg<2>::type;
-  attr_t attr = ccl::create_operation_attr<attr_t>();
-
   auto device = inputs[0].device();
   const auto key = std::to_string(device.index());
   auto comm = getXCCLComm(key, device, opType);
@@ -587,7 +578,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collectiveCoalesced(
     for (const auto i : c10::irange(inputs.size())) {
       c10::xpu::XPUCachingAllocator::recordStream(
           inputs[i].storage().data_ptr(), stream);
-      fn(inputs[i], outputs[i], attr, *comm, stream);
+      fn(inputs[i], outputs[i], *comm, stream);
     }
   }
 
@@ -612,10 +603,6 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::pointToPoint(
     OpType opType,
     PreProcess pre,
     PostProcess post) {
-  using traits = function_traits<Fn>;
-  using attr_t = typename traits::template arg<1>::type;
-  attr_t attr = ccl::create_operation_attr<attr_t>();
-
   auto device = tensor.device();
   std::string key;
   int p2pRank = 0, p2pTargetRank = 0;
@@ -671,7 +658,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::pointToPoint(
   c10::xpu::XPUCachingAllocator::recordStream(
       tensor.storage().data_ptr(), stream);
 
-  fn(tensor, attr, *comm, stream, p2pTargetRank);
+  fn(tensor, *comm, stream, p2pTargetRank);
 
   if (!coalescing_state_) {
     post(stream);
@@ -721,7 +708,6 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::send(
   auto ret = pointToPoint(
       tensor,
       [&](at::Tensor& input,
-          ccl::pt2pt_attr attr,
           xcclComm_t& comm,
           at::xpu::XPUStream& stream,
           int dst) {
@@ -733,8 +719,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::send(
             xcclDataType,
             dst,
             comm,
-            ccl::create_stream(stream.queue()),
-            attr);
+            ccl::create_stream(stream.queue()));
         return ret_evt;
       },
       dstRank,
@@ -754,7 +739,6 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::recv(
   auto ret = pointToPoint(
       tensor,
       [&](at::Tensor& output,
-          ccl::pt2pt_attr attr,
           xcclComm_t& comm,
           at::xpu::XPUStream& stream,
           int src) {
@@ -766,8 +750,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::recv(
             xcclDataType,
             src,
             comm,
-            ccl::create_stream(stream.queue()),
-            attr);
+            ccl::create_stream(stream.queue()));
         return ret_evt;
       },
       srcRank,
@@ -826,7 +809,6 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::gather(
       outputs, // just to fit the collective interface
       [&](at::Tensor& /* unused */,
           at::Tensor& /* unused */,
-          ccl::allgather_attr attr, // just to fit interface
           xcclComm_t& comm,
           at::xpu::XPUStream& stream) {
         const auto root = opts.rootRank;
@@ -928,7 +910,6 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::scatter(
       inputs, // just to fit the collective interface
       [&](at::Tensor& /* unused */,
           at::Tensor& /* unused */,
-          ccl::allgather_attr attr, // just to fit interface
           xcclComm_t& comm,
           at::xpu::XPUStream& stream) {
         if (getRank() == root) {
@@ -988,7 +969,6 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::allreduce_impl(
       tensor,
       [&](at::Tensor& input,
           at::Tensor& output,
-          ccl::allreduce_attr attr,
           xcclComm_t& comm,
           at::xpu::XPUStream& stream) {
         auto xcclDataType = getXcclDataType(input.scalar_type());
@@ -1001,8 +981,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::allreduce_impl(
             xcclDataType,
             xcclReduceOp,
             comm,
-            ccl::create_stream(stream.queue()),
-            attr);
+            ccl::create_stream(stream.queue()));
         return ret_evt;
       },
       OpType::ALLREDUCE);
@@ -1042,7 +1021,6 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::allreduce_coalesced(
       tensors,
       [&](at::Tensor& input,
           at::Tensor& output,
-          ccl::allreduce_attr attr,
           xcclComm_t& comm,
           at::xpu::XPUStream& stream) {
         auto xcclDataType = getXcclDataType(input.scalar_type());
@@ -1055,8 +1033,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::allreduce_coalesced(
             xcclDataType,
             xcclReduceOp,
             comm,
-            ccl::create_stream(stream.queue()),
-            attr);
+            ccl::create_stream(stream.queue()));
         return ret_evt;
       },
       OpType::COALESCED);
@@ -1079,7 +1056,6 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::broadcast(
       tensor,
       [&](at::Tensor& input,
           at::Tensor& output,
-          ccl::broadcast_attr attr,
           xcclComm_t& comm,
           at::xpu::XPUStream& stream) {
         auto xcclDataType = getXcclDataType(input.scalar_type());
@@ -1090,8 +1066,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::broadcast(
             xcclDataType,
             root,
             comm,
-            ccl::create_stream(stream.queue()),
-            attr);
+            ccl::create_stream(stream.queue()));
         return ret_evt;
       },
       OpType::BROADCAST);
@@ -1112,7 +1087,6 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::_broadcast_oop(
       outputTensor,
       [&](at::Tensor& input,
           at::Tensor& output,
-          ccl::broadcast_attr attr,
           xcclComm_t& comm,
           at::xpu::XPUStream& stream) {
         auto xcclDataType = getXcclDataType(input.scalar_type());
@@ -1123,8 +1097,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::_broadcast_oop(
             xcclDataType,
             root,
             comm,
-            ccl::create_stream(stream.queue()),
-            attr);
+            ccl::create_stream(stream.queue()));
         return ret_evt;
       },
       OpType::BROADCAST);
@@ -1151,7 +1124,6 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::reduce(
       tensor,
       [&](at::Tensor& input,
           at::Tensor& output,
-          ccl::reduce_attr attr,
           xcclComm_t& comm,
           at::xpu::XPUStream& stream) {
         const int root = opts.rootRank + opts.rootTensor;
@@ -1186,7 +1158,6 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::_reduce_oop(
       outputTensor,
       [&](at::Tensor& input,
           at::Tensor& output,
-          ccl::reduce_attr attr,
           xcclComm_t& comm,
           at::xpu::XPUStream& stream) {
         const int root = opts.rootRank + opts.rootTensor;
@@ -1228,7 +1199,6 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::allgather(
         outputFlattened,
         [&](at::Tensor& input,
             at::Tensor& output,
-            ccl::allgather_attr attr,
             xcclComm_t& comm,
             at::xpu::XPUStream& stream) {
           c10::xpu::XPUCachingAllocator::recordStream(
@@ -1242,8 +1212,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::allgather(
               (size_t)input.numel(),
               xcclDataType,
               comm,
-              ccl::create_stream(stream.queue()),
-              attr);
+              ccl::create_stream(stream.queue()));
           return ret_evt;
         },
         [](at::xpu::XPUStream&,
@@ -1297,7 +1266,6 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::_allgather_base(
       output_tensor,
       [&](at::Tensor& input,
           at::Tensor& output,
-          ccl::allgather_attr attr,
           xcclComm_t& comm,
           at::xpu::XPUStream& stream) {
         c10::xpu::XPUCachingAllocator::recordStream(
@@ -1310,8 +1278,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::_allgather_base(
             (size_t)input.numel(),
             xcclDataType,
             comm,
-            ccl::create_stream(stream.queue()),
-            attr);
+            ccl::create_stream(stream.queue()));
         return ret_evt;
       },
       OpType::_ALLGATHER_BASE);
@@ -1326,7 +1293,6 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::allgather_into_tensor_coalesced(
       outputs,
       [&](at::Tensor& input,
           at::Tensor& output,
-          ccl::allgather_attr attr,
           xcclComm_t& comm,
           at::xpu::XPUStream& stream) {
         auto xcclDataType = getXcclDataType(input.scalar_type());
@@ -1337,8 +1303,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::allgather_into_tensor_coalesced(
             (size_t)input.numel(),
             xcclDataType,
             comm,
-            ccl::create_stream(stream.queue()),
-            attr);
+            ccl::create_stream(stream.queue()));
         return ret_evt;
       },
       OpType::COALESCED);
@@ -1367,7 +1332,6 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::reduce_scatter(
         outputTensor,
         [&](at::Tensor& input,
             at::Tensor& output,
-            ccl::reduce_attr attr,
             xcclComm_t& comm,
             at::xpu::XPUStream& stream) {
           c10::xpu::XPUCachingAllocator::recordStream(
@@ -1442,7 +1406,6 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::_reduce_scatter_base(
       outputTensor,
       [&](at::Tensor& input,
           at::Tensor& output,
-          ccl::reduce_attr attr,
           xcclComm_t& comm,
           at::xpu::XPUStream& stream) {
         c10::xpu::XPUCachingAllocator::recordStream(
@@ -1475,7 +1438,6 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::reduce_scatter_tensor_coalesced(
       outputs,
       [&](at::Tensor& input,
           at::Tensor& output,
-          ccl::reduce_attr attr,
           xcclComm_t& comm,
           at::xpu::XPUStream& stream) {
         c10::xpu::XPUCachingAllocator::recordStream(
@@ -1550,7 +1512,6 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::alltoall_base(
         outputTensor,
         [&](at::Tensor& input,
             at::Tensor& output,
-            ccl::alltoall_attr attr,
             xcclComm_t& comm,
             at::xpu::XPUStream& stream) {
           c10::xpu::XPUCachingAllocator::recordStream(
@@ -1563,8 +1524,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::alltoall_base(
               (size_t)output.numel() / comm.size(),
               xcclDataType,
               comm,
-              ccl::create_stream(stream.queue()),
-              attr);
+              ccl::create_stream(stream.queue()));
           return ret_evt;
         },
         OpType::ALLTOALL_BASE);
@@ -1577,7 +1537,6 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::alltoall_base(
         outputTensor,
         [&](at::Tensor& input,
             at::Tensor& output,
-            ccl::alltoallv_attr attr,
             xcclComm_t& comm,
             at::xpu::XPUStream& stream) {
           std::vector<size_t> sendCounts(size_);
@@ -1608,8 +1567,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::alltoall_base(
               recvCounts,
               xcclDataType,
               comm,
-              ccl::create_stream(stream.queue()),
-              attr);
+              ccl::create_stream(stream.queue()));
           return ret_evt;
         },
         OpType::ALLTOALL_BASE);
@@ -1635,7 +1593,6 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::alltoall(
       outputTensors,
       [&](at::Tensor& /* unused */,
           at::Tensor& /* unused */,
-          ccl::alltoallv_attr attr,
           xcclComm_t& comm,
           at::xpu::XPUStream& stream) {
         c10::OptionalStreamGuard stream_guard(stream.unwrap());
@@ -1671,8 +1628,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::alltoall(
             recvCounts,
             xcclDataType,
             comm,
-            ccl::create_stream(stream.queue()),
-            attr);
+            ccl::create_stream(stream.queue()));
 
         if (!isOutputFlat) {
           ret_evt.wait();

From 56a5e7ff6ca99025855e8da554967f6362287ba5 Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Thu, 17 Oct 2024 00:25:04 +0000
Subject: [PATCH 81/96] Refine specific code

---
 cmake/Modules/FindXCCL.cmake                  |   7 +-
 .../distributed/c10d/ProcessGroupXCCL.cpp     | 209 ++++++++----------
 .../distributed/c10d/ProcessGroupXCCL.hpp     | 194 ++--------------
 torch/csrc/distributed/c10d/Utils.hpp         |   2 +-
 4 files changed, 115 insertions(+), 297 deletions(-)

diff --git a/cmake/Modules/FindXCCL.cmake b/cmake/Modules/FindXCCL.cmake
index a717ad1dafc653..18f7ac642d54e9 100644
--- a/cmake/Modules/FindXCCL.cmake
+++ b/cmake/Modules/FindXCCL.cmake
@@ -6,9 +6,10 @@
 
 include(FindPackageHandleStandardArgs)
 
-set(XCCL_ROOT "")
-if(DEFINED ENV{CCL_ROOT})
-  set(XCCL_ROOT $ENV{CCL_ROOT})
+set(XCCL_ROOT "/opt/intel/oneapi/ccl/latest")
+if (NOT EXISTS "${XCCL_ROOT}")
+  message(STATUS "Default OneCCL not found, using current environment OneAPI")
+  set(XCCL_ROOT $ENV{ONEAPI_ROOT}/ccl/latest)
 endif()
 
 string(COMPARE EQUAL "${XCCL_ROOT}" "" nocclfound)
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index d26d25ae03e39a..ef007825a118ed 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -1,5 +1,6 @@
 #ifdef USE_C10D_XCCL
 
+#include <comm/XPUGuard.h>
 #include <torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp>
 #include <fstream>
 #include <map>
@@ -9,21 +10,20 @@
 #include <unordered_set>
 #include <utility>
 
-#include <ATen/detail/FunctionTraits.h>
 #include <c10/core/DeviceType.h>
 #include <c10/util/Optional.h>
 
 namespace c10d {
 
 namespace {
-std::map<c10d::ReduceOp, ccl::reduction> xcclOps = {
+const std::map<c10d::ReduceOp, ccl::reduction> xcclOps = {
     {ReduceOp::MIN, ccl::reduction::min},
     {ReduceOp::MAX, ccl::reduction::max},
     {ReduceOp::SUM, ccl::reduction::sum},
     {ReduceOp::PRODUCT, ccl::reduction::prod},
 };
 
-std::map<at::ScalarType, ccl::datatype> xcclDatatypes = {
+const std::map<at::ScalarType, ccl::datatype> xcclDatatypes = {
     {at::kByte, ccl::datatype::uint8},
     {at::kChar, ccl::datatype::int8},
     {at::kInt, ccl::datatype::int32},
@@ -35,16 +35,22 @@ std::map<at::ScalarType, ccl::datatype> xcclDatatypes = {
     {at::kBool, ccl::datatype::uint8},
 };
 
-void check_xpu_single_tensor(const at::Tensor& tensor) {
-  if (!tensor.is_xpu() || tensor.is_sparse()) {
-    C10_THROW_ERROR(ValueError, "Tensors must be XPU and dense");
-  }
-  if (!tensor.is_contiguous(tensor.suggest_memory_format())) {
-    C10_THROW_ERROR(ValueError, "Tensors must be contiguous");
+void checkXPUTensor(at::Tensor& tensor) {
+  if (!tensor.is_xpu() || tensor.is_sparse() || tensor.is_complex()) {
+    C10_THROW_ERROR(
+        ValueError, "Tensors must be XPU and dense and non-complex");
+    if (!tensor.is_contiguous(tensor.suggest_memory_format())) {
+      C10_THROW_ERROR(ValueError, "Tensors must be contiguous");
+    }
   }
 }
 
-ccl::datatype getXcclDataType(at::ScalarType type) {
+ccl::datatype getXcclDataType(
+    at::ScalarType type,
+    bool is_reduction_op = false) {
+  TORCH_CHECK(
+      !isFloat8Type(type) && is_reduction_op,
+      "Float8 dtypes are not currenlty supported for XCCL reductions");
   auto it = xcclDatatypes.find(type);
   TORCH_CHECK_WITH(
       TypeError,
@@ -56,26 +62,27 @@ ccl::datatype getXcclDataType(at::ScalarType type) {
 
 ccl::reduction getXcclReduceOp(const ReduceOp& reduceOp, at::Tensor& input) {
   try {
-    if (input.scalar_type() == at::kBool) {
-      if (reduceOp == ReduceOp::SUM) {
-        // For bool tensors, map sum to max, which both represent a bitwise or.
-        // This is to prevent overflow issues with sum, since we use uint8 to
-        // represent a bool (see xcclDatatypes mapping align with cuda).
-        return ccl::reduction::max;
-      }
+    if (input.scalar_type() == at::kBool && reduceOp == ReduceOp::SUM) {
+      // Map sum to max for bool tensors to avoid overflow issues with sum.
+      return ccl::reduction::max;
     }
     return xcclOps.at(reduceOp);
   } catch (const std::out_of_range&) {
     C10_THROW_ERROR(
         ValueError,
-        "Cannot use ReduceOp." + reduce_op_to_string(reduceOp) + " with XCCL");
+        "Cannot use ReduceOp." + reduceOpToString(reduceOp) + " with XCCL");
   }
 }
 
+void syncStream(
+    at::Device& device,
+    at::xpu::XPUEvent& xcclEvent,
+    at::xpu::XPUStream& xcclStream) {
+  xcclEvent.record(at::xpu::getCurrentXPUStream(device.index()));
+  xcclEvent.block(xcclStream);
+}
 } // namespace
 
-static std::mutex xcclCommDevIdxMapMutex;
-static std::unordered_map<std::shared_ptr<xcclComm_t>, int> xcclCommDevIdxMap;
 constexpr int64_t kSynchronizeBusyWaitMillis = 10;
 
 ProcessGroupXCCL::WorkXCCL::WorkXCCL(
@@ -86,8 +93,7 @@ ProcessGroupXCCL::WorkXCCL::WorkXCCL(
     : Work(rank, opType, "profilingTitle", inputs),
       device_(device),
       workStartTime_(std::chrono::steady_clock::now()) {
-  unsigned char enable_timing = 0;
-  xcclEndEvent_ = std::make_shared<at::xpu::XPUEvent>(enable_timing);
+  xcclEndEvent_ = std::make_shared<at::xpu::XPUEvent>();
 }
 
 ProcessGroupXCCL::WorkXCCL::WorkXCCL(const WorkXCCL& w)
@@ -121,12 +127,9 @@ void ProcessGroupXCCL::WorkXCCL::synchronizeInternal(
           currentTimepoint - workStartTime_);
       if (timeElapsed >= timeout) {
         std::string exceptionMsg = c10::str(
-            "Work ran for ",
-            timeElapsed.count(),
-            " milliseconds before timing out.");
+            "Work ran time out after ", timeElapsed.count(), " milliseconds.");
         TORCH_CHECK(false, exceptionMsg)
       }
-
       std::this_thread::sleep_for(
           std::chrono::milliseconds(kSynchronizeBusyWaitMillis));
     }
@@ -145,20 +148,6 @@ ProcessGroupXCCL::ProcessGroupXCCL(
     : Backend(rank, size), store_(store) {
   blockingWait_ = getCvarBool(TORCH_XCCL_BLOCKING_WAIT, false);
   init();
-
-  // Intel oneCCL requires passing CCL_LOCAL_RANK and CCL_LOCAL_SIZE for non-MPI
-  // launchers.
-  if (!with_mpirun()) {
-    int local_rank = getXCCLEnvVar("LOCAL_RANK");
-    int local_world_size = getXCCLEnvVar("LOCAL_WORLD_SIZE");
-    if (local_rank == -1 || local_world_size == -1) {
-      local_rank = rank;
-      local_world_size = size;
-    }
-    setXCCLEnvVar("CCL_PROCESS_LAUNCHER", "none");
-    setXCCLEnvVar("CCL_LOCAL_RANK", local_rank);
-    setXCCLEnvVar("CCL_LOCAL_SIZE", local_world_size);
-  }
 }
 
 ProcessGroupXCCL::~ProcessGroupXCCL() = default;
@@ -177,97 +166,74 @@ c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL> ProcessGroupXCCL::initWork(
 std::shared_ptr<xcclComm_t> ProcessGroupXCCL::getXCCLComm(
     const std::string& deviceKey,
     at::Device& device) {
-  if (deviceKey.empty()) {
-    C10_THROW_ERROR(
-        DistBackendError,
-        "Not able to create/get the XCCL Communicator since "
-        "the devices are empty ");
-  }
-
+  TORCH_CHECK_WITH(
+      DistBackendError,
+      !deviceKey.empty(),
+      "Not able to create/get "
+      "XCCL Communicator since the devices are empty ");
   {
+    // todo: why do we need mutex here?
     std::lock_guard<std::mutex> lock(mutex_);
     if (devXCCLCommMap_.find(deviceKey) != devXCCLCommMap_.end()) {
       return devXCCLCommMap_[deviceKey];
     }
   }
 
-  std::shared_ptr<xcclComm_t> XCCLComm;
-
-  XCCL_KVS kvs = get_kvs(rank_, *store_);
-
   int numRanks, rank;
   numRanks = getSize();
   rank = getRank();
 
   c10::impl::VirtualGuardImpl impl(device.type());
-  c10::Stream stream = impl.getStream(device);
+  c10::Stream stream =
+      impl.getStreamFromGlobalPool(device, /*isHighPriority=*/false);
   sycl::queue& q = c10::xpu::XPUStream(stream).queue();
 
   auto ctx = ccl::create_context(q.get_context());
   ccl::vector_class<ccl::pair_class<int, ccl::device>> devs_rank;
   devs_rank.emplace_back(rank, ccl::create_device(q.get_device()));
 
-  auto comms = ccl::create_communicators(numRanks, devs_rank, ctx, kvs);
-  XCCLComm = std::make_shared<xcclComm_t>(std::move(comms[0]));
+  auto xccl_kvs = get_kvs(rank_, *store_);
+  auto comms = ccl::create_communicators(numRanks, devs_rank, ctx, xccl_kvs);
+  std::shared_ptr<xcclComm_t> XCCLComm =
+      std::make_shared<xcclComm_t>(std::move(comms[0]));
 
-  {
-    std::lock_guard<std::mutex> lock(mutex_);
-    inInitializationCommMap_.emplace(deviceKey, XCCLComm);
-  }
+  std::lock_guard<std::mutex> lock(mutex_);
+  devXCCLCommMap_.emplace(deviceKey, XCCLComm);
+  xcclStreamsMap_.emplace(deviceKey, std::move(stream));
+  xcclEventsMap_.emplace(deviceKey, at::xpu::XPUEvent());
 
-  xcclStreams_.emplace(deviceKey, std::move(stream));
-
-  auto it = inInitializationCommMap_.find(deviceKey);
-  if (it != inInitializationCommMap_.end()) {
-    devXCCLCommMap_.emplace(deviceKey, std::move(it->second));
-    inInitializationCommMap_.erase(deviceKey);
-
-    xcclCommDevIdxMapMutex.lock();
-    xcclCommDevIdxMap.emplace(XCCLComm, device.index());
-    xcclCommDevIdxMapMutex.unlock();
-  }
-
-  it = devXCCLCommMap_.find(deviceKey);
-  TORCH_INTERNAL_ASSERT(
-      it != devXCCLCommMap_.end(), "Communicators not populated in cache!");
-
-  return it->second;
+  return XCCLComm;
 }
 
 template <typename Fn, typename PreProcess, typename PostProcess>
 c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
-    at::Tensor& input,
-    at::Tensor& output,
+    std::vector<at::Tensor>& inputs,
+    std::vector<at::Tensor>& outputs,
     Fn fn,
     PreProcess pre,
     PostProcess post,
     OpType opType) {
-  using traits = function_traits<Fn>;
-  using attr_t = typename traits::template arg<2>::type;
-  attr_t attr = ccl::create_operation_attr<attr_t>();
-
-  auto device = input.device();
+  auto device = inputs[0].device();
   const auto key = std::to_string(device.index());
   auto comm = getXCCLComm(key, device);
 
-  auto stream = xcclStreams_.at(key);
-  std::vector<at::Tensor> outputs{output};
+  auto stream = xcclStreamsMap_.at(key);
+  syncStream(device, xcclEventsMap_[key], stream);
 
   c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL> work;
-
   work = initWork(device, rank_, opType);
-
-  work->outputs_ =
-      std::make_shared<std::vector<at::Tensor>>(std::move(outputs));
-  c10::xpu::XPUCachingAllocator::recordStream(
-      input.storage().data_ptr(), stream);
-
-  auto ccl_stream = ccl::create_stream(stream.queue());
-
-  fn(input, output, attr, *comm, ccl_stream);
+  work->outputs_ = std::make_shared<std::vector<at::Tensor>>(outputs);
+
+  at::xpu::OptionalXPUGuard gpuGuard(device);
+  pre(stream, work);
+  for (const auto i : c10::irange(inputs.size())) {
+    c10::xpu::XPUCachingAllocator::recordStream(
+        inputs[i].storage().data_ptr(), stream);
+    fn(inputs[i], outputs[i], *comm, stream);
+  }
+  post(stream, work);
 
   work->xcclEndEvent_->record(stream);
-
   std::vector<c10::Stream> streams = {stream.unwrap()};
   c10::MultiStreamGuard streamGuard(streams);
   std::vector<at::Device> devices{device};
@@ -279,51 +245,52 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
   return work;
 }
 
-template <typename Fn>
-c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
-    at::Tensor& input,
-    at::Tensor& output,
-    Fn fn,
-    OpType opType) {
-  return collective<Fn>(
-      input,
-      output,
-      fn,
-      [](at::xpu::XPUStream&,
-         c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL>& work) {},
-      [](at::xpu::XPUStream&,
-         c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL>& work) {},
-      opType);
-}
-
 c10::intrusive_ptr<Work> ProcessGroupXCCL::allreduce(
     std::vector<at::Tensor>& tensors,
     const AllreduceOptions& opts) {
   TORCH_CHECK(
       tensors.size() == 1, "Expecting one tensor only but got multiple");
   auto tensor = tensors.back();
-  check_xpu_single_tensor(tensor);
+  checkXPUTensor(tensor);
+
+  RECORD_PARAM_COMMS_DATA(
+      // static_cast<int>(
+      //     this->getSequenceNumberForGroup() + 1), // seq + 1 to match
+      //     collective
+      1,
+      std::make_tuple(pg_uid_, pg_desc_), // PG name tuple
+      tensors, // inputTensors
+      tensors, // outputTensors
+      rank_, // rank
+      "allreduce", // collective name
+      tensor.numel(), // inNelems
+      tensor.numel(), // outNelems
+      tensor.scalar_type(), // dType
+      std::vector<int64_t>(), // inSplitSizes
+      std::vector<int64_t>(), // outSplitSizes
+      0, // globalRankStart
+      1, // globalRankStride
+      this->getSize()); // worldSize
+
   return collective(
       tensor,
       tensor,
       [&](at::Tensor& input,
           at::Tensor& output,
-          ccl::allreduce_attr attr,
           xcclComm_t& comm,
-          ccl::stream& stream) {
-        auto xcclDataType = getXcclDataType(input.scalar_type());
+          at::xpu::XPUStream& stream) {
+        auto xcclDataType = getXcclDataType(input.scalar_type(), true);
         auto xcclReduceOp = getXcclReduceOp(opts.reduceOp, input);
-        ccl::event ret_evt;
-        ret_evt = ccl::allreduce(
+        auto ccl_stream = ccl::create_stream(stream.queue());
+        ccl::allreduce(
             input.data_ptr(),
             output.data_ptr(),
             (size_t)input.numel(),
             xcclDataType,
             xcclReduceOp,
             comm,
-            stream,
-            attr);
-        return ret_evt;
+            ccl_stream);
+        return;
       },
       OpType::ALLREDUCE);
 }
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
index 99b815f2138b4e..5dc003e3dba6b2 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
@@ -35,7 +35,6 @@ static std::vector<std::string> TORCH_XCCL_BLOCKING_WAIT = {
     "XCCL_BLOCKING_WAIT"};
 
 using xcclComm_t = ccl::communicator;
-using XCCL_KVS = ccl::shared_ptr_class<ccl::kvs>;
 constexpr const char* XCCL_BACKEND_NAME = "xccl";
 
 class TORCH_API ProcessGroupXCCL : public Backend {
@@ -52,11 +51,6 @@ class TORCH_API ProcessGroupXCCL : public Backend {
 
     bool isCompleted() override;
 
-    bool isSuccess() const override {
-      TORCH_CHECK(
-          false, "ProcessGroupXCCL::WorkXCCL::isSuccess not implemented");
-    }
-
     void abort() override {
       TORCH_CHECK(false, "ProcessGroupXCCL::WorkXCCL::abort not implemented");
     }
@@ -70,7 +64,7 @@ class TORCH_API ProcessGroupXCCL : public Backend {
     }
 
     std::vector<at::Tensor> result() override {
-      TORCH_CHECK(false, "ProcessGroupXCCL::WorkXCCL::result not implemented");
+      return *outputs_;
     }
 
    protected:
@@ -117,12 +111,24 @@ class TORCH_API ProcessGroupXCCL : public Backend {
       at::Tensor& input,
       at::Tensor& output,
       Fn fn,
-      OpType opType);
+      OpType opType) {
+    auto inputs = std::vector<at::Tensor>{input};
+    auto outputs = std::vector<at::Tensor>{output};
+    return collective<Fn>(
+        inputs,
+        outputs,
+        fn,
+        [](at::xpu::XPUStream&,
+           c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL>&) {},
+        [](at::xpu::XPUStream&,
+           c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL>&) {},
+        opType);
+  }
 
   template <typename Fn, typename PreProcess, typename PostProcess>
   c10::intrusive_ptr<Work> collective(
-      at::Tensor& input,
-      at::Tensor& output,
+      std::vector<at::Tensor>& inputs,
+      std::vector<at::Tensor>& outputs,
       Fn fn,
       PreProcess pre,
       PostProcess post,
@@ -132,141 +138,20 @@ class TORCH_API ProcessGroupXCCL : public Backend {
       std::vector<at::Tensor>& tensors,
       const AllreduceOptions& opts = AllreduceOptions()) override;
 
-  c10::intrusive_ptr<Work> allreduce_coalesced(
-      std::vector<at::Tensor>& tensors,
-      const AllreduceCoalescedOptions& opts =
-          AllreduceCoalescedOptions()) override {
-    TORCH_CHECK(false, "ProcessGroupXCCL::allreduce_coalesced not implemented");
-  }
-
-  c10::intrusive_ptr<Work> reduce(
-      std::vector<at::Tensor>& tensors,
-      const ReduceOptions& opts = ReduceOptions()) override {
-    TORCH_CHECK(false, "ProcessGroupXCCL::reduce not implemented");
-  }
-
-  c10::intrusive_ptr<Work> broadcast(
-      std::vector<at::Tensor>& tensors,
-      const BroadcastOptions& opts = BroadcastOptions()) override {
-    TORCH_CHECK(false, "ProcessGroupXCCL::broadcast not implemented");
-  }
-
-  c10::intrusive_ptr<Work> allgather(
-      std::vector<std::vector<at::Tensor>>& outputTensors,
-      std::vector<at::Tensor>& inputTensors,
-      const AllgatherOptions& opts = AllgatherOptions()) override {
-    TORCH_CHECK(false, "ProcessGroupXCCL::allgather not implemented");
-  }
-
-  c10::intrusive_ptr<Work> _allgather_base(
-      at::Tensor& outputbuffer,
-      at::Tensor& inputbuffer,
-      const AllgatherOptions& opts = AllgatherOptions()) override {
-    TORCH_CHECK(false, "ProcessGroupXCCL::_allgather_base not implemented");
-  }
-
-  c10::intrusive_ptr<Work> allgather_coalesced(
-      std::vector<std::vector<at::Tensor>>& outputTensorLists,
-      std::vector<at::Tensor>& inputTensors,
-      const AllgatherOptions& opts = AllgatherOptions()) override {
-    TORCH_CHECK(false, "ProcessGroupXCCL::allgather_coalesced not implemented");
-  }
-
-  c10::intrusive_ptr<Work> allgather_into_tensor_coalesced(
-      std::vector<at::Tensor>& outputs,
-      std::vector<at::Tensor>& inputs,
-      const AllgatherOptions& opts = AllgatherOptions()) override {
-    TORCH_CHECK(
-        false,
-        "ProcessGroupXCCL::allgather_into_tensor_coalesced not implemented");
-  }
-
-  c10::intrusive_ptr<Work> reduce_scatter(
-      std::vector<at::Tensor>& outputTensors,
-      std::vector<std::vector<at::Tensor>>& inputTensors,
-      const ReduceScatterOptions& opts = ReduceScatterOptions()) override {
-    TORCH_CHECK(false, "ProcessGroupXCCL::reduce_scatter not implemented");
-  }
-
-  c10::intrusive_ptr<Work> _reduce_scatter_base(
-      at::Tensor& outputTensor,
-      at::Tensor& inputTensor,
-      const ReduceScatterOptions& opts = ReduceScatterOptions()) override {
-    TORCH_CHECK(
-        false, "ProcessGroupXCCL::_reduce_scatter_base not implemented");
-  }
-
-  c10::intrusive_ptr<Work> reduce_scatter_tensor_coalesced(
-      std::vector<at::Tensor>& outputs,
-      std::vector<at::Tensor>& inputs,
-      const ReduceScatterOptions& opts = ReduceScatterOptions()) override {
-    TORCH_CHECK(
-        false,
-        "ProcessGroupXCCL::reduce_scatter_tensor_coalesced not implemented");
-  }
-
-  c10::intrusive_ptr<Work> barrier(
-      const BarrierOptions& opts = BarrierOptions()) override {
-    TORCH_CHECK(false, "ProcessGroupXCCL::barrier not implemented");
-  }
-
-  c10::intrusive_ptr<Work> alltoall_base(
-      at::Tensor& outputTensor,
-      at::Tensor& inputTensor,
-      std::vector<int64_t>& outputSplitSizes,
-      std::vector<int64_t>& inputSplitSizes,
-      const AllToAllOptions& opts = AllToAllOptions()) override {
-    TORCH_CHECK(false, "ProcessGroupXCCL::alltoall_base not implemented");
-  }
-
-  c10::intrusive_ptr<Work> alltoall(
-      std::vector<at::Tensor>& outputTensors,
-      std::vector<at::Tensor>& inputTensors,
-      const AllToAllOptions& opts = AllToAllOptions()) override {
-    TORCH_CHECK(false, "ProcessGroupXCCL::alltoall not implemented");
-  }
-
-  c10::intrusive_ptr<Work> send(
-      std::vector<at::Tensor>& tensors,
-      int dstRank,
-      int tag) override {
-    TORCH_CHECK(false, "ProcessGroupXCCL::send not implemented");
-  }
-
-  c10::intrusive_ptr<Work> recv(
-      std::vector<at::Tensor>& tensors,
-      int srcRank,
-      int tag) override {
-    TORCH_CHECK(false, "ProcessGroupXCCL::recv not implemented");
-  }
-
-  c10::intrusive_ptr<Work> gather(
-      std::vector<std::vector<at::Tensor>>& outputTensors,
-      std::vector<at::Tensor>& inputTensors,
-      const GatherOptions& opts = GatherOptions()) override {
-    TORCH_CHECK(false, "ProcessGroupXCCL::gather not implemented");
-  }
-
-  c10::intrusive_ptr<Work> scatter(
-      std::vector<at::Tensor>& outputTensors,
-      std::vector<std::vector<at::Tensor>>& inputTensors,
-      const ScatterOptions& opts = ScatterOptions()) override {
-    TORCH_CHECK(false, "ProcessGroupXCCL::scatter not implemented");
-  }
-
  protected:
-  std::unordered_map<std::string, at::xpu::XPUStream> xcclStreams_;
-  std::unordered_map<std::string, std::shared_ptr<xcclComm_t>>
-      inInitializationCommMap_;
+  std::unordered_map<std::string, at::xpu::XPUStream> xcclStreamsMap_;
+  std::unordered_map<std::string, at::xpu::XPUEvent> xcclEventsMap_;
   std::unordered_map<std::string, std::shared_ptr<xcclComm_t>> devXCCLCommMap_;
   c10::intrusive_ptr<Store> store_;
   std::mutex mutex_;
   bool blockingWait_ = false;
 
  private:
-  XCCL_KVS kvs;
   std::mutex kvs_mutex;
-  XCCL_KVS get_kvs(int rank, c10d::Store& store) {
+  ccl::shared_ptr_class<ccl::kvs> kvs;
+
+  ccl::shared_ptr_class<ccl::kvs> get_kvs(int rank, c10d::Store& store) {
+    // todo: why do we need the mutex here?
     std::lock_guard<std::mutex> lock(kvs_mutex);
     if (kvs)
       return kvs;
@@ -291,41 +176,6 @@ class TORCH_API ProcessGroupXCCL : public Backend {
     return kvs;
   }
 };
-
-namespace {
-int getXCCLEnvVar(std::string envVarName) {
-  char* stringValue = std::getenv(envVarName.c_str());
-  if (stringValue != nullptr) {
-    try {
-      int val = std::stoi(stringValue);
-      return val;
-    } catch (std::exception& e) {
-      TORCH_CHECK(
-          false,
-          "Invalid value for environment variable: " + std::string(envVarName));
-    }
-  } else {
-    return -1;
-  }
-}
-
-template <typename T>
-void setXCCLEnvVar(const std::string& envVarName, T val) {
-  if constexpr (std::is_same_v<T, int>) {
-    setenv(envVarName.c_str(), std::to_string(val).c_str(), 1);
-  } else if constexpr (std::is_same_v<T, std::string>) {
-    setenv(envVarName.c_str(), val.c_str(), 1);
-  }
-}
-
-bool with_mpirun() {
-  return (getenv("MPI_LOCALRANKID") || getenv("MPI_LOCALNRANKS") ||
-          getenv("PMI_RANK") || getenv("PMI_SIZE") || getenv("PMIX_RANK"))
-      ? true
-      : false;
-}
-
-} // namespace
 } // namespace c10d
 
 #endif // USE_C10D_XCCL
diff --git a/torch/csrc/distributed/c10d/Utils.hpp b/torch/csrc/distributed/c10d/Utils.hpp
index 73e37e0437c459..e27ec363ba1cc9 100644
--- a/torch/csrc/distributed/c10d/Utils.hpp
+++ b/torch/csrc/distributed/c10d/Utils.hpp
@@ -557,7 +557,7 @@ size_t computeLengthsAndOffsets(
   return offset;
 }
 
-inline std::string reduce_op_to_string(c10d::ReduceOp op) {
+inline std::string reduceOpToString(c10d::ReduceOp op) {
   switch (op) {
     case c10d::ReduceOp::SUM:
       return "SUM";

From a062f9f8bfc2cb81c4f082515f324b7c5f65dbb8 Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Thu, 17 Oct 2024 00:59:38 +0000
Subject: [PATCH 82/96] accept comments

---
 torch/csrc/distributed/c10d/ProcessGroup.hpp  |  4 ++--
 .../distributed/c10d/ProcessGroupXCCL.cpp     | 19 -------------------
 .../distributed/c10d/ProcessGroupXCCL.hpp     |  3 ++-
 torch/distributed/distributed_c10d.py         |  8 ++------
 4 files changed, 6 insertions(+), 28 deletions(-)

diff --git a/torch/csrc/distributed/c10d/ProcessGroup.hpp b/torch/csrc/distributed/c10d/ProcessGroup.hpp
index 83d2729fc43d43..31c974a061e4a2 100644
--- a/torch/csrc/distributed/c10d/ProcessGroup.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroup.hpp
@@ -51,8 +51,8 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
     NCCL = 2,
     UCC = 3,
     MPI = 4,
-    CUSTOM = 5,
-    XCCL = 6,
+    XCCL = 5,
+    CUSTOM = 6,
   };
 
   static std::string backendTypeToString(const BackendType& type) {
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index ef007825a118ed..90fb4c3f9cbd75 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -253,25 +253,6 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::allreduce(
   auto tensor = tensors.back();
   checkXPUTensor(tensor);
 
-  RECORD_PARAM_COMMS_DATA(
-      // static_cast<int>(
-      //     this->getSequenceNumberForGroup() + 1), // seq + 1 to match
-      //     collective
-      1,
-      std::make_tuple(pg_uid_, pg_desc_), // PG name tuple
-      tensors, // inputTensors
-      tensors, // outputTensors
-      rank_, // rank
-      "allreduce", // collective name
-      tensor.numel(), // inNelems
-      tensor.numel(), // outNelems
-      tensor.scalar_type(), // dType
-      std::vector<int64_t>(), // inSplitSizes
-      std::vector<int64_t>(), // outSplitSizes
-      0, // globalRankStart
-      1, // globalRankStride
-      this->getSize()); // worldSize
-
   return collective(
       tensor,
       tensor,
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
index 5dc003e3dba6b2..6e6eb16d62d620 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
@@ -138,6 +138,8 @@ class TORCH_API ProcessGroupXCCL : public Backend {
       std::vector<at::Tensor>& tensors,
       const AllreduceOptions& opts = AllreduceOptions()) override;
 
+  void setSequenceNumberForGroup() override {}
+
  protected:
   std::unordered_map<std::string, at::xpu::XPUStream> xcclStreamsMap_;
   std::unordered_map<std::string, at::xpu::XPUEvent> xcclEventsMap_;
@@ -151,7 +153,6 @@ class TORCH_API ProcessGroupXCCL : public Backend {
   ccl::shared_ptr_class<ccl::kvs> kvs;
 
   ccl::shared_ptr_class<ccl::kvs> get_kvs(int rank, c10d::Store& store) {
-    // todo: why do we need the mutex here?
     std::lock_guard<std::mutex> lock(kvs_mutex);
     if (kvs)
       return kvs;
diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index 4bbb1c41011231..fc4ca55dbd0237 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -1675,13 +1675,9 @@ def _new_process_group_helper(
             "created, please use a different group name"
         )
 
-    if device_id is not None and (
-        device_id.index is None
-        or (device_id.type != "cuda" and device_id.type != "xpu")
-    ):
+    if device_id is not None and device_id.index is None:
         raise ValueError(
-            "init_process_group device_id parameter must be a cuda device with an "
-            "id, e.g. cuda:0, xpu, not just cuda or xpu or cpu"
+            "init_process_group device_id parameter must be a device with an index"
         )
 
     # Note: _new_process_group_helper is only called from init_process_group, which always provides a timeout value

From 86b66c3b0ff8c731e54453c210e0b0eb321c3e89 Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Mon, 21 Oct 2024 01:54:13 +0000
Subject: [PATCH 83/96] refine code

---
 caffe2/CMakeLists.txt                         |  3 --
 test/distributed/test_c10d_common.py          | 13 +++---
 torch/csrc/distributed/c10d/ProcessGroup.hpp  | 17 ++++----
 .../distributed/c10d/ProcessGroupXCCL.cpp     | 39 +++++++++--------
 .../distributed/c10d/ProcessGroupXCCL.hpp     | 43 +++++++++++--------
 torch/testing/_internal/common_distributed.py | 14 +++++-
 6 files changed, 72 insertions(+), 57 deletions(-)

diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index b4ec018019f165..25bd7f700f68a2 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -1376,9 +1376,6 @@ if(USE_DISTRIBUTED)
   endif()
   if(USE_XPU AND USE_C10D_XCCL)
     target_compile_definitions(torch_xpu PUBLIC USE_C10D_XCCL)
-    set_source_files_properties(
-      ${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupXCCL.cpp
-      PROPERTIES COMPILE_DEFINITIONS  "CCL_ENABLE_ZE;CCL_ENABLE_SYCL")
   endif()
   if(USE_MPI AND USE_C10D_MPI)
     if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
diff --git a/test/distributed/test_c10d_common.py b/test/distributed/test_c10d_common.py
index 903df26bba9f6f..d3cb65f7befb1d 100644
--- a/test/distributed/test_c10d_common.py
+++ b/test/distributed/test_c10d_common.py
@@ -31,6 +31,7 @@
 from torch.testing._internal.common_distributed import (
     MultiProcessTestCase,
     skip_if_lt_x_gpu,
+    get_device_count,
 )
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
@@ -60,17 +61,13 @@
 torch.backends.cuda.matmul.allow_tf32 = False
 
 
-def gpus_for_rank(world_size):
+def gpus_for_rank(world_size, backend):
     """Multigpu tests are designed to simulate the multi nodes with multi
     GPUs on each node. Nccl backend requires equal #GPUs in each process.
     On a single node, all visible GPUs are evenly
     divided to subsets, each process only uses a subset.
     """
-    device_count = (
-        torch.xpu.device_count()
-        if torch.xpu.is_available()
-        else torch.cuda.device_count()
-    )
+    device_count = get_device_count(backend)
     visible_devices = list(range(device_count))
     gpus_per_process = device_count // world_size
     gpus_for_rank = []
@@ -833,7 +830,7 @@ def update_parameters(model):
     def _gpu_model_with_ddp_comm_hook(
         self, process_group, hook=None, gradient_as_bucket_view=False, state=None
     ):
-        device_id = gpus_for_rank(self.world_size)[self.rank][0]
+        device_id = gpus_for_rank(self.world_size, process_group.name())[self.rank][0]
         gpu_model = DistributedDataParallel(
             ModuleForDdpCommHook().to(device_id),
             device_ids=[device_id],
@@ -850,7 +847,7 @@ def _gpu_model_with_ddp_comm_hook(
     def _gpu_model_with_builtin_ddp_comm_hook(
         self, process_group, hook=None, gradient_as_bucket_view=False
     ):
-        device_id = gpus_for_rank(self.world_size)[self.rank][0]
+        device_id = gpus_for_rank(self.world_size, process_group.name())[self.rank][0]
         gpu_model = DistributedDataParallel(
             ModuleForDdpCommHook().to(device_id),
             device_ids=[device_id],
diff --git a/torch/csrc/distributed/c10d/ProcessGroup.hpp b/torch/csrc/distributed/c10d/ProcessGroup.hpp
index 31c974a061e4a2..b3eac70e871bf7 100644
--- a/torch/csrc/distributed/c10d/ProcessGroup.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroup.hpp
@@ -131,6 +131,13 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
     return backendType_;
   };
 
+  inline bool backendSupportsSequenceNumbers(BackendType backendType) {
+    if (backendType == BackendType::GLOO || backendType == BackendType::NCCL ||
+        backendType == BackendType::XCCL || backendType == BackendType::UCC)
+      return true;
+    return false;
+  }
+
   virtual void startCoalescing(c10::DeviceType deviceType) {
     // only nccl has implemented startCoalescing so only execute for nccl
     // backends
@@ -508,10 +515,7 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
   virtual void setSequenceNumberForGroup() {
     auto backendType = getBackendType();
     // TODO: HACK for backend name to get sequence number for that backend.
-    if (backendType == ProcessGroup::BackendType::GLOO ||
-        backendType == ProcessGroup::BackendType::NCCL ||
-        backendType == ProcessGroup::BackendType::XCCL ||
-        backendType == ProcessGroup::BackendType::UCC) {
+    if (backendSupportsSequenceNumbers(backendType)) {
       getDefaultBackend()->setSequenceNumberForGroup();
     } else {
       TORCH_CHECK(
@@ -530,10 +534,7 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
     auto backendType = getBackendType();
 
     // TODO: HACK for backend name to get sequence number for that backend.
-    if (backendType == ProcessGroup::BackendType::GLOO ||
-        backendType == ProcessGroup::BackendType::NCCL ||
-        backendType == ProcessGroup::BackendType::XCCL ||
-        backendType == ProcessGroup::BackendType::UCC) {
+    if (backendSupportsSequenceNumbers(backendType)) {
       return getDefaultBackend()->getSequenceNumberForGroup();
     } else {
       TORCH_CHECK(
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index 90fb4c3f9cbd75..41e4e43436270a 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -1,17 +1,8 @@
 #ifdef USE_C10D_XCCL
 
 #include <comm/XPUGuard.h>
+#include <torch/csrc/distributed/c10d/ParamCommsUtils.hpp>
 #include <torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp>
-#include <fstream>
-#include <map>
-#include <sstream>
-#include <stdexcept>
-#include <tuple>
-#include <unordered_set>
-#include <utility>
-
-#include <c10/core/DeviceType.h>
-#include <c10/util/Optional.h>
 
 namespace c10d {
 
@@ -89,10 +80,13 @@ ProcessGroupXCCL::WorkXCCL::WorkXCCL(
     at::Device& device,
     int rank,
     OpType opType,
+    uint64_t seq,
+    const char* profilingTitle,
     const std::optional<std::vector<at::Tensor>>& inputs)
-    : Work(rank, opType, "profilingTitle", inputs),
+    : Work(rank, opType, profilingTitle, inputs),
       device_(device),
-      workStartTime_(std::chrono::steady_clock::now()) {
+      workStartTime_(std::chrono::steady_clock::now()),
+      seq_(seq) {
   xcclEndEvent_ = std::make_shared<at::xpu::XPUEvent>();
 }
 
@@ -101,7 +95,8 @@ ProcessGroupXCCL::WorkXCCL::WorkXCCL(const WorkXCCL& w)
       device_(w.device_),
       xcclEndEvent_(w.xcclEndEvent_),
       blockingWait_(w.blockingWait_),
-      workStartTime_(w.workStartTime_) {}
+      workStartTime_(w.workStartTime_),
+      seq_(w.seq_) {}
 
 ProcessGroupXCCL::WorkXCCL::~WorkXCCL() = default;
 
@@ -156,10 +151,16 @@ c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL> ProcessGroupXCCL::initWork(
     at::Device& device,
     int rank,
     OpType opType,
+    const char* profilingTitle,
     const std::vector<at::Tensor>& inputs,
     const std::vector<at::Tensor>& outputs) {
   auto r = c10::make_intrusive<ProcessGroupXCCL::WorkXCCL>(
-      device, rank, opType, std::optional<std::vector<at::Tensor>>(inputs));
+      device,
+      rank,
+      opType,
+      seqCollective_,
+      profilingTitle,
+      std::optional<std::vector<at::Tensor>>(inputs));
   return r;
 }
 
@@ -212,7 +213,10 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
     Fn fn,
     PreProcess pre,
     PostProcess post,
-    OpType opType) {
+    OpType opType,
+    const char* profilingTitle) {
+  seqCollective_++;
+
   auto device = inputs[0].device();
   const auto key = std::to_string(device.index());
   auto comm = getXCCLComm(key, device);
@@ -221,7 +225,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
   syncStream(device, xcclEventsMap_[key], stream);
 
   c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL> work;
-  work = initWork(device, rank_, opType);
+  work = initWork(device, rank_, opType, profilingTitle);
   work->outputs_ = std::make_shared<std::vector<at::Tensor>>(outputs);
 
   at::xpu::OptionalXPUGuard gpuGuard(device);
@@ -273,7 +277,8 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::allreduce(
             ccl_stream);
         return;
       },
-      OpType::ALLREDUCE);
+      OpType::ALLREDUCE,
+      "xccl:all_reduce");
 }
 
 } // namespace c10d
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
index 6e6eb16d62d620..f9761c652dc1a0 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
@@ -1,33 +1,24 @@
 #pragma once
 
-#if defined(__linux__)
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <unistd.h>
-#endif
-
 #ifdef USE_C10D_XCCL
-#include <ATen/xpu/XPUEvent.h>
+// We will define those flags in XCCL backend file instead of passing to gcc
+// compiler.
+#define CCL_ENABLE_ZE
+#define CCL_ENABLE_SYCL
+
 #include <oneapi/ccl.hpp>
-#include <torch/csrc/distributed/c10d/Store.hpp>
 #include <exception>
-#include <memory>
-#include <vector>
-
-#include <atomic>
-#include <chrono>
 #include <future>
-#include <iostream>
 #include <list>
 #include <mutex>
-#include <thread>
 #include <unordered_map>
+#include <vector>
 
+#include <ATen/xpu/XPUEvent.h>
 #include <c10/core/StreamGuard.h>
 #include <c10/xpu/XPUCachingAllocator.h>
 #include <torch/csrc/distributed/c10d/Backend.hpp>
-#include <torch/csrc/distributed/c10d/PrefixStore.hpp>
+#include <torch/csrc/distributed/c10d/Store.hpp>
 namespace c10d {
 
 static std::vector<std::string> TORCH_XCCL_BLOCKING_WAIT = {
@@ -45,6 +36,8 @@ class TORCH_API ProcessGroupXCCL : public Backend {
         at::Device& device,
         int rank,
         OpType opType,
+        uint64_t seq,
+        const char* profilingTitle = nullptr,
         const std::optional<std::vector<at::Tensor>>& inputs = std::nullopt);
     WorkXCCL(const WorkXCCL& w);
     ~WorkXCCL() override;
@@ -63,6 +56,10 @@ class TORCH_API ProcessGroupXCCL : public Backend {
       return future_;
     }
 
+    uint64_t getSequencenumber() const override {
+      return seq_;
+    }
+
     std::vector<at::Tensor> result() override {
       return *outputs_;
     }
@@ -72,6 +69,7 @@ class TORCH_API ProcessGroupXCCL : public Backend {
     std::shared_ptr<at::xpu::XPUEvent> xcclEndEvent_;
     bool blockingWait_ = false;
     std::chrono::time_point<std::chrono::steady_clock> workStartTime_;
+    uint64_t seq_;
 
    private:
     void synchronizeInternal(std::chrono::milliseconds timeout);
@@ -103,6 +101,7 @@ class TORCH_API ProcessGroupXCCL : public Backend {
       at::Device& device,
       int rank,
       OpType opType,
+      const char* profilingTitle = nullptr,
       const std::vector<at::Tensor>& inputs = {},
       const std::vector<at::Tensor>& outputs = {});
 
@@ -111,7 +110,8 @@ class TORCH_API ProcessGroupXCCL : public Backend {
       at::Tensor& input,
       at::Tensor& output,
       Fn fn,
-      OpType opType) {
+      OpType opType,
+      const char* profilingTitle = nullptr) {
     auto inputs = std::vector<at::Tensor>{input};
     auto outputs = std::vector<at::Tensor>{output};
     return collective<Fn>(
@@ -132,13 +132,17 @@ class TORCH_API ProcessGroupXCCL : public Backend {
       Fn fn,
       PreProcess pre,
       PostProcess post,
-      OpType opType);
+      OpType opType,
+      const char* profilingTitle = nullptr);
 
   c10::intrusive_ptr<Work> allreduce(
       std::vector<at::Tensor>& tensors,
       const AllreduceOptions& opts = AllreduceOptions()) override;
 
   void setSequenceNumberForGroup() override {}
+  uint64_t getSequenceNumberForGroup() override {
+    return seqCollective_;
+  }
 
  protected:
   std::unordered_map<std::string, at::xpu::XPUStream> xcclStreamsMap_;
@@ -147,6 +151,7 @@ class TORCH_API ProcessGroupXCCL : public Backend {
   c10::intrusive_ptr<Store> store_;
   std::mutex mutex_;
   bool blockingWait_ = false;
+  uint64_t seqCollective_{0};
 
  private:
   std::mutex kvs_mutex;
diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py
index 9ec38c9ca671c2..3e1664690b7132 100644
--- a/torch/testing/_internal/common_distributed.py
+++ b/torch/testing/_internal/common_distributed.py
@@ -93,8 +93,9 @@ class DistTestCases:
 
     # Sets showing that something is implemented
     backend_feature = {}
-    backend_feature["gpu"] = {"nccl", "gloo", "ucc"}
+    backend_feature["gpu"] = {"nccl", "gloo", "ucc", "xccl"}
     backend_feature["cuda"] = {"nccl", "gloo", "ucc"}
+    backend_feature["cuda"] = {"xccl"}
     backend_feature["ddp"] = {"nccl", "gloo", "ucc"}
     backend_feature["subgroup"] = {"nccl", "gloo", "ucc"}
     backend_feature["plugin"] = set()
@@ -462,6 +463,15 @@ def compute_sum(fn, world_size: int):
         ]
     ]
 
+# Returns the number of GPUs, currently only for CUDA and XPU.
+def get_device_count(backend: str):
+    assert c10d.is_backend_available(backend)
+    if backend in backend_feature.get("cuda", set()):
+        return torch.cuda.device_count()
+    elif backend in backend_feature.get("xpu", set()):
+        return torch.xpu.device_count()
+    else:
+        raise ValueError(f"Unsupported backend: {backend}")
 
 # HELPER FOR MULTIGPU TESTS
 def init_multigpu_helper(world_size: int, backend: str):
@@ -470,7 +480,7 @@ def init_multigpu_helper(world_size: int, backend: str):
     On a single node, all visible GPUs are evenly
     divided to subsets, each process only uses a subset.
     """
-    nGPUs = torch.xpu.device_count() if torch.xpu.is_available() else torch.cuda.device_count()
+    nGPUs = get_device_count(backend)
     visible_devices = range(nGPUs)
 
     # If rank is less than or equal to number of available GPU's

From d9ce6368c51d64a4379efdfa26804888026185f4 Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Mon, 21 Oct 2024 06:11:52 +0000
Subject: [PATCH 84/96] align to latest

---
 .../distributed/c10d/ProcessGroupXCCL.cpp     | 531 ++++++------------
 .../distributed/c10d/ProcessGroupXCCL.hpp     |  83 ++-
 2 files changed, 228 insertions(+), 386 deletions(-)

diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index a89a7c48a01ffb..956e80482af28d 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -82,40 +82,38 @@ void check_xpu_single_tensor(
     const at::Tensor& tensor,
     const bool p2p = false // whether operation is a P2P operation
 ) {
-  if (!tensor.is_xpu() || tensor.is_sparse()) {
-    C10_THROW_ERROR(ValueError, "Tensors must be XPU and dense");
-  }
-  // Skip the following requirements for P2P operations
-  if (!tensor.is_contiguous(tensor.suggest_memory_format())) {
-    if (p2p) {
-      TORCH_WARN_ONCE(
-          "Detected non-contiguous tensor in P2P operations. It is user "
-          "responsibility to guarantee that source and destination tensors have "
-          "the same contiguity format.");
-    } else {
-      C10_THROW_ERROR(ValueError, "Tensors must be contiguous");
+  if (!tensor.is_xpu() || tensor.is_sparse() || tensor.is_complex()) {
+    C10_THROW_ERROR(
+        ValueError, "Tensors must be XPU and dense and non-complex");
+
+    // Skip the following requirements for P2P operations
+    if (!tensor.is_contiguous(tensor.suggest_memory_format())) {
+      if (p2p) {
+        TORCH_WARN_ONCE(
+            "Detected non-contiguous tensor in P2P operations. It is user "
+            "responsibility to guarantee that source and destination tensors have "
+            "the same contiguity format.");
+      } else {
+        C10_THROW_ERROR(ValueError, "Tensors must be contiguous");
+      }
     }
   }
 }
-
 int64_t check_xpu_tensors_same_device(const std::vector<at::Tensor>& tensors) {
-  if (tensors.size() == 0) {
-    C10_THROW_ERROR(ValueError, "Tensor list must be nonempty");
-  }
+  TORCH_CHECK_WITH(
+      ValueError, tensors.size() == 0, "Tensor list must be nonempty");
 
   const auto& first = tensors.front();
 
   int64_t total_numel = 0;
   for (const auto& t : tensors) {
-    if (!t.is_xpu() || t.is_sparse()) {
-      C10_THROW_ERROR(ValueError, "Tensors must be XPU and dense");
+    if (!t.is_xpu() || t.is_sparse() || t.is_complex()) {
+      C10_THROW_ERROR(
+          ValueError, "Tensors must be XPU and dense and non-complex");
     }
     if (t.scalar_type() != first.scalar_type()) {
       C10_THROW_ERROR(TypeError, "Tensors must have identical type");
     }
-    if (!t.is_non_overlapping_and_dense()) {
-      C10_THROW_ERROR(ValueError, "Tensors must be non-overlapping and dense");
-    }
     TORCH_CHECK_WITH(
         ValueError,
         t.get_device() == tensors[0].get_device(),
@@ -126,7 +124,12 @@ int64_t check_xpu_tensors_same_device(const std::vector<at::Tensor>& tensors) {
   return total_numel;
 }
 
-ccl::datatype getXcclDataType(at::ScalarType type) {
+ccl::datatype getXcclDataType(
+    at::ScalarType type,
+    bool is_reduction_op = false) {
+  TORCH_CHECK(
+      !isFloat8Type(type) && is_reduction_op,
+      "Float8 dtypes are not currenlty supported for XCCL reductions");
   auto it = xcclDatatypes.find(type);
   TORCH_CHECK_WITH(
       TypeError,
@@ -158,18 +161,6 @@ void syncStream(
   xcclEvent.block(xcclStream);
 }
 
-bool complexViewAsRealAllowed(const ReduceOp reduceOp) {
-  switch (reduceOp) {
-    case ReduceOp::SUM:
-      return true;
-    case ReduceOp::UNUSED:
-      return true;
-    default:
-      return false;
-  }
-  return false;
-}
-
 } // namespace
 
 constexpr int64_t kSynchronizeBusyWaitMillis = 10;
@@ -286,7 +277,6 @@ std::shared_ptr<xcclComm_t> ProcessGroupXCCL::getXCCLComm(
   usedDeviceIdxs_.insert(device.index());
 
   {
-    // todo: why do we need mutex here?
     std::lock_guard<std::mutex> lock(mutex_);
     if (devXCCLCommMap_.find(deviceKey) != devXCCLCommMap_.end()) {
       return devXCCLCommMap_[deviceKey];
@@ -301,11 +291,6 @@ std::shared_ptr<xcclComm_t> ProcessGroupXCCL::getXCCLComm(
 
   at::xpu::OptionalXPUGuard gpuGuard(device);
 
-  for (const auto i : c10::irange(xcclActiveGroupCounter_)) {
-    (void)i;
-    ccl::group_end();
-  }
-
   int numRanks, rank;
   if (!singleP2POp) {
     numRanks = getSize();
@@ -329,37 +314,14 @@ std::shared_ptr<xcclComm_t> ProcessGroupXCCL::getXCCLComm(
 
   auto xccl_kvs = get_kvs(rank_, *store_);
   auto comms = ccl::create_communicators(numRanks, devs_rank, ctx, xccl_kvs);
-  std::shared_ptr<xcclComm_t> XCCLComm =
-      std::make_shared<xcclComm_t>(std::move(comms[0]));
+  XCCLComm = std::make_shared<xcclComm_t>(std::move(comms[0]));
 
   std::lock_guard<std::mutex> lock(mutex_);
   devXCCLCommMap_.emplace(deviceKey, XCCLComm);
   xcclStreamsMap_.emplace(deviceKey, std::move(stream));
   xcclEventsMap_.emplace(deviceKey, at::xpu::XPUEvent());
 
-  for (const auto i : c10::irange(xcclActiveGroupCounter_)) {
-    (void)i;
-    ccl::group_start();
-  }
-
-  xcclStreams_.emplace(deviceKey, std::move(stream));
-  xcclEvents_.emplace(deviceKey, at::xpu::XPUEvent());
-
-  auto it = inInitializationCommMap_.find(deviceKey);
-  if (it != inInitializationCommMap_.end()) {
-    devXCCLCommMap_.emplace(deviceKey, std::move(it->second));
-    inInitializationCommMap_.erase(deviceKey);
-
-    xcclCommDevIdxMapMutex.lock();
-    xcclCommDevIdxMap.emplace(XCCLComm, device.index());
-    xcclCommDevIdxMapMutex.unlock();
-  }
-
-  it = devXCCLCommMap_.find(deviceKey);
-  TORCH_INTERNAL_ASSERT(
-      it != devXCCLCommMap_.end(), "Communicators not populated in cache!");
-
-  return it->second;
+  return XCCLComm;
 }
 
 void ProcessGroupXCCL::groupStart() {
@@ -396,7 +358,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::endCoalescing(OpType optype) {
   auto device = coalescedDevice_;
 
   const auto key = std::to_string(device.index());
-  auto stream = xcclStreams_.at(key);
+  auto stream = xcclStreamsMap_.at(key);
 
   auto work = initWork(device, rank_, optype);
   work->blockingWait_ = blockingWait_;
@@ -422,7 +384,8 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
     Fn fn,
     PreProcess pre,
     PostProcess post,
-    OpType opType) {
+    OpType opType,
+    const char* profilingTitle) {
   auto device = inputs[0].device();
   const auto key = std::to_string(device.index());
   auto comm = getXCCLComm(key, device, opType);
@@ -442,8 +405,8 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
     }
   }
 
-  auto stream = xcclStreams_.at(key);
-  syncStream(device, xcclEvents_[key], stream);
+  auto stream = xcclStreamsMap_.at(key);
+  syncStream(device, xcclEventsMap_[key], stream);
 
   c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL> work;
   work = initWork(device, rank_, opType);
@@ -454,13 +417,12 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
 
   pre(stream, work);
 
-  for (const auto& input : inputs) {
+  for (const auto i : c10::irange(inputs.size())) {
     c10::xpu::XPUCachingAllocator::recordStream(
-        input.storage().data_ptr(), stream);
+        inputs[i].storage().data_ptr(), stream);
+    fn(inputs[i], outputs[i], *comm, stream);
   }
 
-  fn(inputs[0], outputs[0], *comm, stream);
-
   post(stream, work);
 
   if (!coalescing_state_) {
@@ -478,97 +440,13 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
   return work;
 }
 
-template <typename Fn, typename PreProcess, typename PostProcess>
-c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
-    std::vector<at::Tensor>& inputs,
-    std::vector<at::Tensor>& outputs,
-    Fn fn,
-    PreProcess pre,
-    PostProcess post,
-    OpType opType) {
-  auto inputs = std::vector<at::Tensor>{input};
-  auto outputs = std::vector<at::Tensor>{output};
-  return collective(inputs, outputs, fn, pre, post, opType);
-}
-
 template <typename Fn>
-c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
-    at::Tensor& input,
-    at::Tensor& output,
-    Fn fn,
-    OpType opType) {
-  return collective<Fn>(
-      input,
-      output,
-      fn,
-      [](at::xpu::XPUStream&, c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL>&) {
-      },
-      [](at::xpu::XPUStream&, c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL>&) {
-      },
-      opType);
-}
-
-template <typename Fn>
-c10::intrusive_ptr<Work> ProcessGroupXCCL::collectiveCoalesced(
-    std::vector<at::Tensor>& inputs,
-    std::vector<at::Tensor>& outputs,
-    Fn fn,
-    OpType opType) {
-  auto device = inputs[0].device();
-  const auto key = std::to_string(device.index());
-  auto comm = getXCCLComm(key, device, opType);
-
-  if (coalescing_state_ & CoalActive) {
-    coalescing_state_ |= CoalColl;
-    if (coalescedDevice_.index() < 0) {
-      coalescedDevice_ = device;
-    } else {
-      TORCH_CHECK(
-          coalescedDevice_.index() == device.index(), MULTI_DEVICE_ERROR_MSG);
-    }
-    if (coalescedComm_ == nullptr) {
-      coalescedComm_ = comm;
-    } else {
-      TORCH_CHECK(coalescedComm_ == comm, MULTI_DEVICE_ERROR_MSG);
-    }
-  }
-
-  auto stream = xcclStreamsMap_.at(key);
-  syncStream(device, xcclEventsMap_[key], stream);
-
-  c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL> work;
-  work = initWork(device, rank_, opType, profilingTitle);
-  work->outputs_ = std::make_shared<std::vector<at::Tensor>>(outputs);
-
-  at::xpu::OptionalXPUGuard gpuGuard(device);
-  pre(stream, work);
-  for (const auto i : c10::irange(inputs.size())) {
-    c10::xpu::XPUCachingAllocator::recordStream(
-        inputs[i].storage().data_ptr(), stream);
-    fn(inputs[i], outputs[i], *comm, stream);
-  }
-  post(stream, work);
-
-  work->xcclEndEvent_->record(stream);
-  std::vector<c10::Stream> streams = {stream.unwrap()};
-  c10::MultiStreamGuard streamGuard(streams);
-  std::vector<at::Device> devices{device};
-  work->future_ = c10::make_intrusive<at::ivalue::Future>(
-      c10::ListType::create(c10::TensorType::get()), devices);
-  work->future_->markCompleted(at::IValue(*work->outputs_));
-  work->blockingWait_ = blockingWait_;
-
-  return work;
-}
-
-template <typename Fn, typename PreProcess, typename PostProcess>
 c10::intrusive_ptr<Work> ProcessGroupXCCL::pointToPoint(
     at::Tensor& tensor,
     Fn fn,
     int peer,
     OpType opType,
-    PreProcess pre,
-    PostProcess post) {
+    const char* profilingTitle) {
   auto device = tensor.device();
   std::string key;
   int p2pRank = 0, p2pTargetRank = 0;
@@ -605,63 +483,43 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::pointToPoint(
     }
   }
 
-  auto stream = xcclStreams_.at(key);
-  syncStream(device, xcclEvents_[key], stream);
+  auto stream = xcclStreamsMap_.at(key);
+  syncStream(device, xcclEventsMap_[key], stream);
 
-  c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL> work;
   if (!coalescing_state_) {
+    c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL> work;
     work = initWork(device, rank_, opType);
     work->outputs_ = std::make_shared<std::vector<at::Tensor>>();
     work->outputs_->push_back(tensor);
-  }
-
-  at::xpu::OptionalXPUGuard gpuGuard(device);
-
-  if (!coalescing_state_) {
-    pre(stream, work);
-  }
 
-  c10::xpu::XPUCachingAllocator::recordStream(
-      tensor.storage().data_ptr(), stream);
+    at::xpu::OptionalXPUGuard gpuGuard(device);
 
-  fn(tensor, *comm, stream, p2pTargetRank);
+    c10::xpu::XPUCachingAllocator::recordStream(
+        tensor.storage().data_ptr(), stream);
 
-  if (!coalescing_state_) {
-    post(stream);
+    fn(tensor, *comm, stream, p2pTargetRank);
 
     work->xcclEndEvent_->record(stream);
     work->blockingWait_ = blockingWait_;
-
-    {
-      std::vector<c10::Stream> streams = {stream.unwrap()};
-      c10::MultiStreamGuard streamGuard(streams);
-      std::vector<at::Device> devices{device};
-      work->future_ = c10::make_intrusive<at::ivalue::Future>(
-          c10::ListType::create(c10::TensorType::get()), devices);
-      work->future_->markCompleted(at::IValue(*work->outputs_));
-    }
+    std::vector<c10::Stream> streams = {stream.unwrap()};
+    c10::MultiStreamGuard streamGuard(streams);
+    std::vector<at::Device> devices{device};
+    work->future_ = c10::make_intrusive<at::ivalue::Future>(
+        c10::ListType::create(c10::TensorType::get()), devices);
+    work->future_->markCompleted(at::IValue(*work->outputs_));
     return work;
   } else {
+    at::xpu::OptionalXPUGuard gpuGuard(device);
+
+    c10::xpu::XPUCachingAllocator::recordStream(
+        tensor.storage().data_ptr(), stream);
+
+    fn(tensor, *comm, stream, p2pTargetRank);
+
     return nullptr;
   }
 }
 
-template <typename Fn>
-c10::intrusive_ptr<Work> ProcessGroupXCCL::pointToPoint(
-    at::Tensor& tensor,
-    Fn fn,
-    int peer,
-    OpType opType) {
-  return pointToPoint(
-      tensor,
-      fn,
-      peer,
-      opType,
-      [](at::xpu::XPUStream&, c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL>&) {
-      },
-      [](at::xpu::XPUStream&) {});
-}
-
 c10::intrusive_ptr<Work> ProcessGroupXCCL::send(
     std::vector<at::Tensor>& tensors,
     int dstRank,
@@ -677,19 +535,19 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::send(
           xcclComm_t& comm,
           at::xpu::XPUStream& stream,
           int dst) {
-        ccl::event ret_evt;
         auto xcclDataType = getXcclDataType(input.scalar_type());
-        ret_evt = ccl::send(
+        ccl::send(
             input.data_ptr(),
             (size_t)input.numel(),
             xcclDataType,
             dst,
             comm,
             ccl::create_stream(stream.queue()));
-        return ret_evt;
+        return;
       },
       dstRank,
-      OpType::SEND);
+      OpType::SEND,
+      c10::str("xccl:send ", rank_, "->", dstRank).c_str());
   return ret;
 }
 
@@ -708,19 +566,19 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::recv(
           xcclComm_t& comm,
           at::xpu::XPUStream& stream,
           int src) {
-        ccl::event ret_evt;
         auto xcclDataType = getXcclDataType(output.scalar_type());
-        ret_evt = ccl::recv(
+        ccl::recv(
             output.data_ptr(),
             (size_t)output.numel(),
             xcclDataType,
             src,
             comm,
             ccl::create_stream(stream.queue()));
-        return ret_evt;
+        return;
       },
       srcRank,
-      OpType::RECV);
+      OpType::RECV,
+      c10::str("xccl:recv ", rank_, "<-", srcRank).c_str());
   return ret;
 }
 
@@ -785,13 +643,12 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::gather(
           }
         }
         {
-          ccl::event ret_evt;
           auto xcclDataType = getXcclDataType(inputTensor.scalar_type());
           if (rank_ == root) {
             for (const auto r : c10::irange(size_)) {
               if (r != root) {
                 // do receive
-                ret_evt = ccl::recv(
+                ccl::recv(
                     outputs[r].data_ptr(),
                     (size_t)inputTensor.numel(),
                     xcclDataType,
@@ -805,7 +662,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::gather(
             }
           } else {
             // do send
-            ret_evt = ccl::send(
+            ccl::send(
                 inputTensor.data_ptr(),
                 (size_t)inputTensor.numel(),
                 xcclDataType,
@@ -813,13 +670,9 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::gather(
                 comm,
                 ccl::create_stream(stream.queue()));
           }
-          return ret_evt;
+          return;
         }
       },
-      [](at::xpu::XPUStream&, c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL>&) {
-      },
-      [](at::xpu::XPUStream&, c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL>&) {
-      },
       OpType::GATHER);
 }
 
@@ -885,14 +738,13 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::scatter(
           }
         }
         {
-          ccl::event ret_evt;
           if (rank_ == root) {
             for (const auto r : c10::irange(size_)) {
               if (r != root) {
                 // do send
                 size_t send_count = inputs[r].numel();
                 auto send_type = getXcclDataType(inputs[r].scalar_type());
-                ret_evt = ccl::send(
+                ccl::send(
                     inputs[r].data_ptr(),
                     send_count,
                     send_type,
@@ -908,7 +760,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::scatter(
             // do receive
             size_t recv_count = outputTensor.numel();
             auto recv_type = getXcclDataType(outputTensor.scalar_type());
-            ret_evt = ccl::recv(
+            ccl::recv(
                 outputTensor.data_ptr(),
                 recv_count,
                 recv_type,
@@ -917,13 +769,9 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::scatter(
                 ccl::create_stream(stream.queue()));
           }
 
-          return ret_evt;
+          return;
         }
       },
-      [](at::xpu::XPUStream&, c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL>&) {
-      },
-      [](at::xpu::XPUStream&, c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL>&) {
-      },
       OpType::SCATTER);
 }
 
@@ -937,7 +785,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::allreduce_impl(
           at::Tensor& output,
           xcclComm_t& comm,
           at::xpu::XPUStream& stream) {
-        auto xcclDataType = getXcclDataType(input.scalar_type());
+        auto xcclDataType = getXcclDataType(input.scalar_type(), true);
         auto xcclReduceOp = getXcclReduceOp(opts.reduceOp, input);
         auto ccl_stream = ccl::create_stream(stream.queue());
         ccl::allreduce(
@@ -948,7 +796,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::allreduce_impl(
             xcclReduceOp,
             comm,
             ccl::create_stream(stream.queue()));
-        return ret_evt;
+        return;
       },
       OpType::ALLREDUCE,
       "xccl:all_reduce");
@@ -959,30 +807,35 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::allreduce(
     const AllreduceOptions& opts) {
   TORCH_CHECK(tensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
   auto tensor = tensors.back();
-  if (tensor.is_complex()) {
-    TORCH_CHECK(
-        complexViewAsRealAllowed(opts.reduceOp),
-        "all_reduce does not support",
-        opts.reduceOp,
-        "on complex tensors");
-    tensor = at::view_as_real(tensor);
-  }
   check_xpu_single_tensor(tensor);
-  TORCH_CHECK(
-      !isFloat8Type(tensor.scalar_type()),
-      "Float8 dtypes are not currenlty supported for XCCL reductions");
 
-  return allreduce_impl(tensor, opts);
+  return collective(
+      tensor,
+      tensor,
+      [&](at::Tensor& input,
+          at::Tensor& output,
+          xcclComm_t& comm,
+          at::xpu::XPUStream& stream) {
+        auto xcclDataType = getXcclDataType(input.scalar_type(), true);
+        auto xcclReduceOp = getXcclReduceOp(opts.reduceOp, input);
+        ccl::allreduce(
+            input.data_ptr(),
+            output.data_ptr(),
+            (size_t)input.numel(),
+            xcclDataType,
+            xcclReduceOp,
+            comm,
+            ccl::create_stream(stream.queue()));
+        return;
+      },
+      OpType::ALLREDUCE,
+      "xccl:all_reduce");
 }
 
 c10::intrusive_ptr<Work> ProcessGroupXCCL::allreduce_coalesced(
     std::vector<at::Tensor>& tensors,
     const AllreduceCoalescedOptions& opts) {
   check_xpu_tensors_same_device(tensors);
-  TORCH_CHECK(
-      !isFloat8Type(tensors.back().scalar_type()),
-      "Float8 dtypes are not currenlty supported for XCCL reductions");
-
   return collectiveCoalesced(
       tensors,
       tensors,
@@ -990,10 +843,9 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::allreduce_coalesced(
           at::Tensor& output,
           xcclComm_t& comm,
           at::xpu::XPUStream& stream) {
-        auto xcclDataType = getXcclDataType(input.scalar_type());
+        auto xcclDataType = getXcclDataType(input.scalar_type(), true);
         auto xcclReduceOp = getXcclReduceOp(opts.reduceOp, input);
-        ccl::event ret_evt;
-        ret_evt = ccl::allreduce(
+        ccl::allreduce(
             input.data_ptr(),
             output.data_ptr(),
             (size_t)input.numel(),
@@ -1001,9 +853,10 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::allreduce_coalesced(
             xcclReduceOp,
             comm,
             ccl::create_stream(stream.queue()));
-        return ret_evt;
+        return;
       },
-      OpType::COALESCED);
+      OpType::COALESCED,
+      "xccl:allreduce_coalesced");
 }
 
 c10::intrusive_ptr<Work> ProcessGroupXCCL::broadcast(
@@ -1011,9 +864,6 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::broadcast(
     const BroadcastOptions& opts) {
   TORCH_CHECK(tensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
   auto tensor = tensors.back();
-  if (tensor.is_complex()) {
-    tensor = at::view_as_real(tensor);
-  }
   check_xpu_single_tensor(tensor);
 
   const auto root = opts.rootRank + opts.rootTensor;
@@ -1026,17 +876,17 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::broadcast(
           xcclComm_t& comm,
           at::xpu::XPUStream& stream) {
         auto xcclDataType = getXcclDataType(input.scalar_type());
-        ccl::event ret_evt;
-        ret_evt = ccl::broadcast(
+        ccl::broadcast(
             input.data_ptr(),
             (size_t)input.numel(),
             xcclDataType,
             root,
             comm,
             ccl::create_stream(stream.queue()));
-        return ret_evt;
+        return;
       },
-      OpType::BROADCAST);
+      OpType::BROADCAST,
+      "nccl:broadcast");
 }
 
 c10::intrusive_ptr<Work> ProcessGroupXCCL::_broadcast_oop(
@@ -1057,33 +907,24 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::_broadcast_oop(
           xcclComm_t& comm,
           at::xpu::XPUStream& stream) {
         auto xcclDataType = getXcclDataType(input.scalar_type());
-        ccl::event ret_evt;
-        ret_evt = ccl::broadcast(
+        ccl::broadcast(
             input.data_ptr(),
             (size_t)input.numel(),
             xcclDataType,
             root,
             comm,
             ccl::create_stream(stream.queue()));
-        return ret_evt;
+        return;
       },
-      OpType::BROADCAST);
+      OpType::BROADCAST,
+      "xccl:_broadcast_oop");
 }
 
 c10::intrusive_ptr<Work> ProcessGroupXCCL::reduce(
     std::vector<at::Tensor>& tensors,
     const ReduceOptions& opts) {
   TORCH_CHECK(tensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
-  // @lint-ignore CLANGTIDY
   auto tensor = tensors.back();
-  if (tensor.is_complex()) {
-    TORCH_CHECK(
-        complexViewAsRealAllowed(opts.reduceOp),
-        "reduce does not support",
-        opts.reduceOp,
-        "on complex tensors");
-    tensor = at::view_as_real(tensor);
-  }
   check_xpu_single_tensor(tensor);
 
   return collective(
@@ -1094,10 +935,9 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::reduce(
           xcclComm_t& comm,
           at::xpu::XPUStream& stream) {
         const int root = opts.rootRank + opts.rootTensor;
-        const auto xcclDataType = getXcclDataType(input.scalar_type());
+        const auto xcclDataType = getXcclDataType(input.scalar_type(), true);
         const auto xcclReduceOp = getXcclReduceOp(opts.reduceOp, input);
-        ccl::event ret_evt;
-        ret_evt = ccl::reduce(
+        ccl::reduce(
             input.data_ptr(),
             output.data_ptr(),
             (size_t)input.numel(),
@@ -1106,20 +946,20 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::reduce(
             root,
             comm,
             ccl::create_stream(stream.queue()));
-        return ret_evt;
+        return;
       },
-      OpType::REDUCE);
+      OpType::REDUCE,
+      "xccl:reduce");
 }
 
 c10::intrusive_ptr<Work> ProcessGroupXCCL::_reduce_oop(
     at::Tensor& outputTensor,
     at::Tensor& inputTensor,
     const ReduceOptions& opts) {
-  if (outputTensor.numel() != inputTensor.numel()) {
-    C10_THROW_ERROR(
-        ValueError,
-        "Tensor input and output of _reduce_oop must have the same number of elements ");
-  }
+  TORCH_CHECK_WITH(
+      ValueError,
+      outputTensor.numel() != inputTensor.numel(),
+      "Tensor input and output of _reduce_oop must have the same number of elements");
   return collective(
       inputTensor,
       outputTensor,
@@ -1128,10 +968,9 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::_reduce_oop(
           xcclComm_t& comm,
           at::xpu::XPUStream& stream) {
         const int root = opts.rootRank + opts.rootTensor;
-        const auto xcclDataType = getXcclDataType(input.scalar_type());
+        const auto xcclDataType = getXcclDataType(input.scalar_type(), true);
         const auto xcclReduceOp = getXcclReduceOp(opts.reduceOp, input);
-        ccl::event ret_evt;
-        ret_evt = ccl::reduce(
+        ccl::reduce(
             input.data_ptr(),
             output.data_ptr(),
             (size_t)input.numel(),
@@ -1140,9 +979,10 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::_reduce_oop(
             root,
             comm,
             ccl::create_stream(stream.queue()));
-        return ret_evt;
+        return;
       },
-      OpType::REDUCE);
+      OpType::REDUCE,
+      "xccl:_reduce_oop");
 }
 
 c10::intrusive_ptr<Work> ProcessGroupXCCL::allgather(
@@ -1171,16 +1011,14 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::allgather(
           c10::xpu::XPUCachingAllocator::recordStream(
               output.storage().data_ptr(), stream);
           auto xcclDataType = getXcclDataType(input.scalar_type());
-          ccl::event ret_evt;
-
-          ret_evt = ccl::allgather(
+          ccl::allgather(
               input.data_ptr(),
               output.data_ptr(),
               (size_t)input.numel(),
               xcclDataType,
               comm,
               ccl::create_stream(stream.queue()));
-          return ret_evt;
+          return;
         },
         [](at::xpu::XPUStream&,
            c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL>& work) {},
@@ -1194,7 +1032,8 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::allgather(
             outputTensors_[j].copy_(outputFlattened[j], true);
           }
         },
-        OpType::ALLGATHER);
+        OpType::ALLGATHER,
+        "xccl:all_gather");
   } else {
     const auto num_reduces = outputTensors_.size();
     startCoalescing();
@@ -1217,16 +1056,14 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::_allgather_base(
   check_xpu_single_tensor(input_tensor);
   check_xpu_single_tensor(output_tensor);
 
-  if (input_tensor.dtype() != output_tensor.dtype()) {
-    C10_THROW_ERROR(
-        TypeError, "output tensor must have the same type as input tensor");
-  }
-
-  if (input_tensor.numel() * size_ != output_tensor.numel()) {
-    C10_THROW_ERROR(
-        ValueError,
-        "output tensor size must be equal to world_size times input tensor size");
-  }
+  TORCH_CHECK_WITH(
+      TypeError,
+      input_tensor.dtype() != output_tensor.dtype(),
+      "output tensor must have the same type as input tensor");
+  TORCH_CHECK_WITH(
+      ValueError,
+      input_tensor.numel() * size_ != output_tensor.numel(),
+      "output tensor size must be equal to world_size times input tensor size");
 
   return collective(
       input_tensor,
@@ -1238,17 +1075,17 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::_allgather_base(
         c10::xpu::XPUCachingAllocator::recordStream(
             output.storage().data_ptr(), stream);
         auto xcclDataType = getXcclDataType(input.scalar_type());
-        ccl::event ret_evt;
-        ret_evt = ccl::allgather(
+        ccl::allgather(
             input.data_ptr(),
             output.data_ptr(),
             (size_t)input.numel(),
             xcclDataType,
             comm,
             ccl::create_stream(stream.queue()));
-        return ret_evt;
+        return;
       },
-      OpType::_ALLGATHER_BASE);
+      OpType::_ALLGATHER_BASE,
+      "xccl:_all_gather_base");
 }
 
 c10::intrusive_ptr<Work> ProcessGroupXCCL::allgather_into_tensor_coalesced(
@@ -1263,17 +1100,17 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::allgather_into_tensor_coalesced(
           xcclComm_t& comm,
           at::xpu::XPUStream& stream) {
         auto xcclDataType = getXcclDataType(input.scalar_type());
-        ccl::event ret_evt;
-        ret_evt = ccl::allgather(
+        ccl::allgather(
             input.data_ptr(),
             output.data_ptr(),
             (size_t)input.numel(),
             xcclDataType,
             comm,
             ccl::create_stream(stream.queue()));
-        return ret_evt;
+        return;
       },
-      OpType::COALESCED);
+      OpType::COALESCED,
+      "xccl:all_gather_into_tensor_coalesced");
 }
 
 c10::intrusive_ptr<Work> ProcessGroupXCCL::reduce_scatter(
@@ -1286,9 +1123,6 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::reduce_scatter(
   check_xpu_single_tensor(outputTensor);
   // @lint-ignore CLANGTIDY
   auto inputTensors_ = inputTensors.back();
-  TORCH_CHECK(
-      !isFloat8Type(outputTensor.scalar_type()),
-      "Float8 dtypes are not currenlty supported for XCCL reductions");
 
   bool same_size = check_same_size(inputTensors_);
   if (same_size) {
@@ -1303,10 +1137,9 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::reduce_scatter(
             at::xpu::XPUStream& stream) {
           c10::xpu::XPUCachingAllocator::recordStream(
               output.storage().data_ptr(), stream);
-          auto xcclDataType = getXcclDataType(input.scalar_type());
+          auto xcclDataType = getXcclDataType(input.scalar_type(), true);
           auto xcclReduceOp = getXcclReduceOp(opts.reduceOp, input);
-          ccl::event ret_evt;
-          ret_evt = ccl::reduce_scatter(
+          ccl::reduce_scatter(
               input.data_ptr(),
               output.data_ptr(),
               (size_t)output.numel(),
@@ -1314,7 +1147,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::reduce_scatter(
               xcclReduceOp,
               comm,
               ccl::create_stream(stream.queue()));
-          return ret_evt;
+          return;
         },
         [&](at::xpu::XPUStream& Stream,
             c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL>& work) {
@@ -1328,7 +1161,8 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::reduce_scatter(
         },
         [&](at::xpu::XPUStream&,
             c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL>&) {},
-        OpType::REDUCE_SCATTER);
+        OpType::REDUCE_SCATTER,
+        "xccl:reduce_scatter");
   } else {
     const auto num_reduces = inputTensors_.size();
     startCoalescing();
@@ -1351,22 +1185,14 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::_reduce_scatter_base(
     at::Tensor& outputTensor,
     at::Tensor& inputTensor,
     const ReduceScatterOptions& opts) {
-  if (inputTensor.dtype() != outputTensor.dtype()) {
-    C10_THROW_ERROR(
-        TypeError, "input tensor must be the same type as the output tensor.");
-  }
-
-  if (inputTensor.numel() != outputTensor.numel() * size_) {
-    C10_THROW_ERROR(
-        ValueError,
-        "input tensor must be the same size as output size times world size");
-  }
-
-  // @lint-ignore CLANGTIDY
-  const auto& tensor = outputTensor;
-  TORCH_CHECK(
-      !isFloat8Type(tensor.scalar_type()),
-      "Float8 dtypes are not currenlty supported for XCCL reductions");
+  TORCH_CHECK_WITH(
+      TypeError,
+      inputTensor.dtype() != outputTensor.dtype(),
+      "output tensor must have the same type as input tensor");
+  TORCH_CHECK_WITH(
+      ValueError,
+      inputTensor.numel() != outputTensor.numel() * size_,
+      "input tensor size must be equal to world_size times output tensor size");
 
   return collective(
       inputTensor,
@@ -1377,10 +1203,9 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::_reduce_scatter_base(
           at::xpu::XPUStream& stream) {
         c10::xpu::XPUCachingAllocator::recordStream(
             output.storage().data_ptr(), stream);
-        auto xcclDataType = getXcclDataType(input.scalar_type());
+        auto xcclDataType = getXcclDataType(input.scalar_type(), true);
         auto xcclReduceOp = getXcclReduceOp(opts.reduceOp, input);
-        ccl::event ret_evt;
-        ret_evt = ccl::reduce_scatter(
+        ccl::reduce_scatter(
             input.data_ptr(),
             output.data_ptr(),
             (size_t)output.numel(),
@@ -1388,18 +1213,16 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::_reduce_scatter_base(
             xcclReduceOp,
             comm,
             ccl::create_stream(stream.queue()));
-        return ret_evt;
+        return;
       },
-      OpType::_REDUCE_SCATTER_BASE);
+      OpType::_REDUCE_SCATTER_BASE,
+      "xccl:_reduce_scatter_base");
 }
 
 c10::intrusive_ptr<Work> ProcessGroupXCCL::reduce_scatter_tensor_coalesced(
     std::vector<at::Tensor>& outputs,
     std::vector<at::Tensor>& inputs,
     const ReduceScatterOptions& opts) {
-  TORCH_CHECK(
-      !isFloat8Type(inputs.back().scalar_type()),
-      "Float8 dtypes are not currenlty supported for XCCL reductions");
   return collectiveCoalesced(
       inputs,
       outputs,
@@ -1409,10 +1232,9 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::reduce_scatter_tensor_coalesced(
           at::xpu::XPUStream& stream) {
         c10::xpu::XPUCachingAllocator::recordStream(
             output.storage().data_ptr(), stream);
-        auto xcclDataType = getXcclDataType(input.scalar_type());
+        auto xcclDataType = getXcclDataType(input.scalar_type(), true);
         auto xcclReduceOp = getXcclReduceOp(opts.reduceOp, input);
-        ccl::event ret_evt;
-        ret_evt = ccl::reduce_scatter(
+        ccl::reduce_scatter(
             input.data_ptr(),
             output.data_ptr(),
             (size_t)output.numel(),
@@ -1420,9 +1242,10 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::reduce_scatter_tensor_coalesced(
             xcclReduceOp,
             comm,
             ccl::create_stream(stream.queue()));
-        return ret_evt;
+        return;
       },
-      OpType::COALESCED);
+      OpType::COALESCED,
+      "xccl:reduce_scatter_tensor_coalesced");
 }
 
 c10::intrusive_ptr<Work> ProcessGroupXCCL::barrier(const BarrierOptions& opts) {
@@ -1441,6 +1264,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::barrier(const BarrierOptions& opts) {
         static_cast<int16_t>(rank_ % at::detail::getXPUHooks().getNumGPUs());
   }
 
+  // todo: use barrier instead of allreduce
   TORCH_CHECK_WITH(
       ValueError,
       barDevIdx >= 0,
@@ -1484,17 +1308,17 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::alltoall_base(
           c10::xpu::XPUCachingAllocator::recordStream(
               output.storage().data_ptr(), stream);
           auto xcclDataType = getXcclDataType(output.scalar_type());
-          ccl::event ret_evt;
-          ret_evt = ccl::alltoall(
+          ccl::alltoall(
               input.data_ptr(),
               output.data_ptr(),
               (size_t)output.numel() / comm.size(),
               xcclDataType,
               comm,
               ccl::create_stream(stream.queue()));
-          return ret_evt;
+          return;
         },
-        OpType::ALLTOALL_BASE);
+        OpType::ALLTOALL_BASE,
+        "xccl:all_to_all");
   } else {
     c10d::checkSplitSizes(inputSplitSizes, inputTensor, size_);
     c10d::checkSplitSizes(outputSplitSizes, outputTensor, size_);
@@ -1525,9 +1349,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::alltoall_base(
                 (outputSplitsEqual ? outLen : outputSplitSizes[i] * outLen);
           }
           auto xcclDataType = getXcclDataType(output.scalar_type());
-          ccl::event ret_evt;
-
-          ret_evt = ccl::alltoallv(
+          ccl::alltoallv(
               input.data_ptr(),
               sendCounts,
               output.data_ptr(),
@@ -1535,9 +1357,10 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::alltoall_base(
               xcclDataType,
               comm,
               ccl::create_stream(stream.queue()));
-          return ret_evt;
+          return;
         },
-        OpType::ALLTOALL_BASE);
+        OpType::ALLTOALL_BASE,
+        "xccl:all_to_all");
   }
 }
 
@@ -1607,15 +1430,11 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::alltoall(
             outputTensors[i].view({-1}).copy_(flatOutputSplits[i]);
           }
         }
-
         stream.synchronize();
-        return ret_evt;
-      },
-      [](at::xpu::XPUStream&, c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL>&) {
-      },
-      [](at::xpu::XPUStream&, c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL>&) {
+        return;
       },
-      OpType::ALLTOALL);
+      OpType::ALLTOALL,
+      "xccl:all_to_all");
 }
 
 } // namespace c10d
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
index 0f2b2738a4b77c..c8fa11442c692e 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
@@ -21,18 +21,12 @@
 #include <torch/csrc/distributed/c10d/Store.hpp>
 namespace c10d {
 
-namespace {
-struct AutoXcclGroup {
-  AutoXcclGroup();
-  ~AutoXcclGroup() noexcept(false);
-};
-} // namespace
-
 static std::vector<std::string> TORCH_XCCL_BLOCKING_WAIT = {
     "TORCH_XCCL_BLOCKING_WAIT",
     "XCCL_BLOCKING_WAIT"};
 
 using xcclComm_t = ccl::communicator;
+using XCCL_KVS = ccl::shared_ptr_class<ccl::kvs>;
 constexpr const char* XCCL_BACKEND_NAME = "xccl";
 
 class TORCH_API ProcessGroupXCCL : public Backend {
@@ -129,28 +123,50 @@ class TORCH_API ProcessGroupXCCL : public Backend {
       Fn fn,
       OpType opType,
       const char* profilingTitle = nullptr) {
-    auto inputs = std::vector<at::Tensor>{input};
-    auto outputs = std::vector<at::Tensor>{output};
     return collective<Fn>(
-        inputs,
-        outputs,
+        input,
+        output,
         fn,
         [](at::xpu::XPUStream&,
            c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL>&) {},
         [](at::xpu::XPUStream&,
            c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL>&) {},
-        opType);
+        opType,
+        profilingTitle);
   }
 
   template <typename Fn, typename PreProcess, typename PostProcess>
   c10::intrusive_ptr<Work> collective(
-      std::vector<at::Tensor>& inputs,
-      std::vector<at::Tensor>& outputs,
+      at::Tensor& input,
+      at::Tensor& output,
       Fn fn,
       PreProcess pre,
       PostProcess post,
       OpType opType,
-      const char* profilingTitle = nullptr);
+      const char* profilingTitle = nullptr) {
+    auto inputs = std::vector<at::Tensor>{input};
+    auto outputs = std::vector<at::Tensor>{output};
+    return collective(inputs, outputs, fn, pre, post, opType, profilingTitle);
+  }
+
+  template <typename Fn>
+  c10::intrusive_ptr<Work> collective(
+      std::vector<at::Tensor>& inputs,
+      std::vector<at::Tensor>& outputs,
+      Fn fn,
+      OpType opType,
+      const char* profilingTitle = nullptr) {
+    return collective<Fn>(
+        inputs,
+        outputs,
+        fn,
+        [](at::xpu::XPUStream&,
+           c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL>&) {},
+        [](at::xpu::XPUStream&,
+           c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL>&) {},
+        opType,
+        profilingTitle);
+  }
 
   template <typename Fn, typename PreProcess, typename PostProcess>
   c10::intrusive_ptr<Work> collective(
@@ -159,30 +175,39 @@ class TORCH_API ProcessGroupXCCL : public Backend {
       Fn fn,
       PreProcess pre,
       PostProcess post,
-      OpType opType);
+      OpType opType,
+      const char* profilingTitle = nullptr);
 
   template <typename Fn>
   c10::intrusive_ptr<Work> collectiveCoalesced(
       std::vector<at::Tensor>& input,
       std::vector<at::Tensor>& output,
       Fn fn,
-      OpType opType);
+      OpType opType,
+      const char* profilingTitle = nullptr) {
+    return collective<Fn>(
+        input,
+        output,
+        fn,
+        [](at::xpu::XPUStream&,
+           c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL>&) {
+          ccl::group_start();
+        },
+        [](at::xpu::XPUStream&,
+           c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL>&) {
+          ccl::group_end();
+        },
+        opType,
+        profilingTitle);
+  }
 
   template <typename Fn>
-  c10::intrusive_ptr<Work> pointToPoint(
-      at::Tensor& tensor,
-      Fn fn,
-      int peer,
-      OpType opType);
-
-  template <typename Fn, typename PreProcess, typename PostProcess>
   c10::intrusive_ptr<Work> pointToPoint(
       at::Tensor& tensor,
       Fn fn,
       int peer,
       OpType opType,
-      PreProcess pre,
-      PostProcess post);
+      const char* profilingTitle = nullptr);
 
   c10::intrusive_ptr<Work> allreduce_impl(
       at::Tensor& tensor,
@@ -285,10 +310,8 @@ class TORCH_API ProcessGroupXCCL : public Backend {
       const ScatterOptions& opts = ScatterOptions()) override;
 
  protected:
-  std::unordered_map<std::string, at::xpu::XPUStream> xcclStreams_;
-  std::unordered_map<std::string, at::xpu::XPUEvent> xcclEvents_;
-  std::unordered_map<std::string, std::shared_ptr<xcclComm_t>>
-      inInitializationCommMap_;
+  std::unordered_map<std::string, at::xpu::XPUStream> xcclStreamsMap_;
+  std::unordered_map<std::string, at::xpu::XPUEvent> xcclEventsMap_;
   std::unordered_map<std::string, std::shared_ptr<xcclComm_t>> devXCCLCommMap_;
   c10::intrusive_ptr<Store> store_;
   std::mutex mutex_;

From 385c218f274509d36c6e3a8d1e6ece5511a5d13b Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Mon, 21 Oct 2024 01:54:13 +0000
Subject: [PATCH 85/96] refine code

---
 caffe2/CMakeLists.txt                         |  3 --
 test/distributed/test_c10d_common.py          | 13 +++---
 torch/csrc/distributed/c10d/ProcessGroup.hpp  | 17 ++++----
 .../distributed/c10d/ProcessGroupXCCL.cpp     | 39 +++++++++--------
 .../distributed/c10d/ProcessGroupXCCL.hpp     | 43 +++++++++++--------
 torch/testing/_internal/common_distributed.py | 14 +++++-
 6 files changed, 72 insertions(+), 57 deletions(-)

diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index b4ec018019f165..25bd7f700f68a2 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -1376,9 +1376,6 @@ if(USE_DISTRIBUTED)
   endif()
   if(USE_XPU AND USE_C10D_XCCL)
     target_compile_definitions(torch_xpu PUBLIC USE_C10D_XCCL)
-    set_source_files_properties(
-      ${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupXCCL.cpp
-      PROPERTIES COMPILE_DEFINITIONS  "CCL_ENABLE_ZE;CCL_ENABLE_SYCL")
   endif()
   if(USE_MPI AND USE_C10D_MPI)
     if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
diff --git a/test/distributed/test_c10d_common.py b/test/distributed/test_c10d_common.py
index 903df26bba9f6f..d3cb65f7befb1d 100644
--- a/test/distributed/test_c10d_common.py
+++ b/test/distributed/test_c10d_common.py
@@ -31,6 +31,7 @@
 from torch.testing._internal.common_distributed import (
     MultiProcessTestCase,
     skip_if_lt_x_gpu,
+    get_device_count,
 )
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
@@ -60,17 +61,13 @@
 torch.backends.cuda.matmul.allow_tf32 = False
 
 
-def gpus_for_rank(world_size):
+def gpus_for_rank(world_size, backend):
     """Multigpu tests are designed to simulate the multi nodes with multi
     GPUs on each node. Nccl backend requires equal #GPUs in each process.
     On a single node, all visible GPUs are evenly
     divided to subsets, each process only uses a subset.
     """
-    device_count = (
-        torch.xpu.device_count()
-        if torch.xpu.is_available()
-        else torch.cuda.device_count()
-    )
+    device_count = get_device_count(backend)
     visible_devices = list(range(device_count))
     gpus_per_process = device_count // world_size
     gpus_for_rank = []
@@ -833,7 +830,7 @@ def update_parameters(model):
     def _gpu_model_with_ddp_comm_hook(
         self, process_group, hook=None, gradient_as_bucket_view=False, state=None
     ):
-        device_id = gpus_for_rank(self.world_size)[self.rank][0]
+        device_id = gpus_for_rank(self.world_size, process_group.name())[self.rank][0]
         gpu_model = DistributedDataParallel(
             ModuleForDdpCommHook().to(device_id),
             device_ids=[device_id],
@@ -850,7 +847,7 @@ def _gpu_model_with_ddp_comm_hook(
     def _gpu_model_with_builtin_ddp_comm_hook(
         self, process_group, hook=None, gradient_as_bucket_view=False
     ):
-        device_id = gpus_for_rank(self.world_size)[self.rank][0]
+        device_id = gpus_for_rank(self.world_size, process_group.name())[self.rank][0]
         gpu_model = DistributedDataParallel(
             ModuleForDdpCommHook().to(device_id),
             device_ids=[device_id],
diff --git a/torch/csrc/distributed/c10d/ProcessGroup.hpp b/torch/csrc/distributed/c10d/ProcessGroup.hpp
index 31c974a061e4a2..b3eac70e871bf7 100644
--- a/torch/csrc/distributed/c10d/ProcessGroup.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroup.hpp
@@ -131,6 +131,13 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
     return backendType_;
   };
 
+  inline bool backendSupportsSequenceNumbers(BackendType backendType) {
+    if (backendType == BackendType::GLOO || backendType == BackendType::NCCL ||
+        backendType == BackendType::XCCL || backendType == BackendType::UCC)
+      return true;
+    return false;
+  }
+
   virtual void startCoalescing(c10::DeviceType deviceType) {
     // only nccl has implemented startCoalescing so only execute for nccl
     // backends
@@ -508,10 +515,7 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
   virtual void setSequenceNumberForGroup() {
     auto backendType = getBackendType();
     // TODO: HACK for backend name to get sequence number for that backend.
-    if (backendType == ProcessGroup::BackendType::GLOO ||
-        backendType == ProcessGroup::BackendType::NCCL ||
-        backendType == ProcessGroup::BackendType::XCCL ||
-        backendType == ProcessGroup::BackendType::UCC) {
+    if (backendSupportsSequenceNumbers(backendType)) {
       getDefaultBackend()->setSequenceNumberForGroup();
     } else {
       TORCH_CHECK(
@@ -530,10 +534,7 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
     auto backendType = getBackendType();
 
     // TODO: HACK for backend name to get sequence number for that backend.
-    if (backendType == ProcessGroup::BackendType::GLOO ||
-        backendType == ProcessGroup::BackendType::NCCL ||
-        backendType == ProcessGroup::BackendType::XCCL ||
-        backendType == ProcessGroup::BackendType::UCC) {
+    if (backendSupportsSequenceNumbers(backendType)) {
       return getDefaultBackend()->getSequenceNumberForGroup();
     } else {
       TORCH_CHECK(
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index 90fb4c3f9cbd75..41e4e43436270a 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -1,17 +1,8 @@
 #ifdef USE_C10D_XCCL
 
 #include <comm/XPUGuard.h>
+#include <torch/csrc/distributed/c10d/ParamCommsUtils.hpp>
 #include <torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp>
-#include <fstream>
-#include <map>
-#include <sstream>
-#include <stdexcept>
-#include <tuple>
-#include <unordered_set>
-#include <utility>
-
-#include <c10/core/DeviceType.h>
-#include <c10/util/Optional.h>
 
 namespace c10d {
 
@@ -89,10 +80,13 @@ ProcessGroupXCCL::WorkXCCL::WorkXCCL(
     at::Device& device,
     int rank,
     OpType opType,
+    uint64_t seq,
+    const char* profilingTitle,
     const std::optional<std::vector<at::Tensor>>& inputs)
-    : Work(rank, opType, "profilingTitle", inputs),
+    : Work(rank, opType, profilingTitle, inputs),
       device_(device),
-      workStartTime_(std::chrono::steady_clock::now()) {
+      workStartTime_(std::chrono::steady_clock::now()),
+      seq_(seq) {
   xcclEndEvent_ = std::make_shared<at::xpu::XPUEvent>();
 }
 
@@ -101,7 +95,8 @@ ProcessGroupXCCL::WorkXCCL::WorkXCCL(const WorkXCCL& w)
       device_(w.device_),
       xcclEndEvent_(w.xcclEndEvent_),
       blockingWait_(w.blockingWait_),
-      workStartTime_(w.workStartTime_) {}
+      workStartTime_(w.workStartTime_),
+      seq_(w.seq_) {}
 
 ProcessGroupXCCL::WorkXCCL::~WorkXCCL() = default;
 
@@ -156,10 +151,16 @@ c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL> ProcessGroupXCCL::initWork(
     at::Device& device,
     int rank,
     OpType opType,
+    const char* profilingTitle,
     const std::vector<at::Tensor>& inputs,
     const std::vector<at::Tensor>& outputs) {
   auto r = c10::make_intrusive<ProcessGroupXCCL::WorkXCCL>(
-      device, rank, opType, std::optional<std::vector<at::Tensor>>(inputs));
+      device,
+      rank,
+      opType,
+      seqCollective_,
+      profilingTitle,
+      std::optional<std::vector<at::Tensor>>(inputs));
   return r;
 }
 
@@ -212,7 +213,10 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
     Fn fn,
     PreProcess pre,
     PostProcess post,
-    OpType opType) {
+    OpType opType,
+    const char* profilingTitle) {
+  seqCollective_++;
+
   auto device = inputs[0].device();
   const auto key = std::to_string(device.index());
   auto comm = getXCCLComm(key, device);
@@ -221,7 +225,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
   syncStream(device, xcclEventsMap_[key], stream);
 
   c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL> work;
-  work = initWork(device, rank_, opType);
+  work = initWork(device, rank_, opType, profilingTitle);
   work->outputs_ = std::make_shared<std::vector<at::Tensor>>(outputs);
 
   at::xpu::OptionalXPUGuard gpuGuard(device);
@@ -273,7 +277,8 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::allreduce(
             ccl_stream);
         return;
       },
-      OpType::ALLREDUCE);
+      OpType::ALLREDUCE,
+      "xccl:all_reduce");
 }
 
 } // namespace c10d
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
index 6e6eb16d62d620..f9761c652dc1a0 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
@@ -1,33 +1,24 @@
 #pragma once
 
-#if defined(__linux__)
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <unistd.h>
-#endif
-
 #ifdef USE_C10D_XCCL
-#include <ATen/xpu/XPUEvent.h>
+// We will define those flags in XCCL backend file instead of passing to gcc
+// compiler.
+#define CCL_ENABLE_ZE
+#define CCL_ENABLE_SYCL
+
 #include <oneapi/ccl.hpp>
-#include <torch/csrc/distributed/c10d/Store.hpp>
 #include <exception>
-#include <memory>
-#include <vector>
-
-#include <atomic>
-#include <chrono>
 #include <future>
-#include <iostream>
 #include <list>
 #include <mutex>
-#include <thread>
 #include <unordered_map>
+#include <vector>
 
+#include <ATen/xpu/XPUEvent.h>
 #include <c10/core/StreamGuard.h>
 #include <c10/xpu/XPUCachingAllocator.h>
 #include <torch/csrc/distributed/c10d/Backend.hpp>
-#include <torch/csrc/distributed/c10d/PrefixStore.hpp>
+#include <torch/csrc/distributed/c10d/Store.hpp>
 namespace c10d {
 
 static std::vector<std::string> TORCH_XCCL_BLOCKING_WAIT = {
@@ -45,6 +36,8 @@ class TORCH_API ProcessGroupXCCL : public Backend {
         at::Device& device,
         int rank,
         OpType opType,
+        uint64_t seq,
+        const char* profilingTitle = nullptr,
         const std::optional<std::vector<at::Tensor>>& inputs = std::nullopt);
     WorkXCCL(const WorkXCCL& w);
     ~WorkXCCL() override;
@@ -63,6 +56,10 @@ class TORCH_API ProcessGroupXCCL : public Backend {
       return future_;
     }
 
+    uint64_t getSequencenumber() const override {
+      return seq_;
+    }
+
     std::vector<at::Tensor> result() override {
       return *outputs_;
     }
@@ -72,6 +69,7 @@ class TORCH_API ProcessGroupXCCL : public Backend {
     std::shared_ptr<at::xpu::XPUEvent> xcclEndEvent_;
     bool blockingWait_ = false;
     std::chrono::time_point<std::chrono::steady_clock> workStartTime_;
+    uint64_t seq_;
 
    private:
     void synchronizeInternal(std::chrono::milliseconds timeout);
@@ -103,6 +101,7 @@ class TORCH_API ProcessGroupXCCL : public Backend {
       at::Device& device,
       int rank,
       OpType opType,
+      const char* profilingTitle = nullptr,
       const std::vector<at::Tensor>& inputs = {},
       const std::vector<at::Tensor>& outputs = {});
 
@@ -111,7 +110,8 @@ class TORCH_API ProcessGroupXCCL : public Backend {
       at::Tensor& input,
       at::Tensor& output,
       Fn fn,
-      OpType opType) {
+      OpType opType,
+      const char* profilingTitle = nullptr) {
     auto inputs = std::vector<at::Tensor>{input};
     auto outputs = std::vector<at::Tensor>{output};
     return collective<Fn>(
@@ -132,13 +132,17 @@ class TORCH_API ProcessGroupXCCL : public Backend {
       Fn fn,
       PreProcess pre,
       PostProcess post,
-      OpType opType);
+      OpType opType,
+      const char* profilingTitle = nullptr);
 
   c10::intrusive_ptr<Work> allreduce(
       std::vector<at::Tensor>& tensors,
       const AllreduceOptions& opts = AllreduceOptions()) override;
 
   void setSequenceNumberForGroup() override {}
+  uint64_t getSequenceNumberForGroup() override {
+    return seqCollective_;
+  }
 
  protected:
   std::unordered_map<std::string, at::xpu::XPUStream> xcclStreamsMap_;
@@ -147,6 +151,7 @@ class TORCH_API ProcessGroupXCCL : public Backend {
   c10::intrusive_ptr<Store> store_;
   std::mutex mutex_;
   bool blockingWait_ = false;
+  uint64_t seqCollective_{0};
 
  private:
   std::mutex kvs_mutex;
diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py
index 9ec38c9ca671c2..b0b506195b240f 100644
--- a/torch/testing/_internal/common_distributed.py
+++ b/torch/testing/_internal/common_distributed.py
@@ -93,8 +93,9 @@ class DistTestCases:
 
     # Sets showing that something is implemented
     backend_feature = {}
-    backend_feature["gpu"] = {"nccl", "gloo", "ucc"}
+    backend_feature["gpu"] = {"nccl", "gloo", "ucc", "xccl"}
     backend_feature["cuda"] = {"nccl", "gloo", "ucc"}
+    backend_feature["xpu"] = {"xccl"}
     backend_feature["ddp"] = {"nccl", "gloo", "ucc"}
     backend_feature["subgroup"] = {"nccl", "gloo", "ucc"}
     backend_feature["plugin"] = set()
@@ -462,6 +463,15 @@ def compute_sum(fn, world_size: int):
         ]
     ]
 
+# Returns the number of GPUs, currently only for CUDA and XPU.
+def get_device_count(backend: str):
+    assert c10d.is_backend_available(backend)
+    if backend in DistTestCases.backend_feature.get("cuda", set()):
+        return torch.cuda.device_count()
+    elif backend in DistTestCases.backend_feature.get("xpu", set()):
+        return torch.xpu.device_count()
+    else:
+        raise ValueError(f"Unsupported backend: {backend}")
 
 # HELPER FOR MULTIGPU TESTS
 def init_multigpu_helper(world_size: int, backend: str):
@@ -470,7 +480,7 @@ def init_multigpu_helper(world_size: int, backend: str):
     On a single node, all visible GPUs are evenly
     divided to subsets, each process only uses a subset.
     """
-    nGPUs = torch.xpu.device_count() if torch.xpu.is_available() else torch.cuda.device_count()
+    nGPUs = get_device_count(backend)
     visible_devices = range(nGPUs)
 
     # If rank is less than or equal to number of available GPU's

From e36a99c977a8784e9e671a5bb0b661172d2ba35d Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Mon, 21 Oct 2024 07:51:25 +0000
Subject: [PATCH 86/96] update

---
 torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index 89f302595c4ac7..0628d3f3612f01 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -1060,7 +1060,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::_allgather_base(
   TORCH_CHECK_WITH(
       TypeError,
       input_tensor.dtype() == output_tensor.dtype(),
-      "output tensor must have the same type as input tensor");
+      "input tensor must be the same type as the output tensor.");
   TORCH_CHECK_WITH(
       ValueError,
       input_tensor.numel() * size_ == output_tensor.numel(),
@@ -1189,7 +1189,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::_reduce_scatter_base(
   TORCH_CHECK_WITH(
       TypeError,
       inputTensor.dtype() == outputTensor.dtype(),
-      "output tensor must have the same type as input tensor");
+      "input tensor must be the same type as the output tensor.");
   TORCH_CHECK_WITH(
       ValueError,
       inputTensor.numel() == outputTensor.numel() * size_,

From 5096354f792e4c96b4eeac7664c561c416268be4 Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Mon, 21 Oct 2024 08:37:13 +0000
Subject: [PATCH 87/96] update

---
 torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index 0628d3f3612f01..dcfa15a1a6af0b 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -1060,7 +1060,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::_allgather_base(
   TORCH_CHECK_WITH(
       TypeError,
       input_tensor.dtype() == output_tensor.dtype(),
-      "input tensor must be the same type as the output tensor.");
+      "output tensor must have the same type as input tensor");
   TORCH_CHECK_WITH(
       ValueError,
       input_tensor.numel() * size_ == output_tensor.numel(),

From 9e6448b5326f8736ae529b36d286a0c61e654baa Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Tue, 22 Oct 2024 01:26:22 +0000
Subject: [PATCH 88/96] add RECORD_PARAM_COMMS_DATA

---
 .../distributed/c10d/ProcessGroupXCCL.cpp     | 294 +++++++++++++++++-
 1 file changed, 293 insertions(+), 1 deletion(-)

diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index dcfa15a1a6af0b..44dc7360265b55 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -99,6 +99,7 @@ void check_xpu_single_tensor(
     }
   }
 }
+
 int64_t check_xpu_tensors_same_device(const std::vector<at::Tensor>& tensors) {
   TORCH_CHECK_WITH(
       ValueError, tensors.size() == 0, "Tensor list must be nonempty");
@@ -317,6 +318,20 @@ std::shared_ptr<xcclComm_t> ProcessGroupXCCL::getXCCLComm(
   auto comms = ccl::create_communicators(numRanks, devs_rank, ctx, xccl_kvs);
   XCCLComm = std::make_shared<xcclComm_t>(std::move(comms[0]));
 
+  RECORD_PARAM_COMMS(
+      0, // seq
+      std::make_tuple(pg_uid_, pg_desc_), // PG name tuple
+      rank, // rank
+      "init", // collective name
+      0, // inNelems
+      0, // outNelems
+      at::kByte, // dType
+      std::vector<int64_t>(), // inSplitSizes
+      std::vector<int64_t>(), // outSplitSizes
+      -1, // globalRankStart
+      -1, // globalRankStride
+      size_); // worldSize
+
   std::lock_guard<std::mutex> lock(mutex_);
   devXCCLCommMap_.emplace(deviceKey, XCCLComm);
   xcclStreamsMap_.emplace(deviceKey, std::move(stream));
@@ -530,6 +545,23 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::send(
   auto tensor = tensors.back();
   check_xpu_single_tensor(tensor, true);
 
+  RECORD_PARAM_COMMS_DATA(
+      static_cast<int>(
+          this->getSequenceNumberForGroup() + 1), // seq + 1 to match collective
+      std::make_tuple(pg_uid_, pg_desc_), // PG name tuple
+      tensors, // inputTensors
+      tensors, // outputTensors
+      dstRank, // dst rank
+      "send", // collective name
+      tensor.numel(), // inNelems
+      tensor.numel(), // outNelems
+      tensor.scalar_type(), // dType
+      std::vector<int64_t>(), // inSplitSizes
+      std::vector<int64_t>(), // outSplitSizes
+      -1, // globalRankStart
+      -1, // globalRankStride
+      this->getSize()); // worldSize
+
   auto ret = pointToPoint(
       tensor,
       [&](at::Tensor& input,
@@ -561,6 +593,23 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::recv(
   auto tensor = tensors.back();
   check_xpu_single_tensor(tensor, true);
 
+  RECORD_PARAM_COMMS_DATA(
+      static_cast<int>(
+          this->getSequenceNumberForGroup() + 1), // seq + 1 to match collective
+      std::make_tuple(pg_uid_, pg_desc_), // PG name tuple
+      tensors, // inputTensors
+      tensors, // outputTensors
+      srcRank, // src rank
+      "recv", // collective name
+      tensor.numel(), // inNelems
+      tensor.numel(), // outNelems
+      tensor.scalar_type(), // dType
+      std::vector<int64_t>(), // inSplitSizes
+      std::vector<int64_t>(), // outSplitSizes
+      -1, // globalRankStart
+      -1, // globalRankStride
+      this->getSize()); // worldSize
+
   auto ret = pointToPoint(
       tensor,
       [&](at::Tensor& output,
@@ -628,6 +677,23 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::gather(
     outputs.emplace_back();
   }
 
+  RECORD_PARAM_COMMS_DATA(
+      static_cast<int>(
+          this->getSequenceNumberForGroup() + 1), // seq + 1 to match collective
+      std::make_tuple(pg_uid_, pg_desc_), // PG name tuple
+      inputTensors, // inputTensors
+      outputTensors, // outputTensors
+      opts.rootRank, // root rank
+      "gather", // collective name
+      inputTensor.numel(), // inNelems
+      inputTensor.numel() * this->getSize(), // outNelems
+      inputTensor.scalar_type(), // dType
+      std::vector<int64_t>(), // inSplitSizes
+      std::vector<int64_t>(), // outSplitSize
+      -1, // globalRankStart
+      -1, // globalRankStride
+      this->getSize()); // worldSize
+
   auto inputs = std::vector<at::Tensor>{inputTensor};
   return collective(
       inputs,
@@ -722,6 +788,23 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::scatter(
     inputs.emplace_back();
   }
 
+  RECORD_PARAM_COMMS_DATA(
+      static_cast<int>(
+          this->getSequenceNumberForGroup() + 1), // seq + 1 to match collective
+      std::make_tuple(pg_uid_, pg_desc_), // PG name tuple
+      inputTensors, // inputTensors
+      outputTensors, // outputTensors
+      opts.rootRank, // root rank
+      "scatter", // collective name
+      outputTensor.numel() * this->getSize(), // inNelems
+      outputTensor.numel(), // outNelems
+      outputTensor.scalar_type(), // dType
+      std::vector<int64_t>(), // inSplitSizes
+      std::vector<int64_t>(), // outSplitSize
+      -1, // globalRankStart
+      -1, // globalRankStride
+      this->getSize()); // worldSize
+
   const auto root = opts.rootRank;
 
   auto outputs = std::vector<at::Tensor>{outputTensor};
@@ -810,6 +893,24 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::allreduce(
   auto tensor = tensors.back();
   check_xpu_single_tensor(tensor);
 
+  // @lint-ignore CLANGTIDY
+  RECORD_PARAM_COMMS_DATA(
+      static_cast<int>(
+          this->getSequenceNumberForGroup() + 1), // seq + 1 to match collective
+      std::make_tuple(pg_uid_, pg_desc_), // PG name tuple
+      tensors, // inputTensors
+      tensors, // outputTensors
+      rank_, // rank
+      "allreduce", // collective name
+      tensor.numel(), // inNelems
+      tensor.numel(), // outNelems
+      tensor.scalar_type(), // dType
+      std::vector<int64_t>(), // inSplitSizes
+      std::vector<int64_t>(), // outSplitSizes
+      -1, // globalRankStart
+      -1, // globalRankStride
+      size_); // worldSize
+
   return collective(
       tensor,
       tensor,
@@ -836,7 +937,26 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::allreduce(
 c10::intrusive_ptr<Work> ProcessGroupXCCL::allreduce_coalesced(
     std::vector<at::Tensor>& tensors,
     const AllreduceCoalescedOptions& opts) {
-  check_xpu_tensors_same_device(tensors);
+  auto total_numel = check_xpu_tensors_same_device(tensors);
+
+  // @lint-ignore CLANGTIDY
+  RECORD_PARAM_COMMS_DATA(
+      static_cast<int>(
+          this->getSequenceNumberForGroup() + 1), // seq + 1 to match collective
+      std::make_tuple(pg_uid_, pg_desc_), // PG name tuple
+      tensors, // inputTensors
+      tensors, // outputTensors
+      rank_, // rank
+      "allreduce_coalesced", // collective name
+      total_numel, // inNelems
+      total_numel, // outNelems
+      tensors[0].scalar_type(), // dType
+      std::vector<int64_t>(), // inSplitSizes
+      std::vector<int64_t>(), // outSplitSizes
+      -1, // globalRankStart
+      -1, // globalRankStride
+      this->getSize()); // worldSize
+
   return collectiveCoalesced(
       tensors,
       tensors,
@@ -867,6 +987,24 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::broadcast(
   auto tensor = tensors.back();
   check_xpu_single_tensor(tensor);
 
+  // @lint-ignore CLANGTIDY
+  RECORD_PARAM_COMMS_DATA(
+      static_cast<int>(
+          this->getSequenceNumberForGroup() + 1), // seq + 1 to match collective
+      std::make_tuple(pg_uid_, pg_desc_), // PG name tuple
+      tensors, // inputTensors
+      tensors, // outputTensors
+      opts.rootRank, // root rank
+      "broadcast", // collective name
+      tensor.numel(), // inNelems
+      tensor.numel(), // outNelems
+      tensor.scalar_type(), // dType
+      std::vector<int64_t>(), // inSplitSizes
+      std::vector<int64_t>(), // outSplitSizes
+      -1, // globalRankStart
+      -1, // globalRankStride
+      this->getSize()); // worldSize
+
   const auto root = opts.rootRank + opts.rootTensor;
 
   return collective(
@@ -928,6 +1066,23 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::reduce(
   auto tensor = tensors.back();
   check_xpu_single_tensor(tensor);
 
+  RECORD_PARAM_COMMS_DATA(
+      static_cast<int>(
+          this->getSequenceNumberForGroup() + 1), // seq + 1 to match collective
+      std::make_tuple(pg_uid_, pg_desc_), // PG name tuple
+      tensors, // inputTensors
+      tensors, // outputTensors
+      opts.rootRank, // root rank
+      "reduce", // collective name
+      tensor.numel(), // inNelems
+      tensor.numel(), // outNelems
+      tensor.scalar_type(), // dType
+      std::vector<int64_t>(), // inSplitSizes
+      std::vector<int64_t>(), // outSplitSizes
+      -1, // globalRankStart
+      -1, // globalRankStride
+      this->getSize()); // worldSize
+
   return collective(
       tensor,
       tensor,
@@ -997,6 +1152,24 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::allgather(
   // @lint-ignore CLANGTIDY
   std::vector<at::Tensor>& outputTensors_ = outputTensors.back();
 
+  RECORD_PARAM_COMMS_DATA(
+      static_cast<int>(
+          this->getSequenceNumberForGroup() + 1), // seq + 1 to match collective
+      std::make_tuple(pg_uid_, pg_desc_), // PG name tuple
+      inputTensors, // inputTensors
+      outputTensors, // outputTensors
+      rank_, // rank
+      "all_gather", // collective name
+      inputTensor.numel(), // inNelems
+      inputTensor.numel() * // outNelems
+          this->getSize(),
+      inputTensor.scalar_type(), // dType
+      std::vector<int64_t>(), // inSplitSizes
+      std::vector<int64_t>(), // outSplitSize
+      -1, // globalRankStart
+      -1, // globalRankStride
+      this->getSize()); // worldSize
+
   bool same_size = check_same_size(outputTensors_);
   if (same_size) {
     // Flatten a vector of tensors into a single, stacked tensor.
@@ -1066,6 +1239,23 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::_allgather_base(
       input_tensor.numel() * size_ == output_tensor.numel(),
       "output tensor size must be equal to world_size times input tensor size");
 
+  RECORD_PARAM_COMMS_DATA(
+      static_cast<int>(
+          this->getSequenceNumberForGroup() + 1), // seq + 1 to match collective
+      std::make_tuple(pg_uid_, pg_desc_), // PG name tuple
+      input_tensor, // inputTensors
+      output_tensor, // outputTensors
+      rank_, // rank
+      "_allgather_base", // collective name
+      input_tensor.numel(), // inNelems
+      output_tensor.numel(), // outNelems
+      output_tensor.scalar_type(), // dType
+      std::vector<int64_t>(), // inSplitSizes
+      std::vector<int64_t>(), // outSplitSize
+      -1, // globalRankStart
+      -1, // globalRankStride
+      this->getSize()); // worldSize
+
   return collective(
       input_tensor,
       output_tensor,
@@ -1125,6 +1315,23 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::reduce_scatter(
   // @lint-ignore CLANGTIDY
   auto inputTensors_ = inputTensors.back();
 
+  RECORD_PARAM_COMMS_DATA(
+      static_cast<int>(
+          this->getSequenceNumberForGroup() + 1), // seq + 1 to match collective
+      std::make_tuple(pg_uid_, pg_desc_), // PG name tuple
+      inputTensors, // inputTensors
+      outputTensors, // outputTensors
+      rank_, // rank
+      "reduce_scatter", // collective name
+      outputTensor.numel() * this->getSize(), // inNelems
+      outputTensor.numel(), // outNelems
+      outputTensor.scalar_type(), // dType
+      std::vector<int64_t>(), // inSplitSizes
+      std::vector<int64_t>(), // outSplitSizes
+      -1, // globalRankStart
+      -1, // globalRankStride
+      this->getSize()); // worldSize
+
   bool same_size = check_same_size(inputTensors_);
   if (same_size) {
     // Flatten a vector of tensors into a single, stacked tensor.
@@ -1195,6 +1402,23 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::_reduce_scatter_base(
       inputTensor.numel() == outputTensor.numel() * size_,
       "input tensor must be the same size as output size times world size");
 
+  RECORD_PARAM_COMMS_DATA(
+      static_cast<int>(
+          this->getSequenceNumberForGroup() + 1), // seq + 1 to match collective
+      std::make_tuple(pg_uid_, pg_desc_), // PG name tuple
+      inputTensor, // inputTensor
+      outputTensor, // outputTensor
+      rank_, // rank
+      "_reduce_scatter_base", // collective name
+      inputTensor.numel(), // inNelems
+      outputTensor.numel(), // outNelems
+      outputTensor.scalar_type(), // dtype
+      std::vector<int64_t>(), // inSplitSizes
+      std::vector<int64_t>(), // outSplitSizes
+      -1, // globalRankStart
+      -1, // globalRankStride
+      this->getSize()); // worldSize
+
   return collective(
       inputTensor,
       outputTensor,
@@ -1250,6 +1474,20 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::reduce_scatter_tensor_coalesced(
 }
 
 c10::intrusive_ptr<Work> ProcessGroupXCCL::barrier(const BarrierOptions& opts) {
+  RECORD_PARAM_COMMS(
+      static_cast<int>(
+          this->getSequenceNumberForGroup() + 1), // seq + 1 to match collective
+      std::make_tuple(pg_uid_, pg_desc_), // PG name tuple
+      rank_, // rank
+      "barrier", // collective name
+      0, // inNelems
+      0, // outNelems
+      at::kByte, // dType
+      std::vector<int64_t>(), // inSplitSizes
+      std::vector<int64_t>(), // outSplitSizes
+      -1, // globalRankStart
+      -1, // globalRankStride
+      this->getSize()); // worldSize
   // Device to use for barrier
   int barDevIdx = -1;
 
@@ -1292,6 +1530,23 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::alltoall_base(
   check_xpu_single_tensor(outputTensor, true);
   check_xpu_single_tensor(inputTensor, true);
   if (outputSplitSizes.size() == 0 && inputSplitSizes.size() == 0) {
+    RECORD_PARAM_COMMS_DATA(
+        static_cast<int>(
+            this->getSequenceNumberForGroup() +
+            1), // seq + 1 to match collective
+        std::make_tuple(pg_uid_, pg_desc_), // PG name tuple
+        inputTensor, // inputTensor
+        outputTensor, // outputTensor
+        rank_, // rank
+        "all_to_all", // collective name
+        inputTensor.numel(), // inNelems
+        outputTensor.numel(), // outNelems
+        inputTensor.scalar_type(), // dType
+        std::vector<int64_t>(), // inSplitSizes
+        std::vector<int64_t>(), // outSplitSizes
+        -1, // globalRankStart
+        -1, // globalRankStride
+        this->getSize()); // worldSize
     TORCH_CHECK(
         outputTensor.numel() == inputTensor.numel() &&
             outputTensor.scalar_type() == inputTensor.scalar_type(),
@@ -1324,6 +1579,24 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::alltoall_base(
     c10d::checkSplitSizes(inputSplitSizes, inputTensor, size_);
     c10d::checkSplitSizes(outputSplitSizes, outputTensor, size_);
 
+    RECORD_PARAM_COMMS_DATA(
+        static_cast<int>(
+            this->getSequenceNumberForGroup() +
+            1), // seq + 1 to match collective
+        std::make_tuple(pg_uid_, pg_desc_), // PG name tuple
+        inputTensor, // inputTensor
+        outputTensor, // outputTensor
+        rank_, // rank
+        "all_to_allv", // collective name
+        inputTensor.numel(), // inNelems
+        outputTensor.numel(), // outNelems
+        inputTensor.scalar_type(), // dType
+        inputSplitSizes, // inSplitSizes
+        outputSplitSizes, // outSplitSizes
+        -1, // globalRankStart
+        -1, // globalRankStride
+        this->getSize()); // worldSize
+
     return collective(
         inputTensor,
         outputTensor,
@@ -1370,6 +1643,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::alltoall(
     std::vector<at::Tensor>& inputTensors,
     const AllToAllOptions& /* unused */) {
   auto device = outputTensors[0].device();
+  int64_t total_numel = 0;
   for (const auto r : c10::irange(outputTensors.size())) {
     check_xpu_single_tensor(outputTensors[r], true);
     check_xpu_single_tensor(inputTensors[r], true);
@@ -1377,8 +1651,26 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::alltoall(
         device == outputTensors[r].device() &&
             device == inputTensors[r].device(),
         "Tensors must be on the same device")
+    total_numel += inputTensors[r].numel();
   }
 
+  RECORD_PARAM_COMMS_DATA(
+      static_cast<int>(
+          this->getSequenceNumberForGroup() + 1), // seq + 1 to match collective
+      std::make_tuple(pg_uid_, pg_desc_), // PG name tuple
+      inputTensors, // inputTensors
+      outputTensors, // outputTensors
+      rank_, // rank
+      "all_to_all", // collective name
+      total_numel, // inNelems
+      total_numel, // outNelems
+      inputTensors.front().scalar_type(), // dType
+      std::vector<int64_t>(), // inSplitSizes
+      std::vector<int64_t>(), // outSplitSizes
+      -1, // globalRankStart
+      -1, // globalRankStride
+      this->getSize()); // worldSize
+
   return collective(
       inputTensors,
       outputTensors,

From 8d9c24e19143ac8aa9809a0fb2f2e92b5e473efd Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Fri, 25 Oct 2024 00:38:50 +0000
Subject: [PATCH 89/96] update

---
 torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
index c8fa11442c692e..2f83fe8f248bd4 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
@@ -309,6 +309,10 @@ class TORCH_API ProcessGroupXCCL : public Backend {
       std::vector<std::vector<at::Tensor>>& inputTensors,
       const ScatterOptions& opts = ScatterOptions()) override;
 
+  void setSequenceNumberForGroup() override;
+
+  uint64_t getSequenceNumberForGroup() override;
+
  protected:
   std::unordered_map<std::string, at::xpu::XPUStream> xcclStreamsMap_;
   std::unordered_map<std::string, at::xpu::XPUEvent> xcclEventsMap_;
@@ -322,6 +326,7 @@ class TORCH_API ProcessGroupXCCL : public Backend {
   bool blockingWait_ = false;
   static thread_local uint64_t xcclActiveGroupCounter_;
   uint64_t seqCollective_{0};
+  uint64_t seqP2P_{0};
 
  private:
   std::mutex kvs_mutex;

From e808b6c2857f8b8034ba0cf24d5cd047efa4851a Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Fri, 25 Oct 2024 00:40:08 +0000
Subject: [PATCH 90/96] update

---
 torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index 44dc7360265b55..4081529d486d33 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -246,6 +246,12 @@ ProcessGroupXCCL::ProcessGroupXCCL(
 
 ProcessGroupXCCL::~ProcessGroupXCCL() = default;
 
+void ProcessGroupXCCL::setSequenceNumberForGroup() {}
+
+uint64_t ProcessGroupXCCL::getSequenceNumberForGroup() {
+  return seqCollective_;
+}
+
 c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL> ProcessGroupXCCL::initWork(
     at::Device& device,
     int rank,
@@ -353,6 +359,11 @@ void ProcessGroupXCCL::groupEnd() {
 // TODO: wait p2p enable
 static constexpr int CoalActive = 0x01, CoalColl = 0x02, CoalP2P = 0x04;
 void ProcessGroupXCCL::startCoalescing() {
+  if (coalescing_state_ & CoalP2P) {
+    seqP2P_++;
+  } else {
+    seqCollective_++;
+  }
   coalescedDevice_.set_index(-1);
   coalescedComm_ = nullptr;
   coalescing_state_ |= CoalActive;
@@ -402,6 +413,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
     PostProcess post,
     OpType opType,
     const char* profilingTitle) {
+  seqCollective_++;
   auto device = inputs[0].device();
   const auto key = std::to_string(device.index());
   auto comm = getXCCLComm(key, device, opType);
@@ -480,6 +492,9 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::pointToPoint(
     p2pRank = rank_ <= peer ? 0 : 1;
     isSendRecvSelf = rank_ == peer;
     p2pTargetRank = isSendRecvSelf ? 0 : 1 - p2pRank;
+    if (!coalescing_state_) {
+      seqP2P_++;
+    }
   }
 
   auto comm = getXCCLComm(key, device, opType, p2pRank, isSendRecvSelf);

From 193d9463c1a0dec192f1f100f313dd02df4a8ca8 Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Mon, 28 Oct 2024 08:25:20 +0000
Subject: [PATCH 91/96] update

---
 torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index 4081529d486d33..04c936cb02c31b 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -102,7 +102,7 @@ void check_xpu_single_tensor(
 
 int64_t check_xpu_tensors_same_device(const std::vector<at::Tensor>& tensors) {
   TORCH_CHECK_WITH(
-      ValueError, tensors.size() == 0, "Tensor list must be nonempty");
+      ValueError, tensors.size() != 0, "Tensor list must be nonempty");
 
   const auto& first = tensors.front();
 

From eb447f2bffb775037a53f250f8b485f48a8b6c35 Mon Sep 17 00:00:00 2001
From: "Han, Chao1" <chao1.han@intel.com>
Date: Thu, 31 Oct 2024 20:59:35 +0800
Subject: [PATCH 92/96] fix all_gather_v bug

---
 torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index 04c936cb02c31b..b920895342dd91 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -1063,6 +1063,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::_broadcast_oop(
         auto xcclDataType = getXcclDataType(input.scalar_type());
         ccl::broadcast(
             input.data_ptr(),
+            output.data_ptr(),
             (size_t)input.numel(),
             xcclDataType,
             root,

From 20b60b1809572017a6449ef7bc9ac3a3f58c516a Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Mon, 11 Nov 2024 07:24:19 +0000
Subject: [PATCH 93/96] correct get kvs

---
 torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp |  3 +--
 torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp | 16 ++++++++++------
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index b920895342dd91..1527de6fe3f284 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -292,7 +292,6 @@ std::shared_ptr<xcclComm_t> ProcessGroupXCCL::getXCCLComm(
   }
 
   std::shared_ptr<xcclComm_t> XCCLComm;
-  XCCL_KVS kvs = get_kvs(rank_, *store_);
 
   bool batchP2P = xcclActiveGroupCounter_ > 0;
   bool singleP2POp = isP2POp(opType, batchP2P);
@@ -320,7 +319,7 @@ std::shared_ptr<xcclComm_t> ProcessGroupXCCL::getXCCLComm(
   ccl::vector_class<ccl::pair_class<int, ccl::device>> devs_rank;
   devs_rank.emplace_back(rank, ccl::create_device(q.get_device()));
 
-  auto xccl_kvs = get_kvs(rank_, *store_);
+  auto xccl_kvs = get_kvs(rank_, *store_, singleP2POp, deviceKey, p2pRank);
   auto comms = ccl::create_communicators(numRanks, devs_rank, ctx, xccl_kvs);
   XCCLComm = std::make_shared<xcclComm_t>(std::move(comms[0]));
 
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
index 2f83fe8f248bd4..cbbd724f88c6bb 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
@@ -330,15 +330,19 @@ class TORCH_API ProcessGroupXCCL : public Backend {
 
  private:
   std::mutex kvs_mutex;
-  ccl::shared_ptr_class<ccl::kvs> kvs;
 
-  ccl::shared_ptr_class<ccl::kvs> get_kvs(int rank, c10d::Store& store) {
+  ccl::shared_ptr_class<ccl::kvs> get_kvs(int rank, c10d::Store& store,
+    bool singleP2POp = false, const std::string& p2pKey = "", int p2pRank = 0) {
     std::lock_guard<std::mutex> lock(kvs_mutex);
-    if (kvs)
-      return kvs;
-    std::string storeKey = "xccl_kvs";
+    ccl::shared_ptr_class<ccl::kvs> kvs;
+    std::string storeKey;
+    if (!singleP2POp) {
+       storeKey = "xccl_kvs";
+    } else {
+       storeKey = p2pKey;
+    }
     // Rank 0 broadcast the bootstrap network information to other ranks
-    if (rank == 0) {
+    if (rank == 0 || (singleP2POp && p2pRank == 0)) {
       kvs = ccl::create_main_kvs();
       ccl::kvs::address_type main_addr = kvs->get_address();
       auto ccl_kvs_addr =

From b442419da4f529083b418be8d6b5cd1769423390 Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Tue, 12 Nov 2024 02:23:48 +0000
Subject: [PATCH 94/96] update kvs key

---
 torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp |  2 +-
 torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp | 13 +++++++++----
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index 1527de6fe3f284..f202f8916f89fd 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -239,7 +239,7 @@ ProcessGroupXCCL::ProcessGroupXCCL(
     const c10::intrusive_ptr<Store>& store,
     int rank,
     int size)
-    : Backend(rank, size), store_(store) {
+    : Backend(rank, size), store_(store), xcclCommCounter_(0) {
   blockingWait_ = getCvarBool(TORCH_XCCL_BLOCKING_WAIT, false);
   init();
 }
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
index cbbd724f88c6bb..c30ca603c7ba07 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
@@ -318,6 +318,7 @@ class TORCH_API ProcessGroupXCCL : public Backend {
   std::unordered_map<std::string, at::xpu::XPUEvent> xcclEventsMap_;
   std::unordered_map<std::string, std::shared_ptr<xcclComm_t>> devXCCLCommMap_;
   c10::intrusive_ptr<Store> store_;
+  uint64_t xcclCommCounter_{0};
   std::mutex mutex_;
   std::set<int> usedDeviceIdxs_;
   int coalescing_state_ = 0;
@@ -331,15 +332,19 @@ class TORCH_API ProcessGroupXCCL : public Backend {
  private:
   std::mutex kvs_mutex;
 
-  ccl::shared_ptr_class<ccl::kvs> get_kvs(int rank, c10d::Store& store,
-    bool singleP2POp = false, const std::string& p2pKey = "", int p2pRank = 0) {
+  ccl::shared_ptr_class<ccl::kvs> get_kvs(
+      int rank,
+      c10d::Store& store,
+      bool singleP2POp = false,
+      const std::string& p2pKey = "",
+      int p2pRank = 0) {
     std::lock_guard<std::mutex> lock(kvs_mutex);
     ccl::shared_ptr_class<ccl::kvs> kvs;
     std::string storeKey;
     if (!singleP2POp) {
-       storeKey = "xccl_kvs";
+      storeKey = std::to_string(xcclCommCounter_++);
     } else {
-       storeKey = p2pKey;
+      storeKey = p2pKey;
     }
     // Rank 0 broadcast the bootstrap network information to other ranks
     if (rank == 0 || (singleP2POp && p2pRank == 0)) {

From 65e0d9d7946716a829c04777954b7ab134bdf472 Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Thu, 14 Nov 2024 08:48:03 +0000
Subject: [PATCH 95/96] WA AVG reduction

---
 test/distributed/test_c10d_ops_xccl.py        | 10 +++++
 .../distributed/c10d/ProcessGroupXCCL.cpp     | 44 +++++++++++++++++++
 2 files changed, 54 insertions(+)

diff --git a/test/distributed/test_c10d_ops_xccl.py b/test/distributed/test_c10d_ops_xccl.py
index 279ec0eb03ecf8..9784cf3a5c0bea 100644
--- a/test/distributed/test_c10d_ops_xccl.py
+++ b/test/distributed/test_c10d_ops_xccl.py
@@ -155,6 +155,16 @@ def allreduce(tensors, op):
             tensors[0],
         )
 
+        # Avg
+        tensors = [torch.tensor([self.rank + 1.0]).xpu(local_device_id)]
+
+        allreduce(tensors, c10d.ReduceOp.AVG)
+        ndev = self.world_size
+        self.assertEqual(
+            torch.tensor([ndev * (ndev + 1.0) / (2.0 * ndev)]),
+            tensors[0],
+        )
+
         # Product
         tensors = [torch.tensor([self.rank + 1]).xpu(local_device_id)]
 
diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
index f202f8916f89fd..b2a900c92b8c0b 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -147,6 +147,10 @@ ccl::reduction getXcclReduceOp(const ReduceOp& reduceOp, at::Tensor& input) {
       // Map sum to max for bool tensors to avoid overflow issues with sum.
       return ccl::reduction::max;
     }
+    // WA due to oneCCL not support AVG
+    if (reduceOp == ReduceOp::AVG) {
+      return ccl::reduction::sum;
+    }
     return xcclOps.at(reduceOp);
   } catch (const std::out_of_range&) {
     C10_THROW_ERROR(
@@ -894,6 +898,11 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::allreduce_impl(
             xcclReduceOp,
             comm,
             ccl::create_stream(stream.queue()));
+        // WA due to oneCCL not support AVG
+        if (opts.reduceOp == ReduceOp::AVG) {
+          auto divisor = getSize();
+          output.div_(divisor);
+        }
         return;
       },
       OpType::ALLREDUCE,
@@ -942,6 +951,11 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::allreduce(
             xcclReduceOp,
             comm,
             ccl::create_stream(stream.queue()));
+        // WA due to oneCCL not support AVG
+        if (opts.reduceOp == ReduceOp::AVG) {
+          auto divisor = getSize();
+          output.div_(divisor);
+        }
         return;
       },
       OpType::ALLREDUCE,
@@ -988,6 +1002,11 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::allreduce_coalesced(
             xcclReduceOp,
             comm,
             ccl::create_stream(stream.queue()));
+        // WA due to oneCCL not support AVG
+        if (opts.reduceOp == ReduceOp::AVG) {
+          auto divisor = getSize();
+          output.div_(divisor);
+        }
         return;
       },
       OpType::COALESCED,
@@ -1117,6 +1136,11 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::reduce(
             root,
             comm,
             ccl::create_stream(stream.queue()));
+        // WA due to oneCCL not support AVG
+        if (opts.reduceOp == ReduceOp::AVG && getRank() == root) {
+          auto divisor = getSize();
+          output.div_(divisor);
+        }
         return;
       },
       OpType::REDUCE,
@@ -1150,6 +1174,11 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::_reduce_oop(
             root,
             comm,
             ccl::create_stream(stream.queue()));
+        // WA due to oneCCL not support AVG
+        if (opts.reduceOp == ReduceOp::AVG && getRank() == root) {
+          auto divisor = getSize();
+          output.div_(divisor);
+        }
         return;
       },
       OpType::REDUCE,
@@ -1370,6 +1399,11 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::reduce_scatter(
               xcclReduceOp,
               comm,
               ccl::create_stream(stream.queue()));
+          // WA due to oneCCL not support AVG
+          if (opts.reduceOp == ReduceOp::AVG) {
+            auto divisor = getSize();
+            output.div_(divisor);
+          }
           return;
         },
         [&](at::xpu::XPUStream& Stream,
@@ -1453,6 +1487,11 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::_reduce_scatter_base(
             xcclReduceOp,
             comm,
             ccl::create_stream(stream.queue()));
+        // WA due to oneCCL not support AVG
+        if (opts.reduceOp == ReduceOp::AVG) {
+          auto divisor = getSize();
+          output.div_(divisor);
+        }
         return;
       },
       OpType::_REDUCE_SCATTER_BASE,
@@ -1482,6 +1521,11 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::reduce_scatter_tensor_coalesced(
             xcclReduceOp,
             comm,
             ccl::create_stream(stream.queue()));
+        // WA due to oneCCL not support AVG
+        if (opts.reduceOp == ReduceOp::AVG) {
+          auto divisor = getSize();
+          output.div_(divisor);
+        }
         return;
       },
       OpType::COALESCED,

From 3e97e67847d6f5486a4fe58d06fc1fcb21f59d82 Mon Sep 17 00:00:00 2001
From: hanchao <chao1.han@intel.com>
Date: Fri, 15 Nov 2024 01:04:43 +0000
Subject: [PATCH 96/96] update test case

---
 test/distributed/test_c10d_ops_xccl.py |    4 +-
 test/distributed/test_c10d_xccl.py     | 1424 +++++++++++++++++++++++-
 2 files changed, 1399 insertions(+), 29 deletions(-)

diff --git a/test/distributed/test_c10d_ops_xccl.py b/test/distributed/test_c10d_ops_xccl.py
index 9784cf3a5c0bea..6a600aa595f7e7 100644
--- a/test/distributed/test_c10d_ops_xccl.py
+++ b/test/distributed/test_c10d_ops_xccl.py
@@ -44,6 +44,7 @@
 
 TEST_MULTIGPU = TEST_XPU and torch.xpu.device_count() >= 2
 
+
 class ProcessGroupXCCLOpTest(MultiProcContinousTest):
     @classmethod
     def backend_str(cls) -> str:
@@ -256,7 +257,6 @@ def reduce(xs, rootRank, rootTensor, op=None):
                 ):
                     reduce(tensors, self.rank, rt, op)
 
-
     @requires_xccl()
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs")
     def test_allgather_ops(self):
@@ -710,7 +710,6 @@ def perm(n, k):
         expected = torch.tensor(prod_val)
         self.assertEqual(expected, output_tensor)
 
-
     @requires_xccl()
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs")
     def test_reduce_scatter_base_ops(self):
@@ -830,4 +829,3 @@ def test_send_recv_object_list(self):
             nprocs=world_size,
             args=(world_size, rdvz_file),
         )
-
diff --git a/test/distributed/test_c10d_xccl.py b/test/distributed/test_c10d_xccl.py
index 704cdd414e554b..3503f6059f2825 100644
--- a/test/distributed/test_c10d_xccl.py
+++ b/test/distributed/test_c10d_xccl.py
@@ -1,14 +1,25 @@
 # Owner(s): ["oncall: distributed"]
 
+import copy
 import math
 import os
+import random
 import sys
 import time
 from datetime import timedelta
+from enum import auto, Enum
+from itertools import product
 from unittest import mock
 
+from test_c10d_common import DoubleGpuNet, gpus_for_rank, ModuleForDdpCommHook
+
 import torch
 import torch.distributed as c10d
+import torch.distributed.algorithms.ddp_comm_hooks.default_hooks as default
+import torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook as powerSGD
+import torch.nn.functional as F
+from torch import nn
+from torch.nn.parallel import DistributedDataParallel
 
 
 if not c10d.is_available() or not c10d.is_xccl_available():
@@ -23,8 +34,11 @@
     init_multigpu_helper,
     MultiProcessTestCase,
     requires_xccl,
+    skip_if_lt_x_gpu,
 )
 from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    parametrize,
     retry_on_connect_failures,
     run_tests,
     skip_but_pass_in_sandcastle_if,
@@ -267,37 +281,1395 @@ def test_set_process_group_desc(self):
         pg_2 = c10d.new_group([0, 1])
         self.assertEqual(pg_2.group_desc, "undefined")
 
-    def _test_allreduce_basics(self, fn):
-        pg = self._create_process_group_xccl()
-        device = torch.device("xpu:" + str(self.rank))
-        # Single input tests
-        tests = simple_reduce_tests(self.rank, self.world_size)
-        for op, input, expected in tests:
-            opts = c10d.AllreduceOptions()
-            opts.reduceOp = op
-            tensor = fn(input.to(device))
-            fut = pg.allreduce([tensor], opts).get_future()
-            fut.wait()
-            result = fut.value()
-            self.assertEqual(expected, result[0], exact_dtype=False)
 
-        x = fn(torch.tensor([self.rank + 1.0], device=device))
-        fut = pg.allreduce(x).get_future()
-        fut.wait()
-        result = fut.value()
-        self.assertEqual(
-            torch.tensor([float(self.world_size * (self.world_size + 1) / 2)]),
-            result[0],
+class DistributedDataParallelTest(
+    test_c10d_common.CommonDistributedDataParallelTest, MultiProcessTestCase
+):
+    def setUp(self):
+        super().setUp()
+        self._spawn_processes()
+
+    def _get_process_group(self):
+        store = self._get_store()
+        c10d.init_process_group(
+            "xccl", store=store, rank=self.rank, world_size=self.world_size
+        )
+        return c10d.distributed_c10d._get_default_group()
+
+    def _test_xccl_backend(
+        self, devices, device_ids, multi_device=False, gradient_as_bucket_view=False
+    ):
+        process_group = self._get_process_group()
+        self._test_ddp_with_process_group(
+            process_group, devices, device_ids, multi_device, gradient_as_bucket_view
         )
 
     @requires_xccl()
-    def test_allreduce_basics(self):
-        self._test_allreduce_basics(lambda t: t.clone())
+    @skip_if_lt_x_gpu(2)
+    def test_xccl_backend_multi_device_ids_not_allowed(self):
+        int_devices = list(range(torch.xpu.device_count()))
+        devices = [torch.device("xpu:" + str(i)) for i in int_devices]
+        with self.assertRaisesRegex(
+            ValueError, "device_ids can only be None or contain a single element."
+        ):
+            self._test_xccl_backend(devices, int_devices)
 
+    @requires_xccl()
+    @skip_if_lt_x_gpu(4)
+    def test_ddp_multi_device_module_config(self):
+        gpus = gpus_for_rank(self.world_size, "xccl")[self.rank]
 
-if __name__ == "__main__":
-    assert (
-        not torch.xpu._initialized
-    ), "test_distributed must not have initialized XPU context on main process"
+        self.assertTrue(len(gpus) >= 2, "expecting at least 2 gpus per process")
+
+        process_group = self._get_process_group()
+
+        gpus = gpus[:2]
+        model = DoubleGpuNet(gpus)
+
+        with self.assertRaisesRegex(
+            ValueError,
+            "DistributedDataParallel device_ids and output_device arguments only work with "
+            "single-device/multiple-device GPU modules or CPU modules",
+        ):
+            ddp_model = DistributedDataParallel(
+                model, output_device=gpus[1], process_group=process_group
+            )
+
+        with self.assertRaisesRegex(
+            ValueError, "device_ids can only be None or contain a single element."
+        ):
+            ddp_model = DistributedDataParallel(
+                model, device_ids=gpus, process_group=process_group
+            )
+
+        with self.assertRaisesRegex(
+            ValueError, "input module must be on the same type of devices"
+        ):
+            model.fc1 = model.fc1.cpu()
+            ddp_model = DistributedDataParallel(model, process_group=process_group)
+
+        model = model.cpu()
+        with self.assertRaisesRegex(
+            ValueError, "device_ids can only be None or contain a single element."
+        ):
+            ddp_model = DistributedDataParallel(
+                model, device_ids=gpus, process_group=process_group
+            )
+
+    def _test_fp16(self, gradient_as_bucket_view=False):
+        process_group = self._get_process_group()
+
+        gpus = gpus_for_rank(self.world_size, "xccl")[self.rank]
+        model = nn.Linear(1, 1, bias=False).xpu(gpus[0]).half()
+        nn.init.constant_(model.weight, 1)
+        ddp_model = DistributedDataParallel(
+            model,
+            device_ids=[gpus[0]],
+            process_group=process_group,
+            bucket_cap_mb=0.001,
+            gradient_as_bucket_view=gradient_as_bucket_view,
+        )
+
+        # Input 2**15, so that the gradients will overflow with a
+        # world_size of 2, unless we normalize the gradient by the
+        # world_size before the reduction
+        input = torch.tensor([[2**15]]).xpu(gpus[0]).half()
+
+        # Step model
+        ddp_model.train()
+        output = ddp_model(input)
+        loss = output.sum()
+        loss.backward()
+
+        self.assertFalse(any(torch.isinf(p.grad).any() for p in ddp_model.parameters()))
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(2)
+    def test_fp16(self):
+        self._test_fp16()
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(2)
+    def test_fp16_grad_is_view(self):
+        self._test_fp16(gradient_as_bucket_view=True)
+
+    def _test_arbitrary_forward_return_value(self, gradient_as_bucket_view=False):
+        """
+        Note: this test can be sped up by only running it on a CPU module
+        once DistributedDataParallel supports them.
+        """
+        process_group = self._get_process_group()
+
+        class ForwardReturnValueModule(nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.fc1 = nn.Linear(2, 10, bias=False)
+                self.fc2 = nn.Linear(10, 4, bias=False)
+                self.fc3 = nn.Linear(4, 4, bias=False)
+                self.relu = nn.ReLU()
+
+            def forward(self, x, fn):
+                x = self.relu(self.fc1(x))
+                x = self.relu(self.fc2(x))
+                # The first softmax does NOT include fc3 in its autograd graph
+                # whereas the second softmax DOES. If we pass only the first
+                # tensor we see in the output to the reducer, it marks the
+                # gradient for fc3 as ready (because it doesn't show up). If
+                # downstream uses of this return value choose to differentiate
+                # against the second output tensor, it would still receive a
+                # gradient and a callback for this tensor, resulting in a crash.
+                return fn(
+                    F.softmax(x, dim=1),
+                    F.softmax(self.fc3(x), dim=1),
+                )
+
+        device_id = gpus_for_rank(self.world_size, "xccl")[self.rank][0]
+        model = DistributedDataParallel(
+            ForwardReturnValueModule().float().to(device_id),
+            device_ids=[device_id],
+            process_group=process_group,
+            gradient_as_bucket_view=gradient_as_bucket_view,
+        )
+
+        batch_size = 4
+        criterion = nn.CrossEntropyLoss()
+        input = torch.rand([batch_size, 2], dtype=torch.float)
+        target = torch.LongTensor([random.randrange(4) for _ in range(batch_size)]).to(
+            device_id
+        )
+
+        # Always run "backward" to ensure the reducer is called by autograd.
+        # If we don't correctly capture the output tensors from the return value,
+        # the reducer won't see a hook for the unused parameter, and throw an error.
+        # The correct capture is what we're testing in this function.
+        def test(box, unbox):
+            output = model(input, fn=box)
+            loss = criterion(unbox(output), target)
+            loss.backward()
+
+        # Test with identity return value
+        test(
+            box=lambda x, y: (x, y),
+            unbox=lambda obj: obj[1],
+        )
+
+        # Test with list return value
+        test(
+            box=lambda x, y: ["foo", x, "bar", y],
+            unbox=lambda obj: obj[3],
+        )
+
+        # Test with tuple return value
+        test(
+            box=lambda x, y: ("foo", x, "bar", y),
+            unbox=lambda obj: obj[3],
+        )
+
+        # Test with dict return value
+        test(
+            box=lambda x, y: {"foo": "bar", "a": x, "b": y},
+            unbox=lambda obj: obj["b"],
+        )
+
+        # Test with list with dict return value
+        test(
+            box=lambda x, y: ["foo", "bar", {"a": x, "b": y}],
+            unbox=lambda obj: obj[2]["b"],
+        )
+
+        # Test with dict with list return value
+        test(
+            box=lambda x, y: {"foo": "bar", "list": [0, x, 1, y]},
+            unbox=lambda obj: obj["list"][3],
+        )
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(2)
+    def test_arbitrary_forward_return_value(self):
+        self._test_arbitrary_forward_return_value()
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(2)
+    def test_arbitrary_forward_return_value_grad_is_view(self):
+        self._test_arbitrary_forward_return_value(gradient_as_bucket_view=True)
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(2)
+    def test_ddp_with_lazy_parameters(self):
+        process_group = self._get_process_group()
+        with self.assertRaisesRegex(
+            RuntimeError, "Modules with uninitialized parameters"
+        ):
+            DistributedDataParallel(
+                torch.nn.LazyLinear(10), process_group=process_group
+            )
+
+    def _test_multiple_outputs_multiple_backward(self, gradient_as_bucket_view=False):
+        """
+        Note: this test can be sped up by only running it on a CPU module
+        once DistributedDataParallel supports them.
+        """
+        process_group = self._get_process_group()
+
+        class MultipleOutputModule(nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+
+                def define_module():
+                    return nn.Sequential(
+                        nn.Linear(2, 10, bias=False),
+                        nn.ReLU(),
+                        nn.Linear(10, 4, bias=False),
+                        nn.ReLU(),
+                    )
+
+                self.module0 = define_module()
+                self.module1 = define_module()
+
+            def forward(self, x):
+                return (
+                    F.softmax(self.module0(x), dim=1),
+                    F.softmax(self.module1(x), dim=1),
+                )
+
+        device_id = gpus_for_rank(self.world_size, "xccl")[self.rank][0]
+        model = DistributedDataParallel(
+            MultipleOutputModule().float().to(device_id),
+            device_ids=[device_id],
+            process_group=process_group,
+            gradient_as_bucket_view=gradient_as_bucket_view,
+        )
+
+        batch_size = 4
+        criterion = nn.CrossEntropyLoss()
+        input = torch.rand([batch_size, 2], dtype=torch.float)
+        target = torch.LongTensor([random.randrange(4) for _ in range(batch_size)]).to(
+            device_id
+        )
+
+        # Compute loss and gradients for both outputs
+        output1, output2 = model(input)
+        loss1 = criterion(output1, target)
+        loss1.backward()
+        loss2 = criterion(output2, target)
+        loss2.backward()
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(2)
+    def test_multiple_outputs_multiple_backward(self):
+        self._test_multiple_outputs_multiple_backward()
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(2)
+    def test_multiple_outputs_multiple_backward_grad_is_view(self):
+        self._test_multiple_outputs_multiple_backward(gradient_as_bucket_view=True)
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(2)
+    def test_no_grad(self):
+        """
+        Note: this test can be sped up by only running it on a CPU module
+        once DistributedDataParallel supports them.
+        """
+        process_group = self._get_process_group()
+
+        class NoGradModule(nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.fc1 = nn.Linear(2, 10, bias=False)
+                self.fc2 = nn.Linear(10, 4, bias=False)
+                self.relu = nn.ReLU()
+
+            def forward(self, x):
+                x = self.relu(self.fc1(x))
+                x = self.relu(self.fc2(x))
+                return F.softmax(x, dim=1)
+
+        device_id = gpus_for_rank(self.world_size, "xccl")[self.rank][0]
+        model = DistributedDataParallel(
+            NoGradModule().float().to(device_id),
+            device_ids=[device_id],
+            process_group=process_group,
+        )
+
+        batch_size = 4
+        input = torch.rand([batch_size, 2], dtype=torch.float)
+
+        def check_no_grads():
+            for p in model.parameters():
+                self.assertTrue(p.requires_grad)
+                self.assertIsNone(p.grad)
+
+        # After initialization, no parameter has their gradient set.
+        check_no_grads()
+
+        # Run `forward` function with torch.no_grad()
+        with torch.no_grad():
+            output = model(input)
+            self.assertTrue(isinstance(output, torch.Tensor))
+
+        # No parameter should have their gradient set.
+        check_no_grads()
+
+    def _test_accumulate_gradients_module(self, gradient_as_bucket_view=False):
+        # This is NOT the recommended way to implement accumulating grads, but
+        # we would like to make sure DDP does not mess up with the underlying
+        # module.
+        int_devices = gpus_for_rank(self.world_size, "xccl")[self.rank][:1]
+        devices = [torch.device("xpu:" + str(i)) for i in int_devices]
+        process_group = self._get_process_group()
+        global_batch_size = self.world_size
+
+        model, ddp_model, input, target = self._prepare_single_device_module(
+            process_group, devices, devices, global_batch_size, gradient_as_bucket_view
+        )
+
+        def step_model(model, input, target):
+            model.train()
+            output = model(input)
+            loss = F.mse_loss(output, target.to(output.device))
+            loss.backward()
+
+        # ensure accumulate grads works with no_grad
+        with torch.no_grad():
+            ddp_model.train()
+            ddp_model.module(input)
+
+        # Check two model parameters over 4 iterations.
+        # Use 4 iterations because we alternate between reducing and
+        # not reducing and want to make sure we switch both ways.
+        for iteration in range(4):
+            step_model(model, input, target)
+
+            if iteration % 2 == 0:
+                # Skip gradients sync without calling prepare_for_backward
+                step_model(
+                    ddp_model.module,
+                    input[self.rank : (self.rank + 1)],
+                    target[self.rank : (self.rank + 1)],
+                )
+                for i, j in zip(model.parameters(), ddp_model.parameters()):
+                    self.assertNotEqual(i.grad, j.grad)
+            else:
+                step_model(
+                    ddp_model,
+                    input[self.rank : (self.rank + 1)],
+                    target[self.rank : (self.rank + 1)],
+                )
+                for i, j in zip(model.parameters(), ddp_model.parameters()):
+                    self.assertEqual(i.grad, j.grad, rtol=1.3e-06, atol=5e-5)
+
+            # Shuffle the input so that DDP input is different
+            torch.manual_seed(1337 + iteration)
+            input = input[torch.randperm(global_batch_size)]
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(2)
+    def test_failure_recovery(self):
+        process_group = self._get_process_group()
+
+        # need to create a separate file for the recovered FileStore, because
+        # the original one will be deleted when destructing the first FileStore.
+        recovery_filename = self.file_name + "_recovery"
 
+        if self.rank == 0:
+            # the file will be deleted by the recovered FileStore
+            open(recovery_filename, "w").close()
+
+        # not necessary to run barrier here, as DDP will synchronize
+
+        class TestModel(nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.fc1 = nn.Linear(2, 10, bias=False)
+                self.fc2 = nn.Linear(10, 4, bias=False)
+                self.relu = nn.ReLU()
+
+            def forward(self, x):
+                x = self.relu(self.fc1(x))
+                x = self.relu(self.fc2(x))
+                return F.softmax(x, dim=1)
+
+        device_id = gpus_for_rank(self.world_size, "xccl")[self.rank][0]
+        model = TestModel().float().to(device_id)
+        ddp = DistributedDataParallel(
+            model,
+            device_ids=[device_id],
+            process_group=process_group,
+        )
+
+        batch_size = 4
+        criterion = nn.CrossEntropyLoss()
+        input = torch.rand([batch_size, 2], dtype=torch.float)
+        target = torch.LongTensor([random.randrange(4) for _ in range(batch_size)]).to(
+            device_id
+        )
+
+        for _ in range(6):
+            output = ddp(input)
+            loss = criterion(output, target)
+            loss.backward()
+
+        del ddp
+        c10d.destroy_process_group(process_group)
+
+        store = c10d.FileStore(recovery_filename, self.world_size)
+        c10d.init_process_group(
+            "xccl", store=store, rank=self.rank, world_size=self.world_size
+        )
+        process_group = c10d.distributed_c10d._get_default_group()
+        ddp = DistributedDataParallel(
+            model,
+            device_ids=[device_id],
+            process_group=process_group,
+        )
+
+        input = torch.rand([batch_size, 2], dtype=torch.float)
+        target = torch.LongTensor([random.randrange(4) for _ in range(batch_size)]).to(
+            device_id
+        )
+        for _ in range(6):
+            output = ddp(input)
+            loss = criterion(output, target)
+            loss.backward()
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(2)
+    def test_pass_default_pg(self):
+        dist.init_process_group(
+            "xccl",
+            init_method=f"file://{self.file_name}",
+            world_size=self.world_size,
+            rank=self.rank,
+        )
+
+        default_pg = c10d.distributed_c10d._get_default_group()
+        dist.destroy_process_group(default_pg)
+        self.assertFalse(dist.is_initialized())
+
+    def _gpu_model_with_ddp_comm_hook(
+        self,
+        process_group,
+        hook=None,
+        gradient_as_bucket_view=False,
+        state=None,
+        static_graph=False,
+    ):
+        device_id = gpus_for_rank(self.world_size, "xccl")[self.rank][0]
+        gpu_model = DistributedDataParallel(
+            ModuleForDdpCommHook().to(device_id),
+            device_ids=[device_id],
+            process_group=process_group,
+            gradient_as_bucket_view=gradient_as_bucket_view,
+            static_graph=static_graph,
+        )
+
+        # Register a DDP communication hook if any.
+        if hook is not None:
+            gpu_model.register_comm_hook(state, hook)
+
+        return gpu_model
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(2)
+    def test_ddp_comm_hook_future_passing_gpu_xccl(self):
+        """
+        This unit test verifies whether the Future object is passed properly using xccl backend.
+        The hook callback function creates a Future object and sets a value to it.
+        """
+        process_group = self._get_process_group()
+
+        # Get GPU model with simple_hook registered.
+        gpu_model = self._gpu_model_with_ddp_comm_hook(process_group, self._simple_hook)
+
+        # check whether the grads are equal to what simple_hook's then callback returns.
+        # without the comm_hook, result would be 0.25 * torch.ones(2, 2).
+        self._run_and_verify_hook(gpu_model, 8, 2 * torch.ones(2, 2))
+
+    def _test_ddp_comm_hook_allreduce_hook_xccl(
+        self, gradient_as_bucket_view=False, static_graph=False
+    ):
+        """
+        This unit test verifies whether a DDP communication hook that just calls
+        allreduce gives the same result with the case of no hook registered.
+        Without the then callback, the future_value in reducer is no longer
+        a PyObject, and this unit test verifies future_value is properly checked.
+        """
+        process_group = self._get_process_group()
+
+        def allreduce_hook(
+            state: object, bucket: dist.GradBucket
+        ) -> torch.futures.Future[torch.Tensor]:
+            tensors = [bucket.buffer() / self.world_size]
+            return (
+                process_group.allreduce(tensors)
+                .get_future()
+                .then(lambda fut: fut.value()[0])
+            )
+
+        # Get GPU model with allreduce_hook registered.
+        gpu_model = self._gpu_model_with_ddp_comm_hook(
+            process_group, allreduce_hook, gradient_as_bucket_view, static_graph
+        )
+
+        # check whether the grads are equal to what DDP without hook would return.
+        self._run_and_verify_hook(gpu_model, 8, 0.25 * torch.ones(2, 2))
+
+    def _test_default_ddp_comm_hooks_xccl(self, gradient_as_bucket_view=False):
+        """
+        This unit test verifies whether default Python DDP communication hooks ALLREDUCE, FP16_COMPRESS
+        and BF16_COMPRESS, can give the same result with the case of no hook registered.
+        """
+        process_group = self._get_process_group()
+
+        # For these default DDP comm hooks, the only state is process group.
+        state = process_group
+        hook_options = [default.allreduce_hook, default.fp16_compress_hook]
+        if c10d.is_xccl_available():
+            hook_options.append(default.bf16_compress_hook)
+        for hook in hook_options:
+            # Get GPU model with the hook registered.
+            # The first arg 'process_group' is used for initializing the test environment,
+            # so it cannot be replaced by 'state', although they have the same value.
+            gpu_model = self._gpu_model_with_ddp_comm_hook(
+                process_group, hook, gradient_as_bucket_view, state
+            )
+
+            # check whether the grads are equal to what DDP without hook would return.
+            self._run_and_verify_hook(gpu_model, 8, 0.25 * torch.ones(2, 2))
+
+    def _test_fp16_compress_wrapper(self, gradient_as_bucket_view=False):
+        """
+        This unit test verifies whether wrapping the ALLREDUCE and POWER_SGD hooks with
+        the FP16_WRAPPER can give the same result as when there is no hook registered.
+        """
+        process_group = self._get_process_group()
+        powerSGD_state = powerSGD.PowerSGDState(process_group=process_group)
+
+        hook_args = [
+            (powerSGD.powerSGD_hook, powerSGD_state),
+            (default.allreduce_hook, process_group),
+        ]
+
+        for hook, state in hook_args:
+            gpu_model = self._gpu_model_with_ddp_comm_hook(
+                process_group,
+                default.fp16_compress_wrapper(hook),
+                gradient_as_bucket_view,
+                state,
+            )
+
+            # check whether the grads are equal to what DDP without hook would return.
+            self._run_and_verify_hook(gpu_model, 8, 0.25 * torch.ones(2, 2))
+
+    def _test_bf16_compress_wrapper(self, gradient_as_bucket_view=False):
+        """
+        This unit test verifies whether wrapping the ALLREDUCE and POWER_SGD hooks with
+        the BF16_WRAPPER can give the same result as when there is no hook registered.
+        """
+        process_group = self._get_process_group()
+        powerSGD_state = powerSGD.PowerSGDState(process_group=process_group)
+
+        hook_args = [
+            (powerSGD.powerSGD_hook, powerSGD_state),
+            (default.allreduce_hook, process_group),
+        ]
+
+        for hook, state in hook_args:
+            gpu_model = self._gpu_model_with_ddp_comm_hook(
+                process_group,
+                default.bf16_compress_wrapper(hook),
+                gradient_as_bucket_view,
+                state,
+            )
+
+            # check whether the grads are equal to what DDP without hook would return.
+            self._run_and_verify_hook(gpu_model, 8, 0.25 * torch.ones(2, 2))
+
+    def _test_powerSGD_ddp_comm_hook_xccl(self, gradient_as_bucket_view=False):
+        """
+        This unit test verifies whether Python DDP communication hook POWER_SGD
+        can give the same result with the case of no hook registered.
+        """
+        process_group = self._get_process_group()
+
+        # Get GPU model with the hook registered.
+        # Test the hook with different algorithmic configs.
+        for use_error_feedback, warm_start, batch_tensors_with_same_shape in product(
+            [True, False],
+            [True, False],
+            [True, False],
+        ):
+            state = powerSGD.PowerSGDState(
+                process_group=process_group,
+                matrix_approximation_rank=1,
+                use_error_feedback=use_error_feedback,
+                warm_start=warm_start,
+                batch_tensors_with_same_shape=batch_tensors_with_same_shape,
+            )
+            for hook in [powerSGD.powerSGD_hook, powerSGD.batched_powerSGD_hook]:
+                gpu_model = self._gpu_model_with_ddp_comm_hook(
+                    process_group, hook, gradient_as_bucket_view, state
+                )
+
+                # check whether the grads are equal to what DDP without hook would return.
+                self._run_and_verify_hook(gpu_model, 8, 0.25 * torch.ones(2, 2))
+
+    def _test_builtin_ddp_comm_hooks_xccl(self, gradient_as_bucket_view=False):
+        """
+        This unit test verifies whether built-in C++ DDP communication hooks ALLREDUCE and FP16_COMPRESS
+        can give the same result with the case of no hook registered.
+        """
+        process_group = self._get_process_group()
+
+        for comm_hook_type in [
+            dist.BuiltinCommHookType.ALLREDUCE,
+            dist.BuiltinCommHookType.FP16_COMPRESS,
+        ]:
+            # Get GPU model with the built-in communication hook.
+            gpu_model = self._gpu_model_with_builtin_ddp_comm_hook(
+                process_group, comm_hook_type, gradient_as_bucket_view
+            )
+
+            # check whether the grads are equal to what DDP without hook would return.
+            self._run_and_verify_hook(gpu_model, 8, 0.25 * torch.ones(2, 2))
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(2)
+    def test_ddp_comm_hook_allreduce_hook_xccl(self):
+        self._test_ddp_comm_hook_allreduce_hook_xccl()
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(2)
+    def test_default_ddp_comm_hooks_xccl(self):
+        self._test_default_ddp_comm_hooks_xccl()
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(2)
+    def test_fp16_compress_wrapper_xccl(self):
+        self._test_fp16_compress_wrapper()
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(2)
+    def test_bf16_compress_wrapper_xccl(self):
+        self._test_bf16_compress_wrapper()
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(2)
+    def test_builtin_ddp_comm_hooks_xccl(self):
+        self._test_builtin_ddp_comm_hooks_xccl()
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(2)
+    def test_powerSGD_ddp_comm_hook_xccl(self):
+        self._test_powerSGD_ddp_comm_hook_xccl()
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(2)
+    def test_ddp_comm_hook_allreduce_hook_xccl_grad_is_view(self):
+        self._test_ddp_comm_hook_allreduce_hook_xccl(gradient_as_bucket_view=True)
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(2)
+    def test_ddp_comm_hook_allreduce_hook_xccl_static_graph(self):
+        self._test_ddp_comm_hook_allreduce_hook_xccl(static_graph=True)
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(2)
+    def test_default_ddp_comm_hooks_xccl_is_view(self):
+        self._test_default_ddp_comm_hooks_xccl(gradient_as_bucket_view=True)
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(2)
+    def test_fp16_compress_wrapper_is_view(self):
+        self._test_fp16_compress_wrapper(gradient_as_bucket_view=True)
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(2)
+    def test_bf16_compress_wrapper_is_view(self):
+        self._test_bf16_compress_wrapper(gradient_as_bucket_view=True)
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(2)
+    def test_builtin_ddp_comm_hooks_xccl_grad_is_view(self):
+        self._test_builtin_ddp_comm_hooks_xccl(gradient_as_bucket_view=True)
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(2)
+    def test_powerSGD_ddp_comm_hook_xccl_grad_is_view(self):
+        self._test_powerSGD_ddp_comm_hook_xccl(gradient_as_bucket_view=True)
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(2)
+    def test_ddp_comm_hook_allreduce_with_then_hook_xccl(self):
+        """
+        This unit test verifies whether a DDP communication hook that calls allreduce and then
+        multiplies the result by ten and divides by two gives the expected result.
+        """
+        process_group = self._get_process_group()
+
+        def allreduce_with_then_hook(
+            state: object, bucket: dist.GradBucket
+        ) -> torch.futures.Future[torch.Tensor]:
+            tensors = [bucket.buffer() / self.world_size]
+            fut = process_group.allreduce(tensors).get_future()
+
+            def mult(fut):
+                # Multiply the result by 10.
+                return 10 * fut.value()[0]
+
+            def div(fut):
+                # Divide the result by 2.
+                return 0.5 * fut.value()
+
+            return fut.then(mult).then(div)
+
+        # Get GPU model with allreduce_with_then_hook registered.
+        gpu_model = self._gpu_model_with_ddp_comm_hook(
+            process_group, allreduce_with_then_hook
+        )
+
+        # check whether the grads are equal to what allreduce returns multiplied by 5.
+        # without the comm_hook, result would be still 0.25 * torch.ones(2, 2).
+        self._run_and_verify_hook(gpu_model, 8, 1.25 * torch.ones(2, 2))
+
+    class AcceptsParam(torch.nn.Module):
+        def __init__(self, p, factor):
+            super().__init__()
+            self.a = p
+            self.f = factor
+
+        def forward(self, input):
+            return input + self.a * self.f
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(2)
+    def test_ddp_weight_sharing(self):
+        process_group = self._get_process_group()
+
+        size = 2048 * 2048
+        dev = self.rank
+        world = self.world_size
+
+        p = torch.nn.Parameter(torch.randn(size, requires_grad=True))
+
+        for try_set_to_none, use_bucket_view in product((False, True), (False, True)):
+            m = torch.nn.Sequential(
+                self.AcceptsParam(p, dev + 1), self.AcceptsParam(p, dev + 1)
+            ).xpu(dev)
+
+            m = torch.nn.parallel.DistributedDataParallel(
+                m,
+                bucket_cap_mb=1,
+                gradient_as_bucket_view=use_bucket_view,
+                device_ids=[dev],
+                process_group=process_group,
+            )
+
+            for i in range(3):
+                m.zero_grad(set_to_none=try_set_to_none)
+                m(1).sum().backward()
+
+                # Each param value is multiplied by "rank + 1" twice in forward, so the grad
+                # values produced by a particular rank should be 2. * (rank + 1).
+                # Summing these over ranks and dividing by world size gives the expected result:
+                analytic = torch.full_like(
+                    p, 2.0 * (world * (world + 1.0) / 2.0) / world, device=dev
+                )
+                for name, p in m.named_parameters():
+                    self.assertEqual(
+                        p.grad,
+                        analytic,
+                        "mismatch at "
+                        + name
+                        + ".grad for "
+                        + f"set_to_none = {try_set_to_none}, use_bucket_view = {use_bucket_view}",
+                    )
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(2)
+    def test_ddp_packed_sequence(self):
+        """
+        Tests that DDP with ``device_ids`` specified can run a forward and
+        backward pass with ``PackedSequence`` s with parity compared to a local
+        version of the model.
+        """
+        store = c10d.FileStore(self.file_name, self.world_size)
+        process_group = dist.init_process_group(
+            "xccl",
+            world_size=self.world_size,
+            rank=self.rank,
+            store=store,
+        )
+        seqs = ["sequence_sequence", "seq", "sequence"]
+        vocab = ["<pad>"] + sorted({ch for seq in seqs for ch in seq})
+        vectorized_seqs = [[vocab.index(tok) for tok in seq] for seq in seqs]
+        # Set the seed to make the embedding and LSTM deterministic (even
+        # across ranks since DDP broadcasts parameters from rank 0)
+        torch.manual_seed(0)
+        embed = nn.Embedding(len(vocab), 4)  # keep on CPU
+        lstm = nn.LSTM(input_size=4, hidden_size=2, batch_first=True).to(self.rank)
+        lstm_ddp = DistributedDataParallel(
+            copy.deepcopy(lstm),
+            device_ids=[self.rank],
+            process_group=process_group,
+        )
+        for p1, p2 in zip(lstm.parameters(), lstm_ddp.module.parameters()):
+            self.assertEqual(p1, p2)
+        seq_lengths = torch.LongTensor(list(map(len, vectorized_seqs)))
+        seq_tensor = torch.Tensor(
+            torch.zeros((len(vectorized_seqs), seq_lengths.max()))
+        ).long()
+        for i, (seq, seq_len) in enumerate(zip(vectorized_seqs, seq_lengths)):
+            seq_tensor[i, :seq_len] = torch.LongTensor(seq)
+        seq_lengths, permutation_idx = seq_lengths.sort(0, descending=True)
+        seq_tensor = seq_tensor[permutation_idx]
+        embedded_seq_tensor = embed(seq_tensor)
+        packed_input = torch.nn.utils.rnn.pack_padded_sequence(
+            embedded_seq_tensor,
+            seq_lengths,
+            batch_first=True,
+        )
+        packed_input_ddp = torch.nn.utils.rnn.pack_padded_sequence(
+            embedded_seq_tensor.detach().clone(),
+            seq_lengths,
+            batch_first=True,
+        )
+        # Move the input to GPU explicitly for the local model
+        packed_output, (ht, ct) = lstm(packed_input.to(self.rank))
+        # Let DDP move the input to GPU internally
+        packed_output_ddp, (ht_ddp, ct_ddp) = lstm_ddp(packed_input_ddp)
+        self.assertEqual(packed_output.data, packed_output_ddp.data)
+        self.assertEqual(ht, ht_ddp)
+        self.assertEqual(ct, ct_ddp)
+        packed_output.data.sum().backward()
+        packed_output_ddp.data.sum().backward()
+        for p1, p2 in zip(lstm.parameters(), lstm_ddp.parameters()):
+            self.assertEqual(p1.grad, p2.grad)
+
+    # error: input dense tensor has to be contiguous
+    @requires_xccl()
+    @skip_if_lt_x_gpu(2)
+    def test_channels_last_contig(self):
+        process_group = self._get_process_group()
+        device = torch.device(f"xpu:{self.rank}")
+        tensor = torch.ones((2, 16, 768, 1152), dtype=torch.float32, device=device).to(
+            memory_format=torch.channels_last
+        )
+        process_group.broadcast([tensor]).wait()
+
+
+class CommTest(test_c10d_common.AbstractCommTest, MultiProcessTestCase):
+    @property
+    def device(self):
+        return f"xpu:{self.rank}"
+
+    def setUp(self):
+        super().setUp()
+        self._spawn_processes()
+
+    def tearDown(self):
+        super().tearDown()
+        try:
+            os.remove(self.file_name)
+        except OSError:
+            pass
+
+    def _test_broadcast_coalesced(self, process_group, device, root_rank):
+        half = torch.float16
+
+        # No support for float16 for CPU tensors
+        if device == torch.device("cpu"):
+            half = torch.float32
+
+        target = torch.arange(60, dtype=half, device=device).chunk(5)
+        target += torch.arange(60, dtype=torch.float32, device=device).chunk(5)
+        target += torch.arange(60, dtype=half, device=device).chunk(5)
+        target += torch.arange(60, dtype=torch.float64, device=device).chunk(5)
+        target += torch.arange(60, dtype=half, device=device).chunk(5)
+        target += torch.arange(60, dtype=torch.float32, device=device).chunk(5)
+
+        # The tensors to pass to broadcast are identical to the target
+        # only on the process that is the root of the broadcast.
+        if self.rank == root_rank:
+            tensors = [tensor.clone() for tensor in target]
+        else:
+            tensors = [torch.zeros_like(tensor) for tensor in target]
+
+        if self.rank != root_rank:
+            self.assertNotEqual(tensors, target)
+
+        c10d._broadcast_coalesced(
+            process_group, tensors, buffer_size=256, src=root_rank
+        )
+
+        if self.rank != root_rank:
+            self.assertEqual(tensors, target)
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(2)
+    def test_broadcast_coalesced_xccl(self):
+        store = c10d.FileStore(self.file_name, self.world_size)
+        c10d.init_process_group(
+            backend="xccl", store=store, rank=self.rank, world_size=self.world_size
+        )
+        process_group = c10d.distributed_c10d._get_default_group()
+        device = torch.device("xpu:%d" % self.rank)
+        ranks = [0, 1]
+        for root_rank in ranks:
+            self._test_broadcast_coalesced(process_group, device, root_rank)
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(2)
+    def test_all_reduce_coalesced_xccl(self):
+        store = c10d.FileStore(self.file_name, self.world_size)
+        c10d.init_process_group(
+            backend="xccl", store=store, rank=self.rank, world_size=self.world_size
+        )
+        process_group = c10d.distributed_c10d._get_default_group()
+        device = torch.device("xpu:%d" % self.rank)
+        tensors = [
+            torch.full((60 + i,), self.rank + 1 + i, device=device, dtype=torch.float)
+            for i in range(5)
+        ]
+        torch.distributed.all_reduce_coalesced(tensors, group=process_group)
+        for i, t in enumerate(tensors):
+            self.assertEqual(
+                t,
+                torch.full_like(
+                    t, self.world_size * (i + (self.world_size + 1.0) / 2.0)
+                ),
+            )
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(2)
+    def test_all_reduce_coalesced_manager_xccl(self):
+        store = c10d.FileStore(self.file_name, self.world_size)
+        c10d.init_process_group(
+            backend="xccl", store=store, rank=self.rank, world_size=self.world_size
+        )
+        process_group = c10d.distributed_c10d._get_default_group()
+        device = torch.device("xpu:%d" % self.rank)
+        tensors = [
+            torch.full((60 + i,), self.rank + 1 + i, device=device, dtype=torch.float)
+            for i in range(5)
+        ]
+        with torch.distributed._coalescing_manager(
+            group=process_group, device=device, async_ops=True
+        ) as cm:
+            for tensor in tensors:
+                torch.distributed.all_reduce(tensor)
+        self.assertEqual(len(cm.works), 1)
+        cm.wait()
+        for i, t in enumerate(tensors):
+            self.assertEqual(
+                t,
+                torch.full_like(
+                    t, self.world_size * (i + (self.world_size + 1.0) / 2.0)
+                ),
+            )
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(4)
+    def test_xccl_barrier(self):
+        store = c10d.FileStore(self.file_name, self.world_size)
+        c10d.init_process_group(
+            backend="xccl", rank=self.rank, world_size=self.world_size, store=store
+        )
+
+        t = torch.tensor([self.rank + 1] * 10).xpu(2 * self.rank)
+        c10d.all_reduce(t)
+        expected_tensor = torch.tensor([3] * 10).xpu(2 * self.rank)
+        self.assertEqual(expected_tensor, t)
+
+        # Test with new_group
+        pg = c10d.new_group([0, 1])
+        t = torch.tensor([self.rank + 1] * 10).xpu(2 * self.rank)
+        pg.allreduce(t).wait()
+        self.assertEqual(expected_tensor, t)
+
+        pg = c10d.new_group([0])
+        if self.rank == 0:
+            t = torch.tensor([self.rank + 1] * 10).xpu(2 * self.rank)
+            expected_tensor = torch.tensor([self.rank + 1] * 10).xpu(2 * self.rank)
+            pg.allreduce(t).wait()
+            self.assertEqual(expected_tensor, t)
+
+        pg = c10d.new_group([1])
+        if self.rank == 1:
+            t = torch.tensor([self.rank + 1] * 10).xpu(2 * self.rank)
+            expected_tensor = torch.tensor([self.rank + 1] * 10).xpu(2 * self.rank)
+            pg.allreduce(t).wait()
+            self.assertEqual(expected_tensor, t)
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(2)
+    def test_xccl_barrier_device_ids(self):
+        store = c10d.FileStore(self.file_name, self.world_size)
+        c10d.init_process_group(
+            backend="xccl", rank=self.rank, world_size=self.world_size, store=store
+        )
+
+        c10d.barrier(device_ids=[self.rank])
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(2)
+    def test_xccl_barrier_device_ids_function_argument(self):
+        store = c10d.FileStore(self.file_name, self.world_size)
+        c10d.init_process_group(
+            backend="xccl", rank=self.rank, world_size=self.world_size, store=store
+        )
+
+        with self.assertRaisesRegex(TypeError, "Invalid function argument"):
+            c10d.barrier(device_ids=self.rank)
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(2)
+    def test_reduce_scatter_base_k(self):
+        store = dist.FileStore(self.file_name, self.world_size)
+        dist.init_process_group(
+            "xccl",
+            world_size=self.world_size,
+            rank=self.rank,
+            store=store,
+        )
+        output_tensor = torch.zeros(2, dtype=torch.int64).to(self.rank)
+        input_tensors = torch.arange(self.world_size * 2, dtype=torch.int64).to(
+            self.rank
+        )
+        input_tensors = torch.reshape(input_tensors, (self.world_size, 2))
+        dist.reduce_scatter_tensor(output_tensor, input_tensors)
+        self.assertEqual(output_tensor, input_tensors[self.rank] * self.world_size)
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(2)
+    def test_reduce_scatter_tensor_coalesced(self):
+        store = dist.FileStore(self.file_name, self.world_size)
+        dist.init_process_group(
+            "xccl",
+            world_size=self.world_size,
+            rank=self.rank,
+            store=store,
+        )
+        output_tensors = torch.zeros(2, 2).to(self.rank)
+        input_tensors = [torch.ones(2, 2).to(self.rank) for _ in range(self.world_size)]
+        with dist._coalescing_manager():
+            for i in range(self.world_size):
+                dist.reduce_scatter_tensor(output_tensors[i], input_tensors[i])
+        self.assertEqual(output_tensors, input_tensors[self.rank] * self.world_size)
+
+
+class SetDeviceMethod(Enum):
+    TORCH_XPU_SET = auto()  # torch.xpu.set_device
+    COLLECTIVE_ARGUMENT = auto()  # broadcast_object_list(device=)
+
+
+class XCCLProcessGroupWithDispatchedCollectivesTests(
+    test_c10d_common.ProcessGroupWithDispatchedCollectivesTests
+):
+    @requires_xccl()
+    @skip_if_lt_x_gpu(1)
+    def test_collectives(self):
+        self._test_collectives(backend="xccl")
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(1)
+    def test_allreduce_coalesced(self):
+        self._test_allreduce_coalesced(backend="xccl")
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(1)
+    def test_all_to_all_single(self):
+        self._test_all_to_all_single(backend="xccl")
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(1)
+    def test_allgather_base(self):
+        store = dist.FileStore(self.file_name, self.world_size)
+        dist.init_process_group(
+            "xccl",
+            world_size=self.world_size,
+            rank=self.rank,
+            store=store,
+        )
+        device = "xpu"
+        tensor = torch.ones(10, 10, device=torch.device(device))
+        output_tensor = torch.zeros(10, 10, device=torch.device(device))
+        dist.all_gather_into_tensor(output_tensor, tensor)
+        self.assertEqual(output_tensor, tensor)
+
+
+class LargeCommTest(test_c10d_common.AbstractLargeCommTest, MultiProcessTestCase):
+    def setUp(self):
+        super().setUp()
+        self._spawn_processes()
+
+    def tearDown(self):
+        super().tearDown()
+        try:
+            os.remove(self.file_name)
+        except OSError:
+            pass
+
+    @property
+    def device(self):
+        return self.rank
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(4)
+    def test_new_group_local_sync(self):
+        self._test_new_group_local_sync(backend="xccl")
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(4)
+    def test_new_group_local_sync_sanity_check(self):
+        self._test_new_group_local_sync_sanity_check(backend="xccl")
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(4)
+    def test_new_group_local_sync_duplicated_pg(self):
+        self._test_new_group_local_sync_duplicate_pg(backend="xccl")
+
+    def _init_two_pg2_subgroups(self, world_size: int = 4):
+        if world_size != 4:
+            raise NotImplementedError(
+                f"need world size of 4 to get 2 subgroup PGs, but got world size of {world_size}"
+            )
+        store = c10d.FileStore(self.file_name, world_size)
+        c10d.init_process_group(
+            backend="xccl", store=store, rank=self.rank, world_size=world_size
+        )
+        # every rank creates the same sub groups
+        # including unused sub groups in the current rank
+        a_group = c10d.new_group([0, 1])
+        b_group = c10d.new_group([2, 3])
+        return a_group if self.rank < 2 else b_group
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(4)
+    def test_gather_subgroup(self):
+        world_size = 4
+        if self.rank >= world_size:
+            # just easier to write the test for exactly 4 gpus, even if this test class increased to 8gpu later
+            return
+
+        subgroup = self._init_two_pg2_subgroups(world_size)
+        device = torch.device("xpu:%d" % self.rank)
+        input = torch.ones((10,), device=device) * self.rank
+        if self.rank == 0 or self.rank == 2:
+            gather_list = [torch.empty_like(input) for _ in range(subgroup.size())]
+            torch.distributed.gather(
+                input,
+                gather_list=gather_list,
+                dst=self.rank,
+                group=subgroup,
+                async_op=False,
+            )
+            for src in range(len(gather_list)):
+                expected = (torch.ones_like(input) * self.rank) + src
+                self.assertEqual(gather_list[src], expected)
+        else:
+            torch.distributed.gather(
+                input,
+                gather_list=None,
+                dst=self.rank - 1,
+                group=subgroup,
+                async_op=False,
+            )
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(4)
+    def test_gather_object_subgroup(self):
+        world_size = 4
+        if self.rank >= world_size:
+            # just easier to write the test for exactly 4 gpus, even if this test class increased to 8gpu later
+            return
+
+        subgroup = self._init_two_pg2_subgroups(world_size)
+
+        # discrepancy #1
+        # have to set device or else gather_object gets wrong device from 'current_device = _get_pg_default_device(group)
+        torch.xpu.set_device(self.rank)
+
+        input = {"rank": self.rank}
+        if self.rank == 0 or self.rank == 2:
+            # discrepancy #2
+            # another weird thing- what's the point of making me specify some empty objects in my list?
+            # empty list should be valid imo.  (but it throws an error)
+            gather_list = [{}, {}]
+            torch.distributed.gather_object(
+                input, object_gather_list=gather_list, dst=self.rank, group=subgroup
+            )
+            for src in range(len(gather_list)):
+                self.assertEqual(gather_list[src]["rank"], self.rank + src)
+        else:
+            torch.distributed.gather_object(
+                input, object_gather_list=None, dst=self.rank - 1, group=subgroup
+            )
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(4)
+    def test_reduce_subgroup(self):
+        world_size = 4
+        if self.rank >= world_size:
+            return
+        subgroup = self._init_two_pg2_subgroups(world_size)
+        device = torch.device("xpu:%d" % self.rank)
+        x = torch.ones((10,), device=device) * self.rank
+        if self.rank == 0 or self.rank == 2:
+            expected = x + torch.ones((10,), device=device) * (self.rank + 1)
+            c10d.reduce(x, dst=self.rank, group=subgroup, async_op=False)
+            self.assertEqual(x, expected)
+        else:
+            c10d.reduce(x, dst=self.rank - 1, group=subgroup, async_op=False)
+
+    # error: RuntimeError: Point-to-point communication as the first call is not supported now
+    @requires_xccl()
+    @skip_if_lt_x_gpu(4)
+    @parametrize("async_op", [True, False])
+    def test_send_recv_subgroup(self, async_op):
+        world_size = 4
+        if self.rank >= world_size:
+            return
+        subgroup = self._init_two_pg2_subgroups(world_size)
+        device = torch.device("xpu:%d" % self.rank)
+        if self.rank == 0 or self.rank == 2:
+            x = torch.empty((10,), device=device)
+            if async_op:
+                c10d.irecv(x, src=self.rank + 1, group=subgroup).wait()
+            else:
+                c10d.recv(x, src=self.rank + 1, group=subgroup)
+            expected = torch.ones((10,), device=device) * (self.rank + 1)
+            self.assertEqual(x, expected)
+        else:
+            x = torch.ones((10,), device=device) * self.rank
+            if async_op:
+                c10d.isend(x, dst=self.rank - 1, group=subgroup).wait()
+            else:
+                c10d.send(x, dst=self.rank - 1, group=subgroup)
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(4)
+    def test_broadcast_subgroup(self):
+        world_size = 4
+        if self.rank >= world_size:
+            return
+        subgroup = self._init_two_pg2_subgroups(world_size)
+        device = torch.device("xpu:%d" % self.rank)
+        if self.rank == 0 or self.rank == 2:
+            x = torch.empty((10,), device=device)
+            c10d.broadcast(x, src=self.rank + 1, group=subgroup)
+            expected = torch.ones((10,), device=device) * (self.rank + 1)
+            self.assertEqual(x, expected)
+        else:
+            x = torch.ones((10,), device=device) * self.rank
+            c10d.broadcast(x, src=self.rank, group=subgroup)
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(4)
+    @parametrize(
+        "set_device",
+        [SetDeviceMethod.TORCH_XPU_SET, SetDeviceMethod.COLLECTIVE_ARGUMENT],
+    )
+    def test_send_recv_object_list_subgroup(self, set_device: SetDeviceMethod):
+        world_size = 4
+        if self.rank >= world_size:
+            return
+        subgroup = self._init_two_pg2_subgroups(world_size)
+        if set_device == SetDeviceMethod.TORCH_XPU_SET:
+            torch.xpu.set_device(self.rank)
+            device = None
+        else:
+            device = torch.device("xpu:%d" % self.rank)
+        if self.rank == 0 or self.rank == 2:
+            x = [{}]
+            c10d.recv_object_list(x, src=self.rank + 1, group=subgroup, device=device)
+            expected = [{"rank": self.rank + 1}]
+            self.assertEqual(x, expected)
+        else:
+            x = [{"rank": self.rank}]
+            c10d.send_object_list(x, dst=self.rank - 1, group=subgroup, device=device)
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(4)
+    @parametrize(
+        "set_device",
+        [SetDeviceMethod.TORCH_XPU_SET, SetDeviceMethod.COLLECTIVE_ARGUMENT],
+    )
+    def test_broadcast_object_list_subgroup(self, set_device: SetDeviceMethod):
+        world_size = 4
+        if self.rank >= world_size:
+            return
+        subgroup = self._init_two_pg2_subgroups(world_size)
+        if set_device == SetDeviceMethod.TORCH_XPU_SET:
+            torch.xpu.set_device(self.rank)
+            device = None
+        else:
+            device = torch.device("xpu:%d" % self.rank)
+        if self.rank == 0 or self.rank == 2:
+            x = [{}]
+            c10d.broadcast_object_list(
+                x, src=self.rank + 1, group=subgroup, device=device
+            )
+            expected = [{"rank": self.rank + 1}]
+            self.assertEqual(x, expected)
+        else:
+            x = [{"rank": self.rank}]
+            c10d.broadcast_object_list(x, src=self.rank, group=subgroup, device=device)
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(4)
+    def test_scatter_subgroup(self):
+        world_size = 4
+        if self.rank >= world_size:
+            return
+        subgroup = self._init_two_pg2_subgroups(world_size)
+        device = torch.device("xpu:%d" % self.rank)
+        x = torch.empty((10,), device=device)
+        expected = torch.ones((10,), device=device) * self.rank
+        if self.rank == 0 or self.rank == 2:
+            c10d.scatter(x, scatter_list=None, src=self.rank + 1, group=subgroup)
+        else:
+            scatter_list = [
+                torch.ones((10,), device=device) * (self.rank - 1),
+                torch.ones((10,), device=device) * self.rank,
+            ]
+            c10d.scatter(x, scatter_list=scatter_list, src=self.rank, group=subgroup)
+        self.assertEqual(x, expected)
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(4)
+    def test_scatter_object_list_subgroup(self):
+        world_size = 4
+        if self.rank >= world_size:
+            return
+        subgroup = self._init_two_pg2_subgroups(world_size)
+        torch.xpu.set_device(self.rank)
+        scatter_object_output_list = [None]
+        expected = [{"rank": self.rank}]
+        if self.rank == 0 or self.rank == 2:
+            c10d.scatter_object_list(
+                scatter_object_output_list=scatter_object_output_list,
+                scatter_object_input_list=None,
+                src=self.rank + 1,
+                group=subgroup,
+            )
+
+        else:
+            scatter_object_input_list = [
+                {"rank": self.rank - 1},
+                {"rank": self.rank},
+            ]
+            c10d.scatter_object_list(
+                scatter_object_output_list=scatter_object_output_list,
+                scatter_object_input_list=scatter_object_input_list,
+                src=self.rank,
+                group=subgroup,
+            )
+        self.assertEqual(scatter_object_output_list, expected)
+
+
+instantiate_parametrized_tests(LargeCommTest)
+
+if __name__ == "__main__":
     run_tests()