From c0e1dc1c864840da352de3fac11b6fd717979479 Mon Sep 17 00:00:00 2001 From: hanchao Date: Thu, 29 Aug 2024 09:28:58 +0000 Subject: [PATCH 01/96] Happy Init --- CMakeLists.txt | 6 + caffe2/CMakeLists.txt | 11 + caffe2/core/macros.h.in | 1 + cmake/Dependencies.cmake | 16 + cmake/External/xccl.cmake | 13 + cmake/Modules/FindXCCL.cmake | 68 ++ cmake/Summary.cmake | 5 + setup.py | 4 + torch/CMakeLists.txt | 7 + .../distributed/c10d/ProcessGroupXCCL.cpp | 356 +++++++ .../distributed/c10d/ProcessGroupXCCL.hpp | 140 +++ torch/csrc/distributed/c10d/XCCLUtils.hpp | 334 +++++++ torch/csrc/xpu/xccl.cpp | 923 ++++++++++++++++++ torch/csrc/xpu/xccl.h | 112 +++ 14 files changed, 1996 insertions(+) create mode 100644 cmake/External/xccl.cmake create mode 100644 cmake/Modules/FindXCCL.cmake create mode 100644 torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp create mode 100644 torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp create mode 100644 torch/csrc/distributed/c10d/XCCLUtils.hpp create mode 100644 torch/csrc/xpu/xccl.cpp create mode 100644 torch/csrc/xpu/xccl.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 5139c0a478e788..89ef59681bfff4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -275,6 +275,8 @@ option(USE_NATIVE_ARCH "Use -march=native" OFF) cmake_dependent_option(USE_MPS "Use MPS for macOS build" ON "MPS_FOUND" OFF) cmake_dependent_option(USE_NCCL "Use NCCL" ON "USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF) +cmake_dependent_option(USE_XCCL "Use XCCL" ON + "USE_XPU;UNIX;NOT APPLE" OFF) cmake_dependent_option(USE_RCCL "Use RCCL" ON USE_NCCL OFF) cmake_dependent_option(USE_STATIC_NCCL "Use static NCCL" OFF "USE_NCCL" OFF) cmake_dependent_option(USE_SYSTEM_NCCL "Use system-wide NCCL" OFF "USE_NCCL" @@ -353,6 +355,8 @@ cmake_dependent_option(USE_C10D_GLOO "USE C10D GLOO" ON "USE_DISTRIBUTED;USE_GLOO" OFF) cmake_dependent_option(USE_C10D_NCCL "USE C10D NCCL" ON "USE_DISTRIBUTED;USE_NCCL" OFF) +cmake_dependent_option(USE_C10D_XCCL "USE C10D XCCL" ON + "USE_DISTRIBUTED;USE_XCCL" OFF) cmake_dependent_option(USE_C10D_MPI "USE C10D MPI" ON "USE_DISTRIBUTED;USE_MPI" OFF) cmake_dependent_option( @@ -365,6 +369,8 @@ cmake_dependent_option( USE_C10D_GLOO "USE C10D GLOO" ON "USE_DISTRIBUTED;USE_GLOO" OFF) cmake_dependent_option( USE_C10D_NCCL "USE C10D NCCL" ON "USE_DISTRIBUTED;USE_NCCL" OFF) +cmake_dependent_option( + USE_C10D_XCCL "USE C10D XCCL" ON "USE_DISTRIBUTED;USE_XCCL" OFF) cmake_dependent_option( USE_C10D_MPI "USE C10D MPI" ON "USE_DISTRIBUTED;USE_MPI" OFF) cmake_dependent_option( diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index 8ed93cdff0479c..2c4da5fd50f10c 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -1057,6 +1057,10 @@ if(USE_XPU) # 2. Using add_custom_command in torch-xpu-ops to define sycl device sources # compilation. add_custom_command requires an explicit dependency. list(APPEND ${Caffe2_XPU_INCLUDE} ${TORCH_XPU_OPS_DIR}/src/ATen/) + # if(USE_XCCL) + # list(APPEND Caffe2_GPU_SRCS + # ${TORCH_SRC_DIR}/csrc/xpu/xccl.cpp) + # endif() set(TORCH_XPU_OPS_PYTORCH_DEPS ATEN_CPU_FILES_GEN_TARGET) add_subdirectory(${TORCH_ROOT}/third_party/torch-xpu-ops @@ -1065,6 +1069,10 @@ if(USE_XPU) message(WARNING "Failed to include ATen XPU implementation target") else() target_link_libraries(torch_xpu PRIVATE torch_xpu_ops) + if(USE_XCCL) + target_link_libraries(torch_xpu PRIVATE __caffe2_xccl) + target_compile_definitions(torch_xpu PRIVATE USE_XCCL) + endif() if(MSVC) # Windows target_link_libraries(torch_xpu PRIVATE @@ -1365,6 +1373,9 @@ if(USE_DISTRIBUTED) target_compile_definitions(torch_cuda PUBLIC USE_C10D_NCCL) endif() endif() + if(USE_C10D_XCCL) + target_compile_definitions(torch_xpu PUBLIC USE_C10D_XCCL) + endif() if(USE_MPI AND USE_C10D_MPI) if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU") set_source_files_properties( diff --git a/caffe2/core/macros.h.in b/caffe2/core/macros.h.in index 2929f105b31faa..e5398a83cad947 100644 --- a/caffe2/core/macros.h.in +++ b/caffe2/core/macros.h.in @@ -45,6 +45,7 @@ {"USE_CUDNN", "${USE_CUDNN}"}, \ {"CUDNN_VERSION", "${CUDNN_VERSION}"}, \ {"USE_NCCL", "${USE_NCCL}"}, \ + {"USE_XCCL", "${USE_XCCL}"}, \ {"USE_MPI", "${USE_MPI}"}, \ {"USE_GFLAGS", "${USE_GFLAGS}"}, \ {"USE_GLOG", "${USE_GLOG}"}, \ diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index ef33a3165340c1..49fb525afbf8a8 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -1150,6 +1150,22 @@ if(USE_CUDA) include_directories(SYSTEM ${CUB_INCLUDE_DIRS}) endif() +# ---[ XCCL +if(USE_XCCL) + if(NOT USE_XPU) + message(WARNING + "Not using XPU, so disabling USE_NUSE_XCCLCCL. Suppress this warning with " + "-DUSE_XCCL=OFF.") + caffe2_update_option(USE_XCCL OFF) + elseif(NOT CMAKE_SYSTEM_NAME STREQUAL "Linux") + message(WARNING "USE_XCCL is currently only supported under Linux.") + caffe2_update_option(USE_XCCL OFF) + else() + include(${CMAKE_CURRENT_LIST_DIR}/External/xccl.cmake) + list(APPEND Caffe2_XPU_DEPENDENCY_LIBS __caffe2_xccl) + endif() +endif() + if(USE_DISTRIBUTED AND USE_TENSORPIPE) if(MSVC) message(WARNING "Tensorpipe cannot be used on Windows.") diff --git a/cmake/External/xccl.cmake b/cmake/External/xccl.cmake new file mode 100644 index 00000000000000..d1e8f33881b80b --- /dev/null +++ b/cmake/External/xccl.cmake @@ -0,0 +1,13 @@ +if(NOT __XCCL_INCLUDED) + set(__XCCL_INCLUDED TRUE) + + if(USE_XCCL) + # XCCL_ROOT, XCCL_LIBRARY_DIR, XCCL_INCLUDE_DIR are handled by FindXCCL.cmake. + find_package(XCCL REQUIRED) + if(XCCL_FOUND) + add_library(__caffe2_xccl INTERFACE) + target_link_libraries(__caffe2_xccl INTERFACE ${XCCL_LIBRARY}) + target_include_directories(__caffe2_xccl INTERFACE ${XCCL_INCLUDE_DIR}) + endif() + endif() +endif() diff --git a/cmake/Modules/FindXCCL.cmake b/cmake/Modules/FindXCCL.cmake new file mode 100644 index 00000000000000..3f30e8cd23d6e7 --- /dev/null +++ b/cmake/Modules/FindXCCL.cmake @@ -0,0 +1,68 @@ +# This will define the following variables: +# XCCL_FOUND : True if the system has the XCCL library. +# XCCL_INCLUDE_DIR : Include directories needed to use XCCL. +# XCCL_LIBRARY_DIR :The path to the XCCL library. +# XCCL_LIBRARY : XCCL library fullname. + +include(FindPackageHandleStandardArgs) + +set(XCCL_ROOT "") +if(DEFINED ENV{CCL_ROOT}) + set(XCCL_ROOT $ENV{CCL_ROOT}) +endif() + +string(COMPARE EQUAL "${XCCL_ROOT}" "" nosyclfound) +if(nosyclfound) + set(XCCL_FOUND False) + set(XCCL_REASON_FAILURE "XCCL library not set!!") + set(XCCL_NOT_FOUND_MESSAGE "${XCCL_REASON_FAILURE}") + return() +endif() + +# Find include path from binary. +find_file( + XCCL_INCLUDE_DIR + NAMES include + HINTS ${XCCL_ROOT} + NO_DEFAULT_PATH +) + +# Find include/sycl path from include path. +find_file( + XCCL_INCLUDE_ONEAPI_DIR + NAMES oneapi + HINTS ${XCCL_ROOT}/include/ + NO_DEFAULT_PATH +) + +list(APPEND XCCL_INCLUDE_DIR ${XCCL_INCLUDE_ONEAPI_DIR}) + +# Find library directory from binary. +find_file( + XCCL_LIBRARY_DIR + NAMES lib + HINTS ${XCCL_ROOT} + NO_DEFAULT_PATH +) + +# Find XCCL library fullname. +find_library( + XCCL_LIBRARY + NAMES ccl + HINTS ${XCCL_LIBRARY_DIR} + NO_DEFAULT_PATH +) + +if((NOT XCCL_INCLUDE_DIR) OR (NOT XCCL_LIBRARY_DIR) OR (NOT XCCL_LIBRARY)) + set(XCCL_FOUND False) + set(XCCL_REASON_FAILURE "XCCL library is incomplete!!") + set(XCCL_NOT_FOUND_MESSAGE "${XCCL_REASON_FAILURE}") + return() +endif() + +find_package_handle_standard_args( + XCCL + FOUND_VAR XCCL_FOUND + REQUIRED_VARS XCCL_INCLUDE_DIR XCCL_LIBRARY_DIR XCCL_LIBRARY + REASON_FAILURE_MESSAGE "${XCCL_REASON_FAILURE}" +) diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake index d51c451589c2c4..0b601cf2a6a329 100644 --- a/cmake/Summary.cmake +++ b/cmake/Summary.cmake @@ -153,6 +153,11 @@ function(caffe2_print_configuration_summary) message(STATUS " USE_SYSTEM_UCC : ${USE_SYSTEM_UCC}") endif() message(STATUS " USE_ITT : ${USE_ITT}") + message(STATUS " USE_XCCL : ${USE_XCCL}") + if(${USE_XCCL}) + message(STATUS " XCCL include path : ${XCCL_INCLUDE_DIR}") + message(STATUS " XCCL library : ${XCCL_LIBRARY}") + endif() message(STATUS " USE_NCCL : ${USE_NCCL}") if(${USE_NCCL}) message(STATUS " USE_SYSTEM_NCCL : ${USE_SYSTEM_NCCL}") diff --git a/setup.py b/setup.py index 92f1e2ddc7bcd3..e6191c0616db4a 100644 --- a/setup.py +++ b/setup.py @@ -645,6 +645,10 @@ def run(self): report("-- Building NCCL library") else: report("-- Not using NCCL") + if cmake_cache_vars["USE_XCCL"]: + report("-- Building XCCL library") + else: + report("-- Not using XCCL") if cmake_cache_vars["USE_DISTRIBUTED"]: if IS_WINDOWS: report("-- Building without distributed package") diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt index bb949a081c95e9..8ab7d7aeb095b6 100644 --- a/torch/CMakeLists.txt +++ b/torch/CMakeLists.txt @@ -282,6 +282,9 @@ if(USE_DISTRIBUTED) if(USE_NCCL) list(APPEND TORCH_PYTHON_LINK_LIBRARIES __caffe2_nccl) endif() + if(USE_XCCL) + list(APPEND TORCH_PYTHON_LINK_LIBRARIES __caffe2_xccl) + endif() # Same for MPI. if(USE_MPI) list(APPEND TORCH_PYTHON_LINK_LIBRARIES MPI::MPI_CXX) @@ -345,6 +348,10 @@ if(BUILD_LIBTORCHLESS) target_compile_definitions(torch_python PRIVATE USE_C10D_NCCL) endif() + if(USE_XPU AND USE_C10D_XCCL) + target_compile_definitions(torch_python PRIVATE USE_C10D_XCCL) + endif() + if(USE_DISTRIBUTED) target_compile_definitions(torch_python PRIVATE USE_DISTRIBUTED) endif() diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp new file mode 100644 index 00000000000000..9466a0c091c99c --- /dev/null +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -0,0 +1,356 @@ +#include +#include +#include +#include + +#ifdef USE_C10D_XCCL +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace c10d { + +namespace { +std::map xcclOps = + { + {ReduceOp::MIN, ccl::reduction::min}, + {ReduceOp::MAX, ccl::reduction::max}, + {ReduceOp::SUM, ccl::reduction::sum}, + {ReduceOp::PRODUCT, ccl::reduction::prod}, + }; + +std::map xcclDatatypes = + { + {at::kByte, ccl::datatype::uint8}, + {at::kChar, ccl::datatype::int8}, + {at::kShort, ccl::datatype::int16}, + {at::kInt, ccl::datatype::int32}, + {at::kLong, ccl::datatype::int64}, + {at::kHalf, ccl::datatype::float16}, + {at::kFloat, ccl::datatype::float32}, + {at::kDouble, ccl::datatype::float64}, + {at::kBFloat16, ccl::datatype::bfloat16}, + {at::kBool, ccl::datatype::uint8}, + }; + +void check_gpu_single_tensor( + const at::Tensor& tensor +) { + if (!tensor.is_xpu() || tensor.is_sparse()) { + C10_THROW_ERROR(ValueError, "Tensors must be XPU and dense"); + } + if (!tensor.is_contiguous(tensor.suggest_memory_format())) { + C10_THROW_ERROR(ValueError, "Tensors must be contiguous"); + } + } +} + +} // namespace + +namespace { + +ProcessGroupXCCL::WorkXCCL::WorkXCCL(std::vector> outputTensors, + int rank, + c10d::OpType opType, + const c10::optional>& inputTensors) + : Work(rank, opType, nullptr, inputTensors), + outputTensors_(std::move(outputTensors)), + future_(createFutureAsOutput(outputTensors) + ); + +c10::intrusive_ptr ProcessGroupXCCL::WorkXCCL::getFuture() { + return future_; +} + +c10::intrusive_ptr ProcessGroupXCCL::createProcessGroupXCCL( + const c10::intrusive_ptr& store, + int rank, + int size) +{ + return c10::make_intrusive(store, rank, size); +} + +c10::intrusive_ptr ProcessGroupNCCL::initWork( + at::Device& device, + int rank, + OpType opType, + const std::vector& inputs, + const std::vector& outputs, + bool record) { + auto r = c10::make_intrusive( + device, + rank, + opType, + seqCollective_, + profilingTitle, + profilingTitle != nullptr ? std::optional>(inputs) + : std::nullopt, + desyncDebug_, + enableTiming_.load(), + dist_debug_level_); + if (record) { + bool isP2P = isP2POp(opType); + r->trace_id_ = NCCLTraceBuffer::get()->record( + local_id_, + std::make_tuple(pg_uid_, pg_desc_), + seqCollective_, + seqP2P_, + op_id_, + profilingTitle ? profilingTitle : "", + inputs, + outputs, + r->ncclStartEvent_.get(), + r->ncclEndEvent_.get(), + options_->timeout, + pgStatus_, + isP2P); + } + return r; +} + +ProcessGroupXCCL::~ProcessGroupXCCL() +{ +} + +std::shared_ptr ProcessGroupXCCL::getXCCLComm( + const std::string& deviceKey, + at::Device& device) { + + if (deviceKey.empty()) { + C10_THROW_ERROR( + DistBackendError, + "Not able to create/get the CCL Communicator since " + "the devices are empty "); + } + + { + std::lock_guard lock(mutex_); + if (devXCCLCommMap_.find(deviceKey) != devXCCLCommMap_.end()) { + return devXCCLCommMap_[deviceKey]; + } + } + + std::shared_ptr xcclComm; + + XCCL_KVS kvs = get_kvs(rank_, store_); + + int numRanks, rank; + numRanks = getSize(); + rank = getRank(); + + ccl::vector_class> devs_rank; + c10::impl::VirtualGuardImpl impl(device.type()); + c10::Stream stream = impl.getStream(device); + auto q = get_sycl_queue(stream); + auto ctx = ccl::create_context(q.get_context()); + devs_rank.emplace_back(rank, ccl::create_device(q.get_device())); + auto xcclComm = ccl::create_communicator(numRanks, devs_rank, ctx, kvs); + + { + std::lock_guard lock(mutex_); + inInitializationCommMap_.emplace(deviceKey, ncclComm); + } + + auto it = inInitializationCommMap_.find(deviceKey); + if (it != inInitializationCommMap_.end()) { + devXCCLCommMap_.emplace(deviceKey, std::move(it->second)); + inInitializationCommMap_.erase(deviceKey); + + ncclCommDevIdxMapMutex.lock(); + ncclCommDevIdxMap.emplace(ncclComm, device.index()); + ncclCommDevIdxMapMutex.unlock(); + } + + it = devXCCLCommMap_.find(deviceKey); + TORCH_INTERNAL_ASSERT( + it != devXCCLCommMap_.end(), "Communicators not populated in cache!"); + + return it->second; +} + +template +c10::intrusive_ptr ProcessGroupNCCL::collective( + at::Tensor& input, + at::Tensor& output, + Fn fn, + PreProcess pre, + PostProcess post, + OpType opType) { + + auto device = input.device(); + const auto key = std::to_string(device.index()); + auto ncclComm = getXCCLComm(key, device); + + std::vector inputs{input}; + std::vector outputs{output}; + + auto work = + initWork(device, rank_, opType, profilingTitle, inputs, outputs, enqueue); + + // Store references to outputs to be used by WorkNCCL::result and operator<<. + work->outputs_ = + std::make_shared>(std::move(outputs)); + + if (avoidRecordStreams) { + work->stashed_for_allocator_safety_ = + std::make_shared>(); + work->stashed_for_allocator_safety_->push_back(input); + } + + at::cuda::OptionalCUDAGuard gpuGuard; + + // Start event should only be recorded before the ncclGroupStart() + if (work->timingEnabled_) { + work->ncclStartEvent_->record(ncclStream); + } + + pre(ncclStream, work); + + ncclComm_t comm = ncclComm->getNcclComm(); + + // Both `inputs' and `outputs' are created on a worker stream and used in + // different ncclStreams. Hence, both must record the ncclStream to + // prevent being freed before the collective finishes. + // + // We only record `inputs' here, and leave recording `outputs' to `fn' for + // operations where `inputs' and `outputs' are not the same. + // + // See [Sync Streams]. + if (!avoidRecordStreams) { + if (!input.is_sparse()) { + c10::cuda::CUDACachingAllocator::recordStream( + input.storage().data_ptr(), ncclStream); + } else { + // for sparse input case record streams on both index and value + // tensors + c10::cuda::CUDACachingAllocator::recordStream( + input.values().storage().data_ptr(), ncclStream); + c10::cuda::CUDACachingAllocator::recordStream( + input.indices().storage().data_ptr(), ncclStream); + } + } +#ifndef NCCL_HAS_COMM_NONBLOCKING + C10D_NCCL_CHECK( + fn(input, output, comm, ncclStream), + ncclComm->getNcclCommFailureReason()); +#else + C10D_NCCL_CHECK_TIMEOUT( + fn(input, output, comm, ncclStream), + comm, + ncclComm->getNcclCommFailureReason()); +#endif + + post(ncclStream, work); + + // End event should only be recorded after the ncclGroupEnd() + if (!coalescing_state_) { + work->ncclEndEvent_->record(ncclStream); + } + work->ncclComm_ = ncclComm; + + { + c10::cuda::CUDAMultiStreamGuard streamGuard(ncclStream); + std::vector devices{device}; + work->future_ = c10::make_intrusive( + c10::ListType::create(c10::TensorType::get()), devices); + + // Add a callback that runs profiling end callbacks. wrapCallback() in CUDA + // future blocks the stream this callback runs on the corresponding + // ncclEndEvents_ ensuring appropriate synchronization. + if (work->recordFunctionEndCallback_) { + work->future_->addCallback( + [work](at::ivalue::Future& /* unused */) { + work->recordFunctionEndCallback_(); + }, + // uses_future = false allows us to skip synchronization in + // ivalue::Future, but is only valid as long as the lambda doesn't use + // the "Future" argument. + /*uses_future=*/false); + } + work->future_->markCompleted(at::IValue(*work->outputs_)); + } + + // Set appropriate work parameters. + work->blockingWait_ = blockingWait_; + work->avoidRecordStreams_ = avoidRecordStreams; + work->opTimeout_ = options_->timeout; + work->store_ = store_; + // Record size info for debug. We only record the size on the first device as + // multi-device per process is deprecated + work->numelIn_ = input.numel(); + work->numelOut_ = output.numel(); + + // Notify graphs before we check the capture status preemptively + at::cuda::CUDAGraph::inc_pending_event_queries(); + if (enqueue) { + workEnqueue(work); + } else { + at::cuda::CUDAGraph::dec_pending_event_queries(); + } + + return work; +} + +c10::intrusive_ptr ProcessGroupXCCL::allreduce_impl( + at::Tensor& tensor, + const AllreduceOptions& opts) { + return collective( + tensor, + tensor, + [&](at::Tensor& input, + at::Tensor& output, + ncclComm_t comm, + at::cuda::CUDAStream& stream) { + auto ncclDataType = getNcclDataType(input.scalar_type()); + auto ncclReduceOp = + getNcclReduceOp(opts.reduceOp, input, ncclDataType, comm); + return ncclAllReduce( + input.data_ptr(), + output.data_ptr(), + input.numel(), + ncclDataType, + ncclReduceOp, + comm, + stream.stream()); + }, + OpType::ALLREDUCE, + "nccl:all_reduce"); +} + +c10::intrusive_ptr ProcessGroupXCCL::allreduce( + std::vector& tensors, + const AllreduceOptions& opts) +{ + TORCH_CHECK(tensors.size() == 1, "Expecting one tensor only but got multiple"); + auto tensor = tensors.back(); + check_gpu_single_tensor(tensor); + if (opts.reduceOp == ReduceOp::SUM) { + TORCH_CHECK(false, "Cannot use ReduceOp SUM with XPU") + } + return allreduce_impl(tensor, opts); +} + + +} + +} \ No newline at end of file diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp new file mode 100644 index 00000000000000..39f3c1a5e89964 --- /dev/null +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp @@ -0,0 +1,140 @@ +#pragma once + +#if defined(__linux__) +#include +#include +#include +#include +#endif + +#ifdef USE_C10D_XCCL + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace c10d { + +constexpr const char* XCCL_BACKEND_NAME = "xccl"; + +class ProcessGroupXCCL : public Backend { +public: + class WorkXCCL : public Work { + public: + WorkXCCL( + std::vector> outputTensors, + int rank = -1, + OpType opType = UNKNOWN, + const c10::optional>& inputTensors = c10::nullopt) + : outputTensors_(std::move(outputTensors)) {} + + WorkXCCL(const WorkXCCL& w) + : outputTensors_(w.outputTensors_), events_(w.events_) {} + + ~WorkXCCL() override { + // Ensures all events are properly handled before destruction + for (auto& event : events_) { + event.wait(); + } + } + + bool isCompleted() override { + for (const auto& event : events_) { + if (!event.test()) { + return false; + } + } + return true; + } + + bool isSuccess() const override { + TORCH_CHECK(false, "ProcessGroupXCCL::WorkXCCL::isSuccess not implemented"); + } + + void abort() override { + TORCH_CHECK(false, "ProcessGroupXCCL::WorkXCCL::abort not implemented"); + } + + void synchronize() override { + for (auto& event : events_) { + event.wait(); + } + } + + void wait() override { + for (auto& event : events_) { + call_with_lock(globalMutex, [&]() { + CCL_CHECK(event.wait()); + }); + } + events_.clear(); + } + + c10::intrusive_ptr getFuture() override; + + std::vector result() override { + return outputTensors_.empty() ? std::vector() : outputTensors_[0]; + } + + protected: + friend class ProcessGroupXCCL; + std::vector events_; + const std::vector> outputTensors_; + c10::intrusive_ptr future_; + }; + + explicit ProcessGroupXCCL(const c10::intrusive_ptr& store, + int rank, + int size) + : store_(store), rank_(rank), size_(size) { + } + + virtual ~ProcessGroupXCCL(); + + const std::string getBackendName() const override { + return std::string(XCCL_BACKEND_NAME); + } + + c10::intrusive_ptr allreduce( + std::vector& tensors, + const AllreduceOptions& opts = AllreduceOptions()) override; + + c10::intrusive_ptr barrier( + const BarrierOptions& opts = BarrierOptions()) override; + + static c10::intrusive_ptr createProcessGroupXCCL( + const c10::intrusive_ptr& store, + int rank = -1, + int size = -1); + +private: + int rank_; + int size_; + +public: + std::unordered_map> + inInitializationCommMap_; + std::unordered_map> devXCCLCommMap_; + c10::intrusive_ptr store_; + std::mutex mutex_; +}; + +} // namespace c10d + +#endif // USE_C10D_XCCL diff --git a/torch/csrc/distributed/c10d/XCCLUtils.hpp b/torch/csrc/distributed/c10d/XCCLUtils.hpp new file mode 100644 index 00000000000000..d52f3df8ea466d --- /dev/null +++ b/torch/csrc/distributed/c10d/XCCLUtils.hpp @@ -0,0 +1,334 @@ +#pragma once + +#ifdef USE_C10D_XCCL + +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + +// RAII wrapper for NCCL communicator +class XCCLComm { + public: + explicit XCCLComm(ncclComm_t ncclComm) + : ncclComm_(ncclComm), + aborted_(false), + ncclAsyncErr_(ncclSuccess), + commFailureReason_(c10::nullopt), + initialized_(false) {} + + NCCLComm() : NCCLComm(nullptr) {} + + ~NCCLComm() noexcept { + // Add lock in this destructor, as aborted_ needs to be read after memory + // barrier here. + std::unique_lock lock(mutex_); + if (ncclComm_ && !aborted_) { +#ifdef ENABLE_NCCL_ERROR_CHECKING + // Use ncclCommAbort instead of ncclCommDestroy here since + // ncclCommDestroy could block forever waiting for work to complete on + // the communicator. + C10D_NCCL_ASSERT(::ncclCommAbort(ncclComm_)); +#else + C10D_NCCL_ASSERT(::ncclCommDestroy(ncclComm_)); +#endif + } + } + + static std::shared_ptr create( + int numRanks, + int rank, + ncclUniqueId commId) { + auto comm = std::make_shared(); + C10D_NCCL_CHECK( + ncclCommInitRank(&(comm->ncclComm_), numRanks, commId, rank), + c10::nullopt); + comm->ncclId_ = commId; + comm->rank_ = rank; + comm->initialized_ = true; + return comm; + } + +#ifdef NCCL_HAS_COMM_NONBLOCKING + static std::shared_ptr create( + int numRanks, + int rank, + ncclUniqueId commId, + ncclConfig_t& config) { + auto comm = std::make_shared(); + bool isInitialized = false; + if (nccl_use_nonblocking()) { + config.blocking = 0; + LOG(INFO) << "Rank " << rank + << ": creating NCCL communicator in nonblocking mode"; + C10D_NCCL_CHECK_NONBLOCKING( + ncclCommInitRankConfig( + &(comm->ncclComm_), numRanks, commId, rank, &config), + c10::nullopt); + } else { + C10D_NCCL_CHECK( + ncclCommInitRankConfig( + &(comm->ncclComm_), numRanks, commId, rank, &config), + c10::nullopt); + // under blocking mode, comm is initialized after NCCL CHECK + isInitialized = true; + } + comm->ncclId_ = commId; + comm->rank_ = rank; + comm->initialized_ = isInitialized; + return comm; + } +#endif + +#ifdef NCCL_HAS_COMM_SPLIT + static std::shared_ptr split( + NCCLComm* source, + int color_id, + int rank, + ncclConfig_t& config) { + auto comm = std::make_shared(); + C10D_NCCL_CHECK( + ncclCommSplit( + source->ncclComm_, color_id, rank, &(comm->ncclComm_), &config), + c10::nullopt); + ++source->ncclCommSplitCounter_; + comm->rank_ = rank; + return comm; + } +#endif + +#if defined(IS_NCCL_EXP) && defined(NCCL_COMM_DUMP) + std::unordered_map ncclCommDump() { + std::unordered_map dump; + if (isAborted()) { + LOG(INFO) << "Communicator was aborted before trying to dump its state."; + return dump; + } + C10D_NCCL_CHECK(::ncclCommDump(ncclComm_, dump), c10::nullopt); + return dump; + } +#endif + + ncclUniqueId getNcclId() { + return ncclId_; + } + + // Must not be copyable + NCCLComm(const NCCLComm&) = delete; + NCCLComm& operator=(const NCCLComm&) = delete; + + // Do not support move assignment as there is no valid use case + NCCLComm& operator=(NCCLComm&& other) = delete; + + // Move constructable + NCCLComm(NCCLComm&& other) { + // Using other's lock, as it reads other's states + // Can not use this.mutex_, as this object is being constructed. + std::unique_lock lock(other.mutex_); + std::swap(ncclComm_, other.ncclComm_); + std::swap(aborted_, other.aborted_); + std::swap(ncclAsyncErr_, other.ncclAsyncErr_); + std::swap(initialized_, other.initialized_); + } + + ncclComm_t getNcclComm(); + + c10::optional getNcclCommFailureReason() const { + std::unique_lock lock(mutex_); + return commFailureReason_; + } + + void ncclCommAbort( + c10::optional commFailureReason = c10::nullopt) { + std::unique_lock lock(mutex_); +#ifdef ENABLE_NCCL_ERROR_CHECKING + if (aborted_) { + // Should not abort twice. + return; + } + +#ifdef NCCL_HAS_COMM_REGISTER + // Deregister all registered segments before aborting. + for (auto& it : registeredSegmentHandles_) { + void* handle = it.second; + C10D_NCCL_CHECK( + ::ncclCommDeregister(ncclComm_, handle), + c10::str( + "Failed to deregister segment handle ", + handle, + " on ncclComm_ ", + ncclComm_)); + } + registeredSegmentHandles_.clear(); +#endif + + // Set true failure reason if provided by ProcessGroupNCCL (e.g. work + // timeout) + commFailureReason_ = commFailureReason; + LOG(INFO) << "Aborting ncclComm_ " << ncclComm_ << " with reason: " + << (commFailureReason ? *commFailureReason + : "No abort reason provided."); +#ifndef NCCL_HAS_COMM_NONBLOCKING + C10D_NCCL_CHECK(::ncclCommAbort(ncclComm_), commFailureReason_); +#else + C10D_NCCL_CHECK_TIMEOUT( + ::ncclCommAbort(ncclComm_), ncclComm_, commFailureReason_); +#endif + aborted_ = true; + ncclComm_ = nullptr; + + // Set an appropriate error so that we avoid using the communicator. + if (ncclAsyncErr_ == ncclSuccess) { + ncclAsyncErr_ = ncclSystemError; + } +#else + // This is a NOOP, if error checks are disabled. + return; +#endif + } + + bool isAborted() const { + std::unique_lock lock(mutex_); + return aborted_; + } + + uint64_t getCommSplitCounter() const { + return ncclCommSplitCounter_; + } + + ncclResult_t checkForNcclError() { + std::unique_lock lock(mutex_); +#ifdef ENABLE_NCCL_ERROR_CHECKING + if (ncclAsyncErr_ != ncclSuccess) { + return ncclAsyncErr_; + } + C10D_NCCL_CHECK( + ncclCommGetAsyncError(ncclComm_, &ncclAsyncErr_), commFailureReason_); + return ncclAsyncErr_; +#else + // Always return success, if error checks are disabled. + return ncclSuccess; +#endif + } + + ncclResult_t registerSegment(void* ptr, size_t size) { + std::unique_lock lock(mutex_); +#ifdef NCCL_HAS_COMM_REGISTER + // We register only segments from cache allocator + // which are guaranteed to be with disjoint addr ranges. Thus, a ptr always + // maps to a unique handle and should not be registered before the current + // ptr is deregistered and freed. + TORCH_CHECK( + registeredSegmentHandles_.count(ptr) == 0, + "Segment with ptr ", + ptr, + " has already been registered on ncclComm_ ", + ncclComm_); + + void* handle; + C10D_NCCL_CHECK( + ncclCommRegister(ncclComm_, ptr, size, &handle), + c10::str( + "Failed to register segment with ptr ", + ptr, + ", size ", + size, + " on ncclComm_ ", + ncclComm_)); + registeredSegmentHandles_[ptr] = handle; + return ncclSuccess; +#else + return ncclInvalidUsage; +#endif + } + + ncclResult_t deregisterSegment(void* ptr) { + std::unique_lock lock(mutex_); +#ifdef NCCL_HAS_COMM_REGISTER + TORCH_CHECK( + registeredSegmentHandles_.count(ptr) == 1, + "Segment with ptr ", + ptr, + " is not registered on ncclComm_ ", + ncclComm_); + + void* handle = registeredSegmentHandles_[ptr]; + C10D_NCCL_CHECK( + ncclCommDeregister(ncclComm_, handle), + c10::str( + "Failed to deregister segment handle ", + handle, + ", with ptr ", + ptr, + " on ncclComm_ ", + ncclComm_)); + registeredSegmentHandles_.erase(ptr); + return ncclSuccess; +#else + return ncclInvalidUsage; +#endif + } + + friend class ProcessGroupNCCL; + + protected: + // a helper function to wait until the communicator is initialized; + void waitUntilInitialized(int timeoutSecs); + ncclComm_t ncclComm_; + // Unique nccl_id for this communicator. + ncclUniqueId ncclId_; + bool aborted_; + uint64_t ncclCommSplitCounter_{0}; + ncclResult_t ncclAsyncErr_; + mutable std::mutex mutex_; + // Rank that this communicator corresponds to. + int rank_; + // Optional reason for communicator failure, provided by ProcessGroupNCCL for + // better error messaging. + c10::optional commFailureReason_; + bool initialized_{false}; +#ifdef NCCL_HAS_COMM_REGISTER + // Stores handlers for tensors registered by NCCL + std::unordered_map registeredSegmentHandles_; +#endif +}; + +// Helper that automatically cleans up premul sums. +struct ncclRedOpRAII { + ncclRedOpRAII() = default; + ncclRedOpRAII(ncclRedOp_t op) : op_(op) {} + ncclRedOpRAII(ncclRedOp_t op, ncclComm_t comm) + : op_(op), comm_(comm), premul_sum_(true) {} + ncclRedOpRAII(const ncclRedOpRAII&) = delete; + ncclRedOpRAII& operator=(const ncclRedOpRAII&) = delete; + ncclRedOpRAII(ncclRedOpRAII&& tmp) : ncclRedOpRAII() { + std::swap(tmp.op_, this->op_); + std::swap(tmp.comm_, this->comm_); + std::swap(tmp.premul_sum_, this->premul_sum_); + } +#if defined(ENABLE_NCCL_PREMUL_SUM_SUPPORT) + ~ncclRedOpRAII() { + if (premul_sum_) { + ncclRedOpDestroy(op_, comm_); + } + } +#endif + operator ncclRedOp_t() const { + return op_; + } + ncclRedOp_t op_; + ncclComm_t comm_; + bool premul_sum_ = false; +}; + +} // namespace c10d + +#endif // USE_C10D_NCCL + diff --git a/torch/csrc/xpu/xccl.cpp b/torch/csrc/xpu/xccl.cpp new file mode 100644 index 00000000000000..5304b43f57d410 --- /dev/null +++ b/torch/csrc/xpu/xccl.cpp @@ -0,0 +1,923 @@ +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + + +xcclComm_t* to_xccl_comm(torch::xpu::xccl::xcclComm_t* var) { + return reinterpret_cast(var); +} + +xcclComm_t to_xccl_comm(torch::xpu::xccl::xcclComm_t var) { + return reinterpret_cast(var); +} + + +xcclDataType_t to_nccl_data_type(c10::ScalarType type) { + switch (type) { + case at::kFloat: + return ccl::datatype::float32; + case at::kHalf: + return ccl::datatype::float16; + case at::kDouble: + return ccl::datatype::float64; + case at::kLong: + return ccl::datatype::int64; + case at::kInt: + return ccl::datatype::int32; + case at::kChar: + return ccl::datatype::int8; + case at::kByte: + return ccl::datatype::uint8; + case at::kBool: + return ccl::datatype::uint8; + case at::kBFloat16: + return ccl::datatype::bfloat16; + default: + TORCH_CHECK(false, "Unconvertible XCCL type ", type); + } +} + +ncclDataType_t to_xccl_data_type(const at::Tensor& t) { + if (!t.is_xpu()) { + TORCH_CHECK( + false, + "XCCL only supports XPU tensors, but got a tensor on ", + t.device()); + } + return to_xccl_data_type(t.scalar_type()); +} + +ccl::reduction to_xccl_red_op(int var) { + return (ccl::reduction)(var); +} + +namespace torch::xpu::xccl { + +XCCL_KVS get_kvs(int rank, c10d::Store& store) { + if (kvs) + return kvs; + // Each process group is with different store, so we use the unique key for + // broadcast the bootstrap network information. + std::string storeKey = "ccl_kvs"; + + // Rank 0 broadcast the bootstrap network information to other ranks + if (rank == 0) { + kvs = ccl::create_main_kvs(); + ccl::kvs::address_type main_addr = kvs->get_address(); + auto ccl_kvs_addr = std::vector(main_addr.begin(), main_addr.end()); + store.set(storeKey, ccl_kvs_addr); + } + else { + auto ccl_kvs_addr = store.get(storeKey); + if (ccl_kvs_addr.size() != ccl::kvs::address_max_size) { + throw std::runtime_error( + "Unexpected ccl kvs addr from the store\n"); + } + ccl::kvs::address_type main_addr; + std::copy_n(std::make_move_iterator(ccl_kvs_addr.begin()), + ccl::kvs::address_max_size, + main_addr.begin()); + kvs = ccl::create_kvs(main_addr); + } + + return kvs; +} + + +using namespace at; + +namespace detail { + +void xcclCommInitAll(xcclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) { + for(int i = 0; i < nranks; i++) { + newcomm[i] = ccl::create_communicator(nranks, i, get_kvs_addr) + } + c10::Stream dpcpp_stream = impl.getStream(devices[0]); + ccl::vector_class> devs_rank; + newcomm = ccl::create_communicators(nranks, devs_rank, ctx, ) +} + +struct XcclCommList { + std::unique_ptr comms; + int ndevices; + XcclCommList(const std::vector& devices) + : comms(new xcclComm_t[devices.size()]), ndevices(devices.size()) { + xcclCommInitAll( + to_xccl_comm(comms.get()), devices.size(), devices.data()); + } + NcclCommList(NcclCommList&& foo) = default; + ~NcclCommList() { + if (comms) { + for (const auto i : c10::irange(ndevices)) { + comm_destroy(comms[i]); + } + } + } + ArrayRef ref() const { + return ArrayRef(comms.get(), ndevices); + } +}; + +using device_list = std::vector; +// accesses to this object have to be guarded by THC's CudaFreeMutex +std::unordered_map> _communicators; +static std::unordered_map> + _communicators; + +ArrayRef get_communicators(TensorList inputs) { + static auto get_device = [](const at::Tensor& t) -> int { + return t.get_device(); + }; + device_list devices = fmap(inputs, get_device); + auto it = _communicators.find(devices); + if (it == _communicators.end()) { + it = _communicators.emplace(devices, devices).first; + } + return it->second; +} + +static inline void check_tensor( + const at::Tensor& input, + const std::optional& output, + int input_multiplier, + int output_multiplier, + int64_t ref_numel, + ScalarType ref_dtype) { + auto check_one = [&](const at::Tensor& tensor) { + if (!tensor.is_xpu() || tensor.is_sparse()) { + throw std::runtime_error( + "input and output elements have to be xpu dense Tensors"); + } + + if (ref_dtype != tensor.scalar_type()) { + throw std::runtime_error( + "all inputs and outputs must be of the same Tensor dtype"); + } + + if (!tensor.is_contiguous()) { + throw std::runtime_error("all inputs and outputs have to be contiguous"); + } + }; + + check_one(input); + + // all inputs must be same size + if (input.numel() != ref_numel) { + throw std::runtime_error( + "all inputs must have the same number of elements"); + } + + if (output) { + check_one(*output); + + // inputs and outputs must be on same device respectively + if (input.get_device() != output->get_device()) { + throw std::runtime_error("input and output must be on the same device"); + } + + if (output->numel() * output_multiplier != ref_numel * input_multiplier) { + throw std::runtime_error( + "output must be of size input_size * size_multiplier"); + } + } +} + +void check_inputs( + TensorList inputs, + TensorList outputs, + int input_multiplier, + int output_multiplier) { + // len(inputs) == len(outputs) + size_t len = inputs.size(); + + if (len <= 0) { + throw std::runtime_error("input sequence can't be empty"); + } + + if (len != outputs.size()) { + std::stringstream err; + err << "inputs and outputs sequences have to be of the same length, but got input of length " + << len << " and output of length " << outputs.size(); + throw std::runtime_error(err.str()); + } + + device_set devices; + int64_t numel = inputs[0].numel(); + auto dtype = inputs[0].scalar_type(); + + for (const auto i : c10::irange(len)) { + auto input = inputs[i]; + auto output = outputs[i]; + + check_tensor( + input, output, input_multiplier, output_multiplier, numel, dtype); + + auto input_device = input.get_device(); + // inputs must be on unique devices + if (devices.test(input_device)) { + throw std::runtime_error("inputs must be on unique devices"); + } + devices.set(input_device); + } +} + +void check_inputs( + TensorList inputs, + const at::Tensor& output, + int root, + int input_multiplier, + int output_multiplier) { + auto len = inputs.size(); + + if (len <= 0) { + throw std::runtime_error("input sequence can't be empty"); + } + + device_set devices; + int64_t numel = inputs[0].numel(); + auto dtype = inputs[0].scalar_type(); + + for (const auto i : c10::irange(len)) { + auto input = inputs[i]; + + check_tensor( + input, + i == static_cast>(root) + ? std::optional{output} + : std::nullopt, + input_multiplier, + output_multiplier, + numel, + dtype); + + auto input_device = input.get_device(); + // inputs must be on unique devices + if (devices.test(input_device)) { + throw std::runtime_error("inputs must be on unique devices"); + } + devices.set(input_device); + } +} + +} // namespace detail + +bool is_available(TensorList tensors) { +#ifdef USE_XCCL + device_set devices; + for (auto& tensor : tensors) { + if (!tensor.is_xpu() || tensor.is_sparse()) + return false; + if (!tensor.is_contiguous()) + return false; + auto device = tensor.get_device(); + if (devices[device]) + return false; + devices[device] = true; + } + return true; +#else + return false; +#endif +} + +std::uint64_t version() { +#if defined(NCCL_MAJOR) + constexpr std::uint64_t ver = (((uint64_t)NCCL_MAJOR) << 32) | + (((uint64_t)NCCL_MINOR) << 16) | ((uint64_t)NCCL_PATCH); + return ver; +#elif defined(USE_NCCL) + // return major version "1" + return ((uint64_t)1) << 32; +#else + return 0; +#endif +} + +ncclComm_t comm_init_rank(int nranks, const ncclUniqueId& comm_id, int rank) { +#ifdef USE_XCCL + using namespace torch::xpu::xccl::detail; + xcclComm_t comm; + ncclUniqueId id = comm_id; + NCCL_CHECK(ncclCommInitRank( + to_nccl_comm(&comm), nranks, *(to_nccl_unique_id(&id)), rank)); + return comm; +#else + return nullptr; +#endif +} + + +namespace { +// NCCL changed the numerical type used for count between NCCL1 and NCCL2. +// So we use the following struct, which gets the type of the second argument +// of T, if T is a function type, with ncclBcast, to get that type statically +// and programmatically. + +template +struct GetSecondArgType; + +template +struct GetSecondArgType { + typedef typename std::decay::type type; +}; + +constexpr auto count_max = + std::numeric_limits::type>::max(); + +// Since NCCL 2.12.10, NCCL supports send/recv 0 byte: +// https://github.com/NVIDIA/nccl/issues/696. The issue of skipping send/recv +// is that it can cause deadlock when a rank send and recv 0 bytes so it's +// completely skipping the collective, causing mismatch across ranks +#if defined(NCCL_MAJOR) && \ + ((NCCL_MAJOR > 2) || ((NCCL_MAJOR == 2) && (NCCL_MINOR > 13))) +template +constexpr bool _nccl_should_send_recv(C10_UNUSED T _unused_) { + return true; +} +#else +// old NCCL uses 0 byte message for synchronization +// Avoid send/recv when message size is zero +template +inline bool _nccl_should_send_recv(T value) { + return value != 0; +} +#endif +} // namespace + +size_t get_max_count() { + return count_max; +} + +void broadcast( + TensorList tensors, + const stream_list& streams, + const comm_list& user_comms) { +#ifdef USE_NCCL + using namespace torch::cuda::nccl::detail; + check_inputs(tensors, tensors, 1, 1); + auto data_type = to_nccl_data_type(tensors[0]); + int64_t numel = tensors[0].numel(); + + const auto comms = user_comms.empty() ? get_communicators(tensors) + : ArrayRef(user_comms); + + AutoNcclGroup nccl_group_guard; + at::cuda::OptionalCUDAGuard device_guard; + for (size_t i = 0, num_tensors = tensors.size(); i < num_tensors; i++) { + auto device = tensors[i].get_device(); + device_guard.set_index(device); + // Default to the current stream + const auto stream = (streams.empty() || !streams[i]) + ? at::cuda::getCurrentCUDAStream(device).stream() + : streams[i]->stream(); + TORCH_CHECK( + static_cast(numel) <= static_cast(count_max), + "Broadcast tensor has ", + numel, + " elements, which exceeds the " + "maximum NCCL supports (", + count_max, + ")"); + ncclComm_t comm = comms[i]; + NCCL_CHECK(ncclBcast( + tensors[i].data_ptr(), + numel, + data_type, + 0, + to_nccl_comm(comm), + stream)); + } +#else + AT_ERROR("PyTorch built without NCCL support"); +#endif +} + +void reduce( + const std::vector& inputs, + at::Tensor& output, + int32_t root, + int32_t op, + const stream_list& streams, + const comm_list& user_comms) { +#ifdef USE_NCCL + using namespace torch::cuda::nccl::detail; + TORCH_CHECK( + root >= 0 && static_cast(root) < inputs.size(), "invalid root"); + + check_inputs(inputs, output, root, 1, 1); + const auto len = inputs.size(); + + auto data_type = to_nccl_data_type(inputs[0]); + + const auto count = inputs[0].numel(); + auto comms_ref = user_comms.empty() ? get_communicators(inputs) + : ArrayRef(user_comms); + + AutoNcclGroup nccl_group_guard; + at::cuda::OptionalCUDAGuard device_guard; + for (const auto i : c10::irange(len)) { + auto device = inputs[i].device().index(); + device_guard.set_index(device); + // Default to the current stream + const auto stream = (streams.empty() || !streams[i]) + ? at::cuda::getCurrentCUDAStream(device).stream() + : streams[i]->stream(); + + ncclComm_t comm = comms_ref[i]; + NCCL_CHECK(ncclReduce( + inputs[i].data_ptr(), + static_cast>(root) == i + ? output.data_ptr() + : nullptr, + count, + data_type, + to_nccl_red_op(op), + root, + to_nccl_comm(comm), + stream)); + } +#else + AT_ERROR("PyTorch built without NCCL support"); +#endif +} + +void reduce( + std::vector& inputs, + int32_t root, + int32_t op, + const stream_list& streams, + const comm_list& user_comms) { + reduce(inputs, /*output=*/inputs[root], root, op, streams, user_comms); +} + +void all_reduce( + const std::vector& inputs, + std::vector& outputs, + int32_t op, + const stream_list& streams, + const comm_list& user_comms) { +#ifdef USE_NCCL + using namespace torch::cuda::nccl::detail; + check_inputs(inputs, outputs, 1, 1); + const auto len = inputs.size(); + + auto data_type = to_nccl_data_type(inputs[0]); + + const auto count = inputs[0].numel(); + auto comms_ref = user_comms.empty() ? get_communicators(inputs) + : ArrayRef(user_comms); + + AutoNcclGroup nccl_group_guard; + at::cuda::OptionalCUDAGuard device_guard; + for (const auto i : c10::irange(len)) { + auto device = inputs[i].device().index(); + device_guard.set_index(device); + // Default to the current stream + const auto stream = (streams.empty() || !streams[i]) + ? at::cuda::getCurrentCUDAStream(device).stream() + : streams[i]->stream(); + + ncclComm_t comm = comms_ref[i]; + NCCL_CHECK(ncclAllReduce( + inputs[i].data_ptr(), + outputs[i].data_ptr(), + count, + data_type, + to_nccl_red_op(op), + to_nccl_comm(comm), + stream)); + } +#else + AT_ERROR("PyTorch built without NCCL support"); +#endif +} + +void reduce_scatter( + const std::vector& inputs, + std::vector& outputs, + int32_t op, + const stream_list& streams, + const comm_list& user_comms) { +#ifdef USE_NCCL + using namespace torch::cuda::nccl::detail; + const auto len = inputs.size(); + check_inputs(inputs, outputs, 1, len); + + auto data_type = to_nccl_data_type(inputs[0]); + + const auto count = inputs[0].numel() / len; + auto comms_ref = user_comms.empty() ? get_communicators(inputs) + : ArrayRef(user_comms); + + AutoNcclGroup nccl_group_guard; + at::cuda::OptionalCUDAGuard device_guard; + for (const auto i : c10::irange(len)) { + auto device = inputs[i].device().index(); + device_guard.set_index(device); + // Default to the current stream + const auto stream = (streams.empty() || !streams[i]) + ? at::cuda::getCurrentCUDAStream(device).stream() + : streams[i]->stream(); + + ncclComm_t comm = comms_ref[i]; + NCCL_CHECK(ncclReduceScatter( + inputs[i].data_ptr(), + outputs[i].data_ptr(), + count, + data_type, + to_nccl_red_op(op), + to_nccl_comm(comm), + stream)); + } +#else + AT_ERROR("PyTorch built without NCCL support"); +#endif +} + +void all_gather( + const std::vector& inputs, + std::vector& outputs, + const stream_list& streams, + const comm_list& user_comms) { +#ifdef USE_NCCL + using namespace torch::cuda::nccl::detail; + const auto len = inputs.size(); + check_inputs(inputs, outputs, len, 1); + + auto data_type = to_nccl_data_type(inputs[0]); + + const auto count = inputs[0].numel(); + auto comms_ref = user_comms.empty() ? get_communicators(inputs) + : ArrayRef(user_comms); + + AutoNcclGroup nccl_group_guard; + at::cuda::OptionalCUDAGuard device_guard; + for (const auto i : c10::irange(len)) { + auto device = inputs[i].device().index(); + device_guard.set_index(device); + // Default to the current stream + const auto stream = (streams.empty() || !streams[i]) + ? at::cuda::getCurrentCUDAStream(device).stream() + : streams[i]->stream(); + + ncclComm_t comm = comms_ref[i]; +#if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2) + NCCL_CHECK(ncclAllGather( + inputs[i].data_ptr(), + outputs[i].data_ptr(), + count, + data_type, + to_nccl_comm(comm), + stream)); +#else + NCCL_CHECK(ncclAllGather( + inputs[i].data_ptr(), + count, + data_type, + outputs[i].data_ptr(), + to_nccl_comm(comm), + stream)); +#endif + } +#else + AT_ERROR("PyTorch built without NCCL support"); +#endif +} + +void all2all_single_equal_split( + at::Tensor& input, + at::Tensor& output, + int size, + ncclComm_t _comm, + at::cuda::CUDAStream& stream) { +#ifdef USE_NCCL +#if defined(NCCL_MAJOR) && \ + ((NCCL_MAJOR > 2) || ((NCCL_MAJOR == 2) && (NCCL_MINOR >= 7))) + using namespace torch::cuda::nccl::detail; + + int numranks; + auto type = to_nccl_data_type(input); + size_t count = input.numel() / size; + size_t rankdiff = input.nbytes() / size; + const auto* sendbuff = reinterpret_cast(input.const_data_ptr()); + auto* recvbuff = reinterpret_cast(output.data_ptr()); + auto comm = to_nccl_comm(_comm); +#if defined(USE_ROCM) + NCCL_CHECK(ncclAllToAll(sendbuff, recvbuff, count, type, comm, stream)); +#else + NCCL_CHECK(ncclCommCount(comm, &numranks)); + NCCL_CHECK(ncclGroupStart()); + for (const auto r : c10::irange(numranks)) { + if (_nccl_should_send_recv(count)) { + NCCL_CHECK( + ncclSend(sendbuff + r * rankdiff, count, type, r, comm, stream)); + NCCL_CHECK( + ncclRecv(recvbuff + r * rankdiff, count, type, r, comm, stream)); + } + } +#ifndef NCCL_HAS_COMM_NONBLOCKING + NCCL_CHECK(ncclGroupEnd()); +#else + NCCL_CHECK_TIMEOUT(ncclGroupEnd(), _comm); +#endif +#endif +#else + AT_ERROR("all2all is only supported for NCCL lib version >= 2.7.0"); +#endif +#else + AT_ERROR("PyTorch built without NCCL support"); +#endif +} + +void all2all_single_unequal_split( + void* sendbuff, + const size_t* sendcounts, + const size_t* senddispls, + void* recvbuff, + const size_t* recvcounts, + const size_t* recvdispls, + size_t size, + c10::ScalarType _type, + ncclComm_t _comm, + at::cuda::CUDAStream& stream) { +#ifdef USE_NCCL +#if defined(NCCL_MAJOR) && \ + ((NCCL_MAJOR > 2) || ((NCCL_MAJOR == 2) && (NCCL_MINOR >= 7))) + using namespace torch::cuda::nccl::detail; + + auto type = to_nccl_data_type(_type); + auto comm = to_nccl_comm(_comm); + int numranks; + NCCL_CHECK(ncclCommCount(comm, &numranks)); + NCCL_CHECK(ncclGroupStart()); + for (const auto r : c10::irange(numranks)) { + if (_nccl_should_send_recv(sendcounts[r])) { + NCCL_CHECK(ncclSend( + ((char*)sendbuff) + senddispls[r] * size, + sendcounts[r], + type, + r, + comm, + stream)); + } + if (_nccl_should_send_recv(recvcounts[r])) { + NCCL_CHECK(ncclRecv( + ((char*)recvbuff) + recvdispls[r] * size, + recvcounts[r], + type, + r, + comm, + stream)); + } + } +#ifndef NCCL_HAS_COMM_NONBLOCKING + NCCL_CHECK(ncclGroupEnd()); +#else + NCCL_CHECK_TIMEOUT(ncclGroupEnd(), _comm); +#endif +#else + AT_ERROR("all2all is only supported for NCCL lib version >= 2.7.0"); +#endif +#else + AT_ERROR("PyTorch built without NCCL support"); +#endif +} + +void all2all( + std::vector& outputTensors, + std::vector& inputTensors, + ncclComm_t _comm, + at::cuda::CUDAStream& stream) { +#ifdef USE_NCCL +#if defined(NCCL_MAJOR) && \ + ((NCCL_MAJOR > 2) || ((NCCL_MAJOR == 2) && (NCCL_MINOR >= 7))) + using namespace torch::cuda::nccl::detail; + auto comm = to_nccl_comm(_comm); + + NCCL_CHECK(ncclGroupStart()); + for (const auto r : c10::irange(outputTensors.size())) { + at::Tensor& input = inputTensors[r]; + at::Tensor& output = outputTensors[r]; + + if (_nccl_should_send_recv(input.numel())) { + NCCL_CHECK(ncclSend( + input.data_ptr(), + input.numel(), + to_nccl_data_type(input), + r, + comm, + stream.stream())); + } + if (_nccl_should_send_recv(output.numel())) { + NCCL_CHECK(ncclRecv( + output.data_ptr(), + output.numel(), + to_nccl_data_type(output), + r, + comm, + stream.stream())); + } + } +#ifndef NCCL_HAS_COMM_NONBLOCKING + NCCL_CHECK(ncclGroupEnd()); +#else + NCCL_CHECK_TIMEOUT(ncclGroupEnd(), _comm); +#endif +#else + AT_ERROR("all2all is only supported for NCCL lib version >= 2.7.0"); +#endif +#else + AT_ERROR("PyTorch built without NCCL support"); +#endif +} + +void send( + const at::Tensor& input, + ncclComm_t comm, + at::cuda::CUDAStream stream, + int dst) { +#ifdef USE_NCCL +#if defined(NCCL_MAJOR) && \ + ((NCCL_MAJOR > 2) || ((NCCL_MAJOR == 2) && (NCCL_MINOR >= 7))) + using namespace torch::cuda::nccl::detail; +#ifndef NCCL_HAS_COMM_NONBLOCKING + NCCL_CHECK(ncclSend( + input.data_ptr(), + input.numel(), + to_nccl_data_type(input), + dst, + to_nccl_comm(comm), + stream.stream())); +#else + NCCL_CHECK_TIMEOUT( + ncclSend( + input.data_ptr(), + input.numel(), + to_nccl_data_type(input), + dst, + to_nccl_comm(comm), + stream.stream()), + comm); +#endif +#else + AT_ERROR("Send is only supported for NCCL lib version >= 2.7.0"); +#endif +#else + AT_ERROR("PyTorch built without NCCL support"); +#endif +} + +void recv( + at::Tensor& output, + ncclComm_t comm, + at::cuda::CUDAStream stream, + int src) { +#ifdef USE_NCCL +#if defined(NCCL_MAJOR) && \ + ((NCCL_MAJOR > 2) || ((NCCL_MAJOR == 2) && (NCCL_MINOR >= 7))) + using namespace torch::cuda::nccl::detail; +#ifndef NCCL_HAS_COMM_NONBLOCKING + NCCL_CHECK(ncclRecv( + output.data_ptr(), + output.numel(), + to_nccl_data_type(output), + src, + to_nccl_comm(comm), + stream.stream())); +#else + NCCL_CHECK_TIMEOUT( + ncclRecv( + output.data_ptr(), + output.numel(), + to_nccl_data_type(output), + src, + to_nccl_comm(comm), + stream.stream()), + comm); +#endif +#else + AT_ERROR("Recv is only supported for NCCL lib version >= 2.7.0"); +#endif +#else + AT_ERROR("PyTorch built without NCCL support"); +#endif +} + +void gather( + const at::Tensor& inputs, + std::vector& outputs, + ncclComm_t _comm, + at::cuda::CUDAStream& stream, + int32_t root) { +#ifdef USE_NCCL +#if defined(NCCL_MAJOR) && \ + ((NCCL_MAJOR > 2) || ((NCCL_MAJOR == 2) && (NCCL_MINOR >= 7))) + using namespace torch::cuda::nccl::detail; + + auto comm = to_nccl_comm(_comm); + int numranks, cur_rank; + NCCL_CHECK(ncclCommCount(comm, &numranks)); + NCCL_CHECK(ncclCommUserRank(comm, &cur_rank)); + + size_t count = inputs.numel(); + auto type = to_nccl_data_type(inputs); + const auto* sendbuff = reinterpret_cast(inputs.const_data_ptr()); + + NCCL_CHECK(ncclGroupStart()); + + if (cur_rank == root) { + for (const auto r : c10::irange(numranks)) { + if (r != root) { + auto* recvbuff = reinterpret_cast(outputs[r].data_ptr()); + NCCL_CHECK(ncclRecv(recvbuff, count, type, r, comm, stream)); + } else { + // on its own rank, simply copy from the input + outputs[r].copy_(inputs); + } + } + } else { + NCCL_CHECK(ncclSend(sendbuff, count, type, root, comm, stream)); + } +#ifndef NCCL_HAS_COMM_NONBLOCKING + NCCL_CHECK(ncclGroupEnd()); +#else + NCCL_CHECK_TIMEOUT(ncclGroupEnd(), _comm); +#endif + +#else + AT_ERROR("gather is only supported for NCCL lib version >= 2.7.0"); +#endif +#else + AT_ERROR("PyTorch built without NCCL support"); +#endif +} + +void scatter( + const std::vector& inputs, + at::Tensor& outputs, + ncclComm_t _comm, + at::cuda::CUDAStream& stream, + int32_t root) { +#ifdef USE_NCCL +#if defined(NCCL_MAJOR) && \ + ((NCCL_MAJOR > 2) || ((NCCL_MAJOR == 2) && (NCCL_MINOR >= 7))) + using namespace torch::cuda::nccl::detail; + + auto comm = to_nccl_comm(_comm); + int numranks, cur_rank; +#ifndef NCCL_HAS_COMM_NONBLOCKING + NCCL_CHECK(ncclCommCount(comm, &numranks)); + NCCL_CHECK(ncclCommUserRank(comm, &cur_rank)); +#else + NCCL_CHECK_TIMEOUT(ncclCommCount(comm, &numranks), _comm); + NCCL_CHECK_TIMEOUT(ncclCommUserRank(comm, &cur_rank), _comm); +#endif + NCCL_CHECK(ncclGroupStart()); + if (cur_rank == root) { + for (const auto r : c10::irange(numranks)) { + if (r != root) { + size_t send_count = inputs[r].numel(); + auto send_type = to_nccl_data_type(inputs[r]); + const auto* sendbuff = + reinterpret_cast(inputs[r].const_data_ptr()); + NCCL_CHECK(ncclSend(sendbuff, send_count, send_type, r, comm, stream)); + } else { + // on its own rank, simply copy it to the output + outputs.copy_(inputs[r]); + } + } + } else { + size_t recv_count = outputs.numel(); + auto recv_type = to_nccl_data_type(outputs); + auto* recvbuff = reinterpret_cast(outputs.data_ptr()); + NCCL_CHECK(ncclRecv(recvbuff, recv_count, recv_type, root, comm, stream)); + } +#ifndef NCCL_HAS_COMM_NONBLOCKING + NCCL_CHECK(ncclGroupEnd()); +#else + NCCL_CHECK_TIMEOUT(ncclGroupEnd(), _comm); +#endif +#else + AT_ERROR("scatter is only supported for NCCL lib version >= 2.7.0"); +#endif +#else + AT_ERROR("PyTorch built without NCCL support"); +#endif +} + +} // namespace torch::cuda::nccl + diff --git a/torch/csrc/xpu/xccl.h b/torch/csrc/xpu/xccl.h new file mode 100644 index 00000000000000..d844f166ec5ab1 --- /dev/null +++ b/torch/csrc/xpu/xccl.h @@ -0,0 +1,112 @@ +#pragma once + +#include +#include +#include +#include +#include + +namespace torch::xpu::xccl { + +using xcclComm_t = ccl::communicator; + +using XCCL_KVS = ccl::shared_ptr_class; + +ccl::shared_ptr_class kvs; +std::vector kvs_addr; + +XCCL_KVS get_kvs(int rank, c10d::Store& store) +class Comms { +public: + + explicit Comms(ccl::vector_class &comms) : + comms(std::move(comms)), streams{} {} + + explicit Comms(ccl::vector_class &comms, ccl::vector_class &streams, std::vector &torch_streams) : + comms(std::move(comms)), streams(std::move(streams)), torch_streams(std::move(torch_streams)) {} + + ~Comms() noexcept(false) {} + + Comms() = delete; + + Comms(const Comms &) = delete; + + Comms &operator=(const Comms &) = delete; + + Comms(Comms &&other) : comms(std::move(other.comms)), streams(std::move(other.streams)), + torch_streams(std::move(other.torch_streams)) {} + + Comms &operator=(Comms &&other) { + std::swap(comms, other.comms); + std::swap(streams, other.streams); + std::swap(torch_streams, other.torch_streams); + return *this; + } + +public: + // The Communicators used by XCCL + ccl::vector_class comms; + // The streams used by XCCL + ccl::vector_class streams; + // one to one mapping the torch streams to the ccl::stream. + std::vector torch_streams; +}; + +enum class xcclRedOp { Sum = 0, Prod = 1, Max = 2, Min = 3}; + +enum class xcclDataType { + Int8 = 0, + Char = 0, + Uint8 = 1, + Int32 = 2, + Int = 2, + Uint32 = 3, + Int64 = 4, + Uint64 = 5, + Float16 = 6, + Half = 6, + Float32 = 7, + Float = 7, + Float64 = 8, + Double = 8, + Bfloat16 = 9, + NumTypes = 10 +}; + +namespace detail { + + at::ArrayRef get_communicators( + at::TensorList inputs); + void check_inputs( + at::TensorList inputs, + at::TensorList outputs, + int input_multiplier, + int output_multiplier); + void check_inputs( + at::TensorList inputs, + const at::Tensor& output, + int root, + int input_multiplier, + int output_multiplier); + +} // namespace detail + +using comm_list = std::vector; +using stream_list = std::vector>; + + std::uint64_t version(); + const char* version_suffix(); + +bool is_available(at::TensorList tensors); + +comm_init_rank(int nranks, const ncclUniqueId& comm_id, int rank); + void comm_destroy(ncclComm_t comm); + +void all_reduce( + const std::vector& inputs, + std::vector& outputs, + int32_t op = static_cast(xcclRedOp::Sum), + const stream_list& streams = {}, + const comm_list& user_comms = {}); +} // namespace torch::xpu::xccl + From 93a4bdb962b9c8e5bffa0f9b5716dbe6df05bda4 Mon Sep 17 00:00:00 2001 From: hanchao Date: Fri, 30 Aug 2024 07:44:51 +0000 Subject: [PATCH 02/96] update --- .../distributed/c10d/ProcessGroupXCCL.cpp | 323 +++---- .../distributed/c10d/ProcessGroupXCCL.hpp | 56 +- torch/csrc/distributed/c10d/XCCLUtils.hpp | 334 ------- torch/csrc/xpu/xccl.cpp | 850 +++--------------- torch/csrc/xpu/xccl.h | 56 +- 5 files changed, 295 insertions(+), 1324 deletions(-) delete mode 100644 torch/csrc/distributed/c10d/XCCLUtils.hpp diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index 9466a0c091c99c..3325691c3a8531 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -14,9 +14,6 @@ #include #include #include -#include -#include -#include #include #include #include @@ -31,16 +28,14 @@ namespace c10d { namespace { -std::map xcclOps = - { +std::map xcclOps = { {ReduceOp::MIN, ccl::reduction::min}, {ReduceOp::MAX, ccl::reduction::max}, {ReduceOp::SUM, ccl::reduction::sum}, {ReduceOp::PRODUCT, ccl::reduction::prod}, - }; +}; -std::map xcclDatatypes = - { +std::map xcclDatatypes = { {at::kByte, ccl::datatype::uint8}, {at::kChar, ccl::datatype::int8}, {at::kShort, ccl::datatype::int16}, @@ -51,96 +46,89 @@ std::map xcclDatatypes = {at::kDouble, ccl::datatype::float64}, {at::kBFloat16, ccl::datatype::bfloat16}, {at::kBool, ccl::datatype::uint8}, - }; +}; -void check_gpu_single_tensor( - const at::Tensor& tensor -) { +void check_gpu_single_tensor(const at::Tensor& tensor) { if (!tensor.is_xpu() || tensor.is_sparse()) { C10_THROW_ERROR(ValueError, "Tensors must be XPU and dense"); } if (!tensor.is_contiguous(tensor.suggest_memory_format())) { - C10_THROW_ERROR(ValueError, "Tensors must be contiguous"); - } + C10_THROW_ERROR(ValueError, "Tensors must be contiguous"); } } - } // namespace +ccl::datatype getXcclDataType(at::ScalarType type) { + auto it = xcclDatatypes.find(type); + TORCH_CHECK_WITH( + TypeError, + it != xcclDatatypes.end(), + "Input tensor data type is not supported for XCCL process group: ", + type); + return it->second; +} + +} // namespace c10d + namespace { -ProcessGroupXCCL::WorkXCCL::WorkXCCL(std::vector> outputTensors, - int rank, - c10d::OpType opType, - const c10::optional>& inputTensors) - : Work(rank, opType, nullptr, inputTensors), - outputTensors_(std::move(outputTensors)), - future_(createFutureAsOutput(outputTensors) - ); +static std::mutex xcclCommDevIdxMapMutex; +static std::unordered_map, int> xcclCommDevIdxMap; + +template < + template + class WorkXCCL, + typename RunF, + typename CommType, + typename InputType, + typename OutputType, + typename attr_t> +c10::intrusive_ptr make_work_ccl( + const std::vector& inputs, + const std::vector& outputs, + RunF f, + CommType& comms, + attr_t& attr, + int rank, + c10d::OpType op_type) { + c10::intrusive_ptr> + ret_ptr = c10::make_intrusive< + WorkCCL>( + inputs, outputs, f, comms, attr, rank, op_type); + return ret_ptr; +} -c10::intrusive_ptr ProcessGroupXCCL::WorkXCCL::getFuture() { +ProcessGroupXCCL::WorkXCCL::WorkXCCL( + std::vector> outputTensors, + int rank, + c10d::OpType opType, + const c10::optional>& inputTensors) + : Work(rank, opType, nullptr, inputTensors), + outputTensors_(std::move(outputTensors)), + future_(createFutureAsOutput(outputTensors)) {} + +c10::intrusive_ptr ProcessGroupXCCL::WorkXCCL:: + getFuture() { return future_; } c10::intrusive_ptr ProcessGroupXCCL::createProcessGroupXCCL( const c10::intrusive_ptr& store, int rank, - int size) -{ + int size) { return c10::make_intrusive(store, rank, size); } -c10::intrusive_ptr ProcessGroupNCCL::initWork( - at::Device& device, - int rank, - OpType opType, - const std::vector& inputs, - const std::vector& outputs, - bool record) { - auto r = c10::make_intrusive( - device, - rank, - opType, - seqCollective_, - profilingTitle, - profilingTitle != nullptr ? std::optional>(inputs) - : std::nullopt, - desyncDebug_, - enableTiming_.load(), - dist_debug_level_); - if (record) { - bool isP2P = isP2POp(opType); - r->trace_id_ = NCCLTraceBuffer::get()->record( - local_id_, - std::make_tuple(pg_uid_, pg_desc_), - seqCollective_, - seqP2P_, - op_id_, - profilingTitle ? profilingTitle : "", - inputs, - outputs, - r->ncclStartEvent_.get(), - r->ncclEndEvent_.get(), - options_->timeout, - pgStatus_, - isP2P); - } - return r; -} - -ProcessGroupXCCL::~ProcessGroupXCCL() -{ -} +ProcessGroupXCCL::~ProcessGroupXCCL() {} std::shared_ptr ProcessGroupXCCL::getXCCLComm( const std::string& deviceKey, at::Device& device) { - if (deviceKey.empty()) { C10_THROW_ERROR( DistBackendError, "Not able to create/get the CCL Communicator since " - "the devices are empty "); + "the devices are empty "); } { @@ -164,11 +152,11 @@ std::shared_ptr ProcessGroupXCCL::getXCCLComm( auto q = get_sycl_queue(stream); auto ctx = ccl::create_context(q.get_context()); devs_rank.emplace_back(rank, ccl::create_device(q.get_device())); - auto xcclComm = ccl::create_communicator(numRanks, devs_rank, ctx, kvs); + xcclComm = ccl::create_communicator(numRanks, devs_rank, ctx, kvs); { std::lock_guard lock(mutex_); - inInitializationCommMap_.emplace(deviceKey, ncclComm); + inInitializationCommMap_.emplace(deviceKey, xcclComm); } auto it = inInitializationCommMap_.find(deviceKey); @@ -176,9 +164,9 @@ std::shared_ptr ProcessGroupXCCL::getXCCLComm( devXCCLCommMap_.emplace(deviceKey, std::move(it->second)); inInitializationCommMap_.erase(deviceKey); - ncclCommDevIdxMapMutex.lock(); - ncclCommDevIdxMap.emplace(ncclComm, device.index()); - ncclCommDevIdxMapMutex.unlock(); + xcclCommDevIdxMapMutex.lock(); + xcclCommDevIdxMap.emplace(xcclComm, device.index()); + xcclCommDevIdxMapMutex.unlock(); } it = devXCCLCommMap_.find(deviceKey); @@ -189,168 +177,87 @@ std::shared_ptr ProcessGroupXCCL::getXCCLComm( } template -c10::intrusive_ptr ProcessGroupNCCL::collective( +c10::intrusive_ptr ProcessGroupXCCL::collective( at::Tensor& input, at::Tensor& output, Fn fn, PreProcess pre, PostProcess post, OpType opType) { + using traits = function_traits; + using attr_t = typename traits::template arg<2>::type; + attr_t attr = ccl::create_operation_attr(); auto device = input.device(); const auto key = std::to_string(device.index()); - auto ncclComm = getXCCLComm(key, device); + auto xcclComm = getXCCLComm(key, device); std::vector inputs{input}; std::vector outputs{output}; - auto work = - initWork(device, rank_, opType, profilingTitle, inputs, outputs, enqueue); - - // Store references to outputs to be used by WorkNCCL::result and operator<<. - work->outputs_ = - std::make_shared>(std::move(outputs)); - - if (avoidRecordStreams) { - work->stashed_for_allocator_safety_ = - std::make_shared>(); - work->stashed_for_allocator_safety_->push_back(input); - } + c10::intrusive_ptr work; + // work = + // initWork(device, rank_, opType, profilingTitle, inputs, outputs, + // enqueue); - at::cuda::OptionalCUDAGuard gpuGuard; - - // Start event should only be recorded before the ncclGroupStart() - if (work->timingEnabled_) { - work->ncclStartEvent_->record(ncclStream); - } - - pre(ncclStream, work); - - ncclComm_t comm = ncclComm->getNcclComm(); - - // Both `inputs' and `outputs' are created on a worker stream and used in - // different ncclStreams. Hence, both must record the ncclStream to - // prevent being freed before the collective finishes. - // - // We only record `inputs' here, and leave recording `outputs' to `fn' for - // operations where `inputs' and `outputs' are not the same. - // - // See [Sync Streams]. - if (!avoidRecordStreams) { - if (!input.is_sparse()) { - c10::cuda::CUDACachingAllocator::recordStream( - input.storage().data_ptr(), ncclStream); - } else { - // for sparse input case record streams on both index and value - // tensors - c10::cuda::CUDACachingAllocator::recordStream( - input.values().storage().data_ptr(), ncclStream); - c10::cuda::CUDACachingAllocator::recordStream( - input.indices().storage().data_ptr(), ncclStream); - } - } -#ifndef NCCL_HAS_COMM_NONBLOCKING - C10D_NCCL_CHECK( - fn(input, output, comm, ncclStream), - ncclComm->getNcclCommFailureReason()); -#else - C10D_NCCL_CHECK_TIMEOUT( - fn(input, output, comm, ncclStream), - comm, - ncclComm->getNcclCommFailureReason()); -#endif - - post(ncclStream, work); - - // End event should only be recorded after the ncclGroupEnd() - if (!coalescing_state_) { - work->ncclEndEvent_->record(ncclStream); - } - work->ncclComm_ = ncclComm; - - { - c10::cuda::CUDAMultiStreamGuard streamGuard(ncclStream); - std::vector devices{device}; - work->future_ = c10::make_intrusive( - c10::ListType::create(c10::TensorType::get()), devices); - - // Add a callback that runs profiling end callbacks. wrapCallback() in CUDA - // future blocks the stream this callback runs on the corresponding - // ncclEndEvents_ ensuring appropriate synchronization. - if (work->recordFunctionEndCallback_) { - work->future_->addCallback( - [work](at::ivalue::Future& /* unused */) { - work->recordFunctionEndCallback_(); - }, - // uses_future = false allows us to skip synchronization in - // ivalue::Future, but is only valid as long as the lambda doesn't use - // the "Future" argument. - /*uses_future=*/false); - } - work->future_->markCompleted(at::IValue(*work->outputs_)); - } - - // Set appropriate work parameters. - work->blockingWait_ = blockingWait_; - work->avoidRecordStreams_ = avoidRecordStreams; - work->opTimeout_ = options_->timeout; - work->store_ = store_; - // Record size info for debug. We only record the size on the first device as - // multi-device per process is deprecated - work->numelIn_ = input.numel(); - work->numelOut_ = output.numel(); - - // Notify graphs before we check the capture status preemptively - at::cuda::CUDAGraph::inc_pending_event_queries(); - if (enqueue) { - workEnqueue(work); - } else { - at::cuda::CUDAGraph::dec_pending_event_queries(); - } + work = make_work_ccl( + inputs, outputs, fn, xcclComm, attr, rank_, op_type); + // pre(ncclStream, work); + // ncclComm_t comm = ncclComm->getNcclComm(); + // post(ncclStream, work); return work; } -c10::intrusive_ptr ProcessGroupXCCL::allreduce_impl( - at::Tensor& tensor, +template +c10::intrusive_ptr ProcessGroupNCCL::collective( + at::Tensor& input, + at::Tensor& output, + Fn fn, + OpType opType) { + return collective( + input, + output, + fn, + [](std::vector&) {}, + [](std::vector&) {}, + opType); +} + +c10::intrusive_ptr ProcessGroupXCCL::allreduce( + std::vector& tensors, const AllreduceOptions& opts) { + TORCH_CHECK( + tensors.size() == 1, "Expecting one tensor only but got multiple"); + auto tensor = tensors.back(); + check_gpu_single_tensor(tensor); + if (opts.reduceOp == ReduceOp::AVG) { + TORCH_CHECK(false, "Cannot use ReduceOp AVG with XPU") + } return collective( tensor, tensor, [&](at::Tensor& input, at::Tensor& output, - ncclComm_t comm, - at::cuda::CUDAStream& stream) { - auto ncclDataType = getNcclDataType(input.scalar_type()); - auto ncclReduceOp = - getNcclReduceOp(opts.reduceOp, input, ncclDataType, comm); - return ncclAllReduce( + ccl::allreduce_attr attr, + xcclComm_t comm, + ccl::stream& stream) { + ccl::event ret_evt; + ccl::datatype datatype = getXcclDataType(input.scalar_type()); + ret_evt = ccl::allreduce( input.data_ptr(), output.data_ptr(), - input.numel(), - ncclDataType, - ncclReduceOp, + (size_t)input.numel(), + getXcclDataType(input.scalar_type()), + xcclOp.at(opts.reduceOp), comm, - stream.stream()); + stream, + attr); + return ret_evt; }, - OpType::ALLREDUCE, - "nccl:all_reduce"); + OpType::ALLREDUCE); } -c10::intrusive_ptr ProcessGroupXCCL::allreduce( - std::vector& tensors, - const AllreduceOptions& opts) -{ - TORCH_CHECK(tensors.size() == 1, "Expecting one tensor only but got multiple"); - auto tensor = tensors.back(); - check_gpu_single_tensor(tensor); - if (opts.reduceOp == ReduceOp::SUM) { - TORCH_CHECK(false, "Cannot use ReduceOp SUM with XPU") - } - return allreduce_impl(tensor, opts); -} - - -} +} // namespace -} \ No newline at end of file +#endif // USE_C10D_XCCL \ No newline at end of file diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp index 39f3c1a5e89964..51801ed992edcc 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp @@ -10,6 +10,7 @@ #ifdef USE_C10D_XCCL #include +#include #include #include #include @@ -34,24 +35,25 @@ namespace c10d { constexpr const char* XCCL_BACKEND_NAME = "xccl"; class ProcessGroupXCCL : public Backend { -public: + public: class WorkXCCL : public Work { - public: - WorkXCCL( + public: + WorkXCCL( std::vector> outputTensors, int rank = -1, OpType opType = UNKNOWN, - const c10::optional>& inputTensors = c10::nullopt) - : outputTensors_(std::move(outputTensors)) {} + const c10::optional>& inputTensors = + c10::nullopt) + : Work(rank, opType), outputTensors_(std::move(outputTensors)) {} WorkXCCL(const WorkXCCL& w) : outputTensors_(w.outputTensors_), events_(w.events_) {} ~WorkXCCL() override { - // Ensures all events are properly handled before destruction - for (auto& event : events_) { - event.wait(); - } + // Ensures all events are properly handled before destruction + for (auto& event : events_) { + event.wait(); + } } bool isCompleted() override { @@ -64,11 +66,12 @@ class ProcessGroupXCCL : public Backend { } bool isSuccess() const override { - TORCH_CHECK(false, "ProcessGroupXCCL::WorkXCCL::isSuccess not implemented"); + TORCH_CHECK( + false, "ProcessGroupXCCL::WorkXCCL::isSuccess not implemented"); } void abort() override { - TORCH_CHECK(false, "ProcessGroupXCCL::WorkXCCL::abort not implemented"); + TORCH_CHECK(false, "ProcessGroupXCCL::WorkXCCL::abort not implemented"); } void synchronize() override { @@ -78,34 +81,37 @@ class ProcessGroupXCCL : public Backend { } void wait() override { + std::lock_guard lock(mutex_); for (auto& event : events_) { - call_with_lock(globalMutex, [&]() { - CCL_CHECK(event.wait()); - }); + CCL_CHECK(event.wait()); } events_.clear(); } - c10::intrusive_ptr getFuture() override; + c10::intrusive_ptr getFuture() override { + TORCH_CHECK( + false, "ProcessGroupXCCL::WorkXCCL::getFuture not implemented"); + } std::vector result() override { - return outputTensors_.empty() ? std::vector() : outputTensors_[0]; + return outputTensors_.empty() ? std::vector() + : outputTensors_[0]; } - protected: + protected: friend class ProcessGroupXCCL; std::vector events_; const std::vector> outputTensors_; c10::intrusive_ptr future_; }; - explicit ProcessGroupXCCL(const c10::intrusive_ptr& store, - int rank, - int size) - : store_(store), rank_(rank), size_(size) { - } + explicit ProcessGroupXCCL( + const c10::intrusive_ptr& store, + int rank, + int size) + : store_(store), rank_(rank), size_(size) {} - virtual ~ProcessGroupXCCL(); + ProcessGroupXCCL::~ProcessGroupXCCL() = default; const std::string getBackendName() const override { return std::string(XCCL_BACKEND_NAME); @@ -123,11 +129,11 @@ class ProcessGroupXCCL : public Backend { int rank = -1, int size = -1); -private: + private: int rank_; int size_; -public: + public: std::unordered_map> inInitializationCommMap_; std::unordered_map> devXCCLCommMap_; diff --git a/torch/csrc/distributed/c10d/XCCLUtils.hpp b/torch/csrc/distributed/c10d/XCCLUtils.hpp deleted file mode 100644 index d52f3df8ea466d..00000000000000 --- a/torch/csrc/distributed/c10d/XCCLUtils.hpp +++ /dev/null @@ -1,334 +0,0 @@ -#pragma once - -#ifdef USE_C10D_XCCL - -#include -#include - -#include -#include -#include - -#include -#include -#include -#include - -// RAII wrapper for NCCL communicator -class XCCLComm { - public: - explicit XCCLComm(ncclComm_t ncclComm) - : ncclComm_(ncclComm), - aborted_(false), - ncclAsyncErr_(ncclSuccess), - commFailureReason_(c10::nullopt), - initialized_(false) {} - - NCCLComm() : NCCLComm(nullptr) {} - - ~NCCLComm() noexcept { - // Add lock in this destructor, as aborted_ needs to be read after memory - // barrier here. - std::unique_lock lock(mutex_); - if (ncclComm_ && !aborted_) { -#ifdef ENABLE_NCCL_ERROR_CHECKING - // Use ncclCommAbort instead of ncclCommDestroy here since - // ncclCommDestroy could block forever waiting for work to complete on - // the communicator. - C10D_NCCL_ASSERT(::ncclCommAbort(ncclComm_)); -#else - C10D_NCCL_ASSERT(::ncclCommDestroy(ncclComm_)); -#endif - } - } - - static std::shared_ptr create( - int numRanks, - int rank, - ncclUniqueId commId) { - auto comm = std::make_shared(); - C10D_NCCL_CHECK( - ncclCommInitRank(&(comm->ncclComm_), numRanks, commId, rank), - c10::nullopt); - comm->ncclId_ = commId; - comm->rank_ = rank; - comm->initialized_ = true; - return comm; - } - -#ifdef NCCL_HAS_COMM_NONBLOCKING - static std::shared_ptr create( - int numRanks, - int rank, - ncclUniqueId commId, - ncclConfig_t& config) { - auto comm = std::make_shared(); - bool isInitialized = false; - if (nccl_use_nonblocking()) { - config.blocking = 0; - LOG(INFO) << "Rank " << rank - << ": creating NCCL communicator in nonblocking mode"; - C10D_NCCL_CHECK_NONBLOCKING( - ncclCommInitRankConfig( - &(comm->ncclComm_), numRanks, commId, rank, &config), - c10::nullopt); - } else { - C10D_NCCL_CHECK( - ncclCommInitRankConfig( - &(comm->ncclComm_), numRanks, commId, rank, &config), - c10::nullopt); - // under blocking mode, comm is initialized after NCCL CHECK - isInitialized = true; - } - comm->ncclId_ = commId; - comm->rank_ = rank; - comm->initialized_ = isInitialized; - return comm; - } -#endif - -#ifdef NCCL_HAS_COMM_SPLIT - static std::shared_ptr split( - NCCLComm* source, - int color_id, - int rank, - ncclConfig_t& config) { - auto comm = std::make_shared(); - C10D_NCCL_CHECK( - ncclCommSplit( - source->ncclComm_, color_id, rank, &(comm->ncclComm_), &config), - c10::nullopt); - ++source->ncclCommSplitCounter_; - comm->rank_ = rank; - return comm; - } -#endif - -#if defined(IS_NCCL_EXP) && defined(NCCL_COMM_DUMP) - std::unordered_map ncclCommDump() { - std::unordered_map dump; - if (isAborted()) { - LOG(INFO) << "Communicator was aborted before trying to dump its state."; - return dump; - } - C10D_NCCL_CHECK(::ncclCommDump(ncclComm_, dump), c10::nullopt); - return dump; - } -#endif - - ncclUniqueId getNcclId() { - return ncclId_; - } - - // Must not be copyable - NCCLComm(const NCCLComm&) = delete; - NCCLComm& operator=(const NCCLComm&) = delete; - - // Do not support move assignment as there is no valid use case - NCCLComm& operator=(NCCLComm&& other) = delete; - - // Move constructable - NCCLComm(NCCLComm&& other) { - // Using other's lock, as it reads other's states - // Can not use this.mutex_, as this object is being constructed. - std::unique_lock lock(other.mutex_); - std::swap(ncclComm_, other.ncclComm_); - std::swap(aborted_, other.aborted_); - std::swap(ncclAsyncErr_, other.ncclAsyncErr_); - std::swap(initialized_, other.initialized_); - } - - ncclComm_t getNcclComm(); - - c10::optional getNcclCommFailureReason() const { - std::unique_lock lock(mutex_); - return commFailureReason_; - } - - void ncclCommAbort( - c10::optional commFailureReason = c10::nullopt) { - std::unique_lock lock(mutex_); -#ifdef ENABLE_NCCL_ERROR_CHECKING - if (aborted_) { - // Should not abort twice. - return; - } - -#ifdef NCCL_HAS_COMM_REGISTER - // Deregister all registered segments before aborting. - for (auto& it : registeredSegmentHandles_) { - void* handle = it.second; - C10D_NCCL_CHECK( - ::ncclCommDeregister(ncclComm_, handle), - c10::str( - "Failed to deregister segment handle ", - handle, - " on ncclComm_ ", - ncclComm_)); - } - registeredSegmentHandles_.clear(); -#endif - - // Set true failure reason if provided by ProcessGroupNCCL (e.g. work - // timeout) - commFailureReason_ = commFailureReason; - LOG(INFO) << "Aborting ncclComm_ " << ncclComm_ << " with reason: " - << (commFailureReason ? *commFailureReason - : "No abort reason provided."); -#ifndef NCCL_HAS_COMM_NONBLOCKING - C10D_NCCL_CHECK(::ncclCommAbort(ncclComm_), commFailureReason_); -#else - C10D_NCCL_CHECK_TIMEOUT( - ::ncclCommAbort(ncclComm_), ncclComm_, commFailureReason_); -#endif - aborted_ = true; - ncclComm_ = nullptr; - - // Set an appropriate error so that we avoid using the communicator. - if (ncclAsyncErr_ == ncclSuccess) { - ncclAsyncErr_ = ncclSystemError; - } -#else - // This is a NOOP, if error checks are disabled. - return; -#endif - } - - bool isAborted() const { - std::unique_lock lock(mutex_); - return aborted_; - } - - uint64_t getCommSplitCounter() const { - return ncclCommSplitCounter_; - } - - ncclResult_t checkForNcclError() { - std::unique_lock lock(mutex_); -#ifdef ENABLE_NCCL_ERROR_CHECKING - if (ncclAsyncErr_ != ncclSuccess) { - return ncclAsyncErr_; - } - C10D_NCCL_CHECK( - ncclCommGetAsyncError(ncclComm_, &ncclAsyncErr_), commFailureReason_); - return ncclAsyncErr_; -#else - // Always return success, if error checks are disabled. - return ncclSuccess; -#endif - } - - ncclResult_t registerSegment(void* ptr, size_t size) { - std::unique_lock lock(mutex_); -#ifdef NCCL_HAS_COMM_REGISTER - // We register only segments from cache allocator - // which are guaranteed to be with disjoint addr ranges. Thus, a ptr always - // maps to a unique handle and should not be registered before the current - // ptr is deregistered and freed. - TORCH_CHECK( - registeredSegmentHandles_.count(ptr) == 0, - "Segment with ptr ", - ptr, - " has already been registered on ncclComm_ ", - ncclComm_); - - void* handle; - C10D_NCCL_CHECK( - ncclCommRegister(ncclComm_, ptr, size, &handle), - c10::str( - "Failed to register segment with ptr ", - ptr, - ", size ", - size, - " on ncclComm_ ", - ncclComm_)); - registeredSegmentHandles_[ptr] = handle; - return ncclSuccess; -#else - return ncclInvalidUsage; -#endif - } - - ncclResult_t deregisterSegment(void* ptr) { - std::unique_lock lock(mutex_); -#ifdef NCCL_HAS_COMM_REGISTER - TORCH_CHECK( - registeredSegmentHandles_.count(ptr) == 1, - "Segment with ptr ", - ptr, - " is not registered on ncclComm_ ", - ncclComm_); - - void* handle = registeredSegmentHandles_[ptr]; - C10D_NCCL_CHECK( - ncclCommDeregister(ncclComm_, handle), - c10::str( - "Failed to deregister segment handle ", - handle, - ", with ptr ", - ptr, - " on ncclComm_ ", - ncclComm_)); - registeredSegmentHandles_.erase(ptr); - return ncclSuccess; -#else - return ncclInvalidUsage; -#endif - } - - friend class ProcessGroupNCCL; - - protected: - // a helper function to wait until the communicator is initialized; - void waitUntilInitialized(int timeoutSecs); - ncclComm_t ncclComm_; - // Unique nccl_id for this communicator. - ncclUniqueId ncclId_; - bool aborted_; - uint64_t ncclCommSplitCounter_{0}; - ncclResult_t ncclAsyncErr_; - mutable std::mutex mutex_; - // Rank that this communicator corresponds to. - int rank_; - // Optional reason for communicator failure, provided by ProcessGroupNCCL for - // better error messaging. - c10::optional commFailureReason_; - bool initialized_{false}; -#ifdef NCCL_HAS_COMM_REGISTER - // Stores handlers for tensors registered by NCCL - std::unordered_map registeredSegmentHandles_; -#endif -}; - -// Helper that automatically cleans up premul sums. -struct ncclRedOpRAII { - ncclRedOpRAII() = default; - ncclRedOpRAII(ncclRedOp_t op) : op_(op) {} - ncclRedOpRAII(ncclRedOp_t op, ncclComm_t comm) - : op_(op), comm_(comm), premul_sum_(true) {} - ncclRedOpRAII(const ncclRedOpRAII&) = delete; - ncclRedOpRAII& operator=(const ncclRedOpRAII&) = delete; - ncclRedOpRAII(ncclRedOpRAII&& tmp) : ncclRedOpRAII() { - std::swap(tmp.op_, this->op_); - std::swap(tmp.comm_, this->comm_); - std::swap(tmp.premul_sum_, this->premul_sum_); - } -#if defined(ENABLE_NCCL_PREMUL_SUM_SUPPORT) - ~ncclRedOpRAII() { - if (premul_sum_) { - ncclRedOpDestroy(op_, comm_); - } - } -#endif - operator ncclRedOp_t() const { - return op_; - } - ncclRedOp_t op_; - ncclComm_t comm_; - bool premul_sum_ = false; -}; - -} // namespace c10d - -#endif // USE_C10D_NCCL - diff --git a/torch/csrc/xpu/xccl.cpp b/torch/csrc/xpu/xccl.cpp index 5304b43f57d410..747c5bf3eb1103 100644 --- a/torch/csrc/xpu/xccl.cpp +++ b/torch/csrc/xpu/xccl.cpp @@ -1,22 +1,17 @@ #include -#include -#include #include -#include -#include #include #include #include -#include +#include #include #include #include #include - xcclComm_t* to_xccl_comm(torch::xpu::xccl::xcclComm_t* var) { return reinterpret_cast(var); } @@ -25,8 +20,7 @@ xcclComm_t to_xccl_comm(torch::xpu::xccl::xcclComm_t var) { return reinterpret_cast(var); } - -xcclDataType_t to_nccl_data_type(c10::ScalarType type) { +ccl::datatype to_nccl_data_type(c10::ScalarType type) { switch (type) { case at::kFloat: return ccl::datatype::float32; @@ -78,77 +72,77 @@ XCCL_KVS get_kvs(int rank, c10d::Store& store) { if (rank == 0) { kvs = ccl::create_main_kvs(); ccl::kvs::address_type main_addr = kvs->get_address(); - auto ccl_kvs_addr = std::vector(main_addr.begin(), main_addr.end()); + auto ccl_kvs_addr = + std::vector(main_addr.begin(), main_addr.end()); store.set(storeKey, ccl_kvs_addr); - } - else { + } else { auto ccl_kvs_addr = store.get(storeKey); if (ccl_kvs_addr.size() != ccl::kvs::address_max_size) { - throw std::runtime_error( - "Unexpected ccl kvs addr from the store\n"); + throw std::runtime_error("Unexpected ccl kvs addr from the store\n"); } ccl::kvs::address_type main_addr; - std::copy_n(std::make_move_iterator(ccl_kvs_addr.begin()), - ccl::kvs::address_max_size, - main_addr.begin()); + std::copy_n( + std::make_move_iterator(ccl_kvs_addr.begin()), + ccl::kvs::address_max_size, + main_addr.begin()); kvs = ccl::create_kvs(main_addr); } return kvs; } - using namespace at; namespace detail { -void xcclCommInitAll(xcclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) { - for(int i = 0; i < nranks; i++) { - newcomm[i] = ccl::create_communicator(nranks, i, get_kvs_addr) - } - c10::Stream dpcpp_stream = impl.getStream(devices[0]); - ccl::vector_class> devs_rank; - newcomm = ccl::create_communicators(nranks, devs_rank, ctx, ) -} - -struct XcclCommList { - std::unique_ptr comms; - int ndevices; - XcclCommList(const std::vector& devices) - : comms(new xcclComm_t[devices.size()]), ndevices(devices.size()) { - xcclCommInitAll( - to_xccl_comm(comms.get()), devices.size(), devices.data()); - } - NcclCommList(NcclCommList&& foo) = default; - ~NcclCommList() { - if (comms) { - for (const auto i : c10::irange(ndevices)) { - comm_destroy(comms[i]); - } - } - } - ArrayRef ref() const { - return ArrayRef(comms.get(), ndevices); - } -}; - -using device_list = std::vector; -// accesses to this object have to be guarded by THC's CudaFreeMutex -std::unordered_map> _communicators; -static std::unordered_map> - _communicators; - -ArrayRef get_communicators(TensorList inputs) { - static auto get_device = [](const at::Tensor& t) -> int { - return t.get_device(); - }; - device_list devices = fmap(inputs, get_device); - auto it = _communicators.find(devices); - if (it == _communicators.end()) { - it = _communicators.emplace(devices, devices).first; - } - return it->second; -} +// void xcclCommInitAll(xcclComm_t* newcomm, int nranks, ncclUniqueId commId, +// int myrank) { +// for(int i = 0; i < nranks; i++) { +// newcomm[i] = ccl::create_communicator(nranks, i, get_kvs_addr) +// } +// c10::Stream dpcpp_stream = impl.getStream(devices[0]); +// ccl::vector_class> devs_rank; +// newcomm = ccl::create_communicators(nranks, devs_rank, ctx, ) +// } + +// struct XcclCommList { +// std::unique_ptr comms; +// int ndevices; +// XcclCommList(const std::vector& devices) +// : comms(new xcclComm_t[devices.size()]), ndevices(devices.size()) { +// xcclCommInitAll( +// to_xccl_comm(comms.get()), devices.size(), devices.data()); +// } +// NcclCommList(NcclCommList&& foo) = default; +// ~NcclCommList() { +// if (comms) { +// for (const auto i : c10::irange(ndevices)) { +// comm_destroy(comms[i]); +// } +// } +// } +// ArrayRef ref() const { +// return ArrayRef(comms.get(), ndevices); +// } +// }; + +// using device_list = std::vector; +// // accesses to this object have to be guarded by THC's CudaFreeMutex +// std::unordered_map> _communicators; +// static std::unordered_map> +// _communicators; + +// ArrayRef get_communicators(TensorList inputs) { +// static auto get_device = [](const at::Tensor& t) -> int { +// return t.get_device(); +// }; +// device_list devices = fmap(inputs, get_device); +// auto it = _communicators.find(devices); +// if (it == _communicators.end()) { +// it = _communicators.emplace(devices, devices).first; +// } +// return it->second; +// } static inline void check_tensor( const at::Tensor& input, @@ -275,649 +269,85 @@ void check_inputs( } // namespace detail -bool is_available(TensorList tensors) { -#ifdef USE_XCCL - device_set devices; - for (auto& tensor : tensors) { - if (!tensor.is_xpu() || tensor.is_sparse()) - return false; - if (!tensor.is_contiguous()) - return false; - auto device = tensor.get_device(); - if (devices[device]) - return false; - devices[device] = true; - } - return true; -#else - return false; -#endif -} - -std::uint64_t version() { -#if defined(NCCL_MAJOR) - constexpr std::uint64_t ver = (((uint64_t)NCCL_MAJOR) << 32) | - (((uint64_t)NCCL_MINOR) << 16) | ((uint64_t)NCCL_PATCH); - return ver; -#elif defined(USE_NCCL) - // return major version "1" - return ((uint64_t)1) << 32; -#else - return 0; -#endif -} - -ncclComm_t comm_init_rank(int nranks, const ncclUniqueId& comm_id, int rank) { -#ifdef USE_XCCL - using namespace torch::xpu::xccl::detail; - xcclComm_t comm; - ncclUniqueId id = comm_id; - NCCL_CHECK(ncclCommInitRank( - to_nccl_comm(&comm), nranks, *(to_nccl_unique_id(&id)), rank)); - return comm; -#else - return nullptr; -#endif -} - - -namespace { -// NCCL changed the numerical type used for count between NCCL1 and NCCL2. -// So we use the following struct, which gets the type of the second argument -// of T, if T is a function type, with ncclBcast, to get that type statically -// and programmatically. - -template -struct GetSecondArgType; - -template -struct GetSecondArgType { - typedef typename std::decay::type type; -}; - -constexpr auto count_max = - std::numeric_limits::type>::max(); - -// Since NCCL 2.12.10, NCCL supports send/recv 0 byte: -// https://github.com/NVIDIA/nccl/issues/696. The issue of skipping send/recv -// is that it can cause deadlock when a rank send and recv 0 bytes so it's -// completely skipping the collective, causing mismatch across ranks -#if defined(NCCL_MAJOR) && \ - ((NCCL_MAJOR > 2) || ((NCCL_MAJOR == 2) && (NCCL_MINOR > 13))) -template -constexpr bool _nccl_should_send_recv(C10_UNUSED T _unused_) { - return true; -} -#else -// old NCCL uses 0 byte message for synchronization -// Avoid send/recv when message size is zero -template -inline bool _nccl_should_send_recv(T value) { - return value != 0; -} -#endif -} // namespace - -size_t get_max_count() { - return count_max; -} - -void broadcast( - TensorList tensors, - const stream_list& streams, - const comm_list& user_comms) { -#ifdef USE_NCCL - using namespace torch::cuda::nccl::detail; - check_inputs(tensors, tensors, 1, 1); - auto data_type = to_nccl_data_type(tensors[0]); - int64_t numel = tensors[0].numel(); - - const auto comms = user_comms.empty() ? get_communicators(tensors) - : ArrayRef(user_comms); - - AutoNcclGroup nccl_group_guard; - at::cuda::OptionalCUDAGuard device_guard; - for (size_t i = 0, num_tensors = tensors.size(); i < num_tensors; i++) { - auto device = tensors[i].get_device(); - device_guard.set_index(device); - // Default to the current stream - const auto stream = (streams.empty() || !streams[i]) - ? at::cuda::getCurrentCUDAStream(device).stream() - : streams[i]->stream(); - TORCH_CHECK( - static_cast(numel) <= static_cast(count_max), - "Broadcast tensor has ", - numel, - " elements, which exceeds the " - "maximum NCCL supports (", - count_max, - ")"); - ncclComm_t comm = comms[i]; - NCCL_CHECK(ncclBcast( - tensors[i].data_ptr(), - numel, - data_type, - 0, - to_nccl_comm(comm), - stream)); - } -#else - AT_ERROR("PyTorch built without NCCL support"); -#endif -} - -void reduce( - const std::vector& inputs, - at::Tensor& output, - int32_t root, - int32_t op, - const stream_list& streams, - const comm_list& user_comms) { -#ifdef USE_NCCL - using namespace torch::cuda::nccl::detail; - TORCH_CHECK( - root >= 0 && static_cast(root) < inputs.size(), "invalid root"); - - check_inputs(inputs, output, root, 1, 1); - const auto len = inputs.size(); - - auto data_type = to_nccl_data_type(inputs[0]); - - const auto count = inputs[0].numel(); - auto comms_ref = user_comms.empty() ? get_communicators(inputs) - : ArrayRef(user_comms); - - AutoNcclGroup nccl_group_guard; - at::cuda::OptionalCUDAGuard device_guard; - for (const auto i : c10::irange(len)) { - auto device = inputs[i].device().index(); - device_guard.set_index(device); - // Default to the current stream - const auto stream = (streams.empty() || !streams[i]) - ? at::cuda::getCurrentCUDAStream(device).stream() - : streams[i]->stream(); - - ncclComm_t comm = comms_ref[i]; - NCCL_CHECK(ncclReduce( - inputs[i].data_ptr(), - static_cast>(root) == i - ? output.data_ptr() - : nullptr, - count, - data_type, - to_nccl_red_op(op), - root, - to_nccl_comm(comm), - stream)); - } -#else - AT_ERROR("PyTorch built without NCCL support"); -#endif -} - -void reduce( - std::vector& inputs, - int32_t root, - int32_t op, - const stream_list& streams, - const comm_list& user_comms) { - reduce(inputs, /*output=*/inputs[root], root, op, streams, user_comms); -} - -void all_reduce( - const std::vector& inputs, - std::vector& outputs, - int32_t op, - const stream_list& streams, - const comm_list& user_comms) { -#ifdef USE_NCCL - using namespace torch::cuda::nccl::detail; - check_inputs(inputs, outputs, 1, 1); - const auto len = inputs.size(); - - auto data_type = to_nccl_data_type(inputs[0]); - - const auto count = inputs[0].numel(); - auto comms_ref = user_comms.empty() ? get_communicators(inputs) - : ArrayRef(user_comms); - - AutoNcclGroup nccl_group_guard; - at::cuda::OptionalCUDAGuard device_guard; - for (const auto i : c10::irange(len)) { - auto device = inputs[i].device().index(); - device_guard.set_index(device); - // Default to the current stream - const auto stream = (streams.empty() || !streams[i]) - ? at::cuda::getCurrentCUDAStream(device).stream() - : streams[i]->stream(); - - ncclComm_t comm = comms_ref[i]; - NCCL_CHECK(ncclAllReduce( - inputs[i].data_ptr(), - outputs[i].data_ptr(), - count, - data_type, - to_nccl_red_op(op), - to_nccl_comm(comm), - stream)); - } -#else - AT_ERROR("PyTorch built without NCCL support"); -#endif -} - -void reduce_scatter( - const std::vector& inputs, - std::vector& outputs, - int32_t op, - const stream_list& streams, - const comm_list& user_comms) { -#ifdef USE_NCCL - using namespace torch::cuda::nccl::detail; - const auto len = inputs.size(); - check_inputs(inputs, outputs, 1, len); - - auto data_type = to_nccl_data_type(inputs[0]); - - const auto count = inputs[0].numel() / len; - auto comms_ref = user_comms.empty() ? get_communicators(inputs) - : ArrayRef(user_comms); - - AutoNcclGroup nccl_group_guard; - at::cuda::OptionalCUDAGuard device_guard; - for (const auto i : c10::irange(len)) { - auto device = inputs[i].device().index(); - device_guard.set_index(device); - // Default to the current stream - const auto stream = (streams.empty() || !streams[i]) - ? at::cuda::getCurrentCUDAStream(device).stream() - : streams[i]->stream(); - - ncclComm_t comm = comms_ref[i]; - NCCL_CHECK(ncclReduceScatter( - inputs[i].data_ptr(), - outputs[i].data_ptr(), - count, - data_type, - to_nccl_red_op(op), - to_nccl_comm(comm), - stream)); - } -#else - AT_ERROR("PyTorch built without NCCL support"); -#endif -} - -void all_gather( - const std::vector& inputs, - std::vector& outputs, - const stream_list& streams, - const comm_list& user_comms) { -#ifdef USE_NCCL - using namespace torch::cuda::nccl::detail; - const auto len = inputs.size(); - check_inputs(inputs, outputs, len, 1); - - auto data_type = to_nccl_data_type(inputs[0]); - - const auto count = inputs[0].numel(); - auto comms_ref = user_comms.empty() ? get_communicators(inputs) - : ArrayRef(user_comms); - - AutoNcclGroup nccl_group_guard; - at::cuda::OptionalCUDAGuard device_guard; - for (const auto i : c10::irange(len)) { - auto device = inputs[i].device().index(); - device_guard.set_index(device); - // Default to the current stream - const auto stream = (streams.empty() || !streams[i]) - ? at::cuda::getCurrentCUDAStream(device).stream() - : streams[i]->stream(); - - ncclComm_t comm = comms_ref[i]; -#if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2) - NCCL_CHECK(ncclAllGather( - inputs[i].data_ptr(), - outputs[i].data_ptr(), - count, - data_type, - to_nccl_comm(comm), - stream)); -#else - NCCL_CHECK(ncclAllGather( - inputs[i].data_ptr(), - count, - data_type, - outputs[i].data_ptr(), - to_nccl_comm(comm), - stream)); -#endif - } -#else - AT_ERROR("PyTorch built without NCCL support"); -#endif -} - -void all2all_single_equal_split( - at::Tensor& input, - at::Tensor& output, - int size, - ncclComm_t _comm, - at::cuda::CUDAStream& stream) { -#ifdef USE_NCCL -#if defined(NCCL_MAJOR) && \ - ((NCCL_MAJOR > 2) || ((NCCL_MAJOR == 2) && (NCCL_MINOR >= 7))) - using namespace torch::cuda::nccl::detail; - - int numranks; - auto type = to_nccl_data_type(input); - size_t count = input.numel() / size; - size_t rankdiff = input.nbytes() / size; - const auto* sendbuff = reinterpret_cast(input.const_data_ptr()); - auto* recvbuff = reinterpret_cast(output.data_ptr()); - auto comm = to_nccl_comm(_comm); -#if defined(USE_ROCM) - NCCL_CHECK(ncclAllToAll(sendbuff, recvbuff, count, type, comm, stream)); -#else - NCCL_CHECK(ncclCommCount(comm, &numranks)); - NCCL_CHECK(ncclGroupStart()); - for (const auto r : c10::irange(numranks)) { - if (_nccl_should_send_recv(count)) { - NCCL_CHECK( - ncclSend(sendbuff + r * rankdiff, count, type, r, comm, stream)); - NCCL_CHECK( - ncclRecv(recvbuff + r * rankdiff, count, type, r, comm, stream)); - } - } -#ifndef NCCL_HAS_COMM_NONBLOCKING - NCCL_CHECK(ncclGroupEnd()); -#else - NCCL_CHECK_TIMEOUT(ncclGroupEnd(), _comm); -#endif -#endif -#else - AT_ERROR("all2all is only supported for NCCL lib version >= 2.7.0"); -#endif -#else - AT_ERROR("PyTorch built without NCCL support"); -#endif -} - -void all2all_single_unequal_split( - void* sendbuff, - const size_t* sendcounts, - const size_t* senddispls, - void* recvbuff, - const size_t* recvcounts, - const size_t* recvdispls, - size_t size, - c10::ScalarType _type, - ncclComm_t _comm, - at::cuda::CUDAStream& stream) { -#ifdef USE_NCCL -#if defined(NCCL_MAJOR) && \ - ((NCCL_MAJOR > 2) || ((NCCL_MAJOR == 2) && (NCCL_MINOR >= 7))) - using namespace torch::cuda::nccl::detail; - - auto type = to_nccl_data_type(_type); - auto comm = to_nccl_comm(_comm); - int numranks; - NCCL_CHECK(ncclCommCount(comm, &numranks)); - NCCL_CHECK(ncclGroupStart()); - for (const auto r : c10::irange(numranks)) { - if (_nccl_should_send_recv(sendcounts[r])) { - NCCL_CHECK(ncclSend( - ((char*)sendbuff) + senddispls[r] * size, - sendcounts[r], - type, - r, - comm, - stream)); - } - if (_nccl_should_send_recv(recvcounts[r])) { - NCCL_CHECK(ncclRecv( - ((char*)recvbuff) + recvdispls[r] * size, - recvcounts[r], - type, - r, - comm, - stream)); - } - } -#ifndef NCCL_HAS_COMM_NONBLOCKING - NCCL_CHECK(ncclGroupEnd()); -#else - NCCL_CHECK_TIMEOUT(ncclGroupEnd(), _comm); -#endif -#else - AT_ERROR("all2all is only supported for NCCL lib version >= 2.7.0"); -#endif -#else - AT_ERROR("PyTorch built without NCCL support"); -#endif -} - -void all2all( - std::vector& outputTensors, - std::vector& inputTensors, - ncclComm_t _comm, - at::cuda::CUDAStream& stream) { -#ifdef USE_NCCL -#if defined(NCCL_MAJOR) && \ - ((NCCL_MAJOR > 2) || ((NCCL_MAJOR == 2) && (NCCL_MINOR >= 7))) - using namespace torch::cuda::nccl::detail; - auto comm = to_nccl_comm(_comm); - - NCCL_CHECK(ncclGroupStart()); - for (const auto r : c10::irange(outputTensors.size())) { - at::Tensor& input = inputTensors[r]; - at::Tensor& output = outputTensors[r]; - - if (_nccl_should_send_recv(input.numel())) { - NCCL_CHECK(ncclSend( - input.data_ptr(), - input.numel(), - to_nccl_data_type(input), - r, - comm, - stream.stream())); - } - if (_nccl_should_send_recv(output.numel())) { - NCCL_CHECK(ncclRecv( - output.data_ptr(), - output.numel(), - to_nccl_data_type(output), - r, - comm, - stream.stream())); - } - } -#ifndef NCCL_HAS_COMM_NONBLOCKING - NCCL_CHECK(ncclGroupEnd()); -#else - NCCL_CHECK_TIMEOUT(ncclGroupEnd(), _comm); -#endif -#else - AT_ERROR("all2all is only supported for NCCL lib version >= 2.7.0"); -#endif -#else - AT_ERROR("PyTorch built without NCCL support"); -#endif -} - -void send( - const at::Tensor& input, - ncclComm_t comm, - at::cuda::CUDAStream stream, - int dst) { -#ifdef USE_NCCL -#if defined(NCCL_MAJOR) && \ - ((NCCL_MAJOR > 2) || ((NCCL_MAJOR == 2) && (NCCL_MINOR >= 7))) - using namespace torch::cuda::nccl::detail; -#ifndef NCCL_HAS_COMM_NONBLOCKING - NCCL_CHECK(ncclSend( - input.data_ptr(), - input.numel(), - to_nccl_data_type(input), - dst, - to_nccl_comm(comm), - stream.stream())); -#else - NCCL_CHECK_TIMEOUT( - ncclSend( - input.data_ptr(), - input.numel(), - to_nccl_data_type(input), - dst, - to_nccl_comm(comm), - stream.stream()), - comm); -#endif -#else - AT_ERROR("Send is only supported for NCCL lib version >= 2.7.0"); -#endif -#else - AT_ERROR("PyTorch built without NCCL support"); -#endif -} - -void recv( - at::Tensor& output, - ncclComm_t comm, - at::cuda::CUDAStream stream, - int src) { -#ifdef USE_NCCL -#if defined(NCCL_MAJOR) && \ - ((NCCL_MAJOR > 2) || ((NCCL_MAJOR == 2) && (NCCL_MINOR >= 7))) - using namespace torch::cuda::nccl::detail; -#ifndef NCCL_HAS_COMM_NONBLOCKING - NCCL_CHECK(ncclRecv( - output.data_ptr(), - output.numel(), - to_nccl_data_type(output), - src, - to_nccl_comm(comm), - stream.stream())); -#else - NCCL_CHECK_TIMEOUT( - ncclRecv( - output.data_ptr(), - output.numel(), - to_nccl_data_type(output), - src, - to_nccl_comm(comm), - stream.stream()), - comm); -#endif -#else - AT_ERROR("Recv is only supported for NCCL lib version >= 2.7.0"); -#endif -#else - AT_ERROR("PyTorch built without NCCL support"); -#endif -} - -void gather( - const at::Tensor& inputs, - std::vector& outputs, - ncclComm_t _comm, - at::cuda::CUDAStream& stream, - int32_t root) { -#ifdef USE_NCCL -#if defined(NCCL_MAJOR) && \ - ((NCCL_MAJOR > 2) || ((NCCL_MAJOR == 2) && (NCCL_MINOR >= 7))) - using namespace torch::cuda::nccl::detail; - - auto comm = to_nccl_comm(_comm); - int numranks, cur_rank; - NCCL_CHECK(ncclCommCount(comm, &numranks)); - NCCL_CHECK(ncclCommUserRank(comm, &cur_rank)); - - size_t count = inputs.numel(); - auto type = to_nccl_data_type(inputs); - const auto* sendbuff = reinterpret_cast(inputs.const_data_ptr()); - - NCCL_CHECK(ncclGroupStart()); - - if (cur_rank == root) { - for (const auto r : c10::irange(numranks)) { - if (r != root) { - auto* recvbuff = reinterpret_cast(outputs[r].data_ptr()); - NCCL_CHECK(ncclRecv(recvbuff, count, type, r, comm, stream)); - } else { - // on its own rank, simply copy from the input - outputs[r].copy_(inputs); - } - } - } else { - NCCL_CHECK(ncclSend(sendbuff, count, type, root, comm, stream)); - } -#ifndef NCCL_HAS_COMM_NONBLOCKING - NCCL_CHECK(ncclGroupEnd()); -#else - NCCL_CHECK_TIMEOUT(ncclGroupEnd(), _comm); -#endif - -#else - AT_ERROR("gather is only supported for NCCL lib version >= 2.7.0"); -#endif -#else - AT_ERROR("PyTorch built without NCCL support"); -#endif -} - -void scatter( - const std::vector& inputs, - at::Tensor& outputs, - ncclComm_t _comm, - at::cuda::CUDAStream& stream, - int32_t root) { -#ifdef USE_NCCL -#if defined(NCCL_MAJOR) && \ - ((NCCL_MAJOR > 2) || ((NCCL_MAJOR == 2) && (NCCL_MINOR >= 7))) - using namespace torch::cuda::nccl::detail; - - auto comm = to_nccl_comm(_comm); - int numranks, cur_rank; -#ifndef NCCL_HAS_COMM_NONBLOCKING - NCCL_CHECK(ncclCommCount(comm, &numranks)); - NCCL_CHECK(ncclCommUserRank(comm, &cur_rank)); -#else - NCCL_CHECK_TIMEOUT(ncclCommCount(comm, &numranks), _comm); - NCCL_CHECK_TIMEOUT(ncclCommUserRank(comm, &cur_rank), _comm); -#endif - NCCL_CHECK(ncclGroupStart()); - if (cur_rank == root) { - for (const auto r : c10::irange(numranks)) { - if (r != root) { - size_t send_count = inputs[r].numel(); - auto send_type = to_nccl_data_type(inputs[r]); - const auto* sendbuff = - reinterpret_cast(inputs[r].const_data_ptr()); - NCCL_CHECK(ncclSend(sendbuff, send_count, send_type, r, comm, stream)); - } else { - // on its own rank, simply copy it to the output - outputs.copy_(inputs[r]); - } - } - } else { - size_t recv_count = outputs.numel(); - auto recv_type = to_nccl_data_type(outputs); - auto* recvbuff = reinterpret_cast(outputs.data_ptr()); - NCCL_CHECK(ncclRecv(recvbuff, recv_count, recv_type, root, comm, stream)); - } -#ifndef NCCL_HAS_COMM_NONBLOCKING - NCCL_CHECK(ncclGroupEnd()); -#else - NCCL_CHECK_TIMEOUT(ncclGroupEnd(), _comm); -#endif -#else - AT_ERROR("scatter is only supported for NCCL lib version >= 2.7.0"); -#endif -#else - AT_ERROR("PyTorch built without NCCL support"); -#endif -} - -} // namespace torch::cuda::nccl - +// std::uint64_t version() { +// #if defined(NCCL_MAJOR) +// constexpr std::uint64_t ver = (((uint64_t)NCCL_MAJOR) << 32) | +// (((uint64_t)NCCL_MINOR) << 16) | ((uint64_t)NCCL_PATCH); +// return ver; +// #elif defined(USE_NCCL) +// // return major version "1" +// return ((uint64_t)1) << 32; +// #else +// return 0; +// #endif +// } + +// ncclComm_t comm_init_rank(int nranks, const ncclUniqueId& comm_id, int rank) +// { #ifdef USE_XCCL +// using namespace torch::xpu::xccl::detail; +// xcclComm_t comm; +// ncclUniqueId id = comm_id; +// NCCL_CHECK(ncclCommInitRank( +// to_nccl_comm(&comm), nranks, *(to_nccl_unique_id(&id)), rank)); +// return comm; +// #else +// return nullptr; +// #endif +// } + +// namespace { + +// ret_evt = torch::xpu::xccl::all_reduce( +// input, +// output, +// datatype, +// xcclOp.at(opts.reduceOp), +// comm, +// attr, +// stream, +// root); + +// void all_reduce( +// at::Tensor& input, +// at::Tensor& output, +// ccl::datatype datatype, +// ccl::reduction op, +// const stream_list& streams, +// const comm_list& user_comms) { +// #ifdef USE_XCCL +// using namespace torch::cuda::nccl::detail; +// check_inputs(inputs, outputs, 1, 1); +// const auto len = inputs.size(); + +// auto data_type = to_nccl_data_type(inputs[0]); + +// const auto count = inputs[0].numel(); +// auto comms_ref = user_comms.empty() ? get_communicators(inputs) +// : ArrayRef(user_comms); + +// AutoNcclGroup nccl_group_guard; +// at::cuda::OptionalCUDAGuard device_guard; +// for (const auto i : c10::irange(len)) { +// auto device = inputs[i].device().index(); +// device_guard.set_index(device); +// // Default to the current stream +// const auto stream = (streams.empty() || !streams[i]) +// ? at::cuda::getCurrentCUDAStream(device).stream() +// : streams[i]->stream(); + +// ncclComm_t comm = comms_ref[i]; +// NCCL_CHECK(ncclAllReduce( +// inputs[i].data_ptr(), +// outputs[i].data_ptr(), +// count, +// data_type, +// to_nccl_red_op(op), +// to_nccl_comm(comm), +// stream)); +// } +// #else +// AT_ERROR("PyTorch built without NCCL support"); +// #endif +// } + +} // namespace torch::xpu::xccl diff --git a/torch/csrc/xpu/xccl.h b/torch/csrc/xpu/xccl.h index d844f166ec5ab1..f0f2b57a1dc9f7 100644 --- a/torch/csrc/xpu/xccl.h +++ b/torch/csrc/xpu/xccl.h @@ -12,47 +12,11 @@ using xcclComm_t = ccl::communicator; using XCCL_KVS = ccl::shared_ptr_class; -ccl::shared_ptr_class kvs; -std::vector kvs_addr; +XCCL_KVS kvs; -XCCL_KVS get_kvs(int rank, c10d::Store& store) -class Comms { -public: +XCCL_KVS get_kvs(int rank, c10d::Store& store); - explicit Comms(ccl::vector_class &comms) : - comms(std::move(comms)), streams{} {} - - explicit Comms(ccl::vector_class &comms, ccl::vector_class &streams, std::vector &torch_streams) : - comms(std::move(comms)), streams(std::move(streams)), torch_streams(std::move(torch_streams)) {} - - ~Comms() noexcept(false) {} - - Comms() = delete; - - Comms(const Comms &) = delete; - - Comms &operator=(const Comms &) = delete; - - Comms(Comms &&other) : comms(std::move(other.comms)), streams(std::move(other.streams)), - torch_streams(std::move(other.torch_streams)) {} - - Comms &operator=(Comms &&other) { - std::swap(comms, other.comms); - std::swap(streams, other.streams); - std::swap(torch_streams, other.torch_streams); - return *this; - } - -public: - // The Communicators used by XCCL - ccl::vector_class comms; - // The streams used by XCCL - ccl::vector_class streams; - // one to one mapping the torch streams to the ccl::stream. - std::vector torch_streams; -}; - -enum class xcclRedOp { Sum = 0, Prod = 1, Max = 2, Min = 3}; +enum class xcclRedOp { Sum = 0, Prod = 1, Max = 2, Min = 3 }; enum class xcclDataType { Int8 = 0, @@ -75,14 +39,13 @@ enum class xcclDataType { namespace detail { - at::ArrayRef get_communicators( - at::TensorList inputs); - void check_inputs( +at::ArrayRef get_communicators(at::TensorList inputs); +void check_inputs( at::TensorList inputs, at::TensorList outputs, int input_multiplier, int output_multiplier); - void check_inputs( +void check_inputs( at::TensorList inputs, const at::Tensor& output, int root, @@ -94,13 +57,13 @@ namespace detail { using comm_list = std::vector; using stream_list = std::vector>; - std::uint64_t version(); - const char* version_suffix(); +std::uint64_t version(); +const char* version_suffix(); bool is_available(at::TensorList tensors); comm_init_rank(int nranks, const ncclUniqueId& comm_id, int rank); - void comm_destroy(ncclComm_t comm); +void comm_destroy(ncclComm_t comm); void all_reduce( const std::vector& inputs, @@ -109,4 +72,3 @@ void all_reduce( const stream_list& streams = {}, const comm_list& user_comms = {}); } // namespace torch::xpu::xccl - From ba6c4b7f4d14bc7e8345125215246c5a5153b520 Mon Sep 17 00:00:00 2001 From: hanchao Date: Fri, 30 Aug 2024 08:11:14 +0000 Subject: [PATCH 03/96] update --- caffe2/CMakeLists.txt | 8 ++++---- torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp | 3 --- torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp | 2 +- 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index 2c4da5fd50f10c..28e7d0c96ba877 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -1014,6 +1014,10 @@ elseif(USE_CUDA) endif() if(USE_XPU) + if(USE_XCCL) + list(APPEND Caffe2_XPU_SRCS + ${TORCH_SRC_DIR}/csrc/xpu/xccl.cpp) + endif() add_library(torch_xpu ${Caffe2_XPU_SRCS}) torch_compile_options(torch_xpu) # see cmake/public/utils.cmake target_compile_definitions(torch_xpu PRIVATE USE_XPU) @@ -1057,10 +1061,6 @@ if(USE_XPU) # 2. Using add_custom_command in torch-xpu-ops to define sycl device sources # compilation. add_custom_command requires an explicit dependency. list(APPEND ${Caffe2_XPU_INCLUDE} ${TORCH_XPU_OPS_DIR}/src/ATen/) - # if(USE_XCCL) - # list(APPEND Caffe2_GPU_SRCS - # ${TORCH_SRC_DIR}/csrc/xpu/xccl.cpp) - # endif() set(TORCH_XPU_OPS_PYTORCH_DEPS ATEN_CPU_FILES_GEN_TARGET) add_subdirectory(${TORCH_ROOT}/third_party/torch-xpu-ops diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index 3325691c3a8531..d901259f400c5e 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -11,15 +11,12 @@ #include #include -#include -#include #include #include #include #include #include #include -#include #include #include #include diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp index 51801ed992edcc..c7e17b491ffce6 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp @@ -10,7 +10,7 @@ #ifdef USE_C10D_XCCL #include -#include +#include #include #include #include From 68a6aeecd90e3cf9f993a9df236121cd223102c8 Mon Sep 17 00:00:00 2001 From: hanchao Date: Fri, 30 Aug 2024 09:07:54 +0000 Subject: [PATCH 04/96] update --- torch/csrc/xpu/xccl.cpp | 159 +++++++++++++++++++--------------------- torch/csrc/xpu/xccl.h | 23 +++--- 2 files changed, 89 insertions(+), 93 deletions(-) diff --git a/torch/csrc/xpu/xccl.cpp b/torch/csrc/xpu/xccl.cpp index 747c5bf3eb1103..6224b19254dbfe 100644 --- a/torch/csrc/xpu/xccl.cpp +++ b/torch/csrc/xpu/xccl.cpp @@ -5,22 +5,15 @@ #include #include -#include +#include #include #include #include #include -xcclComm_t* to_xccl_comm(torch::xpu::xccl::xcclComm_t* var) { - return reinterpret_cast(var); -} - -xcclComm_t to_xccl_comm(torch::xpu::xccl::xcclComm_t var) { - return reinterpret_cast(var); -} -ccl::datatype to_nccl_data_type(c10::ScalarType type) { +ccl::datatype to_xccl_data_type(c10::ScalarType type) { switch (type) { case at::kFloat: return ccl::datatype::float32; @@ -45,7 +38,7 @@ ccl::datatype to_nccl_data_type(c10::ScalarType type) { } } -ncclDataType_t to_xccl_data_type(const at::Tensor& t) { +ccl::datatype to_xccl_data_type(const at::Tensor& t) { if (!t.is_xpu()) { TORCH_CHECK( false, @@ -61,11 +54,13 @@ ccl::reduction to_xccl_red_op(int var) { namespace torch::xpu::xccl { +XCCL_KVS kvs; +std::mutex kvs_mutex; + XCCL_KVS get_kvs(int rank, c10d::Store& store) { + std::lock_guard lock(kvs_mutex); if (kvs) return kvs; - // Each process group is with different store, so we use the unique key for - // broadcast the bootstrap network information. std::string storeKey = "ccl_kvs"; // Rank 0 broadcast the bootstrap network information to other ranks @@ -82,9 +77,9 @@ XCCL_KVS get_kvs(int rank, c10d::Store& store) { } ccl::kvs::address_type main_addr; std::copy_n( - std::make_move_iterator(ccl_kvs_addr.begin()), - ccl::kvs::address_max_size, - main_addr.begin()); + ccl_kvs_addr.begin(), + ccl::kvs::address_max_size, + main_addr.begin()); kvs = ccl::create_kvs(main_addr); } @@ -190,82 +185,82 @@ static inline void check_tensor( } } -void check_inputs( - TensorList inputs, - TensorList outputs, - int input_multiplier, - int output_multiplier) { - // len(inputs) == len(outputs) - size_t len = inputs.size(); +// void check_inputs( +// TensorList inputs, +// TensorList outputs, +// int input_multiplier, +// int output_multiplier) { +// // len(inputs) == len(outputs) +// size_t len = inputs.size(); - if (len <= 0) { - throw std::runtime_error("input sequence can't be empty"); - } +// if (len <= 0) { +// throw std::runtime_error("input sequence can't be empty"); +// } - if (len != outputs.size()) { - std::stringstream err; - err << "inputs and outputs sequences have to be of the same length, but got input of length " - << len << " and output of length " << outputs.size(); - throw std::runtime_error(err.str()); - } +// if (len != outputs.size()) { +// std::stringstream err; +// err << "inputs and outputs sequences have to be of the same length, but got input of length " +// << len << " and output of length " << outputs.size(); +// throw std::runtime_error(err.str()); +// } - device_set devices; - int64_t numel = inputs[0].numel(); - auto dtype = inputs[0].scalar_type(); +// device_set devices; +// int64_t numel = inputs[0].numel(); +// auto dtype = inputs[0].scalar_type(); - for (const auto i : c10::irange(len)) { - auto input = inputs[i]; - auto output = outputs[i]; +// for (const auto i : c10::irange(len)) { +// auto input = inputs[i]; +// auto output = outputs[i]; - check_tensor( - input, output, input_multiplier, output_multiplier, numel, dtype); +// check_tensor( +// input, output, input_multiplier, output_multiplier, numel, dtype); - auto input_device = input.get_device(); - // inputs must be on unique devices - if (devices.test(input_device)) { - throw std::runtime_error("inputs must be on unique devices"); - } - devices.set(input_device); - } -} +// auto input_device = input.get_device(); +// // inputs must be on unique devices +// if (devices.test(input_device)) { +// throw std::runtime_error("inputs must be on unique devices"); +// } +// devices.set(input_device); +// } +// } -void check_inputs( - TensorList inputs, - const at::Tensor& output, - int root, - int input_multiplier, - int output_multiplier) { - auto len = inputs.size(); +// void check_inputs( +// TensorList inputs, +// const at::Tensor& output, +// int root, +// int input_multiplier, +// int output_multiplier) { +// auto len = inputs.size(); - if (len <= 0) { - throw std::runtime_error("input sequence can't be empty"); - } +// if (len <= 0) { +// throw std::runtime_error("input sequence can't be empty"); +// } - device_set devices; - int64_t numel = inputs[0].numel(); - auto dtype = inputs[0].scalar_type(); - - for (const auto i : c10::irange(len)) { - auto input = inputs[i]; - - check_tensor( - input, - i == static_cast>(root) - ? std::optional{output} - : std::nullopt, - input_multiplier, - output_multiplier, - numel, - dtype); - - auto input_device = input.get_device(); - // inputs must be on unique devices - if (devices.test(input_device)) { - throw std::runtime_error("inputs must be on unique devices"); - } - devices.set(input_device); - } -} +// device_set devices; +// int64_t numel = inputs[0].numel(); +// auto dtype = inputs[0].scalar_type(); + +// for (const auto i : c10::irange(len)) { +// auto input = inputs[i]; + +// check_tensor( +// input, +// i == static_cast>(root) +// ? std::optional{output} +// : std::nullopt, +// input_multiplier, +// output_multiplier, +// numel, +// dtype); + +// auto input_device = input.get_device(); +// // inputs must be on unique devices +// if (devices.test(input_device)) { +// throw std::runtime_error("inputs must be on unique devices"); +// } +// devices.set(input_device); +// } +// } } // namespace detail diff --git a/torch/csrc/xpu/xccl.h b/torch/csrc/xpu/xccl.h index f0f2b57a1dc9f7..31fc594e71cc0b 100644 --- a/torch/csrc/xpu/xccl.h +++ b/torch/csrc/xpu/xccl.h @@ -5,6 +5,7 @@ #include #include #include +#include namespace torch::xpu::xccl { @@ -12,7 +13,7 @@ using xcclComm_t = ccl::communicator; using XCCL_KVS = ccl::shared_ptr_class; -XCCL_KVS kvs; +extern XCCL_KVS kvs; XCCL_KVS get_kvs(int rank, c10d::Store& store); @@ -54,21 +55,21 @@ void check_inputs( } // namespace detail -using comm_list = std::vector; -using stream_list = std::vector>; +// using comm_list = std::vector; +// using stream_list = std::vector>; std::uint64_t version(); const char* version_suffix(); bool is_available(at::TensorList tensors); -comm_init_rank(int nranks, const ncclUniqueId& comm_id, int rank); -void comm_destroy(ncclComm_t comm); +// comm_init_rank(int nranks, const ncclUniqueId& comm_id, int rank); +// void comm_destroy(xcclComm_t comm); -void all_reduce( - const std::vector& inputs, - std::vector& outputs, - int32_t op = static_cast(xcclRedOp::Sum), - const stream_list& streams = {}, - const comm_list& user_comms = {}); +// void all_reduce( +// const std::vector& inputs, +// std::vector& outputs, +// int32_t op = static_cast(xcclRedOp::Sum), +// const stream_list& streams = {}, +// const comm_list& user_comms = {}); } // namespace torch::xpu::xccl From 31eeee95ff611e8c8ab88dce0b096e45db808f41 Mon Sep 17 00:00:00 2001 From: hanchao Date: Mon, 2 Sep 2024 03:30:54 +0000 Subject: [PATCH 05/96] register --- build_variables.bzl | 1 + torch/_C/_distributed_c10d.pyi | 9 ++++++ .../distributed/c10d/ProcessGroupXCCL.cpp | 16 +++++----- .../distributed/c10d/ProcessGroupXCCL.hpp | 29 +++++++------------ torch/csrc/distributed/c10d/init.cpp | 22 ++++++++++++++ 5 files changed, 51 insertions(+), 26 deletions(-) diff --git a/build_variables.bzl b/build_variables.bzl index e05c94bd83f577..80a575324aa8b3 100644 --- a/build_variables.bzl +++ b/build_variables.bzl @@ -542,6 +542,7 @@ libtorch_distributed_extra_sources = [ "torch/csrc/distributed/autograd/rpc_messages/rref_backward_req.cpp", "torch/csrc/distributed/autograd/rpc_messages/rref_backward_resp.cpp", "torch/csrc/distributed/c10d/HashStore.cpp", + "torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp", "torch/csrc/distributed/rpc/agent_utils.cpp", "torch/csrc/distributed/rpc/message.cpp", "torch/csrc/distributed/rpc/profiler/remote_profiler_manager.cpp", diff --git a/torch/_C/_distributed_c10d.pyi b/torch/_C/_distributed_c10d.pyi index 94e8578bbfff62..b2cba6905901f3 100644 --- a/torch/_C/_distributed_c10d.pyi +++ b/torch/_C/_distributed_c10d.pyi @@ -697,3 +697,12 @@ class ProcessGroupCudaP2P(Backend): storage_offset: Optional[int] = 0, ) -> torch.Tensor: ... def _shutdown(self) -> None: ... + +class ProcessGroupXCC(Backend): + def __init__( + self, + store: Store, + rank: int, + size: int, + timeout: timedelta, + ): ... \ No newline at end of file diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index d901259f400c5e..1fe069575c7143 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -70,7 +70,7 @@ ccl::datatype getXcclDataType(at::ScalarType type) { namespace { static std::mutex xcclCommDevIdxMapMutex; -static std::unordered_map, int> xcclCommDevIdxMap; +static std::unordered_map, int> xcclCommDevIdxMap; template < template @@ -118,7 +118,7 @@ c10::intrusive_ptr ProcessGroupXCCL::createProcessGroupXCCL( ProcessGroupXCCL::~ProcessGroupXCCL() {} -std::shared_ptr ProcessGroupXCCL::getXCCLComm( +std::shared_ptr ProcessGroupXCCL::getXCCLComm( const std::string& deviceKey, at::Device& device) { if (deviceKey.empty()) { @@ -135,7 +135,7 @@ std::shared_ptr ProcessGroupXCCL::getXCCLComm( } } - std::shared_ptr xcclComm; + std::shared_ptr xcclComm_t; XCCL_KVS kvs = get_kvs(rank_, store_); @@ -149,11 +149,11 @@ std::shared_ptr ProcessGroupXCCL::getXCCLComm( auto q = get_sycl_queue(stream); auto ctx = ccl::create_context(q.get_context()); devs_rank.emplace_back(rank, ccl::create_device(q.get_device())); - xcclComm = ccl::create_communicator(numRanks, devs_rank, ctx, kvs); + xcclComm_t = ccl::create_communicator(numRanks, devs_rank, ctx, kvs); { std::lock_guard lock(mutex_); - inInitializationCommMap_.emplace(deviceKey, xcclComm); + inInitializationCommMap_.emplace(deviceKey, xcclComm_t); } auto it = inInitializationCommMap_.find(deviceKey); @@ -162,7 +162,7 @@ std::shared_ptr ProcessGroupXCCL::getXCCLComm( inInitializationCommMap_.erase(deviceKey); xcclCommDevIdxMapMutex.lock(); - xcclCommDevIdxMap.emplace(xcclComm, device.index()); + xcclCommDevIdxMap.emplace(xcclComm_t, device.index()); xcclCommDevIdxMapMutex.unlock(); } @@ -187,7 +187,7 @@ c10::intrusive_ptr ProcessGroupXCCL::collective( auto device = input.device(); const auto key = std::to_string(device.index()); - auto xcclComm = getXCCLComm(key, device); + auto xcclComm_t = getXCCLComm(key, device); std::vector inputs{input}; std::vector outputs{output}; @@ -198,7 +198,7 @@ c10::intrusive_ptr ProcessGroupXCCL::collective( // enqueue); work = make_work_ccl( - inputs, outputs, fn, xcclComm, attr, rank_, op_type); + inputs, outputs, fn, xcclComm_t, attr, rank_, op_type); // pre(ncclStream, work); // ncclComm_t comm = ncclComm->getNcclComm(); // post(ncclStream, work); diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp index c7e17b491ffce6..6c4a40f0a3ee77 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp @@ -25,14 +25,13 @@ #include #include -#include #include #include -#include namespace c10d { constexpr const char* XCCL_BACKEND_NAME = "xccl"; +using namespace torch::xpu::xccl; class ProcessGroupXCCL : public Backend { public: @@ -41,14 +40,11 @@ class ProcessGroupXCCL : public Backend { WorkXCCL( std::vector> outputTensors, int rank = -1, - OpType opType = UNKNOWN, + OpType opType = OpType::UNKNOWN, const c10::optional>& inputTensors = c10::nullopt) : Work(rank, opType), outputTensors_(std::move(outputTensors)) {} - WorkXCCL(const WorkXCCL& w) - : outputTensors_(w.outputTensors_), events_(w.events_) {} - ~WorkXCCL() override { // Ensures all events are properly handled before destruction for (auto& event : events_) { @@ -57,7 +53,7 @@ class ProcessGroupXCCL : public Backend { } bool isCompleted() override { - for (const auto& event : events_) { + for (auto& event : events_) { if (!event.test()) { return false; } @@ -80,14 +76,15 @@ class ProcessGroupXCCL : public Backend { } } - void wait() override { - std::lock_guard lock(mutex_); + void wait() { + std::unique_lock lock(mutex_); for (auto& event : events_) { - CCL_CHECK(event.wait()); + event.wait(); } events_.clear(); } + c10::intrusive_ptr getFuture() override { TORCH_CHECK( false, "ProcessGroupXCCL::WorkXCCL::getFuture not implemented"); @@ -109,9 +106,9 @@ class ProcessGroupXCCL : public Backend { const c10::intrusive_ptr& store, int rank, int size) - : store_(store), rank_(rank), size_(size) {} + : store_(store), Backend(rank, size) {} - ProcessGroupXCCL::~ProcessGroupXCCL() = default; + ~ProcessGroupXCCL() = default; const std::string getBackendName() const override { return std::string(XCCL_BACKEND_NAME); @@ -129,14 +126,10 @@ class ProcessGroupXCCL : public Backend { int rank = -1, int size = -1); - private: - int rank_; - int size_; - public: - std::unordered_map> + std::unordered_map> inInitializationCommMap_; - std::unordered_map> devXCCLCommMap_; + std::unordered_map> devXCCLCommMap_; c10::intrusive_ptr store_; std::mutex mutex_; }; diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp index c8f9dff37f06e2..e12e96f9fe882f 100644 --- a/torch/csrc/distributed/c10d/init.cpp +++ b/torch/csrc/distributed/c10d/init.cpp @@ -37,6 +37,11 @@ #include #endif +#ifdef USE_C10D_XCCL +#include +#endif + + #include #include #include @@ -2877,6 +2882,23 @@ Example:: py::call_guard()); #endif +#ifdef USE_C10D_XCCL + auto processGroupXCCL = + intrusive_ptr_no_gil_destructor_class_<::c10d::ProcessGroupXCCL>( + module, "ProcessGroupXCCL", backend) + .def( + py::init([](const c10::intrusive_ptr<::c10d::Store>& store, + int rank, + int size) { + return c10::make_intrusive<::c10d::ProcessGroupXCCL>( + store, rank, size); + }), + py::arg("store"), + py::arg("rank"), + py::arg("size"), + py::call_guard()); +#endif + py::enum_<::c10d::OpType>(module, "OpType") .value("BROADCAST", ::c10d::OpType::BROADCAST) .value("ALLREDUCE", ::c10d::OpType::ALLREDUCE) From b977abcad2464bff802df68318ea658014cad63e Mon Sep 17 00:00:00 2001 From: hanchao Date: Mon, 2 Sep 2024 06:58:33 +0000 Subject: [PATCH 06/96] update --- .../distributed/c10d/ProcessGroupXCCL.cpp | 33 +++++++++---------- .../distributed/c10d/ProcessGroupXCCL.hpp | 10 +++--- 2 files changed, 20 insertions(+), 23 deletions(-) diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index 1fe069575c7143..12f33316c08f86 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -65,10 +65,6 @@ ccl::datatype getXcclDataType(at::ScalarType type) { return it->second; } -} // namespace c10d - -namespace { - static std::mutex xcclCommDevIdxMapMutex; static std::unordered_map, int> xcclCommDevIdxMap; @@ -116,7 +112,13 @@ c10::intrusive_ptr ProcessGroupXCCL::createProcessGroupXCCL( return c10::make_intrusive(store, rank, size); } -ProcessGroupXCCL::~ProcessGroupXCCL() {} +ProcessGroupXCCL::ProcessGroupXCCL( + const c10::intrusive_ptr& store, + int rank, + int size) + : Backend(rank, size), store_(store) {} + +ProcessGroupXCCL::~ProcessGroupXCCL() = default; std::shared_ptr ProcessGroupXCCL::getXCCLComm( const std::string& deviceKey, @@ -135,7 +137,7 @@ std::shared_ptr ProcessGroupXCCL::getXCCLComm( } } - std::shared_ptr xcclComm_t; + std::shared_ptr XCCLComm; XCCL_KVS kvs = get_kvs(rank_, store_); @@ -149,11 +151,11 @@ std::shared_ptr ProcessGroupXCCL::getXCCLComm( auto q = get_sycl_queue(stream); auto ctx = ccl::create_context(q.get_context()); devs_rank.emplace_back(rank, ccl::create_device(q.get_device())); - xcclComm_t = ccl::create_communicator(numRanks, devs_rank, ctx, kvs); + XCCLComm = ccl::create_communicator(numRanks, devs_rank, ctx, kvs); { std::lock_guard lock(mutex_); - inInitializationCommMap_.emplace(deviceKey, xcclComm_t); + inInitializationCommMap_.emplace(deviceKey, XCCLComm); } auto it = inInitializationCommMap_.find(deviceKey); @@ -162,7 +164,7 @@ std::shared_ptr ProcessGroupXCCL::getXCCLComm( inInitializationCommMap_.erase(deviceKey); xcclCommDevIdxMapMutex.lock(); - xcclCommDevIdxMap.emplace(xcclComm_t, device.index()); + xcclCommDevIdxMap.emplace(XCCLComm, device.index()); xcclCommDevIdxMapMutex.unlock(); } @@ -193,15 +195,9 @@ c10::intrusive_ptr ProcessGroupXCCL::collective( std::vector outputs{output}; c10::intrusive_ptr work; - // work = - // initWork(device, rank_, opType, profilingTitle, inputs, outputs, - // enqueue); work = make_work_ccl( inputs, outputs, fn, xcclComm_t, attr, rank_, op_type); - // pre(ncclStream, work); - // ncclComm_t comm = ncclComm->getNcclComm(); - // post(ncclStream, work); return work; } @@ -255,6 +251,9 @@ c10::intrusive_ptr ProcessGroupXCCL::allreduce( OpType::ALLREDUCE); } -} // namespace +// c10::intrusive_ptr barrier( +// const BarrierOptions& opts = BarrierOptions()) override; + +} // namespace c10d -#endif // USE_C10D_XCCL \ No newline at end of file +#endif // USE_C10D_XCCL diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp index 6c4a40f0a3ee77..7e59180eb9b57e 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp @@ -83,11 +83,9 @@ class ProcessGroupXCCL : public Backend { } events_.clear(); } - - + c10::intrusive_ptr getFuture() override { - TORCH_CHECK( - false, "ProcessGroupXCCL::WorkXCCL::getFuture not implemented"); + return future_; } std::vector result() override { @@ -118,8 +116,8 @@ class ProcessGroupXCCL : public Backend { std::vector& tensors, const AllreduceOptions& opts = AllreduceOptions()) override; - c10::intrusive_ptr barrier( - const BarrierOptions& opts = BarrierOptions()) override; + // c10::intrusive_ptr barrier( + // const BarrierOptions& opts = BarrierOptions()) override; static c10::intrusive_ptr createProcessGroupXCCL( const c10::intrusive_ptr& store, From 486b61a9f78bdc530da1185bdd4023098e987f78 Mon Sep 17 00:00:00 2001 From: hanchao Date: Tue, 3 Sep 2024 01:41:04 +0000 Subject: [PATCH 07/96] update --- torch/_C/_distributed_c10d.pyi | 3 +-- torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp | 15 ++++++++------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/torch/_C/_distributed_c10d.pyi b/torch/_C/_distributed_c10d.pyi index b2cba6905901f3..0c97185519d28f 100644 --- a/torch/_C/_distributed_c10d.pyi +++ b/torch/_C/_distributed_c10d.pyi @@ -698,11 +698,10 @@ class ProcessGroupCudaP2P(Backend): ) -> torch.Tensor: ... def _shutdown(self) -> None: ... -class ProcessGroupXCC(Backend): +class ProcessGroupXCCL(Backend): def __init__( self, store: Store, rank: int, size: int, - timeout: timedelta, ): ... \ No newline at end of file diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp index 7e59180eb9b57e..0c5f4fa5aeccf7 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp @@ -76,13 +76,14 @@ class ProcessGroupXCCL : public Backend { } } - void wait() { - std::unique_lock lock(mutex_); - for (auto& event : events_) { - event.wait(); - } - events_.clear(); - } + bool wait(std::chrono::milliseconds timeout = kNoTimeout) override; + // void wait() { + // std::unique_lock lock(mutex_); + // for (auto& event : events_) { + // event.wait(); + // } + // events_.clear(); + // } c10::intrusive_ptr getFuture() override { return future_; From 6844932aeb7b4c8aff6e0d4bac5bf32ede5e0a5b Mon Sep 17 00:00:00 2001 From: hanchao Date: Tue, 3 Sep 2024 02:25:24 +0000 Subject: [PATCH 08/96] fix typo and register frontend --- cmake/Dependencies.cmake | 2 +- cmake/Modules/FindXCCL.cmake | 2 +- torch/distributed/distributed_c10d.py | 29 ++++++++++++++++++++++++--- 3 files changed, 28 insertions(+), 5 deletions(-) diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 49fb525afbf8a8..cb204eada5f689 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -1154,7 +1154,7 @@ endif() if(USE_XCCL) if(NOT USE_XPU) message(WARNING - "Not using XPU, so disabling USE_NUSE_XCCLCCL. Suppress this warning with " + "Not using XPU, so disabling USE_XCCL. Suppress this warning with " "-DUSE_XCCL=OFF.") caffe2_update_option(USE_XCCL OFF) elseif(NOT CMAKE_SYSTEM_NAME STREQUAL "Linux") diff --git a/cmake/Modules/FindXCCL.cmake b/cmake/Modules/FindXCCL.cmake index 3f30e8cd23d6e7..56b7fc0f7dcf32 100644 --- a/cmake/Modules/FindXCCL.cmake +++ b/cmake/Modules/FindXCCL.cmake @@ -27,7 +27,7 @@ find_file( NO_DEFAULT_PATH ) -# Find include/sycl path from include path. +# Find include/oneapi path from include path. find_file( XCCL_INCLUDE_ONEAPI_DIR NAMES oneapi diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py index 45e096985143a3..d178f976c5682d 100644 --- a/torch/distributed/distributed_c10d.py +++ b/torch/distributed/distributed_c10d.py @@ -130,6 +130,7 @@ _NCCL_AVAILABLE = True _GLOO_AVAILABLE = True _UCC_AVAILABLE = True +_XCCL_AVAILABLE = True _pickler = pickle.Pickler _unpickler = pickle.Unpickler @@ -193,6 +194,14 @@ def _export_c_types() -> None: except ImportError: _UCC_AVAILABLE = False +try: + from torch._C._distributed_c10d import ProcessGroupXCCL + + ProcessGroupXCCL.__module__ = "torch.distributed.distributed_c10d" + __all__ += ["ProcessGroupXCCL"] +except ImportError: + _XCCL_AVAILABLE = False + logger = logging.getLogger(__name__) PG_WRAPPER_STORE_PREFIX = "pg_wrapper" @@ -222,7 +231,7 @@ class Backend(str): """ An enum-like class for backends. - Available backends: GLOO, NCCL, UCC, MPI, and other registered backends. + Available backends: GLOO, NCCL, UCC, MPI, XCCL, and other registered backends. The values of this class are lowercase strings, e.g., ``"gloo"``. They can be accessed as attributes, e.g., ``Backend.NCCL``. @@ -242,6 +251,7 @@ class Backend(str): NCCL = "nccl" UCC = "ucc" MPI = "mpi" + XCCL = "XCCL" _BackendPlugin = namedtuple("_BackendPlugin", ["creator_fn", "extended_api"]) @@ -1097,6 +1107,9 @@ def is_ucc_available() -> bool: """Check if the UCC backend is available.""" return _UCC_AVAILABLE +def is_xccl_available() -> bool: + """Check if the XCCL backend is available.""" + return _XCCL_AVAILABLE def is_backend_available(backend: str) -> bool: """ @@ -1385,7 +1398,7 @@ def init_process_group( Args: backend (str or Backend, optional): The backend to use. Depending on - build-time configurations, valid values include ``mpi``, ``gloo``, + build-time configurations, valid values include ``mpi``, ``gloo``, ``xccl``, ``nccl``, and ``ucc``. If the backend is not provided, then both a ``gloo`` and ``nccl`` backend will be created, see notes below for how multiple backends are managed. This field can be given as a lowercase string @@ -1762,7 +1775,6 @@ def _new_process_group_helper( pg_options = ProcessGroupNCCL.Options() pg_options.is_high_priority_stream = False pg_options._timeout = timeout - if split_from: pg_options.split_from = split_from pg_options.split_color = _process_group_color(global_ranks_in_group) @@ -1781,6 +1793,17 @@ def _new_process_group_helper( backend_prefix_store, group_rank, group_size, timeout=timeout ) backend_type = ProcessGroup.BackendType.UCC + elif backend_str == Backend.XCCL: + if not is_xccl_available(): + raise RuntimeError("Distributed package doesn't have XCCL built in") + if pg_options is not None: + assert isinstance( + pg_options, ProcessGroupXCCL.Options + ), "Expected pg_options argument to be of type ProcessGroupXCCL.Options" + backend_class = ProcessGroupXCCL( + backend_prefix_store, group_rank, group_size + ) + backend_type = ProcessGroup.BackendType.XCCL else: assert ( backend_str.upper() in Backend._plugins From 7f6f8b96bb2bba92b8bc4e912e414da65d521f2d Mon Sep 17 00:00:00 2001 From: hanchao Date: Tue, 3 Sep 2024 02:30:15 +0000 Subject: [PATCH 09/96] update --- torch/distributed/distributed_c10d.py | 1 + 1 file changed, 1 insertion(+) diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py index d178f976c5682d..26cb1cda1db8cb 100644 --- a/torch/distributed/distributed_c10d.py +++ b/torch/distributed/distributed_c10d.py @@ -87,6 +87,7 @@ "is_nccl_available", "is_torchelastic_launched", "is_ucc_available", + "is_xccl_available", "isend", "monitored_barrier", "new_group", From be683207a1e745f4973410b63da180f8d2a46578 Mon Sep 17 00:00:00 2001 From: hanchao Date: Tue, 3 Sep 2024 06:03:13 +0000 Subject: [PATCH 10/96] update --- .../distributed/c10d/ProcessGroupXCCL.cpp | 30 ++++++-- .../distributed/c10d/ProcessGroupXCCL.hpp | 74 +++++++++++-------- 2 files changed, 66 insertions(+), 38 deletions(-) diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index 12f33316c08f86..ef60d0546b0df8 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -53,7 +53,6 @@ void check_gpu_single_tensor(const at::Tensor& tensor) { C10_THROW_ERROR(ValueError, "Tensors must be contiguous"); } } -} // namespace ccl::datatype getXcclDataType(at::ScalarType type) { auto it = xcclDatatypes.find(type); @@ -64,6 +63,9 @@ ccl::datatype getXcclDataType(at::ScalarType type) { type); return it->second; } +} // namespace + + static std::mutex xcclCommDevIdxMapMutex; static std::unordered_map, int> xcclCommDevIdxMap; @@ -91,14 +93,26 @@ c10::intrusive_ptr make_work_ccl( return ret_ptr; } +// ProcessGroupXCCL::WorkXCCL::WorkXCCL( +// std::vector> outputTensors, +// int rank, +// c10d::OpType opType, +// const c10::optional>& inputTensors) +// : Work(rank, opType, nullptr, inputTensors), +// outputTensors_(std::move(outputTensors)), +// future_(createFutureAsOutput(outputTensors)) {} + ProcessGroupXCCL::WorkXCCL::WorkXCCL( - std::vector> outputTensors, + at::Device& device, int rank, - c10d::OpType opType, - const c10::optional>& inputTensors) - : Work(rank, opType, nullptr, inputTensors), - outputTensors_(std::move(outputTensors)), - future_(createFutureAsOutput(outputTensors)) {} + OpType opType, + const std::optional>& inputs) + : Work(rank, opType, "profilingTitle", inputs), device_(device) {} + +ProcessGroupXCCL::WorkXCCL::WorkXCCL(const WorkXCCL& w) + : Work(w.rank_, w.opType_), device_(w.device_) {} + +ProcessGroupXCCL::WorkXCCL::~WorkXCCL() = default; c10::intrusive_ptr ProcessGroupXCCL::WorkXCCL:: getFuture() { @@ -198,7 +212,7 @@ c10::intrusive_ptr ProcessGroupXCCL::collective( work = make_work_ccl( inputs, outputs, fn, xcclComm_t, attr, rank_, op_type); - + // work->events_.emplace_back(fn); return work; } diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp index 0c5f4fa5aeccf7..0b3a50a4c1fffd 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp @@ -38,27 +38,34 @@ class ProcessGroupXCCL : public Backend { class WorkXCCL : public Work { public: WorkXCCL( - std::vector> outputTensors, - int rank = -1, - OpType opType = OpType::UNKNOWN, - const c10::optional>& inputTensors = - c10::nullopt) - : Work(rank, opType), outputTensors_(std::move(outputTensors)) {} - - ~WorkXCCL() override { - // Ensures all events are properly handled before destruction - for (auto& event : events_) { - event.wait(); - } - } - + at::Device& device, + int rank, + OpType opType, + const std::optional>& inputs = std::nullopt); + // WorkXCCL( + // std::vector> outputTensors, + // int rank = -1, + // OpType opType = OpType::UNKNOWN, + // const c10::optional>& inputTensors = + // c10::nullopt) + // : Work(rank, opType), outputTensors_(std::move(outputTensors)) {} + WorkXCCL(const WorkXCCL& w); + // ~WorkXCCL() override { + // // Ensures all events are properly handled before destruction + // for (auto& event : events_) { + // event.wait(); + // } + // } + ~WorkXCCL() override; bool isCompleted() override { - for (auto& event : events_) { - if (!event.test()) { - return false; - } - } - return true; + TORCH_CHECK( + false, "ProcessGroupXCCL::WorkXCCL::isCompleted not implemented"); + // for (auto& event : events_) { + // if (!event.test()) { + // return false; + // } + // } + // return true; } bool isSuccess() const override { @@ -71,9 +78,11 @@ class ProcessGroupXCCL : public Backend { } void synchronize() override { - for (auto& event : events_) { - event.wait(); - } + TORCH_CHECK( + false, "ProcessGroupXCCL::WorkXCCL::synchronize not implemented"); + // for (auto& event : events_) { + // event.wait(); + // } } bool wait(std::chrono::milliseconds timeout = kNoTimeout) override; @@ -84,28 +93,33 @@ class ProcessGroupXCCL : public Backend { // } // events_.clear(); // } - + c10::intrusive_ptr getFuture() override { return future_; } std::vector result() override { - return outputTensors_.empty() ? std::vector() - : outputTensors_[0]; + TORCH_CHECK(false, "ProcessGroupXCCL::WorkXCCL::result not implemented"); + // return outputTensors_.empty() ? std::vector() + // : outputTensors_[0]; } protected: - friend class ProcessGroupXCCL; - std::vector events_; - const std::vector> outputTensors_; + at::Device device_; + // std::vector events_; + // std::shared_ptr xcclComm_; + // const std::vector> outputTensors_; + private: + std::shared_ptr> outputs_; c10::intrusive_ptr future_; + friend class ProcessGroupXCCL; }; explicit ProcessGroupXCCL( const c10::intrusive_ptr& store, int rank, int size) - : store_(store), Backend(rank, size) {} + : store_(store), Backend(rank, size) {} ~ProcessGroupXCCL() = default; From 2e21d4f21803175ef7e697d1a22fced2777deb39 Mon Sep 17 00:00:00 2001 From: hanchao Date: Tue, 3 Sep 2024 08:50:16 +0000 Subject: [PATCH 11/96] update --- torch/csrc/distributed/c10d/Ops.cpp | 1 + .../distributed/c10d/ProcessGroupXCCL.cpp | 107 ++++++++++++------ .../distributed/c10d/ProcessGroupXCCL.hpp | 14 +-- 3 files changed, 81 insertions(+), 41 deletions(-) diff --git a/torch/csrc/distributed/c10d/Ops.cpp b/torch/csrc/distributed/c10d/Ops.cpp index ae822ad3975049..03a5e42874594e 100644 --- a/torch/csrc/distributed/c10d/Ops.cpp +++ b/torch/csrc/distributed/c10d/Ops.cpp @@ -181,6 +181,7 @@ IMPL_BROADCAST(PrivateUse1) IMPL_ALLREDUCE(CPU) IMPL_ALLREDUCE(CUDA) +IMPL_ALLREDUCE(XPU) IMPL_ALLREDUCE(PrivateUse1) #define IMPL_ALLREDUCE_COALESCED(DEV) \ diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index ef60d0546b0df8..5e2e179d32af37 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -45,7 +45,7 @@ std::map xcclDatatypes = { {at::kBool, ccl::datatype::uint8}, }; -void check_gpu_single_tensor(const at::Tensor& tensor) { +void check_xpu_single_tensor(const at::Tensor& tensor) { if (!tensor.is_xpu() || tensor.is_sparse()) { C10_THROW_ERROR(ValueError, "Tensors must be XPU and dense"); } @@ -65,33 +65,31 @@ ccl::datatype getXcclDataType(at::ScalarType type) { } } // namespace - - static std::mutex xcclCommDevIdxMapMutex; static std::unordered_map, int> xcclCommDevIdxMap; -template < - template - class WorkXCCL, - typename RunF, - typename CommType, - typename InputType, - typename OutputType, - typename attr_t> -c10::intrusive_ptr make_work_ccl( - const std::vector& inputs, - const std::vector& outputs, - RunF f, - CommType& comms, - attr_t& attr, - int rank, - c10d::OpType op_type) { - c10::intrusive_ptr> - ret_ptr = c10::make_intrusive< - WorkCCL>( - inputs, outputs, f, comms, attr, rank, op_type); - return ret_ptr; -} +// template < +// template +// class WorkXCCL, +// typename RunF, +// typename CommType, +// typename InputType, +// typename OutputType, +// typename attr_t> +// c10::intrusive_ptr make_work_ccl( +// const std::vector& inputs, +// const std::vector& outputs, +// RunF f, +// CommType& comms, +// attr_t& attr, +// int rank, +// c10d::OpType op_type) { +// c10::intrusive_ptr> +// ret_ptr = c10::make_intrusive< +// WorkCCL>( +// inputs, outputs, f, comms, attr, rank, op_type); +// return ret_ptr; +// } // ProcessGroupXCCL::WorkXCCL::WorkXCCL( // std::vector> outputTensors, @@ -107,10 +105,14 @@ ProcessGroupXCCL::WorkXCCL::WorkXCCL( int rank, OpType opType, const std::optional>& inputs) - : Work(rank, opType, "profilingTitle", inputs), device_(device) {} + : Work(rank, opType, "profilingTitle", inputs), device_(device) { + unsigned char enable_timing = 0; + xcclEndEvent_ = std::make_shared(enable_timing); +} ProcessGroupXCCL::WorkXCCL::WorkXCCL(const WorkXCCL& w) - : Work(w.rank_, w.opType_), device_(w.device_) {} + : Work(w.rank_, w.opType_), device_(w.device_), + xcclEndEvent_(w.xcclEndEvent_) {} ProcessGroupXCCL::WorkXCCL::~WorkXCCL() = default; @@ -119,6 +121,12 @@ c10::intrusive_ptr ProcessGroupXCCL::WorkXCCL:: return future_; } +void ProcessGroupXCCL::WorkXCCL::synchronize() { + auto currentStream = at::xpu::getCurrentXPUStream(device_.index()); + // Block the current stream on the XCCL stream + xcclEndEvent_->block(currentStream); +} + c10::intrusive_ptr ProcessGroupXCCL::createProcessGroupXCCL( const c10::intrusive_ptr& store, int rank, @@ -134,6 +142,20 @@ ProcessGroupXCCL::ProcessGroupXCCL( ProcessGroupXCCL::~ProcessGroupXCCL() = default; +c10::intrusive_ptr ProcessGroupXCCL::initWork( + at::Device& device, + int rank, + OpType opType, + const std::vector& inputs, + const std::vector& outputs) { + auto r = c10::make_intrusive( + device, + rank, + opType, + std::optional>(inputs)); + return r; +} + std::shared_ptr ProcessGroupXCCL::getXCCLComm( const std::string& deviceKey, at::Device& device) { @@ -162,7 +184,7 @@ std::shared_ptr ProcessGroupXCCL::getXCCLComm( ccl::vector_class> devs_rank; c10::impl::VirtualGuardImpl impl(device.type()); c10::Stream stream = impl.getStream(device); - auto q = get_sycl_queue(stream); + auto q = at::xpu::XPUStream(stream).queue(); auto ctx = ccl::create_context(q.get_context()); devs_rank.emplace_back(rank, ccl::create_device(q.get_device())); XCCLComm = ccl::create_communicator(numRanks, devs_rank, ctx, kvs); @@ -172,6 +194,8 @@ std::shared_ptr ProcessGroupXCCL::getXCCLComm( inInitializationCommMap_.emplace(deviceKey, XCCLComm); } + xcclStreams_.emplace(deviceKey, std::move(stream)); + auto it = inInitializationCommMap_.find(deviceKey); if (it != inInitializationCommMap_.end()) { devXCCLCommMap_.emplace(deviceKey, std::move(it->second)); @@ -203,21 +227,38 @@ c10::intrusive_ptr ProcessGroupXCCL::collective( auto device = input.device(); const auto key = std::to_string(device.index()); - auto xcclComm_t = getXCCLComm(key, device); + auto comm = getXCCLComm(key, device); + auto xcclStream = xcclStreams_.at(key); std::vector inputs{input}; std::vector outputs{output}; c10::intrusive_ptr work; - work = make_work_ccl( - inputs, outputs, fn, xcclComm_t, attr, rank_, op_type); + work =initWork(device, rank_, op_type); + // work = make_work_ccl( + // inputs, outputs, fn, xcclComm_t, attr, rank_, op_type); // work->events_.emplace_back(fn); + work->outputs_ = + std::make_shared>(std::move(outputs)); + c10::xpu::XPUCachingAllocator::recordStream( + input.storage().data_ptr(), xcclStream); + + auto ccl_stream = ccl::create_stream(at::xpu::XPUStream(xcclStream).queue()); + fn(input, output, attr, comm, ccl_stream); + + work->xcclEndEvent_->record(xcclStream); + c10::MultiStreamGuard streamGuard(xcclStream); + std::vector devices{device}; + work->future_ = c10::make_intrusive( + c10::ListType::create(c10::TensorType::get()), devices); + work->future_->markCompleted(at::IValue(*work->outputs_)); + return work; } template -c10::intrusive_ptr ProcessGroupNCCL::collective( +c10::intrusive_ptr ProcessGroupXCCL::collective( at::Tensor& input, at::Tensor& output, Fn fn, @@ -237,7 +278,7 @@ c10::intrusive_ptr ProcessGroupXCCL::allreduce( TORCH_CHECK( tensors.size() == 1, "Expecting one tensor only but got multiple"); auto tensor = tensors.back(); - check_gpu_single_tensor(tensor); + check_xpu_single_tensor(tensor); if (opts.reduceOp == ReduceOp::AVG) { TORCH_CHECK(false, "Cannot use ReduceOp AVG with XPU") } diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp index 0b3a50a4c1fffd..02eddb7acb8ec0 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp @@ -11,6 +11,8 @@ #include #include +#include +#include #include #include #include @@ -77,13 +79,7 @@ class ProcessGroupXCCL : public Backend { TORCH_CHECK(false, "ProcessGroupXCCL::WorkXCCL::abort not implemented"); } - void synchronize() override { - TORCH_CHECK( - false, "ProcessGroupXCCL::WorkXCCL::synchronize not implemented"); - // for (auto& event : events_) { - // event.wait(); - // } - } + void synchronize() override; bool wait(std::chrono::milliseconds timeout = kNoTimeout) override; // void wait() { @@ -106,6 +102,7 @@ class ProcessGroupXCCL : public Backend { protected: at::Device device_; + std::shared_ptr xcclEndEvent_; // std::vector events_; // std::shared_ptr xcclComm_; // const std::vector> outputTensors_; @@ -121,7 +118,7 @@ class ProcessGroupXCCL : public Backend { int size) : store_(store), Backend(rank, size) {} - ~ProcessGroupXCCL() = default; + ~ProcessGroupXCCL() override; const std::string getBackendName() const override { return std::string(XCCL_BACKEND_NAME); @@ -140,6 +137,7 @@ class ProcessGroupXCCL : public Backend { int size = -1); public: + std::unordered_map xcclStreams_; std::unordered_map> inInitializationCommMap_; std::unordered_map> devXCCLCommMap_; From c9ef78fdc5c8872246c74e5a1949d5a7c94726c5 Mon Sep 17 00:00:00 2001 From: hanchao Date: Wed, 4 Sep 2024 04:25:38 +0000 Subject: [PATCH 12/96] update --- build_variables.bzl | 1 + torch/csrc/distributed/c10d/ProcessGroup.hpp | 1 + .../distributed/c10d/ProcessGroupXCCL.cpp | 50 ++++--------------- .../distributed/c10d/ProcessGroupXCCL.hpp | 15 ++---- 4 files changed, 16 insertions(+), 51 deletions(-) diff --git a/build_variables.bzl b/build_variables.bzl index 80a575324aa8b3..55a3f0023b571f 100644 --- a/build_variables.bzl +++ b/build_variables.bzl @@ -786,6 +786,7 @@ libtorch_python_cuda_sources = libtorch_python_cuda_core_sources + [ ] libtorch_python_xpu_sources = [ + "torch/csrc/xpu/xccl.cpp", "torch/csrc/xpu/Event.cpp", "torch/csrc/xpu/Module.cpp", "torch/csrc/xpu/Stream.cpp", diff --git a/torch/csrc/distributed/c10d/ProcessGroup.hpp b/torch/csrc/distributed/c10d/ProcessGroup.hpp index acf8c9c354a76b..85142caf0ac7c7 100644 --- a/torch/csrc/distributed/c10d/ProcessGroup.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroup.hpp @@ -70,6 +70,7 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder { UCC = 3, MPI = 4, CUSTOM = 5, + XCCL = 6, }; // Not used, set for backwards compatibility and only used for TypeDef in diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index 5e2e179d32af37..8be7c6451fcdd0 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -3,7 +3,7 @@ #include #include -#ifdef USE_C10D_XCCL +// #ifdef USE_C10D_XCCL #include #include #include @@ -68,38 +68,6 @@ ccl::datatype getXcclDataType(at::ScalarType type) { static std::mutex xcclCommDevIdxMapMutex; static std::unordered_map, int> xcclCommDevIdxMap; -// template < -// template -// class WorkXCCL, -// typename RunF, -// typename CommType, -// typename InputType, -// typename OutputType, -// typename attr_t> -// c10::intrusive_ptr make_work_ccl( -// const std::vector& inputs, -// const std::vector& outputs, -// RunF f, -// CommType& comms, -// attr_t& attr, -// int rank, -// c10d::OpType op_type) { -// c10::intrusive_ptr> -// ret_ptr = c10::make_intrusive< -// WorkCCL>( -// inputs, outputs, f, comms, attr, rank, op_type); -// return ret_ptr; -// } - -// ProcessGroupXCCL::WorkXCCL::WorkXCCL( -// std::vector> outputTensors, -// int rank, -// c10d::OpType opType, -// const c10::optional>& inputTensors) -// : Work(rank, opType, nullptr, inputTensors), -// outputTensors_(std::move(outputTensors)), -// future_(createFutureAsOutput(outputTensors)) {} - ProcessGroupXCCL::WorkXCCL::WorkXCCL( at::Device& device, int rank, @@ -116,6 +84,11 @@ ProcessGroupXCCL::WorkXCCL::WorkXCCL(const WorkXCCL& w) ProcessGroupXCCL::WorkXCCL::~WorkXCCL() = default; +bool ProcessGroupXCCL::WorkXCCL::wait(std::chrono::milliseconds timeout) { + synchronize(); + return true; +} + c10::intrusive_ptr ProcessGroupXCCL::WorkXCCL:: getFuture() { return future_; @@ -267,8 +240,10 @@ c10::intrusive_ptr ProcessGroupXCCL::collective( input, output, fn, - [](std::vector&) {}, - [](std::vector&) {}, + [](at::xpu::XPUStream&, + c10::intrusive_ptr& work) {}, + [](at::xpu::XPUStream&, + c10::intrusive_ptr& work) {}, opType); } @@ -306,9 +281,6 @@ c10::intrusive_ptr ProcessGroupXCCL::allreduce( OpType::ALLREDUCE); } -// c10::intrusive_ptr barrier( -// const BarrierOptions& opts = BarrierOptions()) override; - } // namespace c10d -#endif // USE_C10D_XCCL +// #endif // USE_C10D_XCCL diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp index 02eddb7acb8ec0..d14d677205ecbb 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp @@ -7,7 +7,7 @@ #include #endif -#ifdef USE_C10D_XCCL +// #ifdef USE_C10D_XCCL #include #include @@ -35,7 +35,7 @@ namespace c10d { constexpr const char* XCCL_BACKEND_NAME = "xccl"; using namespace torch::xpu::xccl; -class ProcessGroupXCCL : public Backend { +class TORCH_XPU_API ProcessGroupXCCL : public Backend { public: class WorkXCCL : public Work { public: @@ -82,13 +82,6 @@ class ProcessGroupXCCL : public Backend { void synchronize() override; bool wait(std::chrono::milliseconds timeout = kNoTimeout) override; - // void wait() { - // std::unique_lock lock(mutex_); - // for (auto& event : events_) { - // event.wait(); - // } - // events_.clear(); - // } c10::intrusive_ptr getFuture() override { return future_; @@ -96,8 +89,6 @@ class ProcessGroupXCCL : public Backend { std::vector result() override { TORCH_CHECK(false, "ProcessGroupXCCL::WorkXCCL::result not implemented"); - // return outputTensors_.empty() ? std::vector() - // : outputTensors_[0]; } protected: @@ -147,4 +138,4 @@ class ProcessGroupXCCL : public Backend { } // namespace c10d -#endif // USE_C10D_XCCL +// #endif // USE_C10D_XCCL From 076db36d3da015427b53c473484d59a0b5ebcd21 Mon Sep 17 00:00:00 2001 From: hanchao Date: Wed, 4 Sep 2024 07:46:38 +0000 Subject: [PATCH 13/96] update --- build_variables.bzl | 2 +- caffe2/CMakeLists.txt | 5 ++--- cmake/Summary.cmake | 1 + torch/CMakeLists.txt | 5 +++++ torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp | 2 -- torch/csrc/xpu/xccl.h | 2 ++ 6 files changed, 11 insertions(+), 6 deletions(-) diff --git a/build_variables.bzl b/build_variables.bzl index 55a3f0023b571f..b903a55b17439b 100644 --- a/build_variables.bzl +++ b/build_variables.bzl @@ -542,7 +542,6 @@ libtorch_distributed_extra_sources = [ "torch/csrc/distributed/autograd/rpc_messages/rref_backward_req.cpp", "torch/csrc/distributed/autograd/rpc_messages/rref_backward_resp.cpp", "torch/csrc/distributed/c10d/HashStore.cpp", - "torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp", "torch/csrc/distributed/rpc/agent_utils.cpp", "torch/csrc/distributed/rpc/message.cpp", "torch/csrc/distributed/rpc/profiler/remote_profiler_manager.cpp", @@ -787,6 +786,7 @@ libtorch_python_cuda_sources = libtorch_python_cuda_core_sources + [ libtorch_python_xpu_sources = [ "torch/csrc/xpu/xccl.cpp", + "torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp", "torch/csrc/xpu/Event.cpp", "torch/csrc/xpu/Module.cpp", "torch/csrc/xpu/Stream.cpp", diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index 28e7d0c96ba877..01d280cb3fc7c4 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -1015,13 +1015,12 @@ endif() if(USE_XPU) if(USE_XCCL) - list(APPEND Caffe2_XPU_SRCS - ${TORCH_SRC_DIR}/csrc/xpu/xccl.cpp) + list(APPEND Caffe2_XPU_SRCS + ${TORCH_SRC_DIR}/csrc/xpu/xccl.cpp) endif() add_library(torch_xpu ${Caffe2_XPU_SRCS}) torch_compile_options(torch_xpu) # see cmake/public/utils.cmake target_compile_definitions(torch_xpu PRIVATE USE_XPU) - # ATen XPU implementation set(TORCH_XPU_OPS_DIR ${TORCH_ROOT}/third_party/torch-xpu-ops) set(TORCH_XPU_OPS_REPO_URL https://github.com/intel/torch-xpu-ops.git) diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake index 0b601cf2a6a329..229ff112ab3187 100644 --- a/cmake/Summary.cmake +++ b/cmake/Summary.cmake @@ -155,6 +155,7 @@ function(caffe2_print_configuration_summary) message(STATUS " USE_ITT : ${USE_ITT}") message(STATUS " USE_XCCL : ${USE_XCCL}") if(${USE_XCCL}) + message(STATUS " USE_C10D_XCCL : ${USE_C10D_XCCL}") message(STATUS " XCCL include path : ${XCCL_INCLUDE_DIR}") message(STATUS " XCCL library : ${XCCL_LIBRARY}") endif() diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt index 8ab7d7aeb095b6..f50ae4e02c3386 100644 --- a/torch/CMakeLists.txt +++ b/torch/CMakeLists.txt @@ -165,6 +165,9 @@ if(USE_XPU) append_filelist("libtorch_python_xpu_sources" TORCH_PYTHON_SRCS) list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_XPU) + # if(USE_XCCL) + list(APPEND TORCH_PYTHON_LINK_LIBRARIES torch::xpurt) + # endif() endif() if(USE_CUDNN OR USE_ROCM) @@ -419,6 +422,8 @@ endif() target_compile_definitions(torch_python PRIVATE "-DTHP_BUILD_MAIN_LIB") target_link_libraries(torch_python PRIVATE ${TORCH_LIB} ${TORCH_PYTHON_LINK_LIBRARIES}) +target_link_libraries(torch_python PRIVATE torch::xpurt) +target_link_libraries(torch_python PRIVATE c10_xpu) target_compile_definitions(torch_python PRIVATE ${TORCH_PYTHON_COMPILE_DEFINITIONS}) diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp index d14d677205ecbb..01a5966b811069 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp @@ -11,8 +11,6 @@ #include #include -#include -#include #include #include #include diff --git a/torch/csrc/xpu/xccl.h b/torch/csrc/xpu/xccl.h index 31fc594e71cc0b..c7a67975bb286c 100644 --- a/torch/csrc/xpu/xccl.h +++ b/torch/csrc/xpu/xccl.h @@ -6,6 +6,8 @@ #include #include #include +#include +#include namespace torch::xpu::xccl { From 8d739aca40d4f6f59458478093af304c2f327b86 Mon Sep 17 00:00:00 2001 From: hanchao Date: Wed, 4 Sep 2024 10:09:43 +0000 Subject: [PATCH 14/96] update --- build_variables.bzl | 1 - caffe2/CMakeLists.txt | 8 +- .../distributed/c10d/ProcessGroupXCCL.cpp | 61 +-- .../distributed/c10d/ProcessGroupXCCL.hpp | 40 +- torch/csrc/xpu/xccl.cpp | 348 ------------------ torch/csrc/xpu/xccl.h | 77 ---- 6 files changed, 80 insertions(+), 455 deletions(-) delete mode 100644 torch/csrc/xpu/xccl.cpp delete mode 100644 torch/csrc/xpu/xccl.h diff --git a/build_variables.bzl b/build_variables.bzl index b903a55b17439b..cff70d00320b0e 100644 --- a/build_variables.bzl +++ b/build_variables.bzl @@ -785,7 +785,6 @@ libtorch_python_cuda_sources = libtorch_python_cuda_core_sources + [ ] libtorch_python_xpu_sources = [ - "torch/csrc/xpu/xccl.cpp", "torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp", "torch/csrc/xpu/Event.cpp", "torch/csrc/xpu/Module.cpp", diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index 01d280cb3fc7c4..55339880a82a37 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -1014,10 +1014,10 @@ elseif(USE_CUDA) endif() if(USE_XPU) - if(USE_XCCL) - list(APPEND Caffe2_XPU_SRCS - ${TORCH_SRC_DIR}/csrc/xpu/xccl.cpp) - endif() + # if(USE_XCCL) + # list(APPEND Caffe2_XPU_SRCS + # ${TORCH_SRC_DIR}/csrc/xpu/xccl.cpp) + # endif() add_library(torch_xpu ${Caffe2_XPU_SRCS}) torch_compile_options(torch_xpu) # see cmake/public/utils.cmake target_compile_definitions(torch_xpu PRIVATE USE_XPU) diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index 8be7c6451fcdd0..ffd566f10f854a 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -11,6 +11,7 @@ #include #include +#include #include #include #include @@ -45,6 +46,36 @@ std::map xcclDatatypes = { {at::kBool, ccl::datatype::uint8}, }; +XCCL_KVS kvs; +std::mutex kvs_mutex; + +XCCL_KVS get_kvs(int rank, c10d::Store& store) { + std::lock_guard lock(kvs_mutex); + if (kvs) + return kvs; + std::string storeKey = "ccl_kvs"; + + // Rank 0 broadcast the bootstrap network information to other ranks + if (rank == 0) { + kvs = ccl::create_main_kvs(); + ccl::kvs::address_type main_addr = kvs->get_address(); + auto ccl_kvs_addr = + std::vector(main_addr.begin(), main_addr.end()); + store.set(storeKey, ccl_kvs_addr); + } else { + auto ccl_kvs_addr = store.get(storeKey); + if (ccl_kvs_addr.size() != ccl::kvs::address_max_size) { + throw std::runtime_error("Unexpected ccl kvs addr from the store\n"); + } + ccl::kvs::address_type main_addr; + std::copy_n( + ccl_kvs_addr.begin(), ccl::kvs::address_max_size, main_addr.begin()); + kvs = ccl::create_kvs(main_addr); + } + + return kvs; +} + void check_xpu_single_tensor(const at::Tensor& tensor) { if (!tensor.is_xpu() || tensor.is_sparse()) { C10_THROW_ERROR(ValueError, "Tensors must be XPU and dense"); @@ -89,11 +120,6 @@ bool ProcessGroupXCCL::WorkXCCL::wait(std::chrono::milliseconds timeout) { return true; } -c10::intrusive_ptr ProcessGroupXCCL::WorkXCCL:: - getFuture() { - return future_; -} - void ProcessGroupXCCL::WorkXCCL::synchronize() { auto currentStream = at::xpu::getCurrentXPUStream(device_.index()); // Block the current stream on the XCCL stream @@ -107,12 +133,6 @@ c10::intrusive_ptr ProcessGroupXCCL::createProcessGroupXCCL( return c10::make_intrusive(store, rank, size); } -ProcessGroupXCCL::ProcessGroupXCCL( - const c10::intrusive_ptr& store, - int rank, - int size) - : Backend(rank, size), store_(store) {} - ProcessGroupXCCL::~ProcessGroupXCCL() = default; c10::intrusive_ptr ProcessGroupXCCL::initWork( @@ -148,7 +168,7 @@ std::shared_ptr ProcessGroupXCCL::getXCCLComm( std::shared_ptr XCCLComm; - XCCL_KVS kvs = get_kvs(rank_, store_); + XCCL_KVS kvs = get_kvs(rank_, *store_); int numRanks, rank; numRanks = getSize(); @@ -157,7 +177,7 @@ std::shared_ptr ProcessGroupXCCL::getXCCLComm( ccl::vector_class> devs_rank; c10::impl::VirtualGuardImpl impl(device.type()); c10::Stream stream = impl.getStream(device); - auto q = at::xpu::XPUStream(stream).queue(); + sycl::queue& q = c10::xpu::XPUStream(stream).queue(); auto ctx = ccl::create_context(q.get_context()); devs_rank.emplace_back(rank, ccl::create_device(q.get_device())); XCCLComm = ccl::create_communicator(numRanks, devs_rank, ctx, kvs); @@ -208,20 +228,20 @@ c10::intrusive_ptr ProcessGroupXCCL::collective( c10::intrusive_ptr work; - work =initWork(device, rank_, op_type); - // work = make_work_ccl( - // inputs, outputs, fn, xcclComm_t, attr, rank_, op_type); - // work->events_.emplace_back(fn); + work = initWork(device, rank_, opType); + work->outputs_ = std::make_shared>(std::move(outputs)); c10::xpu::XPUCachingAllocator::recordStream( input.storage().data_ptr(), xcclStream); - auto ccl_stream = ccl::create_stream(at::xpu::XPUStream(xcclStream).queue()); + auto ccl_stream = ccl::create_stream(xcclStream.queue()); fn(input, output, attr, comm, ccl_stream); work->xcclEndEvent_->record(xcclStream); - c10::MultiStreamGuard streamGuard(xcclStream); + + std::vector streams = {xcclStream.unwrap()}; + c10::MultiStreamGuard streamGuard(streams); std::vector devices{device}; work->future_ = c10::make_intrusive( c10::ListType::create(c10::TensorType::get()), devices); @@ -266,13 +286,12 @@ c10::intrusive_ptr ProcessGroupXCCL::allreduce( xcclComm_t comm, ccl::stream& stream) { ccl::event ret_evt; - ccl::datatype datatype = getXcclDataType(input.scalar_type()); ret_evt = ccl::allreduce( input.data_ptr(), output.data_ptr(), (size_t)input.numel(), getXcclDataType(input.scalar_type()), - xcclOp.at(opts.reduceOp), + xcclOps.at(opts.reduceOp), comm, stream, attr); diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp index 01a5966b811069..b43403f52f31ab 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp @@ -8,9 +8,11 @@ #endif // #ifdef USE_C10D_XCCL - #include -#include +#include +#include +#include +// #include #include #include #include @@ -24,14 +26,17 @@ #include #include +#include +#include #include #include #include - namespace c10d { +using xcclComm_t = ccl::communicator; +using XCCL_KVS = ccl::shared_ptr_class; constexpr const char* XCCL_BACKEND_NAME = "xccl"; -using namespace torch::xpu::xccl; +// using namespace torch::xpu::xccl; class TORCH_XPU_API ProcessGroupXCCL : public Backend { public: @@ -113,6 +118,33 @@ class TORCH_XPU_API ProcessGroupXCCL : public Backend { return std::string(XCCL_BACKEND_NAME); } + std::shared_ptr getXCCLComm( + const std::string& deviceKey, + at::Device& device); + + virtual c10::intrusive_ptr initWork( + at::Device& device, + int rank, + OpType opType, + const std::vector& inputs = {}, + const std::vector& outputs = {}); + + template + c10::intrusive_ptr collective( + at::Tensor& input, + at::Tensor& output, + Fn fn, + OpType opType); + + template + c10::intrusive_ptr collective( + at::Tensor& input, + at::Tensor& output, + Fn fn, + PreProcess pre, + PostProcess post, + OpType opType); + c10::intrusive_ptr allreduce( std::vector& tensors, const AllreduceOptions& opts = AllreduceOptions()) override; diff --git a/torch/csrc/xpu/xccl.cpp b/torch/csrc/xpu/xccl.cpp deleted file mode 100644 index 6224b19254dbfe..00000000000000 --- a/torch/csrc/xpu/xccl.cpp +++ /dev/null @@ -1,348 +0,0 @@ -#include - -#include -#include -#include -#include - -#include - -#include -#include -#include -#include - - -ccl::datatype to_xccl_data_type(c10::ScalarType type) { - switch (type) { - case at::kFloat: - return ccl::datatype::float32; - case at::kHalf: - return ccl::datatype::float16; - case at::kDouble: - return ccl::datatype::float64; - case at::kLong: - return ccl::datatype::int64; - case at::kInt: - return ccl::datatype::int32; - case at::kChar: - return ccl::datatype::int8; - case at::kByte: - return ccl::datatype::uint8; - case at::kBool: - return ccl::datatype::uint8; - case at::kBFloat16: - return ccl::datatype::bfloat16; - default: - TORCH_CHECK(false, "Unconvertible XCCL type ", type); - } -} - -ccl::datatype to_xccl_data_type(const at::Tensor& t) { - if (!t.is_xpu()) { - TORCH_CHECK( - false, - "XCCL only supports XPU tensors, but got a tensor on ", - t.device()); - } - return to_xccl_data_type(t.scalar_type()); -} - -ccl::reduction to_xccl_red_op(int var) { - return (ccl::reduction)(var); -} - -namespace torch::xpu::xccl { - -XCCL_KVS kvs; -std::mutex kvs_mutex; - -XCCL_KVS get_kvs(int rank, c10d::Store& store) { - std::lock_guard lock(kvs_mutex); - if (kvs) - return kvs; - std::string storeKey = "ccl_kvs"; - - // Rank 0 broadcast the bootstrap network information to other ranks - if (rank == 0) { - kvs = ccl::create_main_kvs(); - ccl::kvs::address_type main_addr = kvs->get_address(); - auto ccl_kvs_addr = - std::vector(main_addr.begin(), main_addr.end()); - store.set(storeKey, ccl_kvs_addr); - } else { - auto ccl_kvs_addr = store.get(storeKey); - if (ccl_kvs_addr.size() != ccl::kvs::address_max_size) { - throw std::runtime_error("Unexpected ccl kvs addr from the store\n"); - } - ccl::kvs::address_type main_addr; - std::copy_n( - ccl_kvs_addr.begin(), - ccl::kvs::address_max_size, - main_addr.begin()); - kvs = ccl::create_kvs(main_addr); - } - - return kvs; -} - -using namespace at; - -namespace detail { - -// void xcclCommInitAll(xcclComm_t* newcomm, int nranks, ncclUniqueId commId, -// int myrank) { -// for(int i = 0; i < nranks; i++) { -// newcomm[i] = ccl::create_communicator(nranks, i, get_kvs_addr) -// } -// c10::Stream dpcpp_stream = impl.getStream(devices[0]); -// ccl::vector_class> devs_rank; -// newcomm = ccl::create_communicators(nranks, devs_rank, ctx, ) -// } - -// struct XcclCommList { -// std::unique_ptr comms; -// int ndevices; -// XcclCommList(const std::vector& devices) -// : comms(new xcclComm_t[devices.size()]), ndevices(devices.size()) { -// xcclCommInitAll( -// to_xccl_comm(comms.get()), devices.size(), devices.data()); -// } -// NcclCommList(NcclCommList&& foo) = default; -// ~NcclCommList() { -// if (comms) { -// for (const auto i : c10::irange(ndevices)) { -// comm_destroy(comms[i]); -// } -// } -// } -// ArrayRef ref() const { -// return ArrayRef(comms.get(), ndevices); -// } -// }; - -// using device_list = std::vector; -// // accesses to this object have to be guarded by THC's CudaFreeMutex -// std::unordered_map> _communicators; -// static std::unordered_map> -// _communicators; - -// ArrayRef get_communicators(TensorList inputs) { -// static auto get_device = [](const at::Tensor& t) -> int { -// return t.get_device(); -// }; -// device_list devices = fmap(inputs, get_device); -// auto it = _communicators.find(devices); -// if (it == _communicators.end()) { -// it = _communicators.emplace(devices, devices).first; -// } -// return it->second; -// } - -static inline void check_tensor( - const at::Tensor& input, - const std::optional& output, - int input_multiplier, - int output_multiplier, - int64_t ref_numel, - ScalarType ref_dtype) { - auto check_one = [&](const at::Tensor& tensor) { - if (!tensor.is_xpu() || tensor.is_sparse()) { - throw std::runtime_error( - "input and output elements have to be xpu dense Tensors"); - } - - if (ref_dtype != tensor.scalar_type()) { - throw std::runtime_error( - "all inputs and outputs must be of the same Tensor dtype"); - } - - if (!tensor.is_contiguous()) { - throw std::runtime_error("all inputs and outputs have to be contiguous"); - } - }; - - check_one(input); - - // all inputs must be same size - if (input.numel() != ref_numel) { - throw std::runtime_error( - "all inputs must have the same number of elements"); - } - - if (output) { - check_one(*output); - - // inputs and outputs must be on same device respectively - if (input.get_device() != output->get_device()) { - throw std::runtime_error("input and output must be on the same device"); - } - - if (output->numel() * output_multiplier != ref_numel * input_multiplier) { - throw std::runtime_error( - "output must be of size input_size * size_multiplier"); - } - } -} - -// void check_inputs( -// TensorList inputs, -// TensorList outputs, -// int input_multiplier, -// int output_multiplier) { -// // len(inputs) == len(outputs) -// size_t len = inputs.size(); - -// if (len <= 0) { -// throw std::runtime_error("input sequence can't be empty"); -// } - -// if (len != outputs.size()) { -// std::stringstream err; -// err << "inputs and outputs sequences have to be of the same length, but got input of length " -// << len << " and output of length " << outputs.size(); -// throw std::runtime_error(err.str()); -// } - -// device_set devices; -// int64_t numel = inputs[0].numel(); -// auto dtype = inputs[0].scalar_type(); - -// for (const auto i : c10::irange(len)) { -// auto input = inputs[i]; -// auto output = outputs[i]; - -// check_tensor( -// input, output, input_multiplier, output_multiplier, numel, dtype); - -// auto input_device = input.get_device(); -// // inputs must be on unique devices -// if (devices.test(input_device)) { -// throw std::runtime_error("inputs must be on unique devices"); -// } -// devices.set(input_device); -// } -// } - -// void check_inputs( -// TensorList inputs, -// const at::Tensor& output, -// int root, -// int input_multiplier, -// int output_multiplier) { -// auto len = inputs.size(); - -// if (len <= 0) { -// throw std::runtime_error("input sequence can't be empty"); -// } - -// device_set devices; -// int64_t numel = inputs[0].numel(); -// auto dtype = inputs[0].scalar_type(); - -// for (const auto i : c10::irange(len)) { -// auto input = inputs[i]; - -// check_tensor( -// input, -// i == static_cast>(root) -// ? std::optional{output} -// : std::nullopt, -// input_multiplier, -// output_multiplier, -// numel, -// dtype); - -// auto input_device = input.get_device(); -// // inputs must be on unique devices -// if (devices.test(input_device)) { -// throw std::runtime_error("inputs must be on unique devices"); -// } -// devices.set(input_device); -// } -// } - -} // namespace detail - -// std::uint64_t version() { -// #if defined(NCCL_MAJOR) -// constexpr std::uint64_t ver = (((uint64_t)NCCL_MAJOR) << 32) | -// (((uint64_t)NCCL_MINOR) << 16) | ((uint64_t)NCCL_PATCH); -// return ver; -// #elif defined(USE_NCCL) -// // return major version "1" -// return ((uint64_t)1) << 32; -// #else -// return 0; -// #endif -// } - -// ncclComm_t comm_init_rank(int nranks, const ncclUniqueId& comm_id, int rank) -// { #ifdef USE_XCCL -// using namespace torch::xpu::xccl::detail; -// xcclComm_t comm; -// ncclUniqueId id = comm_id; -// NCCL_CHECK(ncclCommInitRank( -// to_nccl_comm(&comm), nranks, *(to_nccl_unique_id(&id)), rank)); -// return comm; -// #else -// return nullptr; -// #endif -// } - -// namespace { - -// ret_evt = torch::xpu::xccl::all_reduce( -// input, -// output, -// datatype, -// xcclOp.at(opts.reduceOp), -// comm, -// attr, -// stream, -// root); - -// void all_reduce( -// at::Tensor& input, -// at::Tensor& output, -// ccl::datatype datatype, -// ccl::reduction op, -// const stream_list& streams, -// const comm_list& user_comms) { -// #ifdef USE_XCCL -// using namespace torch::cuda::nccl::detail; -// check_inputs(inputs, outputs, 1, 1); -// const auto len = inputs.size(); - -// auto data_type = to_nccl_data_type(inputs[0]); - -// const auto count = inputs[0].numel(); -// auto comms_ref = user_comms.empty() ? get_communicators(inputs) -// : ArrayRef(user_comms); - -// AutoNcclGroup nccl_group_guard; -// at::cuda::OptionalCUDAGuard device_guard; -// for (const auto i : c10::irange(len)) { -// auto device = inputs[i].device().index(); -// device_guard.set_index(device); -// // Default to the current stream -// const auto stream = (streams.empty() || !streams[i]) -// ? at::cuda::getCurrentCUDAStream(device).stream() -// : streams[i]->stream(); - -// ncclComm_t comm = comms_ref[i]; -// NCCL_CHECK(ncclAllReduce( -// inputs[i].data_ptr(), -// outputs[i].data_ptr(), -// count, -// data_type, -// to_nccl_red_op(op), -// to_nccl_comm(comm), -// stream)); -// } -// #else -// AT_ERROR("PyTorch built without NCCL support"); -// #endif -// } - -} // namespace torch::xpu::xccl diff --git a/torch/csrc/xpu/xccl.h b/torch/csrc/xpu/xccl.h deleted file mode 100644 index c7a67975bb286c..00000000000000 --- a/torch/csrc/xpu/xccl.h +++ /dev/null @@ -1,77 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include - -namespace torch::xpu::xccl { - -using xcclComm_t = ccl::communicator; - -using XCCL_KVS = ccl::shared_ptr_class; - -extern XCCL_KVS kvs; - -XCCL_KVS get_kvs(int rank, c10d::Store& store); - -enum class xcclRedOp { Sum = 0, Prod = 1, Max = 2, Min = 3 }; - -enum class xcclDataType { - Int8 = 0, - Char = 0, - Uint8 = 1, - Int32 = 2, - Int = 2, - Uint32 = 3, - Int64 = 4, - Uint64 = 5, - Float16 = 6, - Half = 6, - Float32 = 7, - Float = 7, - Float64 = 8, - Double = 8, - Bfloat16 = 9, - NumTypes = 10 -}; - -namespace detail { - -at::ArrayRef get_communicators(at::TensorList inputs); -void check_inputs( - at::TensorList inputs, - at::TensorList outputs, - int input_multiplier, - int output_multiplier); -void check_inputs( - at::TensorList inputs, - const at::Tensor& output, - int root, - int input_multiplier, - int output_multiplier); - -} // namespace detail - -// using comm_list = std::vector; -// using stream_list = std::vector>; - -std::uint64_t version(); -const char* version_suffix(); - -bool is_available(at::TensorList tensors); - -// comm_init_rank(int nranks, const ncclUniqueId& comm_id, int rank); -// void comm_destroy(xcclComm_t comm); - -// void all_reduce( -// const std::vector& inputs, -// std::vector& outputs, -// int32_t op = static_cast(xcclRedOp::Sum), -// const stream_list& streams = {}, -// const comm_list& user_comms = {}); -} // namespace torch::xpu::xccl From fb9746bd8c15ebaeef525926cc7b5e112e51dddd Mon Sep 17 00:00:00 2001 From: hanchao Date: Thu, 5 Sep 2024 05:56:21 +0000 Subject: [PATCH 15/96] register again --- torch/_C/_distributed_c10d.pyi | 1 + torch/csrc/distributed/c10d/ProcessGroup.cpp | 2 ++ torch/csrc/distributed/c10d/ProcessGroup.hpp | 2 ++ .../distributed/c10d/ProcessGroupXCCL.cpp | 30 ++++++++++++------- torch/csrc/distributed/c10d/init.cpp | 1 + torch/distributed/distributed_c10d.py | 11 +++++-- 6 files changed, 35 insertions(+), 12 deletions(-) diff --git a/torch/_C/_distributed_c10d.pyi b/torch/_C/_distributed_c10d.pyi index 0c97185519d28f..53011cde6b178a 100644 --- a/torch/_C/_distributed_c10d.pyi +++ b/torch/_C/_distributed_c10d.pyi @@ -309,6 +309,7 @@ class ProcessGroup: UNDEFINED = ... GLOO = ... NCCL = ... + XCCL = ... UCC = ... MPI = ... CUSTOM = ... diff --git a/torch/csrc/distributed/c10d/ProcessGroup.cpp b/torch/csrc/distributed/c10d/ProcessGroup.cpp index 75635bc68aed4f..70356b3bf382ce 100644 --- a/torch/csrc/distributed/c10d/ProcessGroup.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroup.cpp @@ -21,6 +21,8 @@ static ProcessGroup::BackendType strToBackendType(std::string_view backend) { return ProcessGroup::BackendType::GLOO; } else if (backend == "nccl") { return ProcessGroup::BackendType::NCCL; + } else if (backend == "xccl") { + return ProcessGroup::BackendType::XCCL; } else if (backend == "ucc") { return ProcessGroup::BackendType::UCC; } else if (backend == "mpi") { diff --git a/torch/csrc/distributed/c10d/ProcessGroup.hpp b/torch/csrc/distributed/c10d/ProcessGroup.hpp index 85142caf0ac7c7..73fc2bda701327 100644 --- a/torch/csrc/distributed/c10d/ProcessGroup.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroup.hpp @@ -490,6 +490,7 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder { // TODO: HACK for backend name to get sequence number for that backend. if (backendType == ProcessGroup::BackendType::GLOO || backendType == ProcessGroup::BackendType::NCCL || + backendType == ProcessGroup::BackendType::XCCL || backendType == ProcessGroup::BackendType::UCC) { getDefaultBackend()->setSequenceNumberForGroup(); } else { @@ -511,6 +512,7 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder { // TODO: HACK for backend name to get sequence number for that backend. if (backendType == ProcessGroup::BackendType::GLOO || backendType == ProcessGroup::BackendType::NCCL || + backendType == ProcessGroup::BackendType::XCCL || backendType == ProcessGroup::BackendType::UCC) { return getDefaultBackend()->getSequenceNumberForGroup(); } else { diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index ffd566f10f854a..e21be88ef83d16 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -178,9 +178,16 @@ std::shared_ptr ProcessGroupXCCL::getXCCLComm( c10::impl::VirtualGuardImpl impl(device.type()); c10::Stream stream = impl.getStream(device); sycl::queue& q = c10::xpu::XPUStream(stream).queue(); - auto ctx = ccl::create_context(q.get_context()); - devs_rank.emplace_back(rank, ccl::create_device(q.get_device())); - XCCLComm = ccl::create_communicator(numRanks, devs_rank, ctx, kvs); + // const sycl::context& sycl_ctx = q.get_context(); + // sycl::context sycl_ctx = q.get_context(); + // ccl::generic_context_type ccl_ctx(sycl_ctx); + // auto ctx = ccl::create_context(ccl_ctx.get()); + + // auto ctx = ccl::create_context(q.get_context()); + // devs_rank.emplace_back(rank, ccl::create_device(q.get_device())); + // XCCLComm = ccl::create_communicator(numRanks, devs_rank, ctx, kvs); + XCCLComm = std::make_shared(ccl::create_communicator(numRanks, rank, kvs)); + { std::lock_guard lock(mutex_); @@ -222,7 +229,7 @@ c10::intrusive_ptr ProcessGroupXCCL::collective( const auto key = std::to_string(device.index()); auto comm = getXCCLComm(key, device); - auto xcclStream = xcclStreams_.at(key); + auto stream = xcclStreams_.at(key); std::vector inputs{input}; std::vector outputs{output}; @@ -233,14 +240,17 @@ c10::intrusive_ptr ProcessGroupXCCL::collective( work->outputs_ = std::make_shared>(std::move(outputs)); c10::xpu::XPUCachingAllocator::recordStream( - input.storage().data_ptr(), xcclStream); + input.storage().data_ptr(), stream); - auto ccl_stream = ccl::create_stream(xcclStream.queue()); - fn(input, output, attr, comm, ccl_stream); + // auto ccl_stream = ccl::create_stream(stream.queue()); + auto ccl_stream = ccl::create_stream(); + + fn(input, output, attr, *comm, ccl_stream); + // fn(input, output, attr, comm, ccl_stream); - work->xcclEndEvent_->record(xcclStream); + work->xcclEndEvent_->record(stream); - std::vector streams = {xcclStream.unwrap()}; + std::vector streams = {stream.unwrap()}; c10::MultiStreamGuard streamGuard(streams); std::vector devices{device}; work->future_ = c10::make_intrusive( @@ -283,7 +293,7 @@ c10::intrusive_ptr ProcessGroupXCCL::allreduce( [&](at::Tensor& input, at::Tensor& output, ccl::allreduce_attr attr, - xcclComm_t comm, + xcclComm_t& comm, ccl::stream& stream) { ccl::event ret_evt; ret_evt = ccl::allreduce( diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp index e12e96f9fe882f..5d200bb6eeb9cf 100644 --- a/torch/csrc/distributed/c10d/init.cpp +++ b/torch/csrc/distributed/c10d/init.cpp @@ -2237,6 +2237,7 @@ The hook must have the following signature: .value("UNDEFINED", ::c10d::ProcessGroup::BackendType::UNDEFINED) .value("GLOO", ::c10d::ProcessGroup::BackendType::GLOO) .value("NCCL", ::c10d::ProcessGroup::BackendType::NCCL) + .value("XCCL", ::c10d::ProcessGroup::BackendType::XCCL) .value("UCC", ::c10d::ProcessGroup::BackendType::UCC) .value("MPI", ::c10d::ProcessGroup::BackendType::MPI) .value("CUSTOM", ::c10d::ProcessGroup::BackendType::CUSTOM) diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py index 26cb1cda1db8cb..3f68609905bb5a 100644 --- a/torch/distributed/distributed_c10d.py +++ b/torch/distributed/distributed_c10d.py @@ -252,22 +252,24 @@ class Backend(str): NCCL = "nccl" UCC = "ucc" MPI = "mpi" - XCCL = "XCCL" + XCCL = "xccl" _BackendPlugin = namedtuple("_BackendPlugin", ["creator_fn", "extended_api"]) _plugins: Dict[str, _BackendPlugin] = {} - backend_list = [UNDEFINED, GLOO, NCCL, UCC, MPI] + backend_list = [UNDEFINED, GLOO, NCCL, XCCL, UCC, MPI] default_device_backend_map: Dict[str, str] = { "cpu": GLOO, "cuda": NCCL, + "xpu": XCCL, } backend_capability: Dict[str, List[str]] = { GLOO: ["cpu", "cuda"], NCCL: ["cuda"], + XCCL: ["xpu"], UCC: ["cpu", "cuda"], MPI: ["cpu", "cuda"], } @@ -276,6 +278,7 @@ class Backend(str): UNDEFINED: ProcessGroup.BackendType.UNDEFINED, GLOO: ProcessGroup.BackendType.GLOO, NCCL: ProcessGroup.BackendType.NCCL, + XCCL: ProcessGroup.BackendType.XCCL, UCC: ProcessGroup.BackendType.UCC, } @@ -1364,6 +1367,10 @@ def _set_pg_timeout(timeout: timedelta, group: Optional[ProcessGroup] = None) -> backends.add(backend) # type: ignore[arg-type] elif is_gloo_available() and isinstance(backend, ProcessGroupGloo): backends.add(backend) # type: ignore[arg-type] + if torch.device("xpu") in devices and is_xpu_available(): + backend = group._get_backend(torch.device("xpu")) + if isinstance(backend, ProcessGroupXCCL): + backends.add(backend) # type: ignore[arg-type] if len(backends) == 0: warnings.warn("Set timeout is now only supported for either nccl or gloo.") for backend in backends: From 4f73180371f3560acbb4750d9e366c3dc3feea40 Mon Sep 17 00:00:00 2001 From: hanchao Date: Thu, 5 Sep 2024 07:42:13 +0000 Subject: [PATCH 16/96] update --- torch/csrc/distributed/c10d/Ops.cpp | 1 + torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/torch/csrc/distributed/c10d/Ops.cpp b/torch/csrc/distributed/c10d/Ops.cpp index 03a5e42874594e..4979c57384fcb4 100644 --- a/torch/csrc/distributed/c10d/Ops.cpp +++ b/torch/csrc/distributed/c10d/Ops.cpp @@ -516,6 +516,7 @@ REGISTER_C10D_OP(alltoall_) REGISTER_C10D_OP(alltoall_base_) REGISTER_C10D_OP(barrier) +REGISTER_C10D_OP1(allreduce_, XPU) // The following ops are specialized, register them separately TORCH_LIBRARY_IMPL(c10d, CPU, m) { diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp index b43403f52f31ab..9ad20797afcb6d 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp @@ -38,7 +38,7 @@ using XCCL_KVS = ccl::shared_ptr_class; constexpr const char* XCCL_BACKEND_NAME = "xccl"; // using namespace torch::xpu::xccl; -class TORCH_XPU_API ProcessGroupXCCL : public Backend { +class TORCH_API ProcessGroupXCCL : public Backend { public: class WorkXCCL : public Work { public: From 7c2f0180b16ecc681836708474ec4c79b09e12fa Mon Sep 17 00:00:00 2001 From: hanchao Date: Fri, 6 Sep 2024 10:10:50 +0000 Subject: [PATCH 17/96] update --- caffe2/CMakeLists.txt | 11 ++++--- .../distributed/c10d/ProcessGroupXCCL.cpp | 23 ++++++-------- .../distributed/c10d/ProcessGroupXCCL.hpp | 30 ++----------------- 3 files changed, 19 insertions(+), 45 deletions(-) diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index 55339880a82a37..2119dd19328000 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -1014,10 +1014,6 @@ elseif(USE_CUDA) endif() if(USE_XPU) - # if(USE_XCCL) - # list(APPEND Caffe2_XPU_SRCS - # ${TORCH_SRC_DIR}/csrc/xpu/xccl.cpp) - # endif() add_library(torch_xpu ${Caffe2_XPU_SRCS}) torch_compile_options(torch_xpu) # see cmake/public/utils.cmake target_compile_definitions(torch_xpu PRIVATE USE_XPU) @@ -1373,7 +1369,14 @@ if(USE_DISTRIBUTED) endif() endif() if(USE_C10D_XCCL) + # if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + # set_source_files_properties( + # ${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupXCCL.cpp + # PROPERTIES COMPILE_DEFINITIONS "CCL_ENABLE_SYCL") + # target_sources(torch_xpu PRIVATE ${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupXCCL.cpp) + # endif() target_compile_definitions(torch_xpu PUBLIC USE_C10D_XCCL) + target_compile_definitions(torch_xpu PUBLIC CCL_ENABLE_SYCL) endif() if(USE_MPI AND USE_C10D_MPI) if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU") diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index e21be88ef83d16..cabdb9f61433bc 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -3,7 +3,7 @@ #include #include -// #ifdef USE_C10D_XCCL +#ifdef USE_C10D_XCCL #include #include #include @@ -174,20 +174,16 @@ std::shared_ptr ProcessGroupXCCL::getXCCLComm( numRanks = getSize(); rank = getRank(); - ccl::vector_class> devs_rank; c10::impl::VirtualGuardImpl impl(device.type()); c10::Stream stream = impl.getStream(device); sycl::queue& q = c10::xpu::XPUStream(stream).queue(); - // const sycl::context& sycl_ctx = q.get_context(); - // sycl::context sycl_ctx = q.get_context(); - // ccl::generic_context_type ccl_ctx(sycl_ctx); - // auto ctx = ccl::create_context(ccl_ctx.get()); - // auto ctx = ccl::create_context(q.get_context()); - // devs_rank.emplace_back(rank, ccl::create_device(q.get_device())); - // XCCLComm = ccl::create_communicator(numRanks, devs_rank, ctx, kvs); - XCCLComm = std::make_shared(ccl::create_communicator(numRanks, rank, kvs)); + auto ctx = ccl::create_context(q.get_context()); + ccl::vector_class> devs_rank; + devs_rank.emplace_back(rank, ccl::create_device(q.get_device())); + auto comms = ccl::create_communicators(numRanks, devs_rank, ctx, kvs); + XCCLComm = std::make_shared(std::move(comms[0])); { std::lock_guard lock(mutex_); @@ -242,11 +238,10 @@ c10::intrusive_ptr ProcessGroupXCCL::collective( c10::xpu::XPUCachingAllocator::recordStream( input.storage().data_ptr(), stream); - // auto ccl_stream = ccl::create_stream(stream.queue()); - auto ccl_stream = ccl::create_stream(); + auto ccl_stream = ccl::create_stream(stream.queue()); + // auto ccl_stream = ccl::create_stream(); fn(input, output, attr, *comm, ccl_stream); - // fn(input, output, attr, comm, ccl_stream); work->xcclEndEvent_->record(stream); @@ -312,4 +307,4 @@ c10::intrusive_ptr ProcessGroupXCCL::allreduce( } // namespace c10d -// #endif // USE_C10D_XCCL +#endif // USE_C10D_XCCL diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp index 9ad20797afcb6d..f8b9d15bd65484 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp @@ -7,12 +7,11 @@ #include #endif -// #ifdef USE_C10D_XCCL +#ifdef USE_C10D_XCCL #include #include #include #include -// #include #include #include #include @@ -36,7 +35,6 @@ namespace c10d { using xcclComm_t = ccl::communicator; using XCCL_KVS = ccl::shared_ptr_class; constexpr const char* XCCL_BACKEND_NAME = "xccl"; -// using namespace torch::xpu::xccl; class TORCH_API ProcessGroupXCCL : public Backend { public: @@ -47,30 +45,11 @@ class TORCH_API ProcessGroupXCCL : public Backend { int rank, OpType opType, const std::optional>& inputs = std::nullopt); - // WorkXCCL( - // std::vector> outputTensors, - // int rank = -1, - // OpType opType = OpType::UNKNOWN, - // const c10::optional>& inputTensors = - // c10::nullopt) - // : Work(rank, opType), outputTensors_(std::move(outputTensors)) {} WorkXCCL(const WorkXCCL& w); - // ~WorkXCCL() override { - // // Ensures all events are properly handled before destruction - // for (auto& event : events_) { - // event.wait(); - // } - // } ~WorkXCCL() override; bool isCompleted() override { TORCH_CHECK( false, "ProcessGroupXCCL::WorkXCCL::isCompleted not implemented"); - // for (auto& event : events_) { - // if (!event.test()) { - // return false; - // } - // } - // return true; } bool isSuccess() const override { @@ -97,9 +76,6 @@ class TORCH_API ProcessGroupXCCL : public Backend { protected: at::Device device_; std::shared_ptr xcclEndEvent_; - // std::vector events_; - // std::shared_ptr xcclComm_; - // const std::vector> outputTensors_; private: std::shared_ptr> outputs_; c10::intrusive_ptr future_; @@ -110,7 +86,7 @@ class TORCH_API ProcessGroupXCCL : public Backend { const c10::intrusive_ptr& store, int rank, int size) - : store_(store), Backend(rank, size) {} + : Backend(rank, size), store_(store) {} ~ProcessGroupXCCL() override; @@ -168,4 +144,4 @@ class TORCH_API ProcessGroupXCCL : public Backend { } // namespace c10d -// #endif // USE_C10D_XCCL +#endif // USE_C10D_XCCL From 229a80ac530ff1e976b7670826e953b73ac6f5b8 Mon Sep 17 00:00:00 2001 From: hanchao Date: Mon, 9 Sep 2024 08:12:16 +0000 Subject: [PATCH 18/96] refine cmake --- build_variables.bzl | 5 ++++- caffe2/CMakeLists.txt | 19 +++++++++---------- cmake/Dependencies.cmake | 2 +- cmake/External/xccl.cmake | 10 +++++++--- torch/CMakeLists.txt | 6 ++---- .../distributed/c10d/ProcessGroupXCCL.hpp | 3 +-- 6 files changed, 24 insertions(+), 21 deletions(-) diff --git a/build_variables.bzl b/build_variables.bzl index cff70d00320b0e..98b721617b609c 100644 --- a/build_variables.bzl +++ b/build_variables.bzl @@ -700,6 +700,10 @@ libtorch_cuda_sources = libtorch_cuda_core_sources + libtorch_cuda_distributed_s "torch/csrc/cuda/nccl.cpp", ] +libtorch_xpu_distributed_extra_sources = [ + "torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp", +] + torch_cpp_srcs = [ "torch/csrc/api/src/cuda.cpp", # this just forwards stuff, no real CUDA "torch/csrc/api/src/data/datasets/mnist.cpp", @@ -785,7 +789,6 @@ libtorch_python_cuda_sources = libtorch_python_cuda_core_sources + [ ] libtorch_python_xpu_sources = [ - "torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp", "torch/csrc/xpu/Event.cpp", "torch/csrc/xpu/Module.cpp", "torch/csrc/xpu/Stream.cpp", diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index 2119dd19328000..9f242febb94711 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -1014,9 +1014,14 @@ elseif(USE_CUDA) endif() if(USE_XPU) + if(USE_DISTRIBUTED) + append_filelist("libtorch_xpu_distributed_extra_sources" Caffe2_XPU_SRCS) + # message(FATAL_ERROR ${Caffe2_XPU_SRCS}) + endif() add_library(torch_xpu ${Caffe2_XPU_SRCS}) torch_compile_options(torch_xpu) # see cmake/public/utils.cmake target_compile_definitions(torch_xpu PRIVATE USE_XPU) + # ATen XPU implementation set(TORCH_XPU_OPS_DIR ${TORCH_ROOT}/third_party/torch-xpu-ops) set(TORCH_XPU_OPS_REPO_URL https://github.com/intel/torch-xpu-ops.git) @@ -1064,10 +1069,6 @@ if(USE_XPU) message(WARNING "Failed to include ATen XPU implementation target") else() target_link_libraries(torch_xpu PRIVATE torch_xpu_ops) - if(USE_XCCL) - target_link_libraries(torch_xpu PRIVATE __caffe2_xccl) - target_compile_definitions(torch_xpu PRIVATE USE_XCCL) - endif() if(MSVC) # Windows target_link_libraries(torch_xpu PRIVATE @@ -1082,6 +1083,10 @@ if(USE_XPU) include_directories(SYSTEM ${ATen_XPU_INCLUDE_DIRS}) endif() + if(USE_XCCL) + target_link_libraries(torch_xpu PRIVATE torch::xccl) + target_compile_definitions(torch_xpu PRIVATE USE_XCCL) + endif() endif() if(NOT MSVC AND USE_XNNPACK) @@ -1369,12 +1374,6 @@ if(USE_DISTRIBUTED) endif() endif() if(USE_C10D_XCCL) - # if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU") - # set_source_files_properties( - # ${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupXCCL.cpp - # PROPERTIES COMPILE_DEFINITIONS "CCL_ENABLE_SYCL") - # target_sources(torch_xpu PRIVATE ${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupXCCL.cpp) - # endif() target_compile_definitions(torch_xpu PUBLIC USE_C10D_XCCL) target_compile_definitions(torch_xpu PUBLIC CCL_ENABLE_SYCL) endif() diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index cb204eada5f689..8abea841fcf61c 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -1162,7 +1162,7 @@ if(USE_XCCL) caffe2_update_option(USE_XCCL OFF) else() include(${CMAKE_CURRENT_LIST_DIR}/External/xccl.cmake) - list(APPEND Caffe2_XPU_DEPENDENCY_LIBS __caffe2_xccl) + list(APPEND Caffe2_XPU_DEPENDENCY_LIBS torch::xccl) endif() endif() diff --git a/cmake/External/xccl.cmake b/cmake/External/xccl.cmake index d1e8f33881b80b..56205b381b1324 100644 --- a/cmake/External/xccl.cmake +++ b/cmake/External/xccl.cmake @@ -5,9 +5,13 @@ if(NOT __XCCL_INCLUDED) # XCCL_ROOT, XCCL_LIBRARY_DIR, XCCL_INCLUDE_DIR are handled by FindXCCL.cmake. find_package(XCCL REQUIRED) if(XCCL_FOUND) - add_library(__caffe2_xccl INTERFACE) - target_link_libraries(__caffe2_xccl INTERFACE ${XCCL_LIBRARY}) - target_include_directories(__caffe2_xccl INTERFACE ${XCCL_INCLUDE_DIR}) + add_library(torch::xccl INTERFACE IMPORTED) + set_property( + TARGET torch::xccl PROPERTY INTERFACE_INCLUDE_DIRECTORIES + ${XCCL_INCLUDE_DIR}) + set_property( + TARGET torch::xccl PROPERTY INTERFACE_LINK_LIBRARIES + ${XCCL_LIBRARY}) endif() endif() endif() diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt index f50ae4e02c3386..5bca5ac72452ec 100644 --- a/torch/CMakeLists.txt +++ b/torch/CMakeLists.txt @@ -163,11 +163,9 @@ endif() if(USE_XPU) include(${TORCH_ROOT}/cmake/public/xpu.cmake) append_filelist("libtorch_python_xpu_sources" TORCH_PYTHON_SRCS) - + list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_XPU) - # if(USE_XCCL) list(APPEND TORCH_PYTHON_LINK_LIBRARIES torch::xpurt) - # endif() endif() if(USE_CUDNN OR USE_ROCM) @@ -286,7 +284,7 @@ if(USE_DISTRIBUTED) list(APPEND TORCH_PYTHON_LINK_LIBRARIES __caffe2_nccl) endif() if(USE_XCCL) - list(APPEND TORCH_PYTHON_LINK_LIBRARIES __caffe2_xccl) + list(APPEND TORCH_PYTHON_LINK_LIBRARIES torch::xccl) endif() # Same for MPI. if(USE_MPI) diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp index f8b9d15bd65484..829e07816589fc 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp @@ -10,8 +10,7 @@ #ifdef USE_C10D_XCCL #include #include -#include -#include +#include #include #include #include From 746007b5fc6edb8ad98f6e7e1811ffe7623240f5 Mon Sep 17 00:00:00 2001 From: hanchao Date: Mon, 9 Sep 2024 09:46:48 +0000 Subject: [PATCH 19/96] register all dist op and enable getXcclReduceOp --- caffe2/CMakeLists.txt | 8 +- torch/csrc/distributed/c10d/Ops.cpp | 20 ++- .../distributed/c10d/ProcessGroupXCCL.cpp | 50 +++++-- .../distributed/c10d/ProcessGroupXCCL.hpp | 141 +++++++++++++++++- 4 files changed, 198 insertions(+), 21 deletions(-) diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index 9f242febb94711..ae183e32d17e7d 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -1014,9 +1014,8 @@ elseif(USE_CUDA) endif() if(USE_XPU) - if(USE_DISTRIBUTED) + if(USE_XCCL) append_filelist("libtorch_xpu_distributed_extra_sources" Caffe2_XPU_SRCS) - # message(FATAL_ERROR ${Caffe2_XPU_SRCS}) endif() add_library(torch_xpu ${Caffe2_XPU_SRCS}) torch_compile_options(torch_xpu) # see cmake/public/utils.cmake @@ -1375,7 +1374,10 @@ if(USE_DISTRIBUTED) endif() if(USE_C10D_XCCL) target_compile_definitions(torch_xpu PUBLIC USE_C10D_XCCL) - target_compile_definitions(torch_xpu PUBLIC CCL_ENABLE_SYCL) + # target_compile_definitions(torch_xpu PUBLIC CCL_ENABLE_SYCL) + set_source_files_properties( + ${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupXCCL.cpp + PROPERTIES COMPILE_DEFINITIONS "CCL_ENABLE_SYCL") endif() if(USE_MPI AND USE_C10D_MPI) if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU") diff --git a/torch/csrc/distributed/c10d/Ops.cpp b/torch/csrc/distributed/c10d/Ops.cpp index 4979c57384fcb4..48d2b3ed1bf69a 100644 --- a/torch/csrc/distributed/c10d/Ops.cpp +++ b/torch/csrc/distributed/c10d/Ops.cpp @@ -79,6 +79,7 @@ namespace { } IMPL_SEND(CPU) +IMPL_SEND(XPU) IMPL_SEND(CUDA) IMPL_SEND(PrivateUse1) @@ -94,6 +95,7 @@ IMPL_SEND(PrivateUse1) } IMPL_RECV(CPU) +IMPL_RECV(XPU) IMPL_RECV(CUDA) IMPL_RECV(PrivateUse1) @@ -108,6 +110,7 @@ IMPL_RECV(PrivateUse1) } IMPL_RECV_ANY_SOURCE(CPU) +IMPL_RECV_ANY_SOURCE(XPU) IMPL_RECV_ANY_SOURCE(CUDA) IMPL_RECV_ANY_SOURCE(PrivateUse1) @@ -131,6 +134,7 @@ IMPL_RECV_ANY_SOURCE(PrivateUse1) } IMPL_REDUCE(CPU) +IMPL_REDUCE(XPU) IMPL_REDUCE(CUDA) IMPL_REDUCE(PrivateUse1) @@ -156,6 +160,7 @@ IMPL_REDUCE(PrivateUse1) } IMPL_BROADCAST(CPU) +IMPL_BROADCAST(XPU) IMPL_BROADCAST(CUDA) IMPL_BROADCAST(PrivateUse1) @@ -199,6 +204,7 @@ IMPL_ALLREDUCE(PrivateUse1) } IMPL_ALLREDUCE_COALESCED(CPU) +IMPL_ALLREDUCE_COALESCED(XPU) IMPL_ALLREDUCE_COALESCED(CUDA) IMPL_ALLREDUCE_COALESCED(PrivateUse1) @@ -223,6 +229,7 @@ IMPL_ALLREDUCE_COALESCED(PrivateUse1) // NOLINTBEGIN(cppcoreguidelines-pro-type-const-cast) IMPL_ALLGATHER(CPU) +IMPL_ALLGATHER(XPU) IMPL_ALLGATHER(CUDA) IMPL_ALLGATHER(PrivateUse1) @@ -243,6 +250,7 @@ IMPL_ALLGATHER(PrivateUse1) } IMPL__ALLGATHER_BASE(CPU) +IMPL__ALLGATHER_BASE(XPU) IMPL__ALLGATHER_BASE(CUDA) IMPL__ALLGATHER_BASE(PrivateUse1) @@ -259,6 +267,7 @@ IMPL__ALLGATHER_BASE(PrivateUse1) } IMPL_ALLGATHER_COALESCED(CPU) +IMPL_ALLGATHER_COALESCED(XPU) IMPL_ALLGATHER_COALESCED(CUDA) IMPL_ALLGATHER_COALESCED(PrivateUse1) @@ -274,6 +283,7 @@ IMPL_ALLGATHER_COALESCED(PrivateUse1) } IMPL_ALLGATHER_INTO_TENSOR_COALESCED(CPU) +IMPL_ALLGATHER_INTO_TENSOR_COALESCED(XPU) IMPL_ALLGATHER_INTO_TENSOR_COALESCED(CUDA) IMPL_ALLGATHER_INTO_TENSOR_COALESCED(PrivateUse1) @@ -297,6 +307,7 @@ IMPL_ALLGATHER_INTO_TENSOR_COALESCED(PrivateUse1) } IMPL_REDUCE_SCATTER(CPU) +IMPL_REDUCE_SCATTER(XPU) IMPL_REDUCE_SCATTER(CUDA) IMPL_REDUCE_SCATTER(PrivateUse1) @@ -321,6 +332,7 @@ IMPL_REDUCE_SCATTER(PrivateUse1) } IMPL__REDUCE_SCATTER_BASE(CPU) +IMPL__REDUCE_SCATTER_BASE(XPU) IMPL__REDUCE_SCATTER_BASE(CUDA) IMPL__REDUCE_SCATTER_BASE(PrivateUse1) @@ -342,6 +354,7 @@ IMPL__REDUCE_SCATTER_BASE(PrivateUse1) } IMPL_REDUCE_SCATTER_TENSOR_COALESCED(CPU) +IMPL_REDUCE_SCATTER_TENSOR_COALESCED(XPU) IMPL_REDUCE_SCATTER_TENSOR_COALESCED(CUDA) IMPL_REDUCE_SCATTER_TENSOR_COALESCED(PrivateUse1) @@ -361,6 +374,7 @@ IMPL_REDUCE_SCATTER_TENSOR_COALESCED(PrivateUse1) } IMPL_GATHER(CPU) +IMPL_GATHER(XPU) IMPL_GATHER(CUDA) IMPL_GATHER(PrivateUse1) @@ -383,6 +397,7 @@ IMPL_GATHER(PrivateUse1) } IMPL_SCATTER(CPU) +IMPL_SCATTER(XPU) IMPL_SCATTER(CUDA) IMPL_SCATTER(PrivateUse1) @@ -404,6 +419,7 @@ IMPL_SCATTER(PrivateUse1) } IMPL_ALLTOALL(CPU) +IMPL_ALLTOALL(XPU) IMPL_ALLTOALL(CUDA) IMPL_ALLTOALL(PrivateUse1) @@ -425,6 +441,7 @@ IMPL_ALLTOALL(PrivateUse1) } IMPL_ALLTOALL_BASE(CPU) +IMPL_ALLTOALL_BASE(XPU) IMPL_ALLTOALL_BASE(CUDA) IMPL_ALLTOALL_BASE(PrivateUse1) @@ -440,6 +457,7 @@ IMPL_ALLTOALL_BASE(PrivateUse1) } IMPL_BARRIER(CPU) +IMPL_BARRIER(XPU) IMPL_BARRIER(CUDA) IMPL_BARRIER(PrivateUse1) // NOLINTEND(cppcoreguidelines-pro-type-const-cast) @@ -492,6 +510,7 @@ namespace { #define REGISTER_C10D_OP(FUNC) \ REGISTER_C10D_OP1(FUNC, CPU) \ REGISTER_C10D_OP1(FUNC, CUDA) \ + REGISTER_C10D_OP1(FUNC, XPU) \ REGISTER_C10D_OP1(FUNC, PrivateUse1) // Now we start to register ops with the three device keys @@ -516,7 +535,6 @@ REGISTER_C10D_OP(alltoall_) REGISTER_C10D_OP(alltoall_base_) REGISTER_C10D_OP(barrier) -REGISTER_C10D_OP1(allreduce_, XPU) // The following ops are specialized, register them separately TORCH_LIBRARY_IMPL(c10d, CPU, m) { diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index cabdb9f61433bc..f6ef0ae0a6ebee 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -94,6 +94,39 @@ ccl::datatype getXcclDataType(at::ScalarType type) { type); return it->second; } + +ccl::reduction getXcclReduceOp(const ReduceOp& reduceOp, at::Tensor& input) { + try { + if (input.scalar_type() == at::kBool) { + if (reduceOp == ReduceOp::SUM) { + // For bool tensors, map sum to max, which both represent a bitwise or. + // This is to prevent overflow issues with sum, since we use uint8 to + // represent a bool (see xcclDatatypes mapping align with cuda). + return ccl::reduction::max; + } + } + return xcclOps.at(reduceOp); + } catch (const std::out_of_range&) { + switch (reduceOp) { + case ReduceOp::AVG: + C10_THROW_ERROR(ValueError, "Cannot use ReduceOp AVG with XCCL"); + break; + case ReduceOp::BAND: + C10_THROW_ERROR(ValueError, "Cannot use ReduceOp.BAND with XCCL"); + break; + case ReduceOp::BOR: + C10_THROW_ERROR(ValueError, "Cannot use ReduceOp.BOR with XCCL"); + break; + case ReduceOp::BXOR: + C10_THROW_ERROR(ValueError, "Cannot use ReduceOp.BXOR with XCCL"); + break; + default: + C10_THROW_ERROR(ValueError, "Unhandled ReduceOp"); + break; + } + } +} + } // namespace static std::mutex xcclCommDevIdxMapMutex; @@ -110,7 +143,8 @@ ProcessGroupXCCL::WorkXCCL::WorkXCCL( } ProcessGroupXCCL::WorkXCCL::WorkXCCL(const WorkXCCL& w) - : Work(w.rank_, w.opType_), device_(w.device_), + : Work(w.rank_, w.opType_), + device_(w.device_), xcclEndEvent_(w.xcclEndEvent_) {} ProcessGroupXCCL::WorkXCCL::~WorkXCCL() = default; @@ -142,10 +176,7 @@ c10::intrusive_ptr ProcessGroupXCCL::initWork( const std::vector& inputs, const std::vector& outputs) { auto r = c10::make_intrusive( - device, - rank, - opType, - std::optional>(inputs)); + device, rank, opType, std::optional>(inputs)); return r; } @@ -237,9 +268,8 @@ c10::intrusive_ptr ProcessGroupXCCL::collective( std::make_shared>(std::move(outputs)); c10::xpu::XPUCachingAllocator::recordStream( input.storage().data_ptr(), stream); - + auto ccl_stream = ccl::create_stream(stream.queue()); - // auto ccl_stream = ccl::create_stream(); fn(input, output, attr, *comm, ccl_stream); @@ -290,13 +320,15 @@ c10::intrusive_ptr ProcessGroupXCCL::allreduce( ccl::allreduce_attr attr, xcclComm_t& comm, ccl::stream& stream) { + auto xcclDataType = getXcclDataType(input.scalar_type()); + auto xcclReduceOp = getXcclReduceOp(opts.reduceOp, input); ccl::event ret_evt; ret_evt = ccl::allreduce( input.data_ptr(), output.data_ptr(), (size_t)input.numel(), - getXcclDataType(input.scalar_type()), - xcclOps.at(opts.reduceOp), + xcclDataType, + xcclReduceOp, comm, stream, attr); diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp index 829e07816589fc..bd74ca745ed644 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp @@ -8,9 +8,9 @@ #endif #ifdef USE_C10D_XCCL +#include #include #include -#include #include #include #include @@ -75,6 +75,7 @@ class TORCH_API ProcessGroupXCCL : public Backend { protected: at::Device device_; std::shared_ptr xcclEndEvent_; + private: std::shared_ptr> outputs_; c10::intrusive_ptr future_; @@ -89,6 +90,11 @@ class TORCH_API ProcessGroupXCCL : public Backend { ~ProcessGroupXCCL() override; + static c10::intrusive_ptr createProcessGroupXCCL( + const c10::intrusive_ptr& store, + int rank = -1, + int size = -1); + const std::string getBackendName() const override { return std::string(XCCL_BACKEND_NAME); } @@ -124,13 +130,133 @@ class TORCH_API ProcessGroupXCCL : public Backend { std::vector& tensors, const AllreduceOptions& opts = AllreduceOptions()) override; - // c10::intrusive_ptr barrier( - // const BarrierOptions& opts = BarrierOptions()) override; + c10::intrusive_ptr allreduce_coalesced( + std::vector& tensors, + const AllreduceCoalescedOptions& opts = + AllreduceCoalescedOptions()) override { + TORCH_CHECK(false, "ProcessGroupXCCL::allreduce_coalesced not implemented"); + } - static c10::intrusive_ptr createProcessGroupXCCL( - const c10::intrusive_ptr& store, - int rank = -1, - int size = -1); + c10::intrusive_ptr reduce( + std::vector& tensors, + const ReduceOptions& opts = ReduceOptions()) override { + TORCH_CHECK(false, "ProcessGroupXCCL::reduce not implemented"); + } + + c10::intrusive_ptr broadcast( + std::vector& tensors, + const BroadcastOptions& opts = BroadcastOptions()) override { + TORCH_CHECK(false, "ProcessGroupXCCL::broadcast not implemented"); + } + + c10::intrusive_ptr allreduce_sparse( + std::vector& tensors, + const AllreduceOptions& opts = AllreduceOptions()) override { + TORCH_CHECK(false, "ProcessGroupXCCL::allreduce_sparse not implemented"); + } + + c10::intrusive_ptr allgather( + std::vector>& outputTensors, + std::vector& inputTensors, + const AllgatherOptions& opts = AllgatherOptions()) override { + TORCH_CHECK(false, "ProcessGroupXCCL::allgather not implemented"); + } + + c10::intrusive_ptr _allgather_base( + at::Tensor& outputbuffer, + at::Tensor& inputbuffer, + const AllgatherOptions& opts = AllgatherOptions()) override { + TORCH_CHECK(false, "ProcessGroupXCCL::_allgather_base not implemented"); + } + + c10::intrusive_ptr allgather_coalesced( + std::vector>& outputTensorLists, + std::vector& inputTensors, + const AllgatherOptions& opts = AllgatherOptions()) override { + TORCH_CHECK(false, "ProcessGroupXCCL::allgather_coalesced not implemented"); + } + + c10::intrusive_ptr allgather_into_tensor_coalesced( + std::vector& outputs, + std::vector& inputs, + const AllgatherOptions& opts = AllgatherOptions()) override { + TORCH_CHECK( + false, + "ProcessGroupXCCL::allgather_into_tensor_coalesced not implemented"); + } + + c10::intrusive_ptr reduce_scatter( + std::vector& outputTensors, + std::vector>& inputTensors, + const ReduceScatterOptions& opts = ReduceScatterOptions()) override { + TORCH_CHECK(false, "ProcessGroupXCCL::reduce_scatter not implemented"); + } + + c10::intrusive_ptr _reduce_scatter_base( + at::Tensor& outputTensor, + at::Tensor& inputTensor, + const ReduceScatterOptions& opts = ReduceScatterOptions()) override { + TORCH_CHECK( + false, "ProcessGroupXCCL::_reduce_scatter_base not implemented"); + } + + c10::intrusive_ptr reduce_scatter_tensor_coalesced( + std::vector& outputs, + std::vector& inputs, + const ReduceScatterOptions& opts = ReduceScatterOptions()) override { + TORCH_CHECK( + false, + "ProcessGroupXCCL::reduce_scatter_tensor_coalesced not implemented"); + } + + c10::intrusive_ptr barrier( + const BarrierOptions& opts = BarrierOptions()) override { + TORCH_CHECK(false, "ProcessGroupXCCL::barrier not implemented"); + } + + c10::intrusive_ptr alltoall_base( + at::Tensor& outputTensor, + at::Tensor& inputTensor, + std::vector& outputSplitSizes, + std::vector& inputSplitSizes, + const AllToAllOptions& opts = AllToAllOptions()) override { + TORCH_CHECK(false, "ProcessGroupXCCL::alltoall_base not implemented"); + } + + c10::intrusive_ptr alltoall( + std::vector& outputTensors, + std::vector& inputTensors, + const AllToAllOptions& opts = AllToAllOptions()) override { + TORCH_CHECK(false, "ProcessGroupXCCL::alltoall not implemented"); + } + + c10::intrusive_ptr send( + std::vector& tensors, + int dstRank, + int tag) override { + TORCH_CHECK(false, "ProcessGroupXCCL::send not implemented"); + } + + c10::intrusive_ptr recv( + std::vector& tensors, + int srcRank, + int tag) override { + TORCH_CHECK(false, "ProcessGroupXCCL::recv not implemented"); + } + + c10::intrusive_ptr gather( + std::vector>& outputTensors, + std::vector& inputTensors, + const GatherOptions& opts = GatherOptions()) override { + TORCH_CHECK(false, "ProcessGroupXCCL::gather not implemented"); + } + + c10::intrusive_ptr scatter( + std::vector& outputTensors, + std::vector>& inputTensors, + const ScatterOptions& opts = ScatterOptions()) override { + TORCH_CHECK(false, "ProcessGroupXCCL::scatter not implemented"); + } public: std::unordered_map xcclStreams_; @@ -140,7 +266,6 @@ class TORCH_API ProcessGroupXCCL : public Backend { c10::intrusive_ptr store_; std::mutex mutex_; }; - } // namespace c10d #endif // USE_C10D_XCCL From 5195f523342e7176ef5912ce17bd73598c13b8d6 Mon Sep 17 00:00:00 2001 From: hanchao Date: Mon, 9 Sep 2024 09:50:10 +0000 Subject: [PATCH 20/96] update --- caffe2/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index ae183e32d17e7d..a51b2938c0ff73 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -1374,7 +1374,6 @@ if(USE_DISTRIBUTED) endif() if(USE_C10D_XCCL) target_compile_definitions(torch_xpu PUBLIC USE_C10D_XCCL) - # target_compile_definitions(torch_xpu PUBLIC CCL_ENABLE_SYCL) set_source_files_properties( ${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupXCCL.cpp PROPERTIES COMPILE_DEFINITIONS "CCL_ENABLE_SYCL") From 2eb044620774692913cc083e58b53717dc22b004 Mon Sep 17 00:00:00 2001 From: "Han, Chao1" Date: Tue, 10 Sep 2024 17:19:54 +0800 Subject: [PATCH 21/96] update --- torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index f6ef0ae0a6ebee..4e0b7db3592093 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -309,9 +309,6 @@ c10::intrusive_ptr ProcessGroupXCCL::allreduce( tensors.size() == 1, "Expecting one tensor only but got multiple"); auto tensor = tensors.back(); check_xpu_single_tensor(tensor); - if (opts.reduceOp == ReduceOp::AVG) { - TORCH_CHECK(false, "Cannot use ReduceOp AVG with XPU") - } return collective( tensor, tensor, From 0f6176270833f9f5866c491e6ed58f57822724a5 Mon Sep 17 00:00:00 2001 From: hanchao Date: Tue, 10 Sep 2024 02:36:59 +0000 Subject: [PATCH 22/96] update flag --- caffe2/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index a51b2938c0ff73..d44a8da210462f 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -1376,7 +1376,7 @@ if(USE_DISTRIBUTED) target_compile_definitions(torch_xpu PUBLIC USE_C10D_XCCL) set_source_files_properties( ${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupXCCL.cpp - PROPERTIES COMPILE_DEFINITIONS "CCL_ENABLE_SYCL") + PROPERTIES COMPILE_DEFINITIONS "CCL_ENABLE_ZE;CCL_ENABLE_SYCL") endif() if(USE_MPI AND USE_C10D_MPI) if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU") From 227e98decb633a108faaf50ab34641d446aa7774 Mon Sep 17 00:00:00 2001 From: hanchao Date: Tue, 10 Sep 2024 02:59:13 +0000 Subject: [PATCH 23/96] update --- torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp index bd74ca745ed644..2f16df6450fe62 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp @@ -149,12 +149,6 @@ class TORCH_API ProcessGroupXCCL : public Backend { TORCH_CHECK(false, "ProcessGroupXCCL::broadcast not implemented"); } - c10::intrusive_ptr allreduce_sparse( - std::vector& tensors, - const AllreduceOptions& opts = AllreduceOptions()) override { - TORCH_CHECK(false, "ProcessGroupXCCL::allreduce_sparse not implemented"); - } - c10::intrusive_ptr allgather( std::vector>& outputTensors, std::vector& inputTensors, From df81919f64c2b2bf42ca732b51a372679035fc20 Mon Sep 17 00:00:00 2001 From: hanchao Date: Wed, 11 Sep 2024 04:43:01 +0000 Subject: [PATCH 24/96] update --- torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index 4e0b7db3592093..790c02675b03bf 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -265,7 +265,7 @@ c10::intrusive_ptr ProcessGroupXCCL::collective( work = initWork(device, rank_, opType); work->outputs_ = - std::make_shared>(std::move(outputs)); + std::make_shared>(outputs); c10::xpu::XPUCachingAllocator::recordStream( input.storage().data_ptr(), stream); From b0c05928c607cf80b3883c935c57721abca935e5 Mon Sep 17 00:00:00 2001 From: hanchao Date: Wed, 11 Sep 2024 04:49:00 +0000 Subject: [PATCH 25/96] update --- torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index 790c02675b03bf..e690cc1f57aa43 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -264,8 +264,7 @@ c10::intrusive_ptr ProcessGroupXCCL::collective( work = initWork(device, rank_, opType); - work->outputs_ = - std::make_shared>(outputs); + work->outputs_ = std::make_shared>(outputs); c10::xpu::XPUCachingAllocator::recordStream( input.storage().data_ptr(), stream); From 366d20849aeea8197ffc94cdb4851b054d3c2c07 Mon Sep 17 00:00:00 2001 From: hanchao Date: Wed, 11 Sep 2024 06:23:46 +0000 Subject: [PATCH 26/96] rm redundance code --- torch/CMakeLists.txt | 3 --- 1 file changed, 3 deletions(-) diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt index 5bca5ac72452ec..af678d11e7f325 100644 --- a/torch/CMakeLists.txt +++ b/torch/CMakeLists.txt @@ -165,7 +165,6 @@ if(USE_XPU) append_filelist("libtorch_python_xpu_sources" TORCH_PYTHON_SRCS) list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_XPU) - list(APPEND TORCH_PYTHON_LINK_LIBRARIES torch::xpurt) endif() if(USE_CUDNN OR USE_ROCM) @@ -420,8 +419,6 @@ endif() target_compile_definitions(torch_python PRIVATE "-DTHP_BUILD_MAIN_LIB") target_link_libraries(torch_python PRIVATE ${TORCH_LIB} ${TORCH_PYTHON_LINK_LIBRARIES}) -target_link_libraries(torch_python PRIVATE torch::xpurt) -target_link_libraries(torch_python PRIVATE c10_xpu) target_compile_definitions(torch_python PRIVATE ${TORCH_PYTHON_COMPILE_DEFINITIONS}) From 3530e43f74742f60ca2f121be920916e1e6a4e14 Mon Sep 17 00:00:00 2001 From: hanchao Date: Wed, 11 Sep 2024 09:57:00 +0000 Subject: [PATCH 27/96] enable timeout --- .../distributed/c10d/ProcessGroupXCCL.cpp | 80 +++++++++++++++++-- .../distributed/c10d/ProcessGroupXCCL.hpp | 42 +++++++--- 2 files changed, 102 insertions(+), 20 deletions(-) diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index e690cc1f57aa43..421336b4872a5a 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -131,13 +131,16 @@ ccl::reduction getXcclReduceOp(const ReduceOp& reduceOp, at::Tensor& input) { static std::mutex xcclCommDevIdxMapMutex; static std::unordered_map, int> xcclCommDevIdxMap; +constexpr int64_t kSynchronizeBusyWaitMillis = 10; ProcessGroupXCCL::WorkXCCL::WorkXCCL( at::Device& device, int rank, OpType opType, const std::optional>& inputs) - : Work(rank, opType, "profilingTitle", inputs), device_(device) { + : Work(rank, opType, "profilingTitle", inputs), + device_(device), + workStartTime_(std::chrono::steady_clock::now()) { unsigned char enable_timing = 0; xcclEndEvent_ = std::make_shared(enable_timing); } @@ -145,26 +148,85 @@ ProcessGroupXCCL::WorkXCCL::WorkXCCL( ProcessGroupXCCL::WorkXCCL::WorkXCCL(const WorkXCCL& w) : Work(w.rank_, w.opType_), device_(w.device_), + blockingWait_(w.blockingWait_), + workStartTime_(w.workStartTime_), xcclEndEvent_(w.xcclEndEvent_) {} ProcessGroupXCCL::WorkXCCL::~WorkXCCL() = default; -bool ProcessGroupXCCL::WorkXCCL::wait(std::chrono::milliseconds timeout) { - synchronize(); +bool ProcessGroupXCCL::WorkXCCL::checkTimeout( + std::optional timeout) { + auto currentTimepoint = std::chrono::steady_clock::now(); + auto timeElapsed = std::chrono::duration_cast( + currentTimepoint - workStartTime_); + std::chrono::milliseconds opTimeout = std::chrono::milliseconds(60000); + + auto workTimeout = timeout ? *timeout : opTimeout; + + if (timeElapsed < workTimeout) + return false; + return true; +} + +bool ProcessGroupXCCL::WorkXCCL::isCompleted() { + for (auto& ret : rets) { + bool flag; + try { + TORCH_CHECK(flag = ret.test()); + } catch (...) { + finishAWorkXCCLError(std::current_exception()); + return true; + } + if (!flag) { + return false; + } + } return true; } void ProcessGroupXCCL::WorkXCCL::synchronize() { + synchronizeInternal(kNoTimeout); +} + +void ProcessGroupXCCL::WorkXCCL::synchronizeStream() { auto currentStream = at::xpu::getCurrentXPUStream(device_.index()); // Block the current stream on the XCCL stream xcclEndEvent_->block(currentStream); } -c10::intrusive_ptr ProcessGroupXCCL::createProcessGroupXCCL( +void ProcessGroupXCCL::WorkXCCL::synchronizeInternal( + std::chrono::milliseconds timeout) { + synchronizeStream(); + + if (blockingWait_) { + while (!isCompleted()) { + bool timedOut = checkTimeout( + timeout == kNoTimeout ? std::nullopt : std::make_optional(timeout)); + if (timedOut) { + break; + } + std::this_thread::sleep_for( + std::chrono::milliseconds(kSynchronizeBusyWaitMillis)); + } + } +} + +bool ProcessGroupXCCL::WorkXCCL::wait(std::chrono::milliseconds timeout) { + synchronizeInternal(timeout); + for (auto& event : rets) { + event.wait(); + } + rets.clear(); + return true; +} + +ProcessGroupXCCL::ProcessGroupXCCL( const c10::intrusive_ptr& store, int rank, - int size) { - return c10::make_intrusive(store, rank, size); + int size) + : Backend(rank, size), store_(store) { + blockingWait_ = getCvarBool(TORCH_XCCL_BLOCKING_WAIT, false); + init(); } ProcessGroupXCCL::~ProcessGroupXCCL() = default; @@ -264,13 +326,14 @@ c10::intrusive_ptr ProcessGroupXCCL::collective( work = initWork(device, rank_, opType); - work->outputs_ = std::make_shared>(outputs); + work->outputs_ = + std::make_shared>(std::move(outputs)); c10::xpu::XPUCachingAllocator::recordStream( input.storage().data_ptr(), stream); auto ccl_stream = ccl::create_stream(stream.queue()); - fn(input, output, attr, *comm, ccl_stream); + work->addResult(fn(input, output, attr, *comm, ccl_stream)); work->xcclEndEvent_->record(stream); @@ -280,6 +343,7 @@ c10::intrusive_ptr ProcessGroupXCCL::collective( work->future_ = c10::make_intrusive( c10::ListType::create(c10::TensorType::get()), devices); work->future_->markCompleted(at::IValue(*work->outputs_)); + work->blockingWait_ = blockingWait_; return work; } diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp index 2f16df6450fe62..7bb3a14d6e1446 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp @@ -31,6 +31,10 @@ #include namespace c10d { +static std::vector TORCH_XCCL_BLOCKING_WAIT = { + "TORCH_XCCL_BLOCKING_WAIT", + "XCCL_BLOCKING_WAIT"}; + using xcclComm_t = ccl::communicator; using XCCL_KVS = ccl::shared_ptr_class; constexpr const char* XCCL_BACKEND_NAME = "xccl"; @@ -46,11 +50,13 @@ class TORCH_API ProcessGroupXCCL : public Backend { const std::optional>& inputs = std::nullopt); WorkXCCL(const WorkXCCL& w); ~WorkXCCL() override; - bool isCompleted() override { - TORCH_CHECK( - false, "ProcessGroupXCCL::WorkXCCL::isCompleted not implemented"); + + void addResult(ccl::event&& result) { + rets.push_back(std::move(result)); } + bool isCompleted() override; + bool isSuccess() const override { TORCH_CHECK( false, "ProcessGroupXCCL::WorkXCCL::isSuccess not implemented"); @@ -62,6 +68,8 @@ class TORCH_API ProcessGroupXCCL : public Backend { void synchronize() override; + void synchronizeStream(); + bool wait(std::chrono::milliseconds timeout = kNoTimeout) override; c10::intrusive_ptr getFuture() override { @@ -72,29 +80,38 @@ class TORCH_API ProcessGroupXCCL : public Backend { TORCH_CHECK(false, "ProcessGroupXCCL::WorkXCCL::result not implemented"); } + bool checkTimeout( + std::optional timeout = std::nullopt); + protected: at::Device device_; std::shared_ptr xcclEndEvent_; + bool blockingWait_ = false; + std::chrono::time_point workStartTime_; + std::vector rets; private: + void finishAWorkXCCLError(std::exception_ptr eptr) { + future_->setError(eptr); + finish(eptr); + } + void synchronizeInternal(std::chrono::milliseconds timeout); std::shared_ptr> outputs_; c10::intrusive_ptr future_; friend class ProcessGroupXCCL; }; - explicit ProcessGroupXCCL( + ProcessGroupXCCL(const c10::intrusive_ptr& store, int rank, int size); + + C10_DEPRECATED ProcessGroupXCCL( const c10::intrusive_ptr& store, int rank, - int size) - : Backend(rank, size), store_(store) {} + int size, + const std::string& groupName) + : ProcessGroupXCCL(store, rank, size) {} ~ProcessGroupXCCL() override; - static c10::intrusive_ptr createProcessGroupXCCL( - const c10::intrusive_ptr& store, - int rank = -1, - int size = -1); - const std::string getBackendName() const override { return std::string(XCCL_BACKEND_NAME); } @@ -252,13 +269,14 @@ class TORCH_API ProcessGroupXCCL : public Backend { TORCH_CHECK(false, "ProcessGroupXCCL::scatter not implemented"); } - public: + protected: std::unordered_map xcclStreams_; std::unordered_map> inInitializationCommMap_; std::unordered_map> devXCCLCommMap_; c10::intrusive_ptr store_; std::mutex mutex_; + bool blockingWait_ = false; }; } // namespace c10d From 485ae8b9015bec2afee97b1653d9362e440fd11c Mon Sep 17 00:00:00 2001 From: hanchao Date: Thu, 12 Sep 2024 01:02:50 +0000 Subject: [PATCH 28/96] add oneccl env --- .../distributed/c10d/ProcessGroupXCCL.cpp | 12 +++++++++ .../distributed/c10d/ProcessGroupXCCL.hpp | 26 +++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index 421336b4872a5a..e008669ca8ad79 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -227,6 +227,18 @@ ProcessGroupXCCL::ProcessGroupXCCL( : Backend(rank, size), store_(store) { blockingWait_ = getCvarBool(TORCH_XCCL_BLOCKING_WAIT, false); init(); + + { + int local_rank = getXCCLEnvVar("LOCAL_RANK"); + int local_world_size = getXCCLEnvVar("LOCAL_WORLD_SIZE"); + if (local_rank == -1 || local_world_size == -1) { + local_rank = rank; + local_world_size = size; + } + setXCCLEnvVar("CCL_PROCESS_LAUNCHER", "none"); + setXCCLEnvVar("CCL_LOCAL_RANK", local_rank); + setXCCLEnvVar("CCL_LOCAL_SIZE", local_world_size); + } } ProcessGroupXCCL::~ProcessGroupXCCL() = default; diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp index 7bb3a14d6e1446..eca66a33922d55 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp @@ -31,6 +31,32 @@ #include namespace c10d { +namespace { +int getXCCLEnvVar(std::string envVarName) { + char* stringValue = std::getenv(envVarName.c_str()); + if (stringValue != nullptr) { + try { + int val = std::stoi(stringValue); + return val; + } catch (std::exception& e) { + TORCH_CHECK( + false, + "Invalid value for environment variable: " + std::string(envVarName)); + } + } else { + return -1; + } +} + +void setXCCLEnvVar(std::string envVarName, int val) { + setenv(envVarName.c_str(), std::to_string(val).c_str(), val); +} + +void setXCCLEnvVar(std::string envVarName, std::string val) { + setenv(envVarName.c_str(), val.c_str(), 1); +} +} // namespace + static std::vector TORCH_XCCL_BLOCKING_WAIT = { "TORCH_XCCL_BLOCKING_WAIT", "XCCL_BLOCKING_WAIT"}; From 0cfd224d34a357c0586ad0da4c0e19def4e36d47 Mon Sep 17 00:00:00 2001 From: hanchao Date: Thu, 12 Sep 2024 01:06:22 +0000 Subject: [PATCH 29/96] update --- torch/CMakeLists.txt | 2 +- torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt index af678d11e7f325..9a91b26d54cfb4 100644 --- a/torch/CMakeLists.txt +++ b/torch/CMakeLists.txt @@ -163,7 +163,7 @@ endif() if(USE_XPU) include(${TORCH_ROOT}/cmake/public/xpu.cmake) append_filelist("libtorch_python_xpu_sources" TORCH_PYTHON_SRCS) - + list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_XPU) endif() diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index e008669ca8ad79..e550225e19cb79 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -260,7 +260,7 @@ std::shared_ptr ProcessGroupXCCL::getXCCLComm( if (deviceKey.empty()) { C10_THROW_ERROR( DistBackendError, - "Not able to create/get the CCL Communicator since " + "Not able to create/get the XCCL Communicator since " "the devices are empty "); } From b99fd8cf26fd0dae68826be199d09f39ac2af01d Mon Sep 17 00:00:00 2001 From: hanchao Date: Thu, 12 Sep 2024 03:20:47 +0000 Subject: [PATCH 30/96] Add simple test --- test/distributed/test_c10d_xccl.py | 221 ++++++++++++++++++ test/run_test.py | 1 + torch/testing/_internal/common_distributed.py | 6 + 3 files changed, 228 insertions(+) create mode 100644 test/distributed/test_c10d_xccl.py diff --git a/test/distributed/test_c10d_xccl.py b/test/distributed/test_c10d_xccl.py new file mode 100644 index 00000000000000..33a2f196c3b5d1 --- /dev/null +++ b/test/distributed/test_c10d_xccl.py @@ -0,0 +1,221 @@ +# Owner(s): ["oncall: distributed"] + +import copy +import logging +import math +import operator +import os +import random +import sys +import tempfile +from functools import reduce +from unittest import mock, SkipTest + +import torch +import torch.distributed as c10d + + +if not c10d.is_available() or not c10d.is_xccl_available(): + print("c10d XCCL not available, skipping tests", file=sys.stderr) + sys.exit(0) + +import test_c10d_common + +import torch.distributed as dist +import torch.nn.functional as F +import torch.testing._internal.common_utils as common +from torch import nn +from torch.nn.parallel import DistributedDataParallel +from torch.testing._internal.common_distributed import ( + MultiProcessTestCase, + requires_xccl, +) +from torch.testing._internal.common_utils import ( + retry_on_connect_failures, + run_tests, + TestCase, +) + +def simple_reduce_tests(rank, world_size): + tests = [ + ( + c10d.ReduceOp.SUM, + torch.tensor([rank + 1.0]), + torch.tensor([float(world_size * (world_size + 1) / 2)]), + ), + ( + c10d.ReduceOp.PRODUCT, + torch.tensor([rank + 1.0]), + torch.tensor([float(math.factorial(world_size))]), + ), + ( + c10d.ReduceOp.MIN, + torch.tensor([rank + 1.0]), + torch.tensor([1.0]), + ), + ( + c10d.ReduceOp.MAX, + torch.tensor([rank + 1.0]), + torch.tensor([world_size]), + ), + ] + + return tests + + +class RendezvousEnvTest(TestCase): + @retry_on_connect_failures + @requires_xccl() + def test_common_errors(self): + vars = { + "WORLD_SIZE": "1", + "RANK": "0", + "MASTER_ADDR": "127.0.0.1", + "MASTER_PORT": str(common.find_free_port()), + } + + class Env: + def __init__(self, vars): + self.env_patcher = mock.patch.dict(os.environ, vars, clear=True) + + def __enter__(self): + self.env_patcher.start() + + def __exit__(self, type, value, traceback): + self.env_patcher.stop() + + def without(d, key): + d = d.copy() + d.pop(key) + return d + + def withouts(d, keys): + d = d.copy() + for key in keys: + d.pop(key) + return d + + with Env(without(vars, "WORLD_SIZE")): + self.assertEqual(None, os.environ.get("WORLD_SIZE")) + with self.assertRaisesRegex(ValueError, "WORLD_SIZE expected"): + gen = c10d.rendezvous("env://") + next(gen) + c10d.init_process_group(backend="xccl", world_size=1) + self.assertEqual(c10d.get_rank(), 0) + self.assertEqual(c10d.get_world_size(), 1) + c10d.destroy_process_group() + + with Env(without(vars, "RANK")): + self.assertEqual(None, os.environ.get("RANK")) + with self.assertRaisesRegex(ValueError, "RANK expected"): + gen = c10d.rendezvous("env://") + next(gen) + c10d.init_process_group(backend="xccl", rank=0) + self.assertEqual(c10d.get_rank(), 0) + self.assertEqual(c10d.get_world_size(), 1) + c10d.destroy_process_group() + + with Env(withouts(vars, ["RANK", "WORLD_SIZE"])): + self.assertEqual(None, os.environ.get("RANK")) + self.assertEqual(None, os.environ.get("WORLD_SIZE")) + c10d.init_process_group(backend="xccl", rank=0, world_size=1) + self.assertEqual(c10d.get_rank(), 0) + self.assertEqual(c10d.get_world_size(), 1) + c10d.destroy_process_group() + + with Env(vars): + c10d.init_process_group(backend="xccl") + self.assertEqual(c10d.get_rank(), 0) + self.assertEqual(c10d.get_world_size(), 1) + c10d.destroy_process_group() + + with Env(without(vars, "MASTER_ADDR")): + self.assertEqual(None, os.environ.get("MASTER_ADDR")) + with self.assertRaisesRegex(ValueError, "MASTER_ADDR expected"): + gen = c10d.rendezvous("env://") + next(gen) + + with Env(without(vars, "MASTER_PORT")): + self.assertEqual(None, os.environ.get("MASTER_PORT")) + with self.assertRaisesRegex(ValueError, "MASTER_PORT expected"): + gen = c10d.rendezvous("env://") + next(gen) + + with Env(without(vars, "WORLD_SIZE")): + self.assertEqual(None, os.environ.get("WORLD_SIZE")) + gen = c10d.rendezvous(f"env://?world_size={1}") + _, _, size = next(gen) + self.assertEqual(size, 1) + + with Env(without(vars, "RANK")): + self.assertEqual(None, os.environ.get("RANK")) + gen = c10d.rendezvous(f"env://?rank={0}") + _, rank, _ = next(gen) + self.assertEqual(rank, 0) + + with Env(withouts(vars, ["RANK", "WORLD_SIZE"])): + self.assertEqual(None, os.environ.get("RANK")) + self.assertEqual(None, os.environ.get("WORLD_SIZE")) + gen = c10d.rendezvous(f"env://?rank={0}&world_size={1}") + _, rank, size = next(gen) + self.assertEqual(rank, 0) + self.assertEqual(size, 1) + +class TimeoutTest(test_c10d_common.AbstractTimeoutTest, TestCase): + @requires_xccl() + @retry_on_connect_failures + def test_default_store_timeout_nccl(self): + self._test_default_store_timeout("xccl") + +class ProcessGroupXCCLTest(MultiProcessTestCase): + def _create_process_group_xccl(self): + store = c10d.FileStore(self.file_name, self.world_size) + return c10d.ProcessGroupXCCL(store, self.rank, self.world_size) + + def setUp(self): + super().setUp() + self._spawn_processes() + + def tearDown(self): + super().tearDown() + try: + os.remove(self.file_name) + except OSError: + pass + + def _test_allreduce_basics(self, fn): + pg = self._create_process_group_xccl() + device = torch.device("xpu:" + str(self.rank)) + # Single input tests + tests = simple_reduce_tests(self.rank, self.world_size) + for op, input, expected in tests: + opts = c10d.AllreduceOptions() + opts.reduceOp = op + tensor = fn(input.to(device)) + fut = pg.allreduce([tensor], opts).get_future() + fut.wait() + result = fut.value() + self.assertEqual(expected, result[0], exact_dtype=False) + + x = fn(torch.tensor([self.rank + 1.0], device = device)) + fut = pg.allreduce(x).get_future() + fut.wait() + result = fut.value() + self.assertEqual( + torch.tensor([float(self.world_size * (self.world_size + 1) / 2)]), + result[0], + ) + + @requires_xccl() + def test_allreduce_basics(self): + self._test_allreduce_basics(lambda t: t.clone()) + + + +if __name__ == "__main__": + assert ( + not torch.xpu._initialized + ), "test_distributed must not have initialized XPU context on main process" + + run_tests() + diff --git a/test/run_test.py b/test/run_test.py index 80a724e129a7a2..02a37ffee07375 100755 --- a/test/run_test.py +++ b/test/run_test.py @@ -1105,6 +1105,7 @@ def run_ci_sanity_check(test: ShardedTest, test_directory, options): "distributed/test_c10d_nccl": run_test_with_subprocess, "distributed/test_c10d_gloo": run_test_with_subprocess, "distributed/test_c10d_ucc": run_test_with_subprocess, + "distributed/test_c10d_xccl": run_test_with_subprocess, "distributed/test_c10d_common": run_test_with_subprocess, "distributed/test_c10d_spawn_gloo": run_test_with_subprocess, "distributed/test_c10d_spawn_nccl": run_test_with_subprocess, diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py index d59102232f7db7..ff83bc8ab66666 100644 --- a/torch/testing/_internal/common_distributed.py +++ b/torch/testing/_internal/common_distributed.py @@ -320,6 +320,12 @@ def requires_nccl(): "c10d was not compiled with the NCCL backend", ) +def requires_xccl(): + return skip_but_pass_in_sandcastle_if( + not c10d.is_xccl_available(), + "c10d was not compiled with the XCCL backend", + ) + def requires_ucc(): return skip_but_pass_in_sandcastle_if( not c10d.is_ucc_available(), From dc41d6adf4b831029432c9cb9f10eacbedd85278 Mon Sep 17 00:00:00 2001 From: hanchao Date: Thu, 12 Sep 2024 03:27:54 +0000 Subject: [PATCH 31/96] update --- test/run_test.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/run_test.py b/test/run_test.py index 02a37ffee07375..80a724e129a7a2 100755 --- a/test/run_test.py +++ b/test/run_test.py @@ -1105,7 +1105,6 @@ def run_ci_sanity_check(test: ShardedTest, test_directory, options): "distributed/test_c10d_nccl": run_test_with_subprocess, "distributed/test_c10d_gloo": run_test_with_subprocess, "distributed/test_c10d_ucc": run_test_with_subprocess, - "distributed/test_c10d_xccl": run_test_with_subprocess, "distributed/test_c10d_common": run_test_with_subprocess, "distributed/test_c10d_spawn_gloo": run_test_with_subprocess, "distributed/test_c10d_spawn_nccl": run_test_with_subprocess, From 4c3f49f2cffcab2718f5169d52f42d7b6ee36f0d Mon Sep 17 00:00:00 2001 From: hanchao Date: Tue, 10 Sep 2024 05:46:06 +0000 Subject: [PATCH 32/96] enable coalese --- .../distributed/c10d/ProcessGroupXCCL.cpp | 137 ++++++++++++++++++ .../distributed/c10d/ProcessGroupXCCL.hpp | 25 +++- 2 files changed, 159 insertions(+), 3 deletions(-) diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index e550225e19cb79..3c73e7547b50aa 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -26,6 +26,22 @@ namespace c10d { namespace { + +// wait nonblocking implement +AutoXcclGroup::AutoXcclGroup() { + comm_ = nullptr; + ccl::group_start(); +} + +AutoNcclGroup::AutoNcclGroup(xcclComm_t comm) { + comm_ = comm; + ccl::group_start(); +} + +AutoNcclGroup::~AutoNcclGroup() noexcept(false) { + ccl::group_end(); +} + std::map xcclOps = { {ReduceOp::MIN, ccl::reduction::min}, {ReduceOp::MAX, ccl::reduction::max}, @@ -85,6 +101,34 @@ void check_xpu_single_tensor(const at::Tensor& tensor) { } } +int64_t check_xpu_tensors_same_device(const std::vector& tensors) { + if (tensors.size() == 0) { + C10_THROW_ERROR(ValueError, "Tensor list must be nonempty"); + } + + const auto& first = tensors.front(); + + int64_t total_numel = 0; + for (const auto& t : tensors) { + if (!t.is_xpu() || t.is_sparse()) { + C10_THROW_ERROR(ValueError, "Tensors must be XPU and dense"); + } + if (t.scalar_type() != first.scalar_type()) { + C10_THROW_ERROR(TypeError, "Tensors must have identical type"); + } + if (!t.is_non_overlapping_and_dense()) { + C10_THROW_ERROR(ValueError, "Tensors must be non-overlapping and dense"); + } + TORCH_CHECK_WITH( + ValueError, + t.get_device() == tensors[0].get_device(), + "Expected list of tensors on the same device"); + total_numel += t.numel(); + } + + return total_numel; +} + ccl::datatype getXcclDataType(at::ScalarType type) { auto it = xcclDatatypes.find(type); TORCH_CHECK_WITH( @@ -133,6 +177,9 @@ static std::mutex xcclCommDevIdxMapMutex; static std::unordered_map, int> xcclCommDevIdxMap; constexpr int64_t kSynchronizeBusyWaitMillis = 10; +// Before implementing send/recv, the xcclActiveGroupCounter_ variable has no effect. +thread_local uint64_t ProcessGroupXCCL::xcclActiveGroupCounter_ = 0; + ProcessGroupXCCL::WorkXCCL::WorkXCCL( at::Device& device, int rank, @@ -314,6 +361,16 @@ std::shared_ptr ProcessGroupXCCL::getXCCLComm( return it->second; } +void ProcessGroupXCCL::groupStart() { + ccl::group_start(); + ++xcclActiveGroupCounter_; +} + +void ProcessGroupXCCL::groupEnd() { + ccl::group_end(); + --xcclActiveGroupCounter_; +} + template c10::intrusive_ptr ProcessGroupXCCL::collective( at::Tensor& input, @@ -377,6 +434,53 @@ c10::intrusive_ptr ProcessGroupXCCL::collective( opType); } +template +c10::intrusive_ptr ProcessGroupXCCL::collectiveCoalesced( + std::vector& inputs, + std::vector& outputs, + Fn fn, + OpType opType) { + + using traits = function_traits; + using attr_t = typename traits::template arg<2>::type; + attr_t attr = ccl::create_operation_attr(); + + auto device = inputs[0].device(); + const auto key = std::to_string(device.index()); + auto comm = getXCCLComm(key, device); + + auto stream = xcclStreams_.at(key); + auto ccl_stream = ccl::create_stream(stream.queue()); + + c10::intrusive_ptr work; + + work = initWork(device, rank_, opType); + + work->outputs_ = + std::make_shared>(std::move(outputs)); + + { + AutoXcclGroup xccl_group_guard(comm); + for (const auto i : c10::irange(inputs.size())) { + c10::xpu::XPUCachingAllocator::recordStream( + inputs[i].storage().data_ptr(), stream); + fn(inputs[i], outputs[i], attr, *comm, ccl_stream); + } + } + + work->xcclEndEvent_->record(stream); + + std::vector streams = {stream.unwrap()}; + c10::MultiStreamGuard streamGuard(streams); + std::vector devices{device}; + work->future_ = c10::make_intrusive( + c10::ListType::create(c10::TensorType::get()), devices); + work->future_->markCompleted(at::IValue(*work->outputs_)); + + return work; + +} + c10::intrusive_ptr ProcessGroupXCCL::allreduce( std::vector& tensors, const AllreduceOptions& opts) { @@ -409,6 +513,39 @@ c10::intrusive_ptr ProcessGroupXCCL::allreduce( OpType::ALLREDUCE); } +c10::intrusive_ptr ProcessGroupXCCL::allreduce_coalesced( + std::vector& tensors, + const AllreduceCoalescedOptions& opts = + AllreduceCoalescedOptions()) { + check_xpu_tensors_same_device(tensors); + TORCH_CHECK( + !isFloat8Type(tensors.back().scalar_type()), + "Float8 dtypes are not currenlty supported for XCCL reductions"); + + return collectiveCoalesced( + tensors, + tensors, + [&](at::Tensor& input, + at::Tensor& output, + xcclComm_t& comm, + ccl::stream& stream) { + auto xcclDataType = getXcclDataType(input.scalar_type()); + auto xcclReduceOp = getXcclReduceOp(opts.reduceOp, input); + ccl::event ret_evt; + ret_evt = ccl::allreduce( + input.data_ptr(), + output.data_ptr(), + (size_t)input.numel(), + xcclDataType, + xcclReduceOp, + comm, + stream, + attr); + return ret_evt; + }, + OpType::COALESCED); +} + } // namespace c10d #endif // USE_C10D_XCCL diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp index eca66a33922d55..6fa066e83b976c 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp @@ -65,6 +65,15 @@ using xcclComm_t = ccl::communicator; using XCCL_KVS = ccl::shared_ptr_class; constexpr const char* XCCL_BACKEND_NAME = "xccl"; +namespace { +struct AutoXcclGroup { + AutoXcclGroup(); + AutoXcclGroup(xcclComm_t comm); + ~AutoXcclGroup() noexcept(false); + xcclComm_t comm_; +}; +} // namespace + class TORCH_API ProcessGroupXCCL : public Backend { public: class WorkXCCL : public Work { @@ -169,6 +178,13 @@ class TORCH_API ProcessGroupXCCL : public Backend { PostProcess post, OpType opType); + template + c10::intrusive_ptr collectiveCoalesced( + std::vector& input, + std::vector& output, + Fn fn, + OpType opType); + c10::intrusive_ptr allreduce( std::vector& tensors, const AllreduceOptions& opts = AllreduceOptions()) override; @@ -176,9 +192,7 @@ class TORCH_API ProcessGroupXCCL : public Backend { c10::intrusive_ptr allreduce_coalesced( std::vector& tensors, const AllreduceCoalescedOptions& opts = - AllreduceCoalescedOptions()) override { - TORCH_CHECK(false, "ProcessGroupXCCL::allreduce_coalesced not implemented"); - } + AllreduceCoalescedOptions()) override; c10::intrusive_ptr reduce( std::vector& tensors, @@ -281,6 +295,10 @@ class TORCH_API ProcessGroupXCCL : public Backend { TORCH_CHECK(false, "ProcessGroupXCCL::recv not implemented"); } + void groupStart(); + + void groupEnd(); + c10::intrusive_ptr gather( std::vector>& outputTensors, std::vector& inputTensors, @@ -303,6 +321,7 @@ class TORCH_API ProcessGroupXCCL : public Backend { c10::intrusive_ptr store_; std::mutex mutex_; bool blockingWait_ = false; + static thread_local uint64_t xcclActiveGroupCounter_; }; } // namespace c10d From afa2adc754130feedc55440500a5bc413c42965c Mon Sep 17 00:00:00 2001 From: hanchao Date: Wed, 11 Sep 2024 05:34:46 +0000 Subject: [PATCH 33/96] Support broadcast --- .../distributed/c10d/ProcessGroupXCCL.cpp | 36 +++++++++++++++++++ .../distributed/c10d/ProcessGroupXCCL.hpp | 4 +-- 2 files changed, 37 insertions(+), 3 deletions(-) diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index 3c73e7547b50aa..74a102ddf0ad3a 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -546,6 +546,42 @@ c10::intrusive_ptr ProcessGroupXCCL::allreduce_coalesced( OpType::COALESCED); } +c10::intrusive_ptr ProcessGroupXCCL::broadcast( + std::vector& tensors, + const BroadcastOptions& opts) { + TORCH_CHECK( + tensors.size() == 1, "Expecting one tensor only but got multiple"); + auto tensor = tensors.back(); + if (tensor.is_complex()) { + tensor = at::view_as_real(tensor); + } + check_xpu_single_tensor(tensor); + + const auto root = opts.rootRank + opts.rootTensor; + + return collective( + tensor, + tensor, + [&](at::Tensor& input, + at::Tensor& output, + ccl::broadcast_attr attr, + xcclComm_t& comm, + ccl::stream& stream) { + auto xcclDataType = getXcclDataType(input.scalar_type()); + ccl::event ret_evt; + ret_evt = ccl::broadcast( + input.data_ptr(), + (size_t)input.numel(), + xcclDataType, + root, + comm, + stream, + attr); + return ret_evt; + }, + OpType::BROADCAST); +} + } // namespace c10d #endif // USE_C10D_XCCL diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp index 6fa066e83b976c..75f2d944bf72a7 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp @@ -202,9 +202,7 @@ class TORCH_API ProcessGroupXCCL : public Backend { c10::intrusive_ptr broadcast( std::vector& tensors, - const BroadcastOptions& opts = BroadcastOptions()) override { - TORCH_CHECK(false, "ProcessGroupXCCL::broadcast not implemented"); - } + const BroadcastOptions& opts = BroadcastOptions()) override; c10::intrusive_ptr allgather( std::vector>& outputTensors, From 8efb5d0d397a8171f04347570aec4b6d2d5a810b Mon Sep 17 00:00:00 2001 From: hanchao Date: Thu, 12 Sep 2024 04:52:38 +0000 Subject: [PATCH 34/96] update --- torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index 74a102ddf0ad3a..ac36fddf80ac4a 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -457,14 +457,14 @@ c10::intrusive_ptr ProcessGroupXCCL::collectiveCoalesced( work = initWork(device, rank_, opType); work->outputs_ = - std::make_shared>(std::move(outputs)); + std::make_shared>(outputs); { AutoXcclGroup xccl_group_guard(comm); for (const auto i : c10::irange(inputs.size())) { c10::xpu::XPUCachingAllocator::recordStream( inputs[i].storage().data_ptr(), stream); - fn(inputs[i], outputs[i], attr, *comm, ccl_stream); + work->addResult(fn(inputs[i], outputs[i], attr, *comm, ccl_stream)); } } @@ -476,6 +476,7 @@ c10::intrusive_ptr ProcessGroupXCCL::collectiveCoalesced( work->future_ = c10::make_intrusive( c10::ListType::create(c10::TensorType::get()), devices); work->future_->markCompleted(at::IValue(*work->outputs_)); + work->blockingWait_ = blockingWait_; return work; From e85c26816e3dacf7244cc0d4f5abe1914f79fe66 Mon Sep 17 00:00:00 2001 From: hanchao Date: Thu, 12 Sep 2024 04:54:33 +0000 Subject: [PATCH 35/96] update --- torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp | 5 ----- torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp | 2 -- 2 files changed, 7 deletions(-) diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index ac36fddf80ac4a..ce482a97952c34 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -33,11 +33,6 @@ AutoXcclGroup::AutoXcclGroup() { ccl::group_start(); } -AutoNcclGroup::AutoNcclGroup(xcclComm_t comm) { - comm_ = comm; - ccl::group_start(); -} - AutoNcclGroup::~AutoNcclGroup() noexcept(false) { ccl::group_end(); } diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp index 75f2d944bf72a7..25f3a1653a0c45 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp @@ -68,9 +68,7 @@ constexpr const char* XCCL_BACKEND_NAME = "xccl"; namespace { struct AutoXcclGroup { AutoXcclGroup(); - AutoXcclGroup(xcclComm_t comm); ~AutoXcclGroup() noexcept(false); - xcclComm_t comm_; }; } // namespace From 7488dbd780dd703f39a49e81a8d488040fab4572 Mon Sep 17 00:00:00 2001 From: hanchao Date: Thu, 12 Sep 2024 05:01:19 +0000 Subject: [PATCH 36/96] update --- .../distributed/c10d/ProcessGroupXCCL.cpp | 21 ++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index ce482a97952c34..ea5220e3dac77b 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -29,11 +29,23 @@ namespace { // wait nonblocking implement AutoXcclGroup::AutoXcclGroup() { +<<<<<<< HEAD + ccl::group_start(); +} + +AutoXcclGroup::AutoXcclGroup(std::shared_ptr comm) { + comm_ = std::move(comm); + ccl::group_start(); +} + +AutoXcclGroup::~AutoXcclGroup() noexcept(false) { +======= comm_ = nullptr; ccl::group_start(); } AutoNcclGroup::~AutoNcclGroup() noexcept(false) { +>>>>>>> e85c26816e3dacf7244cc0d4f5abe1914f79fe66 ccl::group_end(); } @@ -175,6 +187,10 @@ constexpr int64_t kSynchronizeBusyWaitMillis = 10; // Before implementing send/recv, the xcclActiveGroupCounter_ variable has no effect. thread_local uint64_t ProcessGroupXCCL::xcclActiveGroupCounter_ = 0; +// Before implementing send/recv, the xcclActiveGroupCounter_ variable has no +// effect. +thread_local uint64_t ProcessGroupXCCL::xcclActiveGroupCounter_ = 0; + ProcessGroupXCCL::WorkXCCL::WorkXCCL( at::Device& device, int rank, @@ -435,7 +451,6 @@ c10::intrusive_ptr ProcessGroupXCCL::collectiveCoalesced( std::vector& outputs, Fn fn, OpType opType) { - using traits = function_traits; using attr_t = typename traits::template arg<2>::type; attr_t attr = ccl::create_operation_attr(); @@ -511,8 +526,7 @@ c10::intrusive_ptr ProcessGroupXCCL::allreduce( c10::intrusive_ptr ProcessGroupXCCL::allreduce_coalesced( std::vector& tensors, - const AllreduceCoalescedOptions& opts = - AllreduceCoalescedOptions()) { + const AllreduceCoalescedOptions& opts) { check_xpu_tensors_same_device(tensors); TORCH_CHECK( !isFloat8Type(tensors.back().scalar_type()), @@ -523,6 +537,7 @@ c10::intrusive_ptr ProcessGroupXCCL::allreduce_coalesced( tensors, [&](at::Tensor& input, at::Tensor& output, + ccl::allreduce_attr attr, xcclComm_t& comm, ccl::stream& stream) { auto xcclDataType = getXcclDataType(input.scalar_type()); From e5d6f3728c58fbf62ef9f5f864041730455df2d3 Mon Sep 17 00:00:00 2001 From: hanchao Date: Thu, 12 Sep 2024 05:09:16 +0000 Subject: [PATCH 37/96] update --- .../distributed/c10d/ProcessGroupXCCL.cpp | 24 +++---------------- 1 file changed, 3 insertions(+), 21 deletions(-) diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index ea5220e3dac77b..03ba5824baf2e5 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -29,23 +29,10 @@ namespace { // wait nonblocking implement AutoXcclGroup::AutoXcclGroup() { -<<<<<<< HEAD - ccl::group_start(); -} - -AutoXcclGroup::AutoXcclGroup(std::shared_ptr comm) { - comm_ = std::move(comm); ccl::group_start(); } AutoXcclGroup::~AutoXcclGroup() noexcept(false) { -======= - comm_ = nullptr; - ccl::group_start(); -} - -AutoNcclGroup::~AutoNcclGroup() noexcept(false) { ->>>>>>> e85c26816e3dacf7244cc0d4f5abe1914f79fe66 ccl::group_end(); } @@ -184,9 +171,6 @@ static std::mutex xcclCommDevIdxMapMutex; static std::unordered_map, int> xcclCommDevIdxMap; constexpr int64_t kSynchronizeBusyWaitMillis = 10; -// Before implementing send/recv, the xcclActiveGroupCounter_ variable has no effect. -thread_local uint64_t ProcessGroupXCCL::xcclActiveGroupCounter_ = 0; - // Before implementing send/recv, the xcclActiveGroupCounter_ variable has no // effect. thread_local uint64_t ProcessGroupXCCL::xcclActiveGroupCounter_ = 0; @@ -466,11 +450,10 @@ c10::intrusive_ptr ProcessGroupXCCL::collectiveCoalesced( work = initWork(device, rank_, opType); - work->outputs_ = - std::make_shared>(outputs); - + work->outputs_ = std::make_shared>(outputs); + { - AutoXcclGroup xccl_group_guard(comm); + AutoXcclGroup xccl_group_guard; for (const auto i : c10::irange(inputs.size())) { c10::xpu::XPUCachingAllocator::recordStream( inputs[i].storage().data_ptr(), stream); @@ -489,7 +472,6 @@ c10::intrusive_ptr ProcessGroupXCCL::collectiveCoalesced( work->blockingWait_ = blockingWait_; return work; - } c10::intrusive_ptr ProcessGroupXCCL::allreduce( From 0da5e777f3792331f373918ff54514acab824ee9 Mon Sep 17 00:00:00 2001 From: hanchao Date: Thu, 12 Sep 2024 09:21:05 +0000 Subject: [PATCH 38/96] add allgather --- .../distributed/c10d/ProcessGroupXCCL.cpp | 259 ++++++++++++++++-- .../distributed/c10d/ProcessGroupXCCL.hpp | 39 ++- 2 files changed, 257 insertions(+), 41 deletions(-) diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index 03ba5824baf2e5..79d67eb8fdb809 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -86,6 +87,15 @@ XCCL_KVS get_kvs(int rank, c10d::Store& store) { return kvs; } +bool check_same_size(const std::vector& input_tensors) { + for (const auto& input_tensor : input_tensors) { + if (!input_tensors[0].is_same_size(input_tensor)) { + return false; + } + } + return true; +} + void check_xpu_single_tensor(const at::Tensor& tensor) { if (!tensor.is_xpu() || tensor.is_sparse()) { C10_THROW_ERROR(ValueError, "Tensors must be XPU and dense"); @@ -190,9 +200,9 @@ ProcessGroupXCCL::WorkXCCL::WorkXCCL( ProcessGroupXCCL::WorkXCCL::WorkXCCL(const WorkXCCL& w) : Work(w.rank_, w.opType_), device_(w.device_), + xcclEndEvent_(w.xcclEndEvent_), blockingWait_(w.blockingWait_), - workStartTime_(w.workStartTime_), - xcclEndEvent_(w.xcclEndEvent_) {} + workStartTime_(w.workStartTime_) {} ProcessGroupXCCL::WorkXCCL::~WorkXCCL() = default; @@ -366,10 +376,17 @@ void ProcessGroupXCCL::groupEnd() { --xcclActiveGroupCounter_; } -template +// align with good design single-device style, input_t and output_t due to +// allgatherv need vector output +template < + typename Fn, + typename input_t, + typename output_t, + typename PreProcess, + typename PostProcess> c10::intrusive_ptr ProcessGroupXCCL::collective( - at::Tensor& input, - at::Tensor& output, + std::vector& inputs, + std::vector& outputs, Fn fn, PreProcess pre, PostProcess post, @@ -378,26 +395,50 @@ c10::intrusive_ptr ProcessGroupXCCL::collective( using attr_t = typename traits::template arg<2>::type; attr_t attr = ccl::create_operation_attr(); - auto device = input.device(); + auto device = inputs[0].device(); const auto key = std::to_string(device.index()); auto comm = getXCCLComm(key, device); auto stream = xcclStreams_.at(key); - std::vector inputs{input}; - std::vector outputs{output}; c10::intrusive_ptr work; work = initWork(device, rank_, opType); - work->outputs_ = - std::make_shared>(std::move(outputs)); - c10::xpu::XPUCachingAllocator::recordStream( - input.storage().data_ptr(), stream); + { // Do we need to store the result of the operation? + std::variant, std::vector>> + outputs; + std::visit( + [&work](auto&& outputData) { + using T = std::decay_t; + + if constexpr (std::is_same_v>) { + work->outputs_ = std::make_shared>( + std::move(outputData)); + } else if constexpr (std::is_same_v< + T, + std::vector>>) { + std::vector flattened; + for (auto& vec : outputData) { + flattened.insert(flattened.end(), vec.begin(), vec.end()); + } + work->outputs_ = + std::make_shared>(std::move(flattened)); + } + }, + outputs); + } + + pre(stream, work); + + for (const auto& input : inputs) { + c10::xpu::XPUCachingAllocator::recordStream( + input.storage().data_ptr(), stream); + } - auto ccl_stream = ccl::create_stream(stream.queue()); + work->addResult(fn(inputs[0], outputs[0], attr, *comm, stream)); - work->addResult(fn(input, output, attr, *comm, ccl_stream)); + post(stream, work); work->xcclEndEvent_->record(stream); @@ -412,20 +453,38 @@ c10::intrusive_ptr ProcessGroupXCCL::collective( return work; } -template +template < + typename Fn, + typename input_t, + typename output_t, + typename PreProcess, + typename PostProcess> +c10::intrusive_ptr ProcessGroupXCCL::collective( + input_t& input, + output_t& output, + Fn fn, + PreProcess pre, + PostProcess post, + OpType opType) { + auto inputs = std::vector{input}; + auto outputs = std::vector{output}; + return collective(inputs, outputs, fn, pre, post, opType); +} + +template c10::intrusive_ptr ProcessGroupXCCL::collective( - at::Tensor& input, - at::Tensor& output, + input_t& input, + output_t& output, Fn fn, OpType opType) { return collective( input, output, fn, - [](at::xpu::XPUStream&, - c10::intrusive_ptr& work) {}, - [](at::xpu::XPUStream&, - c10::intrusive_ptr& work) {}, + [](at::xpu::XPUStream&, c10::intrusive_ptr&) { + }, + [](at::xpu::XPUStream&, c10::intrusive_ptr&) { + }, opType); } @@ -444,7 +503,6 @@ c10::intrusive_ptr ProcessGroupXCCL::collectiveCoalesced( auto comm = getXCCLComm(key, device); auto stream = xcclStreams_.at(key); - auto ccl_stream = ccl::create_stream(stream.queue()); c10::intrusive_ptr work; @@ -457,7 +515,7 @@ c10::intrusive_ptr ProcessGroupXCCL::collectiveCoalesced( for (const auto i : c10::irange(inputs.size())) { c10::xpu::XPUCachingAllocator::recordStream( inputs[i].storage().data_ptr(), stream); - work->addResult(fn(inputs[i], outputs[i], attr, *comm, ccl_stream)); + work->addResult(fn(inputs[i], outputs[i], attr, *comm, stream)); } } @@ -488,7 +546,7 @@ c10::intrusive_ptr ProcessGroupXCCL::allreduce( at::Tensor& output, ccl::allreduce_attr attr, xcclComm_t& comm, - ccl::stream& stream) { + at::xpu::XPUStream& stream) { auto xcclDataType = getXcclDataType(input.scalar_type()); auto xcclReduceOp = getXcclReduceOp(opts.reduceOp, input); ccl::event ret_evt; @@ -499,7 +557,7 @@ c10::intrusive_ptr ProcessGroupXCCL::allreduce( xcclDataType, xcclReduceOp, comm, - stream, + ccl::create_stream(stream.queue()), attr); return ret_evt; }, @@ -521,7 +579,7 @@ c10::intrusive_ptr ProcessGroupXCCL::allreduce_coalesced( at::Tensor& output, ccl::allreduce_attr attr, xcclComm_t& comm, - ccl::stream& stream) { + at::xpu::XPUStream& stream) { auto xcclDataType = getXcclDataType(input.scalar_type()); auto xcclReduceOp = getXcclReduceOp(opts.reduceOp, input); ccl::event ret_evt; @@ -532,7 +590,7 @@ c10::intrusive_ptr ProcessGroupXCCL::allreduce_coalesced( xcclDataType, xcclReduceOp, comm, - stream, + ccl::create_stream(stream.queue()), attr); return ret_evt; }, @@ -559,7 +617,7 @@ c10::intrusive_ptr ProcessGroupXCCL::broadcast( at::Tensor& output, ccl::broadcast_attr attr, xcclComm_t& comm, - ccl::stream& stream) { + at::xpu::XPUStream& stream) { auto xcclDataType = getXcclDataType(input.scalar_type()); ccl::event ret_evt; ret_evt = ccl::broadcast( @@ -568,13 +626,156 @@ c10::intrusive_ptr ProcessGroupXCCL::broadcast( xcclDataType, root, comm, - stream, + ccl::create_stream(stream.queue()), attr); return ret_evt; }, OpType::BROADCAST); } +c10::intrusive_ptr ProcessGroupXCCL::allgather( + std::vector>& outputTensors, + std::vector& inputTensors, + const AllgatherOptions& opts) { + TORCH_CHECK( + inputTensors.size() == 1, "Expecting one tensor only but got multiple"); + // @lint-ignore CLANGTIDY + auto inputTensor = inputTensors.back(); + check_xpu_single_tensor(inputTensor); + // @lint-ignore CLANGTIDY + std::vector& outputTensors_ = outputTensors.back(); + + bool same_size = check_same_size(outputTensors_); + if (same_size) { + // Flatten a vector of tensors into a single, stacked tensor. + at::Tensor outputFlattened = newLikeFlat(outputTensors_); + + return collective( + inputTensor, + outputFlattened, + [&](at::Tensor& input, + at::Tensor& output, + ccl::allgather_attr attr, + xcclComm_t& comm, + at::xpu::XPUStream& stream) { + c10::xpu::XPUCachingAllocator::recordStream( + output.storage().data_ptr(), stream); + auto xcclDataType = getXcclDataType(input.scalar_type()); + ccl::event ret_evt; + + ret_evt = ccl::allgather( + input.data_ptr(), + output.data_ptr(), + (size_t)input.numel(), + xcclDataType, + comm, + ccl::create_stream(stream.queue()), + attr); + return ret_evt; + }, + [](at::xpu::XPUStream&, + c10::intrusive_ptr& work) {}, + [&](at::xpu::XPUStream& Stream, + c10::intrusive_ptr& work) { + // Copy the flattened output tensors to the outputs. + c10::StreamGuard guard(Stream); + for (const auto j : c10::irange(outputTensors_.size())) { + c10::xpu::XPUCachingAllocator::recordStream( + outputTensors_[j].storage().data_ptr(), Stream); + outputTensors_[j].copy_(outputFlattened[j], true); + } + }, + OpType::ALLGATHER); + } else { + // xccl implemented allgatherv, so broadcast_oop not needed + return collective( + inputTensor, + outputTensors_, + [=](at::Tensor& input, + const std::vector& outputs, + ccl::allgatherv_attr attr, + xcclComm_t& comm, + at::xpu::XPUStream& stream) { + ccl::event ret_evt; + auto xcclDataType = getXcclDataType(input.scalar_type()); + + std::vector recvCounts(outputs.size(), 0); + std::transform( + outputs.begin(), + outputs.end(), + recvCounts.begin(), + [](const at::Tensor& t) { return t.numel(); }); + + TORCH_CHECK( + (size_t)input.numel() == recvCounts[rank_], + "allgather: send and recv count doesn't match"); + + std::vector recvBufs(outputs.size(), nullptr); + std::transform( + outputs.begin(), + outputs.end(), + recvBufs.begin(), + [](const at::Tensor& t) { return t.data_ptr(); }); + + ret_evt = ccl::allgatherv( + input.data_ptr(), + (size_t)input.numel(), + recvBufs, + recvCounts, + xcclDataType, + comm, + ccl::create_stream(stream.queue()), + attr); + return ret_evt; + }, + c10d::OpType::ALLGATHER); + } +} + +c10::intrusive_ptr ProcessGroupXCCL::_allgather_base( + at::Tensor& output_tensor, + at::Tensor& input_tensor, + const AllgatherOptions& opts) { + check_xpu_single_tensor(input_tensor); + check_xpu_single_tensor(output_tensor); + + if (input_tensor.dtype() != output_tensor.dtype()) { + C10_THROW_ERROR( + TypeError, "output tensor must have the same type as input tensor"); + } + + if (input_tensor.numel() * size_ != output_tensor.numel()) { + C10_THROW_ERROR( + ValueError, + "output tensor size must be equal to world_size times input tensor size"); + } + + return collective( + input_tensor, + output_tensor, + [&](at::Tensor& input, + at::Tensor& output, + ccl::allgather_attr attr, + xcclComm_t& comm, + at::xpu::XPUStream& stream) { + c10::xpu::XPUCachingAllocator::recordStream( + output.storage().data_ptr(), stream); + auto xcclDataType = getXcclDataType(input.scalar_type()); + ccl::event ret_evt; + + ret_evt = ccl::allgather( + input.data_ptr(), + output.data_ptr(), + (size_t)input.numel(), + xcclDataType, + comm, + ccl::create_stream(stream.queue()), + attr); + return ret_evt; + }, + OpType::_ALLGATHER_BASE); +} + } // namespace c10d #endif // USE_C10D_XCCL diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp index 25f3a1653a0c45..71d6a7ec653152 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp @@ -160,17 +160,36 @@ class TORCH_API ProcessGroupXCCL : public Backend { const std::vector& inputs = {}, const std::vector& outputs = {}); - template + template c10::intrusive_ptr collective( - at::Tensor& input, - at::Tensor& output, + input_t& input, + output_t& output, Fn fn, OpType opType); - template + template < + typename Fn, + typename input_t, + typename output_t, + typename PreProcess, + typename PostProcess> c10::intrusive_ptr collective( - at::Tensor& input, - at::Tensor& output, + input_t& input, + output_t& output, + Fn fn, + PreProcess pre, + PostProcess post, + OpType opType); + + template < + typename Fn, + typename input_t, + typename output_t, + typename PreProcess, + typename PostProcess> + c10::intrusive_ptr collective( + std::vector& inputs, + std::vector& outputs, Fn fn, PreProcess pre, PostProcess post, @@ -205,16 +224,12 @@ class TORCH_API ProcessGroupXCCL : public Backend { c10::intrusive_ptr allgather( std::vector>& outputTensors, std::vector& inputTensors, - const AllgatherOptions& opts = AllgatherOptions()) override { - TORCH_CHECK(false, "ProcessGroupXCCL::allgather not implemented"); - } + const AllgatherOptions& opts = AllgatherOptions()) override; c10::intrusive_ptr _allgather_base( at::Tensor& outputbuffer, at::Tensor& inputbuffer, - const AllgatherOptions& opts = AllgatherOptions()) override { - TORCH_CHECK(false, "ProcessGroupXCCL::_allgather_base not implemented"); - } + const AllgatherOptions& opts = AllgatherOptions()) override; c10::intrusive_ptr allgather_coalesced( std::vector>& outputTensorLists, From 0ad5677130743821757939de262c58d934afd19c Mon Sep 17 00:00:00 2001 From: hanchao Date: Fri, 13 Sep 2024 02:43:53 +0000 Subject: [PATCH 39/96] support allgather_into_tensor_coalesced --- .../distributed/c10d/ProcessGroupXCCL.cpp | 28 ++++++++++++++++++- .../distributed/c10d/ProcessGroupXCCL.hpp | 6 +--- 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index 79d67eb8fdb809..e78bc4e49871bd 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -762,7 +762,6 @@ c10::intrusive_ptr ProcessGroupXCCL::_allgather_base( output.storage().data_ptr(), stream); auto xcclDataType = getXcclDataType(input.scalar_type()); ccl::event ret_evt; - ret_evt = ccl::allgather( input.data_ptr(), output.data_ptr(), @@ -776,6 +775,33 @@ c10::intrusive_ptr ProcessGroupXCCL::_allgather_base( OpType::_ALLGATHER_BASE); } +c10::intrusive_ptr ProcessGroupXCCL::allgather_into_tensor_coalesced( + std::vector& outputs, + std::vector& inputs, + const AllgatherOptions& opts) { + return collectiveCoalesced( + inputs, + outputs, + [&](at::Tensor& input, + at::Tensor& output, + ccl::allgather_attr attr, + xcclComm_t& comm, + at::xpu::XPUStream& stream) { + auto xcclDataType = getXcclDataType(input.scalar_type()); + ccl::event ret_evt; + ret_evt = ccl::allgather( + input.data_ptr(), + output.data_ptr(), + (size_t)input.numel(), + xcclDataType, + comm, + ccl::create_stream(stream.queue()), + attr); + return ret_evt; + }, + OpType::COALESCED); +} + } // namespace c10d #endif // USE_C10D_XCCL diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp index 71d6a7ec653152..94ee71ab0190cb 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp @@ -241,11 +241,7 @@ class TORCH_API ProcessGroupXCCL : public Backend { c10::intrusive_ptr allgather_into_tensor_coalesced( std::vector& outputs, std::vector& inputs, - const AllgatherOptions& opts = AllgatherOptions()) override { - TORCH_CHECK( - false, - "ProcessGroupXCCL::allgather_into_tensor_coalesced not implemented"); - } + const AllgatherOptions& opts = AllgatherOptions()) override; c10::intrusive_ptr reduce_scatter( std::vector& outputTensors, From 009e334af7ee713d015907c1103027282e74f3ef Mon Sep 17 00:00:00 2001 From: hanchao Date: Fri, 13 Sep 2024 05:30:33 +0000 Subject: [PATCH 40/96] support reduce_scatter --- .../distributed/c10d/ProcessGroupXCCL.cpp | 198 +++++++++++++++++- .../distributed/c10d/ProcessGroupXCCL.hpp | 18 +- 2 files changed, 206 insertions(+), 10 deletions(-) diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index e78bc4e49871bd..4792ba86682f03 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -272,6 +272,13 @@ bool ProcessGroupXCCL::WorkXCCL::wait(std::chrono::milliseconds timeout) { return true; } +constexpr const char* MULTI_DEVICE_ERROR_MSG = + "Expecting one tensor only but got multiple. You are probably using multiple " + "devices under one thread. The support for such usage has been deprecated. " + "For details, please refer to " + "https://pytorch.org/docs/stable/distributed.html#multi-gpu-collective-functions. " + "ProcessGroupXCCL continues supporting multi-process and multi-thread modes."; + ProcessGroupXCCL::ProcessGroupXCCL( const c10::intrusive_ptr& store, int rank, @@ -376,7 +383,50 @@ void ProcessGroupXCCL::groupEnd() { --xcclActiveGroupCounter_; } -// align with good design single-device style, input_t and output_t due to +// TODO: wait p2p enable +static constexpr int CoalActive = 0x01, CoalColl = 0x02; +void ProcessGroupXCCL::startCoalescing() { + coalescedDevice_.set_index(-1); + coalescedComm_ = nullptr; + coalescing_state_ |= CoalActive; + groupStart(); +} + +c10::intrusive_ptr ProcessGroupXCCL::endCoalescing(OpType optype) { + if (coalescedComm_ == nullptr) { + // There is no actual work being coalesced, return here + groupEnd(); + coalescing_state_ = 0; + return nullptr; + } + TORCH_CHECK( + coalescedDevice_.index() >= 0, + "Somthing went wrong. Did you call end_coalescing before start_coalescing?"); + + auto comm = coalescedComm_; + auto device = coalescedDevice_; + + const auto key = std::to_string(device.index()); + auto stream = xcclStreams_.at(key); + + auto work = initWork(device, rank_, optype); + work->blockingWait_ = blockingWait_; + + groupEnd(); + + work->xcclEndEvent_->record(stream); + + coalescing_state_ = 0; + coalescedComm_ = nullptr; + return work; +} + +c10::intrusive_ptr ProcessGroupXCCL::endCoalescing() { + // Default OpType to COALESCED if not specified + return endCoalescing(OpType::COALESCED); +} + +// align with single-device style, input_t and output_t due to // allgatherv need vector output template < typename Fn, @@ -399,6 +449,21 @@ c10::intrusive_ptr ProcessGroupXCCL::collective( const auto key = std::to_string(device.index()); auto comm = getXCCLComm(key, device); + if (coalescing_state_ & CoalActive) { + coalescing_state_ |= CoalColl; + if (coalescedDevice_.index() < 0) { + coalescedDevice_ = device; + } else { + TORCH_CHECK( + coalescedDevice_.index() == device.index(), MULTI_DEVICE_ERROR_MSG); + } + if (coalescedComm_ == nullptr) { + coalescedComm_ = comm; + } else { + TORCH_CHECK(coalescedComm_ == comm, MULTI_DEVICE_ERROR_MSG); + } + } + auto stream = xcclStreams_.at(key); c10::intrusive_ptr work; @@ -502,6 +567,21 @@ c10::intrusive_ptr ProcessGroupXCCL::collectiveCoalesced( const auto key = std::to_string(device.index()); auto comm = getXCCLComm(key, device); + if (coalescing_state_ & CoalActive) { + coalescing_state_ |= CoalColl; + if (coalescedDevice_.index() < 0) { + coalescedDevice_ = device; + } else { + TORCH_CHECK( + coalescedDevice_.index() == device.index(), MULTI_DEVICE_ERROR_MSG); + } + if (coalescedComm_ == nullptr) { + coalescedComm_ = comm; + } else { + TORCH_CHECK(coalescedComm_ == comm, MULTI_DEVICE_ERROR_MSG); + } + } + auto stream = xcclStreams_.at(key); c10::intrusive_ptr work; @@ -535,8 +615,7 @@ c10::intrusive_ptr ProcessGroupXCCL::collectiveCoalesced( c10::intrusive_ptr ProcessGroupXCCL::allreduce( std::vector& tensors, const AllreduceOptions& opts) { - TORCH_CHECK( - tensors.size() == 1, "Expecting one tensor only but got multiple"); + TORCH_CHECK(tensors.size() == 1, MULTI_DEVICE_ERROR_MSG); auto tensor = tensors.back(); check_xpu_single_tensor(tensor); return collective( @@ -600,8 +679,7 @@ c10::intrusive_ptr ProcessGroupXCCL::allreduce_coalesced( c10::intrusive_ptr ProcessGroupXCCL::broadcast( std::vector& tensors, const BroadcastOptions& opts) { - TORCH_CHECK( - tensors.size() == 1, "Expecting one tensor only but got multiple"); + TORCH_CHECK(tensors.size() == 1, MULTI_DEVICE_ERROR_MSG); auto tensor = tensors.back(); if (tensor.is_complex()) { tensor = at::view_as_real(tensor); @@ -633,12 +711,46 @@ c10::intrusive_ptr ProcessGroupXCCL::broadcast( OpType::BROADCAST); } +c10::intrusive_ptr ProcessGroupXCCL::_reduce_oop( + at::Tensor& outputTensor, + at::Tensor& inputTensor, + const ReduceOptions& opts) { + if (outputTensor.numel() != inputTensor.numel()) { + C10_THROW_ERROR( + ValueError, + "Tensor input and output of _reduce_oop must have the same number of elements "); + } + return collective( + inputTensor, + outputTensor, + [&](at::Tensor& input, + at::Tensor& output, + ccl::reduce_attr attr, + xcclComm_t& comm, + at::xpu::XPUStream& stream) { + const int root = opts.rootRank + opts.rootTensor; + const auto xcclDataType = getXcclDataType(input.scalar_type()); + const auto xcclReduceOp = getXcclReduceOp(opts.reduceOp, input); + ccl::event ret_evt; + ret_evt = ccl::reduce( + input.data_ptr(), + output.data_ptr(), + (size_t)input.numel(), + xcclDataType, + xcclReduceOp, + root, + comm, + ccl::create_stream(stream.queue())); + return ret_evt; + }, + OpType::REDUCE); +} + c10::intrusive_ptr ProcessGroupXCCL::allgather( std::vector>& outputTensors, std::vector& inputTensors, const AllgatherOptions& opts) { - TORCH_CHECK( - inputTensors.size() == 1, "Expecting one tensor only but got multiple"); + TORCH_CHECK(inputTensors.size() == 1, MULTI_DEVICE_ERROR_MSG); // @lint-ignore CLANGTIDY auto inputTensor = inputTensors.back(); check_xpu_single_tensor(inputTensor); @@ -802,6 +914,78 @@ c10::intrusive_ptr ProcessGroupXCCL::allgather_into_tensor_coalesced( OpType::COALESCED); } +c10::intrusive_ptr ProcessGroupXCCL::reduce_scatter( + std::vector& outputTensors, + std::vector>& inputTensors, + const ReduceScatterOptions& opts) { + TORCH_CHECK(outputTensors.size() == 1, MULTI_DEVICE_ERROR_MSG); + // @lint-ignore CLANGTIDY + auto outputTensor = outputTensors.back(); + check_xpu_single_tensor(outputTensor); + // @lint-ignore CLANGTIDY + auto inputTensors_ = inputTensors.back(); + TORCH_CHECK( + !isFloat8Type(outputTensor.scalar_type()), + "Float8 dtypes are not currenlty supported for NCCL reductions"); + + bool same_size = check_same_size(inputTensors_); + if (same_size) { + // Flatten a vector of tensors into a single, stacked tensor. + at::Tensor inputFlattened = newLikeFlat(inputTensors_); + return collective( + inputFlattened, + outputTensor, + [&](at::Tensor& input, + at::Tensor& output, + ccl::reduce_attr attr, + xcclComm_t& comm, + at::xpu::XPUStream& stream) { + c10::xpu::XPUCachingAllocator::recordStream( + output.storage().data_ptr(), stream); + auto xcclDataType = getXcclDataType(input.scalar_type()); + auto xcclReduceOp = getXcclReduceOp(opts.reduceOp, input); + ccl::event ret_evt; + ret_evt = ccl::reduce_scatter( + input.data_ptr(), + output.data_ptr(), + (size_t)output.numel(), + xcclDataType, + xcclReduceOp, + comm, + ccl::create_stream(stream.queue())); + return ret_evt; + }, + [&](at::xpu::XPUStream& Stream, + c10::intrusive_ptr& work) { + // Copy the input tensors to the flattened inputs. + c10::StreamGuard guard(Stream); + for (const auto j : c10::irange(inputTensors_.size())) { + c10::xpu::XPUCachingAllocator::recordStream( + inputTensors_[j].storage().data_ptr(), Stream); + inputFlattened[j].copy_(inputTensors_[j], true); + } + }, + [&](at::xpu::XPUStream&, + c10::intrusive_ptr&) {}, + OpType::REDUCE_SCATTER); + } else { + const auto num_reduces = inputTensors_.size(); + startCoalescing(); + for (const int i : c10::irange(num_reduces)) { + auto& input = inputTensors_[i]; + auto& output = (i == rank_) ? outputTensor : input; + auto reduceOpts = ReduceOptions{ + opts.reduceOp, + static_cast(i), + static_cast(0), + opts.timeout}; + _reduce_oop(output, input, reduceOpts); + } + auto work = endCoalescing(OpType::REDUCE_SCATTER); + return work; + } +} + } // namespace c10d #endif // USE_C10D_XCCL diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp index 94ee71ab0190cb..3b72fd4261f5cb 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp @@ -149,6 +149,12 @@ class TORCH_API ProcessGroupXCCL : public Backend { return std::string(XCCL_BACKEND_NAME); } + void startCoalescing() override; + + c10::intrusive_ptr endCoalescing() override; + + c10::intrusive_ptr endCoalescing(OpType optype); + std::shared_ptr getXCCLComm( const std::string& deviceKey, at::Device& device); @@ -221,6 +227,11 @@ class TORCH_API ProcessGroupXCCL : public Backend { std::vector& tensors, const BroadcastOptions& opts = BroadcastOptions()) override; + c10::intrusive_ptr _reduce_oop( + at::Tensor& outputTensors, + at::Tensor& inputTensors, + const ReduceOptions& opts = ReduceOptions()); + c10::intrusive_ptr allgather( std::vector>& outputTensors, std::vector& inputTensors, @@ -246,9 +257,7 @@ class TORCH_API ProcessGroupXCCL : public Backend { c10::intrusive_ptr reduce_scatter( std::vector& outputTensors, std::vector>& inputTensors, - const ReduceScatterOptions& opts = ReduceScatterOptions()) override { - TORCH_CHECK(false, "ProcessGroupXCCL::reduce_scatter not implemented"); - } + const ReduceScatterOptions& opts = ReduceScatterOptions()) override; c10::intrusive_ptr _reduce_scatter_base( at::Tensor& outputTensor, @@ -327,6 +336,9 @@ class TORCH_API ProcessGroupXCCL : public Backend { std::unordered_map> devXCCLCommMap_; c10::intrusive_ptr store_; std::mutex mutex_; + int coalescing_state_ = 0; + at::Device coalescedDevice_ = at::Device("xpu"); + std::shared_ptr coalescedComm_ = nullptr; bool blockingWait_ = false; static thread_local uint64_t xcclActiveGroupCounter_; }; From ecbd9894c4dcca31d8b10746231c3a0d2d155d85 Mon Sep 17 00:00:00 2001 From: hanchao Date: Fri, 13 Sep 2024 06:26:27 +0000 Subject: [PATCH 41/96] refine test cases --- test/distributed/test_c10d_common.py | 5 +- test/distributed/test_c10d_xccl.py | 168 +++++++++++++++++- torch/distributed/distributed_c10d.py | 4 +- torch/testing/_internal/common_distributed.py | 5 +- 4 files changed, 173 insertions(+), 9 deletions(-) diff --git a/test/distributed/test_c10d_common.py b/test/distributed/test_c10d_common.py index 6a0621f3f49913..0c1426d0e29c21 100644 --- a/test/distributed/test_c10d_common.py +++ b/test/distributed/test_c10d_common.py @@ -66,8 +66,9 @@ def gpus_for_rank(world_size): On a single node, all visible GPUs are evenly divided to subsets, each process only uses a subset. """ - visible_devices = list(range(torch.cuda.device_count())) - gpus_per_process = torch.cuda.device_count() // world_size + device_count = torch.xpu.device_count() if torch.xpu.is_available() else torch.cuda.device_count() + visible_devices = list(range(device_count)) + gpus_per_process = device_count // world_size gpus_for_rank = [] for rank in range(world_size): gpus_for_rank.append( diff --git a/test/distributed/test_c10d_xccl.py b/test/distributed/test_c10d_xccl.py index 33a2f196c3b5d1..a998af7b16ef98 100644 --- a/test/distributed/test_c10d_xccl.py +++ b/test/distributed/test_c10d_xccl.py @@ -7,7 +7,10 @@ import os import random import sys + +import time import tempfile +from datetime import timedelta from functools import reduce from unittest import mock, SkipTest @@ -20,6 +23,7 @@ sys.exit(0) import test_c10d_common +from test_c10d_common import DoubleGpuNet, gpus_for_rank, ModuleForDdpCommHook import torch.distributed as dist import torch.nn.functional as F @@ -29,8 +33,12 @@ from torch.testing._internal.common_distributed import ( MultiProcessTestCase, requires_xccl, + init_multigpu_helper, + skip_if_lt_x_gpu, ) from torch.testing._internal.common_utils import ( + skip_but_pass_in_sandcastle_if, + TEST_XPU, retry_on_connect_failures, run_tests, TestCase, @@ -62,10 +70,12 @@ def simple_reduce_tests(rank, world_size): return tests +TEST_MULTIXPU = torch.xpu.device_count() > 1 class RendezvousEnvTest(TestCase): @retry_on_connect_failures @requires_xccl() + @skip_but_pass_in_sandcastle_if(not TEST_XPU, "No GPUs available, skipping test") def test_common_errors(self): vars = { "WORLD_SIZE": "1", @@ -164,13 +174,23 @@ def withouts(d, keys): class TimeoutTest(test_c10d_common.AbstractTimeoutTest, TestCase): @requires_xccl() @retry_on_connect_failures + @skip_but_pass_in_sandcastle_if(not TEST_XPU, "No GPUs available, skipping test") def test_default_store_timeout_nccl(self): self._test_default_store_timeout("xccl") class ProcessGroupXCCLTest(MultiProcessTestCase): - def _create_process_group_xccl(self): + def _create_process_group_xccl(self, timeout=timedelta(seconds=600), device_id=None): store = c10d.FileStore(self.file_name, self.world_size) - return c10d.ProcessGroupXCCL(store, self.rank, self.world_size) + c10d.init_process_group( + "xccl", + world_size=self.world_size, + rank=self.rank, + store=store, + timeout=timeout, + device_id=device_id, + ) + pg = c10d.distributed_c10d._get_default_group() + return pg def setUp(self): super().setUp() @@ -182,7 +202,76 @@ def tearDown(self): os.remove(self.file_name) except OSError: pass - + + @property + def world_size(self): + return 2 + + @property + def rank_to_GPU(self): + # return rank to GPU map + return init_multigpu_helper(self.world_size, "xccl") + + @requires_xccl() + @skip_but_pass_in_sandcastle_if( + torch.xpu.device_count() < 2, "XCCL test requires 2+ GPUs" + ) + def test_close_multi_pg_unordered(self): + pg = self._create_process_group_xccl() + device = self.rank_to_GPU[self.rank][0] + t = torch.rand(10, 10, device=device) + # First allreduce to initialize default PG's communicator. + pg.allreduce(t).wait() + new_pg1 = c10d.new_group([0, 1]) + new_pg2 = c10d.new_group([0, 1]) + if self.rank == 0 or self.rank == 1: + t1 = torch.rand(10, 10, device=device) + t2 = torch.rand(10, 10, device=device) + new_pg1.allreduce(t1).wait() + new_pg2.allreduce(t2).wait() + if self.rank == 0: + dist.destroy_process_group(new_pg2) + # force destruction of pg2 first + del new_pg2 + dist.destroy_process_group(new_pg1) + del new_pg1 + if self.rank == 1: + c10d.destroy_process_group(new_pg1) + # force destruction of pg1 first + del new_pg1 + dist.destroy_process_group(new_pg2) + del new_pg2 + dist.destroy_process_group() + + @requires_xccl() + @skip_but_pass_in_sandcastle_if( + torch.xpu.device_count() < 2, "XCCL test requires 2+ GPUs" + ) + def test_file_store_check(self): + # self.file_name is created using "delete=False" + # e.g., self.file_name = tempfile.NamedTemporaryFile(delete=False).name + store = dist.FileStore(self.file_name, self.world_size) + dist.init_process_group( + backend="xccl", rank=self.rank, world_size=self.world_size, store=store + ) + pg = dist.distributed_c10d._get_default_group() + self.assertEqual(pg.rank(), self.rank) + self.assertEqual(pg.size(), self.world_size) + # give enough time for check() to be executed multiple times + time.sleep(2) + dist.destroy_process_group() + + @requires_xccl() + @skip_but_pass_in_sandcastle_if(not TEST_MULTIXPU, "XCCL test requires 2+ GPUs") + def test_set_process_group_desc(self): + device = torch.device(f"xpu:{self.rank}") + pg_default = self._create_process_group_xccl(device_id=device) + self.assertEqual(pg_default.group_desc, "default_pg") + pg_1 = c10d.new_group([0, 1], group_desc="test_purpose") + self.assertEqual(pg_1.group_desc, "test_purpose") + pg_2 = c10d.new_group([0, 1]) + self.assertEqual(pg_2.group_desc, "undefined") + def _test_allreduce_basics(self, fn): pg = self._create_process_group_xccl() device = torch.device("xpu:" + str(self.rank)) @@ -210,6 +299,79 @@ def _test_allreduce_basics(self, fn): def test_allreduce_basics(self): self._test_allreduce_basics(lambda t: t.clone()) +class DistributedDataParallelTest( + test_c10d_common.CommonDistributedDataParallelTest, MultiProcessTestCase +): + def setUp(self): + super().setUp() + self._spawn_processes() + + def _get_process_group(self): + store = self._get_store() + c10d.init_process_group( + "xccl", store=store, rank=self.rank, world_size=self.world_size + ) + return c10d.distributed_c10d._get_default_group() + + def _test_xccl_backend( + self, devices, device_ids, multi_device=False, gradient_as_bucket_view=False + ): + process_group = self._get_process_group() + self._test_ddp_with_process_group( + process_group, devices, device_ids, multi_device, gradient_as_bucket_view + ) + + @requires_xccl() + @skip_if_lt_x_gpu(2) + def test_xccl_backend_multi_device_ids_not_allowed(self): + int_devices = list(range(torch.xpu.device_count())) + devices = [torch.device("xpu:" + str(i)) for i in int_devices] + with self.assertRaisesRegex( + ValueError, "device_ids can only be None or contain a single element." + ): + self._test_xccl_backend(devices, int_devices) + + @requires_xccl() + @skip_if_lt_x_gpu(4) + def test_ddp_multi_device_module_config(self): + gpus = gpus_for_rank(self.world_size)[self.rank] + + self.assertTrue(len(gpus) >= 2, "expecting at least 2 gpus per process") + + process_group = self._get_process_group() + + gpus = gpus[:2] + model = DoubleGpuNet(gpus) + + with self.assertRaisesRegex( + ValueError, + "DistributedDataParallel device_ids and output_device arguments only work with " + "single-device/multiple-device GPU modules or CPU modules", + ): + ddp_model = DistributedDataParallel( + model, output_device=gpus[1], process_group=process_group + ) + + with self.assertRaisesRegex( + ValueError, "device_ids can only be None or contain a single element." + ): + ddp_model = DistributedDataParallel( + model, device_ids=gpus, process_group=process_group + ) + + with self.assertRaisesRegex( + ValueError, "input module must be on the same type of devices" + ): + model.fc1 = model.fc1.cpu() + ddp_model = DistributedDataParallel(model, process_group=process_group) + + model = model.cpu() + with self.assertRaisesRegex( + ValueError, "device_ids can only be None or contain a single element." + ): + ddp_model = DistributedDataParallel( + model, device_ids=gpus, process_group=process_group + ) if __name__ == "__main__": diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py index 3f68609905bb5a..d0781765c090ff 100644 --- a/torch/distributed/distributed_c10d.py +++ b/torch/distributed/distributed_c10d.py @@ -1672,10 +1672,10 @@ def _new_process_group_helper( "created, please use a different group name" ) - if device_id is not None and (device_id.index is None or device_id.type != "cuda"): + if device_id is not None and (device_id.index is None or (device_id.type != "cuda" and device_id.type != "xpu")): raise ValueError( "init_process_group device_id parameter must be a cuda device with an " - "id, e.g. cuda:0, not just cuda or cpu" + "id, e.g. cuda:0, xpu, not just cuda or xpu or cpu" ) # Note: _new_process_group_helper is only called from init_process_group, which always provides a timeout value diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py index ff83bc8ab66666..554114b7bbcb1c 100644 --- a/torch/testing/_internal/common_distributed.py +++ b/torch/testing/_internal/common_distributed.py @@ -180,7 +180,8 @@ def skip_if_lt_x_gpu(x): def decorator(func): @wraps(func) def wrapper(*args, **kwargs): - if torch.cuda.is_available() and torch.cuda.device_count() >= x: + if (torch.cuda.is_available() and torch.cuda.device_count() >= x) or \ + (torch.xpu.is_available() and torch.xpu.device_count() >= x): return func(*args, **kwargs) sys.exit(TEST_SKIPS[f"multi-gpu-{x}"].exit_code) @@ -469,7 +470,7 @@ def init_multigpu_helper(world_size: int, backend: str): On a single node, all visible GPUs are evenly divided to subsets, each process only uses a subset. """ - nGPUs = torch.cuda.device_count() + nGPUs = torch.xpu.device_count() if torch.xpu.is_available() else torch.cuda.device_count() visible_devices = range(nGPUs) # If rank is less than or equal to number of available GPU's From a23ffb2f6bd537359570701a63820ce7e0ab52dc Mon Sep 17 00:00:00 2001 From: hanchao Date: Fri, 13 Sep 2024 06:34:06 +0000 Subject: [PATCH 42/96] update ut --- test/distributed/test_c10d_xccl.py | 73 ------------------------------ 1 file changed, 73 deletions(-) diff --git a/test/distributed/test_c10d_xccl.py b/test/distributed/test_c10d_xccl.py index a998af7b16ef98..3843a695f766c9 100644 --- a/test/distributed/test_c10d_xccl.py +++ b/test/distributed/test_c10d_xccl.py @@ -299,79 +299,6 @@ def _test_allreduce_basics(self, fn): def test_allreduce_basics(self): self._test_allreduce_basics(lambda t: t.clone()) -class DistributedDataParallelTest( - test_c10d_common.CommonDistributedDataParallelTest, MultiProcessTestCase -): - def setUp(self): - super().setUp() - self._spawn_processes() - - def _get_process_group(self): - store = self._get_store() - c10d.init_process_group( - "xccl", store=store, rank=self.rank, world_size=self.world_size - ) - return c10d.distributed_c10d._get_default_group() - - def _test_xccl_backend( - self, devices, device_ids, multi_device=False, gradient_as_bucket_view=False - ): - process_group = self._get_process_group() - self._test_ddp_with_process_group( - process_group, devices, device_ids, multi_device, gradient_as_bucket_view - ) - - @requires_xccl() - @skip_if_lt_x_gpu(2) - def test_xccl_backend_multi_device_ids_not_allowed(self): - int_devices = list(range(torch.xpu.device_count())) - devices = [torch.device("xpu:" + str(i)) for i in int_devices] - with self.assertRaisesRegex( - ValueError, "device_ids can only be None or contain a single element." - ): - self._test_xccl_backend(devices, int_devices) - - @requires_xccl() - @skip_if_lt_x_gpu(4) - def test_ddp_multi_device_module_config(self): - gpus = gpus_for_rank(self.world_size)[self.rank] - - self.assertTrue(len(gpus) >= 2, "expecting at least 2 gpus per process") - - process_group = self._get_process_group() - - gpus = gpus[:2] - model = DoubleGpuNet(gpus) - - with self.assertRaisesRegex( - ValueError, - "DistributedDataParallel device_ids and output_device arguments only work with " - "single-device/multiple-device GPU modules or CPU modules", - ): - ddp_model = DistributedDataParallel( - model, output_device=gpus[1], process_group=process_group - ) - - with self.assertRaisesRegex( - ValueError, "device_ids can only be None or contain a single element." - ): - ddp_model = DistributedDataParallel( - model, device_ids=gpus, process_group=process_group - ) - - with self.assertRaisesRegex( - ValueError, "input module must be on the same type of devices" - ): - model.fc1 = model.fc1.cpu() - ddp_model = DistributedDataParallel(model, process_group=process_group) - - model = model.cpu() - with self.assertRaisesRegex( - ValueError, "device_ids can only be None or contain a single element." - ): - ddp_model = DistributedDataParallel( - model, device_ids=gpus, process_group=process_group - ) if __name__ == "__main__": From 1d02dfe83b8c6546cd4200cc53b19aed38d025b9 Mon Sep 17 00:00:00 2001 From: hanchao Date: Fri, 13 Sep 2024 06:47:42 +0000 Subject: [PATCH 43/96] add mpi check --- torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp | 2 +- torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index e550225e19cb79..b51d299b47d8de 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -228,7 +228,7 @@ ProcessGroupXCCL::ProcessGroupXCCL( blockingWait_ = getCvarBool(TORCH_XCCL_BLOCKING_WAIT, false); init(); - { + if (!with_mpirun()) { int local_rank = getXCCLEnvVar("LOCAL_RANK"); int local_world_size = getXCCLEnvVar("LOCAL_WORLD_SIZE"); if (local_rank == -1 || local_world_size == -1) { diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp index eca66a33922d55..6d946acbea804b 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp @@ -55,6 +55,13 @@ void setXCCLEnvVar(std::string envVarName, int val) { void setXCCLEnvVar(std::string envVarName, std::string val) { setenv(envVarName.c_str(), val.c_str(), 1); } + +bool with_mpirun() { + return (getenv("MPI_LOCALRANKID") || getenv("MPI_LOCALNRANKS") || + getenv("PMI_RANK") || getenv("PMI_SIZE") || getenv("PMIX_RANK")) + ? true + : false; +} } // namespace static std::vector TORCH_XCCL_BLOCKING_WAIT = { From c485bd82da66537b3323110c97bbb5d01c57ed67 Mon Sep 17 00:00:00 2001 From: hanchao Date: Fri, 13 Sep 2024 06:56:23 +0000 Subject: [PATCH 44/96] update datatype map --- torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp | 8 ++++---- torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp | 4 ---- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index b51d299b47d8de..fd02226a1dd772 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -36,7 +36,6 @@ std::map xcclOps = { std::map xcclDatatypes = { {at::kByte, ccl::datatype::uint8}, {at::kChar, ccl::datatype::int8}, - {at::kShort, ccl::datatype::int16}, {at::kInt, ccl::datatype::int32}, {at::kLong, ccl::datatype::int64}, {at::kHalf, ccl::datatype::float16}, @@ -148,9 +147,9 @@ ProcessGroupXCCL::WorkXCCL::WorkXCCL( ProcessGroupXCCL::WorkXCCL::WorkXCCL(const WorkXCCL& w) : Work(w.rank_, w.opType_), device_(w.device_), + xcclEndEvent_(w.xcclEndEvent_), blockingWait_(w.blockingWait_), - workStartTime_(w.workStartTime_), - xcclEndEvent_(w.xcclEndEvent_) {} + workStartTime_(w.workStartTime_) {} ProcessGroupXCCL::WorkXCCL::~WorkXCCL() = default; @@ -174,7 +173,8 @@ bool ProcessGroupXCCL::WorkXCCL::isCompleted() { try { TORCH_CHECK(flag = ret.test()); } catch (...) { - finishAWorkXCCLError(std::current_exception()); + future_->setError(std::current_exception()); + finish(std::current_exception()); return true; } if (!flag) { diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp index 6d946acbea804b..6d4cc5097ebbc5 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp @@ -124,10 +124,6 @@ class TORCH_API ProcessGroupXCCL : public Backend { std::vector rets; private: - void finishAWorkXCCLError(std::exception_ptr eptr) { - future_->setError(eptr); - finish(eptr); - } void synchronizeInternal(std::chrono::milliseconds timeout); std::shared_ptr> outputs_; c10::intrusive_ptr future_; From 2d1ae87592eeab86bd5ef20706c5e373073c5ff2 Mon Sep 17 00:00:00 2001 From: hanchao Date: Fri, 13 Sep 2024 07:22:25 +0000 Subject: [PATCH 45/96] update --- torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp | 9 +++++++-- torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp | 2 ++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index fd02226a1dd772..8f689ec80eb12a 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -167,14 +167,19 @@ bool ProcessGroupXCCL::WorkXCCL::checkTimeout( return true; } +void ProcessGroupXCCL::WorkXCCL::finishWorkXcclError( + const std::exception_ptr& eptr) { + future_->setError(eptr); + finish(eptr); +} + bool ProcessGroupXCCL::WorkXCCL::isCompleted() { for (auto& ret : rets) { bool flag; try { TORCH_CHECK(flag = ret.test()); } catch (...) { - future_->setError(std::current_exception()); - finish(std::current_exception()); + finishWorkXcclError(std::current_exception()); return true; } if (!flag) { diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp index 6d4cc5097ebbc5..37e36047a63c16 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp @@ -88,6 +88,8 @@ class TORCH_API ProcessGroupXCCL : public Backend { rets.push_back(std::move(result)); } + void finishWorkXcclError(const std::exception_ptr& eptr); + bool isCompleted() override; bool isSuccess() const override { From 61842614f089399f77991278e84e2d0d29e71a44 Mon Sep 17 00:00:00 2001 From: hanchao Date: Fri, 13 Sep 2024 07:47:05 +0000 Subject: [PATCH 46/96] update --- torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp index 8f59b7a13d4d3e..cc6b9d36869b58 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp @@ -62,6 +62,11 @@ bool with_mpirun() { ? true : false; } + +struct AutoXcclGroup { + AutoXcclGroup(); + ~AutoXcclGroup() noexcept(false); +}; } // namespace static std::vector TORCH_XCCL_BLOCKING_WAIT = { @@ -72,13 +77,6 @@ using xcclComm_t = ccl::communicator; using XCCL_KVS = ccl::shared_ptr_class; constexpr const char* XCCL_BACKEND_NAME = "xccl"; -namespace { -struct AutoXcclGroup { - AutoXcclGroup(); - ~AutoXcclGroup() noexcept(false); -}; -} // namespace - class TORCH_API ProcessGroupXCCL : public Backend { public: class WorkXCCL : public Work { From 91d26d94f27289af653ba7cebba301c20607ae61 Mon Sep 17 00:00:00 2001 From: hanchao Date: Fri, 13 Sep 2024 08:50:44 +0000 Subject: [PATCH 47/96] update --- .../distributed/c10d/ProcessGroupXCCL.cpp | 29 ++++--------------- .../distributed/c10d/ProcessGroupXCCL.hpp | 7 ----- 2 files changed, 6 insertions(+), 30 deletions(-) diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index 8f689ec80eb12a..fbca2c7f470247 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -167,26 +167,11 @@ bool ProcessGroupXCCL::WorkXCCL::checkTimeout( return true; } -void ProcessGroupXCCL::WorkXCCL::finishWorkXcclError( - const std::exception_ptr& eptr) { - future_->setError(eptr); - finish(eptr); -} - bool ProcessGroupXCCL::WorkXCCL::isCompleted() { - for (auto& ret : rets) { - bool flag; - try { - TORCH_CHECK(flag = ret.test()); - } catch (...) { - finishWorkXcclError(std::current_exception()); - return true; - } - if (!flag) { - return false; - } + if (xcclEndEvent_ && xcclEndEvent_->query()) { + return true; } - return true; + return false; } void ProcessGroupXCCL::WorkXCCL::synchronize() { @@ -218,10 +203,6 @@ void ProcessGroupXCCL::WorkXCCL::synchronizeInternal( bool ProcessGroupXCCL::WorkXCCL::wait(std::chrono::milliseconds timeout) { synchronizeInternal(timeout); - for (auto& event : rets) { - event.wait(); - } - rets.clear(); return true; } @@ -233,6 +214,8 @@ ProcessGroupXCCL::ProcessGroupXCCL( blockingWait_ = getCvarBool(TORCH_XCCL_BLOCKING_WAIT, false); init(); + // Intel oneCCL requires passing CCL_LOCAL_RANK and CCL_LOCAL_SIZE for non-MPI + // launchers. if (!with_mpirun()) { int local_rank = getXCCLEnvVar("LOCAL_RANK"); int local_world_size = getXCCLEnvVar("LOCAL_WORLD_SIZE"); @@ -350,7 +333,7 @@ c10::intrusive_ptr ProcessGroupXCCL::collective( auto ccl_stream = ccl::create_stream(stream.queue()); - work->addResult(fn(input, output, attr, *comm, ccl_stream)); + fn(input, output, attr, *comm, ccl_stream); work->xcclEndEvent_->record(stream); diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp index 37e36047a63c16..96f7e46e7c378d 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp @@ -84,12 +84,6 @@ class TORCH_API ProcessGroupXCCL : public Backend { WorkXCCL(const WorkXCCL& w); ~WorkXCCL() override; - void addResult(ccl::event&& result) { - rets.push_back(std::move(result)); - } - - void finishWorkXcclError(const std::exception_ptr& eptr); - bool isCompleted() override; bool isSuccess() const override { @@ -123,7 +117,6 @@ class TORCH_API ProcessGroupXCCL : public Backend { std::shared_ptr xcclEndEvent_; bool blockingWait_ = false; std::chrono::time_point workStartTime_; - std::vector rets; private: void synchronizeInternal(std::chrono::milliseconds timeout); From 2a83d68e7562bb40b3e21ed15d15303189864070 Mon Sep 17 00:00:00 2001 From: hanchao Date: Sat, 14 Sep 2024 01:25:16 +0000 Subject: [PATCH 48/96] update --- torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index fbca2c7f470247..7b0336960b7285 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -319,8 +319,6 @@ c10::intrusive_ptr ProcessGroupXCCL::collective( auto comm = getXCCLComm(key, device); auto stream = xcclStreams_.at(key); - std::vector inputs{input}; - std::vector outputs{output}; c10::intrusive_ptr work; From 7f62b869297f4e199ae0bda426e4964b3257cd2a Mon Sep 17 00:00:00 2001 From: hanchao Date: Sat, 14 Sep 2024 01:26:18 +0000 Subject: [PATCH 49/96] update --- torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index 7b0336960b7285..6b57a6c5471b36 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -319,6 +319,7 @@ c10::intrusive_ptr ProcessGroupXCCL::collective( auto comm = getXCCLComm(key, device); auto stream = xcclStreams_.at(key); + std::vector outputs{output}; c10::intrusive_ptr work; From c48f5eb3c00d50f24abbc037e94fe0724c41f8b6 Mon Sep 17 00:00:00 2001 From: hanchao Date: Sat, 14 Sep 2024 02:12:36 +0000 Subject: [PATCH 50/96] Support reduce_scatter_base --- .../distributed/c10d/ProcessGroupXCCL.cpp | 52 +++++++++++++++++++ .../distributed/c10d/ProcessGroupXCCL.hpp | 7 +-- 2 files changed, 54 insertions(+), 5 deletions(-) diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index 1a854853117615..f2dde28c4e3b94 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -54,6 +54,11 @@ std::map xcclDatatypes = { {at::kDouble, ccl::datatype::float64}, {at::kBFloat16, ccl::datatype::bfloat16}, {at::kBool, ccl::datatype::uint8}, + // use for allgather + {at::kFloat8_e5m2, ccl::datatype::uint8}, + {at::kFloat8_e4m3fn, ccl::datatype::uint8}, + {at::kFloat8_e4m3fnuz, ccl::datatype::uint8}, + {at::kFloat8_e5m2fnuz, ccl::datatype::uint8}, }; XCCL_KVS kvs; @@ -991,6 +996,53 @@ c10::intrusive_ptr ProcessGroupXCCL::reduce_scatter( } } +c10::intrusive_ptr ProcessGroupXCCL::_reduce_scatter_base( + at::Tensor& outputTensor, + at::Tensor& inputTensor, + const ReduceScatterOptions& opts) { + if (inputTensor.dtype() != outputTensor.dtype()) { + C10_THROW_ERROR( + TypeError, "input tensor must be the same type as the output tensor."); + } + + if (inputTensor.numel() != outputTensor.numel() * size_) { + C10_THROW_ERROR( + ValueError, + "input tensor must be the same size as output size times world size"); + } + + // @lint-ignore CLANGTIDY + const auto& tensor = outputTensor; + TORCH_CHECK( + !isFloat8Type(tensor.scalar_type()), + "Float8 dtypes are not currenlty supported for NCCL reductions"); + + return collective( + inputTensor, + outputTensor, + [&](at::Tensor& input, + at::Tensor& output, + ccl::reduce_attr attr, + xcclComm_t& comm, + at::xpu::XPUStream& stream) { + c10::xpu::XPUCachingAllocator::recordStream( + output.storage().data_ptr(), stream); + auto xcclDataType = getXcclDataType(input.scalar_type()); + auto xcclReduceOp = getXcclReduceOp(opts.reduceOp, input); + ccl::event ret_evt; + ret_evt = ccl::reduce_scatter( + input.data_ptr(), + output.data_ptr(), + (size_t)output.numel(), + xcclDataType, + xcclReduceOp, + comm, + ccl::create_stream(stream.queue())); + return ret_evt; + }, + OpType::_REDUCE_SCATTER_BASE); +} + } // namespace c10d #endif // USE_C10D_XCCL diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp index cc6b9d36869b58..42b26740d49055 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp @@ -117,7 +117,7 @@ class TORCH_API ProcessGroupXCCL : public Backend { } std::vector result() override { - TORCH_CHECK(false, "ProcessGroupXCCL::WorkXCCL::result not implemented"); + return *outputs_; } bool checkTimeout( @@ -265,10 +265,7 @@ class TORCH_API ProcessGroupXCCL : public Backend { c10::intrusive_ptr _reduce_scatter_base( at::Tensor& outputTensor, at::Tensor& inputTensor, - const ReduceScatterOptions& opts = ReduceScatterOptions()) override { - TORCH_CHECK( - false, "ProcessGroupXCCL::_reduce_scatter_base not implemented"); - } + const ReduceScatterOptions& opts = ReduceScatterOptions()) override; c10::intrusive_ptr reduce_scatter_tensor_coalesced( std::vector& outputs, From 9b17dc4911f7b4074db79b6dc573aefbb74bbcb2 Mon Sep 17 00:00:00 2001 From: hanchao Date: Sat, 14 Sep 2024 02:53:06 +0000 Subject: [PATCH 51/96] Support reduce_scatter_tensor_coalesced --- .../distributed/c10d/ProcessGroupXCCL.cpp | 37 ++++++++++++++++++- .../distributed/c10d/ProcessGroupXCCL.hpp | 6 +-- 2 files changed, 36 insertions(+), 7 deletions(-) diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index f2dde28c4e3b94..dfd5d78a5b970b 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -936,7 +936,7 @@ c10::intrusive_ptr ProcessGroupXCCL::reduce_scatter( auto inputTensors_ = inputTensors.back(); TORCH_CHECK( !isFloat8Type(outputTensor.scalar_type()), - "Float8 dtypes are not currenlty supported for NCCL reductions"); + "Float8 dtypes are not currenlty supported for XCCL reductions"); bool same_size = check_same_size(inputTensors_); if (same_size) { @@ -1015,7 +1015,7 @@ c10::intrusive_ptr ProcessGroupXCCL::_reduce_scatter_base( const auto& tensor = outputTensor; TORCH_CHECK( !isFloat8Type(tensor.scalar_type()), - "Float8 dtypes are not currenlty supported for NCCL reductions"); + "Float8 dtypes are not currenlty supported for XCCL reductions"); return collective( inputTensor, @@ -1043,6 +1043,39 @@ c10::intrusive_ptr ProcessGroupXCCL::_reduce_scatter_base( OpType::_REDUCE_SCATTER_BASE); } +c10::intrusive_ptr ProcessGroupXCCL::reduce_scatter_tensor_coalesced( + std::vector& outputs, + std::vector& inputs, + const ReduceScatterOptions& opts) { + TORCH_CHECK( + !isFloat8Type(inputs.back().scalar_type()), + "Float8 dtypes are not currenlty supported for XCCL reductions"); + return collectiveCoalesced( + inputs, + outputs, + [&](at::Tensor& input, + at::Tensor& output, + ccl::reduce_attr attr, + xcclComm_t& comm, + at::xpu::XPUStream& stream) { + c10::xpu::XPUCachingAllocator::recordStream( + output.storage().data_ptr(), stream); + auto xcclDataType = getXcclDataType(input.scalar_type()); + auto xcclReduceOp = getXcclReduceOp(opts.reduceOp, input); + ccl::event ret_evt; + ret_evt = ccl::reduce_scatter( + input.data_ptr(), + output.data_ptr(), + (size_t)output.numel(), + xcclDataType, + xcclReduceOp, + comm, + ccl::create_stream(stream.queue())); + return ret_evt; + }, + OpType::COALESCED); +} + } // namespace c10d #endif // USE_C10D_XCCL diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp index 42b26740d49055..2357aad73bb512 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp @@ -270,11 +270,7 @@ class TORCH_API ProcessGroupXCCL : public Backend { c10::intrusive_ptr reduce_scatter_tensor_coalesced( std::vector& outputs, std::vector& inputs, - const ReduceScatterOptions& opts = ReduceScatterOptions()) override { - TORCH_CHECK( - false, - "ProcessGroupXCCL::reduce_scatter_tensor_coalesced not implemented"); - } + const ReduceScatterOptions& opts = ReduceScatterOptions()) override; c10::intrusive_ptr barrier( const BarrierOptions& opts = BarrierOptions()) override { From 6cb32272695f76a3f8522473db999f8036f4c771 Mon Sep 17 00:00:00 2001 From: hanchao Date: Sat, 14 Sep 2024 03:33:48 +0000 Subject: [PATCH 52/96] support barrier --- .../distributed/c10d/ProcessGroupXCCL.cpp | 59 +++++++++++++++++-- .../distributed/c10d/ProcessGroupXCCL.hpp | 10 +++- 2 files changed, 61 insertions(+), 8 deletions(-) diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index dfd5d78a5b970b..670fc343c04cbd 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -271,6 +271,10 @@ void ProcessGroupXCCL::WorkXCCL::synchronizeInternal( std::chrono::milliseconds(kSynchronizeBusyWaitMillis)); } } + if (barrierTensor_.defined()) { + auto currentStream = at::xpu::getCurrentXPUStream(device_.index()); + currentStream.synchronize(); + } } bool ProcessGroupXCCL::WorkXCCL::wait(std::chrono::milliseconds timeout) { @@ -333,6 +337,8 @@ std::shared_ptr ProcessGroupXCCL::getXCCLComm( "the devices are empty "); } + usedDeviceIdxs_.insert(device.index()); + { std::lock_guard lock(mutex_); if (devXCCLCommMap_.find(deviceKey) != devXCCLCommMap_.end()) { @@ -622,12 +628,9 @@ c10::intrusive_ptr ProcessGroupXCCL::collectiveCoalesced( return work; } -c10::intrusive_ptr ProcessGroupXCCL::allreduce( - std::vector& tensors, +c10::intrusive_ptr ProcessGroupXCCL::allreduce_impl( + at::Tensor& tensor, const AllreduceOptions& opts) { - TORCH_CHECK(tensors.size() == 1, MULTI_DEVICE_ERROR_MSG); - auto tensor = tensors.back(); - check_xpu_single_tensor(tensor); return collective( tensor, tensor, @@ -653,6 +656,19 @@ c10::intrusive_ptr ProcessGroupXCCL::allreduce( OpType::ALLREDUCE); } +c10::intrusive_ptr ProcessGroupXCCL::allreduce( + std::vector& tensors, + const AllreduceOptions& opts) { + TORCH_CHECK(tensors.size() == 1, MULTI_DEVICE_ERROR_MSG); + auto tensor = tensors.back(); + check_xpu_single_tensor(tensor); + TORCH_CHECK( + !isFloat8Type(tensor.scalar_type()), + "Float8 dtypes are not currenlty supported for XCCL reductions"); + + return allreduce_impl(tensor, opts); +} + c10::intrusive_ptr ProcessGroupXCCL::allreduce_coalesced( std::vector& tensors, const AllreduceCoalescedOptions& opts) { @@ -1076,6 +1092,39 @@ c10::intrusive_ptr ProcessGroupXCCL::reduce_scatter_tensor_coalesced( OpType::COALESCED); } +c10::intrusive_ptr ProcessGroupXCCL::barrier(const BarrierOptions& opts) { + // Device to use for barrier + int barDevIdx = -1; + + // See nccl barrier comments + if (!opts.device_ids.empty()) { + barDevIdx = opts.device_ids[0]; + } else if (getBoundDeviceId()) { + barDevIdx = (*getBoundDeviceId()).index(); + } else if (!usedDeviceIdxs_.empty()) { + barDevIdx = *usedDeviceIdxs_.begin(); + } else { + barDevIdx = + static_cast(rank_ % at::detail::getXPUHooks().getNumGPUs()); + } + + TORCH_CHECK_WITH( + ValueError, + barDevIdx >= 0, + "Failed to infer a GPU device id to perform barrier. "); + auto barDevice = at::Device(at::DeviceType::XPU, barDevIdx); + + at::Tensor barrierTensor = + at::zeros({1}, at::TensorOptions().device(barDevice).dtype(at::kFloat)); + + auto work = allreduce_impl(barrierTensor); + + auto xcclWork = dynamic_cast(work.get()); + TORCH_CHECK(xcclWork); + xcclWork->barrierTensor_ = std::move(barrierTensor); + return work; +} + } // namespace c10d #endif // USE_C10D_XCCL diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp index 2357aad73bb512..80bf9a3dc5749f 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp @@ -126,6 +126,7 @@ class TORCH_API ProcessGroupXCCL : public Backend { protected: at::Device device_; std::shared_ptr xcclEndEvent_; + at::Tensor barrierTensor_; bool blockingWait_ = false; std::chrono::time_point workStartTime_; std::vector rets; @@ -211,6 +212,10 @@ class TORCH_API ProcessGroupXCCL : public Backend { Fn fn, OpType opType); + c10::intrusive_ptr allreduce_impl( + at::Tensor& tensor, + const AllreduceOptions& opts = AllreduceOptions()); + c10::intrusive_ptr allreduce( std::vector& tensors, const AllreduceOptions& opts = AllreduceOptions()) override; @@ -273,9 +278,7 @@ class TORCH_API ProcessGroupXCCL : public Backend { const ReduceScatterOptions& opts = ReduceScatterOptions()) override; c10::intrusive_ptr barrier( - const BarrierOptions& opts = BarrierOptions()) override { - TORCH_CHECK(false, "ProcessGroupXCCL::barrier not implemented"); - } + const BarrierOptions& opts = BarrierOptions()) override; c10::intrusive_ptr alltoall_base( at::Tensor& outputTensor, @@ -332,6 +335,7 @@ class TORCH_API ProcessGroupXCCL : public Backend { std::unordered_map> devXCCLCommMap_; c10::intrusive_ptr store_; std::mutex mutex_; + std::set usedDeviceIdxs_; int coalescing_state_ = 0; at::Device coalescedDevice_ = at::Device("xpu"); std::shared_ptr coalescedComm_ = nullptr; From d858c81606962b0e439578dbb6e11247f278ce80 Mon Sep 17 00:00:00 2001 From: hanchao Date: Sat, 14 Sep 2024 06:10:35 +0000 Subject: [PATCH 53/96] update --- torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index c45e995ebd12c0..df1510ce9162d9 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -464,7 +464,6 @@ c10::intrusive_ptr ProcessGroupXCCL::collective( } auto stream = xcclStreams_.at(key); - std::vector outputs{output}; c10::intrusive_ptr work; From fea20f5081463691700954986458eefa8c07df17 Mon Sep 17 00:00:00 2001 From: hanchao Date: Sat, 14 Sep 2024 06:13:00 +0000 Subject: [PATCH 54/96] update --- torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index df1510ce9162d9..c7d9a10d9bf706 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -594,7 +594,7 @@ c10::intrusive_ptr ProcessGroupXCCL::collectiveCoalesced( for (const auto i : c10::irange(inputs.size())) { c10::xpu::XPUCachingAllocator::recordStream( inputs[i].storage().data_ptr(), stream); - work->addResult(fn(inputs[i], outputs[i], attr, *comm, stream)); + fn(inputs[i], outputs[i], attr, *comm, stream); } } From e0e27f3d72454bbae0f10113362b12068bdfeaa8 Mon Sep 17 00:00:00 2001 From: hanchao Date: Sat, 14 Sep 2024 08:04:20 +0000 Subject: [PATCH 55/96] update --- torch/csrc/distributed/c10d/ProcessGroup.hpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/torch/csrc/distributed/c10d/ProcessGroup.hpp b/torch/csrc/distributed/c10d/ProcessGroup.hpp index 73fc2bda701327..f5e87c9be999ea 100644 --- a/torch/csrc/distributed/c10d/ProcessGroup.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroup.hpp @@ -590,6 +590,11 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder { tensor = at::empty( {1}, at::TensorOptions().device(at::DeviceType::CUDA).dtype(at::kByte)); + } else if (backendType_ == c10d::ProcessGroup::BackendType::XCCL) { + // set xpu tensor for override cpu dispatch + tensor = at::empty( + {1}, + at::TensorOptions().device(at::DeviceType::XPU).dtype(at::kByte)); } else { // Default to using cpu implementation tensor = at::empty( From 029026d547055b17679846a024f53131c1cf7bdf Mon Sep 17 00:00:00 2001 From: hanchao Date: Wed, 18 Sep 2024 01:55:53 +0000 Subject: [PATCH 56/96] add ut --- test/distributed/test_c10d_ops_xccl.py | 852 +++++++++++++++++++++++++ 1 file changed, 852 insertions(+) create mode 100644 test/distributed/test_c10d_ops_xccl.py diff --git a/test/distributed/test_c10d_ops_xccl.py b/test/distributed/test_c10d_ops_xccl.py new file mode 100644 index 00000000000000..5d041058ead41b --- /dev/null +++ b/test/distributed/test_c10d_ops_xccl.py @@ -0,0 +1,852 @@ +# Owner(s): ["oncall: distributed"] +# This test file contains positive tests for c10d with XCCL backend. +# During the test, it is expected that ProcessGroup will not be aborted, destroyed or incur fatal error. +# Please be mindful of this when adding tests here. +# If you need to add tests for group creation, abort or destroy, please add tests in test_c10d_xccl.py. + +# There are two ways to launch tests in this file: +# 1. Run this file directly with `python test_c10d_ops_xccl.py` +# 2. Use multi-process launcher, e.g. `torchrun --standalone --nproc-per-node 2 test_c10d_ops_xccl.py` + +import math +import os +import sys +import tempfile + +import torch +import torch.distributed as c10d + + +if not c10d.is_available() or not c10d.is_xccl_available(): + print("c10d XCCL not available, skipping tests", file=sys.stderr) + sys.exit(0) + + +import torch.distributed as dist +from torch.testing._internal.common_distributed import ( + init_multigpu_helper, + MultiProcContinousTest, + requires_xccl, +) +from torch.testing._internal.common_utils import ( + skip_but_pass_in_sandcastle_if, + skipIfRocm, + TEST_WITH_DEV_DBG_ASAN, + TEST_XPU, +) + + +if TEST_WITH_DEV_DBG_ASAN: + print( + "Skip ASAN as torch + multiprocessing spawn have known issues", file=sys.stderr + ) + sys.exit(0) + +TEST_MULTIGPU = TEST_XPU and torch.xpu.device_count() >= 2 + +class ProcessGroupXCCLOpTest(MultiProcContinousTest): + @classmethod + def backend_str(cls) -> str: + return "xccl" + + # @classmethod + # def opts(cls): + # opts = c10d.ProcessGroupXCCL.Options() + # return opts + + @property + def rank_to_GPU(self): + # return rank to GPU map + return init_multigpu_helper(self.world_size, "xccl") + + # TODO: wait reduce + # @requires_xccl() + # @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs") + # def test_empty_tensors(self): + # pg = self.pg + # local_device_idx = self.rank_to_GPU[self.rank][0] + + # xs = [torch.FloatTensor([]).xpu(local_device_idx)] + # pg.broadcast(xs).wait() + # self.assertEqual(0, xs[0].numel()) + + # pg.allreduce(xs).wait() + # self.assertEqual(0, xs[0].numel()) + + # pg.reduce(xs).wait() + # self.assertEqual(0, xs[0].numel()) + + # ys = [ + # [ + # torch.FloatTensor([]).xpu(local_device_idx) + # for _ in range(self.world_size) + # ] + # ] + # pg.allgather(ys, xs).wait() + # for y in ys[0]: + # self.assertEqual(0, y.numel()) + + # ys = [torch.FloatTensor([]).xpu(local_device_idx)] + # xs = [ + # [ + # torch.FloatTensor([]).xpu(local_device_idx) + # for _ in range(self.world_size) + # ] + # ] + # pg.reduce_scatter(ys, xs).wait() + # self.assertEqual(0, ys[0].numel()) + + @requires_xccl() + @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs") + def test_broadcast_ops(self): + pg = self.pg + + def broadcast(xs, rootRank, rootTensor): + opts = c10d.BroadcastOptions() + opts.rootRank = rootRank + opts.rootTensor = rootTensor + work = pg.broadcast(xs, opts) + work.wait() + return xs + + # Every rank is root once + for i in range(self.world_size): + # Run with 1 input tensor + x = torch.tensor([self.rank]).xpu(self.rank_to_GPU[self.rank][0]) + output = broadcast([x], i, 0) + self.assertEqual(torch.tensor([i]), output[0]) + + expected_tensor = torch.empty([i + 1, i + 1]).fill_(i + 1) + xs = [ + torch.empty([i + 1, i + 1]).fill_(-1).xpu(device=device_idx) + for device_idx in self.rank_to_GPU[self.rank] + ] + + # test with multiple input tensors (multiple gpu in one rank) + for j in range(len(xs)): + if self.rank == i: + xs[j] = expected_tensor.xpu(device=self.rank_to_GPU[self.rank][j]) + + broadcast(xs, i, j) + + for tensor in xs: + self.assertEqual(tensor, expected_tensor) + + @requires_xccl() + @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs") + def test_allreduce_ops(self): + device_count = torch.xpu.device_count() + pg = self.pg + local_device_id = self.rank_to_GPU[self.rank][0] + + def allreduce(tensors, op): + opts = c10d.AllreduceOptions() + opts.reduceOp = op + work = pg.allreduce(tensors, opts) + work.wait() + + # Sum + tensors = [torch.tensor([self.rank + 1]).xpu(local_device_id)] + + allreduce(tensors, c10d.ReduceOp.SUM) + + ndev = self.world_size + self.assertEqual( + torch.tensor([ndev * (ndev + 1) // 2]), + tensors[0], + ) + + # Product + tensors = [torch.tensor([self.rank + 1]).xpu(local_device_id)] + + allreduce(tensors, c10d.ReduceOp.PRODUCT) + self.assertEqual(torch.tensor([math.factorial(self.world_size)]), tensors[0]) + + # Min + tensors = [torch.tensor([self.rank + 1]).xpu(local_device_id)] + + allreduce(tensors, c10d.ReduceOp.MIN) + self.assertEqual(torch.tensor([1]), tensors[0]) + + # Max + tensors = [torch.tensor([self.rank + 1]).xpu(local_device_id)] + + allreduce(tensors, c10d.ReduceOp.MAX) + self.assertEqual(torch.tensor([self.world_size]), tensors[0]) + + for op, err in zip( + (c10d.ReduceOp.BAND, c10d.ReduceOp.BOR, c10d.ReduceOp.BXOR), + ("ReduceOp.BAND", "ReduceOp.BOR", "ReduceOp.BXOR"), + ): + with self.assertRaisesRegex(ValueError, "Cannot use " + err + " with XCCL"): + allreduce(tensors, op) + + # TODO: wait all2all + # @requires_xccl() + # @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs") + # def test_alltoall_ops_with_xpufree_race(self): + # pg = self.pg + # opts = c10d.AllToAllOptions() + # local_device = f"xpu:{self.rank_to_GPU[self.rank][0]}" + # torch.xpu.set_device(local_device) + # input = torch.rand(1000, 1000, device=local_device) + # output = torch.rand(1000, 1000, device=local_device) + # race_tensors = [] + # # create some tensors to race with alltoall collective + # for _ in range(10): + # tmp = [] + # for i in range(5): + # tmp.append(torch.rand(10 ** (3 + i), device=local_device)) + # race_tensors.append(tmp) + + # for i in range(10): + # race_tensors.pop() + # work = pg.alltoall_base(output, input, [], [], opts) + # # this triggers xpuFree + # torch.xpu.empty_cache() + # work.wait() + # torch.xpu.synchronize(device=local_device) + + # TODO: wait reduce + # @requires_xccl() + # @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs") + # def test_reduce_ops(self): + # pg = self.pg + # local_device_id = self.rank_to_GPU[self.rank][0] + + # def reduce(xs, rootRank, rootTensor, op=None): + # opts = c10d.ReduceOptions() + # opts.rootRank = rootRank + # opts.rootTensor = rootTensor + # if op: + # opts.reduceOp = op + # work = pg.reduce(xs, opts) + # work.wait() + + # # for every root tensor + # for rt in range(self.world_size): + # tensors = [torch.tensor([self.rank + 1]).xpu(local_device_id)] + + # reduce(tensors, rt, 0) + + # if self.rank == rt: + # self.assertEqual( + # torch.tensor([self.world_size * (self.world_size + 1) // 2]), + # tensors[0], + # ) + # else: + # self.assertEqual( + # torch.tensor([self.rank + 1]), + # tensors[0], + # ) + + # for op, err in zip( + # (c10d.ReduceOp.BAND, c10d.ReduceOp.BOR, c10d.ReduceOp.BXOR), + # ("ReduceOp.BAND", "ReduceOp.BOR", "ReduceOp.BXOR"), + # ): + # with self.assertRaisesRegex( + # ValueError, "Cannot use " + err + " with XCCL" + # ): + # reduce(tensors, self.rank, rt, op) + + # # Premul sum + # if torch.xpu.xccl.version() >= (2, 11, 1): + # for factor in (3.0, torch.tensor([5.0], device=local_device_id)): + # if isinstance(factor, torch.Tensor): + # factor_ref = factor.cpu().item() + # else: + # factor_ref = factor + # float_tensors = [ + # torch.tensor( + # [self.rank + 1.0], device=f"xpu:{local_device_id}" + # ) + # ] + # float_tensors_ref = [ + # torch.tensor( + # [(self.rank + 1.0) * factor_ref], + # device=f"xpu:{local_device_id}", + # ) + # ] + + # reduce(float_tensors_ref, rt, 0) + # reduce(float_tensors, rt, 0, c10d._make_xccl_premul_sum(factor)) + # if self.rank == rt: + # self.assertEqual(float_tensors_ref[0], float_tensors[0]) + + @requires_xccl() + @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs") + def test_allgather_ops(self): + pg = self.pg + local_device_ids = self.rank_to_GPU[self.rank] + + def allgather(output_ts, input_ts): + work = pg.allgather(output_ts, input_ts) + return work.wait() + + tensors = [torch.empty(2, 2).fill_(2).xpu(device=i) for i in local_device_ids] + output_tensors = [] + expected_output = [] + + output_per_gpu = ( + [torch.empty(2, 2).fill_(-1)] * len(local_device_ids) * self.world_size + ) + expected_per_gpu = ( + [torch.empty(2, 2).fill_(2)] * len(local_device_ids) * self.world_size + ) + + for gpu in local_device_ids: + output_tensors.append([t.xpu(device=gpu) for t in output_per_gpu]) + expected_output.append([t.xpu(device=gpu) for t in expected_per_gpu]) + + result = allgather(output_tensors, tensors) + + # Verification + self.assertEqual(output_tensors, expected_output) + + @requires_xccl() + @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs") + def test_allgather_base_ops(self): + pg = self.pg + local_device_id = self.rank_to_GPU[self.rank][0] + + def allgather_base(output_t, input_t): + work = pg._allgather_base(output_t, input_t) + work.wait() + + # allgather_base is GPU number agnostic. + # Each rank contribute one tensor regardless of GPU counts + tensor = torch.tensor([self.rank]).xpu(local_device_id) + output_t = torch.empty((self.world_size), dtype=tensor.dtype).xpu( + local_device_id + ) + + allgather_base(output_t, tensor) + + # Verification + self.assertEqual(torch.arange(self.world_size), output_t) + + @requires_xccl() + @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs") + def test_allgather_base_basics(self): + pg = self.pg + local_device_id = self.rank_to_GPU[self.rank][0] + + def allgather_base(output_t, input_t): + work = pg._allgather_base(output_t, input_t) + work.wait() + + # anticipate an error + with self.assertRaisesRegex( + ValueError, + "output tensor size must be equal to world_size times input tensor size", + ): + tensor = torch.tensor([self.rank]).xpu(local_device_id) + output_t = torch.empty((self.world_size + 1), dtype=tensor.dtype).xpu( + local_device_id + ) + # fails the check because output_t is not correctly sized + allgather_base(output_t, tensor) + + # anticipate an error + with self.assertRaisesRegex( + TypeError, "output tensor must have the same type as input tensor" + ): + tensor = torch.tensor([self.rank], dtype=torch.float).xpu(local_device_id) + output_t = torch.empty((self.world_size + 1), dtype=torch.long).xpu( + local_device_id + ) + # fails the check because the dtype is different + allgather_base(output_t, tensor) + + # TODO: wait gather + # @requires_xccl() + # @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs") + # def test_gather_ops(self): + # pg = self.pg + # local_device_ids = self.rank_to_GPU[self.rank] + # num_gpus = len(local_device_ids) + + # def gather(output_t, input_t, rootRank): + # opts = c10d.GatherOptions() + # opts.rootRank = rootRank + # if rootRank == self.rank: + # work = pg.gather(output_t, input_t, opts) + # else: + # work = pg.gather([], input_t, opts) + # work.wait() + + # # init input + # tensors = [] + # for device_id in local_device_ids: + # tensors.append(torch.tensor([self.rank]).xpu(device_id)) + + # # init output + # output_ts = [] + # for idx in range(num_gpus): + # gpu_idx = local_device_ids[idx] + # output_ts.append([]) + # for rank in range(self.world_size): + # output_ts[idx].append(torch.tensor([-1]).xpu(gpu_idx)) + + # expected = [[torch.tensor([rank]) for rank in range(self.world_size)]] + # for rank in range(self.world_size): + # gather(output_ts, tensors, rank) + # if rank == self.rank: + # self.assertEqual(expected, output_ts) + + # @requires_xccl() + # @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs") + # def test_gather_stress(self): + # pg = self.pg + # local_device_ids = self.rank_to_GPU[self.rank] + # num_gpus = len(local_device_ids) + + # def gather(output_t, input_t, rootRank): + # opts = c10d.GatherOptions() + # opts.rootRank = rootRank + # if rootRank == self.rank: + # work = pg.gather(output_t, input_t, opts) + # else: + # work = pg.gather([], input_t, opts) + # work.wait() + + # stress_length = 1000 + + # # init input + # tensors = [] + # for i in range(stress_length): + # tensors.append([]) + # for device_id in local_device_ids: + # tensors[i].append(torch.tensor([self.rank]).xpu(device_id)) + + # # init output + # output_ts = [] + # for i in range(stress_length): + # output_ts.append([[] for _ in range(num_gpus)]) + # for idx, ls in enumerate(output_ts[i]): + # gpu_idx = local_device_ids[idx] + # for _ in range(self.world_size): + # ls.append(torch.tensor([-1]).xpu(gpu_idx)) + + # expected = [[torch.tensor([rank]) for rank in range(self.world_size)]] + # for i in range(stress_length): + # for rank in range(self.world_size): + # gather(output_ts[i], tensors[i], rank) + # # Verification + # if rank == self.rank: + # self.assertEqual(output_ts[i], expected) + + # @requires_xccl() + # @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs") + # def test_gather_checks(self): + # pg = self.pg + # device_id = self.rank_to_GPU[self.rank][0] + + # # init input + # tensor = torch.tensor([self.rank]).xpu(device_id) + + # # init output + # output_ts = [] + # for rank in range(self.world_size): + # output_ts.append(torch.tensor([-1]).xpu(device_id)) + + # with self.assertRaisesRegex(ValueError, "invalid root rank"): + # opts = c10d.GatherOptions() + # opts.rootRank = -1 + # pg.gather([output_ts], [tensor], opts) + + # with self.assertRaisesRegex(TypeError, "incompatible function arguments"): + # pg.gather([output_ts], [tensor], 0) + + # with self.assertRaisesRegex(ValueError, "invalid root rank"): + # opts = c10d.GatherOptions() + # opts.rootRank = self.world_size + # pg.gather([output_ts], [tensor], opts) + + # with self.assertRaisesRegex( + # # throws error message from dispatcher + # RuntimeError, + # "There were no tensor arguments to this function", + # ): + # opts = c10d.GatherOptions() + # opts.rootRank = 0 + # pg.gather([output_ts], [], opts) + + # TODO: wait scatter + # @requires_xccl() + # @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs") + # def test_scatter_ops(self): + # pg = self.pg + # local_device_ids = self.rank_to_GPU[self.rank] + # num_gpus = len(local_device_ids) + + # def scatter(output_t, input_t, rootRank): + # opts = c10d.ScatterOptions() + # opts.rootRank = rootRank + # if rootRank == self.rank: + # work = pg.scatter(output_t, input_t, opts) + # else: + # work = pg.scatter(output_t, [], opts) + # work.wait() + + # # init output + # tensors = [] + # for device_id in local_device_ids: + # tensors.append(torch.tensor([-1]).xpu(device_id)) + + # # init input + # scatter_list = [] + # for idx in range(num_gpus): + # gpu_idx = local_device_ids[idx] + # scatter_list.append([]) + # for rank in range(self.world_size): + # scatter_list[idx].append(torch.tensor([rank]).xpu(gpu_idx)) + + # # test each rank to scatter + # expected = [torch.tensor([self.rank])] + # for rank in range(self.world_size): + # scatter(tensors, scatter_list, rank) + # self.assertEqual(expected, tensors) + + # @requires_xccl() + # @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs") + # def test_scatter_stress(self): + # pg = self.pg + # local_device_ids = self.rank_to_GPU[self.rank] + # num_gpus = len(local_device_ids) + + # def scatter(output_t, input_t, rootRank): + # opts = c10d.ScatterOptions() + # opts.rootRank = rootRank + # if rootRank == self.rank: + # work = pg.scatter(output_t, input_t, opts) + # else: + # work = pg.scatter(output_t, [], opts) + # work.wait() + + # stress_length = 1000 + + # # init output + # tensors = [] + # for i in range(stress_length): + # tensors.append([]) + # for device_id in local_device_ids: + # tensors[i].append(torch.tensor([-1]).xpu(device_id)) + + # # init input + # scatter_list = [] + # for i in range(stress_length): + # scatter_list.append([[] for _ in range(num_gpus)]) + # for idx, ls in enumerate(scatter_list[i]): + # gpu_idx = local_device_ids[idx] + # for rank in range(self.world_size): + # ls.append(torch.tensor([rank]).xpu(gpu_idx)) + + # # test each rank to scatter + # expected = [torch.tensor([self.rank])] + # for i in range(stress_length): + # for rank in range(self.world_size): + # scatter(tensors[i], scatter_list[i], rank) + # # Verification + # self.assertEqual(tensors[i], expected) + + # @requires_xccl() + # @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs") + # def test_scatter_checks(self): + # pg = self.pg + # local_device_ids = self.rank_to_GPU[self.rank] + # num_gpus = len(local_device_ids) + + # # init output + # tensors = [] + # for device_id in local_device_ids: + # tensors.append(torch.tensor([-1]).xpu(device_id)) + + # # init input + # scatter_list = [] + # for idx in range(num_gpus): + # gpu_idx = local_device_ids[idx] + # scatter_list.append([]) + # for rank in range(self.world_size): + # scatter_list[idx].append(torch.tensor([rank]).xpu(gpu_idx)) + + # with self.assertRaisesRegex(ValueError, "invalid root rank"): + # opts = c10d.ScatterOptions() + # opts.rootRank = -1 + # pg.scatter(tensors, scatter_list, opts) + + # with self.assertRaisesRegex(TypeError, "incompatible function arguments"): + # pg.scatter(tensors, scatter_list, 0) + + # with self.assertRaisesRegex(ValueError, "invalid root rank"): + # opts = c10d.ScatterOptions() + # opts.rootRank = self.world_size + # pg.scatter(tensors, scatter_list, opts) + + # with self.assertRaisesRegex( + # # throws error message from dispatcher + # RuntimeError, + # "There were no tensor arguments to this function", + # ): + # opts = c10d.ScatterOptions() + # opts.rootRank = 0 + # pg.scatter([], scatter_list, opts) + + @requires_xccl() + @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs") + def test_reduce_scatter_base_basics(self): + pg = self.pg + local_device_id = self.rank_to_GPU[self.rank][0] + + def reduce_scatter_base(output_t, input_t): + work = pg._reduce_scatter_base(output_t, input_t) + work.wait() + + # anticipate an error + with self.assertRaisesRegex( + ValueError, + "input tensor must be the same size as output size times world size", + ): + input_t = torch.tensor([self.rank]).xpu(local_device_id) + output_t = torch.empty((self.world_size + 1), dtype=input_t.dtype).xpu( + local_device_id + ) + # fails the check because output_t is not correctly sized + reduce_scatter_base(output_t, input_t) + + # anticipate an error + with self.assertRaisesRegex( + TypeError, "input tensor must be the same type as the output tensor." + ): + tensor = torch.tensor([self.rank], dtype=torch.float).xpu(local_device_id) + output_t = torch.empty((self.world_size + 1), dtype=torch.long).xpu( + local_device_id + ) + # fails the check because the dtype is different + reduce_scatter_base(output_t, tensor) + + @requires_xccl() + @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs") + def test_reduce_scatter_ops(self): + pg = self.pg + local_device_ids = self.rank_to_GPU[self.rank] + num_gpus = len(local_device_ids) + + def reduce_scatter(outputs, input_lists, op): + opts = c10d.ReduceScatterOptions() + opts.reduceOp = op + work = pg.reduce_scatter(outputs, input_lists, opts) + work.wait() + + output = [torch.tensor([0]).xpu(i) for i in local_device_ids] + + # GPU/rank + # 0 [1], [2], [3], [4] + # 1 [2], [3], [4], [5] + # 2 [3], [4], [5], [6] + # 3 [4], [5], [6], [7] + + # Sum + tensor_lists = [] + input_per_gpu = [] + + for i in range(self.world_size): + input_per_gpu.append(torch.tensor([self.rank + i + 1])) + + for gpu in local_device_ids: + tensor_lists.append([t.xpu(device=gpu) for t in input_per_gpu]) + + reduce_scatter(output, tensor_lists, c10d.ReduceOp.SUM) + + for i in range(num_gpus): + expected = torch.tensor( + [ + (1 + self.world_size) * self.world_size // 2 + + self.world_size * self.rank + ] + ) + + self.assertEqual(expected, output[i]) + + # Min + reduce_scatter(output, tensor_lists, c10d.ReduceOp.MIN) + + for i in range(num_gpus): + expected = torch.tensor([self.rank + 1 + i]) + self.assertEqual(expected, output[i]) + + # Max + reduce_scatter(output, tensor_lists, c10d.ReduceOp.MAX) + + for i in range(num_gpus): + expected = torch.tensor([self.rank + self.world_size + i]) + self.assertEqual(expected, output[i]) + + # Product + reduce_scatter(output, tensor_lists, c10d.ReduceOp.PRODUCT) + + # math package don't have math.perm until python 3.8, so + # we implement a naive version here. + def perm(n, k): + prod_val = n + for val in range(n - k + 1, n): + prod_val *= val + return prod_val + + for i in range(num_gpus): + prod_val = perm(self.rank + self.world_size, self.world_size) + + expected = torch.tensor([prod_val]) + self.assertEqual(expected, output[i]) + + # Test the input params overridden scenarios, aka, when the input is + # a list and output is just one tensor. + # Sum + output_tensor = torch.empty_like(input_per_gpu[0][0]).xpu(self.rank) + input_list = [tensor[0].xpu(self.rank) for tensor in input_per_gpu] + pg.reduce_scatter(output_tensor, input_list, c10d.ReduceOp.SUM).wait() + expected = torch.tensor( + (1 + self.world_size) * self.world_size // 2 + self.world_size * self.rank + ) + self.assertEqual(expected, output_tensor) + + # Min + pg.reduce_scatter(output_tensor, input_list, c10d.ReduceOp.MIN).wait() + expected = torch.tensor(self.rank + 1) + self.assertEqual(expected, output_tensor) + + # Max + pg.reduce_scatter(output_tensor, input_list, c10d.ReduceOp.MAX).wait() + expected = torch.tensor(self.rank + self.world_size) + self.assertEqual(expected, output_tensor) + + # Product + pg.reduce_scatter(output_tensor, input_list, c10d.ReduceOp.PRODUCT).wait() + prod_val = self.rank + 1 + for k in range(1, self.world_size): + prod_val = prod_val * (self.rank + 1 + k) + expected = torch.tensor(prod_val) + self.assertEqual(expected, output_tensor) + + + @requires_xccl() + @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs") + def test_reduce_scatter_base_ops(self): + pg = self.pg + local_device_id = self.rank_to_GPU[self.rank][0] + + def reduce_scatter_base(output_t, input_t): + work = pg._reduce_scatter_base(output_t, input_t) + work.wait() + + # reduce_scatter_base is GPU number agnostic. + # Each rank contribute one tensor regardless of GPU counts + output_t = torch.empty([1]).xpu(local_device_id) + tensor = torch.arange(self.world_size, dtype=output_t.dtype).xpu( + local_device_id + ) + + reduce_scatter_base(output_t, tensor) + + # Verification + self.assertEqual(output_t[0], self.rank * self.world_size) + + @requires_xccl() + @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs") + def test_barrier(self): + pg = self.pg + local_device_ids = self.rank_to_GPU[self.rank] + + def allreduce(tensors): + opts = c10d.AllreduceOptions() + work = pg.allreduce(tensors, opts) + return work + + # Making the collective to operate on + # 1, 2, 3, 4, .... len(local_device_ids) GPUs + tensors_list = [[] for _ in range(len(local_device_ids))] + + for i in range(1, len(local_device_ids) + 1): + for j in range(i): + tensors_list[i - 1].append( + torch.tensor([j + 1]).xpu(local_device_ids[j]) + ) + + works = [] + for tensors in tensors_list: + work = allreduce(tensors) + works.append(work) + + # Barrier will ensure that all previous work is completed + pg.barrier().wait() + + for i in range(1, len(local_device_ids) + 1): + for j in range(i): + self.assertEqual( + torch.tensor([(j + 1) * self.world_size]), tensors_list[i - 1][j] + ) + + # TODO: wait send/recv + # @requires_xccl() + # @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs") + # def test_send_recv(self): + # pg = self.pg + # device = self.rank_to_GPU[self.rank][0] + + # # Generate the same random tensor + # torch.manual_seed(0) + # send_tensor = torch.rand(10, 10, device=device) + # if self.rank == 0: + # dist.send(send_tensor, 1) + # if self.rank == 1: + # recv_tensor = torch.rand(10, 10, device=device) + # dist.recv(recv_tensor, 0) + # self.assertEqual(send_tensor, recv_tensor) + + # @requires_xccl() + # @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs") + # def test_send_recv_complex(self): + # pg = self.pg + # device = self.rank_to_GPU[self.rank][0] + + # # Generate the same random tensor + # torch.manual_seed(0) + # send_tensor = torch.rand(10, 10, dtype=torch.cfloat, device=device) + # if self.rank == 0: + # dist.send(send_tensor, 1) + # if self.rank == 1: + # recv_tensor = torch.rand(10, 10, dtype=torch.cfloat, device=device) + # dist.recv(recv_tensor, 0) + # self.assertEqual(send_tensor, recv_tensor) + + # @requires_xccl() + # @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs") + # def test_send_recv_object_list(self): + # device = self.rank_to_GPU[self.rank][0] + + # val = 99 if self.rank == 0 else None + # object_list = [val] * self.world_size + # if self.rank == 0: + # dist.send_object_list(object_list, 1, device=device) + # if self.rank == 1: + # dist.recv_object_list(object_list, 0, device=device) + # self.assertEqual(object_list[0], 99) + + +if __name__ == "__main__": + rank = int(os.getenv("RANK", -1)) + world_size = int(os.getenv("WORLD_SIZE", 2)) + + if rank != -1: + # Launched with torchrun or other multi-proc launchers. Directly run the test. + ProcessGroupXCCLOpTest.run_rank(rank, world_size) + else: + # Launched as a single process. Spawn subprocess to run the tests. + # Also need a rendezvous file for `init_process_group` purpose. + rdvz_file = tempfile.NamedTemporaryFile(delete=False).name + torch.multiprocessing.spawn( + ProcessGroupXCCLOpTest.run_rank, + nprocs=world_size, + args=(world_size, rdvz_file), + ) + From 682f40fe87c2fa2f725eae3550f557fdc19c60ef Mon Sep 17 00:00:00 2001 From: hanchao Date: Wed, 18 Sep 2024 06:16:31 +0000 Subject: [PATCH 57/96] Support all2all_base --- .../distributed/c10d/ProcessGroupXCCL.cpp | 102 +++++++++++++++++- .../distributed/c10d/ProcessGroupXCCL.hpp | 4 +- 2 files changed, 101 insertions(+), 5 deletions(-) diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index c7d9a10d9bf706..7f3e3719cdd307 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -100,12 +100,23 @@ bool check_same_size(const std::vector& input_tensors) { return true; } -void check_xpu_single_tensor(const at::Tensor& tensor) { +void check_xpu_single_tensor( + const at::Tensor& tensor, + const bool p2p = false // whether operation is a P2P operation +) { if (!tensor.is_xpu() || tensor.is_sparse()) { C10_THROW_ERROR(ValueError, "Tensors must be XPU and dense"); } + // Skip the following requirements for P2P operations if (!tensor.is_contiguous(tensor.suggest_memory_format())) { - C10_THROW_ERROR(ValueError, "Tensors must be contiguous"); + if (p2p) { + TORCH_WARN_ONCE( + "Detected non-contiguous tensor in P2P operations. It is user " + "responsibility to guarantee that source and destination tensors have " + "the same contiguity format."); + } else { + C10_THROW_ERROR(ValueError, "Tensors must be contiguous"); + } } } @@ -1108,6 +1119,93 @@ c10::intrusive_ptr ProcessGroupXCCL::barrier(const BarrierOptions& opts) { return work; } +c10::intrusive_ptr ProcessGroupXCCL::alltoall_base( + at::Tensor& outputTensor, + at::Tensor& inputTensor, + std::vector& outputSplitSizes, + std::vector& inputSplitSizes, + const AllToAllOptions& /* unused */) { + check_xpu_single_tensor(outputTensor, true); + check_xpu_single_tensor(inputTensor, true); + if (outputSplitSizes.size() == 0 && inputSplitSizes.size() == 0) { + TORCH_CHECK( + outputTensor.numel() == inputTensor.numel() && + outputTensor.scalar_type() == inputTensor.scalar_type(), + "xpu_alltoall_base: tensors are not equal in size or data type"); + TORCH_CHECK( + outputTensor.size(0) % size_ == 0, + "xpu_alltoall_base: tensor's dim 0 does not divide equally across group size"); + return collective( + inputTensor, + outputTensor, + [&](at::Tensor& input, + at::Tensor& output, + ccl::alltoall_attr attr, + xcclComm_t& comm, + at::xpu::XPUStream& stream) { + c10::xpu::XPUCachingAllocator::recordStream( + output.storage().data_ptr(), stream); + auto xcclDataType = getXcclDataType(output.scalar_type()); + ccl::event ret_evt; + ret_evt = ccl::alltoall( + input.data_ptr(), + output.data_ptr(), + (size_t)output.numel() / comm.size(), + xcclDataType, + comm, + ccl::create_stream(stream.queue()), + attr); + return ret_evt; + }, + OpType::ALLTOALL_BASE); + } else { + c10d::checkSplitSizes(inputSplitSizes, inputTensor, size_); + c10d::checkSplitSizes(outputSplitSizes, outputTensor, size_); + + return collective( + inputTensor, + outputTensor, + [&](at::Tensor& input, + at::Tensor& output, + ccl::alltoall_attr attr, + xcclComm_t& comm, + at::xpu::XPUStream& stream) { + std::vector sendCounts(size_); + std::vector recvCounts(size_); + bool inputSplitsEqual = inputSplitSizes.size() == 0; + bool outputSplitsEqual = outputSplitSizes.size() == 0; + + size_t inLen = input.numel(); + size_t outLen = output.numel(); + if (inLen) + inLen /= (inputSplitsEqual ? size_ : input.size(0)); + if (outLen) + outLen /= (outputSplitsEqual ? size_ : output.size(0)); + + for (int i = 0; i < size_; i++) { + sendCounts[i] = + (inputSplitsEqual ? inLen : inputSplitSizes[i] * inLen); + recvCounts[i] = + (outputSplitsEqual ? outLen : outputSplitSizes[i] * outLen); + } + auto xcclDataType = getXcclDataType(output.scalar_type()); + ccl::event ret_evt; + + ret_evt = ccl::alltoallv( + input.data_ptr(), + sendCounts, + output.data_ptr(), + recvCounts, + xcclDataType, + comm, + ccl::create_stream(stream.queue()), + attr); + return ret_evt; + }, + OpType::ALLTOALL_BASE); + } +} + } // namespace c10d #endif // USE_C10D_XCCL diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp index 8e17435a4ce1b6..0147ef3744384e 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp @@ -278,9 +278,7 @@ class TORCH_API ProcessGroupXCCL : public Backend { at::Tensor& inputTensor, std::vector& outputSplitSizes, std::vector& inputSplitSizes, - const AllToAllOptions& opts = AllToAllOptions()) override { - TORCH_CHECK(false, "ProcessGroupXCCL::alltoall_base not implemented"); - } + const AllToAllOptions& opts = AllToAllOptions()) override; c10::intrusive_ptr alltoall( std::vector& outputTensors, From 2694617e6c4027875360e050051fa5128bcd7261 Mon Sep 17 00:00:00 2001 From: hanchao Date: Wed, 18 Sep 2024 06:53:24 +0000 Subject: [PATCH 58/96] update --- torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index 7f3e3719cdd307..16766b1190b072 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -1158,7 +1158,8 @@ c10::intrusive_ptr ProcessGroupXCCL::alltoall_base( return ret_evt; }, OpType::ALLTOALL_BASE); - } else { + } + else { c10d::checkSplitSizes(inputSplitSizes, inputTensor, size_); c10d::checkSplitSizes(outputSplitSizes, outputTensor, size_); @@ -1167,7 +1168,7 @@ c10::intrusive_ptr ProcessGroupXCCL::alltoall_base( outputTensor, [&](at::Tensor& input, at::Tensor& output, - ccl::alltoall_attr attr, + ccl::alltoallv_attr attr, xcclComm_t& comm, at::xpu::XPUStream& stream) { std::vector sendCounts(size_); From 612df4271896bde6e0d30cb0eb2a07a2bae346cf Mon Sep 17 00:00:00 2001 From: hanchao Date: Wed, 18 Sep 2024 08:57:33 +0000 Subject: [PATCH 59/96] support all2all --- test/distributed/test_c10d_ops_xccl.py | 49 ++++---- .../distributed/c10d/ProcessGroupXCCL.cpp | 116 +++++++++++++++++- .../distributed/c10d/ProcessGroupXCCL.hpp | 4 +- 3 files changed, 139 insertions(+), 30 deletions(-) diff --git a/test/distributed/test_c10d_ops_xccl.py b/test/distributed/test_c10d_ops_xccl.py index 5d041058ead41b..a59d03a1750e1c 100644 --- a/test/distributed/test_c10d_ops_xccl.py +++ b/test/distributed/test_c10d_ops_xccl.py @@ -181,31 +181,30 @@ def allreduce(tensors, op): with self.assertRaisesRegex(ValueError, "Cannot use " + err + " with XCCL"): allreduce(tensors, op) - # TODO: wait all2all - # @requires_xccl() - # @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs") - # def test_alltoall_ops_with_xpufree_race(self): - # pg = self.pg - # opts = c10d.AllToAllOptions() - # local_device = f"xpu:{self.rank_to_GPU[self.rank][0]}" - # torch.xpu.set_device(local_device) - # input = torch.rand(1000, 1000, device=local_device) - # output = torch.rand(1000, 1000, device=local_device) - # race_tensors = [] - # # create some tensors to race with alltoall collective - # for _ in range(10): - # tmp = [] - # for i in range(5): - # tmp.append(torch.rand(10 ** (3 + i), device=local_device)) - # race_tensors.append(tmp) - - # for i in range(10): - # race_tensors.pop() - # work = pg.alltoall_base(output, input, [], [], opts) - # # this triggers xpuFree - # torch.xpu.empty_cache() - # work.wait() - # torch.xpu.synchronize(device=local_device) + @requires_xccl() + @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs") + def test_alltoall_ops_with_xpufree_race(self): + pg = self.pg + opts = c10d.AllToAllOptions() + local_device = f"xpu:{self.rank_to_GPU[self.rank][0]}" + torch.xpu.set_device(local_device) + input = torch.rand(1000, 1000, device=local_device) + output = torch.rand(1000, 1000, device=local_device) + race_tensors = [] + # create some tensors to race with alltoall collective + for _ in range(10): + tmp = [] + for i in range(5): + tmp.append(torch.rand(10 ** (3 + i), device=local_device)) + race_tensors.append(tmp) + + for i in range(10): + race_tensors.pop() + work = pg.alltoall_base(output, input, [], [], opts) + # this triggers xpuFree + torch.xpu.empty_cache() + work.wait() + torch.xpu.synchronize(device=local_device) # TODO: wait reduce # @requires_xccl() diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index 16766b1190b072..5d43694def146c 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -91,6 +91,44 @@ XCCL_KVS get_kvs(int rank, c10d::Store& store) { return kvs; } +bool computeLengthsAndCheckAndGetFlat( + const std::vector& tensors, + std::vector& lengths, + at::Tensor& flatTensor, + int64_t& flatLength) { + int64_t groupSize = tensors.size(); + auto firstTensor = tensors[0]; + int64_t totalSize = 0; + bool isFlat = true; + + auto storage = firstTensor.storage(); + int64_t firstStorageOffset = firstTensor.storage_offset(); + + for (int i = 0; i < groupSize; i++) { + auto& curTensor = tensors[i]; + int64_t length = curTensor.numel(); + lengths[i] = length; + totalSize += length; + + if (isFlat && + (!storage.is_alias_of(curTensor.storage()) || + curTensor.storage_offset() != + firstStorageOffset + totalSize - length)) { + isFlat = false; + } + } + + flatLength = totalSize; + + if (isFlat) { + flatTensor = firstTensor; + } else { + flatTensor = at::empty({totalSize}, firstTensor.options()); + } + + return isFlat; +} + bool check_same_size(const std::vector& input_tensors) { for (const auto& input_tensor : input_tensors) { if (!input_tensors[0].is_same_size(input_tensor)) { @@ -1158,8 +1196,7 @@ c10::intrusive_ptr ProcessGroupXCCL::alltoall_base( return ret_evt; }, OpType::ALLTOALL_BASE); - } - else { + } else { c10d::checkSplitSizes(inputSplitSizes, inputTensor, size_); c10d::checkSplitSizes(outputSplitSizes, outputTensor, size_); @@ -1207,6 +1244,81 @@ c10::intrusive_ptr ProcessGroupXCCL::alltoall_base( } } +c10::intrusive_ptr ProcessGroupXCCL::alltoall( + std::vector& outputTensors, + std::vector& inputTensors, + const AllToAllOptions& /* unused */) { + auto device = outputTensors[0].device(); + for (const auto r : c10::irange(outputTensors.size())) { + check_xpu_single_tensor(outputTensors[r], true); + check_xpu_single_tensor(inputTensors[r], true); + TORCH_CHECK( + device == outputTensors[r].device() && + device == inputTensors[r].device(), + "Tensors must be on the same device") + } + + return collective( + inputTensors, + outputTensors, + [&](at::Tensor& /* unused */, + at::Tensor& /* unused */, + ccl::alltoallv_attr attr, + xcclComm_t& comm, + at::xpu::XPUStream& stream) { + c10::OptionalStreamGuard stream_guard(stream.unwrap()); + at::Tensor flatInput; + at::Tensor flatOutput; + + std::vector sendCounts(size_); + std::vector recvCounts(size_); + + int64_t flatSendCount; + int64_t flatRecvCount; + + bool isInputFlat = computeLengthsAndCheckAndGetFlat( + inputTensors, sendCounts, flatInput, flatSendCount); + bool isOutputFlat = computeLengthsAndCheckAndGetFlat( + outputTensors, recvCounts, flatOutput, flatRecvCount); + if (!isInputFlat) { + auto flatInputSplits = flatInput.split_with_sizes( + c10::IntArrayRef((int64_t*)sendCounts.data(), sendCounts.size()), + 0); + + for (int i = 0; i < size_; i++) { + flatInputSplits[i].copy_(inputTensors[i].view({-1})); + } + } + + auto xcclDataType = getXcclDataType(flatOutput.scalar_type()); + ccl::event ret_evt; + ret_evt = ccl::alltoallv( + flatInput.data_ptr(), + sendCounts, + flatOutput.data_ptr(), + recvCounts, + xcclDataType, + comm, + ccl::create_stream(stream.queue()), + attr); + + if (!isOutputFlat) { + ret_evt.wait(); + auto flatOutputSplits = flatOutput.split_with_sizes( + c10::IntArrayRef((int64_t*)recvCounts.data(), recvCounts.size()), + 0); + + for (int i = 0; i < size_; i++) { + outputTensors[i].view({-1}).copy_(flatOutputSplits[i]); + } + } + + stream.synchronize(); + return ret_evt; + }, + OpType::ALLTOALL); +} + } // namespace c10d #endif // USE_C10D_XCCL diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp index 0147ef3744384e..cfef4ace195f26 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp @@ -283,9 +283,7 @@ class TORCH_API ProcessGroupXCCL : public Backend { c10::intrusive_ptr alltoall( std::vector& outputTensors, std::vector& inputTensors, - const AllToAllOptions& opts = AllToAllOptions()) override { - TORCH_CHECK(false, "ProcessGroupXCCL::alltoall not implemented"); - } + const AllToAllOptions& opts = AllToAllOptions()) override; c10::intrusive_ptr send( std::vector& tensors, From 001dac2fafb6d658326d579bfb86216ef04e6077 Mon Sep 17 00:00:00 2001 From: hanchao Date: Thu, 19 Sep 2024 03:33:32 +0000 Subject: [PATCH 60/96] use lintrunner format code --- setup.py | 2 +- test/distributed/test_c10d_common.py | 6 +++- test/distributed/test_c10d_xccl.py | 33 ++++++++----------- torch/_C/_distributed_c10d.pyi | 2 +- torch/csrc/distributed/c10d/Ops.cpp | 2 +- torch/csrc/distributed/c10d/init.cpp | 1 - torch/distributed/distributed_c10d.py | 9 +++-- torch/testing/_internal/common_distributed.py | 2 +- 8 files changed, 29 insertions(+), 28 deletions(-) diff --git a/setup.py b/setup.py index e6191c0616db4a..ad48f4b0108633 100644 --- a/setup.py +++ b/setup.py @@ -648,7 +648,7 @@ def run(self): if cmake_cache_vars["USE_XCCL"]: report("-- Building XCCL library") else: - report("-- Not using XCCL") + report("-- Not using XCCL") if cmake_cache_vars["USE_DISTRIBUTED"]: if IS_WINDOWS: report("-- Building without distributed package") diff --git a/test/distributed/test_c10d_common.py b/test/distributed/test_c10d_common.py index 0c1426d0e29c21..3e5538d57e38ae 100644 --- a/test/distributed/test_c10d_common.py +++ b/test/distributed/test_c10d_common.py @@ -66,7 +66,11 @@ def gpus_for_rank(world_size): On a single node, all visible GPUs are evenly divided to subsets, each process only uses a subset. """ - device_count = torch.xpu.device_count() if torch.xpu.is_available() else torch.cuda.device_count() + device_count = ( + torch.xpu.device_count() + if torch.xpu.is_available() + else torch.cuda.device_count() + ) visible_devices = list(range(device_count)) gpus_per_process = device_count // world_size gpus_for_rank = [] diff --git a/test/distributed/test_c10d_xccl.py b/test/distributed/test_c10d_xccl.py index 3843a695f766c9..704cdd414e554b 100644 --- a/test/distributed/test_c10d_xccl.py +++ b/test/distributed/test_c10d_xccl.py @@ -1,18 +1,11 @@ # Owner(s): ["oncall: distributed"] -import copy -import logging import math -import operator import os -import random import sys - import time -import tempfile from datetime import timedelta -from functools import reduce -from unittest import mock, SkipTest +from unittest import mock import torch import torch.distributed as c10d @@ -23,27 +16,23 @@ sys.exit(0) import test_c10d_common -from test_c10d_common import DoubleGpuNet, gpus_for_rank, ModuleForDdpCommHook import torch.distributed as dist -import torch.nn.functional as F import torch.testing._internal.common_utils as common -from torch import nn -from torch.nn.parallel import DistributedDataParallel from torch.testing._internal.common_distributed import ( + init_multigpu_helper, MultiProcessTestCase, requires_xccl, - init_multigpu_helper, - skip_if_lt_x_gpu, ) from torch.testing._internal.common_utils import ( - skip_but_pass_in_sandcastle_if, - TEST_XPU, retry_on_connect_failures, run_tests, + skip_but_pass_in_sandcastle_if, + TEST_XPU, TestCase, ) + def simple_reduce_tests(rank, world_size): tests = [ ( @@ -70,8 +59,10 @@ def simple_reduce_tests(rank, world_size): return tests + TEST_MULTIXPU = torch.xpu.device_count() > 1 + class RendezvousEnvTest(TestCase): @retry_on_connect_failures @requires_xccl() @@ -171,6 +162,7 @@ def withouts(d, keys): self.assertEqual(rank, 0) self.assertEqual(size, 1) + class TimeoutTest(test_c10d_common.AbstractTimeoutTest, TestCase): @requires_xccl() @retry_on_connect_failures @@ -178,8 +170,11 @@ class TimeoutTest(test_c10d_common.AbstractTimeoutTest, TestCase): def test_default_store_timeout_nccl(self): self._test_default_store_timeout("xccl") + class ProcessGroupXCCLTest(MultiProcessTestCase): - def _create_process_group_xccl(self, timeout=timedelta(seconds=600), device_id=None): + def _create_process_group_xccl( + self, timeout=timedelta(seconds=600), device_id=None + ): store = c10d.FileStore(self.file_name, self.world_size) c10d.init_process_group( "xccl", @@ -286,7 +281,7 @@ def _test_allreduce_basics(self, fn): result = fut.value() self.assertEqual(expected, result[0], exact_dtype=False) - x = fn(torch.tensor([self.rank + 1.0], device = device)) + x = fn(torch.tensor([self.rank + 1.0], device=device)) fut = pg.allreduce(x).get_future() fut.wait() result = fut.value() @@ -300,11 +295,9 @@ def test_allreduce_basics(self): self._test_allreduce_basics(lambda t: t.clone()) - if __name__ == "__main__": assert ( not torch.xpu._initialized ), "test_distributed must not have initialized XPU context on main process" run_tests() - diff --git a/torch/_C/_distributed_c10d.pyi b/torch/_C/_distributed_c10d.pyi index 53011cde6b178a..6033d969925972 100644 --- a/torch/_C/_distributed_c10d.pyi +++ b/torch/_C/_distributed_c10d.pyi @@ -705,4 +705,4 @@ class ProcessGroupXCCL(Backend): store: Store, rank: int, size: int, - ): ... \ No newline at end of file + ): ... diff --git a/torch/csrc/distributed/c10d/Ops.cpp b/torch/csrc/distributed/c10d/Ops.cpp index 48d2b3ed1bf69a..699c54236f6412 100644 --- a/torch/csrc/distributed/c10d/Ops.cpp +++ b/torch/csrc/distributed/c10d/Ops.cpp @@ -510,7 +510,7 @@ namespace { #define REGISTER_C10D_OP(FUNC) \ REGISTER_C10D_OP1(FUNC, CPU) \ REGISTER_C10D_OP1(FUNC, CUDA) \ - REGISTER_C10D_OP1(FUNC, XPU) \ + REGISTER_C10D_OP1(FUNC, XPU) \ REGISTER_C10D_OP1(FUNC, PrivateUse1) // Now we start to register ops with the three device keys diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp index 5d200bb6eeb9cf..e3ed6d6bd4bcb4 100644 --- a/torch/csrc/distributed/c10d/init.cpp +++ b/torch/csrc/distributed/c10d/init.cpp @@ -41,7 +41,6 @@ #include #endif - #include #include #include diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py index d0781765c090ff..9fa3224873c9fc 100644 --- a/torch/distributed/distributed_c10d.py +++ b/torch/distributed/distributed_c10d.py @@ -1111,10 +1111,12 @@ def is_ucc_available() -> bool: """Check if the UCC backend is available.""" return _UCC_AVAILABLE + def is_xccl_available() -> bool: """Check if the XCCL backend is available.""" return _XCCL_AVAILABLE + def is_backend_available(backend: str) -> bool: """ Check backend availability. @@ -1367,7 +1369,7 @@ def _set_pg_timeout(timeout: timedelta, group: Optional[ProcessGroup] = None) -> backends.add(backend) # type: ignore[arg-type] elif is_gloo_available() and isinstance(backend, ProcessGroupGloo): backends.add(backend) # type: ignore[arg-type] - if torch.device("xpu") in devices and is_xpu_available(): + if torch.device("xpu") in devices and is_xccl_available(): backend = group._get_backend(torch.device("xpu")) if isinstance(backend, ProcessGroupXCCL): backends.add(backend) # type: ignore[arg-type] @@ -1672,7 +1674,10 @@ def _new_process_group_helper( "created, please use a different group name" ) - if device_id is not None and (device_id.index is None or (device_id.type != "cuda" and device_id.type != "xpu")): + if device_id is not None and ( + device_id.index is None + or (device_id.type != "cuda" and device_id.type != "xpu") + ): raise ValueError( "init_process_group device_id parameter must be a cuda device with an " "id, e.g. cuda:0, xpu, not just cuda or xpu or cpu" diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py index 554114b7bbcb1c..26bdcce6103120 100644 --- a/torch/testing/_internal/common_distributed.py +++ b/torch/testing/_internal/common_distributed.py @@ -326,7 +326,7 @@ def requires_xccl(): not c10d.is_xccl_available(), "c10d was not compiled with the XCCL backend", ) - + def requires_ucc(): return skip_but_pass_in_sandcastle_if( not c10d.is_ucc_available(), From f13b44908b4b6366f8af7bb2bdbfd4d1a2e3758c Mon Sep 17 00:00:00 2001 From: hanchao Date: Thu, 19 Sep 2024 05:35:15 +0000 Subject: [PATCH 61/96] rm allgatherv align with nccl --- .../distributed/c10d/ProcessGroupXCCL.cpp | 126 ++++++++---------- .../distributed/c10d/ProcessGroupXCCL.hpp | 41 +++--- 2 files changed, 74 insertions(+), 93 deletions(-) diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index c7d9a10d9bf706..25181d2b9d2498 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -268,11 +268,7 @@ bool ProcessGroupXCCL::WorkXCCL::wait(std::chrono::milliseconds timeout) { } constexpr const char* MULTI_DEVICE_ERROR_MSG = - "Expecting one tensor only but got multiple. You are probably using multiple " - "devices under one thread. The support for such usage has been deprecated. " - "For details, please refer to " - "https://pytorch.org/docs/stable/distributed.html#multi-gpu-collective-functions. " - "ProcessGroupXCCL continues supporting multi-process and multi-thread modes."; + "Expecting one tensor only but got multiple"; ProcessGroupXCCL::ProcessGroupXCCL( const c10::intrusive_ptr& store, @@ -425,17 +421,10 @@ c10::intrusive_ptr ProcessGroupXCCL::endCoalescing() { return endCoalescing(OpType::COALESCED); } -// align with single-device style, input_t and output_t due to -// allgatherv need vector output -template < - typename Fn, - typename input_t, - typename output_t, - typename PreProcess, - typename PostProcess> +template c10::intrusive_ptr ProcessGroupXCCL::collective( - std::vector& inputs, - std::vector& outputs, + std::vector& inputs, + std::vector& outputs, Fn fn, PreProcess pre, PostProcess post, @@ -517,28 +506,23 @@ c10::intrusive_ptr ProcessGroupXCCL::collective( return work; } -template < - typename Fn, - typename input_t, - typename output_t, - typename PreProcess, - typename PostProcess> +template c10::intrusive_ptr ProcessGroupXCCL::collective( - input_t& input, - output_t& output, + at::Tensor& input, + at::Tensor& output, Fn fn, PreProcess pre, PostProcess post, OpType opType) { - auto inputs = std::vector{input}; - auto outputs = std::vector{output}; + auto inputs = std::vector{input}; + auto outputs = std::vector{output}; return collective(inputs, outputs, fn, pre, post, opType); } -template +template c10::intrusive_ptr ProcessGroupXCCL::collective( - input_t& input, - output_t& output, + at::Tensor& input, + at::Tensor& output, Fn fn, OpType opType) { return collective( @@ -720,6 +704,39 @@ c10::intrusive_ptr ProcessGroupXCCL::broadcast( OpType::BROADCAST); } +c10::intrusive_ptr ProcessGroupXCCL::_broadcast_oop( + at::Tensor& outputTensor, + at::Tensor& inputTensor, + const BroadcastOptions& opts) { + if (outputTensor.numel() != inputTensor.numel()) { + C10_THROW_ERROR( + ValueError, + "Tensor input and output of _broadcast_oop must have the same number of elements "); + } + const auto root = opts.rootRank + opts.rootTensor; + return collective( + inputTensor, + outputTensor, + [&](at::Tensor& input, + at::Tensor& output, + ccl::broadcast_attr attr, + xcclComm_t& comm, + at::xpu::XPUStream& stream) { + auto xcclDataType = getXcclDataType(input.scalar_type()); + ccl::event ret_evt; + ret_evt = ccl::broadcast( + input.data_ptr(), + (size_t)input.numel(), + xcclDataType, + root, + comm, + ccl::create_stream(stream.queue()), + attr); + return ret_evt; + }, + OpType::BROADCAST); +} + c10::intrusive_ptr ProcessGroupXCCL::_reduce_oop( at::Tensor& outputTensor, at::Tensor& inputTensor, @@ -808,48 +825,17 @@ c10::intrusive_ptr ProcessGroupXCCL::allgather( }, OpType::ALLGATHER); } else { - // xccl implemented allgatherv, so broadcast_oop not needed - return collective( - inputTensor, - outputTensors_, - [=](at::Tensor& input, - const std::vector& outputs, - ccl::allgatherv_attr attr, - xcclComm_t& comm, - at::xpu::XPUStream& stream) { - ccl::event ret_evt; - auto xcclDataType = getXcclDataType(input.scalar_type()); - - std::vector recvCounts(outputs.size(), 0); - std::transform( - outputs.begin(), - outputs.end(), - recvCounts.begin(), - [](const at::Tensor& t) { return t.numel(); }); - - TORCH_CHECK( - (size_t)input.numel() == recvCounts[rank_], - "allgather: send and recv count doesn't match"); - - std::vector recvBufs(outputs.size(), nullptr); - std::transform( - outputs.begin(), - outputs.end(), - recvBufs.begin(), - [](const at::Tensor& t) { return t.data_ptr(); }); - - ret_evt = ccl::allgatherv( - input.data_ptr(), - (size_t)input.numel(), - recvBufs, - recvCounts, - xcclDataType, - comm, - ccl::create_stream(stream.queue()), - attr); - return ret_evt; - }, - c10d::OpType::ALLGATHER); + const auto num_reduces = outputTensors_.size(); + startCoalescing(); + for (const int i : c10::irange(num_reduces)) { + auto& output = outputTensors_[i]; + auto& input = (i == rank_) ? inputTensor : output; + auto broadcastOpts = BroadcastOptions{ + static_cast(i), static_cast(0), opts.timeout}; + _broadcast_oop(output, input, broadcastOpts); + } + auto work = endCoalescing(OpType::ALLGATHER); + return work; } } diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp index 8e17435a4ce1b6..f50c2bbc4dc4bd 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp @@ -163,36 +163,26 @@ class TORCH_API ProcessGroupXCCL : public Backend { const std::vector& inputs = {}, const std::vector& outputs = {}); - template + template c10::intrusive_ptr collective( - input_t& input, - output_t& output, + at::Tensor& input, + at::Tensor& output, Fn fn, OpType opType); - template < - typename Fn, - typename input_t, - typename output_t, - typename PreProcess, - typename PostProcess> + template c10::intrusive_ptr collective( - input_t& input, - output_t& output, + at::Tensor& input, + at::Tensor& output, Fn fn, PreProcess pre, PostProcess post, OpType opType); - template < - typename Fn, - typename input_t, - typename output_t, - typename PreProcess, - typename PostProcess> + template c10::intrusive_ptr collective( - std::vector& inputs, - std::vector& outputs, + std::vector& inputs, + std::vector& outputs, Fn fn, PreProcess pre, PostProcess post, @@ -224,15 +214,20 @@ class TORCH_API ProcessGroupXCCL : public Backend { TORCH_CHECK(false, "ProcessGroupXCCL::reduce not implemented"); } - c10::intrusive_ptr broadcast( - std::vector& tensors, - const BroadcastOptions& opts = BroadcastOptions()) override; - c10::intrusive_ptr _reduce_oop( at::Tensor& outputTensors, at::Tensor& inputTensors, const ReduceOptions& opts = ReduceOptions()); + c10::intrusive_ptr broadcast( + std::vector& tensors, + const BroadcastOptions& opts = BroadcastOptions()) override; + + c10::intrusive_ptr _broadcast_oop( + at::Tensor& outputTensor, + at::Tensor& inputTensor, + const BroadcastOptions& opts); + c10::intrusive_ptr allgather( std::vector>& outputTensors, std::vector& inputTensors, From af29d9650d9be5d99ea3f0b62d62c8ac1107e994 Mon Sep 17 00:00:00 2001 From: hanchao Date: Thu, 19 Sep 2024 06:03:24 +0000 Subject: [PATCH 62/96] Support reduce --- test/distributed/test_c10d_ops_xccl.py | 169 ++++++++---------- .../distributed/c10d/ProcessGroupXCCL.cpp | 62 +++++++ .../distributed/c10d/ProcessGroupXCCL.hpp | 4 +- 3 files changed, 135 insertions(+), 100 deletions(-) diff --git a/test/distributed/test_c10d_ops_xccl.py b/test/distributed/test_c10d_ops_xccl.py index a59d03a1750e1c..0e278b0e2deab4 100644 --- a/test/distributed/test_c10d_ops_xccl.py +++ b/test/distributed/test_c10d_ops_xccl.py @@ -59,42 +59,41 @@ def rank_to_GPU(self): # return rank to GPU map return init_multigpu_helper(self.world_size, "xccl") - # TODO: wait reduce - # @requires_xccl() - # @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs") - # def test_empty_tensors(self): - # pg = self.pg - # local_device_idx = self.rank_to_GPU[self.rank][0] - - # xs = [torch.FloatTensor([]).xpu(local_device_idx)] - # pg.broadcast(xs).wait() - # self.assertEqual(0, xs[0].numel()) - - # pg.allreduce(xs).wait() - # self.assertEqual(0, xs[0].numel()) - - # pg.reduce(xs).wait() - # self.assertEqual(0, xs[0].numel()) - - # ys = [ - # [ - # torch.FloatTensor([]).xpu(local_device_idx) - # for _ in range(self.world_size) - # ] - # ] - # pg.allgather(ys, xs).wait() - # for y in ys[0]: - # self.assertEqual(0, y.numel()) - - # ys = [torch.FloatTensor([]).xpu(local_device_idx)] - # xs = [ - # [ - # torch.FloatTensor([]).xpu(local_device_idx) - # for _ in range(self.world_size) - # ] - # ] - # pg.reduce_scatter(ys, xs).wait() - # self.assertEqual(0, ys[0].numel()) + @requires_xccl() + @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs") + def test_empty_tensors(self): + pg = self.pg + local_device_idx = self.rank_to_GPU[self.rank][0] + + xs = [torch.FloatTensor([]).xpu(local_device_idx)] + pg.broadcast(xs).wait() + self.assertEqual(0, xs[0].numel()) + + pg.allreduce(xs).wait() + self.assertEqual(0, xs[0].numel()) + + pg.reduce(xs).wait() + self.assertEqual(0, xs[0].numel()) + + ys = [ + [ + torch.FloatTensor([]).xpu(local_device_idx) + for _ in range(self.world_size) + ] + ] + pg.allgather(ys, xs).wait() + for y in ys[0]: + self.assertEqual(0, y.numel()) + + ys = [torch.FloatTensor([]).xpu(local_device_idx)] + xs = [ + [ + torch.FloatTensor([]).xpu(local_device_idx) + for _ in range(self.world_size) + ] + ] + pg.reduce_scatter(ys, xs).wait() + self.assertEqual(0, ys[0].numel()) @requires_xccl() @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs") @@ -206,71 +205,47 @@ def test_alltoall_ops_with_xpufree_race(self): work.wait() torch.xpu.synchronize(device=local_device) - # TODO: wait reduce - # @requires_xccl() - # @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs") - # def test_reduce_ops(self): - # pg = self.pg - # local_device_id = self.rank_to_GPU[self.rank][0] + @requires_xccl() + @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs") + def test_reduce_ops(self): + pg = self.pg + local_device_id = self.rank_to_GPU[self.rank][0] - # def reduce(xs, rootRank, rootTensor, op=None): - # opts = c10d.ReduceOptions() - # opts.rootRank = rootRank - # opts.rootTensor = rootTensor - # if op: - # opts.reduceOp = op - # work = pg.reduce(xs, opts) - # work.wait() + def reduce(xs, rootRank, rootTensor, op=None): + opts = c10d.ReduceOptions() + opts.rootRank = rootRank + opts.rootTensor = rootTensor + if op: + opts.reduceOp = op + work = pg.reduce(xs, opts) + work.wait() - # # for every root tensor - # for rt in range(self.world_size): - # tensors = [torch.tensor([self.rank + 1]).xpu(local_device_id)] + # for every root tensor + for rt in range(self.world_size): + tensors = [torch.tensor([self.rank + 1]).xpu(local_device_id)] - # reduce(tensors, rt, 0) + reduce(tensors, rt, 0) + + if self.rank == rt: + self.assertEqual( + torch.tensor([self.world_size * (self.world_size + 1) // 2]), + tensors[0], + ) + else: + self.assertEqual( + torch.tensor([self.rank + 1]), + tensors[0], + ) + + for op, err in zip( + (c10d.ReduceOp.BAND, c10d.ReduceOp.BOR, c10d.ReduceOp.BXOR), + ("ReduceOp.BAND", "ReduceOp.BOR", "ReduceOp.BXOR"), + ): + with self.assertRaisesRegex( + ValueError, "Cannot use " + err + " with XCCL" + ): + reduce(tensors, self.rank, rt, op) - # if self.rank == rt: - # self.assertEqual( - # torch.tensor([self.world_size * (self.world_size + 1) // 2]), - # tensors[0], - # ) - # else: - # self.assertEqual( - # torch.tensor([self.rank + 1]), - # tensors[0], - # ) - - # for op, err in zip( - # (c10d.ReduceOp.BAND, c10d.ReduceOp.BOR, c10d.ReduceOp.BXOR), - # ("ReduceOp.BAND", "ReduceOp.BOR", "ReduceOp.BXOR"), - # ): - # with self.assertRaisesRegex( - # ValueError, "Cannot use " + err + " with XCCL" - # ): - # reduce(tensors, self.rank, rt, op) - - # # Premul sum - # if torch.xpu.xccl.version() >= (2, 11, 1): - # for factor in (3.0, torch.tensor([5.0], device=local_device_id)): - # if isinstance(factor, torch.Tensor): - # factor_ref = factor.cpu().item() - # else: - # factor_ref = factor - # float_tensors = [ - # torch.tensor( - # [self.rank + 1.0], device=f"xpu:{local_device_id}" - # ) - # ] - # float_tensors_ref = [ - # torch.tensor( - # [(self.rank + 1.0) * factor_ref], - # device=f"xpu:{local_device_id}", - # ) - # ] - - # reduce(float_tensors_ref, rt, 0) - # reduce(float_tensors, rt, 0, c10d._make_xccl_premul_sum(factor)) - # if self.rank == rt: - # self.assertEqual(float_tensors_ref[0], float_tensors[0]) @requires_xccl() @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs") diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index 5d43694def146c..1ba775b9239879 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -228,6 +228,18 @@ ccl::reduction getXcclReduceOp(const ReduceOp& reduceOp, at::Tensor& input) { } } +bool complexViewAsRealAllowed(const ReduceOp reduceOp) { + switch (reduceOp) { + case ReduceOp::SUM: + return true; + case ReduceOp::UNUSED: + return true; + default: + return false; + } + return false; +} + } // namespace static std::mutex xcclCommDevIdxMapMutex; @@ -693,6 +705,14 @@ c10::intrusive_ptr ProcessGroupXCCL::allreduce( const AllreduceOptions& opts) { TORCH_CHECK(tensors.size() == 1, MULTI_DEVICE_ERROR_MSG); auto tensor = tensors.back(); + if (tensor.is_complex()) { + TORCH_CHECK( + complexViewAsRealAllowed(opts.reduceOp), + "all_reduce does not support", + opts.reduceOp, + "on complex tensors"); + tensor = at::view_as_real(tensor); + } check_xpu_single_tensor(tensor); TORCH_CHECK( !isFloat8Type(tensor.scalar_type()), @@ -769,6 +789,48 @@ c10::intrusive_ptr ProcessGroupXCCL::broadcast( OpType::BROADCAST); } +c10::intrusive_ptr ProcessGroupXCCL::reduce( + std::vector& tensors, + const ReduceOptions& opts) { + TORCH_CHECK(tensors.size() == 1, MULTI_DEVICE_ERROR_MSG); + // @lint-ignore CLANGTIDY + auto tensor = tensors.back(); + if (tensor.is_complex()) { + TORCH_CHECK( + complexViewAsRealAllowed(opts.reduceOp), + "reduce does not support", + opts.reduceOp, + "on complex tensors"); + tensor = at::view_as_real(tensor); + } + check_xpu_single_tensor(tensor); + + return collective( + tensor, + tensor, + [&](at::Tensor& input, + at::Tensor& output, + ccl::reduce_attr attr, + xcclComm_t& comm, + at::xpu::XPUStream& stream) { + const int root = opts.rootRank + opts.rootTensor; + const auto xcclDataType = getXcclDataType(input.scalar_type()); + const auto xcclReduceOp = getXcclReduceOp(opts.reduceOp, input); + ccl::event ret_evt; + ret_evt = ccl::reduce( + input.data_ptr(), + output.data_ptr(), + (size_t)input.numel(), + xcclDataType, + xcclReduceOp, + root, + comm, + ccl::create_stream(stream.queue())); + return ret_evt; + }, + OpType::REDUCE); +} + c10::intrusive_ptr ProcessGroupXCCL::_reduce_oop( at::Tensor& outputTensor, at::Tensor& inputTensor, diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp index cfef4ace195f26..f7b946aab603f0 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp @@ -220,9 +220,7 @@ class TORCH_API ProcessGroupXCCL : public Backend { c10::intrusive_ptr reduce( std::vector& tensors, - const ReduceOptions& opts = ReduceOptions()) override { - TORCH_CHECK(false, "ProcessGroupXCCL::reduce not implemented"); - } + const ReduceOptions& opts = ReduceOptions()) override; c10::intrusive_ptr broadcast( std::vector& tensors, From 20b118822d797a5dd15c1cb6745336fcbb1e5aa7 Mon Sep 17 00:00:00 2001 From: hanchao Date: Thu, 19 Sep 2024 06:43:18 +0000 Subject: [PATCH 63/96] Support gather --- test/distributed/test_c10d_ops_xccl.py | 201 +++++++++--------- .../distributed/c10d/ProcessGroupXCCL.cpp | 96 +++++++++ .../distributed/c10d/ProcessGroupXCCL.hpp | 4 +- 3 files changed, 197 insertions(+), 104 deletions(-) diff --git a/test/distributed/test_c10d_ops_xccl.py b/test/distributed/test_c10d_ops_xccl.py index 0e278b0e2deab4..3076444f2e4786 100644 --- a/test/distributed/test_c10d_ops_xccl.py +++ b/test/distributed/test_c10d_ops_xccl.py @@ -332,119 +332,118 @@ def allgather_base(output_t, input_t): # fails the check because the dtype is different allgather_base(output_t, tensor) - # TODO: wait gather - # @requires_xccl() - # @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs") - # def test_gather_ops(self): - # pg = self.pg - # local_device_ids = self.rank_to_GPU[self.rank] - # num_gpus = len(local_device_ids) - - # def gather(output_t, input_t, rootRank): - # opts = c10d.GatherOptions() - # opts.rootRank = rootRank - # if rootRank == self.rank: - # work = pg.gather(output_t, input_t, opts) - # else: - # work = pg.gather([], input_t, opts) - # work.wait() - - # # init input - # tensors = [] - # for device_id in local_device_ids: - # tensors.append(torch.tensor([self.rank]).xpu(device_id)) - - # # init output - # output_ts = [] - # for idx in range(num_gpus): - # gpu_idx = local_device_ids[idx] - # output_ts.append([]) - # for rank in range(self.world_size): - # output_ts[idx].append(torch.tensor([-1]).xpu(gpu_idx)) - - # expected = [[torch.tensor([rank]) for rank in range(self.world_size)]] - # for rank in range(self.world_size): - # gather(output_ts, tensors, rank) - # if rank == self.rank: - # self.assertEqual(expected, output_ts) - - # @requires_xccl() - # @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs") - # def test_gather_stress(self): - # pg = self.pg - # local_device_ids = self.rank_to_GPU[self.rank] - # num_gpus = len(local_device_ids) + @requires_xccl() + @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs") + def test_gather_ops(self): + pg = self.pg + local_device_ids = self.rank_to_GPU[self.rank] + num_gpus = len(local_device_ids) - # def gather(output_t, input_t, rootRank): - # opts = c10d.GatherOptions() - # opts.rootRank = rootRank - # if rootRank == self.rank: - # work = pg.gather(output_t, input_t, opts) - # else: - # work = pg.gather([], input_t, opts) - # work.wait() + def gather(output_t, input_t, rootRank): + opts = c10d.GatherOptions() + opts.rootRank = rootRank + if rootRank == self.rank: + work = pg.gather(output_t, input_t, opts) + else: + work = pg.gather([], input_t, opts) + work.wait() - # stress_length = 1000 + # init input + tensors = [] + for device_id in local_device_ids: + tensors.append(torch.tensor([self.rank]).xpu(device_id)) + + # init output + output_ts = [] + for idx in range(num_gpus): + gpu_idx = local_device_ids[idx] + output_ts.append([]) + for rank in range(self.world_size): + output_ts[idx].append(torch.tensor([-1]).xpu(gpu_idx)) + + expected = [[torch.tensor([rank]) for rank in range(self.world_size)]] + for rank in range(self.world_size): + gather(output_ts, tensors, rank) + if rank == self.rank: + self.assertEqual(expected, output_ts) - # # init input - # tensors = [] - # for i in range(stress_length): - # tensors.append([]) - # for device_id in local_device_ids: - # tensors[i].append(torch.tensor([self.rank]).xpu(device_id)) + @requires_xccl() + @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs") + def test_gather_stress(self): + pg = self.pg + local_device_ids = self.rank_to_GPU[self.rank] + num_gpus = len(local_device_ids) - # # init output - # output_ts = [] - # for i in range(stress_length): - # output_ts.append([[] for _ in range(num_gpus)]) - # for idx, ls in enumerate(output_ts[i]): - # gpu_idx = local_device_ids[idx] - # for _ in range(self.world_size): - # ls.append(torch.tensor([-1]).xpu(gpu_idx)) + def gather(output_t, input_t, rootRank): + opts = c10d.GatherOptions() + opts.rootRank = rootRank + if rootRank == self.rank: + work = pg.gather(output_t, input_t, opts) + else: + work = pg.gather([], input_t, opts) + work.wait() - # expected = [[torch.tensor([rank]) for rank in range(self.world_size)]] - # for i in range(stress_length): - # for rank in range(self.world_size): - # gather(output_ts[i], tensors[i], rank) - # # Verification - # if rank == self.rank: - # self.assertEqual(output_ts[i], expected) + stress_length = 1000 + + # init input + tensors = [] + for i in range(stress_length): + tensors.append([]) + for device_id in local_device_ids: + tensors[i].append(torch.tensor([self.rank]).xpu(device_id)) + + # init output + output_ts = [] + for i in range(stress_length): + output_ts.append([[] for _ in range(num_gpus)]) + for idx, ls in enumerate(output_ts[i]): + gpu_idx = local_device_ids[idx] + for _ in range(self.world_size): + ls.append(torch.tensor([-1]).xpu(gpu_idx)) + + expected = [[torch.tensor([rank]) for rank in range(self.world_size)]] + for i in range(stress_length): + for rank in range(self.world_size): + gather(output_ts[i], tensors[i], rank) + # Verification + if rank == self.rank: + self.assertEqual(output_ts[i], expected) - # @requires_xccl() - # @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs") - # def test_gather_checks(self): - # pg = self.pg - # device_id = self.rank_to_GPU[self.rank][0] + @requires_xccl() + @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs") + def test_gather_checks(self): + pg = self.pg + device_id = self.rank_to_GPU[self.rank][0] - # # init input - # tensor = torch.tensor([self.rank]).xpu(device_id) + # init input + tensor = torch.tensor([self.rank]).xpu(device_id) - # # init output - # output_ts = [] - # for rank in range(self.world_size): - # output_ts.append(torch.tensor([-1]).xpu(device_id)) + # init output + output_ts = [] + for rank in range(self.world_size): + output_ts.append(torch.tensor([-1]).xpu(device_id)) - # with self.assertRaisesRegex(ValueError, "invalid root rank"): - # opts = c10d.GatherOptions() - # opts.rootRank = -1 - # pg.gather([output_ts], [tensor], opts) + with self.assertRaisesRegex(ValueError, "invalid root rank"): + opts = c10d.GatherOptions() + opts.rootRank = -1 + pg.gather([output_ts], [tensor], opts) - # with self.assertRaisesRegex(TypeError, "incompatible function arguments"): - # pg.gather([output_ts], [tensor], 0) + with self.assertRaisesRegex(TypeError, "incompatible function arguments"): + pg.gather([output_ts], [tensor], 0) - # with self.assertRaisesRegex(ValueError, "invalid root rank"): - # opts = c10d.GatherOptions() - # opts.rootRank = self.world_size - # pg.gather([output_ts], [tensor], opts) + with self.assertRaisesRegex(ValueError, "invalid root rank"): + opts = c10d.GatherOptions() + opts.rootRank = self.world_size + pg.gather([output_ts], [tensor], opts) - # with self.assertRaisesRegex( - # # throws error message from dispatcher - # RuntimeError, - # "There were no tensor arguments to this function", - # ): - # opts = c10d.GatherOptions() - # opts.rootRank = 0 - # pg.gather([output_ts], [], opts) + with self.assertRaisesRegex( + # throws error message from dispatcher + RuntimeError, + "There were no tensor arguments to this function", + ): + opts = c10d.GatherOptions() + opts.rootRank = 0 + pg.gather([output_ts], [], opts) # TODO: wait scatter # @requires_xccl() diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index 1ba775b9239879..c34583d14c2017 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -672,6 +672,102 @@ c10::intrusive_ptr ProcessGroupXCCL::collectiveCoalesced( return work; } +c10::intrusive_ptr ProcessGroupXCCL::gather( + std::vector>& outputTensors, + std::vector& inputTensors, + const GatherOptions& opts) { + static auto invalidArgument = [](const std::string& msg) { + C10_THROW_ERROR(ValueError, "ProcessGroupXCCL::gather: " + msg); + }; + + assertRootRank(invalidArgument, opts.rootRank, size_); + + TORCH_CHECK(inputTensors.size() == 1, MULTI_DEVICE_ERROR_MSG); + // @lint-ignore CLANGTIDY + auto inputTensor = inputTensors.back(); + + std::vector outputs; + + if (getRank() == opts.rootRank) { + if (outputTensors.size() != 1) { + std::stringstream ss; + ss << "requires a single-element output list containing a list with " + << getSize() << " tensors."; + invalidArgument(ss.str()); + } else if (outputTensors[0].size() != static_cast(getSize())) { + std::stringstream ss; + ss << "Incorrect output list size " << outputTensors[0].size() + << ". Output list size should be " << getSize() + << ", same as size of the process group."; + invalidArgument(ss.str()); + } + + const auto& options = inputTensor.options(); + const auto& sizes = inputTensor.sizes(); + assertTypeAndSizesMatch(invalidArgument, outputTensors[0], options, sizes); + outputs = outputTensors[0]; + } else { + // if not in the root rank, initialize outputs as empty list + if (outputTensors.size() != 0) { + invalidArgument("requires empty output on non-root"); + } + outputs = {}; + // append a empty tensor to the list, we don't use it but the + // `collective` template function requires it to invoke its function + outputs.emplace_back(); + } + + auto inputs = std::vector{inputTensor}; + return collective( + inputs, + outputs, // just to fit the collective interface + [&](at::Tensor& /* unused */, + at::Tensor& /* unused */, + ccl::allgather_attr attr, // just to fit interface + xcclComm_t& comm, + at::xpu::XPUStream& stream) { + const auto root = opts.rootRank; + if (getRank() == root) { + for (auto output : outputs) { + c10::xpu::XPUCachingAllocator::recordStream( + output.storage().data_ptr(), stream); + } + } + { + ccl::event ret_evt; + auto xcclDataType = getXcclDataType(inputTensor.scalar_type()); + if (rank_ == root) { + for (const auto r : c10::irange(size_)) { + if (r != root) { + // do receive + ret_evt = ccl::recv( + outputs[r].data_ptr(), + (size_t)inputTensor.numel(), + xcclDataType, + r, + comm, + ccl::create_stream(stream.queue())); + } else { + // on its own rank, simply copy from the input + outputs[r].copy_(inputTensor); + } + } + } else { + // do send + ret_evt = ccl::send( + inputTensor.data_ptr(), + (size_t)inputTensor.numel(), + xcclDataType, + root, + comm, + ccl::create_stream(stream.queue())); + } + return ret_evt; + } + }, + OpType::GATHER); +} + c10::intrusive_ptr ProcessGroupXCCL::allreduce_impl( at::Tensor& tensor, const AllreduceOptions& opts) { diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp index f7b946aab603f0..ec0e1b805f579f 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp @@ -304,9 +304,7 @@ class TORCH_API ProcessGroupXCCL : public Backend { c10::intrusive_ptr gather( std::vector>& outputTensors, std::vector& inputTensors, - const GatherOptions& opts = GatherOptions()) override { - TORCH_CHECK(false, "ProcessGroupXCCL::gather not implemented"); - } + const GatherOptions& opts = GatherOptions()) override; c10::intrusive_ptr scatter( std::vector& outputTensors, From 1463eca58ba59dd98a39ebc425fc9c7bd93ef164 Mon Sep 17 00:00:00 2001 From: hanchao Date: Thu, 19 Sep 2024 07:14:25 +0000 Subject: [PATCH 64/96] Support scatter --- test/distributed/test_c10d_ops_xccl.py | 233 +++++++++--------- .../distributed/c10d/ProcessGroupXCCL.cpp | 101 ++++++++ .../distributed/c10d/ProcessGroupXCCL.hpp | 4 +- 3 files changed, 218 insertions(+), 120 deletions(-) diff --git a/test/distributed/test_c10d_ops_xccl.py b/test/distributed/test_c10d_ops_xccl.py index 3076444f2e4786..8cfce2be164d9f 100644 --- a/test/distributed/test_c10d_ops_xccl.py +++ b/test/distributed/test_c10d_ops_xccl.py @@ -445,125 +445,124 @@ def test_gather_checks(self): opts.rootRank = 0 pg.gather([output_ts], [], opts) - # TODO: wait scatter - # @requires_xccl() - # @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs") - # def test_scatter_ops(self): - # pg = self.pg - # local_device_ids = self.rank_to_GPU[self.rank] - # num_gpus = len(local_device_ids) - - # def scatter(output_t, input_t, rootRank): - # opts = c10d.ScatterOptions() - # opts.rootRank = rootRank - # if rootRank == self.rank: - # work = pg.scatter(output_t, input_t, opts) - # else: - # work = pg.scatter(output_t, [], opts) - # work.wait() - - # # init output - # tensors = [] - # for device_id in local_device_ids: - # tensors.append(torch.tensor([-1]).xpu(device_id)) - - # # init input - # scatter_list = [] - # for idx in range(num_gpus): - # gpu_idx = local_device_ids[idx] - # scatter_list.append([]) - # for rank in range(self.world_size): - # scatter_list[idx].append(torch.tensor([rank]).xpu(gpu_idx)) - - # # test each rank to scatter - # expected = [torch.tensor([self.rank])] - # for rank in range(self.world_size): - # scatter(tensors, scatter_list, rank) - # self.assertEqual(expected, tensors) + @requires_xccl() + @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs") + def test_scatter_ops(self): + pg = self.pg + local_device_ids = self.rank_to_GPU[self.rank] + num_gpus = len(local_device_ids) - # @requires_xccl() - # @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs") - # def test_scatter_stress(self): - # pg = self.pg - # local_device_ids = self.rank_to_GPU[self.rank] - # num_gpus = len(local_device_ids) - - # def scatter(output_t, input_t, rootRank): - # opts = c10d.ScatterOptions() - # opts.rootRank = rootRank - # if rootRank == self.rank: - # work = pg.scatter(output_t, input_t, opts) - # else: - # work = pg.scatter(output_t, [], opts) - # work.wait() - - # stress_length = 1000 - - # # init output - # tensors = [] - # for i in range(stress_length): - # tensors.append([]) - # for device_id in local_device_ids: - # tensors[i].append(torch.tensor([-1]).xpu(device_id)) - - # # init input - # scatter_list = [] - # for i in range(stress_length): - # scatter_list.append([[] for _ in range(num_gpus)]) - # for idx, ls in enumerate(scatter_list[i]): - # gpu_idx = local_device_ids[idx] - # for rank in range(self.world_size): - # ls.append(torch.tensor([rank]).xpu(gpu_idx)) - - # # test each rank to scatter - # expected = [torch.tensor([self.rank])] - # for i in range(stress_length): - # for rank in range(self.world_size): - # scatter(tensors[i], scatter_list[i], rank) - # # Verification - # self.assertEqual(tensors[i], expected) + def scatter(output_t, input_t, rootRank): + opts = c10d.ScatterOptions() + opts.rootRank = rootRank + if rootRank == self.rank: + work = pg.scatter(output_t, input_t, opts) + else: + work = pg.scatter(output_t, [], opts) + work.wait() - # @requires_xccl() - # @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs") - # def test_scatter_checks(self): - # pg = self.pg - # local_device_ids = self.rank_to_GPU[self.rank] - # num_gpus = len(local_device_ids) - - # # init output - # tensors = [] - # for device_id in local_device_ids: - # tensors.append(torch.tensor([-1]).xpu(device_id)) - - # # init input - # scatter_list = [] - # for idx in range(num_gpus): - # gpu_idx = local_device_ids[idx] - # scatter_list.append([]) - # for rank in range(self.world_size): - # scatter_list[idx].append(torch.tensor([rank]).xpu(gpu_idx)) - - # with self.assertRaisesRegex(ValueError, "invalid root rank"): - # opts = c10d.ScatterOptions() - # opts.rootRank = -1 - # pg.scatter(tensors, scatter_list, opts) - - # with self.assertRaisesRegex(TypeError, "incompatible function arguments"): - # pg.scatter(tensors, scatter_list, 0) - - # with self.assertRaisesRegex(ValueError, "invalid root rank"): - # opts = c10d.ScatterOptions() - # opts.rootRank = self.world_size - # pg.scatter(tensors, scatter_list, opts) - - # with self.assertRaisesRegex( - # # throws error message from dispatcher - # RuntimeError, - # "There were no tensor arguments to this function", - # ): - # opts = c10d.ScatterOptions() - # opts.rootRank = 0 - # pg.scatter([], scatter_list, opts) + # init output + tensors = [] + for device_id in local_device_ids: + tensors.append(torch.tensor([-1]).xpu(device_id)) + + # init input + scatter_list = [] + for idx in range(num_gpus): + gpu_idx = local_device_ids[idx] + scatter_list.append([]) + for rank in range(self.world_size): + scatter_list[idx].append(torch.tensor([rank]).xpu(gpu_idx)) + + # test each rank to scatter + expected = [torch.tensor([self.rank])] + for rank in range(self.world_size): + scatter(tensors, scatter_list, rank) + self.assertEqual(expected, tensors) + + @requires_xccl() + @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs") + def test_scatter_stress(self): + pg = self.pg + local_device_ids = self.rank_to_GPU[self.rank] + num_gpus = len(local_device_ids) + + def scatter(output_t, input_t, rootRank): + opts = c10d.ScatterOptions() + opts.rootRank = rootRank + if rootRank == self.rank: + work = pg.scatter(output_t, input_t, opts) + else: + work = pg.scatter(output_t, [], opts) + work.wait() + + stress_length = 1000 + + # init output + tensors = [] + for i in range(stress_length): + tensors.append([]) + for device_id in local_device_ids: + tensors[i].append(torch.tensor([-1]).xpu(device_id)) + + # init input + scatter_list = [] + for i in range(stress_length): + scatter_list.append([[] for _ in range(num_gpus)]) + for idx, ls in enumerate(scatter_list[i]): + gpu_idx = local_device_ids[idx] + for rank in range(self.world_size): + ls.append(torch.tensor([rank]).xpu(gpu_idx)) + + # test each rank to scatter + expected = [torch.tensor([self.rank])] + for i in range(stress_length): + for rank in range(self.world_size): + scatter(tensors[i], scatter_list[i], rank) + # Verification + self.assertEqual(tensors[i], expected) + + @requires_xccl() + @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs") + def test_scatter_checks(self): + pg = self.pg + local_device_ids = self.rank_to_GPU[self.rank] + num_gpus = len(local_device_ids) + + # init output + tensors = [] + for device_id in local_device_ids: + tensors.append(torch.tensor([-1]).xpu(device_id)) + + # init input + scatter_list = [] + for idx in range(num_gpus): + gpu_idx = local_device_ids[idx] + scatter_list.append([]) + for rank in range(self.world_size): + scatter_list[idx].append(torch.tensor([rank]).xpu(gpu_idx)) + + with self.assertRaisesRegex(ValueError, "invalid root rank"): + opts = c10d.ScatterOptions() + opts.rootRank = -1 + pg.scatter(tensors, scatter_list, opts) + + with self.assertRaisesRegex(TypeError, "incompatible function arguments"): + pg.scatter(tensors, scatter_list, 0) + + with self.assertRaisesRegex(ValueError, "invalid root rank"): + opts = c10d.ScatterOptions() + opts.rootRank = self.world_size + pg.scatter(tensors, scatter_list, opts) + + with self.assertRaisesRegex( + # throws error message from dispatcher + RuntimeError, + "There were no tensor arguments to this function", + ): + opts = c10d.ScatterOptions() + opts.rootRank = 0 + pg.scatter([], scatter_list, opts) @requires_xccl() @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs") diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index c34583d14c2017..638b969fb2b5e7 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -768,6 +768,107 @@ c10::intrusive_ptr ProcessGroupXCCL::gather( OpType::GATHER); } +c10::intrusive_ptr ProcessGroupXCCL::scatter( + std::vector& outputTensors, + std::vector>& inputTensors, + const ScatterOptions& opts) { + static auto invalidArgument = [](const std::string& msg) { + C10_THROW_ERROR(ValueError, "ProcessGroupXCCL::scatter: " + msg); + }; + + assertRootRank(invalidArgument, opts.rootRank, size_); + + TORCH_CHECK(outputTensors.size() == 1, MULTI_DEVICE_ERROR_MSG); + auto outputTensor = outputTensors.back(); + + std::vector inputs; + + if (getRank() == opts.rootRank) { + if (inputTensors.size() != 1) { + std::stringstream ss; + ss << "requires a single-element input list containing a list with " + << getSize() << " tensors."; + invalidArgument(ss.str()); + } else if (inputTensors[0].size() != static_cast(getSize())) { + std::stringstream ss; + ss << "Incorrect input list size " << inputTensors[0].size() + << ". Input list size should be " << getSize() + << ", same as size of the process group."; + invalidArgument(ss.str()); + } + + const auto& options = outputTensor.options(); + const auto& sizes = outputTensor.sizes(); + assertTypeAndSizesMatch(invalidArgument, inputTensors[0], options, sizes); + inputs = inputTensors[0]; + } else { + // if not in the root rank, initialize inputTensors as empty place holder + // with an empty list + if (inputTensors.size() != 0) { + invalidArgument("requires empty input on non-root"); + } + inputs = {}; + // append a empty tensor to the list, we don't use it but the + // `collective` template function requires it to invoke its function + inputs.emplace_back(); + } + + const auto root = opts.rootRank; + + auto outputs = std::vector{outputTensor}; + return collective( + outputs, + inputs, // just to fit the collective interface + [&](at::Tensor& /* unused */, + at::Tensor& /* unused */, + ccl::allgather_attr attr, // just to fit interface + xcclComm_t& comm, + at::xpu::XPUStream& stream) { + if (getRank() == root) { + for (auto input : inputs) { + c10::xpu::XPUCachingAllocator::recordStream( + input.storage().data_ptr(), stream); + } + } + { + ccl::event ret_evt; + if (rank_ == root) { + for (const auto r : c10::irange(size_)) { + if (r != root) { + // do send + size_t send_count = inputs[r].numel(); + auto send_type = getXcclDataType(inputs[r].scalar_type()); + ret_evt = ccl::send( + inputs[r].data_ptr(), + send_count, + send_type, + r, + comm, + ccl::create_stream(stream.queue())); + } else { + // on its own rank, simply copy from the input + outputTensor.copy_(inputs[r]); + } + } + } else { + // do receive + size_t recv_count = outputTensor.numel(); + auto recv_type = getXcclDataType(outputTensor.scalar_type()); + ret_evt = ccl::recv( + outputTensor.data_ptr(), + recv_count, + recv_type, + root, + comm, + ccl::create_stream(stream.queue())); + } + + return ret_evt; + } + }, + OpType::SCATTER); +} + c10::intrusive_ptr ProcessGroupXCCL::allreduce_impl( at::Tensor& tensor, const AllreduceOptions& opts) { diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp index ec0e1b805f579f..690aec54e8cc0b 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp @@ -309,9 +309,7 @@ class TORCH_API ProcessGroupXCCL : public Backend { c10::intrusive_ptr scatter( std::vector& outputTensors, std::vector>& inputTensors, - const ScatterOptions& opts = ScatterOptions()) override { - TORCH_CHECK(false, "ProcessGroupXCCL::scatter not implemented"); - } + const ScatterOptions& opts = ScatterOptions()) override; protected: std::unordered_map xcclStreams_; From 156c2ac9bccb6159cd538de3e54ec484d2399787 Mon Sep 17 00:00:00 2001 From: hanchao Date: Thu, 19 Sep 2024 09:21:27 +0000 Subject: [PATCH 65/96] update --- torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp | 2 +- torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp | 13 +++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index 6b57a6c5471b36..5aeeb62bee1ece 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -52,7 +52,7 @@ XCCL_KVS get_kvs(int rank, c10d::Store& store) { std::lock_guard lock(kvs_mutex); if (kvs) return kvs; - std::string storeKey = "ccl_kvs"; + std::string storeKey = "xccl_kvs"; // Rank 0 broadcast the bootstrap network information to other ranks if (rank == 0) { diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp index 96f7e46e7c378d..14a9f398a8cbe7 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp @@ -48,12 +48,13 @@ int getXCCLEnvVar(std::string envVarName) { } } -void setXCCLEnvVar(std::string envVarName, int val) { - setenv(envVarName.c_str(), std::to_string(val).c_str(), val); -} - -void setXCCLEnvVar(std::string envVarName, std::string val) { - setenv(envVarName.c_str(), val.c_str(), 1); +template +void setXCCLEnvVar(const std::string& envVarName, T val) { + if constexpr (std::is_same_v) { + setenv(envVarName.c_str(), std::to_string(val).c_str(), 1); + } else if constexpr (std::is_same_v) { + setenv(envVarName.c_str(), val.c_str(), 1); + } } bool with_mpirun() { From 652da01588ecb64e354707bc52496c28c01f07ce Mon Sep 17 00:00:00 2001 From: hanchao Date: Thu, 29 Aug 2024 09:28:58 +0000 Subject: [PATCH 66/96] Xccl process group for Pytorch --- CMakeLists.txt | 6 + build_variables.bzl | 4 + caffe2/CMakeLists.txt | 13 + caffe2/core/macros.h.in | 1 + cmake/Dependencies.cmake | 16 + cmake/External/xccl.cmake | 17 + cmake/Modules/FindXCCL.cmake | 68 +++ cmake/Summary.cmake | 6 + setup.py | 4 + test/distributed/test_c10d_common.py | 9 +- test/distributed/test_c10d_xccl.py | 303 +++++++++++++ torch/CMakeLists.txt | 7 + torch/_C/_distributed_c10d.pyi | 9 + torch/csrc/distributed/c10d/Ops.cpp | 20 + torch/csrc/distributed/c10d/ProcessGroup.cpp | 2 + torch/csrc/distributed/c10d/ProcessGroup.hpp | 3 + .../distributed/c10d/ProcessGroupXCCL.cpp | 401 ++++++++++++++++++ .../distributed/c10d/ProcessGroupXCCL.hpp | 308 ++++++++++++++ torch/csrc/distributed/c10d/init.cpp | 22 + torch/distributed/distributed_c10d.py | 48 ++- torch/testing/_internal/common_distributed.py | 11 +- 21 files changed, 1268 insertions(+), 10 deletions(-) create mode 100644 cmake/External/xccl.cmake create mode 100644 cmake/Modules/FindXCCL.cmake create mode 100644 test/distributed/test_c10d_xccl.py create mode 100644 torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp create mode 100644 torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 5139c0a478e788..89ef59681bfff4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -275,6 +275,8 @@ option(USE_NATIVE_ARCH "Use -march=native" OFF) cmake_dependent_option(USE_MPS "Use MPS for macOS build" ON "MPS_FOUND" OFF) cmake_dependent_option(USE_NCCL "Use NCCL" ON "USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF) +cmake_dependent_option(USE_XCCL "Use XCCL" ON + "USE_XPU;UNIX;NOT APPLE" OFF) cmake_dependent_option(USE_RCCL "Use RCCL" ON USE_NCCL OFF) cmake_dependent_option(USE_STATIC_NCCL "Use static NCCL" OFF "USE_NCCL" OFF) cmake_dependent_option(USE_SYSTEM_NCCL "Use system-wide NCCL" OFF "USE_NCCL" @@ -353,6 +355,8 @@ cmake_dependent_option(USE_C10D_GLOO "USE C10D GLOO" ON "USE_DISTRIBUTED;USE_GLOO" OFF) cmake_dependent_option(USE_C10D_NCCL "USE C10D NCCL" ON "USE_DISTRIBUTED;USE_NCCL" OFF) +cmake_dependent_option(USE_C10D_XCCL "USE C10D XCCL" ON + "USE_DISTRIBUTED;USE_XCCL" OFF) cmake_dependent_option(USE_C10D_MPI "USE C10D MPI" ON "USE_DISTRIBUTED;USE_MPI" OFF) cmake_dependent_option( @@ -365,6 +369,8 @@ cmake_dependent_option( USE_C10D_GLOO "USE C10D GLOO" ON "USE_DISTRIBUTED;USE_GLOO" OFF) cmake_dependent_option( USE_C10D_NCCL "USE C10D NCCL" ON "USE_DISTRIBUTED;USE_NCCL" OFF) +cmake_dependent_option( + USE_C10D_XCCL "USE C10D XCCL" ON "USE_DISTRIBUTED;USE_XCCL" OFF) cmake_dependent_option( USE_C10D_MPI "USE C10D MPI" ON "USE_DISTRIBUTED;USE_MPI" OFF) cmake_dependent_option( diff --git a/build_variables.bzl b/build_variables.bzl index e05c94bd83f577..98b721617b609c 100644 --- a/build_variables.bzl +++ b/build_variables.bzl @@ -700,6 +700,10 @@ libtorch_cuda_sources = libtorch_cuda_core_sources + libtorch_cuda_distributed_s "torch/csrc/cuda/nccl.cpp", ] +libtorch_xpu_distributed_extra_sources = [ + "torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp", +] + torch_cpp_srcs = [ "torch/csrc/api/src/cuda.cpp", # this just forwards stuff, no real CUDA "torch/csrc/api/src/data/datasets/mnist.cpp", diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index 8ed93cdff0479c..d44a8da210462f 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -1014,6 +1014,9 @@ elseif(USE_CUDA) endif() if(USE_XPU) + if(USE_XCCL) + append_filelist("libtorch_xpu_distributed_extra_sources" Caffe2_XPU_SRCS) + endif() add_library(torch_xpu ${Caffe2_XPU_SRCS}) torch_compile_options(torch_xpu) # see cmake/public/utils.cmake target_compile_definitions(torch_xpu PRIVATE USE_XPU) @@ -1079,6 +1082,10 @@ if(USE_XPU) include_directories(SYSTEM ${ATen_XPU_INCLUDE_DIRS}) endif() + if(USE_XCCL) + target_link_libraries(torch_xpu PRIVATE torch::xccl) + target_compile_definitions(torch_xpu PRIVATE USE_XCCL) + endif() endif() if(NOT MSVC AND USE_XNNPACK) @@ -1365,6 +1372,12 @@ if(USE_DISTRIBUTED) target_compile_definitions(torch_cuda PUBLIC USE_C10D_NCCL) endif() endif() + if(USE_C10D_XCCL) + target_compile_definitions(torch_xpu PUBLIC USE_C10D_XCCL) + set_source_files_properties( + ${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupXCCL.cpp + PROPERTIES COMPILE_DEFINITIONS "CCL_ENABLE_ZE;CCL_ENABLE_SYCL") + endif() if(USE_MPI AND USE_C10D_MPI) if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU") set_source_files_properties( diff --git a/caffe2/core/macros.h.in b/caffe2/core/macros.h.in index 2929f105b31faa..e5398a83cad947 100644 --- a/caffe2/core/macros.h.in +++ b/caffe2/core/macros.h.in @@ -45,6 +45,7 @@ {"USE_CUDNN", "${USE_CUDNN}"}, \ {"CUDNN_VERSION", "${CUDNN_VERSION}"}, \ {"USE_NCCL", "${USE_NCCL}"}, \ + {"USE_XCCL", "${USE_XCCL}"}, \ {"USE_MPI", "${USE_MPI}"}, \ {"USE_GFLAGS", "${USE_GFLAGS}"}, \ {"USE_GLOG", "${USE_GLOG}"}, \ diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index ef33a3165340c1..8abea841fcf61c 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -1150,6 +1150,22 @@ if(USE_CUDA) include_directories(SYSTEM ${CUB_INCLUDE_DIRS}) endif() +# ---[ XCCL +if(USE_XCCL) + if(NOT USE_XPU) + message(WARNING + "Not using XPU, so disabling USE_XCCL. Suppress this warning with " + "-DUSE_XCCL=OFF.") + caffe2_update_option(USE_XCCL OFF) + elseif(NOT CMAKE_SYSTEM_NAME STREQUAL "Linux") + message(WARNING "USE_XCCL is currently only supported under Linux.") + caffe2_update_option(USE_XCCL OFF) + else() + include(${CMAKE_CURRENT_LIST_DIR}/External/xccl.cmake) + list(APPEND Caffe2_XPU_DEPENDENCY_LIBS torch::xccl) + endif() +endif() + if(USE_DISTRIBUTED AND USE_TENSORPIPE) if(MSVC) message(WARNING "Tensorpipe cannot be used on Windows.") diff --git a/cmake/External/xccl.cmake b/cmake/External/xccl.cmake new file mode 100644 index 00000000000000..56205b381b1324 --- /dev/null +++ b/cmake/External/xccl.cmake @@ -0,0 +1,17 @@ +if(NOT __XCCL_INCLUDED) + set(__XCCL_INCLUDED TRUE) + + if(USE_XCCL) + # XCCL_ROOT, XCCL_LIBRARY_DIR, XCCL_INCLUDE_DIR are handled by FindXCCL.cmake. + find_package(XCCL REQUIRED) + if(XCCL_FOUND) + add_library(torch::xccl INTERFACE IMPORTED) + set_property( + TARGET torch::xccl PROPERTY INTERFACE_INCLUDE_DIRECTORIES + ${XCCL_INCLUDE_DIR}) + set_property( + TARGET torch::xccl PROPERTY INTERFACE_LINK_LIBRARIES + ${XCCL_LIBRARY}) + endif() + endif() +endif() diff --git a/cmake/Modules/FindXCCL.cmake b/cmake/Modules/FindXCCL.cmake new file mode 100644 index 00000000000000..56b7fc0f7dcf32 --- /dev/null +++ b/cmake/Modules/FindXCCL.cmake @@ -0,0 +1,68 @@ +# This will define the following variables: +# XCCL_FOUND : True if the system has the XCCL library. +# XCCL_INCLUDE_DIR : Include directories needed to use XCCL. +# XCCL_LIBRARY_DIR :The path to the XCCL library. +# XCCL_LIBRARY : XCCL library fullname. + +include(FindPackageHandleStandardArgs) + +set(XCCL_ROOT "") +if(DEFINED ENV{CCL_ROOT}) + set(XCCL_ROOT $ENV{CCL_ROOT}) +endif() + +string(COMPARE EQUAL "${XCCL_ROOT}" "" nosyclfound) +if(nosyclfound) + set(XCCL_FOUND False) + set(XCCL_REASON_FAILURE "XCCL library not set!!") + set(XCCL_NOT_FOUND_MESSAGE "${XCCL_REASON_FAILURE}") + return() +endif() + +# Find include path from binary. +find_file( + XCCL_INCLUDE_DIR + NAMES include + HINTS ${XCCL_ROOT} + NO_DEFAULT_PATH +) + +# Find include/oneapi path from include path. +find_file( + XCCL_INCLUDE_ONEAPI_DIR + NAMES oneapi + HINTS ${XCCL_ROOT}/include/ + NO_DEFAULT_PATH +) + +list(APPEND XCCL_INCLUDE_DIR ${XCCL_INCLUDE_ONEAPI_DIR}) + +# Find library directory from binary. +find_file( + XCCL_LIBRARY_DIR + NAMES lib + HINTS ${XCCL_ROOT} + NO_DEFAULT_PATH +) + +# Find XCCL library fullname. +find_library( + XCCL_LIBRARY + NAMES ccl + HINTS ${XCCL_LIBRARY_DIR} + NO_DEFAULT_PATH +) + +if((NOT XCCL_INCLUDE_DIR) OR (NOT XCCL_LIBRARY_DIR) OR (NOT XCCL_LIBRARY)) + set(XCCL_FOUND False) + set(XCCL_REASON_FAILURE "XCCL library is incomplete!!") + set(XCCL_NOT_FOUND_MESSAGE "${XCCL_REASON_FAILURE}") + return() +endif() + +find_package_handle_standard_args( + XCCL + FOUND_VAR XCCL_FOUND + REQUIRED_VARS XCCL_INCLUDE_DIR XCCL_LIBRARY_DIR XCCL_LIBRARY + REASON_FAILURE_MESSAGE "${XCCL_REASON_FAILURE}" +) diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake index d51c451589c2c4..229ff112ab3187 100644 --- a/cmake/Summary.cmake +++ b/cmake/Summary.cmake @@ -153,6 +153,12 @@ function(caffe2_print_configuration_summary) message(STATUS " USE_SYSTEM_UCC : ${USE_SYSTEM_UCC}") endif() message(STATUS " USE_ITT : ${USE_ITT}") + message(STATUS " USE_XCCL : ${USE_XCCL}") + if(${USE_XCCL}) + message(STATUS " USE_C10D_XCCL : ${USE_C10D_XCCL}") + message(STATUS " XCCL include path : ${XCCL_INCLUDE_DIR}") + message(STATUS " XCCL library : ${XCCL_LIBRARY}") + endif() message(STATUS " USE_NCCL : ${USE_NCCL}") if(${USE_NCCL}) message(STATUS " USE_SYSTEM_NCCL : ${USE_SYSTEM_NCCL}") diff --git a/setup.py b/setup.py index 92f1e2ddc7bcd3..ad48f4b0108633 100644 --- a/setup.py +++ b/setup.py @@ -645,6 +645,10 @@ def run(self): report("-- Building NCCL library") else: report("-- Not using NCCL") + if cmake_cache_vars["USE_XCCL"]: + report("-- Building XCCL library") + else: + report("-- Not using XCCL") if cmake_cache_vars["USE_DISTRIBUTED"]: if IS_WINDOWS: report("-- Building without distributed package") diff --git a/test/distributed/test_c10d_common.py b/test/distributed/test_c10d_common.py index 6a0621f3f49913..3e5538d57e38ae 100644 --- a/test/distributed/test_c10d_common.py +++ b/test/distributed/test_c10d_common.py @@ -66,8 +66,13 @@ def gpus_for_rank(world_size): On a single node, all visible GPUs are evenly divided to subsets, each process only uses a subset. """ - visible_devices = list(range(torch.cuda.device_count())) - gpus_per_process = torch.cuda.device_count() // world_size + device_count = ( + torch.xpu.device_count() + if torch.xpu.is_available() + else torch.cuda.device_count() + ) + visible_devices = list(range(device_count)) + gpus_per_process = device_count // world_size gpus_for_rank = [] for rank in range(world_size): gpus_for_rank.append( diff --git a/test/distributed/test_c10d_xccl.py b/test/distributed/test_c10d_xccl.py new file mode 100644 index 00000000000000..704cdd414e554b --- /dev/null +++ b/test/distributed/test_c10d_xccl.py @@ -0,0 +1,303 @@ +# Owner(s): ["oncall: distributed"] + +import math +import os +import sys +import time +from datetime import timedelta +from unittest import mock + +import torch +import torch.distributed as c10d + + +if not c10d.is_available() or not c10d.is_xccl_available(): + print("c10d XCCL not available, skipping tests", file=sys.stderr) + sys.exit(0) + +import test_c10d_common + +import torch.distributed as dist +import torch.testing._internal.common_utils as common +from torch.testing._internal.common_distributed import ( + init_multigpu_helper, + MultiProcessTestCase, + requires_xccl, +) +from torch.testing._internal.common_utils import ( + retry_on_connect_failures, + run_tests, + skip_but_pass_in_sandcastle_if, + TEST_XPU, + TestCase, +) + + +def simple_reduce_tests(rank, world_size): + tests = [ + ( + c10d.ReduceOp.SUM, + torch.tensor([rank + 1.0]), + torch.tensor([float(world_size * (world_size + 1) / 2)]), + ), + ( + c10d.ReduceOp.PRODUCT, + torch.tensor([rank + 1.0]), + torch.tensor([float(math.factorial(world_size))]), + ), + ( + c10d.ReduceOp.MIN, + torch.tensor([rank + 1.0]), + torch.tensor([1.0]), + ), + ( + c10d.ReduceOp.MAX, + torch.tensor([rank + 1.0]), + torch.tensor([world_size]), + ), + ] + + return tests + + +TEST_MULTIXPU = torch.xpu.device_count() > 1 + + +class RendezvousEnvTest(TestCase): + @retry_on_connect_failures + @requires_xccl() + @skip_but_pass_in_sandcastle_if(not TEST_XPU, "No GPUs available, skipping test") + def test_common_errors(self): + vars = { + "WORLD_SIZE": "1", + "RANK": "0", + "MASTER_ADDR": "127.0.0.1", + "MASTER_PORT": str(common.find_free_port()), + } + + class Env: + def __init__(self, vars): + self.env_patcher = mock.patch.dict(os.environ, vars, clear=True) + + def __enter__(self): + self.env_patcher.start() + + def __exit__(self, type, value, traceback): + self.env_patcher.stop() + + def without(d, key): + d = d.copy() + d.pop(key) + return d + + def withouts(d, keys): + d = d.copy() + for key in keys: + d.pop(key) + return d + + with Env(without(vars, "WORLD_SIZE")): + self.assertEqual(None, os.environ.get("WORLD_SIZE")) + with self.assertRaisesRegex(ValueError, "WORLD_SIZE expected"): + gen = c10d.rendezvous("env://") + next(gen) + c10d.init_process_group(backend="xccl", world_size=1) + self.assertEqual(c10d.get_rank(), 0) + self.assertEqual(c10d.get_world_size(), 1) + c10d.destroy_process_group() + + with Env(without(vars, "RANK")): + self.assertEqual(None, os.environ.get("RANK")) + with self.assertRaisesRegex(ValueError, "RANK expected"): + gen = c10d.rendezvous("env://") + next(gen) + c10d.init_process_group(backend="xccl", rank=0) + self.assertEqual(c10d.get_rank(), 0) + self.assertEqual(c10d.get_world_size(), 1) + c10d.destroy_process_group() + + with Env(withouts(vars, ["RANK", "WORLD_SIZE"])): + self.assertEqual(None, os.environ.get("RANK")) + self.assertEqual(None, os.environ.get("WORLD_SIZE")) + c10d.init_process_group(backend="xccl", rank=0, world_size=1) + self.assertEqual(c10d.get_rank(), 0) + self.assertEqual(c10d.get_world_size(), 1) + c10d.destroy_process_group() + + with Env(vars): + c10d.init_process_group(backend="xccl") + self.assertEqual(c10d.get_rank(), 0) + self.assertEqual(c10d.get_world_size(), 1) + c10d.destroy_process_group() + + with Env(without(vars, "MASTER_ADDR")): + self.assertEqual(None, os.environ.get("MASTER_ADDR")) + with self.assertRaisesRegex(ValueError, "MASTER_ADDR expected"): + gen = c10d.rendezvous("env://") + next(gen) + + with Env(without(vars, "MASTER_PORT")): + self.assertEqual(None, os.environ.get("MASTER_PORT")) + with self.assertRaisesRegex(ValueError, "MASTER_PORT expected"): + gen = c10d.rendezvous("env://") + next(gen) + + with Env(without(vars, "WORLD_SIZE")): + self.assertEqual(None, os.environ.get("WORLD_SIZE")) + gen = c10d.rendezvous(f"env://?world_size={1}") + _, _, size = next(gen) + self.assertEqual(size, 1) + + with Env(without(vars, "RANK")): + self.assertEqual(None, os.environ.get("RANK")) + gen = c10d.rendezvous(f"env://?rank={0}") + _, rank, _ = next(gen) + self.assertEqual(rank, 0) + + with Env(withouts(vars, ["RANK", "WORLD_SIZE"])): + self.assertEqual(None, os.environ.get("RANK")) + self.assertEqual(None, os.environ.get("WORLD_SIZE")) + gen = c10d.rendezvous(f"env://?rank={0}&world_size={1}") + _, rank, size = next(gen) + self.assertEqual(rank, 0) + self.assertEqual(size, 1) + + +class TimeoutTest(test_c10d_common.AbstractTimeoutTest, TestCase): + @requires_xccl() + @retry_on_connect_failures + @skip_but_pass_in_sandcastle_if(not TEST_XPU, "No GPUs available, skipping test") + def test_default_store_timeout_nccl(self): + self._test_default_store_timeout("xccl") + + +class ProcessGroupXCCLTest(MultiProcessTestCase): + def _create_process_group_xccl( + self, timeout=timedelta(seconds=600), device_id=None + ): + store = c10d.FileStore(self.file_name, self.world_size) + c10d.init_process_group( + "xccl", + world_size=self.world_size, + rank=self.rank, + store=store, + timeout=timeout, + device_id=device_id, + ) + pg = c10d.distributed_c10d._get_default_group() + return pg + + def setUp(self): + super().setUp() + self._spawn_processes() + + def tearDown(self): + super().tearDown() + try: + os.remove(self.file_name) + except OSError: + pass + + @property + def world_size(self): + return 2 + + @property + def rank_to_GPU(self): + # return rank to GPU map + return init_multigpu_helper(self.world_size, "xccl") + + @requires_xccl() + @skip_but_pass_in_sandcastle_if( + torch.xpu.device_count() < 2, "XCCL test requires 2+ GPUs" + ) + def test_close_multi_pg_unordered(self): + pg = self._create_process_group_xccl() + device = self.rank_to_GPU[self.rank][0] + t = torch.rand(10, 10, device=device) + # First allreduce to initialize default PG's communicator. + pg.allreduce(t).wait() + new_pg1 = c10d.new_group([0, 1]) + new_pg2 = c10d.new_group([0, 1]) + if self.rank == 0 or self.rank == 1: + t1 = torch.rand(10, 10, device=device) + t2 = torch.rand(10, 10, device=device) + new_pg1.allreduce(t1).wait() + new_pg2.allreduce(t2).wait() + if self.rank == 0: + dist.destroy_process_group(new_pg2) + # force destruction of pg2 first + del new_pg2 + dist.destroy_process_group(new_pg1) + del new_pg1 + if self.rank == 1: + c10d.destroy_process_group(new_pg1) + # force destruction of pg1 first + del new_pg1 + dist.destroy_process_group(new_pg2) + del new_pg2 + dist.destroy_process_group() + + @requires_xccl() + @skip_but_pass_in_sandcastle_if( + torch.xpu.device_count() < 2, "XCCL test requires 2+ GPUs" + ) + def test_file_store_check(self): + # self.file_name is created using "delete=False" + # e.g., self.file_name = tempfile.NamedTemporaryFile(delete=False).name + store = dist.FileStore(self.file_name, self.world_size) + dist.init_process_group( + backend="xccl", rank=self.rank, world_size=self.world_size, store=store + ) + pg = dist.distributed_c10d._get_default_group() + self.assertEqual(pg.rank(), self.rank) + self.assertEqual(pg.size(), self.world_size) + # give enough time for check() to be executed multiple times + time.sleep(2) + dist.destroy_process_group() + + @requires_xccl() + @skip_but_pass_in_sandcastle_if(not TEST_MULTIXPU, "XCCL test requires 2+ GPUs") + def test_set_process_group_desc(self): + device = torch.device(f"xpu:{self.rank}") + pg_default = self._create_process_group_xccl(device_id=device) + self.assertEqual(pg_default.group_desc, "default_pg") + pg_1 = c10d.new_group([0, 1], group_desc="test_purpose") + self.assertEqual(pg_1.group_desc, "test_purpose") + pg_2 = c10d.new_group([0, 1]) + self.assertEqual(pg_2.group_desc, "undefined") + + def _test_allreduce_basics(self, fn): + pg = self._create_process_group_xccl() + device = torch.device("xpu:" + str(self.rank)) + # Single input tests + tests = simple_reduce_tests(self.rank, self.world_size) + for op, input, expected in tests: + opts = c10d.AllreduceOptions() + opts.reduceOp = op + tensor = fn(input.to(device)) + fut = pg.allreduce([tensor], opts).get_future() + fut.wait() + result = fut.value() + self.assertEqual(expected, result[0], exact_dtype=False) + + x = fn(torch.tensor([self.rank + 1.0], device=device)) + fut = pg.allreduce(x).get_future() + fut.wait() + result = fut.value() + self.assertEqual( + torch.tensor([float(self.world_size * (self.world_size + 1) / 2)]), + result[0], + ) + + @requires_xccl() + def test_allreduce_basics(self): + self._test_allreduce_basics(lambda t: t.clone()) + + +if __name__ == "__main__": + assert ( + not torch.xpu._initialized + ), "test_distributed must not have initialized XPU context on main process" + + run_tests() diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt index bb949a081c95e9..9a91b26d54cfb4 100644 --- a/torch/CMakeLists.txt +++ b/torch/CMakeLists.txt @@ -282,6 +282,9 @@ if(USE_DISTRIBUTED) if(USE_NCCL) list(APPEND TORCH_PYTHON_LINK_LIBRARIES __caffe2_nccl) endif() + if(USE_XCCL) + list(APPEND TORCH_PYTHON_LINK_LIBRARIES torch::xccl) + endif() # Same for MPI. if(USE_MPI) list(APPEND TORCH_PYTHON_LINK_LIBRARIES MPI::MPI_CXX) @@ -345,6 +348,10 @@ if(BUILD_LIBTORCHLESS) target_compile_definitions(torch_python PRIVATE USE_C10D_NCCL) endif() + if(USE_XPU AND USE_C10D_XCCL) + target_compile_definitions(torch_python PRIVATE USE_C10D_XCCL) + endif() + if(USE_DISTRIBUTED) target_compile_definitions(torch_python PRIVATE USE_DISTRIBUTED) endif() diff --git a/torch/_C/_distributed_c10d.pyi b/torch/_C/_distributed_c10d.pyi index 94e8578bbfff62..6033d969925972 100644 --- a/torch/_C/_distributed_c10d.pyi +++ b/torch/_C/_distributed_c10d.pyi @@ -309,6 +309,7 @@ class ProcessGroup: UNDEFINED = ... GLOO = ... NCCL = ... + XCCL = ... UCC = ... MPI = ... CUSTOM = ... @@ -697,3 +698,11 @@ class ProcessGroupCudaP2P(Backend): storage_offset: Optional[int] = 0, ) -> torch.Tensor: ... def _shutdown(self) -> None: ... + +class ProcessGroupXCCL(Backend): + def __init__( + self, + store: Store, + rank: int, + size: int, + ): ... diff --git a/torch/csrc/distributed/c10d/Ops.cpp b/torch/csrc/distributed/c10d/Ops.cpp index ae822ad3975049..699c54236f6412 100644 --- a/torch/csrc/distributed/c10d/Ops.cpp +++ b/torch/csrc/distributed/c10d/Ops.cpp @@ -79,6 +79,7 @@ namespace { } IMPL_SEND(CPU) +IMPL_SEND(XPU) IMPL_SEND(CUDA) IMPL_SEND(PrivateUse1) @@ -94,6 +95,7 @@ IMPL_SEND(PrivateUse1) } IMPL_RECV(CPU) +IMPL_RECV(XPU) IMPL_RECV(CUDA) IMPL_RECV(PrivateUse1) @@ -108,6 +110,7 @@ IMPL_RECV(PrivateUse1) } IMPL_RECV_ANY_SOURCE(CPU) +IMPL_RECV_ANY_SOURCE(XPU) IMPL_RECV_ANY_SOURCE(CUDA) IMPL_RECV_ANY_SOURCE(PrivateUse1) @@ -131,6 +134,7 @@ IMPL_RECV_ANY_SOURCE(PrivateUse1) } IMPL_REDUCE(CPU) +IMPL_REDUCE(XPU) IMPL_REDUCE(CUDA) IMPL_REDUCE(PrivateUse1) @@ -156,6 +160,7 @@ IMPL_REDUCE(PrivateUse1) } IMPL_BROADCAST(CPU) +IMPL_BROADCAST(XPU) IMPL_BROADCAST(CUDA) IMPL_BROADCAST(PrivateUse1) @@ -181,6 +186,7 @@ IMPL_BROADCAST(PrivateUse1) IMPL_ALLREDUCE(CPU) IMPL_ALLREDUCE(CUDA) +IMPL_ALLREDUCE(XPU) IMPL_ALLREDUCE(PrivateUse1) #define IMPL_ALLREDUCE_COALESCED(DEV) \ @@ -198,6 +204,7 @@ IMPL_ALLREDUCE(PrivateUse1) } IMPL_ALLREDUCE_COALESCED(CPU) +IMPL_ALLREDUCE_COALESCED(XPU) IMPL_ALLREDUCE_COALESCED(CUDA) IMPL_ALLREDUCE_COALESCED(PrivateUse1) @@ -222,6 +229,7 @@ IMPL_ALLREDUCE_COALESCED(PrivateUse1) // NOLINTBEGIN(cppcoreguidelines-pro-type-const-cast) IMPL_ALLGATHER(CPU) +IMPL_ALLGATHER(XPU) IMPL_ALLGATHER(CUDA) IMPL_ALLGATHER(PrivateUse1) @@ -242,6 +250,7 @@ IMPL_ALLGATHER(PrivateUse1) } IMPL__ALLGATHER_BASE(CPU) +IMPL__ALLGATHER_BASE(XPU) IMPL__ALLGATHER_BASE(CUDA) IMPL__ALLGATHER_BASE(PrivateUse1) @@ -258,6 +267,7 @@ IMPL__ALLGATHER_BASE(PrivateUse1) } IMPL_ALLGATHER_COALESCED(CPU) +IMPL_ALLGATHER_COALESCED(XPU) IMPL_ALLGATHER_COALESCED(CUDA) IMPL_ALLGATHER_COALESCED(PrivateUse1) @@ -273,6 +283,7 @@ IMPL_ALLGATHER_COALESCED(PrivateUse1) } IMPL_ALLGATHER_INTO_TENSOR_COALESCED(CPU) +IMPL_ALLGATHER_INTO_TENSOR_COALESCED(XPU) IMPL_ALLGATHER_INTO_TENSOR_COALESCED(CUDA) IMPL_ALLGATHER_INTO_TENSOR_COALESCED(PrivateUse1) @@ -296,6 +307,7 @@ IMPL_ALLGATHER_INTO_TENSOR_COALESCED(PrivateUse1) } IMPL_REDUCE_SCATTER(CPU) +IMPL_REDUCE_SCATTER(XPU) IMPL_REDUCE_SCATTER(CUDA) IMPL_REDUCE_SCATTER(PrivateUse1) @@ -320,6 +332,7 @@ IMPL_REDUCE_SCATTER(PrivateUse1) } IMPL__REDUCE_SCATTER_BASE(CPU) +IMPL__REDUCE_SCATTER_BASE(XPU) IMPL__REDUCE_SCATTER_BASE(CUDA) IMPL__REDUCE_SCATTER_BASE(PrivateUse1) @@ -341,6 +354,7 @@ IMPL__REDUCE_SCATTER_BASE(PrivateUse1) } IMPL_REDUCE_SCATTER_TENSOR_COALESCED(CPU) +IMPL_REDUCE_SCATTER_TENSOR_COALESCED(XPU) IMPL_REDUCE_SCATTER_TENSOR_COALESCED(CUDA) IMPL_REDUCE_SCATTER_TENSOR_COALESCED(PrivateUse1) @@ -360,6 +374,7 @@ IMPL_REDUCE_SCATTER_TENSOR_COALESCED(PrivateUse1) } IMPL_GATHER(CPU) +IMPL_GATHER(XPU) IMPL_GATHER(CUDA) IMPL_GATHER(PrivateUse1) @@ -382,6 +397,7 @@ IMPL_GATHER(PrivateUse1) } IMPL_SCATTER(CPU) +IMPL_SCATTER(XPU) IMPL_SCATTER(CUDA) IMPL_SCATTER(PrivateUse1) @@ -403,6 +419,7 @@ IMPL_SCATTER(PrivateUse1) } IMPL_ALLTOALL(CPU) +IMPL_ALLTOALL(XPU) IMPL_ALLTOALL(CUDA) IMPL_ALLTOALL(PrivateUse1) @@ -424,6 +441,7 @@ IMPL_ALLTOALL(PrivateUse1) } IMPL_ALLTOALL_BASE(CPU) +IMPL_ALLTOALL_BASE(XPU) IMPL_ALLTOALL_BASE(CUDA) IMPL_ALLTOALL_BASE(PrivateUse1) @@ -439,6 +457,7 @@ IMPL_ALLTOALL_BASE(PrivateUse1) } IMPL_BARRIER(CPU) +IMPL_BARRIER(XPU) IMPL_BARRIER(CUDA) IMPL_BARRIER(PrivateUse1) // NOLINTEND(cppcoreguidelines-pro-type-const-cast) @@ -491,6 +510,7 @@ namespace { #define REGISTER_C10D_OP(FUNC) \ REGISTER_C10D_OP1(FUNC, CPU) \ REGISTER_C10D_OP1(FUNC, CUDA) \ + REGISTER_C10D_OP1(FUNC, XPU) \ REGISTER_C10D_OP1(FUNC, PrivateUse1) // Now we start to register ops with the three device keys diff --git a/torch/csrc/distributed/c10d/ProcessGroup.cpp b/torch/csrc/distributed/c10d/ProcessGroup.cpp index 75635bc68aed4f..70356b3bf382ce 100644 --- a/torch/csrc/distributed/c10d/ProcessGroup.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroup.cpp @@ -21,6 +21,8 @@ static ProcessGroup::BackendType strToBackendType(std::string_view backend) { return ProcessGroup::BackendType::GLOO; } else if (backend == "nccl") { return ProcessGroup::BackendType::NCCL; + } else if (backend == "xccl") { + return ProcessGroup::BackendType::XCCL; } else if (backend == "ucc") { return ProcessGroup::BackendType::UCC; } else if (backend == "mpi") { diff --git a/torch/csrc/distributed/c10d/ProcessGroup.hpp b/torch/csrc/distributed/c10d/ProcessGroup.hpp index acf8c9c354a76b..73fc2bda701327 100644 --- a/torch/csrc/distributed/c10d/ProcessGroup.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroup.hpp @@ -70,6 +70,7 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder { UCC = 3, MPI = 4, CUSTOM = 5, + XCCL = 6, }; // Not used, set for backwards compatibility and only used for TypeDef in @@ -489,6 +490,7 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder { // TODO: HACK for backend name to get sequence number for that backend. if (backendType == ProcessGroup::BackendType::GLOO || backendType == ProcessGroup::BackendType::NCCL || + backendType == ProcessGroup::BackendType::XCCL || backendType == ProcessGroup::BackendType::UCC) { getDefaultBackend()->setSequenceNumberForGroup(); } else { @@ -510,6 +512,7 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder { // TODO: HACK for backend name to get sequence number for that backend. if (backendType == ProcessGroup::BackendType::GLOO || backendType == ProcessGroup::BackendType::NCCL || + backendType == ProcessGroup::BackendType::XCCL || backendType == ProcessGroup::BackendType::UCC) { return getDefaultBackend()->getSequenceNumberForGroup(); } else { diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp new file mode 100644 index 00000000000000..5aeeb62bee1ece --- /dev/null +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -0,0 +1,401 @@ +#include +#include +#include +#include + +#ifdef USE_C10D_XCCL +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace c10d { + +namespace { +std::map xcclOps = { + {ReduceOp::MIN, ccl::reduction::min}, + {ReduceOp::MAX, ccl::reduction::max}, + {ReduceOp::SUM, ccl::reduction::sum}, + {ReduceOp::PRODUCT, ccl::reduction::prod}, +}; + +std::map xcclDatatypes = { + {at::kByte, ccl::datatype::uint8}, + {at::kChar, ccl::datatype::int8}, + {at::kInt, ccl::datatype::int32}, + {at::kLong, ccl::datatype::int64}, + {at::kHalf, ccl::datatype::float16}, + {at::kFloat, ccl::datatype::float32}, + {at::kDouble, ccl::datatype::float64}, + {at::kBFloat16, ccl::datatype::bfloat16}, + {at::kBool, ccl::datatype::uint8}, +}; + +XCCL_KVS kvs; +std::mutex kvs_mutex; + +XCCL_KVS get_kvs(int rank, c10d::Store& store) { + std::lock_guard lock(kvs_mutex); + if (kvs) + return kvs; + std::string storeKey = "xccl_kvs"; + + // Rank 0 broadcast the bootstrap network information to other ranks + if (rank == 0) { + kvs = ccl::create_main_kvs(); + ccl::kvs::address_type main_addr = kvs->get_address(); + auto ccl_kvs_addr = + std::vector(main_addr.begin(), main_addr.end()); + store.set(storeKey, ccl_kvs_addr); + } else { + auto ccl_kvs_addr = store.get(storeKey); + if (ccl_kvs_addr.size() != ccl::kvs::address_max_size) { + throw std::runtime_error("Unexpected ccl kvs addr from the store\n"); + } + ccl::kvs::address_type main_addr; + std::copy_n( + ccl_kvs_addr.begin(), ccl::kvs::address_max_size, main_addr.begin()); + kvs = ccl::create_kvs(main_addr); + } + + return kvs; +} + +void check_xpu_single_tensor(const at::Tensor& tensor) { + if (!tensor.is_xpu() || tensor.is_sparse()) { + C10_THROW_ERROR(ValueError, "Tensors must be XPU and dense"); + } + if (!tensor.is_contiguous(tensor.suggest_memory_format())) { + C10_THROW_ERROR(ValueError, "Tensors must be contiguous"); + } +} + +ccl::datatype getXcclDataType(at::ScalarType type) { + auto it = xcclDatatypes.find(type); + TORCH_CHECK_WITH( + TypeError, + it != xcclDatatypes.end(), + "Input tensor data type is not supported for XCCL process group: ", + type); + return it->second; +} + +ccl::reduction getXcclReduceOp(const ReduceOp& reduceOp, at::Tensor& input) { + try { + if (input.scalar_type() == at::kBool) { + if (reduceOp == ReduceOp::SUM) { + // For bool tensors, map sum to max, which both represent a bitwise or. + // This is to prevent overflow issues with sum, since we use uint8 to + // represent a bool (see xcclDatatypes mapping align with cuda). + return ccl::reduction::max; + } + } + return xcclOps.at(reduceOp); + } catch (const std::out_of_range&) { + switch (reduceOp) { + case ReduceOp::AVG: + C10_THROW_ERROR(ValueError, "Cannot use ReduceOp AVG with XCCL"); + break; + case ReduceOp::BAND: + C10_THROW_ERROR(ValueError, "Cannot use ReduceOp.BAND with XCCL"); + break; + case ReduceOp::BOR: + C10_THROW_ERROR(ValueError, "Cannot use ReduceOp.BOR with XCCL"); + break; + case ReduceOp::BXOR: + C10_THROW_ERROR(ValueError, "Cannot use ReduceOp.BXOR with XCCL"); + break; + default: + C10_THROW_ERROR(ValueError, "Unhandled ReduceOp"); + break; + } + } +} + +} // namespace + +static std::mutex xcclCommDevIdxMapMutex; +static std::unordered_map, int> xcclCommDevIdxMap; +constexpr int64_t kSynchronizeBusyWaitMillis = 10; + +ProcessGroupXCCL::WorkXCCL::WorkXCCL( + at::Device& device, + int rank, + OpType opType, + const std::optional>& inputs) + : Work(rank, opType, "profilingTitle", inputs), + device_(device), + workStartTime_(std::chrono::steady_clock::now()) { + unsigned char enable_timing = 0; + xcclEndEvent_ = std::make_shared(enable_timing); +} + +ProcessGroupXCCL::WorkXCCL::WorkXCCL(const WorkXCCL& w) + : Work(w.rank_, w.opType_), + device_(w.device_), + xcclEndEvent_(w.xcclEndEvent_), + blockingWait_(w.blockingWait_), + workStartTime_(w.workStartTime_) {} + +ProcessGroupXCCL::WorkXCCL::~WorkXCCL() = default; + +bool ProcessGroupXCCL::WorkXCCL::checkTimeout( + std::optional timeout) { + auto currentTimepoint = std::chrono::steady_clock::now(); + auto timeElapsed = std::chrono::duration_cast( + currentTimepoint - workStartTime_); + std::chrono::milliseconds opTimeout = std::chrono::milliseconds(60000); + + auto workTimeout = timeout ? *timeout : opTimeout; + + if (timeElapsed < workTimeout) + return false; + return true; +} + +bool ProcessGroupXCCL::WorkXCCL::isCompleted() { + if (xcclEndEvent_ && xcclEndEvent_->query()) { + return true; + } + return false; +} + +void ProcessGroupXCCL::WorkXCCL::synchronize() { + synchronizeInternal(kNoTimeout); +} + +void ProcessGroupXCCL::WorkXCCL::synchronizeStream() { + auto currentStream = at::xpu::getCurrentXPUStream(device_.index()); + // Block the current stream on the XCCL stream + xcclEndEvent_->block(currentStream); +} + +void ProcessGroupXCCL::WorkXCCL::synchronizeInternal( + std::chrono::milliseconds timeout) { + synchronizeStream(); + + if (blockingWait_) { + while (!isCompleted()) { + bool timedOut = checkTimeout( + timeout == kNoTimeout ? std::nullopt : std::make_optional(timeout)); + if (timedOut) { + break; + } + std::this_thread::sleep_for( + std::chrono::milliseconds(kSynchronizeBusyWaitMillis)); + } + } +} + +bool ProcessGroupXCCL::WorkXCCL::wait(std::chrono::milliseconds timeout) { + synchronizeInternal(timeout); + return true; +} + +ProcessGroupXCCL::ProcessGroupXCCL( + const c10::intrusive_ptr& store, + int rank, + int size) + : Backend(rank, size), store_(store) { + blockingWait_ = getCvarBool(TORCH_XCCL_BLOCKING_WAIT, false); + init(); + + // Intel oneCCL requires passing CCL_LOCAL_RANK and CCL_LOCAL_SIZE for non-MPI + // launchers. + if (!with_mpirun()) { + int local_rank = getXCCLEnvVar("LOCAL_RANK"); + int local_world_size = getXCCLEnvVar("LOCAL_WORLD_SIZE"); + if (local_rank == -1 || local_world_size == -1) { + local_rank = rank; + local_world_size = size; + } + setXCCLEnvVar("CCL_PROCESS_LAUNCHER", "none"); + setXCCLEnvVar("CCL_LOCAL_RANK", local_rank); + setXCCLEnvVar("CCL_LOCAL_SIZE", local_world_size); + } +} + +ProcessGroupXCCL::~ProcessGroupXCCL() = default; + +c10::intrusive_ptr ProcessGroupXCCL::initWork( + at::Device& device, + int rank, + OpType opType, + const std::vector& inputs, + const std::vector& outputs) { + auto r = c10::make_intrusive( + device, rank, opType, std::optional>(inputs)); + return r; +} + +std::shared_ptr ProcessGroupXCCL::getXCCLComm( + const std::string& deviceKey, + at::Device& device) { + if (deviceKey.empty()) { + C10_THROW_ERROR( + DistBackendError, + "Not able to create/get the XCCL Communicator since " + "the devices are empty "); + } + + { + std::lock_guard lock(mutex_); + if (devXCCLCommMap_.find(deviceKey) != devXCCLCommMap_.end()) { + return devXCCLCommMap_[deviceKey]; + } + } + + std::shared_ptr XCCLComm; + + XCCL_KVS kvs = get_kvs(rank_, *store_); + + int numRanks, rank; + numRanks = getSize(); + rank = getRank(); + + c10::impl::VirtualGuardImpl impl(device.type()); + c10::Stream stream = impl.getStream(device); + sycl::queue& q = c10::xpu::XPUStream(stream).queue(); + + auto ctx = ccl::create_context(q.get_context()); + ccl::vector_class> devs_rank; + devs_rank.emplace_back(rank, ccl::create_device(q.get_device())); + + auto comms = ccl::create_communicators(numRanks, devs_rank, ctx, kvs); + XCCLComm = std::make_shared(std::move(comms[0])); + + { + std::lock_guard lock(mutex_); + inInitializationCommMap_.emplace(deviceKey, XCCLComm); + } + + xcclStreams_.emplace(deviceKey, std::move(stream)); + + auto it = inInitializationCommMap_.find(deviceKey); + if (it != inInitializationCommMap_.end()) { + devXCCLCommMap_.emplace(deviceKey, std::move(it->second)); + inInitializationCommMap_.erase(deviceKey); + + xcclCommDevIdxMapMutex.lock(); + xcclCommDevIdxMap.emplace(XCCLComm, device.index()); + xcclCommDevIdxMapMutex.unlock(); + } + + it = devXCCLCommMap_.find(deviceKey); + TORCH_INTERNAL_ASSERT( + it != devXCCLCommMap_.end(), "Communicators not populated in cache!"); + + return it->second; +} + +template +c10::intrusive_ptr ProcessGroupXCCL::collective( + at::Tensor& input, + at::Tensor& output, + Fn fn, + PreProcess pre, + PostProcess post, + OpType opType) { + using traits = function_traits; + using attr_t = typename traits::template arg<2>::type; + attr_t attr = ccl::create_operation_attr(); + + auto device = input.device(); + const auto key = std::to_string(device.index()); + auto comm = getXCCLComm(key, device); + + auto stream = xcclStreams_.at(key); + std::vector outputs{output}; + + c10::intrusive_ptr work; + + work = initWork(device, rank_, opType); + + work->outputs_ = + std::make_shared>(std::move(outputs)); + c10::xpu::XPUCachingAllocator::recordStream( + input.storage().data_ptr(), stream); + + auto ccl_stream = ccl::create_stream(stream.queue()); + + fn(input, output, attr, *comm, ccl_stream); + + work->xcclEndEvent_->record(stream); + + std::vector streams = {stream.unwrap()}; + c10::MultiStreamGuard streamGuard(streams); + std::vector devices{device}; + work->future_ = c10::make_intrusive( + c10::ListType::create(c10::TensorType::get()), devices); + work->future_->markCompleted(at::IValue(*work->outputs_)); + work->blockingWait_ = blockingWait_; + + return work; +} + +template +c10::intrusive_ptr ProcessGroupXCCL::collective( + at::Tensor& input, + at::Tensor& output, + Fn fn, + OpType opType) { + return collective( + input, + output, + fn, + [](at::xpu::XPUStream&, + c10::intrusive_ptr& work) {}, + [](at::xpu::XPUStream&, + c10::intrusive_ptr& work) {}, + opType); +} + +c10::intrusive_ptr ProcessGroupXCCL::allreduce( + std::vector& tensors, + const AllreduceOptions& opts) { + TORCH_CHECK( + tensors.size() == 1, "Expecting one tensor only but got multiple"); + auto tensor = tensors.back(); + check_xpu_single_tensor(tensor); + return collective( + tensor, + tensor, + [&](at::Tensor& input, + at::Tensor& output, + ccl::allreduce_attr attr, + xcclComm_t& comm, + ccl::stream& stream) { + auto xcclDataType = getXcclDataType(input.scalar_type()); + auto xcclReduceOp = getXcclReduceOp(opts.reduceOp, input); + ccl::event ret_evt; + ret_evt = ccl::allreduce( + input.data_ptr(), + output.data_ptr(), + (size_t)input.numel(), + xcclDataType, + xcclReduceOp, + comm, + stream, + attr); + return ret_evt; + }, + OpType::ALLREDUCE); +} + +} // namespace c10d + +#endif // USE_C10D_XCCL diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp new file mode 100644 index 00000000000000..14a9f398a8cbe7 --- /dev/null +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp @@ -0,0 +1,308 @@ +#pragma once + +#if defined(__linux__) +#include +#include +#include +#include +#endif + +#ifdef USE_C10D_XCCL +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +namespace c10d { + +namespace { +int getXCCLEnvVar(std::string envVarName) { + char* stringValue = std::getenv(envVarName.c_str()); + if (stringValue != nullptr) { + try { + int val = std::stoi(stringValue); + return val; + } catch (std::exception& e) { + TORCH_CHECK( + false, + "Invalid value for environment variable: " + std::string(envVarName)); + } + } else { + return -1; + } +} + +template +void setXCCLEnvVar(const std::string& envVarName, T val) { + if constexpr (std::is_same_v) { + setenv(envVarName.c_str(), std::to_string(val).c_str(), 1); + } else if constexpr (std::is_same_v) { + setenv(envVarName.c_str(), val.c_str(), 1); + } +} + +bool with_mpirun() { + return (getenv("MPI_LOCALRANKID") || getenv("MPI_LOCALNRANKS") || + getenv("PMI_RANK") || getenv("PMI_SIZE") || getenv("PMIX_RANK")) + ? true + : false; +} +} // namespace + +static std::vector TORCH_XCCL_BLOCKING_WAIT = { + "TORCH_XCCL_BLOCKING_WAIT", + "XCCL_BLOCKING_WAIT"}; + +using xcclComm_t = ccl::communicator; +using XCCL_KVS = ccl::shared_ptr_class; +constexpr const char* XCCL_BACKEND_NAME = "xccl"; + +class TORCH_API ProcessGroupXCCL : public Backend { + public: + class WorkXCCL : public Work { + public: + WorkXCCL( + at::Device& device, + int rank, + OpType opType, + const std::optional>& inputs = std::nullopt); + WorkXCCL(const WorkXCCL& w); + ~WorkXCCL() override; + + bool isCompleted() override; + + bool isSuccess() const override { + TORCH_CHECK( + false, "ProcessGroupXCCL::WorkXCCL::isSuccess not implemented"); + } + + void abort() override { + TORCH_CHECK(false, "ProcessGroupXCCL::WorkXCCL::abort not implemented"); + } + + void synchronize() override; + + void synchronizeStream(); + + bool wait(std::chrono::milliseconds timeout = kNoTimeout) override; + + c10::intrusive_ptr getFuture() override { + return future_; + } + + std::vector result() override { + TORCH_CHECK(false, "ProcessGroupXCCL::WorkXCCL::result not implemented"); + } + + bool checkTimeout( + std::optional timeout = std::nullopt); + + protected: + at::Device device_; + std::shared_ptr xcclEndEvent_; + bool blockingWait_ = false; + std::chrono::time_point workStartTime_; + + private: + void synchronizeInternal(std::chrono::milliseconds timeout); + std::shared_ptr> outputs_; + c10::intrusive_ptr future_; + friend class ProcessGroupXCCL; + }; + + ProcessGroupXCCL(const c10::intrusive_ptr& store, int rank, int size); + + C10_DEPRECATED ProcessGroupXCCL( + const c10::intrusive_ptr& store, + int rank, + int size, + const std::string& groupName) + : ProcessGroupXCCL(store, rank, size) {} + + ~ProcessGroupXCCL() override; + + const std::string getBackendName() const override { + return std::string(XCCL_BACKEND_NAME); + } + + std::shared_ptr getXCCLComm( + const std::string& deviceKey, + at::Device& device); + + virtual c10::intrusive_ptr initWork( + at::Device& device, + int rank, + OpType opType, + const std::vector& inputs = {}, + const std::vector& outputs = {}); + + template + c10::intrusive_ptr collective( + at::Tensor& input, + at::Tensor& output, + Fn fn, + OpType opType); + + template + c10::intrusive_ptr collective( + at::Tensor& input, + at::Tensor& output, + Fn fn, + PreProcess pre, + PostProcess post, + OpType opType); + + c10::intrusive_ptr allreduce( + std::vector& tensors, + const AllreduceOptions& opts = AllreduceOptions()) override; + + c10::intrusive_ptr allreduce_coalesced( + std::vector& tensors, + const AllreduceCoalescedOptions& opts = + AllreduceCoalescedOptions()) override { + TORCH_CHECK(false, "ProcessGroupXCCL::allreduce_coalesced not implemented"); + } + + c10::intrusive_ptr reduce( + std::vector& tensors, + const ReduceOptions& opts = ReduceOptions()) override { + TORCH_CHECK(false, "ProcessGroupXCCL::reduce not implemented"); + } + + c10::intrusive_ptr broadcast( + std::vector& tensors, + const BroadcastOptions& opts = BroadcastOptions()) override { + TORCH_CHECK(false, "ProcessGroupXCCL::broadcast not implemented"); + } + + c10::intrusive_ptr allgather( + std::vector>& outputTensors, + std::vector& inputTensors, + const AllgatherOptions& opts = AllgatherOptions()) override { + TORCH_CHECK(false, "ProcessGroupXCCL::allgather not implemented"); + } + + c10::intrusive_ptr _allgather_base( + at::Tensor& outputbuffer, + at::Tensor& inputbuffer, + const AllgatherOptions& opts = AllgatherOptions()) override { + TORCH_CHECK(false, "ProcessGroupXCCL::_allgather_base not implemented"); + } + + c10::intrusive_ptr allgather_coalesced( + std::vector>& outputTensorLists, + std::vector& inputTensors, + const AllgatherOptions& opts = AllgatherOptions()) override { + TORCH_CHECK(false, "ProcessGroupXCCL::allgather_coalesced not implemented"); + } + + c10::intrusive_ptr allgather_into_tensor_coalesced( + std::vector& outputs, + std::vector& inputs, + const AllgatherOptions& opts = AllgatherOptions()) override { + TORCH_CHECK( + false, + "ProcessGroupXCCL::allgather_into_tensor_coalesced not implemented"); + } + + c10::intrusive_ptr reduce_scatter( + std::vector& outputTensors, + std::vector>& inputTensors, + const ReduceScatterOptions& opts = ReduceScatterOptions()) override { + TORCH_CHECK(false, "ProcessGroupXCCL::reduce_scatter not implemented"); + } + + c10::intrusive_ptr _reduce_scatter_base( + at::Tensor& outputTensor, + at::Tensor& inputTensor, + const ReduceScatterOptions& opts = ReduceScatterOptions()) override { + TORCH_CHECK( + false, "ProcessGroupXCCL::_reduce_scatter_base not implemented"); + } + + c10::intrusive_ptr reduce_scatter_tensor_coalesced( + std::vector& outputs, + std::vector& inputs, + const ReduceScatterOptions& opts = ReduceScatterOptions()) override { + TORCH_CHECK( + false, + "ProcessGroupXCCL::reduce_scatter_tensor_coalesced not implemented"); + } + + c10::intrusive_ptr barrier( + const BarrierOptions& opts = BarrierOptions()) override { + TORCH_CHECK(false, "ProcessGroupXCCL::barrier not implemented"); + } + + c10::intrusive_ptr alltoall_base( + at::Tensor& outputTensor, + at::Tensor& inputTensor, + std::vector& outputSplitSizes, + std::vector& inputSplitSizes, + const AllToAllOptions& opts = AllToAllOptions()) override { + TORCH_CHECK(false, "ProcessGroupXCCL::alltoall_base not implemented"); + } + + c10::intrusive_ptr alltoall( + std::vector& outputTensors, + std::vector& inputTensors, + const AllToAllOptions& opts = AllToAllOptions()) override { + TORCH_CHECK(false, "ProcessGroupXCCL::alltoall not implemented"); + } + + c10::intrusive_ptr send( + std::vector& tensors, + int dstRank, + int tag) override { + TORCH_CHECK(false, "ProcessGroupXCCL::send not implemented"); + } + + c10::intrusive_ptr recv( + std::vector& tensors, + int srcRank, + int tag) override { + TORCH_CHECK(false, "ProcessGroupXCCL::recv not implemented"); + } + + c10::intrusive_ptr gather( + std::vector>& outputTensors, + std::vector& inputTensors, + const GatherOptions& opts = GatherOptions()) override { + TORCH_CHECK(false, "ProcessGroupXCCL::gather not implemented"); + } + + c10::intrusive_ptr scatter( + std::vector& outputTensors, + std::vector>& inputTensors, + const ScatterOptions& opts = ScatterOptions()) override { + TORCH_CHECK(false, "ProcessGroupXCCL::scatter not implemented"); + } + + protected: + std::unordered_map xcclStreams_; + std::unordered_map> + inInitializationCommMap_; + std::unordered_map> devXCCLCommMap_; + c10::intrusive_ptr store_; + std::mutex mutex_; + bool blockingWait_ = false; +}; +} // namespace c10d + +#endif // USE_C10D_XCCL diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp index c8f9dff37f06e2..e3ed6d6bd4bcb4 100644 --- a/torch/csrc/distributed/c10d/init.cpp +++ b/torch/csrc/distributed/c10d/init.cpp @@ -37,6 +37,10 @@ #include #endif +#ifdef USE_C10D_XCCL +#include +#endif + #include #include #include @@ -2232,6 +2236,7 @@ The hook must have the following signature: .value("UNDEFINED", ::c10d::ProcessGroup::BackendType::UNDEFINED) .value("GLOO", ::c10d::ProcessGroup::BackendType::GLOO) .value("NCCL", ::c10d::ProcessGroup::BackendType::NCCL) + .value("XCCL", ::c10d::ProcessGroup::BackendType::XCCL) .value("UCC", ::c10d::ProcessGroup::BackendType::UCC) .value("MPI", ::c10d::ProcessGroup::BackendType::MPI) .value("CUSTOM", ::c10d::ProcessGroup::BackendType::CUSTOM) @@ -2877,6 +2882,23 @@ Example:: py::call_guard()); #endif +#ifdef USE_C10D_XCCL + auto processGroupXCCL = + intrusive_ptr_no_gil_destructor_class_<::c10d::ProcessGroupXCCL>( + module, "ProcessGroupXCCL", backend) + .def( + py::init([](const c10::intrusive_ptr<::c10d::Store>& store, + int rank, + int size) { + return c10::make_intrusive<::c10d::ProcessGroupXCCL>( + store, rank, size); + }), + py::arg("store"), + py::arg("rank"), + py::arg("size"), + py::call_guard()); +#endif + py::enum_<::c10d::OpType>(module, "OpType") .value("BROADCAST", ::c10d::OpType::BROADCAST) .value("ALLREDUCE", ::c10d::OpType::ALLREDUCE) diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py index 45e096985143a3..9fa3224873c9fc 100644 --- a/torch/distributed/distributed_c10d.py +++ b/torch/distributed/distributed_c10d.py @@ -87,6 +87,7 @@ "is_nccl_available", "is_torchelastic_launched", "is_ucc_available", + "is_xccl_available", "isend", "monitored_barrier", "new_group", @@ -130,6 +131,7 @@ _NCCL_AVAILABLE = True _GLOO_AVAILABLE = True _UCC_AVAILABLE = True +_XCCL_AVAILABLE = True _pickler = pickle.Pickler _unpickler = pickle.Unpickler @@ -193,6 +195,14 @@ def _export_c_types() -> None: except ImportError: _UCC_AVAILABLE = False +try: + from torch._C._distributed_c10d import ProcessGroupXCCL + + ProcessGroupXCCL.__module__ = "torch.distributed.distributed_c10d" + __all__ += ["ProcessGroupXCCL"] +except ImportError: + _XCCL_AVAILABLE = False + logger = logging.getLogger(__name__) PG_WRAPPER_STORE_PREFIX = "pg_wrapper" @@ -222,7 +232,7 @@ class Backend(str): """ An enum-like class for backends. - Available backends: GLOO, NCCL, UCC, MPI, and other registered backends. + Available backends: GLOO, NCCL, UCC, MPI, XCCL, and other registered backends. The values of this class are lowercase strings, e.g., ``"gloo"``. They can be accessed as attributes, e.g., ``Backend.NCCL``. @@ -242,21 +252,24 @@ class Backend(str): NCCL = "nccl" UCC = "ucc" MPI = "mpi" + XCCL = "xccl" _BackendPlugin = namedtuple("_BackendPlugin", ["creator_fn", "extended_api"]) _plugins: Dict[str, _BackendPlugin] = {} - backend_list = [UNDEFINED, GLOO, NCCL, UCC, MPI] + backend_list = [UNDEFINED, GLOO, NCCL, XCCL, UCC, MPI] default_device_backend_map: Dict[str, str] = { "cpu": GLOO, "cuda": NCCL, + "xpu": XCCL, } backend_capability: Dict[str, List[str]] = { GLOO: ["cpu", "cuda"], NCCL: ["cuda"], + XCCL: ["xpu"], UCC: ["cpu", "cuda"], MPI: ["cpu", "cuda"], } @@ -265,6 +278,7 @@ class Backend(str): UNDEFINED: ProcessGroup.BackendType.UNDEFINED, GLOO: ProcessGroup.BackendType.GLOO, NCCL: ProcessGroup.BackendType.NCCL, + XCCL: ProcessGroup.BackendType.XCCL, UCC: ProcessGroup.BackendType.UCC, } @@ -1098,6 +1112,11 @@ def is_ucc_available() -> bool: return _UCC_AVAILABLE +def is_xccl_available() -> bool: + """Check if the XCCL backend is available.""" + return _XCCL_AVAILABLE + + def is_backend_available(backend: str) -> bool: """ Check backend availability. @@ -1350,6 +1369,10 @@ def _set_pg_timeout(timeout: timedelta, group: Optional[ProcessGroup] = None) -> backends.add(backend) # type: ignore[arg-type] elif is_gloo_available() and isinstance(backend, ProcessGroupGloo): backends.add(backend) # type: ignore[arg-type] + if torch.device("xpu") in devices and is_xccl_available(): + backend = group._get_backend(torch.device("xpu")) + if isinstance(backend, ProcessGroupXCCL): + backends.add(backend) # type: ignore[arg-type] if len(backends) == 0: warnings.warn("Set timeout is now only supported for either nccl or gloo.") for backend in backends: @@ -1385,7 +1408,7 @@ def init_process_group( Args: backend (str or Backend, optional): The backend to use. Depending on - build-time configurations, valid values include ``mpi``, ``gloo``, + build-time configurations, valid values include ``mpi``, ``gloo``, ``xccl``, ``nccl``, and ``ucc``. If the backend is not provided, then both a ``gloo`` and ``nccl`` backend will be created, see notes below for how multiple backends are managed. This field can be given as a lowercase string @@ -1651,10 +1674,13 @@ def _new_process_group_helper( "created, please use a different group name" ) - if device_id is not None and (device_id.index is None or device_id.type != "cuda"): + if device_id is not None and ( + device_id.index is None + or (device_id.type != "cuda" and device_id.type != "xpu") + ): raise ValueError( "init_process_group device_id parameter must be a cuda device with an " - "id, e.g. cuda:0, not just cuda or cpu" + "id, e.g. cuda:0, xpu, not just cuda or xpu or cpu" ) # Note: _new_process_group_helper is only called from init_process_group, which always provides a timeout value @@ -1762,7 +1788,6 @@ def _new_process_group_helper( pg_options = ProcessGroupNCCL.Options() pg_options.is_high_priority_stream = False pg_options._timeout = timeout - if split_from: pg_options.split_from = split_from pg_options.split_color = _process_group_color(global_ranks_in_group) @@ -1781,6 +1806,17 @@ def _new_process_group_helper( backend_prefix_store, group_rank, group_size, timeout=timeout ) backend_type = ProcessGroup.BackendType.UCC + elif backend_str == Backend.XCCL: + if not is_xccl_available(): + raise RuntimeError("Distributed package doesn't have XCCL built in") + if pg_options is not None: + assert isinstance( + pg_options, ProcessGroupXCCL.Options + ), "Expected pg_options argument to be of type ProcessGroupXCCL.Options" + backend_class = ProcessGroupXCCL( + backend_prefix_store, group_rank, group_size + ) + backend_type = ProcessGroup.BackendType.XCCL else: assert ( backend_str.upper() in Backend._plugins diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py index d59102232f7db7..26bdcce6103120 100644 --- a/torch/testing/_internal/common_distributed.py +++ b/torch/testing/_internal/common_distributed.py @@ -180,7 +180,8 @@ def skip_if_lt_x_gpu(x): def decorator(func): @wraps(func) def wrapper(*args, **kwargs): - if torch.cuda.is_available() and torch.cuda.device_count() >= x: + if (torch.cuda.is_available() and torch.cuda.device_count() >= x) or \ + (torch.xpu.is_available() and torch.xpu.device_count() >= x): return func(*args, **kwargs) sys.exit(TEST_SKIPS[f"multi-gpu-{x}"].exit_code) @@ -320,6 +321,12 @@ def requires_nccl(): "c10d was not compiled with the NCCL backend", ) +def requires_xccl(): + return skip_but_pass_in_sandcastle_if( + not c10d.is_xccl_available(), + "c10d was not compiled with the XCCL backend", + ) + def requires_ucc(): return skip_but_pass_in_sandcastle_if( not c10d.is_ucc_available(), @@ -463,7 +470,7 @@ def init_multigpu_helper(world_size: int, backend: str): On a single node, all visible GPUs are evenly divided to subsets, each process only uses a subset. """ - nGPUs = torch.cuda.device_count() + nGPUs = torch.xpu.device_count() if torch.xpu.is_available() else torch.cuda.device_count() visible_devices = range(nGPUs) # If rank is less than or equal to number of available GPU's From a71d69a50684d8e6c6edd2ddc285f7589a44914d Mon Sep 17 00:00:00 2001 From: hanchao Date: Fri, 20 Sep 2024 03:39:00 +0000 Subject: [PATCH 67/96] Align latest --- torch/distributed/distributed_c10d.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py index 2d9357bbd15a44..4bbb1c41011231 100644 --- a/torch/distributed/distributed_c10d.py +++ b/torch/distributed/distributed_c10d.py @@ -1819,10 +1819,10 @@ def _new_process_group_helper( elif backend_str == Backend.XCCL: if not is_xccl_available(): raise RuntimeError("Distributed package doesn't have XCCL built in") - if pg_options is not None: + if backend_options is not None: assert isinstance( - pg_options, ProcessGroupXCCL.Options - ), "Expected pg_options argument to be of type ProcessGroupXCCL.Options" + backend_options, ProcessGroupXCCL.Options + ), "Expected backend_options argument to be of type ProcessGroupXCCL.Options" backend_class = ProcessGroupXCCL( backend_prefix_store, group_rank, group_size ) From 4bf448dd02db537095d0e0ec275116246d0bac92 Mon Sep 17 00:00:00 2001 From: hanchao Date: Fri, 20 Sep 2024 05:21:28 +0000 Subject: [PATCH 68/96] update --- torch/distributed/distributed_c10d.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py index dfae588345c726..4bbb1c41011231 100644 --- a/torch/distributed/distributed_c10d.py +++ b/torch/distributed/distributed_c10d.py @@ -1819,17 +1819,10 @@ def _new_process_group_helper( elif backend_str == Backend.XCCL: if not is_xccl_available(): raise RuntimeError("Distributed package doesn't have XCCL built in") -<<<<<<< HEAD - if pg_options is not None: - assert isinstance( - pg_options, ProcessGroupXCCL.Options - ), "Expected pg_options argument to be of type ProcessGroupXCCL.Options" -======= if backend_options is not None: assert isinstance( backend_options, ProcessGroupXCCL.Options ), "Expected backend_options argument to be of type ProcessGroupXCCL.Options" ->>>>>>> xccl-bak backend_class = ProcessGroupXCCL( backend_prefix_store, group_rank, group_size ) From 1f83fbf89397b132db708279369dbe1940b527a6 Mon Sep 17 00:00:00 2001 From: hanchao Date: Fri, 20 Sep 2024 05:48:39 +0000 Subject: [PATCH 69/96] update --- .../distributed/c10d/ProcessGroupXCCL.cpp | 25 +------------------ 1 file changed, 1 insertion(+), 24 deletions(-) diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index acdcae2eea4cde..32030f45e73ae3 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -10,7 +10,6 @@ #include #include #include -#include #include #include @@ -458,29 +457,7 @@ c10::intrusive_ptr ProcessGroupXCCL::collective( work = initWork(device, rank_, opType); - { // Do we need to store the result of the operation? - std::variant, std::vector>> - outputs; - std::visit( - [&work](auto&& outputData) { - using T = std::decay_t; - - if constexpr (std::is_same_v>) { - work->outputs_ = std::make_shared>( - std::move(outputData)); - } else if constexpr (std::is_same_v< - T, - std::vector>>) { - std::vector flattened; - for (auto& vec : outputData) { - flattened.insert(flattened.end(), vec.begin(), vec.end()); - } - work->outputs_ = - std::make_shared>(std::move(flattened)); - } - }, - outputs); - } + work->outputs_ = std::make_shared>(outputs); pre(stream, work); From 4f4ecf476317fc0a12b768dc77f7b109ba01020e Mon Sep 17 00:00:00 2001 From: hanchao Date: Fri, 20 Sep 2024 07:34:29 +0000 Subject: [PATCH 70/96] update --- torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index 32030f45e73ae3..e973ce110ab0bc 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -4,6 +4,7 @@ #include #ifdef USE_C10D_XCCL +#include #include #include #include @@ -459,6 +460,8 @@ c10::intrusive_ptr ProcessGroupXCCL::collective( work->outputs_ = std::make_shared>(outputs); + at::xpu::OptionalXPUGuard gpuGuard(device); + pre(stream, work); for (const auto& input : inputs) { @@ -470,7 +473,9 @@ c10::intrusive_ptr ProcessGroupXCCL::collective( post(stream, work); - work->xcclEndEvent_->record(stream); + if (!coalescing_state_) { + work->xcclEndEvent_->record(stream); + } std::vector streams = {stream.unwrap()}; c10::MultiStreamGuard streamGuard(streams); @@ -550,6 +555,8 @@ c10::intrusive_ptr ProcessGroupXCCL::collectiveCoalesced( work->outputs_ = std::make_shared>(outputs); + at::xpu::OptionalXPUGuard gpuGuard(device); + { AutoXcclGroup xccl_group_guard; for (const auto i : c10::irange(inputs.size())) { From b6bc4a82376d23ccaf9ad951ff3a016677bfc9f8 Mon Sep 17 00:00:00 2001 From: hanchao Date: Fri, 20 Sep 2024 08:01:50 +0000 Subject: [PATCH 71/96] update --- torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index 034f201dc75bd7..59cd218b7f1c25 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -733,6 +733,10 @@ c10::intrusive_ptr ProcessGroupXCCL::gather( return ret_evt; } }, + [](at::xpu::XPUStream&, c10::intrusive_ptr&) { + }, + [](at::xpu::XPUStream&, c10::intrusive_ptr&) { + }, OpType::GATHER); } @@ -834,6 +838,10 @@ c10::intrusive_ptr ProcessGroupXCCL::scatter( return ret_evt; } }, + [](at::xpu::XPUStream&, c10::intrusive_ptr&) { + }, + [](at::xpu::XPUStream&, c10::intrusive_ptr&) { + }, OpType::SCATTER); } @@ -1545,6 +1553,10 @@ c10::intrusive_ptr ProcessGroupXCCL::alltoall( stream.synchronize(); return ret_evt; }, + [](at::xpu::XPUStream&, c10::intrusive_ptr&) { + }, + [](at::xpu::XPUStream&, c10::intrusive_ptr&) { + }, OpType::ALLTOALL); } From 1fbb7edfa494196a90a2445d74aaea7e0966ca7b Mon Sep 17 00:00:00 2001 From: hanchao Date: Mon, 23 Sep 2024 07:15:42 +0000 Subject: [PATCH 72/96] support p2p --- test/distributed/test_c10d_ops_xccl.py | 89 ++++--- .../distributed/c10d/ProcessGroupXCCL.cpp | 226 +++++++++++++++++- .../distributed/c10d/ProcessGroupXCCL.hpp | 30 ++- 3 files changed, 281 insertions(+), 64 deletions(-) diff --git a/test/distributed/test_c10d_ops_xccl.py b/test/distributed/test_c10d_ops_xccl.py index 8cfce2be164d9f..279ec0eb03ecf8 100644 --- a/test/distributed/test_c10d_ops_xccl.py +++ b/test/distributed/test_c10d_ops_xccl.py @@ -758,51 +758,50 @@ def allreduce(tensors): torch.tensor([(j + 1) * self.world_size]), tensors_list[i - 1][j] ) - # TODO: wait send/recv - # @requires_xccl() - # @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs") - # def test_send_recv(self): - # pg = self.pg - # device = self.rank_to_GPU[self.rank][0] - - # # Generate the same random tensor - # torch.manual_seed(0) - # send_tensor = torch.rand(10, 10, device=device) - # if self.rank == 0: - # dist.send(send_tensor, 1) - # if self.rank == 1: - # recv_tensor = torch.rand(10, 10, device=device) - # dist.recv(recv_tensor, 0) - # self.assertEqual(send_tensor, recv_tensor) - - # @requires_xccl() - # @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs") - # def test_send_recv_complex(self): - # pg = self.pg - # device = self.rank_to_GPU[self.rank][0] - - # # Generate the same random tensor - # torch.manual_seed(0) - # send_tensor = torch.rand(10, 10, dtype=torch.cfloat, device=device) - # if self.rank == 0: - # dist.send(send_tensor, 1) - # if self.rank == 1: - # recv_tensor = torch.rand(10, 10, dtype=torch.cfloat, device=device) - # dist.recv(recv_tensor, 0) - # self.assertEqual(send_tensor, recv_tensor) - - # @requires_xccl() - # @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs") - # def test_send_recv_object_list(self): - # device = self.rank_to_GPU[self.rank][0] - - # val = 99 if self.rank == 0 else None - # object_list = [val] * self.world_size - # if self.rank == 0: - # dist.send_object_list(object_list, 1, device=device) - # if self.rank == 1: - # dist.recv_object_list(object_list, 0, device=device) - # self.assertEqual(object_list[0], 99) + @requires_xccl() + @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs") + def test_send_recv(self): + pg = self.pg + device = self.rank_to_GPU[self.rank][0] + + # Generate the same random tensor + torch.manual_seed(0) + send_tensor = torch.rand(10, 10, device=device) + if self.rank == 0: + dist.send(send_tensor, 1) + if self.rank == 1: + recv_tensor = torch.rand(10, 10, device=device) + dist.recv(recv_tensor, 0) + self.assertEqual(send_tensor, recv_tensor) + + @requires_xccl() + @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs") + def test_send_recv_complex(self): + pg = self.pg + device = self.rank_to_GPU[self.rank][0] + + # Generate the same random tensor + torch.manual_seed(0) + send_tensor = torch.rand(10, 10, dtype=torch.cfloat, device=device) + if self.rank == 0: + dist.send(send_tensor, 1) + if self.rank == 1: + recv_tensor = torch.rand(10, 10, dtype=torch.cfloat, device=device) + dist.recv(recv_tensor, 0) + self.assertEqual(send_tensor, recv_tensor) + + @requires_xccl() + @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs") + def test_send_recv_object_list(self): + device = self.rank_to_GPU[self.rank][0] + + val = 99 if self.rank == 0 else None + object_list = [val] * self.world_size + if self.rank == 0: + dist.send_object_list(object_list, 1, device=device) + if self.rank == 1: + dist.recv_object_list(object_list, 0, device=device) + self.assertEqual(object_list[0], 99) if __name__ == "__main__": diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index 59cd218b7f1c25..27a11506f5e309 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -228,6 +228,14 @@ ccl::reduction getXcclReduceOp(const ReduceOp& reduceOp, at::Tensor& input) { } } +void syncStream( + at::Device& device, + at::xpu::XPUEvent& xcclEvent, + at::xpu::XPUStream& xcclStream) { + xcclEvent.record(at::xpu::getCurrentXPUStream(device.index())); + xcclEvent.block(xcclStream); +} + bool complexViewAsRealAllowed(const ReduceOp reduceOp) { switch (reduceOp) { case ReduceOp::SUM: @@ -245,9 +253,6 @@ bool complexViewAsRealAllowed(const ReduceOp reduceOp) { static std::mutex xcclCommDevIdxMapMutex; static std::unordered_map, int> xcclCommDevIdxMap; constexpr int64_t kSynchronizeBusyWaitMillis = 10; - -// Before implementing send/recv, the xcclActiveGroupCounter_ variable has no -// effect. thread_local uint64_t ProcessGroupXCCL::xcclActiveGroupCounter_ = 0; ProcessGroupXCCL::WorkXCCL::WorkXCCL( @@ -369,7 +374,10 @@ c10::intrusive_ptr ProcessGroupXCCL::initWork( std::shared_ptr ProcessGroupXCCL::getXCCLComm( const std::string& deviceKey, - at::Device& device) { + at::Device& device, + OpType opType, + int p2pRank, + bool isSendRecvSelf) { if (deviceKey.empty()) { C10_THROW_ERROR( DistBackendError, @@ -387,12 +395,29 @@ std::shared_ptr ProcessGroupXCCL::getXCCLComm( } std::shared_ptr XCCLComm; - XCCL_KVS kvs = get_kvs(rank_, *store_); + bool batchP2P = xcclActiveGroupCounter_ > 0; + bool singleP2POp = isP2POp(opType, batchP2P); + + at::xpu::OptionalXPUGuard gpuGuard(device); + + for (const auto i : c10::irange(xcclActiveGroupCounter_)) { + (void)i; + ccl::group_end(); + } + int numRanks, rank; - numRanks = getSize(); - rank = getRank(); + if (!singleP2POp) { + numRanks = getSize(); + rank = getRank(); + } else if (isSendRecvSelf) { + numRanks = 1; + rank = 0; + } else { + numRanks = 2; + rank = p2pRank; + } c10::impl::VirtualGuardImpl impl(device.type()); c10::Stream stream = impl.getStream(device); @@ -410,7 +435,13 @@ std::shared_ptr ProcessGroupXCCL::getXCCLComm( inInitializationCommMap_.emplace(deviceKey, XCCLComm); } + for (const auto i : c10::irange(xcclActiveGroupCounter_)) { + (void)i; + ccl::group_start(); + } + xcclStreams_.emplace(deviceKey, std::move(stream)); + xcclEvents_.emplace(deviceKey, at::xpu::XPUEvent()); auto it = inInitializationCommMap_.find(deviceKey); if (it != inInitializationCommMap_.end()) { @@ -440,7 +471,7 @@ void ProcessGroupXCCL::groupEnd() { } // TODO: wait p2p enable -static constexpr int CoalActive = 0x01, CoalColl = 0x02; +static constexpr int CoalActive = 0x01, CoalColl = 0x02, CoalP2P = 0x04; void ProcessGroupXCCL::startCoalescing() { coalescedDevice_.set_index(-1); coalescedComm_ = nullptr; @@ -496,7 +527,7 @@ c10::intrusive_ptr ProcessGroupXCCL::collective( auto device = inputs[0].device(); const auto key = std::to_string(device.index()); - auto comm = getXCCLComm(key, device); + auto comm = getXCCLComm(key, device, opType); if (coalescing_state_ & CoalActive) { coalescing_state_ |= CoalColl; @@ -514,9 +545,9 @@ c10::intrusive_ptr ProcessGroupXCCL::collective( } auto stream = xcclStreams_.at(key); + syncStream(device, xcclEvents_[key], stream); c10::intrusive_ptr work; - work = initWork(device, rank_, opType); work->outputs_ = std::make_shared>(outputs); @@ -591,7 +622,7 @@ c10::intrusive_ptr ProcessGroupXCCL::collectiveCoalesced( auto device = inputs[0].device(); const auto key = std::to_string(device.index()); - auto comm = getXCCLComm(key, device); + auto comm = getXCCLComm(key, device, opType); if (coalescing_state_ & CoalActive) { coalescing_state_ |= CoalColl; @@ -609,9 +640,9 @@ c10::intrusive_ptr ProcessGroupXCCL::collectiveCoalesced( } auto stream = xcclStreams_.at(key); + syncStream(device, xcclEvents_[key], stream); c10::intrusive_ptr work; - work = initWork(device, rank_, opType); work->outputs_ = std::make_shared>(outputs); @@ -640,6 +671,177 @@ c10::intrusive_ptr ProcessGroupXCCL::collectiveCoalesced( return work; } +template +c10::intrusive_ptr ProcessGroupXCCL::pointToPoint( + at::Tensor& tensor, + Fn fn, + int peer, + OpType opType, + PreProcess pre, + PostProcess post) { + using traits = function_traits; + using attr_t = typename traits::template arg<1>::type; + attr_t attr = ccl::create_operation_attr(); + + auto device = tensor.device(); + std::string key; + int p2pRank = 0, p2pTargetRank = 0; + bool isSendRecvSelf = false; + + bool batchP2P = xcclActiveGroupCounter_ > 0; + if (batchP2P) { + key = std::to_string(device.index()); + p2pRank = rank_; + p2pTargetRank = peer; + } else { + int lowRank = rank_ < peer ? rank_ : peer; + int highRank = rank_ < peer ? peer : rank_; + key = std::to_string(lowRank) + ":" + std::to_string(highRank); + p2pRank = rank_ <= peer ? 0 : 1; + isSendRecvSelf = rank_ == peer; + p2pTargetRank = isSendRecvSelf ? 0 : 1 - p2pRank; + } + + auto comm = getXCCLComm(key, device, opType, p2pRank, isSendRecvSelf); + + if (coalescing_state_ & CoalActive) { + coalescing_state_ |= CoalP2P; + if (coalescedDevice_.index() < 0) { + coalescedDevice_ = device; + } else { + TORCH_CHECK( + coalescedDevice_.index() == device.index(), MULTI_DEVICE_ERROR_MSG); + } + if (coalescedComm_ == nullptr) { + coalescedComm_ = comm; + } else { + TORCH_CHECK(coalescedComm_ == comm, MULTI_DEVICE_ERROR_MSG); + } + } + + auto stream = xcclStreams_.at(key); + syncStream(device, xcclEvents_[key], stream); + + c10::intrusive_ptr work; + if (!coalescing_state_) { + work = initWork(device, rank_, opType); + work->outputs_ = std::make_shared>(); + work->outputs_->push_back(tensor); + } + + at::xpu::OptionalXPUGuard gpuGuard(device); + + if (!coalescing_state_) { + pre(stream, work); + } + + c10::xpu::XPUCachingAllocator::recordStream( + tensor.storage().data_ptr(), stream); + + fn(tensor, attr, *comm, stream, p2pTargetRank); + + if (!coalescing_state_) { + post(stream); + + work->xcclEndEvent_->record(stream); + work->blockingWait_ = blockingWait_; + + { + std::vector streams = {stream.unwrap()}; + c10::MultiStreamGuard streamGuard(streams); + std::vector devices{device}; + work->future_ = c10::make_intrusive( + c10::ListType::create(c10::TensorType::get()), devices); + work->future_->markCompleted(at::IValue(*work->outputs_)); + } + return work; + } else { + return nullptr; + } +} + +template +c10::intrusive_ptr ProcessGroupXCCL::pointToPoint( + at::Tensor& tensor, + Fn fn, + int peer, + OpType opType) { + return pointToPoint( + tensor, + fn, + peer, + opType, + [](at::xpu::XPUStream&, c10::intrusive_ptr&) { + }, + [](at::xpu::XPUStream&) {}); +} + +c10::intrusive_ptr ProcessGroupXCCL::send( + std::vector& tensors, + int dstRank, + int /* unused */) { + TORCH_CHECK(tensors.size() == 1, MULTI_DEVICE_ERROR_MSG); + // @lint-ignore CLANGTIDY + auto tensor = tensors.back(); + check_xpu_single_tensor(tensor, true); + + auto ret = pointToPoint( + tensor, + [&](at::Tensor& input, + ccl::pt2pt_attr attr, + xcclComm_t& comm, + at::xpu::XPUStream& stream, + int dst) { + ccl::event ret_evt; + auto xcclDataType = getXcclDataType(input.scalar_type()); + ret_evt = ccl::send( + input.data_ptr(), + (size_t)input.numel(), + xcclDataType, + dst, + comm, + ccl::create_stream(stream.queue()), + attr); + return ret_evt; + }, + dstRank, + OpType::SEND); + return ret; +} + +c10::intrusive_ptr ProcessGroupXCCL::recv( + std::vector& tensors, + int srcRank, + int /* unused */) { + TORCH_CHECK(tensors.size() == 1, MULTI_DEVICE_ERROR_MSG); + // @lint-ignore CLANGTIDY + auto tensor = tensors.back(); + check_xpu_single_tensor(tensor, true); + + auto ret = pointToPoint( + tensor, + [&](at::Tensor& output, + ccl::pt2pt_attr attr, + xcclComm_t& comm, + at::xpu::XPUStream& stream, + int src) { + ccl::event ret_evt; + auto xcclDataType = getXcclDataType(output.scalar_type()); + ret_evt = ccl::recv( + output.data_ptr(), + (size_t)output.numel(), + xcclDataType, + src, + comm, + ccl::create_stream(stream.queue()), + attr); + return ret_evt; + }, + srcRank, + OpType::RECV); + return ret; +} + c10::intrusive_ptr ProcessGroupXCCL::gather( std::vector>& outputTensors, std::vector& inputTensors, diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp index bc69ec992e8649..033396e87fdedc 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp @@ -155,7 +155,10 @@ class TORCH_API ProcessGroupXCCL : public Backend { std::shared_ptr getXCCLComm( const std::string& deviceKey, - at::Device& device); + at::Device& device, + OpType opType, + int p2pRank = 0, + bool isSendRecvSelf = false); virtual c10::intrusive_ptr initWork( at::Device& device, @@ -196,6 +199,22 @@ class TORCH_API ProcessGroupXCCL : public Backend { Fn fn, OpType opType); + template + c10::intrusive_ptr pointToPoint( + at::Tensor& tensor, + Fn fn, + int peer, + OpType opType); + + template + c10::intrusive_ptr pointToPoint( + at::Tensor& tensor, + Fn fn, + int peer, + OpType opType, + PreProcess pre, + PostProcess post); + c10::intrusive_ptr allreduce_impl( at::Tensor& tensor, const AllreduceOptions& opts = AllreduceOptions()); @@ -282,16 +301,12 @@ class TORCH_API ProcessGroupXCCL : public Backend { c10::intrusive_ptr send( std::vector& tensors, int dstRank, - int tag) override { - TORCH_CHECK(false, "ProcessGroupXCCL::send not implemented"); - } + int tag) override; c10::intrusive_ptr recv( std::vector& tensors, int srcRank, - int tag) override { - TORCH_CHECK(false, "ProcessGroupXCCL::recv not implemented"); - } + int tag) override; void groupStart(); @@ -309,6 +324,7 @@ class TORCH_API ProcessGroupXCCL : public Backend { protected: std::unordered_map xcclStreams_; + std::unordered_map xcclEvents_; std::unordered_map> inInitializationCommMap_; std::unordered_map> devXCCLCommMap_; From 88bea257af5931dc9e083f1cf7027e802d587f8d Mon Sep 17 00:00:00 2001 From: hanchao Date: Sun, 29 Sep 2024 09:05:17 +0000 Subject: [PATCH 73/96] refine findccl code --- cmake/Modules/FindXCCL.cmake | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cmake/Modules/FindXCCL.cmake b/cmake/Modules/FindXCCL.cmake index 56b7fc0f7dcf32..a717ad1dafc653 100644 --- a/cmake/Modules/FindXCCL.cmake +++ b/cmake/Modules/FindXCCL.cmake @@ -11,10 +11,10 @@ if(DEFINED ENV{CCL_ROOT}) set(XCCL_ROOT $ENV{CCL_ROOT}) endif() -string(COMPARE EQUAL "${XCCL_ROOT}" "" nosyclfound) -if(nosyclfound) +string(COMPARE EQUAL "${XCCL_ROOT}" "" nocclfound) +if(nocclfound) set(XCCL_FOUND False) - set(XCCL_REASON_FAILURE "XCCL library not set!!") + set(XCCL_REASON_FAILURE "OneCCL library not found!!") set(XCCL_NOT_FOUND_MESSAGE "${XCCL_REASON_FAILURE}") return() endif() @@ -55,7 +55,7 @@ find_library( if((NOT XCCL_INCLUDE_DIR) OR (NOT XCCL_LIBRARY_DIR) OR (NOT XCCL_LIBRARY)) set(XCCL_FOUND False) - set(XCCL_REASON_FAILURE "XCCL library is incomplete!!") + set(XCCL_REASON_FAILURE "OneCCL library not found!!") set(XCCL_NOT_FOUND_MESSAGE "${XCCL_REASON_FAILURE}") return() endif() From f6ea93450c1b8bd10b709ee5b4076ac25c6413a2 Mon Sep 17 00:00:00 2001 From: hanchao Date: Mon, 30 Sep 2024 04:41:54 +0000 Subject: [PATCH 74/96] Add comments for build xccl --- caffe2/CMakeLists.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index 2160399a3ea296..54ec7db0cad87e 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -1013,6 +1013,10 @@ elseif(USE_CUDA) endif() if(USE_XPU) + # if SYCL runtime and oneCCL runtime are both system installed + # then building flag USE_XPU=ON , USE_XCCL=ON and USE_C10D_XCCL=ON; + # XCCL backend will be build in libtorch_xpu; + # manually set `USE_XCCL=OFF` disable XCCL backend building. if(USE_XCCL) append_filelist("libtorch_xpu_distributed_extra_sources" Caffe2_XPU_SRCS) endif() From 31d092d72303b08dcdfa0b2fd8b4e4ae45d3dffd Mon Sep 17 00:00:00 2001 From: hanchao Date: Wed, 9 Oct 2024 08:18:35 +0000 Subject: [PATCH 75/96] minor fix --- caffe2/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index 54ec7db0cad87e..16a8834225915c 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -1085,7 +1085,7 @@ if(USE_XPU) include_directories(SYSTEM ${ATen_XPU_INCLUDE_DIRS}) endif() - if(USE_XCCL) + if(USE_C10D_XCCL) target_link_libraries(torch_xpu PRIVATE torch::xccl) target_compile_definitions(torch_xpu PRIVATE USE_XCCL) endif() From cbea299190dcc0e90796fac38cb7b1adb2a34e1a Mon Sep 17 00:00:00 2001 From: hanchao Date: Wed, 9 Oct 2024 10:31:25 +0000 Subject: [PATCH 76/96] rm duplicate code and refine cmake --- CMakeLists.txt | 2 -- caffe2/CMakeLists.txt | 4 ++-- cmake/Dependencies.cmake | 1 - cmake/External/xccl.cmake | 3 +++ 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0318fcb4d1ec04..60fc8aae14173b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -369,8 +369,6 @@ cmake_dependent_option( USE_C10D_GLOO "USE C10D GLOO" ON "USE_DISTRIBUTED;USE_GLOO" OFF) cmake_dependent_option( USE_C10D_NCCL "USE C10D NCCL" ON "USE_DISTRIBUTED;USE_NCCL" OFF) -cmake_dependent_option( - USE_C10D_XCCL "USE C10D XCCL" ON "USE_DISTRIBUTED;USE_XCCL" OFF) cmake_dependent_option( USE_C10D_MPI "USE C10D MPI" ON "USE_DISTRIBUTED;USE_MPI" OFF) cmake_dependent_option( diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index 16a8834225915c..b4ec018019f165 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -1085,7 +1085,7 @@ if(USE_XPU) include_directories(SYSTEM ${ATen_XPU_INCLUDE_DIRS}) endif() - if(USE_C10D_XCCL) + if(USE_XCCL) target_link_libraries(torch_xpu PRIVATE torch::xccl) target_compile_definitions(torch_xpu PRIVATE USE_XCCL) endif() @@ -1374,7 +1374,7 @@ if(USE_DISTRIBUTED) target_compile_definitions(torch_cuda PUBLIC USE_C10D_NCCL) endif() endif() - if(USE_C10D_XCCL) + if(USE_XPU AND USE_C10D_XCCL) target_compile_definitions(torch_xpu PUBLIC USE_C10D_XCCL) set_source_files_properties( ${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupXCCL.cpp diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 3e59b813d31381..ee38f19773af81 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -1163,7 +1163,6 @@ if(USE_XCCL) caffe2_update_option(USE_XCCL OFF) else() include(${CMAKE_CURRENT_LIST_DIR}/External/xccl.cmake) - list(APPEND Caffe2_XPU_DEPENDENCY_LIBS torch::xccl) endif() endif() diff --git a/cmake/External/xccl.cmake b/cmake/External/xccl.cmake index 56205b381b1324..467bb830e0b6cf 100644 --- a/cmake/External/xccl.cmake +++ b/cmake/External/xccl.cmake @@ -12,6 +12,9 @@ if(NOT __XCCL_INCLUDED) set_property( TARGET torch::xccl PROPERTY INTERFACE_LINK_LIBRARIES ${XCCL_LIBRARY}) + else() + set(USE_XCCL OFF) + set(USE_C10D_XCCL OFF) endif() endif() endif() From ef261c6f3de6d9cd25c12dbf149fca83482996e2 Mon Sep 17 00:00:00 2001 From: hanchao Date: Thu, 10 Oct 2024 00:30:33 +0000 Subject: [PATCH 77/96] update cmake --- cmake/Dependencies.cmake | 3 +++ cmake/External/xccl.cmake | 25 ++++++++++--------------- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index ee38f19773af81..f90846e89c7549 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -1163,6 +1163,9 @@ if(USE_XCCL) caffe2_update_option(USE_XCCL OFF) else() include(${CMAKE_CURRENT_LIST_DIR}/External/xccl.cmake) + if(NOT XCCL_FOUND) + caffe2_update_option(USE_XCCL OFF) + endif() endif() endif() diff --git a/cmake/External/xccl.cmake b/cmake/External/xccl.cmake index 467bb830e0b6cf..acb7cee87593e1 100644 --- a/cmake/External/xccl.cmake +++ b/cmake/External/xccl.cmake @@ -1,20 +1,15 @@ if(NOT __XCCL_INCLUDED) set(__XCCL_INCLUDED TRUE) - if(USE_XCCL) - # XCCL_ROOT, XCCL_LIBRARY_DIR, XCCL_INCLUDE_DIR are handled by FindXCCL.cmake. - find_package(XCCL REQUIRED) - if(XCCL_FOUND) - add_library(torch::xccl INTERFACE IMPORTED) - set_property( - TARGET torch::xccl PROPERTY INTERFACE_INCLUDE_DIRECTORIES - ${XCCL_INCLUDE_DIR}) - set_property( - TARGET torch::xccl PROPERTY INTERFACE_LINK_LIBRARIES - ${XCCL_LIBRARY}) - else() - set(USE_XCCL OFF) - set(USE_C10D_XCCL OFF) - endif() + # XCCL_ROOT, XCCL_LIBRARY_DIR, XCCL_INCLUDE_DIR are handled by FindXCCL.cmake. + find_package(XCCL REQUIRED) + if(XCCL_FOUND) + add_library(torch::xccl INTERFACE IMPORTED) + set_property( + TARGET torch::xccl PROPERTY INTERFACE_INCLUDE_DIRECTORIES + ${XCCL_INCLUDE_DIR}) + set_property( + TARGET torch::xccl PROPERTY INTERFACE_LINK_LIBRARIES + ${XCCL_LIBRARY}) endif() endif() From 6c648cdbc1260f55256eebff0e0a0d6981b66694 Mon Sep 17 00:00:00 2001 From: hanchao Date: Tue, 24 Sep 2024 05:19:22 +0000 Subject: [PATCH 78/96] hidden xccl specific --- .../distributed/c10d/ProcessGroupXCCL.cpp | 104 +++--------------- .../distributed/c10d/ProcessGroupXCCL.hpp | 103 ++++++++++------- torch/csrc/distributed/c10d/Utils.hpp | 25 +++++ 3 files changed, 106 insertions(+), 126 deletions(-) diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index 5aeeb62bee1ece..d26d25ae03e39a 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -1,11 +1,9 @@ +#ifdef USE_C10D_XCCL + #include #include -#include -#include - -#ifdef USE_C10D_XCCL -#include #include +#include #include #include #include @@ -13,15 +11,7 @@ #include #include -#include -#include -#include #include -#include -#include -#include -#include -#include namespace c10d { @@ -45,36 +35,6 @@ std::map xcclDatatypes = { {at::kBool, ccl::datatype::uint8}, }; -XCCL_KVS kvs; -std::mutex kvs_mutex; - -XCCL_KVS get_kvs(int rank, c10d::Store& store) { - std::lock_guard lock(kvs_mutex); - if (kvs) - return kvs; - std::string storeKey = "xccl_kvs"; - - // Rank 0 broadcast the bootstrap network information to other ranks - if (rank == 0) { - kvs = ccl::create_main_kvs(); - ccl::kvs::address_type main_addr = kvs->get_address(); - auto ccl_kvs_addr = - std::vector(main_addr.begin(), main_addr.end()); - store.set(storeKey, ccl_kvs_addr); - } else { - auto ccl_kvs_addr = store.get(storeKey); - if (ccl_kvs_addr.size() != ccl::kvs::address_max_size) { - throw std::runtime_error("Unexpected ccl kvs addr from the store\n"); - } - ccl::kvs::address_type main_addr; - std::copy_n( - ccl_kvs_addr.begin(), ccl::kvs::address_max_size, main_addr.begin()); - kvs = ccl::create_kvs(main_addr); - } - - return kvs; -} - void check_xpu_single_tensor(const at::Tensor& tensor) { if (!tensor.is_xpu() || tensor.is_sparse()) { C10_THROW_ERROR(ValueError, "Tensors must be XPU and dense"); @@ -106,23 +66,9 @@ ccl::reduction getXcclReduceOp(const ReduceOp& reduceOp, at::Tensor& input) { } return xcclOps.at(reduceOp); } catch (const std::out_of_range&) { - switch (reduceOp) { - case ReduceOp::AVG: - C10_THROW_ERROR(ValueError, "Cannot use ReduceOp AVG with XCCL"); - break; - case ReduceOp::BAND: - C10_THROW_ERROR(ValueError, "Cannot use ReduceOp.BAND with XCCL"); - break; - case ReduceOp::BOR: - C10_THROW_ERROR(ValueError, "Cannot use ReduceOp.BOR with XCCL"); - break; - case ReduceOp::BXOR: - C10_THROW_ERROR(ValueError, "Cannot use ReduceOp.BXOR with XCCL"); - break; - default: - C10_THROW_ERROR(ValueError, "Unhandled ReduceOp"); - break; - } + C10_THROW_ERROR( + ValueError, + "Cannot use ReduceOp." + reduce_op_to_string(reduceOp) + " with XCCL"); } } @@ -153,20 +99,6 @@ ProcessGroupXCCL::WorkXCCL::WorkXCCL(const WorkXCCL& w) ProcessGroupXCCL::WorkXCCL::~WorkXCCL() = default; -bool ProcessGroupXCCL::WorkXCCL::checkTimeout( - std::optional timeout) { - auto currentTimepoint = std::chrono::steady_clock::now(); - auto timeElapsed = std::chrono::duration_cast( - currentTimepoint - workStartTime_); - std::chrono::milliseconds opTimeout = std::chrono::milliseconds(60000); - - auto workTimeout = timeout ? *timeout : opTimeout; - - if (timeElapsed < workTimeout) - return false; - return true; -} - bool ProcessGroupXCCL::WorkXCCL::isCompleted() { if (xcclEndEvent_ && xcclEndEvent_->query()) { return true; @@ -178,23 +110,23 @@ void ProcessGroupXCCL::WorkXCCL::synchronize() { synchronizeInternal(kNoTimeout); } -void ProcessGroupXCCL::WorkXCCL::synchronizeStream() { - auto currentStream = at::xpu::getCurrentXPUStream(device_.index()); - // Block the current stream on the XCCL stream - xcclEndEvent_->block(currentStream); -} - void ProcessGroupXCCL::WorkXCCL::synchronizeInternal( std::chrono::milliseconds timeout) { - synchronizeStream(); - + auto currentStream = at::xpu::getCurrentXPUStream(device_.index()); + xcclEndEvent_->block(currentStream); if (blockingWait_) { while (!isCompleted()) { - bool timedOut = checkTimeout( - timeout == kNoTimeout ? std::nullopt : std::make_optional(timeout)); - if (timedOut) { - break; + auto currentTimepoint = std::chrono::steady_clock::now(); + auto timeElapsed = std::chrono::duration_cast( + currentTimepoint - workStartTime_); + if (timeElapsed >= timeout) { + std::string exceptionMsg = c10::str( + "Work ran for ", + timeElapsed.count(), + " milliseconds before timing out."); + TORCH_CHECK(false, exceptionMsg) } + std::this_thread::sleep_for( std::chrono::milliseconds(kSynchronizeBusyWaitMillis)); } diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp index 14a9f398a8cbe7..99b815f2138b4e 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp @@ -28,43 +28,8 @@ #include #include #include -#include namespace c10d { -namespace { -int getXCCLEnvVar(std::string envVarName) { - char* stringValue = std::getenv(envVarName.c_str()); - if (stringValue != nullptr) { - try { - int val = std::stoi(stringValue); - return val; - } catch (std::exception& e) { - TORCH_CHECK( - false, - "Invalid value for environment variable: " + std::string(envVarName)); - } - } else { - return -1; - } -} - -template -void setXCCLEnvVar(const std::string& envVarName, T val) { - if constexpr (std::is_same_v) { - setenv(envVarName.c_str(), std::to_string(val).c_str(), 1); - } else if constexpr (std::is_same_v) { - setenv(envVarName.c_str(), val.c_str(), 1); - } -} - -bool with_mpirun() { - return (getenv("MPI_LOCALRANKID") || getenv("MPI_LOCALNRANKS") || - getenv("PMI_RANK") || getenv("PMI_SIZE") || getenv("PMIX_RANK")) - ? true - : false; -} -} // namespace - static std::vector TORCH_XCCL_BLOCKING_WAIT = { "TORCH_XCCL_BLOCKING_WAIT", "XCCL_BLOCKING_WAIT"}; @@ -98,8 +63,6 @@ class TORCH_API ProcessGroupXCCL : public Backend { void synchronize() override; - void synchronizeStream(); - bool wait(std::chrono::milliseconds timeout = kNoTimeout) override; c10::intrusive_ptr getFuture() override { @@ -110,9 +73,6 @@ class TORCH_API ProcessGroupXCCL : public Backend { TORCH_CHECK(false, "ProcessGroupXCCL::WorkXCCL::result not implemented"); } - bool checkTimeout( - std::optional timeout = std::nullopt); - protected: at::Device device_; std::shared_ptr xcclEndEvent_; @@ -302,7 +262,70 @@ class TORCH_API ProcessGroupXCCL : public Backend { c10::intrusive_ptr store_; std::mutex mutex_; bool blockingWait_ = false; + + private: + XCCL_KVS kvs; + std::mutex kvs_mutex; + XCCL_KVS get_kvs(int rank, c10d::Store& store) { + std::lock_guard lock(kvs_mutex); + if (kvs) + return kvs; + std::string storeKey = "xccl_kvs"; + // Rank 0 broadcast the bootstrap network information to other ranks + if (rank == 0) { + kvs = ccl::create_main_kvs(); + ccl::kvs::address_type main_addr = kvs->get_address(); + auto ccl_kvs_addr = + std::vector(main_addr.begin(), main_addr.end()); + store.set(storeKey, ccl_kvs_addr); + } else { + auto ccl_kvs_addr = store.get(storeKey); + if (ccl_kvs_addr.size() != ccl::kvs::address_max_size) { + throw std::runtime_error("Unexpected ccl kvs addr from the store\n"); + } + ccl::kvs::address_type main_addr; + std::copy_n( + ccl_kvs_addr.begin(), ccl::kvs::address_max_size, main_addr.begin()); + kvs = ccl::create_kvs(main_addr); + } + return kvs; + } }; + +namespace { +int getXCCLEnvVar(std::string envVarName) { + char* stringValue = std::getenv(envVarName.c_str()); + if (stringValue != nullptr) { + try { + int val = std::stoi(stringValue); + return val; + } catch (std::exception& e) { + TORCH_CHECK( + false, + "Invalid value for environment variable: " + std::string(envVarName)); + } + } else { + return -1; + } +} + +template +void setXCCLEnvVar(const std::string& envVarName, T val) { + if constexpr (std::is_same_v) { + setenv(envVarName.c_str(), std::to_string(val).c_str(), 1); + } else if constexpr (std::is_same_v) { + setenv(envVarName.c_str(), val.c_str(), 1); + } +} + +bool with_mpirun() { + return (getenv("MPI_LOCALRANKID") || getenv("MPI_LOCALNRANKS") || + getenv("PMI_RANK") || getenv("PMI_SIZE") || getenv("PMIX_RANK")) + ? true + : false; +} + +} // namespace } // namespace c10d #endif // USE_C10D_XCCL diff --git a/torch/csrc/distributed/c10d/Utils.hpp b/torch/csrc/distributed/c10d/Utils.hpp index ea4a4653bc35fc..73e37e0437c459 100644 --- a/torch/csrc/distributed/c10d/Utils.hpp +++ b/torch/csrc/distributed/c10d/Utils.hpp @@ -557,6 +557,31 @@ size_t computeLengthsAndOffsets( return offset; } +inline std::string reduce_op_to_string(c10d::ReduceOp op) { + switch (op) { + case c10d::ReduceOp::SUM: + return "SUM"; + case c10d::ReduceOp::PRODUCT: + return "PRODUCT"; + case c10d::ReduceOp::MIN: + return "MIN"; + case c10d::ReduceOp::MAX: + return "MAX"; + case c10d::ReduceOp::BAND: + return "BAND"; + case c10d::ReduceOp::BOR: + return "BOR"; + case c10d::ReduceOp::BXOR: + return "BXOR"; + case c10d::ReduceOp::AVG: + return "AVG"; + case c10d::ReduceOp::PREMUL_SUM: + return "PREMUL_SUM"; + default: + return "UNKNOWN"; + } +} + using RankType = uint32_t; using SizeType = uint64_t; From e621fe6010382c3c4e614df4ace6a861f598442d Mon Sep 17 00:00:00 2001 From: hanchao Date: Fri, 11 Oct 2024 01:55:18 +0000 Subject: [PATCH 79/96] fix ci fail --- test/distributed/test_c10d_common.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/distributed/test_c10d_common.py b/test/distributed/test_c10d_common.py index d96abb1ca82675..903df26bba9f6f 100644 --- a/test/distributed/test_c10d_common.py +++ b/test/distributed/test_c10d_common.py @@ -1836,6 +1836,9 @@ def test_init_process_group_for_all_backends(self): elif backend == dist.Backend.UCC: if not dist.is_ucc_available(): continue + elif backend == dist.Backend.XCCL: + if not dist.is_xccl_available(): + continue # Multi-threaded PG is defined as a pure python class. # Its pg.name() does not going through Pybind, so its backend name # is still "threaded" instead of "custom". From f85a8451003bf14b6bc72ddd7799c7bc239bd8b4 Mon Sep 17 00:00:00 2001 From: hanchao Date: Sat, 12 Oct 2024 02:35:00 +0000 Subject: [PATCH 80/96] rm ccl attr --- .../distributed/c10d/ProcessGroupXCCL.cpp | 78 ++++--------------- 1 file changed, 17 insertions(+), 61 deletions(-) diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index 3a795c817bec22..365640d1377781 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -1,9 +1,9 @@ #ifdef USE_C10D_XCCL -#include -#include #include +#include #include +#include #include #include #include @@ -11,7 +11,6 @@ #include #include -#include #include #include @@ -454,10 +453,6 @@ c10::intrusive_ptr ProcessGroupXCCL::collective( PreProcess pre, PostProcess post, OpType opType) { - using traits = function_traits; - using attr_t = typename traits::template arg<2>::type; - attr_t attr = ccl::create_operation_attr(); - auto device = inputs[0].device(); const auto key = std::to_string(device.index()); auto comm = getXCCLComm(key, device, opType); @@ -494,7 +489,7 @@ c10::intrusive_ptr ProcessGroupXCCL::collective( input.storage().data_ptr(), stream); } - fn(inputs[0], outputs[0], attr, *comm, stream); + fn(inputs[0], outputs[0], *comm, stream); post(stream, work); @@ -549,10 +544,6 @@ c10::intrusive_ptr ProcessGroupXCCL::collectiveCoalesced( std::vector& outputs, Fn fn, OpType opType) { - using traits = function_traits; - using attr_t = typename traits::template arg<2>::type; - attr_t attr = ccl::create_operation_attr(); - auto device = inputs[0].device(); const auto key = std::to_string(device.index()); auto comm = getXCCLComm(key, device, opType); @@ -587,7 +578,7 @@ c10::intrusive_ptr ProcessGroupXCCL::collectiveCoalesced( for (const auto i : c10::irange(inputs.size())) { c10::xpu::XPUCachingAllocator::recordStream( inputs[i].storage().data_ptr(), stream); - fn(inputs[i], outputs[i], attr, *comm, stream); + fn(inputs[i], outputs[i], *comm, stream); } } @@ -612,10 +603,6 @@ c10::intrusive_ptr ProcessGroupXCCL::pointToPoint( OpType opType, PreProcess pre, PostProcess post) { - using traits = function_traits; - using attr_t = typename traits::template arg<1>::type; - attr_t attr = ccl::create_operation_attr(); - auto device = tensor.device(); std::string key; int p2pRank = 0, p2pTargetRank = 0; @@ -671,7 +658,7 @@ c10::intrusive_ptr ProcessGroupXCCL::pointToPoint( c10::xpu::XPUCachingAllocator::recordStream( tensor.storage().data_ptr(), stream); - fn(tensor, attr, *comm, stream, p2pTargetRank); + fn(tensor, *comm, stream, p2pTargetRank); if (!coalescing_state_) { post(stream); @@ -721,7 +708,6 @@ c10::intrusive_ptr ProcessGroupXCCL::send( auto ret = pointToPoint( tensor, [&](at::Tensor& input, - ccl::pt2pt_attr attr, xcclComm_t& comm, at::xpu::XPUStream& stream, int dst) { @@ -733,8 +719,7 @@ c10::intrusive_ptr ProcessGroupXCCL::send( xcclDataType, dst, comm, - ccl::create_stream(stream.queue()), - attr); + ccl::create_stream(stream.queue())); return ret_evt; }, dstRank, @@ -754,7 +739,6 @@ c10::intrusive_ptr ProcessGroupXCCL::recv( auto ret = pointToPoint( tensor, [&](at::Tensor& output, - ccl::pt2pt_attr attr, xcclComm_t& comm, at::xpu::XPUStream& stream, int src) { @@ -766,8 +750,7 @@ c10::intrusive_ptr ProcessGroupXCCL::recv( xcclDataType, src, comm, - ccl::create_stream(stream.queue()), - attr); + ccl::create_stream(stream.queue())); return ret_evt; }, srcRank, @@ -826,7 +809,6 @@ c10::intrusive_ptr ProcessGroupXCCL::gather( outputs, // just to fit the collective interface [&](at::Tensor& /* unused */, at::Tensor& /* unused */, - ccl::allgather_attr attr, // just to fit interface xcclComm_t& comm, at::xpu::XPUStream& stream) { const auto root = opts.rootRank; @@ -928,7 +910,6 @@ c10::intrusive_ptr ProcessGroupXCCL::scatter( inputs, // just to fit the collective interface [&](at::Tensor& /* unused */, at::Tensor& /* unused */, - ccl::allgather_attr attr, // just to fit interface xcclComm_t& comm, at::xpu::XPUStream& stream) { if (getRank() == root) { @@ -988,7 +969,6 @@ c10::intrusive_ptr ProcessGroupXCCL::allreduce_impl( tensor, [&](at::Tensor& input, at::Tensor& output, - ccl::allreduce_attr attr, xcclComm_t& comm, at::xpu::XPUStream& stream) { auto xcclDataType = getXcclDataType(input.scalar_type()); @@ -1001,8 +981,7 @@ c10::intrusive_ptr ProcessGroupXCCL::allreduce_impl( xcclDataType, xcclReduceOp, comm, - ccl::create_stream(stream.queue()), - attr); + ccl::create_stream(stream.queue())); return ret_evt; }, OpType::ALLREDUCE); @@ -1042,7 +1021,6 @@ c10::intrusive_ptr ProcessGroupXCCL::allreduce_coalesced( tensors, [&](at::Tensor& input, at::Tensor& output, - ccl::allreduce_attr attr, xcclComm_t& comm, at::xpu::XPUStream& stream) { auto xcclDataType = getXcclDataType(input.scalar_type()); @@ -1055,8 +1033,7 @@ c10::intrusive_ptr ProcessGroupXCCL::allreduce_coalesced( xcclDataType, xcclReduceOp, comm, - ccl::create_stream(stream.queue()), - attr); + ccl::create_stream(stream.queue())); return ret_evt; }, OpType::COALESCED); @@ -1079,7 +1056,6 @@ c10::intrusive_ptr ProcessGroupXCCL::broadcast( tensor, [&](at::Tensor& input, at::Tensor& output, - ccl::broadcast_attr attr, xcclComm_t& comm, at::xpu::XPUStream& stream) { auto xcclDataType = getXcclDataType(input.scalar_type()); @@ -1090,8 +1066,7 @@ c10::intrusive_ptr ProcessGroupXCCL::broadcast( xcclDataType, root, comm, - ccl::create_stream(stream.queue()), - attr); + ccl::create_stream(stream.queue())); return ret_evt; }, OpType::BROADCAST); @@ -1112,7 +1087,6 @@ c10::intrusive_ptr ProcessGroupXCCL::_broadcast_oop( outputTensor, [&](at::Tensor& input, at::Tensor& output, - ccl::broadcast_attr attr, xcclComm_t& comm, at::xpu::XPUStream& stream) { auto xcclDataType = getXcclDataType(input.scalar_type()); @@ -1123,8 +1097,7 @@ c10::intrusive_ptr ProcessGroupXCCL::_broadcast_oop( xcclDataType, root, comm, - ccl::create_stream(stream.queue()), - attr); + ccl::create_stream(stream.queue())); return ret_evt; }, OpType::BROADCAST); @@ -1151,7 +1124,6 @@ c10::intrusive_ptr ProcessGroupXCCL::reduce( tensor, [&](at::Tensor& input, at::Tensor& output, - ccl::reduce_attr attr, xcclComm_t& comm, at::xpu::XPUStream& stream) { const int root = opts.rootRank + opts.rootTensor; @@ -1186,7 +1158,6 @@ c10::intrusive_ptr ProcessGroupXCCL::_reduce_oop( outputTensor, [&](at::Tensor& input, at::Tensor& output, - ccl::reduce_attr attr, xcclComm_t& comm, at::xpu::XPUStream& stream) { const int root = opts.rootRank + opts.rootTensor; @@ -1228,7 +1199,6 @@ c10::intrusive_ptr ProcessGroupXCCL::allgather( outputFlattened, [&](at::Tensor& input, at::Tensor& output, - ccl::allgather_attr attr, xcclComm_t& comm, at::xpu::XPUStream& stream) { c10::xpu::XPUCachingAllocator::recordStream( @@ -1242,8 +1212,7 @@ c10::intrusive_ptr ProcessGroupXCCL::allgather( (size_t)input.numel(), xcclDataType, comm, - ccl::create_stream(stream.queue()), - attr); + ccl::create_stream(stream.queue())); return ret_evt; }, [](at::xpu::XPUStream&, @@ -1297,7 +1266,6 @@ c10::intrusive_ptr ProcessGroupXCCL::_allgather_base( output_tensor, [&](at::Tensor& input, at::Tensor& output, - ccl::allgather_attr attr, xcclComm_t& comm, at::xpu::XPUStream& stream) { c10::xpu::XPUCachingAllocator::recordStream( @@ -1310,8 +1278,7 @@ c10::intrusive_ptr ProcessGroupXCCL::_allgather_base( (size_t)input.numel(), xcclDataType, comm, - ccl::create_stream(stream.queue()), - attr); + ccl::create_stream(stream.queue())); return ret_evt; }, OpType::_ALLGATHER_BASE); @@ -1326,7 +1293,6 @@ c10::intrusive_ptr ProcessGroupXCCL::allgather_into_tensor_coalesced( outputs, [&](at::Tensor& input, at::Tensor& output, - ccl::allgather_attr attr, xcclComm_t& comm, at::xpu::XPUStream& stream) { auto xcclDataType = getXcclDataType(input.scalar_type()); @@ -1337,8 +1303,7 @@ c10::intrusive_ptr ProcessGroupXCCL::allgather_into_tensor_coalesced( (size_t)input.numel(), xcclDataType, comm, - ccl::create_stream(stream.queue()), - attr); + ccl::create_stream(stream.queue())); return ret_evt; }, OpType::COALESCED); @@ -1367,7 +1332,6 @@ c10::intrusive_ptr ProcessGroupXCCL::reduce_scatter( outputTensor, [&](at::Tensor& input, at::Tensor& output, - ccl::reduce_attr attr, xcclComm_t& comm, at::xpu::XPUStream& stream) { c10::xpu::XPUCachingAllocator::recordStream( @@ -1442,7 +1406,6 @@ c10::intrusive_ptr ProcessGroupXCCL::_reduce_scatter_base( outputTensor, [&](at::Tensor& input, at::Tensor& output, - ccl::reduce_attr attr, xcclComm_t& comm, at::xpu::XPUStream& stream) { c10::xpu::XPUCachingAllocator::recordStream( @@ -1475,7 +1438,6 @@ c10::intrusive_ptr ProcessGroupXCCL::reduce_scatter_tensor_coalesced( outputs, [&](at::Tensor& input, at::Tensor& output, - ccl::reduce_attr attr, xcclComm_t& comm, at::xpu::XPUStream& stream) { c10::xpu::XPUCachingAllocator::recordStream( @@ -1550,7 +1512,6 @@ c10::intrusive_ptr ProcessGroupXCCL::alltoall_base( outputTensor, [&](at::Tensor& input, at::Tensor& output, - ccl::alltoall_attr attr, xcclComm_t& comm, at::xpu::XPUStream& stream) { c10::xpu::XPUCachingAllocator::recordStream( @@ -1563,8 +1524,7 @@ c10::intrusive_ptr ProcessGroupXCCL::alltoall_base( (size_t)output.numel() / comm.size(), xcclDataType, comm, - ccl::create_stream(stream.queue()), - attr); + ccl::create_stream(stream.queue())); return ret_evt; }, OpType::ALLTOALL_BASE); @@ -1577,7 +1537,6 @@ c10::intrusive_ptr ProcessGroupXCCL::alltoall_base( outputTensor, [&](at::Tensor& input, at::Tensor& output, - ccl::alltoallv_attr attr, xcclComm_t& comm, at::xpu::XPUStream& stream) { std::vector sendCounts(size_); @@ -1608,8 +1567,7 @@ c10::intrusive_ptr ProcessGroupXCCL::alltoall_base( recvCounts, xcclDataType, comm, - ccl::create_stream(stream.queue()), - attr); + ccl::create_stream(stream.queue())); return ret_evt; }, OpType::ALLTOALL_BASE); @@ -1635,7 +1593,6 @@ c10::intrusive_ptr ProcessGroupXCCL::alltoall( outputTensors, [&](at::Tensor& /* unused */, at::Tensor& /* unused */, - ccl::alltoallv_attr attr, xcclComm_t& comm, at::xpu::XPUStream& stream) { c10::OptionalStreamGuard stream_guard(stream.unwrap()); @@ -1671,8 +1628,7 @@ c10::intrusive_ptr ProcessGroupXCCL::alltoall( recvCounts, xcclDataType, comm, - ccl::create_stream(stream.queue()), - attr); + ccl::create_stream(stream.queue())); if (!isOutputFlat) { ret_evt.wait(); From 56a5e7ff6ca99025855e8da554967f6362287ba5 Mon Sep 17 00:00:00 2001 From: hanchao Date: Thu, 17 Oct 2024 00:25:04 +0000 Subject: [PATCH 81/96] Refine specific code --- cmake/Modules/FindXCCL.cmake | 7 +- .../distributed/c10d/ProcessGroupXCCL.cpp | 209 ++++++++---------- .../distributed/c10d/ProcessGroupXCCL.hpp | 194 ++-------------- torch/csrc/distributed/c10d/Utils.hpp | 2 +- 4 files changed, 115 insertions(+), 297 deletions(-) diff --git a/cmake/Modules/FindXCCL.cmake b/cmake/Modules/FindXCCL.cmake index a717ad1dafc653..18f7ac642d54e9 100644 --- a/cmake/Modules/FindXCCL.cmake +++ b/cmake/Modules/FindXCCL.cmake @@ -6,9 +6,10 @@ include(FindPackageHandleStandardArgs) -set(XCCL_ROOT "") -if(DEFINED ENV{CCL_ROOT}) - set(XCCL_ROOT $ENV{CCL_ROOT}) +set(XCCL_ROOT "/opt/intel/oneapi/ccl/latest") +if (NOT EXISTS "${XCCL_ROOT}") + message(STATUS "Default OneCCL not found, using current environment OneAPI") + set(XCCL_ROOT $ENV{ONEAPI_ROOT}/ccl/latest) endif() string(COMPARE EQUAL "${XCCL_ROOT}" "" nocclfound) diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index d26d25ae03e39a..ef007825a118ed 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -1,5 +1,6 @@ #ifdef USE_C10D_XCCL +#include #include #include #include @@ -9,21 +10,20 @@ #include #include -#include #include #include namespace c10d { namespace { -std::map xcclOps = { +const std::map xcclOps = { {ReduceOp::MIN, ccl::reduction::min}, {ReduceOp::MAX, ccl::reduction::max}, {ReduceOp::SUM, ccl::reduction::sum}, {ReduceOp::PRODUCT, ccl::reduction::prod}, }; -std::map xcclDatatypes = { +const std::map xcclDatatypes = { {at::kByte, ccl::datatype::uint8}, {at::kChar, ccl::datatype::int8}, {at::kInt, ccl::datatype::int32}, @@ -35,16 +35,22 @@ std::map xcclDatatypes = { {at::kBool, ccl::datatype::uint8}, }; -void check_xpu_single_tensor(const at::Tensor& tensor) { - if (!tensor.is_xpu() || tensor.is_sparse()) { - C10_THROW_ERROR(ValueError, "Tensors must be XPU and dense"); - } - if (!tensor.is_contiguous(tensor.suggest_memory_format())) { - C10_THROW_ERROR(ValueError, "Tensors must be contiguous"); +void checkXPUTensor(at::Tensor& tensor) { + if (!tensor.is_xpu() || tensor.is_sparse() || tensor.is_complex()) { + C10_THROW_ERROR( + ValueError, "Tensors must be XPU and dense and non-complex"); + if (!tensor.is_contiguous(tensor.suggest_memory_format())) { + C10_THROW_ERROR(ValueError, "Tensors must be contiguous"); + } } } -ccl::datatype getXcclDataType(at::ScalarType type) { +ccl::datatype getXcclDataType( + at::ScalarType type, + bool is_reduction_op = false) { + TORCH_CHECK( + !isFloat8Type(type) && is_reduction_op, + "Float8 dtypes are not currenlty supported for XCCL reductions"); auto it = xcclDatatypes.find(type); TORCH_CHECK_WITH( TypeError, @@ -56,26 +62,27 @@ ccl::datatype getXcclDataType(at::ScalarType type) { ccl::reduction getXcclReduceOp(const ReduceOp& reduceOp, at::Tensor& input) { try { - if (input.scalar_type() == at::kBool) { - if (reduceOp == ReduceOp::SUM) { - // For bool tensors, map sum to max, which both represent a bitwise or. - // This is to prevent overflow issues with sum, since we use uint8 to - // represent a bool (see xcclDatatypes mapping align with cuda). - return ccl::reduction::max; - } + if (input.scalar_type() == at::kBool && reduceOp == ReduceOp::SUM) { + // Map sum to max for bool tensors to avoid overflow issues with sum. + return ccl::reduction::max; } return xcclOps.at(reduceOp); } catch (const std::out_of_range&) { C10_THROW_ERROR( ValueError, - "Cannot use ReduceOp." + reduce_op_to_string(reduceOp) + " with XCCL"); + "Cannot use ReduceOp." + reduceOpToString(reduceOp) + " with XCCL"); } } +void syncStream( + at::Device& device, + at::xpu::XPUEvent& xcclEvent, + at::xpu::XPUStream& xcclStream) { + xcclEvent.record(at::xpu::getCurrentXPUStream(device.index())); + xcclEvent.block(xcclStream); +} } // namespace -static std::mutex xcclCommDevIdxMapMutex; -static std::unordered_map, int> xcclCommDevIdxMap; constexpr int64_t kSynchronizeBusyWaitMillis = 10; ProcessGroupXCCL::WorkXCCL::WorkXCCL( @@ -86,8 +93,7 @@ ProcessGroupXCCL::WorkXCCL::WorkXCCL( : Work(rank, opType, "profilingTitle", inputs), device_(device), workStartTime_(std::chrono::steady_clock::now()) { - unsigned char enable_timing = 0; - xcclEndEvent_ = std::make_shared(enable_timing); + xcclEndEvent_ = std::make_shared(); } ProcessGroupXCCL::WorkXCCL::WorkXCCL(const WorkXCCL& w) @@ -121,12 +127,9 @@ void ProcessGroupXCCL::WorkXCCL::synchronizeInternal( currentTimepoint - workStartTime_); if (timeElapsed >= timeout) { std::string exceptionMsg = c10::str( - "Work ran for ", - timeElapsed.count(), - " milliseconds before timing out."); + "Work ran time out after ", timeElapsed.count(), " milliseconds."); TORCH_CHECK(false, exceptionMsg) } - std::this_thread::sleep_for( std::chrono::milliseconds(kSynchronizeBusyWaitMillis)); } @@ -145,20 +148,6 @@ ProcessGroupXCCL::ProcessGroupXCCL( : Backend(rank, size), store_(store) { blockingWait_ = getCvarBool(TORCH_XCCL_BLOCKING_WAIT, false); init(); - - // Intel oneCCL requires passing CCL_LOCAL_RANK and CCL_LOCAL_SIZE for non-MPI - // launchers. - if (!with_mpirun()) { - int local_rank = getXCCLEnvVar("LOCAL_RANK"); - int local_world_size = getXCCLEnvVar("LOCAL_WORLD_SIZE"); - if (local_rank == -1 || local_world_size == -1) { - local_rank = rank; - local_world_size = size; - } - setXCCLEnvVar("CCL_PROCESS_LAUNCHER", "none"); - setXCCLEnvVar("CCL_LOCAL_RANK", local_rank); - setXCCLEnvVar("CCL_LOCAL_SIZE", local_world_size); - } } ProcessGroupXCCL::~ProcessGroupXCCL() = default; @@ -177,97 +166,74 @@ c10::intrusive_ptr ProcessGroupXCCL::initWork( std::shared_ptr ProcessGroupXCCL::getXCCLComm( const std::string& deviceKey, at::Device& device) { - if (deviceKey.empty()) { - C10_THROW_ERROR( - DistBackendError, - "Not able to create/get the XCCL Communicator since " - "the devices are empty "); - } - + TORCH_CHECK_WITH( + DistBackendError, + !deviceKey.empty(), + "Not able to create/get " + "XCCL Communicator since the devices are empty "); { + // todo: why do we need mutex here? std::lock_guard lock(mutex_); if (devXCCLCommMap_.find(deviceKey) != devXCCLCommMap_.end()) { return devXCCLCommMap_[deviceKey]; } } - std::shared_ptr XCCLComm; - - XCCL_KVS kvs = get_kvs(rank_, *store_); - int numRanks, rank; numRanks = getSize(); rank = getRank(); c10::impl::VirtualGuardImpl impl(device.type()); - c10::Stream stream = impl.getStream(device); + c10::Stream stream = + impl.getStreamFromGlobalPool(device, /*isHighPriority=*/false); sycl::queue& q = c10::xpu::XPUStream(stream).queue(); auto ctx = ccl::create_context(q.get_context()); ccl::vector_class> devs_rank; devs_rank.emplace_back(rank, ccl::create_device(q.get_device())); - auto comms = ccl::create_communicators(numRanks, devs_rank, ctx, kvs); - XCCLComm = std::make_shared(std::move(comms[0])); + auto xccl_kvs = get_kvs(rank_, *store_); + auto comms = ccl::create_communicators(numRanks, devs_rank, ctx, xccl_kvs); + std::shared_ptr XCCLComm = + std::make_shared(std::move(comms[0])); - { - std::lock_guard lock(mutex_); - inInitializationCommMap_.emplace(deviceKey, XCCLComm); - } + std::lock_guard lock(mutex_); + devXCCLCommMap_.emplace(deviceKey, XCCLComm); + xcclStreamsMap_.emplace(deviceKey, std::move(stream)); + xcclEventsMap_.emplace(deviceKey, at::xpu::XPUEvent()); - xcclStreams_.emplace(deviceKey, std::move(stream)); - - auto it = inInitializationCommMap_.find(deviceKey); - if (it != inInitializationCommMap_.end()) { - devXCCLCommMap_.emplace(deviceKey, std::move(it->second)); - inInitializationCommMap_.erase(deviceKey); - - xcclCommDevIdxMapMutex.lock(); - xcclCommDevIdxMap.emplace(XCCLComm, device.index()); - xcclCommDevIdxMapMutex.unlock(); - } - - it = devXCCLCommMap_.find(deviceKey); - TORCH_INTERNAL_ASSERT( - it != devXCCLCommMap_.end(), "Communicators not populated in cache!"); - - return it->second; + return XCCLComm; } template c10::intrusive_ptr ProcessGroupXCCL::collective( - at::Tensor& input, - at::Tensor& output, + std::vector& inputs, + std::vector& outputs, Fn fn, PreProcess pre, PostProcess post, OpType opType) { - using traits = function_traits; - using attr_t = typename traits::template arg<2>::type; - attr_t attr = ccl::create_operation_attr(); - - auto device = input.device(); + auto device = inputs[0].device(); const auto key = std::to_string(device.index()); auto comm = getXCCLComm(key, device); - auto stream = xcclStreams_.at(key); - std::vector outputs{output}; + auto stream = xcclStreamsMap_.at(key); + syncStream(device, xcclEventsMap_[key], stream); c10::intrusive_ptr work; - work = initWork(device, rank_, opType); - - work->outputs_ = - std::make_shared>(std::move(outputs)); - c10::xpu::XPUCachingAllocator::recordStream( - input.storage().data_ptr(), stream); - - auto ccl_stream = ccl::create_stream(stream.queue()); - - fn(input, output, attr, *comm, ccl_stream); + work->outputs_ = std::make_shared>(outputs); + + at::xpu::OptionalXPUGuard gpuGuard(device); + pre(stream, work); + for (const auto i : c10::irange(inputs.size())) { + c10::xpu::XPUCachingAllocator::recordStream( + inputs[i].storage().data_ptr(), stream); + fn(inputs[i], outputs[i], *comm, stream); + } + post(stream, work); work->xcclEndEvent_->record(stream); - std::vector streams = {stream.unwrap()}; c10::MultiStreamGuard streamGuard(streams); std::vector devices{device}; @@ -279,51 +245,52 @@ c10::intrusive_ptr ProcessGroupXCCL::collective( return work; } -template -c10::intrusive_ptr ProcessGroupXCCL::collective( - at::Tensor& input, - at::Tensor& output, - Fn fn, - OpType opType) { - return collective( - input, - output, - fn, - [](at::xpu::XPUStream&, - c10::intrusive_ptr& work) {}, - [](at::xpu::XPUStream&, - c10::intrusive_ptr& work) {}, - opType); -} - c10::intrusive_ptr ProcessGroupXCCL::allreduce( std::vector& tensors, const AllreduceOptions& opts) { TORCH_CHECK( tensors.size() == 1, "Expecting one tensor only but got multiple"); auto tensor = tensors.back(); - check_xpu_single_tensor(tensor); + checkXPUTensor(tensor); + + RECORD_PARAM_COMMS_DATA( + // static_cast( + // this->getSequenceNumberForGroup() + 1), // seq + 1 to match + // collective + 1, + std::make_tuple(pg_uid_, pg_desc_), // PG name tuple + tensors, // inputTensors + tensors, // outputTensors + rank_, // rank + "allreduce", // collective name + tensor.numel(), // inNelems + tensor.numel(), // outNelems + tensor.scalar_type(), // dType + std::vector(), // inSplitSizes + std::vector(), // outSplitSizes + 0, // globalRankStart + 1, // globalRankStride + this->getSize()); // worldSize + return collective( tensor, tensor, [&](at::Tensor& input, at::Tensor& output, - ccl::allreduce_attr attr, xcclComm_t& comm, - ccl::stream& stream) { - auto xcclDataType = getXcclDataType(input.scalar_type()); + at::xpu::XPUStream& stream) { + auto xcclDataType = getXcclDataType(input.scalar_type(), true); auto xcclReduceOp = getXcclReduceOp(opts.reduceOp, input); - ccl::event ret_evt; - ret_evt = ccl::allreduce( + auto ccl_stream = ccl::create_stream(stream.queue()); + ccl::allreduce( input.data_ptr(), output.data_ptr(), (size_t)input.numel(), xcclDataType, xcclReduceOp, comm, - stream, - attr); - return ret_evt; + ccl_stream); + return; }, OpType::ALLREDUCE); } diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp index 99b815f2138b4e..5dc003e3dba6b2 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp @@ -35,7 +35,6 @@ static std::vector TORCH_XCCL_BLOCKING_WAIT = { "XCCL_BLOCKING_WAIT"}; using xcclComm_t = ccl::communicator; -using XCCL_KVS = ccl::shared_ptr_class; constexpr const char* XCCL_BACKEND_NAME = "xccl"; class TORCH_API ProcessGroupXCCL : public Backend { @@ -52,11 +51,6 @@ class TORCH_API ProcessGroupXCCL : public Backend { bool isCompleted() override; - bool isSuccess() const override { - TORCH_CHECK( - false, "ProcessGroupXCCL::WorkXCCL::isSuccess not implemented"); - } - void abort() override { TORCH_CHECK(false, "ProcessGroupXCCL::WorkXCCL::abort not implemented"); } @@ -70,7 +64,7 @@ class TORCH_API ProcessGroupXCCL : public Backend { } std::vector result() override { - TORCH_CHECK(false, "ProcessGroupXCCL::WorkXCCL::result not implemented"); + return *outputs_; } protected: @@ -117,12 +111,24 @@ class TORCH_API ProcessGroupXCCL : public Backend { at::Tensor& input, at::Tensor& output, Fn fn, - OpType opType); + OpType opType) { + auto inputs = std::vector{input}; + auto outputs = std::vector{output}; + return collective( + inputs, + outputs, + fn, + [](at::xpu::XPUStream&, + c10::intrusive_ptr&) {}, + [](at::xpu::XPUStream&, + c10::intrusive_ptr&) {}, + opType); + } template c10::intrusive_ptr collective( - at::Tensor& input, - at::Tensor& output, + std::vector& inputs, + std::vector& outputs, Fn fn, PreProcess pre, PostProcess post, @@ -132,141 +138,20 @@ class TORCH_API ProcessGroupXCCL : public Backend { std::vector& tensors, const AllreduceOptions& opts = AllreduceOptions()) override; - c10::intrusive_ptr allreduce_coalesced( - std::vector& tensors, - const AllreduceCoalescedOptions& opts = - AllreduceCoalescedOptions()) override { - TORCH_CHECK(false, "ProcessGroupXCCL::allreduce_coalesced not implemented"); - } - - c10::intrusive_ptr reduce( - std::vector& tensors, - const ReduceOptions& opts = ReduceOptions()) override { - TORCH_CHECK(false, "ProcessGroupXCCL::reduce not implemented"); - } - - c10::intrusive_ptr broadcast( - std::vector& tensors, - const BroadcastOptions& opts = BroadcastOptions()) override { - TORCH_CHECK(false, "ProcessGroupXCCL::broadcast not implemented"); - } - - c10::intrusive_ptr allgather( - std::vector>& outputTensors, - std::vector& inputTensors, - const AllgatherOptions& opts = AllgatherOptions()) override { - TORCH_CHECK(false, "ProcessGroupXCCL::allgather not implemented"); - } - - c10::intrusive_ptr _allgather_base( - at::Tensor& outputbuffer, - at::Tensor& inputbuffer, - const AllgatherOptions& opts = AllgatherOptions()) override { - TORCH_CHECK(false, "ProcessGroupXCCL::_allgather_base not implemented"); - } - - c10::intrusive_ptr allgather_coalesced( - std::vector>& outputTensorLists, - std::vector& inputTensors, - const AllgatherOptions& opts = AllgatherOptions()) override { - TORCH_CHECK(false, "ProcessGroupXCCL::allgather_coalesced not implemented"); - } - - c10::intrusive_ptr allgather_into_tensor_coalesced( - std::vector& outputs, - std::vector& inputs, - const AllgatherOptions& opts = AllgatherOptions()) override { - TORCH_CHECK( - false, - "ProcessGroupXCCL::allgather_into_tensor_coalesced not implemented"); - } - - c10::intrusive_ptr reduce_scatter( - std::vector& outputTensors, - std::vector>& inputTensors, - const ReduceScatterOptions& opts = ReduceScatterOptions()) override { - TORCH_CHECK(false, "ProcessGroupXCCL::reduce_scatter not implemented"); - } - - c10::intrusive_ptr _reduce_scatter_base( - at::Tensor& outputTensor, - at::Tensor& inputTensor, - const ReduceScatterOptions& opts = ReduceScatterOptions()) override { - TORCH_CHECK( - false, "ProcessGroupXCCL::_reduce_scatter_base not implemented"); - } - - c10::intrusive_ptr reduce_scatter_tensor_coalesced( - std::vector& outputs, - std::vector& inputs, - const ReduceScatterOptions& opts = ReduceScatterOptions()) override { - TORCH_CHECK( - false, - "ProcessGroupXCCL::reduce_scatter_tensor_coalesced not implemented"); - } - - c10::intrusive_ptr barrier( - const BarrierOptions& opts = BarrierOptions()) override { - TORCH_CHECK(false, "ProcessGroupXCCL::barrier not implemented"); - } - - c10::intrusive_ptr alltoall_base( - at::Tensor& outputTensor, - at::Tensor& inputTensor, - std::vector& outputSplitSizes, - std::vector& inputSplitSizes, - const AllToAllOptions& opts = AllToAllOptions()) override { - TORCH_CHECK(false, "ProcessGroupXCCL::alltoall_base not implemented"); - } - - c10::intrusive_ptr alltoall( - std::vector& outputTensors, - std::vector& inputTensors, - const AllToAllOptions& opts = AllToAllOptions()) override { - TORCH_CHECK(false, "ProcessGroupXCCL::alltoall not implemented"); - } - - c10::intrusive_ptr send( - std::vector& tensors, - int dstRank, - int tag) override { - TORCH_CHECK(false, "ProcessGroupXCCL::send not implemented"); - } - - c10::intrusive_ptr recv( - std::vector& tensors, - int srcRank, - int tag) override { - TORCH_CHECK(false, "ProcessGroupXCCL::recv not implemented"); - } - - c10::intrusive_ptr gather( - std::vector>& outputTensors, - std::vector& inputTensors, - const GatherOptions& opts = GatherOptions()) override { - TORCH_CHECK(false, "ProcessGroupXCCL::gather not implemented"); - } - - c10::intrusive_ptr scatter( - std::vector& outputTensors, - std::vector>& inputTensors, - const ScatterOptions& opts = ScatterOptions()) override { - TORCH_CHECK(false, "ProcessGroupXCCL::scatter not implemented"); - } - protected: - std::unordered_map xcclStreams_; - std::unordered_map> - inInitializationCommMap_; + std::unordered_map xcclStreamsMap_; + std::unordered_map xcclEventsMap_; std::unordered_map> devXCCLCommMap_; c10::intrusive_ptr store_; std::mutex mutex_; bool blockingWait_ = false; private: - XCCL_KVS kvs; std::mutex kvs_mutex; - XCCL_KVS get_kvs(int rank, c10d::Store& store) { + ccl::shared_ptr_class kvs; + + ccl::shared_ptr_class get_kvs(int rank, c10d::Store& store) { + // todo: why do we need the mutex here? std::lock_guard lock(kvs_mutex); if (kvs) return kvs; @@ -291,41 +176,6 @@ class TORCH_API ProcessGroupXCCL : public Backend { return kvs; } }; - -namespace { -int getXCCLEnvVar(std::string envVarName) { - char* stringValue = std::getenv(envVarName.c_str()); - if (stringValue != nullptr) { - try { - int val = std::stoi(stringValue); - return val; - } catch (std::exception& e) { - TORCH_CHECK( - false, - "Invalid value for environment variable: " + std::string(envVarName)); - } - } else { - return -1; - } -} - -template -void setXCCLEnvVar(const std::string& envVarName, T val) { - if constexpr (std::is_same_v) { - setenv(envVarName.c_str(), std::to_string(val).c_str(), 1); - } else if constexpr (std::is_same_v) { - setenv(envVarName.c_str(), val.c_str(), 1); - } -} - -bool with_mpirun() { - return (getenv("MPI_LOCALRANKID") || getenv("MPI_LOCALNRANKS") || - getenv("PMI_RANK") || getenv("PMI_SIZE") || getenv("PMIX_RANK")) - ? true - : false; -} - -} // namespace } // namespace c10d #endif // USE_C10D_XCCL diff --git a/torch/csrc/distributed/c10d/Utils.hpp b/torch/csrc/distributed/c10d/Utils.hpp index 73e37e0437c459..e27ec363ba1cc9 100644 --- a/torch/csrc/distributed/c10d/Utils.hpp +++ b/torch/csrc/distributed/c10d/Utils.hpp @@ -557,7 +557,7 @@ size_t computeLengthsAndOffsets( return offset; } -inline std::string reduce_op_to_string(c10d::ReduceOp op) { +inline std::string reduceOpToString(c10d::ReduceOp op) { switch (op) { case c10d::ReduceOp::SUM: return "SUM"; From a062f9f8bfc2cb81c4f082515f324b7c5f65dbb8 Mon Sep 17 00:00:00 2001 From: hanchao Date: Thu, 17 Oct 2024 00:59:38 +0000 Subject: [PATCH 82/96] accept comments --- torch/csrc/distributed/c10d/ProcessGroup.hpp | 4 ++-- .../distributed/c10d/ProcessGroupXCCL.cpp | 19 ------------------- .../distributed/c10d/ProcessGroupXCCL.hpp | 3 ++- torch/distributed/distributed_c10d.py | 8 ++------ 4 files changed, 6 insertions(+), 28 deletions(-) diff --git a/torch/csrc/distributed/c10d/ProcessGroup.hpp b/torch/csrc/distributed/c10d/ProcessGroup.hpp index 83d2729fc43d43..31c974a061e4a2 100644 --- a/torch/csrc/distributed/c10d/ProcessGroup.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroup.hpp @@ -51,8 +51,8 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder { NCCL = 2, UCC = 3, MPI = 4, - CUSTOM = 5, - XCCL = 6, + XCCL = 5, + CUSTOM = 6, }; static std::string backendTypeToString(const BackendType& type) { diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index ef007825a118ed..90fb4c3f9cbd75 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -253,25 +253,6 @@ c10::intrusive_ptr ProcessGroupXCCL::allreduce( auto tensor = tensors.back(); checkXPUTensor(tensor); - RECORD_PARAM_COMMS_DATA( - // static_cast( - // this->getSequenceNumberForGroup() + 1), // seq + 1 to match - // collective - 1, - std::make_tuple(pg_uid_, pg_desc_), // PG name tuple - tensors, // inputTensors - tensors, // outputTensors - rank_, // rank - "allreduce", // collective name - tensor.numel(), // inNelems - tensor.numel(), // outNelems - tensor.scalar_type(), // dType - std::vector(), // inSplitSizes - std::vector(), // outSplitSizes - 0, // globalRankStart - 1, // globalRankStride - this->getSize()); // worldSize - return collective( tensor, tensor, diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp index 5dc003e3dba6b2..6e6eb16d62d620 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp @@ -138,6 +138,8 @@ class TORCH_API ProcessGroupXCCL : public Backend { std::vector& tensors, const AllreduceOptions& opts = AllreduceOptions()) override; + void setSequenceNumberForGroup() override {} + protected: std::unordered_map xcclStreamsMap_; std::unordered_map xcclEventsMap_; @@ -151,7 +153,6 @@ class TORCH_API ProcessGroupXCCL : public Backend { ccl::shared_ptr_class kvs; ccl::shared_ptr_class get_kvs(int rank, c10d::Store& store) { - // todo: why do we need the mutex here? std::lock_guard lock(kvs_mutex); if (kvs) return kvs; diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py index 4bbb1c41011231..fc4ca55dbd0237 100644 --- a/torch/distributed/distributed_c10d.py +++ b/torch/distributed/distributed_c10d.py @@ -1675,13 +1675,9 @@ def _new_process_group_helper( "created, please use a different group name" ) - if device_id is not None and ( - device_id.index is None - or (device_id.type != "cuda" and device_id.type != "xpu") - ): + if device_id is not None and device_id.index is None: raise ValueError( - "init_process_group device_id parameter must be a cuda device with an " - "id, e.g. cuda:0, xpu, not just cuda or xpu or cpu" + "init_process_group device_id parameter must be a device with an index" ) # Note: _new_process_group_helper is only called from init_process_group, which always provides a timeout value From 86b66c3b0ff8c731e54453c210e0b0eb321c3e89 Mon Sep 17 00:00:00 2001 From: hanchao Date: Mon, 21 Oct 2024 01:54:13 +0000 Subject: [PATCH 83/96] refine code --- caffe2/CMakeLists.txt | 3 -- test/distributed/test_c10d_common.py | 13 +++--- torch/csrc/distributed/c10d/ProcessGroup.hpp | 17 ++++---- .../distributed/c10d/ProcessGroupXCCL.cpp | 39 +++++++++-------- .../distributed/c10d/ProcessGroupXCCL.hpp | 43 +++++++++++-------- torch/testing/_internal/common_distributed.py | 14 +++++- 6 files changed, 72 insertions(+), 57 deletions(-) diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index b4ec018019f165..25bd7f700f68a2 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -1376,9 +1376,6 @@ if(USE_DISTRIBUTED) endif() if(USE_XPU AND USE_C10D_XCCL) target_compile_definitions(torch_xpu PUBLIC USE_C10D_XCCL) - set_source_files_properties( - ${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupXCCL.cpp - PROPERTIES COMPILE_DEFINITIONS "CCL_ENABLE_ZE;CCL_ENABLE_SYCL") endif() if(USE_MPI AND USE_C10D_MPI) if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU") diff --git a/test/distributed/test_c10d_common.py b/test/distributed/test_c10d_common.py index 903df26bba9f6f..d3cb65f7befb1d 100644 --- a/test/distributed/test_c10d_common.py +++ b/test/distributed/test_c10d_common.py @@ -31,6 +31,7 @@ from torch.testing._internal.common_distributed import ( MultiProcessTestCase, skip_if_lt_x_gpu, + get_device_count, ) from torch.testing._internal.common_utils import ( instantiate_parametrized_tests, @@ -60,17 +61,13 @@ torch.backends.cuda.matmul.allow_tf32 = False -def gpus_for_rank(world_size): +def gpus_for_rank(world_size, backend): """Multigpu tests are designed to simulate the multi nodes with multi GPUs on each node. Nccl backend requires equal #GPUs in each process. On a single node, all visible GPUs are evenly divided to subsets, each process only uses a subset. """ - device_count = ( - torch.xpu.device_count() - if torch.xpu.is_available() - else torch.cuda.device_count() - ) + device_count = get_device_count(backend) visible_devices = list(range(device_count)) gpus_per_process = device_count // world_size gpus_for_rank = [] @@ -833,7 +830,7 @@ def update_parameters(model): def _gpu_model_with_ddp_comm_hook( self, process_group, hook=None, gradient_as_bucket_view=False, state=None ): - device_id = gpus_for_rank(self.world_size)[self.rank][0] + device_id = gpus_for_rank(self.world_size, process_group.name())[self.rank][0] gpu_model = DistributedDataParallel( ModuleForDdpCommHook().to(device_id), device_ids=[device_id], @@ -850,7 +847,7 @@ def _gpu_model_with_ddp_comm_hook( def _gpu_model_with_builtin_ddp_comm_hook( self, process_group, hook=None, gradient_as_bucket_view=False ): - device_id = gpus_for_rank(self.world_size)[self.rank][0] + device_id = gpus_for_rank(self.world_size, process_group.name())[self.rank][0] gpu_model = DistributedDataParallel( ModuleForDdpCommHook().to(device_id), device_ids=[device_id], diff --git a/torch/csrc/distributed/c10d/ProcessGroup.hpp b/torch/csrc/distributed/c10d/ProcessGroup.hpp index 31c974a061e4a2..b3eac70e871bf7 100644 --- a/torch/csrc/distributed/c10d/ProcessGroup.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroup.hpp @@ -131,6 +131,13 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder { return backendType_; }; + inline bool backendSupportsSequenceNumbers(BackendType backendType) { + if (backendType == BackendType::GLOO || backendType == BackendType::NCCL || + backendType == BackendType::XCCL || backendType == BackendType::UCC) + return true; + return false; + } + virtual void startCoalescing(c10::DeviceType deviceType) { // only nccl has implemented startCoalescing so only execute for nccl // backends @@ -508,10 +515,7 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder { virtual void setSequenceNumberForGroup() { auto backendType = getBackendType(); // TODO: HACK for backend name to get sequence number for that backend. - if (backendType == ProcessGroup::BackendType::GLOO || - backendType == ProcessGroup::BackendType::NCCL || - backendType == ProcessGroup::BackendType::XCCL || - backendType == ProcessGroup::BackendType::UCC) { + if (backendSupportsSequenceNumbers(backendType)) { getDefaultBackend()->setSequenceNumberForGroup(); } else { TORCH_CHECK( @@ -530,10 +534,7 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder { auto backendType = getBackendType(); // TODO: HACK for backend name to get sequence number for that backend. - if (backendType == ProcessGroup::BackendType::GLOO || - backendType == ProcessGroup::BackendType::NCCL || - backendType == ProcessGroup::BackendType::XCCL || - backendType == ProcessGroup::BackendType::UCC) { + if (backendSupportsSequenceNumbers(backendType)) { return getDefaultBackend()->getSequenceNumberForGroup(); } else { TORCH_CHECK( diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index 90fb4c3f9cbd75..41e4e43436270a 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -1,17 +1,8 @@ #ifdef USE_C10D_XCCL #include +#include #include -#include -#include -#include -#include -#include -#include -#include - -#include -#include namespace c10d { @@ -89,10 +80,13 @@ ProcessGroupXCCL::WorkXCCL::WorkXCCL( at::Device& device, int rank, OpType opType, + uint64_t seq, + const char* profilingTitle, const std::optional>& inputs) - : Work(rank, opType, "profilingTitle", inputs), + : Work(rank, opType, profilingTitle, inputs), device_(device), - workStartTime_(std::chrono::steady_clock::now()) { + workStartTime_(std::chrono::steady_clock::now()), + seq_(seq) { xcclEndEvent_ = std::make_shared(); } @@ -101,7 +95,8 @@ ProcessGroupXCCL::WorkXCCL::WorkXCCL(const WorkXCCL& w) device_(w.device_), xcclEndEvent_(w.xcclEndEvent_), blockingWait_(w.blockingWait_), - workStartTime_(w.workStartTime_) {} + workStartTime_(w.workStartTime_), + seq_(w.seq_) {} ProcessGroupXCCL::WorkXCCL::~WorkXCCL() = default; @@ -156,10 +151,16 @@ c10::intrusive_ptr ProcessGroupXCCL::initWork( at::Device& device, int rank, OpType opType, + const char* profilingTitle, const std::vector& inputs, const std::vector& outputs) { auto r = c10::make_intrusive( - device, rank, opType, std::optional>(inputs)); + device, + rank, + opType, + seqCollective_, + profilingTitle, + std::optional>(inputs)); return r; } @@ -212,7 +213,10 @@ c10::intrusive_ptr ProcessGroupXCCL::collective( Fn fn, PreProcess pre, PostProcess post, - OpType opType) { + OpType opType, + const char* profilingTitle) { + seqCollective_++; + auto device = inputs[0].device(); const auto key = std::to_string(device.index()); auto comm = getXCCLComm(key, device); @@ -221,7 +225,7 @@ c10::intrusive_ptr ProcessGroupXCCL::collective( syncStream(device, xcclEventsMap_[key], stream); c10::intrusive_ptr work; - work = initWork(device, rank_, opType); + work = initWork(device, rank_, opType, profilingTitle); work->outputs_ = std::make_shared>(outputs); at::xpu::OptionalXPUGuard gpuGuard(device); @@ -273,7 +277,8 @@ c10::intrusive_ptr ProcessGroupXCCL::allreduce( ccl_stream); return; }, - OpType::ALLREDUCE); + OpType::ALLREDUCE, + "xccl:all_reduce"); } } // namespace c10d diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp index 6e6eb16d62d620..f9761c652dc1a0 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp @@ -1,33 +1,24 @@ #pragma once -#if defined(__linux__) -#include -#include -#include -#include -#endif - #ifdef USE_C10D_XCCL -#include +// We will define those flags in XCCL backend file instead of passing to gcc +// compiler. +#define CCL_ENABLE_ZE +#define CCL_ENABLE_SYCL + #include -#include #include -#include -#include - -#include -#include #include -#include #include #include -#include #include +#include +#include #include #include #include -#include +#include namespace c10d { static std::vector TORCH_XCCL_BLOCKING_WAIT = { @@ -45,6 +36,8 @@ class TORCH_API ProcessGroupXCCL : public Backend { at::Device& device, int rank, OpType opType, + uint64_t seq, + const char* profilingTitle = nullptr, const std::optional>& inputs = std::nullopt); WorkXCCL(const WorkXCCL& w); ~WorkXCCL() override; @@ -63,6 +56,10 @@ class TORCH_API ProcessGroupXCCL : public Backend { return future_; } + uint64_t getSequencenumber() const override { + return seq_; + } + std::vector result() override { return *outputs_; } @@ -72,6 +69,7 @@ class TORCH_API ProcessGroupXCCL : public Backend { std::shared_ptr xcclEndEvent_; bool blockingWait_ = false; std::chrono::time_point workStartTime_; + uint64_t seq_; private: void synchronizeInternal(std::chrono::milliseconds timeout); @@ -103,6 +101,7 @@ class TORCH_API ProcessGroupXCCL : public Backend { at::Device& device, int rank, OpType opType, + const char* profilingTitle = nullptr, const std::vector& inputs = {}, const std::vector& outputs = {}); @@ -111,7 +110,8 @@ class TORCH_API ProcessGroupXCCL : public Backend { at::Tensor& input, at::Tensor& output, Fn fn, - OpType opType) { + OpType opType, + const char* profilingTitle = nullptr) { auto inputs = std::vector{input}; auto outputs = std::vector{output}; return collective( @@ -132,13 +132,17 @@ class TORCH_API ProcessGroupXCCL : public Backend { Fn fn, PreProcess pre, PostProcess post, - OpType opType); + OpType opType, + const char* profilingTitle = nullptr); c10::intrusive_ptr allreduce( std::vector& tensors, const AllreduceOptions& opts = AllreduceOptions()) override; void setSequenceNumberForGroup() override {} + uint64_t getSequenceNumberForGroup() override { + return seqCollective_; + } protected: std::unordered_map xcclStreamsMap_; @@ -147,6 +151,7 @@ class TORCH_API ProcessGroupXCCL : public Backend { c10::intrusive_ptr store_; std::mutex mutex_; bool blockingWait_ = false; + uint64_t seqCollective_{0}; private: std::mutex kvs_mutex; diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py index 9ec38c9ca671c2..3e1664690b7132 100644 --- a/torch/testing/_internal/common_distributed.py +++ b/torch/testing/_internal/common_distributed.py @@ -93,8 +93,9 @@ class DistTestCases: # Sets showing that something is implemented backend_feature = {} - backend_feature["gpu"] = {"nccl", "gloo", "ucc"} + backend_feature["gpu"] = {"nccl", "gloo", "ucc", "xccl"} backend_feature["cuda"] = {"nccl", "gloo", "ucc"} + backend_feature["cuda"] = {"xccl"} backend_feature["ddp"] = {"nccl", "gloo", "ucc"} backend_feature["subgroup"] = {"nccl", "gloo", "ucc"} backend_feature["plugin"] = set() @@ -462,6 +463,15 @@ def compute_sum(fn, world_size: int): ] ] +# Returns the number of GPUs, currently only for CUDA and XPU. +def get_device_count(backend: str): + assert c10d.is_backend_available(backend) + if backend in backend_feature.get("cuda", set()): + return torch.cuda.device_count() + elif backend in backend_feature.get("xpu", set()): + return torch.xpu.device_count() + else: + raise ValueError(f"Unsupported backend: {backend}") # HELPER FOR MULTIGPU TESTS def init_multigpu_helper(world_size: int, backend: str): @@ -470,7 +480,7 @@ def init_multigpu_helper(world_size: int, backend: str): On a single node, all visible GPUs are evenly divided to subsets, each process only uses a subset. """ - nGPUs = torch.xpu.device_count() if torch.xpu.is_available() else torch.cuda.device_count() + nGPUs = get_device_count(backend) visible_devices = range(nGPUs) # If rank is less than or equal to number of available GPU's From d9ce6368c51d64a4379efdfa26804888026185f4 Mon Sep 17 00:00:00 2001 From: hanchao Date: Mon, 21 Oct 2024 06:11:52 +0000 Subject: [PATCH 84/96] align to latest --- .../distributed/c10d/ProcessGroupXCCL.cpp | 531 ++++++------------ .../distributed/c10d/ProcessGroupXCCL.hpp | 83 ++- 2 files changed, 228 insertions(+), 386 deletions(-) diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index a89a7c48a01ffb..956e80482af28d 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -82,40 +82,38 @@ void check_xpu_single_tensor( const at::Tensor& tensor, const bool p2p = false // whether operation is a P2P operation ) { - if (!tensor.is_xpu() || tensor.is_sparse()) { - C10_THROW_ERROR(ValueError, "Tensors must be XPU and dense"); - } - // Skip the following requirements for P2P operations - if (!tensor.is_contiguous(tensor.suggest_memory_format())) { - if (p2p) { - TORCH_WARN_ONCE( - "Detected non-contiguous tensor in P2P operations. It is user " - "responsibility to guarantee that source and destination tensors have " - "the same contiguity format."); - } else { - C10_THROW_ERROR(ValueError, "Tensors must be contiguous"); + if (!tensor.is_xpu() || tensor.is_sparse() || tensor.is_complex()) { + C10_THROW_ERROR( + ValueError, "Tensors must be XPU and dense and non-complex"); + + // Skip the following requirements for P2P operations + if (!tensor.is_contiguous(tensor.suggest_memory_format())) { + if (p2p) { + TORCH_WARN_ONCE( + "Detected non-contiguous tensor in P2P operations. It is user " + "responsibility to guarantee that source and destination tensors have " + "the same contiguity format."); + } else { + C10_THROW_ERROR(ValueError, "Tensors must be contiguous"); + } } } } - int64_t check_xpu_tensors_same_device(const std::vector& tensors) { - if (tensors.size() == 0) { - C10_THROW_ERROR(ValueError, "Tensor list must be nonempty"); - } + TORCH_CHECK_WITH( + ValueError, tensors.size() == 0, "Tensor list must be nonempty"); const auto& first = tensors.front(); int64_t total_numel = 0; for (const auto& t : tensors) { - if (!t.is_xpu() || t.is_sparse()) { - C10_THROW_ERROR(ValueError, "Tensors must be XPU and dense"); + if (!t.is_xpu() || t.is_sparse() || t.is_complex()) { + C10_THROW_ERROR( + ValueError, "Tensors must be XPU and dense and non-complex"); } if (t.scalar_type() != first.scalar_type()) { C10_THROW_ERROR(TypeError, "Tensors must have identical type"); } - if (!t.is_non_overlapping_and_dense()) { - C10_THROW_ERROR(ValueError, "Tensors must be non-overlapping and dense"); - } TORCH_CHECK_WITH( ValueError, t.get_device() == tensors[0].get_device(), @@ -126,7 +124,12 @@ int64_t check_xpu_tensors_same_device(const std::vector& tensors) { return total_numel; } -ccl::datatype getXcclDataType(at::ScalarType type) { +ccl::datatype getXcclDataType( + at::ScalarType type, + bool is_reduction_op = false) { + TORCH_CHECK( + !isFloat8Type(type) && is_reduction_op, + "Float8 dtypes are not currenlty supported for XCCL reductions"); auto it = xcclDatatypes.find(type); TORCH_CHECK_WITH( TypeError, @@ -158,18 +161,6 @@ void syncStream( xcclEvent.block(xcclStream); } -bool complexViewAsRealAllowed(const ReduceOp reduceOp) { - switch (reduceOp) { - case ReduceOp::SUM: - return true; - case ReduceOp::UNUSED: - return true; - default: - return false; - } - return false; -} - } // namespace constexpr int64_t kSynchronizeBusyWaitMillis = 10; @@ -286,7 +277,6 @@ std::shared_ptr ProcessGroupXCCL::getXCCLComm( usedDeviceIdxs_.insert(device.index()); { - // todo: why do we need mutex here? std::lock_guard lock(mutex_); if (devXCCLCommMap_.find(deviceKey) != devXCCLCommMap_.end()) { return devXCCLCommMap_[deviceKey]; @@ -301,11 +291,6 @@ std::shared_ptr ProcessGroupXCCL::getXCCLComm( at::xpu::OptionalXPUGuard gpuGuard(device); - for (const auto i : c10::irange(xcclActiveGroupCounter_)) { - (void)i; - ccl::group_end(); - } - int numRanks, rank; if (!singleP2POp) { numRanks = getSize(); @@ -329,37 +314,14 @@ std::shared_ptr ProcessGroupXCCL::getXCCLComm( auto xccl_kvs = get_kvs(rank_, *store_); auto comms = ccl::create_communicators(numRanks, devs_rank, ctx, xccl_kvs); - std::shared_ptr XCCLComm = - std::make_shared(std::move(comms[0])); + XCCLComm = std::make_shared(std::move(comms[0])); std::lock_guard lock(mutex_); devXCCLCommMap_.emplace(deviceKey, XCCLComm); xcclStreamsMap_.emplace(deviceKey, std::move(stream)); xcclEventsMap_.emplace(deviceKey, at::xpu::XPUEvent()); - for (const auto i : c10::irange(xcclActiveGroupCounter_)) { - (void)i; - ccl::group_start(); - } - - xcclStreams_.emplace(deviceKey, std::move(stream)); - xcclEvents_.emplace(deviceKey, at::xpu::XPUEvent()); - - auto it = inInitializationCommMap_.find(deviceKey); - if (it != inInitializationCommMap_.end()) { - devXCCLCommMap_.emplace(deviceKey, std::move(it->second)); - inInitializationCommMap_.erase(deviceKey); - - xcclCommDevIdxMapMutex.lock(); - xcclCommDevIdxMap.emplace(XCCLComm, device.index()); - xcclCommDevIdxMapMutex.unlock(); - } - - it = devXCCLCommMap_.find(deviceKey); - TORCH_INTERNAL_ASSERT( - it != devXCCLCommMap_.end(), "Communicators not populated in cache!"); - - return it->second; + return XCCLComm; } void ProcessGroupXCCL::groupStart() { @@ -396,7 +358,7 @@ c10::intrusive_ptr ProcessGroupXCCL::endCoalescing(OpType optype) { auto device = coalescedDevice_; const auto key = std::to_string(device.index()); - auto stream = xcclStreams_.at(key); + auto stream = xcclStreamsMap_.at(key); auto work = initWork(device, rank_, optype); work->blockingWait_ = blockingWait_; @@ -422,7 +384,8 @@ c10::intrusive_ptr ProcessGroupXCCL::collective( Fn fn, PreProcess pre, PostProcess post, - OpType opType) { + OpType opType, + const char* profilingTitle) { auto device = inputs[0].device(); const auto key = std::to_string(device.index()); auto comm = getXCCLComm(key, device, opType); @@ -442,8 +405,8 @@ c10::intrusive_ptr ProcessGroupXCCL::collective( } } - auto stream = xcclStreams_.at(key); - syncStream(device, xcclEvents_[key], stream); + auto stream = xcclStreamsMap_.at(key); + syncStream(device, xcclEventsMap_[key], stream); c10::intrusive_ptr work; work = initWork(device, rank_, opType); @@ -454,13 +417,12 @@ c10::intrusive_ptr ProcessGroupXCCL::collective( pre(stream, work); - for (const auto& input : inputs) { + for (const auto i : c10::irange(inputs.size())) { c10::xpu::XPUCachingAllocator::recordStream( - input.storage().data_ptr(), stream); + inputs[i].storage().data_ptr(), stream); + fn(inputs[i], outputs[i], *comm, stream); } - fn(inputs[0], outputs[0], *comm, stream); - post(stream, work); if (!coalescing_state_) { @@ -478,97 +440,13 @@ c10::intrusive_ptr ProcessGroupXCCL::collective( return work; } -template -c10::intrusive_ptr ProcessGroupXCCL::collective( - std::vector& inputs, - std::vector& outputs, - Fn fn, - PreProcess pre, - PostProcess post, - OpType opType) { - auto inputs = std::vector{input}; - auto outputs = std::vector{output}; - return collective(inputs, outputs, fn, pre, post, opType); -} - template -c10::intrusive_ptr ProcessGroupXCCL::collective( - at::Tensor& input, - at::Tensor& output, - Fn fn, - OpType opType) { - return collective( - input, - output, - fn, - [](at::xpu::XPUStream&, c10::intrusive_ptr&) { - }, - [](at::xpu::XPUStream&, c10::intrusive_ptr&) { - }, - opType); -} - -template -c10::intrusive_ptr ProcessGroupXCCL::collectiveCoalesced( - std::vector& inputs, - std::vector& outputs, - Fn fn, - OpType opType) { - auto device = inputs[0].device(); - const auto key = std::to_string(device.index()); - auto comm = getXCCLComm(key, device, opType); - - if (coalescing_state_ & CoalActive) { - coalescing_state_ |= CoalColl; - if (coalescedDevice_.index() < 0) { - coalescedDevice_ = device; - } else { - TORCH_CHECK( - coalescedDevice_.index() == device.index(), MULTI_DEVICE_ERROR_MSG); - } - if (coalescedComm_ == nullptr) { - coalescedComm_ = comm; - } else { - TORCH_CHECK(coalescedComm_ == comm, MULTI_DEVICE_ERROR_MSG); - } - } - - auto stream = xcclStreamsMap_.at(key); - syncStream(device, xcclEventsMap_[key], stream); - - c10::intrusive_ptr work; - work = initWork(device, rank_, opType, profilingTitle); - work->outputs_ = std::make_shared>(outputs); - - at::xpu::OptionalXPUGuard gpuGuard(device); - pre(stream, work); - for (const auto i : c10::irange(inputs.size())) { - c10::xpu::XPUCachingAllocator::recordStream( - inputs[i].storage().data_ptr(), stream); - fn(inputs[i], outputs[i], *comm, stream); - } - post(stream, work); - - work->xcclEndEvent_->record(stream); - std::vector streams = {stream.unwrap()}; - c10::MultiStreamGuard streamGuard(streams); - std::vector devices{device}; - work->future_ = c10::make_intrusive( - c10::ListType::create(c10::TensorType::get()), devices); - work->future_->markCompleted(at::IValue(*work->outputs_)); - work->blockingWait_ = blockingWait_; - - return work; -} - -template c10::intrusive_ptr ProcessGroupXCCL::pointToPoint( at::Tensor& tensor, Fn fn, int peer, OpType opType, - PreProcess pre, - PostProcess post) { + const char* profilingTitle) { auto device = tensor.device(); std::string key; int p2pRank = 0, p2pTargetRank = 0; @@ -605,63 +483,43 @@ c10::intrusive_ptr ProcessGroupXCCL::pointToPoint( } } - auto stream = xcclStreams_.at(key); - syncStream(device, xcclEvents_[key], stream); + auto stream = xcclStreamsMap_.at(key); + syncStream(device, xcclEventsMap_[key], stream); - c10::intrusive_ptr work; if (!coalescing_state_) { + c10::intrusive_ptr work; work = initWork(device, rank_, opType); work->outputs_ = std::make_shared>(); work->outputs_->push_back(tensor); - } - - at::xpu::OptionalXPUGuard gpuGuard(device); - - if (!coalescing_state_) { - pre(stream, work); - } - c10::xpu::XPUCachingAllocator::recordStream( - tensor.storage().data_ptr(), stream); + at::xpu::OptionalXPUGuard gpuGuard(device); - fn(tensor, *comm, stream, p2pTargetRank); + c10::xpu::XPUCachingAllocator::recordStream( + tensor.storage().data_ptr(), stream); - if (!coalescing_state_) { - post(stream); + fn(tensor, *comm, stream, p2pTargetRank); work->xcclEndEvent_->record(stream); work->blockingWait_ = blockingWait_; - - { - std::vector streams = {stream.unwrap()}; - c10::MultiStreamGuard streamGuard(streams); - std::vector devices{device}; - work->future_ = c10::make_intrusive( - c10::ListType::create(c10::TensorType::get()), devices); - work->future_->markCompleted(at::IValue(*work->outputs_)); - } + std::vector streams = {stream.unwrap()}; + c10::MultiStreamGuard streamGuard(streams); + std::vector devices{device}; + work->future_ = c10::make_intrusive( + c10::ListType::create(c10::TensorType::get()), devices); + work->future_->markCompleted(at::IValue(*work->outputs_)); return work; } else { + at::xpu::OptionalXPUGuard gpuGuard(device); + + c10::xpu::XPUCachingAllocator::recordStream( + tensor.storage().data_ptr(), stream); + + fn(tensor, *comm, stream, p2pTargetRank); + return nullptr; } } -template -c10::intrusive_ptr ProcessGroupXCCL::pointToPoint( - at::Tensor& tensor, - Fn fn, - int peer, - OpType opType) { - return pointToPoint( - tensor, - fn, - peer, - opType, - [](at::xpu::XPUStream&, c10::intrusive_ptr&) { - }, - [](at::xpu::XPUStream&) {}); -} - c10::intrusive_ptr ProcessGroupXCCL::send( std::vector& tensors, int dstRank, @@ -677,19 +535,19 @@ c10::intrusive_ptr ProcessGroupXCCL::send( xcclComm_t& comm, at::xpu::XPUStream& stream, int dst) { - ccl::event ret_evt; auto xcclDataType = getXcclDataType(input.scalar_type()); - ret_evt = ccl::send( + ccl::send( input.data_ptr(), (size_t)input.numel(), xcclDataType, dst, comm, ccl::create_stream(stream.queue())); - return ret_evt; + return; }, dstRank, - OpType::SEND); + OpType::SEND, + c10::str("xccl:send ", rank_, "->", dstRank).c_str()); return ret; } @@ -708,19 +566,19 @@ c10::intrusive_ptr ProcessGroupXCCL::recv( xcclComm_t& comm, at::xpu::XPUStream& stream, int src) { - ccl::event ret_evt; auto xcclDataType = getXcclDataType(output.scalar_type()); - ret_evt = ccl::recv( + ccl::recv( output.data_ptr(), (size_t)output.numel(), xcclDataType, src, comm, ccl::create_stream(stream.queue())); - return ret_evt; + return; }, srcRank, - OpType::RECV); + OpType::RECV, + c10::str("xccl:recv ", rank_, "<-", srcRank).c_str()); return ret; } @@ -785,13 +643,12 @@ c10::intrusive_ptr ProcessGroupXCCL::gather( } } { - ccl::event ret_evt; auto xcclDataType = getXcclDataType(inputTensor.scalar_type()); if (rank_ == root) { for (const auto r : c10::irange(size_)) { if (r != root) { // do receive - ret_evt = ccl::recv( + ccl::recv( outputs[r].data_ptr(), (size_t)inputTensor.numel(), xcclDataType, @@ -805,7 +662,7 @@ c10::intrusive_ptr ProcessGroupXCCL::gather( } } else { // do send - ret_evt = ccl::send( + ccl::send( inputTensor.data_ptr(), (size_t)inputTensor.numel(), xcclDataType, @@ -813,13 +670,9 @@ c10::intrusive_ptr ProcessGroupXCCL::gather( comm, ccl::create_stream(stream.queue())); } - return ret_evt; + return; } }, - [](at::xpu::XPUStream&, c10::intrusive_ptr&) { - }, - [](at::xpu::XPUStream&, c10::intrusive_ptr&) { - }, OpType::GATHER); } @@ -885,14 +738,13 @@ c10::intrusive_ptr ProcessGroupXCCL::scatter( } } { - ccl::event ret_evt; if (rank_ == root) { for (const auto r : c10::irange(size_)) { if (r != root) { // do send size_t send_count = inputs[r].numel(); auto send_type = getXcclDataType(inputs[r].scalar_type()); - ret_evt = ccl::send( + ccl::send( inputs[r].data_ptr(), send_count, send_type, @@ -908,7 +760,7 @@ c10::intrusive_ptr ProcessGroupXCCL::scatter( // do receive size_t recv_count = outputTensor.numel(); auto recv_type = getXcclDataType(outputTensor.scalar_type()); - ret_evt = ccl::recv( + ccl::recv( outputTensor.data_ptr(), recv_count, recv_type, @@ -917,13 +769,9 @@ c10::intrusive_ptr ProcessGroupXCCL::scatter( ccl::create_stream(stream.queue())); } - return ret_evt; + return; } }, - [](at::xpu::XPUStream&, c10::intrusive_ptr&) { - }, - [](at::xpu::XPUStream&, c10::intrusive_ptr&) { - }, OpType::SCATTER); } @@ -937,7 +785,7 @@ c10::intrusive_ptr ProcessGroupXCCL::allreduce_impl( at::Tensor& output, xcclComm_t& comm, at::xpu::XPUStream& stream) { - auto xcclDataType = getXcclDataType(input.scalar_type()); + auto xcclDataType = getXcclDataType(input.scalar_type(), true); auto xcclReduceOp = getXcclReduceOp(opts.reduceOp, input); auto ccl_stream = ccl::create_stream(stream.queue()); ccl::allreduce( @@ -948,7 +796,7 @@ c10::intrusive_ptr ProcessGroupXCCL::allreduce_impl( xcclReduceOp, comm, ccl::create_stream(stream.queue())); - return ret_evt; + return; }, OpType::ALLREDUCE, "xccl:all_reduce"); @@ -959,30 +807,35 @@ c10::intrusive_ptr ProcessGroupXCCL::allreduce( const AllreduceOptions& opts) { TORCH_CHECK(tensors.size() == 1, MULTI_DEVICE_ERROR_MSG); auto tensor = tensors.back(); - if (tensor.is_complex()) { - TORCH_CHECK( - complexViewAsRealAllowed(opts.reduceOp), - "all_reduce does not support", - opts.reduceOp, - "on complex tensors"); - tensor = at::view_as_real(tensor); - } check_xpu_single_tensor(tensor); - TORCH_CHECK( - !isFloat8Type(tensor.scalar_type()), - "Float8 dtypes are not currenlty supported for XCCL reductions"); - return allreduce_impl(tensor, opts); + return collective( + tensor, + tensor, + [&](at::Tensor& input, + at::Tensor& output, + xcclComm_t& comm, + at::xpu::XPUStream& stream) { + auto xcclDataType = getXcclDataType(input.scalar_type(), true); + auto xcclReduceOp = getXcclReduceOp(opts.reduceOp, input); + ccl::allreduce( + input.data_ptr(), + output.data_ptr(), + (size_t)input.numel(), + xcclDataType, + xcclReduceOp, + comm, + ccl::create_stream(stream.queue())); + return; + }, + OpType::ALLREDUCE, + "xccl:all_reduce"); } c10::intrusive_ptr ProcessGroupXCCL::allreduce_coalesced( std::vector& tensors, const AllreduceCoalescedOptions& opts) { check_xpu_tensors_same_device(tensors); - TORCH_CHECK( - !isFloat8Type(tensors.back().scalar_type()), - "Float8 dtypes are not currenlty supported for XCCL reductions"); - return collectiveCoalesced( tensors, tensors, @@ -990,10 +843,9 @@ c10::intrusive_ptr ProcessGroupXCCL::allreduce_coalesced( at::Tensor& output, xcclComm_t& comm, at::xpu::XPUStream& stream) { - auto xcclDataType = getXcclDataType(input.scalar_type()); + auto xcclDataType = getXcclDataType(input.scalar_type(), true); auto xcclReduceOp = getXcclReduceOp(opts.reduceOp, input); - ccl::event ret_evt; - ret_evt = ccl::allreduce( + ccl::allreduce( input.data_ptr(), output.data_ptr(), (size_t)input.numel(), @@ -1001,9 +853,10 @@ c10::intrusive_ptr ProcessGroupXCCL::allreduce_coalesced( xcclReduceOp, comm, ccl::create_stream(stream.queue())); - return ret_evt; + return; }, - OpType::COALESCED); + OpType::COALESCED, + "xccl:allreduce_coalesced"); } c10::intrusive_ptr ProcessGroupXCCL::broadcast( @@ -1011,9 +864,6 @@ c10::intrusive_ptr ProcessGroupXCCL::broadcast( const BroadcastOptions& opts) { TORCH_CHECK(tensors.size() == 1, MULTI_DEVICE_ERROR_MSG); auto tensor = tensors.back(); - if (tensor.is_complex()) { - tensor = at::view_as_real(tensor); - } check_xpu_single_tensor(tensor); const auto root = opts.rootRank + opts.rootTensor; @@ -1026,17 +876,17 @@ c10::intrusive_ptr ProcessGroupXCCL::broadcast( xcclComm_t& comm, at::xpu::XPUStream& stream) { auto xcclDataType = getXcclDataType(input.scalar_type()); - ccl::event ret_evt; - ret_evt = ccl::broadcast( + ccl::broadcast( input.data_ptr(), (size_t)input.numel(), xcclDataType, root, comm, ccl::create_stream(stream.queue())); - return ret_evt; + return; }, - OpType::BROADCAST); + OpType::BROADCAST, + "nccl:broadcast"); } c10::intrusive_ptr ProcessGroupXCCL::_broadcast_oop( @@ -1057,33 +907,24 @@ c10::intrusive_ptr ProcessGroupXCCL::_broadcast_oop( xcclComm_t& comm, at::xpu::XPUStream& stream) { auto xcclDataType = getXcclDataType(input.scalar_type()); - ccl::event ret_evt; - ret_evt = ccl::broadcast( + ccl::broadcast( input.data_ptr(), (size_t)input.numel(), xcclDataType, root, comm, ccl::create_stream(stream.queue())); - return ret_evt; + return; }, - OpType::BROADCAST); + OpType::BROADCAST, + "xccl:_broadcast_oop"); } c10::intrusive_ptr ProcessGroupXCCL::reduce( std::vector& tensors, const ReduceOptions& opts) { TORCH_CHECK(tensors.size() == 1, MULTI_DEVICE_ERROR_MSG); - // @lint-ignore CLANGTIDY auto tensor = tensors.back(); - if (tensor.is_complex()) { - TORCH_CHECK( - complexViewAsRealAllowed(opts.reduceOp), - "reduce does not support", - opts.reduceOp, - "on complex tensors"); - tensor = at::view_as_real(tensor); - } check_xpu_single_tensor(tensor); return collective( @@ -1094,10 +935,9 @@ c10::intrusive_ptr ProcessGroupXCCL::reduce( xcclComm_t& comm, at::xpu::XPUStream& stream) { const int root = opts.rootRank + opts.rootTensor; - const auto xcclDataType = getXcclDataType(input.scalar_type()); + const auto xcclDataType = getXcclDataType(input.scalar_type(), true); const auto xcclReduceOp = getXcclReduceOp(opts.reduceOp, input); - ccl::event ret_evt; - ret_evt = ccl::reduce( + ccl::reduce( input.data_ptr(), output.data_ptr(), (size_t)input.numel(), @@ -1106,20 +946,20 @@ c10::intrusive_ptr ProcessGroupXCCL::reduce( root, comm, ccl::create_stream(stream.queue())); - return ret_evt; + return; }, - OpType::REDUCE); + OpType::REDUCE, + "xccl:reduce"); } c10::intrusive_ptr ProcessGroupXCCL::_reduce_oop( at::Tensor& outputTensor, at::Tensor& inputTensor, const ReduceOptions& opts) { - if (outputTensor.numel() != inputTensor.numel()) { - C10_THROW_ERROR( - ValueError, - "Tensor input and output of _reduce_oop must have the same number of elements "); - } + TORCH_CHECK_WITH( + ValueError, + outputTensor.numel() != inputTensor.numel(), + "Tensor input and output of _reduce_oop must have the same number of elements"); return collective( inputTensor, outputTensor, @@ -1128,10 +968,9 @@ c10::intrusive_ptr ProcessGroupXCCL::_reduce_oop( xcclComm_t& comm, at::xpu::XPUStream& stream) { const int root = opts.rootRank + opts.rootTensor; - const auto xcclDataType = getXcclDataType(input.scalar_type()); + const auto xcclDataType = getXcclDataType(input.scalar_type(), true); const auto xcclReduceOp = getXcclReduceOp(opts.reduceOp, input); - ccl::event ret_evt; - ret_evt = ccl::reduce( + ccl::reduce( input.data_ptr(), output.data_ptr(), (size_t)input.numel(), @@ -1140,9 +979,10 @@ c10::intrusive_ptr ProcessGroupXCCL::_reduce_oop( root, comm, ccl::create_stream(stream.queue())); - return ret_evt; + return; }, - OpType::REDUCE); + OpType::REDUCE, + "xccl:_reduce_oop"); } c10::intrusive_ptr ProcessGroupXCCL::allgather( @@ -1171,16 +1011,14 @@ c10::intrusive_ptr ProcessGroupXCCL::allgather( c10::xpu::XPUCachingAllocator::recordStream( output.storage().data_ptr(), stream); auto xcclDataType = getXcclDataType(input.scalar_type()); - ccl::event ret_evt; - - ret_evt = ccl::allgather( + ccl::allgather( input.data_ptr(), output.data_ptr(), (size_t)input.numel(), xcclDataType, comm, ccl::create_stream(stream.queue())); - return ret_evt; + return; }, [](at::xpu::XPUStream&, c10::intrusive_ptr& work) {}, @@ -1194,7 +1032,8 @@ c10::intrusive_ptr ProcessGroupXCCL::allgather( outputTensors_[j].copy_(outputFlattened[j], true); } }, - OpType::ALLGATHER); + OpType::ALLGATHER, + "xccl:all_gather"); } else { const auto num_reduces = outputTensors_.size(); startCoalescing(); @@ -1217,16 +1056,14 @@ c10::intrusive_ptr ProcessGroupXCCL::_allgather_base( check_xpu_single_tensor(input_tensor); check_xpu_single_tensor(output_tensor); - if (input_tensor.dtype() != output_tensor.dtype()) { - C10_THROW_ERROR( - TypeError, "output tensor must have the same type as input tensor"); - } - - if (input_tensor.numel() * size_ != output_tensor.numel()) { - C10_THROW_ERROR( - ValueError, - "output tensor size must be equal to world_size times input tensor size"); - } + TORCH_CHECK_WITH( + TypeError, + input_tensor.dtype() != output_tensor.dtype(), + "output tensor must have the same type as input tensor"); + TORCH_CHECK_WITH( + ValueError, + input_tensor.numel() * size_ != output_tensor.numel(), + "output tensor size must be equal to world_size times input tensor size"); return collective( input_tensor, @@ -1238,17 +1075,17 @@ c10::intrusive_ptr ProcessGroupXCCL::_allgather_base( c10::xpu::XPUCachingAllocator::recordStream( output.storage().data_ptr(), stream); auto xcclDataType = getXcclDataType(input.scalar_type()); - ccl::event ret_evt; - ret_evt = ccl::allgather( + ccl::allgather( input.data_ptr(), output.data_ptr(), (size_t)input.numel(), xcclDataType, comm, ccl::create_stream(stream.queue())); - return ret_evt; + return; }, - OpType::_ALLGATHER_BASE); + OpType::_ALLGATHER_BASE, + "xccl:_all_gather_base"); } c10::intrusive_ptr ProcessGroupXCCL::allgather_into_tensor_coalesced( @@ -1263,17 +1100,17 @@ c10::intrusive_ptr ProcessGroupXCCL::allgather_into_tensor_coalesced( xcclComm_t& comm, at::xpu::XPUStream& stream) { auto xcclDataType = getXcclDataType(input.scalar_type()); - ccl::event ret_evt; - ret_evt = ccl::allgather( + ccl::allgather( input.data_ptr(), output.data_ptr(), (size_t)input.numel(), xcclDataType, comm, ccl::create_stream(stream.queue())); - return ret_evt; + return; }, - OpType::COALESCED); + OpType::COALESCED, + "xccl:all_gather_into_tensor_coalesced"); } c10::intrusive_ptr ProcessGroupXCCL::reduce_scatter( @@ -1286,9 +1123,6 @@ c10::intrusive_ptr ProcessGroupXCCL::reduce_scatter( check_xpu_single_tensor(outputTensor); // @lint-ignore CLANGTIDY auto inputTensors_ = inputTensors.back(); - TORCH_CHECK( - !isFloat8Type(outputTensor.scalar_type()), - "Float8 dtypes are not currenlty supported for XCCL reductions"); bool same_size = check_same_size(inputTensors_); if (same_size) { @@ -1303,10 +1137,9 @@ c10::intrusive_ptr ProcessGroupXCCL::reduce_scatter( at::xpu::XPUStream& stream) { c10::xpu::XPUCachingAllocator::recordStream( output.storage().data_ptr(), stream); - auto xcclDataType = getXcclDataType(input.scalar_type()); + auto xcclDataType = getXcclDataType(input.scalar_type(), true); auto xcclReduceOp = getXcclReduceOp(opts.reduceOp, input); - ccl::event ret_evt; - ret_evt = ccl::reduce_scatter( + ccl::reduce_scatter( input.data_ptr(), output.data_ptr(), (size_t)output.numel(), @@ -1314,7 +1147,7 @@ c10::intrusive_ptr ProcessGroupXCCL::reduce_scatter( xcclReduceOp, comm, ccl::create_stream(stream.queue())); - return ret_evt; + return; }, [&](at::xpu::XPUStream& Stream, c10::intrusive_ptr& work) { @@ -1328,7 +1161,8 @@ c10::intrusive_ptr ProcessGroupXCCL::reduce_scatter( }, [&](at::xpu::XPUStream&, c10::intrusive_ptr&) {}, - OpType::REDUCE_SCATTER); + OpType::REDUCE_SCATTER, + "xccl:reduce_scatter"); } else { const auto num_reduces = inputTensors_.size(); startCoalescing(); @@ -1351,22 +1185,14 @@ c10::intrusive_ptr ProcessGroupXCCL::_reduce_scatter_base( at::Tensor& outputTensor, at::Tensor& inputTensor, const ReduceScatterOptions& opts) { - if (inputTensor.dtype() != outputTensor.dtype()) { - C10_THROW_ERROR( - TypeError, "input tensor must be the same type as the output tensor."); - } - - if (inputTensor.numel() != outputTensor.numel() * size_) { - C10_THROW_ERROR( - ValueError, - "input tensor must be the same size as output size times world size"); - } - - // @lint-ignore CLANGTIDY - const auto& tensor = outputTensor; - TORCH_CHECK( - !isFloat8Type(tensor.scalar_type()), - "Float8 dtypes are not currenlty supported for XCCL reductions"); + TORCH_CHECK_WITH( + TypeError, + inputTensor.dtype() != outputTensor.dtype(), + "output tensor must have the same type as input tensor"); + TORCH_CHECK_WITH( + ValueError, + inputTensor.numel() != outputTensor.numel() * size_, + "input tensor size must be equal to world_size times output tensor size"); return collective( inputTensor, @@ -1377,10 +1203,9 @@ c10::intrusive_ptr ProcessGroupXCCL::_reduce_scatter_base( at::xpu::XPUStream& stream) { c10::xpu::XPUCachingAllocator::recordStream( output.storage().data_ptr(), stream); - auto xcclDataType = getXcclDataType(input.scalar_type()); + auto xcclDataType = getXcclDataType(input.scalar_type(), true); auto xcclReduceOp = getXcclReduceOp(opts.reduceOp, input); - ccl::event ret_evt; - ret_evt = ccl::reduce_scatter( + ccl::reduce_scatter( input.data_ptr(), output.data_ptr(), (size_t)output.numel(), @@ -1388,18 +1213,16 @@ c10::intrusive_ptr ProcessGroupXCCL::_reduce_scatter_base( xcclReduceOp, comm, ccl::create_stream(stream.queue())); - return ret_evt; + return; }, - OpType::_REDUCE_SCATTER_BASE); + OpType::_REDUCE_SCATTER_BASE, + "xccl:_reduce_scatter_base"); } c10::intrusive_ptr ProcessGroupXCCL::reduce_scatter_tensor_coalesced( std::vector& outputs, std::vector& inputs, const ReduceScatterOptions& opts) { - TORCH_CHECK( - !isFloat8Type(inputs.back().scalar_type()), - "Float8 dtypes are not currenlty supported for XCCL reductions"); return collectiveCoalesced( inputs, outputs, @@ -1409,10 +1232,9 @@ c10::intrusive_ptr ProcessGroupXCCL::reduce_scatter_tensor_coalesced( at::xpu::XPUStream& stream) { c10::xpu::XPUCachingAllocator::recordStream( output.storage().data_ptr(), stream); - auto xcclDataType = getXcclDataType(input.scalar_type()); + auto xcclDataType = getXcclDataType(input.scalar_type(), true); auto xcclReduceOp = getXcclReduceOp(opts.reduceOp, input); - ccl::event ret_evt; - ret_evt = ccl::reduce_scatter( + ccl::reduce_scatter( input.data_ptr(), output.data_ptr(), (size_t)output.numel(), @@ -1420,9 +1242,10 @@ c10::intrusive_ptr ProcessGroupXCCL::reduce_scatter_tensor_coalesced( xcclReduceOp, comm, ccl::create_stream(stream.queue())); - return ret_evt; + return; }, - OpType::COALESCED); + OpType::COALESCED, + "xccl:reduce_scatter_tensor_coalesced"); } c10::intrusive_ptr ProcessGroupXCCL::barrier(const BarrierOptions& opts) { @@ -1441,6 +1264,7 @@ c10::intrusive_ptr ProcessGroupXCCL::barrier(const BarrierOptions& opts) { static_cast(rank_ % at::detail::getXPUHooks().getNumGPUs()); } + // todo: use barrier instead of allreduce TORCH_CHECK_WITH( ValueError, barDevIdx >= 0, @@ -1484,17 +1308,17 @@ c10::intrusive_ptr ProcessGroupXCCL::alltoall_base( c10::xpu::XPUCachingAllocator::recordStream( output.storage().data_ptr(), stream); auto xcclDataType = getXcclDataType(output.scalar_type()); - ccl::event ret_evt; - ret_evt = ccl::alltoall( + ccl::alltoall( input.data_ptr(), output.data_ptr(), (size_t)output.numel() / comm.size(), xcclDataType, comm, ccl::create_stream(stream.queue())); - return ret_evt; + return; }, - OpType::ALLTOALL_BASE); + OpType::ALLTOALL_BASE, + "xccl:all_to_all"); } else { c10d::checkSplitSizes(inputSplitSizes, inputTensor, size_); c10d::checkSplitSizes(outputSplitSizes, outputTensor, size_); @@ -1525,9 +1349,7 @@ c10::intrusive_ptr ProcessGroupXCCL::alltoall_base( (outputSplitsEqual ? outLen : outputSplitSizes[i] * outLen); } auto xcclDataType = getXcclDataType(output.scalar_type()); - ccl::event ret_evt; - - ret_evt = ccl::alltoallv( + ccl::alltoallv( input.data_ptr(), sendCounts, output.data_ptr(), @@ -1535,9 +1357,10 @@ c10::intrusive_ptr ProcessGroupXCCL::alltoall_base( xcclDataType, comm, ccl::create_stream(stream.queue())); - return ret_evt; + return; }, - OpType::ALLTOALL_BASE); + OpType::ALLTOALL_BASE, + "xccl:all_to_all"); } } @@ -1607,15 +1430,11 @@ c10::intrusive_ptr ProcessGroupXCCL::alltoall( outputTensors[i].view({-1}).copy_(flatOutputSplits[i]); } } - stream.synchronize(); - return ret_evt; - }, - [](at::xpu::XPUStream&, c10::intrusive_ptr&) { - }, - [](at::xpu::XPUStream&, c10::intrusive_ptr&) { + return; }, - OpType::ALLTOALL); + OpType::ALLTOALL, + "xccl:all_to_all"); } } // namespace c10d diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp index 0f2b2738a4b77c..c8fa11442c692e 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp @@ -21,18 +21,12 @@ #include namespace c10d { -namespace { -struct AutoXcclGroup { - AutoXcclGroup(); - ~AutoXcclGroup() noexcept(false); -}; -} // namespace - static std::vector TORCH_XCCL_BLOCKING_WAIT = { "TORCH_XCCL_BLOCKING_WAIT", "XCCL_BLOCKING_WAIT"}; using xcclComm_t = ccl::communicator; +using XCCL_KVS = ccl::shared_ptr_class; constexpr const char* XCCL_BACKEND_NAME = "xccl"; class TORCH_API ProcessGroupXCCL : public Backend { @@ -129,28 +123,50 @@ class TORCH_API ProcessGroupXCCL : public Backend { Fn fn, OpType opType, const char* profilingTitle = nullptr) { - auto inputs = std::vector{input}; - auto outputs = std::vector{output}; return collective( - inputs, - outputs, + input, + output, fn, [](at::xpu::XPUStream&, c10::intrusive_ptr&) {}, [](at::xpu::XPUStream&, c10::intrusive_ptr&) {}, - opType); + opType, + profilingTitle); } template c10::intrusive_ptr collective( - std::vector& inputs, - std::vector& outputs, + at::Tensor& input, + at::Tensor& output, Fn fn, PreProcess pre, PostProcess post, OpType opType, - const char* profilingTitle = nullptr); + const char* profilingTitle = nullptr) { + auto inputs = std::vector{input}; + auto outputs = std::vector{output}; + return collective(inputs, outputs, fn, pre, post, opType, profilingTitle); + } + + template + c10::intrusive_ptr collective( + std::vector& inputs, + std::vector& outputs, + Fn fn, + OpType opType, + const char* profilingTitle = nullptr) { + return collective( + inputs, + outputs, + fn, + [](at::xpu::XPUStream&, + c10::intrusive_ptr&) {}, + [](at::xpu::XPUStream&, + c10::intrusive_ptr&) {}, + opType, + profilingTitle); + } template c10::intrusive_ptr collective( @@ -159,30 +175,39 @@ class TORCH_API ProcessGroupXCCL : public Backend { Fn fn, PreProcess pre, PostProcess post, - OpType opType); + OpType opType, + const char* profilingTitle = nullptr); template c10::intrusive_ptr collectiveCoalesced( std::vector& input, std::vector& output, Fn fn, - OpType opType); + OpType opType, + const char* profilingTitle = nullptr) { + return collective( + input, + output, + fn, + [](at::xpu::XPUStream&, + c10::intrusive_ptr&) { + ccl::group_start(); + }, + [](at::xpu::XPUStream&, + c10::intrusive_ptr&) { + ccl::group_end(); + }, + opType, + profilingTitle); + } template - c10::intrusive_ptr pointToPoint( - at::Tensor& tensor, - Fn fn, - int peer, - OpType opType); - - template c10::intrusive_ptr pointToPoint( at::Tensor& tensor, Fn fn, int peer, OpType opType, - PreProcess pre, - PostProcess post); + const char* profilingTitle = nullptr); c10::intrusive_ptr allreduce_impl( at::Tensor& tensor, @@ -285,10 +310,8 @@ class TORCH_API ProcessGroupXCCL : public Backend { const ScatterOptions& opts = ScatterOptions()) override; protected: - std::unordered_map xcclStreams_; - std::unordered_map xcclEvents_; - std::unordered_map> - inInitializationCommMap_; + std::unordered_map xcclStreamsMap_; + std::unordered_map xcclEventsMap_; std::unordered_map> devXCCLCommMap_; c10::intrusive_ptr store_; std::mutex mutex_; From 385c218f274509d36c6e3a8d1e6ece5511a5d13b Mon Sep 17 00:00:00 2001 From: hanchao Date: Mon, 21 Oct 2024 01:54:13 +0000 Subject: [PATCH 85/96] refine code --- caffe2/CMakeLists.txt | 3 -- test/distributed/test_c10d_common.py | 13 +++--- torch/csrc/distributed/c10d/ProcessGroup.hpp | 17 ++++---- .../distributed/c10d/ProcessGroupXCCL.cpp | 39 +++++++++-------- .../distributed/c10d/ProcessGroupXCCL.hpp | 43 +++++++++++-------- torch/testing/_internal/common_distributed.py | 14 +++++- 6 files changed, 72 insertions(+), 57 deletions(-) diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index b4ec018019f165..25bd7f700f68a2 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -1376,9 +1376,6 @@ if(USE_DISTRIBUTED) endif() if(USE_XPU AND USE_C10D_XCCL) target_compile_definitions(torch_xpu PUBLIC USE_C10D_XCCL) - set_source_files_properties( - ${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupXCCL.cpp - PROPERTIES COMPILE_DEFINITIONS "CCL_ENABLE_ZE;CCL_ENABLE_SYCL") endif() if(USE_MPI AND USE_C10D_MPI) if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU") diff --git a/test/distributed/test_c10d_common.py b/test/distributed/test_c10d_common.py index 903df26bba9f6f..d3cb65f7befb1d 100644 --- a/test/distributed/test_c10d_common.py +++ b/test/distributed/test_c10d_common.py @@ -31,6 +31,7 @@ from torch.testing._internal.common_distributed import ( MultiProcessTestCase, skip_if_lt_x_gpu, + get_device_count, ) from torch.testing._internal.common_utils import ( instantiate_parametrized_tests, @@ -60,17 +61,13 @@ torch.backends.cuda.matmul.allow_tf32 = False -def gpus_for_rank(world_size): +def gpus_for_rank(world_size, backend): """Multigpu tests are designed to simulate the multi nodes with multi GPUs on each node. Nccl backend requires equal #GPUs in each process. On a single node, all visible GPUs are evenly divided to subsets, each process only uses a subset. """ - device_count = ( - torch.xpu.device_count() - if torch.xpu.is_available() - else torch.cuda.device_count() - ) + device_count = get_device_count(backend) visible_devices = list(range(device_count)) gpus_per_process = device_count // world_size gpus_for_rank = [] @@ -833,7 +830,7 @@ def update_parameters(model): def _gpu_model_with_ddp_comm_hook( self, process_group, hook=None, gradient_as_bucket_view=False, state=None ): - device_id = gpus_for_rank(self.world_size)[self.rank][0] + device_id = gpus_for_rank(self.world_size, process_group.name())[self.rank][0] gpu_model = DistributedDataParallel( ModuleForDdpCommHook().to(device_id), device_ids=[device_id], @@ -850,7 +847,7 @@ def _gpu_model_with_ddp_comm_hook( def _gpu_model_with_builtin_ddp_comm_hook( self, process_group, hook=None, gradient_as_bucket_view=False ): - device_id = gpus_for_rank(self.world_size)[self.rank][0] + device_id = gpus_for_rank(self.world_size, process_group.name())[self.rank][0] gpu_model = DistributedDataParallel( ModuleForDdpCommHook().to(device_id), device_ids=[device_id], diff --git a/torch/csrc/distributed/c10d/ProcessGroup.hpp b/torch/csrc/distributed/c10d/ProcessGroup.hpp index 31c974a061e4a2..b3eac70e871bf7 100644 --- a/torch/csrc/distributed/c10d/ProcessGroup.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroup.hpp @@ -131,6 +131,13 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder { return backendType_; }; + inline bool backendSupportsSequenceNumbers(BackendType backendType) { + if (backendType == BackendType::GLOO || backendType == BackendType::NCCL || + backendType == BackendType::XCCL || backendType == BackendType::UCC) + return true; + return false; + } + virtual void startCoalescing(c10::DeviceType deviceType) { // only nccl has implemented startCoalescing so only execute for nccl // backends @@ -508,10 +515,7 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder { virtual void setSequenceNumberForGroup() { auto backendType = getBackendType(); // TODO: HACK for backend name to get sequence number for that backend. - if (backendType == ProcessGroup::BackendType::GLOO || - backendType == ProcessGroup::BackendType::NCCL || - backendType == ProcessGroup::BackendType::XCCL || - backendType == ProcessGroup::BackendType::UCC) { + if (backendSupportsSequenceNumbers(backendType)) { getDefaultBackend()->setSequenceNumberForGroup(); } else { TORCH_CHECK( @@ -530,10 +534,7 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder { auto backendType = getBackendType(); // TODO: HACK for backend name to get sequence number for that backend. - if (backendType == ProcessGroup::BackendType::GLOO || - backendType == ProcessGroup::BackendType::NCCL || - backendType == ProcessGroup::BackendType::XCCL || - backendType == ProcessGroup::BackendType::UCC) { + if (backendSupportsSequenceNumbers(backendType)) { return getDefaultBackend()->getSequenceNumberForGroup(); } else { TORCH_CHECK( diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index 90fb4c3f9cbd75..41e4e43436270a 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -1,17 +1,8 @@ #ifdef USE_C10D_XCCL #include +#include #include -#include -#include -#include -#include -#include -#include -#include - -#include -#include namespace c10d { @@ -89,10 +80,13 @@ ProcessGroupXCCL::WorkXCCL::WorkXCCL( at::Device& device, int rank, OpType opType, + uint64_t seq, + const char* profilingTitle, const std::optional>& inputs) - : Work(rank, opType, "profilingTitle", inputs), + : Work(rank, opType, profilingTitle, inputs), device_(device), - workStartTime_(std::chrono::steady_clock::now()) { + workStartTime_(std::chrono::steady_clock::now()), + seq_(seq) { xcclEndEvent_ = std::make_shared(); } @@ -101,7 +95,8 @@ ProcessGroupXCCL::WorkXCCL::WorkXCCL(const WorkXCCL& w) device_(w.device_), xcclEndEvent_(w.xcclEndEvent_), blockingWait_(w.blockingWait_), - workStartTime_(w.workStartTime_) {} + workStartTime_(w.workStartTime_), + seq_(w.seq_) {} ProcessGroupXCCL::WorkXCCL::~WorkXCCL() = default; @@ -156,10 +151,16 @@ c10::intrusive_ptr ProcessGroupXCCL::initWork( at::Device& device, int rank, OpType opType, + const char* profilingTitle, const std::vector& inputs, const std::vector& outputs) { auto r = c10::make_intrusive( - device, rank, opType, std::optional>(inputs)); + device, + rank, + opType, + seqCollective_, + profilingTitle, + std::optional>(inputs)); return r; } @@ -212,7 +213,10 @@ c10::intrusive_ptr ProcessGroupXCCL::collective( Fn fn, PreProcess pre, PostProcess post, - OpType opType) { + OpType opType, + const char* profilingTitle) { + seqCollective_++; + auto device = inputs[0].device(); const auto key = std::to_string(device.index()); auto comm = getXCCLComm(key, device); @@ -221,7 +225,7 @@ c10::intrusive_ptr ProcessGroupXCCL::collective( syncStream(device, xcclEventsMap_[key], stream); c10::intrusive_ptr work; - work = initWork(device, rank_, opType); + work = initWork(device, rank_, opType, profilingTitle); work->outputs_ = std::make_shared>(outputs); at::xpu::OptionalXPUGuard gpuGuard(device); @@ -273,7 +277,8 @@ c10::intrusive_ptr ProcessGroupXCCL::allreduce( ccl_stream); return; }, - OpType::ALLREDUCE); + OpType::ALLREDUCE, + "xccl:all_reduce"); } } // namespace c10d diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp index 6e6eb16d62d620..f9761c652dc1a0 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp @@ -1,33 +1,24 @@ #pragma once -#if defined(__linux__) -#include -#include -#include -#include -#endif - #ifdef USE_C10D_XCCL -#include +// We will define those flags in XCCL backend file instead of passing to gcc +// compiler. +#define CCL_ENABLE_ZE +#define CCL_ENABLE_SYCL + #include -#include #include -#include -#include - -#include -#include #include -#include #include #include -#include #include +#include +#include #include #include #include -#include +#include namespace c10d { static std::vector TORCH_XCCL_BLOCKING_WAIT = { @@ -45,6 +36,8 @@ class TORCH_API ProcessGroupXCCL : public Backend { at::Device& device, int rank, OpType opType, + uint64_t seq, + const char* profilingTitle = nullptr, const std::optional>& inputs = std::nullopt); WorkXCCL(const WorkXCCL& w); ~WorkXCCL() override; @@ -63,6 +56,10 @@ class TORCH_API ProcessGroupXCCL : public Backend { return future_; } + uint64_t getSequencenumber() const override { + return seq_; + } + std::vector result() override { return *outputs_; } @@ -72,6 +69,7 @@ class TORCH_API ProcessGroupXCCL : public Backend { std::shared_ptr xcclEndEvent_; bool blockingWait_ = false; std::chrono::time_point workStartTime_; + uint64_t seq_; private: void synchronizeInternal(std::chrono::milliseconds timeout); @@ -103,6 +101,7 @@ class TORCH_API ProcessGroupXCCL : public Backend { at::Device& device, int rank, OpType opType, + const char* profilingTitle = nullptr, const std::vector& inputs = {}, const std::vector& outputs = {}); @@ -111,7 +110,8 @@ class TORCH_API ProcessGroupXCCL : public Backend { at::Tensor& input, at::Tensor& output, Fn fn, - OpType opType) { + OpType opType, + const char* profilingTitle = nullptr) { auto inputs = std::vector{input}; auto outputs = std::vector{output}; return collective( @@ -132,13 +132,17 @@ class TORCH_API ProcessGroupXCCL : public Backend { Fn fn, PreProcess pre, PostProcess post, - OpType opType); + OpType opType, + const char* profilingTitle = nullptr); c10::intrusive_ptr allreduce( std::vector& tensors, const AllreduceOptions& opts = AllreduceOptions()) override; void setSequenceNumberForGroup() override {} + uint64_t getSequenceNumberForGroup() override { + return seqCollective_; + } protected: std::unordered_map xcclStreamsMap_; @@ -147,6 +151,7 @@ class TORCH_API ProcessGroupXCCL : public Backend { c10::intrusive_ptr store_; std::mutex mutex_; bool blockingWait_ = false; + uint64_t seqCollective_{0}; private: std::mutex kvs_mutex; diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py index 9ec38c9ca671c2..b0b506195b240f 100644 --- a/torch/testing/_internal/common_distributed.py +++ b/torch/testing/_internal/common_distributed.py @@ -93,8 +93,9 @@ class DistTestCases: # Sets showing that something is implemented backend_feature = {} - backend_feature["gpu"] = {"nccl", "gloo", "ucc"} + backend_feature["gpu"] = {"nccl", "gloo", "ucc", "xccl"} backend_feature["cuda"] = {"nccl", "gloo", "ucc"} + backend_feature["xpu"] = {"xccl"} backend_feature["ddp"] = {"nccl", "gloo", "ucc"} backend_feature["subgroup"] = {"nccl", "gloo", "ucc"} backend_feature["plugin"] = set() @@ -462,6 +463,15 @@ def compute_sum(fn, world_size: int): ] ] +# Returns the number of GPUs, currently only for CUDA and XPU. +def get_device_count(backend: str): + assert c10d.is_backend_available(backend) + if backend in DistTestCases.backend_feature.get("cuda", set()): + return torch.cuda.device_count() + elif backend in DistTestCases.backend_feature.get("xpu", set()): + return torch.xpu.device_count() + else: + raise ValueError(f"Unsupported backend: {backend}") # HELPER FOR MULTIGPU TESTS def init_multigpu_helper(world_size: int, backend: str): @@ -470,7 +480,7 @@ def init_multigpu_helper(world_size: int, backend: str): On a single node, all visible GPUs are evenly divided to subsets, each process only uses a subset. """ - nGPUs = torch.xpu.device_count() if torch.xpu.is_available() else torch.cuda.device_count() + nGPUs = get_device_count(backend) visible_devices = range(nGPUs) # If rank is less than or equal to number of available GPU's From e36a99c977a8784e9e671a5bb0b661172d2ba35d Mon Sep 17 00:00:00 2001 From: hanchao Date: Mon, 21 Oct 2024 07:51:25 +0000 Subject: [PATCH 86/96] update --- torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index 89f302595c4ac7..0628d3f3612f01 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -1060,7 +1060,7 @@ c10::intrusive_ptr ProcessGroupXCCL::_allgather_base( TORCH_CHECK_WITH( TypeError, input_tensor.dtype() == output_tensor.dtype(), - "output tensor must have the same type as input tensor"); + "input tensor must be the same type as the output tensor."); TORCH_CHECK_WITH( ValueError, input_tensor.numel() * size_ == output_tensor.numel(), @@ -1189,7 +1189,7 @@ c10::intrusive_ptr ProcessGroupXCCL::_reduce_scatter_base( TORCH_CHECK_WITH( TypeError, inputTensor.dtype() == outputTensor.dtype(), - "output tensor must have the same type as input tensor"); + "input tensor must be the same type as the output tensor."); TORCH_CHECK_WITH( ValueError, inputTensor.numel() == outputTensor.numel() * size_, From 5096354f792e4c96b4eeac7664c561c416268be4 Mon Sep 17 00:00:00 2001 From: hanchao Date: Mon, 21 Oct 2024 08:37:13 +0000 Subject: [PATCH 87/96] update --- torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index 0628d3f3612f01..dcfa15a1a6af0b 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -1060,7 +1060,7 @@ c10::intrusive_ptr ProcessGroupXCCL::_allgather_base( TORCH_CHECK_WITH( TypeError, input_tensor.dtype() == output_tensor.dtype(), - "input tensor must be the same type as the output tensor."); + "output tensor must have the same type as input tensor"); TORCH_CHECK_WITH( ValueError, input_tensor.numel() * size_ == output_tensor.numel(), From 9e6448b5326f8736ae529b36d286a0c61e654baa Mon Sep 17 00:00:00 2001 From: hanchao Date: Tue, 22 Oct 2024 01:26:22 +0000 Subject: [PATCH 88/96] add RECORD_PARAM_COMMS_DATA --- .../distributed/c10d/ProcessGroupXCCL.cpp | 294 +++++++++++++++++- 1 file changed, 293 insertions(+), 1 deletion(-) diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index dcfa15a1a6af0b..44dc7360265b55 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -99,6 +99,7 @@ void check_xpu_single_tensor( } } } + int64_t check_xpu_tensors_same_device(const std::vector& tensors) { TORCH_CHECK_WITH( ValueError, tensors.size() == 0, "Tensor list must be nonempty"); @@ -317,6 +318,20 @@ std::shared_ptr ProcessGroupXCCL::getXCCLComm( auto comms = ccl::create_communicators(numRanks, devs_rank, ctx, xccl_kvs); XCCLComm = std::make_shared(std::move(comms[0])); + RECORD_PARAM_COMMS( + 0, // seq + std::make_tuple(pg_uid_, pg_desc_), // PG name tuple + rank, // rank + "init", // collective name + 0, // inNelems + 0, // outNelems + at::kByte, // dType + std::vector(), // inSplitSizes + std::vector(), // outSplitSizes + -1, // globalRankStart + -1, // globalRankStride + size_); // worldSize + std::lock_guard lock(mutex_); devXCCLCommMap_.emplace(deviceKey, XCCLComm); xcclStreamsMap_.emplace(deviceKey, std::move(stream)); @@ -530,6 +545,23 @@ c10::intrusive_ptr ProcessGroupXCCL::send( auto tensor = tensors.back(); check_xpu_single_tensor(tensor, true); + RECORD_PARAM_COMMS_DATA( + static_cast( + this->getSequenceNumberForGroup() + 1), // seq + 1 to match collective + std::make_tuple(pg_uid_, pg_desc_), // PG name tuple + tensors, // inputTensors + tensors, // outputTensors + dstRank, // dst rank + "send", // collective name + tensor.numel(), // inNelems + tensor.numel(), // outNelems + tensor.scalar_type(), // dType + std::vector(), // inSplitSizes + std::vector(), // outSplitSizes + -1, // globalRankStart + -1, // globalRankStride + this->getSize()); // worldSize + auto ret = pointToPoint( tensor, [&](at::Tensor& input, @@ -561,6 +593,23 @@ c10::intrusive_ptr ProcessGroupXCCL::recv( auto tensor = tensors.back(); check_xpu_single_tensor(tensor, true); + RECORD_PARAM_COMMS_DATA( + static_cast( + this->getSequenceNumberForGroup() + 1), // seq + 1 to match collective + std::make_tuple(pg_uid_, pg_desc_), // PG name tuple + tensors, // inputTensors + tensors, // outputTensors + srcRank, // src rank + "recv", // collective name + tensor.numel(), // inNelems + tensor.numel(), // outNelems + tensor.scalar_type(), // dType + std::vector(), // inSplitSizes + std::vector(), // outSplitSizes + -1, // globalRankStart + -1, // globalRankStride + this->getSize()); // worldSize + auto ret = pointToPoint( tensor, [&](at::Tensor& output, @@ -628,6 +677,23 @@ c10::intrusive_ptr ProcessGroupXCCL::gather( outputs.emplace_back(); } + RECORD_PARAM_COMMS_DATA( + static_cast( + this->getSequenceNumberForGroup() + 1), // seq + 1 to match collective + std::make_tuple(pg_uid_, pg_desc_), // PG name tuple + inputTensors, // inputTensors + outputTensors, // outputTensors + opts.rootRank, // root rank + "gather", // collective name + inputTensor.numel(), // inNelems + inputTensor.numel() * this->getSize(), // outNelems + inputTensor.scalar_type(), // dType + std::vector(), // inSplitSizes + std::vector(), // outSplitSize + -1, // globalRankStart + -1, // globalRankStride + this->getSize()); // worldSize + auto inputs = std::vector{inputTensor}; return collective( inputs, @@ -722,6 +788,23 @@ c10::intrusive_ptr ProcessGroupXCCL::scatter( inputs.emplace_back(); } + RECORD_PARAM_COMMS_DATA( + static_cast( + this->getSequenceNumberForGroup() + 1), // seq + 1 to match collective + std::make_tuple(pg_uid_, pg_desc_), // PG name tuple + inputTensors, // inputTensors + outputTensors, // outputTensors + opts.rootRank, // root rank + "scatter", // collective name + outputTensor.numel() * this->getSize(), // inNelems + outputTensor.numel(), // outNelems + outputTensor.scalar_type(), // dType + std::vector(), // inSplitSizes + std::vector(), // outSplitSize + -1, // globalRankStart + -1, // globalRankStride + this->getSize()); // worldSize + const auto root = opts.rootRank; auto outputs = std::vector{outputTensor}; @@ -810,6 +893,24 @@ c10::intrusive_ptr ProcessGroupXCCL::allreduce( auto tensor = tensors.back(); check_xpu_single_tensor(tensor); + // @lint-ignore CLANGTIDY + RECORD_PARAM_COMMS_DATA( + static_cast( + this->getSequenceNumberForGroup() + 1), // seq + 1 to match collective + std::make_tuple(pg_uid_, pg_desc_), // PG name tuple + tensors, // inputTensors + tensors, // outputTensors + rank_, // rank + "allreduce", // collective name + tensor.numel(), // inNelems + tensor.numel(), // outNelems + tensor.scalar_type(), // dType + std::vector(), // inSplitSizes + std::vector(), // outSplitSizes + -1, // globalRankStart + -1, // globalRankStride + size_); // worldSize + return collective( tensor, tensor, @@ -836,7 +937,26 @@ c10::intrusive_ptr ProcessGroupXCCL::allreduce( c10::intrusive_ptr ProcessGroupXCCL::allreduce_coalesced( std::vector& tensors, const AllreduceCoalescedOptions& opts) { - check_xpu_tensors_same_device(tensors); + auto total_numel = check_xpu_tensors_same_device(tensors); + + // @lint-ignore CLANGTIDY + RECORD_PARAM_COMMS_DATA( + static_cast( + this->getSequenceNumberForGroup() + 1), // seq + 1 to match collective + std::make_tuple(pg_uid_, pg_desc_), // PG name tuple + tensors, // inputTensors + tensors, // outputTensors + rank_, // rank + "allreduce_coalesced", // collective name + total_numel, // inNelems + total_numel, // outNelems + tensors[0].scalar_type(), // dType + std::vector(), // inSplitSizes + std::vector(), // outSplitSizes + -1, // globalRankStart + -1, // globalRankStride + this->getSize()); // worldSize + return collectiveCoalesced( tensors, tensors, @@ -867,6 +987,24 @@ c10::intrusive_ptr ProcessGroupXCCL::broadcast( auto tensor = tensors.back(); check_xpu_single_tensor(tensor); + // @lint-ignore CLANGTIDY + RECORD_PARAM_COMMS_DATA( + static_cast( + this->getSequenceNumberForGroup() + 1), // seq + 1 to match collective + std::make_tuple(pg_uid_, pg_desc_), // PG name tuple + tensors, // inputTensors + tensors, // outputTensors + opts.rootRank, // root rank + "broadcast", // collective name + tensor.numel(), // inNelems + tensor.numel(), // outNelems + tensor.scalar_type(), // dType + std::vector(), // inSplitSizes + std::vector(), // outSplitSizes + -1, // globalRankStart + -1, // globalRankStride + this->getSize()); // worldSize + const auto root = opts.rootRank + opts.rootTensor; return collective( @@ -928,6 +1066,23 @@ c10::intrusive_ptr ProcessGroupXCCL::reduce( auto tensor = tensors.back(); check_xpu_single_tensor(tensor); + RECORD_PARAM_COMMS_DATA( + static_cast( + this->getSequenceNumberForGroup() + 1), // seq + 1 to match collective + std::make_tuple(pg_uid_, pg_desc_), // PG name tuple + tensors, // inputTensors + tensors, // outputTensors + opts.rootRank, // root rank + "reduce", // collective name + tensor.numel(), // inNelems + tensor.numel(), // outNelems + tensor.scalar_type(), // dType + std::vector(), // inSplitSizes + std::vector(), // outSplitSizes + -1, // globalRankStart + -1, // globalRankStride + this->getSize()); // worldSize + return collective( tensor, tensor, @@ -997,6 +1152,24 @@ c10::intrusive_ptr ProcessGroupXCCL::allgather( // @lint-ignore CLANGTIDY std::vector& outputTensors_ = outputTensors.back(); + RECORD_PARAM_COMMS_DATA( + static_cast( + this->getSequenceNumberForGroup() + 1), // seq + 1 to match collective + std::make_tuple(pg_uid_, pg_desc_), // PG name tuple + inputTensors, // inputTensors + outputTensors, // outputTensors + rank_, // rank + "all_gather", // collective name + inputTensor.numel(), // inNelems + inputTensor.numel() * // outNelems + this->getSize(), + inputTensor.scalar_type(), // dType + std::vector(), // inSplitSizes + std::vector(), // outSplitSize + -1, // globalRankStart + -1, // globalRankStride + this->getSize()); // worldSize + bool same_size = check_same_size(outputTensors_); if (same_size) { // Flatten a vector of tensors into a single, stacked tensor. @@ -1066,6 +1239,23 @@ c10::intrusive_ptr ProcessGroupXCCL::_allgather_base( input_tensor.numel() * size_ == output_tensor.numel(), "output tensor size must be equal to world_size times input tensor size"); + RECORD_PARAM_COMMS_DATA( + static_cast( + this->getSequenceNumberForGroup() + 1), // seq + 1 to match collective + std::make_tuple(pg_uid_, pg_desc_), // PG name tuple + input_tensor, // inputTensors + output_tensor, // outputTensors + rank_, // rank + "_allgather_base", // collective name + input_tensor.numel(), // inNelems + output_tensor.numel(), // outNelems + output_tensor.scalar_type(), // dType + std::vector(), // inSplitSizes + std::vector(), // outSplitSize + -1, // globalRankStart + -1, // globalRankStride + this->getSize()); // worldSize + return collective( input_tensor, output_tensor, @@ -1125,6 +1315,23 @@ c10::intrusive_ptr ProcessGroupXCCL::reduce_scatter( // @lint-ignore CLANGTIDY auto inputTensors_ = inputTensors.back(); + RECORD_PARAM_COMMS_DATA( + static_cast( + this->getSequenceNumberForGroup() + 1), // seq + 1 to match collective + std::make_tuple(pg_uid_, pg_desc_), // PG name tuple + inputTensors, // inputTensors + outputTensors, // outputTensors + rank_, // rank + "reduce_scatter", // collective name + outputTensor.numel() * this->getSize(), // inNelems + outputTensor.numel(), // outNelems + outputTensor.scalar_type(), // dType + std::vector(), // inSplitSizes + std::vector(), // outSplitSizes + -1, // globalRankStart + -1, // globalRankStride + this->getSize()); // worldSize + bool same_size = check_same_size(inputTensors_); if (same_size) { // Flatten a vector of tensors into a single, stacked tensor. @@ -1195,6 +1402,23 @@ c10::intrusive_ptr ProcessGroupXCCL::_reduce_scatter_base( inputTensor.numel() == outputTensor.numel() * size_, "input tensor must be the same size as output size times world size"); + RECORD_PARAM_COMMS_DATA( + static_cast( + this->getSequenceNumberForGroup() + 1), // seq + 1 to match collective + std::make_tuple(pg_uid_, pg_desc_), // PG name tuple + inputTensor, // inputTensor + outputTensor, // outputTensor + rank_, // rank + "_reduce_scatter_base", // collective name + inputTensor.numel(), // inNelems + outputTensor.numel(), // outNelems + outputTensor.scalar_type(), // dtype + std::vector(), // inSplitSizes + std::vector(), // outSplitSizes + -1, // globalRankStart + -1, // globalRankStride + this->getSize()); // worldSize + return collective( inputTensor, outputTensor, @@ -1250,6 +1474,20 @@ c10::intrusive_ptr ProcessGroupXCCL::reduce_scatter_tensor_coalesced( } c10::intrusive_ptr ProcessGroupXCCL::barrier(const BarrierOptions& opts) { + RECORD_PARAM_COMMS( + static_cast( + this->getSequenceNumberForGroup() + 1), // seq + 1 to match collective + std::make_tuple(pg_uid_, pg_desc_), // PG name tuple + rank_, // rank + "barrier", // collective name + 0, // inNelems + 0, // outNelems + at::kByte, // dType + std::vector(), // inSplitSizes + std::vector(), // outSplitSizes + -1, // globalRankStart + -1, // globalRankStride + this->getSize()); // worldSize // Device to use for barrier int barDevIdx = -1; @@ -1292,6 +1530,23 @@ c10::intrusive_ptr ProcessGroupXCCL::alltoall_base( check_xpu_single_tensor(outputTensor, true); check_xpu_single_tensor(inputTensor, true); if (outputSplitSizes.size() == 0 && inputSplitSizes.size() == 0) { + RECORD_PARAM_COMMS_DATA( + static_cast( + this->getSequenceNumberForGroup() + + 1), // seq + 1 to match collective + std::make_tuple(pg_uid_, pg_desc_), // PG name tuple + inputTensor, // inputTensor + outputTensor, // outputTensor + rank_, // rank + "all_to_all", // collective name + inputTensor.numel(), // inNelems + outputTensor.numel(), // outNelems + inputTensor.scalar_type(), // dType + std::vector(), // inSplitSizes + std::vector(), // outSplitSizes + -1, // globalRankStart + -1, // globalRankStride + this->getSize()); // worldSize TORCH_CHECK( outputTensor.numel() == inputTensor.numel() && outputTensor.scalar_type() == inputTensor.scalar_type(), @@ -1324,6 +1579,24 @@ c10::intrusive_ptr ProcessGroupXCCL::alltoall_base( c10d::checkSplitSizes(inputSplitSizes, inputTensor, size_); c10d::checkSplitSizes(outputSplitSizes, outputTensor, size_); + RECORD_PARAM_COMMS_DATA( + static_cast( + this->getSequenceNumberForGroup() + + 1), // seq + 1 to match collective + std::make_tuple(pg_uid_, pg_desc_), // PG name tuple + inputTensor, // inputTensor + outputTensor, // outputTensor + rank_, // rank + "all_to_allv", // collective name + inputTensor.numel(), // inNelems + outputTensor.numel(), // outNelems + inputTensor.scalar_type(), // dType + inputSplitSizes, // inSplitSizes + outputSplitSizes, // outSplitSizes + -1, // globalRankStart + -1, // globalRankStride + this->getSize()); // worldSize + return collective( inputTensor, outputTensor, @@ -1370,6 +1643,7 @@ c10::intrusive_ptr ProcessGroupXCCL::alltoall( std::vector& inputTensors, const AllToAllOptions& /* unused */) { auto device = outputTensors[0].device(); + int64_t total_numel = 0; for (const auto r : c10::irange(outputTensors.size())) { check_xpu_single_tensor(outputTensors[r], true); check_xpu_single_tensor(inputTensors[r], true); @@ -1377,8 +1651,26 @@ c10::intrusive_ptr ProcessGroupXCCL::alltoall( device == outputTensors[r].device() && device == inputTensors[r].device(), "Tensors must be on the same device") + total_numel += inputTensors[r].numel(); } + RECORD_PARAM_COMMS_DATA( + static_cast( + this->getSequenceNumberForGroup() + 1), // seq + 1 to match collective + std::make_tuple(pg_uid_, pg_desc_), // PG name tuple + inputTensors, // inputTensors + outputTensors, // outputTensors + rank_, // rank + "all_to_all", // collective name + total_numel, // inNelems + total_numel, // outNelems + inputTensors.front().scalar_type(), // dType + std::vector(), // inSplitSizes + std::vector(), // outSplitSizes + -1, // globalRankStart + -1, // globalRankStride + this->getSize()); // worldSize + return collective( inputTensors, outputTensors, From 8d9c24e19143ac8aa9809a0fb2f2e92b5e473efd Mon Sep 17 00:00:00 2001 From: hanchao Date: Fri, 25 Oct 2024 00:38:50 +0000 Subject: [PATCH 89/96] update --- torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp index c8fa11442c692e..2f83fe8f248bd4 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp @@ -309,6 +309,10 @@ class TORCH_API ProcessGroupXCCL : public Backend { std::vector>& inputTensors, const ScatterOptions& opts = ScatterOptions()) override; + void setSequenceNumberForGroup() override; + + uint64_t getSequenceNumberForGroup() override; + protected: std::unordered_map xcclStreamsMap_; std::unordered_map xcclEventsMap_; @@ -322,6 +326,7 @@ class TORCH_API ProcessGroupXCCL : public Backend { bool blockingWait_ = false; static thread_local uint64_t xcclActiveGroupCounter_; uint64_t seqCollective_{0}; + uint64_t seqP2P_{0}; private: std::mutex kvs_mutex; From e808b6c2857f8b8034ba0cf24d5cd047efa4851a Mon Sep 17 00:00:00 2001 From: hanchao Date: Fri, 25 Oct 2024 00:40:08 +0000 Subject: [PATCH 90/96] update --- torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index 44dc7360265b55..4081529d486d33 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -246,6 +246,12 @@ ProcessGroupXCCL::ProcessGroupXCCL( ProcessGroupXCCL::~ProcessGroupXCCL() = default; +void ProcessGroupXCCL::setSequenceNumberForGroup() {} + +uint64_t ProcessGroupXCCL::getSequenceNumberForGroup() { + return seqCollective_; +} + c10::intrusive_ptr ProcessGroupXCCL::initWork( at::Device& device, int rank, @@ -353,6 +359,11 @@ void ProcessGroupXCCL::groupEnd() { // TODO: wait p2p enable static constexpr int CoalActive = 0x01, CoalColl = 0x02, CoalP2P = 0x04; void ProcessGroupXCCL::startCoalescing() { + if (coalescing_state_ & CoalP2P) { + seqP2P_++; + } else { + seqCollective_++; + } coalescedDevice_.set_index(-1); coalescedComm_ = nullptr; coalescing_state_ |= CoalActive; @@ -402,6 +413,7 @@ c10::intrusive_ptr ProcessGroupXCCL::collective( PostProcess post, OpType opType, const char* profilingTitle) { + seqCollective_++; auto device = inputs[0].device(); const auto key = std::to_string(device.index()); auto comm = getXCCLComm(key, device, opType); @@ -480,6 +492,9 @@ c10::intrusive_ptr ProcessGroupXCCL::pointToPoint( p2pRank = rank_ <= peer ? 0 : 1; isSendRecvSelf = rank_ == peer; p2pTargetRank = isSendRecvSelf ? 0 : 1 - p2pRank; + if (!coalescing_state_) { + seqP2P_++; + } } auto comm = getXCCLComm(key, device, opType, p2pRank, isSendRecvSelf); From 193d9463c1a0dec192f1f100f313dd02df4a8ca8 Mon Sep 17 00:00:00 2001 From: hanchao Date: Mon, 28 Oct 2024 08:25:20 +0000 Subject: [PATCH 91/96] update --- torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index 4081529d486d33..04c936cb02c31b 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -102,7 +102,7 @@ void check_xpu_single_tensor( int64_t check_xpu_tensors_same_device(const std::vector& tensors) { TORCH_CHECK_WITH( - ValueError, tensors.size() == 0, "Tensor list must be nonempty"); + ValueError, tensors.size() != 0, "Tensor list must be nonempty"); const auto& first = tensors.front(); From eb447f2bffb775037a53f250f8b485f48a8b6c35 Mon Sep 17 00:00:00 2001 From: "Han, Chao1" Date: Thu, 31 Oct 2024 20:59:35 +0800 Subject: [PATCH 92/96] fix all_gather_v bug --- torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index 04c936cb02c31b..b920895342dd91 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -1063,6 +1063,7 @@ c10::intrusive_ptr ProcessGroupXCCL::_broadcast_oop( auto xcclDataType = getXcclDataType(input.scalar_type()); ccl::broadcast( input.data_ptr(), + output.data_ptr(), (size_t)input.numel(), xcclDataType, root, From 20b60b1809572017a6449ef7bc9ac3a3f58c516a Mon Sep 17 00:00:00 2001 From: hanchao Date: Mon, 11 Nov 2024 07:24:19 +0000 Subject: [PATCH 93/96] correct get kvs --- torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp | 3 +-- torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp | 16 ++++++++++------ 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index b920895342dd91..1527de6fe3f284 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -292,7 +292,6 @@ std::shared_ptr ProcessGroupXCCL::getXCCLComm( } std::shared_ptr XCCLComm; - XCCL_KVS kvs = get_kvs(rank_, *store_); bool batchP2P = xcclActiveGroupCounter_ > 0; bool singleP2POp = isP2POp(opType, batchP2P); @@ -320,7 +319,7 @@ std::shared_ptr ProcessGroupXCCL::getXCCLComm( ccl::vector_class> devs_rank; devs_rank.emplace_back(rank, ccl::create_device(q.get_device())); - auto xccl_kvs = get_kvs(rank_, *store_); + auto xccl_kvs = get_kvs(rank_, *store_, singleP2POp, deviceKey, p2pRank); auto comms = ccl::create_communicators(numRanks, devs_rank, ctx, xccl_kvs); XCCLComm = std::make_shared(std::move(comms[0])); diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp index 2f83fe8f248bd4..cbbd724f88c6bb 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp @@ -330,15 +330,19 @@ class TORCH_API ProcessGroupXCCL : public Backend { private: std::mutex kvs_mutex; - ccl::shared_ptr_class kvs; - ccl::shared_ptr_class get_kvs(int rank, c10d::Store& store) { + ccl::shared_ptr_class get_kvs(int rank, c10d::Store& store, + bool singleP2POp = false, const std::string& p2pKey = "", int p2pRank = 0) { std::lock_guard lock(kvs_mutex); - if (kvs) - return kvs; - std::string storeKey = "xccl_kvs"; + ccl::shared_ptr_class kvs; + std::string storeKey; + if (!singleP2POp) { + storeKey = "xccl_kvs"; + } else { + storeKey = p2pKey; + } // Rank 0 broadcast the bootstrap network information to other ranks - if (rank == 0) { + if (rank == 0 || (singleP2POp && p2pRank == 0)) { kvs = ccl::create_main_kvs(); ccl::kvs::address_type main_addr = kvs->get_address(); auto ccl_kvs_addr = From b442419da4f529083b418be8d6b5cd1769423390 Mon Sep 17 00:00:00 2001 From: hanchao Date: Tue, 12 Nov 2024 02:23:48 +0000 Subject: [PATCH 94/96] update kvs key --- torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp | 2 +- torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp | 13 +++++++++---- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index 1527de6fe3f284..f202f8916f89fd 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -239,7 +239,7 @@ ProcessGroupXCCL::ProcessGroupXCCL( const c10::intrusive_ptr& store, int rank, int size) - : Backend(rank, size), store_(store) { + : Backend(rank, size), store_(store), xcclCommCounter_(0) { blockingWait_ = getCvarBool(TORCH_XCCL_BLOCKING_WAIT, false); init(); } diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp index cbbd724f88c6bb..c30ca603c7ba07 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp @@ -318,6 +318,7 @@ class TORCH_API ProcessGroupXCCL : public Backend { std::unordered_map xcclEventsMap_; std::unordered_map> devXCCLCommMap_; c10::intrusive_ptr store_; + uint64_t xcclCommCounter_{0}; std::mutex mutex_; std::set usedDeviceIdxs_; int coalescing_state_ = 0; @@ -331,15 +332,19 @@ class TORCH_API ProcessGroupXCCL : public Backend { private: std::mutex kvs_mutex; - ccl::shared_ptr_class get_kvs(int rank, c10d::Store& store, - bool singleP2POp = false, const std::string& p2pKey = "", int p2pRank = 0) { + ccl::shared_ptr_class get_kvs( + int rank, + c10d::Store& store, + bool singleP2POp = false, + const std::string& p2pKey = "", + int p2pRank = 0) { std::lock_guard lock(kvs_mutex); ccl::shared_ptr_class kvs; std::string storeKey; if (!singleP2POp) { - storeKey = "xccl_kvs"; + storeKey = std::to_string(xcclCommCounter_++); } else { - storeKey = p2pKey; + storeKey = p2pKey; } // Rank 0 broadcast the bootstrap network information to other ranks if (rank == 0 || (singleP2POp && p2pRank == 0)) { From 65e0d9d7946716a829c04777954b7ab134bdf472 Mon Sep 17 00:00:00 2001 From: hanchao Date: Thu, 14 Nov 2024 08:48:03 +0000 Subject: [PATCH 95/96] WA AVG reduction --- test/distributed/test_c10d_ops_xccl.py | 10 +++++ .../distributed/c10d/ProcessGroupXCCL.cpp | 44 +++++++++++++++++++ 2 files changed, 54 insertions(+) diff --git a/test/distributed/test_c10d_ops_xccl.py b/test/distributed/test_c10d_ops_xccl.py index 279ec0eb03ecf8..9784cf3a5c0bea 100644 --- a/test/distributed/test_c10d_ops_xccl.py +++ b/test/distributed/test_c10d_ops_xccl.py @@ -155,6 +155,16 @@ def allreduce(tensors, op): tensors[0], ) + # Avg + tensors = [torch.tensor([self.rank + 1.0]).xpu(local_device_id)] + + allreduce(tensors, c10d.ReduceOp.AVG) + ndev = self.world_size + self.assertEqual( + torch.tensor([ndev * (ndev + 1.0) / (2.0 * ndev)]), + tensors[0], + ) + # Product tensors = [torch.tensor([self.rank + 1]).xpu(local_device_id)] diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index f202f8916f89fd..b2a900c92b8c0b 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -147,6 +147,10 @@ ccl::reduction getXcclReduceOp(const ReduceOp& reduceOp, at::Tensor& input) { // Map sum to max for bool tensors to avoid overflow issues with sum. return ccl::reduction::max; } + // WA due to oneCCL not support AVG + if (reduceOp == ReduceOp::AVG) { + return ccl::reduction::sum; + } return xcclOps.at(reduceOp); } catch (const std::out_of_range&) { C10_THROW_ERROR( @@ -894,6 +898,11 @@ c10::intrusive_ptr ProcessGroupXCCL::allreduce_impl( xcclReduceOp, comm, ccl::create_stream(stream.queue())); + // WA due to oneCCL not support AVG + if (opts.reduceOp == ReduceOp::AVG) { + auto divisor = getSize(); + output.div_(divisor); + } return; }, OpType::ALLREDUCE, @@ -942,6 +951,11 @@ c10::intrusive_ptr ProcessGroupXCCL::allreduce( xcclReduceOp, comm, ccl::create_stream(stream.queue())); + // WA due to oneCCL not support AVG + if (opts.reduceOp == ReduceOp::AVG) { + auto divisor = getSize(); + output.div_(divisor); + } return; }, OpType::ALLREDUCE, @@ -988,6 +1002,11 @@ c10::intrusive_ptr ProcessGroupXCCL::allreduce_coalesced( xcclReduceOp, comm, ccl::create_stream(stream.queue())); + // WA due to oneCCL not support AVG + if (opts.reduceOp == ReduceOp::AVG) { + auto divisor = getSize(); + output.div_(divisor); + } return; }, OpType::COALESCED, @@ -1117,6 +1136,11 @@ c10::intrusive_ptr ProcessGroupXCCL::reduce( root, comm, ccl::create_stream(stream.queue())); + // WA due to oneCCL not support AVG + if (opts.reduceOp == ReduceOp::AVG && getRank() == root) { + auto divisor = getSize(); + output.div_(divisor); + } return; }, OpType::REDUCE, @@ -1150,6 +1174,11 @@ c10::intrusive_ptr ProcessGroupXCCL::_reduce_oop( root, comm, ccl::create_stream(stream.queue())); + // WA due to oneCCL not support AVG + if (opts.reduceOp == ReduceOp::AVG && getRank() == root) { + auto divisor = getSize(); + output.div_(divisor); + } return; }, OpType::REDUCE, @@ -1370,6 +1399,11 @@ c10::intrusive_ptr ProcessGroupXCCL::reduce_scatter( xcclReduceOp, comm, ccl::create_stream(stream.queue())); + // WA due to oneCCL not support AVG + if (opts.reduceOp == ReduceOp::AVG) { + auto divisor = getSize(); + output.div_(divisor); + } return; }, [&](at::xpu::XPUStream& Stream, @@ -1453,6 +1487,11 @@ c10::intrusive_ptr ProcessGroupXCCL::_reduce_scatter_base( xcclReduceOp, comm, ccl::create_stream(stream.queue())); + // WA due to oneCCL not support AVG + if (opts.reduceOp == ReduceOp::AVG) { + auto divisor = getSize(); + output.div_(divisor); + } return; }, OpType::_REDUCE_SCATTER_BASE, @@ -1482,6 +1521,11 @@ c10::intrusive_ptr ProcessGroupXCCL::reduce_scatter_tensor_coalesced( xcclReduceOp, comm, ccl::create_stream(stream.queue())); + // WA due to oneCCL not support AVG + if (opts.reduceOp == ReduceOp::AVG) { + auto divisor = getSize(); + output.div_(divisor); + } return; }, OpType::COALESCED, From 3e97e67847d6f5486a4fe58d06fc1fcb21f59d82 Mon Sep 17 00:00:00 2001 From: hanchao Date: Fri, 15 Nov 2024 01:04:43 +0000 Subject: [PATCH 96/96] update test case --- test/distributed/test_c10d_ops_xccl.py | 4 +- test/distributed/test_c10d_xccl.py | 1424 +++++++++++++++++++++++- 2 files changed, 1399 insertions(+), 29 deletions(-) diff --git a/test/distributed/test_c10d_ops_xccl.py b/test/distributed/test_c10d_ops_xccl.py index 9784cf3a5c0bea..6a600aa595f7e7 100644 --- a/test/distributed/test_c10d_ops_xccl.py +++ b/test/distributed/test_c10d_ops_xccl.py @@ -44,6 +44,7 @@ TEST_MULTIGPU = TEST_XPU and torch.xpu.device_count() >= 2 + class ProcessGroupXCCLOpTest(MultiProcContinousTest): @classmethod def backend_str(cls) -> str: @@ -256,7 +257,6 @@ def reduce(xs, rootRank, rootTensor, op=None): ): reduce(tensors, self.rank, rt, op) - @requires_xccl() @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs") def test_allgather_ops(self): @@ -710,7 +710,6 @@ def perm(n, k): expected = torch.tensor(prod_val) self.assertEqual(expected, output_tensor) - @requires_xccl() @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs") def test_reduce_scatter_base_ops(self): @@ -830,4 +829,3 @@ def test_send_recv_object_list(self): nprocs=world_size, args=(world_size, rdvz_file), ) - diff --git a/test/distributed/test_c10d_xccl.py b/test/distributed/test_c10d_xccl.py index 704cdd414e554b..3503f6059f2825 100644 --- a/test/distributed/test_c10d_xccl.py +++ b/test/distributed/test_c10d_xccl.py @@ -1,14 +1,25 @@ # Owner(s): ["oncall: distributed"] +import copy import math import os +import random import sys import time from datetime import timedelta +from enum import auto, Enum +from itertools import product from unittest import mock +from test_c10d_common import DoubleGpuNet, gpus_for_rank, ModuleForDdpCommHook + import torch import torch.distributed as c10d +import torch.distributed.algorithms.ddp_comm_hooks.default_hooks as default +import torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook as powerSGD +import torch.nn.functional as F +from torch import nn +from torch.nn.parallel import DistributedDataParallel if not c10d.is_available() or not c10d.is_xccl_available(): @@ -23,8 +34,11 @@ init_multigpu_helper, MultiProcessTestCase, requires_xccl, + skip_if_lt_x_gpu, ) from torch.testing._internal.common_utils import ( + instantiate_parametrized_tests, + parametrize, retry_on_connect_failures, run_tests, skip_but_pass_in_sandcastle_if, @@ -267,37 +281,1395 @@ def test_set_process_group_desc(self): pg_2 = c10d.new_group([0, 1]) self.assertEqual(pg_2.group_desc, "undefined") - def _test_allreduce_basics(self, fn): - pg = self._create_process_group_xccl() - device = torch.device("xpu:" + str(self.rank)) - # Single input tests - tests = simple_reduce_tests(self.rank, self.world_size) - for op, input, expected in tests: - opts = c10d.AllreduceOptions() - opts.reduceOp = op - tensor = fn(input.to(device)) - fut = pg.allreduce([tensor], opts).get_future() - fut.wait() - result = fut.value() - self.assertEqual(expected, result[0], exact_dtype=False) - x = fn(torch.tensor([self.rank + 1.0], device=device)) - fut = pg.allreduce(x).get_future() - fut.wait() - result = fut.value() - self.assertEqual( - torch.tensor([float(self.world_size * (self.world_size + 1) / 2)]), - result[0], +class DistributedDataParallelTest( + test_c10d_common.CommonDistributedDataParallelTest, MultiProcessTestCase +): + def setUp(self): + super().setUp() + self._spawn_processes() + + def _get_process_group(self): + store = self._get_store() + c10d.init_process_group( + "xccl", store=store, rank=self.rank, world_size=self.world_size + ) + return c10d.distributed_c10d._get_default_group() + + def _test_xccl_backend( + self, devices, device_ids, multi_device=False, gradient_as_bucket_view=False + ): + process_group = self._get_process_group() + self._test_ddp_with_process_group( + process_group, devices, device_ids, multi_device, gradient_as_bucket_view ) @requires_xccl() - def test_allreduce_basics(self): - self._test_allreduce_basics(lambda t: t.clone()) + @skip_if_lt_x_gpu(2) + def test_xccl_backend_multi_device_ids_not_allowed(self): + int_devices = list(range(torch.xpu.device_count())) + devices = [torch.device("xpu:" + str(i)) for i in int_devices] + with self.assertRaisesRegex( + ValueError, "device_ids can only be None or contain a single element." + ): + self._test_xccl_backend(devices, int_devices) + @requires_xccl() + @skip_if_lt_x_gpu(4) + def test_ddp_multi_device_module_config(self): + gpus = gpus_for_rank(self.world_size, "xccl")[self.rank] -if __name__ == "__main__": - assert ( - not torch.xpu._initialized - ), "test_distributed must not have initialized XPU context on main process" + self.assertTrue(len(gpus) >= 2, "expecting at least 2 gpus per process") + + process_group = self._get_process_group() + + gpus = gpus[:2] + model = DoubleGpuNet(gpus) + + with self.assertRaisesRegex( + ValueError, + "DistributedDataParallel device_ids and output_device arguments only work with " + "single-device/multiple-device GPU modules or CPU modules", + ): + ddp_model = DistributedDataParallel( + model, output_device=gpus[1], process_group=process_group + ) + + with self.assertRaisesRegex( + ValueError, "device_ids can only be None or contain a single element." + ): + ddp_model = DistributedDataParallel( + model, device_ids=gpus, process_group=process_group + ) + + with self.assertRaisesRegex( + ValueError, "input module must be on the same type of devices" + ): + model.fc1 = model.fc1.cpu() + ddp_model = DistributedDataParallel(model, process_group=process_group) + + model = model.cpu() + with self.assertRaisesRegex( + ValueError, "device_ids can only be None or contain a single element." + ): + ddp_model = DistributedDataParallel( + model, device_ids=gpus, process_group=process_group + ) + + def _test_fp16(self, gradient_as_bucket_view=False): + process_group = self._get_process_group() + + gpus = gpus_for_rank(self.world_size, "xccl")[self.rank] + model = nn.Linear(1, 1, bias=False).xpu(gpus[0]).half() + nn.init.constant_(model.weight, 1) + ddp_model = DistributedDataParallel( + model, + device_ids=[gpus[0]], + process_group=process_group, + bucket_cap_mb=0.001, + gradient_as_bucket_view=gradient_as_bucket_view, + ) + + # Input 2**15, so that the gradients will overflow with a + # world_size of 2, unless we normalize the gradient by the + # world_size before the reduction + input = torch.tensor([[2**15]]).xpu(gpus[0]).half() + + # Step model + ddp_model.train() + output = ddp_model(input) + loss = output.sum() + loss.backward() + + self.assertFalse(any(torch.isinf(p.grad).any() for p in ddp_model.parameters())) + + @requires_xccl() + @skip_if_lt_x_gpu(2) + def test_fp16(self): + self._test_fp16() + + @requires_xccl() + @skip_if_lt_x_gpu(2) + def test_fp16_grad_is_view(self): + self._test_fp16(gradient_as_bucket_view=True) + + def _test_arbitrary_forward_return_value(self, gradient_as_bucket_view=False): + """ + Note: this test can be sped up by only running it on a CPU module + once DistributedDataParallel supports them. + """ + process_group = self._get_process_group() + + class ForwardReturnValueModule(nn.Module): + def __init__(self) -> None: + super().__init__() + self.fc1 = nn.Linear(2, 10, bias=False) + self.fc2 = nn.Linear(10, 4, bias=False) + self.fc3 = nn.Linear(4, 4, bias=False) + self.relu = nn.ReLU() + + def forward(self, x, fn): + x = self.relu(self.fc1(x)) + x = self.relu(self.fc2(x)) + # The first softmax does NOT include fc3 in its autograd graph + # whereas the second softmax DOES. If we pass only the first + # tensor we see in the output to the reducer, it marks the + # gradient for fc3 as ready (because it doesn't show up). If + # downstream uses of this return value choose to differentiate + # against the second output tensor, it would still receive a + # gradient and a callback for this tensor, resulting in a crash. + return fn( + F.softmax(x, dim=1), + F.softmax(self.fc3(x), dim=1), + ) + + device_id = gpus_for_rank(self.world_size, "xccl")[self.rank][0] + model = DistributedDataParallel( + ForwardReturnValueModule().float().to(device_id), + device_ids=[device_id], + process_group=process_group, + gradient_as_bucket_view=gradient_as_bucket_view, + ) + + batch_size = 4 + criterion = nn.CrossEntropyLoss() + input = torch.rand([batch_size, 2], dtype=torch.float) + target = torch.LongTensor([random.randrange(4) for _ in range(batch_size)]).to( + device_id + ) + + # Always run "backward" to ensure the reducer is called by autograd. + # If we don't correctly capture the output tensors from the return value, + # the reducer won't see a hook for the unused parameter, and throw an error. + # The correct capture is what we're testing in this function. + def test(box, unbox): + output = model(input, fn=box) + loss = criterion(unbox(output), target) + loss.backward() + + # Test with identity return value + test( + box=lambda x, y: (x, y), + unbox=lambda obj: obj[1], + ) + + # Test with list return value + test( + box=lambda x, y: ["foo", x, "bar", y], + unbox=lambda obj: obj[3], + ) + + # Test with tuple return value + test( + box=lambda x, y: ("foo", x, "bar", y), + unbox=lambda obj: obj[3], + ) + + # Test with dict return value + test( + box=lambda x, y: {"foo": "bar", "a": x, "b": y}, + unbox=lambda obj: obj["b"], + ) + + # Test with list with dict return value + test( + box=lambda x, y: ["foo", "bar", {"a": x, "b": y}], + unbox=lambda obj: obj[2]["b"], + ) + + # Test with dict with list return value + test( + box=lambda x, y: {"foo": "bar", "list": [0, x, 1, y]}, + unbox=lambda obj: obj["list"][3], + ) + + @requires_xccl() + @skip_if_lt_x_gpu(2) + def test_arbitrary_forward_return_value(self): + self._test_arbitrary_forward_return_value() + + @requires_xccl() + @skip_if_lt_x_gpu(2) + def test_arbitrary_forward_return_value_grad_is_view(self): + self._test_arbitrary_forward_return_value(gradient_as_bucket_view=True) + + @requires_xccl() + @skip_if_lt_x_gpu(2) + def test_ddp_with_lazy_parameters(self): + process_group = self._get_process_group() + with self.assertRaisesRegex( + RuntimeError, "Modules with uninitialized parameters" + ): + DistributedDataParallel( + torch.nn.LazyLinear(10), process_group=process_group + ) + + def _test_multiple_outputs_multiple_backward(self, gradient_as_bucket_view=False): + """ + Note: this test can be sped up by only running it on a CPU module + once DistributedDataParallel supports them. + """ + process_group = self._get_process_group() + + class MultipleOutputModule(nn.Module): + def __init__(self) -> None: + super().__init__() + + def define_module(): + return nn.Sequential( + nn.Linear(2, 10, bias=False), + nn.ReLU(), + nn.Linear(10, 4, bias=False), + nn.ReLU(), + ) + + self.module0 = define_module() + self.module1 = define_module() + + def forward(self, x): + return ( + F.softmax(self.module0(x), dim=1), + F.softmax(self.module1(x), dim=1), + ) + + device_id = gpus_for_rank(self.world_size, "xccl")[self.rank][0] + model = DistributedDataParallel( + MultipleOutputModule().float().to(device_id), + device_ids=[device_id], + process_group=process_group, + gradient_as_bucket_view=gradient_as_bucket_view, + ) + + batch_size = 4 + criterion = nn.CrossEntropyLoss() + input = torch.rand([batch_size, 2], dtype=torch.float) + target = torch.LongTensor([random.randrange(4) for _ in range(batch_size)]).to( + device_id + ) + + # Compute loss and gradients for both outputs + output1, output2 = model(input) + loss1 = criterion(output1, target) + loss1.backward() + loss2 = criterion(output2, target) + loss2.backward() + + @requires_xccl() + @skip_if_lt_x_gpu(2) + def test_multiple_outputs_multiple_backward(self): + self._test_multiple_outputs_multiple_backward() + + @requires_xccl() + @skip_if_lt_x_gpu(2) + def test_multiple_outputs_multiple_backward_grad_is_view(self): + self._test_multiple_outputs_multiple_backward(gradient_as_bucket_view=True) + + @requires_xccl() + @skip_if_lt_x_gpu(2) + def test_no_grad(self): + """ + Note: this test can be sped up by only running it on a CPU module + once DistributedDataParallel supports them. + """ + process_group = self._get_process_group() + + class NoGradModule(nn.Module): + def __init__(self) -> None: + super().__init__() + self.fc1 = nn.Linear(2, 10, bias=False) + self.fc2 = nn.Linear(10, 4, bias=False) + self.relu = nn.ReLU() + + def forward(self, x): + x = self.relu(self.fc1(x)) + x = self.relu(self.fc2(x)) + return F.softmax(x, dim=1) + + device_id = gpus_for_rank(self.world_size, "xccl")[self.rank][0] + model = DistributedDataParallel( + NoGradModule().float().to(device_id), + device_ids=[device_id], + process_group=process_group, + ) + + batch_size = 4 + input = torch.rand([batch_size, 2], dtype=torch.float) + + def check_no_grads(): + for p in model.parameters(): + self.assertTrue(p.requires_grad) + self.assertIsNone(p.grad) + + # After initialization, no parameter has their gradient set. + check_no_grads() + + # Run `forward` function with torch.no_grad() + with torch.no_grad(): + output = model(input) + self.assertTrue(isinstance(output, torch.Tensor)) + + # No parameter should have their gradient set. + check_no_grads() + + def _test_accumulate_gradients_module(self, gradient_as_bucket_view=False): + # This is NOT the recommended way to implement accumulating grads, but + # we would like to make sure DDP does not mess up with the underlying + # module. + int_devices = gpus_for_rank(self.world_size, "xccl")[self.rank][:1] + devices = [torch.device("xpu:" + str(i)) for i in int_devices] + process_group = self._get_process_group() + global_batch_size = self.world_size + + model, ddp_model, input, target = self._prepare_single_device_module( + process_group, devices, devices, global_batch_size, gradient_as_bucket_view + ) + + def step_model(model, input, target): + model.train() + output = model(input) + loss = F.mse_loss(output, target.to(output.device)) + loss.backward() + + # ensure accumulate grads works with no_grad + with torch.no_grad(): + ddp_model.train() + ddp_model.module(input) + + # Check two model parameters over 4 iterations. + # Use 4 iterations because we alternate between reducing and + # not reducing and want to make sure we switch both ways. + for iteration in range(4): + step_model(model, input, target) + + if iteration % 2 == 0: + # Skip gradients sync without calling prepare_for_backward + step_model( + ddp_model.module, + input[self.rank : (self.rank + 1)], + target[self.rank : (self.rank + 1)], + ) + for i, j in zip(model.parameters(), ddp_model.parameters()): + self.assertNotEqual(i.grad, j.grad) + else: + step_model( + ddp_model, + input[self.rank : (self.rank + 1)], + target[self.rank : (self.rank + 1)], + ) + for i, j in zip(model.parameters(), ddp_model.parameters()): + self.assertEqual(i.grad, j.grad, rtol=1.3e-06, atol=5e-5) + + # Shuffle the input so that DDP input is different + torch.manual_seed(1337 + iteration) + input = input[torch.randperm(global_batch_size)] + + @requires_xccl() + @skip_if_lt_x_gpu(2) + def test_failure_recovery(self): + process_group = self._get_process_group() + + # need to create a separate file for the recovered FileStore, because + # the original one will be deleted when destructing the first FileStore. + recovery_filename = self.file_name + "_recovery" + if self.rank == 0: + # the file will be deleted by the recovered FileStore + open(recovery_filename, "w").close() + + # not necessary to run barrier here, as DDP will synchronize + + class TestModel(nn.Module): + def __init__(self) -> None: + super().__init__() + self.fc1 = nn.Linear(2, 10, bias=False) + self.fc2 = nn.Linear(10, 4, bias=False) + self.relu = nn.ReLU() + + def forward(self, x): + x = self.relu(self.fc1(x)) + x = self.relu(self.fc2(x)) + return F.softmax(x, dim=1) + + device_id = gpus_for_rank(self.world_size, "xccl")[self.rank][0] + model = TestModel().float().to(device_id) + ddp = DistributedDataParallel( + model, + device_ids=[device_id], + process_group=process_group, + ) + + batch_size = 4 + criterion = nn.CrossEntropyLoss() + input = torch.rand([batch_size, 2], dtype=torch.float) + target = torch.LongTensor([random.randrange(4) for _ in range(batch_size)]).to( + device_id + ) + + for _ in range(6): + output = ddp(input) + loss = criterion(output, target) + loss.backward() + + del ddp + c10d.destroy_process_group(process_group) + + store = c10d.FileStore(recovery_filename, self.world_size) + c10d.init_process_group( + "xccl", store=store, rank=self.rank, world_size=self.world_size + ) + process_group = c10d.distributed_c10d._get_default_group() + ddp = DistributedDataParallel( + model, + device_ids=[device_id], + process_group=process_group, + ) + + input = torch.rand([batch_size, 2], dtype=torch.float) + target = torch.LongTensor([random.randrange(4) for _ in range(batch_size)]).to( + device_id + ) + for _ in range(6): + output = ddp(input) + loss = criterion(output, target) + loss.backward() + + @requires_xccl() + @skip_if_lt_x_gpu(2) + def test_pass_default_pg(self): + dist.init_process_group( + "xccl", + init_method=f"file://{self.file_name}", + world_size=self.world_size, + rank=self.rank, + ) + + default_pg = c10d.distributed_c10d._get_default_group() + dist.destroy_process_group(default_pg) + self.assertFalse(dist.is_initialized()) + + def _gpu_model_with_ddp_comm_hook( + self, + process_group, + hook=None, + gradient_as_bucket_view=False, + state=None, + static_graph=False, + ): + device_id = gpus_for_rank(self.world_size, "xccl")[self.rank][0] + gpu_model = DistributedDataParallel( + ModuleForDdpCommHook().to(device_id), + device_ids=[device_id], + process_group=process_group, + gradient_as_bucket_view=gradient_as_bucket_view, + static_graph=static_graph, + ) + + # Register a DDP communication hook if any. + if hook is not None: + gpu_model.register_comm_hook(state, hook) + + return gpu_model + + @requires_xccl() + @skip_if_lt_x_gpu(2) + def test_ddp_comm_hook_future_passing_gpu_xccl(self): + """ + This unit test verifies whether the Future object is passed properly using xccl backend. + The hook callback function creates a Future object and sets a value to it. + """ + process_group = self._get_process_group() + + # Get GPU model with simple_hook registered. + gpu_model = self._gpu_model_with_ddp_comm_hook(process_group, self._simple_hook) + + # check whether the grads are equal to what simple_hook's then callback returns. + # without the comm_hook, result would be 0.25 * torch.ones(2, 2). + self._run_and_verify_hook(gpu_model, 8, 2 * torch.ones(2, 2)) + + def _test_ddp_comm_hook_allreduce_hook_xccl( + self, gradient_as_bucket_view=False, static_graph=False + ): + """ + This unit test verifies whether a DDP communication hook that just calls + allreduce gives the same result with the case of no hook registered. + Without the then callback, the future_value in reducer is no longer + a PyObject, and this unit test verifies future_value is properly checked. + """ + process_group = self._get_process_group() + + def allreduce_hook( + state: object, bucket: dist.GradBucket + ) -> torch.futures.Future[torch.Tensor]: + tensors = [bucket.buffer() / self.world_size] + return ( + process_group.allreduce(tensors) + .get_future() + .then(lambda fut: fut.value()[0]) + ) + + # Get GPU model with allreduce_hook registered. + gpu_model = self._gpu_model_with_ddp_comm_hook( + process_group, allreduce_hook, gradient_as_bucket_view, static_graph + ) + + # check whether the grads are equal to what DDP without hook would return. + self._run_and_verify_hook(gpu_model, 8, 0.25 * torch.ones(2, 2)) + + def _test_default_ddp_comm_hooks_xccl(self, gradient_as_bucket_view=False): + """ + This unit test verifies whether default Python DDP communication hooks ALLREDUCE, FP16_COMPRESS + and BF16_COMPRESS, can give the same result with the case of no hook registered. + """ + process_group = self._get_process_group() + + # For these default DDP comm hooks, the only state is process group. + state = process_group + hook_options = [default.allreduce_hook, default.fp16_compress_hook] + if c10d.is_xccl_available(): + hook_options.append(default.bf16_compress_hook) + for hook in hook_options: + # Get GPU model with the hook registered. + # The first arg 'process_group' is used for initializing the test environment, + # so it cannot be replaced by 'state', although they have the same value. + gpu_model = self._gpu_model_with_ddp_comm_hook( + process_group, hook, gradient_as_bucket_view, state + ) + + # check whether the grads are equal to what DDP without hook would return. + self._run_and_verify_hook(gpu_model, 8, 0.25 * torch.ones(2, 2)) + + def _test_fp16_compress_wrapper(self, gradient_as_bucket_view=False): + """ + This unit test verifies whether wrapping the ALLREDUCE and POWER_SGD hooks with + the FP16_WRAPPER can give the same result as when there is no hook registered. + """ + process_group = self._get_process_group() + powerSGD_state = powerSGD.PowerSGDState(process_group=process_group) + + hook_args = [ + (powerSGD.powerSGD_hook, powerSGD_state), + (default.allreduce_hook, process_group), + ] + + for hook, state in hook_args: + gpu_model = self._gpu_model_with_ddp_comm_hook( + process_group, + default.fp16_compress_wrapper(hook), + gradient_as_bucket_view, + state, + ) + + # check whether the grads are equal to what DDP without hook would return. + self._run_and_verify_hook(gpu_model, 8, 0.25 * torch.ones(2, 2)) + + def _test_bf16_compress_wrapper(self, gradient_as_bucket_view=False): + """ + This unit test verifies whether wrapping the ALLREDUCE and POWER_SGD hooks with + the BF16_WRAPPER can give the same result as when there is no hook registered. + """ + process_group = self._get_process_group() + powerSGD_state = powerSGD.PowerSGDState(process_group=process_group) + + hook_args = [ + (powerSGD.powerSGD_hook, powerSGD_state), + (default.allreduce_hook, process_group), + ] + + for hook, state in hook_args: + gpu_model = self._gpu_model_with_ddp_comm_hook( + process_group, + default.bf16_compress_wrapper(hook), + gradient_as_bucket_view, + state, + ) + + # check whether the grads are equal to what DDP without hook would return. + self._run_and_verify_hook(gpu_model, 8, 0.25 * torch.ones(2, 2)) + + def _test_powerSGD_ddp_comm_hook_xccl(self, gradient_as_bucket_view=False): + """ + This unit test verifies whether Python DDP communication hook POWER_SGD + can give the same result with the case of no hook registered. + """ + process_group = self._get_process_group() + + # Get GPU model with the hook registered. + # Test the hook with different algorithmic configs. + for use_error_feedback, warm_start, batch_tensors_with_same_shape in product( + [True, False], + [True, False], + [True, False], + ): + state = powerSGD.PowerSGDState( + process_group=process_group, + matrix_approximation_rank=1, + use_error_feedback=use_error_feedback, + warm_start=warm_start, + batch_tensors_with_same_shape=batch_tensors_with_same_shape, + ) + for hook in [powerSGD.powerSGD_hook, powerSGD.batched_powerSGD_hook]: + gpu_model = self._gpu_model_with_ddp_comm_hook( + process_group, hook, gradient_as_bucket_view, state + ) + + # check whether the grads are equal to what DDP without hook would return. + self._run_and_verify_hook(gpu_model, 8, 0.25 * torch.ones(2, 2)) + + def _test_builtin_ddp_comm_hooks_xccl(self, gradient_as_bucket_view=False): + """ + This unit test verifies whether built-in C++ DDP communication hooks ALLREDUCE and FP16_COMPRESS + can give the same result with the case of no hook registered. + """ + process_group = self._get_process_group() + + for comm_hook_type in [ + dist.BuiltinCommHookType.ALLREDUCE, + dist.BuiltinCommHookType.FP16_COMPRESS, + ]: + # Get GPU model with the built-in communication hook. + gpu_model = self._gpu_model_with_builtin_ddp_comm_hook( + process_group, comm_hook_type, gradient_as_bucket_view + ) + + # check whether the grads are equal to what DDP without hook would return. + self._run_and_verify_hook(gpu_model, 8, 0.25 * torch.ones(2, 2)) + + @requires_xccl() + @skip_if_lt_x_gpu(2) + def test_ddp_comm_hook_allreduce_hook_xccl(self): + self._test_ddp_comm_hook_allreduce_hook_xccl() + + @requires_xccl() + @skip_if_lt_x_gpu(2) + def test_default_ddp_comm_hooks_xccl(self): + self._test_default_ddp_comm_hooks_xccl() + + @requires_xccl() + @skip_if_lt_x_gpu(2) + def test_fp16_compress_wrapper_xccl(self): + self._test_fp16_compress_wrapper() + + @requires_xccl() + @skip_if_lt_x_gpu(2) + def test_bf16_compress_wrapper_xccl(self): + self._test_bf16_compress_wrapper() + + @requires_xccl() + @skip_if_lt_x_gpu(2) + def test_builtin_ddp_comm_hooks_xccl(self): + self._test_builtin_ddp_comm_hooks_xccl() + + @requires_xccl() + @skip_if_lt_x_gpu(2) + def test_powerSGD_ddp_comm_hook_xccl(self): + self._test_powerSGD_ddp_comm_hook_xccl() + + @requires_xccl() + @skip_if_lt_x_gpu(2) + def test_ddp_comm_hook_allreduce_hook_xccl_grad_is_view(self): + self._test_ddp_comm_hook_allreduce_hook_xccl(gradient_as_bucket_view=True) + + @requires_xccl() + @skip_if_lt_x_gpu(2) + def test_ddp_comm_hook_allreduce_hook_xccl_static_graph(self): + self._test_ddp_comm_hook_allreduce_hook_xccl(static_graph=True) + + @requires_xccl() + @skip_if_lt_x_gpu(2) + def test_default_ddp_comm_hooks_xccl_is_view(self): + self._test_default_ddp_comm_hooks_xccl(gradient_as_bucket_view=True) + + @requires_xccl() + @skip_if_lt_x_gpu(2) + def test_fp16_compress_wrapper_is_view(self): + self._test_fp16_compress_wrapper(gradient_as_bucket_view=True) + + @requires_xccl() + @skip_if_lt_x_gpu(2) + def test_bf16_compress_wrapper_is_view(self): + self._test_bf16_compress_wrapper(gradient_as_bucket_view=True) + + @requires_xccl() + @skip_if_lt_x_gpu(2) + def test_builtin_ddp_comm_hooks_xccl_grad_is_view(self): + self._test_builtin_ddp_comm_hooks_xccl(gradient_as_bucket_view=True) + + @requires_xccl() + @skip_if_lt_x_gpu(2) + def test_powerSGD_ddp_comm_hook_xccl_grad_is_view(self): + self._test_powerSGD_ddp_comm_hook_xccl(gradient_as_bucket_view=True) + + @requires_xccl() + @skip_if_lt_x_gpu(2) + def test_ddp_comm_hook_allreduce_with_then_hook_xccl(self): + """ + This unit test verifies whether a DDP communication hook that calls allreduce and then + multiplies the result by ten and divides by two gives the expected result. + """ + process_group = self._get_process_group() + + def allreduce_with_then_hook( + state: object, bucket: dist.GradBucket + ) -> torch.futures.Future[torch.Tensor]: + tensors = [bucket.buffer() / self.world_size] + fut = process_group.allreduce(tensors).get_future() + + def mult(fut): + # Multiply the result by 10. + return 10 * fut.value()[0] + + def div(fut): + # Divide the result by 2. + return 0.5 * fut.value() + + return fut.then(mult).then(div) + + # Get GPU model with allreduce_with_then_hook registered. + gpu_model = self._gpu_model_with_ddp_comm_hook( + process_group, allreduce_with_then_hook + ) + + # check whether the grads are equal to what allreduce returns multiplied by 5. + # without the comm_hook, result would be still 0.25 * torch.ones(2, 2). + self._run_and_verify_hook(gpu_model, 8, 1.25 * torch.ones(2, 2)) + + class AcceptsParam(torch.nn.Module): + def __init__(self, p, factor): + super().__init__() + self.a = p + self.f = factor + + def forward(self, input): + return input + self.a * self.f + + @requires_xccl() + @skip_if_lt_x_gpu(2) + def test_ddp_weight_sharing(self): + process_group = self._get_process_group() + + size = 2048 * 2048 + dev = self.rank + world = self.world_size + + p = torch.nn.Parameter(torch.randn(size, requires_grad=True)) + + for try_set_to_none, use_bucket_view in product((False, True), (False, True)): + m = torch.nn.Sequential( + self.AcceptsParam(p, dev + 1), self.AcceptsParam(p, dev + 1) + ).xpu(dev) + + m = torch.nn.parallel.DistributedDataParallel( + m, + bucket_cap_mb=1, + gradient_as_bucket_view=use_bucket_view, + device_ids=[dev], + process_group=process_group, + ) + + for i in range(3): + m.zero_grad(set_to_none=try_set_to_none) + m(1).sum().backward() + + # Each param value is multiplied by "rank + 1" twice in forward, so the grad + # values produced by a particular rank should be 2. * (rank + 1). + # Summing these over ranks and dividing by world size gives the expected result: + analytic = torch.full_like( + p, 2.0 * (world * (world + 1.0) / 2.0) / world, device=dev + ) + for name, p in m.named_parameters(): + self.assertEqual( + p.grad, + analytic, + "mismatch at " + + name + + ".grad for " + + f"set_to_none = {try_set_to_none}, use_bucket_view = {use_bucket_view}", + ) + + @requires_xccl() + @skip_if_lt_x_gpu(2) + def test_ddp_packed_sequence(self): + """ + Tests that DDP with ``device_ids`` specified can run a forward and + backward pass with ``PackedSequence`` s with parity compared to a local + version of the model. + """ + store = c10d.FileStore(self.file_name, self.world_size) + process_group = dist.init_process_group( + "xccl", + world_size=self.world_size, + rank=self.rank, + store=store, + ) + seqs = ["sequence_sequence", "seq", "sequence"] + vocab = [""] + sorted({ch for seq in seqs for ch in seq}) + vectorized_seqs = [[vocab.index(tok) for tok in seq] for seq in seqs] + # Set the seed to make the embedding and LSTM deterministic (even + # across ranks since DDP broadcasts parameters from rank 0) + torch.manual_seed(0) + embed = nn.Embedding(len(vocab), 4) # keep on CPU + lstm = nn.LSTM(input_size=4, hidden_size=2, batch_first=True).to(self.rank) + lstm_ddp = DistributedDataParallel( + copy.deepcopy(lstm), + device_ids=[self.rank], + process_group=process_group, + ) + for p1, p2 in zip(lstm.parameters(), lstm_ddp.module.parameters()): + self.assertEqual(p1, p2) + seq_lengths = torch.LongTensor(list(map(len, vectorized_seqs))) + seq_tensor = torch.Tensor( + torch.zeros((len(vectorized_seqs), seq_lengths.max())) + ).long() + for i, (seq, seq_len) in enumerate(zip(vectorized_seqs, seq_lengths)): + seq_tensor[i, :seq_len] = torch.LongTensor(seq) + seq_lengths, permutation_idx = seq_lengths.sort(0, descending=True) + seq_tensor = seq_tensor[permutation_idx] + embedded_seq_tensor = embed(seq_tensor) + packed_input = torch.nn.utils.rnn.pack_padded_sequence( + embedded_seq_tensor, + seq_lengths, + batch_first=True, + ) + packed_input_ddp = torch.nn.utils.rnn.pack_padded_sequence( + embedded_seq_tensor.detach().clone(), + seq_lengths, + batch_first=True, + ) + # Move the input to GPU explicitly for the local model + packed_output, (ht, ct) = lstm(packed_input.to(self.rank)) + # Let DDP move the input to GPU internally + packed_output_ddp, (ht_ddp, ct_ddp) = lstm_ddp(packed_input_ddp) + self.assertEqual(packed_output.data, packed_output_ddp.data) + self.assertEqual(ht, ht_ddp) + self.assertEqual(ct, ct_ddp) + packed_output.data.sum().backward() + packed_output_ddp.data.sum().backward() + for p1, p2 in zip(lstm.parameters(), lstm_ddp.parameters()): + self.assertEqual(p1.grad, p2.grad) + + # error: input dense tensor has to be contiguous + @requires_xccl() + @skip_if_lt_x_gpu(2) + def test_channels_last_contig(self): + process_group = self._get_process_group() + device = torch.device(f"xpu:{self.rank}") + tensor = torch.ones((2, 16, 768, 1152), dtype=torch.float32, device=device).to( + memory_format=torch.channels_last + ) + process_group.broadcast([tensor]).wait() + + +class CommTest(test_c10d_common.AbstractCommTest, MultiProcessTestCase): + @property + def device(self): + return f"xpu:{self.rank}" + + def setUp(self): + super().setUp() + self._spawn_processes() + + def tearDown(self): + super().tearDown() + try: + os.remove(self.file_name) + except OSError: + pass + + def _test_broadcast_coalesced(self, process_group, device, root_rank): + half = torch.float16 + + # No support for float16 for CPU tensors + if device == torch.device("cpu"): + half = torch.float32 + + target = torch.arange(60, dtype=half, device=device).chunk(5) + target += torch.arange(60, dtype=torch.float32, device=device).chunk(5) + target += torch.arange(60, dtype=half, device=device).chunk(5) + target += torch.arange(60, dtype=torch.float64, device=device).chunk(5) + target += torch.arange(60, dtype=half, device=device).chunk(5) + target += torch.arange(60, dtype=torch.float32, device=device).chunk(5) + + # The tensors to pass to broadcast are identical to the target + # only on the process that is the root of the broadcast. + if self.rank == root_rank: + tensors = [tensor.clone() for tensor in target] + else: + tensors = [torch.zeros_like(tensor) for tensor in target] + + if self.rank != root_rank: + self.assertNotEqual(tensors, target) + + c10d._broadcast_coalesced( + process_group, tensors, buffer_size=256, src=root_rank + ) + + if self.rank != root_rank: + self.assertEqual(tensors, target) + + @requires_xccl() + @skip_if_lt_x_gpu(2) + def test_broadcast_coalesced_xccl(self): + store = c10d.FileStore(self.file_name, self.world_size) + c10d.init_process_group( + backend="xccl", store=store, rank=self.rank, world_size=self.world_size + ) + process_group = c10d.distributed_c10d._get_default_group() + device = torch.device("xpu:%d" % self.rank) + ranks = [0, 1] + for root_rank in ranks: + self._test_broadcast_coalesced(process_group, device, root_rank) + + @requires_xccl() + @skip_if_lt_x_gpu(2) + def test_all_reduce_coalesced_xccl(self): + store = c10d.FileStore(self.file_name, self.world_size) + c10d.init_process_group( + backend="xccl", store=store, rank=self.rank, world_size=self.world_size + ) + process_group = c10d.distributed_c10d._get_default_group() + device = torch.device("xpu:%d" % self.rank) + tensors = [ + torch.full((60 + i,), self.rank + 1 + i, device=device, dtype=torch.float) + for i in range(5) + ] + torch.distributed.all_reduce_coalesced(tensors, group=process_group) + for i, t in enumerate(tensors): + self.assertEqual( + t, + torch.full_like( + t, self.world_size * (i + (self.world_size + 1.0) / 2.0) + ), + ) + + @requires_xccl() + @skip_if_lt_x_gpu(2) + def test_all_reduce_coalesced_manager_xccl(self): + store = c10d.FileStore(self.file_name, self.world_size) + c10d.init_process_group( + backend="xccl", store=store, rank=self.rank, world_size=self.world_size + ) + process_group = c10d.distributed_c10d._get_default_group() + device = torch.device("xpu:%d" % self.rank) + tensors = [ + torch.full((60 + i,), self.rank + 1 + i, device=device, dtype=torch.float) + for i in range(5) + ] + with torch.distributed._coalescing_manager( + group=process_group, device=device, async_ops=True + ) as cm: + for tensor in tensors: + torch.distributed.all_reduce(tensor) + self.assertEqual(len(cm.works), 1) + cm.wait() + for i, t in enumerate(tensors): + self.assertEqual( + t, + torch.full_like( + t, self.world_size * (i + (self.world_size + 1.0) / 2.0) + ), + ) + + @requires_xccl() + @skip_if_lt_x_gpu(4) + def test_xccl_barrier(self): + store = c10d.FileStore(self.file_name, self.world_size) + c10d.init_process_group( + backend="xccl", rank=self.rank, world_size=self.world_size, store=store + ) + + t = torch.tensor([self.rank + 1] * 10).xpu(2 * self.rank) + c10d.all_reduce(t) + expected_tensor = torch.tensor([3] * 10).xpu(2 * self.rank) + self.assertEqual(expected_tensor, t) + + # Test with new_group + pg = c10d.new_group([0, 1]) + t = torch.tensor([self.rank + 1] * 10).xpu(2 * self.rank) + pg.allreduce(t).wait() + self.assertEqual(expected_tensor, t) + + pg = c10d.new_group([0]) + if self.rank == 0: + t = torch.tensor([self.rank + 1] * 10).xpu(2 * self.rank) + expected_tensor = torch.tensor([self.rank + 1] * 10).xpu(2 * self.rank) + pg.allreduce(t).wait() + self.assertEqual(expected_tensor, t) + + pg = c10d.new_group([1]) + if self.rank == 1: + t = torch.tensor([self.rank + 1] * 10).xpu(2 * self.rank) + expected_tensor = torch.tensor([self.rank + 1] * 10).xpu(2 * self.rank) + pg.allreduce(t).wait() + self.assertEqual(expected_tensor, t) + + @requires_xccl() + @skip_if_lt_x_gpu(2) + def test_xccl_barrier_device_ids(self): + store = c10d.FileStore(self.file_name, self.world_size) + c10d.init_process_group( + backend="xccl", rank=self.rank, world_size=self.world_size, store=store + ) + + c10d.barrier(device_ids=[self.rank]) + + @requires_xccl() + @skip_if_lt_x_gpu(2) + def test_xccl_barrier_device_ids_function_argument(self): + store = c10d.FileStore(self.file_name, self.world_size) + c10d.init_process_group( + backend="xccl", rank=self.rank, world_size=self.world_size, store=store + ) + + with self.assertRaisesRegex(TypeError, "Invalid function argument"): + c10d.barrier(device_ids=self.rank) + + @requires_xccl() + @skip_if_lt_x_gpu(2) + def test_reduce_scatter_base_k(self): + store = dist.FileStore(self.file_name, self.world_size) + dist.init_process_group( + "xccl", + world_size=self.world_size, + rank=self.rank, + store=store, + ) + output_tensor = torch.zeros(2, dtype=torch.int64).to(self.rank) + input_tensors = torch.arange(self.world_size * 2, dtype=torch.int64).to( + self.rank + ) + input_tensors = torch.reshape(input_tensors, (self.world_size, 2)) + dist.reduce_scatter_tensor(output_tensor, input_tensors) + self.assertEqual(output_tensor, input_tensors[self.rank] * self.world_size) + + @requires_xccl() + @skip_if_lt_x_gpu(2) + def test_reduce_scatter_tensor_coalesced(self): + store = dist.FileStore(self.file_name, self.world_size) + dist.init_process_group( + "xccl", + world_size=self.world_size, + rank=self.rank, + store=store, + ) + output_tensors = torch.zeros(2, 2).to(self.rank) + input_tensors = [torch.ones(2, 2).to(self.rank) for _ in range(self.world_size)] + with dist._coalescing_manager(): + for i in range(self.world_size): + dist.reduce_scatter_tensor(output_tensors[i], input_tensors[i]) + self.assertEqual(output_tensors, input_tensors[self.rank] * self.world_size) + + +class SetDeviceMethod(Enum): + TORCH_XPU_SET = auto() # torch.xpu.set_device + COLLECTIVE_ARGUMENT = auto() # broadcast_object_list(device=) + + +class XCCLProcessGroupWithDispatchedCollectivesTests( + test_c10d_common.ProcessGroupWithDispatchedCollectivesTests +): + @requires_xccl() + @skip_if_lt_x_gpu(1) + def test_collectives(self): + self._test_collectives(backend="xccl") + + @requires_xccl() + @skip_if_lt_x_gpu(1) + def test_allreduce_coalesced(self): + self._test_allreduce_coalesced(backend="xccl") + + @requires_xccl() + @skip_if_lt_x_gpu(1) + def test_all_to_all_single(self): + self._test_all_to_all_single(backend="xccl") + + @requires_xccl() + @skip_if_lt_x_gpu(1) + def test_allgather_base(self): + store = dist.FileStore(self.file_name, self.world_size) + dist.init_process_group( + "xccl", + world_size=self.world_size, + rank=self.rank, + store=store, + ) + device = "xpu" + tensor = torch.ones(10, 10, device=torch.device(device)) + output_tensor = torch.zeros(10, 10, device=torch.device(device)) + dist.all_gather_into_tensor(output_tensor, tensor) + self.assertEqual(output_tensor, tensor) + + +class LargeCommTest(test_c10d_common.AbstractLargeCommTest, MultiProcessTestCase): + def setUp(self): + super().setUp() + self._spawn_processes() + + def tearDown(self): + super().tearDown() + try: + os.remove(self.file_name) + except OSError: + pass + + @property + def device(self): + return self.rank + + @requires_xccl() + @skip_if_lt_x_gpu(4) + def test_new_group_local_sync(self): + self._test_new_group_local_sync(backend="xccl") + + @requires_xccl() + @skip_if_lt_x_gpu(4) + def test_new_group_local_sync_sanity_check(self): + self._test_new_group_local_sync_sanity_check(backend="xccl") + + @requires_xccl() + @skip_if_lt_x_gpu(4) + def test_new_group_local_sync_duplicated_pg(self): + self._test_new_group_local_sync_duplicate_pg(backend="xccl") + + def _init_two_pg2_subgroups(self, world_size: int = 4): + if world_size != 4: + raise NotImplementedError( + f"need world size of 4 to get 2 subgroup PGs, but got world size of {world_size}" + ) + store = c10d.FileStore(self.file_name, world_size) + c10d.init_process_group( + backend="xccl", store=store, rank=self.rank, world_size=world_size + ) + # every rank creates the same sub groups + # including unused sub groups in the current rank + a_group = c10d.new_group([0, 1]) + b_group = c10d.new_group([2, 3]) + return a_group if self.rank < 2 else b_group + + @requires_xccl() + @skip_if_lt_x_gpu(4) + def test_gather_subgroup(self): + world_size = 4 + if self.rank >= world_size: + # just easier to write the test for exactly 4 gpus, even if this test class increased to 8gpu later + return + + subgroup = self._init_two_pg2_subgroups(world_size) + device = torch.device("xpu:%d" % self.rank) + input = torch.ones((10,), device=device) * self.rank + if self.rank == 0 or self.rank == 2: + gather_list = [torch.empty_like(input) for _ in range(subgroup.size())] + torch.distributed.gather( + input, + gather_list=gather_list, + dst=self.rank, + group=subgroup, + async_op=False, + ) + for src in range(len(gather_list)): + expected = (torch.ones_like(input) * self.rank) + src + self.assertEqual(gather_list[src], expected) + else: + torch.distributed.gather( + input, + gather_list=None, + dst=self.rank - 1, + group=subgroup, + async_op=False, + ) + + @requires_xccl() + @skip_if_lt_x_gpu(4) + def test_gather_object_subgroup(self): + world_size = 4 + if self.rank >= world_size: + # just easier to write the test for exactly 4 gpus, even if this test class increased to 8gpu later + return + + subgroup = self._init_two_pg2_subgroups(world_size) + + # discrepancy #1 + # have to set device or else gather_object gets wrong device from 'current_device = _get_pg_default_device(group) + torch.xpu.set_device(self.rank) + + input = {"rank": self.rank} + if self.rank == 0 or self.rank == 2: + # discrepancy #2 + # another weird thing- what's the point of making me specify some empty objects in my list? + # empty list should be valid imo. (but it throws an error) + gather_list = [{}, {}] + torch.distributed.gather_object( + input, object_gather_list=gather_list, dst=self.rank, group=subgroup + ) + for src in range(len(gather_list)): + self.assertEqual(gather_list[src]["rank"], self.rank + src) + else: + torch.distributed.gather_object( + input, object_gather_list=None, dst=self.rank - 1, group=subgroup + ) + + @requires_xccl() + @skip_if_lt_x_gpu(4) + def test_reduce_subgroup(self): + world_size = 4 + if self.rank >= world_size: + return + subgroup = self._init_two_pg2_subgroups(world_size) + device = torch.device("xpu:%d" % self.rank) + x = torch.ones((10,), device=device) * self.rank + if self.rank == 0 or self.rank == 2: + expected = x + torch.ones((10,), device=device) * (self.rank + 1) + c10d.reduce(x, dst=self.rank, group=subgroup, async_op=False) + self.assertEqual(x, expected) + else: + c10d.reduce(x, dst=self.rank - 1, group=subgroup, async_op=False) + + # error: RuntimeError: Point-to-point communication as the first call is not supported now + @requires_xccl() + @skip_if_lt_x_gpu(4) + @parametrize("async_op", [True, False]) + def test_send_recv_subgroup(self, async_op): + world_size = 4 + if self.rank >= world_size: + return + subgroup = self._init_two_pg2_subgroups(world_size) + device = torch.device("xpu:%d" % self.rank) + if self.rank == 0 or self.rank == 2: + x = torch.empty((10,), device=device) + if async_op: + c10d.irecv(x, src=self.rank + 1, group=subgroup).wait() + else: + c10d.recv(x, src=self.rank + 1, group=subgroup) + expected = torch.ones((10,), device=device) * (self.rank + 1) + self.assertEqual(x, expected) + else: + x = torch.ones((10,), device=device) * self.rank + if async_op: + c10d.isend(x, dst=self.rank - 1, group=subgroup).wait() + else: + c10d.send(x, dst=self.rank - 1, group=subgroup) + + @requires_xccl() + @skip_if_lt_x_gpu(4) + def test_broadcast_subgroup(self): + world_size = 4 + if self.rank >= world_size: + return + subgroup = self._init_two_pg2_subgroups(world_size) + device = torch.device("xpu:%d" % self.rank) + if self.rank == 0 or self.rank == 2: + x = torch.empty((10,), device=device) + c10d.broadcast(x, src=self.rank + 1, group=subgroup) + expected = torch.ones((10,), device=device) * (self.rank + 1) + self.assertEqual(x, expected) + else: + x = torch.ones((10,), device=device) * self.rank + c10d.broadcast(x, src=self.rank, group=subgroup) + + @requires_xccl() + @skip_if_lt_x_gpu(4) + @parametrize( + "set_device", + [SetDeviceMethod.TORCH_XPU_SET, SetDeviceMethod.COLLECTIVE_ARGUMENT], + ) + def test_send_recv_object_list_subgroup(self, set_device: SetDeviceMethod): + world_size = 4 + if self.rank >= world_size: + return + subgroup = self._init_two_pg2_subgroups(world_size) + if set_device == SetDeviceMethod.TORCH_XPU_SET: + torch.xpu.set_device(self.rank) + device = None + else: + device = torch.device("xpu:%d" % self.rank) + if self.rank == 0 or self.rank == 2: + x = [{}] + c10d.recv_object_list(x, src=self.rank + 1, group=subgroup, device=device) + expected = [{"rank": self.rank + 1}] + self.assertEqual(x, expected) + else: + x = [{"rank": self.rank}] + c10d.send_object_list(x, dst=self.rank - 1, group=subgroup, device=device) + + @requires_xccl() + @skip_if_lt_x_gpu(4) + @parametrize( + "set_device", + [SetDeviceMethod.TORCH_XPU_SET, SetDeviceMethod.COLLECTIVE_ARGUMENT], + ) + def test_broadcast_object_list_subgroup(self, set_device: SetDeviceMethod): + world_size = 4 + if self.rank >= world_size: + return + subgroup = self._init_two_pg2_subgroups(world_size) + if set_device == SetDeviceMethod.TORCH_XPU_SET: + torch.xpu.set_device(self.rank) + device = None + else: + device = torch.device("xpu:%d" % self.rank) + if self.rank == 0 or self.rank == 2: + x = [{}] + c10d.broadcast_object_list( + x, src=self.rank + 1, group=subgroup, device=device + ) + expected = [{"rank": self.rank + 1}] + self.assertEqual(x, expected) + else: + x = [{"rank": self.rank}] + c10d.broadcast_object_list(x, src=self.rank, group=subgroup, device=device) + + @requires_xccl() + @skip_if_lt_x_gpu(4) + def test_scatter_subgroup(self): + world_size = 4 + if self.rank >= world_size: + return + subgroup = self._init_two_pg2_subgroups(world_size) + device = torch.device("xpu:%d" % self.rank) + x = torch.empty((10,), device=device) + expected = torch.ones((10,), device=device) * self.rank + if self.rank == 0 or self.rank == 2: + c10d.scatter(x, scatter_list=None, src=self.rank + 1, group=subgroup) + else: + scatter_list = [ + torch.ones((10,), device=device) * (self.rank - 1), + torch.ones((10,), device=device) * self.rank, + ] + c10d.scatter(x, scatter_list=scatter_list, src=self.rank, group=subgroup) + self.assertEqual(x, expected) + + @requires_xccl() + @skip_if_lt_x_gpu(4) + def test_scatter_object_list_subgroup(self): + world_size = 4 + if self.rank >= world_size: + return + subgroup = self._init_two_pg2_subgroups(world_size) + torch.xpu.set_device(self.rank) + scatter_object_output_list = [None] + expected = [{"rank": self.rank}] + if self.rank == 0 or self.rank == 2: + c10d.scatter_object_list( + scatter_object_output_list=scatter_object_output_list, + scatter_object_input_list=None, + src=self.rank + 1, + group=subgroup, + ) + + else: + scatter_object_input_list = [ + {"rank": self.rank - 1}, + {"rank": self.rank}, + ] + c10d.scatter_object_list( + scatter_object_output_list=scatter_object_output_list, + scatter_object_input_list=scatter_object_input_list, + src=self.rank, + group=subgroup, + ) + self.assertEqual(scatter_object_output_list, expected) + + +instantiate_parametrized_tests(LargeCommTest) + +if __name__ == "__main__": run_tests()