Skip to content

Commit adafc0f

Browse files
authored
Add MSCCL Support (#658)
* Add MSCCL support * Add alignment and message size checking * Fix nRanks checking, in-place and out-of-place tests and group call handling * Fix hipGraph unit test * Change MSCCL init warning to INFO * Revise license info
1 parent b953544 commit adafc0f

38 files changed

+40923
-11
lines changed

CMakeLists.txt

+17-2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
# Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
2+
# Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
23

34
cmake_minimum_required(VERSION 3.5)
45
INCLUDE(CheckIncludeFiles)
@@ -138,7 +139,8 @@ if (BUILD_ALLREDUCE_ONLY)
138139
set(CU_SOURCES
139140
src/collectives/device/all_reduce.cu
140141
src/collectives/device/sendrecv.cu
141-
src/collectives/device/functions.cu)
142+
src/collectives/device/functions.cu
143+
src/collectives/device/msccl_kernel.cu)
142144
else()
143145
set(CU_SOURCES
144146
src/collectives/device/all_reduce.cu
@@ -149,7 +151,8 @@ else()
149151
src/collectives/device/reduce_scatter.cu
150152
src/collectives/device/sendrecv.cu
151153
src/collectives/device/onerank_reduce.cu
152-
src/collectives/device/functions.cu)
154+
src/collectives/device/functions.cu
155+
src/collectives/device/msccl_kernel.cu)
153156
endif()
154157

155158
set(CPP_SOURCES)
@@ -223,6 +226,12 @@ set(HEADER_SOURCES
223226
src/include/nvtx3/nvToolsExtCudaRt.h
224227
src/include/nvtx3/nvToolsExtCuda.h
225228
src/include/nvtx3/nvToolsExtOpenCL.h
229+
src/include/msccl/msccl_kernel.h
230+
src/include/msccl/msccl_lifecycle.h
231+
src/include/msccl/msccl_parser.h
232+
src/include/msccl/msccl_setup.h
233+
src/include/msccl/msccl_status.h
234+
src/include/msccl/msccl_struct.h
226235
src/graph/rings.h
227236
src/graph/rome_models.h
228237
src/graph/topo.h
@@ -242,6 +251,7 @@ set(API_SOURCES
242251
src/collectives/scatter.cc
243252
src/collectives/gather.cc
244253
src/collectives/sendrecv.cc
254+
src/collectives/msccl.cc
245255
src/net.cc)
246256
foreach(filename ${API_SOURCES})
247257
string(REPLACE ".cc"
@@ -278,6 +288,10 @@ set(CC_SOURCES
278288
src/misc/param.cc
279289
src/misc/rocmwrap.cc
280290
src/misc/strongstream.cc
291+
src/misc/msccl/msccl_lifecycle.cc
292+
src/misc/msccl/msccl_parser.cc
293+
src/misc/msccl/msccl_setup.cc
294+
src/misc/msccl/msccl_status.cc
281295
src/transport/coll_net.cc
282296
src/transport/net.cc
283297
src/transport/net_ib.cc
@@ -314,6 +328,7 @@ set(HIPIFY_SOURCES
314328
src/collectives/reduce_scatter_api.cpp
315329
src/collectives/scatter_api.cpp
316330
src/collectives/sendrecv_api.cpp
331+
src/collectives/msccl_api.cpp
317332
src/debug.cpp
318333
src/enqueue.cpp
319334
src/graph/xml.cpp

src/collectives/all_gather.cc

+10
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,27 @@
11
/*************************************************************************
22
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
3+
* Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
34
*
45
* See LICENSE.txt for license information
56
************************************************************************/
67

78
#include "enqueue.h"
89
#include "collectives.h"
910

11+
#include "msccl/msccl_lifecycle.h"
12+
1013
NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size_t sendcount,
1114
ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
1215
ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
1316
ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
1417
NVTX3_FUNC_RANGE_IN(nccl_domain);
18+
19+
if (mscclAvailable() && !mscclIsCaller()) {
20+
return mscclEnqueueCheck(
21+
sendbuff, nullptr, nullptr, recvbuff, nullptr, nullptr,
22+
sendcount, datatype, 0, 0, ncclSum, mscclFuncAllGather, comm, stream);
23+
}
24+
1525
struct ncclInfo info = { ncclFuncAllGather, "AllGather",
1626
sendbuff, recvbuff, sendcount, datatype, ncclSum, 0, comm, stream, /* Args */
1727
ALLGATHER_CHUNKSTEPS, ALLGATHER_SLICESTEPS };

src/collectives/all_reduce.cc

+10
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,26 @@
11
/*************************************************************************
22
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
3+
* Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
34
*
45
* See LICENSE.txt for license information
56
************************************************************************/
67

78
#include "enqueue.h"
89

10+
#include "msccl/msccl_lifecycle.h"
11+
912
NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count,
1013
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
1114
ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
1215
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
1316
NVTX3_FUNC_RANGE_IN(nccl_domain);
17+
18+
if (mscclAvailable() && !mscclIsCaller()) {
19+
return mscclEnqueueCheck(
20+
sendbuff, nullptr, nullptr, recvbuff, nullptr, nullptr,
21+
count, datatype, 0, 0, op, mscclFuncAllReduce, comm, stream);
22+
}
23+
1424
struct ncclInfo info = { ncclFuncAllReduce, "AllReduce",
1525
sendbuff, recvbuff, count, datatype, op, 0, comm, stream, /* Args */
1626
ALLREDUCE_CHUNKSTEPS, ALLREDUCE_SLICESTEPS };

src/collectives/all_to_all.cc

+9
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
/*************************************************************************
22
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
33
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
4+
* Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
45
*
56
* See LICENSE.txt for license information
67
************************************************************************/
@@ -9,10 +10,18 @@
910
#include "collectives.h"
1011
#include "graph/topo.h"
1112

13+
#include "msccl/msccl_lifecycle.h"
14+
1215
NCCL_API(ncclResult_t, ncclAllToAll, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
1316
ncclComm_t comm, hipStream_t stream);
1417
ncclResult_t ncclAllToAll(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
1518
ncclComm_t comm, hipStream_t stream) {
19+
if (mscclAvailable() && !mscclIsCaller()) {
20+
return mscclEnqueueCheck(
21+
sendbuff, nullptr, nullptr, recvbuff, nullptr, nullptr,
22+
count, datatype, 0, 0, ncclSum, mscclFuncAllToAll, comm, stream);
23+
}
24+
1625
size_t rankOffset = count * ncclTypeSize(datatype);
1726
size_t rankAlign = rankOffset & ((~rankOffset) + 1);
1827
// Determine Pivot A2A support now that we know number of channels

src/collectives/all_to_allv.cc

+9
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,28 @@
11
/*************************************************************************
22
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
33
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
4+
* Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
45
*
56
* See LICENSE.txt for license information
67
************************************************************************/
78

89
#include "enqueue.h"
910
#include "collectives.h"
1011

12+
#include "msccl/msccl_lifecycle.h"
13+
1114
NCCL_API(ncclResult_t, ncclAllToAllv, const void *sendbuff, const size_t sendcounts[], const size_t sdispls[],
1215
void *recvbuff, const size_t recvcounts[], const size_t rdispls[],
1316
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
1417
ncclResult_t ncclAllToAllv(const void *sendbuff, const size_t sendcounts[], const size_t sdispls[],
1518
void *recvbuff, const size_t recvcounts[], const size_t rdispls[],
1619
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream) {
20+
if (mscclAvailable() && !mscclIsCaller()) {
21+
return mscclEnqueueCheck(
22+
sendbuff, sendcounts, sdispls, recvbuff, recvcounts, rdispls,
23+
0, datatype, 0, 0, ncclSum, mscclFuncAllToAllv, comm, stream);
24+
}
25+
1726
int nRanks;
1827
NCCLCHECK(ncclCommCount(comm, &nRanks));
1928
NCCLCHECK(ncclGroupStart());

src/collectives/broadcast.cc

+9
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,26 @@
11
/*************************************************************************
22
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
3+
* Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
34
*
45
* See LICENSE.txt for license information
56
************************************************************************/
67

78
#include "enqueue.h"
89
#include "collectives.h"
910

11+
#include "msccl/msccl_lifecycle.h"
12+
1013
NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
1114
ncclComm_t comm, cudaStream_t stream);
1215
ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
1316
ncclComm_t comm, cudaStream_t stream) {
1417
NVTX3_FUNC_RANGE_IN(nccl_domain);
18+
if (mscclAvailable() && !mscclIsCaller()) {
19+
return mscclEnqueueCheck(
20+
sendbuff, nullptr, nullptr, recvbuff, nullptr, nullptr,
21+
count, datatype, root, 0, ncclSum, mscclFuncBroadcast, comm, stream);
22+
}
23+
1524
struct ncclInfo info = { ncclFuncBroadcast, "Broadcast",
1625
sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */
1726
BROADCAST_CHUNKSTEPS, BROADCAST_SLICESTEPS };

0 commit comments

Comments
 (0)