Skip to content

Commit 8250092

Browse files
UnitTest: add test cases for 2.14 API (ncclCommInitRankConfig and ncclCommFinalize for non-blocking communicator) (#662)
1 parent adafc0f commit 8250092

6 files changed

+146
-9
lines changed

test/AllReduce_NonBlockingConf.cpp

+64
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
/*************************************************************************
2+
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
3+
*
4+
* See LICENSE.txt for license information
5+
************************************************************************/
6+
#include "TestBed.hpp"
7+
namespace RcclUnitTesting
8+
{
9+
TEST(AllReduce, NonBlocking)
10+
{
11+
TestBed testBed;
12+
// Configuration
13+
ncclFunc_t const funcType = ncclCollAllReduce;
14+
std::vector<ncclDataType_t> const& dataTypes = {ncclFloat};
15+
std::vector<ncclRedOp_t> const& redOps = {ncclSum};
16+
std::vector<int> const numElements = {1048576, 1024};
17+
bool const inPlace = false;
18+
bool const useManagedMem = false;
19+
bool const useBlocking = false;
20+
21+
OptionalColArgs options;
22+
// Terminate the test as soon as first failure occurs
23+
bool isCorrect = true;
24+
for (int totalRanks = testBed.ev.minGpus; totalRanks <= testBed.ev.maxGpus && isCorrect; ++totalRanks)
25+
for (int isMultiProcess = 0; isMultiProcess <= 1 && isCorrect; ++isMultiProcess)
26+
{
27+
if (!(testBed.ev.processMask & (1 << isMultiProcess))) continue;
28+
29+
// Test either single process all GPUs, or 1 process per GPU
30+
int const numProcesses = isMultiProcess ? totalRanks : 1;
31+
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks), 1, useBlocking);
32+
33+
for (int redOpIdx = 0; redOpIdx < redOps.size() && isCorrect; ++redOpIdx)
34+
{
35+
options.redOp = redOps[redOpIdx];
36+
for (int dataIdx = 0; dataIdx < dataTypes.size() && isCorrect; ++dataIdx)
37+
{
38+
if (testBed.ev.showNames)
39+
INFO("%s %d-ranks AllReduce %s Blocking Config (%s-%s)\n",
40+
isMultiProcess ? "MP" : "SP",
41+
totalRanks, useBlocking ? "true" : "false",
42+
ncclRedOpNames[redOps[redOpIdx]], ncclDataTypeNames[dataTypes[dataIdx]]);
43+
44+
45+
for (int numIdx = 0; numIdx < numElements.size() && isCorrect; ++numIdx)
46+
{
47+
testBed.SetCollectiveArgs(funcType,
48+
dataTypes[dataIdx],
49+
numElements[numIdx],
50+
numElements[numIdx],
51+
options);
52+
}
53+
testBed.AllocateMem(inPlace, useManagedMem);
54+
testBed.PrepareData();
55+
testBed.ExecuteCollectives();
56+
testBed.ValidateResults(isCorrect);
57+
testBed.DeallocateMem();
58+
}
59+
}
60+
testBed.DestroyComms();
61+
}
62+
testBed.Finalize();
63+
}
64+
}

test/CMakeLists.txt

+2
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ if(BUILD_TESTS)
4646
set(TEST_SOURCE_FILES
4747
AllReduce_Clique.cpp
4848
AllReduce_GroupCall.cpp
49+
AllReduce_NonBlockingConf.cpp
4950
AllReduce_InPlace.cpp
5051
AllReduce_ManagedMem.cpp
5152
AllReduce_OutOfPlace.cpp
@@ -57,6 +58,7 @@ if(BUILD_TESTS)
5758
#AllReduce
5859
AllReduce_Clique.cpp
5960
AllReduce_GroupCall.cpp
61+
AllReduce_NonBlockingConf.cpp
6062
AllReduce_InPlace.cpp
6163
AllReduce_ManagedMem.cpp
6264
AllReduce_OutOfPlace.cpp

test/common/TestBed.cpp

+7-3
Original file line numberDiff line numberDiff line change
@@ -85,12 +85,13 @@ namespace RcclUnitTesting
8585
}
8686

8787
void TestBed::InitComms(std::vector<std::vector<int>> const& deviceIdsPerProcess,
88-
int const numCollectivesInGroup)
88+
int const numCollectivesInGroup, bool const useBlocking)
8989
{
9090
// Count up the total number of GPUs to use and track child/deviceId per rank
9191
this->numActiveChildren = deviceIdsPerProcess.size();
9292
this->numActiveRanks = 0;
9393
this->numCollectivesInGroup = numCollectivesInGroup;
94+
this->useBlocking = useBlocking;
9495
this->rankToChildMap.clear();
9596
this->rankToDeviceMap.clear();
9697
if (ev.verbose) INFO("Setting up %d active child processes\n", this->numActiveChildren);
@@ -139,6 +140,9 @@ namespace RcclUnitTesting
139140
// Send the number of collectives to be run per group call
140141
PIPE_WRITE(childId, numCollectivesInGroup);
141142

143+
// Send the RCCL communication with blocking or non-blocking option
144+
PIPE_WRITE(childId, useBlocking);
145+
142146
// Send whether to use MultiRank interfaces or not.
143147
PIPE_WRITE(childId, useMulti);
144148

@@ -159,9 +163,9 @@ namespace RcclUnitTesting
159163
}
160164
}
161165

162-
void TestBed::InitComms(int const numGpus, int const numCollectivesInGroup)
166+
void TestBed::InitComms(int const numGpus, int const numCollectivesInGroup, bool const useBlocking)
163167
{
164-
InitComms(TestBed::GetDeviceIdsList(1, numGpus), numCollectivesInGroup);
168+
InitComms(TestBed::GetDeviceIdsList(1, numGpus), numCollectivesInGroup, useBlocking);
165169
}
166170

167171
void TestBed::SetCollectiveArgs(ncclFunc_t const funcType,

test/common/TestBed.hpp

+4-4
Original file line numberDiff line numberDiff line change
@@ -25,19 +25,19 @@ namespace RcclUnitTesting
2525
int numActiveChildren; // List of active children (with usable RCCL comms)
2626
int numActiveRanks; // Current # of ranks in use
2727
int numCollectivesInGroup; // # of collectives to execute per group call
28-
28+
bool useBlocking; // RCCL communication with blocking or non-blocking option
2929
EnvVars ev; // Environment variables
3030

3131
// Constructor - Creates one child process per detected GPU device that waits for further commands
3232
TestBed();
3333

3434
// Prepare TestBed for use with GPUs across multiple child processes
3535
void InitComms(std::vector<std::vector<int>> const& deviceIdsPerChild,
36-
int const numCollectivesInGroup = 1);
37-
36+
int const numCollectivesInGroup = 1, bool const useBlocking = true);
37+
3838
// Prepare TestBed for use with GPUs on a single child process
3939
void InitComms(int const numGpus,
40-
int const numCollectivesInGroup = 1);
40+
int const numCollectivesInGroup = 1, bool const useBlocking = true);
4141

4242
// Set collectives arguments for specified collective / rank
4343
// Setting scalarsPerRank to non-null will create custom reduction operator

test/common/TestBedChild.cpp

+67-2
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,26 @@
2020
} \
2121
}
2222

23+
#define CHILD_NCCL_CALL_NON_BLOCKING(msg) \
24+
{ \
25+
for (int i = 0; i < this->comms.size(); ++i) \
26+
{ \
27+
ncclResult_t ncclAsyncErr; \
28+
int loop_counter = 0; \
29+
do \
30+
{ \
31+
loop_counter++; \
32+
if (loop_counter == MAX_LOOP_COUNTER) break; \
33+
ncclCommGetAsyncError(this->comms[i], &ncclAsyncErr); \
34+
} while(ncclAsyncErr == ncclInProgress); \
35+
if (ncclAsyncErr != ncclSuccess) \
36+
{ \
37+
ERROR("Child process %d fails NCCL call %s with code %d\n", this->childId, msg, ncclAsyncErr); \
38+
return TEST_FAIL; \
39+
} \
40+
} \
41+
}
42+
2343
#define PIPE_READ(val) \
2444
if (read(childReadFd, &val, sizeof(val)) != sizeof(val)) return TEST_FAIL;
2545

@@ -126,6 +146,7 @@ namespace RcclUnitTesting
126146
PIPE_READ(this->totalRanks);
127147
PIPE_READ(this->rankOffset);
128148
PIPE_READ(this->numCollectivesInGroup);
149+
PIPE_READ(this->useBlocking);
129150
bool useMultiRankPerGpu;
130151
PIPE_READ(useMultiRankPerGpu);
131152

@@ -177,6 +198,18 @@ namespace RcclUnitTesting
177198
break;
178199
}
179200
}
201+
else if (this->useBlocking == false)
202+
{
203+
// When non-blocking communicator is desired call ncclCommInitRankConfig with appropriate flag
204+
ncclConfig_t config = NCCL_CONFIG_INITIALIZER;
205+
config.blocking = 0;
206+
if (ncclCommInitRankConfig(&this->comms[localRank], this->totalRanks, id, globalRank, &config) != ncclSuccess)
207+
{
208+
ERROR("Rank %d on child %d unable to call ncclCommInitRankConfig\n", globalRank, this->childId);
209+
status = TEST_FAIL;
210+
break;
211+
}
212+
}
180213
else
181214
{
182215
if (ncclCommInitRank(&this->comms[localRank], this->totalRanks, id, globalRank) != ncclSuccess)
@@ -187,10 +220,26 @@ namespace RcclUnitTesting
187220
}
188221
}
189222
}
190-
if (status == TEST_SUCCESS)
223+
if (this->useBlocking == false)
191224
{
192-
CHILD_NCCL_CALL(ncclGroupEnd(), "ncclGroupStart");
225+
CHILD_NCCL_CALL_NON_BLOCKING("ncclCommGetAsyncErrorInitRankConfig");
193226
}
227+
if (status == TEST_SUCCESS)
228+
{
229+
// Check if the communicator is non-blocking
230+
if (this->useBlocking == false)
231+
{
232+
// handle the ncclGroupEnd in case of non-blocking communication
233+
ncclResult_t Group_End_state = ncclGroupEnd();
234+
if (Group_End_state != ncclSuccess) CHILD_NCCL_CALL_NON_BLOCKING("ncclCommGetAsyncErrorGroup");
235+
}
236+
else
237+
{
238+
// In case of blocking communication just call ncclGroupEnd
239+
CHILD_NCCL_CALL(ncclGroupEnd(), "ncclGroupEnd");
240+
}
241+
}
242+
194243
if (this->verbose) INFO("Child %d finishes InitComms() [%s]\n",
195244
this->childId, status == TEST_SUCCESS ? "SUCCESS" : "FAIL");
196245
return status;
@@ -680,6 +729,22 @@ namespace RcclUnitTesting
680729
if (this->verbose) INFO("Child %d begins DestroyComms\n", this->childId);
681730

682731
// Release comms
732+
for (int i = 0; i < this->comms.size(); ++i)
733+
{
734+
// Check if the communicator is non-blocking
735+
if (this->useBlocking == false)
736+
{
737+
// handle the non-blocking case
738+
ncclCommFinalize(this->comms[i]);
739+
CHILD_NCCL_CALL_NON_BLOCKING("ncclCommGetAsyncErrorCommFinalize");
740+
}
741+
else
742+
{
743+
// In case of blocking just call Finalize
744+
CHILD_NCCL_CALL(ncclCommFinalize(this->comms[i]), "ncclCommFinalize");
745+
}
746+
}
747+
683748
for (int i = 0; i < this->comms.size(); ++i)
684749
{
685750
CHILD_NCCL_CALL(ncclCommDestroy(this->comms[i]), "ncclCommDestroy");

test/common/TestBedChild.hpp

+2
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include "rccl/rccl.h"
1313

1414
#define MAX_RANKS 32
15+
#define MAX_LOOP_COUNTER 1000000000
1516
namespace RcclUnitTesting
1617
{
1718
class TestBedChild
@@ -63,6 +64,7 @@ namespace RcclUnitTesting
6364
int totalRanks; // Total ranks
6465
int rankOffset; // Global rank offset for this child
6566
int numCollectivesInGroup; // # of collectives to run per group call
67+
bool useBlocking; // RCCL communication with blocking or non-blocking option
6668
std::vector<ncclComm_t> comms; // RCCL communicators for each rank
6769
std::vector<int> deviceIds; // Device IDs for each rank
6870
std::vector<hipStream_t> streams; // Streams for executing collectives

0 commit comments

Comments
 (0)