diff --git a/src/all_gather.cu b/src/all_gather.cu index 1eaafdd..97c9765 100644 --- a/src/all_gather.cu +++ b/src/all_gather.cu @@ -19,10 +19,12 @@ testResult_t AllGatherInitData(struct threadArgs* args, ncclDataType_t type, ncc size_t sendcount = args->sendBytes / wordSize(type); size_t recvcount = args->expectedBytes / wordSize(type); int nranks = args->nProcs*args->nThreads*args->nGpus; + int nGpusVisible; + CUDACHECK(cudaGetDeviceCount(&nGpusVisible)); for (int i=0; inGpus; i++) { int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; - CUDACHECK(cudaSetDevice(gpuid)); + CUDACHECK(cudaSetDevice(gpuid % nGpusVisible)); int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i]; diff --git a/src/all_reduce.cu b/src/all_reduce.cu index 9c65f25..116759c 100644 --- a/src/all_reduce.cu +++ b/src/all_reduce.cu @@ -19,10 +19,12 @@ testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncc size_t sendcount = args->sendBytes / wordSize(type); size_t recvcount = args->expectedBytes / wordSize(type); int nranks = args->nProcs*args->nThreads*args->nGpus; + int nGpusVisible; + CUDACHECK(cudaGetDeviceCount(&nGpusVisible)); for (int i=0; inGpus; i++) { int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; - CUDACHECK(cudaSetDevice(gpuid)); + CUDACHECK(cudaSetDevice(gpuid % nGpusVisible)); int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; diff --git a/src/alltoall.cu b/src/alltoall.cu index 0eae1b0..04d70bc 100644 --- a/src/alltoall.cu +++ b/src/alltoall.cu @@ -19,11 +19,13 @@ testResult_t AlltoAllInitData(struct threadArgs* args, ncclDataType_t type, nccl size_t sendcount = args->sendBytes / wordSize(type); size_t recvcount = args->expectedBytes / wordSize(type); int nranks = args->nProcs*args->nThreads*args->nGpus; + int nGpusVisible; + CUDACHECK(cudaGetDeviceCount(&nGpusVisible)); for (int i=0; inGpus; i++) { char* str = getenv("NCCL_TESTS_DEVICE"); int gpuid = str ? atoi(str) : args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; - CUDACHECK(cudaSetDevice(gpuid)); + CUDACHECK(cudaSetDevice(gpuid % nGpusVisible)); int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; diff --git a/src/broadcast.cu b/src/broadcast.cu index 40dcb5d..cae7744 100644 --- a/src/broadcast.cu +++ b/src/broadcast.cu @@ -18,10 +18,12 @@ void BroadcastGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *par testResult_t BroadcastInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { size_t sendcount = args->sendBytes / wordSize(type); size_t recvcount = args->expectedBytes / wordSize(type); + int nGpusVisible; + CUDACHECK(cudaGetDeviceCount(&nGpusVisible)); for (int i=0; inGpus; i++) { int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; - CUDACHECK(cudaSetDevice(gpuid)); + CUDACHECK(cudaSetDevice(gpuid % nGpusVisible)); int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; diff --git a/src/common.cu b/src/common.cu index 8fe9258..fcf25be 100644 --- a/src/common.cu +++ b/src/common.cu @@ -55,6 +55,7 @@ thread_local int is_main_thread = 0; // Command line parameter defaults static int nThreads = 1; static int nGpus = 1; +static int nGpusVisible; static size_t minBytes = 32*1024*1024; static size_t maxBytes = 32*1024*1024; static size_t stepBytes = 1*1024*1024; @@ -223,7 +224,7 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t int device; int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); NCCLCHECK(ncclCommCuDevice(args->comms[i], &device)); - CUDACHECK(cudaSetDevice(device)); + CUDACHECK(cudaSetDevice(device % nGpusVisible)); void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank)) : args->recvbuffs[i]; TESTCHECK(CheckDelta(data, args->expected[i], count, 0, type, op, 0, nranks, wrongPerGpu+i)); @@ -587,7 +588,7 @@ testResult_t threadRunTests(struct threadArgs* args) { // will be done on the current GPU (by default : 0) and if the GPUs are in // exclusive mode those operations will fail. int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus; - CUDACHECK(cudaSetDevice(gpuid)); + CUDACHECK(cudaSetDevice(gpuid % nGpusVisible)); TESTCHECK(ncclTestEngine.runTest(args, ncclroot, (ncclDataType_t)nccltype, test_typenames[nccltype], (ncclRedOp_t)ncclop, test_opnames[ncclop])); return testSuccess; } @@ -604,7 +605,7 @@ testResult_t threadInit(struct threadArgs* args) { for (int i=0; inGpus; i++) { int rank = args->proc*args->nThreads*args->nGpus + args->thread*args->nGpus + i; int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; - CUDACHECK(cudaSetDevice(gpuid)); + CUDACHECK(cudaSetDevice(gpuid % nGpusVisible)); NCCLCHECK(ncclCommInitRank(args->comms+i, nranks, args->ncclId, rank)); } NCCLCHECK(ncclGroupEnd()); @@ -685,6 +686,8 @@ int main(int argc, char* argv[]) { {} }; + CUDACHECK(cudaGetDeviceCount(&nGpusVisible)); + while(1) { int c; c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:hG:a:", longopts, &longindex); @@ -698,6 +701,10 @@ int main(int argc, char* argv[]) { break; case 'g': nGpus = strtol(optarg, NULL, 0); + if (nGpus > nGpusVisible) { + fprintf(stderr, "invalid number of GPUs specified (%d), only for %d GPUs exist\n", nGpus, nGpusVisible); + return -1; + } break; case 'b': parsed = parsesize(optarg); @@ -843,7 +850,7 @@ testResult_t run() { int cudaDev = localRank*nThreads*nGpus+i; int rank = proc*nThreads*nGpus+i; cudaDeviceProp prop; - CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev)); + CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev % nGpus)); len += snprintf(line+len, MAX_LINE-len, "# Rank %2d Pid %6d on %10s device %2d [0x%02x] %s\n", rank, getpid(), hostname, cudaDev, prop.pciBusID, prop.name); maxMem = std::min(maxMem, prop.totalGlobalMem); @@ -887,7 +894,7 @@ testResult_t run() { ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes, (size_t)nProcs*nGpus*nThreads); for (int i=0; isendBytes / wordSize(type); size_t recvcount = args->expectedBytes / wordSize(type); int nranks = args->nProcs*args->nThreads*args->nGpus; + int nGpusVisible; + CUDACHECK(cudaGetDeviceCount(&nGpusVisible)); for (int i=0; inGpus; i++) { int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; - CUDACHECK(cudaSetDevice(gpuid)); + CUDACHECK(cudaSetDevice(gpuid % nGpusVisible)); int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i]; diff --git a/src/hypercube.cu b/src/hypercube.cu index ae9fbd0..208208d 100644 --- a/src/hypercube.cu +++ b/src/hypercube.cu @@ -22,10 +22,12 @@ testResult_t HyperCubeInitData(struct threadArgs* args, ncclDataType_t type, ncc size_t sendcount = args->sendBytes / wordSize(type); size_t recvcount = args->expectedBytes / wordSize(type); int nranks = args->nProcs*args->nThreads*args->nGpus; + int nGpusVisible; + CUDACHECK(cudaGetDeviceCount(&nGpusVisible)); for (int i=0; inGpus; i++) { int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; - CUDACHECK(cudaSetDevice(gpuid)); + CUDACHECK(cudaSetDevice(gpuid % nGpusVisible)); int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i]; diff --git a/src/reduce.cu b/src/reduce.cu index c2707c7..8f56dbd 100644 --- a/src/reduce.cu +++ b/src/reduce.cu @@ -19,10 +19,12 @@ testResult_t ReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRe size_t sendcount = args->sendBytes / wordSize(type); size_t recvcount = args->expectedBytes / wordSize(type); int nranks = args->nProcs*args->nThreads*args->nGpus; + int nGpusVisible; + CUDACHECK(cudaGetDeviceCount(&nGpusVisible)); for (int i=0; inGpus; i++) { int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; - CUDACHECK(cudaSetDevice(gpuid)); + CUDACHECK(cudaSetDevice(gpuid % nGpusVisible)); int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; diff --git a/src/reduce_scatter.cu b/src/reduce_scatter.cu index e4a59dc..cbb5799 100644 --- a/src/reduce_scatter.cu +++ b/src/reduce_scatter.cu @@ -19,10 +19,12 @@ testResult_t ReduceScatterInitData(struct threadArgs* args, ncclDataType_t type, size_t sendcount = args->sendBytes / wordSize(type); size_t recvcount = args->expectedBytes / wordSize(type); int nranks = args->nProcs*args->nThreads*args->nGpus; + int nGpusVisible; + CUDACHECK(cudaGetDeviceCount(&nGpusVisible)); for (int i=0; inGpus; i++) { int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; - CUDACHECK(cudaSetDevice(gpuid)); + CUDACHECK(cudaSetDevice(gpuid % nGpusVisible)); int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; diff --git a/src/scatter.cu b/src/scatter.cu index d244b2b..f8dd0c8 100644 --- a/src/scatter.cu +++ b/src/scatter.cu @@ -18,10 +18,12 @@ void ScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *param testResult_t ScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { size_t sendcount = args->sendBytes / wordSize(type); size_t recvcount = args->expectedBytes / wordSize(type); + int nGpusVisible; + CUDACHECK(cudaGetDeviceCount(&nGpusVisible)); for (int i=0; inGpus; i++) { int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; - CUDACHECK(cudaSetDevice(gpuid)); + CUDACHECK(cudaSetDevice(gpuid % nGpusVisible)); int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; diff --git a/src/sendrecv.cu b/src/sendrecv.cu index e73a92b..c28d40a 100644 --- a/src/sendrecv.cu +++ b/src/sendrecv.cu @@ -19,10 +19,12 @@ testResult_t SendRecvInitData(struct threadArgs* args, ncclDataType_t type, nccl size_t sendcount = args->sendBytes / wordSize(type); size_t recvcount = args->expectedBytes / wordSize(type); int nranks = args->nProcs*args->nThreads*args->nGpus; + int nGpusVisible; + CUDACHECK(cudaGetDeviceCount(&nGpusVisible)); for (int i=0; inGpus; i++) { int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; - CUDACHECK(cudaSetDevice(gpuid)); + CUDACHECK(cudaSetDevice(gpuid % nGpusVisible)); int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];