Skip to content

Commit

Permalink
Merge remote-tracking branch 'nccl/master' into develop
Browse files Browse the repository at this point in the history
  • Loading branch information
BertanDogancay authored and corey-derochie-amd committed Jan 23, 2025
2 parents b6377e0 + 178b6b7 commit 36343be
Show file tree
Hide file tree
Showing 122 changed files with 8,931 additions and 4,624 deletions.
6 changes: 5 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,16 @@

Full documentation for RCCL is available at [https://rccl.readthedocs.io](https://rccl.readthedocs.io)

## Unreleased
## Unreleased - RCCL 2.22.3 for ROCm 6.4.0

### Added

* `RCCL_SOCKET_REUSEADDR` and `RCCL_SOCKET_LINGER` environment parameters

### Changed

* Compatibility with NCCL 2.22.3

## RCCL 2.21.5 for ROCm 6.3.1

### Added
Expand Down
34 changes: 23 additions & 11 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -425,12 +425,13 @@ set(SRC_FILES
src/graph/tuning.cc
src/graph/xml.cc
src/graph/xml.h
src/include/align.h
src/include/alloc.h
src/include/alt_rsmi.h
src/include/archinfo.h
src/include/api_trace.h
src/include/argcheck.h
src/include/BfdBacktrace.hpp
src/include/bitops.h
src/include/bootstrap.h
src/include/channel.h
src/include/checks.h
Expand Down Expand Up @@ -489,30 +490,40 @@ set(SRC_FILES
src/include/npkit/npkit.h
src/include/npkit/npkit_event.h
src/include/npkit/npkit_struct.h
src/include/nvtx3/nvToolsExt.h
src/include/nvtx3/nvToolsExtCounters.h
src/include/nvtx3/nvToolsExtCuda.h
src/include/nvtx3/nvToolsExtCudaRt.h
src/include/nvtx3/nvToolsExt.h
src/include/nvtx3/nvToolsExtMem.h
src/include/nvtx3/nvToolsExtMemCudaRt.h
src/include/nvtx3/nvToolsExtOpenCL.h
src/include/nvtx3/nvToolsExtPayload.h
src/include/nvtx3/nvToolsExtPayloadHelper.h
src/include/nvtx3/nvToolsExtSemanticsCounters.h
src/include/nvtx3/nvToolsExtSemanticsScope.h
src/include/nvtx3/nvToolsExtSync.h
src/include/nvtx3/nvtx3.hpp
src/include/nvtx3/nvtxDetail/nvtxExtHelperMacros.h
src/include/nvtx3/nvtxDetail/nvtxExtImpl.h
src/include/nvtx3/nvtxDetail/nvtxExtImplCounters_v1.h
src/include/nvtx3/nvtxDetail/nvtxExtImplMem_v1.h
src/include/nvtx3/nvtxDetail/nvtxExtImplMemCudaRt_v1.h
src/include/nvtx3/nvtxDetail/nvtxExtImplPayload_v1.h
src/include/nvtx3/nvtxDetail/nvtxExtInit.h
src/include/nvtx3/nvtxDetail/nvtxExtPayloadHelperInternal.h
src/include/nvtx3/nvtxDetail/nvtxExtPayloadTypeInfo.h
src/include/nvtx3/nvtxDetail/nvtxExtTypes.h
src/include/nvtx3/nvtxDetail/nvtxImpl.h
src/include/nvtx3/nvtxDetail/nvtxImplCore.h
src/include/nvtx3/nvtxDetail/nvtxImplCudaRt_v3.h
src/include/nvtx3/nvtxDetail/nvtxImplCuda_v3.h
src/include/nvtx3/nvtxDetail/nvtxImpl.h
src/include/nvtx3/nvtxDetail/nvtxImplCudaRt_v3.h
src/include/nvtx3/nvtxDetail/nvtxImplOpenCL_v3.h
src/include/nvtx3/nvtxDetail/nvtxImplSync_v3.h
src/include/nvtx3/nvtxDetail/nvtxInit.h
src/include/nvtx3/nvtxDetail/nvtxInitDecls.h
src/include/nvtx3/nvtxDetail/nvtxInitDefs.h
src/include/nvtx3/nvtxDetail/nvtxInit.h
src/include/nvtx3/nvtxDetail/nvtxLinkOnce.h
src/include/nvtx3/nvtxDetail/nvtxTypes.h
src/include/nvtx3/nvtxExtDetail/nvtxExtImpl.h
src/include/nvtx3/nvtxExtDetail/nvtxExtImplPayload_v1.h
src/include/nvtx3/nvtxExtDetail/nvtxExtInit.h
src/include/nvtx3/nvtxExtDetail/nvtxExtPayloadTypeInfo.h
src/include/nvtx3/nvtxExtDetail/nvtxExtTypes.h
src/include/alt_rsmi.h
src/misc/alt_rsmi.cc
src/misc/archinfo.cc
src/misc/argcheck.cc
Expand Down Expand Up @@ -542,6 +553,7 @@ set(SRC_FILES
src/misc/msccl/msccl_setup.cc
src/misc/msccl/msccl_status.cc
src/transport/coll_net.cc
src/transport/generic.cc
src/transport/net.cc
src/transport/net_ib.cc
src/transport/net_socket.cc
Expand Down
4 changes: 2 additions & 2 deletions NOTICES.txt
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
Notices and Licenses file
_______________________________________________________________

Dependencies on nvidia-nccl v2.21.5-1 (BSD3)
Dependencies on nvidia-nccl v2.22.3-1 (BSD3)

Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
Modifications Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All rights reserved.
Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.

Redistribution and use in source and binary forms, with or without
Expand Down
4 changes: 2 additions & 2 deletions cmake/scripts/add_unroll.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ if [[ "$HIP_FILE" =~ .*/src/device/.*\.h ]]; then
sed -i "s/\\(runRing<T[^>]*\\)>*/\\1, COLL_UNROLL>/" "$HIP_FILE"
sed -i "s/runTreeUpDown<T, RedOp, ProtoSimple<1, 1, COLL_UNROLL>>/runTreeUpDown<T, RedOp, ProtoSimple<1, 1, COLL_UNROLL>, COLL_UNROLL>/" "$HIP_FILE"
sed -i "s/\\(runTreeSplit<T[^>]*\\)>*/\\1, COLL_UNROLL>/" "$HIP_FILE"
sed -i "s/\\(struct RunWorkElement<ncclFunc[^>]*\\)>*/\\1, COLL_UNROLL>/" "$HIP_FILE"
sed -i "s/\\(struct RunWork<ncclFunc[^>]*\\)>*/\\1, COLL_UNROLL>/" "$HIP_FILE"
sed -i "s/\\(struct RunWorkColl<ncclFunc[^>]*\\)>*/\\1, COLL_UNROLL>/" "$HIP_FILE"
sed -i "s/\\(struct RunWorkBatch<ncclFunc[^>]*\\)>*/\\1, COLL_UNROLL>/" "$HIP_FILE"

echo "Added COLL_UNROLL template argument to $HIP_FILE"
fi
11 changes: 11 additions & 0 deletions docs/how-to/using-nccl.rst
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,17 @@ Initialization
supports ``dmabuf``, it should set ``ptrSupport`` to ``NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF`` and
provide a ``regMrDmaBuf`` function.
* The ``regIsGlobal`` field allows NCCL to register buffers in advance, for example, using a loopback connection.
Later, it also lets NCCL expect that a subsequent registration on a buffer from a previous registration
will happen nearly immediately, because the buffer is already known by the network adapter. A typical
implementation maintains a registration cache, with the call to ``ncclCommRegister`` creating the
initial entry in the cache using ``regMr()`` on a loopback connection. Any later call to the NCCL
system can call ``regMr()`` again on the real connection, with the real buffer (which could be at a
different offset within the original buffer, with a smaller size, for example). It
could then call ``deregMr()`` immediately afterwards.
The ``ncclCommDeregister`` call should issue the final call to ``deregMr()`` and effectively remove the mapping
on the network adapter.
* The ``speed`` field indicates the speed of the network port in Mbps (10^6 bits per second).
This ensures proper optimization of flows within the node.
Expand Down
10 changes: 10 additions & 0 deletions ext-net/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,16 @@ set to `NCCL_PTR_HOST|NCCL_PTR_CUDA`, otherwise it should be set to `NCCL_PTR_HO
supports `dmabuf`, it should set `ptrSupport` to `NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF` and
provide a `regMrDmaBuf` function.

The `regIsGlobal` field allows NCCL to register buffers in advance using e.g. a loopback connection
and later on, expect that another registration on a buffer contained within a previous registration
will be nearly immediate, as the buffer is already known by the network adapter. A typical
implementation would maintain a registration cache; the call to ncclCommRegister will create the
initial entry in the cache using regMr() on a loopback connection. Any later call to NCCL
operations will call regMr() again on the real connection, with the real buffer (could be at a
different offset within the original buffer, with a smaller size, etc), then deregMr() right after.
The call to ncclCommDeregister should call the final deregMr() and effectively remove the mapping
on the network adapter.

The `speed` field indicates the speed of the network port in Mbps (10^6 bits per second). This is
important to ensure proper optimization of flows within the node.

Expand Down
15 changes: 15 additions & 0 deletions ext-net/example/nccl/common.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/

#ifndef COMMON_H_
#define COMMON_H_

typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys;

typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);

#endif
1 change: 1 addition & 0 deletions ext-net/example/nccl/err.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ typedef enum { ncclSuccess = 0,
ncclSystemError = 2,
ncclInternalError = 3,
ncclInvalidArgument = 4,
ncclInvalidUsage = 5,
ncclRemoteError = 6 } ncclResult_t;

#endif
6 changes: 1 addition & 5 deletions ext-net/example/nccl/net.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include <stdint.h>
#include <stdlib.h>

#include "common.h"
#include "err.h"

#define NCCL_NET_HANDLE_MAXSIZE 128
Expand All @@ -19,11 +20,6 @@
// Maximum number of requests per comm object
#define NCCL_NET_MAX_REQUESTS 32

typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_ALL=~0} ncclDebugLogSubSys;

typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);

#include "net_v8.h"
#include "net_v7.h"
#include "net_v6.h"
Expand Down
4 changes: 2 additions & 2 deletions ext-net/example/nccl/types.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
*/

#ifndef NCCL_ERR_H_
#define NCCL_ERR_H_
#ifndef NCCL_TYPES_H_
#define NCCL_TYPES_H_

/* Data types */
typedef enum { ncclInt8 = 0, ncclChar = 0,
Expand Down
15 changes: 15 additions & 0 deletions ext-tuner/example/nccl/common.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/

#ifndef COMMON_H_
#define COMMON_H_

typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys;

typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);

#endif
17 changes: 17 additions & 0 deletions ext-tuner/example/nccl/err.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
/*
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
*/

#ifndef NCCL_ERR_H_
#define NCCL_ERR_H_

/* Error type for plugins */
typedef enum { ncclSuccess = 0,
ncclUnhandledCudaError = 1,
ncclSystemError = 2,
ncclInternalError = 3,
ncclInvalidArgument = 4,
ncclInvalidUsage = 5,
ncclRemoteError = 6 } ncclResult_t;

#endif
43 changes: 28 additions & 15 deletions ext-tuner/example/nccl/tuner.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,24 @@
#ifndef NCCL_TUNER_H_
#define NCCL_TUNER_H_

#include "nccl.h"
#include <stdint.h>
#include <stdlib.h>

typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_ALL=~0} ncclDebugLogSubSys;

typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
#include "common.h"
#include "err.h"

#define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now
typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncSend, ncclFuncRecv, ncclNumFuncs} ncclFunc_t;
typedef enum {
ncclFuncBroadcast = 0,
ncclFuncReduce = 1,
ncclFuncAllGather = 2,
ncclFuncReduceScatter = 3,
ncclFuncAllReduce = 4,
ncclFuncSendRecv = 5,
ncclFuncSend = 6,
ncclFuncRecv = 7,
ncclNumFuncs = 8
} ncclFunc_t;

#define NCCL_NUM_ALGORITHMS 6 // Tree/Ring/CollNet*
#define NCCL_ALGO_UNDEF -1
Expand All @@ -33,6 +42,8 @@ typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncRed
#define NCCL_PROTO_LL128 1
#define NCCL_PROTO_SIMPLE 2

#define NCCL_ALGO_PROTO_IGNORE -1.0

// API to be implemented by external tuner
typedef struct {
// Name of the tuner
Expand All @@ -52,31 +63,33 @@ typedef struct {
// - context: tuner context object
// - collType: collective type , e.g., allreduce, allgather…
// - nBytes: collective size in bytes
// - collNetSupport: whether collnet supports this type
// - nvlsSupport: whether nvlink sharp supports this time
// - numPipeOps: number of operations in the group
// - numAlgo: number of algorithms in collCostTable
// - numProto: number of protocols in collCostTable
//
// Outputs:
// - algorithm: selected algorithm to be used for the given collective
// - protocol: selected protocol to be used for the given collective
// - nChannels: number of channels (hence SMs) to be used.
//
// InOut:
// - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType.
// NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE).
//
// If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
// default tuning for the given collective.
// Also, the plugin is allowed to not set any output, or set only the
// algorithm and protocol, but not only the algorithm or only the protocol.
// Unset fields will be set automatically by NCCL.
ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
int collNetSupport, int nvlsSupport, int numPipeOps,
int *algorithm, int *protocol, int* nChannels);
int numPipeOps, float** collCostTable, int numAlgo, int numProto,
int* nChannels);

// Terminates the plugin and cleans up any resources that the plugin allocated.
// context: tuner context object
ncclResult_t (*destroy)(void* context);
} ncclTuner_v2_t;
} ncclTuner_v3_t;

typedef ncclTuner_v2_t ncclTuner_t;
typedef ncclTuner_v3_t ncclTuner_t;

#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v2"
#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v3"

#endif
4 changes: 2 additions & 2 deletions ext-tuner/example/plugin.c
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ ncclResult_t ncclTopoGetAlgoTime_Tuner(ncclFunc_t collType, int algorithm, int p
return ncclSuccess;
}

__hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context) {
__hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction) {
if (nRanks <= 1) return ncclSuccess;
int compCapIndex = HOPPER_COMPCAP_IDX;
int index2 = nNodes <= 2 ? nNodes-1 : 2;
Expand Down Expand Up @@ -226,7 +226,7 @@ __hidden ncclResult_t pluginDestroy(void* context) { return ncclSuccess; }

#define PLUGIN_NAME "Example"

const ncclTuner_v2_t ncclTunerPlugin_v2 = {
const ncclTuner_v3_t ncclTunerPlugin_v3 = {
.name = PLUGIN_NAME,
.init = pluginInit,
.getCollInfo = pluginGetCollInfo,
Expand Down
4 changes: 2 additions & 2 deletions makefiles/version.mk
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
##### version
NCCL_MAJOR := 2
NCCL_MINOR := 21
NCCL_PATCH := 5
NCCL_MINOR := 22
NCCL_PATCH := 3
NCCL_SUFFIX :=
PKG_REVISION := 1
5 changes: 3 additions & 2 deletions src/bootstrap.cc
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,6 @@ ncclResult_t bootstrapCreateRoot(struct ncclBootstrapHandle* handle, bool idFrom

ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle) {
memset(handle, 0, sizeof(ncclBootstrapHandle));
NCCLCHECK(getRandomData(&handle->magic, sizeof(handle->magic)));

const char* env = ncclGetEnv("NCCL_COMM_ID");
if (env) {
Expand All @@ -211,7 +210,9 @@ ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle) {
WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
return ncclInvalidArgument;
}
handle->magic = NCCL_MAGIC;
} else {
NCCLCHECK(getRandomData(&handle->magic, sizeof(handle->magic)));
memcpy(&handle->addr, &bootstrapNetIfAddr, sizeof(union ncclSocketAddress));
NCCLCHECK(bootstrapCreateRoot(handle, false));
}
Expand Down Expand Up @@ -631,7 +632,7 @@ ncclResult_t bootstrapClose(void* commState) {
struct bootstrapState* state = (struct bootstrapState*)commState;
if (state->unexpectedConnections != NULL) {
unexpectedFree(state);
if (__atomic_load_n(state->abortFlag, __ATOMIC_RELAXED) == 0) {
if (__atomic_load_n(state->abortFlag, __ATOMIC_ACQUIRE) == 0) {
WARN("Unexpected connections are not empty");
return ncclInternalError;
}
Expand Down
Loading

0 comments on commit 36343be

Please sign in to comment.