Skip to content

Commit

Permalink
add nccl topology
Browse files Browse the repository at this point in the history
  • Loading branch information
shiyu1994 committed Nov 9, 2023
1 parent 985780f commit 35b0ca1
Show file tree
Hide file tree
Showing 20 changed files with 604 additions and 410 deletions.
24 changes: 16 additions & 8 deletions docs/Parameters.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1280,31 +1280,39 @@ GPU Parameters

- ``gpu_platform_id`` :raw-html:`<a id="gpu_platform_id" title="Permalink to this parameter" href="#gpu_platform_id">&#x1F517;&#xFE0E;</a>`, default = ``-1``, type = int

- OpenCL platform ID. Usually each GPU vendor exposes one OpenCL platform
- OpenCL platform ID with device_type=gpu. Usually each GPU vendor exposes one OpenCL platform

- ``-1`` means the system-wide default platform

- **Note**: refer to `GPU Targets <./GPU-Targets.rst#query-opencl-devices-in-your-system>`__ for more details

- ``gpu_device_id`` :raw-html:`<a id="gpu_device_id" title="Permalink to this parameter" href="#gpu_device_id">&#x1F517;&#xFE0E;</a>`, default = ``-1``, type = int

- OpenCL device ID in the specified platform. Each GPU in the selected platform has a unique device ID
- Master CUDA device ID with device_type=cuda or OpenCL device ID in the specified platform with device_type=gpu.

- Each GPU in the selected platform has a unique device ID

- ``-1`` means the default device in the selected platform

- **Note**: refer to `GPU Targets <./GPU-Targets.rst#query-opencl-devices-in-your-system>`__ for more details

- ``gpu_use_dp`` :raw-html:`<a id="gpu_use_dp" title="Permalink to this parameter" href="#gpu_use_dp">&#x1F517;&#xFE0E;</a>`, default = ``false``, type = bool
- ``num_gpus`` :raw-html:`<a id="num_gpus" title="Permalink to this parameter" href="#num_gpus">&#x1F517;&#xFE0E;</a>`, default = ``1``, type = int

- set this to ``true`` to use double precision math on GPU (by default single precision is used)
- Number of GPUs to use for training, used with device_type=cuda

- **Note**: can be used only in OpenCL implementation, in CUDA implementation only double precision is currently supported
- When <= 0, only 1 GPU will be used

- ``gpu_device_id_list`` :raw-html:`<a id="gpu_device_id_list" title="Permalink to this parameter" href="#gpu_device_id_list">&#x1F517;&#xFE0E;</a>`, default = ``""``, type = string

- ``num_gpu`` :raw-html:`<a id="num_gpu" title="Permalink to this parameter" href="#num_gpu">&#x1F517;&#xFE0E;</a>`, default = ``1``, type = int, constraints: ``num_gpu > 0``
- List of CUDA device IDs used when device_type=cuda

- number of GPUs
- When empty, the devices with the smallest IDs will be used

- **Note**: can be used only in CUDA implementation
- ``gpu_use_dp`` :raw-html:`<a id="gpu_use_dp" title="Permalink to this parameter" href="#gpu_use_dp">&#x1F517;&#xFE0E;</a>`, default = ``false``, type = bool

- set this to ``true`` to use double precision math on GPU (by default single precision is used)

- **Note**: can be used only in OpenCL implementation, in CUDA implementation only double precision is currently supported

.. end params list
Expand Down
4 changes: 3 additions & 1 deletion include/LightGBM/boosting.h
Original file line number Diff line number Diff line change
Expand Up @@ -309,9 +309,11 @@ class LIGHTGBM_EXPORT Boosting {
* \param format Format of model
* \param config config for boosting
* \param filename name of model file, if existing will continue to train from this model
* \param device_type type of device, can be cpu, gpu or cuda
* \param num_gpu number of GPUs to use
* \return The boosting object
*/
static Boosting* CreateBoosting(const std::string& type, const char* filename);
static Boosting* CreateBoosting(const std::string& type, const char* filename, const std::string& device_type, const int num_gpu);

virtual std::string GetLoadedParam() const = 0;

Expand Down
18 changes: 11 additions & 7 deletions include/LightGBM/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -1091,25 +1091,29 @@ struct Config {
#pragma region GPU Parameters
#endif // __NVCC__

// desc = OpenCL platform ID. Usually each GPU vendor exposes one OpenCL platform
// desc = OpenCL platform ID with device_type=gpu. Usually each GPU vendor exposes one OpenCL platform
// desc = ``-1`` means the system-wide default platform
// desc = **Note**: refer to `GPU Targets <./GPU-Targets.rst#query-opencl-devices-in-your-system>`__ for more details
int gpu_platform_id = -1;

// desc = OpenCL device ID in the specified platform. Each GPU in the selected platform has a unique device ID
// desc = Master CUDA device ID with device_type=cuda or OpenCL device ID in the specified platform with device_type=gpu.
// desc = Each GPU in the selected platform has a unique device ID
// desc = ``-1`` means the default device in the selected platform
// desc = **Note**: refer to `GPU Targets <./GPU-Targets.rst#query-opencl-devices-in-your-system>`__ for more details
int gpu_device_id = -1;

// desc = Number of GPUs to use for training, used with device_type=cuda
// desc = When <= 0, only 1 GPU will be used
int num_gpus = 1;

// desc = List of CUDA device IDs used when device_type=cuda
// desc = When empty, the devices with the smallest IDs will be used
std::string gpu_device_id_list = "";

// desc = set this to ``true`` to use double precision math on GPU (by default single precision is used)
// desc = **Note**: can be used only in OpenCL implementation, in CUDA implementation only double precision is currently supported
bool gpu_use_dp = false;

// check = >0
// desc = number of GPUs
// desc = **Note**: can be used only in CUDA implementation
int num_gpu = 1;

#ifndef __NVCC__
#pragma endregion

Expand Down
225 changes: 225 additions & 0 deletions include/LightGBM/cuda/cuda_nccl_topology.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,225 @@
/*!
* Copyright (c) 2023 Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See LICENSE file in the project root for license information.
*/

#include <LightGBM/cuda/cuda_utils.hu>
#include <LightGBM/network.h>
#include <LightGBM/utils/common.h>
#include <set>
#include <string>
#include <vector>
#include <nccl.h>
#include <functional>
#include <thread>

namespace LightGBM {

class NCCLTopology {
public:
NCCLTopology(const int master_gpu_device_id, const int num_gpus, const std::string& gpu_device_id_list, const data_size_t global_num_data) {
num_gpus_ = num_gpus;
master_gpu_device_id_ = master_gpu_device_id;
global_num_data_ = global_num_data;
int max_num_gpu = 0;
CUDASUCCESS_OR_FATAL(cudaGetDeviceCount(&max_num_gpu));
if (gpu_device_id_list != std::string("")) {
std::set<int> gpu_id_set;
std::vector<std::string> gpu_list_str = Common::Split(gpu_device_id_list.c_str(), ",");
for (const auto& gpu_str : gpu_list_str) {
int gpu_id = 0;
Common::Atoi<int>(gpu_str.c_str(), &gpu_id);
if (gpu_id < 0 || gpu_id >= max_num_gpu) {
Log::Warning("Invalid GPU device ID %d in gpu_device_list is ignored.", gpu_id);
} else {
gpu_id_set.insert(gpu_id);
}
}
for (const int gpu_id : gpu_id_set) {
gpu_list_.push_back(gpu_id);
}
}
if (!gpu_list_.empty() && num_gpus_ != static_cast<int>(gpu_list_.size())) {
Log::Warning("num_gpus_ = %d is different from the number of valid device IDs in gpu_device_list (%d), using %d GPUs instead.",\
num_gpus_, static_cast<int>(gpu_list_.size()), static_cast<int>(gpu_list_.size()));
num_gpus_ = static_cast<int>(gpu_list_.size());
}

if (!gpu_list_.empty()) {
bool check_master_gpu = false;
for (int i = 0; i < static_cast<int>(gpu_list_.size()); ++i) {
const int gpu_id = gpu_list_[i];
if (gpu_id == master_gpu_device_id_) {
check_master_gpu = true;
master_gpu_index_ = i;
break;
}
}
if (!check_master_gpu) {
Log::Warning("Master GPU index not in gpu_device_list. Using %d as the master GPU instead.", gpu_list_[0]);
master_gpu_device_id_ = gpu_list_[0];
master_gpu_index_ = 0;
}
} else {
if (num_gpus_ <= 0) {
num_gpus_ = 1;
} else if (num_gpus_ > max_num_gpu) {
Log::Warning("Only %d GPUs available, using num_gpu = %d.", max_num_gpu, max_num_gpu);
num_gpus_ = max_num_gpu;
}
if (master_gpu_device_id_ < 0 || master_gpu_device_id_ >= num_gpus_) {
Log::Warning("Invalid gpu_device_id = %d for master GPU index, using gpu_device_id = 0 instead.", master_gpu_device_id_);
master_gpu_device_id_ = 0;
master_gpu_index_ = 0;
}
for (int i = 0; i < num_gpus_; ++i) {
gpu_list_.push_back(i);
}
}

Log::Info("Using GPU devices %s, and local master GPU device %d.", Common::Join<int>(gpu_list_, ","), master_gpu_device_id_);

const int num_threads = OMP_NUM_THREADS();
if (num_gpus_ > num_threads) {
Log::Fatal("Number of GPUs %d is greather than the number of threads %d. Please use more threads.", num_gpus_, num_threads);
}

host_threads_.resize(num_gpus_);
}

void InitNCCL() {
nccl_gpu_rank_.resize(num_gpus_, -1);
nccl_communicators_.resize(num_gpus_);
ncclUniqueId nccl_unique_id;
if (Network::num_machines() == 1 || Network::rank() == 0) {
NCCLCHECK(ncclGetUniqueId(&nccl_unique_id));
}
if (Network::num_machines() > 1) {
std::vector<ncclUniqueId> output_buffer(Network::num_machines());
Network::Allgather(
reinterpret_cast<char*>(&nccl_unique_id),
sizeof(ncclUniqueId) / sizeof(char),
reinterpret_cast<char*>(output_buffer.data()));
if (Network::rank() > 0) {
nccl_unique_id = output_buffer[0];
}
}

if (Network::num_machines() > 1) {
node_rank_offset_.resize(Network::num_machines() + 1, 0);
Network::Allgather(
reinterpret_cast<char*>(&num_gpus_),
sizeof(int) / sizeof(char),
reinterpret_cast<char*>(node_rank_offset_.data() + 1));
for (int rank = 1; rank < Network::num_machines() + 1; ++rank) {
node_rank_offset_[rank] += node_rank_offset_[rank - 1];
}
CHECK_EQ(node_rank_offset_[Network::rank() + 1] - node_rank_offset_[Network::rank()], num_gpus_);
NCCLCHECK(ncclGroupStart());
for (int gpu_index = 0; gpu_index < num_gpus_; ++gpu_index) {
SetCUDADevice(gpu_list_[gpu_index], __FILE__, __LINE__);
nccl_gpu_rank_[gpu_index] = gpu_index + node_rank_offset_[Network::rank()];
NCCLCHECK(ncclCommInitRank(&nccl_communicators_[gpu_index], node_rank_offset_.back(), nccl_unique_id, nccl_gpu_rank_[gpu_index]));
}
NCCLCHECK(ncclGroupEnd());
} else {
NCCLCHECK(ncclGroupStart());
for (int gpu_index = 0; gpu_index < num_gpus_; ++gpu_index) {
SetCUDADevice(gpu_list_[gpu_index], __FILE__, __LINE__);
nccl_gpu_rank_[gpu_index] = gpu_index;
NCCLCHECK(ncclCommInitRank(&nccl_communicators_[gpu_index], num_gpus_, nccl_unique_id, gpu_index));
}
NCCLCHECK(ncclGroupEnd());
}

// return to master gpu device
CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_));
}

template <typename ARG_T, typename RET_T>
void RunPerDevice(const std::vector<std::unique_ptr<ARG_T>>& objs, const std::function<RET_T(ARG_T*)>& func) {
#pragma omp parallel for schedule(static) num_threads(num_gpus_)
for (int i = 0; i < num_gpus_; ++i) {
CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_list_[i]));
func(objs[i].get());
}
CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_));
}

template <typename RET_T>
void InitPerDevice(std::vector<std::unique_ptr<RET_T>>& vec) {
vec.resize(num_gpus_);
#pragma omp parallel for schedule(static) num_threads(num_gpus_)
for (int i = 0; i < num_gpus_; ++i) {
CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_list_[i]));
RET_T* nccl_info = new RET_T();
nccl_info->SetNCCLInfo(nccl_communicators_[i], nccl_gpu_rank_[i], i, gpu_list_[i], global_num_data_);
}
CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_));
}

template <typename ARG_T>
void DispatchPerDevice(std::vector<std::unique_ptr<ARG_T>>& objs, const std::function<void(ARG_T*)>& func) {
for (int i = 0; i < num_gpus_; ++i) {
host_threads_[i] = std::thread([this, i, &func, &objs] () {
CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_list_[i]))
func(objs[i].get());
});
// if (pthread_create(&host_threads_[i], nullptr, [this, i, &func] (void* ptr) {
// CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_list_[i]))
// func(reinterpret_cast<ARG_T*>(ptr));
// return reinterpret_cast<void*>(nullptr);
// },
// reinterpret_cast<void*>(objs[i].get()))) {
// Log::Fatal("Error in creating boosting threads.");
// }
}
for (int i = 0; i < num_gpus_; ++i) {
// if (pthread_join(host_threads_[i], nullptr)) {
// Log::Fatal("Error in joining boosting threads.");
// }
host_threads_[i].join();
}
CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_));
}

template <typename ARG_T, typename RET_T>
void RunOnMasterDevice(const std::vector<std::unique_ptr<ARG_T>>& objs, const std::function<RET_T(ARG_T*)>& func) {
CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_));
func(objs[master_gpu_index_].get());
}

template <typename ARG_T, typename RET_T>
void RunOnNonMasterDevice(const std::vector<std::unique_ptr<ARG_T>>& objs, const std::function<RET_T(ARG_T*)>& func) {
for (int i = 0; i < num_gpus_; ++i) {
if (i != master_gpu_index_) {
CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_list_[i]));
func(objs[i].get());
}
}
CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_));
}

int num_gpus() const { return num_gpus_; }

int master_gpu_index() const { return master_gpu_index_; }

int master_gpu_device_id() const { return master_gpu_device_id_; }

const std::vector<int>& gpu_list() const { return gpu_list_; }

private:
int num_gpus_;
int master_gpu_index_;
int master_gpu_device_id_;
std::vector<int> gpu_list_;
data_size_t global_num_data_;

ncclUniqueId nccl_unique_id_;
std::vector<int> node_rank_offset_;
std::vector<int> nccl_gpu_rank_;
std::vector<ncclComm_t> nccl_communicators_;
std::vector<std::thread> host_threads_;
};

} // namespace LightGBM
2 changes: 1 addition & 1 deletion include/LightGBM/cuda/cuda_objective_function.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
namespace LightGBM {

template <typename HOST_OBJECTIVE>
class CUDAObjectiveInterface: public HOST_OBJECTIVE {
class CUDAObjectiveInterface: public HOST_OBJECTIVE, NCCLInfo {
public:
explicit CUDAObjectiveInterface(const Config& config): HOST_OBJECTIVE(config) {
const int gpu_device_id = config.gpu_device_id >= 0 ? config.gpu_device_id : 0;
Expand Down
37 changes: 37 additions & 0 deletions include/LightGBM/cuda/cuda_utils.hu
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,10 @@
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdio.h>
#include <nccl.h>

#include <LightGBM/utils/log.h>
#include <LightGBM/meta.h>

#include <algorithm>
#include <vector>
Expand All @@ -32,6 +34,15 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort =

#define CUDASUCCESS_OR_FATAL_OUTER(ans) { gpuAssert((ans), file, line); }

#define NCCLCHECK(cmd) do { \
ncclResult_t r = cmd; \
if (r!= ncclSuccess) { \
printf("Failed, NCCL error %s:%d '%s'\n", \
__FILE__,__LINE__,ncclGetErrorString(r)); \
exit(EXIT_FAILURE); \
} \
} while(0)

void SetCUDADevice(int gpu_device_id, const char* file, int line);

int GetCUDADevice(const char* file, int line);
Expand Down Expand Up @@ -205,6 +216,32 @@ static __device__ T SafeLog(T x) {
}
}

class NCCLInfo {
public:
NCCLInfo() {}

void SetNCCLInfo(
ncclComm_t nccl_communicator,
int nccl_gpu_rank,
int local_gpu_rank,
int gpu_device_id,
data_size_t global_num_data) {
nccl_communicator_ = nccl_communicator;
nccl_gpu_rank_ = nccl_gpu_rank;
local_gpu_rank_ = local_gpu_rank;
gpu_device_id_ = gpu_device_id;
global_num_data_ = global_num_data;
}

protected:
ncclComm_t nccl_communicator_ = nullptr;
int nccl_gpu_rank_ = -1;
int local_gpu_rank_ = -1;
int gpu_device_id_ = -1;
int num_gpu_in_node_ = 0;
data_size_t global_num_data_ = 0;
};

} // namespace LightGBM

#endif // USE_CUDA
Expand Down
Loading

0 comments on commit 35b0ca1

Please sign in to comment.