From 35b0ca147b011b9453130c55b42aad197bc0bc88 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Thu, 9 Nov 2023 09:25:22 +0000 Subject: [PATCH] add nccl topology --- docs/Parameters.rst | 24 +- include/LightGBM/boosting.h | 4 +- include/LightGBM/config.h | 18 +- include/LightGBM/cuda/cuda_nccl_topology.hpp | 225 ++++++++++ .../LightGBM/cuda/cuda_objective_function.hpp | 2 +- include/LightGBM/cuda/cuda_utils.hu | 37 ++ include/LightGBM/dataset.h | 4 + include/LightGBM/objective_function.h | 8 + src/application/application.cpp | 6 +- src/boosting/boosting.cpp | 16 +- src/boosting/cuda/nccl_gbdt.cpp | 383 ++++-------------- src/boosting/cuda/nccl_gbdt.hpp | 39 +- src/boosting/cuda/nccl_gbdt_component.hpp | 95 +++++ src/boosting/gbdt.h | 2 +- src/c_api.cpp | 4 +- src/io/config_auto.cpp | 19 +- src/io/dataset.cpp | 35 +- src/objective/objective_function.cpp | 89 ++-- .../cuda/cuda_single_gpu_tree_learner.hpp | 2 +- src/treelearner/tree_learner.cpp | 2 +- 20 files changed, 604 insertions(+), 410 deletions(-) create mode 100644 include/LightGBM/cuda/cuda_nccl_topology.hpp create mode 100644 src/boosting/cuda/nccl_gbdt_component.hpp diff --git a/docs/Parameters.rst b/docs/Parameters.rst index 86104ba5be55..4d6e92e03988 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -1280,7 +1280,7 @@ GPU Parameters - ``gpu_platform_id`` :raw-html:`🔗︎`, default = ``-1``, type = int - - OpenCL platform ID. Usually each GPU vendor exposes one OpenCL platform + - OpenCL platform ID with device_type=gpu. Usually each GPU vendor exposes one OpenCL platform - ``-1`` means the system-wide default platform @@ -1288,23 +1288,31 @@ GPU Parameters - ``gpu_device_id`` :raw-html:`🔗︎`, default = ``-1``, type = int - - OpenCL device ID in the specified platform. Each GPU in the selected platform has a unique device ID + - Master CUDA device ID with device_type=cuda or OpenCL device ID in the specified platform with device_type=gpu. + + - Each GPU in the selected platform has a unique device ID - ``-1`` means the default device in the selected platform - **Note**: refer to `GPU Targets <./GPU-Targets.rst#query-opencl-devices-in-your-system>`__ for more details -- ``gpu_use_dp`` :raw-html:`🔗︎`, default = ``false``, type = bool +- ``num_gpus`` :raw-html:`🔗︎`, default = ``1``, type = int - - set this to ``true`` to use double precision math on GPU (by default single precision is used) + - Number of GPUs to use for training, used with device_type=cuda - - **Note**: can be used only in OpenCL implementation, in CUDA implementation only double precision is currently supported + - When <= 0, only 1 GPU will be used + +- ``gpu_device_id_list`` :raw-html:`🔗︎`, default = ``""``, type = string -- ``num_gpu`` :raw-html:`🔗︎`, default = ``1``, type = int, constraints: ``num_gpu > 0`` + - List of CUDA device IDs used when device_type=cuda - - number of GPUs + - When empty, the devices with the smallest IDs will be used - - **Note**: can be used only in CUDA implementation +- ``gpu_use_dp`` :raw-html:`🔗︎`, default = ``false``, type = bool + + - set this to ``true`` to use double precision math on GPU (by default single precision is used) + + - **Note**: can be used only in OpenCL implementation, in CUDA implementation only double precision is currently supported .. end params list diff --git a/include/LightGBM/boosting.h b/include/LightGBM/boosting.h index 1bfc18b4470b..8e694aca29da 100644 --- a/include/LightGBM/boosting.h +++ b/include/LightGBM/boosting.h @@ -309,9 +309,11 @@ class LIGHTGBM_EXPORT Boosting { * \param format Format of model * \param config config for boosting * \param filename name of model file, if existing will continue to train from this model + * \param device_type type of device, can be cpu, gpu or cuda + * \param num_gpu number of GPUs to use * \return The boosting object */ - static Boosting* CreateBoosting(const std::string& type, const char* filename); + static Boosting* CreateBoosting(const std::string& type, const char* filename, const std::string& device_type, const int num_gpu); virtual std::string GetLoadedParam() const = 0; diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index 6d61bc764924..09a6836f56c7 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -1091,25 +1091,29 @@ struct Config { #pragma region GPU Parameters #endif // __NVCC__ - // desc = OpenCL platform ID. Usually each GPU vendor exposes one OpenCL platform + // desc = OpenCL platform ID with device_type=gpu. Usually each GPU vendor exposes one OpenCL platform // desc = ``-1`` means the system-wide default platform // desc = **Note**: refer to `GPU Targets <./GPU-Targets.rst#query-opencl-devices-in-your-system>`__ for more details int gpu_platform_id = -1; - // desc = OpenCL device ID in the specified platform. Each GPU in the selected platform has a unique device ID + // desc = Master CUDA device ID with device_type=cuda or OpenCL device ID in the specified platform with device_type=gpu. + // desc = Each GPU in the selected platform has a unique device ID // desc = ``-1`` means the default device in the selected platform // desc = **Note**: refer to `GPU Targets <./GPU-Targets.rst#query-opencl-devices-in-your-system>`__ for more details int gpu_device_id = -1; + // desc = Number of GPUs to use for training, used with device_type=cuda + // desc = When <= 0, only 1 GPU will be used + int num_gpus = 1; + + // desc = List of CUDA device IDs used when device_type=cuda + // desc = When empty, the devices with the smallest IDs will be used + std::string gpu_device_id_list = ""; + // desc = set this to ``true`` to use double precision math on GPU (by default single precision is used) // desc = **Note**: can be used only in OpenCL implementation, in CUDA implementation only double precision is currently supported bool gpu_use_dp = false; - // check = >0 - // desc = number of GPUs - // desc = **Note**: can be used only in CUDA implementation - int num_gpu = 1; - #ifndef __NVCC__ #pragma endregion diff --git a/include/LightGBM/cuda/cuda_nccl_topology.hpp b/include/LightGBM/cuda/cuda_nccl_topology.hpp new file mode 100644 index 000000000000..98f485ba97d8 --- /dev/null +++ b/include/LightGBM/cuda/cuda_nccl_topology.hpp @@ -0,0 +1,225 @@ +/*! + * Copyright (c) 2023 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace LightGBM { + +class NCCLTopology { + public: + NCCLTopology(const int master_gpu_device_id, const int num_gpus, const std::string& gpu_device_id_list, const data_size_t global_num_data) { + num_gpus_ = num_gpus; + master_gpu_device_id_ = master_gpu_device_id; + global_num_data_ = global_num_data; + int max_num_gpu = 0; + CUDASUCCESS_OR_FATAL(cudaGetDeviceCount(&max_num_gpu)); + if (gpu_device_id_list != std::string("")) { + std::set gpu_id_set; + std::vector gpu_list_str = Common::Split(gpu_device_id_list.c_str(), ","); + for (const auto& gpu_str : gpu_list_str) { + int gpu_id = 0; + Common::Atoi(gpu_str.c_str(), &gpu_id); + if (gpu_id < 0 || gpu_id >= max_num_gpu) { + Log::Warning("Invalid GPU device ID %d in gpu_device_list is ignored.", gpu_id); + } else { + gpu_id_set.insert(gpu_id); + } + } + for (const int gpu_id : gpu_id_set) { + gpu_list_.push_back(gpu_id); + } + } + if (!gpu_list_.empty() && num_gpus_ != static_cast(gpu_list_.size())) { + Log::Warning("num_gpus_ = %d is different from the number of valid device IDs in gpu_device_list (%d), using %d GPUs instead.",\ + num_gpus_, static_cast(gpu_list_.size()), static_cast(gpu_list_.size())); + num_gpus_ = static_cast(gpu_list_.size()); + } + + if (!gpu_list_.empty()) { + bool check_master_gpu = false; + for (int i = 0; i < static_cast(gpu_list_.size()); ++i) { + const int gpu_id = gpu_list_[i]; + if (gpu_id == master_gpu_device_id_) { + check_master_gpu = true; + master_gpu_index_ = i; + break; + } + } + if (!check_master_gpu) { + Log::Warning("Master GPU index not in gpu_device_list. Using %d as the master GPU instead.", gpu_list_[0]); + master_gpu_device_id_ = gpu_list_[0]; + master_gpu_index_ = 0; + } + } else { + if (num_gpus_ <= 0) { + num_gpus_ = 1; + } else if (num_gpus_ > max_num_gpu) { + Log::Warning("Only %d GPUs available, using num_gpu = %d.", max_num_gpu, max_num_gpu); + num_gpus_ = max_num_gpu; + } + if (master_gpu_device_id_ < 0 || master_gpu_device_id_ >= num_gpus_) { + Log::Warning("Invalid gpu_device_id = %d for master GPU index, using gpu_device_id = 0 instead.", master_gpu_device_id_); + master_gpu_device_id_ = 0; + master_gpu_index_ = 0; + } + for (int i = 0; i < num_gpus_; ++i) { + gpu_list_.push_back(i); + } + } + + Log::Info("Using GPU devices %s, and local master GPU device %d.", Common::Join(gpu_list_, ","), master_gpu_device_id_); + + const int num_threads = OMP_NUM_THREADS(); + if (num_gpus_ > num_threads) { + Log::Fatal("Number of GPUs %d is greather than the number of threads %d. Please use more threads.", num_gpus_, num_threads); + } + + host_threads_.resize(num_gpus_); + } + + void InitNCCL() { + nccl_gpu_rank_.resize(num_gpus_, -1); + nccl_communicators_.resize(num_gpus_); + ncclUniqueId nccl_unique_id; + if (Network::num_machines() == 1 || Network::rank() == 0) { + NCCLCHECK(ncclGetUniqueId(&nccl_unique_id)); + } + if (Network::num_machines() > 1) { + std::vector output_buffer(Network::num_machines()); + Network::Allgather( + reinterpret_cast(&nccl_unique_id), + sizeof(ncclUniqueId) / sizeof(char), + reinterpret_cast(output_buffer.data())); + if (Network::rank() > 0) { + nccl_unique_id = output_buffer[0]; + } + } + + if (Network::num_machines() > 1) { + node_rank_offset_.resize(Network::num_machines() + 1, 0); + Network::Allgather( + reinterpret_cast(&num_gpus_), + sizeof(int) / sizeof(char), + reinterpret_cast(node_rank_offset_.data() + 1)); + for (int rank = 1; rank < Network::num_machines() + 1; ++rank) { + node_rank_offset_[rank] += node_rank_offset_[rank - 1]; + } + CHECK_EQ(node_rank_offset_[Network::rank() + 1] - node_rank_offset_[Network::rank()], num_gpus_); + NCCLCHECK(ncclGroupStart()); + for (int gpu_index = 0; gpu_index < num_gpus_; ++gpu_index) { + SetCUDADevice(gpu_list_[gpu_index], __FILE__, __LINE__); + nccl_gpu_rank_[gpu_index] = gpu_index + node_rank_offset_[Network::rank()]; + NCCLCHECK(ncclCommInitRank(&nccl_communicators_[gpu_index], node_rank_offset_.back(), nccl_unique_id, nccl_gpu_rank_[gpu_index])); + } + NCCLCHECK(ncclGroupEnd()); + } else { + NCCLCHECK(ncclGroupStart()); + for (int gpu_index = 0; gpu_index < num_gpus_; ++gpu_index) { + SetCUDADevice(gpu_list_[gpu_index], __FILE__, __LINE__); + nccl_gpu_rank_[gpu_index] = gpu_index; + NCCLCHECK(ncclCommInitRank(&nccl_communicators_[gpu_index], num_gpus_, nccl_unique_id, gpu_index)); + } + NCCLCHECK(ncclGroupEnd()); + } + + // return to master gpu device + CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_)); + } + + template + void RunPerDevice(const std::vector>& objs, const std::function& func) { + #pragma omp parallel for schedule(static) num_threads(num_gpus_) + for (int i = 0; i < num_gpus_; ++i) { + CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_list_[i])); + func(objs[i].get()); + } + CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_)); + } + + template + void InitPerDevice(std::vector>& vec) { + vec.resize(num_gpus_); + #pragma omp parallel for schedule(static) num_threads(num_gpus_) + for (int i = 0; i < num_gpus_; ++i) { + CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_list_[i])); + RET_T* nccl_info = new RET_T(); + nccl_info->SetNCCLInfo(nccl_communicators_[i], nccl_gpu_rank_[i], i, gpu_list_[i], global_num_data_); + } + CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_)); + } + + template + void DispatchPerDevice(std::vector>& objs, const std::function& func) { + for (int i = 0; i < num_gpus_; ++i) { + host_threads_[i] = std::thread([this, i, &func, &objs] () { + CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_list_[i])) + func(objs[i].get()); + }); + // if (pthread_create(&host_threads_[i], nullptr, [this, i, &func] (void* ptr) { + // CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_list_[i])) + // func(reinterpret_cast(ptr)); + // return reinterpret_cast(nullptr); + // }, + // reinterpret_cast(objs[i].get()))) { + // Log::Fatal("Error in creating boosting threads."); + // } + } + for (int i = 0; i < num_gpus_; ++i) { + // if (pthread_join(host_threads_[i], nullptr)) { + // Log::Fatal("Error in joining boosting threads."); + // } + host_threads_[i].join(); + } + CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_)); + } + + template + void RunOnMasterDevice(const std::vector>& objs, const std::function& func) { + CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_)); + func(objs[master_gpu_index_].get()); + } + + template + void RunOnNonMasterDevice(const std::vector>& objs, const std::function& func) { + for (int i = 0; i < num_gpus_; ++i) { + if (i != master_gpu_index_) { + CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_list_[i])); + func(objs[i].get()); + } + } + CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_)); + } + + int num_gpus() const { return num_gpus_; } + + int master_gpu_index() const { return master_gpu_index_; } + + int master_gpu_device_id() const { return master_gpu_device_id_; } + + const std::vector& gpu_list() const { return gpu_list_; } + + private: + int num_gpus_; + int master_gpu_index_; + int master_gpu_device_id_; + std::vector gpu_list_; + data_size_t global_num_data_; + + ncclUniqueId nccl_unique_id_; + std::vector node_rank_offset_; + std::vector nccl_gpu_rank_; + std::vector nccl_communicators_; + std::vector host_threads_; +}; + +} // namespace LightGBM diff --git a/include/LightGBM/cuda/cuda_objective_function.hpp b/include/LightGBM/cuda/cuda_objective_function.hpp index 465ed334156c..3b4ce46a38be 100644 --- a/include/LightGBM/cuda/cuda_objective_function.hpp +++ b/include/LightGBM/cuda/cuda_objective_function.hpp @@ -19,7 +19,7 @@ namespace LightGBM { template -class CUDAObjectiveInterface: public HOST_OBJECTIVE { +class CUDAObjectiveInterface: public HOST_OBJECTIVE, NCCLInfo { public: explicit CUDAObjectiveInterface(const Config& config): HOST_OBJECTIVE(config) { const int gpu_device_id = config.gpu_device_id >= 0 ? config.gpu_device_id : 0; diff --git a/include/LightGBM/cuda/cuda_utils.hu b/include/LightGBM/cuda/cuda_utils.hu index 4bd84aeb264d..7fbb482d06a4 100644 --- a/include/LightGBM/cuda/cuda_utils.hu +++ b/include/LightGBM/cuda/cuda_utils.hu @@ -11,8 +11,10 @@ #include #include #include +#include #include +#include #include #include @@ -32,6 +34,15 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = #define CUDASUCCESS_OR_FATAL_OUTER(ans) { gpuAssert((ans), file, line); } +#define NCCLCHECK(cmd) do { \ + ncclResult_t r = cmd; \ + if (r!= ncclSuccess) { \ + printf("Failed, NCCL error %s:%d '%s'\n", \ + __FILE__,__LINE__,ncclGetErrorString(r)); \ + exit(EXIT_FAILURE); \ + } \ +} while(0) + void SetCUDADevice(int gpu_device_id, const char* file, int line); int GetCUDADevice(const char* file, int line); @@ -205,6 +216,32 @@ static __device__ T SafeLog(T x) { } } +class NCCLInfo { + public: + NCCLInfo() {} + + void SetNCCLInfo( + ncclComm_t nccl_communicator, + int nccl_gpu_rank, + int local_gpu_rank, + int gpu_device_id, + data_size_t global_num_data) { + nccl_communicator_ = nccl_communicator; + nccl_gpu_rank_ = nccl_gpu_rank; + local_gpu_rank_ = local_gpu_rank; + gpu_device_id_ = gpu_device_id; + global_num_data_ = global_num_data; + } + + protected: + ncclComm_t nccl_communicator_ = nullptr; + int nccl_gpu_rank_ = -1; + int local_gpu_rank_ = -1; + int gpu_device_id_ = -1; + int num_gpu_in_node_ = 0; + data_size_t global_num_data_ = 0; +}; + } // namespace LightGBM #endif // USE_CUDA diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index 56bc7b841dc3..9e41d35e73cd 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -647,6 +647,8 @@ class Dataset { void CopySubrow(const Dataset* fullset, const data_size_t* used_indices, data_size_t num_used_indices, bool need_meta_data); + void CopySubrowToDevice(const Dataset* fullset, const data_size_t* used_indices, data_size_t num_used_indices, bool need_meta_data, int gpu_device_id); + MultiValBin* GetMultiBinFromSparseFeatures(const std::vector& offsets) const; MultiValBin* GetMultiBinFromAllFeatures(const std::vector& offsets) const; @@ -991,6 +993,8 @@ class Dataset { void CreateCUDAColumnData(); + void CopySubrowHostPart(const Dataset* fullset, const data_size_t* used_indices, data_size_t num_used_indices, bool need_meta_data); + std::string data_filename_; /*! \brief Store used features */ std::vector> feature_groups_; diff --git a/include/LightGBM/objective_function.h b/include/LightGBM/objective_function.h index ad188dc39676..fb3dc3ef2977 100644 --- a/include/LightGBM/objective_function.h +++ b/include/LightGBM/objective_function.h @@ -107,6 +107,14 @@ class ObjectiveFunction { virtual bool NeedConvertOutputCUDA () const { return false; } + /*! + * \brief Create object of objective function on CUDA + * \param type Specific type of objective function + * \param config Config for objective function + */ + LIGHTGBM_EXPORT static ObjectiveFunction* CreateObjectiveFunctionCUDA(const std::string& type, + const Config& config); + #endif // USE_CUDA }; diff --git a/src/application/application.cpp b/src/application/application.cpp index 0bb9eca13bf2..926d1d4608d1 100644 --- a/src/application/application.cpp +++ b/src/application/application.cpp @@ -181,7 +181,7 @@ void Application::InitTrain() { // create boosting boosting_.reset( Boosting::CreateBoosting(config_.boosting, - config_.input_model.c_str())); + config_.input_model.c_str(), config_.device_type, config_.num_gpus)); // create objective function objective_fun_.reset( ObjectiveFunction::CreateObjectiveFunction(config_.objective, @@ -261,13 +261,13 @@ void Application::Predict() { void Application::InitPredict() { boosting_.reset( - Boosting::CreateBoosting("gbdt", config_.input_model.c_str())); + Boosting::CreateBoosting("gbdt", config_.input_model.c_str(), config_.device_type, config_.num_gpus)); Log::Info("Finished initializing prediction, total used %d iterations", boosting_->GetCurrentIteration()); } void Application::ConvertModel() { boosting_.reset( - Boosting::CreateBoosting(config_.boosting, config_.input_model.c_str())); + Boosting::CreateBoosting(config_.boosting, config_.input_model.c_str(), config_.device_type, config_.num_gpus)); boosting_->SaveModelToIfElse(-1, config_.convert_model.c_str()); } diff --git a/src/boosting/boosting.cpp b/src/boosting/boosting.cpp index 98f2554b1388..7c5fcde1b655 100644 --- a/src/boosting/boosting.cpp +++ b/src/boosting/boosting.cpp @@ -8,6 +8,10 @@ #include "gbdt.h" #include "rf.hpp" +#ifdef USE_CUDA +#include "cuda/nccl_gbdt.hpp" +#endif // USE_CUDA + namespace LightGBM { std::string GetBoostingTypeFromModelFile(const char* filename) { @@ -31,7 +35,7 @@ bool Boosting::LoadFileToBoosting(Boosting* boosting, const char* filename) { return true; } -Boosting* Boosting::CreateBoosting(const std::string& type, const char* filename) { +Boosting* Boosting::CreateBoosting(const std::string& type, const char* filename, const std::string& device_type, const int num_gpus) { if (filename == nullptr || filename[0] == '\0') { if (type == std::string("gbdt")) { return new GBDT(); @@ -48,7 +52,15 @@ Boosting* Boosting::CreateBoosting(const std::string& type, const char* filename std::unique_ptr ret; if (GetBoostingTypeFromModelFile(filename) == std::string("tree")) { if (type == std::string("gbdt")) { - ret.reset(new GBDT()); + #ifdef USE_CUDA + if (device_type == std::string("cuda") && num_gpus > 1) { + return new NCCLGBDT(); + } else { + #endif // USE_CUDA + return new GBDT(); + #ifdef USE_CUDA + } + #endif // USE_CUDA } else if (type == std::string("dart")) { ret.reset(new DART()); } else if (type == std::string("goss")) { diff --git a/src/boosting/cuda/nccl_gbdt.cpp b/src/boosting/cuda/nccl_gbdt.cpp index 47672a53679c..f06c5b92611b 100644 --- a/src/boosting/cuda/nccl_gbdt.cpp +++ b/src/boosting/cuda/nccl_gbdt.cpp @@ -4,8 +4,11 @@ */ #include "nccl_gbdt.hpp" +#include "nccl_gbdt_component.hpp" #include +#include + #ifdef USE_CUDA namespace LightGBM { @@ -22,182 +25,26 @@ void NCCLGBDT::Init( const ObjectiveFunction* objective_function, const std::vector& training_metrics) { GBDT_T::Init(gbdt_config, train_data, objective_function, training_metrics); - int max_num_gpu = 0; - CUDASUCCESS_OR_FATAL(cudaGetDeviceCount(&max_num_gpu)); - num_gpu_ = this->config_->num_gpu; - if (num_gpu_ > max_num_gpu) { - Log::Warning("Specifying %d GPUs, but only %d available.", num_gpu_, max_num_gpu); - num_gpu_ = max_num_gpu; - } - int gpu_device_id = this->config_->gpu_device_id; - if (this->config_->gpu_device_list == std::string("")) { - if (gpu_device_id < 0 || gpu_device_id >= num_gpu_) { - Log::Warning("Master GPU Device ID %d is not in the valid range [%d, %d], will use GPU 0 as master.", gpu_device_id, 0, max_num_gpu); - gpu_device_id = 0; - } - } - master_gpu_device_id_ = gpu_device_id; - master_gpu_index_ = master_gpu_device_id_; - - if (this->config_->gpu_device_list != std::string("")) { - std::vector gpu_list_str = Common::Split(this->config_->gpu_device_list.c_str(), ","); - for (const auto& gpu_str : gpu_list_str) { - int gpu_id = 0; - Common::Atoi(gpu_str.c_str(), &gpu_id); - gpu_list_.emplace_back(gpu_id); - } - bool check_master_gpu = false; - for (int i = 0; i < static_cast(gpu_list_.size()); ++i) { - const int gpu_id = gpu_list_[i]; - if (gpu_id == master_gpu_device_id_) { - master_gpu_index_ = i; - check_master_gpu = true; - } - } - if (!check_master_gpu) { - Log::Fatal("Master GPU ID %d is not in GPU ID list.", master_gpu_device_id_); - } - } - - const int num_threads = OMP_NUM_THREADS(); - if (num_gpu_ > num_threads) { - Log::Fatal("Number of GPUs %d is greather than the number of threads %d. Please use more threads.", num_gpu_, num_threads); - } - - InitNCCL(); - - // partition data across GPUs - const data_size_t num_data_per_gpu = (this->num_data_ + num_gpu_ - 1) / num_gpu_; - std::vector all_data_indices(this->num_data_, 0); - #pragma omp parallel for schedule(static) - for (data_size_t i = 0; i < this->num_data_; ++i) { - all_data_indices[i] = i; - } - per_gpu_data_start_.resize(num_gpu_); - per_gpu_data_end_.resize(num_gpu_); - per_gpu_datasets_.resize(num_gpu_); - for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) { - SetCUDADevice(gpu_index); - const data_size_t data_start = num_data_per_gpu * gpu_index; - const data_size_t data_end = std::min(data_start + num_data_per_gpu, this->num_data_); - const data_size_t num_data_in_gpu = data_end - data_start; - per_gpu_data_start_[gpu_index] = data_start; - per_gpu_data_end_[gpu_index] = data_end; - per_gpu_datasets_[gpu_index].reset(new Dataset(num_data_in_gpu)); - per_gpu_datasets_[gpu_index]->ReSize(num_data_in_gpu); - per_gpu_datasets_[gpu_index]->CopyFeatureMapperFrom(this->train_data_); - per_gpu_datasets_[gpu_index]->CopySubrow(this->train_data_, all_data_indices.data() + data_start, num_data_in_gpu, true, data_start, data_end, GetCUDADevice(gpu_index)); - } - // initialize per gpu objectives, training scores and tree learners - per_gpu_objective_functions_.resize(num_gpu_); - per_gpu_train_score_updater_.resize(num_gpu_); - per_gpu_gradients_.resize(num_gpu_); - per_gpu_hessians_.resize(num_gpu_); - per_gpu_tree_learners_.resize(num_gpu_); - for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) { - SetCUDADevice(gpu_index); - const data_size_t num_data_in_gpu = per_gpu_data_end_[gpu_index] - per_gpu_data_start_[gpu_index]; - per_gpu_objective_functions_[gpu_index].reset(ObjectiveFunction::CreateObjectiveFunction(this->config_->objective, *(this->config_.get()))); - per_gpu_objective_functions_[gpu_index]->Init(per_gpu_datasets_[gpu_index]->metadata(), per_gpu_datasets_[gpu_index]->num_data()); - per_gpu_objective_functions_[gpu_index]->SetNCCLComm(&nccl_communicators_[gpu_index]); - per_gpu_train_score_updater_[gpu_index].reset(new CUDAScoreUpdater(per_gpu_datasets_[gpu_index].get(), this->num_tree_per_iteration_)); - per_gpu_gradients_[gpu_index].reset(new CUDAVector(num_data_in_gpu)); - per_gpu_hessians_[gpu_index].reset(new CUDAVector(num_data_in_gpu)); - per_gpu_tree_learners_[gpu_index].reset(TreeLearner::CreateTreeLearner( - this->config_->tree_learner, - this->config_->device_type, - this->config_.get())); - per_gpu_tree_learners_[gpu_index]->SetNCCL(&nccl_communicators_[gpu_index], nccl_gpu_rank_[gpu_index], GetCUDADevice(gpu_index), this->num_data_); - per_gpu_tree_learners_[gpu_index]->Init(per_gpu_datasets_[gpu_index].get(), this->is_constant_hessian_); - } - - // initialize host threads and thread data - host_threads_.resize(num_gpu_); - boosting_thread_data_.resize(num_gpu_); - train_tree_learner_thread_data_.resize(num_gpu_); - update_score_thread_data_.resize(num_gpu_); - for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) { - boosting_thread_data_[gpu_index].gpu_index = GetCUDADevice(gpu_index); - boosting_thread_data_[gpu_index].gpu_objective_function = per_gpu_objective_functions_[gpu_index].get(); - boosting_thread_data_[gpu_index].gradients = per_gpu_gradients_[gpu_index]->RawData(); - boosting_thread_data_[gpu_index].hessians = per_gpu_hessians_[gpu_index]->RawData(); - boosting_thread_data_[gpu_index].score = per_gpu_train_score_updater_[gpu_index]->score(); - train_tree_learner_thread_data_[gpu_index].gpu_index = GetCUDADevice(gpu_index); - train_tree_learner_thread_data_[gpu_index].gpu_tree_learner = per_gpu_tree_learners_[gpu_index].get(); - train_tree_learner_thread_data_[gpu_index].gradients = per_gpu_gradients_[gpu_index]->RawData(); - train_tree_learner_thread_data_[gpu_index].hessians = per_gpu_hessians_[gpu_index]->RawData(); - train_tree_learner_thread_data_[gpu_index].num_data_in_gpu = per_gpu_data_end_[gpu_index] - per_gpu_data_start_[gpu_index]; - update_score_thread_data_[gpu_index].gpu_index = GetCUDADevice(gpu_index); - update_score_thread_data_[gpu_index].gpu_score_updater = per_gpu_train_score_updater_[gpu_index].get(); - update_score_thread_data_[gpu_index].gpu_tree_learner = per_gpu_tree_learners_[gpu_index].get(); - } - - // return to master gpu device - CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_)); -} + nccl_topology_.reset(new NCCLTopology(this->config_->gpu_device_id, this->config_->num_gpus, this->config_->gpu_device_id_list, train_data->num_data())); -template -void NCCLGBDT::InitNCCL() { - nccl_gpu_rank_.resize(num_gpu_, -1); - nccl_communicators_.resize(num_gpu_); - ncclUniqueId nccl_unique_id; - if (Network::num_machines() == 1 || Network::rank() == 0) { - NCCLCHECK(ncclGetUniqueId(&nccl_unique_id)); - } - if (Network::num_machines() > 1) { - std::vector output_buffer(Network::num_machines()); - Network::Allgather( - reinterpret_cast(&nccl_unique_id), - sizeof(ncclUniqueId) / sizeof(char), - reinterpret_cast(output_buffer.data())); - if (Network::rank() > 0) { - nccl_unique_id = output_buffer[0]; - } - } - - if (Network::num_machines() > 1) { - std::vector num_gpus_per_machine(Network::num_machines() + 1, 0); - Network::Allgather( - reinterpret_cast(&num_gpu_), - sizeof(int) / sizeof(char), - reinterpret_cast(num_gpus_per_machine.data() + 1)); - for (int rank = 1; rank < Network::num_machines() + 1; ++rank) { - num_gpus_per_machine[rank] += num_gpus_per_machine[rank - 1]; - } - CHECK_EQ(num_gpus_per_machine[Network::rank() + 1] - num_gpus_per_machine[Network::rank()], num_gpu_); - NCCLCHECK(ncclGroupStart()); - for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) { - SetCUDADevice(gpu_index); - nccl_gpu_rank_[gpu_index] = gpu_index + num_gpus_per_machine[Network::rank()]; - NCCLCHECK(ncclCommInitRank(&nccl_communicators_[gpu_index], num_gpus_per_machine.back(), nccl_unique_id, nccl_gpu_rank_[gpu_index])); - } - NCCLCHECK(ncclGroupEnd()); - } else { - NCCLCHECK(ncclGroupStart()); - for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) { - SetCUDADevice(gpu_index); - nccl_gpu_rank_[gpu_index] = gpu_index; - NCCLCHECK(ncclCommInitRank(&nccl_communicators_[gpu_index], num_gpu_, nccl_unique_id, gpu_index)); - } - NCCLCHECK(ncclGroupEnd()); - } + nccl_topology_->InitNCCL(); - // return to master gpu device - CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_)); + nccl_topology_->InitPerDevice(nccl_gbdt_components_); + nccl_topology_->RunPerDevice(nccl_gbdt_components_, [this, gbdt_config, train_data] + (NCCLGBDTComponent* nccl_gbdt_component) { nccl_gbdt_component->Init( + gbdt_config, train_data, this->num_tree_per_iteration_, this->boosting_on_gpu_, this->is_constant_hessian_ + ); + }); } template -void* NCCLGBDT::BoostingThread(void* thread_data) { - const BoostingThreadData* boosting_thread_data = reinterpret_cast(thread_data); - const int gpu_index = boosting_thread_data->gpu_index; - const ObjectiveFunction* objective_function = boosting_thread_data->gpu_objective_function; - score_t* gradients = boosting_thread_data->gradients; - score_t* hessians = boosting_thread_data->hessians; - const double* score = boosting_thread_data->score; - CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_index)); +void NCCLGBDT::BoostingThread(NCCLGBDTComponent* thread_data) { + const ObjectiveFunction* objective_function = thread_data->objective_function(); + score_t* gradients = thread_data->gradients(); + score_t* hessians = thread_data->hessians(); + const double* score = thread_data->train_score_updater()->score(); objective_function->GetGradients(score, gradients, hessians); - return nullptr; } template @@ -206,46 +53,28 @@ void NCCLGBDT::Boosting() { if (this->objective_function_ == nullptr) { Log::Fatal("No object function provided"); } - for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) { - if (pthread_create(&host_threads_[gpu_index], nullptr, BoostingThread, - reinterpret_cast(&boosting_thread_data_[gpu_index]))) { - Log::Fatal("Error in creating boosting threads."); - } - } - for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) { - if (pthread_join(host_threads_[gpu_index], nullptr)) { - Log::Fatal("Error in joining boosting threads."); - } - } + nccl_topology_->DispatchPerDevice(nccl_gbdt_components_, BoostingThread); } template double NCCLGBDT::BoostFromAverage(int class_id, bool update_scorer) { double init_score = GBDT_T::BoostFromAverage(class_id, update_scorer); - for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) { - SetCUDADevice(gpu_index); - if (std::fabs(init_score) > kEpsilon && update_scorer) { - per_gpu_train_score_updater_[gpu_index]->AddScore(init_score, class_id); - } + + if (init_score != 0.0) { + nccl_topology_->RunPerDevice(nccl_gbdt_components_, [init_score, class_id] (NCCLGBDTComponent* thread_data) { + thread_data->train_score_updater()->AddScore(init_score, class_id); + }); } - // return to master gpu device - CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_)); + return init_score; } template -void* NCCLGBDT::TrainTreeLearnerThread(void* thread_data) { - TrainTreeLearnerThreadData* tree_train_learner_thread_data = reinterpret_cast(thread_data); - const int gpu_index = tree_train_learner_thread_data->gpu_index; - const int class_id = tree_train_learner_thread_data->class_id; - const data_size_t num_data_in_gpu = tree_train_learner_thread_data->num_data_in_gpu; - const score_t* gradients = tree_train_learner_thread_data->gradients + class_id * num_data_in_gpu; - const score_t* hessians = tree_train_learner_thread_data->hessians + class_id * num_data_in_gpu; - const bool is_first_tree = tree_train_learner_thread_data->is_first_time; - CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_index)); - tree_train_learner_thread_data->tree.reset( - tree_train_learner_thread_data->gpu_tree_learner->Train(gradients, hessians, is_first_tree)); - return nullptr; +void NCCLGBDT::TrainTreeLearnerThread(NCCLGBDTComponent* thread_data, const int class_id, const bool is_first_tree) { + const data_size_t num_data_in_gpu = thread_data->num_data_in_gpu(); + const score_t* gradients = thread_data->gradients() + class_id * num_data_in_gpu; + const score_t* hessians = thread_data->hessians() + class_id * num_data_in_gpu; + thread_data->SetTree(thread_data->tree_learner()->Train(gradients, hessians, is_first_tree)); } template @@ -259,88 +88,61 @@ bool NCCLGBDT::TrainOneIter(const score_t* gradients, const score_t* hes } Boosting(); } else { - for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) { - SetCUDADevice(gpu_index); - const data_size_t gpu_data_start = per_gpu_data_start_[gpu_index]; - const data_size_t num_data_in_gpu = per_gpu_data_end_[gpu_index] - gpu_data_start; + nccl_topology_->RunPerDevice(nccl_gbdt_components_, [this, gradients, hessians] (NCCLGBDTComponent* thread_data) { + const data_size_t data_start_index = thread_data->data_start_index(); + const data_size_t num_data_in_gpu = thread_data->num_data_in_gpu(); + for (int class_id = 0; class_id < this->num_class_; ++class_id) { CopyFromHostToCUDADevice( - per_gpu_gradients_[gpu_index]->RawData() + class_id * num_data_in_gpu, - gradients + class_id * this->num_data_ + gpu_data_start, num_data_in_gpu, __FILE__, __LINE__); + thread_data->gradients() + class_id * num_data_in_gpu, + gradients + class_id * this->num_data_ + data_start_index, num_data_in_gpu, __FILE__, __LINE__); CopyFromHostToCUDADevice( - per_gpu_hessians_[gpu_index]->RawData() + class_id * num_data_in_gpu, - hessians + class_id * this->num_data_ + gpu_data_start, num_data_in_gpu, __FILE__, __LINE__); + thread_data->hessians() + class_id * num_data_in_gpu, + hessians + class_id * this->num_data_ + data_start_index, num_data_in_gpu, __FILE__, __LINE__); } - } - - // return to master gpu device - CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_)); + }); } bool should_continue = false; for (int cur_tree_id = 0; cur_tree_id < this->num_tree_per_iteration_; ++cur_tree_id) { - std::vector> new_tree(num_gpu_); - for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) { - new_tree[gpu_index].reset(nullptr); - } if (this->class_need_train_[cur_tree_id] && this->train_data_->num_features() > 0) { - if (this->is_use_subset_ && this->bag_data_cnt_ < this->num_data_) { + if (this->data_sample_strategy_->is_use_subset() && this->data_sample_strategy_->bag_data_cnt() < this->num_data_) { Log::Fatal("Bagging is not supported for NCCLGBDT"); } bool is_first_tree = this->models_.size() < static_cast(this->num_tree_per_iteration_); - for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) { - train_tree_learner_thread_data_[gpu_index].is_first_time = is_first_tree; - train_tree_learner_thread_data_[gpu_index].class_id = cur_tree_id; - if (pthread_create(&host_threads_[gpu_index], nullptr, TrainTreeLearnerThread, - reinterpret_cast(&train_tree_learner_thread_data_[gpu_index]))) { - Log::Fatal("Error in creating tree training threads."); - } - } - for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) { - if (pthread_join(host_threads_[gpu_index], nullptr)) { - Log::Fatal("Error in joining tree training threads."); - } - new_tree[gpu_index].reset(train_tree_learner_thread_data_[gpu_index].tree.release()); - } + nccl_topology_->DispatchPerDevice(nccl_gbdt_components_, + [is_first_tree, cur_tree_id] (NCCLGBDTComponent* thread_data) -> void { + TrainTreeLearnerThread(thread_data, cur_tree_id, is_first_tree); + }); } - if (new_tree[master_gpu_index_]->num_leaves() > 1) { - should_continue = true; - if (this->objective_function_ != nullptr && this->objective_function_->IsRenewTreeOutput()) { - Log::Fatal("Objective function with renewing is not supported for NCCLGBDT."); - } - // shrinkage by learning rate - for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) { - SetCUDADevice(gpu_index); - new_tree[gpu_index]->Shrinkage(this->shrinkage_rate_); + nccl_topology_->DispatchPerDevice(nccl_gbdt_components_, [cur_tree_id, this, init_scores] (NCCLGBDTComponent* thread_data) -> void { + this->UpdateScoreThread(thread_data, cur_tree_id, this->config_->learning_rate, init_scores[cur_tree_id]); + }); + + nccl_topology_->RunOnMasterDevice(nccl_gbdt_components_, [&should_continue, this, cur_tree_id] (NCCLGBDTComponent* thread_data) -> void { + if (thread_data->new_tree()->num_leaves() > 1) { + should_continue = true; } - CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_)); - // update score - UpdateScore(new_tree, cur_tree_id); - if (std::fabs(init_scores[cur_tree_id]) > kEpsilon) { - for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) { - SetCUDADevice(gpu_index); - new_tree[gpu_index]->AddBias(init_scores[cur_tree_id]); - } - CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_)); + for (auto& score_updater : this->valid_score_updater_) { + score_updater->AddScore(thread_data->new_tree(), cur_tree_id); } - } else { - // only add default score one-time + }); + + if (!should_continue) { if (this->models_.size() < static_cast(this->num_tree_per_iteration_)) { Log::Warning("Training stopped with no splits."); } } // add model - this->models_.push_back(std::move(new_tree[master_gpu_index_])); + nccl_topology_->RunOnMasterDevice(nccl_gbdt_components_, [this] (NCCLGBDTComponent* thread_data) -> void { + this->models_.emplace_back(thread_data->release_new_tree()); + }); - for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) { - if (gpu_index != master_gpu_index_) { - SetCUDADevice(gpu_index); - new_tree[gpu_index].reset(nullptr); - } - } - CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_)); + nccl_topology_->RunOnNonMasterDevice(nccl_gbdt_components_, [this] (NCCLGBDTComponent* thread_data) -> void { + thread_data->clear_new_tree(); + }); } if (!should_continue) { @@ -358,45 +160,20 @@ bool NCCLGBDT::TrainOneIter(const score_t* gradients, const score_t* hes } template -void* NCCLGBDT::UpdateScoreThread(void* thread_data) { - const UpdateScoreThreadData* update_score_thread_data = reinterpret_cast(thread_data); - const int gpu_index = update_score_thread_data->gpu_index; - CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_index)); - update_score_thread_data->gpu_score_updater->AddScore( - update_score_thread_data->gpu_tree_learner, - update_score_thread_data->tree, - update_score_thread_data->cur_tree_id); - return nullptr; -} - -template -void NCCLGBDT::UpdateScore(const std::vector>& tree, const int cur_tree_id) { - Common::FunctionTimer fun_timer("GBDT::UpdateScore", global_timer); - // update training score - if (!this->is_use_subset_) { - if (this->num_data_ - this->bag_data_cnt_ > 0) { - Log::Fatal("bagging is not supported for NCCLGBDT."); - } - for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) { - update_score_thread_data_[gpu_index].tree = tree[gpu_index].get(); - update_score_thread_data_[gpu_index].cur_tree_id = cur_tree_id; - if (pthread_create(&host_threads_[gpu_index], nullptr, UpdateScoreThread, - reinterpret_cast(&update_score_thread_data_[gpu_index]))) { - Log::Fatal("Error in creating update score threads."); - } +void NCCLGBDT::UpdateScoreThread(NCCLGBDTComponent* thread_data, const int cur_tree_id, const double shrinkage_rate, const double init_score) { + if (thread_data->new_tree()->num_leaves() > 1) { + // TODO(shiyu1994): implement bagging + if (thread_data->objective_function() != nullptr && thread_data->objective_function()->IsRenewTreeOutput()) { + // TODO(shiyu1994): implement renewing } - for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) { - if (pthread_join(host_threads_[gpu_index], nullptr)) { - Log::Fatal("Error in joining tree training threads."); - } + thread_data->new_tree()->Shrinkage(shrinkage_rate); + thread_data->train_score_updater()->AddScore( + thread_data->tree_learner(), + thread_data->new_tree(), + cur_tree_id); + if (std::fabs(init_score) > kEpsilon) { + thread_data->new_tree()->AddBias(init_score); } - } else { - Log::Fatal("bagging is not supported for NCCLGBDT."); - } - - // update validation score - for (auto& score_updater : this->valid_score_updater_) { - score_updater->AddScore(tree[master_gpu_index_].get(), cur_tree_id); } } @@ -405,18 +182,18 @@ std::vector NCCLGBDT::EvalOneMetric(const Metric* metric, const if (score == this->train_score_updater_->score()) { // delegate to per gpu train score updater std::vector tmp_score(num_data * this->num_class_, 0.0f); - for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) { - const data_size_t data_start = per_gpu_data_start_[gpu_index]; - const data_size_t num_data_in_gpu = per_gpu_data_end_[gpu_index] - data_start; + + nccl_topology_->RunPerDevice(nccl_gbdt_components_, [this, &tmp_score] (NCCLGBDTComponent* thread_data) { + const data_size_t data_start = thread_data->data_start_index(); + const data_size_t num_data_in_gpu = thread_data->num_data_in_gpu(); for (int class_id = 0; class_id < this->num_class_; ++class_id) { - SetCUDADevice(gpu_index); CopyFromCUDADeviceToHost(tmp_score.data() + class_id * this->num_data_ + data_start, - per_gpu_train_score_updater_[gpu_index]->score() + class_id * num_data_in_gpu, + thread_data->train_score_updater()->score() + class_id * num_data_in_gpu, static_cast(num_data_in_gpu), __FILE__, __LINE__); } - CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_)); - return metric->Eval(tmp_score.data(), this->objective_function_); - } + }); + + return metric->Eval(tmp_score.data(), this->objective_function_); } else { return GBDT_T::EvalOneMetric(metric, score, num_data); } diff --git a/src/boosting/cuda/nccl_gbdt.hpp b/src/boosting/cuda/nccl_gbdt.hpp index 1d808ef5e65f..7f648b1e42d6 100644 --- a/src/boosting/cuda/nccl_gbdt.hpp +++ b/src/boosting/cuda/nccl_gbdt.hpp @@ -9,10 +9,13 @@ #ifdef USE_CUDA #include "../gbdt.h" +#include #include #include #include "cuda_score_updater.hpp" #include +#include +#include "nccl_gbdt_component.hpp" namespace LightGBM { @@ -100,17 +103,11 @@ class NCCLGBDT: public GBDT_T { } }; - static void* BoostingThread(void* thread_data); + static void BoostingThread(NCCLGBDTComponent* thread_data); - static void* TrainTreeLearnerThread(void* thread_data); + static void TrainTreeLearnerThread(NCCLGBDTComponent* thread_data, const int class_id, const bool is_first_tree); - static void* UpdateScoreThread(void* thread_data); - - void Bagging(int /*iter*/) override { - Log::Fatal("Bagging is not supported for NCCLGBDT."); - } - - void InitNCCL(); + static void UpdateScoreThread(NCCLGBDTComponent* thread_data, const int cur_tree_id, const double shrinkage_rate, const double init_score); double BoostFromAverage(int class_id, bool update_scorer) override; @@ -126,27 +123,8 @@ class NCCLGBDT: public GBDT_T { std::vector EvalOneMetric(const Metric* metric, const double* score, const data_size_t num_data) const override; - void SetCUDADevice(int gpu_id) const { - if (gpu_list_.empty()) { - CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_id)); - } else { - CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_list_[gpu_id])); - } - } - int GetCUDADevice(int gpu_id) const { - if (gpu_list_.empty()) { - return gpu_id; - } else { - return gpu_list_[gpu_id]; - } - } - - int num_gpu_; int num_threads_; - int master_gpu_device_id_; - int master_gpu_index_; - std::vector gpu_list_; std::vector> per_gpu_objective_functions_; std::vector> per_gpu_train_score_updater_; std::vector>> per_gpu_gradients_; @@ -158,9 +136,14 @@ class NCCLGBDT: public GBDT_T { std::vector boosting_thread_data_; std::vector train_tree_learner_thread_data_; std::vector update_score_thread_data_; + + std::unique_ptr nccl_topology_; + std::vector nccl_gpu_rank_; std::vector nccl_communicators_; std::vector> per_gpu_tree_learners_; + + std::vector> nccl_gbdt_components_; }; } // namespace LightGBM diff --git a/src/boosting/cuda/nccl_gbdt_component.hpp b/src/boosting/cuda/nccl_gbdt_component.hpp new file mode 100644 index 000000000000..600f6c9b596f --- /dev/null +++ b/src/boosting/cuda/nccl_gbdt_component.hpp @@ -0,0 +1,95 @@ +/*! + * Copyright (c) 2023 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ + +#ifndef LIGHTGBM_BOOSTING_CUDA_NCCL_GBDT_COMPONENT_HPP_ +#define LIGHTGBM_BOOSTING_CUDA_NCCL_GBDT_COMPONENT_HPP_ + +#ifdef USE_CUDA + + +#include +#include +#include "cuda_score_updater.hpp" +#include "../../treelearner/cuda/cuda_single_gpu_tree_learner.hpp" +#include + +namespace LightGBM { + +class NCCLGBDTComponent: public NCCLInfo { + public: + NCCLGBDTComponent() {} + + void Init(const Config* config, const Dataset* train_data, const int num_tree_per_iteration, const bool boosting_on_gpu, const bool is_constant_hessian) { + const data_size_t num_data_per_gpu = (train_data->num_data() + num_gpu_in_node_ - 1) / num_gpu_in_node_; + data_start_index_ = num_data_per_gpu * local_gpu_rank_; + data_end_index_ = std::min(data_start_index_ + num_data_per_gpu, train_data->num_data()); + num_data_in_gpu_ = data_end_index_ - data_start_index_; + + dataset_.reset(new Dataset(num_data_in_gpu_)); + dataset_->ReSize(num_data_in_gpu_); + dataset_->CopyFeatureMapperFrom(train_data); + std::vector used_indices(num_data_in_gpu_); + for (data_size_t data_index = data_start_index_; data_index < data_end_index_; ++data_index) { + used_indices[data_index - data_start_index_] = data_index; + } + dataset_->CopySubrowToDevice(train_data, used_indices.data(), num_data_in_gpu_, true, gpu_device_id_); + + objective_function_.reset(ObjectiveFunction::CreateObjectiveFunctionCUDA(config->objective, *config)); + train_score_updater_.reset(new CUDAScoreUpdater(dataset_.get(), num_tree_per_iteration, boosting_on_gpu)); + gradients_.reset(new CUDAVector(num_data_in_gpu_)); + hessians_.reset(new CUDAVector(num_data_in_gpu_)); + tree_learner_.reset(new CUDASingleGPUTreeLearner(config, boosting_on_gpu)); + + tree_learner_->SetNCCLInfo(nccl_communicator_, nccl_gpu_rank_, local_gpu_rank_, gpu_device_id_, train_data->num_data()); + + objective_function_->Init(dataset_->metadata(), dataset_->num_data()); + tree_learner_->Init(dataset_.get(), is_constant_hessian); + } + + ObjectiveFunction* objective_function() { return objective_function_.get(); } + + CUDAScoreUpdater* train_score_updater() { return train_score_updater_.get(); } + + score_t* gradients() { return gradients_->RawData(); } + + score_t* hessians() { return hessians_->RawData(); } + + data_size_t num_data_in_gpu() const { return num_data_in_gpu_; } + + CUDASingleGPUTreeLearner* tree_learner() { return tree_learner_.get(); } + + void SetTree(Tree* tree) { + new_tree_.reset(tree); + } + + data_size_t data_start_index() const { return data_start_index_; } + + data_size_t data_end_index() const { return data_end_index_; } + + Tree* new_tree() { return new_tree_.get(); } + + Tree* release_new_tree() { return new_tree_.release(); } + + void clear_new_tree() { new_tree_.reset(nullptr); } + + private: + std::unique_ptr objective_function_; + std::unique_ptr train_score_updater_; + std::unique_ptr> gradients_; + std::unique_ptr> hessians_; + std::unique_ptr dataset_; + std::unique_ptr tree_learner_; + std::unique_ptr new_tree_; + + data_size_t data_start_index_; + data_size_t data_end_index_; + data_size_t num_data_in_gpu_; +}; + +} // namespace LightGBM + +#endif // USE_CUDA + +#endif // LIGHTGBM_BOOSTING_CUDA_NCCL_GBDT_COMPONENT_HPP_ diff --git a/src/boosting/gbdt.h b/src/boosting/gbdt.h index 28ebee446fad..98f2f3235fd9 100644 --- a/src/boosting/gbdt.h +++ b/src/boosting/gbdt.h @@ -505,7 +505,7 @@ class GBDT : public GBDTBase { */ std::string OutputMetric(int iter); - double BoostFromAverage(int class_id, bool update_scorer); + virtual double BoostFromAverage(int class_id, bool update_scorer); /*! * \brief Reset gradient buffers, must be called after sample strategy is reset diff --git a/src/c_api.cpp b/src/c_api.cpp index baf934db42b1..9e7c29d69dd8 100644 --- a/src/c_api.cpp +++ b/src/c_api.cpp @@ -108,7 +108,7 @@ class SingleRowPredictor { class Booster { public: explicit Booster(const char* filename) { - boosting_.reset(Boosting::CreateBoosting("gbdt", filename)); + boosting_.reset(Boosting::CreateBoosting("gbdt", filename, std::string("cpu"), 0)); } Booster(const Dataset* train_data, @@ -122,7 +122,7 @@ class Booster { "please use continued train with input score"); } - boosting_.reset(Boosting::CreateBoosting(config_.boosting, nullptr)); + boosting_.reset(Boosting::CreateBoosting(config_.boosting, nullptr, config_.device_type, config_.num_gpus)); train_data_ = train_data; CreateObjectiveAndMetrics(); diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp index 394614af3f33..204a41c0bba3 100644 --- a/src/io/config_auto.cpp +++ b/src/io/config_auto.cpp @@ -318,8 +318,9 @@ const std::unordered_set& Config::parameter_set() { "machines", "gpu_platform_id", "gpu_device_id", + "num_gpus", + "gpu_device_id_list", "gpu_use_dp", - "num_gpu", }); return params; } @@ -656,10 +657,11 @@ void Config::GetMembersFromString(const std::unordered_map>& Config::paramet {"machines", {"workers", "nodes"}}, {"gpu_platform_id", {}}, {"gpu_device_id", {}}, + {"num_gpus", {}}, + {"gpu_device_id_list", {}}, {"gpu_use_dp", {}}, - {"num_gpu", {}}, }); return map; } @@ -1061,8 +1065,9 @@ const std::unordered_map& Config::ParameterTypes() { {"machines", "string"}, {"gpu_platform_id", "int"}, {"gpu_device_id", "int"}, + {"num_gpus", "int"}, + {"gpu_device_id_list", "string"}, {"gpu_use_dp", "bool"}, - {"num_gpu", "int"}, }); return map; } diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index e78f8a6b696c..7e77d5f819f6 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -833,9 +833,8 @@ void Dataset::ReSize(data_size_t num_data) { } } -void Dataset::CopySubrow(const Dataset* fullset, - const data_size_t* used_indices, - data_size_t num_used_indices, bool need_meta_data) { + +void Dataset::CopySubrowHostPart(const Dataset* fullset, const data_size_t* used_indices, data_size_t num_used_indices, bool need_meta_data) { CHECK_EQ(num_used_indices, num_data_); std::vector group_ids, subfeature_ids; @@ -882,6 +881,13 @@ void Dataset::CopySubrow(const Dataset* fullset, } } } +} + +void Dataset::CopySubrow(const Dataset* fullset, + const data_size_t* used_indices, + data_size_t num_used_indices, bool need_meta_data) { + CopySubrowHostPart(fullset, used_indices, num_used_indices, need_meta_data); + // update CUDA storage for column data and metadata device_type_ = fullset->device_type_; gpu_device_id_ = fullset->gpu_device_id_; @@ -889,7 +895,28 @@ void Dataset::CopySubrow(const Dataset* fullset, #ifdef USE_CUDA if (device_type_ == std::string("cuda")) { if (cuda_column_data_ == nullptr) { - cuda_column_data_.reset(new CUDAColumnData(fullset->num_data(), gpu_device_id_)); + cuda_column_data_.reset(new CUDAColumnData(num_used_indices, gpu_device_id_)); + metadata_.CreateCUDAMetadata(gpu_device_id_); + } + cuda_column_data_->CopySubrow(fullset->cuda_column_data(), used_indices, num_used_indices); + } + #endif // USE_CUDA +} + +void Dataset::CopySubrowToDevice(const Dataset* fullset, + const data_size_t* used_indices, + data_size_t num_used_indices, bool need_meta_data, + int gpu_device_id) { + CopySubrowHostPart(fullset, used_indices, num_used_indices, need_meta_data); + + // update CUDA storage for column data and metadata + device_type_ = fullset->device_type_; + gpu_device_id_ = gpu_device_id; + + #ifdef USE_CUDA + if (device_type_ == std::string("cuda")) { + if (cuda_column_data_ == nullptr) { + cuda_column_data_.reset(new CUDAColumnData(num_used_indices, gpu_device_id_)); metadata_.CreateCUDAMetadata(gpu_device_id_); } cuda_column_data_->CopySubrow(fullset->cuda_column_data(), used_indices, num_used_indices); diff --git a/src/objective/objective_function.cpp b/src/objective/objective_function.cpp index a203017cf36e..1d758b361bca 100644 --- a/src/objective/objective_function.cpp +++ b/src/objective/objective_function.cpp @@ -3,6 +3,7 @@ * Licensed under the MIT License. See LICENSE file in the project root for license information. */ #include +#include #include "binary_objective.hpp" #include "multiclass_objective.hpp" @@ -17,52 +18,58 @@ namespace LightGBM { +#ifdef USE_CUDA +ObjectiveFunction* ObjectiveFunction::CreateObjectiveFunctionCUDA(const std::string& type, const Config& config) { + if (type == std::string("regression")) { + return new CUDARegressionL2loss(config); + } else if (type == std::string("regression_l1")) { + return new CUDARegressionL1loss(config); + } else if (type == std::string("quantile")) { + return new CUDARegressionQuantileloss(config); + } else if (type == std::string("huber")) { + return new CUDARegressionHuberLoss(config); + } else if (type == std::string("fair")) { + return new CUDARegressionFairLoss(config); + } else if (type == std::string("poisson")) { + return new CUDARegressionPoissonLoss(config); + } else if (type == std::string("binary")) { + return new CUDABinaryLogloss(config); + } else if (type == std::string("lambdarank")) { + return new CUDALambdarankNDCG(config); + } else if (type == std::string("rank_xendcg")) { + return new CUDARankXENDCG(config); + } else if (type == std::string("multiclass")) { + return new CUDAMulticlassSoftmax(config); + } else if (type == std::string("multiclassova")) { + return new CUDAMulticlassOVA(config); + } else if (type == std::string("cross_entropy")) { + Log::Warning("Objective cross_entropy is not implemented in cuda version. Fall back to boosting on CPU."); + return new CrossEntropy(config); + } else if (type == std::string("cross_entropy_lambda")) { + Log::Warning("Objective cross_entropy_lambda is not implemented in cuda version. Fall back to boosting on CPU."); + return new CrossEntropyLambda(config); + } else if (type == std::string("mape")) { + Log::Warning("Objective mape is not implemented in cuda version. Fall back to boosting on CPU."); + return new RegressionMAPELOSS(config); + } else if (type == std::string("gamma")) { + Log::Warning("Objective gamma is not implemented in cuda version. Fall back to boosting on CPU."); + return new RegressionGammaLoss(config); + } else if (type == std::string("tweedie")) { + Log::Warning("Objective tweedie is not implemented in cuda version. Fall back to boosting on CPU."); + return new RegressionTweedieLoss(config); + } else if (type == std::string("custom")) { + Log::Warning("Using customized objective with cuda. This requires copying gradients from CPU to GPU, which can be slow."); + return nullptr; + } +} +#endif // USE_CUDA + ObjectiveFunction* ObjectiveFunction::CreateObjectiveFunction(const std::string& type, const Config& config) { #ifdef USE_CUDA if (config.device_type == std::string("cuda") && config.data_sample_strategy != std::string("goss") && config.boosting != std::string("rf")) { - if (type == std::string("regression")) { - return new CUDARegressionL2loss(config); - } else if (type == std::string("regression_l1")) { - return new CUDARegressionL1loss(config); - } else if (type == std::string("quantile")) { - return new CUDARegressionQuantileloss(config); - } else if (type == std::string("huber")) { - return new CUDARegressionHuberLoss(config); - } else if (type == std::string("fair")) { - return new CUDARegressionFairLoss(config); - } else if (type == std::string("poisson")) { - return new CUDARegressionPoissonLoss(config); - } else if (type == std::string("binary")) { - return new CUDABinaryLogloss(config); - } else if (type == std::string("lambdarank")) { - return new CUDALambdarankNDCG(config); - } else if (type == std::string("rank_xendcg")) { - return new CUDARankXENDCG(config); - } else if (type == std::string("multiclass")) { - return new CUDAMulticlassSoftmax(config); - } else if (type == std::string("multiclassova")) { - return new CUDAMulticlassOVA(config); - } else if (type == std::string("cross_entropy")) { - Log::Warning("Objective cross_entropy is not implemented in cuda version. Fall back to boosting on CPU."); - return new CrossEntropy(config); - } else if (type == std::string("cross_entropy_lambda")) { - Log::Warning("Objective cross_entropy_lambda is not implemented in cuda version. Fall back to boosting on CPU."); - return new CrossEntropyLambda(config); - } else if (type == std::string("mape")) { - Log::Warning("Objective mape is not implemented in cuda version. Fall back to boosting on CPU."); - return new RegressionMAPELOSS(config); - } else if (type == std::string("gamma")) { - Log::Warning("Objective gamma is not implemented in cuda version. Fall back to boosting on CPU."); - return new RegressionGammaLoss(config); - } else if (type == std::string("tweedie")) { - Log::Warning("Objective tweedie is not implemented in cuda version. Fall back to boosting on CPU."); - return new RegressionTweedieLoss(config); - } else if (type == std::string("custom")) { - Log::Warning("Using customized objective with cuda. This requires copying gradients from CPU to GPU, which can be slow."); - return nullptr; - } + return CreateObjectiveFunctionCUDA(type, config); } else { #endif // USE_CUDA if (type == std::string("regression")) { diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp b/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp index fa782ebaad25..da1a3f09ce90 100644 --- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp +++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp @@ -23,7 +23,7 @@ namespace LightGBM { #define CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE (1024) -class CUDASingleGPUTreeLearner: public SerialTreeLearner { +class CUDASingleGPUTreeLearner: public SerialTreeLearner, public NCCLInfo { public: explicit CUDASingleGPUTreeLearner(const Config* config, const bool boosting_on_cuda); diff --git a/src/treelearner/tree_learner.cpp b/src/treelearner/tree_learner.cpp index 0018bdaf8b94..250019e27dbf 100644 --- a/src/treelearner/tree_learner.cpp +++ b/src/treelearner/tree_learner.cpp @@ -40,7 +40,7 @@ TreeLearner* TreeLearner::CreateTreeLearner(const std::string& learner_type, con } } else if (device_type == std::string("cuda")) { if (learner_type == std::string("serial")) { - if (config->num_gpu == 1) { + if (config->num_gpus == 1) { return new CUDASingleGPUTreeLearner(config, boosting_on_cuda); } else { Log::Fatal("Currently cuda version only supports training on a single GPU.");